blob: db0cdfcc5579b4f442e3d9367edd0d226a48a43a [file] [log] [blame]
// Copyright 2022 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "components/omnibox/browser/on_device_tail_tokenizer.h"
#include <string>
#include <vector>
#include "base/files/file_path.h"
#include "base/files/file_util.h"
#include "base/path_service.h"
#include "base/strings/utf_string_conversions.h"
#include "testing/gmock/include/gmock/gmock.h"
#include "testing/gtest/include/gtest/gtest.h"
using ::testing::ElementsAreArray;
class OnDeviceTailTokenizerTest : public ::testing::Test {
public:
OnDeviceTailTokenizerTest() { tokenizer_.Reset(); }
protected:
void InitializeTokenizer(const std::string filename) {
base::FilePath file_path;
base::PathService::Get(base::DIR_SRC_TEST_DATA_ROOT, &file_path);
file_path =
file_path.AppendASCII("components/test/data/omnibox/" + filename);
ASSERT_TRUE(base::PathExists(file_path));
tokenizer_.Init(file_path);
EXPECT_TRUE(tokenizer_.IsReady());
}
OnDeviceTailTokenizer tokenizer_;
};
TEST_F(OnDeviceTailTokenizerTest, IsTokenPrintable) {
InitializeTokenizer("vocab_test.txt");
EXPECT_TRUE(tokenizer_.IsTokenPrintable(33));
EXPECT_TRUE(tokenizer_.IsTokenPrintable(260));
EXPECT_FALSE(tokenizer_.IsTokenPrintable(1));
EXPECT_FALSE(tokenizer_.IsTokenPrintable(257));
EXPECT_FALSE(tokenizer_.IsTokenPrintable(600));
}
TEST_F(OnDeviceTailTokenizerTest, CreatePrefixTokenization) {
{
SCOPED_TRACE("Test for ASCII vocab #1.");
InitializeTokenizer("vocab_test.txt");
OnDeviceTailTokenizer::Tokenization tokenization;
// Expect tokens ["n", "j", " ", "do", "c"].
// See OnDeviceTailTokenizer::EncodeRawString for details and simplified
// examples about how ID sequences are determined.
tokenizer_.CreatePrefixTokenization("nj doc", &tokenization);
EXPECT_THAT(tokenization.unambiguous_ids,
testing::ElementsAreArray({257, 110, 106, 32, 297}));
EXPECT_EQ("c", tokenization.constraint_prefix);
EXPECT_EQ("nj do", tokenization.unambiguous_prefix);
}
{
SCOPED_TRACE("Test for ASCII vocab #2.");
InitializeTokenizer("vocab_test.txt");
OnDeviceTailTokenizer::Tokenization tokenization;
// Expect tokens ["re", "mi", "t", "ly", " ", "log", "in"].
tokenizer_.CreatePrefixTokenization("remitly login", &tokenization);
EXPECT_THAT(tokenization.unambiguous_ids,
testing::ElementsAreArray({257, 414, 366, 116, 363, 32, 521}));
EXPECT_EQ("in", tokenization.constraint_prefix);
EXPECT_EQ("remitly log", tokenization.unambiguous_prefix);
}
{
SCOPED_TRACE("Test for ASCII vocab #3.");
InitializeTokenizer("vocab_test.txt");
OnDeviceTailTokenizer::Tokenization tokenization;
// Expect tokens
// ["us", " ", "pa", "ss", "po", "rt", " ", "ap", "pl", "ica", "tio", "n"]
tokenizer_.CreatePrefixTokenization("us passport application",
&tokenization);
EXPECT_THAT(tokenization.unambiguous_ids,
testing::ElementsAreArray({257, 456, 32, 402, 434, 407, 424, 32,
270, 406, 507, 549}));
EXPECT_EQ("n", tokenization.constraint_prefix);
EXPECT_EQ("us passport applicatio", tokenization.unambiguous_prefix);
}
{
SCOPED_TRACE("Test for i18n languages.");
InitializeTokenizer("vocab_i18n_test.txt");
OnDeviceTailTokenizer::Tokenization tokenization;
// Expect tokens
// ["us", "ल्", "वावि", "てる", "a", "वा"]
tokenizer_.CreatePrefixTokenization("usल्वाविてるaवा", &tokenization);
EXPECT_THAT(tokenization.unambiguous_ids,
testing::ElementsAreArray({257, 259, 260, 263, 264, 97}));
EXPECT_EQ("वा", tokenization.constraint_prefix);
EXPECT_EQ("usल्वाविてるa", tokenization.unambiguous_prefix);
}
}
TEST_F(OnDeviceTailTokenizerTest, TokenizePrevQuery) {
{
SCOPED_TRACE("Test for ASCII vocab #1.");
InitializeTokenizer("vocab_test.txt");
OnDeviceTailTokenizer::TokenIds token_ids;
tokenizer_.TokenizePrevQuery("facebook", &token_ids);
// Expect tokens: ["fa", "ce", "bo", "ok"]
EXPECT_EQ(4, (int)token_ids.size());
EXPECT_THAT(token_ids, ElementsAreArray({317, 285, 281, 390}));
EXPECT_EQ("fa", tokenizer_.IdToToken(token_ids[0]));
}
{
SCOPED_TRACE("Test for ASCII vocab #2.");
InitializeTokenizer("vocab_test.txt");
OnDeviceTailTokenizer::TokenIds token_ids;
tokenizer_.TokenizePrevQuery("matching gym outfits", &token_ids);
// Expect tokens:
// ["ma", "t", "chi", "ng", " ", "g", "y", "m", " ", "out", "fi", "ts"]
EXPECT_EQ(12, (int)token_ids.size());
EXPECT_THAT(token_ids, ElementsAreArray({364, 116, 488, 375, 32, 103, 121,
109, 32, 533, 320, 443}));
EXPECT_EQ("ma", tokenizer_.IdToToken(token_ids[0]));
}
{
SCOPED_TRACE("Test for i18n languages.");
InitializeTokenizer("vocab_i18n_test.txt");
OnDeviceTailTokenizer::TokenIds token_ids;
tokenizer_.TokenizePrevQuery("usल्वाविてるaवा", &token_ids);
// Expect tokens:
// ["us", "ल्", "वावि", "てる", "a", "वा"]
EXPECT_EQ(6, (int)token_ids.size());
EXPECT_THAT(token_ids, ElementsAreArray({259, 260, 263, 264, 97, 261}));
EXPECT_EQ("us", tokenizer_.IdToToken(token_ids[0]));
}
}