utils/grammar/parsing/lexer.cc - chromiumos/third_party/libtextclassifier - Git at Google

 // Copyright 2020 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     https://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //

 #include "utils/grammar/parsing/lexer.h"

 namespace libtextclassifier3::grammar {

 Symbol::Type Lexer::GetSymbolType(const UnicodeText::const_iterator& it) const {
   if (unilib_.IsPunctuation(*it)) {
     return Symbol::Type::TYPE_PUNCTUATION;
   } else if (unilib_.IsDigit(*it)) {
     return Symbol::Type::TYPE_DIGITS;
   } else {
     return Symbol::Type::TYPE_TERM;
   }
 }

 void Lexer::AppendTokenSymbols(const StringPiece value, int match_offset,
                                const CodepointSpan codepoint_span,
                                std::vector<Symbol>* symbols) const {
   // Possibly split token.
   UnicodeText token_unicode = UTF8ToUnicodeText(value.data(), value.size(),
                                                 /*do_copy=*/false);
   int next_match_offset = match_offset;
   auto token_end = token_unicode.end();
   auto it = token_unicode.begin();
   Symbol::Type type = GetSymbolType(it);
   CodepointIndex sub_token_start = codepoint_span.first;
   while (it != token_end) {
     auto next = std::next(it);
     int num_codepoints = 1;
     Symbol::Type next_type;
     while (next != token_end) {
       next_type = GetSymbolType(next);
       if (type == Symbol::Type::TYPE_PUNCTUATION || next_type != type) {
         break;
       }
       ++next;
       ++num_codepoints;
     }
     symbols->emplace_back(
         type, CodepointSpan{sub_token_start, sub_token_start + num_codepoints},
         /*match_offset=*/next_match_offset,
         /*lexeme=*/
         StringPiece(it.utf8_data(), next.utf8_data() - it.utf8_data()));
     next_match_offset = sub_token_start + num_codepoints;
     it = next;
     type = next_type;
     sub_token_start = next_match_offset;
   }
 }

 }  // namespace libtextclassifier3::grammar
	// Copyright 2020 Google LLC
	//
	// Licensed under the Apache License, Version 2.0 (the "License");
	// you may not use this file except in compliance with the License.
	// You may obtain a copy of the License at
	//
	// https://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.
	//

	#include "utils/grammar/parsing/lexer.h"

	namespace libtextclassifier3::grammar {

	Symbol::Type Lexer::GetSymbolType(const UnicodeText::const_iterator& it) const {
	if (unilib_.IsPunctuation(*it)) {
	return Symbol::Type::TYPE_PUNCTUATION;
	} else if (unilib_.IsDigit(*it)) {
	return Symbol::Type::TYPE_DIGITS;
	} else {
	return Symbol::Type::TYPE_TERM;
	}
	}

	void Lexer::AppendTokenSymbols(const StringPiece value, int match_offset,
	const CodepointSpan codepoint_span,
	std::vector<Symbol>* symbols) const {
	// Possibly split token.
	UnicodeText token_unicode = UTF8ToUnicodeText(value.data(), value.size(),
	/do_copy=/false);
	int next_match_offset = match_offset;
	auto token_end = token_unicode.end();
	auto it = token_unicode.begin();
	Symbol::Type type = GetSymbolType(it);
	CodepointIndex sub_token_start = codepoint_span.first;
	while (it != token_end) {
	auto next = std::next(it);
	int num_codepoints = 1;
	Symbol::Type next_type;
	while (next != token_end) {
	next_type = GetSymbolType(next);
	if (type == Symbol::Type::TYPE_PUNCTUATION \|\| next_type != type) {
	break;
	}
	++next;
	++num_codepoints;
	}
	symbols->emplace_back(
	type, CodepointSpan{sub_token_start, sub_token_start + num_codepoints},
	/match_offset=/next_match_offset,
	/lexeme=/
	StringPiece(it.utf8_data(), next.utf8_data() - it.utf8_data()));
	next_match_offset = sub_token_start + num_codepoints;
	it = next;
	type = next_type;
	sub_token_start = next_match_offset;
	}
	}

	} // namespace libtextclassifier3::grammar