blob: 9578482222bed4a71a31d77ba8feba40a4f078c4 [file] [log] [blame]
// Copyright (c) 2012 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "content/browser/speech/speech_recognition_engine.h"
#include <stddef.h>
#include <stdint.h>
#include <memory>
#include "base/big_endian.h"
#include "base/containers/queue.h"
#include "base/numerics/safe_conversions.h"
#include "base/run_loop.h"
#include "base/strings/utf_string_conversions.h"
#include "base/sys_byteorder.h"
#include "base/test/task_environment.h"
#include "content/browser/speech/audio_buffer.h"
#include "content/public/browser/google_streaming_api.pb.h"
#include "mojo/public/cpp/bindings/remote.h"
#include "net/base/net_errors.h"
#include "net/http/http_response_headers.h"
#include "net/http/http_util.h"
#include "services/network/public/cpp/url_loader_completion_status.h"
#include "services/network/public/cpp/weak_wrapper_shared_url_loader_factory.h"
#include "services/network/public/mojom/url_response_head.mojom.h"
#include "services/network/test/test_url_loader_factory.h"
#include "testing/gtest/include/gtest/gtest.h"
#include "third_party/blink/public/mojom/speech/speech_recognition_error.mojom.h"
#include "third_party/blink/public/mojom/speech/speech_recognition_result.mojom.h"
using base::HostToNet32;
using base::checked_cast;
namespace content {
// Frame types for framed POST data.
static const uint32_t kFrameTypePreamble = 0;
static const uint32_t kFrameTypeRecognitionAudio = 1;
// Note: the terms upstream and downstream are from the point-of-view of the
// client (engine_under_test_).
class SpeechRecognitionEngineTest
: public SpeechRecognitionEngine::Delegate,
public testing::Test {
public:
SpeechRecognitionEngineTest()
: last_number_of_upstream_chunks_seen_(0U),
error_(blink::mojom::SpeechRecognitionErrorCode::kNone),
end_of_utterance_counter_(0) {}
// SpeechRecognitionRequestDelegate methods.
void OnSpeechRecognitionEngineResults(
const std::vector<blink::mojom::SpeechRecognitionResultPtr>& results)
override {
results_.push(mojo::Clone(results));
}
void OnSpeechRecognitionEngineEndOfUtterance() override {
++end_of_utterance_counter_;
}
void OnSpeechRecognitionEngineError(
const blink::mojom::SpeechRecognitionError& error) override {
error_ = error.code;
}
// testing::Test methods.
void SetUp() override;
void TearDown() override;
protected:
enum DownstreamError {
DOWNSTREAM_ERROR_NONE,
DOWNSTREAM_ERROR_HTTP500,
DOWNSTREAM_ERROR_NETWORK,
DOWNSTREAM_ERROR_WEBSERVICE_NO_MATCH
};
static bool ResultsAreEqual(
const std::vector<blink::mojom::SpeechRecognitionResultPtr>& a,
const std::vector<blink::mojom::SpeechRecognitionResultPtr>& b);
static std::string SerializeProtobufResponse(
const proto::SpeechRecognitionEvent& msg);
const network::TestURLLoaderFactory::PendingRequest* GetUpstreamRequest();
const network::TestURLLoaderFactory::PendingRequest* GetDownstreamRequest();
void StartMockRecognition();
void EndMockRecognition();
void InjectDummyAudioChunk();
void ProvideMockResponseStartDownstreamIfNeeded();
void ProvideMockProtoResultDownstream(
const proto::SpeechRecognitionEvent& result);
void ProvideMockResultDownstream(
const blink::mojom::SpeechRecognitionResultPtr& result);
void ExpectResultsReceived(
const std::vector<blink::mojom::SpeechRecognitionResultPtr>& result);
void ExpectFramedChunk(const std::string& chunk, uint32_t type);
// Reads and returns all pending upload data from |upstream_data_pipe_|,
// initializing the pipe from |GetUpstreamRequest()|, if needed.
std::string ConsumeChunkedUploadData();
void CloseMockDownstream(DownstreamError error);
base::test::SingleThreadTaskEnvironment task_environment_;
network::TestURLLoaderFactory url_loader_factory_;
mojo::ScopedDataPipeProducerHandle downstream_data_pipe_;
mojo::Remote<network::mojom::ChunkedDataPipeGetter> chunked_data_pipe_getter_;
mojo::ScopedDataPipeConsumerHandle upstream_data_pipe_;
std::unique_ptr<SpeechRecognitionEngine> engine_under_test_;
size_t last_number_of_upstream_chunks_seen_;
std::string response_buffer_;
blink::mojom::SpeechRecognitionErrorCode error_;
int end_of_utterance_counter_;
base::queue<std::vector<blink::mojom::SpeechRecognitionResultPtr>> results_;
};
TEST_F(SpeechRecognitionEngineTest, SingleDefinitiveResult) {
StartMockRecognition();
ASSERT_TRUE(GetUpstreamRequest());
ASSERT_EQ("", ConsumeChunkedUploadData());
// Inject some dummy audio chunks and check a corresponding chunked upload
// is performed every time on the server.
for (int i = 0; i < 3; ++i) {
InjectDummyAudioChunk();
ASSERT_FALSE(ConsumeChunkedUploadData().empty());
}
// Ensure that a final (empty) audio chunk is uploaded on chunks end.
engine_under_test_->AudioChunksEnded();
ASSERT_FALSE(ConsumeChunkedUploadData().empty());
ASSERT_TRUE(engine_under_test_->IsRecognitionPending());
// Simulate a protobuf message streamed from the server containing a single
// result with two hypotheses.
std::vector<blink::mojom::SpeechRecognitionResultPtr> results;
results.push_back(blink::mojom::SpeechRecognitionResult::New());
blink::mojom::SpeechRecognitionResultPtr& result = results.back();
result->is_provisional = false;
result->hypotheses.push_back(blink::mojom::SpeechRecognitionHypothesis::New(
base::UTF8ToUTF16("hypothesis 1"), 0.1F));
result->hypotheses.push_back(blink::mojom::SpeechRecognitionHypothesis::New(
base::UTF8ToUTF16("hypothesis 2"), 0.2F));
ProvideMockResultDownstream(result);
ExpectResultsReceived(results);
ASSERT_TRUE(engine_under_test_->IsRecognitionPending());
// Ensure everything is closed cleanly after the downstream is closed.
CloseMockDownstream(DOWNSTREAM_ERROR_NONE);
ASSERT_FALSE(engine_under_test_->IsRecognitionPending());
EndMockRecognition();
ASSERT_EQ(blink::mojom::SpeechRecognitionErrorCode::kNone, error_);
ASSERT_EQ(0U, results_.size());
}
TEST_F(SpeechRecognitionEngineTest, SeveralStreamingResults) {
StartMockRecognition();
ASSERT_TRUE(GetUpstreamRequest());
ASSERT_EQ("", ConsumeChunkedUploadData());
for (int i = 0; i < 4; ++i) {
InjectDummyAudioChunk();
ASSERT_NE("", ConsumeChunkedUploadData());
std::vector<blink::mojom::SpeechRecognitionResultPtr> results;
results.push_back(blink::mojom::SpeechRecognitionResult::New());
blink::mojom::SpeechRecognitionResultPtr& result = results.back();
result->is_provisional = (i % 2 == 0); // Alternate result types.
float confidence = result->is_provisional ? 0.0F : (i * 0.1F);
result->hypotheses.push_back(blink::mojom::SpeechRecognitionHypothesis::New(
base::UTF8ToUTF16("hypothesis"), confidence));
ProvideMockResultDownstream(result);
ExpectResultsReceived(results);
ASSERT_TRUE(engine_under_test_->IsRecognitionPending());
}
// Ensure that a final (empty) audio chunk is uploaded on chunks end.
engine_under_test_->AudioChunksEnded();
ASSERT_NE("", ConsumeChunkedUploadData());
ASSERT_TRUE(engine_under_test_->IsRecognitionPending());
// Simulate a final definitive result.
std::vector<blink::mojom::SpeechRecognitionResultPtr> results;
results.push_back(blink::mojom::SpeechRecognitionResult::New());
blink::mojom::SpeechRecognitionResultPtr& result = results.back();
result->is_provisional = false;
result->hypotheses.push_back(blink::mojom::SpeechRecognitionHypothesis::New(
base::UTF8ToUTF16("The final result"), 1.0F));
ProvideMockResultDownstream(result);
ExpectResultsReceived(results);
ASSERT_TRUE(engine_under_test_->IsRecognitionPending());
// Ensure everything is closed cleanly after the downstream is closed.
CloseMockDownstream(DOWNSTREAM_ERROR_NONE);
ASSERT_FALSE(engine_under_test_->IsRecognitionPending());
EndMockRecognition();
ASSERT_EQ(blink::mojom::SpeechRecognitionErrorCode::kNone, error_);
ASSERT_EQ(0U, results_.size());
}
TEST_F(SpeechRecognitionEngineTest, NoFinalResultAfterAudioChunksEnded) {
StartMockRecognition();
ASSERT_TRUE(GetUpstreamRequest());
ASSERT_EQ("", ConsumeChunkedUploadData());
// Simulate one pushed audio chunk.
InjectDummyAudioChunk();
ASSERT_NE("", ConsumeChunkedUploadData());
// Simulate the corresponding definitive result.
std::vector<blink::mojom::SpeechRecognitionResultPtr> results;
results.push_back(blink::mojom::SpeechRecognitionResult::New());
blink::mojom::SpeechRecognitionResultPtr& result = results.back();
result->hypotheses.push_back(blink::mojom::SpeechRecognitionHypothesis::New(
base::UTF8ToUTF16("hypothesis"), 1.0F));
ProvideMockResultDownstream(result);
ExpectResultsReceived(results);
ASSERT_TRUE(engine_under_test_->IsRecognitionPending());
// Simulate a silent downstream closure after |AudioChunksEnded|.
engine_under_test_->AudioChunksEnded();
ASSERT_NE("", ConsumeChunkedUploadData());
ASSERT_TRUE(engine_under_test_->IsRecognitionPending());
CloseMockDownstream(DOWNSTREAM_ERROR_NONE);
// Expect an empty result, aimed at notifying recognition ended with no
// actual results nor errors.
std::vector<blink::mojom::SpeechRecognitionResultPtr> empty_results;
ExpectResultsReceived(empty_results);
// Ensure everything is closed cleanly after the downstream is closed.
ASSERT_FALSE(engine_under_test_->IsRecognitionPending());
EndMockRecognition();
ASSERT_EQ(blink::mojom::SpeechRecognitionErrorCode::kNone, error_);
ASSERT_EQ(0U, results_.size());
}
// Simulate the network service repeatedly re-requesting data (Possibly due to
// using a stale socket, for instance).
TEST_F(SpeechRecognitionEngineTest, ReRequestData) {
StartMockRecognition();
ASSERT_TRUE(GetUpstreamRequest());
ASSERT_EQ("", ConsumeChunkedUploadData());
// Simulate one pushed audio chunk.
InjectDummyAudioChunk();
std::string uploaded_data = ConsumeChunkedUploadData();
ASSERT_NE(uploaded_data, ConsumeChunkedUploadData());
// The network service closes the data pipe.
upstream_data_pipe_.reset();
// Re-opening the data pipe should result in the data being re-uploaded.
ASSERT_EQ(uploaded_data, ConsumeChunkedUploadData());
// Simulate the corresponding definitive result.
std::vector<blink::mojom::SpeechRecognitionResultPtr> results;
results.push_back(blink::mojom::SpeechRecognitionResult::New());
blink::mojom::SpeechRecognitionResultPtr& result = results.back();
result->hypotheses.push_back(blink::mojom::SpeechRecognitionHypothesis::New(
base::UTF8ToUTF16("hypothesis"), 1.0F));
ProvideMockResultDownstream(result);
ExpectResultsReceived(results);
ASSERT_TRUE(engine_under_test_->IsRecognitionPending());
// Simulate a silent downstream closure after |AudioChunksEnded|.
engine_under_test_->AudioChunksEnded();
std::string new_uploaded_data = ConsumeChunkedUploadData();
ASSERT_NE(new_uploaded_data, ConsumeChunkedUploadData());
uploaded_data += new_uploaded_data;
// The network service closes the data pipe.
upstream_data_pipe_.reset();
// Re-opening the data pipe should result in the data being re-uploaded.
ASSERT_EQ(uploaded_data, ConsumeChunkedUploadData());
ASSERT_TRUE(engine_under_test_->IsRecognitionPending());
CloseMockDownstream(DOWNSTREAM_ERROR_NONE);
// Expect an empty result, aimed at notifying recognition ended with no
// actual results nor errors.
std::vector<blink::mojom::SpeechRecognitionResultPtr> empty_results;
ExpectResultsReceived(empty_results);
// Ensure everything is closed cleanly after the downstream is closed.
ASSERT_FALSE(engine_under_test_->IsRecognitionPending());
EndMockRecognition();
ASSERT_EQ(blink::mojom::SpeechRecognitionErrorCode::kNone, error_);
ASSERT_EQ(0U, results_.size());
}
TEST_F(SpeechRecognitionEngineTest, NoMatchError) {
StartMockRecognition();
ASSERT_TRUE(GetUpstreamRequest());
ASSERT_EQ("", ConsumeChunkedUploadData());
for (int i = 0; i < 3; ++i) {
InjectDummyAudioChunk();
ASSERT_NE("", ConsumeChunkedUploadData());
}
engine_under_test_->AudioChunksEnded();
ASSERT_NE("", ConsumeChunkedUploadData());
ASSERT_TRUE(engine_under_test_->IsRecognitionPending());
// Simulate only a provisional result.
std::vector<blink::mojom::SpeechRecognitionResultPtr> results;
results.push_back(blink::mojom::SpeechRecognitionResult::New());
blink::mojom::SpeechRecognitionResultPtr& result = results.back();
result->is_provisional = true;
result->hypotheses.push_back(blink::mojom::SpeechRecognitionHypothesis::New(
base::UTF8ToUTF16("The final result"), 0.0F));
ProvideMockResultDownstream(result);
ExpectResultsReceived(results);
ASSERT_TRUE(engine_under_test_->IsRecognitionPending());
CloseMockDownstream(DOWNSTREAM_ERROR_WEBSERVICE_NO_MATCH);
// Expect an empty result.
ASSERT_FALSE(engine_under_test_->IsRecognitionPending());
EndMockRecognition();
std::vector<blink::mojom::SpeechRecognitionResultPtr> empty_result;
ExpectResultsReceived(empty_result);
}
TEST_F(SpeechRecognitionEngineTest, HTTPError) {
StartMockRecognition();
ASSERT_TRUE(GetUpstreamRequest());
ASSERT_EQ("", ConsumeChunkedUploadData());
InjectDummyAudioChunk();
ASSERT_NE("", ConsumeChunkedUploadData());
// Close the downstream with a HTTP 500 error.
CloseMockDownstream(DOWNSTREAM_ERROR_HTTP500);
// Expect a blink::mojom::SpeechRecognitionErrorCode::kNetwork error to be
// raised.
ASSERT_FALSE(engine_under_test_->IsRecognitionPending());
EndMockRecognition();
ASSERT_EQ(blink::mojom::SpeechRecognitionErrorCode::kNetwork, error_);
ASSERT_EQ(0U, results_.size());
}
TEST_F(SpeechRecognitionEngineTest, NetworkError) {
StartMockRecognition();
ASSERT_TRUE(GetUpstreamRequest());
ASSERT_EQ("", ConsumeChunkedUploadData());
InjectDummyAudioChunk();
ASSERT_NE("", ConsumeChunkedUploadData());
// Close the downstream fetcher simulating a network failure.
CloseMockDownstream(DOWNSTREAM_ERROR_NETWORK);
// Expect a blink::mojom::SpeechRecognitionErrorCode::kNetwork error to be
// raised.
ASSERT_FALSE(engine_under_test_->IsRecognitionPending());
EndMockRecognition();
ASSERT_EQ(blink::mojom::SpeechRecognitionErrorCode::kNetwork, error_);
ASSERT_EQ(0U, results_.size());
}
TEST_F(SpeechRecognitionEngineTest, Stability) {
StartMockRecognition();
ASSERT_TRUE(GetUpstreamRequest());
ASSERT_EQ("", ConsumeChunkedUploadData());
// Upload a dummy audio chunk.
InjectDummyAudioChunk();
ASSERT_NE("", ConsumeChunkedUploadData());
engine_under_test_->AudioChunksEnded();
// Simulate a protobuf message with an intermediate result without confidence,
// but with stability.
proto::SpeechRecognitionEvent proto_event;
proto_event.set_status(proto::SpeechRecognitionEvent::STATUS_SUCCESS);
proto::SpeechRecognitionResult* proto_result = proto_event.add_result();
proto_result->set_stability(0.5);
proto::SpeechRecognitionAlternative *proto_alternative =
proto_result->add_alternative();
proto_alternative->set_transcript("foo");
ProvideMockProtoResultDownstream(proto_event);
// Set up expectations.
std::vector<blink::mojom::SpeechRecognitionResultPtr> results;
results.push_back(blink::mojom::SpeechRecognitionResult::New());
blink::mojom::SpeechRecognitionResultPtr& result = results.back();
result->is_provisional = true;
result->hypotheses.push_back(blink::mojom::SpeechRecognitionHypothesis::New(
base::UTF8ToUTF16("foo"), 0.5));
// Check that the protobuf generated the expected result.
ExpectResultsReceived(results);
// Since it was a provisional result, recognition is still pending.
ASSERT_TRUE(engine_under_test_->IsRecognitionPending());
// Shut down.
CloseMockDownstream(DOWNSTREAM_ERROR_NONE);
ASSERT_FALSE(engine_under_test_->IsRecognitionPending());
EndMockRecognition();
// Since there was no final result, we get an empty "no match" result.
std::vector<blink::mojom::SpeechRecognitionResultPtr> empty_result;
ExpectResultsReceived(empty_result);
ASSERT_EQ(blink::mojom::SpeechRecognitionErrorCode::kNone, error_);
ASSERT_EQ(0U, results_.size());
}
TEST_F(SpeechRecognitionEngineTest, EndOfUtterance) {
StartMockRecognition();
ASSERT_TRUE(GetUpstreamRequest());
// Simulate a END_OF_UTTERANCE proto event with continuous true.
SpeechRecognitionEngine::Config config;
config.continuous = true;
engine_under_test_->SetConfig(config);
proto::SpeechRecognitionEvent proto_event;
proto_event.set_endpoint(proto::SpeechRecognitionEvent::END_OF_UTTERANCE);
ASSERT_EQ(0, end_of_utterance_counter_);
ProvideMockProtoResultDownstream(proto_event);
ASSERT_EQ(0, end_of_utterance_counter_);
// Simulate a END_OF_UTTERANCE proto event with continuous false.
config.continuous = false;
engine_under_test_->SetConfig(config);
ProvideMockProtoResultDownstream(proto_event);
ASSERT_EQ(1, end_of_utterance_counter_);
// Shut down.
CloseMockDownstream(DOWNSTREAM_ERROR_NONE);
EndMockRecognition();
}
TEST_F(SpeechRecognitionEngineTest, SendPreamble) {
const size_t kPreambleLength = 100;
scoped_refptr<SpeechRecognitionSessionPreamble> preamble =
new SpeechRecognitionSessionPreamble();
preamble->sample_rate = 16000;
preamble->sample_depth = 2;
preamble->sample_data.assign(kPreambleLength, 0);
SpeechRecognitionEngine::Config config;
config.auth_token = "foo";
config.auth_scope = "bar";
config.preamble = preamble;
engine_under_test_->SetConfig(config);
StartMockRecognition();
ASSERT_TRUE(GetUpstreamRequest());
// First chunk uploaded should be the preamble.
std::string chunk = ConsumeChunkedUploadData();
ASSERT_NE("", chunk);
ExpectFramedChunk(chunk, kFrameTypePreamble);
for (int i = 0; i < 3; ++i) {
InjectDummyAudioChunk();
chunk = ConsumeChunkedUploadData();
ASSERT_NE("", chunk);
ExpectFramedChunk(chunk, kFrameTypeRecognitionAudio);
}
engine_under_test_->AudioChunksEnded();
ASSERT_TRUE(engine_under_test_->IsRecognitionPending());
// Simulate a protobuf message streamed from the server containing a single
// result with one hypotheses.
std::vector<blink::mojom::SpeechRecognitionResultPtr> results;
results.push_back(blink::mojom::SpeechRecognitionResult::New());
blink::mojom::SpeechRecognitionResultPtr& result = results.back();
result->is_provisional = false;
result->hypotheses.push_back(blink::mojom::SpeechRecognitionHypothesis::New(
base::UTF8ToUTF16("hypothesis 1"), 0.1F));
ProvideMockResultDownstream(result);
ExpectResultsReceived(results);
ASSERT_TRUE(engine_under_test_->IsRecognitionPending());
// Ensure everything is closed cleanly after the downstream is closed.
CloseMockDownstream(DOWNSTREAM_ERROR_NONE);
ASSERT_FALSE(engine_under_test_->IsRecognitionPending());
EndMockRecognition();
ASSERT_EQ(blink::mojom::SpeechRecognitionErrorCode::kNone, error_);
ASSERT_EQ(0U, results_.size());
}
void SpeechRecognitionEngineTest::SetUp() {
engine_under_test_.reset(new SpeechRecognitionEngine(
base::MakeRefCounted<network::WeakWrapperSharedURLLoaderFactory>(
&url_loader_factory_),
"" /* accept_language */));
engine_under_test_->set_delegate(this);
}
void SpeechRecognitionEngineTest::TearDown() {
engine_under_test_.reset();
}
const network::TestURLLoaderFactory::PendingRequest*
SpeechRecognitionEngineTest::GetUpstreamRequest() {
for (const auto& pending_request : *url_loader_factory_.pending_requests()) {
if (pending_request.request.url.spec().find("/up") != std::string::npos)
return &pending_request;
}
return nullptr;
}
const network::TestURLLoaderFactory::PendingRequest*
SpeechRecognitionEngineTest::GetDownstreamRequest() {
for (const auto& pending_request : *url_loader_factory_.pending_requests()) {
if (pending_request.request.url.spec().find("/down") != std::string::npos)
return &pending_request;
}
return nullptr;
}
// Starts recognition on the engine, ensuring that both stream fetchers are
// created.
void SpeechRecognitionEngineTest::StartMockRecognition() {
DCHECK(engine_under_test_.get());
ASSERT_FALSE(engine_under_test_->IsRecognitionPending());
engine_under_test_->StartRecognition();
ASSERT_TRUE(engine_under_test_->IsRecognitionPending());
ASSERT_TRUE(GetUpstreamRequest());
ASSERT_TRUE(GetDownstreamRequest());
}
void SpeechRecognitionEngineTest::EndMockRecognition() {
DCHECK(engine_under_test_.get());
engine_under_test_->EndRecognition();
ASSERT_FALSE(engine_under_test_->IsRecognitionPending());
// TODO(primiano): In order to be very pedantic we should check that both the
// upstream and downstream URL fetchers have been disposed at this time.
// Unfortunately it seems that there is no direct way to detect (in tests)
// if a url_fetcher has been freed or not, since they are not automatically
// de-registered from the TestURLFetcherFactory on destruction.
}
void SpeechRecognitionEngineTest::InjectDummyAudioChunk() {
// Enough data so that the encoder will output something, as can't read 0
// bytes from a Mojo stream.
unsigned char dummy_audio_buffer_data[2000 * 2] = {'\0'};
scoped_refptr<AudioChunk> dummy_audio_chunk(
new AudioChunk(&dummy_audio_buffer_data[0],
sizeof(dummy_audio_buffer_data),
2 /* bytes per sample */));
DCHECK(engine_under_test_.get());
engine_under_test_->TakeAudioChunk(*dummy_audio_chunk.get());
}
void SpeechRecognitionEngineTest::ProvideMockResponseStartDownstreamIfNeeded() {
if (downstream_data_pipe_.get())
return;
const network::TestURLLoaderFactory::PendingRequest* downstream_request =
GetDownstreamRequest();
ASSERT_TRUE(downstream_request);
auto head = network::mojom::URLResponseHead::New();
std::string headers("HTTP/1.1 200 OK\n\n");
head->headers = base::MakeRefCounted<net::HttpResponseHeaders>(
net::HttpUtil::AssembleRawHeaders(headers));
downstream_request->client->OnReceiveResponse(std::move(head));
mojo::DataPipe data_pipe;
downstream_request->client->OnStartLoadingResponseBody(
std::move(data_pipe.consumer_handle));
downstream_data_pipe_ = std::move(data_pipe.producer_handle);
}
void SpeechRecognitionEngineTest::ProvideMockProtoResultDownstream(
const proto::SpeechRecognitionEvent& result) {
ProvideMockResponseStartDownstreamIfNeeded();
ASSERT_TRUE(downstream_data_pipe_.get());
ASSERT_TRUE(downstream_data_pipe_.is_valid());
std::string response_string = SerializeProtobufResponse(result);
response_buffer_.append(response_string);
uint32_t written = 0;
while (written < response_string.size()) {
uint32_t write_bytes = response_string.size() - written;
MojoResult result = downstream_data_pipe_->WriteData(
response_string.data() + written, &write_bytes,
MOJO_WRITE_DATA_FLAG_NONE);
if (result == MOJO_RESULT_OK) {
written += write_bytes;
continue;
}
if (result == MOJO_RESULT_SHOULD_WAIT) {
base::RunLoop().RunUntilIdle();
continue;
}
FAIL() << "Mojo pipe unexpectedly closed";
}
base::RunLoop().RunUntilIdle();
}
void SpeechRecognitionEngineTest::ProvideMockResultDownstream(
const blink::mojom::SpeechRecognitionResultPtr& result) {
proto::SpeechRecognitionEvent proto_event;
proto_event.set_status(proto::SpeechRecognitionEvent::STATUS_SUCCESS);
proto::SpeechRecognitionResult* proto_result = proto_event.add_result();
proto_result->set_final(!result->is_provisional);
for (size_t i = 0; i < result->hypotheses.size(); ++i) {
proto::SpeechRecognitionAlternative* proto_alternative =
proto_result->add_alternative();
const blink::mojom::SpeechRecognitionHypothesisPtr& hypothesis =
result->hypotheses[i];
proto_alternative->set_confidence(hypothesis->confidence);
proto_alternative->set_transcript(base::UTF16ToUTF8(hypothesis->utterance));
}
ProvideMockProtoResultDownstream(proto_event);
base::RunLoop().RunUntilIdle();
}
void SpeechRecognitionEngineTest::CloseMockDownstream(
DownstreamError error) {
if (error == DOWNSTREAM_ERROR_HTTP500) {
// Can't provide a network error if already gave the consumer a 200
// response.
ASSERT_FALSE(downstream_data_pipe_.get());
const network::TestURLLoaderFactory::PendingRequest* downstream_request =
GetDownstreamRequest();
ASSERT_TRUE(downstream_request);
auto head = network::mojom::URLResponseHead::New();
std::string headers("HTTP/1.1 500 Server Sad\n\n");
head->headers = base::MakeRefCounted<net::HttpResponseHeaders>(
net::HttpUtil::AssembleRawHeaders(headers));
downstream_request->client->OnReceiveResponse(std::move(head));
// Wait for the response to be handled.
base::RunLoop().RunUntilIdle();
return;
}
ProvideMockResponseStartDownstreamIfNeeded();
const network::TestURLLoaderFactory::PendingRequest* downstream_request =
GetDownstreamRequest();
ASSERT_TRUE(downstream_request);
network::URLLoaderCompletionStatus status;
status.decoded_body_length = response_buffer_.size();
status.error_code =
(error == DOWNSTREAM_ERROR_NETWORK) ? net::ERR_FAILED : net::OK;
downstream_request->client->OnComplete(status);
downstream_data_pipe_.reset();
// Wait for the completion events to be handled.
base::RunLoop().RunUntilIdle();
}
void SpeechRecognitionEngineTest::ExpectResultsReceived(
const std::vector<blink::mojom::SpeechRecognitionResultPtr>& results) {
ASSERT_GE(1U, results_.size());
ASSERT_TRUE(ResultsAreEqual(results, results_.front()));
results_.pop();
}
bool SpeechRecognitionEngineTest::ResultsAreEqual(
const std::vector<blink::mojom::SpeechRecognitionResultPtr>& a,
const std::vector<blink::mojom::SpeechRecognitionResultPtr>& b) {
if (a.size() != b.size())
return false;
auto it_a = a.begin();
auto it_b = b.begin();
for (; it_a != a.end() && it_b != b.end(); ++it_a, ++it_b) {
if ((*it_a)->is_provisional != (*it_b)->is_provisional ||
(*it_a)->hypotheses.size() != (*it_b)->hypotheses.size()) {
return false;
}
for (size_t i = 0; i < (*it_a)->hypotheses.size(); ++i) {
const blink::mojom::SpeechRecognitionHypothesisPtr& hyp_a =
(*it_a)->hypotheses[i];
const blink::mojom::SpeechRecognitionHypothesisPtr& hyp_b =
(*it_b)->hypotheses[i];
if (hyp_a->utterance != hyp_b->utterance ||
hyp_a->confidence != hyp_b->confidence) {
return false;
}
}
}
return true;
}
void SpeechRecognitionEngineTest::ExpectFramedChunk(
const std::string& chunk, uint32_t type) {
uint32_t value;
base::ReadBigEndian(&chunk[0], &value);
EXPECT_EQ(chunk.size() - 8, value);
base::ReadBigEndian(&chunk[4], &value);
EXPECT_EQ(type, value);
}
std::string SpeechRecognitionEngineTest::ConsumeChunkedUploadData() {
std::string result;
base::RunLoop().RunUntilIdle();
if (!upstream_data_pipe_.get()) {
if (!chunked_data_pipe_getter_) {
const network::TestURLLoaderFactory::PendingRequest* upstream_request =
GetUpstreamRequest();
EXPECT_TRUE(upstream_request);
EXPECT_TRUE(upstream_request->request.request_body);
EXPECT_EQ(1u, upstream_request->request.request_body->elements()->size());
auto& element =
(*upstream_request->request.request_body->elements_mutable())[0];
if (element.type() != network::DataElement::Tag::kChunkedDataPipe) {
ADD_FAILURE() << "element type mismatch";
return "";
}
chunked_data_pipe_getter_.Bind(
element.As<network::DataElementChunkedDataPipe>()
.ReleaseChunkedDataPipeGetter());
}
mojo::DataPipe data_pipe;
chunked_data_pipe_getter_->StartReading(
std::move(data_pipe.producer_handle));
upstream_data_pipe_ = std::move(data_pipe.consumer_handle);
}
EXPECT_TRUE(upstream_data_pipe_.is_valid());
std::string out;
while (true) {
base::RunLoop().RunUntilIdle();
const void* data;
uint32_t num_bytes;
MojoResult result = upstream_data_pipe_->BeginReadData(
&data, &num_bytes, MOJO_READ_DATA_FLAG_NONE);
if (result == MOJO_RESULT_OK) {
out.append(static_cast<const char*>(data), num_bytes);
upstream_data_pipe_->EndReadData(num_bytes);
continue;
}
if (result == MOJO_RESULT_SHOULD_WAIT)
break;
ADD_FAILURE() << "Mojo pipe unexpectedly closed";
break;
}
return out;
}
std::string SpeechRecognitionEngineTest::SerializeProtobufResponse(
const proto::SpeechRecognitionEvent& msg) {
std::string msg_string;
msg.SerializeToString(&msg_string);
// Prepend 4 byte prefix length indication to the protobuf message as
// envisaged by the google streaming recognition webservice protocol.
uint32_t prefix = HostToNet32(checked_cast<uint32_t>(msg_string.size()));
msg_string.insert(0, reinterpret_cast<char*>(&prefix), sizeof(prefix));
return msg_string;
}
} // namespace content