| // Copyright (c) 2012 The Chromium Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| #include "content/browser/speech/google_one_shot_remote_engine.h" |
| |
| #include <stddef.h> |
| #include <stdint.h> |
| |
| #include <vector> |
| |
| #include "base/json/json_reader.h" |
| #include "base/strings/string_number_conversions.h" |
| #include "base/strings/string_util.h" |
| #include "base/values.h" |
| #include "content/browser/speech/audio_buffer.h" |
| #include "content/public/common/speech_recognition_error.h" |
| #include "content/public/common/speech_recognition_result.h" |
| #include "google_apis/google_api_keys.h" |
| #include "net/base/escape.h" |
| #include "net/base/load_flags.h" |
| #include "net/url_request/http_user_agent_settings.h" |
| #include "net/url_request/url_fetcher.h" |
| #include "net/url_request/url_request_context.h" |
| #include "net/url_request/url_request_context_getter.h" |
| #include "net/url_request/url_request_status.h" |
| |
| namespace content { |
| namespace { |
| |
| const char* const kDefaultSpeechRecognitionUrl = |
| "https://www.google.com/speech-api/v1/recognize?xjerr=1&client=chromium&"; |
| const char* const kStatusString = "status"; |
| const char* const kHypothesesString = "hypotheses"; |
| const char* const kUtteranceString = "utterance"; |
| const char* const kConfidenceString = "confidence"; |
| const int kWebServiceStatusNoError = 0; |
| const int kWebServiceStatusNoSpeech = 4; |
| const int kWebServiceStatusNoMatch = 5; |
| |
| bool ParseServerResponse(const std::string& response_body, |
| SpeechRecognitionResult* result, |
| SpeechRecognitionError* error) { |
| if (response_body.empty()) { |
| LOG(WARNING) << "ParseServerResponse: Response was empty."; |
| return false; |
| } |
| DVLOG(1) << "ParseServerResponse: Parsing response " << response_body; |
| |
| // Parse the response, ignoring comments. |
| std::string error_msg; |
| scoped_ptr<base::Value> response_value = base::JSONReader::ReadAndReturnError( |
| response_body, base::JSON_PARSE_RFC, NULL, &error_msg); |
| if (response_value == NULL) { |
| LOG(WARNING) << "ParseServerResponse: JSONReader failed : " << error_msg; |
| return false; |
| } |
| |
| if (!response_value->IsType(base::Value::TYPE_DICTIONARY)) { |
| DVLOG(1) << "ParseServerResponse: Unexpected response type " |
| << response_value->GetType(); |
| return false; |
| } |
| const base::DictionaryValue* response_object = |
| static_cast<const base::DictionaryValue*>(response_value.get()); |
| |
| // Get the status. |
| int status; |
| if (!response_object->GetInteger(kStatusString, &status)) { |
| DVLOG(1) << "ParseServerResponse: " << kStatusString |
| << " is not a valid integer value."; |
| return false; |
| } |
| |
| // Process the status. |
| switch (status) { |
| case kWebServiceStatusNoError: |
| break; |
| case kWebServiceStatusNoSpeech: |
| error->code = SPEECH_RECOGNITION_ERROR_NO_SPEECH; |
| return false; |
| case kWebServiceStatusNoMatch: |
| error->code = SPEECH_RECOGNITION_ERROR_NO_MATCH; |
| return false; |
| default: |
| error->code = SPEECH_RECOGNITION_ERROR_NETWORK; |
| // Other status codes should not be returned by the server. |
| DVLOG(1) << "ParseServerResponse: unexpected status code " << status; |
| return false; |
| } |
| |
| // Get the hypotheses. |
| const base::Value* hypotheses_value = NULL; |
| if (!response_object->Get(kHypothesesString, &hypotheses_value)) { |
| DVLOG(1) << "ParseServerResponse: Missing hypotheses attribute."; |
| return false; |
| } |
| |
| DCHECK(hypotheses_value); |
| if (!hypotheses_value->IsType(base::Value::TYPE_LIST)) { |
| DVLOG(1) << "ParseServerResponse: Unexpected hypotheses type " |
| << hypotheses_value->GetType(); |
| return false; |
| } |
| |
| const base::ListValue* hypotheses_list = |
| static_cast<const base::ListValue*>(hypotheses_value); |
| |
| // For now we support only single shot recognition, so we are giving only a |
| // final result, consisting of one fragment (with one or more hypotheses). |
| size_t index = 0; |
| for (; index < hypotheses_list->GetSize(); ++index) { |
| const base::Value* hypothesis = NULL; |
| if (!hypotheses_list->Get(index, &hypothesis)) { |
| LOG(WARNING) << "ParseServerResponse: Unable to read hypothesis value."; |
| break; |
| } |
| DCHECK(hypothesis); |
| if (!hypothesis->IsType(base::Value::TYPE_DICTIONARY)) { |
| LOG(WARNING) << "ParseServerResponse: Unexpected value type " |
| << hypothesis->GetType(); |
| break; |
| } |
| |
| const base::DictionaryValue* hypothesis_value = |
| static_cast<const base::DictionaryValue*>(hypothesis); |
| base::string16 utterance; |
| |
| if (!hypothesis_value->GetString(kUtteranceString, &utterance)) { |
| LOG(WARNING) << "ParseServerResponse: Missing utterance value."; |
| break; |
| } |
| |
| // It is not an error if the 'confidence' field is missing. |
| double confidence = 0.0; |
| hypothesis_value->GetDouble(kConfidenceString, &confidence); |
| result->hypotheses.push_back(SpeechRecognitionHypothesis(utterance, |
| confidence)); |
| } |
| |
| if (index < hypotheses_list->GetSize()) { |
| result->hypotheses.clear(); |
| return false; |
| } |
| return true; |
| } |
| |
| } // namespace |
| |
| const int GoogleOneShotRemoteEngine::kAudioPacketIntervalMs = 100; |
| int GoogleOneShotRemoteEngine::url_fetcher_id_for_tests = 0; |
| |
| GoogleOneShotRemoteEngine::GoogleOneShotRemoteEngine( |
| net::URLRequestContextGetter* context) |
| : url_context_(context) { |
| } |
| |
| GoogleOneShotRemoteEngine::~GoogleOneShotRemoteEngine() {} |
| |
| void GoogleOneShotRemoteEngine::SetConfig( |
| const SpeechRecognitionEngineConfig& config) { |
| config_ = config; |
| } |
| |
| void GoogleOneShotRemoteEngine::StartRecognition() { |
| DCHECK(delegate()); |
| DCHECK(!url_fetcher_.get()); |
| std::string lang_param = config_.language; |
| |
| if (lang_param.empty() && url_context_.get()) { |
| // If no language is provided then we use the first from the accepted |
| // language list. If this list is empty then it defaults to "en-US". |
| // Example of the contents of this list: "es,en-GB;q=0.8", "" |
| net::URLRequestContext* request_context = |
| url_context_->GetURLRequestContext(); |
| DCHECK(request_context); |
| // TODO(pauljensen): GoogleOneShotRemoteEngine should be constructed with |
| // a reference to the HttpUserAgentSettings rather than accessing the |
| // accept language through the URLRequestContext. |
| if (request_context->http_user_agent_settings()) { |
| std::string accepted_language_list = |
| request_context->http_user_agent_settings()->GetAcceptLanguage(); |
| size_t separator = accepted_language_list.find_first_of(",;"); |
| lang_param = accepted_language_list.substr(0, separator); |
| } |
| } |
| |
| if (lang_param.empty()) |
| lang_param = "en-US"; |
| |
| std::vector<std::string> parts; |
| parts.push_back("lang=" + net::EscapeQueryParamValue(lang_param, true)); |
| |
| if (!config_.grammars.empty()) { |
| DCHECK_EQ(config_.grammars.size(), 1U); |
| parts.push_back("lm=" + net::EscapeQueryParamValue(config_.grammars[0].url, |
| true)); |
| } |
| |
| if (!config_.hardware_info.empty()) |
| parts.push_back("xhw=" + net::EscapeQueryParamValue(config_.hardware_info, |
| true)); |
| parts.push_back("maxresults=" + base::UintToString(config_.max_hypotheses)); |
| parts.push_back(config_.filter_profanities ? "pfilter=2" : "pfilter=0"); |
| |
| std::string api_key = google_apis::GetAPIKey(); |
| parts.push_back("key=" + net::EscapeQueryParamValue(api_key, true)); |
| |
| GURL url(std::string(kDefaultSpeechRecognitionUrl) + |
| base::JoinString(parts, "&")); |
| |
| encoder_.reset(new AudioEncoder(config_.audio_sample_rate, |
| config_.audio_num_bits_per_sample)); |
| DCHECK(encoder_.get()); |
| url_fetcher_ = net::URLFetcher::Create(url_fetcher_id_for_tests, url, |
| net::URLFetcher::POST, this); |
| url_fetcher_->SetChunkedUpload(encoder_->GetMimeType()); |
| url_fetcher_->SetRequestContext(url_context_.get()); |
| url_fetcher_->SetReferrer(config_.origin_url); |
| |
| // The speech recognition API does not require user identification as part |
| // of requests, so we don't send cookies or auth data for these requests to |
| // prevent any accidental connection between users who are logged into the |
| // domain for other services (e.g. bookmark sync) with the speech requests. |
| url_fetcher_->SetLoadFlags(net::LOAD_DO_NOT_SAVE_COOKIES | |
| net::LOAD_DO_NOT_SEND_COOKIES | |
| net::LOAD_DO_NOT_SEND_AUTH_DATA); |
| url_fetcher_->Start(); |
| } |
| |
| void GoogleOneShotRemoteEngine::EndRecognition() { |
| url_fetcher_.reset(); |
| } |
| |
| void GoogleOneShotRemoteEngine::TakeAudioChunk(const AudioChunk& data) { |
| DCHECK(url_fetcher_.get()); |
| DCHECK(encoder_.get()); |
| DCHECK_EQ(data.bytes_per_sample(), config_.audio_num_bits_per_sample / 8); |
| encoder_->Encode(data); |
| scoped_refptr<AudioChunk> encoded_data(encoder_->GetEncodedDataAndClear()); |
| url_fetcher_->AppendChunkToUpload(encoded_data->AsString(), false); |
| } |
| |
| void GoogleOneShotRemoteEngine::AudioChunksEnded() { |
| DCHECK(url_fetcher_.get()); |
| DCHECK(encoder_.get()); |
| |
| // UploadAudioChunk requires a non-empty final buffer. So we encode a packet |
| // of silence in case encoder had no data already. |
| size_t sample_count = |
| config_.audio_sample_rate * kAudioPacketIntervalMs / 1000; |
| scoped_refptr<AudioChunk> dummy_chunk(new AudioChunk( |
| sample_count * sizeof(int16_t), encoder_->GetBitsPerSample() / 8)); |
| encoder_->Encode(*dummy_chunk.get()); |
| encoder_->Flush(); |
| scoped_refptr<AudioChunk> encoded_dummy_data( |
| encoder_->GetEncodedDataAndClear()); |
| DCHECK(!encoded_dummy_data->IsEmpty()); |
| encoder_.reset(); |
| |
| url_fetcher_->AppendChunkToUpload(encoded_dummy_data->AsString(), true); |
| } |
| |
| void GoogleOneShotRemoteEngine::OnURLFetchComplete( |
| const net::URLFetcher* source) { |
| DCHECK_EQ(url_fetcher_.get(), source); |
| SpeechRecognitionResults results; |
| results.push_back(SpeechRecognitionResult()); |
| SpeechRecognitionResult& result = results.back(); |
| SpeechRecognitionError error(SPEECH_RECOGNITION_ERROR_NETWORK); |
| std::string data; |
| |
| // The default error code in case of parse errors is NETWORK_FAILURE, however |
| // ParseServerResponse can change the error to a more appropriate one. |
| bool error_occurred = (!source->GetStatus().is_success() || |
| source->GetResponseCode() != 200 || |
| !source->GetResponseAsString(&data) || |
| !ParseServerResponse(data, &result, &error)); |
| url_fetcher_.reset(); |
| if (error_occurred) { |
| DVLOG(1) << "GoogleOneShotRemoteEngine: Network Error " << error.code; |
| delegate()->OnSpeechRecognitionEngineError(error); |
| } else { |
| DVLOG(1) << "GoogleOneShotRemoteEngine: Invoking delegate with result."; |
| delegate()->OnSpeechRecognitionEngineResults(results); |
| } |
| } |
| |
| bool GoogleOneShotRemoteEngine::IsRecognitionPending() const { |
| return url_fetcher_ != NULL; |
| } |
| |
| int GoogleOneShotRemoteEngine::GetDesiredAudioChunkDurationMs() const { |
| return kAudioPacketIntervalMs; |
| } |
| |
| } // namespace content |