services/image_annotation/annotator.cc - chromium/src - Git at Google

 // Copyright 2019 The Chromium Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

 #include "services/image_annotation/annotator.h"

 #include <utility>

 #include "base/base64.h"
 #include "base/feature_list.h"
 #include "base/json/json_reader.h"
 #include "base/json/json_writer.h"
 #include "net/base/load_flags.h"
 #include "net/traffic_annotation/network_traffic_annotation.h"
 #include "url/gurl.h"

 namespace image_annotation {

 namespace {

 constexpr size_t kMaxResponseSize = 1024 * 1024;  // 1MB.

 // The minimum confidence value needed to return an OCR result.
 // TODO(crbug.com/916420): tune this value.
 constexpr double kMinOcrConfidence = 0.7;

 // Constructs and returns a JSON string representing an OCR request for the
 // given image bytes.
 std::string FormatJsonOcrRequest(const std::string& source_id,
                                  const std::vector<uint8_t>& image_bytes) {
   // Re-encode image bytes into base64, which can be represented in JSON.
   std::string base64_data;
   Base64Encode(
       base::StringPiece(reinterpret_cast<const char*>(image_bytes.data()),
                         image_bytes.size()),
       &base64_data);

   base::Value image_request(base::Value::Type::DICTIONARY);
   image_request.SetKey("image_id", base::Value(source_id));
   image_request.SetKey("image_bytes", base::Value(std::move(base64_data)));

   // TODO(crbug.com/916420): batch multiple images into one request.
   base::Value image_request_list(base::Value::Type::LIST);
   image_request_list.GetList().push_back(std::move(image_request));

   // TODO(crbug.com/916420): accept and propagate page language info to improve
   //                         OCR accuracy.
   base::Value feature_request(base::Value::Type::DICTIONARY);
   feature_request.SetKey("ocr_feature",
                          base::Value(base::Value::Type::DICTIONARY));

   base::Value request(base::Value::Type::DICTIONARY);
   request.SetKey("image_requests", base::Value(std::move(image_request_list)));
   request.SetKey("feature_request", std::move(feature_request));

   std::string json_request;
   base::JSONWriter::Write(request, &json_request);

   return json_request;
 }

 // Creates a URL loader that calls the image annotation server with an OCR
 // request for the given image bytes.
 std::unique_ptr<network::SimpleURLLoader> MakeOcrRequestLoader(
     const GURL& server_url,
     const std::string& source_id,
     const std::vector<uint8_t>& image_bytes) {
   auto resource_request = std::make_unique<network::ResourceRequest>();
   resource_request->method = "POST";

   // TODO(crbug.com/916420): accept and pass API key when the server is
   //                         configured to require it.
   resource_request->url = server_url;

   resource_request->load_flags = net::LOAD_DO_NOT_SAVE_COOKIES |
                                  net::LOAD_DO_NOT_SEND_COOKIES |
                                  net::LOAD_DO_NOT_SEND_AUTH_DATA;

   // TODO(crbug.com/916420): update this annotation to be more general and to
   //                         reflect specfics of the UI when it is implemented.
   const net::NetworkTrafficAnnotationTag traffic_annotation =
       net::DefineNetworkTrafficAnnotation("image_annotation", R"(
         semantics {
           sender: "Image Annotation"
           description:
             "Chrome can identify text inside images and provide this text to "
             "screen readers (for visually-impaired users) by sending images to "
             "Google's servers. If image text extraction is enabled for a page, "
             "Chrome will send the URLs and pixels of all images on the "
             "page to Google's servers, which will return any textual content "
             "identified inside the images. This content is made accessible to "
             "screen reading software."
           trigger: "A page containing images is loaded for a user who has "
                    "automatic image text extraction enabled."
           data: "Image pixels and URLs. No user identifier is sent along with "
                 "the data."
           destination: GOOGLE_OWNED_SERVICE
         }
         policy {
           cookies_allowed: NO
           setting:
             "You can enable or disable this feature via the context menu "
             "for images, or via 'Image Labeling' in Chrome's settings under "
             "Accessibility. This feature is disabled by default."
           policy_exception_justification: "Policy to come; feature not yet "
                                           "complete."
         })");

   auto url_loader = network::SimpleURLLoader::Create(
       std::move(resource_request), traffic_annotation);

   url_loader->AttachStringForUpload(
       FormatJsonOcrRequest(source_id, image_bytes), "application/json");

   return url_loader;
 }

 // Attempts to extract OCR results from the server response, returning true and
 // setting |out| to these results if successful.
 base::Optional<std::string> ParseJsonOcrResponse(
     const std::string* const json_response) {
   if (!json_response)
     return base::nullopt;

   const std::unique_ptr<base::Value> response =
       base::JSONReader::Read(*json_response);
   if (!response || !response->is_dict())
     return base::nullopt;

   const base::Value* const results = response->FindKey("results");
   if (!results || !results->is_list() || results->GetList().size() != 1 ||
       !results->GetList()[0].is_dict())
     return base::nullopt;

   const base::Value* const annotations =
       results->GetList()[0].FindKey("annotations");
   if (!annotations || !annotations->is_dict())
     return base::nullopt;

   const base::Value* const ocr_list = annotations->FindKey("ocr");
   if (!ocr_list || !ocr_list->is_list())
     return base::nullopt;

   // The server returns separate OCR results for each region of the image; we
   // naively concatenate these into one response string.
   std::string out;
   for (const base::Value& ocr : ocr_list->GetList()) {
     if (!ocr.is_dict())
       return base::nullopt;

     const base::Value* const detected_text = ocr.FindKey("detected_text");
     if (!detected_text || !detected_text->is_string())
       return base::nullopt;

     const base::Value* const confidence = ocr.FindKey("confidence_score");
     if (!confidence || !confidence->is_double())
       return base::nullopt;

     if (confidence->GetDouble() < kMinOcrConfidence)
       continue;

     const std::string& detected_text_str = detected_text->GetString();

     if (!out.empty() && !detected_text_str.empty())
       out += "\n";
     out += detected_text_str;
   }

   return out;
 }

 }  // namespace

 Annotator::Annotator(
     GURL server_url,
     scoped_refptr<network::SharedURLLoaderFactory> url_loader_factory)
     : url_loader_factory_(std::move(url_loader_factory)),
       server_url_(std::move(server_url)) {}

 Annotator::~Annotator() {}

 void Annotator::BindRequest(mojom::AnnotatorRequest request) {
   bindings_.AddBinding(this, std::move(request));
 }

 void Annotator::AnnotateImage(const std::string& source_id,
                               mojom::ImageProcessorPtr image_processor,
                               AnnotateImageCallback callback) {
   // Return cached results if they exist.
   const auto cache_lookup = cached_results_.find(source_id);
   if (cache_lookup != cached_results_.end()) {
     std::move(callback).Run(
         mojom::AnnotateImageResult::NewOcrText(cache_lookup->second));
     return;
   }

   // Register the ImageProcessor and callback to be used for this request.
   RequestInfoList& request_info_list = request_infos_[source_id];
   request_info_list.push_back(
       {std::move(image_processor), std::move(callback)});

   // If the image processor dies: automatically delete the request info and
   // reassign local processing (for other interested clients) if the dead image
   // processor was responsible for some ongoing work.
   request_info_list.back().first.set_connection_error_handler(base::BindOnce(
       &Annotator::RemoveRequestInfo, base::Unretained(this), source_id,
       --request_info_list.end(), mojom::AnnotateImageError::kCanceled));

   // Don't start local work if it would duplicate some ongoing or already-
   // completed work.
   if (base::ContainsKey(local_processors_, source_id) ||
       base::ContainsKey(url_loaders_, source_id))
     return;

   local_processors_.insert(
       std::make_pair(source_id, &request_info_list.back().first));

   // TODO(crbug.com/916420): first query the public result cache by URL to
   // improve latency.

   request_info_list.back().first->GetJpgImageData(
       base::BindOnce(&Annotator::OnJpgImageDataReceived, base::Unretained(this),
                      source_id, --request_info_list.end()));
 }

 void Annotator::OnJpgImageDataReceived(
     const std::string& source_id,
     const RequestInfoList::iterator request_info_it,
     const std::vector<uint8_t>& image_bytes) {
   // Failed to retrieve bytes from local processor; remove dead processor and
   // reschedule processing.
   if (image_bytes.empty()) {
     RemoveRequestInfo(source_id, request_info_it,
                       mojom::AnnotateImageError::kFailure);
     return;
   }

   // Local processing is no longer ongoing.
   local_processors_.erase(source_id);

   // Kick off server communication.
   // TODO(crbug.com/916420): add request to a queue here and batch them up
   //                         to limit number of concurrent HTTP requests made.
   std::unique_ptr<network::SimpleURLLoader> url_loader =
       MakeOcrRequestLoader(server_url_, source_id, image_bytes);
   const auto url_loader_it =
       url_loaders_.insert({source_id, std::move(url_loader)}).first;
   url_loader_it->second->DownloadToString(
       url_loader_factory_.get(),
       base::BindOnce(&Annotator::OnServerResponseReceived,
                      base::Unretained(this), source_id, url_loader_it),
       kMaxResponseSize);
 }

 void Annotator::OnServerResponseReceived(
     const std::string& source_id,
     const URLLoaderMap::iterator url_loader_it,
     const std::unique_ptr<std::string> json_response) {
   url_loaders_.erase(url_loader_it);

   // Extract OCR results into a string.
   const base::Optional<std::string> ocr_text =
       ParseJsonOcrResponse(json_response.get());

   if (ocr_text.has_value())
     cached_results_.insert({source_id, *ocr_text});

   const auto request_info_it = request_infos_.find(source_id);
   if (request_info_it == request_infos_.end())
     return;

   // Notify clients of success or failure.
   // TODO(crbug.com/916420): explore server retry strategies.
   for (auto& info : request_info_it->second) {
     auto result = ocr_text.has_value()
                       ? mojom::AnnotateImageResult::NewOcrText(*ocr_text)
                       : mojom::AnnotateImageResult::NewErrorCode(
                             mojom::AnnotateImageError::kFailure);
     std::move(info.second).Run(std::move(result));
   }
   request_infos_.erase(request_info_it);
 }

 void Annotator::RemoveRequestInfo(
     const std::string& source_id,
     const RequestInfoList::iterator request_info_it,
     const mojom::AnnotateImageError error) {
   // Check whether we are deleting the ImageProcessor responsible for current
   // local processing.
   auto lookup = local_processors_.find(source_id);
   const bool should_reassign = lookup != local_processors_.end() &&
                                lookup->second == &request_info_it->first;

   // Notify client of cancellation / failure.
   std::move(request_info_it->second)
       .Run(mojom::AnnotateImageResult::NewErrorCode(error));

   // Delete the specified ImageProcessor.
   RequestInfoList& request_info_list = request_infos_[source_id];
   request_info_list.erase(request_info_it);

   // If necessary, reassign local processing.
   if (!request_info_list.empty() && should_reassign) {
     lookup->second = &request_info_list.front().first;

     request_info_list.front().first->GetJpgImageData(base::BindOnce(
         &Annotator::OnJpgImageDataReceived, base::Unretained(this), source_id,
         request_info_list.begin()));
   }
 }

 }  // namespace image_annotation
	// Copyright 2019 The Chromium Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style license that can be
	// found in the LICENSE file.

	#include "services/image_annotation/annotator.h"

	#include <utility>

	#include "base/base64.h"
	#include "base/feature_list.h"
	#include "base/json/json_reader.h"
	#include "base/json/json_writer.h"
	#include "net/base/load_flags.h"
	#include "net/traffic_annotation/network_traffic_annotation.h"
	#include "url/gurl.h"

	namespace image_annotation {

	namespace {

	constexpr size_t kMaxResponseSize = 1024 * 1024; // 1MB.

	// The minimum confidence value needed to return an OCR result.
	// TODO(crbug.com/916420): tune this value.
	constexpr double kMinOcrConfidence = 0.7;

	// Constructs and returns a JSON string representing an OCR request for the
	// given image bytes.
	std::string FormatJsonOcrRequest(const std::string& source_id,
	const std::vector<uint8_t>& image_bytes) {
	// Re-encode image bytes into base64, which can be represented in JSON.
	std::string base64_data;
	Base64Encode(
	base::StringPiece(reinterpret_cast<const char*>(image_bytes.data()),
	image_bytes.size()),
	&base64_data);

	base::Value image_request(base::Value::Type::DICTIONARY);
	image_request.SetKey("image_id", base::Value(source_id));
	image_request.SetKey("image_bytes", base::Value(std::move(base64_data)));

	// TODO(crbug.com/916420): batch multiple images into one request.
	base::Value image_request_list(base::Value::Type::LIST);
	image_request_list.GetList().push_back(std::move(image_request));

	// TODO(crbug.com/916420): accept and propagate page language info to improve
	// OCR accuracy.
	base::Value feature_request(base::Value::Type::DICTIONARY);
	feature_request.SetKey("ocr_feature",
	base::Value(base::Value::Type::DICTIONARY));

	base::Value request(base::Value::Type::DICTIONARY);
	request.SetKey("image_requests", base::Value(std::move(image_request_list)));
	request.SetKey("feature_request", std::move(feature_request));

	std::string json_request;
	base::JSONWriter::Write(request, &json_request);

	return json_request;
	}

	// Creates a URL loader that calls the image annotation server with an OCR
	// request for the given image bytes.
	std::unique_ptr<network::SimpleURLLoader> MakeOcrRequestLoader(
	const GURL& server_url,
	const std::string& source_id,
	const std::vector<uint8_t>& image_bytes) {
	auto resource_request = std::make_unique<network::ResourceRequest>();
	resource_request->method = "POST";

	// TODO(crbug.com/916420): accept and pass API key when the server is
	// configured to require it.
	resource_request->url = server_url;

	resource_request->load_flags = net::LOAD_DO_NOT_SAVE_COOKIES \|
	net::LOAD_DO_NOT_SEND_COOKIES \|
	net::LOAD_DO_NOT_SEND_AUTH_DATA;

	// TODO(crbug.com/916420): update this annotation to be more general and to
	// reflect specfics of the UI when it is implemented.
	const net::NetworkTrafficAnnotationTag traffic_annotation =
	net::DefineNetworkTrafficAnnotation("image_annotation", R"(
	semantics {
	sender: "Image Annotation"
	description:
	"Chrome can identify text inside images and provide this text to "
	"screen readers (for visually-impaired users) by sending images to "
	"Google's servers. If image text extraction is enabled for a page, "
	"Chrome will send the URLs and pixels of all images on the "
	"page to Google's servers, which will return any textual content "
	"identified inside the images. This content is made accessible to "
	"screen reading software."
	trigger: "A page containing images is loaded for a user who has "
	"automatic image text extraction enabled."
	data: "Image pixels and URLs. No user identifier is sent along with "
	"the data."
	destination: GOOGLE_OWNED_SERVICE
	}
	policy {
	cookies_allowed: NO
	setting:
	"You can enable or disable this feature via the context menu "
	"for images, or via 'Image Labeling' in Chrome's settings under "
	"Accessibility. This feature is disabled by default."
	policy_exception_justification: "Policy to come; feature not yet "
	"complete."
	})");

	auto url_loader = network::SimpleURLLoader::Create(
	std::move(resource_request), traffic_annotation);

	url_loader->AttachStringForUpload(
	FormatJsonOcrRequest(source_id, image_bytes), "application/json");

	return url_loader;
	}

	// Attempts to extract OCR results from the server response, returning true and
	// setting \|out\| to these results if successful.
	base::Optional<std::string> ParseJsonOcrResponse(
	const std::string* const json_response) {
	if (!json_response)
	return base::nullopt;

	const std::unique_ptr<base::Value> response =
	base::JSONReader::Read(*json_response);
	if (!response \|\| !response->is_dict())
	return base::nullopt;

	const base::Value* const results = response->FindKey("results");
	if (!results \|\| !results->is_list() \|\| results->GetList().size() != 1 \|\|
	!results->GetList()[0].is_dict())
	return base::nullopt;

	const base::Value* const annotations =
	results->GetList()[0].FindKey("annotations");
	if (!annotations \|\| !annotations->is_dict())
	return base::nullopt;

	const base::Value* const ocr_list = annotations->FindKey("ocr");
	if (!ocr_list \|\| !ocr_list->is_list())
	return base::nullopt;

	// The server returns separate OCR results for each region of the image; we
	// naively concatenate these into one response string.
	std::string out;
	for (const base::Value& ocr : ocr_list->GetList()) {
	if (!ocr.is_dict())
	return base::nullopt;

	const base::Value* const detected_text = ocr.FindKey("detected_text");
	if (!detected_text \|\| !detected_text->is_string())
	return base::nullopt;

	const base::Value* const confidence = ocr.FindKey("confidence_score");
	if (!confidence \|\| !confidence->is_double())
	return base::nullopt;

	if (confidence->GetDouble() < kMinOcrConfidence)
	continue;

	const std::string& detected_text_str = detected_text->GetString();

	if (!out.empty() && !detected_text_str.empty())
	out += "\n";
	out += detected_text_str;
	}

	return out;
	}

	} // namespace

	Annotator::Annotator(
	GURL server_url,
	scoped_refptr<network::SharedURLLoaderFactory> url_loader_factory)
	: url_loader_factory_(std::move(url_loader_factory)),
	server_url_(std::move(server_url)) {}

	Annotator::~Annotator() {}

	void Annotator::BindRequest(mojom::AnnotatorRequest request) {
	bindings_.AddBinding(this, std::move(request));
	}

	void Annotator::AnnotateImage(const std::string& source_id,
	mojom::ImageProcessorPtr image_processor,
	AnnotateImageCallback callback) {
	// Return cached results if they exist.
	const auto cache_lookup = cached_results_.find(source_id);
	if (cache_lookup != cached_results_.end()) {
	std::move(callback).Run(
	mojom::AnnotateImageResult::NewOcrText(cache_lookup->second));
	return;
	}

	// Register the ImageProcessor and callback to be used for this request.
	RequestInfoList& request_info_list = request_infos_[source_id];
	request_info_list.push_back(
	{std::move(image_processor), std::move(callback)});

	// If the image processor dies: automatically delete the request info and
	// reassign local processing (for other interested clients) if the dead image
	// processor was responsible for some ongoing work.
	request_info_list.back().first.set_connection_error_handler(base::BindOnce(
	&Annotator::RemoveRequestInfo, base::Unretained(this), source_id,
	--request_info_list.end(), mojom::AnnotateImageError::kCanceled));

	// Don't start local work if it would duplicate some ongoing or already-
	// completed work.
	if (base::ContainsKey(local_processors_, source_id) \|\|
	base::ContainsKey(url_loaders_, source_id))
	return;

	local_processors_.insert(
	std::make_pair(source_id, &request_info_list.back().first));

	// TODO(crbug.com/916420): first query the public result cache by URL to
	// improve latency.

	request_info_list.back().first->GetJpgImageData(
	base::BindOnce(&Annotator::OnJpgImageDataReceived, base::Unretained(this),
	source_id, --request_info_list.end()));
	}

	void Annotator::OnJpgImageDataReceived(
	const std::string& source_id,
	const RequestInfoList::iterator request_info_it,
	const std::vector<uint8_t>& image_bytes) {
	// Failed to retrieve bytes from local processor; remove dead processor and
	// reschedule processing.
	if (image_bytes.empty()) {
	RemoveRequestInfo(source_id, request_info_it,
	mojom::AnnotateImageError::kFailure);
	return;
	}

	// Local processing is no longer ongoing.
	local_processors_.erase(source_id);

	// Kick off server communication.
	// TODO(crbug.com/916420): add request to a queue here and batch them up
	// to limit number of concurrent HTTP requests made.
	std::unique_ptr<network::SimpleURLLoader> url_loader =
	MakeOcrRequestLoader(server_url_, source_id, image_bytes);
	const auto url_loader_it =
	url_loaders_.insert({source_id, std::move(url_loader)}).first;
	url_loader_it->second->DownloadToString(
	url_loader_factory_.get(),
	base::BindOnce(&Annotator::OnServerResponseReceived,
	base::Unretained(this), source_id, url_loader_it),
	kMaxResponseSize);
	}

	void Annotator::OnServerResponseReceived(
	const std::string& source_id,
	const URLLoaderMap::iterator url_loader_it,
	const std::unique_ptr<std::string> json_response) {
	url_loaders_.erase(url_loader_it);

	// Extract OCR results into a string.
	const base::Optional<std::string> ocr_text =
	ParseJsonOcrResponse(json_response.get());

	if (ocr_text.has_value())
	cached_results_.insert({source_id, *ocr_text});

	const auto request_info_it = request_infos_.find(source_id);
	if (request_info_it == request_infos_.end())
	return;

	// Notify clients of success or failure.
	// TODO(crbug.com/916420): explore server retry strategies.
	for (auto& info : request_info_it->second) {
	auto result = ocr_text.has_value()
	? mojom::AnnotateImageResult::NewOcrText(*ocr_text)
	: mojom::AnnotateImageResult::NewErrorCode(
	mojom::AnnotateImageError::kFailure);
	std::move(info.second).Run(std::move(result));
	}
	request_infos_.erase(request_info_it);
	}

	void Annotator::RemoveRequestInfo(
	const std::string& source_id,
	const RequestInfoList::iterator request_info_it,
	const mojom::AnnotateImageError error) {
	// Check whether we are deleting the ImageProcessor responsible for current
	// local processing.
	auto lookup = local_processors_.find(source_id);
	const bool should_reassign = lookup != local_processors_.end() &&
	lookup->second == &request_info_it->first;

	// Notify client of cancellation / failure.
	std::move(request_info_it->second)
	.Run(mojom::AnnotateImageResult::NewErrorCode(error));

	// Delete the specified ImageProcessor.
	RequestInfoList& request_info_list = request_infos_[source_id];
	request_info_list.erase(request_info_it);

	// If necessary, reassign local processing.
	if (!request_info_list.empty() && should_reassign) {
	lookup->second = &request_info_list.front().first;

	request_info_list.front().first->GetJpgImageData(base::BindOnce(
	&Annotator::OnJpgImageDataReceived, base::Unretained(this), source_id,
	request_info_list.begin()));
	}
	}

	} // namespace image_annotation