webnn: Implement tracing for the TFLite backend Copies the design of the trace events recently added to the Core ML backend to the TFLite backend. To reduce overhead the implementation of webnn::ScopedTrace no longer requires a memory allocation to create a substep. Change-Id: I2d62f0fcc111e9914abfba595c88ffc79d19ee59 Bug: 41486052 Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/6191420 Auto-Submit: Reilly Grant <reillyg@chromium.org> Commit-Queue: Reilly Grant <reillyg@chromium.org> Commit-Queue: Austin Sullivan <asully@chromium.org> Reviewed-by: Austin Sullivan <asully@chromium.org> Cr-Commit-Position: refs/heads/main@{#1410067}

commit: b4ff1d2c7bd5899e26f928e227e1110c2ce7a12c [log] [tgz]
author: Reilly Grant <reillyg@chromium.org> Thu Jan 23 02:16:58 2025
committer: Chromium LUCI CQ <chromium-scoped@luci-project-accounts.iam.gserviceaccount.com> Thu Jan 23 02:16:58 2025
tree: f8653d46cc13d9f8f673af650289da8046b3aea8
parent: 2d80802c0621f0e9928658d0e6d51e976e5c95c0 [diff]
diff --git a/services/webnn/public/cpp/webnn_trace.cc b/services/webnn/public/cpp/webnn_trace.cc
index 73ca78c..61a3d2f7 100644
--- a/services/webnn/public/cpp/webnn_trace.cc
+++ b/services/webnn/public/cpp/webnn_trace.cc

@@ -5,7 +5,6 @@
 #include "services/webnn/public/cpp/webnn_trace.h"
 
 #include "base/logging.h"
-#include "base/memory/ptr_util.h"
 #include "base/trace_event/trace_event.h"
 #include "base/trace_event/trace_id_helper.h"
 
@@ -13,15 +12,19 @@
 
 constexpr char kWebNNTraceCategory[] = "webnn";
 
-// Reset the |id_| so the moved `ScopedTrace` object won't end the trace
-// prematurely on destruction.
+// Reset the `id_` and `step_name_` fields so the moved `ScopedTrace` object
+// won't end the trace prematurely on destruction.
 ScopedTrace::ScopedTrace(ScopedTrace&& other)
     : name_(other.name_),
       id_(std::exchange(other.id_, std::nullopt)),
-      step_(std::move(other.step_)) {}
+      step_name_(std::exchange(other.step_name_, std::nullopt)) {}
 
 ScopedTrace::~ScopedTrace() {
   if (id_.has_value()) {
+    if (step_name_.has_value()) {
+      TRACE_EVENT_NESTABLE_ASYNC_END0(kWebNNTraceCategory, *step_name_,
+                                      TRACE_ID_LOCAL(id_.value()));
+    }
     TRACE_EVENT_NESTABLE_ASYNC_END0(kWebNNTraceCategory, name_,
                                     TRACE_ID_LOCAL(id_.value()));
   }
@@ -31,7 +34,7 @@
   if (this != &other) {
     name_ = other.name_;
     id_ = std::exchange(other.id_, std::nullopt);
-    step_ = std::move(other.step_);
+    step_name_ = std::exchange(other.step_name_, std::nullopt);
   }
   return *this;
 }
@@ -39,8 +42,13 @@
 void ScopedTrace::AddStep(const char* step_name) {
   // Calling AddStep() after move is not allowed.
   CHECK(id_.has_value());
-  step_.reset();
-  step_ = base::WrapUnique(new ScopedTrace(step_name, id_.value()));
+  if (step_name_.has_value()) {
+    TRACE_EVENT_NESTABLE_ASYNC_END0(kWebNNTraceCategory, *step_name_,
+                                    TRACE_ID_LOCAL(id_.value()));
+  }
+  step_name_ = step_name;
+  TRACE_EVENT_NESTABLE_ASYNC_BEGIN0(kWebNNTraceCategory, *step_name_,
+                                    TRACE_ID_LOCAL(id_.value()));
 }
 
 ScopedTrace::ScopedTrace(const char* name)

diff --git a/services/webnn/public/cpp/webnn_trace.h b/services/webnn/public/cpp/webnn_trace.h
index 5686d60..2005e989 100644
--- a/services/webnn/public/cpp/webnn_trace.h
+++ b/services/webnn/public/cpp/webnn_trace.h

@@ -47,7 +47,13 @@
   // 'ScopedTrace' object, and stops 'this''s destruction from ending the
   // trace.
   std::optional<uint64_t> id_;
-  std::unique_ptr<ScopedTrace> step_;
+
+  // The step name.
+  //
+  // An `std::nullopt` means that either the trace has been transferred to
+  // another `ScopedTrace` object or there is no active sub-trace, and stops
+  // `this`'s destruction from ending the sub-trace.
+  std::optional<const char*> step_name_;
 };
 
 }  // namespace webnn

diff --git a/services/webnn/tflite/graph_impl_tflite.cc b/services/webnn/tflite/graph_impl_tflite.cc
index 30df4ba..5b961922 100644
--- a/services/webnn/tflite/graph_impl_tflite.cc
+++ b/services/webnn/tflite/graph_impl_tflite.cc

@@ -19,6 +19,7 @@
 #include "mojo/public/cpp/bindings/self_owned_associated_receiver.h"
 #include "services/webnn/buildflags.h"
 #include "services/webnn/error.h"
+#include "services/webnn/public/cpp/webnn_trace.h"
 #include "services/webnn/public/mojom/webnn_context_provider.mojom.h"
 #include "services/webnn/public/mojom/webnn_error.mojom.h"
 #include "services/webnn/public/mojom/webnn_graph.mojom.h"
@@ -181,12 +182,13 @@
 #endif
   }
 
-  void DoDispatch(base::flat_map<int, raw_ref<const BufferContent>> tensors) {
-    TfLiteStatus status;
-    bool needs_reallocate_tensors = false;
+  void DoDispatch(base::flat_map<int, raw_ref<const BufferContent>> tensors,
+                  ScopedTrace scoped_trace) {
+    scoped_trace.AddStep("Set up intepreter");
 
     // TODO: Detect when `tensors` hasn't changed since the last invocation and
     // this step can be skipped.
+    bool needs_reallocate_tensors = false;
     for (auto& [tensor_idx, buffer] : tensors) {
       TfLiteTensor* tensor = interpreter_->tensor(tensor_idx);
       if (tensor->allocation_type == kTfLitePersistentRo) {
@@ -196,7 +198,7 @@
       }
 
       base::span<uint8_t> data = buffer->AsSpan();
-      status = interpreter_->SetCustomAllocationForTensor(
+      TfLiteStatus status = interpreter_->SetCustomAllocationForTensor(
           tensor_idx, {data.data(), data.size()});
       if (status != kTfLiteOk) {
         LOG(ERROR) << "Unable set custom tensor allocation: "
@@ -207,7 +209,7 @@
     }
 
     if (needs_reallocate_tensors) {
-      status = interpreter_->AllocateTensors();
+      TfLiteStatus status = interpreter_->AllocateTensors();
       if (status != kTfLiteOk) {
         LOG(ERROR) << "Unable to allocate tensors: "
                    << TfLiteStatusToString(status);
@@ -215,10 +217,11 @@
       }
     }
 
+    scoped_trace.AddStep("Run inference");
 #if BUILDFLAG(WEBNN_ENABLE_TFLITE_PROFILER)
     profiler_.StartProfiling();
 #endif
-    status = interpreter_->Invoke();
+    TfLiteStatus status = interpreter_->Invoke();
 #if BUILDFLAG(WEBNN_ENABLE_TFLITE_PROFILER)
     profiler_.StopProfiling();
 #endif
@@ -229,6 +232,7 @@
     }
 
     // Copy the outputs that weren't configured as custom allocations.
+    scoped_trace.AddStep("Process outputs");
     for (int tensor_idx : interpreter_->outputs()) {
       TfLiteTensor* tensor = interpreter_->tensor(tensor_idx);
       if (tensor->allocation_type == kTfLitePersistentRo) {
@@ -323,6 +327,8 @@
 void GraphImplTflite::DispatchImpl(
     const base::flat_map<std::string_view, WebNNTensorImpl*>& named_inputs,
     const base::flat_map<std::string_view, WebNNTensorImpl*>& named_outputs) {
+  ScopedTrace scoped_trace("GraphImplTflite::DispatchImpl");
+
   std::vector<
       std::pair<int, scoped_refptr<QueueableResourceState<BufferContent>>>>
       input_buffer_states, output_buffer_states;
@@ -359,6 +365,7 @@
     exclusive_resources.push_back(buffer_state);
   }
 
+  scoped_trace.AddStep("Acquire resources");
   auto task = base::MakeRefCounted<ResourceTask>(
       std::move(shared_resources), std::move(exclusive_resources),
       base::BindOnce(
@@ -370,7 +377,7 @@
              base::flat_map<
                  int, scoped_refptr<QueueableResourceState<BufferContent>>>
                  output_buffer_states,
-             base::OnceClosure completion_closure) {
+             ScopedTrace scoped_trace, base::OnceClosure completion_closure) {
             ComputeResources* raw_compute_resources =
                 compute_resources_state->GetExclusivelyLockedResource();
 
@@ -388,12 +395,12 @@
                     // a `QueueableResourceState` corresponding to
                     // `raw_compute_resources` is held by the
                     // `ResourceTask` until `completion_closure` is run below.
-                    base::Unretained(raw_compute_resources),
-                    std::move(buffers)),
+                    base::Unretained(raw_compute_resources), std::move(buffers),
+                    std::move(scoped_trace)),
                 std::move(completion_closure));
           },
           compute_resources_state_, std::move(input_buffer_states),
-          std::move(output_buffer_states)));
+          std::move(output_buffer_states), std::move(scoped_trace)));
   task->Enqueue();
 }
 

diff --git a/services/webnn/tflite/tensor_impl_tflite.cc b/services/webnn/tflite/tensor_impl_tflite.cc
index 95454fd..c470099 100644
--- a/services/webnn/tflite/tensor_impl_tflite.cc
+++ b/services/webnn/tflite/tensor_impl_tflite.cc

@@ -9,6 +9,7 @@
 #include "base/compiler_specific.h"
 #include "base/memory/ptr_util.h"
 #include "base/memory/scoped_refptr.h"
+#include "services/webnn/public/cpp/webnn_trace.h"
 #include "services/webnn/public/mojom/webnn_tensor.mojom.h"
 #include "services/webnn/queueable_resource_state.h"
 #include "services/webnn/queueable_resource_state_base.h"
@@ -59,10 +60,14 @@
 
 void TensorImplTflite::ReadTensorImpl(ReadTensorCallback callback) {
   DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
+
+  ScopedTrace scoped_trace("TensorImplTflite::ReadTensorImpl");
+
   // Lock the buffer contents as shared/read-only.
   std::vector<scoped_refptr<QueueableResourceStateBase>> shared_resources = {
       buffer_state_};
 
+  scoped_trace.AddStep("Wait for tensor");
   auto task = base::MakeRefCounted<ResourceTask>(
       std::move(shared_resources),
       /*exclusive_resources=*/
@@ -70,25 +75,32 @@
       base::BindOnce(
           [](scoped_refptr<QueueableResourceState<BufferContent>>
                  content_handle,
-             ReadTensorCallback callback,
+             ReadTensorCallback callback, ScopedTrace scoped_trace,
              base::OnceClosure completion_closure) {
+            scoped_trace.AddStep("Begin read");
             // Memory copies are fast, avoid the overhead of posting a task
             // to the thread pool and do the work synchronously.
             std::move(callback).Run(
                 mojom::ReadTensorResult::NewBuffer(mojo_base::BigBuffer(
                     content_handle->GetSharedLockedResource().AsSpan())));
+
+            scoped_trace.AddStep("End read");
             std::move(completion_closure).Run();
           },
-          buffer_state_, std::move(callback)));
+          buffer_state_, std::move(callback), std::move(scoped_trace)));
   task->Enqueue();
 }
 
 void TensorImplTflite::WriteTensorImpl(mojo_base::BigBuffer src_buffer) {
   DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
+
+  ScopedTrace scoped_trace("TensorImplTflite::WriteTensorImpl");
+
   // Take an exclusive lock to the buffer contents while reading.
   std::vector<scoped_refptr<QueueableResourceStateBase>> exclusive_resources = {
       buffer_state_};
 
+  scoped_trace.AddStep("Wait for tensor");
   auto task = base::MakeRefCounted<ResourceTask>(
       /*shared_resources=*/std::vector<
           scoped_refptr<QueueableResourceStateBase>>(),
@@ -96,16 +108,19 @@
       base::BindOnce(
           [](scoped_refptr<QueueableResourceState<BufferContent>>
                  content_handle,
-             mojo_base::BigBuffer src_buffer,
+             mojo_base::BigBuffer src_buffer, ScopedTrace scoped_trace,
              base::OnceClosure completion_closure) {
+            scoped_trace.AddStep("Begin write");
             // Memory copies are fast, avoid the overhead of posting a task to
             // the thread pool and do the work synchronously.
             content_handle->GetExclusivelyLockedResource()
                 ->AsSpan()
                 .copy_prefix_from(src_buffer);
+
+            scoped_trace.AddStep("End write");
             std::move(completion_closure).Run();
           },
-          buffer_state_, std::move(src_buffer)));
+          buffer_state_, std::move(src_buffer), std::move(scoped_trace)));
   task->Enqueue();
 }
commit	b4ff1d2c7bd5899e26f928e227e1110c2ce7a12c	[log] [tgz]
author	Reilly Grant <reillyg@chromium.org>	Thu Jan 23 02:16:58 2025
committer	Chromium LUCI CQ <chromium-scoped@luci-project-accounts.iam.gserviceaccount.com>	Thu Jan 23 02:16:58 2025
tree	f8653d46cc13d9f8f673af650289da8046b3aea8
parent	2d80802c0621f0e9928658d0e6d51e976e5c95c0 [diff]