| // Copyright 2024 The Chromium Authors |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| #import "ios/chrome/browser/intelligence/proto_wrappers/page_context_wrapper.h" |
| |
| #import <Foundation/Foundation.h> |
| |
| #import <memory> |
| #import <optional> |
| #import <string> |
| #import <utility> |
| |
| #import "base/barrier_closure.h" |
| #import "base/check.h" |
| #import "base/check_op.h" |
| #import "base/logging.h" |
| #import "base/memory/weak_ptr.h" |
| #import "base/strings/string_util.h" |
| #import "base/strings/sys_string_conversions.h" |
| #import "base/strings/utf_string_conversions.h" |
| #import "base/time/time.h" |
| #import "base/timer/timer.h" |
| #import "base/token.h" |
| #import "components/optimization_guide/core/page_content_proto_serializer.h" |
| #import "components/optimization_guide/proto/features/common_quality_data.pb.h" |
| #import "ios/chrome/browser/intelligence/features/features.h" |
| #import "ios/chrome/browser/intelligence/proto_wrappers/page_context_wrapper_metrics.h" |
| #import "ios/chrome/browser/snapshots/model/snapshot_tab_helper.h" |
| #import "ios/public/provider/chrome/browser/bwg/bwg_api.h" |
| #import "ios/web/find_in_page/find_in_page_java_script_feature.h" |
| #import "ios/web/public/js_messaging/web_frame.h" |
| #import "ios/web/public/js_messaging/web_frames_manager.h" |
| #import "ios/web/public/web_state.h" |
| #import "url/origin.h" |
| |
| namespace { |
| |
| // The default Page Context execution timeout. |
| base::TimeDelta kDefaultPageContextTimeout = base::Seconds(1); |
| |
| // The key for whether the PageContext should be detached. The value is a |
| // bool. |
| constexpr const char kShouldDetachPageContext[] = "shouldDetachPageContext"; |
| |
| // The key for the current node's innerText in the JavaScript object. The value |
| // is a string. |
| constexpr const char kCurrentNodeInnerTextDictKey[] = "currentNodeInnerText"; |
| |
| // The key for the children frames in the JavaScript object. The value is an |
| // array of objects. |
| constexpr const char kChildrenFramesDictKey[] = "children"; |
| |
| // The key for the source URL of the frame in the JavaScript object. The value |
| // is a string. |
| constexpr const char kSourceURLDictKey[] = "sourceURL"; |
| |
| // The key for the title of the frame in the JavaScript object. The value is a |
| // string. |
| constexpr const char kFrameTitleDictKey[] = "title"; |
| |
| // The key for the links of the frame in the JavaScript object. The value is an |
| // array of objects. |
| constexpr const char kFrameLinksDictKey[] = "links"; |
| |
| // The key for a link's HREF/URL field in the JavaScript object. The value is a |
| // string. |
| constexpr const char kLinkHREFDictKey[] = "href"; |
| |
| // The key for a link's innerText in the JavaScript object. The value is a |
| // string. |
| constexpr const char kLinkTextDictKey[] = "linkText"; |
| |
| // The JavaScript to be executed on each WebState's WebFrames, which retrieves |
| // the innerText of the document body, and recursively traverses through |
| // same-origin nested iframes to retrieve their innerTexts as well, constructing |
| // a tree structure. iframes are marked as processed with a nonce to avoid |
| // duplicate text from frames, but only for the current run. Early returns if |
| // the PageContext should be detached, or the frame is not the top-most |
| // same-origin frame. |
| // TODO(crbug.com/423681226): Write this in TypeScript and create a JS Feature |
| // for it. |
| constexpr const char16_t* kInnerTextTreeJavaScript = uR"DELIM( |
| (() => { |
| // Checks whether the PageContext should be detached. |
| const shouldDetachPageContext = () => { |
| // PageContext detachment logic injected below. |
| $1 |
| }; |
| |
| // If the PageContext should be detached, early return. |
| if (shouldDetachPageContext()) { |
| return { shouldDetachPageContext: true }; |
| } |
| |
| // The script should only run if it has no same-origin parent. (The script |
| // should only start execution on top-most nodes of a given origin). |
| if (window.self !== window.top && |
| location.ancestorOrigins?.[0] === location.origin) { |
| // Not the top-most same-origin frame, early exit. |
| return null; |
| } |
| |
| // Recursively constructs the innerText tree for the passed node and its |
| // children same-origin iframes. |
| const constructSameOriginInnerTextTree = (node, frameURL, frameTitle, nonceAttributeValue) => { |
| // Early return if the node is null or already processed. |
| if (!node || node.getAttribute('data-__gCrWeb-innerText-processed') === nonceAttributeValue) { |
| return null; |
| } |
| |
| // Mark node as processed. |
| node.setAttribute('data-__gCrWeb-innerText-processed', nonceAttributeValue); |
| |
| // Get all nested iframes within the current node. |
| const nestedIframes = node.getElementsByTagName('iframe'); |
| const childNodeInnerTexts = [...nestedIframes].map(iframe => { |
| if (!iframe) { |
| return null; |
| } |
| |
| // Try to access the iframe's body, failure is possible (cross-origin iframes). |
| let iframeBody; |
| try { |
| iframeBody = iframe.contentDocument ? iframe.contentDocument.body : null; |
| } catch (error) { |
| return null; |
| } |
| |
| // Recursively construct the innerText tree for the iframe's body. |
| return iframeBody ? constructSameOriginInnerTextTree(iframeBody, iframe.src, iframe.title, |
| nonceAttributeValue) : null; |
| }); |
| |
| const result = { |
| currentNodeInnerText: node.innerText, |
| children: childNodeInnerTexts.filter(item => item !== null), |
| sourceURL: frameURL, |
| title: frameTitle, |
| }; |
| |
| // Anchor tag retrieval logic injected below. |
| $2 |
| |
| return result; |
| }; |
| |
| return constructSameOriginInnerTextTree(document.body, window.location.href, document.title, "$3"); |
| })(); |
| )DELIM"; |
| |
| // The JavaScript to be executed in each WebFrame which gets all of a frame's |
| // anchor tags and adds them to an array with their corresponding URL and |
| // textContent (which includes all text, including text that is not visually |
| // rendered). Injected into the main script. |
| constexpr const char16_t* kAnchorTagsJavaScript = uR"DELIM( |
| // Add all the frame's anchor tags to a links array with their HREF/URL and |
| // textContent. |
| const linksArray = []; |
| const anchorElements = node.querySelectorAll('a[href]'); |
| anchorElements.forEach((anchor) => { |
| linksArray.push({ |
| href: anchor.href, |
| linkText: anchor.textContent |
| }); |
| }); |
| |
| result.links = linksArray; |
| )DELIM"; |
| |
| } // namespace |
| |
| // TODO(crbug.com/424258248): Add a timeout for the execution of the async tasks |
| // in the PageContextWrapper. |
| @implementation PageContextWrapper { |
| base::WeakPtr<web::WebState> _webState; |
| |
| // The amount of async tasks this specific instance of the PageContext wrapper |
| // needs to complete before executing the `completionCallback`. |
| NSInteger _asyncTasksToComplete; |
| |
| // The timer which keeps track of the overall execution timeout. |
| base::OneShotTimer _timeoutTimer; |
| |
| // The root node of the PageContext's AnnotatedPageContent (APC) tree. This |
| // tree is constructed on the fly as values are returned from JavaScript. |
| std::unique_ptr<optimization_guide::proto::AnnotatedPageContent> _rootAPCNode; |
| |
| // Whether the PageContext should be detached. Likely a protected page. |
| BOOL _forceDetachPageContext; |
| |
| // The callback to execute once all async work is complete, whichs |
| // relinquishes ownership of the PageContext proto to the callback's handler. |
| base::OnceCallback<void(PageContextWrapperCallbackResponse)> |
| _completionCallback; |
| |
| // Unique pointer to the PageContext proto. |
| std::unique_ptr<optimization_guide::proto::PageContext> _pageContext; |
| |
| // The current PageContext instance's metrics logger. Only created when async |
| // tasks execution is started. |
| PageContextWrapperMetrics* _pageContextMetrics; |
| } |
| |
| - (instancetype)initWithWebState:(web::WebState*)webState |
| completionCallback: |
| (base::OnceCallback<void(PageContextWrapperCallbackResponse)>) |
| completionCallback { |
| self = [super init]; |
| if (self) { |
| _asyncTasksToComplete = 0; |
| _webState = webState->GetWeakPtr(); |
| _completionCallback = std::move(completionCallback); |
| |
| // Create the PageContext proto/object. |
| _pageContext = std::make_unique<optimization_guide::proto::PageContext>(); |
| _pageContext->set_url(_webState->GetVisibleURL().spec()); |
| _pageContext->set_title(base::UTF16ToUTF8(_webState->GetTitle())); |
| } |
| return self; |
| } |
| |
| - (void)dealloc { |
| _timeoutTimer.Stop(); |
| [self stopTextHighlighting]; |
| } |
| |
| - (void)populatePageContextFieldsAsync { |
| [self populatePageContextFieldsAsyncWithTimeout:kDefaultPageContextTimeout]; |
| } |
| |
| - (void)populatePageContextFieldsAsyncWithTimeout:(base::TimeDelta)timeout { |
| CHECK_GE(_asyncTasksToComplete, 0); |
| _pageContextMetrics = [[PageContextWrapperMetrics alloc] init]; |
| __weak PageContextWrapper* weakSelf = self; |
| |
| // Start the timer. |
| _timeoutTimer.Start(FROM_HERE, timeout, base::BindOnce(^{ |
| [weakSelf onTimeout]; |
| })); |
| |
| if (_asyncTasksToComplete == 0) { |
| [self asyncWorkCompletedForPageContext]; |
| return; |
| } |
| |
| // Use a `BarrierClosure` to ensure all async tasks are completed before |
| // executing the overall completion callback. The BarrierClosure will wait |
| // until the `pageContextBarrier` callback is itself run |
| // `_asyncTasksToComplete` times. |
| base::RepeatingClosure pageContextBarrier = |
| base::BarrierClosure(_asyncTasksToComplete, base::BindOnce(^{ |
| [weakSelf asyncWorkCompletedForPageContext]; |
| })); |
| |
| // Asynchronous work. *IMPORTANT NOTES*: |
| // When adding async tasks below, an accompanying setter should also be |
| // created to follow the disabled-by-default pattern (which |
| // increments/decrements `_asyncTasksToComplete` accordingly). Also, if a |
| // given task is enabled, every code path for that task should eventually |
| // execute the `pageContextBarrier` callback, otherwise the `BarrierClosure` |
| // will never execute its completion block. |
| |
| if (_shouldGetSnapshot) { |
| [self processSnapshotWithBarrier:pageContextBarrier]; |
| } |
| |
| if (_shouldGetAnnotatedPageContent) { |
| [self processAnnotatedPageContentWithBarrier:pageContextBarrier]; |
| } |
| |
| // Create full page PDF representation of the WebState, if enabled. |
| if (_shouldGetFullPagePDF) { |
| [_pageContextMetrics executionStartedForTask:PageContextTask::kPDF]; |
| |
| _webState->CreateFullPagePdf(base::BindOnce(^(NSData* PDFData) { |
| [weakSelf encodeAndSetFullPagePDF:PDFData]; |
| pageContextBarrier.Run(); |
| })); |
| } |
| } |
| |
| #pragma mark - Setters |
| |
| // Sets the flag to enabled/disabled, and increments/decrements accordingly the |
| // total amount of async tasks gating the completion callback. |
| - (void)setShouldGetSnapshot:(BOOL)shouldGetSnapshot { |
| if (_shouldGetSnapshot == shouldGetSnapshot) { |
| return; |
| } |
| |
| _asyncTasksToComplete += shouldGetSnapshot ? 1 : -1; |
| _shouldGetSnapshot = shouldGetSnapshot; |
| } |
| |
| // Sets the flag to enabled/disabled, and increments/decrements accordingly the |
| // total amount of async tasks gating the completion callback. |
| - (void)setShouldGetFullPagePDF:(BOOL)shouldGetFullPagePDF { |
| if (_shouldGetFullPagePDF == shouldGetFullPagePDF) { |
| return; |
| } |
| |
| _asyncTasksToComplete += shouldGetFullPagePDF ? 1 : -1; |
| _shouldGetFullPagePDF = shouldGetFullPagePDF; |
| } |
| |
| // Sets the flag to enabled/disabled, and increments/decrements accordingly the |
| // total amount of async tasks gating the completion callback. |
| - (void)setShouldGetAnnotatedPageContent:(BOOL)shouldGetAnnotatedPageContent { |
| if (_shouldGetAnnotatedPageContent == shouldGetAnnotatedPageContent) { |
| return; |
| } |
| |
| _asyncTasksToComplete += shouldGetAnnotatedPageContent ? 1 : -1; |
| _shouldGetAnnotatedPageContent = shouldGetAnnotatedPageContent; |
| } |
| |
| #pragma mark - Private |
| |
| // Retrieve WebState snapshot. The barrier's callback will be executed for all |
| // codepaths in this method. |
| - (void)processSnapshotWithBarrier:(base::RepeatingClosure)barrier { |
| [_pageContextMetrics executionStartedForTask:PageContextTask::kScreenshot]; |
| |
| __weak PageContextWrapper* weakSelf = self; |
| auto callback = ^(UIImage* image) { |
| __strong __typeof(weakSelf) strongSelf = weakSelf; |
| if (!strongSelf) { |
| return; |
| } |
| |
| if ([strongSelf shouldUpdateSnapshotWithImage:image]) { |
| [strongSelf updateSnapshotWithBarrier:barrier]; |
| return; |
| } |
| |
| [strongSelf encodeImageAndSetTabScreenshot:image]; |
| barrier.Run(); |
| }; |
| |
| // If the WebState is currently visible, update the snapshot in case the |
| // user was scrolling, otherwise retrieve the latest version in cache or on |
| // disk. |
| if (_webState->IsVisible()) { |
| auto updateSnapshotCallback = |
| base::BindOnce(^(std::optional<int> result_matches) { |
| // TODO(crbug.com/401282824): Log the matches count to measure text |
| // highlighting precision. |
| [weakSelf updateSnapshotWithCallback:callback]; |
| }); |
| |
| // If there is text to highlight, do it before capturing the screenshot. |
| if (_textToHighlight != nil) { |
| web::WebFrame* mainFrame = |
| _webState->GetPageWorldWebFramesManager()->GetMainWebFrame(); |
| web::FindInPageJavaScriptFeature* findInPageFeature = |
| web::FindInPageJavaScriptFeature::GetInstance(); |
| |
| findInPageFeature->Search(mainFrame, |
| base::SysNSStringToUTF8(_textToHighlight), |
| std::move(updateSnapshotCallback)); |
| } else { |
| std::move(updateSnapshotCallback).Run(std::nullopt); |
| } |
| } else { |
| SnapshotTabHelper::FromWebState(_webState.get()) |
| ->RetrieveColorSnapshot(callback); |
| } |
| } |
| |
| // Get the WebState's AnnotatedPageContent filled with innerTexts. The barrier's |
| // callback will be executed for all codepaths in this method. |
| - (void)processAnnotatedPageContentWithBarrier:(base::RepeatingClosure)barrier { |
| [_pageContextMetrics |
| executionStartedForTask:PageContextTask::kAnnotatedPageContent]; |
| |
| std::set<web::WebFrame*> webFrames = |
| _webState->GetPageWorldWebFramesManager()->GetAllWebFrames(); |
| web::WebFrame* mainFrame = |
| _webState->GetPageWorldWebFramesManager()->GetMainWebFrame(); |
| |
| if (webFrames.empty() || !mainFrame) { |
| [_pageContextMetrics |
| executionFinishedForTask:PageContextTask::kAnnotatedPageContent |
| withCompletionStatus:PageContextCompletionStatus::kFailure]; |
| barrier.Run(); |
| return; |
| } |
| |
| // Create the root node of the APC tree and its first root ContentNode. |
| _rootAPCNode = |
| std::make_unique<optimization_guide::proto::AnnotatedPageContent>(); |
| _rootAPCNode->set_version( |
| optimization_guide::proto::AnnotatedPageContentVersion:: |
| ANNOTATED_PAGE_CONTENT_VERSION_1_0); |
| _rootAPCNode->mutable_root_node() |
| ->mutable_content_attributes() |
| ->set_attribute_type(optimization_guide::proto::CONTENT_ATTRIBUTE_ROOT); |
| |
| // Use a `BarrierClosure` to ensure the JavaScript is done executing in |
| // all WebFrames before executing the page context barrier `barrier`, |
| // which in turn signals to the PageContextWrapper that the APC is done being |
| // processed. The BarrierClosure will wait until the |
| // `annotatedPageContentBarrier` callback is itself run once per WebFrame (+1 |
| // since we execute the JS explicitly on the main frame first). |
| __weak PageContextWrapper* weakSelf = self; |
| base::RepeatingClosure annotatedPageContentBarrier = base::BarrierClosure( |
| webFrames.size() + 1, base::BindOnce(^{ |
| [weakSelf webFramesAnnotatedPageContentFetchCompleted]; |
| barrier.Run(); |
| })); |
| |
| // Callback to aggregate values from the JS execution. |
| auto callback = [](PageContextWrapper* weakWrapper, |
| base::RepeatingClosure barrier, BOOL isMainFrame, |
| const url::Origin& securityOrigin, |
| const base::Value* value, NSError* error) { |
| [weakWrapper aggregateJavaScriptValue:value |
| withError:error |
| isMainFrame:isMainFrame |
| securityOrigin:securityOrigin]; |
| barrier.Run(); |
| }; |
| |
| // Construct the JavaScript script to be executed on each Web Frame with a |
| // random token as nonce to differentiate between runs/executions. |
| base::Token nonce = base::Token::CreateRandom(); |
| std::u16string nonceString = base::UTF8ToUTF16(nonce.ToString()); |
| std::u16string maybeAnchorTagsJavaScript = |
| IsPageContextAnchorTagsEnabled() ? kAnchorTagsJavaScript : u""; |
| std::u16string script = base::ReplaceStringPlaceholders( |
| kInnerTextTreeJavaScript, |
| base::span<const std::u16string>( |
| {ios::provider::GetPageContextShouldDetachScript(), |
| maybeAnchorTagsJavaScript, nonceString}), |
| nullptr); |
| |
| // If the page is not protected, execute the JavaScript on the main WebFrame |
| // first and pass in the callback (which executes the barrier when run). |
| if (ios::provider::IsProtectedUrl(mainFrame->GetUrl().spec())) { |
| _forceDetachPageContext = YES; |
| annotatedPageContentBarrier.Run(); |
| } else { |
| mainFrame->ExecuteJavaScript( |
| script, |
| base::BindOnce(callback, weakSelf, annotatedPageContentBarrier, |
| /*isMainFrame=*/YES, mainFrame->GetSecurityOrigin())); |
| } |
| |
| // Execute the JavaScript on each other WebFrame and pass in the callback |
| // (which executes the barrier when run). |
| for (web::WebFrame* webFrame : webFrames) { |
| if (ios::provider::IsProtectedUrl(webFrame->GetUrl().spec())) { |
| _forceDetachPageContext = YES; |
| } |
| |
| // Skip if it's the main frame since it was already processed above, or if |
| // Page Context should already be force detached. |
| if (!webFrame || webFrame->IsMainFrame() || _forceDetachPageContext) { |
| annotatedPageContentBarrier.Run(); |
| continue; |
| } |
| |
| webFrame->ExecuteJavaScript( |
| script, |
| base::BindOnce(callback, weakSelf, annotatedPageContentBarrier, |
| /*isMainFrame=*/NO, webFrame->GetSecurityOrigin())); |
| } |
| } |
| |
| // All async tasks are complete, execute the overall completion callback. |
| // Relinquish ownership to the callback handler. |
| - (void)asyncWorkCompletedForPageContext { |
| _timeoutTimer.Stop(); |
| |
| if (!_completionCallback) { |
| return; |
| } |
| |
| [self stopTextHighlighting]; |
| |
| PageContextWrapperCallbackResponse response; |
| PageContextCompletionStatus completionStatus; |
| |
| // Construct the response and completion status, either with the expected |
| // value or an error. |
| if (_forceDetachPageContext) { |
| response = base::unexpected(PageContextWrapperError::kForceDetachError); |
| completionStatus = PageContextCompletionStatus::kProtected; |
| } else if (_shouldGetAnnotatedPageContent && |
| !_pageContext->has_annotated_page_content()) { |
| response = base::unexpected(PageContextWrapperError::kAPCError); |
| completionStatus = PageContextCompletionStatus::kFailure; |
| } else if (_shouldGetSnapshot && !_pageContext->has_tab_screenshot()) { |
| response = base::unexpected(PageContextWrapperError::kScreenshotError); |
| completionStatus = PageContextCompletionStatus::kFailure; |
| |
| } else if (_shouldGetFullPagePDF && !_pageContext->has_pdf_data()) { |
| response = base::unexpected(PageContextWrapperError::kPDFDataError); |
| completionStatus = PageContextCompletionStatus::kFailure; |
| |
| } else { |
| response = base::ok(std::move(_pageContext)); |
| completionStatus = PageContextCompletionStatus::kSuccess; |
| } |
| |
| [_pageContextMetrics executionFinishedForTask:PageContextTask::kOverall |
| withCompletionStatus:completionStatus]; |
| |
| std::move(_completionCallback).Run(std::move(response)); |
| } |
| |
| // Returns YES if the image is nil and forcing the update of missing snapshots |
| // is enabled. |
| - (BOOL)shouldUpdateSnapshotWithImage:(UIImage*)image { |
| return !image && _shouldForceUpdateMissingSnapshots; |
| } |
| |
| // Updates the snapshot for the given WebState, and executes the `barrier` |
| // callback when finished. |
| - (void)updateSnapshotWithBarrier:(base::RepeatingClosure)barrier { |
| __weak PageContextWrapper* weakSelf = self; |
| SnapshotTabHelper::FromWebState(_webState.get()) |
| ->UpdateSnapshotWithCallback(^(UIImage* image) { |
| __strong __typeof(weakSelf) strongSelf = weakSelf; |
| if (!strongSelf) { |
| return; |
| } |
| [strongSelf encodeImageAndSetTabScreenshot:image]; |
| barrier.Run(); |
| }); |
| } |
| |
| // Updates the current WebState's snapshot with the given callback. |
| - (void)updateSnapshotWithCallback:(void (^)(UIImage*))callback { |
| if (_webState) { |
| SnapshotTabHelper::FromWebState(_webState.get()) |
| ->UpdateSnapshotWithCallback(callback); |
| } |
| } |
| |
| // Convert UIImage snapshot to PNG, and then to base64 encoded string. Set the |
| // tab screenshot on the current PageContext. |
| - (void)encodeImageAndSetTabScreenshot:(UIImage*)image { |
| [self stopTextHighlighting]; |
| |
| if (!image) { |
| [_pageContextMetrics |
| executionFinishedForTask:PageContextTask::kScreenshot |
| withCompletionStatus:PageContextCompletionStatus::kFailure]; |
| DLOG(WARNING) << "Failed to fetch webpage screenshot."; |
| return; |
| } |
| |
| NSData* imageData = UIImagePNGRepresentation(image); |
| if (!imageData) { |
| [_pageContextMetrics |
| executionFinishedForTask:PageContextTask::kScreenshot |
| withCompletionStatus:PageContextCompletionStatus::kFailure]; |
| DLOG(WARNING) << "Failed to convert the screenshot to PNG."; |
| return; |
| } |
| |
| NSString* base64String = [imageData base64EncodedStringWithOptions:0]; |
| _pageContext->set_tab_screenshot(base::SysNSStringToUTF8(base64String)); |
| |
| [_pageContextMetrics |
| executionFinishedForTask:PageContextTask::kScreenshot |
| withCompletionStatus:PageContextCompletionStatus::kSuccess]; |
| } |
| |
| // If it exists, convert the PDF data to base64 encoded string and set it in the |
| // PageContext proto. |
| - (void)encodeAndSetFullPagePDF:(NSData*)PDFData { |
| if (!PDFData) { |
| [_pageContextMetrics |
| executionFinishedForTask:PageContextTask::kPDF |
| withCompletionStatus:PageContextCompletionStatus::kFailure]; |
| DLOG(WARNING) << "Failed to fetch webpage PDF data."; |
| return; |
| } |
| |
| NSString* base64String = [PDFData base64EncodedStringWithOptions:0]; |
| _pageContext->set_pdf_data(base::SysNSStringToUTF8(base64String)); |
| |
| [_pageContextMetrics |
| executionFinishedForTask:PageContextTask::kPDF |
| withCompletionStatus:PageContextCompletionStatus::kSuccess]; |
| } |
| |
| // If it exists, parse the returned JavaScript value from the WebFrame, |
| // construct its ContentNode subtree and insert it into the APC tree. |
| - (void)aggregateJavaScriptValue:(const base::Value*)value |
| withError:(NSError*)error |
| isMainFrame:(BOOL)isMainFrame |
| securityOrigin:(const url::Origin&)securityOrigin { |
| if (error || !value || !value->is_dict()) { |
| DLOG(WARNING) << "Failed to fetch frame's innerText tree."; |
| if (error) { |
| // TODO(crbug.com/401282824): Log the failure rate of aggregation. |
| DLOG(WARNING) << base::SysNSStringToUTF8([error localizedDescription]); |
| } |
| return; |
| } |
| |
| if (_forceDetachPageContext) { |
| return; |
| } |
| |
| // Check if PageContext should be force detached. |
| // TODO(crbug.com/423681226): Force detaching PageContext shouldn't depend on |
| // fetching innerText/APC, it should always be enabled. |
| std::optional<bool> shouldDetachPageContext = |
| value->GetDict().FindBool(kShouldDetachPageContext); |
| if (shouldDetachPageContext.has_value() && shouldDetachPageContext.value()) { |
| _forceDetachPageContext = YES; |
| [_pageContextMetrics |
| executionFinishedForTask:PageContextTask::kAnnotatedPageContent |
| withCompletionStatus:PageContextCompletionStatus::kProtected]; |
| return; |
| } |
| |
| if (isMainFrame) { |
| [self populateMainFrameSubtreeWithValue:value origin:securityOrigin]; |
| } |
| |
| // Recursively populate the ContentNode subtree for any of the WebFrame's |
| // iframes. |
| const base::Value::List* childrenFrames = |
| value->GetDict().FindList(kChildrenFramesDictKey); |
| if (childrenFrames && !childrenFrames->empty()) { |
| for (const auto& childFrame : *childrenFrames) { |
| if (!childFrame.is_dict()) { |
| continue; |
| } |
| |
| [self populateIframeSubtreeWithValue:&childFrame |
| origin:securityOrigin |
| parentNode:_rootAPCNode->mutable_root_node()]; |
| } |
| } |
| } |
| |
| // Set the constructed APC tree on the PageContext proto. |
| - (void)webFramesAnnotatedPageContentFetchCompleted { |
| _pageContext->set_allocated_annotated_page_content(_rootAPCNode.release()); |
| |
| [_pageContextMetrics |
| executionFinishedForTask:PageContextTask::kAnnotatedPageContent |
| withCompletionStatus:PageContextCompletionStatus::kSuccess]; |
| } |
| |
| // Populate the main frame's ContentNode subtree with the correct nodes and |
| // their values. Adds Main Frame data and the root text ContentNode. |
| - (void)populateMainFrameSubtreeWithValue:(const base::Value*)value |
| origin:(const url::Origin&)origin { |
| if (!value || !value->is_dict()) { |
| return; |
| } |
| |
| // Set the main frame's security origin. |
| [self populateFrameDataNode:_rootAPCNode->mutable_main_frame_data() |
| withValue:value |
| origin:origin]; |
| |
| // Set its child text node. |
| [self populateTextInfoNodeWithValue:value |
| origin:origin |
| parentNode:_rootAPCNode->mutable_root_node()]; |
| |
| // Set its children anchor nodes. |
| if (IsPageContextAnchorTagsEnabled()) { |
| [self |
| populateAnchorNodeChildrenWithValue:value |
| parentNode:_rootAPCNode->mutable_root_node()]; |
| } |
| } |
| |
| // Populate a FrameData node with the correct values. |
| - (void)populateFrameDataNode: |
| (optimization_guide::proto::FrameData*)frameDataNode |
| withValue:(const base::Value*)value |
| origin:(const url::Origin&)origin { |
| if (!value || !value->is_dict() || !frameDataNode) { |
| return; |
| } |
| |
| optimization_guide::SecurityOriginSerializer::Serialize( |
| origin, frameDataNode->mutable_security_origin()); |
| |
| const std::string* titlePtr = value->GetDict().FindString(kFrameTitleDictKey); |
| if (titlePtr) { |
| frameDataNode->set_title(*titlePtr); |
| } |
| |
| const std::string* urlPtr = value->GetDict().FindString(kSourceURLDictKey); |
| if (urlPtr) { |
| frameDataNode->set_url(*urlPtr); |
| } |
| } |
| |
| // Populate a ContentNode with a TextInfo node and its correct values. |
| - (void)populateTextInfoNodeWithValue:(const base::Value*)value |
| origin:(const url::Origin&)origin |
| parentNode:(optimization_guide::proto::ContentNode*) |
| parentNode { |
| if (!value || !value->is_dict() || !parentNode) { |
| return; |
| } |
| |
| // Early return if there is no text to add. |
| const std::string* innerTextPtr = |
| value->GetDict().FindString(kCurrentNodeInnerTextDictKey); |
| if (!innerTextPtr) { |
| return; |
| } |
| std::string_view trimmedText = |
| base::TrimWhitespaceASCII(*innerTextPtr, base::TRIM_ALL); |
| if (trimmedText.empty()) { |
| return; |
| } |
| |
| // Create and add the text node. |
| optimization_guide::proto::ContentNode* childTextNode = |
| parentNode->add_children_nodes(); |
| childTextNode->mutable_content_attributes()->set_attribute_type( |
| optimization_guide::proto::CONTENT_ATTRIBUTE_TEXT); |
| childTextNode->mutable_content_attributes() |
| ->mutable_text_data() |
| ->set_text_content(trimmedText); |
| } |
| |
| // Populate the ContentNode subtree for an iframe with the correct values. Also |
| // recursively populates the subtrees for all of this iframe's children. |
| - (void)populateIframeSubtreeWithValue:(const base::Value*)value |
| origin:(const url::Origin&)origin |
| parentNode:(optimization_guide::proto::ContentNode*) |
| parentNode { |
| if (!value || !value->is_dict() || !parentNode) { |
| return; |
| } |
| |
| // Create the child iframe node. |
| optimization_guide::proto::ContentNode* node = |
| parentNode->add_children_nodes(); |
| node->mutable_content_attributes()->set_attribute_type( |
| optimization_guide::proto::CONTENT_ATTRIBUTE_IFRAME); |
| |
| // Set its FrameData values. |
| optimization_guide::proto::FrameData* nodeFrameData = |
| node->mutable_content_attributes() |
| ->mutable_iframe_data() |
| ->mutable_frame_data(); |
| [self populateFrameDataNode:nodeFrameData withValue:value origin:origin]; |
| |
| // Create the nested root child ContentNode. |
| optimization_guide::proto::ContentNode* childRootNode = |
| node->add_children_nodes(); |
| childRootNode->mutable_content_attributes()->set_attribute_type( |
| optimization_guide::proto::CONTENT_ATTRIBUTE_ROOT); |
| |
| // Create the nested text node. |
| [self populateTextInfoNodeWithValue:value |
| origin:origin |
| parentNode:childRootNode]; |
| |
| // Create the children anchor nodes. |
| if (IsPageContextAnchorTagsEnabled()) { |
| [self populateAnchorNodeChildrenWithValue:value parentNode:childRootNode]; |
| } |
| |
| // Recursively populate the ContentNode subtree for any children iframes. |
| const base::Value::List* childrenFrames = |
| value->GetDict().FindList(kChildrenFramesDictKey); |
| if (childrenFrames && !childrenFrames->empty()) { |
| for (const auto& childFrame : *childrenFrames) { |
| if (childFrame.is_dict()) { |
| [self populateIframeSubtreeWithValue:&childFrame |
| origin:origin |
| parentNode:childRootNode]; |
| } |
| } |
| } |
| } |
| |
| // Populate all anchor tags as AnchorData nodes which are direct children of |
| // `parentNode`. |
| - (void)populateAnchorNodeChildrenWithValue:(const base::Value*)value |
| parentNode: |
| (optimization_guide::proto::ContentNode*) |
| parentNode { |
| if (!value || !value->is_dict() || !parentNode) { |
| return; |
| } |
| |
| const base::Value::List* links = |
| value->GetDict().FindList(kFrameLinksDictKey); |
| if (!links || links->empty()) { |
| return; |
| } |
| |
| for (const auto& linkValue : *links) { |
| [self populateAnchorNodeWithValue:&linkValue parentNode:parentNode]; |
| } |
| } |
| |
| // Creates an AnchorData node (with the corresponding URL) with one child |
| // TextInfo node (with the corresponding innerText). Set the AnchorData node as |
| // direct child of `parentNode`. |
| - (void)populateAnchorNodeWithValue:(const base::Value*)linkData |
| parentNode:(optimization_guide::proto::ContentNode*) |
| parentNode { |
| if (!linkData || !linkData->is_dict() || !parentNode) { |
| return; |
| } |
| |
| const std::string* href = linkData->GetDict().FindString(kLinkHREFDictKey); |
| if (!href || href->empty()) { |
| return; |
| } |
| |
| // Create the anchor node. |
| optimization_guide::proto::ContentNode* anchorNode = |
| parentNode->add_children_nodes(); |
| anchorNode->mutable_content_attributes()->set_attribute_type( |
| optimization_guide::proto::CONTENT_ATTRIBUTE_ANCHOR); |
| |
| // Set the anchor data (the HREF). |
| anchorNode->mutable_content_attributes()->mutable_anchor_data()->set_url( |
| *href); |
| |
| // Create a child text node for the anchor's innerText. |
| const std::string* linkText = |
| linkData->GetDict().FindString(kLinkTextDictKey); |
| if (!linkText || linkText->empty() || |
| base::TrimWhitespaceASCII(*linkText, base::TRIM_ALL).empty()) { |
| return; |
| } |
| |
| // Set the child text node's text value. |
| optimization_guide::proto::ContentNode* textNode = |
| anchorNode->add_children_nodes(); |
| textNode->mutable_content_attributes()->set_attribute_type( |
| optimization_guide::proto::CONTENT_ATTRIBUTE_TEXT); |
| textNode->mutable_content_attributes()->mutable_text_data()->set_text_content( |
| *linkText); |
| } |
| |
| // Stop the highlighting of text. |
| - (void)stopTextHighlighting { |
| if (!_webState) { |
| return; |
| } |
| |
| web::WebFrame* mainFrame = |
| _webState->GetPageWorldWebFramesManager()->GetMainWebFrame(); |
| |
| if (!mainFrame) { |
| return; |
| } |
| |
| web::FindInPageJavaScriptFeature* find_in_page_feature = |
| web::FindInPageJavaScriptFeature::GetInstance(); |
| |
| find_in_page_feature->Stop(mainFrame); |
| } |
| |
| // Called when the overall execution times out. Cancels the timer and executes |
| // the completion callback with `kTimeout`. |
| - (void)onTimeout { |
| if (!_completionCallback) { |
| return; |
| } |
| |
| [self stopTextHighlighting]; |
| |
| DLOG(WARNING) << "PageContextWrapper execution timed out."; |
| |
| [_pageContextMetrics |
| executionFinishedForTask:PageContextTask::kOverall |
| withCompletionStatus:PageContextCompletionStatus::kTimeout]; |
| |
| std::move(_completionCallback) |
| .Run(base::unexpected(PageContextWrapperError::kTimeout)); |
| } |
| |
| @end |