| // Copyright 2014 The Chromium Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| package org.chromium.distiller; |
| |
| import org.chromium.distiller.document.TextDocument; |
| import org.chromium.distiller.document.TextDocumentStatistics; |
| import org.chromium.distiller.extractors.ArticleExtractor; |
| import org.chromium.distiller.proto.DomDistillerProtos.StatisticsInfo; |
| import org.chromium.distiller.proto.DomDistillerProtos.TimingEntry; |
| import org.chromium.distiller.proto.DomDistillerProtos.TimingInfo; |
| import org.chromium.distiller.webdocument.DomConverter; |
| import org.chromium.distiller.webdocument.WebDocument; |
| import org.chromium.distiller.webdocument.WebDocumentBuilder; |
| import org.chromium.distiller.webdocument.WebImage; |
| import org.chromium.distiller.webdocument.filters.RelevantElements; |
| import org.chromium.distiller.webdocument.filters.LeadImageFinder; |
| import org.chromium.distiller.webdocument.filters.NestedElementRetainer; |
| |
| import com.google.gwt.dom.client.Document; |
| import com.google.gwt.dom.client.Element; |
| import com.google.gwt.dom.client.Node; |
| import com.google.gwt.dom.client.NodeList; |
| |
| import java.util.ArrayList; |
| import java.util.LinkedList; |
| import java.util.List; |
| import java.util.Set; |
| |
| public class ContentExtractor { |
| private final Element documentElement; |
| private final List<String> candidateTitles; |
| private final TimingInfo mTimingInfo; |
| private final StatisticsInfo mStatisticsInfo; |
| private final MarkupParser parser; |
| private final List<String> imageUrls; |
| private String textDirection; |
| |
| private class WebDocumentInfo { |
| WebDocument document; |
| Set<Node> hiddenElements; |
| } |
| |
| public ContentExtractor(Element root) { |
| documentElement = root; |
| candidateTitles = new LinkedList<String>(); |
| mTimingInfo = TimingInfo.create(); |
| mStatisticsInfo = StatisticsInfo.create(); |
| imageUrls = new ArrayList<String>(); |
| |
| double startTime = DomUtil.getTime(); |
| parser = new MarkupParser(root, mTimingInfo); |
| mTimingInfo.setMarkupParsingTime(DomUtil.getTime() - startTime); |
| textDirection = ""; |
| } |
| |
| // Grabs a list of candidate titles in descending priority order: |
| // 1) meta-information |
| // 2) The document's title element, modified based on some readability heuristics |
| // 3) The document's title element, if it's a string |
| private void ensureTitleInitialized() { |
| if (candidateTitles.size() > 0) return; |
| |
| String title = parser.getTitle(); |
| if (!title.isEmpty()) { |
| candidateTitles.add(title); |
| } |
| candidateTitles.add(DocumentTitleGetter.getDocumentTitle( |
| Document.get().getTitle(), Document.get().getDocumentElement())); |
| if (Document.get().getTitle().getClass() == String.class) { |
| candidateTitles.add(Document.get().getTitle()); |
| } |
| } |
| |
| public MarkupParser getMarkupParser() { return parser; } |
| |
| public String extractTitle() { |
| ensureTitleInitialized(); |
| assert candidateTitles.size() > 0; |
| return candidateTitles.get(0); |
| } |
| |
| public String extractContent() { |
| return extractContent(false); |
| } |
| |
| public String extractContent(boolean textOnly) { |
| double now = DomUtil.getTime(); |
| WebDocumentInfo documentInfo = createWebDocumentInfoFromPage(); |
| mTimingInfo.setDocumentConstructionTime(DomUtil.getTime() - now); |
| |
| now = DomUtil.getTime(); |
| processDocument(documentInfo.document); |
| RelevantElements.process(documentInfo.document); |
| LeadImageFinder.process(documentInfo.document); |
| NestedElementRetainer.process(documentInfo.document); |
| |
| List<WebImage> images = documentInfo.document.getContentImages(); |
| for (WebImage wi : images) { |
| imageUrls.add(wi.getSrc()); |
| } |
| mTimingInfo.setArticleProcessingTime(DomUtil.getTime() - now); |
| |
| now = DomUtil.getTime(); |
| String html = documentInfo.document.generateOutput(textOnly); |
| mTimingInfo.setFormattingTime(DomUtil.getTime() - now); |
| |
| if (LogUtil.isLoggable(LogUtil.DEBUG_LEVEL_TIMING_INFO)) { |
| for (int i = 0; i < mTimingInfo.getOtherTimesCount(); i++) { |
| TimingEntry entry = mTimingInfo.getOtherTimes(i); |
| LogUtil.logToConsole("Timing: " + entry.getName() + " = " + entry.getTime()); |
| } |
| |
| LogUtil.logToConsole( |
| "Timing: MarkupParsingTime = " + |
| mTimingInfo.getMarkupParsingTime() + |
| "\nTiming: DocumentConstructionTime = " + |
| mTimingInfo.getDocumentConstructionTime() + |
| "\nTiming: ArticleProcessingTime = " + |
| mTimingInfo.getArticleProcessingTime() + |
| "\nTiming: FormattingTime = " + |
| mTimingInfo.getFormattingTime() |
| ); |
| } |
| return html; |
| } |
| |
| /** |
| * Returns timing information about the most recent extraction run. |
| * @return an instance of DomDistillerProtos.TimingInfo with detailed timing statistics. |
| */ |
| public TimingInfo getTimingInfo() { |
| return mTimingInfo; |
| } |
| |
| /** |
| * Returns statistical information about the most recent extraction run. |
| * @return an instance of DomDistillerProtos.StatisticsInfo with detailed statistics. |
| */ |
| public StatisticsInfo getStatisticsInfo() { |
| return mStatisticsInfo; |
| } |
| |
| /** |
| * Get the page's text directionality ("ltr", "rtl", or "auto"). |
| * @return The page's text direction (default is "auto"). |
| */ |
| public String getTextDirection() { |
| if (textDirection == null || textDirection.isEmpty()) { |
| textDirection = "auto"; |
| } |
| return textDirection; |
| } |
| |
| /** |
| * Get a list of the content image URLs in the provided document. |
| * @return A list of image URLs. |
| */ |
| public List<String> getImageUrls() { |
| return imageUrls; |
| } |
| |
| /** |
| * Get the element of the main article, if any. |
| * @return An element of article (not necessarily the html5 article element). |
| */ |
| private Element getArticleElement(Element root) { |
| NodeList<Element> allArticles = root.getElementsByTagName("ARTICLE"); |
| // Having multiple article elements usually indicates a bad case for this shortcut. |
| // TODO(wychen): some sites exclude things like title and author in article element. |
| if (allArticles.getLength() == 1) { |
| return allArticles.getItem(0); |
| } |
| // Note that the CSS property matching is case sensitive, and "Article" is the correct |
| // capitalization. |
| String query = "[itemscope][itemtype*=\"Article\"],[itemscope][itemtype*=\"Post\"]"; |
| allArticles = DomUtil.querySelectorAll(root, query); |
| // It is commonly seen that the article is wrapped separately or in multiple layers. |
| if (allArticles.getLength() > 0) { |
| return Element.as(DomUtil.getNearestCommonAncestor(allArticles)); |
| } |
| return null; |
| } |
| |
| /** |
| * Converts the original HTML page into a WebDocument for analysis. |
| */ |
| private WebDocumentInfo createWebDocumentInfoFromPage() { |
| WebDocumentInfo info = new WebDocumentInfo(); |
| WebDocumentBuilder documentBuilder = new WebDocumentBuilder(); |
| DomConverter converter = new DomConverter(documentBuilder); |
| Element walkerRoot = getArticleElement(documentElement); |
| if (walkerRoot == null) { |
| walkerRoot = documentElement; |
| } |
| new DomWalker(converter).walk(walkerRoot); |
| info.document = documentBuilder.toWebDocument(); |
| ensureTitleInitialized(); |
| info.hiddenElements = converter.getHiddenElements(); |
| |
| return info; |
| } |
| |
| /** |
| * Implements the actual analysis of the page content, identifying the core elements of the |
| * page. |
| * |
| * @param document the WebDocument representation of the page extracted from the DOM. |
| */ |
| private void processDocument(WebDocument document) { |
| TextDocument textDocument = document.createTextDocumentView(); |
| ArticleExtractor.INSTANCE.process(textDocument, candidateTitles); |
| mStatisticsInfo.setWordCount(TextDocumentStatistics.countWordsInContent(textDocument)); |
| textDocument.applyToModel(); |
| } |
| } |