java/org/chromium/distiller/ContentExtractor.java - chromium/dom-distiller - Git at Google

 // Copyright 2014 The Chromium Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

 package org.chromium.distiller;

 import org.chromium.distiller.document.TextDocument;
 import org.chromium.distiller.document.TextDocumentStatistics;
 import org.chromium.distiller.extractors.ArticleExtractor;
 import org.chromium.distiller.proto.DomDistillerProtos.StatisticsInfo;
 import org.chromium.distiller.proto.DomDistillerProtos.TimingEntry;
 import org.chromium.distiller.proto.DomDistillerProtos.TimingInfo;
 import org.chromium.distiller.webdocument.DomConverter;
 import org.chromium.distiller.webdocument.WebDocument;
 import org.chromium.distiller.webdocument.WebDocumentBuilder;
 import org.chromium.distiller.webdocument.WebImage;
 import org.chromium.distiller.webdocument.filters.RelevantElements;
 import org.chromium.distiller.webdocument.filters.LeadImageFinder;
 import org.chromium.distiller.webdocument.filters.NestedElementRetainer;

 import com.google.gwt.dom.client.Document;
 import com.google.gwt.dom.client.Element;
 import com.google.gwt.dom.client.Node;
 import com.google.gwt.dom.client.NodeList;

 import java.util.ArrayList;
 import java.util.LinkedList;
 import java.util.List;
 import java.util.Set;

 public class ContentExtractor {
     private final Element documentElement;
     private final List<String> candidateTitles;
     private final TimingInfo mTimingInfo;
     private final StatisticsInfo mStatisticsInfo;
     private final MarkupParser parser;
     private final List<String> imageUrls;
     private String textDirection;

     private class WebDocumentInfo {
         WebDocument document;
         Set<Node> hiddenElements;
     }

     public ContentExtractor(Element root) {
         documentElement = root;
         candidateTitles = new LinkedList<String>();
         mTimingInfo = TimingInfo.create();
         mStatisticsInfo = StatisticsInfo.create();
         imageUrls = new ArrayList<String>();

         double startTime = DomUtil.getTime();
         parser = new MarkupParser(root, mTimingInfo);
         mTimingInfo.setMarkupParsingTime(DomUtil.getTime() - startTime);
         textDirection = "";
     }

     // Grabs a list of candidate titles in descending priority order:
     // 1) meta-information
     // 2) The document's title element, modified based on some readability heuristics
     // 3) The document's title element, if it's a string
     private void ensureTitleInitialized() {
         if (candidateTitles.size() > 0) return;

         String title = parser.getTitle();
         if (!title.isEmpty()) {
             candidateTitles.add(title);
         }
         candidateTitles.add(DocumentTitleGetter.getDocumentTitle(
                     Document.get().getTitle(), Document.get().getDocumentElement()));
         if (Document.get().getTitle().getClass() == String.class) {
             candidateTitles.add(Document.get().getTitle());
         }
     }

     public MarkupParser getMarkupParser() { return parser; }

     public String extractTitle() {
         ensureTitleInitialized();
         assert candidateTitles.size() > 0;
         return candidateTitles.get(0);
     }

     public String extractContent() {
         return extractContent(false);
     }

     public String extractContent(boolean textOnly) {
         double now = DomUtil.getTime();
         WebDocumentInfo documentInfo = createWebDocumentInfoFromPage();
         mTimingInfo.setDocumentConstructionTime(DomUtil.getTime() - now);

         now = DomUtil.getTime();
         processDocument(documentInfo.document);
         RelevantElements.process(documentInfo.document);
         LeadImageFinder.process(documentInfo.document);
         NestedElementRetainer.process(documentInfo.document);

         List<WebImage> images = documentInfo.document.getContentImages();
         for (WebImage wi : images) {
             imageUrls.add(wi.getSrc());
         }
         mTimingInfo.setArticleProcessingTime(DomUtil.getTime() - now);

         now = DomUtil.getTime();
         String html = documentInfo.document.generateOutput(textOnly);
         mTimingInfo.setFormattingTime(DomUtil.getTime() - now);

         if (LogUtil.isLoggable(LogUtil.DEBUG_LEVEL_TIMING_INFO)) {
             for (int i = 0; i < mTimingInfo.getOtherTimesCount(); i++) {
                 TimingEntry entry =  mTimingInfo.getOtherTimes(i);
                 LogUtil.logToConsole("Timing: " + entry.getName() + " = " + entry.getTime());
             }

             LogUtil.logToConsole(
                     "Timing: MarkupParsingTime = " +
                     mTimingInfo.getMarkupParsingTime() +
                     "\nTiming: DocumentConstructionTime = " +
                     mTimingInfo.getDocumentConstructionTime() +
                     "\nTiming: ArticleProcessingTime = " +
                     mTimingInfo.getArticleProcessingTime() +
                     "\nTiming: FormattingTime = " +
                     mTimingInfo.getFormattingTime()
                     );
         }
         return html;
     }

     /**
      * Returns timing information about the most recent extraction run.
      * @return an instance of DomDistillerProtos.TimingInfo with detailed timing statistics.
      */
     public TimingInfo getTimingInfo() {
         return mTimingInfo;
     }

     /**
      * Returns statistical information about the most recent extraction run.
      * @return an instance of DomDistillerProtos.StatisticsInfo with detailed statistics.
      */
     public StatisticsInfo getStatisticsInfo() {
         return mStatisticsInfo;
     }

     /**
      * Get the page's text directionality ("ltr", "rtl", or "auto").
      * @return The page's text direction (default is "auto").
      */
     public String getTextDirection() {
         if (textDirection == null || textDirection.isEmpty()) {
             textDirection = "auto";
         }
         return textDirection;
     }

     /**
      * Get a list of the content image URLs in the provided document.
      * @return A list of image URLs.
      */
     public List<String> getImageUrls() {
         return imageUrls;
     }

     /**
      * Get the element of the main article, if any.
      * @return An element of article (not necessarily the html5 article element).
      */
     private Element getArticleElement(Element root) {
         NodeList<Element> allArticles = root.getElementsByTagName("ARTICLE");
         // Having multiple article elements usually indicates a bad case for this shortcut.
         // TODO(wychen): some sites exclude things like title and author in article element.
         if (allArticles.getLength() == 1) {
             return allArticles.getItem(0);
         }
         // Note that the CSS property matching is case sensitive, and "Article" is the correct
         // capitalization.
         String query = "[itemscope][itemtype*=\"Article\"],[itemscope][itemtype*=\"Post\"]";
         allArticles = DomUtil.querySelectorAll(root, query);
         // It is commonly seen that the article is wrapped separately or in multiple layers.
         if (allArticles.getLength() > 0) {
             return Element.as(DomUtil.getNearestCommonAncestor(allArticles));
         }
         return null;
     }

     /**
      * Converts the original HTML page into a WebDocument for analysis.
      */
     private WebDocumentInfo createWebDocumentInfoFromPage() {
         WebDocumentInfo info = new WebDocumentInfo();
         WebDocumentBuilder documentBuilder = new WebDocumentBuilder();
         DomConverter converter = new DomConverter(documentBuilder);
         Element walkerRoot = getArticleElement(documentElement);
         if (walkerRoot == null) {
             walkerRoot = documentElement;
         }
         new DomWalker(converter).walk(walkerRoot);
         info.document = documentBuilder.toWebDocument();
         ensureTitleInitialized();
         info.hiddenElements = converter.getHiddenElements();

         return info;
     }

     /**
      * Implements the actual analysis of the page content, identifying the core elements of the
      * page.
      *
      * @param document the WebDocument representation of the page extracted from the DOM.
      */
     private void processDocument(WebDocument document) {
         TextDocument textDocument = document.createTextDocumentView();
         ArticleExtractor.INSTANCE.process(textDocument, candidateTitles);
         mStatisticsInfo.setWordCount(TextDocumentStatistics.countWordsInContent(textDocument));
         textDocument.applyToModel();
     }
 }
	// Copyright 2014 The Chromium Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style license that can be
	// found in the LICENSE file.

	package org.chromium.distiller;

	import org.chromium.distiller.document.TextDocument;
	import org.chromium.distiller.document.TextDocumentStatistics;
	import org.chromium.distiller.extractors.ArticleExtractor;
	import org.chromium.distiller.proto.DomDistillerProtos.StatisticsInfo;
	import org.chromium.distiller.proto.DomDistillerProtos.TimingEntry;
	import org.chromium.distiller.proto.DomDistillerProtos.TimingInfo;
	import org.chromium.distiller.webdocument.DomConverter;
	import org.chromium.distiller.webdocument.WebDocument;
	import org.chromium.distiller.webdocument.WebDocumentBuilder;
	import org.chromium.distiller.webdocument.WebImage;
	import org.chromium.distiller.webdocument.filters.RelevantElements;
	import org.chromium.distiller.webdocument.filters.LeadImageFinder;
	import org.chromium.distiller.webdocument.filters.NestedElementRetainer;

	import com.google.gwt.dom.client.Document;
	import com.google.gwt.dom.client.Element;
	import com.google.gwt.dom.client.Node;
	import com.google.gwt.dom.client.NodeList;

	import java.util.ArrayList;
	import java.util.LinkedList;
	import java.util.List;
	import java.util.Set;

	public class ContentExtractor {
	private final Element documentElement;
	private final List<String> candidateTitles;
	private final TimingInfo mTimingInfo;
	private final StatisticsInfo mStatisticsInfo;
	private final MarkupParser parser;
	private final List<String> imageUrls;
	private String textDirection;

	private class WebDocumentInfo {
	WebDocument document;
	Set<Node> hiddenElements;
	}

	public ContentExtractor(Element root) {
	documentElement = root;
	candidateTitles = new LinkedList<String>();
	mTimingInfo = TimingInfo.create();
	mStatisticsInfo = StatisticsInfo.create();
	imageUrls = new ArrayList<String>();

	double startTime = DomUtil.getTime();
	parser = new MarkupParser(root, mTimingInfo);
	mTimingInfo.setMarkupParsingTime(DomUtil.getTime() - startTime);
	textDirection = "";
	}

	// Grabs a list of candidate titles in descending priority order:
	// 1) meta-information
	// 2) The document's title element, modified based on some readability heuristics
	// 3) The document's title element, if it's a string
	private void ensureTitleInitialized() {
	if (candidateTitles.size() > 0) return;

	String title = parser.getTitle();
	if (!title.isEmpty()) {
	candidateTitles.add(title);
	}
	candidateTitles.add(DocumentTitleGetter.getDocumentTitle(
	Document.get().getTitle(), Document.get().getDocumentElement()));
	if (Document.get().getTitle().getClass() == String.class) {
	candidateTitles.add(Document.get().getTitle());
	}
	}

	public MarkupParser getMarkupParser() { return parser; }

	public String extractTitle() {
	ensureTitleInitialized();
	assert candidateTitles.size() > 0;
	return candidateTitles.get(0);
	}

	public String extractContent() {
	return extractContent(false);
	}

	public String extractContent(boolean textOnly) {
	double now = DomUtil.getTime();
	WebDocumentInfo documentInfo = createWebDocumentInfoFromPage();
	mTimingInfo.setDocumentConstructionTime(DomUtil.getTime() - now);

	now = DomUtil.getTime();
	processDocument(documentInfo.document);
	RelevantElements.process(documentInfo.document);
	LeadImageFinder.process(documentInfo.document);
	NestedElementRetainer.process(documentInfo.document);

	List<WebImage> images = documentInfo.document.getContentImages();
	for (WebImage wi : images) {
	imageUrls.add(wi.getSrc());
	}
	mTimingInfo.setArticleProcessingTime(DomUtil.getTime() - now);

	now = DomUtil.getTime();
	String html = documentInfo.document.generateOutput(textOnly);
	mTimingInfo.setFormattingTime(DomUtil.getTime() - now);

	if (LogUtil.isLoggable(LogUtil.DEBUG_LEVEL_TIMING_INFO)) {
	for (int i = 0; i < mTimingInfo.getOtherTimesCount(); i++) {
	TimingEntry entry = mTimingInfo.getOtherTimes(i);
	LogUtil.logToConsole("Timing: " + entry.getName() + " = " + entry.getTime());
	}

	LogUtil.logToConsole(
	"Timing: MarkupParsingTime = " +
	mTimingInfo.getMarkupParsingTime() +
	"\nTiming: DocumentConstructionTime = " +
	mTimingInfo.getDocumentConstructionTime() +
	"\nTiming: ArticleProcessingTime = " +
	mTimingInfo.getArticleProcessingTime() +
	"\nTiming: FormattingTime = " +
	mTimingInfo.getFormattingTime()
	);
	}
	return html;
	}

	/**
	* Returns timing information about the most recent extraction run.
	* @return an instance of DomDistillerProtos.TimingInfo with detailed timing statistics.
	*/
	public TimingInfo getTimingInfo() {
	return mTimingInfo;
	}

	/**
	* Returns statistical information about the most recent extraction run.
	* @return an instance of DomDistillerProtos.StatisticsInfo with detailed statistics.
	*/
	public StatisticsInfo getStatisticsInfo() {
	return mStatisticsInfo;
	}

	/**
	* Get the page's text directionality ("ltr", "rtl", or "auto").
	* @return The page's text direction (default is "auto").
	*/
	public String getTextDirection() {
	if (textDirection == null \|\| textDirection.isEmpty()) {
	textDirection = "auto";
	}
	return textDirection;
	}

	/**
	* Get a list of the content image URLs in the provided document.
	* @return A list of image URLs.
	*/
	public List<String> getImageUrls() {
	return imageUrls;
	}

	/**
	* Get the element of the main article, if any.
	* @return An element of article (not necessarily the html5 article element).
	*/
	private Element getArticleElement(Element root) {
	NodeList<Element> allArticles = root.getElementsByTagName("ARTICLE");
	// Having multiple article elements usually indicates a bad case for this shortcut.
	// TODO(wychen): some sites exclude things like title and author in article element.
	if (allArticles.getLength() == 1) {
	return allArticles.getItem(0);
	}
	// Note that the CSS property matching is case sensitive, and "Article" is the correct
	// capitalization.
	String query = "[itemscope][itemtype=\"Article\"],[itemscope][itemtype=\"Post\"]";
	allArticles = DomUtil.querySelectorAll(root, query);
	// It is commonly seen that the article is wrapped separately or in multiple layers.
	if (allArticles.getLength() > 0) {
	return Element.as(DomUtil.getNearestCommonAncestor(allArticles));
	}
	return null;
	}

	/**
	* Converts the original HTML page into a WebDocument for analysis.
	*/
	private WebDocumentInfo createWebDocumentInfoFromPage() {
	WebDocumentInfo info = new WebDocumentInfo();
	WebDocumentBuilder documentBuilder = new WebDocumentBuilder();
	DomConverter converter = new DomConverter(documentBuilder);
	Element walkerRoot = getArticleElement(documentElement);
	if (walkerRoot == null) {
	walkerRoot = documentElement;
	}
	new DomWalker(converter).walk(walkerRoot);
	info.document = documentBuilder.toWebDocument();
	ensureTitleInitialized();
	info.hiddenElements = converter.getHiddenElements();

	return info;
	}

	/**
	* Implements the actual analysis of the page content, identifying the core elements of the
	* page.
	*
	* @param document the WebDocument representation of the page extracted from the DOM.
	*/
	private void processDocument(WebDocument document) {
	TextDocument textDocument = document.createTextDocumentView();
	ArticleExtractor.INSTANCE.process(textDocument, candidateTitles);
	mStatisticsInfo.setWordCount(TextDocumentStatistics.countWordsInContent(textDocument));
	textDocument.applyToModel();
	}
	}