third_party/readability/js/readability.js - chromium/src - Git at Google

 // Copyright 2014 The Chromium Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

 // Local modifications to this file are described in the README.chromium
 // file.

 var dbg = (typeof console !== 'undefined') ? function(s) {
     console.log("Readability: " + s);
 } : function() {};

 /*
  * Readability. An Arc90 Lab Experiment.
  * Website: http://lab.arc90.com/experiments/readability
  * Source:  http://code.google.com/p/arc90labs-readability
  *
  * "Readability" is a trademark of Arc90 Inc and may not be used without explicit permission.
  *
  * Copyright (c) 2010 Arc90 Inc
  * Readability is licensed under the Apache License, Version 2.0.
 **/
 var readability = {
     readStyle: "style-newspaper",
     readSize: "size-medium",
     readMargin: "margin-wide",

     distilledHTML: '',
     distilledArticleContent: null,
     nextPageLink: '',

     version:                '1.7.1',
     iframeLoads:             0,
     convertLinksToFootnotes: false,
     reversePageScroll:       false, /* If they hold shift and hit space, scroll up */
     frameHack:               false, /**
                                       * The frame hack is to workaround a firefox bug where if you
                                       * pull content out of a frame and stick it into the parent element, the scrollbar won't appear.
                                       * So we fake a scrollbar in the wrapping div.
                                      **/
     biggestFrame:            false,
     flags:                   0x1 | 0x2 | 0x4,   /* Start with all flags set. */

     /* constants */
     FLAG_STRIP_UNLIKELYS:     0x1,
     FLAG_WEIGHT_CLASSES:      0x2,
     FLAG_CLEAN_CONDITIONALLY: 0x4,

     maxPages:    30, /* The maximum number of pages to loop through before we call it quits and just show a link. */
     parsedPages: {}, /* The list of pages we've parsed in this call of readability, for autopaging. As a key store for easier searching. */
     pageETags:   {}, /* A list of the ETag headers of pages we've parsed, in case they happen to match, we'll know it's a duplicate. */

     /**
      * All of the regular expressions in use within readability.
      * Defined up here so we don't instantiate them repeatedly in loops.
      **/
     regexps: {
         unlikelyCandidates:    /combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter/i,
         okMaybeItsACandidate:  /and|article|body|column|main|shadow/i,
         positive:              /article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i,
         negative:              /combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget/i,
         extraneous:            /print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single/i,
         divToPElements:        /<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i,
         replaceBrs:            /(<br[^>]*>[ \n\r\t]*){2,}/gi,
         replaceFonts:          /<(\/?)font[^>]*>/gi,
         trim:                  /^\s+|\s+$/g,
         normalize:             /\s{2,}/g,
         killBreaks:            /(<br\s*\/?>(\s|&nbsp;?)*){1,}/g,
         videos:                /http:\/\/(www\.)?(youtube|vimeo)\.com/i,
         skipFootnoteLink:      /^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i,
         nextLink:              /(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i, // Match: next, continue, >, >>, » but not >|, »| as those usually mean last.
         prevLink:              /(prev|earl|old|new|<|«)/i
     },

     /**
      * Runs readability.
      *
      * Workflow:
      *  1. Prep the document by removing script tags, css, etc.
      *  2. Build readability's DOM tree.
      *  3. Grab the article content from the current dom tree.
      *  4. Replace the current DOM tree with the new one.
      *  5. Read peacefully.
      *
      * @return void
      **/
     init: function() {
         /* Before we do anything, remove all scripts that are not readability. */
         window.onload = window.onunload = function() {};

         readability.removeScripts(document);

         /* Make sure this document is added to the list of parsed pages first, so we don't double up on the first page */
         readability.parsedPages[window.location.href.replace(/\/$/, '')] = true;

         /* Pull out any possible next page link first */
         readability.nextPageLink = readability.findNextPageLink(document.body);

         /* We handle processing of nextPage from C++ set nextPageLink to null */
         var nextPageLink = null;

         readability.prepDocument();

         /* Build readability's DOM tree */
         var overlay        = document.createElement("DIV");
         var innerDiv       = document.createElement("DIV");
         var articleTools   = readability.getArticleTools();
         var articleTitleText   = readability.getArticleTitle();
         var articleContent = readability.grabArticle();

         if(!articleContent) {
             articleContent    = document.createElement("DIV");
             articleContent.id = "readability-content";
             articleContent.innerHTML = [
                 "<p>Sorry, readability was unable to parse this page for content. If you feel like it should have been able to, please <a href='http://code.google.com/p/arc90labs-readability/issues/entry'>let us know by submitting an issue.</a></p>",
                 (readability.frameHack ? "<p><strong>It appears this page uses frames.</strong> Unfortunately, browser security properties often cause Readability to fail on pages that include frames." : ""),
                 "<p>Also, please note that Readability does not play very nicely with front pages. Readability is intended to work on articles with a sizable chunk of text that you'd like to read comfortably. If you're using Readability on a landing page (like nytimes.com for example), please click into an article first before using Readability.</p>"
             ].join('');

             nextPageLink = null;
         }

         overlay.id              = "readOverlay";
         innerDiv.id             = "readInner";

         /* Apply user-selected styling */
         document.body.className = readability.readStyle;
         document.dir            = readability.getSuggestedDirection(articleTitleText);

         if (readability.readStyle === "style-athelas" || readability.readStyle === "style-apertura"){
             overlay.className = readability.readStyle + " rdbTypekit";
         } else {
             overlay.className = readability.readStyle;
         }
         innerDiv.className    = readability.readMargin + " " + readability.readSize;

         if(typeof(readConvertLinksToFootnotes) !== 'undefined' && readConvertLinksToFootnotes === true) {
             readability.convertLinksToFootnotes = true;
         }

         readability.distilledHTML = articleContent.innerHTML;

         if(readability.frameHack) {
             var readOverlay = document.getElementById('readOverlay');
             readOverlay.style.height = '100%';
             readOverlay.style.overflow = 'auto';
         }

         /**
          * If someone tries to use Readability on a site's root page, give them a warning about usage.
         **/
         if((window.location.protocol + "//" + window.location.host + "/") === window.location.href) {
             articleContent.style.display = "none";
             var rootWarning = document.createElement('p');
                 rootWarning.id = "readability-warning";
                 rootWarning.innerHTML = "<em>Readability</em> was intended for use on individual articles and not home pages. " +
                 "If you'd like to try rendering this page anyway, <a onClick='javascript:document.getElementById(\"readability-warning\").style.display=\"none\";document.getElementById(\"readability-content\").style.display=\"block\";'>click here</a> to continue.";

             innerDiv.insertBefore( rootWarning, articleContent );
         }

         readability.postProcessContent(articleContent);

         window.scrollTo(0, 0);

         if (nextPageLink) {
             /**
              * Append any additional pages after a small timeout so that people
              * can start reading without having to wait for this to finish processing.
             **/
             window.setTimeout(function() {
                 readability.appendNextPage(nextPageLink);
             }, 500);
         }

         /** Smooth scrolling **/
         document.onkeydown = function(e) {
             var code = (window.event) ? event.keyCode : e.keyCode;
             if (code === 16) {
                 readability.reversePageScroll = true;
                 return;
             }

             if (code === 32) {
                 readability.curScrollStep = 0;
                 var windowHeight = window.innerHeight ? window.innerHeight : (document.documentElement.clientHeight ? document.documentElement.clientHeight : document.body.clientHeight);

                 if(readability.reversePageScroll) {
                     readability.scrollTo(readability.scrollTop(), readability.scrollTop() - (windowHeight - 50), 20, 10);
                 }
                 else {
                     readability.scrollTo(readability.scrollTop(), readability.scrollTop() + (windowHeight - 50), 20, 10);
                 }

                 return false;
             }
         };

         document.onkeyup = function(e) {
             var code = (window.event) ? event.keyCode : e.keyCode;
             if (code === 16) {
                 readability.reversePageScroll = false;
                 return;
             }
         };
     },

     /**
      * Run any post-process modifications to article content as necessary.
      *
      * @param Element
      * @return void
     **/
     postProcessContent: function(articleContent) {
         if(readability.convertLinksToFootnotes && !window.location.href.match(/wikipedia\.org/g)) {
             readability.addFootnotes(articleContent);
         }

         readability.fixImageFloats(articleContent);
     },

     /**
      * Some content ends up looking ugly if the image is too large to be floated.
      * If the image is wider than a threshold (currently 55%), no longer float it,
      * center it instead.
      *
      * @param Element
      * @return void
     **/
     fixImageFloats: function (articleContent) {
         var imageWidthThreshold = Math.min(articleContent.offsetWidth, 800) * 0.55,
             images              = articleContent.getElementsByTagName('img');

         for(var i=0, il = images.length; i < il; i+=1) {
             var image = images[i];

             if(image.offsetWidth > imageWidthThreshold) {
                 image.className += " blockImage";
             }
         }
     },

     /**
      * Get the article tools Element that has buttons like reload, print.
      *
      * @return void
      **/
     getArticleTools: function () {
         var articleTools = document.createElement("DIV");

         articleTools.id        = "readTools";
         articleTools.innerHTML =
             "<a href='#' onclick='return window.location.reload()' title='Reload original page' id='reload-page'>Reload Original Page</a>" +
             "<a href='#' onclick='javascript:window.print();' title='Print page' id='print-page'>Print Page</a>" +
             "<a href='#' onclick='readability.emailBox(); return false;' title='Email page' id='email-page'>Email Page</a>";

         return articleTools;
     },

     /**
      * retuns the suggested direction of the string
      *
      * @return "rtl" || "ltr"
      **/
     getSuggestedDirection: function(text) {
         function sanitizeText() {
             return text.replace(/@\w+/, "");
         }

         function countMatches(match) {
             var matches = text.match(new RegExp(match, "g"));
             return matches !== null ? matches.length : 0;
         }

         function isRTL() {
             var count_heb =  countMatches("[\\u05B0-\\u05F4\\uFB1D-\\uFBF4]");
             var count_arb =  countMatches("[\\u060C-\\u06FE\\uFB50-\\uFEFC]");

             // if 20% of chars are Hebrew or Arbic then direction is rtl
             return  (count_heb + count_arb) * 100 / text.length > 20;
         }

         text  = sanitizeText(text);
         return isRTL() ? "rtl" : "ltr";
     },

     /**
      * Get the article title as an H1.
      *
      * @return void
      **/
     getArticleTitle: function () {
         var curTitle = "",
             origTitle = "";

         try {
             curTitle = origTitle = document.title;
             if(typeof curTitle !== "string") { /* If they had an element with id "title" in their HTML */
                 curTitle = origTitle = readability.getInnerText(document.getElementsByTagName('title')[0]);
             }
         }
         catch(e) {}

         if(curTitle.match(/ [\|\-] /))
         {
             curTitle = origTitle.replace(/(.*)[\|\-] .*/gi,'$1');

             if(curTitle.split(' ').length < 3) {
                 curTitle = origTitle.replace(/[^\|\-]*[\|\-](.*)/gi,'$1');
             }
         }
         else if(curTitle.indexOf(': ') !== -1)
         {
             curTitle = origTitle.replace(/.*:(.*)/gi, '$1');

             if(curTitle.split(' ').length < 3) {
                 curTitle = origTitle.replace(/[^:]*[:](.*)/gi,'$1');
             }
         }
         else if(curTitle.length > 150 || curTitle.length < 15)
         {
             var hOnes = document.getElementsByTagName('h1');
             if(hOnes.length === 1)
             {
                 curTitle = readability.getInnerText(hOnes[0]);
             }
         }

         curTitle = curTitle.replace( readability.regexps.trim, "" );

         if(curTitle.split(' ').length <= 4) {
             curTitle = origTitle;
         }
         return curTitle;
     },

     /**
      * Prepare the HTML document for readability to scrape it.
      * This includes things like stripping javascript, CSS, and handling terrible markup.
      *
      * @return void
      **/
     prepDocument: function () {
         /**
          * In some cases a body element can't be found (if the HTML is totally hosed for example)
          * so we create a new body node and append it to the document.
          */
         if(document.body === null)
         {
             var body = document.createElement("body");
             try {
                 document.body = body;
             }
             catch(e) {
                 document.documentElement.appendChild(body);
                 dbg(e);
             }
         }

         document.body.id = "readabilityBody";

         var frames = document.getElementsByTagName('frame');
         if(frames.length > 0)
         {
             var bestFrame = null;
             var bestFrameSize = 0;    /* The frame to try to run readability upon. Must be on same domain. */
             var biggestFrameSize = 0; /* Used for the error message. Can be on any domain. */
             for(var frameIndex = 0; frameIndex < frames.length; frameIndex+=1)
             {
                 var frameSize = frames[frameIndex].offsetWidth + frames[frameIndex].offsetHeight;
                 var canAccessFrame = false;
                 try {
                     var frameBody = frames[frameIndex].contentWindow.document.body;
                     canAccessFrame = true;
                 }
                 catch(eFrames) {
                     dbg(eFrames);
                 }

                 if(frameSize > biggestFrameSize) {
                     biggestFrameSize         = frameSize;
                     readability.biggestFrame = frames[frameIndex];
                 }

                 if(canAccessFrame && frameSize > bestFrameSize)
                 {
                     readability.frameHack = true;

                     bestFrame = frames[frameIndex];
                     bestFrameSize = frameSize;
                 }
             }

             if(bestFrame)
             {
                 var newBody = document.createElement('body');
                 readability.moveNodeInnards(bestFrame.contentWindow.document.body, newBody);
                 newBody.style.overflow = 'scroll';
                 document.body = newBody;

                 var frameset = document.getElementsByTagName('frameset')[0];
                 if(frameset) {
                     frameset.parentNode.removeChild(frameset); }
             }
         }

         /* Remove all stylesheets */
         for (var k=0;k < document.styleSheets.length; k+=1) {
             if (document.styleSheets[k].href !== null && document.styleSheets[k].href.lastIndexOf("readability") === -1) {
                 document.styleSheets[k].disabled = true;
             }
         }

         /* Remove all style tags in head (not doing this on IE) - TODO: Why not? */
         var styleTags = document.getElementsByTagName("style");
         for (var st=0;st < styleTags.length; st+=1) {
             styleTags[st].textContent = "";
         }

         /* Turn all double br's into p's */
         /* Note, this is pretty costly as far as processing goes. Maybe optimize later. */
         readability.replaceDoubleBrsWithPs(document.body);
         readability.replaceFontsWithSpans(document.body);
     },


     /**
      * Prepare the article node for display. Clean out any inline styles,
      * iframes, forms, strip extraneous <p> tags, etc.
      *
      * @param Element
      * @return void
      **/
     prepArticle: function (articleContent) {
         readability.cleanStyles(articleContent);
         readability.killBreaks(articleContent);

         /* Clean out junk from the article content */
         readability.cleanConditionally(articleContent, "form");
         readability.clean(articleContent, "object");
         readability.clean(articleContent, "h1");

         /**
          * If there is only one h2, they are probably using it
          * as a header and not a subheader, so remove it since we already have a header.
         ***/
         if(articleContent.getElementsByTagName('h2').length === 1) {
             readability.clean(articleContent, "h2");
         }
         readability.clean(articleContent, "iframe");

         readability.cleanHeaders(articleContent);

         /* Do these last as the previous stuff may have removed junk that will affect these */
         readability.cleanConditionally(articleContent, "table");
         readability.cleanConditionally(articleContent, "ul");
         readability.cleanConditionally(articleContent, "div");

         /* Remove extra paragraphs */
         var articleParagraphs = articleContent.getElementsByTagName('p');
         for(var i = articleParagraphs.length-1; i >= 0; i-=1) {
             var imgCount    = articleParagraphs[i].getElementsByTagName('img').length;
             var embedCount  = articleParagraphs[i].getElementsByTagName('embed').length;
             var objectCount = articleParagraphs[i].getElementsByTagName('object').length;

             if(imgCount === 0 && embedCount === 0 && objectCount === 0 && readability.getInnerText(articleParagraphs[i], false) === '') {
                 articleParagraphs[i].parentNode.removeChild(articleParagraphs[i]);
             }
         }

         try {
             readability.replaceBrsWithPs(articleContent);
         }
         catch (e) {
             dbg("Cleaning innerHTML of breaks failed. This is an IE strict-block-elements bug. Ignoring.: " + e);
         }
     },

     /**
      * Initialize a node with the readability object. Also checks the
      * className/id for special names to add to its score.
      *
      * @param Element
      * @return void
     **/
     initializeNode: function (node) {
         node.readability = {"contentScore": 0};

         switch(node.tagName) {
             case 'DIV':
                 node.readability.contentScore += 5;
                 break;

             case 'PRE':
             case 'TD':
             case 'BLOCKQUOTE':
                 node.readability.contentScore += 3;
                 break;

             case 'ADDRESS':
             case 'OL':
             case 'UL':
             case 'DL':
             case 'DD':
             case 'DT':
             case 'LI':
             case 'FORM':
                 node.readability.contentScore -= 3;
                 break;

             case 'H1':
             case 'H2':
             case 'H3':
             case 'H4':
             case 'H5':
             case 'H6':
             case 'TH':
                 node.readability.contentScore -= 5;
                 break;
         }

         node.readability.contentScore += readability.getClassWeight(node);
     },

     /***
      * grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is
      *               most likely to be the stuff a user wants to read. Then return it wrapped up in a div.
      *
      * @param page a document to run upon. Needs to be a full document, complete with body.
      * @return Element
     **/
     grabArticle: function (pageToClone) {
         var stripUnlikelyCandidates = readability.flagIsActive(readability.FLAG_STRIP_UNLIKELYS),
             isPaging = (page !== null) ? true: false;

         var page = null;
         // Never work on the actual page.
         if (isPaging) {
             page = document.body.cloneNode(true);
         } else {
             page = pageToClone.cloneNode(true);
         }

         var allElements = page.getElementsByTagName('*');

         /**
          * First, node prepping. Trash nodes that look cruddy (like ones with the class name "comment", etc), and turn divs
          * into P tags where they have been used inappropriately (as in, where they contain no other block level elements.)
          *
          * Note: Assignment from index for performance. See http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5
          * TODO: Shouldn't this be a reverse traversal?
         **/
         var node = null;
         var nodesToScore = [];
         for(var nodeIndex = 0; (node = allElements[nodeIndex]); nodeIndex+=1) {
             /* Remove unlikely candidates */
             if (stripUnlikelyCandidates) {
                 var unlikelyMatchString = node.className + node.id;
                 if (
                     (
                         unlikelyMatchString.search(readability.regexps.unlikelyCandidates) !== -1 &&
                         unlikelyMatchString.search(readability.regexps.okMaybeItsACandidate) === -1 &&
                         node.tagName !== "BODY"
                     )
                 )
                 {
                     dbg("Removing unlikely candidate - " + unlikelyMatchString);
                     node.parentNode.removeChild(node);
                     nodeIndex-=1;
                     continue;
                 }
             }

             if (node.tagName === "P" || node.tagName === "TD" || node.tagName === "PRE") {
                 nodesToScore[nodesToScore.length] = node;
             }

             /* Turn all divs that don't have children block level elements into p's */
             if (node.tagName === "DIV") {
                 if (node.innerHTML.search(readability.regexps.divToPElements) === -1) {
                     var newNode = document.createElement('p');
                     try {
                         readability.moveNodeInnards(node, newNode);
                         node.parentNode.replaceChild(newNode, node);
                         nodeIndex-=1;

                         nodesToScore[nodesToScore.length] = node;
                     }
                     catch(e) {
                         dbg("Could not alter div to p, probably an IE restriction, reverting back to div.: " + e);
                     }
                 }
                 else
                 {
                     /* EXPERIMENTAL */
                     for(var i = 0, il = node.childNodes.length; i < il; i+=1) {
                         var childNode = node.childNodes[i];
                         if(childNode.nodeType === 3) { // Node.TEXT_NODE
                             var p = document.createElement('p');
                             var t = document.createTextNode(childNode.nodeValue);
                             p.appendChild(t);
                             p.style.display = 'inline';
                             p.className = 'readability-styled';
                             childNode.parentNode.replaceChild(p, childNode);
                         }
                     }
                 }
             }
         }

         /**
          * Loop through all paragraphs, and assign a score to them based on how content-y they look.
          * Then add their score to their parent node.
          *
          * A score is determined by things like number of commas, class names, etc. Maybe eventually link density.
         **/
         var candidates = [];
         for (var pt=0; pt < nodesToScore.length; pt+=1) {
             var parentNode      = nodesToScore[pt].parentNode;
             var grandParentNode = parentNode ? parentNode.parentNode : null;
             var innerText       = readability.getInnerText(nodesToScore[pt]);

             if(!parentNode || typeof(parentNode.tagName) === 'undefined') {
                 continue;
             }

             /* If this paragraph is less than 25 characters, don't even count it. */
             if(innerText.length < 25) {
                 continue; }

             /* Initialize readability data for the parent. */
             if(typeof parentNode.readability === 'undefined') {
                 readability.initializeNode(parentNode);
                 candidates.push(parentNode);
             }

             /* Initialize readability data for the grandparent. */
             if(grandParentNode && typeof(grandParentNode.readability) === 'undefined' && typeof(grandParentNode.tagName) !== 'undefined') {
                 readability.initializeNode(grandParentNode);
                 candidates.push(grandParentNode);
             }

             var contentScore = 0;

             /* Add a point for the paragraph itself as a base. */
             contentScore+=1;

             /* Add points for any commas within this paragraph */
             contentScore += innerText.split(',').length;

             /* For every 100 characters in this paragraph, add another point. Up to 3 points. */
             contentScore += Math.min(Math.floor(innerText.length / 100), 3);

             /* Add the score to the parent. The grandparent gets half. */
             parentNode.readability.contentScore += contentScore;

             if(grandParentNode) {
                 grandParentNode.readability.contentScore += contentScore/2;
             }
         }

         /**
          * After we've calculated scores, loop through all of the possible candidate nodes we found
          * and find the one with the highest score.
         **/
         var topCandidate = null;
         for(var c=0, cl=candidates.length; c < cl; c+=1)
         {
             /**
              * Scale the final candidates score based on link density. Good content should have a
              * relatively small link density (5% or less) and be mostly unaffected by this operation.
             **/
             candidates[c].readability.contentScore = candidates[c].readability.contentScore * (1-readability.getLinkDensity(candidates[c]));

             dbg('Candidate: ' + candidates[c] + " (" + candidates[c].className + ":" + candidates[c].id + ") with score " + candidates[c].readability.contentScore);

             if(!topCandidate || candidates[c].readability.contentScore > topCandidate.readability.contentScore) {
                 topCandidate = candidates[c]; }
         }

         /**
          * If we still have no top candidate, just use the body as a last resort.
          * We also have to copy the body node so it is something we can modify.
          **/
         if (topCandidate === null || topCandidate.tagName === "BODY")
         {
             topCandidate = document.createElement("DIV");
             readability.replaceNodeInnards(page, topCandidate);
             page.appendChild(topCandidate);
             readability.initializeNode(topCandidate);
         }

         /**
          * Now that we have the top candidate, look through its siblings for content that might also be related.
          * Things like preambles, content split by ads that we removed, etc.
         **/
         var articleContent        = document.createElement("DIV");
         if (isPaging) {
             articleContent.id     = "readability-content";
         }
         var siblingScoreThreshold = Math.max(10, topCandidate.readability.contentScore * 0.2);
         var siblingNodes          = topCandidate.parentNode.childNodes;


         for(var s=0, sl=siblingNodes.length; s < sl; s+=1) {
             var siblingNode = siblingNodes[s];
             var append      = false;

             /**
              * Fix for odd IE7 Crash where siblingNode does not exist even though this should be a live nodeList.
              * Example of error visible here: http://www.esquire.com/features/honesty0707
             **/
             if(!siblingNode) {
                 continue;
             }

             dbg("Looking at sibling node: " + siblingNode + " (" + siblingNode.className + ":" + siblingNode.id + ")" + ((typeof siblingNode.readability !== 'undefined') ? (" with score " + siblingNode.readability.contentScore) : ''));
             dbg("Sibling has score " + (siblingNode.readability ? siblingNode.readability.contentScore : 'Unknown'));

             if(siblingNode === topCandidate)
             {
                 append = true;
             }

             var contentBonus = 0;
             /* Give a bonus if sibling nodes and top candidates have the example same classname */
             if(siblingNode.className === topCandidate.className && topCandidate.className !== "") {
                 contentBonus += topCandidate.readability.contentScore * 0.2;
             }

             if(typeof siblingNode.readability !== 'undefined' && (siblingNode.readability.contentScore+contentBonus) >= siblingScoreThreshold)
             {
                 append = true;
             }

             if(siblingNode.nodeName === "P") {
                 var linkDensity = readability.getLinkDensity(siblingNode);
                 var nodeContent = readability.getInnerText(siblingNode);
                 var nodeLength  = nodeContent.length;

                 if(nodeLength > 80 && linkDensity < 0.25)
                 {
                     append = true;
                 }
                 else if(nodeLength < 80 && linkDensity === 0 && nodeContent.search(/\.( |$)/) !== -1)
                 {
                     append = true;
                 }
             }

             if(append) {
                 dbg("Appending node: " + siblingNode);

                 var nodeToAppend = null;
                 if(siblingNode.nodeName !== "DIV" && siblingNode.nodeName !== "P") {
                     /* We have a node that isn't a common block level element, like a form or td tag. Turn it into a div so it doesn't get filtered out later by accident. */

                     dbg("Altering siblingNode of " + siblingNode.nodeName + ' to div.');
                     nodeToAppend = document.createElement("DIV");
                     try {
                         nodeToAppend.id = siblingNode.id;
                         readability.moveNodeInnards(siblingNode, nodeToAppend);
                     }
                     catch(er) {
                         dbg("Could not alter siblingNode to div, probably an IE restriction, reverting back to original.");
                         nodeToAppend = siblingNode;
                         s-=1;
                         sl-=1;
                     }
                 } else {
                     nodeToAppend = siblingNode;
                     s-=1;
                     sl-=1;
                 }

                 /* To ensure a node does not interfere with readability styles, remove its classnames */
                 nodeToAppend.className = "";

                 /* Append sibling and subtract from our list because it removes the node when you append to another node */
                 articleContent.appendChild(nodeToAppend);
             }
         }

         /**
          * So we have all of the content that we need. Now we clean it up for presentation.
         **/
         readability.distilledArticleContent = articleContent.cloneNode(true);
         //readability.prepArticle(articleContent);

         if (readability.curPageNum === 1) {
             var newNode = document.createElement('div');
             newNode.id = "readability-page-1";
             newNode.setAttribute("class", "page");
             readability.moveNodeInnards(articleContent, newNode);
             articleContent.appendChild(newNode);
         }

         /**
          * Now that we've gone through the full algorithm, check to see if we got any meaningful content.
          * If we didn't, we may need to re-run grabArticle with different flags set. This gives us a higher
          * likelihood of finding the content, and the sieve approach gives us a higher likelihood of
          * finding the -right- content.
         **/
         if(readability.getInnerText(articleContent, false).length < 250) {
             if (readability.flagIsActive(readability.FLAG_STRIP_UNLIKELYS)) {
                 readability.removeFlag(readability.FLAG_STRIP_UNLIKELYS);
                 return readability.grabArticle(document.body);
             }
             else if (readability.flagIsActive(readability.FLAG_WEIGHT_CLASSES)) {
                 readability.removeFlag(readability.FLAG_WEIGHT_CLASSES);
                 return readability.grabArticle(document.body);
             }
             else if (readability.flagIsActive(readability.FLAG_CLEAN_CONDITIONALLY)) {
                 readability.removeFlag(readability.FLAG_CLEAN_CONDITIONALLY);
                 return readability.grabArticle(document.body);
             } else {
                 return null;
             }
         }

         return articleContent;
     },

     /**
      * Removes script tags from the document.
      *
      * @param Element
     **/
     removeScripts: function (doc) {
         var scripts = doc.getElementsByTagName('script');
         for(var i = scripts.length-1; i >= 0; i-=1)
         {
             if(typeof(scripts[i].src) === "undefined" || (scripts[i].src.indexOf('readability') === -1 && scripts[i].src.indexOf('typekit') === -1))
             {
                 scripts[i].nodeValue="";
                 scripts[i].removeAttribute('src');
                 if (scripts[i].parentNode) {
                         scripts[i].parentNode.removeChild(scripts[i]);
                 }
             }
         }
     },

     /**
      * Get the inner text of a node - cross browser compatibly.
      * This also strips out any excess whitespace to be found.
      *
      * @param Element
      * @return string
     **/
     getInnerText: function (e, normalizeSpaces) {
         var textContent    = "";

         if(typeof(e.textContent) === "undefined" && typeof(e.innerText) === "undefined") {
             return "";
         }

         normalizeSpaces = (typeof normalizeSpaces === 'undefined') ? true : normalizeSpaces;

         if (navigator.appName === "Microsoft Internet Explorer") {
             textContent = e.innerText.replace( readability.regexps.trim, "" ); }
         else {
             textContent = e.textContent.replace( readability.regexps.trim, "" ); }

         if(normalizeSpaces) {
             return textContent.replace( readability.regexps.normalize, " "); }
         else {
             return textContent; }
     },

     /**
      * Get the number of times a string s appears in the node e.
      *
      * @param Element
      * @param string - what to split on. Default is ","
      * @return number (integer)
     **/
     getCharCount: function (e,s) {
         s = s || ",";
         return readability.getInnerText(e).split(s).length-1;
     },

     /**
      * Remove the style attribute on every e and under.
      * TODO: Test if getElementsByTagName(*) is faster.
      *
      * @param Element
      * @return void
     **/
     cleanStyles: function (e) {
         e = e || document;
         var cur = e.firstChild;

         if(!e) {
             return; }

         // Remove any root styles, if we're able.
         if(typeof e.removeAttribute === 'function' && e.className !== 'readability-styled') {
             e.removeAttribute('style'); }

         // Go until there are no more child nodes
         while ( cur !== null ) {
             if ( cur.nodeType === 1 ) {
                 // Remove style attribute(s) :
                 if(cur.className !== "readability-styled") {
                     cur.removeAttribute("style");
                 }
                 readability.cleanStyles( cur );
             }
             cur = cur.nextSibling;
         }
     },

     /**
      * Get the density of links as a percentage of the content
      * This is the amount of text that is inside a link divided by the total text in the node.
      *
      * @param Element
      * @return number (float)
     **/
     getLinkDensity: function (e) {
         var links      = e.getElementsByTagName("a");
         var textLength = readability.getInnerText(e).length;
         var linkLength = 0;
         for(var i=0, il=links.length; i<il;i+=1)
         {
             linkLength += readability.getInnerText(links[i]).length;
         }

         return linkLength / textLength;
     },

     /**
      * Find a cleaned up version of the current URL, to use for comparing links for possible next-pageyness.
      *
      * @author Dan Lacy
      * @return string the base url
     **/
     findBaseUrl: function () {
         var noUrlParams     = window.location.pathname.split("?")[0],
             urlSlashes      = noUrlParams.split("/").reverse(),
             cleanedSegments = [],
             possibleType    = "";

         for (var i = 0, slashLen = urlSlashes.length; i < slashLen; i+=1) {
             var segment = urlSlashes[i];

             // Split off and save anything that looks like a file type.
             if (segment.indexOf(".") !== -1) {
                 possibleType = segment.split(".")[1];

                 /* If the type isn't alpha-only, it's probably not actually a file extension. */
                 if(!possibleType.match(/[^a-zA-Z]/)) {
                     segment = segment.split(".")[0];
                 }
             }

             /**
              * EW-CMS specific segment replacement. Ugly.
              * Example: http://www.ew.com/ew/article/0,,20313460_20369436,00.html
             **/
             if(segment.indexOf(',00') !== -1) {
                 segment = segment.replace(',00', '');
             }

             // If our first or second segment has anything looking like a page number, remove it.
             if (segment.match(/((_|-)?p[a-z]*|(_|-))[0-9]{1,2}$/i) && ((i === 1) || (i === 0))) {
                 segment = segment.replace(/((_|-)?p[a-z]*|(_|-))[0-9]{1,2}$/i, "");
             }


             var del = false;

             /* If this is purely a number, and it's the first or second segment, it's probably a page number. Remove it. */
             if (i < 2 && segment.match(/^\d{1,2}$/)) {
                 del = true;
             }

             /* If this is the first segment and it's just "index", remove it. */
             if(i === 0 && segment.toLowerCase() === "index") {
                 del = true;
             }


             /* If our first or second segment is smaller than 3 characters, and the first segment was purely alphas, remove it. */
             if(i < 2 && segment.length < 3 && !urlSlashes[0].match(/[a-z]/i)) {
                 del = true;
             }

             /* If it's not marked for deletion, push it to cleanedSegments. */
             if (!del) {
                 cleanedSegments.push(segment);
             }
         }

         // This is our final, cleaned, base article URL.
         return window.location.protocol + "//" + window.location.host + cleanedSegments.reverse().join("/");
     },

     /**
      * Look for any paging links that may occur within the document.
      *
      * @param body
      * @return object (array)
     **/
     findNextPageLink: function (elem) {
         var possiblePages = {},
             allLinks = elem.getElementsByTagName('a'),
             articleBaseUrl = readability.findBaseUrl();

         /**
          * Loop through all links, looking for hints that they may be next-page links.
          * Things like having "page" in their textContent, className or id, or being a child
          * of a node with a page-y className or id.
          *
          * Also possible: levenshtein distance? longest common subsequence?
          *
          * After we do that, assign each page a score, and
         **/
         for(var i = 0, il = allLinks.length; i < il; i+=1) {
             var link     = allLinks[i],
                 linkHref = allLinks[i].href.replace(/#.*$/, '').replace(/\/$/, '');

             /* If we've already seen this page, ignore it */
             if(linkHref === "" || linkHref === articleBaseUrl || linkHref === window.location.href || linkHref in readability.parsedPages) {
                 continue;
             }

             /* If it's on a different domain, skip it. */
             if(window.location.host !== linkHref.split(/\/+/g)[1]) {
                 continue;
             }

             var linkText = readability.getInnerText(link);

             /* If the linkText looks like it's not the next page, skip it. */
             if(linkText.match(readability.regexps.extraneous) || linkText.length > 25) {
                 continue;
             }

             /* If the leftovers of the URL after removing the base URL don't contain any digits, it's certainly not a next page link. */
             var linkHrefLeftover = linkHref.replace(articleBaseUrl, '');
             if(!linkHrefLeftover.match(/\d/)) {
                 continue;
             }

             if(!(linkHref in possiblePages)) {
                 possiblePages[linkHref] = {"score": 0, "linkText": linkText, "href": linkHref};
             } else {
                 possiblePages[linkHref].linkText += ' | ' + linkText;
             }

             var linkObj = possiblePages[linkHref];

             /**
              * If the articleBaseUrl isn't part of this URL, penalize this link. It could still be the link, but the odds are lower.
              * Example: http://www.actionscript.org/resources/articles/745/1/JavaScript-and-VBScript-Injection-in-ActionScript-3/Page1.html
             **/
             if(linkHref.indexOf(articleBaseUrl) !== 0) {
                 linkObj.score -= 25;
             }

             var linkData = linkText + ' ' + link.className + ' ' + link.id;
             if(linkData.match(readability.regexps.nextLink)) {
                 linkObj.score += 50;
             }
             if(linkData.match(/pag(e|ing|inat)/i)) {
                 linkObj.score += 25;
             }
             if(linkData.match(/(first|last)/i)) { // -65 is enough to negate any bonuses gotten from a > or » in the text,
                 /* If we already matched on "next", last is probably fine. If we didn't, then it's bad. Penalize. */
                 if(!linkObj.linkText.match(readability.regexps.nextLink)) {
                     linkObj.score -= 65;
                 }
             }
             if(linkData.match(readability.regexps.negative) || linkData.match(readability.regexps.extraneous)) {
                 linkObj.score -= 50;
             }
             if(linkData.match(readability.regexps.prevLink)) {
                 linkObj.score -= 200;
             }

             /* If a parentNode contains page or paging or paginat */
             var parentNode = link.parentNode,
                 positiveNodeMatch = false,
                 negativeNodeMatch = false;
             while(parentNode) {
                 var parentNodeClassAndId = parentNode.className + ' ' + parentNode.id;
                 if(!positiveNodeMatch && parentNodeClassAndId && parentNodeClassAndId.match(/pag(e|ing|inat)/i)) {
                     positiveNodeMatch = true;
                     linkObj.score += 25;
                 }
                 if(!negativeNodeMatch && parentNodeClassAndId && parentNodeClassAndId.match(readability.regexps.negative)) {
                     /* If this is just something like "footer", give it a negative. If it's something like "body-and-footer", leave it be. */
                     if(!parentNodeClassAndId.match(readability.regexps.positive)) {
                         linkObj.score -= 25;
                         negativeNodeMatch = true;
                     }
                 }

                 parentNode = parentNode.parentNode;
             }

             /**
              * If the URL looks like it has paging in it, add to the score.
              * Things like /page/2/, /pagenum/2, ?p=3, ?page=11, ?pagination=34
             **/
             if (linkHref.match(/p(a|g|ag)?(e|ing|ination)?(=|\/)[0-9]{1,2}/i) || linkHref.match(/(page|paging)/i)) {
                 linkObj.score += 25;
             }

             /* If the URL contains negative values, give a slight decrease. */
             if (linkHref.match(readability.regexps.extraneous)) {
                 linkObj.score -= 15;
             }

             /**
              * Minor punishment to anything that doesn't match our current URL.
              * NOTE: I'm finding this to cause more harm than good where something is exactly 50 points.
              *       Dan, can you show me a counterexample where this is necessary?
              * if (linkHref.indexOf(window.location.href) !== 0) {
              *    linkObj.score -= 1;
              * }
             **/

             /**
              * If the link text can be parsed as a number, give it a minor bonus, with a slight
              * bias towards lower numbered pages. This is so that pages that might not have 'next'
              * in their text can still get scored, and sorted properly by score.
             **/
             var linkTextAsNumber = parseInt(linkText, 10);
             if(linkTextAsNumber) {
                 // Punish 1 since we're either already there, or it's probably before what we want anyways.
                 if (linkTextAsNumber === 1) {
                     linkObj.score -= 10;
                 }
                 else {
                     // Todo: Describe this better
                     linkObj.score += Math.max(0, 10 - linkTextAsNumber);
                 }
             }
         }

         /**
          * Loop thrugh all of our possible pages from above and find our top candidate for the next page URL.
          * Require at least a score of 50, which is a relatively high confidence that this page is the next link.
         **/
         var topPage = null;
         for(var page in possiblePages) {
             if(possiblePages.hasOwnProperty(page)) {
                 if(possiblePages[page].score >= 50 && (!topPage || topPage.score < possiblePages[page].score)) {
                     topPage = possiblePages[page];
                 }
             }
         }

         if(topPage) {
             var nextHref = topPage.href.replace(/\/$/,'');

             dbg('NEXT PAGE IS ' + nextHref);
             readability.parsedPages[nextHref] = true;
             return nextHref;
         }
         else {
             return null;
         }
     },

     createLinkDiv: function(link) {
         var divNode = document.createElement('div');
         var aNode = document.createElement('a');
         var tNode = document.createTextNode('View Next Page');
         divNode.setAttribute('style', 'text-align: center');
         aNode.setAttribute('href', link);
         aNode.appendChild(tNode);
         divNode.appendChild(aNode);
         return divNode;
     },

     xhr: function () {
         if (typeof XMLHttpRequest !== 'undefined' && (window.location.protocol !== 'file:' || !window.ActiveXObject)) {
             return new XMLHttpRequest();
         }
         else {
             try { return new ActiveXObject('Msxml2.XMLHTTP.6.0'); } catch(sixerr) { }
             try { return new ActiveXObject('Msxml2.XMLHTTP.3.0'); } catch(threrr) { }
             try { return new ActiveXObject('Msxml2.XMLHTTP'); } catch(err) { }
         }

         return false;
     },

     successfulRequest: function (request) {
         return (request.status >= 200 && request.status < 300) || request.status === 304 || (request.status === 0 && request.responseText);
     },

     ajax: function (url, options) {
         var request = readability.xhr();

         function respondToReadyState(readyState) {
             if (request.readyState === 4) {
                 if (readability.successfulRequest(request)) {
                     if (options.success) { options.success(request); }
                 }
                 else {
                     if (options.error) { options.error(request); }
                 }
             }
         }

         if (typeof options === 'undefined') { options = {}; }

         request.onreadystatechange = respondToReadyState;

         request.open('get', url, true);
         request.setRequestHeader('Accept', 'text/html');

         try {
             request.send(options.postBody);
         }
         catch (e) {
             if (options.error) { options.error(); }
         }

         return request;
     },

     /**
      * Make an AJAX request for each page and append it to the document.
     **/
     curPageNum: 1,

     appendNextPage: function (nextPageLink) {
         readability.curPageNum+=1;

         var articlePage       = document.createElement("DIV");
         articlePage.id        = 'readability-page-' + readability.curPageNum;
         articlePage.className = 'page';
         articlePage.innerHTML = '<p class="page-separator" title="Page ' + readability.curPageNum + '">&sect;</p>';

         document.getElementById("readability-content").appendChild(articlePage);

         if(readability.curPageNum > readability.maxPages) {
             var linkDiv = readability.createLinkDiv(nextPageLink);

             articlePage.appendChild(linkDiv);
             return;
         }

         /**
          * Now that we've built the article page DOM element, get the page content
          * asynchronously and load the cleaned content into the div we created for it.
         **/
         (function(pageUrl, thisPage) {
             readability.ajax(pageUrl, {
                 success: function(r) {

                     /* First, check to see if we have a matching ETag in headers - if we do, this is a duplicate page. */
                     var eTag = r.getResponseHeader('ETag');
                     if(eTag) {
                         if(eTag in readability.pageETags) {
                             dbg("Exact duplicate page found via ETag. Aborting.");
                             articlePage.style.display = 'none';
                             return;
                         } else {
                             readability.pageETags[eTag] = 1;
                         }
                     }

                     // TODO: this ends up doubling up page numbers on NYTimes articles. Need to generically parse those away.
                     var page = document.createElement("DIV");

                     /**
                      * Do some preprocessing to our HTML to make it ready for appending.
                      * • Remove any script tags. Swap and reswap newlines with a unicode character because multiline regex doesn't work in javascript.
                      * • Turn any noscript tags into divs so that we can parse them. This allows us to find any next page links hidden via javascript.
                      * • Turn all double br's into p's - was handled by prepDocument in the original view.
                      *   Maybe in the future abstract out prepDocument to work for both the original document and AJAX-added pages.
                     **/
                     var pageInnards = r.responseXML;
                     readability.removeScripts(pageInnards);
                     readability.replaceNoscriptsWithPs(pageInnards);
                     readability.replaceDoubleBrsWithPs(pageInnards);
                     readability.replaceFontsWithSpans(pageInnards);
                     page.appendChild(pageInnards);


                     /**
                      * Reset all flags for the next page, as they will search through it and disable as necessary at the end of grabArticle.
                     **/
                     readability.flags = 0x1 | 0x2 | 0x4;

                     var nextPageLink = readability.findNextPageLink(page),
                         content      =  readability.grabArticle(page);

                     if(!content) {
                         dbg("No content found in page to append. Aborting.");
                         return;
                     }

                     /**
                      * Anti-duplicate mechanism. Essentially, get the first paragraph of our new page.
                      * Compare it against all of the the previous document's we've gotten. If the previous
                      * document contains exactly the innerHTML of this first paragraph, it's probably a duplicate.
                     **/
                     var firstP = content.getElementsByTagName("P").length ? content.getElementsByTagName("P")[0] : null;
                     if(firstP && firstP.innerHTML.length > 100) {
                         for(var i=1; i <= readability.curPageNum; i+=1) {
                             var rPage = document.getElementById('readability-page-' + i);
                             if(rPage && rPage.innerHTML.indexOf(firstP.innerHTML) !== -1) {
                                 dbg('Duplicate of page ' + i + ' - skipping.');
                                 articlePage.style.display = 'none';
                                 readability.parsedPages[pageUrl] = true;
                                 return;
                             }
                         }
                     }

                     readability.removeScripts(content);

                     readability.moveNodeInnards(content, thisPage);

                     /**
                      * After the page has rendered, post process the content. This delay is necessary because,
                      * in webkit at least, offsetWidth is not set in time to determine image width. We have to
                      * wait a little bit for reflow to finish before we can fix floating images.
                     **/
                     window.setTimeout(
                         function() { readability.postProcessContent(thisPage); },
                         500
                     );

                     if(nextPageLink) {
                         readability.appendNextPage(nextPageLink);
                     }
                 }
             });
         }(nextPageLink, articlePage));
     },

     /**
      * Get an elements class/id weight. Uses regular expressions to tell if this
      * element looks good or bad.
      *
      * @param Element
      * @return number (Integer)
     **/
     getClassWeight: function (e) {
         if(!readability.flagIsActive(readability.FLAG_WEIGHT_CLASSES)) {
             return 0;
         }

         var weight = 0;

         /* Look for a special classname */
         if (typeof(e.className) === 'string' && e.className !== '')
         {
             if(e.className.search(readability.regexps.negative) !== -1) {
                 weight -= 25; }

             if(e.className.search(readability.regexps.positive) !== -1) {
                 weight += 25; }
         }

         /* Look for a special ID */
         if (typeof(e.id) === 'string' && e.id !== '')
         {
             if(e.id.search(readability.regexps.negative) !== -1) {
                 weight -= 25; }

             if(e.id.search(readability.regexps.positive) !== -1) {
                 weight += 25; }
         }

         return weight;
     },

     nodeIsVisible: function (node) {
         return (node.offsetWidth !== 0 || node.offsetHeight !== 0) && node.style.display.toLowerCase() !== 'none';
     },

     /**
      * Remove extraneous break tags from a node.
      *
      * @param Element
      * @return void
      **/
     killBreaks: function (e) {
         var allElements = e.getElementsByTagName('*');
         while (i < allElements.length) {
             readability.deleteExtraBreaks(allElements[i]);
             i++;
         }
     },

     /**
      * Clean a node of all elements of type "tag".
      * (Unless it's a youtube/vimeo video. People love movies.)
      *
      * @param Element
      * @param string tag to clean
      * @return void
      **/
     clean: function (e, tag) {
         var targetList = e.getElementsByTagName( tag );
         var isEmbed    = (tag === 'object' || tag === 'embed');

         for (var y=targetList.length-1; y >= 0; y-=1) {
             /* Allow youtube and vimeo videos through as people usually want to see those. */
             if(isEmbed) {
                 var attributeValues = "";
                 for (var i=0, il=targetList[y].attributes.length; i < il; i+=1) {
                     attributeValues += targetList[y].attributes[i].value + '|';
                 }

                 /* First, check the elements attributes to see if any of them contain youtube or vimeo */
                 if (attributeValues.search(readability.regexps.videos) !== -1) {
                     continue;
                 }

                 /* Then check the elements inside this element for the same. */
                 if (targetList[y].innerHTML.search(readability.regexps.videos) !== -1) {
                     continue;
                 }

             }

             targetList[y].parentNode.removeChild(targetList[y]);
         }
     },

     /**
      * Clean an element of all tags of type "tag" if they look fishy.
      * "Fishy" is an algorithm based on content length, classnames, link density, number of images & embeds, etc.
      *
      * @return void
      **/
     cleanConditionally: function (e, tag) {

         if(!readability.flagIsActive(readability.FLAG_CLEAN_CONDITIONALLY)) {
             return;
         }

         var tagsList      = e.getElementsByTagName(tag);
         var curTagsLength = tagsList.length;

         /**
          * Gather counts for other typical elements embedded within.
          * Traverse backwards so we can remove nodes at the same time without effecting the traversal.
          *
          * TODO: Consider taking into account original contentScore here.
         **/
         for (var i=curTagsLength-1; i >= 0; i-=1) {
             var weight = readability.getClassWeight(tagsList[i]);
             var contentScore = (typeof tagsList[i].readability !== 'undefined') ? tagsList[i].readability.contentScore : 0;

             dbg("Cleaning Conditionally " + tagsList[i] + " (" + tagsList[i].className + ":" + tagsList[i].id + ")" + ((typeof tagsList[i].readability !== 'undefined') ? (" with score " + tagsList[i].readability.contentScore) : ''));

             if(weight+contentScore < 0)
             {
                 tagsList[i].parentNode.removeChild(tagsList[i]);
             }
             else if ( readability.getCharCount(tagsList[i],',') < 10) {
                 /**
                  * If there are not very many commas, and the number of
                  * non-paragraph elements is more than paragraphs or other ominous signs, remove the element.
                 **/
                 var p      = tagsList[i].getElementsByTagName("p").length;
                 var img    = tagsList[i].getElementsByTagName("img").length;
                 var li     = tagsList[i].getElementsByTagName("li").length-100;
                 var input  = tagsList[i].getElementsByTagName("input").length;

                 var embedCount = 0;
                 var embeds     = tagsList[i].getElementsByTagName("embed");
                 for(var ei=0,il=embeds.length; ei < il; ei+=1) {
                     if (embeds[ei].src.search(readability.regexps.videos) === -1) {
                       embedCount+=1;
                     }
                 }

                 var linkDensity   = readability.getLinkDensity(tagsList[i]);
                 var contentLength = readability.getInnerText(tagsList[i]).length;
                 var toRemove      = false;

                 if ( img > p ) {
                     toRemove = true;
                 } else if(li > p && tag !== "ul" && tag !== "ol") {
                     toRemove = true;
                 } else if( input > Math.floor(p/3) ) {
                     toRemove = true;
                 } else if(contentLength < 25 && (img === 0 || img > 2) ) {
                     toRemove = true;
                 } else if(weight < 25 && linkDensity > 0.2) {
                     toRemove = true;
                 } else if(weight >= 25 && linkDensity > 0.5) {
                     toRemove = true;
                 } else if((embedCount === 1 && contentLength < 75) || embedCount > 1) {
                     toRemove = true;
                 }

                 if(toRemove) {
                     tagsList[i].parentNode.removeChild(tagsList[i]);
                 }
             }
         }
     },

     /**
      * Clean out spurious headers from an Element. Checks things like classnames and link density.
      *
      * @param Element
      * @return void
     **/
     cleanHeaders: function (e) {
         for (var headerIndex = 1; headerIndex < 3; headerIndex+=1) {
             var headers = e.getElementsByTagName('h' + headerIndex);
             for (var i=headers.length-1; i >=0; i-=1) {
                 if (readability.getClassWeight(headers[i]) < 0 || readability.getLinkDensity(headers[i]) > 0.33) {
                     headers[i].parentNode.removeChild(headers[i]);
                 }
             }
         }
     },

     flagIsActive: function(flag) {
         return (readability.flags & flag) > 0;
     },

     addFlag: function(flag) {
         readability.flags = readability.flags | flag;
     },

     removeFlag: function(flag) {
         readability.flags = readability.flags & ~flag;
     },

     // Removes the children of |src| and appends them to |dest|.
     moveNodeInnards: function(src, dest) {
         try {
             while (src.firstChild) {
                 dest.appendChild(src.removeChild(src.firstChild));
             }
         } catch (e) {}
     },

     // Returns true if the node is a whitespace text node.
     isWhitespaceNode: function(node) {
         if (node.nodeType == Node.TEXT_NODE) {
             if (node.data.trim().length == 0) {
                return true;
             }
         }
         return false;
     },

     // Returns true if the node is a <BR>.
     isBrNode: function(node) {
         return (node.tagName === 'BR');
     },


     // Returns the last <BR> node in a sequence of <BR> nodes that are only
     // separated by whitespace, or null if there are not at least two <BR> tags
     // in the sibling chain starting with |node|. Returns the second such <BR>
     // node if |restrictToTwo| is true.
     isMultipleBr: function(node, restrictToTwo) {
         var lastBr = null;
         if (!readability.isBrNode(node)) {
             return lastBr;
         }
         var curr = node.nextSibling;
         while (curr) {
             if (readability.isWhitespaceNode(curr) || readability.isBrNode(curr)) {
                 lastBr = curr;
                 curr = curr.nextSibling;
                 if (restrictToTwo) {
                     if (readability.isBrNode(lastBr)) {
                         return lastBr;
                     }
                 }
                 continue;
             }
             break;
         }
         return lastBr;
     },

     // Removes all <BR> nodes except one and whitespace in between in a series
     // of <BR> nodes.
     deleteExtraBreaks: function(node) {
         var lastBr = readability.isMultipleBr(node, false);
         var ret = false;
         while (lastBr && lastBr != node) {
             var toRemove = lastBr;
             lastBr = lastBr.previousSibling;
             toRemove.parentNode.removeChild(toRemove);
             ret = true;
         }
         return ret;
     },

     // Replaces a pair of <BR> nodes (possibly separated by whitespace), with a
     // <P> node, and makes all next siblings of that pair children of <P>, up
     // until the next pair of <BR> nodes is reached.
     replaceDoubleBrWithP: function(node) {
         // Check that we are starting with a BR.
         var second = readability.isMultipleBr(node, true);
         if (!second) {
             return;
         }
         // Make all next siblings of the second BR into children of a P.
         var p = document.createElement('p');
         var curr = second.nextSibling;
         while (curr) {
             if (readability.isMultipleBr(curr, true)) {
                 break;
             }
             var next = curr.nextSibling;
             p.appendChild(curr.parentNode.removeChild(curr));
             curr = next;
         }
         var ret = curr;

         // Remove all nodes between the first and second BR.
         curr = node.nextSibling;
         while (curr && curr != second) {
             var next = curr.nextSibling;
             curr.parentNode.removeChild(curr);
             curr = next;
         }
         // Remove the second BR.
         second.parentNode.removeChild(second);
         // Replace the first BR with the P.
         node.parentNode.replaceChild(p, node);

         return ret;
     },

     // Returns true if the NodeList contains a double <BR>.
     hasDoubleBr: function(nodeList) {
         for (var i = 0; i < nodeList.length; nodeList++) {
             if (readability.isMultipleBr(nodeList[i], true)) {
                 return true;
             }
         }
         return false;
     },

     // Replaces double <BR> tags with <P> tags.
     replaceDoubleBrsWithPs: function(node) {
         var allElements = node.getElementsByTagName('BR');
         var node = null;
         while (allElements && allElements.length > 0 &&
                readability.hasDoubleBr(allElements)) {
             for (var nodeIndex = 0; (node = allElements[nodeIndex]); nodeIndex += 1) {
                 var next = node;
                 while (next = readability.replaceDoubleBrWithP(next));
             }
             allElements = document.body.getElementsByTagName('BR');
         }
     },


     // Replaces a BR and the whitespace that follows it with a P.
     replaceBrWithP: function(node) {
         if (!readability.isBrNode(node)) {
             return;
         }
         var p = document.createElement('p');
         var curr = node.nextSibling;
         while (curr && !isBrNode(curr)) {
             var next = curr.nextSibling;
             if (readability.isWhitespaceNode(curr)) {
                 curr.parentNode.removeChild(curr);
             } else {
                 p.appendChild(curr.parentNode.removeChild(curr));
             }
             curr = next;
         }
         node.parentNode.replaceChild(p, node);
         return curr;
     },

     // Replaces all <BR> tags with <P> tags. Makes all next siblings of a <BR> tag
     // children of the <P>.
     replaceBrsWithPs: function(node) {
         var allElements = node.getElementsByTagName('BR');
         var node = null;
         while (allElements && allElements.length > 0) {
             for (var nodeIndex = 0; (node = allElements[nodeIndex]); nodeIndex += 1) {
                 var next = node;
                 while (next = readability.replaceBrWithP(next));
             }
             allElements = document.body.getElementsByTagName('BR');
         }
     },

     // Replaces any tag with any other tag.
     replaceTagsWithTags: function(node, srcTag, destTag) {
         var allElements = node.getElementsByTagName(srcTag);
         for (var i = 0; i < allElements.length; i++) {
             var dest = document.createElement(destTag);
             readability.moveNodeInnards(allElements[i], dest);
             allElements[i].parentNode.replaceChild(dest, allElements[i]);
         }
     },

     // Replaces all <noscript> tags with <p> tags.
     replaceNoscriptsWithPs: function(node) {
         readability.replaceTagsWithTags(node, 'noscript', 'p');
     },

     // Replaces all <font> tags with <span> tags.
     replaceFontsWithSpans: function(node) {
         readability.replaceTagsWithTags(node, 'font', 'span');
     },

     // Returns a list of image URLs in the distilled article.
     getImages : function() {
         var images = document.getElementsByTagName('img');
         var result = new Array(images.length);
         dbg("Number of images: " + images.length);
         for(i = 0; i < images.length; i++) {
             result[i] = images[i].src;
             dbg("Image: " + result[i]);
         }
         return result;
     },

     // Returns the distilled article HTML from the page(s).
     getDistilledArticleHTML : function() {
         return readability.distilledHTML;
     },

     // Returns the next page of this article.
     getNextPageLink : function() {
         return readability.nextPageLink;
     }
 };