| // Copyright 2014 The Chromium Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| // Local modifications to this file are described in the README.chromium |
| // file. |
| |
| var dbg = (typeof console !== 'undefined') ? function(s) { |
| console.log("Readability: " + s); |
| } : function() {}; |
| |
| /* |
| * Readability. An Arc90 Lab Experiment. |
| * Website: http://lab.arc90.com/experiments/readability |
| * Source: http://code.google.com/p/arc90labs-readability |
| * |
| * "Readability" is a trademark of Arc90 Inc and may not be used without explicit permission. |
| * |
| * Copyright (c) 2010 Arc90 Inc |
| * Readability is licensed under the Apache License, Version 2.0. |
| **/ |
| var readability = { |
| readStyle: "style-newspaper", |
| readSize: "size-medium", |
| readMargin: "margin-wide", |
| |
| distilledHTML: '', |
| distilledArticleContent: null, |
| nextPageLink: '', |
| |
| version: '1.7.1', |
| iframeLoads: 0, |
| convertLinksToFootnotes: false, |
| reversePageScroll: false, /* If they hold shift and hit space, scroll up */ |
| frameHack: false, /** |
| * The frame hack is to workaround a firefox bug where if you |
| * pull content out of a frame and stick it into the parent element, the scrollbar won't appear. |
| * So we fake a scrollbar in the wrapping div. |
| **/ |
| biggestFrame: false, |
| flags: 0x1 | 0x2 | 0x4, /* Start with all flags set. */ |
| |
| /* constants */ |
| FLAG_STRIP_UNLIKELYS: 0x1, |
| FLAG_WEIGHT_CLASSES: 0x2, |
| FLAG_CLEAN_CONDITIONALLY: 0x4, |
| |
| maxPages: 30, /* The maximum number of pages to loop through before we call it quits and just show a link. */ |
| parsedPages: {}, /* The list of pages we've parsed in this call of readability, for autopaging. As a key store for easier searching. */ |
| pageETags: {}, /* A list of the ETag headers of pages we've parsed, in case they happen to match, we'll know it's a duplicate. */ |
| |
| /** |
| * All of the regular expressions in use within readability. |
| * Defined up here so we don't instantiate them repeatedly in loops. |
| **/ |
| regexps: { |
| unlikelyCandidates: /combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter/i, |
| okMaybeItsACandidate: /and|article|body|column|main|shadow/i, |
| positive: /article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i, |
| negative: /combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget/i, |
| extraneous: /print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single/i, |
| divToPElements: /<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i, |
| replaceBrs: /(<br[^>]*>[ \n\r\t]*){2,}/gi, |
| replaceFonts: /<(\/?)font[^>]*>/gi, |
| trim: /^\s+|\s+$/g, |
| normalize: /\s{2,}/g, |
| killBreaks: /(<br\s*\/?>(\s| ?)*){1,}/g, |
| videos: /http:\/\/(www\.)?(youtube|vimeo)\.com/i, |
| skipFootnoteLink: /^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i, |
| nextLink: /(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i, // Match: next, continue, >, >>, » but not >|, »| as those usually mean last. |
| prevLink: /(prev|earl|old|new|<|«)/i |
| }, |
| |
| /** |
| * Runs readability. |
| * |
| * Workflow: |
| * 1. Prep the document by removing script tags, css, etc. |
| * 2. Build readability's DOM tree. |
| * 3. Grab the article content from the current dom tree. |
| * 4. Replace the current DOM tree with the new one. |
| * 5. Read peacefully. |
| * |
| * @return void |
| **/ |
| init: function() { |
| /* Before we do anything, remove all scripts that are not readability. */ |
| window.onload = window.onunload = function() {}; |
| |
| readability.removeScripts(document); |
| |
| /* Make sure this document is added to the list of parsed pages first, so we don't double up on the first page */ |
| readability.parsedPages[window.location.href.replace(/\/$/, '')] = true; |
| |
| /* Pull out any possible next page link first */ |
| readability.nextPageLink = readability.findNextPageLink(document.body); |
| |
| /* We handle processing of nextPage from C++ set nextPageLink to null */ |
| var nextPageLink = null; |
| |
| readability.prepDocument(); |
| |
| /* Build readability's DOM tree */ |
| var overlay = document.createElement("DIV"); |
| var innerDiv = document.createElement("DIV"); |
| var articleTools = readability.getArticleTools(); |
| var articleTitleText = readability.getArticleTitle(); |
| var articleContent = readability.grabArticle(); |
| |
| if(!articleContent) { |
| articleContent = document.createElement("DIV"); |
| articleContent.id = "readability-content"; |
| articleContent.innerHTML = [ |
| "<p>Sorry, readability was unable to parse this page for content. If you feel like it should have been able to, please <a href='http://code.google.com/p/arc90labs-readability/issues/entry'>let us know by submitting an issue.</a></p>", |
| (readability.frameHack ? "<p><strong>It appears this page uses frames.</strong> Unfortunately, browser security properties often cause Readability to fail on pages that include frames." : ""), |
| "<p>Also, please note that Readability does not play very nicely with front pages. Readability is intended to work on articles with a sizable chunk of text that you'd like to read comfortably. If you're using Readability on a landing page (like nytimes.com for example), please click into an article first before using Readability.</p>" |
| ].join(''); |
| |
| nextPageLink = null; |
| } |
| |
| overlay.id = "readOverlay"; |
| innerDiv.id = "readInner"; |
| |
| /* Apply user-selected styling */ |
| document.body.className = readability.readStyle; |
| document.dir = readability.getSuggestedDirection(articleTitleText); |
| |
| if (readability.readStyle === "style-athelas" || readability.readStyle === "style-apertura"){ |
| overlay.className = readability.readStyle + " rdbTypekit"; |
| } else { |
| overlay.className = readability.readStyle; |
| } |
| innerDiv.className = readability.readMargin + " " + readability.readSize; |
| |
| if(typeof(readConvertLinksToFootnotes) !== 'undefined' && readConvertLinksToFootnotes === true) { |
| readability.convertLinksToFootnotes = true; |
| } |
| |
| readability.distilledHTML = articleContent.innerHTML; |
| |
| if(readability.frameHack) { |
| var readOverlay = document.getElementById('readOverlay'); |
| readOverlay.style.height = '100%'; |
| readOverlay.style.overflow = 'auto'; |
| } |
| |
| /** |
| * If someone tries to use Readability on a site's root page, give them a warning about usage. |
| **/ |
| if((window.location.protocol + "//" + window.location.host + "/") === window.location.href) { |
| articleContent.style.display = "none"; |
| var rootWarning = document.createElement('p'); |
| rootWarning.id = "readability-warning"; |
| rootWarning.innerHTML = "<em>Readability</em> was intended for use on individual articles and not home pages. " + |
| "If you'd like to try rendering this page anyway, <a onClick='javascript:document.getElementById(\"readability-warning\").style.display=\"none\";document.getElementById(\"readability-content\").style.display=\"block\";'>click here</a> to continue."; |
| |
| innerDiv.insertBefore( rootWarning, articleContent ); |
| } |
| |
| readability.postProcessContent(articleContent); |
| |
| window.scrollTo(0, 0); |
| |
| if (nextPageLink) { |
| /** |
| * Append any additional pages after a small timeout so that people |
| * can start reading without having to wait for this to finish processing. |
| **/ |
| window.setTimeout(function() { |
| readability.appendNextPage(nextPageLink); |
| }, 500); |
| } |
| |
| /** Smooth scrolling **/ |
| document.onkeydown = function(e) { |
| var code = (window.event) ? event.keyCode : e.keyCode; |
| if (code === 16) { |
| readability.reversePageScroll = true; |
| return; |
| } |
| |
| if (code === 32) { |
| readability.curScrollStep = 0; |
| var windowHeight = window.innerHeight ? window.innerHeight : (document.documentElement.clientHeight ? document.documentElement.clientHeight : document.body.clientHeight); |
| |
| if(readability.reversePageScroll) { |
| readability.scrollTo(readability.scrollTop(), readability.scrollTop() - (windowHeight - 50), 20, 10); |
| } |
| else { |
| readability.scrollTo(readability.scrollTop(), readability.scrollTop() + (windowHeight - 50), 20, 10); |
| } |
| |
| return false; |
| } |
| }; |
| |
| document.onkeyup = function(e) { |
| var code = (window.event) ? event.keyCode : e.keyCode; |
| if (code === 16) { |
| readability.reversePageScroll = false; |
| return; |
| } |
| }; |
| }, |
| |
| /** |
| * Run any post-process modifications to article content as necessary. |
| * |
| * @param Element |
| * @return void |
| **/ |
| postProcessContent: function(articleContent) { |
| if(readability.convertLinksToFootnotes && !window.location.href.match(/wikipedia\.org/g)) { |
| readability.addFootnotes(articleContent); |
| } |
| |
| readability.fixImageFloats(articleContent); |
| }, |
| |
| /** |
| * Some content ends up looking ugly if the image is too large to be floated. |
| * If the image is wider than a threshold (currently 55%), no longer float it, |
| * center it instead. |
| * |
| * @param Element |
| * @return void |
| **/ |
| fixImageFloats: function (articleContent) { |
| var imageWidthThreshold = Math.min(articleContent.offsetWidth, 800) * 0.55, |
| images = articleContent.getElementsByTagName('img'); |
| |
| for(var i=0, il = images.length; i < il; i+=1) { |
| var image = images[i]; |
| |
| if(image.offsetWidth > imageWidthThreshold) { |
| image.className += " blockImage"; |
| } |
| } |
| }, |
| |
| /** |
| * Get the article tools Element that has buttons like reload, print. |
| * |
| * @return void |
| **/ |
| getArticleTools: function () { |
| var articleTools = document.createElement("DIV"); |
| |
| articleTools.id = "readTools"; |
| articleTools.innerHTML = |
| "<a href='#' onclick='return window.location.reload()' title='Reload original page' id='reload-page'>Reload Original Page</a>" + |
| "<a href='#' onclick='javascript:window.print();' title='Print page' id='print-page'>Print Page</a>" + |
| "<a href='#' onclick='readability.emailBox(); return false;' title='Email page' id='email-page'>Email Page</a>"; |
| |
| return articleTools; |
| }, |
| |
| /** |
| * retuns the suggested direction of the string |
| * |
| * @return "rtl" || "ltr" |
| **/ |
| getSuggestedDirection: function(text) { |
| function sanitizeText() { |
| return text.replace(/@\w+/, ""); |
| } |
| |
| function countMatches(match) { |
| var matches = text.match(new RegExp(match, "g")); |
| return matches !== null ? matches.length : 0; |
| } |
| |
| function isRTL() { |
| var count_heb = countMatches("[\\u05B0-\\u05F4\\uFB1D-\\uFBF4]"); |
| var count_arb = countMatches("[\\u060C-\\u06FE\\uFB50-\\uFEFC]"); |
| |
| // if 20% of chars are Hebrew or Arbic then direction is rtl |
| return (count_heb + count_arb) * 100 / text.length > 20; |
| } |
| |
| text = sanitizeText(text); |
| return isRTL() ? "rtl" : "ltr"; |
| }, |
| |
| /** |
| * Get the article title as an H1. |
| * |
| * @return void |
| **/ |
| getArticleTitle: function () { |
| var curTitle = "", |
| origTitle = ""; |
| |
| try { |
| curTitle = origTitle = document.title; |
| if(typeof curTitle !== "string") { /* If they had an element with id "title" in their HTML */ |
| curTitle = origTitle = readability.getInnerText(document.getElementsByTagName('title')[0]); |
| } |
| } |
| catch(e) {} |
| |
| if(curTitle.match(/ [\|\-] /)) |
| { |
| curTitle = origTitle.replace(/(.*)[\|\-] .*/gi,'$1'); |
| |
| if(curTitle.split(' ').length < 3) { |
| curTitle = origTitle.replace(/[^\|\-]*[\|\-](.*)/gi,'$1'); |
| } |
| } |
| else if(curTitle.indexOf(': ') !== -1) |
| { |
| curTitle = origTitle.replace(/.*:(.*)/gi, '$1'); |
| |
| if(curTitle.split(' ').length < 3) { |
| curTitle = origTitle.replace(/[^:]*[:](.*)/gi,'$1'); |
| } |
| } |
| else if(curTitle.length > 150 || curTitle.length < 15) |
| { |
| var hOnes = document.getElementsByTagName('h1'); |
| if(hOnes.length === 1) |
| { |
| curTitle = readability.getInnerText(hOnes[0]); |
| } |
| } |
| |
| curTitle = curTitle.replace( readability.regexps.trim, "" ); |
| |
| if(curTitle.split(' ').length <= 4) { |
| curTitle = origTitle; |
| } |
| return curTitle; |
| }, |
| |
| /** |
| * Prepare the HTML document for readability to scrape it. |
| * This includes things like stripping javascript, CSS, and handling terrible markup. |
| * |
| * @return void |
| **/ |
| prepDocument: function () { |
| /** |
| * In some cases a body element can't be found (if the HTML is totally hosed for example) |
| * so we create a new body node and append it to the document. |
| */ |
| if(document.body === null) |
| { |
| var body = document.createElement("body"); |
| try { |
| document.body = body; |
| } |
| catch(e) { |
| document.documentElement.appendChild(body); |
| dbg(e); |
| } |
| } |
| |
| document.body.id = "readabilityBody"; |
| |
| var frames = document.getElementsByTagName('frame'); |
| if(frames.length > 0) |
| { |
| var bestFrame = null; |
| var bestFrameSize = 0; /* The frame to try to run readability upon. Must be on same domain. */ |
| var biggestFrameSize = 0; /* Used for the error message. Can be on any domain. */ |
| for(var frameIndex = 0; frameIndex < frames.length; frameIndex+=1) |
| { |
| var frameSize = frames[frameIndex].offsetWidth + frames[frameIndex].offsetHeight; |
| var canAccessFrame = false; |
| try { |
| var frameBody = frames[frameIndex].contentWindow.document.body; |
| canAccessFrame = true; |
| } |
| catch(eFrames) { |
| dbg(eFrames); |
| } |
| |
| if(frameSize > biggestFrameSize) { |
| biggestFrameSize = frameSize; |
| readability.biggestFrame = frames[frameIndex]; |
| } |
| |
| if(canAccessFrame && frameSize > bestFrameSize) |
| { |
| readability.frameHack = true; |
| |
| bestFrame = frames[frameIndex]; |
| bestFrameSize = frameSize; |
| } |
| } |
| |
| if(bestFrame) |
| { |
| var newBody = document.createElement('body'); |
| readability.moveNodeInnards(bestFrame.contentWindow.document.body, newBody); |
| newBody.style.overflow = 'scroll'; |
| document.body = newBody; |
| |
| var frameset = document.getElementsByTagName('frameset')[0]; |
| if(frameset) { |
| frameset.parentNode.removeChild(frameset); } |
| } |
| } |
| |
| /* Remove all stylesheets */ |
| for (var k=0;k < document.styleSheets.length; k+=1) { |
| if (document.styleSheets[k].href !== null && document.styleSheets[k].href.lastIndexOf("readability") === -1) { |
| document.styleSheets[k].disabled = true; |
| } |
| } |
| |
| /* Remove all style tags in head (not doing this on IE) - TODO: Why not? */ |
| var styleTags = document.getElementsByTagName("style"); |
| for (var st=0;st < styleTags.length; st+=1) { |
| styleTags[st].textContent = ""; |
| } |
| |
| /* Turn all double br's into p's */ |
| /* Note, this is pretty costly as far as processing goes. Maybe optimize later. */ |
| readability.replaceDoubleBrsWithPs(document.body); |
| readability.replaceFontsWithSpans(document.body); |
| }, |
| |
| |
| /** |
| * Prepare the article node for display. Clean out any inline styles, |
| * iframes, forms, strip extraneous <p> tags, etc. |
| * |
| * @param Element |
| * @return void |
| **/ |
| prepArticle: function (articleContent) { |
| readability.cleanStyles(articleContent); |
| readability.killBreaks(articleContent); |
| |
| /* Clean out junk from the article content */ |
| readability.cleanConditionally(articleContent, "form"); |
| readability.clean(articleContent, "object"); |
| readability.clean(articleContent, "h1"); |
| |
| /** |
| * If there is only one h2, they are probably using it |
| * as a header and not a subheader, so remove it since we already have a header. |
| ***/ |
| if(articleContent.getElementsByTagName('h2').length === 1) { |
| readability.clean(articleContent, "h2"); |
| } |
| readability.clean(articleContent, "iframe"); |
| |
| readability.cleanHeaders(articleContent); |
| |
| /* Do these last as the previous stuff may have removed junk that will affect these */ |
| readability.cleanConditionally(articleContent, "table"); |
| readability.cleanConditionally(articleContent, "ul"); |
| readability.cleanConditionally(articleContent, "div"); |
| |
| /* Remove extra paragraphs */ |
| var articleParagraphs = articleContent.getElementsByTagName('p'); |
| for(var i = articleParagraphs.length-1; i >= 0; i-=1) { |
| var imgCount = articleParagraphs[i].getElementsByTagName('img').length; |
| var embedCount = articleParagraphs[i].getElementsByTagName('embed').length; |
| var objectCount = articleParagraphs[i].getElementsByTagName('object').length; |
| |
| if(imgCount === 0 && embedCount === 0 && objectCount === 0 && readability.getInnerText(articleParagraphs[i], false) === '') { |
| articleParagraphs[i].parentNode.removeChild(articleParagraphs[i]); |
| } |
| } |
| |
| try { |
| readability.replaceBrsWithPs(articleContent); |
| } |
| catch (e) { |
| dbg("Cleaning innerHTML of breaks failed. This is an IE strict-block-elements bug. Ignoring.: " + e); |
| } |
| }, |
| |
| /** |
| * Initialize a node with the readability object. Also checks the |
| * className/id for special names to add to its score. |
| * |
| * @param Element |
| * @return void |
| **/ |
| initializeNode: function (node) { |
| node.readability = {"contentScore": 0}; |
| |
| switch(node.tagName) { |
| case 'DIV': |
| node.readability.contentScore += 5; |
| break; |
| |
| case 'PRE': |
| case 'TD': |
| case 'BLOCKQUOTE': |
| node.readability.contentScore += 3; |
| break; |
| |
| case 'ADDRESS': |
| case 'OL': |
| case 'UL': |
| case 'DL': |
| case 'DD': |
| case 'DT': |
| case 'LI': |
| case 'FORM': |
| node.readability.contentScore -= 3; |
| break; |
| |
| case 'H1': |
| case 'H2': |
| case 'H3': |
| case 'H4': |
| case 'H5': |
| case 'H6': |
| case 'TH': |
| node.readability.contentScore -= 5; |
| break; |
| } |
| |
| node.readability.contentScore += readability.getClassWeight(node); |
| }, |
| |
| /*** |
| * grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is |
| * most likely to be the stuff a user wants to read. Then return it wrapped up in a div. |
| * |
| * @param page a document to run upon. Needs to be a full document, complete with body. |
| * @return Element |
| **/ |
| grabArticle: function (pageToClone) { |
| var stripUnlikelyCandidates = readability.flagIsActive(readability.FLAG_STRIP_UNLIKELYS), |
| isPaging = (page !== null) ? true: false; |
| |
| var page = null; |
| // Never work on the actual page. |
| if (isPaging) { |
| page = document.body.cloneNode(true); |
| } else { |
| page = pageToClone.cloneNode(true); |
| } |
| |
| var allElements = page.getElementsByTagName('*'); |
| |
| /** |
| * First, node prepping. Trash nodes that look cruddy (like ones with the class name "comment", etc), and turn divs |
| * into P tags where they have been used inappropriately (as in, where they contain no other block level elements.) |
| * |
| * Note: Assignment from index for performance. See http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5 |
| * TODO: Shouldn't this be a reverse traversal? |
| **/ |
| var node = null; |
| var nodesToScore = []; |
| for(var nodeIndex = 0; (node = allElements[nodeIndex]); nodeIndex+=1) { |
| /* Remove unlikely candidates */ |
| if (stripUnlikelyCandidates) { |
| var unlikelyMatchString = node.className + node.id; |
| if ( |
| ( |
| unlikelyMatchString.search(readability.regexps.unlikelyCandidates) !== -1 && |
| unlikelyMatchString.search(readability.regexps.okMaybeItsACandidate) === -1 && |
| node.tagName !== "BODY" |
| ) |
| ) |
| { |
| dbg("Removing unlikely candidate - " + unlikelyMatchString); |
| node.parentNode.removeChild(node); |
| nodeIndex-=1; |
| continue; |
| } |
| } |
| |
| if (node.tagName === "P" || node.tagName === "TD" || node.tagName === "PRE") { |
| nodesToScore[nodesToScore.length] = node; |
| } |
| |
| /* Turn all divs that don't have children block level elements into p's */ |
| if (node.tagName === "DIV") { |
| if (node.innerHTML.search(readability.regexps.divToPElements) === -1) { |
| var newNode = document.createElement('p'); |
| try { |
| readability.moveNodeInnards(node, newNode); |
| node.parentNode.replaceChild(newNode, node); |
| nodeIndex-=1; |
| |
| nodesToScore[nodesToScore.length] = node; |
| } |
| catch(e) { |
| dbg("Could not alter div to p, probably an IE restriction, reverting back to div.: " + e); |
| } |
| } |
| else |
| { |
| /* EXPERIMENTAL */ |
| for(var i = 0, il = node.childNodes.length; i < il; i+=1) { |
| var childNode = node.childNodes[i]; |
| if(childNode.nodeType === 3) { // Node.TEXT_NODE |
| var p = document.createElement('p'); |
| var t = document.createTextNode(childNode.nodeValue); |
| p.appendChild(t); |
| p.style.display = 'inline'; |
| p.className = 'readability-styled'; |
| childNode.parentNode.replaceChild(p, childNode); |
| } |
| } |
| } |
| } |
| } |
| |
| /** |
| * Loop through all paragraphs, and assign a score to them based on how content-y they look. |
| * Then add their score to their parent node. |
| * |
| * A score is determined by things like number of commas, class names, etc. Maybe eventually link density. |
| **/ |
| var candidates = []; |
| for (var pt=0; pt < nodesToScore.length; pt+=1) { |
| var parentNode = nodesToScore[pt].parentNode; |
| var grandParentNode = parentNode ? parentNode.parentNode : null; |
| var innerText = readability.getInnerText(nodesToScore[pt]); |
| |
| if(!parentNode || typeof(parentNode.tagName) === 'undefined') { |
| continue; |
| } |
| |
| /* If this paragraph is less than 25 characters, don't even count it. */ |
| if(innerText.length < 25) { |
| continue; } |
| |
| /* Initialize readability data for the parent. */ |
| if(typeof parentNode.readability === 'undefined') { |
| readability.initializeNode(parentNode); |
| candidates.push(parentNode); |
| } |
| |
| /* Initialize readability data for the grandparent. */ |
| if(grandParentNode && typeof(grandParentNode.readability) === 'undefined' && typeof(grandParentNode.tagName) !== 'undefined') { |
| readability.initializeNode(grandParentNode); |
| candidates.push(grandParentNode); |
| } |
| |
| var contentScore = 0; |
| |
| /* Add a point for the paragraph itself as a base. */ |
| contentScore+=1; |
| |
| /* Add points for any commas within this paragraph */ |
| contentScore += innerText.split(',').length; |
| |
| /* For every 100 characters in this paragraph, add another point. Up to 3 points. */ |
| contentScore += Math.min(Math.floor(innerText.length / 100), 3); |
| |
| /* Add the score to the parent. The grandparent gets half. */ |
| parentNode.readability.contentScore += contentScore; |
| |
| if(grandParentNode) { |
| grandParentNode.readability.contentScore += contentScore/2; |
| } |
| } |
| |
| /** |
| * After we've calculated scores, loop through all of the possible candidate nodes we found |
| * and find the one with the highest score. |
| **/ |
| var topCandidate = null; |
| for(var c=0, cl=candidates.length; c < cl; c+=1) |
| { |
| /** |
| * Scale the final candidates score based on link density. Good content should have a |
| * relatively small link density (5% or less) and be mostly unaffected by this operation. |
| **/ |
| candidates[c].readability.contentScore = candidates[c].readability.contentScore * (1-readability.getLinkDensity(candidates[c])); |
| |
| dbg('Candidate: ' + candidates[c] + " (" + candidates[c].className + ":" + candidates[c].id + ") with score " + candidates[c].readability.contentScore); |
| |
| if(!topCandidate || candidates[c].readability.contentScore > topCandidate.readability.contentScore) { |
| topCandidate = candidates[c]; } |
| } |
| |
| /** |
| * If we still have no top candidate, just use the body as a last resort. |
| * We also have to copy the body node so it is something we can modify. |
| **/ |
| if (topCandidate === null || topCandidate.tagName === "BODY") |
| { |
| topCandidate = document.createElement("DIV"); |
| readability.replaceNodeInnards(page, topCandidate); |
| page.appendChild(topCandidate); |
| readability.initializeNode(topCandidate); |
| } |
| |
| /** |
| * Now that we have the top candidate, look through its siblings for content that might also be related. |
| * Things like preambles, content split by ads that we removed, etc. |
| **/ |
| var articleContent = document.createElement("DIV"); |
| if (isPaging) { |
| articleContent.id = "readability-content"; |
| } |
| var siblingScoreThreshold = Math.max(10, topCandidate.readability.contentScore * 0.2); |
| var siblingNodes = topCandidate.parentNode.childNodes; |
| |
| |
| for(var s=0, sl=siblingNodes.length; s < sl; s+=1) { |
| var siblingNode = siblingNodes[s]; |
| var append = false; |
| |
| /** |
| * Fix for odd IE7 Crash where siblingNode does not exist even though this should be a live nodeList. |
| * Example of error visible here: http://www.esquire.com/features/honesty0707 |
| **/ |
| if(!siblingNode) { |
| continue; |
| } |
| |
| dbg("Looking at sibling node: " + siblingNode + " (" + siblingNode.className + ":" + siblingNode.id + ")" + ((typeof siblingNode.readability !== 'undefined') ? (" with score " + siblingNode.readability.contentScore) : '')); |
| dbg("Sibling has score " + (siblingNode.readability ? siblingNode.readability.contentScore : 'Unknown')); |
| |
| if(siblingNode === topCandidate) |
| { |
| append = true; |
| } |
| |
| var contentBonus = 0; |
| /* Give a bonus if sibling nodes and top candidates have the example same classname */ |
| if(siblingNode.className === topCandidate.className && topCandidate.className !== "") { |
| contentBonus += topCandidate.readability.contentScore * 0.2; |
| } |
| |
| if(typeof siblingNode.readability !== 'undefined' && (siblingNode.readability.contentScore+contentBonus) >= siblingScoreThreshold) |
| { |
| append = true; |
| } |
| |
| if(siblingNode.nodeName === "P") { |
| var linkDensity = readability.getLinkDensity(siblingNode); |
| var nodeContent = readability.getInnerText(siblingNode); |
| var nodeLength = nodeContent.length; |
| |
| if(nodeLength > 80 && linkDensity < 0.25) |
| { |
| append = true; |
| } |
| else if(nodeLength < 80 && linkDensity === 0 && nodeContent.search(/\.( |$)/) !== -1) |
| { |
| append = true; |
| } |
| } |
| |
| if(append) { |
| dbg("Appending node: " + siblingNode); |
| |
| var nodeToAppend = null; |
| if(siblingNode.nodeName !== "DIV" && siblingNode.nodeName !== "P") { |
| /* We have a node that isn't a common block level element, like a form or td tag. Turn it into a div so it doesn't get filtered out later by accident. */ |
| |
| dbg("Altering siblingNode of " + siblingNode.nodeName + ' to div.'); |
| nodeToAppend = document.createElement("DIV"); |
| try { |
| nodeToAppend.id = siblingNode.id; |
| readability.moveNodeInnards(siblingNode, nodeToAppend); |
| } |
| catch(er) { |
| dbg("Could not alter siblingNode to div, probably an IE restriction, reverting back to original."); |
| nodeToAppend = siblingNode; |
| s-=1; |
| sl-=1; |
| } |
| } else { |
| nodeToAppend = siblingNode; |
| s-=1; |
| sl-=1; |
| } |
| |
| /* To ensure a node does not interfere with readability styles, remove its classnames */ |
| nodeToAppend.className = ""; |
| |
| /* Append sibling and subtract from our list because it removes the node when you append to another node */ |
| articleContent.appendChild(nodeToAppend); |
| } |
| } |
| |
| /** |
| * So we have all of the content that we need. Now we clean it up for presentation. |
| **/ |
| readability.distilledArticleContent = articleContent.cloneNode(true); |
| //readability.prepArticle(articleContent); |
| |
| if (readability.curPageNum === 1) { |
| var newNode = document.createElement('div'); |
| newNode.id = "readability-page-1"; |
| newNode.setAttribute("class", "page"); |
| readability.moveNodeInnards(articleContent, newNode); |
| articleContent.appendChild(newNode); |
| } |
| |
| /** |
| * Now that we've gone through the full algorithm, check to see if we got any meaningful content. |
| * If we didn't, we may need to re-run grabArticle with different flags set. This gives us a higher |
| * likelihood of finding the content, and the sieve approach gives us a higher likelihood of |
| * finding the -right- content. |
| **/ |
| if(readability.getInnerText(articleContent, false).length < 250) { |
| if (readability.flagIsActive(readability.FLAG_STRIP_UNLIKELYS)) { |
| readability.removeFlag(readability.FLAG_STRIP_UNLIKELYS); |
| return readability.grabArticle(document.body); |
| } |
| else if (readability.flagIsActive(readability.FLAG_WEIGHT_CLASSES)) { |
| readability.removeFlag(readability.FLAG_WEIGHT_CLASSES); |
| return readability.grabArticle(document.body); |
| } |
| else if (readability.flagIsActive(readability.FLAG_CLEAN_CONDITIONALLY)) { |
| readability.removeFlag(readability.FLAG_CLEAN_CONDITIONALLY); |
| return readability.grabArticle(document.body); |
| } else { |
| return null; |
| } |
| } |
| |
| return articleContent; |
| }, |
| |
| /** |
| * Removes script tags from the document. |
| * |
| * @param Element |
| **/ |
| removeScripts: function (doc) { |
| var scripts = doc.getElementsByTagName('script'); |
| for(var i = scripts.length-1; i >= 0; i-=1) |
| { |
| if(typeof(scripts[i].src) === "undefined" || (scripts[i].src.indexOf('readability') === -1 && scripts[i].src.indexOf('typekit') === -1)) |
| { |
| scripts[i].nodeValue=""; |
| scripts[i].removeAttribute('src'); |
| if (scripts[i].parentNode) { |
| scripts[i].parentNode.removeChild(scripts[i]); |
| } |
| } |
| } |
| }, |
| |
| /** |
| * Get the inner text of a node - cross browser compatibly. |
| * This also strips out any excess whitespace to be found. |
| * |
| * @param Element |
| * @return string |
| **/ |
| getInnerText: function (e, normalizeSpaces) { |
| var textContent = ""; |
| |
| if(typeof(e.textContent) === "undefined" && typeof(e.innerText) === "undefined") { |
| return ""; |
| } |
| |
| normalizeSpaces = (typeof normalizeSpaces === 'undefined') ? true : normalizeSpaces; |
| |
| if (navigator.appName === "Microsoft Internet Explorer") { |
| textContent = e.innerText.replace( readability.regexps.trim, "" ); } |
| else { |
| textContent = e.textContent.replace( readability.regexps.trim, "" ); } |
| |
| if(normalizeSpaces) { |
| return textContent.replace( readability.regexps.normalize, " "); } |
| else { |
| return textContent; } |
| }, |
| |
| /** |
| * Get the number of times a string s appears in the node e. |
| * |
| * @param Element |
| * @param string - what to split on. Default is "," |
| * @return number (integer) |
| **/ |
| getCharCount: function (e,s) { |
| s = s || ","; |
| return readability.getInnerText(e).split(s).length-1; |
| }, |
| |
| /** |
| * Remove the style attribute on every e and under. |
| * TODO: Test if getElementsByTagName(*) is faster. |
| * |
| * @param Element |
| * @return void |
| **/ |
| cleanStyles: function (e) { |
| e = e || document; |
| var cur = e.firstChild; |
| |
| if(!e) { |
| return; } |
| |
| // Remove any root styles, if we're able. |
| if(typeof e.removeAttribute === 'function' && e.className !== 'readability-styled') { |
| e.removeAttribute('style'); } |
| |
| // Go until there are no more child nodes |
| while ( cur !== null ) { |
| if ( cur.nodeType === 1 ) { |
| // Remove style attribute(s) : |
| if(cur.className !== "readability-styled") { |
| cur.removeAttribute("style"); |
| } |
| readability.cleanStyles( cur ); |
| } |
| cur = cur.nextSibling; |
| } |
| }, |
| |
| /** |
| * Get the density of links as a percentage of the content |
| * This is the amount of text that is inside a link divided by the total text in the node. |
| * |
| * @param Element |
| * @return number (float) |
| **/ |
| getLinkDensity: function (e) { |
| var links = e.getElementsByTagName("a"); |
| var textLength = readability.getInnerText(e).length; |
| var linkLength = 0; |
| for(var i=0, il=links.length; i<il;i+=1) |
| { |
| linkLength += readability.getInnerText(links[i]).length; |
| } |
| |
| return linkLength / textLength; |
| }, |
| |
| /** |
| * Find a cleaned up version of the current URL, to use for comparing links for possible next-pageyness. |
| * |
| * @author Dan Lacy |
| * @return string the base url |
| **/ |
| findBaseUrl: function () { |
| var noUrlParams = window.location.pathname.split("?")[0], |
| urlSlashes = noUrlParams.split("/").reverse(), |
| cleanedSegments = [], |
| possibleType = ""; |
| |
| for (var i = 0, slashLen = urlSlashes.length; i < slashLen; i+=1) { |
| var segment = urlSlashes[i]; |
| |
| // Split off and save anything that looks like a file type. |
| if (segment.indexOf(".") !== -1) { |
| possibleType = segment.split(".")[1]; |
| |
| /* If the type isn't alpha-only, it's probably not actually a file extension. */ |
| if(!possibleType.match(/[^a-zA-Z]/)) { |
| segment = segment.split(".")[0]; |
| } |
| } |
| |
| /** |
| * EW-CMS specific segment replacement. Ugly. |
| * Example: http://www.ew.com/ew/article/0,,20313460_20369436,00.html |
| **/ |
| if(segment.indexOf(',00') !== -1) { |
| segment = segment.replace(',00', ''); |
| } |
| |
| // If our first or second segment has anything looking like a page number, remove it. |
| if (segment.match(/((_|-)?p[a-z]*|(_|-))[0-9]{1,2}$/i) && ((i === 1) || (i === 0))) { |
| segment = segment.replace(/((_|-)?p[a-z]*|(_|-))[0-9]{1,2}$/i, ""); |
| } |
| |
| |
| var del = false; |
| |
| /* If this is purely a number, and it's the first or second segment, it's probably a page number. Remove it. */ |
| if (i < 2 && segment.match(/^\d{1,2}$/)) { |
| del = true; |
| } |
| |
| /* If this is the first segment and it's just "index", remove it. */ |
| if(i === 0 && segment.toLowerCase() === "index") { |
| del = true; |
| } |
| |
| |
| /* If our first or second segment is smaller than 3 characters, and the first segment was purely alphas, remove it. */ |
| if(i < 2 && segment.length < 3 && !urlSlashes[0].match(/[a-z]/i)) { |
| del = true; |
| } |
| |
| /* If it's not marked for deletion, push it to cleanedSegments. */ |
| if (!del) { |
| cleanedSegments.push(segment); |
| } |
| } |
| |
| // This is our final, cleaned, base article URL. |
| return window.location.protocol + "//" + window.location.host + cleanedSegments.reverse().join("/"); |
| }, |
| |
| /** |
| * Look for any paging links that may occur within the document. |
| * |
| * @param body |
| * @return object (array) |
| **/ |
| findNextPageLink: function (elem) { |
| var possiblePages = {}, |
| allLinks = elem.getElementsByTagName('a'), |
| articleBaseUrl = readability.findBaseUrl(); |
| |
| /** |
| * Loop through all links, looking for hints that they may be next-page links. |
| * Things like having "page" in their textContent, className or id, or being a child |
| * of a node with a page-y className or id. |
| * |
| * Also possible: levenshtein distance? longest common subsequence? |
| * |
| * After we do that, assign each page a score, and |
| **/ |
| for(var i = 0, il = allLinks.length; i < il; i+=1) { |
| var link = allLinks[i], |
| linkHref = allLinks[i].href.replace(/#.*$/, '').replace(/\/$/, ''); |
| |
| /* If we've already seen this page, ignore it */ |
| if(linkHref === "" || linkHref === articleBaseUrl || linkHref === window.location.href || linkHref in readability.parsedPages) { |
| continue; |
| } |
| |
| /* If it's on a different domain, skip it. */ |
| if(window.location.host !== linkHref.split(/\/+/g)[1]) { |
| continue; |
| } |
| |
| var linkText = readability.getInnerText(link); |
| |
| /* If the linkText looks like it's not the next page, skip it. */ |
| if(linkText.match(readability.regexps.extraneous) || linkText.length > 25) { |
| continue; |
| } |
| |
| /* If the leftovers of the URL after removing the base URL don't contain any digits, it's certainly not a next page link. */ |
| var linkHrefLeftover = linkHref.replace(articleBaseUrl, ''); |
| if(!linkHrefLeftover.match(/\d/)) { |
| continue; |
| } |
| |
| if(!(linkHref in possiblePages)) { |
| possiblePages[linkHref] = {"score": 0, "linkText": linkText, "href": linkHref}; |
| } else { |
| possiblePages[linkHref].linkText += ' | ' + linkText; |
| } |
| |
| var linkObj = possiblePages[linkHref]; |
| |
| /** |
| * If the articleBaseUrl isn't part of this URL, penalize this link. It could still be the link, but the odds are lower. |
| * Example: http://www.actionscript.org/resources/articles/745/1/JavaScript-and-VBScript-Injection-in-ActionScript-3/Page1.html |
| **/ |
| if(linkHref.indexOf(articleBaseUrl) !== 0) { |
| linkObj.score -= 25; |
| } |
| |
| var linkData = linkText + ' ' + link.className + ' ' + link.id; |
| if(linkData.match(readability.regexps.nextLink)) { |
| linkObj.score += 50; |
| } |
| if(linkData.match(/pag(e|ing|inat)/i)) { |
| linkObj.score += 25; |
| } |
| if(linkData.match(/(first|last)/i)) { // -65 is enough to negate any bonuses gotten from a > or » in the text, |
| /* If we already matched on "next", last is probably fine. If we didn't, then it's bad. Penalize. */ |
| if(!linkObj.linkText.match(readability.regexps.nextLink)) { |
| linkObj.score -= 65; |
| } |
| } |
| if(linkData.match(readability.regexps.negative) || linkData.match(readability.regexps.extraneous)) { |
| linkObj.score -= 50; |
| } |
| if(linkData.match(readability.regexps.prevLink)) { |
| linkObj.score -= 200; |
| } |
| |
| /* If a parentNode contains page or paging or paginat */ |
| var parentNode = link.parentNode, |
| positiveNodeMatch = false, |
| negativeNodeMatch = false; |
| while(parentNode) { |
| var parentNodeClassAndId = parentNode.className + ' ' + parentNode.id; |
| if(!positiveNodeMatch && parentNodeClassAndId && parentNodeClassAndId.match(/pag(e|ing|inat)/i)) { |
| positiveNodeMatch = true; |
| linkObj.score += 25; |
| } |
| if(!negativeNodeMatch && parentNodeClassAndId && parentNodeClassAndId.match(readability.regexps.negative)) { |
| /* If this is just something like "footer", give it a negative. If it's something like "body-and-footer", leave it be. */ |
| if(!parentNodeClassAndId.match(readability.regexps.positive)) { |
| linkObj.score -= 25; |
| negativeNodeMatch = true; |
| } |
| } |
| |
| parentNode = parentNode.parentNode; |
| } |
| |
| /** |
| * If the URL looks like it has paging in it, add to the score. |
| * Things like /page/2/, /pagenum/2, ?p=3, ?page=11, ?pagination=34 |
| **/ |
| if (linkHref.match(/p(a|g|ag)?(e|ing|ination)?(=|\/)[0-9]{1,2}/i) || linkHref.match(/(page|paging)/i)) { |
| linkObj.score += 25; |
| } |
| |
| /* If the URL contains negative values, give a slight decrease. */ |
| if (linkHref.match(readability.regexps.extraneous)) { |
| linkObj.score -= 15; |
| } |
| |
| /** |
| * Minor punishment to anything that doesn't match our current URL. |
| * NOTE: I'm finding this to cause more harm than good where something is exactly 50 points. |
| * Dan, can you show me a counterexample where this is necessary? |
| * if (linkHref.indexOf(window.location.href) !== 0) { |
| * linkObj.score -= 1; |
| * } |
| **/ |
| |
| /** |
| * If the link text can be parsed as a number, give it a minor bonus, with a slight |
| * bias towards lower numbered pages. This is so that pages that might not have 'next' |
| * in their text can still get scored, and sorted properly by score. |
| **/ |
| var linkTextAsNumber = parseInt(linkText, 10); |
| if(linkTextAsNumber) { |
| // Punish 1 since we're either already there, or it's probably before what we want anyways. |
| if (linkTextAsNumber === 1) { |
| linkObj.score -= 10; |
| } |
| else { |
| // Todo: Describe this better |
| linkObj.score += Math.max(0, 10 - linkTextAsNumber); |
| } |
| } |
| } |
| |
| /** |
| * Loop thrugh all of our possible pages from above and find our top candidate for the next page URL. |
| * Require at least a score of 50, which is a relatively high confidence that this page is the next link. |
| **/ |
| var topPage = null; |
| for(var page in possiblePages) { |
| if(possiblePages.hasOwnProperty(page)) { |
| if(possiblePages[page].score >= 50 && (!topPage || topPage.score < possiblePages[page].score)) { |
| topPage = possiblePages[page]; |
| } |
| } |
| } |
| |
| if(topPage) { |
| var nextHref = topPage.href.replace(/\/$/,''); |
| |
| dbg('NEXT PAGE IS ' + nextHref); |
| readability.parsedPages[nextHref] = true; |
| return nextHref; |
| } |
| else { |
| return null; |
| } |
| }, |
| |
| createLinkDiv: function(link) { |
| var divNode = document.createElement('div'); |
| var aNode = document.createElement('a'); |
| var tNode = document.createTextNode('View Next Page'); |
| divNode.setAttribute('style', 'text-align: center'); |
| aNode.setAttribute('href', link); |
| aNode.appendChild(tNode); |
| divNode.appendChild(aNode); |
| return divNode; |
| }, |
| |
| xhr: function () { |
| if (typeof XMLHttpRequest !== 'undefined' && (window.location.protocol !== 'file:' || !window.ActiveXObject)) { |
| return new XMLHttpRequest(); |
| } |
| else { |
| try { return new ActiveXObject('Msxml2.XMLHTTP.6.0'); } catch(sixerr) { } |
| try { return new ActiveXObject('Msxml2.XMLHTTP.3.0'); } catch(threrr) { } |
| try { return new ActiveXObject('Msxml2.XMLHTTP'); } catch(err) { } |
| } |
| |
| return false; |
| }, |
| |
| successfulRequest: function (request) { |
| return (request.status >= 200 && request.status < 300) || request.status === 304 || (request.status === 0 && request.responseText); |
| }, |
| |
| ajax: function (url, options) { |
| var request = readability.xhr(); |
| |
| function respondToReadyState(readyState) { |
| if (request.readyState === 4) { |
| if (readability.successfulRequest(request)) { |
| if (options.success) { options.success(request); } |
| } |
| else { |
| if (options.error) { options.error(request); } |
| } |
| } |
| } |
| |
| if (typeof options === 'undefined') { options = {}; } |
| |
| request.onreadystatechange = respondToReadyState; |
| |
| request.open('get', url, true); |
| request.setRequestHeader('Accept', 'text/html'); |
| |
| try { |
| request.send(options.postBody); |
| } |
| catch (e) { |
| if (options.error) { options.error(); } |
| } |
| |
| return request; |
| }, |
| |
| /** |
| * Make an AJAX request for each page and append it to the document. |
| **/ |
| curPageNum: 1, |
| |
| appendNextPage: function (nextPageLink) { |
| readability.curPageNum+=1; |
| |
| var articlePage = document.createElement("DIV"); |
| articlePage.id = 'readability-page-' + readability.curPageNum; |
| articlePage.className = 'page'; |
| articlePage.innerHTML = '<p class="page-separator" title="Page ' + readability.curPageNum + '">§</p>'; |
| |
| document.getElementById("readability-content").appendChild(articlePage); |
| |
| if(readability.curPageNum > readability.maxPages) { |
| var linkDiv = readability.createLinkDiv(nextPageLink); |
| |
| articlePage.appendChild(linkDiv); |
| return; |
| } |
| |
| /** |
| * Now that we've built the article page DOM element, get the page content |
| * asynchronously and load the cleaned content into the div we created for it. |
| **/ |
| (function(pageUrl, thisPage) { |
| readability.ajax(pageUrl, { |
| success: function(r) { |
| |
| /* First, check to see if we have a matching ETag in headers - if we do, this is a duplicate page. */ |
| var eTag = r.getResponseHeader('ETag'); |
| if(eTag) { |
| if(eTag in readability.pageETags) { |
| dbg("Exact duplicate page found via ETag. Aborting."); |
| articlePage.style.display = 'none'; |
| return; |
| } else { |
| readability.pageETags[eTag] = 1; |
| } |
| } |
| |
| // TODO: this ends up doubling up page numbers on NYTimes articles. Need to generically parse those away. |
| var page = document.createElement("DIV"); |
| |
| /** |
| * Do some preprocessing to our HTML to make it ready for appending. |
| * • Remove any script tags. Swap and reswap newlines with a unicode character because multiline regex doesn't work in javascript. |
| * • Turn any noscript tags into divs so that we can parse them. This allows us to find any next page links hidden via javascript. |
| * • Turn all double br's into p's - was handled by prepDocument in the original view. |
| * Maybe in the future abstract out prepDocument to work for both the original document and AJAX-added pages. |
| **/ |
| var pageInnards = r.responseXML; |
| readability.removeScripts(pageInnards); |
| readability.replaceNoscriptsWithPs(pageInnards); |
| readability.replaceDoubleBrsWithPs(pageInnards); |
| readability.replaceFontsWithSpans(pageInnards); |
| page.appendChild(pageInnards); |
| |
| |
| /** |
| * Reset all flags for the next page, as they will search through it and disable as necessary at the end of grabArticle. |
| **/ |
| readability.flags = 0x1 | 0x2 | 0x4; |
| |
| var nextPageLink = readability.findNextPageLink(page), |
| content = readability.grabArticle(page); |
| |
| if(!content) { |
| dbg("No content found in page to append. Aborting."); |
| return; |
| } |
| |
| /** |
| * Anti-duplicate mechanism. Essentially, get the first paragraph of our new page. |
| * Compare it against all of the the previous document's we've gotten. If the previous |
| * document contains exactly the innerHTML of this first paragraph, it's probably a duplicate. |
| **/ |
| var firstP = content.getElementsByTagName("P").length ? content.getElementsByTagName("P")[0] : null; |
| if(firstP && firstP.innerHTML.length > 100) { |
| for(var i=1; i <= readability.curPageNum; i+=1) { |
| var rPage = document.getElementById('readability-page-' + i); |
| if(rPage && rPage.innerHTML.indexOf(firstP.innerHTML) !== -1) { |
| dbg('Duplicate of page ' + i + ' - skipping.'); |
| articlePage.style.display = 'none'; |
| readability.parsedPages[pageUrl] = true; |
| return; |
| } |
| } |
| } |
| |
| readability.removeScripts(content); |
| |
| readability.moveNodeInnards(content, thisPage); |
| |
| /** |
| * After the page has rendered, post process the content. This delay is necessary because, |
| * in webkit at least, offsetWidth is not set in time to determine image width. We have to |
| * wait a little bit for reflow to finish before we can fix floating images. |
| **/ |
| window.setTimeout( |
| function() { readability.postProcessContent(thisPage); }, |
| 500 |
| ); |
| |
| if(nextPageLink) { |
| readability.appendNextPage(nextPageLink); |
| } |
| } |
| }); |
| }(nextPageLink, articlePage)); |
| }, |
| |
| /** |
| * Get an elements class/id weight. Uses regular expressions to tell if this |
| * element looks good or bad. |
| * |
| * @param Element |
| * @return number (Integer) |
| **/ |
| getClassWeight: function (e) { |
| if(!readability.flagIsActive(readability.FLAG_WEIGHT_CLASSES)) { |
| return 0; |
| } |
| |
| var weight = 0; |
| |
| /* Look for a special classname */ |
| if (typeof(e.className) === 'string' && e.className !== '') |
| { |
| if(e.className.search(readability.regexps.negative) !== -1) { |
| weight -= 25; } |
| |
| if(e.className.search(readability.regexps.positive) !== -1) { |
| weight += 25; } |
| } |
| |
| /* Look for a special ID */ |
| if (typeof(e.id) === 'string' && e.id !== '') |
| { |
| if(e.id.search(readability.regexps.negative) !== -1) { |
| weight -= 25; } |
| |
| if(e.id.search(readability.regexps.positive) !== -1) { |
| weight += 25; } |
| } |
| |
| return weight; |
| }, |
| |
| nodeIsVisible: function (node) { |
| return (node.offsetWidth !== 0 || node.offsetHeight !== 0) && node.style.display.toLowerCase() !== 'none'; |
| }, |
| |
| /** |
| * Remove extraneous break tags from a node. |
| * |
| * @param Element |
| * @return void |
| **/ |
| killBreaks: function (e) { |
| var allElements = e.getElementsByTagName('*'); |
| while (i < allElements.length) { |
| readability.deleteExtraBreaks(allElements[i]); |
| i++; |
| } |
| }, |
| |
| /** |
| * Clean a node of all elements of type "tag". |
| * (Unless it's a youtube/vimeo video. People love movies.) |
| * |
| * @param Element |
| * @param string tag to clean |
| * @return void |
| **/ |
| clean: function (e, tag) { |
| var targetList = e.getElementsByTagName( tag ); |
| var isEmbed = (tag === 'object' || tag === 'embed'); |
| |
| for (var y=targetList.length-1; y >= 0; y-=1) { |
| /* Allow youtube and vimeo videos through as people usually want to see those. */ |
| if(isEmbed) { |
| var attributeValues = ""; |
| for (var i=0, il=targetList[y].attributes.length; i < il; i+=1) { |
| attributeValues += targetList[y].attributes[i].value + '|'; |
| } |
| |
| /* First, check the elements attributes to see if any of them contain youtube or vimeo */ |
| if (attributeValues.search(readability.regexps.videos) !== -1) { |
| continue; |
| } |
| |
| /* Then check the elements inside this element for the same. */ |
| if (targetList[y].innerHTML.search(readability.regexps.videos) !== -1) { |
| continue; |
| } |
| |
| } |
| |
| targetList[y].parentNode.removeChild(targetList[y]); |
| } |
| }, |
| |
| /** |
| * Clean an element of all tags of type "tag" if they look fishy. |
| * "Fishy" is an algorithm based on content length, classnames, link density, number of images & embeds, etc. |
| * |
| * @return void |
| **/ |
| cleanConditionally: function (e, tag) { |
| |
| if(!readability.flagIsActive(readability.FLAG_CLEAN_CONDITIONALLY)) { |
| return; |
| } |
| |
| var tagsList = e.getElementsByTagName(tag); |
| var curTagsLength = tagsList.length; |
| |
| /** |
| * Gather counts for other typical elements embedded within. |
| * Traverse backwards so we can remove nodes at the same time without effecting the traversal. |
| * |
| * TODO: Consider taking into account original contentScore here. |
| **/ |
| for (var i=curTagsLength-1; i >= 0; i-=1) { |
| var weight = readability.getClassWeight(tagsList[i]); |
| var contentScore = (typeof tagsList[i].readability !== 'undefined') ? tagsList[i].readability.contentScore : 0; |
| |
| dbg("Cleaning Conditionally " + tagsList[i] + " (" + tagsList[i].className + ":" + tagsList[i].id + ")" + ((typeof tagsList[i].readability !== 'undefined') ? (" with score " + tagsList[i].readability.contentScore) : '')); |
| |
| if(weight+contentScore < 0) |
| { |
| tagsList[i].parentNode.removeChild(tagsList[i]); |
| } |
| else if ( readability.getCharCount(tagsList[i],',') < 10) { |
| /** |
| * If there are not very many commas, and the number of |
| * non-paragraph elements is more than paragraphs or other ominous signs, remove the element. |
| **/ |
| var p = tagsList[i].getElementsByTagName("p").length; |
| var img = tagsList[i].getElementsByTagName("img").length; |
| var li = tagsList[i].getElementsByTagName("li").length-100; |
| var input = tagsList[i].getElementsByTagName("input").length; |
| |
| var embedCount = 0; |
| var embeds = tagsList[i].getElementsByTagName("embed"); |
| for(var ei=0,il=embeds.length; ei < il; ei+=1) { |
| if (embeds[ei].src.search(readability.regexps.videos) === -1) { |
| embedCount+=1; |
| } |
| } |
| |
| var linkDensity = readability.getLinkDensity(tagsList[i]); |
| var contentLength = readability.getInnerText(tagsList[i]).length; |
| var toRemove = false; |
| |
| if ( img > p ) { |
| toRemove = true; |
| } else if(li > p && tag !== "ul" && tag !== "ol") { |
| toRemove = true; |
| } else if( input > Math.floor(p/3) ) { |
| toRemove = true; |
| } else if(contentLength < 25 && (img === 0 || img > 2) ) { |
| toRemove = true; |
| } else if(weight < 25 && linkDensity > 0.2) { |
| toRemove = true; |
| } else if(weight >= 25 && linkDensity > 0.5) { |
| toRemove = true; |
| } else if((embedCount === 1 && contentLength < 75) || embedCount > 1) { |
| toRemove = true; |
| } |
| |
| if(toRemove) { |
| tagsList[i].parentNode.removeChild(tagsList[i]); |
| } |
| } |
| } |
| }, |
| |
| /** |
| * Clean out spurious headers from an Element. Checks things like classnames and link density. |
| * |
| * @param Element |
| * @return void |
| **/ |
| cleanHeaders: function (e) { |
| for (var headerIndex = 1; headerIndex < 3; headerIndex+=1) { |
| var headers = e.getElementsByTagName('h' + headerIndex); |
| for (var i=headers.length-1; i >=0; i-=1) { |
| if (readability.getClassWeight(headers[i]) < 0 || readability.getLinkDensity(headers[i]) > 0.33) { |
| headers[i].parentNode.removeChild(headers[i]); |
| } |
| } |
| } |
| }, |
| |
| flagIsActive: function(flag) { |
| return (readability.flags & flag) > 0; |
| }, |
| |
| addFlag: function(flag) { |
| readability.flags = readability.flags | flag; |
| }, |
| |
| removeFlag: function(flag) { |
| readability.flags = readability.flags & ~flag; |
| }, |
| |
| // Removes the children of |src| and appends them to |dest|. |
| moveNodeInnards: function(src, dest) { |
| try { |
| while (src.firstChild) { |
| dest.appendChild(src.removeChild(src.firstChild)); |
| } |
| } catch (e) {} |
| }, |
| |
| // Returns true if the node is a whitespace text node. |
| isWhitespaceNode: function(node) { |
| if (node.nodeType == Node.TEXT_NODE) { |
| if (node.data.trim().length == 0) { |
| return true; |
| } |
| } |
| return false; |
| }, |
| |
| // Returns true if the node is a <BR>. |
| isBrNode: function(node) { |
| return (node.tagName === 'BR'); |
| }, |
| |
| |
| // Returns the last <BR> node in a sequence of <BR> nodes that are only |
| // separated by whitespace, or null if there are not at least two <BR> tags |
| // in the sibling chain starting with |node|. Returns the second such <BR> |
| // node if |restrictToTwo| is true. |
| isMultipleBr: function(node, restrictToTwo) { |
| var lastBr = null; |
| if (!readability.isBrNode(node)) { |
| return lastBr; |
| } |
| var curr = node.nextSibling; |
| while (curr) { |
| if (readability.isWhitespaceNode(curr) || readability.isBrNode(curr)) { |
| lastBr = curr; |
| curr = curr.nextSibling; |
| if (restrictToTwo) { |
| if (readability.isBrNode(lastBr)) { |
| return lastBr; |
| } |
| } |
| continue; |
| } |
| break; |
| } |
| return lastBr; |
| }, |
| |
| // Removes all <BR> nodes except one and whitespace in between in a series |
| // of <BR> nodes. |
| deleteExtraBreaks: function(node) { |
| var lastBr = readability.isMultipleBr(node, false); |
| var ret = false; |
| while (lastBr && lastBr != node) { |
| var toRemove = lastBr; |
| lastBr = lastBr.previousSibling; |
| toRemove.parentNode.removeChild(toRemove); |
| ret = true; |
| } |
| return ret; |
| }, |
| |
| // Replaces a pair of <BR> nodes (possibly separated by whitespace), with a |
| // <P> node, and makes all next siblings of that pair children of <P>, up |
| // until the next pair of <BR> nodes is reached. |
| replaceDoubleBrWithP: function(node) { |
| // Check that we are starting with a BR. |
| var second = readability.isMultipleBr(node, true); |
| if (!second) { |
| return; |
| } |
| // Make all next siblings of the second BR into children of a P. |
| var p = document.createElement('p'); |
| var curr = second.nextSibling; |
| while (curr) { |
| if (readability.isMultipleBr(curr, true)) { |
| break; |
| } |
| var next = curr.nextSibling; |
| p.appendChild(curr.parentNode.removeChild(curr)); |
| curr = next; |
| } |
| var ret = curr; |
| |
| // Remove all nodes between the first and second BR. |
| curr = node.nextSibling; |
| while (curr && curr != second) { |
| var next = curr.nextSibling; |
| curr.parentNode.removeChild(curr); |
| curr = next; |
| } |
| // Remove the second BR. |
| second.parentNode.removeChild(second); |
| // Replace the first BR with the P. |
| node.parentNode.replaceChild(p, node); |
| |
| return ret; |
| }, |
| |
| // Returns true if the NodeList contains a double <BR>. |
| hasDoubleBr: function(nodeList) { |
| for (var i = 0; i < nodeList.length; nodeList++) { |
| if (readability.isMultipleBr(nodeList[i], true)) { |
| return true; |
| } |
| } |
| return false; |
| }, |
| |
| // Replaces double <BR> tags with <P> tags. |
| replaceDoubleBrsWithPs: function(node) { |
| var allElements = node.getElementsByTagName('BR'); |
| var node = null; |
| while (allElements && allElements.length > 0 && |
| readability.hasDoubleBr(allElements)) { |
| for (var nodeIndex = 0; (node = allElements[nodeIndex]); nodeIndex += 1) { |
| var next = node; |
| while (next = readability.replaceDoubleBrWithP(next)); |
| } |
| allElements = document.body.getElementsByTagName('BR'); |
| } |
| }, |
| |
| |
| // Replaces a BR and the whitespace that follows it with a P. |
| replaceBrWithP: function(node) { |
| if (!readability.isBrNode(node)) { |
| return; |
| } |
| var p = document.createElement('p'); |
| var curr = node.nextSibling; |
| while (curr && !isBrNode(curr)) { |
| var next = curr.nextSibling; |
| if (readability.isWhitespaceNode(curr)) { |
| curr.parentNode.removeChild(curr); |
| } else { |
| p.appendChild(curr.parentNode.removeChild(curr)); |
| } |
| curr = next; |
| } |
| node.parentNode.replaceChild(p, node); |
| return curr; |
| }, |
| |
| // Replaces all <BR> tags with <P> tags. Makes all next siblings of a <BR> tag |
| // children of the <P>. |
| replaceBrsWithPs: function(node) { |
| var allElements = node.getElementsByTagName('BR'); |
| var node = null; |
| while (allElements && allElements.length > 0) { |
| for (var nodeIndex = 0; (node = allElements[nodeIndex]); nodeIndex += 1) { |
| var next = node; |
| while (next = readability.replaceBrWithP(next)); |
| } |
| allElements = document.body.getElementsByTagName('BR'); |
| } |
| }, |
| |
| // Replaces any tag with any other tag. |
| replaceTagsWithTags: function(node, srcTag, destTag) { |
| var allElements = node.getElementsByTagName(srcTag); |
| for (var i = 0; i < allElements.length; i++) { |
| var dest = document.createElement(destTag); |
| readability.moveNodeInnards(allElements[i], dest); |
| allElements[i].parentNode.replaceChild(dest, allElements[i]); |
| } |
| }, |
| |
| // Replaces all <noscript> tags with <p> tags. |
| replaceNoscriptsWithPs: function(node) { |
| readability.replaceTagsWithTags(node, 'noscript', 'p'); |
| }, |
| |
| // Replaces all <font> tags with <span> tags. |
| replaceFontsWithSpans: function(node) { |
| readability.replaceTagsWithTags(node, 'font', 'span'); |
| }, |
| |
| // Returns a list of image URLs in the distilled article. |
| getImages : function() { |
| var images = document.getElementsByTagName('img'); |
| var result = new Array(images.length); |
| dbg("Number of images: " + images.length); |
| for(i = 0; i < images.length; i++) { |
| result[i] = images[i].src; |
| dbg("Image: " + result[i]); |
| } |
| return result; |
| }, |
| |
| // Returns the distilled article HTML from the page(s). |
| getDistilledArticleHTML : function() { |
| return readability.distilledHTML; |
| }, |
| |
| // Returns the next page of this article. |
| getNextPageLink : function() { |
| return readability.nextPageLink; |
| } |
| }; |