| # Copyright 2015 The Chromium Authors. All rights reserved. |
| # Use of this source code is governed by a BSD-style license that can be |
| # found in the LICENSE file. |
| |
| import HTMLParser |
| import json |
| import logging |
| import urllib2 |
| import urlparse |
| |
| |
| class _HRefParser(HTMLParser.HTMLParser): |
| |
| def __init__(self): |
| HTMLParser.HTMLParser.__init__(self) |
| self.hrefs = [] |
| |
| def handle_starttag(self, tag, attrs): |
| if tag == "a": |
| for name, value in attrs: |
| if name == "href": |
| self.hrefs.append(value) |
| |
| |
| def _AbsoluteUrlHasSaneScheme(absolute_url): |
| if len(absolute_url) < 4: |
| return False |
| return absolute_url[0:4] == "http" |
| |
| |
| def GenerateSafeUrls(): |
| """Prints a list of safe urls. |
| |
| Generates a safe list of urls from a seed list. Each href in the HTML |
| fetched from the url from the seed list is placed into the safe list. The |
| safe list contains unsanitized urls. |
| """ |
| # A list of websites whose hrefs are unlikely to link to sites that contain |
| # malware. |
| seed_urls = [ |
| "http://www.cnn.com", |
| "https://www.youtube.com", |
| "https://www.facebook.com", |
| "https://www.twitter.com", |
| "https://www.yahoo.com", |
| "https://www.amazon.com", |
| "https://www.wikipedia.com", |
| "https://www.bing.com", |
| "https://www.dailymotion.com", |
| "https://www.stackoverflow.com", |
| "https://www.google.com/#q=dumpling", |
| "http://www.baidu.com/s?wd=rice", |
| "http://www.baidu.com/s?wd=cow", |
| "https://www.google.com/#q=fox", |
| "http://www.yahoo.co.jp/", |
| "http://www.yandex.ru/", |
| "https://www.imdb.com/", |
| "http://www.huffingtonpost.com/", |
| "https://www.deviantart.com/", |
| "http://www.wsj.com/", |
| ] |
| |
| safe_urls = set() |
| |
| for url in seed_urls: |
| try: |
| # Fetch and parse the HTML. |
| response = urllib2.urlopen(url) |
| encoding = response.headers.getparam("charset") |
| html = response.read() |
| if encoding: |
| html = html.decode(encoding) |
| |
| parser = _HRefParser() |
| parser.feed(html) |
| except: |
| logging.exception("Error fetching or parsing url: %s", url) |
| raise |
| |
| # Looks for all hrefs. |
| for relative_url in parser.hrefs: |
| if not relative_url: |
| continue |
| |
| absolute_url = urlparse.urljoin(url, relative_url) |
| if not _AbsoluteUrlHasSaneScheme(absolute_url): |
| continue |
| safe_urls.add(absolute_url) |
| |
| # Sort the urls, to make them easier to view in bulk. |
| safe_urls_list = list(safe_urls) |
| safe_urls_list.sort() |
| |
| print json.dumps(safe_urls_list, indent=2, separators=(",", ":")) |
| |
| if __name__ == "__main__": |
| GenerateSafeUrls() |