blob: 03557353e18218054199c387f65c7e534e7c3a33 [file] [log] [blame]
# Copyright 2015 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
import HTMLParser
import json
import logging
import urllib2
import urlparse
class _HRefParser(HTMLParser.HTMLParser):
def __init__(self):
HTMLParser.HTMLParser.__init__(self)
self.hrefs = []
def handle_starttag(self, tag, attrs):
if tag == "a":
for name, value in attrs:
if name == "href":
self.hrefs.append(value)
def _AbsoluteUrlHasSaneScheme(absolute_url):
if len(absolute_url) < 4:
return False
return absolute_url[0:4] == "http"
def GenerateSafeUrls():
"""Prints a list of safe urls.
Generates a safe list of urls from a seed list. Each href in the HTML
fetched from the url from the seed list is placed into the safe list. The
safe list contains unsanitized urls.
"""
# A list of websites whose hrefs are unlikely to link to sites that contain
# malware.
seed_urls = [
"http://www.cnn.com",
"https://www.youtube.com",
"https://www.facebook.com",
"https://www.twitter.com",
"https://www.yahoo.com",
"https://www.amazon.com",
"https://www.wikipedia.com",
"https://www.bing.com",
"https://www.dailymotion.com",
"https://www.stackoverflow.com",
"https://www.google.com/#q=dumpling",
"http://www.baidu.com/s?wd=rice",
"http://www.baidu.com/s?wd=cow",
"https://www.google.com/#q=fox",
"http://www.yahoo.co.jp/",
"http://www.yandex.ru/",
"https://www.imdb.com/",
"http://www.huffingtonpost.com/",
"https://www.deviantart.com/",
"http://www.wsj.com/",
]
safe_urls = set()
for url in seed_urls:
try:
# Fetch and parse the HTML.
response = urllib2.urlopen(url)
encoding = response.headers.getparam("charset")
html = response.read()
if encoding:
html = html.decode(encoding)
parser = _HRefParser()
parser.feed(html)
except:
logging.exception("Error fetching or parsing url: %s", url)
raise
# Looks for all hrefs.
for relative_url in parser.hrefs:
if not relative_url:
continue
absolute_url = urlparse.urljoin(url, relative_url)
if not _AbsoluteUrlHasSaneScheme(absolute_url):
continue
safe_urls.add(absolute_url)
# Sort the urls, to make them easier to view in bulk.
safe_urls_list = list(safe_urls)
safe_urls_list.sort()
print json.dumps(safe_urls_list, indent=2, separators=(",", ":"))
if __name__ == "__main__":
GenerateSafeUrls()