blob: 1a3a135dc6e70838b0669daeed96145c387d8898 [file] [log] [blame]
#!/usr/bin/env python2.5
#
# Copyright 2010 Google Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Helper classes which help converting a url to a list of SB expressions."""
import array
import logging
import re
import string
import urllib
import urlparse
import util
class UrlParseError(Exception):
pass
def GenerateSafeChars():
"""
Return a string containing all 'safe' characters that shouldn't be escaped
for url encoding. This includes all printable characters except '#%' and
whitespace characters.
"""
unfiltered_chars = string.digits + string.ascii_letters + string.punctuation
filtered_list = [c for c in unfiltered_chars if c not in '%#']
return array.array('c', filtered_list).tostring()
class ExpressionGenerator(object):
"""Class does the conversion url -> list of SafeBrowsing expressions.
This class converts a given url into the list of all SafeBrowsing host-suffix,
path-prefix expressions for that url. These are expressions that are on the
SafeBrowsing lists.
"""
HEX = re.compile(r'^0x([a-fA-F0-9]+)$')
OCT = re.compile(r'^0([0-7]+)$')
DEC = re.compile(r'^(\d+)$')
IP_WITH_TRAILING_SPACE = re.compile(r'^(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) ')
POSSIBLE_IP = re.compile(r'^(?i)((?:0x[0-9a-f]+|[0-9\\.])+)$')
FIND_BAD_OCTAL_REGEXP = re.compile(r'(^|\.)0\d*[89]')
# This regular expression parses the host and port from a hostname. Note: any
# user and password are removed from the hostname.
HOST_PORT_REGEXP = re.compile(r'^(?:.*@)?(?P<host>[^:]*)(:(?P<port>\d+))?$')
SAFE_CHARS = GenerateSafeChars()
# Dict that maps supported schemes to their default port number.
DEFAULT_PORTS = {'http': '80', 'https': '443', 'ftp': '21'}
def __init__(self, url):
parse_exception = UrlParseError('failed to parse URL "%s"' % (url,))
canonical_url = ExpressionGenerator.CanonicalizeUrl(url)
if not canonical_url:
raise parse_exception
# Each element is a list of host components used to build expressions.
self._host_lists = []
# A list of paths used to build expressions.
self._path_exprs = []
url_split = urlparse.urlsplit(canonical_url)
canonical_host, canonical_path = url_split[1], url_split[2]
self._MakeHostLists(canonical_host, parse_exception)
if url_split[3]:
# Include canonicalized path with query arguments
self._path_exprs.append(canonical_path + '?' + url_split[3])
self._path_exprs.append(canonical_path)
# Get the first three directory path components and create the 4 path
# expressions starting at the root (/) and successively appending directory
# path components, including the trailing slash. E.g.:
# /a/b/c/d.html -> [/, /a/, /a/b/, /a/b/c/]
path_parts = canonical_path.rstrip('/').lstrip('/').split('/')[:3]
if canonical_path.count('/') < 4:
# If the last component in not a directory we remove it.
path_parts.pop()
while path_parts:
self._path_exprs.append('/' + '/'.join(path_parts) + '/')
path_parts.pop()
if canonical_path != '/':
self._path_exprs.append('/')
@staticmethod
def CanonicalizeUrl(url):
"""Canonicalize the given URL for the SafeBrowsing protocol.
Args:
url: URL to canonicalize.
Returns:
A canonical URL or None if the URL could not be canonicalized.
"""
# Start by stripping off the fragment identifier.
tmp_pos = url.find('#')
if tmp_pos >= 0:
url = url[0:tmp_pos]
# Stripping off leading and trailing white spaces.
url = url.lstrip().rstrip()
# Remove any embedded tabs and CR/LF characters which aren't escaped.
url = url.replace('\t', '').replace('\r', '').replace('\n', '')
# Un-escape and re-escpae the URL just in case there are some encoded
# characters in the url scheme for example.
url = ExpressionGenerator._Escape(url)
url_split = urlparse.urlsplit(url)
if not url_split[0]:
# URL had no scheme. In this case we assume it is http://.
url = 'http://' + url
url_split = urlparse.urlsplit(url)
url_scheme = url_split[0].lower()
if url_scheme not in ExpressionGenerator.DEFAULT_PORTS:
return None # Unsupported scheme.
# Note: applying HOST_PORT_REGEXP also removes any user and password.
m = ExpressionGenerator.HOST_PORT_REGEXP.match(url_split[1])
if not m:
return None
host, port = m.group('host'), m.group('port')
canonical_host = ExpressionGenerator.CanonicalizeHost(host)
if not canonical_host:
return None
# Now that the host is canonicalized we add the port back if it's not the
# default port for that url scheme.
if port and port != ExpressionGenerator.DEFAULT_PORTS[url_scheme]:
canonical_host += ':' + port
canonical_path = ExpressionGenerator.CanonicalizePath(url_split[2])
# If the URL ends with ? we want to keep the ?.
canonical_url = url_split[0] + '://' + canonical_host + canonical_path
if url_split[3] != '' or url.endswith('?'):
canonical_url += '?' + url_split[3]
return canonical_url
@staticmethod
def CanonicalizePath(path):
"""Canonicalize the given path."""
if not path:
return '/'
# There are some cases where the path will not start with '/'. Example:
# "ftp://host.com?q" -- the hostname is 'host.com' and the path '%3Fq'.
# Browsers typically do prepend a leading slash to the path in this case,
# we'll do the same.
if path[0] != '/':
path = '/' + path
path = ExpressionGenerator._Escape(path)
path_components = []
for path_component in path.split('/'):
# If the path component is '..' we skip it and remove the preceding path
# component if there are any.
if path_component == '..':
if len(path_components) > 0:
path_components.pop()
# We skip empty path components to remove successive slashes (i.e.,
# // -> /). Note: this means that the leading and trailing slash will
# also be removed and need to be re-added afterwards.
#
# If the path component is '.' we also skip it (i.e., /./ -> /).
elif path_component != '.' and path_component != '':
path_components.append(path_component)
# Put the path components back together and re-add the leading slash which
# got stipped by removing empty path components.
canonical_path = '/' + '/'.join(path_components)
# If necessary we also re-add the trailing slash.
if path.endswith('/') and not canonical_path.endswith('/'):
canonical_path += '/'
return canonical_path
@staticmethod
def CanonicalizeHost(host):
"""Canonicalize the given host. Returns None in case of an error."""
if not host:
return None
host = ExpressionGenerator._Escape(host.lower())
ip = ExpressionGenerator.CanonicalizeIp(host)
if ip:
# Host is an IP address.
host = ip
else:
# Host is a normal hostname.
# Skip trailing, leading and consecutive dots.
host_split = [part for part in host.split('.') if part]
if len(host_split) < 2:
return None
host = '.'.join(host_split)
return host
@staticmethod
def CanonicalizeIp(host):
"""
Return a canonicalized IP if host can represent an IP and None otherwise.
"""
if len(host) <= 15:
# The Windows resolver allows a 4-part dotted decimal IP address to have a
# space followed by any old rubbish, so long as the total length of the
# string doesn't get above 15 characters. So, "10.192.95.89 xy" is
# resolved to 10.192.95.89.
# If the string length is greater than 15 characters,
# e.g. "10.192.95.89 xy.wildcard.example.com", it will be resolved through
# DNS.
m = ExpressionGenerator.IP_WITH_TRAILING_SPACE.match(host)
if m:
host = m.group(1)
if not ExpressionGenerator.POSSIBLE_IP.match(host):
return None
# Basically we should parse octal if we can, but if there are illegal octal
# numbers, i.e. 08 or 09, then we should just look at decimal and hex.
allow_octal = not ExpressionGenerator.FIND_BAD_OCTAL_REGEXP.search(host)
# Skip trailing, leading and consecutive dots.
host_split = [part for part in host.split('.') if part]
if len(host_split) > 4:
return None
ip = []
for i in xrange(len(host_split)):
m = ExpressionGenerator.HEX.match(host_split[i])
if m:
base = 16
else:
m = ExpressionGenerator.OCT.match(host_split[i])
if m and allow_octal:
base = 8
else:
m = ExpressionGenerator.DEC.match(host_split[i])
if m:
base = 10
else:
return None
n = long(m.group(1), base)
if n > 255:
if i < len(host_split) - 1:
n &= 0xff
ip.append(n)
else:
bytes = []
shift = 0
while n > 0 and len(bytes) < 4:
bytes.append(n & 0xff)
n >>= 8
if len(ip) + len(bytes) > 4:
return None
bytes.reverse()
ip.extend(bytes)
else:
ip.append(n)
while len(ip) < 4:
ip.append(0)
return '%u.%u.%u.%u' % tuple(ip)
def Expressions(self):
"""
A generator of the possible expressions.
"""
for host_parts in self._host_lists:
host = '.'.join(host_parts)
for p in self._path_exprs:
yield Expression(host, p)
@staticmethod
def _Escape(unescaped_str):
"""Fully unescape the given string, then re-escape once.
Args:
unescaped_str: string that should be escaped.
Returns:
Escaped string according to the SafeBrowsing protocol.
"""
unquoted = urllib.unquote(unescaped_str)
while unquoted != unescaped_str:
unescaped_str = unquoted
unquoted = urllib.unquote(unquoted)
return urllib.quote(unquoted, ExpressionGenerator.SAFE_CHARS)
def _MakeHostLists(self, host, parse_exception):
"""
Canonicalize host and build self._host_lists.
"""
ip = ExpressionGenerator.CanonicalizeIp(host)
if ip is not None:
# Is an IP.
self._host_lists.append([ip])
return
# Is a hostname.
# Skip trailing, leading and consecutive dots.
host_split = [part for part in host.split('.') if part]
if len(host_split) < 2:
raise parse_exception
start = len(host_split) - 5
stop = len(host_split) - 1
if start <= 0:
start = 1
self._host_lists.append(host_split)
for i in xrange(start, stop):
self._host_lists.append(host_split[i:])
class Expression(object):
"""Class which represents a host-suffix, path-prefix expression."""
def __init__(self, host, path):
self._host = host
self._path = path
self._value = host + path
self._hash_value = util.GetHash256(self._value)
def __str__(self):
return self.Value()
def __repr__(self):
"""
Not really a good repr. This is for debugging.
"""
return self.Value()
def Value(self):
return self._value
def HashValue(self):
return self._hash_value