tools/memory_inspector/memory_inspector/classification/native_heap_classifier.py - chromium/src - Git at Google

 # Copyright 2014 The Chromium Authors. All rights reserved.
 # Use of this source code is governed by a BSD-style license that can be
 # found in the LICENSE file.

 """This module classifies NativeHeap objects filtering their allocations.

 The only filter currently available is 'stacktrace', which works as follows:

 {'name': 'rule-1', 'stacktrace': 'foo' }
 {'name': 'rule-2', 'stacktrace': ['foo', r'bar\s+baz']}
 {'name': 'rule-3', 'source_path': 'sk.*allocator'}
 {'name': 'rule-3', 'source_path': 'sk', 'stacktrace': 'SkAllocator'}


 rule-1 will match any allocation that has 'foo' in one of its  stack frames.
 rule-2 will match any allocation that has a stack frame matching 'foo' AND a
 followed by a stack frame matching 'bar baz'. Note that order matters, so rule-2
 will not match a stacktrace like ['bar baz', 'foo'].
 rule-3 will match any allocation in which at least one of the source paths in
 its stack frames matches the regex sk.*allocator.
 rule-4 will match any allocation which satisfies both the conditions.

 TODO(primiano): introduce more filters after the first prototype with UI, for
 instance, filter by library file name or by allocation size.
 """

 import collections
 import posixpath
 import re

 from memory_inspector.classification import results
 from memory_inspector.classification import rules
 from memory_inspector.core import exceptions
 from memory_inspector.core import native_heap


 _RESULT_KEYS = ['bytes_allocated', 'bytes_resident']


 def LoadRules(content):
   """Loads and parses a native-heap rule tree from a content (string).

   Returns:
     An instance of |rules.Rule|, which nodes are |_NHeapRule| instances.
   """
   return rules.Load(content, _NHeapRule)


 def Classify(nativeheap, rule_tree):
   """Creates aggregated results of native heaps using the provided rules.

   Args:
     nativeheap: the heap dump being processed (a |NativeHeap| instance).
     rule_tree: the user-defined rules that define the filtering categories.

   Returns:
     An instance of |AggreatedResults|.
   """
   assert(isinstance(nativeheap, native_heap.NativeHeap))
   assert(isinstance(rule_tree, rules.Rule))

   res = results.AggreatedResults(rule_tree, _RESULT_KEYS)
   for allocation in nativeheap.allocations:
     res.AddToMatchingNodes(allocation,
                            [allocation.size, allocation.resident_size])
   return res


 def InferHeuristicRulesFromHeap(nheap, max_depth=3, threshold=0.02):
   """Infers the rules tree from a symbolized heap snapshot.

   In lack of a specific set of rules, this method can be invoked to infer a
   meaningful rule tree starting from a heap snapshot. It will build a compact
   radix tree from the source paths of the stack traces, which height is at most
   |max_depth|, selecting only those nodes which contribute for at least
   |threshold| (1.0 = 100%) w.r.t. the total allocation of the heap snapshot.
   """
   assert(isinstance(nheap, native_heap.NativeHeap))

   def RadixTreeInsert(node, path):
     """Inserts a string (path) into a radix tree (a python recursive dict).

     e.g.: [/a/b/c, /a/b/d, /z/h] -> {'/a/b/': {'c': {}, 'd': {}}, '/z/h': {}}
     """

     def GetCommonPrefix(args):
       """Returns the common prefix between two paths (no partial paths).

       e.g.: /tmp/bar, /tmp/baz will return /tmp/ (and not /tmp/ba as the dumb
       posixpath.commonprefix implementation would do)
       """
       parts = posixpath.commonprefix(args).rpartition(posixpath.sep)[0]
       return parts + posixpath.sep if parts else ''

     for node_path in node.iterkeys():
       pfx = GetCommonPrefix([node_path, path])
       if not pfx:
         continue
       if len(pfx) < len(node_path):
         node[pfx] = {node_path[len(pfx):] : node[node_path]}
         del node[node_path]
       if len(path) > len(pfx):
         RadixTreeInsert(node[pfx], path[len(pfx):])
       return
     node[path] = {}  # No common prefix, create new child in current node.

   # Given an allocation of size N and its stack trace, heuristically determines
   # the source directory to be blamed for the N bytes.
   # The blamed_dir is the one which appears more times in the top 8 stack frames
   # (excluding the first 2, which usually are just the (m|c)alloc call sites).
   # At the end, this will generate a *leaderboard* (|blamed_dirs|) which
   # associates, to each source path directory, the number of bytes allocated.

   blamed_dirs = collections.Counter()  # '/s/path' : bytes_from_this_path (int)
   total_allocated = 0
   for alloc in nheap.allocations:
     dir_histogram = collections.Counter()
     for frame in alloc.stack_trace.frames[2:10]:
       # Compute a histogram (for each allocation) of the top source dirs.
       if not frame.symbol or not frame.symbol.source_info:
         continue
       src_file = frame.symbol.source_info[0].source_file_path
       src_dir = posixpath.dirname(src_file.replace('\\', '/')) + '/'
       dir_histogram.update([src_dir])
     if not dir_histogram:
       continue
     # Add the blamed dir to the leaderboard.
     blamed_dir = dir_histogram.most_common()[0][0]
     blamed_dirs.update({blamed_dir : alloc.size})
     total_allocated += alloc.size

   # Select only the top paths from the leaderboard which contribute for more
   # than |threshold| and make a radix tree out of them.
   radix_tree = {}
   for blamed_dir, alloc_size in blamed_dirs.most_common():
     if (1.0 * alloc_size / total_allocated) < threshold:
       break
     RadixTreeInsert(radix_tree, blamed_dir)

   # The final step consists in generating a rule tree from the radix tree. This
   # is a pretty straightforward tree-clone operation, they have the same shape.
   def GenRulesFromRadixTree(radix_tree_node, max_depth, parent_path=''):
     children = []
     if max_depth > 0:
       for node_path, node_children in radix_tree_node.iteritems():
         child_rule = {
             'name': node_path[-16:],
             'source_path': '^' + re.escape(parent_path + node_path),
             'children': GenRulesFromRadixTree(
                 node_children, max_depth - 1, parent_path + node_path)}
         children += [child_rule]
     return children

   rules_tree = GenRulesFromRadixTree(radix_tree, max_depth)
   return LoadRules(str(rules_tree))


 class _NHeapRule(rules.Rule):
   def __init__(self, name, filters):
     super(_NHeapRule, self).__init__(name)

     # The 'stacktrace' filter can be either a string (simple case, one regex) or
     # a list of strings (complex case, see doc in the header of this file).
     stacktrace_regexs = filters.get('stacktrace', [])
     if isinstance(stacktrace_regexs, basestring):
       stacktrace_regexs = [stacktrace_regexs]
     self._stacktrace_regexs = []
     for regex in stacktrace_regexs:
       try:
         self._stacktrace_regexs.append(re.compile(regex))
       except re.error, descr:
         raise exceptions.MemoryInspectorException(
             'Stacktrace regex error "%s" : %s' % (regex, descr))

     # The 'source_path' regex, instead, simply matches the source file path.
     self._path_regex = None
     path_regex = filters.get('source_path')
     if path_regex:
       try:
         self._path_regex = re.compile(path_regex)
       except re.error, descr:
         raise exceptions.MemoryInspectorException(
             'Path regex error "%s" : %s' % (path_regex, descr))

   def Match(self, allocation):
     # Match the source file path, if the 'source_path' filter is specified.
     if self._path_regex:
       path_matches = False
       for frame in allocation.stack_trace.frames:
         if frame.symbol and frame.symbol.source_info:
           if self._path_regex.search(
               frame.symbol.source_info[0].source_file_path):
             path_matches = True
             break
       if not path_matches:
         return False

     # Match the stack traces symbols, if the 'stacktrace' filter is specified.
     if not self._stacktrace_regexs:
       return True
     cur_regex_idx = 0
     cur_regex = self._stacktrace_regexs[0]
     for frame in allocation.stack_trace.frames:
       if frame.symbol and cur_regex.search(frame.symbol.name):
         # The current regex has been matched.
         if cur_regex_idx == len(self._stacktrace_regexs) - 1:
           return True  # All the provided regexs have been matched, we're happy.
         cur_regex_idx += 1
         cur_regex = self._stacktrace_regexs[cur_regex_idx]

     return False  # Not all the provided regexs have been matched.
	# Copyright 2014 The Chromium Authors. All rights reserved.
	# Use of this source code is governed by a BSD-style license that can be
	# found in the LICENSE file.

	"""This module classifies NativeHeap objects filtering their allocations.

	The only filter currently available is 'stacktrace', which works as follows:

	{'name': 'rule-1', 'stacktrace': 'foo' }
	{'name': 'rule-2', 'stacktrace': ['foo', r'bar\s+baz']}
	{'name': 'rule-3', 'source_path': 'sk.*allocator'}
	{'name': 'rule-3', 'source_path': 'sk', 'stacktrace': 'SkAllocator'}


	rule-1 will match any allocation that has 'foo' in one of its stack frames.
	rule-2 will match any allocation that has a stack frame matching 'foo' AND a
	followed by a stack frame matching 'bar baz'. Note that order matters, so rule-2
	will not match a stacktrace like ['bar baz', 'foo'].
	rule-3 will match any allocation in which at least one of the source paths in
	its stack frames matches the regex sk.*allocator.
	rule-4 will match any allocation which satisfies both the conditions.

	TODO(primiano): introduce more filters after the first prototype with UI, for
	instance, filter by library file name or by allocation size.
	"""

	import collections
	import posixpath
	import re

	from memory_inspector.classification import results
	from memory_inspector.classification import rules
	from memory_inspector.core import exceptions
	from memory_inspector.core import native_heap


	_RESULT_KEYS = ['bytes_allocated', 'bytes_resident']


	def LoadRules(content):
	"""Loads and parses a native-heap rule tree from a content (string).

	Returns:
	An instance of \|rules.Rule\|, which nodes are \|_NHeapRule\| instances.
	"""
	return rules.Load(content, _NHeapRule)


	def Classify(nativeheap, rule_tree):
	"""Creates aggregated results of native heaps using the provided rules.

	Args:
	nativeheap: the heap dump being processed (a \|NativeHeap\| instance).
	rule_tree: the user-defined rules that define the filtering categories.

	Returns:
	An instance of \|AggreatedResults\|.
	"""
	assert(isinstance(nativeheap, native_heap.NativeHeap))
	assert(isinstance(rule_tree, rules.Rule))

	res = results.AggreatedResults(rule_tree, _RESULT_KEYS)
	for allocation in nativeheap.allocations:
	res.AddToMatchingNodes(allocation,
	[allocation.size, allocation.resident_size])
	return res


	def InferHeuristicRulesFromHeap(nheap, max_depth=3, threshold=0.02):
	"""Infers the rules tree from a symbolized heap snapshot.

	In lack of a specific set of rules, this method can be invoked to infer a
	meaningful rule tree starting from a heap snapshot. It will build a compact
	radix tree from the source paths of the stack traces, which height is at most
	\|max_depth\|, selecting only those nodes which contribute for at least
	\|threshold\| (1.0 = 100%) w.r.t. the total allocation of the heap snapshot.
	"""
	assert(isinstance(nheap, native_heap.NativeHeap))

	def RadixTreeInsert(node, path):
	"""Inserts a string (path) into a radix tree (a python recursive dict).

	e.g.: [/a/b/c, /a/b/d, /z/h] -> {'/a/b/': {'c': {}, 'd': {}}, '/z/h': {}}
	"""

	def GetCommonPrefix(args):
	"""Returns the common prefix between two paths (no partial paths).

	e.g.: /tmp/bar, /tmp/baz will return /tmp/ (and not /tmp/ba as the dumb
	posixpath.commonprefix implementation would do)
	"""
	parts = posixpath.commonprefix(args).rpartition(posixpath.sep)[0]
	return parts + posixpath.sep if parts else ''

	for node_path in node.iterkeys():
	pfx = GetCommonPrefix([node_path, path])
	if not pfx:
	continue
	if len(pfx) < len(node_path):
	node[pfx] = {node_path[len(pfx):] : node[node_path]}
	del node[node_path]
	if len(path) > len(pfx):
	RadixTreeInsert(node[pfx], path[len(pfx):])
	return
	node[path] = {} # No common prefix, create new child in current node.

	# Given an allocation of size N and its stack trace, heuristically determines
	# the source directory to be blamed for the N bytes.
	# The blamed_dir is the one which appears more times in the top 8 stack frames
	# (excluding the first 2, which usually are just the (m\|c)alloc call sites).
	# At the end, this will generate a leaderboard (\|blamed_dirs\|) which
	# associates, to each source path directory, the number of bytes allocated.

	blamed_dirs = collections.Counter() # '/s/path' : bytes_from_this_path (int)
	total_allocated = 0
	for alloc in nheap.allocations:
	dir_histogram = collections.Counter()
	for frame in alloc.stack_trace.frames[2:10]:
	# Compute a histogram (for each allocation) of the top source dirs.
	if not frame.symbol or not frame.symbol.source_info:
	continue
	src_file = frame.symbol.source_info[0].source_file_path
	src_dir = posixpath.dirname(src_file.replace('\\', '/')) + '/'
	dir_histogram.update([src_dir])
	if not dir_histogram:
	continue
	# Add the blamed dir to the leaderboard.
	blamed_dir = dir_histogram.most_common()[0][0]
	blamed_dirs.update({blamed_dir : alloc.size})
	total_allocated += alloc.size

	# Select only the top paths from the leaderboard which contribute for more
	# than \|threshold\| and make a radix tree out of them.
	radix_tree = {}
	for blamed_dir, alloc_size in blamed_dirs.most_common():
	if (1.0 * alloc_size / total_allocated) < threshold:
	break
	RadixTreeInsert(radix_tree, blamed_dir)

	# The final step consists in generating a rule tree from the radix tree. This
	# is a pretty straightforward tree-clone operation, they have the same shape.
	def GenRulesFromRadixTree(radix_tree_node, max_depth, parent_path=''):
	children = []
	if max_depth > 0:
	for node_path, node_children in radix_tree_node.iteritems():
	child_rule = {
	'name': node_path[-16:],
	'source_path': '^' + re.escape(parent_path + node_path),
	'children': GenRulesFromRadixTree(
	node_children, max_depth - 1, parent_path + node_path)}
	children += [child_rule]
	return children

	rules_tree = GenRulesFromRadixTree(radix_tree, max_depth)
	return LoadRules(str(rules_tree))


	class _NHeapRule(rules.Rule):
	def __init__(self, name, filters):
	super(_NHeapRule, self).__init__(name)

	# The 'stacktrace' filter can be either a string (simple case, one regex) or
	# a list of strings (complex case, see doc in the header of this file).
	stacktrace_regexs = filters.get('stacktrace', [])
	if isinstance(stacktrace_regexs, basestring):
	stacktrace_regexs = [stacktrace_regexs]
	self._stacktrace_regexs = []
	for regex in stacktrace_regexs:
	try:
	self._stacktrace_regexs.append(re.compile(regex))
	except re.error, descr:
	raise exceptions.MemoryInspectorException(
	'Stacktrace regex error "%s" : %s' % (regex, descr))

	# The 'source_path' regex, instead, simply matches the source file path.
	self._path_regex = None
	path_regex = filters.get('source_path')
	if path_regex:
	try:
	self._path_regex = re.compile(path_regex)
	except re.error, descr:
	raise exceptions.MemoryInspectorException(
	'Path regex error "%s" : %s' % (path_regex, descr))

	def Match(self, allocation):
	# Match the source file path, if the 'source_path' filter is specified.
	if self._path_regex:
	path_matches = False
	for frame in allocation.stack_trace.frames:
	if frame.symbol and frame.symbol.source_info:
	if self._path_regex.search(
	frame.symbol.source_info[0].source_file_path):
	path_matches = True
	break
	if not path_matches:
	return False

	# Match the stack traces symbols, if the 'stacktrace' filter is specified.
	if not self._stacktrace_regexs:
	return True
	cur_regex_idx = 0
	cur_regex = self._stacktrace_regexs[0]
	for frame in allocation.stack_trace.frames:
	if frame.symbol and cur_regex.search(frame.symbol.name):
	# The current regex has been matched.
	if cur_regex_idx == len(self._stacktrace_regexs) - 1:
	return True # All the provided regexs have been matched, we're happy.
	cur_regex_idx += 1
	cur_regex = self._stacktrace_regexs[cur_regex_idx]

	return False # Not all the provided regexs have been matched.