| # Copyright 2016 The Chromium Authors. All rights reserved. |
| # Use of this source code is governed by a BSD-style license that can be |
| # found in the LICENSE file. |
| |
| # TODO(http://crbug.com/669639): there are lots of ways to make the code |
| # in this file better. We avoid having separate todos per task; instead |
| # see that meta-issue ticket. |
| |
| import logging |
| import math |
| import numpy as np |
| # N.B., ``np.array`` can't take generators; you must pass explicit lists. |
| |
| from libs.math.functions import MemoizedFunction |
| from libs.math.logarithms import logsumexp |
| from libs.math.vectors import vsum |
| |
| EPSILON = 0.00001 |
| _FEATURE_TO_REASON_PRIORITY = {'MinDistance': 0, |
| 'TopFrameIndex': -1, |
| 'FilePathIdf': -2, |
| 'TouchCrashedDirectory': -3, |
| 'TouchCrashedFile': -4} |
| |
| |
| class UnnormalizedLogLinearModel(object): |
| """An unnormalized loglinear model. |
| |
| We use this model for combining a bunch of different features in order |
| to decide who to blame. In particular, the nice thing about loglinear |
| models is that (a) they allow separating the amount of weight we |
| give to each feature from the feature itself, and (b) they allow us |
| to keep adding features at will without needing to worry about the |
| fiddly details needed for it to be a probability model. |
| |
| Throughout the documentation we use the following conventional |
| terminology. The independent variable (aka: the given data which |
| sets up the classification problem) is called ``X``, where particular |
| values for that variable called ``x``. The dependent variable (aka: |
| the answers/labels returned by classification) is called ``Y``, |
| where particular values for that random variable called ``y``. The |
| partition function is called ``Z``. And (somewhat non-conventionally) |
| we will call the log of the partition function ``zeta``. |
| |
| This class is distinct from ``LogLinearModel`` in that we do not require |
| a specification of ``Y``. This class is sufficient for determining |
| which ``y`` is the most likely, though it can only return a "score" |
| rather than returning a probability per se. |
| """ |
| |
| def __init__(self, meta_feature, meta_weight, epsilon=None): |
| """Construct a new model with the meta_feature and meta_weight. |
| |
| Args: |
| meta_feature (MetaFeature): A function ``X -> Y -> MetaFeatureValue``. |
| N.B., for all ``x`` and ``y`` the length of ``wrapped_feature(x)(y)`` |
| must be the same as the length of ``weights``. |
| All features. |
| meta_weight (MetaWeight): All weights. the weights for the features. |
| The keys of the dictionary are the names of the feature that weight is |
| for. We take this argument as a dict rather than as a list so that |
| callers needn't worry about what order to provide the weights in. |
| epsilon (float): The absolute-error threshold for considering a |
| weight to be "equal to zero". N.B., this should be a positive |
| number, as we will compare it against the absolute value of |
| each weight. |
| """ |
| self._epsilon = EPSILON if epsilon is None else epsilon |
| |
| self._meta_weight = meta_weight |
| self._meta_weight.DropZeroWeights(self._epsilon) |
| |
| self._quadrance = None |
| # TODO(crbug.com/674752): we need better names for ``self._features``. |
| def _Features(x): |
| """Wrap ``meta_feature`` to memoize things. |
| |
| This outer wrapping takes each ``x`` to a memoized instance of |
| ``_FeaturesGivenX``. That is, for each ``x`` we return a |
| ``MemoizedFunction`` from ``Y`` to ``dict(str to FeatureValue)``. |
| """ |
| # Memoize on ``Y``, to ensure we don't need to recompute |
| # ``FeatureValue``s nor recheck the lengths. |
| return MemoizedFunction(meta_feature(x)) |
| |
| # Memoize on ``X``, to ensure we share the memo tables on ``Y``. |
| self._features = MemoizedFunction(_Features) |
| |
| # TODO(crbug.com/674752): we need better names for ``self._scores``. |
| # N.B., this is just the inner product of ``self._meta_weight`` |
| # against ``self._features(x)``. If we can compute this in some |
| # more efficient way, we should. In particular, we will want to |
| # make the weights sparse, in which case we need to use a sparse |
| # variant of the dot product. |
| self._scores = MemoizedFunction(lambda x: self._features(x).map( |
| lambda fxy: self._meta_weight * fxy)) |
| |
| def ClearWeightBasedMemos(self): |
| """Clear all the memos that depend on the weight covector.""" |
| self._quadrance = None |
| self._scores.ClearMemos() |
| |
| def ClearAllMemos(self): |
| """Clear all memos, even those independent of the weight covector.""" |
| self.ClearWeightBasedMemos() |
| self._features.ClearMemos() |
| |
| @property |
| def meta_weight(self): |
| """The weight covector. |
| |
| At present we return the weights as an dict mapping feature name to its |
| weight, but in the future that may be replaced by a more general type which |
| specifies the semantics rather than the implementation details. |
| """ |
| return self._meta_weight |
| |
| @meta_weight.setter |
| def meta_weight(self, new_meta_weight): |
| self._meta_weight = new_meta_weight |
| self._meta_weight.DropZeroWeights(self._epsilon) |
| |
| @property |
| def l0(self): # pragma: no cover |
| """The l0-norm of the weight covector. |
| |
| N.B., despite being popularly called the "l0-norm", this isn't |
| actually a norm in the mathematical sense.""" |
| return self._meta_weight.l0 |
| |
| @property |
| def l1(self): # pragma: no cover |
| """The l1 (aka: Manhattan) norm of the weight covector.""" |
| return self._meta_weight.l1 |
| |
| @property |
| def quadrance(self): |
| """The square of the l2 norm of the weight covector. |
| |
| This value is often more helpful to have direct access to, as it |
| avoids the need for non-rational functions (e.g., sqrt) and shows up |
| as its own quantity in many places. Also, computing it directly avoids |
| the error introduced by squaring the square-root of an IEEE-754 float. |
| """ |
| return self._meta_weight.quadrance |
| |
| @property |
| def l2(self): |
| """The l2 (aka Euclidean) norm of the weight covector. |
| |
| If you need the square of the l2-norm, do not use this property. |
| Instead, use the ``quadrance`` property which is more accurate than |
| squaring this one. |
| """ |
| return math.sqrt(self.quadrance) |
| |
| # TODO(crbug.com/673964): something better for detecting "close to log(0)". |
| def LogZeroish(self, x): |
| """Determine whether a float is close enough to log(0). |
| |
| If a ``FeatureValue`` has a (log-domain) score of -inf for a given |
| ``Suspect``, then that suspect has zero probability of being the |
| culprit. We want to filter these suspects out, to clean up the |
| output of classification; so this method encapsulates the logic of |
| that check. |
| |
| Args: |
| x (float): the float to check |
| |
| Returns: |
| ``True`` if ``x`` is close enough to log(0); else ``False``. |
| """ |
| return x < 0 and math.isinf(x) |
| |
| def Features(self, x): |
| """Returns a function mapping ``y`` to its feature vector given ``x``. |
| |
| Args: |
| x (X): the value of the independent variable. |
| |
| Returns: |
| A ``MemoizedFunction`` of type ``Y -> dict(str to float)``. |
| """ |
| return self._features(x) |
| |
| def Score(self, x): |
| """Returns a function mapping ``y`` to its "score" given ``x``. |
| |
| Semantically, the "score" of ``y`` given ``x`` is the |
| unnormalized log-domain conditional probability of ``y`` given |
| ``x``. Operationally, we compute this by taking the inner product |
| of the weight covector with the ``Features(x)(y)`` vector. The |
| conditional probability itself can be computed by exponentiating |
| the "score" and normalizing with the partition function. However, |
| for determining the "best" ``y`` we don't need to bother computing |
| the probability, because ``Score(x)`` is monotonic with respect to |
| ``Probability(x)``. |
| |
| Args: |
| x (X): the value of the independent variable. |
| |
| Returns: |
| A ``MemoizedFunction`` of type ``Y -> float``. |
| """ |
| return self._scores(x) |
| |
| def FilterReasonWithWeight(self, reason): |
| """Filters reasons with zero weights. |
| |
| Args: |
| reason (dict): Dict mapping feature name to reason list. |
| |
| Returns: |
| A list of reason strings. |
| """ |
| flat_weight = self._meta_weight.leaves |
| |
| filtered_reasons = {} |
| for non_zero_feature in flat_weight: |
| if non_zero_feature in reason: |
| filtered_reasons[non_zero_feature] = reason[non_zero_feature] |
| |
| sorted_reasons = sorted( |
| filtered_reasons.items(), |
| key=lambda item: -_FEATURE_TO_REASON_PRIORITY.get( |
| item[0], -float('inf'))) |
| |
| reasons = [] |
| for _, reason in sorted_reasons: |
| reasons.extend(reason) |
| |
| return reasons |
| |
| |
| class LogLinearModel(UnnormalizedLogLinearModel): |
| """A loglinear probability model. |
| |
| This class is distinct from ``UnnormalizedLogLinearModel`` in that |
| we can provide probabilities (not just scores). However, to do so we |
| require a specification of the subsets of ``Y`` for each ``x``. |
| """ |
| def __init__(self, Y_given_X, meta_feature, meta_weight, epsilon=None): |
| """Construct a new probabilistic model. |
| |
| Args: |
| Y_given_X: a function from ``X`` to an iterable object giving the |
| subset of ``Y`` which has non-zero probability given the |
| ``x``. When in doubt about whether some ``y`` has zero probability |
| or not, it is always safe/correct to return a larger subset of |
| ``Y`` (it'll just take more computation time is all). This is |
| needed for computing the partition function and expectation. N.B., |
| we do not actually need to know/enumerate of *all* of ``Y``, |
| only the subsets for each ``x``. |
| meta_feature (MetaFeature): A function ``X -> Y -> MetaFeatureValue``. |
| N.B., for all ``x`` and ``y`` the length of ``wrapped_feature(x)(y)`` |
| must be the same as the length of ``weights``. |
| All features. |
| meta_weight (MetaWeight): All weights. the weights for the features. |
| The keys of the dictionary are the names of the feature that weight is |
| for. We take this argument as a dict rather than as a list so that |
| callers needn't worry about what order to provide the weights in. |
| epsilon (float): The absolute-error threshold for considering a |
| weight to be "equal to zero". N.B., this should be a positive |
| number, as we will compare it against the absolute value of |
| each weight. |
| """ |
| super(LogLinearModel, self).__init__(meta_feature, meta_weight, epsilon) |
| |
| self._Y = Y_given_X |
| |
| def _LogZ(x): |
| score_given_x = self._scores(x) |
| return logsumexp([score_given_x(y) for y in self._Y(x)]) |
| self._zetas = MemoizedFunction(_LogZ) |
| |
| def ClearWeightBasedMemos(self): |
| """Clear all the memos that depend on the weight covector.""" |
| super(LogLinearModel, self).ClearWeightBasedMemos() |
| self._zetas.ClearMemos() |
| |
| def Z(self, x): |
| """The normal-domain conditional partition-function given ``x``. |
| |
| If you need the log of the partition function, do not use this |
| method. Instead, use the ``LogZ`` method which computes it directly |
| and thus is more accurate than taking the log of this one. |
| |
| Args: |
| x (X): the value of the independent variable. |
| |
| Returns: |
| The normalizing constant of ``Probability(x)``. |
| """ |
| return math.exp(self.LogZ(x)) |
| |
| def LogZ(self, x): |
| """The log-domain conditional partition-function given ``x``. |
| |
| Args: |
| x (X): the value of the independent variable. |
| |
| Returns: |
| The normalizing constant of ``LogProbability(x)``. |
| """ |
| return self._zetas(x) |
| |
| def Probability(self, x): |
| """The normal-domain distribution over ``y`` given ``x``. |
| |
| That is, ``self.Probability(x)(y)`` returns ``p(y | x; self._meta_weight)`` |
| which is the model's estimation of ``Pr(y|x)``. |
| |
| If you need the log-probability, don't use this method. Instead, |
| use the ``LogProbability`` method which computes it directly and |
| thus is more accurate than taking the log of this one. |
| """ |
| logprob_given_x = self.LogProbability(x) |
| return lambda y: math.exp(logprob_given_x(y)) |
| |
| def LogProbability(self, x): |
| """The log-domain distribution over ``y`` given ``x``. |
| |
| That is, ``self.LogProbability(x)(y)`` returns the log of |
| ``self.Probability(x)(y)``. However, this method computes the |
| log-probabilities directly and so is more accurate and efficient |
| than taking the log of ``Probability``. |
| """ |
| zeta_x = self.LogZ(x) |
| score_given_x = self.Score(x) |
| return lambda y: score_given_x(y) - zeta_x |
| |
| def Expectation(self, x, f): |
| """Compute the expectation of a function with respect to ``Probability(x)``. |
| |
| Args: |
| x (X): the value of the independent variable. |
| f: a function of type ``Y -> np.ndarray(float)``. |
| |
| Returns: |
| An ``np.ndarray`` of ``float`` called the "expected value". N.B., |
| the particular array returned may not actually be one that the |
| function returns; rather, it's a sort of average of all the results |
| returned. For more information you can take a look at Wikipedia |
| <https://en.wikipedia.org/wiki/Expected_value>. |
| """ |
| prob_given_x = self.Probability(x) |
| # N.B., the ``*`` below is vector scaling! If we want to make this |
| # method polymorphic in the return type of ``f`` then we'll need an |
| # API that provides both scaling and ``vsum``. |
| return vsum([prob_given_x(y) * f(y) for y in self._Y(x)]) |