chrome/common/safe_browsing/client_model.proto - chromium/src - Git at Google

 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.
 //
 // This proto represents a machine learning model which is used to compute
 // the probability that a particular page visited by Chrome is phishing.
 //
 // Note: sine the machine learning model is trained on the server-side and then
 // downloaded onto the client it is important that this proto file stays in
 // sync with the server-side copy.  Otherwise, the client may not be able to
 // parse the server generated model anymore.  If you want to change this
 // protocol definition or you have questions regarding its format please contact
 // chrome-anti-phishing@googlegroups.com.

 syntax = "proto2";

 option optimize_for = LITE_RUNTIME;

 package safe_browsing;

 // This protocol buffer represents a machine learning model that is used in
 // client-side phishing detection (in Chrome).  The client extracts a set
 // of features from every website the user visits.  Extracted features map
 // feature names to floating point values (e.g., PageSecureLinksFreq -> 0.9).
 //
 // To compute the phishing score (i.e., the probability that the website is
 // phishing) a scorer will simply compute the sum of all rule scores for a
 // given set of extracted features.  The score of a particular rule corresponds
 // to the product of all feature values that are part of the rule times the
 // rule weight.  If a feature has no value (i.e., is not part of the extracted
 // features) its value will be set to zero.  The overall score is computed
 // by summing up all the rule scores.  This overall score is a logodds and can
 // be converted to a probability like this:
 // p = exp(logodds) / (exp(logodds) + 1).
 //
 // To make it harder for phishers to reverse engineer our machine learning model
 // all the features in the model are hashed with a sha256 hash function.  The
 // feature extractors also hash the extracted features before scoring happens.
 message ClientSideModel {
   // In order to save some space we store all the hashed strings in a
   // single repeated field and then the rules as well as page terms
   // and page words refer to an index in that repeated field.  All
   // hashes are sha256 hashes stored in binary format.
   repeated bytes hashes = 1;

   message Rule {
     // List of indexes into hashes above which are basically hashed
     // features that form the current rule.
     repeated int32 feature = 1;

     // The weight for this particular rule.
     required float weight = 2;
   }

   // List of rules which make up the model
   repeated Rule rule = 2;

   // List of indexes that point to the hashed page terms that appear in
   // the model.  The hashes are computed over page terms that are encoded
   // as lowercase UTF-8 strings.
   repeated int32 page_term = 3;

   // List of hashed page words.  The page words correspond to all words that
   // appear in page terms.  If the term "one two" is in the list of page terms
   // then "one" and "two" will be in the list of page words.  For page words
   // we don't use SHA256 because it is too expensive.  We use MurmurHash3
   // instead.  See: http://code.google.com/p/smhasher.
   repeated fixed32 page_word = 4;

   // Page terms in page_term contain at most this many page words.
   required int32 max_words_per_term = 5;

   // Model version number.  Every model that we train should have a different
   // version number and it should always be larger than the previous model
   // version.
   optional int32 version = 6;

   // List of known bad IP subnets.
   message IPSubnet {
     // The subnet prefix is a valid 16-byte IPv6 address (in network order) that
     // is hashed using sha256.
     required bytes prefix = 1;

     // Network prefix size in bits.  Default is an exact-host match.
     optional int32 size = 2 [default = 128];
   };
   repeated IPSubnet bad_subnet = 7;

   // Murmur hash seed that was used to hash the page words.
   optional fixed32 murmur_hash_seed = 8;

   // Maximum number of unique shingle hashes per page.
   optional int32 max_shingles_per_page = 9 [default = 200];

   // The number of words in a shingle.
   optional int32 shingle_size = 10 [default = 4];
 }
	// Copyright (c) 2011 The Chromium Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style license that can be
	// found in the LICENSE file.
	//
	// This proto represents a machine learning model which is used to compute
	// the probability that a particular page visited by Chrome is phishing.
	//
	// Note: sine the machine learning model is trained on the server-side and then
	// downloaded onto the client it is important that this proto file stays in
	// sync with the server-side copy. Otherwise, the client may not be able to
	// parse the server generated model anymore. If you want to change this
	// protocol definition or you have questions regarding its format please contact
	// chrome-anti-phishing@googlegroups.com.

	syntax = "proto2";

	option optimize_for = LITE_RUNTIME;

	package safe_browsing;

	// This protocol buffer represents a machine learning model that is used in
	// client-side phishing detection (in Chrome). The client extracts a set
	// of features from every website the user visits. Extracted features map
	// feature names to floating point values (e.g., PageSecureLinksFreq -> 0.9).
	//
	// To compute the phishing score (i.e., the probability that the website is
	// phishing) a scorer will simply compute the sum of all rule scores for a
	// given set of extracted features. The score of a particular rule corresponds
	// to the product of all feature values that are part of the rule times the
	// rule weight. If a feature has no value (i.e., is not part of the extracted
	// features) its value will be set to zero. The overall score is computed
	// by summing up all the rule scores. This overall score is a logodds and can
	// be converted to a probability like this:
	// p = exp(logodds) / (exp(logodds) + 1).
	//
	// To make it harder for phishers to reverse engineer our machine learning model
	// all the features in the model are hashed with a sha256 hash function. The
	// feature extractors also hash the extracted features before scoring happens.
	message ClientSideModel {
	// In order to save some space we store all the hashed strings in a
	// single repeated field and then the rules as well as page terms
	// and page words refer to an index in that repeated field. All
	// hashes are sha256 hashes stored in binary format.
	repeated bytes hashes = 1;

	message Rule {
	// List of indexes into hashes above which are basically hashed
	// features that form the current rule.
	repeated int32 feature = 1;

	// The weight for this particular rule.
	required float weight = 2;
	}

	// List of rules which make up the model
	repeated Rule rule = 2;

	// List of indexes that point to the hashed page terms that appear in
	// the model. The hashes are computed over page terms that are encoded
	// as lowercase UTF-8 strings.
	repeated int32 page_term = 3;

	// List of hashed page words. The page words correspond to all words that
	// appear in page terms. If the term "one two" is in the list of page terms
	// then "one" and "two" will be in the list of page words. For page words
	// we don't use SHA256 because it is too expensive. We use MurmurHash3
	// instead. See: http://code.google.com/p/smhasher.
	repeated fixed32 page_word = 4;

	// Page terms in page_term contain at most this many page words.
	required int32 max_words_per_term = 5;

	// Model version number. Every model that we train should have a different
	// version number and it should always be larger than the previous model
	// version.
	optional int32 version = 6;

	// List of known bad IP subnets.
	message IPSubnet {
	// The subnet prefix is a valid 16-byte IPv6 address (in network order) that
	// is hashed using sha256.
	required bytes prefix = 1;

	// Network prefix size in bits. Default is an exact-host match.
	optional int32 size = 2 [default = 128];
	};
	repeated IPSubnet bad_subnet = 7;

	// Murmur hash seed that was used to hash the page words.
	optional fixed32 murmur_hash_seed = 8;

	// Maximum number of unique shingle hashes per page.
	optional int32 max_shingles_per_page = 9 [default = 200];

	// The number of words in a shingle.
	optional int32 shingle_size = 10 [default = 4];
	}