chrome/common/extensions/docs/server2/datastore_util.py - chromium/src - Git at Google

 # Copyright 2015 The Chromium Authors. All rights reserved.
 # Use of this source code is governed by a BSD-style license that can be
 # found in the LICENSE file.

 import cPickle
 import logging

 from future import Future
 from gcloud import datastore

 # N.B.: In order to use this module you should have a working cloud development
 # environment configured with the googledatastore module installed.
 #
 # Please see https://cloud.google.com/datastore/docs/getstarted/start_python/


 _PROJECT_NAME = 'chrome-apps-doc'
 _PERSISTENT_OBJECT_KIND = 'PersistentObjectStoreItem'
 _VALUE_PROPERTY_NAME = 'pickled_value'

 # The max number of entities to include in a single request. This is capped at
 # 500 by the service. In practice we may send fewer due to _MAX_REQUEST_SIZE
 _MAX_BATCH_SIZE = 500


 # The maximum entity size allowed by Datastore.
 _MAX_ENTITY_SIZE = 1024*1024


 # The maximum request size (in bytes) to send Datastore. This is an approximate
 # size based on the sum of entity blob_value sizes.
 _MAX_REQUEST_SIZE = 5*1024*1024


 def _CreateEntity(client, name, value):
   key = client.key(_PERSISTENT_OBJECT_KIND, name)
   entity = datastore.Entity(
       key=key, exclude_from_indexes=[_VALUE_PROPERTY_NAME])
   entity[_VALUE_PROPERTY_NAME] = value
   return entity


 def _CreateBatches(client, data):
   '''Constructs batches of at most _MAX_BATCH_SIZE entities to cover all
   entities defined in |data| without exceeding the transaction size limit.
   This is a generator emitting lists of entities.
   '''
   def get_size(entity):
     return len(entity[_VALUE_PROPERTY_NAME])

   entities = [_CreateEntity(client, name, value)
               for name, value in data.iteritems()]
   batch_start = 0
   batch_end = 1
   batch_size = get_size(entities[0])
   while batch_end < len(entities):
     next_size = get_size(entities[batch_end])
     if (batch_size + next_size > _MAX_REQUEST_SIZE or
         batch_end - batch_start >= _MAX_BATCH_SIZE):
       yield entities[batch_start:batch_end], batch_end, len(entities)
       batch_start = batch_end
       batch_size = 0
     else:
       batch_size += next_size
     batch_end = batch_end + 1
   if batch_end > batch_start and batch_start < len(entities):
     yield entities[batch_start:batch_end], batch_end, len(entities)


 def PushData(data, original_data={}):
   '''Pushes a bunch of data into the datastore. The data should be a dict. Each
   key is treated as a namespace, and each value is also a dict. A new datastore
   entry is upserted for every inner key, with the value pickled into the
   |pickled_value| field.

   For example, if given the dictionary:

   {
     'fruit': {
       'apple': 1234,
       'banana': 'yellow',
       'trolling carrot': { 'arbitrarily complex': ['value', 'goes', 'here'] }
     },
     'animal': {
       'sheep': 'baaah',
       'dog': 'woof',
       'trolling cat': 'moo'
     }
   }

   this would result in a push of 6 keys in total, with the following IDs:

     Key('PersistentObjectStoreItem', 'fruit/apple')
     Key('PersistentObjectStoreItem', 'fruit/banana')
     Key('PersistentObjectStoreItem', 'fruit/trolling carrot')
     Key('PersistentObjectStoreItem', 'animal/sheep')
     Key('PersistentObjectStoreItem', 'animal/dog')
     Key('PersistentObjectStoreItem', 'animal/trolling cat')

   If given |original_data|, this will only push key-value pairs for entries that
   are either new or have changed from their original (pickled) value.

   Caveat: Pickling and unpickling a dictionary can (but does not always) change
   its key order. This means that objects will often be seen as changed even when
   they haven't changed.
   '''
   client = datastore.Client(_PROJECT_NAME)

   def flatten(dataset):
     flat = {}
     for namespace, items in dataset.iteritems():
       for k, v in items.iteritems():
         flat['%s/%s' % (namespace, k)] = cPickle.dumps(v)
     return flat

   logging.info('Flattening data sets...')
   data = flatten(data)
   original_data = flatten(original_data)

   logging.info('Culling new data...')
   for k in data.keys():
     if ((k in original_data and original_data[k] == data[k]) or
         (len(data[k]) > _MAX_ENTITY_SIZE)):
       del data[k]

   for entities, n, total in _CreateBatches(client, data):
     batch = client.batch()
     for e in entities:
       batch.put(e)
     logging.info('Committing %s/%s entities...' % (n, total))
     batch.commit()
	# Copyright 2015 The Chromium Authors. All rights reserved.
	# Use of this source code is governed by a BSD-style license that can be
	# found in the LICENSE file.

	import cPickle
	import logging

	from future import Future
	from gcloud import datastore

	# N.B.: In order to use this module you should have a working cloud development
	# environment configured with the googledatastore module installed.
	#
	# Please see https://cloud.google.com/datastore/docs/getstarted/start_python/


	_PROJECT_NAME = 'chrome-apps-doc'
	_PERSISTENT_OBJECT_KIND = 'PersistentObjectStoreItem'
	_VALUE_PROPERTY_NAME = 'pickled_value'

	# The max number of entities to include in a single request. This is capped at
	# 500 by the service. In practice we may send fewer due to _MAX_REQUEST_SIZE
	_MAX_BATCH_SIZE = 500


	# The maximum entity size allowed by Datastore.
	_MAX_ENTITY_SIZE = 1024*1024


	# The maximum request size (in bytes) to send Datastore. This is an approximate
	# size based on the sum of entity blob_value sizes.
	_MAX_REQUEST_SIZE = 510241024


	def _CreateEntity(client, name, value):
	key = client.key(_PERSISTENT_OBJECT_KIND, name)
	entity = datastore.Entity(
	key=key, exclude_from_indexes=[_VALUE_PROPERTY_NAME])
	entity[_VALUE_PROPERTY_NAME] = value
	return entity


	def _CreateBatches(client, data):
	'''Constructs batches of at most _MAX_BATCH_SIZE entities to cover all
	entities defined in \|data\| without exceeding the transaction size limit.
	This is a generator emitting lists of entities.
	'''
	def get_size(entity):
	return len(entity[_VALUE_PROPERTY_NAME])

	entities = [_CreateEntity(client, name, value)
	for name, value in data.iteritems()]
	batch_start = 0
	batch_end = 1
	batch_size = get_size(entities[0])
	while batch_end < len(entities):
	next_size = get_size(entities[batch_end])
	if (batch_size + next_size > _MAX_REQUEST_SIZE or
	batch_end - batch_start >= _MAX_BATCH_SIZE):
	yield entities[batch_start:batch_end], batch_end, len(entities)
	batch_start = batch_end
	batch_size = 0
	else:
	batch_size += next_size
	batch_end = batch_end + 1
	if batch_end > batch_start and batch_start < len(entities):
	yield entities[batch_start:batch_end], batch_end, len(entities)


	def PushData(data, original_data={}):
	'''Pushes a bunch of data into the datastore. The data should be a dict. Each
	key is treated as a namespace, and each value is also a dict. A new datastore
	entry is upserted for every inner key, with the value pickled into the
	\|pickled_value\| field.

	For example, if given the dictionary:

	{
	'fruit': {
	'apple': 1234,
	'banana': 'yellow',
	'trolling carrot': { 'arbitrarily complex': ['value', 'goes', 'here'] }
	},
	'animal': {
	'sheep': 'baaah',
	'dog': 'woof',
	'trolling cat': 'moo'
	}
	}

	this would result in a push of 6 keys in total, with the following IDs:

	Key('PersistentObjectStoreItem', 'fruit/apple')
	Key('PersistentObjectStoreItem', 'fruit/banana')
	Key('PersistentObjectStoreItem', 'fruit/trolling carrot')
	Key('PersistentObjectStoreItem', 'animal/sheep')
	Key('PersistentObjectStoreItem', 'animal/dog')
	Key('PersistentObjectStoreItem', 'animal/trolling cat')

	If given \|original_data\|, this will only push key-value pairs for entries that
	are either new or have changed from their original (pickled) value.

	Caveat: Pickling and unpickling a dictionary can (but does not always) change
	its key order. This means that objects will often be seen as changed even when
	they haven't changed.
	'''
	client = datastore.Client(_PROJECT_NAME)

	def flatten(dataset):
	flat = {}
	for namespace, items in dataset.iteritems():
	for k, v in items.iteritems():
	flat['%s/%s' % (namespace, k)] = cPickle.dumps(v)
	return flat

	logging.info('Flattening data sets...')
	data = flatten(data)
	original_data = flatten(original_data)

	logging.info('Culling new data...')
	for k in data.keys():
	if ((k in original_data and original_data[k] == data[k]) or
	(len(data[k]) > _MAX_ENTITY_SIZE)):
	del data[k]

	for entities, n, total in _CreateBatches(client, data):
	batch = client.batch()
	for e in entities:
	batch.put(e)
	logging.info('Committing %s/%s entities...' % (n, total))
	batch.commit()