server/dsmapper/controller.go - infra/luci/luci-go - Git at Google

 // Copyright 2018 The LUCI Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //      http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 package dsmapper

 import (
 	"context"
 	"fmt"
 	"math"
 	"sync"
 	"time"

 	"golang.org/x/sync/errgroup"
 	"google.golang.org/protobuf/proto"

 	"go.chromium.org/luci/gae/service/datastore"

 	"go.chromium.org/luci/common/clock"
 	"go.chromium.org/luci/common/errors"
 	"go.chromium.org/luci/common/logging"
 	"go.chromium.org/luci/common/retry/transient"
 	"go.chromium.org/luci/common/sync/parallel"

 	"go.chromium.org/luci/server/tq"

 	"go.chromium.org/luci/server/dsmapper/dsmapperpb"
 	"go.chromium.org/luci/server/dsmapper/internal/splitter"
 	"go.chromium.org/luci/server/dsmapper/internal/tasks"

 	// Need this to enqueue tasks inside Datastore transactions.
 	_ "go.chromium.org/luci/server/tq/txn/datastore"
 )

 // ID identifies a mapper registered in the controller.
 //
 // It will be passed across processes, so all processes that execute mapper jobs
 // should register same mappers under same IDs.
 //
 // The safest approach is to keep mapper IDs in the app unique, e.g. do NOT
 // reuse them when adding new mappers or significantly changing existing ones.
 type ID string

 // Mapper applies some function to the given slice of entities, given by
 // their keys.
 //
 // May be called multiple times for same key (thus should be idempotent).
 //
 // Returning a transient error indicates that the processing of this batch of
 // keys should be retried (even if some keys were processed successfully).
 //
 // Returning a fatal error causes the entire shard (and eventually the entire
 // job) to be marked as failed. The processing of the failed shard stops right
 // away, but other shards are kept running until completion (or their own
 // failure).
 //
 // The function is called outside of any transactions, so it can start its own
 // if needed.
 type Mapper func(ctx context.Context, keys []*datastore.Key) error

 // Factory knows how to construct instances of Mapper.
 //
 // Factory is supplied by the users of the library and registered in the
 // controller via RegisterFactory call.
 //
 // It is used to get a mapper to process a set of pages within a shard. It takes
 // a Job (including its Config and Params) and a shard index, so it can prepare
 // the mapper for processing of this specific shard.
 //
 // Returning a transient error triggers an eventual retry. Returning a fatal
 // error causes the shard (eventually the entire job) to be marked as failed.
 type Factory func(ctx context.Context, j *Job, shardIdx int) (Mapper, error)

 // Controller is responsible for starting, progressing and finishing mapping
 // jobs.
 //
 // It should be treated as a global singleton object. Having more than one
 // controller in the production application is a bad idea (they'll collide with
 // each other since they use global datastore namespace). It's still useful
 // to instantiate multiple controllers in unit tests.
 type Controller struct {
 	// MapperQueue is a name of the Cloud Tasks queue to use for mapping jobs.
 	//
 	// This queue will perform all "heavy" tasks. It should be configured
 	// appropriately to allow desired number of shards to run in parallel.
 	//
 	// For example, if the largest submitted job is expected to have 128 shards,
 	// max_concurrent_requests setting of the mapper queue should be at least 128,
 	// otherwise some shards will be stalled waiting for others to finish
 	// (defeating the purpose of having large number of shards).
 	//
 	// If empty, "default" is used.
 	MapperQueue string

 	// ControlQueue is a name of the Cloud Tasks queue to use for control signals.
 	//
 	// This queue is used very lightly when starting and stopping jobs (roughly
 	// 2*Shards tasks overall per job). A default queue.yaml settings for such
 	// queue should be sufficient (unless you run a lot of different jobs at
 	// once).
 	//
 	// If empty, "default" is used.
 	ControlQueue string

 	m       sync.RWMutex
 	mappers map[ID]Factory
 	disp    *tq.Dispatcher
 }

 // Install registers task queue task handlers in the given task queue
 // dispatcher.
 //
 // This must be done before Controller is used.
 //
 // There can be at most one Controller installed into an instance of TQ
 // dispatcher. Installing more will cause panics.
 //
 // If you need multiple different controllers for some reason, create multiple
 // tq.Dispatchers (with different base URLs, so they don't conflict with each
 // other) and install them all into the router.
 func (ctl *Controller) Install(disp *tq.Dispatcher) {
 	ctl.m.Lock()
 	defer ctl.m.Unlock()

 	if ctl.disp != nil {
 		panic("mapper.Controller is already installed into a tq.Dispatcher")
 	}
 	ctl.disp = disp

 	controlQueue := ctl.ControlQueue
 	if controlQueue == "" {
 		controlQueue = "default"
 	}
 	mapperQueue := ctl.MapperQueue
 	if mapperQueue == "" {
 		mapperQueue = "default"
 	}

 	disp.RegisterTaskClass(tq.TaskClass{
 		ID:        "dsmapper-split-and-launch",
 		Prototype: &tasks.SplitAndLaunch{},
 		Kind:      tq.Transactional,
 		Queue:     controlQueue,
 		Handler:   ctl.splitAndLaunchHandler,
 		Quiet:     true,
 	})
 	disp.RegisterTaskClass(tq.TaskClass{
 		ID:        "dsmapper-fan-out-shards",
 		Prototype: &tasks.FanOutShards{},
 		Kind:      tq.Transactional,
 		Queue:     controlQueue,
 		Handler:   ctl.fanOutShardsHandler,
 		Quiet:     true,
 	})
 	disp.RegisterTaskClass(tq.TaskClass{
 		ID:        "dsmapper-process-shard",
 		Prototype: &tasks.ProcessShard{},
 		Kind:      tq.FollowsContext,
 		Queue:     mapperQueue,
 		Handler:   ctl.processShardHandler,
 		Quiet:     true,
 	})
 	disp.RegisterTaskClass(tq.TaskClass{
 		ID:        "dsmapper-request-job-state-update",
 		Prototype: &tasks.RequestJobStateUpdate{},
 		Kind:      tq.Transactional,
 		Queue:     controlQueue,
 		Handler:   ctl.requestJobStateUpdateHandler,
 		Quiet:     true,
 	})
 	disp.RegisterTaskClass(tq.TaskClass{
 		ID:        "dsmapper-update-job-state",
 		Prototype: &tasks.UpdateJobState{},
 		Kind:      tq.NonTransactional,
 		Queue:     controlQueue,
 		Handler:   ctl.updateJobStateHandler,
 		Quiet:     true,
 	})
 }

 // tq returns a dispatcher set in Install or panics if not set yet.
 //
 // Grabs the reader lock inside.
 func (ctl *Controller) tq() *tq.Dispatcher {
 	ctl.m.RLock()
 	defer ctl.m.RUnlock()
 	if ctl.disp == nil {
 		panic("mapper.Controller wasn't installed into tq.Dispatcher yet")
 	}
 	return ctl.disp
 }

 // RegisterFactory adds the given mapper factory to the internal registry.
 //
 // Intended to be used during init() time or early during the process
 // initialization. Panics if a factory with such ID has already been registered.
 //
 // The mapper ID will be used internally to identify which mapper a job should
 // be using. If a factory disappears while the job is running (e.g. if the
 // service binary is updated and new binary doesn't have the mapper registered
 // anymore), the job ends with a failure.
 func (ctl *Controller) RegisterFactory(id ID, m Factory) {
 	ctl.m.Lock()
 	defer ctl.m.Unlock()

 	if _, ok := ctl.mappers[id]; ok {
 		panic(fmt.Sprintf("mapper %q is already registered", id))
 	}

 	if ctl.mappers == nil {
 		ctl.mappers = make(map[ID]Factory, 1)
 	}
 	ctl.mappers[id] = m
 }

 // getFactory returns a registered mapper factory or an error.
 //
 // Grabs the reader lock inside. Can return only fatal errors.
 func (ctl *Controller) getFactory(id ID) (Factory, error) {
 	ctl.m.RLock()
 	defer ctl.m.RUnlock()
 	if m, ok := ctl.mappers[id]; ok {
 		return m, nil
 	}
 	return nil, errors.Reason("no mapper factory with ID %q registered", id).Err()
 }

 // initMapper instantiates a Mapper through a registered factory.
 //
 // May return fatal and transient errors.
 func (ctl *Controller) initMapper(ctx context.Context, j *Job, shardIdx int) (Mapper, error) {
 	f, err := ctl.getFactory(j.Config.Mapper)
 	if err != nil {
 		return nil, errors.Annotate(err, "when initializing mapper").Err()
 	}
 	m, err := f(ctx, j, shardIdx)
 	if err != nil {
 		return nil, errors.Annotate(err, "error from mapper factory %q", j.Config.Mapper).Err()
 	}
 	return m, nil
 }

 // LaunchJob launches a new mapping job, returning its ID (that can be used to
 // control it or query its status).
 //
 // Launches a datastore transaction inside.
 func (ctl *Controller) LaunchJob(ctx context.Context, j *JobConfig) (JobID, error) {
 	disp := ctl.tq()

 	if err := j.Validate(); err != nil {
 		return 0, errors.Annotate(err, "bad job config").Err()
 	}
 	if _, err := ctl.getFactory(j.Mapper); err != nil {
 		return 0, errors.Annotate(err, "bad job config").Err()
 	}

 	// Prepare and store the job entity, generate its key. Launch a tq task that
 	// subdivides the key space and launches individual shards. We do it
 	// asynchronously since this can be potentially slow (for large number of
 	// shards).
 	var job Job
 	err := runTxn(ctx, func(ctx context.Context) error {
 		now := clock.Now(ctx).UTC()
 		job = Job{
 			Config:  *j,
 			State:   dsmapperpb.State_STARTING,
 			Created: now,
 			Updated: now,
 		}
 		if err := datastore.Put(ctx, &job); err != nil {
 			return errors.Annotate(err, "failed to store Job entity").Tag(transient.Tag).Err()
 		}
 		return disp.AddTask(ctx, &tq.Task{
 			Title: fmt.Sprintf("split:job-%d", job.ID),
 			Payload: &tasks.SplitAndLaunch{
 				JobId: int64(job.ID),
 			},
 		})
 	})
 	if err != nil {
 		return 0, err
 	}
 	return job.ID, nil
 }

 // GetJob fetches a previously launched job given its ID.
 //
 // Returns ErrNoSuchJob if not found. All other possible errors are transient
 // and they are marked as such.
 func (ctl *Controller) GetJob(ctx context.Context, id JobID) (*Job, error) {
 	// Even though we could have made getJob public, we want to force API users
 	// to use Controller as a single facade.
 	return getJob(ctx, id)
 }

 // AbortJob aborts a job and returns its most recent state.
 //
 // Silently does nothing if the job is finished or already aborted.
 //
 // Returns ErrNoSuchJob is there's no such job at all. All other possible errors
 // are transient and they are marked as such.
 func (ctl *Controller) AbortJob(ctx context.Context, id JobID) (job *Job, err error) {
 	err = runTxn(ctx, func(ctx context.Context) error {
 		var err error
 		switch job, err = getJob(ctx, id); {
 		case err != nil:
 			return err
 		case isFinalState(job.State) || job.State == dsmapperpb.State_ABORTING:
 			return nil // nothing to abort, already done
 		case job.State == dsmapperpb.State_STARTING:
 			// Shards haven't been launched yet. Kill the job right away.
 			job.State = dsmapperpb.State_ABORTED
 		case job.State == dsmapperpb.State_RUNNING:
 			// Running shards will discover that the job is aborting and will
 			// eventually move into ABORTED state (notifying the job about it). Once
 			// all shards report they are done, the job itself will switch into
 			// ABORTED state.
 			job.State = dsmapperpb.State_ABORTING
 		}
 		job.Updated = clock.Now(ctx).UTC()
 		return errors.Annotate(datastore.Put(ctx, job), "failed to store Job entity").Tag(transient.Tag).Err()
 	})
 	if err != nil {
 		job = nil // don't return bogus data in case txn failed to land
 	}
 	return
 }

 ////////////////////////////////////////////////////////////////////////////////
 // Task queue tasks handlers.

 // errJobAborted is used internally as shard failure status when the job is
 // being aborted.
 //
 // It causes the shard to switch into ABORTED state instead of FAIL.
 var errJobAborted = errors.New("the job has been aborted")

 // splitAndLaunchHandler splits the job into shards and enqueues tasks that
 // process shards.
 func (ctl *Controller) splitAndLaunchHandler(ctx context.Context, payload proto.Message) error {
 	msg := payload.(*tasks.SplitAndLaunch)
 	now := clock.Now(ctx).UTC()

 	// Fetch job details. Make sure it isn't canceled and isn't running already.
 	job, err := getJobInState(ctx, JobID(msg.JobId), dsmapperpb.State_STARTING)
 	if err != nil || job == nil {
 		return errors.Annotate(err, "in SplitAndLaunch").Err()
 	}

 	// Figure out key ranges for shards. There may be fewer shards than requested
 	// if there are too few entities.
 	dq := job.Config.Query.ToDatastoreQuery()
 	ranges, err := splitter.SplitIntoRanges(ctx, dq, splitter.Params{
 		Shards:  job.Config.ShardCount,
 		Samples: 512, // should be enough for everyone...
 	})
 	if err != nil {
 		return errors.Annotate(err, "failed to split the query into shards").Tag(transient.Tag).Err()
 	}

 	// Create entities that hold shards state. Each one is in its own entity
 	// group, since the combined write rate to them is O(ShardCount), which can
 	// overcome limits of a single entity group.
 	shards := make([]*shard, len(ranges))
 	for idx, rng := range ranges {
 		shards[idx] = &shard{
 			JobID:         job.ID,
 			Index:         idx,
 			State:         dsmapperpb.State_STARTING,
 			Range:         rng,
 			ExpectedCount: -1,
 			Created:       now,
 			Updated:       now,
 		}
 	}

 	// Calculate number of entities in each shard to track shard processing
 	// progress. Note that this can be very slow if there are many entities.
 	if job.Config.TrackProgress {
 		logging.Infof(ctx, "Estimating the size of each shard...")
 		if err := fetchShardSizes(ctx, dq, shards); err != nil {
 			return errors.Annotate(err, "when estimating shard sizes").Err()
 		}
 	}

 	// We use auto-generated keys for shards to make sure crashed SplitAndLaunch
 	// task retries cleanly, even if the underlying key space we are mapping over
 	// changes between the retries (making a naive put using "<job-id>:<index>"
 	// key non-idempotent!).
 	logging.Infof(ctx, "Instantiating shards...")
 	if err := datastore.Put(ctx, shards); err != nil {
 		return errors.Annotate(err, "failed to store shards").Tag(transient.Tag).Err()
 	}

 	// Prepare shardList which is basically a manual fully consistent index for
 	// Job -> [Shard] relation. We can't use a regular index, since shards are all
 	// in different entity groups (see O(ShardCount) argument above).
 	//
 	// Log the resulting shards along the way.
 	shardsEnt := shardList{
 		Parent: datastore.KeyForObj(ctx, job),
 		Shards: make([]int64, len(shards)),
 	}
 	for idx, s := range shards {
 		shardsEnt.Shards[idx] = s.ID

 		l, r := "-inf", "+inf"
 		if s.Range.Start != nil {
 			l = s.Range.Start.String()
 		}
 		if s.Range.End != nil {
 			r = s.Range.End.String()
 		}
 		count := ""
 		if s.ExpectedCount != 0 {
 			count = fmt.Sprintf(" (%d entities)", s.ExpectedCount)
 		}
 		logging.Infof(ctx, "Shard #%d is %d: %s - %s%s", idx, s.ID, l, r, count)
 	}

 	// Transactionally associate shards with the job and launch the TQ task that
 	// kicks off the processing of each individual shard. We use an intermediary
 	// task for this since transactionally launching O(ShardCount) tasks hits TQ
 	// transaction limits.
 	//
 	// If SplitAndLaunch crashes before this transaction lands, there'll be some
 	// orphaned Shard entities, no big deal.
 	logging.Infof(ctx, "Updating the job and launching the fan out task...")
 	return runTxn(ctx, func(ctx context.Context) error {
 		job, err := getJobInState(ctx, JobID(msg.JobId), dsmapperpb.State_STARTING)
 		if err != nil || job == nil {
 			return errors.Annotate(err, "in SplitAndLaunch txn").Err()
 		}

 		job.State = dsmapperpb.State_RUNNING
 		job.Updated = now
 		if err := datastore.Put(ctx, job, &shardsEnt); err != nil {
 			return errors.Annotate(err,
 				"when storing Job %d and ShardList with %d shards", job.ID, len(shards),
 			).Tag(transient.Tag).Err()
 		}

 		return ctl.tq().AddTask(ctx, &tq.Task{
 			Title: fmt.Sprintf("fanout:job-%d", job.ID),
 			Payload: &tasks.FanOutShards{
 				JobId: int64(job.ID),
 			},
 		})
 	})
 }

 // fetchShardSizes makes a bunch of Count() queries to figure out size of each
 // shard.
 //
 // Updates ExpectedCount in-place.
 func fetchShardSizes(ctx context.Context, baseQ *datastore.Query, shards []*shard) error {
 	ctx, cancel := clock.WithTimeout(ctx, 10*time.Minute)
 	defer cancel()

 	err := parallel.WorkPool(32, func(tasks chan<- func() error) {
 		for _, sh := range shards {
 			sh := sh
 			tasks <- func() error {
 				n, err := datastore.CountBatch(ctx, 1024, sh.Range.Apply(baseQ))
 				if err == nil {
 					sh.ExpectedCount = n
 				}
 				return errors.Annotate(err, "for shard #%d", sh.Index).Err()
 			}
 		}
 	})

 	return transient.Tag.Apply(err)
 }

 // fanOutShardsHandler fetches a list of shards from the job and launches
 // named ProcessShard tasks, one per shard.
 func (ctl *Controller) fanOutShardsHandler(ctx context.Context, payload proto.Message) error {
 	msg := payload.(*tasks.FanOutShards)

 	// Make sure the job is still present. If it is aborted, we still need to
 	// launch the shards, so they notice they are being aborted. We could try
 	// to abort all shards right here and now, but it basically means implementing
 	// an alternative shard abort flow. Seems simpler just to let the regular flow
 	// to proceed.
 	job, err := getJobInState(ctx, JobID(msg.JobId), dsmapperpb.State_RUNNING, dsmapperpb.State_ABORTING)
 	if err != nil || job == nil {
 		return errors.Annotate(err, "in FanOutShards").Err()
 	}

 	// Grab the list of shards created in SplitAndLaunch. It must exist at this
 	// point, since the job is in Running state.
 	shardIDs, err := job.fetchShardIDs(ctx)
 	if err != nil {
 		return errors.Annotate(err, "in FanOutShards").Err()
 	}

 	// Enqueue a bunch of named ProcessShard tasks (one per shard) to actually
 	// launch shard processing. This is idempotent operation, so if FanOutShards
 	// crashes midway and later retried, nothing bad happens.
 	eg, ctx := errgroup.WithContext(ctx)
 	tq := ctl.tq()
 	for _, sid := range shardIDs {
 		task := makeProcessShardTask(job.ID, sid, 0, true)
 		eg.Go(func() error { return tq.AddTask(ctx, task) })
 	}
 	return eg.Wait()
 }

 // processShardHandler reads a bunch of entities (up to PageSize), and hands
 // them to the mapper.
 //
 // After doing this in a loop for 1 min, it checkpoints the state and reenqueues
 // itself to resume mapping in another instance of the task. This makes each
 // processing TQ task relatively small, so it doesn't eat a lot of memory, or
 // produces gigantic unreadable logs. It also makes TQ's "Pause queue" button
 // more handy.
 func (ctl *Controller) processShardHandler(ctx context.Context, payload proto.Message) error {
 	msg := payload.(*tasks.ProcessShard)

 	// Grab the shard. This returns (nil, nil) if this Task Queue task is stale
 	// (based on taskNum) and should be silently skipped.
 	sh, err := getActiveShard(ctx, msg.ShardId, msg.TaskNum)
 	if err != nil || sh == nil {
 		return errors.Annotate(err, "when fetching shard state").Err()
 	}
 	ctx = logging.SetField(ctx, "shardIdx", sh.Index)

 	logging.Infof(ctx,
 		"Resuming processing of the shard (launched %s ago)",
 		clock.Now(ctx).Sub(sh.Created))

 	// Grab the job config, make sure the job is still active.
 	job, err := getJobInState(ctx, JobID(msg.JobId), dsmapperpb.State_RUNNING, dsmapperpb.State_ABORTING)
 	if err != nil || job == nil {
 		return errors.Annotate(err, "in ProcessShard").Err()
 	}

 	// If the job is being killed, kill the shard as well. This will eventually
 	// notify the job about shard's completion. Once all shards are done, the
 	// job will switch into ABORTED state.
 	if job.State == dsmapperpb.State_ABORTING {
 		return ctl.finishShard(ctx, sh.ID, 0, errJobAborted)
 	}

 	// Prepare the mapper by giving the factory job parameters.
 	mapper, err := ctl.initMapper(ctx, job, sh.Index)
 	switch {
 	case transient.Tag.In(err):
 		return errors.Annotate(err, "transient error when instantiating a mapper").Err()
 	case err != nil:
 		// Kill the shard if the factory returns a fatal error.
 		return ctl.finishShard(ctx, sh.ID, 0, err)
 	}

 	baseQ := job.Config.Query.ToDatastoreQuery()
 	lastKey := sh.ResumeFrom
 	keys := make([]*datastore.Key, 0, job.Config.PageSize)

 	shardDone := false    // true when finished processing the shard
 	pageCount := 0        // how many pages processed successfully
 	itemCount := int64(0) // how many entities processed successfully

 	// A soft deadline when to checkpoint the progress and reenqueue the
 	// processing task. We never abort processing of a page midway (causes too
 	// many complications), so if the mapper is extremely slow, it may end up
 	// running longer than this deadline.
 	dur := time.Minute
 	if job.Config.TaskDuration > 0 {
 		dur = job.Config.TaskDuration
 	}
 	deadline := clock.Now(ctx).Add(dur)

 	// Optionally also put a limit on number of processed pages. Useful if the
 	// mapper is somehow leaking resources (not sure it is possible in Go, but
 	// it was definitely possible in Python).
 	pageCountLimit := math.MaxInt32
 	if job.Config.PagesPerTask > 0 {
 		pageCountLimit = job.Config.PagesPerTask
 	}

 	for clock.Now(ctx).Before(deadline) && pageCount < pageCountLimit {
 		rng := sh.Range
 		if lastKey != nil {
 			rng.Start = lastKey
 		}
 		if rng.IsEmpty() {
 			shardDone = true
 			break
 		}

 		// Fetch next batch of keys. Return an error to the outer scope where it
 		// eventually will bubble up to TQ (so the task is retried with exponential
 		// backoff).
 		logging.Infof(ctx, "Fetching the next batch...")
 		q := rng.Apply(baseQ).Limit(int32(job.Config.PageSize)).KeysOnly(true)
 		keys = keys[:0]
 		if err = datastore.GetAll(ctx, q, &keys); err != nil {
 			err = errors.Annotate(err, "when querying for keys").Tag(transient.Tag).Err()
 			break
 		}

 		// No results within the range? Processing of the shard is complete!
 		if len(keys) == 0 {
 			shardDone = true
 			break
 		}

 		// Let the mapper do its thing. Remember where to resume from.
 		logging.Infof(ctx,
 			"Processing %d entities: %s - %s",
 			len(keys),
 			keys[0].String(),
 			keys[len(keys)-1].String())
 		if err = mapper(ctx, keys); err != nil {
 			err = errors.Annotate(err, "while mapping %d keys", len(keys)).Err()
 			break
 		}
 		lastKey = keys[len(keys)-1]
 		pageCount++
 		itemCount += int64(len(keys))

 		// Note: at this point we might try to checkpoint the progress, but we must
 		// be careful not to exceed 1 transaction per second limit. Considering we
 		// also MUST checkpoint the progress at the end of the task, it is a bit
 		// tricky to guarantee no two checkpoints are closer than 1 sec. We can do
 		// silly things like sleep 1 sec before the last checkpoint, but they
 		// provide no guarantees.
 		//
 		// So instead we store the progress after the deadline is up. If the task
 		// crashes midway, up to 1 min of work will be retried. No big deal.
 	}

 	// We are done with the shard when either processed all its range or failed
 	// with a fatal error. finishShard would take care of notifying the parent
 	// job about the shard's completion.
 	if shardDone || (err != nil && !transient.Tag.In(err)) {
 		return ctl.finishShard(ctx, sh.ID, itemCount, err)
 	}

 	if lastKey != nil {
 		logging.Infof(ctx, "The shard processing will resume from %s", lastKey)
 	} else {
 		logging.Infof(ctx, "The shard processing will resume from scratch")
 	}

 	// If the shard isn't done and we made no progress at all, then we hit
 	// a transient error. Ask TQ to retry.
 	if pageCount == 0 {
 		return err
 	}

 	// Otherwise need to checkpoint the progress and either to retry this task
 	// (on transient errors, to get an exponential backoff from TQ), or start
 	// a new task.
 	txnErr := shardTxn(ctx, sh.ID, func(ctx context.Context, sh *shard) (bool, error) {
 		switch {
 		case sh.ProcessTaskNum != msg.TaskNum:
 			logging.Warningf(ctx, "Unexpected shard state: its ProcessTaskNum is %d != %d", sh.ProcessTaskNum, msg.TaskNum)
 			return false, nil // some other task is already running
 		case sh.ResumeFrom != nil && lastKey.Less(sh.ResumeFrom):
 			logging.Warningf(ctx, "Unexpected shard state: its ResumeFrom is %s >= %s", sh.ResumeFrom, lastKey)
 			return false, nil // someone already claimed to process further, let them proceed
 		}

 		sh.State = dsmapperpb.State_RUNNING
 		sh.ResumeFrom = lastKey
 		sh.ProcessedCount += itemCount

 		// If the processing failed, just store the progress, but do not start a
 		// new TQ task. Retry the current task instead (to get exponential backoff).
 		if err != nil {
 			return true, nil
 		}

 		// Otherwise launch a new task in the chain. This essentially "resets"
 		// the exponential backoff counter.
 		sh.ProcessTaskNum++
 		return true, ctl.tq().AddTask(ctx,
 			makeProcessShardTask(sh.JobID, sh.ID, sh.ProcessTaskNum, false))
 	})

 	switch {
 	case err != nil && txnErr == nil:
 		return err
 	case err == nil && txnErr != nil:
 		return errors.Annotate(txnErr, "when storing shard progress").Err()
 	case err != nil && txnErr != nil:
 		return errors.Annotate(txnErr, "when storing shard progress after a transient error (%s)", err).Err()
 	default: // (nil, nil)
 		return nil
 	}
 }

 // finishShard marks the shard as finished (with status based on shardErr) and
 // emits a task to update the parent job's status.
 func (ctl *Controller) finishShard(ctx context.Context, shardID, processedCount int64, shardErr error) error {
 	err := shardTxn(ctx, shardID, func(ctx context.Context, sh *shard) (save bool, err error) {
 		runtime := clock.Now(ctx).Sub(sh.Created)
 		switch {
 		case shardErr == errJobAborted:
 			logging.Warningf(ctx, "The job has been aborted, aborting the shard after it has been running %s", runtime)
 			sh.State = dsmapperpb.State_ABORTED
 			sh.Error = errJobAborted.Error()
 		case shardErr != nil:
 			logging.Errorf(ctx, "The shard processing failed in %s with error: %s", runtime, shardErr)
 			sh.State = dsmapperpb.State_FAIL
 			sh.Error = shardErr.Error()
 		default:
 			logging.Infof(ctx, "The shard processing finished successfully in %s", runtime)
 			sh.State = dsmapperpb.State_SUCCESS
 		}
 		sh.ProcessedCount += processedCount
 		return true, ctl.requestJobStateUpdate(ctx, sh.JobID, sh.ID)
 	})
 	return errors.Annotate(err, "when marking the shard as finished").Err()
 }

 // makeProcessShardTask creates a ProcessShard tq.Task.
 //
 // If 'named' is true, assigns it a name. Tasks are named based on their shard
 // IDs and an index in the chain of ProcessShard tasks (task number), so that
 // on retries we don't rekick already finished tasks.
 func makeProcessShardTask(job JobID, shardID, taskNum int64, named bool) *tq.Task {
 	// Note: strictly speaking including job ID in the task name is redundant,
 	// since shardID is already globally unique, but it doesn't hurt. Useful for
 	// debugging and when looking at logs and pending TQ tasks.
 	t := &tq.Task{
 		Title: fmt.Sprintf("map:job-%d-shard-%d-task-%d", job, shardID, taskNum),
 		Payload: &tasks.ProcessShard{
 			JobId:   int64(job),
 			ShardId: shardID,
 			TaskNum: taskNum,
 		},
 	}
 	if named {
 		t.DeduplicationKey = fmt.Sprintf("v1-%d-%d-%d", job, shardID, taskNum)
 	}
 	return t
 }

 // requestJobStateUpdate submits RequestJobStateUpdate task, which eventually
 // causes updateJobStateHandler to execute.
 func (ctl *Controller) requestJobStateUpdate(ctx context.Context, jobID JobID, shardID int64) error {
 	return ctl.tq().AddTask(ctx, &tq.Task{
 		Title: fmt.Sprintf("notify:job-%d-shard-%d", jobID, shardID),
 		Payload: &tasks.RequestJobStateUpdate{
 			JobId:   int64(jobID),
 			ShardId: shardID,
 		},
 	})
 }

 // requestJobStateUpdateHandler is called whenever state of some shard changes.
 //
 // It forwards this notification to the job (specifically updateJobStateHandler)
 // throttling the rate to ~0.5 QPS to avoid overwhelming job's entity group with
 // high write rate.
 func (ctl *Controller) requestJobStateUpdateHandler(ctx context.Context, payload proto.Message) error {
 	msg := payload.(*tasks.RequestJobStateUpdate)

 	// Throttle to once per 2 sec (and make sure it is always in the future). We
 	// rely here on a pretty good (< .5s maximum skew) clock sync on servers.
 	eta := clock.Now(ctx).Unix()
 	eta = (eta/2 + 1) * 2
 	dedupKey := fmt.Sprintf("update-job-state-v1:%d:%d", msg.JobId, eta)

 	err := ctl.tq().AddTask(ctx, &tq.Task{
 		DeduplicationKey: dedupKey,
 		Title:            fmt.Sprintf("update:job-%d", msg.JobId),
 		ETA:              time.Unix(eta, 0),
 		Payload:          &tasks.UpdateJobState{JobId: msg.JobId},
 	})
 	return errors.Annotate(err, "when adding UpdateJobState task").Err()
 }

 // updateJobStateHandler is called some time later after one or more shards have
 // changed state.
 //
 // It calculates overall job state based on the state of its shards.
 func (ctl *Controller) updateJobStateHandler(ctx context.Context, payload proto.Message) error {
 	msg := payload.(*tasks.UpdateJobState)

 	// Get the job and all its shards in their most recent state.
 	job, err := getJobInState(ctx, JobID(msg.JobId), dsmapperpb.State_RUNNING, dsmapperpb.State_ABORTING)
 	if err != nil || job == nil {
 		return errors.Annotate(err, "in UpdateJobState").Err()
 	}
 	shards, err := job.fetchShards(ctx)
 	if err != nil {
 		return errors.Annotate(err, "failed to fetch shards").Err()
 	}

 	// Switch the job into a final state only when all shards are done running.
 	perState := make(map[dsmapperpb.State]int, len(dsmapperpb.State_name))
 	finished := 0
 	for _, sh := range shards {
 		logging.Infof(ctx, "Shard #%d (%d) is in state %s", sh.Index, sh.ID, sh.State)
 		perState[sh.State]++
 		if isFinalState(sh.State) {
 			finished++
 		}
 	}
 	if finished != len(shards) {
 		return nil
 	}

 	jobState := dsmapperpb.State_SUCCESS
 	switch {
 	case perState[dsmapperpb.State_ABORTED] != 0:
 		jobState = dsmapperpb.State_ABORTED
 	case perState[dsmapperpb.State_FAIL] != 0:
 		jobState = dsmapperpb.State_FAIL
 	}

 	return runTxn(ctx, func(ctx context.Context) error {
 		job, err := getJobInState(ctx, JobID(msg.JobId), dsmapperpb.State_RUNNING, dsmapperpb.State_ABORTING)
 		if err != nil || job == nil {
 			return errors.Annotate(err, "in UpdateJobState txn").Err()
 		}

 		// Make sure an aborting job ends up in aborted state, even if all its
 		// shards manged to finish. It looks weird when an ABORTING job moves
 		// into e.g. SUCCESS state.
 		if job.State == dsmapperpb.State_ABORTING {
 			job.State = dsmapperpb.State_ABORTED
 		} else {
 			job.State = jobState
 		}
 		job.Updated = clock.Now(ctx).UTC()

 		runtime := job.Updated.Sub(job.Created)
 		switch job.State {
 		case dsmapperpb.State_SUCCESS:
 			logging.Infof(ctx, "The job finished successfully in %s", runtime)
 		case dsmapperpb.State_FAIL:
 			logging.Errorf(ctx, "The job finished with %d shards failing in %s", perState[dsmapperpb.State_FAIL], runtime)
 			for _, sh := range shards {
 				if sh.State == dsmapperpb.State_FAIL {
 					logging.Errorf(ctx, "Shard #%d (%d) error - %s", sh.Index, sh.ID, sh.Error)
 				}
 			}
 		case dsmapperpb.State_ABORTED:
 			logging.Warningf(ctx, "The job has been aborted after %s: %d shards succeeded, %d shards failed, %d shards aborted",
 				runtime, perState[dsmapperpb.State_SUCCESS], perState[dsmapperpb.State_FAIL], perState[dsmapperpb.State_ABORTED])
 		}

 		return transient.Tag.Apply(datastore.Put(ctx, job))
 	})
 }