src/cache_inode/cache_inode_lru.c - chromiumos/third_party/nfs-ganesha - Git at Google

 /*
  * Vim:noexpandtab:shiftwidth=8:tabstop=8:
  *
  * Copyright (C) 2010, The Linux Box Corporation
  * Contributor : Matt Benjamin <matt@linuxbox.com>
  *
  * Some portions Copyright CEA/DAM/DIF  (2008)
  * contributeur : Philippe DENIEL   philippe.deniel@cea.fr
  *                Thomas LEIBOVICI  thomas.leibovici@cea.fr
  *
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 3 of the License, or (at your option) any later version.
  *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
  * License along with this library; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  * 02110-1301 USA
  *
  * -------------
  */

 /**
  * @addtogroup cache_inode
  * @{
  */

 #include "config.h"
 #include <sys/types.h>
 #include <sys/param.h>
 #include <time.h>
 #include <pthread.h>
 #include <assert.h>
 #include <sys/time.h>
 #include <sys/resource.h>
 #include <misc/timespec.h>
 #include <stdio.h>
 #include <stdbool.h>
 #include <stdint.h>
 #include <unistd.h>
 #include "fsal.h"
 #include "nfs_core.h"
 #include "log.h"
 #include "cache_inode.h"
 #include "cache_inode_lru.h"
 #include "abstract_atomic.h"
 #include "cache_inode_hash.h"
 #include "gsh_intrinsic.h"
 #include "sal_functions.h"
 #include "nfs_exports.h"

 /**
  *
  * @file cache_inode_lru.c
  * @author Matt Benjamin <matt@linuxbox.com>
  * @brief Constant-time cache inode cache management implementation
  */

 /**
  * @page LRUOverview LRU Overview
  *
  * This module implements a constant-time cache management strategy
  * based on LRU.  Some ideas are taken from 2Q [Johnson and Shasha 1994]
  * and MQ [Zhou, Chen, Li 2004].  In this system, cache management does
  * interact with cache entry lifecycle, but the lru queue is not a garbage
  * collector. Most imporantly, cache management operations execute in constant
  * time, as expected with LRU (and MQ).
  *
  * Cache entries in use by a currently-active protocol request (or other
  * operation) have a positive refcount, and threfore should not be present
  * at the cold end of an lru queue if the cache is well-sized.
  *
  * Cache entries with lock and open state are not eligible for collection
  * under ordinary circumstances, so are kept on a separate lru_pinned
  * list to retain constant time.
  *
  * As noted below, initial references to cache entries may only be granted
  * under the cache inode hash table latch.  Likewise, entries must first be
  * made unreachable to the cache inode hash table, then independently reach
  * a refcnt of 0, before they may be disposed or recycled.
  */

 struct lru_state lru_state;

 /**
  * A single queue structure.
  */

 struct lru_q {
 	struct glist_head q;	/* LRU is at HEAD, MRU at tail */
 	enum lru_q_id id;
 	uint64_t size;
 };


 /**
  * A single queue lane, holding both movable and pinned entries.
  */

 struct lru_q_lane {
 	struct lru_q L1;
 	struct lru_q L2;
 	struct lru_q pinned;	/* uncollectable, due to state */
 	struct lru_q cleanup;	/* deferred cleanup */
 	pthread_mutex_t mtx;
 	/* LRU thread scan position */
 	struct {
 		bool active;
 		struct glist_head *glist;
 		struct glist_head *glistn;
 	} iter;
 	struct {
 		char *func;
 		uint32_t line;
 	} locktrace;
 	 CACHE_PAD(0);
 };

 #define QLOCK(qlane) \
 	do { \
 		PTHREAD_MUTEX_lock(&(qlane)->mtx); \
 		(qlane)->locktrace.func = (char *) __func__; \
 		(qlane)->locktrace.line = __LINE__; \
 	} while (0)

 #define QUNLOCK(qlane) \
 	PTHREAD_MUTEX_unlock(&(qlane)->mtx)

 /**
  * A multi-level LRU algorithm inspired by MQ [Zhou].  Transition from
  * L1 to L2 implies various checks (open files, etc) have been
  * performed, so ensures they are performed only once.  A
  * correspondence to the "scan resistance" property of 2Q and MQ is
  * accomplished by recycling/clean loads onto the LRU of L1.  Async
  * processing onto L2 constrains oscillation in this algorithm.
  */

 static struct lru_q_lane LRU[LRU_N_Q_LANES];

 /**
  * This is a global counter of files opened by cache_inode.  This is
  * preliminary expected to go away.  Problems with this method are
  * that it overcounts file descriptors for FSALs that don't use them
  * for open files, and, under the Lieb Rearchitecture, FSALs will be
  * responsible for caching their own file descriptors, with
  * interfaces for Cache_Inode to interrogate them as to usage or
  * instruct them to close them.
  */

 size_t open_fd_count = 0;

 /**
  * The refcount mechanism distinguishes 3 key object states:
  *
  * 1. unreferenced (unreachable)
  * 2. unincremented, but reachable
  * 3. incremented
  *
  * It seems most convenient to make unreferenced correspond to refcount==0.
  * Then refcount==1 is a SENTINEL_REFCOUNT in which the only reference to
  * the entry is the set of functions which can grant new references.  An
  * object with refcount > 1 has been referenced by some thread, which must
  * release its reference at some point.
  *
  * More specifically, in the current implementation, reachability is
  * serialized by the cache lookup table latch.
  *
  * Currently, I propose to distinguish between objects with positive refcount
  * and objects with state.  The latter could be evicted, in the normal case,
  * only with loss of protocol correctness, but may have only the sentinel
  * refcount.  To preserve constant time operation, they are stored in an
  * independent partition of the LRU queue.
  */

 static struct fridgethr *lru_fridge;

 enum lru_edge {
 	LRU_HEAD,		/* LRU */
 	LRU_TAIL		/* MRU */
 };

 static const uint32_t FD_FALLBACK_LIMIT = 0x400;

 /* Some helper macros */
 #define LRU_NEXT(n) \
 	(atomic_inc_uint32_t(&(n)) % LRU_N_Q_LANES)

 /* Delete lru, use iif the current thread is not the LRU
  * thread.  The node being removed is lru, glist a pointer to L1's q,
  * qlane its lane. */
 #define LRU_DQ_SAFE(lru, q) \
 	do { \
 		if ((lru)->qid == LRU_ENTRY_L1) { \
 			struct lru_q_lane *qlane = &LRU[(lru)->lane]; \
 			if (unlikely((qlane->iter.active) && \
 				     ((&(lru)->q) == qlane->iter.glistn))) { \
 				qlane->iter.glistn = (lru)->q.next; \
 			} \
 		} \
 		glist_del(&(lru)->q); \
 		--((q)->size); \
 	} while (0)

 #define LRU_ENTRY_L1_OR_L2(e) \
 	(((e)->lru.qid == LRU_ENTRY_L2) || \
 	 ((e)->lru.qid == LRU_ENTRY_L1))

 #define LRU_ENTRY_RECLAIMABLE(e, n) \
 	(LRU_ENTRY_L1_OR_L2(e) && \
 	((n) == LRU_SENTINEL_REFCOUNT+1) && \
 	 ((e)->fh_hk.inavl))

 /**
  * @brief Initialize a single base queue.
  *
  * This function initializes a single queue partition (L1, L1 pinned, L2,
  * etc)
  */
 static inline void
 lru_init_queue(struct lru_q *q, enum lru_q_id qid)
 {
 	glist_init(&q->q);
 	q->id = qid;
 	q->size = 0;
 }

 static inline void
 lru_init_queues(void)
 {
 	int ix;

 	for (ix = 0; ix < LRU_N_Q_LANES; ++ix) {
 		struct lru_q_lane *qlane = &LRU[ix];

 		/* one mutex per lane */
 		PTHREAD_MUTEX_init(&qlane->mtx, NULL);

 		/* init iterator */
 		qlane->iter.active = false;

 		/* init lane queues */
 		lru_init_queue(&LRU[ix].L1, LRU_ENTRY_L1);
 		lru_init_queue(&LRU[ix].L2, LRU_ENTRY_L2);
 		lru_init_queue(&LRU[ix].pinned, LRU_ENTRY_PINNED);
 		lru_init_queue(&LRU[ix].cleanup, LRU_ENTRY_CLEANUP);
 	}
 }

 /**
  * @brief Return a pointer to the current queue of entry
  *
  * This function returns a pointer to the queue on which entry is linked,
  * or NULL if entry is not on any queue.
  *
  * The lane lock corresponding to entry is LOCKED.
  *
  * @param[in] entry  The entry.
  *
  * @return A pointer to entry's current queue, NULL if none.
  */
 static inline struct lru_q *
 lru_queue_of(cache_entry_t *entry)
 {
 	struct lru_q *q;

 	switch (entry->lru.qid) {
 	case LRU_ENTRY_PINNED:
 		q = &LRU[(entry->lru.lane)].pinned;
 		break;
 	case LRU_ENTRY_L1:
 		q = &LRU[(entry->lru.lane)].L1;
 		break;
 	case LRU_ENTRY_L2:
 		q = &LRU[(entry->lru.lane)].L2;
 		break;
 	case LRU_ENTRY_CLEANUP:
 		q = &LRU[(entry->lru.lane)].cleanup;
 		break;
 	default:
 		/* LRU_NO_LANE */
 		q = NULL;
 		break;
 	}			/* switch */

 	return q;
 }

 /**
  * @brief Get the appropriate lane for a cache_entry
  *
  * This function gets the LRU lane by taking the modulus of the
  * supplied pointer.
  *
  * @param[in] entry  A pointer to a cache entry
  *
  * @return The LRU lane in which that entry should be stored.
  */
 static inline uint32_t
 lru_lane_of_entry(cache_entry_t *entry)
 {
 	return (uint32_t) (((uintptr_t) entry) % LRU_N_Q_LANES);
 }

 /**
  * @brief Insert an entry into the specified queue and lane
  *
  * This function determines the queue corresponding to the supplied
  * lane and flags, inserts the entry into that queue, and updates the
  * entry to hold the flags and lane.
  *
  * The caller MUST NOT hold a lock on the queue lane.
  *
  * @param[in] entry  The entry to insert
  * @param[in] q      The queue to insert on
  * @param[in] lane   The lane corresponding to entry address
  * @param[in] edge   One of LRU_HEAD (LRU) or LRU_TAIL (MRU)
  */
 static inline void
 lru_insert_entry(cache_entry_t *entry, struct lru_q *q,
 		 uint32_t lane, enum lru_edge edge)
 {
 	cache_inode_lru_t *lru = &entry->lru;
 	struct lru_q_lane *qlane = &LRU[lane];

 	lru->lane = lane;	/* permanently fix lane */
 	lru->qid = q->id;	/* initial */

 	QLOCK(qlane);

 	switch (edge) {
 	case LRU_HEAD:
 		glist_add(&q->q, &lru->q);
 		break;
 	case LRU_TAIL:
 	default:
 		glist_add_tail(&q->q, &lru->q);
 		break;
 	}
 	++(q->size);

 	QUNLOCK(qlane);
 }

 /**
  * @brief pin an entry
  *
  * Pins an entry.  The corresponding q lane is LOCKED.  The entry is NOT
  * on the CLEANUP queue.
  *
  * @param[in] entry  The entry to pin
  * @param[in] entry  Its qlane (which we just computed)
  * @param[in] flags  (TBD)
  */
 static inline void
 cond_pin_entry(cache_entry_t *entry, uint32_t flags)
 {
 	cache_inode_lru_t *lru = &entry->lru;

 	if (!(lru->qid == LRU_ENTRY_PINNED)) {
 		struct lru_q *q;

 		/* out with the old queue */
 		q = lru_queue_of(entry);
 		LRU_DQ_SAFE(lru, q);

 		/* in with the new */
 		lru->qid = LRU_ENTRY_PINNED;
 		q = &LRU[(lru->lane)].pinned;
 		glist_add(&q->q, &lru->q);
 		++(q->size);

 	} /* ! PINNED  (&& !CLEANUP) */
 }

 /**
  * @brief Clean an entry for recycling.
  *
  * This function cleans an entry up before it's recycled or freed.
  *
  * @param[in] entry  The entry to clean
  */
 static inline void
 cache_inode_lru_clean(cache_entry_t *entry)
 {
 	cache_inode_status_t cache_status = CACHE_INODE_SUCCESS;

 	if (is_open(entry)) {
 		cache_status =
 		    cache_inode_close(entry,
 				      CACHE_INODE_FLAG_REALLYCLOSE |
 				      CACHE_INODE_FLAG_NOT_PINNED |
 				      CACHE_INODE_FLAG_CLEANUP |
 				      CACHE_INODE_DONT_KILL);
 		if (cache_status != CACHE_INODE_SUCCESS) {
 			LogCrit(COMPONENT_CACHE_INODE_LRU,
 				"Error closing file in cleanup: %d.",
 				cache_status);
 		}
 	}

 	if (entry->type == DIRECTORY)
 		cache_inode_release_dirents(entry, CACHE_INODE_AVL_BOTH);

 	/* Free FSAL resources */
 	if (entry->obj_handle) {
 		entry->obj_handle->obj_ops.release(entry->obj_handle);
 		entry->obj_handle = NULL;
 	}

 	/* Clean out the export mapping before deconstruction */
 	clean_mapping(entry);

 	/* Finalize last bits of the cache entry */
 	cache_inode_key_delete(&entry->fh_hk.key);
 	PTHREAD_RWLOCK_destroy(&entry->content_lock);
 	PTHREAD_RWLOCK_destroy(&entry->state_lock);
 	PTHREAD_RWLOCK_destroy(&entry->attr_lock);
 }

 /**
  * @brief Try to pull an entry off the queue
  *
  * This function examines the end of the specified queue and if the
  * entry found there can be re-used, it returns with the entry
  * locked.  Otherwise, it returns NULL.  The caller MUST NOT hold a
  * lock on the queue when this function is called.
  *
  * This function follows the locking discipline detailed above.  it
  * returns an lru entry removed from the queue system and which we are
  * permitted to dispose or recycle.
  */

 static uint32_t reap_lane;

 static inline cache_inode_lru_t *
 lru_reap_impl(enum lru_q_id qid)
 {
 	uint32_t lane;
 	struct lru_q_lane *qlane;
 	struct lru_q *lq;
 	cache_inode_lru_t *lru;
 	cache_entry_t *entry;
 	uint32_t refcnt;
 	cih_latch_t latch;
 	int ix;

 	lane = LRU_NEXT(reap_lane);
 	for (ix = 0; ix < LRU_N_Q_LANES; ++ix, lane = LRU_NEXT(reap_lane)) {
 		qlane = &LRU[lane];
 		lq = (qid == LRU_ENTRY_L1) ? &qlane->L1 : &qlane->L2;

 		QLOCK(qlane);
 		lru = glist_first_entry(&lq->q, cache_inode_lru_t, q);
 		if (!lru)
 			goto next_lane;
 		refcnt = atomic_inc_int32_t(&lru->refcnt);
 		entry = container_of(lru, cache_entry_t, lru);
 		if (unlikely(refcnt != (LRU_SENTINEL_REFCOUNT + 1))) {
 			/* cant use it. */
 			cache_inode_lru_unref(entry, LRU_UNREF_QLOCKED);
 			goto next_lane;
 		}
 		/* potentially reclaimable */
 		QUNLOCK(qlane);
 		/* entry must be unreachable from CIH when recycled */
 		if (cih_latch_entry
 		    (entry, &latch, CIH_GET_WLOCK, __func__, __LINE__)) {
 			QLOCK(qlane);
 			refcnt = atomic_fetch_int32_t(&entry->lru.refcnt);
 			/* there are two cases which permit reclaim,
 			 * entry is:
 			 * 1. reachable but unref'd (refcnt==2)
 			 * 2. unreachable, being removed (plus refcnt==0)
 			 *  for safety, take only the former
 			 */
 			if (LRU_ENTRY_RECLAIMABLE(entry, refcnt)) {
 				/* it worked */
 				struct lru_q *q = lru_queue_of(entry);

 				cih_remove_latched(entry, &latch,
 						   CIH_REMOVE_QLOCKED);
 				LRU_DQ_SAFE(lru, q);
 				entry->lru.qid = LRU_ENTRY_NONE;
 				QUNLOCK(qlane);
 				cih_latch_rele(&latch);
 				goto out;
 			}
 			cih_latch_rele(&latch);
 			/* return the ref we took above--unref deals
 			 * correctly with reclaim case */
 			cache_inode_lru_unref(entry, LRU_UNREF_QLOCKED);
 		} else {
 			/* ! QLOCKED */
 			continue;
 		}
  next_lane:
 		QUNLOCK(qlane);
 	}			/* foreach lane */

 	/* ! reclaimable */
 	lru = NULL;
  out:
 	return lru;
 }

 static inline cache_inode_lru_t *
 lru_try_reap_entry(void)
 {
 	cache_inode_lru_t *lru;

 	if (lru_state.entries_used < lru_state.entries_hiwat)
 		return NULL;

 	lru = lru_reap_impl(LRU_ENTRY_L2);
 	if (!lru)
 		lru = lru_reap_impl(LRU_ENTRY_L1);

 	return lru;
 }

 /**
  * @brief Push a cache_inode_killed entry to the cleanup queue
  * for out-of-line cleanup
  *
  * This function appends entry to the appropriate lane of the
  * global cleanup queue, and marks the entry.
  *
  * @param[in] entry  The entry to clean
  */
 void
 cache_inode_lru_cleanup_push(cache_entry_t *entry)
 {
 	cache_inode_lru_t *lru = &entry->lru;
 	struct lru_q_lane *qlane = &LRU[lru->lane];

 	QLOCK(qlane);

 	if (!(lru->qid == LRU_ENTRY_CLEANUP)) {
 		struct lru_q *q;

 		/* out with the old queue */
 		q = lru_queue_of(entry);
 		LRU_DQ_SAFE(lru, q);

 		/* in with the new */
 		lru->qid = LRU_ENTRY_CLEANUP;
 		q = &qlane->cleanup;
 		glist_add(&q->q, &lru->q);
 		++(q->size);
 	}

 	QUNLOCK(qlane);
 }

 /**
  * @brief Push an entry to the cleanup queue that may be unexported
  * for out-of-line cleanup
  *
  * This routine is used to try pushing a cache inode into the cleanup
  * queue. If the entry ends up with another LRU reference before this
  * is accomplished, then don't push it to cleanup.
  *
  * This will be used when unexporting an export. Any cache inode entry
  * that only belonged to that export is a candidate for cleanup.
  * However, it is possible the entry is still accessible via another
  * export, and an LRU reference might be gained before we can lock the
  * AVL tree. In that case, the entry must be left alone (thus
  * cache_inode_kill_entry is NOT suitable for this purpose).
  *
  * @param[in] entry  The entry to clean
  */
 void cache_inode_lru_cleanup_try_push(cache_entry_t *entry)
 {
 	cache_inode_lru_t *lru = &entry->lru;
 	struct lru_q_lane *qlane = &LRU[lru->lane];
 	cih_latch_t latch;

 	if (cih_latch_entry(entry, &latch, CIH_GET_WLOCK,
 			    __func__, __LINE__)) {
 		uint32_t refcnt;

 		QLOCK(qlane);

 		refcnt = atomic_fetch_int32_t(&entry->lru.refcnt);
 		/* there are two cases which permit reclaim,
 		 * entry is:
 		 * 1. reachable but unref'd (refcnt==2)
 		 * 2. unreachable, being removed (plus refcnt==0)
 		 *    for safety, take only the former
 		 */
 		if (LRU_ENTRY_RECLAIMABLE(entry, refcnt)) {
 			/* it worked */
 			struct lru_q *q = lru_queue_of(entry);

 			cih_remove_latched(entry, &latch,
 					   CIH_REMOVE_QLOCKED);
 			LRU_DQ_SAFE(lru, q);
 			entry->lru.qid = LRU_ENTRY_CLEANUP;
 		}

 		QUNLOCK(qlane);

 		cih_latch_rele(&latch);
 	}
 }

 /**
  * @brief Function that executes in the lru thread
  *
  * This function performs long-term reorganization, compaction, and
  * other operations that are not performed in-line with referencing
  * and dereferencing.
  *
  * This function is responsible for deferred cleanup of cache entries
  * killed in request or upcall (or most other) contexts.
  *
  * This function is responsible for cleaning the FD cache.  It works
  * by the following rules:
  *
  *  - If the number of open FDs is below the low water mark, do
  *    nothing.
  *
  *  - If the number of open FDs is between the low and high water
  *    mark, make one pass through the queues, and exit.  Each pass
  *    consists of taking an entry from L1, examining to see if it is a
  *    regular file not bearing state with an open FD, closing the open
  *    FD if it is, and then moving it to L2.  The advantage of the two
  *    level system is twofold: First, seldom used entries congregate
  *    in L2 and the promotion behaviour provides some scan
  *    resistance.  Second, once an entry is examined, it is moved to
  *    L2, so we won't examine the same cache entry repeatedly.
  *
  *  - If the number of open FDs is greater than the high water mark,
  *    we consider ourselves to be in extremis.  In this case we make a
  *    number of passes through the queue not to exceed the number of
  *    passes that would be required to process the number of entries
  *    equal to a biggest_window percent of the system specified
  *    maximum.
  *
  *  - If we are in extremis, and performing the maximum amount of work
  *    allowed has not moved the open FD count required_progress%
  *    toward the high water mark, increment lru_state.futility.  If
  *    lru_state.futility reaches futility_count, temporarily disable
  *    FD caching.
  *
  *  - Every time we wake through timeout, reset futility_count to 0.
  *
  *  - If we fall below the low water mark and FD caching has been
  *    temporarily disabled, re-enable it.
  *
  * This function uses the lock discipline for functions accessing LRU
  * entries through a queue partition.
  *
  * @param[in] ctx Fridge context
  */

 #define CL_FLAGS \
 	(CACHE_INODE_FLAG_REALLYCLOSE| \
 	 CACHE_INODE_FLAG_NOT_PINNED| \
 	 CACHE_INODE_FLAG_CONTENT_HAVE| \
 	 CACHE_INODE_FLAG_CONTENT_HOLD)

 static void
 lru_run(struct fridgethr_context *ctx)
 {
 	/* Index */
 	size_t lane = 0;
 	/* True if we were explicitly awakened. */
 	bool woke = ctx->woke;
 	/* Finalized */
 	uint32_t fdratepersec = 1, fds_avg, fddelta;
 	float fdnorm, fdwait_ratio, fdmulti;
 	time_t threadwait = fridgethr_getwait(ctx);
 	/* True if we are taking extreme measures to reclaim FDs */
 	bool extremis = false;
 	/* Total work done in all passes so far.  If this exceeds the
 	 * window, stop.
 	 */
 	size_t totalwork = 0;
 	uint64_t totalclosed = 0;
 	/* The current count (after reaping) of open FDs */
 	size_t currentopen = 0;
 	struct lru_q *q;
 	time_t new_thread_wait;

 	SetNameFunction("cache_lru");

 	fds_avg = (lru_state.fds_hiwat - lru_state.fds_lowat) / 2;

 	if (cache_param.use_fd_cache)
 		extremis = (atomic_fetch_size_t(&open_fd_count) >
 			    lru_state.fds_hiwat);

 	LogFullDebug(COMPONENT_CACHE_INODE_LRU, "LRU awakes.");

 	if (!woke) {
 		/* If we make it all the way through a timed sleep
 		   without being woken, we assume we aren't racing
 		   against the impossible. */
 		lru_state.futility = 0;
 	}

 	LogFullDebug(COMPONENT_CACHE_INODE_LRU, "lru entries: %zu",
 		     lru_state.entries_used);

 	/* Reap file descriptors.  This is a preliminary example of the
 	   L2 functionality rather than something we expect to be
 	   permanent.  (It will have to adapt heavily to the new FSAL
 	   API, for example.) */

 	if ((atomic_fetch_size_t(&open_fd_count) < lru_state.fds_lowat)
 	    && cache_param.use_fd_cache) {
 		LogDebug(COMPONENT_CACHE_INODE_LRU,
 			 "FD count is %zd and low water mark is %d: not reaping.",
 			 atomic_fetch_size_t(&open_fd_count),
 			 lru_state.fds_lowat);
 		if (cache_param.use_fd_cache
 		    && !lru_state.caching_fds) {
 			lru_state.caching_fds = true;
 			LogEvent(COMPONENT_CACHE_INODE_LRU,
 				 "Re-enabling FD cache.");
 		}
 	} else {
 		/* The count of open file descriptors before this run
 		   of the reaper. */
 		size_t formeropen = atomic_fetch_size_t(&open_fd_count);
 		/* Work done in the most recent pass of all queues.  if
 		   value is less than the work to do in a single queue,
 		   don't spin through more passes. */
 		size_t workpass = 0;
 		time_t curr_time = time(NULL);

 		fdratepersec = (curr_time <= lru_state.prev_time)
 			? 1 : (formeropen - lru_state.prev_fd_count) /
 					(curr_time - lru_state.prev_time);

 		LogFullDebug(COMPONENT_CACHE_INODE_LRU,
 			     "fdrate:%u fdcount:%zd slept for %" PRIu64 " sec",
 			     fdratepersec, formeropen,
 			     curr_time - lru_state.prev_time);

 		if (extremis) {
 			LogDebug(COMPONENT_CACHE_INODE_LRU,
 				 "Open FDs over high water mark, reapring aggressively.");
 		}

 		/* Total fds closed between all lanes and all current runs. */
 		do {
 			workpass = 0;
 			for (lane = 0; lane < LRU_N_Q_LANES; ++lane) {
 				/* The amount of work done on this lane on
 				   this pass. */
 				size_t workdone = 0;
 				/* The entry being examined */
 				cache_inode_lru_t *lru = NULL;
 				/* Number of entries closed in this run. */
 				size_t closed = 0;
 				/* a cache_status */
 				cache_inode_status_t cache_status =
 				    CACHE_INODE_SUCCESS;
 				/* a cache entry */
 				cache_entry_t *entry;
 				/* Current queue lane */
 				struct lru_q_lane *qlane = &LRU[lane];
 				/* entry refcnt */
 				uint32_t refcnt;

 				q = &qlane->L1;

 				LogDebug(COMPONENT_CACHE_INODE_LRU,
 					 "Reaping up to %d entries from lane %zd",
 					 lru_state.per_lane_work, lane);

 				LogFullDebug(COMPONENT_CACHE_INODE_LRU,
 					     "formeropen=%zd totalwork=%zd workpass=%zd closed:%zd totalclosed:%"
 					     PRIu64,
 					     formeropen, totalwork, workpass,
 					     closed, totalclosed);

 				QLOCK(qlane);
 				qlane->iter.active = true;	/* ACTIVE */
 				/* While for_each_safe per se is NOT MT-safe,
 				 * the iteration can be made so by the
 				 * convention that any competing thread which
 				 * would invalidate the iteration also adjusts
 				 * glist and (in particular) glistn */
 				glist_for_each_safe(qlane->iter.glist,
 						    qlane->iter.glistn, &q->q) {
 					struct lru_q *q;

 					/* check per-lane work */
 					if (workdone >= lru_state.per_lane_work)
 						goto next_lane;

 					lru =
 					    glist_entry(qlane->iter.glist,
 							cache_inode_lru_t, q);
 					refcnt =
 					    atomic_inc_int32_t(&lru->refcnt);

 					/* get entry early */
 					entry =
 					    container_of(lru, cache_entry_t,
 							 lru);

 					/* check refcnt in range */
 					if (unlikely(refcnt > 2)) {
 						cache_inode_lru_unref(
 							entry,
 							LRU_UNREF_QLOCKED);
 						workdone++; /* but count it */
 						/* qlane LOCKED, lru refcnt is
 						 * restored */
 						continue;
 					}

 					/* Move entry to MRU of L2 */
 					q = &qlane->L1;
 					LRU_DQ_SAFE(lru, q);
 					lru->qid = LRU_ENTRY_L2;
 					q = &qlane->L2;
 					glist_add(&q->q, &lru->q);
 					++(q->size);

 					/* Drop the lane lock while performing
 					 * (slow) operations on entry */
 					QUNLOCK(qlane);

 					/* Acquire the content lock first; we
 					 * may need to look at fds and close
 					 * it. */
 					PTHREAD_RWLOCK_wrlock(&entry->
 							      content_lock);
 					if (is_open(entry)) {
 						cache_status =
 						    cache_inode_close(
 							    entry, CL_FLAGS);
 						if (cache_status !=
 						    CACHE_INODE_SUCCESS) {
 							LogCrit(
 						      COMPONENT_CACHE_INODE_LRU,
 							"Error closing file in LRU thread.");
 						} else {
 							++totalclosed;
 							++closed;
 						}
 					}
 					PTHREAD_RWLOCK_unlock(&entry->
 							      content_lock);

 					QLOCK(qlane);	/* QLOCKED */
 					cache_inode_lru_unref(
 						entry,
 						LRU_UNREF_QLOCKED);
 					++workdone;
 				} /* for_each_safe lru */

  next_lane:
 				qlane->iter.active = false; /* !ACTIVE */
 				QUNLOCK(qlane);
 				LogDebug(COMPONENT_CACHE_INODE_LRU,
 					 "Actually processed %zd entries on lane %zd closing %zd descriptors",
 					 workdone, lane, closed);
 				workpass += workdone;
 			}	/* foreach lane */
 			totalwork += workpass;
 		} while (extremis && (workpass >= lru_state.per_lane_work)
 			 && (totalwork < lru_state.biggest_window));

 		currentopen = atomic_fetch_size_t(&open_fd_count);
 		if (extremis
 		    && ((currentopen > formeropen)
 			|| (formeropen - currentopen <
 			    (((formeropen -
 			       lru_state.fds_hiwat) *
 			      cache_param.required_progress) /
 			     100)))) {
 			if (++lru_state.futility >
 			    cache_param.futility_count) {
 				LogCrit(COMPONENT_CACHE_INODE_LRU,
 					"Futility count exceeded.  The LRU thread is unable to make progress in reclaiming FDs.  Disabling FD cache.");
 				lru_state.caching_fds = false;
 			}
 		}
 	}

 	/* The following calculation will progressively garbage collect
 	 * more frequently as these two factors increase:
 	 * 1. current number of open file descriptors
 	 * 2. rate at which file descriptors are being used.
 	 *
 	 * When there is little activity, this thread will sleep at the
 	 * "LRU_Run_Interval" from the config.
 	 *
 	 * When there is a lot of activity, the thread will sleep for a
 	 * much shorter time.
 	 */
 	lru_state.prev_fd_count = currentopen;
 	lru_state.prev_time = time(NULL);

 	fdnorm = (fdratepersec + fds_avg) / fds_avg;
 	fddelta = (currentopen > lru_state.fds_lowat)
 			? (currentopen - lru_state.fds_lowat) : 0;
 	fdmulti = (fddelta * 10) / fds_avg;
 	fdmulti = fdmulti ? fdmulti : 1;
 	fdwait_ratio = lru_state.fds_hiwat /
 			((lru_state.fds_hiwat + fdmulti * fddelta) * fdnorm);

 	new_thread_wait = threadwait * fdwait_ratio;

 	if (new_thread_wait < cache_param.lru_run_interval / 10)
 		new_thread_wait = cache_param.lru_run_interval / 10;

 	fridgethr_setwait(ctx, new_thread_wait);

 	LogDebug(COMPONENT_CACHE_INODE_LRU,
 		 "After work, open_fd_count:%zd  count:%" PRIu64
 		 " fdrate:%u threadwait=%" PRIu64,
 		 atomic_fetch_size_t(&open_fd_count),
 		 lru_state.entries_used, fdratepersec, threadwait);
 	LogFullDebug(COMPONENT_CACHE_INODE_LRU,
 		     "currentopen=%zd futility=%d totalwork=%zd biggest_window=%d extremis=%d lanes=%d fds_lowat=%d ",
 		     currentopen, lru_state.futility, totalwork,
 		     lru_state.biggest_window, extremis, LRU_N_Q_LANES,
 		     lru_state.fds_lowat);
 }

 /* Public functions */

 /**
  * Initialize subsystem
  */
 int
 cache_inode_lru_pkginit(void)
 {
 	/* Return code from system calls */
 	int code = 0;
 	/* Rlimit for open file descriptors */
 	struct rlimit rlim = {
 		.rlim_cur = RLIM_INFINITY,
 		.rlim_max = RLIM_INFINITY
 	};
 	struct fridgethr_params frp;

 	memset(&frp, 0, sizeof(struct fridgethr_params));
 	frp.thr_max = 1;
 	frp.thr_min = 1;
 	frp.thread_delay = cache_param.lru_run_interval;
 	frp.flavor = fridgethr_flavor_looper;

 	atomic_store_size_t(&open_fd_count, 0);

 	/* Set high and low watermark for cache entries.  This seems a
 	   bit fishy, so come back and revisit this. */
 	lru_state.entries_hiwat = cache_param.entries_hwmark;
 	lru_state.entries_used = 0;

 	/* Find out the system-imposed file descriptor limit */
 	if (getrlimit(RLIMIT_NOFILE, &rlim) != 0) {
 		code = errno;
 		LogCrit(COMPONENT_CACHE_INODE_LRU,
 			"Call to getrlimit failed with error %d. This should not happen.  Assigning default of %d.",
 			code, FD_FALLBACK_LIMIT);
 		lru_state.fds_system_imposed = FD_FALLBACK_LIMIT;
 	} else {
 		if (rlim.rlim_cur < rlim.rlim_max) {
 			/* Save the old soft value so we can fall back to it
 			   if setrlimit fails. */
 			rlim_t old_soft = rlim.rlim_cur;

 			LogInfo(COMPONENT_CACHE_INODE_LRU,
 				"Attempting to increase soft limit from %"
 				PRIu64 " to hard limit of %" PRIu64,
 				(uint64_t) rlim.rlim_cur,
 				(uint64_t) rlim.rlim_max);
 			rlim.rlim_cur = rlim.rlim_max;
 			if (setrlimit(RLIMIT_NOFILE, &rlim) < 0) {
 				code = errno;
 				LogWarn(COMPONENT_CACHE_INODE_LRU,
 					"Attempt to raise soft FD limit to hard FD limit failed with error %d.  Sticking to soft limit.",
 					code);
 				rlim.rlim_cur = old_soft;
 			}
 		}
 		if (rlim.rlim_cur == RLIM_INFINITY) {
 			FILE *nr_open;

 			nr_open = fopen("/proc/sys/fs/nr_open", "r");
 			if (nr_open == NULL) {
 				code = errno;
 				LogWarn(COMPONENT_CACHE_INODE_LRU,
 					"Attempt to open /proc/sys/fs/nr_open failed (%d)",
 					code);
 				goto err_open;
 			}
 			code = fscanf(nr_open, "%" SCNu32 "\n",
 				      &lru_state.fds_system_imposed);
 			if (code != 1) {
 				code = errno;
 				LogMajor(COMPONENT_CACHE_INODE_LRU,
 					 "The rlimit on open file descriptors is infinite and the attempt to find the system maximum failed with error %d.",
 					 code);
 				LogMajor(COMPONENT_CACHE_INODE_LRU,
 					 "Assigning the default fallback of %d which is almost certainly too small.",
 					 FD_FALLBACK_LIMIT);
 				LogMajor(COMPONENT_CACHE_INODE_LRU,
 					 "If you are on a Linux system, this should never happen.");
 				LogMajor(COMPONENT_CACHE_INODE_LRU,
 					 "If you are running some other system, please set an rlimit on file descriptors (for example, with ulimit) for this process and consider editing "
 					 __FILE__
 					 "to add support for finding your system's maximum.");
 				lru_state.fds_system_imposed =
 				    FD_FALLBACK_LIMIT;
 			}
 			fclose(nr_open);
 err_open:
 			;
 		} else {
 			lru_state.fds_system_imposed = rlim.rlim_cur;
 		}
 		LogInfo(COMPONENT_CACHE_INODE_LRU,
 			"Setting the system-imposed limit on FDs to %d.",
 			lru_state.fds_system_imposed);
 	}

 	lru_state.fds_hard_limit =
 	    (cache_param.fd_limit_percent *
 	     lru_state.fds_system_imposed) / 100;
 	lru_state.fds_hiwat =
 	    (cache_param.fd_hwmark_percent *
 	     lru_state.fds_system_imposed) / 100;
 	lru_state.fds_lowat =
 	    (cache_param.fd_lwmark_percent *
 	     lru_state.fds_system_imposed) / 100;
 	lru_state.futility = 0;

 	lru_state.per_lane_work =
 	    (cache_param.reaper_work / LRU_N_Q_LANES);
 	lru_state.biggest_window =
 	    (cache_param.biggest_window *
 	     lru_state.fds_system_imposed) / 100;

 	lru_state.prev_fd_count = 0;

 	lru_state.caching_fds = cache_param.use_fd_cache;

 	/* init queue complex */
 	lru_init_queues();

 	/* spawn LRU background thread */
 	code = fridgethr_init(&lru_fridge, "LRU_fridge", &frp);
 	if (code != 0) {
 		LogMajor(COMPONENT_CACHE_INODE_LRU,
 			 "Unable to initialize LRU fridge, error code %d.",
 			 code);
 		return code;
 	}

 	code = fridgethr_submit(lru_fridge, lru_run, NULL);
 	if (code != 0) {
 		LogMajor(COMPONENT_CACHE_INODE_LRU,
 			 "Unable to start LRU thread, error code %d.", code);
 		return code;
 	}

 	return 0;
 }

 /**
  * Shutdown subsystem
  *
  * @return 0 on success, POSIX errors on failure.
  */
 int
 cache_inode_lru_pkgshutdown(void)
 {
 	int rc = fridgethr_sync_command(lru_fridge,
 					fridgethr_comm_stop,
 					120);

 	if (rc == ETIMEDOUT) {
 		LogMajor(COMPONENT_CACHE_INODE_LRU,
 			 "Shutdown timed out, cancelling threads.");
 		fridgethr_cancel(lru_fridge);
 	} else if (rc != 0) {
 		LogMajor(COMPONENT_CACHE_INODE_LRU,
 			 "Failed shutting down LRU thread: %d", rc);
 	}
 	return rc;
 }

 static inline bool init_rw_locks(cache_entry_t *entry)
 {
 	int rc;
 	bool attr_lock_init = false;
 	bool content_lock_init = false;

 	/* Initialize the entry locks */
 	rc = pthread_rwlock_init(&entry->attr_lock, NULL);

 	if (rc != 0)
 		goto fail;

 	attr_lock_init = true;

 	rc = pthread_rwlock_init(&entry->content_lock, NULL);

 	if (rc != 0)
 		goto fail;

 	content_lock_init = true;

 	rc = pthread_rwlock_init(&entry->state_lock, NULL);

 	if (rc == 0)
 		return true;

 fail:

 	LogCrit(COMPONENT_CACHE_INODE,
 		"pthread_rwlock_init returned %d (%s)",
 		rc, strerror(rc));

 	if (attr_lock_init)
 		PTHREAD_RWLOCK_destroy(&entry->attr_lock);

 	if (content_lock_init)
 		PTHREAD_RWLOCK_destroy(&entry->content_lock);

 	return false;
 }

 static cache_inode_status_t
 alloc_cache_entry(cache_entry_t **entry)
 {
 	cache_inode_status_t status;
 	cache_entry_t *nentry;

 	nentry = pool_alloc(cache_inode_entry_pool, NULL);
 	if (!nentry) {
 		LogCrit(COMPONENT_CACHE_INODE_LRU,
 			"can't allocate a new entry from cache pool");
 		status = CACHE_INODE_MALLOC_ERROR;
 		goto out;
 	}

 	/* Initialize the entry locks */
 	if (!init_rw_locks(nentry)) {
 		/* Recycle */
 		status = CACHE_INODE_INIT_ENTRY_FAILED;
 		pool_free(cache_inode_entry_pool, nentry);
 		nentry = NULL;
 		goto out;
 	}

 	status = CACHE_INODE_SUCCESS;
 	atomic_inc_int64_t(&lru_state.entries_used);

  out:
 	*entry = nentry;
 	return status;
 }

 /**
  * @brief Re-use or allocate an entry
  *
  * This function repurposes a resident entry in the LRU system if the
  * system is above low-water mark, and allocates a new one otherwise.
  * On success, this function always returns an entry with two
  * references (one for the sentinel, one to allow the caller's use.)
  *
  * @param[out] entry Returned status
  *
  * @return CACHE_INODE_SUCCESS or error.
  */
 cache_inode_status_t
 cache_inode_lru_get(cache_entry_t **entry)
 {
 	cache_inode_lru_t *lru;
 	cache_inode_status_t status = CACHE_INODE_SUCCESS;
 	cache_entry_t *nentry = NULL;
 	uint32_t lane;

 	lru = lru_try_reap_entry();
 	if (lru) {
 		/* we uniquely hold entry */
 		nentry = container_of(lru, cache_entry_t, lru);
 		LogFullDebug(COMPONENT_CACHE_INODE_LRU,
 			     "Recycling entry at %p.", nentry);
 		cache_inode_lru_clean(nentry);
 		if (!init_rw_locks(nentry)) {
 			/* Recycle */
 			status = CACHE_INODE_INIT_ENTRY_FAILED;
 			pool_free(cache_inode_entry_pool, nentry);
 			nentry = NULL;
 			goto out;
 		}
 	} else {
 		/* alloc entry */
 		status = alloc_cache_entry(&nentry);
 		if (!nentry)
 			goto out;
 	}

 	/* Since the entry isn't in a queue, nobody can bump refcnt. */
 	nentry->lru.refcnt = 2;
 	nentry->lru.pin_refcnt = 0;
 	nentry->lru.cf = 0;

 	/* Enqueue. */
 	lane = lru_lane_of_entry(nentry);
 	lru_insert_entry(nentry, &LRU[lane].L1, lane, LRU_HEAD);

  out:
 	*entry = nentry;
 	return status;
 }

 /**
  * @brief Function to let the state layer pin an entry
  *
  * This function moves the given entry to the pinned queue partition
  * for its lane.  If the entry is already pinned, it is a no-op.
  *
  * @param[in] entry  The entry to be moved
  *
  * @retval CACHE_INODE_SUCCESS if the entry was moved.
  * @retval CACHE_INODE_ESTALE  if the entry is in the process of disposal
  */
 cache_inode_status_t
 cache_inode_inc_pin_ref(cache_entry_t *entry)
 {
 	uint32_t lane = entry->lru.lane;
 	struct lru_q_lane *qlane = &LRU[lane];

 	/* Pin ref is infrequent, and never concurrent because SAL invariantly
 	 * holds the state lock exclusive whenever it is called. */
 	QLOCK(qlane);
 	if (entry->lru.qid == LRU_ENTRY_CLEANUP) {
 		QUNLOCK(qlane);
 		return CACHE_INODE_ESTALE;
 	}

 	/* Pin if not pinned already */
 	cond_pin_entry(entry, LRU_FLAG_NONE /* future */);

 	/* take pin ref count */
 	entry->lru.pin_refcnt++;

 	QUNLOCK(qlane);		/* !LOCKED (lane) */

 	return CACHE_INODE_SUCCESS;
 }

 /**
  * @brief Function to let the state layer rlease a pin
  *
  * This function moves the given entry out of the pinned queue
  * partition for its lane.  If the entry is not pinned, it is a
  * no-op.
  *
  * If closefile is true, caller MUST hold the content_lock.
  *
  * @param[in] entry      The entry to be moved
  * @param[in] closefile  Indicates if file should be closed=
  *
  */
 void cache_inode_dec_pin_ref(cache_entry_t *entry, bool closefile)
 {
 	uint32_t lane = entry->lru.lane;
 	cache_inode_lru_t *lru = &entry->lru;
 	struct lru_q_lane *qlane = &LRU[lane];

 	/* Pin ref is infrequent, and never concurrent because SAL invariantly
 	 * holds the state lock exclusive whenever it is called. */
 	QLOCK(qlane);

 	entry->lru.pin_refcnt--;
 	if (unlikely(entry->lru.pin_refcnt == 0)) {

 		/* entry could infrequently be on the cleanup queue */
 		if (lru->qid == LRU_ENTRY_PINNED) {
 			/* remove from pinned */
 			struct lru_q *q = &qlane->pinned;
 			/* XXX skip L1 iteration fixups */
 			glist_del(&lru->q);
 			--(q->size);
 			/* add to MRU of L1 */
 			lru->qid = LRU_ENTRY_L1;
 			q = &qlane->L1;
 			glist_add_tail(&q->q, &lru->q);
 			++(q->size);
 		}

 		if (closefile == true) {
 			cache_inode_close(entry,
 					  CACHE_INODE_FLAG_REALLYCLOSE |
 					  CACHE_INODE_FLAG_NOT_PINNED |
 					  CACHE_INODE_FLAG_CONTENT_HAVE |
 					  CACHE_INODE_FLAG_CONTENT_HOLD |
 					  CACHE_INODE_DONT_KILL);
 		}
 	}

 	QUNLOCK(qlane);
 }

 /**
  * @brief Return true if a file is pinned.
  *
  * This function returns true if a file is pinned.
  *
  * @param[in] entry The file to be checked
  *
  * @return true if pinned, false otherwise.
  */
 bool
 cache_inode_is_pinned(cache_entry_t *entry)
 {
 	uint32_t lane = entry->lru.lane;
 	struct lru_q_lane *qlane = &LRU[lane];
 	int rc;

 	QLOCK(qlane);
 	rc = (entry->lru.pin_refcnt > 0);
 	QUNLOCK(qlane);

 	return rc;
 }

 /**
  * @brief Get a reference
  *
  * This function acquires a reference on the given cache entry.
  *
  * @param[in] entry  The entry on which to get a reference
  * @param[in] flags  One of LRU_REQ_INITIAL, LRU_REQ_SCAN, else LRU_FLAG_NONE
  *
  * A flags value of LRU_REQ_INITIAL or LRU_REQ_SCAN indicates an initial
  * reference.  A non-initial reference is an "extra" reference in some call
  * path, hence does not influence LRU, and is lockless.
  *
  * A flags value of LRU_REQ_INITIAL indicates an ordinary initial reference,
  * and strongly influences LRU.  LRU_REQ_SCAN indicates a scan reference
  * (currently, READDIR) and weakly influences LRU.  Ascan reference should not
  * be taken by call paths which may open a file descriptor.  In both cases, the
  * L1->L2 boundary is sticky (scan resistence).
  *
  * @retval CACHE_INODE_SUCCESS if the reference was acquired
  */
 cache_inode_status_t cache_inode_lru_ref(cache_entry_t *entry, uint32_t flags)
 {
 	cache_inode_lru_t *lru = &entry->lru;
 	struct lru_q_lane *qlane = &LRU[lru->lane];
 	struct lru_q *q;

 	if ((flags & (LRU_REQ_INITIAL | LRU_REQ_STALE_OK)) == 0) {
 		QLOCK(qlane);
 		if (lru->qid == LRU_ENTRY_CLEANUP) {
 			QUNLOCK(qlane);
 			return CACHE_INODE_ESTALE;
 		}
 		QUNLOCK(qlane);
 	}

 	atomic_inc_int32_t(&entry->lru.refcnt);

 	/* adjust LRU on initial refs */
 	if (flags & LRU_REQ_INITIAL) {

 		/* do it less */
 		if ((atomic_inc_int32_t(&entry->lru.cf) % 3) != 0)
 			goto out;

 		QLOCK(qlane);

 		switch (lru->qid) {
 		case LRU_ENTRY_L1:
 			q = lru_queue_of(entry);
 			if (flags & LRU_REQ_INITIAL) {
 				/* advance entry to MRU (of L1) */
 				LRU_DQ_SAFE(lru, q);
 				glist_add_tail(&q->q, &lru->q);
 				--(q->size);
 			} else {
 				/* do not advance entry in L1 on LRU_REQ_SCAN
 				 * (scan resistence) */
 			}
 			break;
 		case LRU_ENTRY_L2:
 			q = lru_queue_of(entry);
 			if (flags & LRU_REQ_INITIAL) {
 				/* move entry to LRU of L1 */
 				glist_del(&lru->q);	/* skip L1 fixups */
 				--(q->size);
 				lru->qid = LRU_ENTRY_L1;
 				q = &qlane->L1;
 				glist_add(&q->q, &lru->q);
 				++(q->size);
 			} else {
 				/* advance entry to MRU of L2 */
 				glist_del(&lru->q);	/* skip L1 fixups */
 				glist_add_tail(&q->q, &lru->q);
 			}
 			break;
 		default:
 			/* do nothing */
 			break;
 		}		/* switch qid */
 		QUNLOCK(qlane);
 	}			/* initial ref */
  out:
 	return CACHE_INODE_SUCCESS;
 }

 /**
  * @brief Relinquish a reference
  *
  * This function relinquishes a reference on the given cache entry.
  * It follows the disposal/recycling lock discipline given at the
  * beginning of the file.
  *
  * The supplied entry is always either unlocked or destroyed by the
  * time this function returns.
  *
  * @param[in] entry  The entry on which to release a reference
  * @param[in] flags  Currently significant are and LRU_FLAG_LOCKED
  *                   (indicating that the caller holds the LRU mutex
  *                   lock for this entry.)
  */
 void
 cache_inode_lru_unref(cache_entry_t *entry, uint32_t flags)
 {
 	int32_t refcnt;
 	bool do_cleanup = false;
 	uint32_t lane = entry->lru.lane;
 	struct lru_q_lane *qlane = &LRU[lane];
 	bool qlocked = flags & LRU_UNREF_QLOCKED;
 	bool other_lock_held = flags & LRU_UNREF_STATE_LOCK_HELD;

 	if (!qlocked && !other_lock_held) {
 		QLOCK(qlane);
 		if (((entry->lru.flags & LRU_CLEANED) == 0) &&
 		    (entry->lru.qid == LRU_ENTRY_CLEANUP)) {
 			do_cleanup = true;
 			entry->lru.flags |= LRU_CLEANED;
 		}
 		QUNLOCK(qlane);

 		if (do_cleanup) {
 			LogDebug(COMPONENT_CACHE_INODE,
 				 "LRU_ENTRY_CLEANUP of entry %p",
 				 entry);
 			state_wipe_file(entry);
 			kill_export_root_entry(entry);
 			kill_export_junction_entry(entry);
 		}
 	}

 	refcnt = atomic_dec_int32_t(&entry->lru.refcnt);

 	if (unlikely(refcnt == 0)) {

 		struct lru_q *q;

 		/* we MUST recheck that refcount is still 0 */
 		if (!qlocked)
 			QLOCK(qlane);

 		refcnt = atomic_fetch_int32_t(&entry->lru.refcnt);

 		if (unlikely(refcnt > 0)) {
 			if (!qlocked)
 				QUNLOCK(qlane);
 			goto out;
 		}

 		/* Really zero.  Remove entry and mark it as dead. */
 		q = lru_queue_of(entry);
 		if (q) {
 			/* as of now, entries leaving the cleanup queue
 			 * are LRU_ENTRY_NONE */
 			LRU_DQ_SAFE(&entry->lru, q);
 		}

 		if (!qlocked)
 			QUNLOCK(qlane);

 		cache_inode_lru_clean(entry);
 		pool_free(cache_inode_entry_pool, entry);

 		atomic_dec_int64_t(&lru_state.entries_used);
 	}			/* refcnt == 0 */
  out:
 	return;
 }

 /**
  * @brief Put back a raced initial reference
  *
  * This function returns an entry previously returned from
  * cache_inode_lru_get, in the uncommon circumstance that it will not
  * be used.
  *
  * @param[in] entry  The entry on which to release a reference
  * @param[in] flags  Currently significant are and LRU_FLAG_LOCKED
  *                   (indicating that the caller holds the LRU mutex
  *                   lock for this entry.)
  */
 void
 cache_inode_lru_putback(cache_entry_t *entry, uint32_t flags)
 {
 	bool qlocked = flags & LRU_UNREF_QLOCKED;
 	uint32_t lane = entry->lru.lane;
 	struct lru_q_lane *qlane = &LRU[lane];
 	struct lru_q *q;

 	if (!qlocked)
 		QLOCK(qlane);

 	q = lru_queue_of(entry);
 	if (q) {
 		/* as of now, entries leaving the cleanup queue
 		 * are LRU_ENTRY_NONE */
 		LRU_DQ_SAFE(&entry->lru, q);
 	}

 	/* We do NOT call lru_clean_entry, since it was never initialized. */
 	pool_free(cache_inode_entry_pool, entry);
 	atomic_dec_int64_t(&lru_state.entries_used);

 	if (!qlocked)
 		QUNLOCK(qlane);
 }

 /**
  *
  * @brief Wake the LRU thread to free FDs.
  *
  * This function wakes the LRU reaper thread to free FDs and should be
  * called when we are over the high water mark.
  */

 void
 lru_wake_thread(void)
 {
 	fridgethr_wake(lru_fridge);
 }

 /** @} */