diff -r e1d135a8f666 sys/kern/vfs_cache.c --- a/sys/kern/vfs_cache.c Sun Mar 26 07:56:54 2006 +0000 +++ b/sys/kern/vfs_cache.c Wed Mar 29 10:23:26 2006 +0200 @@ -108,6 +108,10 @@ #define NCHHASH(hash) (&nchashtbl[(hash) & nchash]) #define MINNEG 1024 +/* Modes for shadow group traversal */ +#define SG_ALL 0 /* traverse whole group */ +#define SG_SUBTREE 1 /* traverse only subtree */ + MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries"); static LIST_HEAD(nchashhead, namecache) *nchashtbl; /* Hash Table */ @@ -170,6 +174,15 @@ static u_long numneghits; STATNODE(CTLFL static u_long numneghits; STATNODE(CTLFLAG_RD, numneghits, &numneghits); struct nchstats nchstats[SMP_MAXCPU]; + +static STAILQ_HEAD(, shadowinfo) shadowinfo_freeq; +static u_long numshadowinfo = 0; +STATNODE(CTLFLAG_RD, numshadowinfo, &numshadowinfo); +static long maxnumshadowinfo = -1; +SYSCTL_LONG(_vfs_cache, OID_AUTO, maxnumshadowinfo, CTLFLAG_RW, + &maxnumshadowinfo, 0, ""); +MALLOC_DEFINE(M_SHADOWINFO, "shadowinfo", "VFS name cache shadowinfo"); + /* * Export VFS cache effectiveness statistics to user-land. * @@ -196,6 +209,62 @@ SYSCTL_PROC(_vfs_cache, OID_AUTO, nchsta SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE|CTLFLAG_RD, 0, 0, sysctl_nchstats, "S,nchstats", "VFS cache effectiveness statistics"); +/* XXX stubs for later MPSAFE work */ +#define shadowinfo_freeq_lock() +#define shadowinfo_freeq_unlock() + +static struct shadowinfo * +shadowinfo_fetch(void) +{ + struct shadowinfo *shinf = STAILQ_FIRST(&shadowinfo_freeq); + + if (! shinf) + goto alloc; + + shadowinfo_freeq_lock(); + if ((shinf = STAILQ_FIRST(&shadowinfo_freeq))) + STAILQ_REMOVE_HEAD(&shadowinfo_freeq, sh_entry); + shadowinfo_freeq_unlock(); + + if (shinf) + return (shinf); + +alloc: + shinf = malloc(sizeof(*shinf), M_SHADOWINFO, M_WAITOK|M_ZERO); + numshadowinfo++; + + return (shinf); +} + +static __inline +struct shadowinfo* +shadowinfo_ref(struct shadowinfo *shinf) +{ + shinf->sh_refs++; + + return (shinf); +} + +static void +shadowinfo_put(struct shadowinfo *shinf) +{ + if (--shinf->sh_refs > 0) + return; + + if (maxnumshadowinfo >= 0 && numshadowinfo > maxnumshadowinfo) { + free(shinf, M_SHADOWINFO); + numshadowinfo--; + return; + } + + shinf->sh_exlocks = 0; + shinf->sh_locktd = NULL; + + shadowinfo_freeq_lock(); + STAILQ_INSERT_TAIL(&shadowinfo_freeq, shinf, sh_entry); + shadowinfo_freeq_unlock(); +} + static void cache_zap(struct namecache *ncp); /* @@ -225,7 +294,7 @@ _cache_drop(struct namecache *ncp) (ncp->nc_flag & NCF_UNRESOLVED) && TAILQ_EMPTY(&ncp->nc_list) ) { - KKASSERT(ncp->nc_exlocks == 0); + KKASSERT(ncp->nc_shadowinfo->sh_exlocks == 0); cache_lock(ncp); cache_zap(ncp); } else { @@ -295,6 +364,10 @@ cache_alloc(int nlen) ncp->nc_error = ENOTCONN; /* needs to be resolved */ ncp->nc_refs = 1; ncp->nc_fsmid = 1; + ncp->nc_shadowinfo = &ncp->nc_shadowinfo_internal; + ncp->nc_shadowinfo_internal.sh_refs = 2; + ncp->nc_shadow_prev = NULL; + ncp->nc_shadow_next = NULL; TAILQ_INIT(&ncp->nc_list); cache_lock(ncp); return(ncp); @@ -303,7 +376,7 @@ static void static void cache_free(struct namecache *ncp) { - KKASSERT(ncp->nc_refs == 1 && ncp->nc_exlocks == 1); + KKASSERT(ncp->nc_refs == 1 && ncp->nc_shadowinfo->sh_exlocks == 1); if (ncp->nc_name) free(ncp->nc_name, M_VFSCACHE); free(ncp, M_VFSCACHE); @@ -322,6 +395,188 @@ cache_drop(struct namecache *ncp) cache_drop(struct namecache *ncp) { _cache_drop(ncp); +} + +/* + * Iterate an "updater" function over a shadow group. + * All-group and subtree-only traversals are supported. + */ +static struct namecache * +cache_group_walk(struct namecache *ncp, + int (*updater)(struct namecache *xncp, void *param), + int flags, void *param) +{ + struct namecache *xncp = ncp, *yncp; + + for (;;) { + yncp = xncp->nc_shadow_next; + if (updater(xncp, param)) + break; + if (! yncp || yncp == ncp || + (flags & SG_SUBTREE && + yncp->nc_shadowheight <= ncp->nc_shadowheight)) + break; + xncp = yncp; + } + + return(xncp); +} + +struct migrate_param { + int heightdelta; + int exlocks; + struct shadowinfo *shadowinfo; +}; + +static int +migrate_updater(struct namecache *ncp, void *param) +{ + struct migrate_param *mpm = param; + struct shadowinfo *shinf = mpm->shadowinfo; + struct shadowinfo *oldshinf = ncp->nc_shadowinfo; + + if (! shinf) + shinf = &ncp->nc_shadowinfo_internal; + + if (shinf == oldshinf) + goto out; + + shinf->sh_locktd = oldshinf->sh_locktd; + + ncp->nc_shadowinfo = shadowinfo_ref(shinf); + shadowinfo_put(oldshinf); + +out: + ncp->nc_shadowheight += mpm->heightdelta; + if (mpm->exlocks >= 0) + shinf->sh_exlocks = mpm->exlocks; + + return (0); +} + +static __inline +void +cache_shadow_link(struct namecache *sncp, struct namecache *ncp) +{ + struct namecache *pncp; + struct namecache *nsncp; + + pncp = ncp->nc_shadow_prev ?: ncp; + nsncp = sncp->nc_shadow_next ?: sncp; + + pncp->nc_shadow_next = nsncp; + nsncp->nc_shadow_prev = pncp; + + sncp->nc_shadow_next = ncp; + ncp->nc_shadow_prev = sncp; +} + +static __inline +void +cache_shadow_unlink(struct namecache *ncp) +{ + if (! ncp->nc_shadow_next) + return; + + KKASSERT(ncp->nc_shadow_prev); + + if (ncp->nc_shadow_prev == ncp->nc_shadow_next) { + ncp->nc_shadow_prev->nc_shadow_next = NULL; + ncp->nc_shadow_next->nc_shadow_prev = NULL; + } else { + ncp->nc_shadow_prev->nc_shadow_next = ncp->nc_shadow_next; + ncp->nc_shadow_next->nc_shadow_prev = ncp->nc_shadow_prev; + } + + ncp->nc_shadow_prev = ncp->nc_shadow_next = NULL; +} + +/* + * Join ncp into the shadow group of sncp. + * + * ncp must be unlocked on entry, while sncp must be locked on entry. + * + * The routine will fail and return ELOOP if the intended shadowing association + * doesnt' make sense (currently this boils down to ncp being the same as + * sncp). + * It will fail with EEXIST if ncp gets resolved or acquires a shadow + * association from elsewhere during the attach attempt (it is possbile due to + * the fact that ncp is unlocked). + * + * - On success ncp will be a representative of the joint shadow group, which + * then will be locked. + * - On failure the namecache entries will exist separately just as they did + * before; both entries will be locked. + */ +int +cache_shadow_attach(struct namecache *ncp, struct namecache *sncp) +{ + struct migrate_param mpm; + + if (ncp == sncp) + return(ELOOP); + + KKASSERT(ncp->nc_shadowinfo->sh_locktd != curthread); + KKASSERT(sncp->nc_shadowinfo->sh_locktd == curthread); + + cache_lock_two(ncp, sncp); + + if ((ncp->nc_flag & NCF_UNRESOLVED) == 0 || ncp->nc_shadowheight != 0) + return(EEXIST); + + if (sncp->nc_shadowinfo == &sncp->nc_shadowinfo_internal) { + mpm.heightdelta = 0; + mpm.shadowinfo = shadowinfo_fetch(); + mpm.exlocks = sncp->nc_shadowinfo->sh_exlocks; + migrate_updater(sncp, &mpm); + } + + mpm.heightdelta = sncp->nc_shadowheight + 1; + mpm.shadowinfo = sncp->nc_shadowinfo; + mpm.exlocks = -1; + + cache_group_walk(ncp, &migrate_updater, SG_ALL, &mpm); + cache_shadow_link(sncp, ncp); + + return(0); +} + +/* + * Take out namecache entry from its shadow group. + * + * The shadow group must be locked upon entry. + * + * On return both the entry and its former group remain locked. + */ +void +cache_shadow_detach(struct namecache *ncp) +{ + struct namecache *pncp, *nncp; + struct migrate_param mpm; + + mpm.shadowinfo = NULL; +again: + mpm.heightdelta = -ncp->nc_shadowheight; + mpm.exlocks = ncp->nc_shadowinfo->sh_exlocks; + pncp = ncp->nc_shadow_prev; + nncp = ncp->nc_shadow_next; + + migrate_updater(ncp, &mpm); + cache_shadow_unlink(ncp); + + if (nncp && nncp == pncp) { + ncp = nncp; + goto again; + } +} + +static int +vref_updater(struct namecache *ncp, void *param) +{ + if (ncp->nc_vp) + *(int *)param > 0 ? vhold(ncp->nc_vp) : vdrop(ncp->nc_vp); + + return(0); } /* @@ -349,15 +604,21 @@ cache_lock(struct namecache *ncp) { thread_t td; int didwarn; + struct shadowinfo *shinf; KKASSERT(ncp->nc_refs != 0); didwarn = 0; td = curthread; for (;;) { - if (ncp->nc_exlocks == 0) { - ncp->nc_exlocks = 1; - ncp->nc_locktd = td; + shinf = ncp->nc_shadowinfo; + KKASSERT(shinf); + KKASSERT(shinf->sh_refs != 0); + if (shinf->sh_exlocks == 0) { + int ref = 1; + + shinf->sh_exlocks = 1; + shinf->sh_locktd = td; /* * The vp associated with a locked ncp must be held * to prevent it from being recycled (which would @@ -365,16 +626,15 @@ cache_lock(struct namecache *ncp) * * XXX loop on race for later MPSAFE work. */ - if (ncp->nc_vp) - vhold(ncp->nc_vp); + cache_group_walk(ncp, &vref_updater, SG_ALL, &ref); break; } - if (ncp->nc_locktd == td) { - ++ncp->nc_exlocks; + if (shinf->sh_locktd == td) { + ++shinf->sh_exlocks; break; } - ncp->nc_flag |= NCF_LOCKREQ; - if (tsleep(ncp, 0, "clock", nclockwarn) == EWOULDBLOCK) { + shinf->sh_lockreq = 1; + if (tsleep(shinf, 0, "clock", nclockwarn) == EWOULDBLOCK) { if (didwarn) continue; didwarn = 1; @@ -398,12 +658,17 @@ cache_lock_nonblock(struct namecache *nc cache_lock_nonblock(struct namecache *ncp) { thread_t td; + struct shadowinfo *shinf = ncp->nc_shadowinfo; KKASSERT(ncp->nc_refs != 0); + KKASSERT(shinf); + KKASSERT(shinf->sh_refs != 0); td = curthread; - if (ncp->nc_exlocks == 0) { - ncp->nc_exlocks = 1; - ncp->nc_locktd = td; + if (shinf->sh_exlocks == 0) { + int ref = 1; + + shinf->sh_exlocks = 1; + shinf->sh_locktd = td; /* * The vp associated with a locked ncp must be held * to prevent it from being recycled (which would @@ -411,8 +676,7 @@ cache_lock_nonblock(struct namecache *nc * * XXX loop on race for later MPSAFE work. */ - if (ncp->nc_vp) - vhold(ncp->nc_vp); + cache_group_walk(ncp, &vref_updater, SG_ALL, &ref); return(0); } else { return(EWOULDBLOCK); @@ -423,17 +687,45 @@ cache_unlock(struct namecache *ncp) cache_unlock(struct namecache *ncp) { thread_t td = curthread; + struct shadowinfo *shinf = ncp->nc_shadowinfo; KKASSERT(ncp->nc_refs > 0); - KKASSERT(ncp->nc_exlocks > 0); - KKASSERT(ncp->nc_locktd == td); - if (--ncp->nc_exlocks == 0) { - if (ncp->nc_vp) - vdrop(ncp->nc_vp); - ncp->nc_locktd = NULL; - if (ncp->nc_flag & NCF_LOCKREQ) { - ncp->nc_flag &= ~NCF_LOCKREQ; - wakeup(ncp); + KKASSERT(shinf); + KKASSERT(shinf->sh_refs > 0); + KKASSERT(shinf->sh_exlocks > 0); + KKASSERT(shinf->sh_locktd == td); + if (shinf->sh_exlocks == 1) { + int ref = -1; + cache_group_walk(ncp, &vref_updater, SG_ALL, &ref); + } + if (--shinf->sh_exlocks == 0) { + shinf->sh_locktd = NULL; + if (shinf->sh_lockreq) { + shinf->sh_lockreq = 0; + wakeup(shinf); + } + } +} + +/* + * Obtain lock on both of uncp and lncp. + * + * On entry, uncp is assumed to be unlocked, and lncp is assumed to be + * locked. + * + * After this function returns, caller is responsible for checking + * the state of lncp which might have got unlocked temporarily. + */ +void +cache_lock_two(struct namecache *uncp, struct namecache *lncp) +{ + if (cache_lock_nonblock(uncp) != 0) { + if (uncp > lncp) + cache_lock(uncp); + else { + cache_unlock(lncp); + cache_lock(uncp); + cache_lock(lncp); } } } @@ -453,7 +745,8 @@ cache_get_nonblock(struct namecache *ncp cache_get_nonblock(struct namecache *ncp) { /* XXX MP */ - if (ncp->nc_exlocks == 0 || ncp->nc_locktd == curthread) { + if (ncp->nc_shadowinfo->sh_exlocks == 0 || + ncp->nc_shadowinfo->sh_locktd == curthread) { _cache_hold(ncp); cache_lock(ncp); return(0); @@ -487,7 +780,7 @@ cache_setvp(struct namecache *ncp, struc if (!TAILQ_EMPTY(&ncp->nc_list)) vhold(vp); TAILQ_INSERT_HEAD(&vp->v_namecache, ncp, nc_vnode); - if (ncp->nc_exlocks) + if (ncp->nc_shadowinfo->sh_exlocks) vhold(vp); /* @@ -521,6 +814,8 @@ cache_settimeout(struct namecache *ncp, ncp->nc_timeout = 1; } +static int unresolver_updater(struct namecache *ncp, void *param); + /* * Disassociate the vnode or negative-cache association and mark a * namecache entry as unresolved again. Note that the ncp is still @@ -541,7 +836,25 @@ void void cache_setunresolved(struct namecache *ncp) { + struct namecache *nncp; + + cache_group_walk(ncp, &unresolver_updater, SG_SUBTREE, ncp); + + nncp = ncp->nc_shadow_next; + if (nncp) + cache_hold(nncp); + unresolver_updater(ncp, NULL); + if (nncp) + cache_put(nncp); +} + +static int +unresolver_updater(struct namecache *ncp, void *param) +{ struct vnode *vp; + + if (ncp == param) + return(0); if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) { ncp->nc_flag |= NCF_UNRESOLVED; @@ -563,13 +876,23 @@ cache_setunresolved(struct namecache *nc */ if (!TAILQ_EMPTY(&ncp->nc_list)) vdrop(vp); - if (ncp->nc_exlocks) + if (ncp->nc_shadowinfo->sh_exlocks) vdrop(vp); } else { TAILQ_REMOVE(&ncneglist, ncp, nc_vnode); --numneg; } - } + + cache_shadow_detach(ncp); + } + + if (ncp->nc_refs == 0) { + cache_hold(ncp); + cache_put(ncp); + } + + + return(0); } /* @@ -619,7 +942,7 @@ cache_inval(struct namecache *ncp, int f struct namecache *nextkid; int rcnt = 0; - KKASSERT(ncp->nc_exlocks); + KKASSERT(ncp->nc_shadowinfo->sh_exlocks); cache_setunresolved(ncp); if (flags & CINV_DESTROY) @@ -715,6 +1038,7 @@ restart: * XXX the disconnection could pose a problem, check code paths to make * sure any code that blocks can handle the parent being changed out from * under it. Maybe we should lock the children (watch out for deadlocks) ? + * [UPDATE: attempt made to lock children, see in situ explanation] * * After we return the caller has the option of calling cache_setvp() if * the vnode of the new target ncp is known. @@ -726,26 +1050,62 @@ cache_rename(struct namecache *fncp, str cache_rename(struct namecache *fncp, struct namecache *tncp) { struct namecache *scan; - int didwarn = 0; - + int didwarn[] = { 0, 0 }; + + /* XXX should we rather make here a non-equality assertion? */ + if (fncp == tncp) + return; + +again: cache_setunresolved(fncp); cache_setunresolved(tncp); + + /* + * It seems we need to unlock fncp before calling cache_inval(): + * cache_inval() does a lot of lock/unlock/relock-ing (with tncp + * and its children), therefore keeping fncp locked might be + * deadlocky... + */ + cache_unlock(fncp); + while (cache_inval(tncp, CINV_CHILDREN) != 0) { - if (didwarn++ % 10 == 0) { - printf("Warning: cache_rename: race during " + if (didwarn[0]++ % 10 == 0) { + printf("Warning: cache_rename: race #1 during " "rename %s->%s\n", fncp->nc_name, tncp->nc_name); } tsleep(tncp, 0, "mvrace", hz / 10); cache_setunresolved(tncp); } + + cache_unlock(tncp); + cache_lock(fncp); + while ((scan = TAILQ_FIRST(&fncp->nc_list)) != NULL) { - cache_hold(scan); + cache_unlock(fncp); + /* + * We have to lock fncp's kids in order to unresolve + * their shadow kids... + */ + cache_get(scan); cache_unlink_parent(scan); + cache_group_walk(scan, &unresolver_updater, SG_SUBTREE, scan); cache_link_parent(scan, tncp); if (scan->nc_flag & NCF_HASHED) cache_rehash(scan); - cache_drop(scan); + cache_put(scan); + cache_lock(fncp); + } + + cache_lock_two(tncp, fncp); + + if ((fncp->nc_flag & tncp->nc_flag & NCF_UNRESOLVED) == 0) { + if (didwarn[1]++ % 10 == 0) { + printf("Warning: cache_rename: race #2 during " + "rename %s->%s\n", + fncp->nc_name, tncp->nc_name); + } + goto again; } } @@ -1321,7 +1681,7 @@ cache_zap(struct namecache *ncp) cache_drop(ncp); return; } - KKASSERT(par->nc_exlocks == 0); + KKASSERT(par->nc_shadowinfo->sh_exlocks == 0); cache_lock(ncp); } done: @@ -1417,7 +1777,7 @@ restart: if (ncp->nc_timeout && (int)(ncp->nc_timeout - ticks) < 0 && (ncp->nc_flag & NCF_UNRESOLVED) == 0 && - ncp->nc_exlocks == 0 + ncp->nc_shadowinfo->sh_exlocks == 0 ) { cache_zap(cache_get(ncp)); goto restart; @@ -1738,6 +2098,7 @@ nchinit(void) gd->gd_nchstats = &nchstats[i]; } TAILQ_INIT(&ncneglist); + STAILQ_INIT(&shadowinfo_freeq); nchashtbl = hashinit(desiredvnodes*2, M_VFSCACHE, &nchash); nclockwarn = 1 * hz; } diff -r e1d135a8f666 sys/sys/namecache.h --- a/sys/sys/namecache.h Sun Mar 26 07:56:54 2006 +0000 +++ b/sys/sys/namecache.h Wed Mar 29 10:23:26 2006 +0200 @@ -70,7 +70,20 @@ struct vnode; +/* + * Auxiliary structure for locking namecache entries, + * either on their own or grouped into "shadow groups". + */ +struct shadowinfo { + STAILQ_ENTRY(shadowinfo) sh_entry; /* entry for free list */ + int sh_exlocks; /* namespace locking */ + struct thread *sh_locktd; /* namespace locking */ + int sh_refs; /* reference count */ + uint8_t sh_lockreq :1; /* lock intent sign */ +}; + TAILQ_HEAD(namecache_list, namecache); +LIST_HEAD(namecache_shadow_list, namecache); /* * The namecache structure is used to manage the filesystem namespace. Most @@ -110,8 +123,12 @@ struct namecache { char *nc_name; /* Separately allocated seg name */ int nc_error; int nc_timeout; /* compared against ticks, or 0 */ - int nc_exlocks; /* namespace locking */ - struct thread *nc_locktd; /* namespace locking */ + struct shadowinfo *nc_shadowinfo; /* namespace locking */ + struct shadowinfo nc_shadowinfo_internal; /* private locking information */ + struct namecache *nc_shadow_prev; /* previous entry in shadow group */ + struct namecache *nc_shadow_next; /* next entry in shadow group */ + int nc_shadowheight; /* measure within shadow group */ + struct namecache *nc_shadowed; /* lower layer entry in layered fs */ struct mount *nc_mount; /* associated mount for vopops */ int64_t nc_fsmid; /* filesystem modified id */ }; @@ -127,7 +144,7 @@ typedef struct namecache *namecache_t; #define NCF_MOUNTPT 0x0008 /* mount point */ #define NCF_ROOT 0x0010 /* namecache root (static) */ #define NCF_HASHED 0x0020 /* namecache entry in hash table */ -#define NCF_LOCKREQ 0x0040 +#define NCF_UNUSED040 0x0040 #define NCF_UNUSED080 0x0080 #define NCF_ISSYMLINK 0x0100 /* represents a symlink */ #define NCF_ISDIR 0x0200 /* represents a directory */ @@ -150,6 +167,9 @@ void cache_lock(struct namecache *ncp); void cache_lock(struct namecache *ncp); int cache_lock_nonblock(struct namecache *ncp); void cache_unlock(struct namecache *ncp); +void cache_lock_two(struct namecache *uncp, struct namecache *lncp); +int cache_shadow_attach(struct namecache *ncp, struct namecache *sncp); +void cache_shadow_detach(struct namecache *ncp); void cache_setvp(struct namecache *ncp, struct vnode *vp); void cache_settimeout(struct namecache *ncp, int nticks); void cache_setunresolved(struct namecache *ncp);