Commit 86d71014 authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge git://

* git:// (87 commits)
  NFSv4: Disallow 'mount -t nfs4 -overs=2' and 'mount -t nfs4 -overs=3'
  NFS: Allow the "nfs" file system type to support NFSv4
  NFS: Move details of nfs4_get_sb() to a helper
  NFS: Refactor NFSv4 text-based mount option validation
  NFS: Mount option parser should detect missing "port="
  NFS: out of date comment regarding O_EXCL above nfs3_proc_create()
  NFS: Handle a zero-length auth flavor list
  SUNRPC: Ensure that sunrpc gets initialised before nfs, lockd, etc...
  nfs: fix compile error in rpc_pipefs.h
  nfs: Remove reference to generic_osync_inode from a comment
  SUNRPC: cache must take a reference to the cache detail's module on open()
  NFS: Use the DNS resolver in the mount code.
  NFS: Add a dns resolver for use with NFSv4 referrals and migration
  SUNRPC: Fix a typo in cache_pipefs_files
  nfs: nfs4xdr: optimize low level decoding
  nfs: nfs4xdr: get rid of READ_BUF
  nfs: nfs4xdr: simplify decode_exchange_id by reusing decode_opaque_inline
  nfs: nfs4xdr: get rid of COPYMEM
  nfs: nfs4xdr: introduce decode_sessionid helper
  nfs: nfs4xdr: introduce decode_verifier helper
parents 86373435 ab3bbaa8
The NFS client
The NFS version 2 protocol was first documented in RFC1094 (March 1989).
Since then two more major releases of NFS have been published, with NFSv3
being documented in RFC1813 (June 1995), and NFSv4 in RFC3530 (April
The Linux NFS client currently supports all the above published versions,
and work is in progress on adding support for minor version 1 of the NFSv4
The purpose of this document is to provide information on some of the
upcall interfaces that are used in order to provide the NFS client with
some of the information that it requires in order to fully comply with
the NFS spec.
The DNS resolver
NFSv4 allows for one server to refer the NFS client to data that has been
migrated onto another server by means of the special "fs_locations"
attribute. See
The fs_locations information can take the form of either an ip address and
a path, or a DNS hostname and a path. The latter requires the NFS client to
do a DNS lookup in order to mount the new volume, and hence the need for an
upcall to allow userland to provide this service.
Assuming that the user has the 'rpc_pipefs' filesystem mounted in the usual
/var/lib/nfs/rpc_pipefs, the upcall consists of the following steps:
(1) The process checks the dns_resolve cache to see if it contains a
valid entry. If so, it returns that entry and exits.
(2) If no valid entry exists, the helper script '/sbin/nfs_cache_getent'
(may be changed using the 'nfs.cache_getent' kernel boot parameter)
is run, with two arguments:
- the cache name, "dns_resolve"
- the hostname to resolve
(3) After looking up the corresponding ip address, the helper script
writes the result into the rpc_pipefs pseudo-file
in the following (text) format:
"<ip address> <hostname> <ttl>\n"
Where <ip address> is in the usual IPv4 (123.456.78.90) or IPv6
(ffee:ddcc:bbaa:9988:7766:5544:3322:1100, ffee::1100, ...) format.
<hostname> is identical to the second argument of the helper
script, and <ttl> is the 'time to live' of this cache entry (in
units of seconds).
Note: If <ip address> is invalid, say the string "0", then a negative
entry is created, which will cause the kernel to treat the hostname
as having no valid DNS translation.
A basic sample /sbin/nfs_cache_getent
echo "Usage: $0 cache_name entry_name"
exit 1
[ $# -lt 2 ] && die
case "${cachename}" in
result="$(${getent} hosts ${name} | ${cut} -f1 -d\ )"
[ -z "${result}" ] && result="0"
echo "${result} ${name} ${ttl}" >${cache_path}
......@@ -1503,6 +1503,14 @@ and is between 256 and 4096 characters. It is defined in the file
[NFS] set the TCP port on which the NFSv4 callback
channel should listen.
[NFS] sets the pathname to the program which is used
to update the NFS client cache entries.
[NFS] sets the timeout after which an attempt to
update a cache entry is deemed to have failed.
[NFS] set the maximum lifetime for idmapper cache
......@@ -2395,6 +2403,18 @@ and is between 256 and 4096 characters. It is defined in the file
stifb= [HW]
Format: bpp:<bpp1>[:<bpp2>[:<bpp3>...]]
SunRPC servers often require that client requests
originate from a privileged port (i.e. a port in the
range 0 < portnr < 1024).
An administrator who wishes to reserve some of these
ports for other uses may adjust the range that the
kernel's sunrpc client considers to be privileged
using these two parameters to set the minimum and
maximum port values.
Control how the NFS server code allocates CPUs to
......@@ -2411,6 +2431,15 @@ and is between 256 and 4096 characters. It is defined in the file
pernode one pool for each NUMA node (equivalent
to global on non-NUMA machines)
Sets the upper limit on the number of simultaneous
RPC calls that can be sent from the client to a
server. Increasing these values may allow you to
improve throughput, but will also increase the
amount of memory reserved for use by the client.
swiotlb= [IA-64] Number of I/O TLB slabs
switches= [HW,M68k]
......@@ -87,18 +87,6 @@ static unsigned int nlm_hash_address(const struct sockaddr *sap)
return hash & (NLM_HOST_NRHASH - 1);
static void nlm_clear_port(struct sockaddr *sap)
switch (sap->sa_family) {
case AF_INET:
((struct sockaddr_in *)sap)->sin_port = 0;
case AF_INET6:
((struct sockaddr_in6 *)sap)->sin6_port = 0;
* Common host lookup routine for server & client
......@@ -177,7 +165,7 @@ static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni)
host->h_addrbuf = nsm->sm_addrbuf;
memcpy(nlm_addr(host), ni->sap, ni->salen);
host->h_addrlen = ni->salen;
rpc_set_port(nlm_addr(host), 0);
memcpy(nlm_srcaddr(host), ni->src_sap, ni->src_len);
host->h_version = ni->version;
host->h_proto = ni->protocol;
......@@ -61,43 +61,6 @@ static inline struct sockaddr *nsm_addr(const struct nsm_handle *nsm)
return (struct sockaddr *)&nsm->sm_addr;
static void nsm_display_ipv4_address(const struct sockaddr *sap, char *buf,
const size_t len)
const struct sockaddr_in *sin = (struct sockaddr_in *)sap;
snprintf(buf, len, "%pI4", &sin->sin_addr.s_addr);
static void nsm_display_ipv6_address(const struct sockaddr *sap, char *buf,
const size_t len)
const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap;
if (ipv6_addr_v4mapped(&sin6->sin6_addr))
snprintf(buf, len, "%pI4", &sin6->sin6_addr.s6_addr32[3]);
else if (sin6->sin6_scope_id != 0)
snprintf(buf, len, "%pI6%%%u", &sin6->sin6_addr,
snprintf(buf, len, "%pI6", &sin6->sin6_addr);
static void nsm_display_address(const struct sockaddr *sap,
char *buf, const size_t len)
switch (sap->sa_family) {
case AF_INET:
nsm_display_ipv4_address(sap, buf, len);
case AF_INET6:
nsm_display_ipv6_address(sap, buf, len);
snprintf(buf, len, "unsupported address family");
static struct rpc_clnt *nsm_create(void)
struct sockaddr_in sin = {
......@@ -307,8 +270,11 @@ static struct nsm_handle *nsm_create_handle(const struct sockaddr *sap,
memcpy(nsm_addr(new), sap, salen);
new->sm_addrlen = salen;
nsm_display_address((const struct sockaddr *)&new->sm_addr,
new->sm_addrbuf, sizeof(new->sm_addrbuf));
if (rpc_ntop(nsm_addr(new), new->sm_addrbuf,
sizeof(new->sm_addrbuf)) == 0)
(void)snprintf(new->sm_addrbuf, sizeof(new->sm_addrbuf),
"unsupported address family");
memcpy(new->sm_name, hostname, hostname_len);
new->sm_name[hostname_len] = '\0';
......@@ -6,7 +6,8 @@ obj-$(CONFIG_NFS_FS) += nfs.o
nfs-y := client.o dir.o file.o getroot.o inode.o super.o nfs2xdr.o \
direct.o pagelist.o proc.o read.o symlink.o unlink.o \
write.o namespace.o mount_clnt.o
write.o namespace.o mount_clnt.o \
dns_resolve.o cache_lib.o
nfs-$(CONFIG_ROOT_NFS) += nfsroot.o
nfs-$(CONFIG_NFS_V3) += nfs3proc.o nfs3xdr.o
nfs-$(CONFIG_NFS_V3_ACL) += nfs3acl.o
* linux/fs/nfs/cache_lib.c
* Helper routines for the NFS client caches
* Copyright (c) 2009 Trond Myklebust <>
#include <linux/kmod.h>
#include <linux/module.h>
#include <linux/moduleparam.h>
#include <linux/mount.h>
#include <linux/namei.h>
#include <linux/sunrpc/cache.h>
#include <linux/sunrpc/rpc_pipe_fs.h>
#include "cache_lib.h"
static char nfs_cache_getent_prog[NFS_CACHE_UPCALL_PATHLEN] =
static unsigned long nfs_cache_getent_timeout = NFS_CACHE_UPCALL_TIMEOUT;
module_param_string(cache_getent, nfs_cache_getent_prog,
sizeof(nfs_cache_getent_prog), 0600);
MODULE_PARM_DESC(cache_getent, "Path to the client cache upcall program");
module_param_named(cache_getent_timeout, nfs_cache_getent_timeout, ulong, 0600);
MODULE_PARM_DESC(cache_getent_timeout, "Timeout (in seconds) after which "
"the cache upcall is assumed to have failed");
int nfs_cache_upcall(struct cache_detail *cd, char *entry_name)
static char *envp[] = { "HOME=/",
char *argv[] = {
int ret = -EACCES;
if (nfs_cache_getent_prog[0] == '\0')
goto out;
ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
* Disable the upcall mechanism if we're getting an ENOENT or
* EACCES error. The admin can re-enable it on the fly by using
* sysfs to set the 'cache_getent' parameter once the problem
* has been fixed.
if (ret == -ENOENT || ret == -EACCES)
nfs_cache_getent_prog[0] = '\0';
return ret > 0 ? 0 : ret;
* Deferred request handling
void nfs_cache_defer_req_put(struct nfs_cache_defer_req *dreq)
if (atomic_dec_and_test(&dreq->count))
static void nfs_dns_cache_revisit(struct cache_deferred_req *d, int toomany)
struct nfs_cache_defer_req *dreq;
dreq = container_of(d, struct nfs_cache_defer_req, deferred_req);
static struct cache_deferred_req *nfs_dns_cache_defer(struct cache_req *req)
struct nfs_cache_defer_req *dreq;
dreq = container_of(req, struct nfs_cache_defer_req, req);
dreq->deferred_req.revisit = nfs_dns_cache_revisit;
return &dreq->deferred_req;
struct nfs_cache_defer_req *nfs_cache_defer_req_alloc(void)
struct nfs_cache_defer_req *dreq;
dreq = kzalloc(sizeof(*dreq), GFP_KERNEL);
if (dreq) {
atomic_set(&dreq->count, 1);
dreq->req.defer = nfs_dns_cache_defer;
return dreq;
int nfs_cache_wait_for_upcall(struct nfs_cache_defer_req *dreq)
if (wait_for_completion_timeout(&dreq->completion,
nfs_cache_getent_timeout * HZ) == 0)
return -ETIMEDOUT;
return 0;
int nfs_cache_register(struct cache_detail *cd)
struct nameidata nd;
struct vfsmount *mnt;
int ret;
mnt = rpc_get_mount();
if (IS_ERR(mnt))
return PTR_ERR(mnt);
ret = vfs_path_lookup(mnt->mnt_root, mnt, "/cache", 0, &nd);
if (ret)
goto err;
ret = sunrpc_cache_register_pipefs(nd.path.dentry,
cd->name, 0600, cd);
if (!ret)
return ret;
return ret;
void nfs_cache_unregister(struct cache_detail *cd)
* Helper routines for the NFS client caches
* Copyright (c) 2009 Trond Myklebust <>
#include <linux/completion.h>
#include <linux/sunrpc/cache.h>
#include <asm/atomic.h>
* Deferred request handling
struct nfs_cache_defer_req {
struct cache_req req;
struct cache_deferred_req deferred_req;
struct completion completion;
atomic_t count;
extern int nfs_cache_upcall(struct cache_detail *cd, char *entry_name);
extern struct nfs_cache_defer_req *nfs_cache_defer_req_alloc(void);
extern void nfs_cache_defer_req_put(struct nfs_cache_defer_req *dreq);
extern int nfs_cache_wait_for_upcall(struct nfs_cache_defer_req *dreq);
extern int nfs_cache_register(struct cache_detail *cd);
extern void nfs_cache_unregister(struct cache_detail *cd);
......@@ -43,21 +43,29 @@ static struct svc_program nfs4_callback_program;
unsigned int nfs_callback_set_tcpport;
unsigned short nfs_callback_tcpport;
unsigned short nfs_callback_tcpport6;
static const int nfs_set_port_min = 0;
static const int nfs_set_port_max = 65535;
static int param_set_port(const char *val, struct kernel_param *kp)
static int param_set_portnr(const char *val, struct kernel_param *kp)
char *endp;
int num = simple_strtol(val, &endp, 0);
if (endp == val || *endp || num < nfs_set_port_min || num > nfs_set_port_max)
unsigned long num;
int ret;
if (!val)
return -EINVAL;
ret = strict_strtoul(val, 0, &num);
return -EINVAL;
*((int *)kp->arg) = num;
*((unsigned int *)kp->arg) = num;
return 0;
module_param_call(callback_tcpport, param_set_port, param_get_int,
&nfs_callback_set_tcpport, 0644);
static int param_get_portnr(char *buffer, struct kernel_param *kp)
return param_get_uint(buffer, kp);
#define param_check_portnr(name, p) __param_check(name, p, unsigned int);
module_param_named(callback_tcpport, nfs_callback_set_tcpport, portnr, 0644);
* This is the NFSv4 callback kernel thread.
......@@ -809,6 +809,9 @@ static int nfs_init_server(struct nfs_server *server,
/* Initialise the client representation from the mount data */
server->flags = data->flags;
server->options = data->options;
if (data->rsize)
server->rsize = nfs_block_size(data->rsize, NULL);
......@@ -1075,10 +1078,6 @@ struct nfs_server *nfs_create_server(const struct nfs_parsed_mount_data *data,
(unsigned long long) server->fsid.major,
(unsigned long long) server->fsid.minor);
list_add_tail(&server->client_link, &server->nfs_client->cl_superblocks);
list_add_tail(&server->master_link, &nfs_volume_list);
......@@ -1275,7 +1274,7 @@ static int nfs4_init_server(struct nfs_server *server,
/* Initialise the client representation from the mount data */
server->flags = data->flags;
server->caps |= NFS_CAP_ATOMIC_OPEN;
server->options = data->options;
/* Get a client record */
......@@ -1360,10 +1359,6 @@ struct nfs_server *nfs4_create_server(const struct nfs_parsed_mount_data *data,
if (server->namelen == 0 || server->namelen > NFS4_MAXNAMLEN)
server->namelen = NFS4_MAXNAMLEN;
list_add_tail(&server->client_link, &server->nfs_client->cl_superblocks);
list_add_tail(&server->master_link, &nfs_volume_list);
......@@ -1401,7 +1396,7 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,
/* Initialise the client representation from the parent server */
nfs_server_copy_userdata(server, parent_server);
server->caps |= NFS_CAP_ATOMIC_OPEN;
/* Get a client representation.
* Note: NFSv4 always uses TCP, */
......@@ -934,9 +934,6 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, const struct iovec *iov,
* back into its cache. We let the server do generic write
* parameter checking and report problems.
* We also avoid an unnecessary invocation of generic_osync_inode(),
* as it is fairly meaningless to sync the metadata of an NFS file.
* We eliminate local atime updates, see direct read above.
* We avoid unnecessary page cache invalidations for normal cached
* linux/fs/nfs/dns_resolve.c
* Copyright (c) 2009 Trond Myklebust <>
* Resolves DNS hostnames into valid ip addresses
#include <linux/hash.h>
#include <linux/string.h>
#include <linux/kmod.h>
#include <linux/module.h>
#include <linux/socket.h>
#include <linux/seq_file.h>
#include <linux/inet.h>
#include <linux/sunrpc/clnt.h>
#include <linux/sunrpc/cache.h>
#include <linux/sunrpc/svcauth.h>
#include "dns_resolve.h"
#include "cache_lib.h"
static struct cache_head *nfs_dns_table[NFS_DNS_HASHTBL_SIZE];
struct nfs_dns_ent {
struct cache_head h;
char *hostname;
size_t namelen;
struct sockaddr_storage addr;
size_t addrlen;
static void nfs_dns_ent_init(struct cache_head *cnew,
struct cache_head *ckey)
struct nfs_dns_ent *new;
struct nfs_dns_ent *key;
new = container_of(cnew, struct nfs_dns_ent, h);
key = container_of(ckey, struct nfs_dns_ent, h);
new->hostname = kstrndup(key->hostname, key->namelen, GFP_KERNEL);
if (new->hostname) {
new->namelen = key->namelen;
memcpy(&new->addr, &key->addr, key->addrlen);
new->addrlen = key->addrlen;
} else {
new->namelen = 0;
new->addrlen = 0;
static void nfs_dns_ent_put(struct kref *ref)
struct nfs_dns_ent *item;
item = container_of(ref, struct nfs_dns_ent, h.ref);
static struct cache_head *nfs_dns_ent_alloc(void)
struct nfs_dns_ent *item = kmalloc(sizeof(*item), GFP_KERNEL);
if (item != NULL) {
item->hostname = NULL;
item->namelen = 0;
item->addrlen = 0;
return &item->h;
return NULL;
static unsigned int nfs_dns_hash(const struct nfs_dns_ent *key)
return hash_str(key->hostname, NFS_DNS_HASHBITS);
static void nfs_dns_request(struct cache_detail *cd,
struct cache_head *ch,
char **bpp, int *blen)
struct nfs_dns_ent *key = container_of(ch, struct nfs_dns_ent, h);
qword_add(bpp, blen, key->hostname);
(*bpp)[-1] = '\n';
static int nfs_dns_upcall(struct cache_detail *cd,
struct cache_head *ch)
struct nfs_dns_ent *key = container_of(ch, struct nfs_dns_ent, h);
int ret;
ret = nfs_cache_upcall(cd, key->hostname);
if (ret)
ret = sunrpc_cache_pipe_upcall(cd, ch, nfs_dns_request);
return ret;
static int nfs_dns_match(struct cache_head *ca,
struct cache_head *cb)