fs/ceph/super.c at v3.2-rc6 · tjh.dev/kernel

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / fs / ceph / super.c
at v3.2-rc6 937 lines 23 kB view raw
  1
  2#include <linux/ceph/ceph_debug.h>
  3
  4#include <linux/backing-dev.h>
  5#include <linux/ctype.h>
  6#include <linux/fs.h>
  7#include <linux/inet.h>
  8#include <linux/in6.h>
  9#include <linux/module.h>
 10#include <linux/mount.h>
 11#include <linux/parser.h>
 12#include <linux/sched.h>
 13#include <linux/seq_file.h>
 14#include <linux/slab.h>
 15#include <linux/statfs.h>
 16#include <linux/string.h>
 17
 18#include "super.h"
 19#include "mds_client.h"
 20
 21#include <linux/ceph/decode.h>
 22#include <linux/ceph/mon_client.h>
 23#include <linux/ceph/auth.h>
 24#include <linux/ceph/debugfs.h>
 25
 26/*
 27 * Ceph superblock operations
 28 *
 29 * Handle the basics of mounting, unmounting.
 30 */
 31
 32/*
 33 * super ops
 34 */
 35static void ceph_put_super(struct super_block *s)
 36{
 37	struct ceph_fs_client *fsc = ceph_sb_to_client(s);
 38
 39	dout("put_super\n");
 40	ceph_mdsc_close_sessions(fsc->mdsc);
 41
 42	/*
 43	 * ensure we release the bdi before put_anon_super releases
 44	 * the device name.
 45	 */
 46	if (s->s_bdi == &fsc->backing_dev_info) {
 47		bdi_unregister(&fsc->backing_dev_info);
 48		s->s_bdi = NULL;
 49	}
 50
 51	return;
 52}
 53
 54static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
 55{
 56	struct ceph_fs_client *fsc = ceph_inode_to_client(dentry->d_inode);
 57	struct ceph_monmap *monmap = fsc->client->monc.monmap;
 58	struct ceph_statfs st;
 59	u64 fsid;
 60	int err;
 61
 62	dout("statfs\n");
 63	err = ceph_monc_do_statfs(&fsc->client->monc, &st);
 64	if (err < 0)
 65		return err;
 66
 67	/* fill in kstatfs */
 68	buf->f_type = CEPH_SUPER_MAGIC;  /* ?? */
 69
 70	/*
 71	 * express utilization in terms of large blocks to avoid
 72	 * overflow on 32-bit machines.
 73	 */
 74	buf->f_bsize = 1 << CEPH_BLOCK_SHIFT;
 75	buf->f_blocks = le64_to_cpu(st.kb) >> (CEPH_BLOCK_SHIFT-10);
 76	buf->f_bfree = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10);
 77	buf->f_bavail = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10);
 78
 79	buf->f_files = le64_to_cpu(st.num_objects);
 80	buf->f_ffree = -1;
 81	buf->f_namelen = NAME_MAX;
 82	buf->f_frsize = PAGE_CACHE_SIZE;
 83
 84	/* leave fsid little-endian, regardless of host endianness */
 85	fsid = *(u64 *)(&monmap->fsid) ^ *((u64 *)&monmap->fsid + 1);
 86	buf->f_fsid.val[0] = fsid & 0xffffffff;
 87	buf->f_fsid.val[1] = fsid >> 32;
 88
 89	return 0;
 90}
 91
 92
 93static int ceph_sync_fs(struct super_block *sb, int wait)
 94{
 95	struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
 96
 97	if (!wait) {
 98		dout("sync_fs (non-blocking)\n");
 99		ceph_flush_dirty_caps(fsc->mdsc);
100		dout("sync_fs (non-blocking) done\n");
101		return 0;
102	}
103
104	dout("sync_fs (blocking)\n");
105	ceph_osdc_sync(&fsc->client->osdc);
106	ceph_mdsc_sync(fsc->mdsc);
107	dout("sync_fs (blocking) done\n");
108	return 0;
109}
110
111/*
112 * mount options
113 */
114enum {
115	Opt_wsize,
116	Opt_rsize,
117	Opt_rasize,
118	Opt_caps_wanted_delay_min,
119	Opt_caps_wanted_delay_max,
120	Opt_cap_release_safety,
121	Opt_readdir_max_entries,
122	Opt_readdir_max_bytes,
123	Opt_congestion_kb,
124	Opt_last_int,
125	/* int args above */
126	Opt_snapdirname,
127	Opt_last_string,
128	/* string args above */
129	Opt_dirstat,
130	Opt_nodirstat,
131	Opt_rbytes,
132	Opt_norbytes,
133	Opt_noasyncreaddir,
134	Opt_ino32,
135};
136
137static match_table_t fsopt_tokens = {
138	{Opt_wsize, "wsize=%d"},
139	{Opt_rsize, "rsize=%d"},
140	{Opt_rasize, "rasize=%d"},
141	{Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"},
142	{Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"},
143	{Opt_cap_release_safety, "cap_release_safety=%d"},
144	{Opt_readdir_max_entries, "readdir_max_entries=%d"},
145	{Opt_readdir_max_bytes, "readdir_max_bytes=%d"},
146	{Opt_congestion_kb, "write_congestion_kb=%d"},
147	/* int args above */
148	{Opt_snapdirname, "snapdirname=%s"},
149	/* string args above */
150	{Opt_dirstat, "dirstat"},
151	{Opt_nodirstat, "nodirstat"},
152	{Opt_rbytes, "rbytes"},
153	{Opt_norbytes, "norbytes"},
154	{Opt_noasyncreaddir, "noasyncreaddir"},
155	{Opt_ino32, "ino32"},
156	{-1, NULL}
157};
158
159static int parse_fsopt_token(char *c, void *private)
160{
161	struct ceph_mount_options *fsopt = private;
162	substring_t argstr[MAX_OPT_ARGS];
163	int token, intval, ret;
164
165	token = match_token((char *)c, fsopt_tokens, argstr);
166	if (token < 0)
167		return -EINVAL;
168
169	if (token < Opt_last_int) {
170		ret = match_int(&argstr[0], &intval);
171		if (ret < 0) {
172			pr_err("bad mount option arg (not int) "
173			       "at '%s'\n", c);
174			return ret;
175		}
176		dout("got int token %d val %d\n", token, intval);
177	} else if (token > Opt_last_int && token < Opt_last_string) {
178		dout("got string token %d val %s\n", token,
179		     argstr[0].from);
180	} else {
181		dout("got token %d\n", token);
182	}
183
184	switch (token) {
185	case Opt_snapdirname:
186		kfree(fsopt->snapdir_name);
187		fsopt->snapdir_name = kstrndup(argstr[0].from,
188					       argstr[0].to-argstr[0].from,
189					       GFP_KERNEL);
190		if (!fsopt->snapdir_name)
191			return -ENOMEM;
192		break;
193
194		/* misc */
195	case Opt_wsize:
196		fsopt->wsize = intval;
197		break;
198	case Opt_rsize:
199		fsopt->rsize = intval;
200		break;
201	case Opt_rasize:
202		fsopt->rasize = intval;
203		break;
204	case Opt_caps_wanted_delay_min:
205		fsopt->caps_wanted_delay_min = intval;
206		break;
207	case Opt_caps_wanted_delay_max:
208		fsopt->caps_wanted_delay_max = intval;
209		break;
210	case Opt_readdir_max_entries:
211		fsopt->max_readdir = intval;
212		break;
213	case Opt_readdir_max_bytes:
214		fsopt->max_readdir_bytes = intval;
215		break;
216	case Opt_congestion_kb:
217		fsopt->congestion_kb = intval;
218		break;
219	case Opt_dirstat:
220		fsopt->flags |= CEPH_MOUNT_OPT_DIRSTAT;
221		break;
222	case Opt_nodirstat:
223		fsopt->flags &= ~CEPH_MOUNT_OPT_DIRSTAT;
224		break;
225	case Opt_rbytes:
226		fsopt->flags |= CEPH_MOUNT_OPT_RBYTES;
227		break;
228	case Opt_norbytes:
229		fsopt->flags &= ~CEPH_MOUNT_OPT_RBYTES;
230		break;
231	case Opt_noasyncreaddir:
232		fsopt->flags |= CEPH_MOUNT_OPT_NOASYNCREADDIR;
233		break;
234	case Opt_ino32:
235		fsopt->flags |= CEPH_MOUNT_OPT_INO32;
236		break;
237	default:
238		BUG_ON(token);
239	}
240	return 0;
241}
242
243static void destroy_mount_options(struct ceph_mount_options *args)
244{
245	dout("destroy_mount_options %p\n", args);
246	kfree(args->snapdir_name);
247	kfree(args);
248}
249
250static int strcmp_null(const char *s1, const char *s2)
251{
252	if (!s1 && !s2)
253		return 0;
254	if (s1 && !s2)
255		return -1;
256	if (!s1 && s2)
257		return 1;
258	return strcmp(s1, s2);
259}
260
261static int compare_mount_options(struct ceph_mount_options *new_fsopt,
262				 struct ceph_options *new_opt,
263				 struct ceph_fs_client *fsc)
264{
265	struct ceph_mount_options *fsopt1 = new_fsopt;
266	struct ceph_mount_options *fsopt2 = fsc->mount_options;
267	int ofs = offsetof(struct ceph_mount_options, snapdir_name);
268	int ret;
269
270	ret = memcmp(fsopt1, fsopt2, ofs);
271	if (ret)
272		return ret;
273
274	ret = strcmp_null(fsopt1->snapdir_name, fsopt2->snapdir_name);
275	if (ret)
276		return ret;
277
278	return ceph_compare_options(new_opt, fsc->client);
279}
280
281static int parse_mount_options(struct ceph_mount_options **pfsopt,
282			       struct ceph_options **popt,
283			       int flags, char *options,
284			       const char *dev_name,
285			       const char **path)
286{
287	struct ceph_mount_options *fsopt;
288	const char *dev_name_end;
289	int err = -ENOMEM;
290
291	fsopt = kzalloc(sizeof(*fsopt), GFP_KERNEL);
292	if (!fsopt)
293		return -ENOMEM;
294
295	dout("parse_mount_options %p, dev_name '%s'\n", fsopt, dev_name);
296
297	fsopt->sb_flags = flags;
298	fsopt->flags = CEPH_MOUNT_OPT_DEFAULT;
299
300	fsopt->rsize = CEPH_RSIZE_DEFAULT;
301	fsopt->rasize = CEPH_RASIZE_DEFAULT;
302	fsopt->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL);
303	fsopt->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT;
304	fsopt->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT;
305	fsopt->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT;
306	fsopt->max_readdir = CEPH_MAX_READDIR_DEFAULT;
307	fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT;
308	fsopt->congestion_kb = default_congestion_kb();
309
310	/* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */
311	err = -EINVAL;
312	if (!dev_name)
313		goto out;
314	*path = strstr(dev_name, ":/");
315	if (*path == NULL) {
316		pr_err("device name is missing path (no :/ in %s)\n",
317				dev_name);
318		goto out;
319	}
320	dev_name_end = *path;
321	dout("device name '%.*s'\n", (int)(dev_name_end - dev_name), dev_name);
322
323	/* path on server */
324	*path += 2;
325	dout("server path '%s'\n", *path);
326
327	err = ceph_parse_options(popt, options, dev_name, dev_name_end,
328				 parse_fsopt_token, (void *)fsopt);
329	if (err)
330		goto out;
331
332	/* success */
333	*pfsopt = fsopt;
334	return 0;
335
336out:
337	destroy_mount_options(fsopt);
338	return err;
339}
340
341/**
342 * ceph_show_options - Show mount options in /proc/mounts
343 * @m: seq_file to write to
344 * @mnt: mount descriptor
345 */
346static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt)
347{
348	struct ceph_fs_client *fsc = ceph_sb_to_client(mnt->mnt_sb);
349	struct ceph_mount_options *fsopt = fsc->mount_options;
350	struct ceph_options *opt = fsc->client->options;
351
352	if (opt->flags & CEPH_OPT_FSID)
353		seq_printf(m, ",fsid=%pU", &opt->fsid);
354	if (opt->flags & CEPH_OPT_NOSHARE)
355		seq_puts(m, ",noshare");
356	if (opt->flags & CEPH_OPT_NOCRC)
357		seq_puts(m, ",nocrc");
358
359	if (opt->name)
360		seq_printf(m, ",name=%s", opt->name);
361	if (opt->key)
362		seq_puts(m, ",secret=<hidden>");
363
364	if (opt->mount_timeout != CEPH_MOUNT_TIMEOUT_DEFAULT)
365		seq_printf(m, ",mount_timeout=%d", opt->mount_timeout);
366	if (opt->osd_idle_ttl != CEPH_OSD_IDLE_TTL_DEFAULT)
367		seq_printf(m, ",osd_idle_ttl=%d", opt->osd_idle_ttl);
368	if (opt->osd_timeout != CEPH_OSD_TIMEOUT_DEFAULT)
369		seq_printf(m, ",osdtimeout=%d", opt->osd_timeout);
370	if (opt->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT)
371		seq_printf(m, ",osdkeepalivetimeout=%d",
372			   opt->osd_keepalive_timeout);
373
374	if (fsopt->flags & CEPH_MOUNT_OPT_DIRSTAT)
375		seq_puts(m, ",dirstat");
376	if ((fsopt->flags & CEPH_MOUNT_OPT_RBYTES) == 0)
377		seq_puts(m, ",norbytes");
378	if (fsopt->flags & CEPH_MOUNT_OPT_NOASYNCREADDIR)
379		seq_puts(m, ",noasyncreaddir");
380
381	if (fsopt->wsize)
382		seq_printf(m, ",wsize=%d", fsopt->wsize);
383	if (fsopt->rsize != CEPH_RSIZE_DEFAULT)
384		seq_printf(m, ",rsize=%d", fsopt->rsize);
385	if (fsopt->rasize != CEPH_RASIZE_DEFAULT)
386		seq_printf(m, ",rasize=%d", fsopt->rasize);
387	if (fsopt->congestion_kb != default_congestion_kb())
388		seq_printf(m, ",write_congestion_kb=%d", fsopt->congestion_kb);
389	if (fsopt->caps_wanted_delay_min != CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT)
390		seq_printf(m, ",caps_wanted_delay_min=%d",
391			 fsopt->caps_wanted_delay_min);
392	if (fsopt->caps_wanted_delay_max != CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT)
393		seq_printf(m, ",caps_wanted_delay_max=%d",
394			   fsopt->caps_wanted_delay_max);
395	if (fsopt->cap_release_safety != CEPH_CAP_RELEASE_SAFETY_DEFAULT)
396		seq_printf(m, ",cap_release_safety=%d",
397			   fsopt->cap_release_safety);
398	if (fsopt->max_readdir != CEPH_MAX_READDIR_DEFAULT)
399		seq_printf(m, ",readdir_max_entries=%d", fsopt->max_readdir);
400	if (fsopt->max_readdir_bytes != CEPH_MAX_READDIR_BYTES_DEFAULT)
401		seq_printf(m, ",readdir_max_bytes=%d", fsopt->max_readdir_bytes);
402	if (strcmp(fsopt->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT))
403		seq_printf(m, ",snapdirname=%s", fsopt->snapdir_name);
404	return 0;
405}
406
407/*
408 * handle any mon messages the standard library doesn't understand.
409 * return error if we don't either.
410 */
411static int extra_mon_dispatch(struct ceph_client *client, struct ceph_msg *msg)
412{
413	struct ceph_fs_client *fsc = client->private;
414	int type = le16_to_cpu(msg->hdr.type);
415
416	switch (type) {
417	case CEPH_MSG_MDS_MAP:
418		ceph_mdsc_handle_map(fsc->mdsc, msg);
419		return 0;
420
421	default:
422		return -1;
423	}
424}
425
426/*
427 * create a new fs client
428 */
429static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
430					struct ceph_options *opt)
431{
432	struct ceph_fs_client *fsc;
433	const unsigned supported_features =
434		CEPH_FEATURE_FLOCK |
435		CEPH_FEATURE_DIRLAYOUTHASH;
436	const unsigned required_features = 0;
437	int err = -ENOMEM;
438
439	fsc = kzalloc(sizeof(*fsc), GFP_KERNEL);
440	if (!fsc)
441		return ERR_PTR(-ENOMEM);
442
443	fsc->client = ceph_create_client(opt, fsc, supported_features,
444					 required_features);
445	if (IS_ERR(fsc->client)) {
446		err = PTR_ERR(fsc->client);
447		goto fail;
448	}
449	fsc->client->extra_mon_dispatch = extra_mon_dispatch;
450	fsc->client->monc.want_mdsmap = 1;
451
452	fsc->mount_options = fsopt;
453
454	fsc->sb = NULL;
455	fsc->mount_state = CEPH_MOUNT_MOUNTING;
456
457	atomic_long_set(&fsc->writeback_count, 0);
458
459	err = bdi_init(&fsc->backing_dev_info);
460	if (err < 0)
461		goto fail_client;
462
463	err = -ENOMEM;
464	/*
465	 * The number of concurrent works can be high but they don't need
466	 * to be processed in parallel, limit concurrency.
467	 */
468	fsc->wb_wq = alloc_workqueue("ceph-writeback", 0, 1);
469	if (fsc->wb_wq == NULL)
470		goto fail_bdi;
471	fsc->pg_inv_wq = alloc_workqueue("ceph-pg-invalid", 0, 1);
472	if (fsc->pg_inv_wq == NULL)
473		goto fail_wb_wq;
474	fsc->trunc_wq = alloc_workqueue("ceph-trunc", 0, 1);
475	if (fsc->trunc_wq == NULL)
476		goto fail_pg_inv_wq;
477
478	/* set up mempools */
479	err = -ENOMEM;
480	fsc->wb_pagevec_pool = mempool_create_kmalloc_pool(10,
481			      fsc->mount_options->wsize >> PAGE_CACHE_SHIFT);
482	if (!fsc->wb_pagevec_pool)
483		goto fail_trunc_wq;
484
485	/* caps */
486	fsc->min_caps = fsopt->max_readdir;
487
488	return fsc;
489
490fail_trunc_wq:
491	destroy_workqueue(fsc->trunc_wq);
492fail_pg_inv_wq:
493	destroy_workqueue(fsc->pg_inv_wq);
494fail_wb_wq:
495	destroy_workqueue(fsc->wb_wq);
496fail_bdi:
497	bdi_destroy(&fsc->backing_dev_info);
498fail_client:
499	ceph_destroy_client(fsc->client);
500fail:
501	kfree(fsc);
502	return ERR_PTR(err);
503}
504
505static void destroy_fs_client(struct ceph_fs_client *fsc)
506{
507	dout("destroy_fs_client %p\n", fsc);
508
509	destroy_workqueue(fsc->wb_wq);
510	destroy_workqueue(fsc->pg_inv_wq);
511	destroy_workqueue(fsc->trunc_wq);
512
513	bdi_destroy(&fsc->backing_dev_info);
514
515	mempool_destroy(fsc->wb_pagevec_pool);
516
517	destroy_mount_options(fsc->mount_options);
518
519	ceph_fs_debugfs_cleanup(fsc);
520
521	ceph_destroy_client(fsc->client);
522
523	kfree(fsc);
524	dout("destroy_fs_client %p done\n", fsc);
525}
526
527/*
528 * caches
529 */
530struct kmem_cache *ceph_inode_cachep;
531struct kmem_cache *ceph_cap_cachep;
532struct kmem_cache *ceph_dentry_cachep;
533struct kmem_cache *ceph_file_cachep;
534
535static void ceph_inode_init_once(void *foo)
536{
537	struct ceph_inode_info *ci = foo;
538	inode_init_once(&ci->vfs_inode);
539}
540
541static int __init init_caches(void)
542{
543	ceph_inode_cachep = kmem_cache_create("ceph_inode_info",
544				      sizeof(struct ceph_inode_info),
545				      __alignof__(struct ceph_inode_info),
546				      (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD),
547				      ceph_inode_init_once);
548	if (ceph_inode_cachep == NULL)
549		return -ENOMEM;
550
551	ceph_cap_cachep = KMEM_CACHE(ceph_cap,
552				     SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
553	if (ceph_cap_cachep == NULL)
554		goto bad_cap;
555
556	ceph_dentry_cachep = KMEM_CACHE(ceph_dentry_info,
557					SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
558	if (ceph_dentry_cachep == NULL)
559		goto bad_dentry;
560
561	ceph_file_cachep = KMEM_CACHE(ceph_file_info,
562				      SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
563	if (ceph_file_cachep == NULL)
564		goto bad_file;
565
566	return 0;
567
568bad_file:
569	kmem_cache_destroy(ceph_dentry_cachep);
570bad_dentry:
571	kmem_cache_destroy(ceph_cap_cachep);
572bad_cap:
573	kmem_cache_destroy(ceph_inode_cachep);
574	return -ENOMEM;
575}
576
577static void destroy_caches(void)
578{
579	kmem_cache_destroy(ceph_inode_cachep);
580	kmem_cache_destroy(ceph_cap_cachep);
581	kmem_cache_destroy(ceph_dentry_cachep);
582	kmem_cache_destroy(ceph_file_cachep);
583}
584
585
586/*
587 * ceph_umount_begin - initiate forced umount.  Tear down down the
588 * mount, skipping steps that may hang while waiting for server(s).
589 */
590static void ceph_umount_begin(struct super_block *sb)
591{
592	struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
593
594	dout("ceph_umount_begin - starting forced umount\n");
595	if (!fsc)
596		return;
597	fsc->mount_state = CEPH_MOUNT_SHUTDOWN;
598	return;
599}
600
601static const struct super_operations ceph_super_ops = {
602	.alloc_inode	= ceph_alloc_inode,
603	.destroy_inode	= ceph_destroy_inode,
604	.write_inode    = ceph_write_inode,
605	.sync_fs        = ceph_sync_fs,
606	.put_super	= ceph_put_super,
607	.show_options   = ceph_show_options,
608	.statfs		= ceph_statfs,
609	.umount_begin   = ceph_umount_begin,
610};
611
612/*
613 * Bootstrap mount by opening the root directory.  Note the mount
614 * @started time from caller, and time out if this takes too long.
615 */
616static struct dentry *open_root_dentry(struct ceph_fs_client *fsc,
617				       const char *path,
618				       unsigned long started)
619{
620	struct ceph_mds_client *mdsc = fsc->mdsc;
621	struct ceph_mds_request *req = NULL;
622	int err;
623	struct dentry *root;
624
625	/* open dir */
626	dout("open_root_inode opening '%s'\n", path);
627	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS);
628	if (IS_ERR(req))
629		return ERR_CAST(req);
630	req->r_path1 = kstrdup(path, GFP_NOFS);
631	req->r_ino1.ino = CEPH_INO_ROOT;
632	req->r_ino1.snap = CEPH_NOSNAP;
633	req->r_started = started;
634	req->r_timeout = fsc->client->options->mount_timeout * HZ;
635	req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE);
636	req->r_num_caps = 2;
637	err = ceph_mdsc_do_request(mdsc, NULL, req);
638	if (err == 0) {
639		dout("open_root_inode success\n");
640		if (ceph_ino(req->r_target_inode) == CEPH_INO_ROOT &&
641		    fsc->sb->s_root == NULL) {
642			root = d_alloc_root(req->r_target_inode);
643			ceph_init_dentry(root);
644		} else {
645			root = d_obtain_alias(req->r_target_inode);
646		}
647		req->r_target_inode = NULL;
648		dout("open_root_inode success, root dentry is %p\n", root);
649	} else {
650		root = ERR_PTR(err);
651	}
652	ceph_mdsc_put_request(req);
653	return root;
654}
655
656
657
658
659/*
660 * mount: join the ceph cluster, and open root directory.
661 */
662static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc,
663		      const char *path)
664{
665	int err;
666	unsigned long started = jiffies;  /* note the start time */
667	struct dentry *root;
668	int first = 0;   /* first vfsmount for this super_block */
669
670	dout("mount start\n");
671	mutex_lock(&fsc->client->mount_mutex);
672
673	err = __ceph_open_session(fsc->client, started);
674	if (err < 0)
675		goto out;
676
677	dout("mount opening root\n");
678	root = open_root_dentry(fsc, "", started);
679	if (IS_ERR(root)) {
680		err = PTR_ERR(root);
681		goto out;
682	}
683	if (fsc->sb->s_root) {
684		dput(root);
685	} else {
686		fsc->sb->s_root = root;
687		first = 1;
688
689		err = ceph_fs_debugfs_init(fsc);
690		if (err < 0)
691			goto fail;
692	}
693
694	if (path[0] == 0) {
695		dget(root);
696	} else {
697		dout("mount opening base mountpoint\n");
698		root = open_root_dentry(fsc, path, started);
699		if (IS_ERR(root)) {
700			err = PTR_ERR(root);
701			goto fail;
702		}
703	}
704
705	fsc->mount_state = CEPH_MOUNT_MOUNTED;
706	dout("mount success\n");
707	mutex_unlock(&fsc->client->mount_mutex);
708	return root;
709
710out:
711	mutex_unlock(&fsc->client->mount_mutex);
712	return ERR_PTR(err);
713
714fail:
715	if (first) {
716		dput(fsc->sb->s_root);
717		fsc->sb->s_root = NULL;
718	}
719	goto out;
720}
721
722static int ceph_set_super(struct super_block *s, void *data)
723{
724	struct ceph_fs_client *fsc = data;
725	int ret;
726
727	dout("set_super %p data %p\n", s, data);
728
729	s->s_flags = fsc->mount_options->sb_flags;
730	s->s_maxbytes = 1ULL << 40;  /* temp value until we get mdsmap */
731
732	s->s_fs_info = fsc;
733	fsc->sb = s;
734
735	s->s_op = &ceph_super_ops;
736	s->s_export_op = &ceph_export_ops;
737
738	s->s_time_gran = 1000;  /* 1000 ns == 1 us */
739
740	ret = set_anon_super(s, NULL);  /* what is that second arg for? */
741	if (ret != 0)
742		goto fail;
743
744	return ret;
745
746fail:
747	s->s_fs_info = NULL;
748	fsc->sb = NULL;
749	return ret;
750}
751
752/*
753 * share superblock if same fs AND options
754 */
755static int ceph_compare_super(struct super_block *sb, void *data)
756{
757	struct ceph_fs_client *new = data;
758	struct ceph_mount_options *fsopt = new->mount_options;
759	struct ceph_options *opt = new->client->options;
760	struct ceph_fs_client *other = ceph_sb_to_client(sb);
761
762	dout("ceph_compare_super %p\n", sb);
763
764	if (compare_mount_options(fsopt, opt, other)) {
765		dout("monitor(s)/mount options don't match\n");
766		return 0;
767	}
768	if ((opt->flags & CEPH_OPT_FSID) &&
769	    ceph_fsid_compare(&opt->fsid, &other->client->fsid)) {
770		dout("fsid doesn't match\n");
771		return 0;
772	}
773	if (fsopt->sb_flags != other->mount_options->sb_flags) {
774		dout("flags differ\n");
775		return 0;
776	}
777	return 1;
778}
779
780/*
781 * construct our own bdi so we can control readahead, etc.
782 */
783static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);
784
785static int ceph_register_bdi(struct super_block *sb,
786			     struct ceph_fs_client *fsc)
787{
788	int err;
789
790	/* set ra_pages based on rasize mount option? */
791	if (fsc->mount_options->rasize >= PAGE_CACHE_SIZE)
792		fsc->backing_dev_info.ra_pages =
793			(fsc->mount_options->rasize + PAGE_CACHE_SIZE - 1)
794			>> PAGE_SHIFT;
795	else
796		fsc->backing_dev_info.ra_pages =
797			default_backing_dev_info.ra_pages;
798
799	err = bdi_register(&fsc->backing_dev_info, NULL, "ceph-%d",
800			   atomic_long_inc_return(&bdi_seq));
801	if (!err)
802		sb->s_bdi = &fsc->backing_dev_info;
803	return err;
804}
805
806static struct dentry *ceph_mount(struct file_system_type *fs_type,
807		       int flags, const char *dev_name, void *data)
808{
809	struct super_block *sb;
810	struct ceph_fs_client *fsc;
811	struct dentry *res;
812	int err;
813	int (*compare_super)(struct super_block *, void *) = ceph_compare_super;
814	const char *path = NULL;
815	struct ceph_mount_options *fsopt = NULL;
816	struct ceph_options *opt = NULL;
817
818	dout("ceph_mount\n");
819	err = parse_mount_options(&fsopt, &opt, flags, data, dev_name, &path);
820	if (err < 0) {
821		res = ERR_PTR(err);
822		goto out_final;
823	}
824
825	/* create client (which we may/may not use) */
826	fsc = create_fs_client(fsopt, opt);
827	if (IS_ERR(fsc)) {
828		res = ERR_CAST(fsc);
829		destroy_mount_options(fsopt);
830		ceph_destroy_options(opt);
831		goto out_final;
832	}
833
834	err = ceph_mdsc_init(fsc);
835	if (err < 0) {
836		res = ERR_PTR(err);
837		goto out;
838	}
839
840	if (ceph_test_opt(fsc->client, NOSHARE))
841		compare_super = NULL;
842	sb = sget(fs_type, compare_super, ceph_set_super, fsc);
843	if (IS_ERR(sb)) {
844		res = ERR_CAST(sb);
845		goto out;
846	}
847
848	if (ceph_sb_to_client(sb) != fsc) {
849		ceph_mdsc_destroy(fsc);
850		destroy_fs_client(fsc);
851		fsc = ceph_sb_to_client(sb);
852		dout("get_sb got existing client %p\n", fsc);
853	} else {
854		dout("get_sb using new client %p\n", fsc);
855		err = ceph_register_bdi(sb, fsc);
856		if (err < 0) {
857			res = ERR_PTR(err);
858			goto out_splat;
859		}
860	}
861
862	res = ceph_real_mount(fsc, path);
863	if (IS_ERR(res))
864		goto out_splat;
865	dout("root %p inode %p ino %llx.%llx\n", res,
866	     res->d_inode, ceph_vinop(res->d_inode));
867	return res;
868
869out_splat:
870	ceph_mdsc_close_sessions(fsc->mdsc);
871	deactivate_locked_super(sb);
872	goto out_final;
873
874out:
875	ceph_mdsc_destroy(fsc);
876	destroy_fs_client(fsc);
877out_final:
878	dout("ceph_mount fail %ld\n", PTR_ERR(res));
879	return res;
880}
881
882static void ceph_kill_sb(struct super_block *s)
883{
884	struct ceph_fs_client *fsc = ceph_sb_to_client(s);
885	dout("kill_sb %p\n", s);
886	ceph_mdsc_pre_umount(fsc->mdsc);
887	kill_anon_super(s);    /* will call put_super after sb is r/o */
888	ceph_mdsc_destroy(fsc);
889	destroy_fs_client(fsc);
890}
891
892static struct file_system_type ceph_fs_type = {
893	.owner		= THIS_MODULE,
894	.name		= "ceph",
895	.mount		= ceph_mount,
896	.kill_sb	= ceph_kill_sb,
897	.fs_flags	= FS_RENAME_DOES_D_MOVE,
898};
899
900#define _STRINGIFY(x) #x
901#define STRINGIFY(x) _STRINGIFY(x)
902
903static int __init init_ceph(void)
904{
905	int ret = init_caches();
906	if (ret)
907		goto out;
908
909	ret = register_filesystem(&ceph_fs_type);
910	if (ret)
911		goto out_icache;
912
913	pr_info("loaded (mds proto %d)\n", CEPH_MDSC_PROTOCOL);
914
915	return 0;
916
917out_icache:
918	destroy_caches();
919out:
920	return ret;
921}
922
923static void __exit exit_ceph(void)
924{
925	dout("exit_ceph\n");
926	unregister_filesystem(&ceph_fs_type);
927	destroy_caches();
928}
929
930module_init(init_ceph);
931module_exit(exit_ceph);
932
933MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
934MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
935MODULE_AUTHOR("Patience Warnick <patience@newdream.net>");
936MODULE_DESCRIPTION("Ceph filesystem for Linux");
937MODULE_LICENSE("GPL");