fs/exofs/super.c at v2.6.39 · tjh.dev/kernel

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / fs / exofs / super.c
at v2.6.39 1001 lines 26 kB view raw
   1/*
   2 * Copyright (C) 2005, 2006
   3 * Avishay Traeger (avishay@gmail.com)
   4 * Copyright (C) 2008, 2009
   5 * Boaz Harrosh <bharrosh@panasas.com>
   6 *
   7 * Copyrights for code taken from ext2:
   8 *     Copyright (C) 1992, 1993, 1994, 1995
   9 *     Remy Card (card@masi.ibp.fr)
  10 *     Laboratoire MASI - Institut Blaise Pascal
  11 *     Universite Pierre et Marie Curie (Paris VI)
  12 *     from
  13 *     linux/fs/minix/inode.c
  14 *     Copyright (C) 1991, 1992  Linus Torvalds
  15 *
  16 * This file is part of exofs.
  17 *
  18 * exofs is free software; you can redistribute it and/or modify
  19 * it under the terms of the GNU General Public License as published by
  20 * the Free Software Foundation.  Since it is based on ext2, and the only
  21 * valid version of GPL for the Linux kernel is version 2, the only valid
  22 * version of GPL for exofs is version 2.
  23 *
  24 * exofs is distributed in the hope that it will be useful,
  25 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  26 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  27 * GNU General Public License for more details.
  28 *
  29 * You should have received a copy of the GNU General Public License
  30 * along with exofs; if not, write to the Free Software
  31 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  32 */
  33
  34#include <linux/string.h>
  35#include <linux/parser.h>
  36#include <linux/vfs.h>
  37#include <linux/random.h>
  38#include <linux/exportfs.h>
  39#include <linux/slab.h>
  40
  41#include "exofs.h"
  42
  43/******************************************************************************
  44 * MOUNT OPTIONS
  45 *****************************************************************************/
  46
  47/*
  48 * struct to hold what we get from mount options
  49 */
  50struct exofs_mountopt {
  51	bool is_osdname;
  52	const char *dev_name;
  53	uint64_t pid;
  54	int timeout;
  55};
  56
  57/*
  58 * exofs-specific mount-time options.
  59 */
  60enum { Opt_name, Opt_pid, Opt_to, Opt_err };
  61
  62/*
  63 * Our mount-time options.  These should ideally be 64-bit unsigned, but the
  64 * kernel's parsing functions do not currently support that.  32-bit should be
  65 * sufficient for most applications now.
  66 */
  67static match_table_t tokens = {
  68	{Opt_name, "osdname=%s"},
  69	{Opt_pid, "pid=%u"},
  70	{Opt_to, "to=%u"},
  71	{Opt_err, NULL}
  72};
  73
  74/*
  75 * The main option parsing method.  Also makes sure that all of the mandatory
  76 * mount options were set.
  77 */
  78static int parse_options(char *options, struct exofs_mountopt *opts)
  79{
  80	char *p;
  81	substring_t args[MAX_OPT_ARGS];
  82	int option;
  83	bool s_pid = false;
  84
  85	EXOFS_DBGMSG("parse_options %s\n", options);
  86	/* defaults */
  87	memset(opts, 0, sizeof(*opts));
  88	opts->timeout = BLK_DEFAULT_SG_TIMEOUT;
  89
  90	while ((p = strsep(&options, ",")) != NULL) {
  91		int token;
  92		char str[32];
  93
  94		if (!*p)
  95			continue;
  96
  97		token = match_token(p, tokens, args);
  98		switch (token) {
  99		case Opt_name:
 100			opts->dev_name = match_strdup(&args[0]);
 101			if (unlikely(!opts->dev_name)) {
 102				EXOFS_ERR("Error allocating dev_name");
 103				return -ENOMEM;
 104			}
 105			opts->is_osdname = true;
 106			break;
 107		case Opt_pid:
 108			if (0 == match_strlcpy(str, &args[0], sizeof(str)))
 109				return -EINVAL;
 110			opts->pid = simple_strtoull(str, NULL, 0);
 111			if (opts->pid < EXOFS_MIN_PID) {
 112				EXOFS_ERR("Partition ID must be >= %u",
 113					  EXOFS_MIN_PID);
 114				return -EINVAL;
 115			}
 116			s_pid = 1;
 117			break;
 118		case Opt_to:
 119			if (match_int(&args[0], &option))
 120				return -EINVAL;
 121			if (option <= 0) {
 122				EXOFS_ERR("Timout must be > 0");
 123				return -EINVAL;
 124			}
 125			opts->timeout = option * HZ;
 126			break;
 127		}
 128	}
 129
 130	if (!s_pid) {
 131		EXOFS_ERR("Need to specify the following options:\n");
 132		EXOFS_ERR("    -o pid=pid_no_to_use\n");
 133		return -EINVAL;
 134	}
 135
 136	return 0;
 137}
 138
 139/******************************************************************************
 140 * INODE CACHE
 141 *****************************************************************************/
 142
 143/*
 144 * Our inode cache.  Isn't it pretty?
 145 */
 146static struct kmem_cache *exofs_inode_cachep;
 147
 148/*
 149 * Allocate an inode in the cache
 150 */
 151static struct inode *exofs_alloc_inode(struct super_block *sb)
 152{
 153	struct exofs_i_info *oi;
 154
 155	oi = kmem_cache_alloc(exofs_inode_cachep, GFP_KERNEL);
 156	if (!oi)
 157		return NULL;
 158
 159	oi->vfs_inode.i_version = 1;
 160	return &oi->vfs_inode;
 161}
 162
 163static void exofs_i_callback(struct rcu_head *head)
 164{
 165	struct inode *inode = container_of(head, struct inode, i_rcu);
 166	INIT_LIST_HEAD(&inode->i_dentry);
 167	kmem_cache_free(exofs_inode_cachep, exofs_i(inode));
 168}
 169
 170/*
 171 * Remove an inode from the cache
 172 */
 173static void exofs_destroy_inode(struct inode *inode)
 174{
 175	call_rcu(&inode->i_rcu, exofs_i_callback);
 176}
 177
 178/*
 179 * Initialize the inode
 180 */
 181static void exofs_init_once(void *foo)
 182{
 183	struct exofs_i_info *oi = foo;
 184
 185	inode_init_once(&oi->vfs_inode);
 186}
 187
 188/*
 189 * Create and initialize the inode cache
 190 */
 191static int init_inodecache(void)
 192{
 193	exofs_inode_cachep = kmem_cache_create("exofs_inode_cache",
 194				sizeof(struct exofs_i_info), 0,
 195				SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
 196				exofs_init_once);
 197	if (exofs_inode_cachep == NULL)
 198		return -ENOMEM;
 199	return 0;
 200}
 201
 202/*
 203 * Destroy the inode cache
 204 */
 205static void destroy_inodecache(void)
 206{
 207	kmem_cache_destroy(exofs_inode_cachep);
 208}
 209
 210/******************************************************************************
 211 * SUPERBLOCK FUNCTIONS
 212 *****************************************************************************/
 213static const struct super_operations exofs_sops;
 214static const struct export_operations exofs_export_ops;
 215
 216static const struct osd_attr g_attr_sb_stats = ATTR_DEF(
 217	EXOFS_APAGE_SB_DATA,
 218	EXOFS_ATTR_SB_STATS,
 219	sizeof(struct exofs_sb_stats));
 220
 221static int __sbi_read_stats(struct exofs_sb_info *sbi)
 222{
 223	struct osd_attr attrs[] = {
 224		[0] = g_attr_sb_stats,
 225	};
 226	struct exofs_io_state *ios;
 227	int ret;
 228
 229	ret = exofs_get_io_state(&sbi->layout, &ios);
 230	if (unlikely(ret)) {
 231		EXOFS_ERR("%s: exofs_get_io_state failed.\n", __func__);
 232		return ret;
 233	}
 234
 235	ios->cred = sbi->s_cred;
 236
 237	ios->in_attr = attrs;
 238	ios->in_attr_len = ARRAY_SIZE(attrs);
 239
 240	ret = exofs_sbi_read(ios);
 241	if (unlikely(ret)) {
 242		EXOFS_ERR("Error reading super_block stats => %d\n", ret);
 243		goto out;
 244	}
 245
 246	ret = extract_attr_from_ios(ios, &attrs[0]);
 247	if (ret) {
 248		EXOFS_ERR("%s: extract_attr of sb_stats failed\n", __func__);
 249		goto out;
 250	}
 251	if (attrs[0].len) {
 252		struct exofs_sb_stats *ess;
 253
 254		if (unlikely(attrs[0].len != sizeof(*ess))) {
 255			EXOFS_ERR("%s: Wrong version of exofs_sb_stats "
 256				  "size(%d) != expected(%zd)\n",
 257				  __func__, attrs[0].len, sizeof(*ess));
 258			goto out;
 259		}
 260
 261		ess = attrs[0].val_ptr;
 262		sbi->s_nextid = le64_to_cpu(ess->s_nextid);
 263		sbi->s_numfiles = le32_to_cpu(ess->s_numfiles);
 264	}
 265
 266out:
 267	exofs_put_io_state(ios);
 268	return ret;
 269}
 270
 271static void stats_done(struct exofs_io_state *ios, void *p)
 272{
 273	exofs_put_io_state(ios);
 274	/* Good thanks nothing to do anymore */
 275}
 276
 277/* Asynchronously write the stats attribute */
 278int exofs_sbi_write_stats(struct exofs_sb_info *sbi)
 279{
 280	struct osd_attr attrs[] = {
 281		[0] = g_attr_sb_stats,
 282	};
 283	struct exofs_io_state *ios;
 284	int ret;
 285
 286	ret = exofs_get_io_state(&sbi->layout, &ios);
 287	if (unlikely(ret)) {
 288		EXOFS_ERR("%s: exofs_get_io_state failed.\n", __func__);
 289		return ret;
 290	}
 291
 292	sbi->s_ess.s_nextid   = cpu_to_le64(sbi->s_nextid);
 293	sbi->s_ess.s_numfiles = cpu_to_le64(sbi->s_numfiles);
 294	attrs[0].val_ptr = &sbi->s_ess;
 295
 296	ios->cred = sbi->s_cred;
 297	ios->done = stats_done;
 298	ios->private = sbi;
 299	ios->out_attr = attrs;
 300	ios->out_attr_len = ARRAY_SIZE(attrs);
 301
 302	ret = exofs_sbi_write(ios);
 303	if (unlikely(ret)) {
 304		EXOFS_ERR("%s: exofs_sbi_write failed.\n", __func__);
 305		exofs_put_io_state(ios);
 306	}
 307
 308	return ret;
 309}
 310
 311/*
 312 * Write the superblock to the OSD
 313 */
 314int exofs_sync_fs(struct super_block *sb, int wait)
 315{
 316	struct exofs_sb_info *sbi;
 317	struct exofs_fscb *fscb;
 318	struct exofs_io_state *ios;
 319	int ret = -ENOMEM;
 320
 321	fscb = kmalloc(sizeof(*fscb), GFP_KERNEL);
 322	if (unlikely(!fscb))
 323		return -ENOMEM;
 324
 325	sbi = sb->s_fs_info;
 326
 327	/* NOTE: We no longer dirty the super_block anywhere in exofs. The
 328	 * reason we write the fscb here on unmount is so we can stay backwards
 329	 * compatible with fscb->s_version == 1. (What we are not compatible
 330	 * with is if a new version FS crashed and then we try to mount an old
 331	 * version). Otherwise the exofs_fscb is read-only from mkfs time. All
 332	 * the writeable info is set in exofs_sbi_write_stats() above.
 333	 */
 334	ret = exofs_get_io_state(&sbi->layout, &ios);
 335	if (unlikely(ret))
 336		goto out;
 337
 338	lock_super(sb);
 339
 340	ios->length = offsetof(struct exofs_fscb, s_dev_table_oid);
 341	memset(fscb, 0, ios->length);
 342	fscb->s_nextid = cpu_to_le64(sbi->s_nextid);
 343	fscb->s_numfiles = cpu_to_le32(sbi->s_numfiles);
 344	fscb->s_magic = cpu_to_le16(sb->s_magic);
 345	fscb->s_newfs = 0;
 346	fscb->s_version = EXOFS_FSCB_VER;
 347
 348	ios->obj.id = EXOFS_SUPER_ID;
 349	ios->offset = 0;
 350	ios->kern_buff = fscb;
 351	ios->cred = sbi->s_cred;
 352
 353	ret = exofs_sbi_write(ios);
 354	if (unlikely(ret))
 355		EXOFS_ERR("%s: exofs_sbi_write failed.\n", __func__);
 356	else
 357		sb->s_dirt = 0;
 358
 359
 360	unlock_super(sb);
 361out:
 362	EXOFS_DBGMSG("s_nextid=0x%llx ret=%d\n", _LLU(sbi->s_nextid), ret);
 363	exofs_put_io_state(ios);
 364	kfree(fscb);
 365	return ret;
 366}
 367
 368static void exofs_write_super(struct super_block *sb)
 369{
 370	if (!(sb->s_flags & MS_RDONLY))
 371		exofs_sync_fs(sb, 1);
 372	else
 373		sb->s_dirt = 0;
 374}
 375
 376static void _exofs_print_device(const char *msg, const char *dev_path,
 377				struct osd_dev *od, u64 pid)
 378{
 379	const struct osd_dev_info *odi = osduld_device_info(od);
 380
 381	printk(KERN_NOTICE "exofs: %s %s osd_name-%s pid-0x%llx\n",
 382		msg, dev_path ?: "", odi->osdname, _LLU(pid));
 383}
 384
 385void exofs_free_sbi(struct exofs_sb_info *sbi)
 386{
 387	while (sbi->layout.s_numdevs) {
 388		int i = --sbi->layout.s_numdevs;
 389		struct osd_dev *od = sbi->layout.s_ods[i];
 390
 391		if (od) {
 392			sbi->layout.s_ods[i] = NULL;
 393			osduld_put_device(od);
 394		}
 395	}
 396	kfree(sbi);
 397}
 398
 399/*
 400 * This function is called when the vfs is freeing the superblock.  We just
 401 * need to free our own part.
 402 */
 403static void exofs_put_super(struct super_block *sb)
 404{
 405	int num_pend;
 406	struct exofs_sb_info *sbi = sb->s_fs_info;
 407
 408	/* make sure there are no pending commands */
 409	for (num_pend = atomic_read(&sbi->s_curr_pending); num_pend > 0;
 410	     num_pend = atomic_read(&sbi->s_curr_pending)) {
 411		wait_queue_head_t wq;
 412
 413		printk(KERN_NOTICE "%s: !!Pending operations in flight. "
 414		       "This is a BUG. please report to osd-dev@open-osd.org\n",
 415		       __func__);
 416		init_waitqueue_head(&wq);
 417		wait_event_timeout(wq,
 418				  (atomic_read(&sbi->s_curr_pending) == 0),
 419				  msecs_to_jiffies(100));
 420	}
 421
 422	_exofs_print_device("Unmounting", NULL, sbi->layout.s_ods[0],
 423			    sbi->layout.s_pid);
 424
 425	bdi_destroy(&sbi->bdi);
 426	exofs_free_sbi(sbi);
 427	sb->s_fs_info = NULL;
 428}
 429
 430static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs,
 431				    struct exofs_device_table *dt)
 432{
 433	u64 stripe_length;
 434
 435	sbi->data_map.odm_num_comps   =
 436				le32_to_cpu(dt->dt_data_map.cb_num_comps);
 437	sbi->data_map.odm_stripe_unit =
 438				le64_to_cpu(dt->dt_data_map.cb_stripe_unit);
 439	sbi->data_map.odm_group_width =
 440				le32_to_cpu(dt->dt_data_map.cb_group_width);
 441	sbi->data_map.odm_group_depth =
 442				le32_to_cpu(dt->dt_data_map.cb_group_depth);
 443	sbi->data_map.odm_mirror_cnt  =
 444				le32_to_cpu(dt->dt_data_map.cb_mirror_cnt);
 445	sbi->data_map.odm_raid_algorithm  =
 446				le32_to_cpu(dt->dt_data_map.cb_raid_algorithm);
 447
 448/* FIXME: Only raid0 for now. if not so, do not mount */
 449	if (sbi->data_map.odm_num_comps != numdevs) {
 450		EXOFS_ERR("odm_num_comps(%u) != numdevs(%u)\n",
 451			  sbi->data_map.odm_num_comps, numdevs);
 452		return -EINVAL;
 453	}
 454	if (sbi->data_map.odm_raid_algorithm != PNFS_OSD_RAID_0) {
 455		EXOFS_ERR("Only RAID_0 for now\n");
 456		return -EINVAL;
 457	}
 458	if (0 != (numdevs % (sbi->data_map.odm_mirror_cnt + 1))) {
 459		EXOFS_ERR("Data Map wrong, numdevs=%d mirrors=%d\n",
 460			  numdevs, sbi->data_map.odm_mirror_cnt);
 461		return -EINVAL;
 462	}
 463
 464	if (0 != (sbi->data_map.odm_stripe_unit & ~PAGE_MASK)) {
 465		EXOFS_ERR("Stripe Unit(0x%llx)"
 466			  " must be Multples of PAGE_SIZE(0x%lx)\n",
 467			  _LLU(sbi->data_map.odm_stripe_unit), PAGE_SIZE);
 468		return -EINVAL;
 469	}
 470
 471	sbi->layout.stripe_unit = sbi->data_map.odm_stripe_unit;
 472	sbi->layout.mirrors_p1 = sbi->data_map.odm_mirror_cnt + 1;
 473
 474	if (sbi->data_map.odm_group_width) {
 475		sbi->layout.group_width = sbi->data_map.odm_group_width;
 476		sbi->layout.group_depth = sbi->data_map.odm_group_depth;
 477		if (!sbi->layout.group_depth) {
 478			EXOFS_ERR("group_depth == 0 && group_width != 0\n");
 479			return -EINVAL;
 480		}
 481		sbi->layout.group_count = sbi->data_map.odm_num_comps /
 482						sbi->layout.mirrors_p1 /
 483						sbi->data_map.odm_group_width;
 484	} else {
 485		if (sbi->data_map.odm_group_depth) {
 486			printk(KERN_NOTICE "Warning: group_depth ignored "
 487				"group_width == 0 && group_depth == %d\n",
 488				sbi->data_map.odm_group_depth);
 489			sbi->data_map.odm_group_depth = 0;
 490		}
 491		sbi->layout.group_width = sbi->data_map.odm_num_comps /
 492							sbi->layout.mirrors_p1;
 493		sbi->layout.group_depth = -1;
 494		sbi->layout.group_count = 1;
 495	}
 496
 497	stripe_length = (u64)sbi->layout.group_width * sbi->layout.stripe_unit;
 498	if (stripe_length >= (1ULL << 32)) {
 499		EXOFS_ERR("Total Stripe length(0x%llx)"
 500			  " >= 32bit is not supported\n", _LLU(stripe_length));
 501		return -EINVAL;
 502	}
 503
 504	return 0;
 505}
 506
 507static unsigned __ra_pages(struct exofs_layout *layout)
 508{
 509	const unsigned _MIN_RA = 32; /* min 128K read-ahead */
 510	unsigned ra_pages = layout->group_width * layout->stripe_unit /
 511				PAGE_SIZE;
 512	unsigned max_io_pages = exofs_max_io_pages(layout, ~0);
 513
 514	ra_pages *= 2; /* two stripes */
 515	if (ra_pages < _MIN_RA)
 516		ra_pages = roundup(_MIN_RA, ra_pages / 2);
 517
 518	if (ra_pages > max_io_pages)
 519		ra_pages = max_io_pages;
 520
 521	return ra_pages;
 522}
 523
 524/* @odi is valid only as long as @fscb_dev is valid */
 525static int exofs_devs_2_odi(struct exofs_dt_device_info *dt_dev,
 526			     struct osd_dev_info *odi)
 527{
 528	odi->systemid_len = le32_to_cpu(dt_dev->systemid_len);
 529	memcpy(odi->systemid, dt_dev->systemid, odi->systemid_len);
 530
 531	odi->osdname_len = le32_to_cpu(dt_dev->osdname_len);
 532	odi->osdname = dt_dev->osdname;
 533
 534	/* FIXME support long names. Will need a _put function */
 535	if (dt_dev->long_name_offset)
 536		return -EINVAL;
 537
 538	/* Make sure osdname is printable!
 539	 * mkexofs should give us space for a null-terminator else the
 540	 * device-table is invalid.
 541	 */
 542	if (unlikely(odi->osdname_len >= sizeof(dt_dev->osdname)))
 543		odi->osdname_len = sizeof(dt_dev->osdname) - 1;
 544	dt_dev->osdname[odi->osdname_len] = 0;
 545
 546	/* If it's all zeros something is bad we read past end-of-obj */
 547	return !(odi->systemid_len || odi->osdname_len);
 548}
 549
 550static int exofs_read_lookup_dev_table(struct exofs_sb_info **psbi,
 551				       unsigned table_count)
 552{
 553	struct exofs_sb_info *sbi = *psbi;
 554	struct osd_dev *fscb_od;
 555	struct osd_obj_id obj = {.partition = sbi->layout.s_pid,
 556				 .id = EXOFS_DEVTABLE_ID};
 557	struct exofs_device_table *dt;
 558	unsigned table_bytes = table_count * sizeof(dt->dt_dev_table[0]) +
 559					     sizeof(*dt);
 560	unsigned numdevs, i;
 561	int ret;
 562
 563	dt = kmalloc(table_bytes, GFP_KERNEL);
 564	if (unlikely(!dt)) {
 565		EXOFS_ERR("ERROR: allocating %x bytes for device table\n",
 566			  table_bytes);
 567		return -ENOMEM;
 568	}
 569
 570	fscb_od = sbi->layout.s_ods[0];
 571	sbi->layout.s_ods[0] = NULL;
 572	sbi->layout.s_numdevs = 0;
 573	ret = exofs_read_kern(fscb_od, sbi->s_cred, &obj, 0, dt, table_bytes);
 574	if (unlikely(ret)) {
 575		EXOFS_ERR("ERROR: reading device table\n");
 576		goto out;
 577	}
 578
 579	numdevs = le64_to_cpu(dt->dt_num_devices);
 580	if (unlikely(!numdevs)) {
 581		ret = -EINVAL;
 582		goto out;
 583	}
 584	WARN_ON(table_count != numdevs);
 585
 586	ret = _read_and_match_data_map(sbi, numdevs, dt);
 587	if (unlikely(ret))
 588		goto out;
 589
 590	if (likely(numdevs > 1)) {
 591		unsigned size = numdevs * sizeof(sbi->layout.s_ods[0]);
 592
 593		sbi = krealloc(sbi, sizeof(*sbi) + size, GFP_KERNEL);
 594		if (unlikely(!sbi)) {
 595			ret = -ENOMEM;
 596			goto out;
 597		}
 598		memset(&sbi->layout.s_ods[1], 0,
 599		       size - sizeof(sbi->layout.s_ods[0]));
 600		*psbi = sbi;
 601	}
 602
 603	for (i = 0; i < numdevs; i++) {
 604		struct exofs_fscb fscb;
 605		struct osd_dev_info odi;
 606		struct osd_dev *od;
 607
 608		if (exofs_devs_2_odi(&dt->dt_dev_table[i], &odi)) {
 609			EXOFS_ERR("ERROR: Read all-zeros device entry\n");
 610			ret = -EINVAL;
 611			goto out;
 612		}
 613
 614		printk(KERN_NOTICE "Add device[%d]: osd_name-%s\n",
 615		       i, odi.osdname);
 616
 617		/* On all devices the device table is identical. The user can
 618		 * specify any one of the participating devices on the command
 619		 * line. We always keep them in device-table order.
 620		 */
 621		if (fscb_od && osduld_device_same(fscb_od, &odi)) {
 622			sbi->layout.s_ods[i] = fscb_od;
 623			++sbi->layout.s_numdevs;
 624			fscb_od = NULL;
 625			continue;
 626		}
 627
 628		od = osduld_info_lookup(&odi);
 629		if (IS_ERR(od)) {
 630			ret = PTR_ERR(od);
 631			EXOFS_ERR("ERROR: device requested is not found "
 632				  "osd_name-%s =>%d\n", odi.osdname, ret);
 633			goto out;
 634		}
 635
 636		sbi->layout.s_ods[i] = od;
 637		++sbi->layout.s_numdevs;
 638
 639		/* Read the fscb of the other devices to make sure the FS
 640		 * partition is there.
 641		 */
 642		ret = exofs_read_kern(od, sbi->s_cred, &obj, 0, &fscb,
 643				      sizeof(fscb));
 644		if (unlikely(ret)) {
 645			EXOFS_ERR("ERROR: Malformed participating device "
 646				  "error reading fscb osd_name-%s\n",
 647				  odi.osdname);
 648			goto out;
 649		}
 650
 651		/* TODO: verify other information is correct and FS-uuid
 652		 *	 matches. Benny what did you say about device table
 653		 *	 generation and old devices?
 654		 */
 655	}
 656
 657out:
 658	kfree(dt);
 659	if (unlikely(!ret && fscb_od)) {
 660		EXOFS_ERR(
 661		      "ERROR: Bad device-table container device not present\n");
 662		osduld_put_device(fscb_od);
 663		ret = -EINVAL;
 664	}
 665
 666	return ret;
 667}
 668
 669/*
 670 * Read the superblock from the OSD and fill in the fields
 671 */
 672static int exofs_fill_super(struct super_block *sb, void *data, int silent)
 673{
 674	struct inode *root;
 675	struct exofs_mountopt *opts = data;
 676	struct exofs_sb_info *sbi;	/*extended info                  */
 677	struct osd_dev *od;		/* Master device                 */
 678	struct exofs_fscb fscb;		/*on-disk superblock info        */
 679	struct osd_obj_id obj;
 680	unsigned table_count;
 681	int ret;
 682
 683	sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
 684	if (!sbi)
 685		return -ENOMEM;
 686
 687	ret = bdi_setup_and_register(&sbi->bdi, "exofs", BDI_CAP_MAP_COPY);
 688	if (ret)
 689		goto free_bdi;
 690
 691	/* use mount options to fill superblock */
 692	if (opts->is_osdname) {
 693		struct osd_dev_info odi = {.systemid_len = 0};
 694
 695		odi.osdname_len = strlen(opts->dev_name);
 696		odi.osdname = (u8 *)opts->dev_name;
 697		od = osduld_info_lookup(&odi);
 698	} else {
 699		od = osduld_path_lookup(opts->dev_name);
 700	}
 701	if (IS_ERR(od)) {
 702		ret = -EINVAL;
 703		goto free_sbi;
 704	}
 705
 706	/* Default layout in case we do not have a device-table */
 707	sbi->layout.stripe_unit = PAGE_SIZE;
 708	sbi->layout.mirrors_p1 = 1;
 709	sbi->layout.group_width = 1;
 710	sbi->layout.group_depth = -1;
 711	sbi->layout.group_count = 1;
 712	sbi->layout.s_ods[0] = od;
 713	sbi->layout.s_numdevs = 1;
 714	sbi->layout.s_pid = opts->pid;
 715	sbi->s_timeout = opts->timeout;
 716
 717	/* fill in some other data by hand */
 718	memset(sb->s_id, 0, sizeof(sb->s_id));
 719	strcpy(sb->s_id, "exofs");
 720	sb->s_blocksize = EXOFS_BLKSIZE;
 721	sb->s_blocksize_bits = EXOFS_BLKSHIFT;
 722	sb->s_maxbytes = MAX_LFS_FILESIZE;
 723	atomic_set(&sbi->s_curr_pending, 0);
 724	sb->s_bdev = NULL;
 725	sb->s_dev = 0;
 726
 727	obj.partition = sbi->layout.s_pid;
 728	obj.id = EXOFS_SUPER_ID;
 729	exofs_make_credential(sbi->s_cred, &obj);
 730
 731	ret = exofs_read_kern(od, sbi->s_cred, &obj, 0, &fscb, sizeof(fscb));
 732	if (unlikely(ret))
 733		goto free_sbi;
 734
 735	sb->s_magic = le16_to_cpu(fscb.s_magic);
 736	/* NOTE: we read below to be backward compatible with old versions */
 737	sbi->s_nextid = le64_to_cpu(fscb.s_nextid);
 738	sbi->s_numfiles = le32_to_cpu(fscb.s_numfiles);
 739
 740	/* make sure what we read from the object store is correct */
 741	if (sb->s_magic != EXOFS_SUPER_MAGIC) {
 742		if (!silent)
 743			EXOFS_ERR("ERROR: Bad magic value\n");
 744		ret = -EINVAL;
 745		goto free_sbi;
 746	}
 747	if (le32_to_cpu(fscb.s_version) > EXOFS_FSCB_VER) {
 748		EXOFS_ERR("ERROR: Bad FSCB version expected-%d got-%d\n",
 749			  EXOFS_FSCB_VER, le32_to_cpu(fscb.s_version));
 750		ret = -EINVAL;
 751		goto free_sbi;
 752	}
 753
 754	/* start generation numbers from a random point */
 755	get_random_bytes(&sbi->s_next_generation, sizeof(u32));
 756	spin_lock_init(&sbi->s_next_gen_lock);
 757
 758	table_count = le64_to_cpu(fscb.s_dev_table_count);
 759	if (table_count) {
 760		ret = exofs_read_lookup_dev_table(&sbi, table_count);
 761		if (unlikely(ret))
 762			goto free_sbi;
 763	}
 764
 765	__sbi_read_stats(sbi);
 766
 767	/* set up operation vectors */
 768	sbi->bdi.ra_pages = __ra_pages(&sbi->layout);
 769	sb->s_bdi = &sbi->bdi;
 770	sb->s_fs_info = sbi;
 771	sb->s_op = &exofs_sops;
 772	sb->s_export_op = &exofs_export_ops;
 773	root = exofs_iget(sb, EXOFS_ROOT_ID - EXOFS_OBJ_OFF);
 774	if (IS_ERR(root)) {
 775		EXOFS_ERR("ERROR: exofs_iget failed\n");
 776		ret = PTR_ERR(root);
 777		goto free_sbi;
 778	}
 779	sb->s_root = d_alloc_root(root);
 780	if (!sb->s_root) {
 781		iput(root);
 782		EXOFS_ERR("ERROR: get root inode failed\n");
 783		ret = -ENOMEM;
 784		goto free_sbi;
 785	}
 786
 787	if (!S_ISDIR(root->i_mode)) {
 788		dput(sb->s_root);
 789		sb->s_root = NULL;
 790		EXOFS_ERR("ERROR: corrupt root inode (mode = %hd)\n",
 791		       root->i_mode);
 792		ret = -EINVAL;
 793		goto free_sbi;
 794	}
 795
 796	_exofs_print_device("Mounting", opts->dev_name, sbi->layout.s_ods[0],
 797			    sbi->layout.s_pid);
 798	if (opts->is_osdname)
 799		kfree(opts->dev_name);
 800	return 0;
 801
 802free_sbi:
 803	bdi_destroy(&sbi->bdi);
 804free_bdi:
 805	EXOFS_ERR("Unable to mount exofs on %s pid=0x%llx err=%d\n",
 806		  opts->dev_name, sbi->layout.s_pid, ret);
 807	exofs_free_sbi(sbi);
 808	if (opts->is_osdname)
 809		kfree(opts->dev_name);
 810	return ret;
 811}
 812
 813/*
 814 * Set up the superblock (calls exofs_fill_super eventually)
 815 */
 816static struct dentry *exofs_mount(struct file_system_type *type,
 817			  int flags, const char *dev_name,
 818			  void *data)
 819{
 820	struct exofs_mountopt opts;
 821	int ret;
 822
 823	ret = parse_options(data, &opts);
 824	if (ret)
 825		return ERR_PTR(ret);
 826
 827	if (!opts.dev_name)
 828		opts.dev_name = dev_name;
 829	return mount_nodev(type, flags, &opts, exofs_fill_super);
 830}
 831
 832/*
 833 * Return information about the file system state in the buffer.  This is used
 834 * by the 'df' command, for example.
 835 */
 836static int exofs_statfs(struct dentry *dentry, struct kstatfs *buf)
 837{
 838	struct super_block *sb = dentry->d_sb;
 839	struct exofs_sb_info *sbi = sb->s_fs_info;
 840	struct exofs_io_state *ios;
 841	struct osd_attr attrs[] = {
 842		ATTR_DEF(OSD_APAGE_PARTITION_QUOTAS,
 843			OSD_ATTR_PQ_CAPACITY_QUOTA, sizeof(__be64)),
 844		ATTR_DEF(OSD_APAGE_PARTITION_INFORMATION,
 845			OSD_ATTR_PI_USED_CAPACITY, sizeof(__be64)),
 846	};
 847	uint64_t capacity = ULLONG_MAX;
 848	uint64_t used = ULLONG_MAX;
 849	uint8_t cred_a[OSD_CAP_LEN];
 850	int ret;
 851
 852	ret = exofs_get_io_state(&sbi->layout, &ios);
 853	if (ret) {
 854		EXOFS_DBGMSG("exofs_get_io_state failed.\n");
 855		return ret;
 856	}
 857
 858	exofs_make_credential(cred_a, &ios->obj);
 859	ios->cred = sbi->s_cred;
 860	ios->in_attr = attrs;
 861	ios->in_attr_len = ARRAY_SIZE(attrs);
 862
 863	ret = exofs_sbi_read(ios);
 864	if (unlikely(ret))
 865		goto out;
 866
 867	ret = extract_attr_from_ios(ios, &attrs[0]);
 868	if (likely(!ret)) {
 869		capacity = get_unaligned_be64(attrs[0].val_ptr);
 870		if (unlikely(!capacity))
 871			capacity = ULLONG_MAX;
 872	} else
 873		EXOFS_DBGMSG("exofs_statfs: get capacity failed.\n");
 874
 875	ret = extract_attr_from_ios(ios, &attrs[1]);
 876	if (likely(!ret))
 877		used = get_unaligned_be64(attrs[1].val_ptr);
 878	else
 879		EXOFS_DBGMSG("exofs_statfs: get used-space failed.\n");
 880
 881	/* fill in the stats buffer */
 882	buf->f_type = EXOFS_SUPER_MAGIC;
 883	buf->f_bsize = EXOFS_BLKSIZE;
 884	buf->f_blocks = capacity >> 9;
 885	buf->f_bfree = (capacity - used) >> 9;
 886	buf->f_bavail = buf->f_bfree;
 887	buf->f_files = sbi->s_numfiles;
 888	buf->f_ffree = EXOFS_MAX_ID - sbi->s_numfiles;
 889	buf->f_namelen = EXOFS_NAME_LEN;
 890
 891out:
 892	exofs_put_io_state(ios);
 893	return ret;
 894}
 895
 896static const struct super_operations exofs_sops = {
 897	.alloc_inode    = exofs_alloc_inode,
 898	.destroy_inode  = exofs_destroy_inode,
 899	.write_inode    = exofs_write_inode,
 900	.evict_inode    = exofs_evict_inode,
 901	.put_super      = exofs_put_super,
 902	.write_super    = exofs_write_super,
 903	.sync_fs	= exofs_sync_fs,
 904	.statfs         = exofs_statfs,
 905};
 906
 907/******************************************************************************
 908 * EXPORT OPERATIONS
 909 *****************************************************************************/
 910
 911struct dentry *exofs_get_parent(struct dentry *child)
 912{
 913	unsigned long ino = exofs_parent_ino(child);
 914
 915	if (!ino)
 916		return NULL;
 917
 918	return d_obtain_alias(exofs_iget(child->d_inode->i_sb, ino));
 919}
 920
 921static struct inode *exofs_nfs_get_inode(struct super_block *sb,
 922		u64 ino, u32 generation)
 923{
 924	struct inode *inode;
 925
 926	inode = exofs_iget(sb, ino);
 927	if (IS_ERR(inode))
 928		return ERR_CAST(inode);
 929	if (generation && inode->i_generation != generation) {
 930		/* we didn't find the right inode.. */
 931		iput(inode);
 932		return ERR_PTR(-ESTALE);
 933	}
 934	return inode;
 935}
 936
 937static struct dentry *exofs_fh_to_dentry(struct super_block *sb,
 938				struct fid *fid, int fh_len, int fh_type)
 939{
 940	return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
 941				    exofs_nfs_get_inode);
 942}
 943
 944static struct dentry *exofs_fh_to_parent(struct super_block *sb,
 945				struct fid *fid, int fh_len, int fh_type)
 946{
 947	return generic_fh_to_parent(sb, fid, fh_len, fh_type,
 948				    exofs_nfs_get_inode);
 949}
 950
 951static const struct export_operations exofs_export_ops = {
 952	.fh_to_dentry = exofs_fh_to_dentry,
 953	.fh_to_parent = exofs_fh_to_parent,
 954	.get_parent = exofs_get_parent,
 955};
 956
 957/******************************************************************************
 958 * INSMOD/RMMOD
 959 *****************************************************************************/
 960
 961/*
 962 * struct that describes this file system
 963 */
 964static struct file_system_type exofs_type = {
 965	.owner          = THIS_MODULE,
 966	.name           = "exofs",
 967	.mount          = exofs_mount,
 968	.kill_sb        = generic_shutdown_super,
 969};
 970
 971static int __init init_exofs(void)
 972{
 973	int err;
 974
 975	err = init_inodecache();
 976	if (err)
 977		goto out;
 978
 979	err = register_filesystem(&exofs_type);
 980	if (err)
 981		goto out_d;
 982
 983	return 0;
 984out_d:
 985	destroy_inodecache();
 986out:
 987	return err;
 988}
 989
 990static void __exit exit_exofs(void)
 991{
 992	unregister_filesystem(&exofs_type);
 993	destroy_inodecache();
 994}
 995
 996MODULE_AUTHOR("Avishay Traeger <avishay@gmail.com>");
 997MODULE_DESCRIPTION("exofs");
 998MODULE_LICENSE("GPL");
 999
1000module_init(init_exofs)
1001module_exit(exit_exofs)