ocfs2: Add a mount option "coherency=*" to handle cluster coherency for O_DIRECT writes.

Currently, the default behavior of O_DIRECT writes was allowing
concurrent writing among nodes to the same file, with no cluster
coherency guaranteed (no EX lock held). This can leave stale data in
the cache for buffered reads on other nodes.

The new mount option introduce a chance to choose two different
behaviors for O_DIRECT writes:

* coherency=full, as the default value, will disallow
concurrent O_DIRECT writes by taking
EX locks.

* coherency=buffered, allow concurrent O_DIRECT writes
without EX lock among nodes, which
gains high performance at risk of
getting stale data on other nodes.

Signed-off-by: Tristan Ye <tristan.ye@oracle.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>

authored by Tristan Ye and committed by Joel Becker 7bdb0d18 75d9bbc7

+52 -2
+7
Documentation/filesystems/ocfs2.txt
··· 87 reservations - users should rarely need to change this 88 value. If allocation reservations are turned off, this 89 option will have no effect.
··· 87 reservations - users should rarely need to change this 88 value. If allocation reservations are turned off, this 89 option will have no effect. 90 + coherency=full (*) Disallow concurrent O_DIRECT writes, cluster inode 91 + lock will be taken to force other nodes drop cache, 92 + therefore full cluster coherency is guaranteed even 93 + for O_DIRECT writes. 94 + coherency=buffered Allow concurrent O_DIRECT writes without EX lock among 95 + nodes, which gains high performance at risk of getting 96 + stale data on other nodes.
+27 -2
fs/ocfs2/file.c
··· 2225 struct file *file = iocb->ki_filp; 2226 struct inode *inode = file->f_path.dentry->d_inode; 2227 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 2228 2229 mlog_entry("(0x%p, %u, '%.*s')\n", file, 2230 (unsigned int)nr_segs, ··· 2250 have_alloc_sem = 1; 2251 } 2252 2253 - /* concurrent O_DIRECT writes are allowed */ 2254 - rw_level = !direct_io; 2255 ret = ocfs2_rw_lock(inode, rw_level); 2256 if (ret < 0) { 2257 mlog_errno(ret); 2258 goto out_sems; 2259 } 2260 2261 can_do_direct = direct_io;
··· 2225 struct file *file = iocb->ki_filp; 2226 struct inode *inode = file->f_path.dentry->d_inode; 2227 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 2228 + int full_coherency = !(osb->s_mount_opt & 2229 + OCFS2_MOUNT_COHERENCY_BUFFERED); 2230 2231 mlog_entry("(0x%p, %u, '%.*s')\n", file, 2232 (unsigned int)nr_segs, ··· 2248 have_alloc_sem = 1; 2249 } 2250 2251 + /* 2252 + * Concurrent O_DIRECT writes are allowed with 2253 + * mount_option "coherency=buffered". 2254 + */ 2255 + rw_level = (!direct_io || full_coherency); 2256 + 2257 ret = ocfs2_rw_lock(inode, rw_level); 2258 if (ret < 0) { 2259 mlog_errno(ret); 2260 goto out_sems; 2261 + } 2262 + 2263 + /* 2264 + * O_DIRECT writes with "coherency=full" need to take EX cluster 2265 + * inode_lock to guarantee coherency. 2266 + */ 2267 + if (direct_io && full_coherency) { 2268 + /* 2269 + * We need to take and drop the inode lock to force 2270 + * other nodes to drop their caches. Buffered I/O 2271 + * already does this in write_begin(). 2272 + */ 2273 + ret = ocfs2_inode_lock(inode, NULL, 1); 2274 + if (ret < 0) { 2275 + mlog_errno(ret); 2276 + goto out_sems; 2277 + } 2278 + 2279 + ocfs2_inode_unlock(inode, 1); 2280 } 2281 2282 can_do_direct = direct_io;
+3
fs/ocfs2/ocfs2.h
··· 263 control lists */ 264 OCFS2_MOUNT_USRQUOTA = 1 << 10, /* We support user quotas */ 265 OCFS2_MOUNT_GRPQUOTA = 1 << 11, /* We support group quotas */ 266 }; 267 268 #define OCFS2_OSB_SOFT_RO 0x0001
··· 263 control lists */ 264 OCFS2_MOUNT_USRQUOTA = 1 << 10, /* We support user quotas */ 265 OCFS2_MOUNT_GRPQUOTA = 1 << 11, /* We support group quotas */ 266 + 267 + OCFS2_MOUNT_COHERENCY_BUFFERED = 1 << 12 /* Allow concurrent O_DIRECT 268 + writes */ 269 }; 270 271 #define OCFS2_OSB_SOFT_RO 0x0001
+15
fs/ocfs2/super.c
··· 177 Opt_noacl, 178 Opt_usrquota, 179 Opt_grpquota, 180 Opt_resv_level, 181 Opt_dir_resv_level, 182 Opt_err, ··· 207 {Opt_noacl, "noacl"}, 208 {Opt_usrquota, "usrquota"}, 209 {Opt_grpquota, "grpquota"}, 210 {Opt_resv_level, "resv_level=%u"}, 211 {Opt_dir_resv_level, "dir_resv_level=%u"}, 212 {Opt_err, NULL} ··· 1456 case Opt_grpquota: 1457 mopt->mount_opt |= OCFS2_MOUNT_GRPQUOTA; 1458 break; 1459 case Opt_acl: 1460 mopt->mount_opt |= OCFS2_MOUNT_POSIX_ACL; 1461 mopt->mount_opt &= ~OCFS2_MOUNT_NO_POSIX_ACL; ··· 1559 seq_printf(s, ",usrquota"); 1560 if (opts & OCFS2_MOUNT_GRPQUOTA) 1561 seq_printf(s, ",grpquota"); 1562 1563 if (opts & OCFS2_MOUNT_NOUSERXATTR) 1564 seq_printf(s, ",nouser_xattr");
··· 177 Opt_noacl, 178 Opt_usrquota, 179 Opt_grpquota, 180 + Opt_coherency_buffered, 181 + Opt_coherency_full, 182 Opt_resv_level, 183 Opt_dir_resv_level, 184 Opt_err, ··· 205 {Opt_noacl, "noacl"}, 206 {Opt_usrquota, "usrquota"}, 207 {Opt_grpquota, "grpquota"}, 208 + {Opt_coherency_buffered, "coherency=buffered"}, 209 + {Opt_coherency_full, "coherency=full"}, 210 {Opt_resv_level, "resv_level=%u"}, 211 {Opt_dir_resv_level, "dir_resv_level=%u"}, 212 {Opt_err, NULL} ··· 1452 case Opt_grpquota: 1453 mopt->mount_opt |= OCFS2_MOUNT_GRPQUOTA; 1454 break; 1455 + case Opt_coherency_buffered: 1456 + mopt->mount_opt |= OCFS2_MOUNT_COHERENCY_BUFFERED; 1457 + break; 1458 + case Opt_coherency_full: 1459 + mopt->mount_opt &= ~OCFS2_MOUNT_COHERENCY_BUFFERED; 1460 + break; 1461 case Opt_acl: 1462 mopt->mount_opt |= OCFS2_MOUNT_POSIX_ACL; 1463 mopt->mount_opt &= ~OCFS2_MOUNT_NO_POSIX_ACL; ··· 1549 seq_printf(s, ",usrquota"); 1550 if (opts & OCFS2_MOUNT_GRPQUOTA) 1551 seq_printf(s, ",grpquota"); 1552 + 1553 + if (opts & OCFS2_MOUNT_COHERENCY_BUFFERED) 1554 + seq_printf(s, ",coherency=buffered"); 1555 + else 1556 + seq_printf(s, ",coherency=full"); 1557 1558 if (opts & OCFS2_MOUNT_NOUSERXATTR) 1559 seq_printf(s, ",nouser_xattr");