ocfs2: Add a mount option "coherency=*" to handle cluster coherency for O_DIRECT writes.

Currently, the default behavior of O_DIRECT writes was allowing
concurrent writing among nodes to the same file, with no cluster
coherency guaranteed (no EX lock held). This can leave stale data in
the cache for buffered reads on other nodes.

The new mount option introduce a chance to choose two different
behaviors for O_DIRECT writes:

* coherency=full, as the default value, will disallow
concurrent O_DIRECT writes by taking
EX locks.

* coherency=buffered, allow concurrent O_DIRECT writes
without EX lock among nodes, which
gains high performance at risk of
getting stale data on other nodes.

Signed-off-by: Tristan Ye <tristan.ye@oracle.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>

authored by Tristan Ye and committed by Joel Becker 7bdb0d18 75d9bbc7

+52 -2
+7
Documentation/filesystems/ocfs2.txt
··· 87 87 reservations - users should rarely need to change this 88 88 value. If allocation reservations are turned off, this 89 89 option will have no effect. 90 + coherency=full (*) Disallow concurrent O_DIRECT writes, cluster inode 91 + lock will be taken to force other nodes drop cache, 92 + therefore full cluster coherency is guaranteed even 93 + for O_DIRECT writes. 94 + coherency=buffered Allow concurrent O_DIRECT writes without EX lock among 95 + nodes, which gains high performance at risk of getting 96 + stale data on other nodes.
+27 -2
fs/ocfs2/file.c
··· 2225 2225 struct file *file = iocb->ki_filp; 2226 2226 struct inode *inode = file->f_path.dentry->d_inode; 2227 2227 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 2228 + int full_coherency = !(osb->s_mount_opt & 2229 + OCFS2_MOUNT_COHERENCY_BUFFERED); 2228 2230 2229 2231 mlog_entry("(0x%p, %u, '%.*s')\n", file, 2230 2232 (unsigned int)nr_segs, ··· 2250 2248 have_alloc_sem = 1; 2251 2249 } 2252 2250 2253 - /* concurrent O_DIRECT writes are allowed */ 2254 - rw_level = !direct_io; 2251 + /* 2252 + * Concurrent O_DIRECT writes are allowed with 2253 + * mount_option "coherency=buffered". 2254 + */ 2255 + rw_level = (!direct_io || full_coherency); 2256 + 2255 2257 ret = ocfs2_rw_lock(inode, rw_level); 2256 2258 if (ret < 0) { 2257 2259 mlog_errno(ret); 2258 2260 goto out_sems; 2261 + } 2262 + 2263 + /* 2264 + * O_DIRECT writes with "coherency=full" need to take EX cluster 2265 + * inode_lock to guarantee coherency. 2266 + */ 2267 + if (direct_io && full_coherency) { 2268 + /* 2269 + * We need to take and drop the inode lock to force 2270 + * other nodes to drop their caches. Buffered I/O 2271 + * already does this in write_begin(). 2272 + */ 2273 + ret = ocfs2_inode_lock(inode, NULL, 1); 2274 + if (ret < 0) { 2275 + mlog_errno(ret); 2276 + goto out_sems; 2277 + } 2278 + 2279 + ocfs2_inode_unlock(inode, 1); 2259 2280 } 2260 2281 2261 2282 can_do_direct = direct_io;
+3
fs/ocfs2/ocfs2.h
··· 263 263 control lists */ 264 264 OCFS2_MOUNT_USRQUOTA = 1 << 10, /* We support user quotas */ 265 265 OCFS2_MOUNT_GRPQUOTA = 1 << 11, /* We support group quotas */ 266 + 267 + OCFS2_MOUNT_COHERENCY_BUFFERED = 1 << 12 /* Allow concurrent O_DIRECT 268 + writes */ 266 269 }; 267 270 268 271 #define OCFS2_OSB_SOFT_RO 0x0001
+15
fs/ocfs2/super.c
··· 177 177 Opt_noacl, 178 178 Opt_usrquota, 179 179 Opt_grpquota, 180 + Opt_coherency_buffered, 181 + Opt_coherency_full, 180 182 Opt_resv_level, 181 183 Opt_dir_resv_level, 182 184 Opt_err, ··· 207 205 {Opt_noacl, "noacl"}, 208 206 {Opt_usrquota, "usrquota"}, 209 207 {Opt_grpquota, "grpquota"}, 208 + {Opt_coherency_buffered, "coherency=buffered"}, 209 + {Opt_coherency_full, "coherency=full"}, 210 210 {Opt_resv_level, "resv_level=%u"}, 211 211 {Opt_dir_resv_level, "dir_resv_level=%u"}, 212 212 {Opt_err, NULL} ··· 1456 1452 case Opt_grpquota: 1457 1453 mopt->mount_opt |= OCFS2_MOUNT_GRPQUOTA; 1458 1454 break; 1455 + case Opt_coherency_buffered: 1456 + mopt->mount_opt |= OCFS2_MOUNT_COHERENCY_BUFFERED; 1457 + break; 1458 + case Opt_coherency_full: 1459 + mopt->mount_opt &= ~OCFS2_MOUNT_COHERENCY_BUFFERED; 1460 + break; 1459 1461 case Opt_acl: 1460 1462 mopt->mount_opt |= OCFS2_MOUNT_POSIX_ACL; 1461 1463 mopt->mount_opt &= ~OCFS2_MOUNT_NO_POSIX_ACL; ··· 1559 1549 seq_printf(s, ",usrquota"); 1560 1550 if (opts & OCFS2_MOUNT_GRPQUOTA) 1561 1551 seq_printf(s, ",grpquota"); 1552 + 1553 + if (opts & OCFS2_MOUNT_COHERENCY_BUFFERED) 1554 + seq_printf(s, ",coherency=buffered"); 1555 + else 1556 + seq_printf(s, ",coherency=full"); 1562 1557 1563 1558 if (opts & OCFS2_MOUNT_NOUSERXATTR) 1564 1559 seq_printf(s, ",nouser_xattr");