Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

fs: add fcntl() interface for setting/getting write life time hints

Define a set of write life time hints:

RWH_WRITE_LIFE_NOT_SET No hint information set
RWH_WRITE_LIFE_NONE No hints about write life time
RWH_WRITE_LIFE_SHORT Data written has a short life time
RWH_WRITE_LIFE_MEDIUM Data written has a medium life time
RWH_WRITE_LIFE_LONG Data written has a long life time
RWH_WRITE_LIFE_EXTREME Data written has an extremely long life time

The intent is for these values to be relative to each other, no
absolute meaning should be attached to these flag names.

Add an fcntl interface for querying these flags, and also for
setting them as well:

F_GET_RW_HINT Returns the read/write hint set on the
underlying inode.

F_SET_RW_HINT Set one of the above write hints on the
underlying inode.

F_GET_FILE_RW_HINT Returns the read/write hint set on the
file descriptor.

F_SET_FILE_RW_HINT Set one of the above write hints on the
file descriptor.

The user passes in a 64-bit pointer to get/set these values, and
the interface returns 0/-1 on success/error.

Sample program testing/implementing basic setting/getting of write
hints is below.

Add support for storing the write life time hint in the inode flags
and in struct file as well, and pass them to the kiocb flags. If
both a file and its corresponding inode has a write hint, then we
use the one in the file, if available. The file hint can be used
for sync/direct IO, for buffered writeback only the inode hint
is available.

This is in preparation for utilizing these hints in the block layer,
to guide on-media data placement.

/*
* writehint.c: get or set an inode write hint
*/
#include <stdio.h>
#include <fcntl.h>
#include <stdlib.h>
#include <unistd.h>
#include <stdbool.h>
#include <inttypes.h>

#ifndef F_GET_RW_HINT
#define F_LINUX_SPECIFIC_BASE 1024
#define F_GET_RW_HINT (F_LINUX_SPECIFIC_BASE + 11)
#define F_SET_RW_HINT (F_LINUX_SPECIFIC_BASE + 12)
#endif

static char *str[] = { "RWF_WRITE_LIFE_NOT_SET", "RWH_WRITE_LIFE_NONE",
"RWH_WRITE_LIFE_SHORT", "RWH_WRITE_LIFE_MEDIUM",
"RWH_WRITE_LIFE_LONG", "RWH_WRITE_LIFE_EXTREME" };

int main(int argc, char *argv[])
{
uint64_t hint;
int fd, ret;

if (argc < 2) {
fprintf(stderr, "%s: file <hint>\n", argv[0]);
return 1;
}

fd = open(argv[1], O_RDONLY);
if (fd < 0) {
perror("open");
return 2;
}

if (argc > 2) {
hint = atoi(argv[2]);
ret = fcntl(fd, F_SET_RW_HINT, &hint);
if (ret < 0) {
perror("fcntl: F_SET_RW_HINT");
return 4;
}
}

ret = fcntl(fd, F_GET_RW_HINT, &hint);
if (ret < 0) {
perror("fcntl: F_GET_RW_HINT");
return 3;
}

printf("%s: hint %s\n", argv[1], str[hint]);
close(fd);
return 0;
}

Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>

+120 -12
+62
fs/fcntl.c
··· 243 243 } 244 244 #endif 245 245 246 + static bool rw_hint_valid(enum rw_hint hint) 247 + { 248 + switch (hint) { 249 + case RWF_WRITE_LIFE_NOT_SET: 250 + case RWH_WRITE_LIFE_NONE: 251 + case RWH_WRITE_LIFE_SHORT: 252 + case RWH_WRITE_LIFE_MEDIUM: 253 + case RWH_WRITE_LIFE_LONG: 254 + case RWH_WRITE_LIFE_EXTREME: 255 + return true; 256 + default: 257 + return false; 258 + } 259 + } 260 + 261 + static long fcntl_rw_hint(struct file *file, unsigned int cmd, 262 + unsigned long arg) 263 + { 264 + struct inode *inode = file_inode(file); 265 + u64 *argp = (u64 __user *)arg; 266 + enum rw_hint hint; 267 + 268 + switch (cmd) { 269 + case F_GET_FILE_RW_HINT: 270 + if (put_user(file_write_hint(file), argp)) 271 + return -EFAULT; 272 + return 0; 273 + case F_SET_FILE_RW_HINT: 274 + if (get_user(hint, argp)) 275 + return -EFAULT; 276 + if (!rw_hint_valid(hint)) 277 + return -EINVAL; 278 + 279 + spin_lock(&file->f_lock); 280 + file->f_write_hint = hint; 281 + spin_unlock(&file->f_lock); 282 + return 0; 283 + case F_GET_RW_HINT: 284 + if (put_user(inode->i_write_hint, argp)) 285 + return -EFAULT; 286 + return 0; 287 + case F_SET_RW_HINT: 288 + if (get_user(hint, argp)) 289 + return -EFAULT; 290 + if (!rw_hint_valid(hint)) 291 + return -EINVAL; 292 + 293 + inode_lock(inode); 294 + inode->i_write_hint = hint; 295 + inode_unlock(inode); 296 + return 0; 297 + default: 298 + return -EINVAL; 299 + } 300 + } 301 + 246 302 static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, 247 303 struct file *filp) 248 304 { ··· 392 336 case F_ADD_SEALS: 393 337 case F_GET_SEALS: 394 338 err = shmem_fcntl(filp, cmd, arg); 339 + break; 340 + case F_GET_RW_HINT: 341 + case F_SET_RW_HINT: 342 + case F_GET_FILE_RW_HINT: 343 + case F_SET_FILE_RW_HINT: 344 + err = fcntl_rw_hint(filp, cmd, arg); 395 345 break; 396 346 default: 397 347 break;
+1
fs/inode.c
··· 146 146 i_gid_write(inode, 0); 147 147 atomic_set(&inode->i_writecount, 0); 148 148 inode->i_size = 0; 149 + inode->i_write_hint = WRITE_LIFE_NOT_SET; 149 150 inode->i_blocks = 0; 150 151 inode->i_bytes = 0; 151 152 inode->i_generation = 0;
+1
fs/open.c
··· 759 759 likely(f->f_op->write || f->f_op->write_iter)) 760 760 f->f_mode |= FMODE_CAN_WRITE; 761 761 762 + f->f_write_hint = WRITE_LIFE_NOT_SET; 762 763 f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC); 763 764 764 765 file_ra_state_init(&f->f_ra, f->f_mapping->host->i_mapping);
+35 -12
include/linux/fs.h
··· 20 20 #include <linux/rwsem.h> 21 21 #include <linux/capability.h> 22 22 #include <linux/semaphore.h> 23 + #include <linux/fcntl.h> 23 24 #include <linux/fiemap.h> 24 25 #include <linux/rculist_bl.h> 25 26 #include <linux/atomic.h> ··· 266 265 struct address_space; 267 266 struct writeback_control; 268 267 268 + /* 269 + * Write life time hint values. 270 + */ 271 + enum rw_hint { 272 + WRITE_LIFE_NOT_SET = 0, 273 + WRITE_LIFE_NONE = RWH_WRITE_LIFE_NONE, 274 + WRITE_LIFE_SHORT = RWH_WRITE_LIFE_SHORT, 275 + WRITE_LIFE_MEDIUM = RWH_WRITE_LIFE_MEDIUM, 276 + WRITE_LIFE_LONG = RWH_WRITE_LIFE_LONG, 277 + WRITE_LIFE_EXTREME = RWH_WRITE_LIFE_EXTREME, 278 + }; 279 + 269 280 #define IOCB_EVENTFD (1 << 0) 270 281 #define IOCB_APPEND (1 << 1) 271 282 #define IOCB_DIRECT (1 << 2) ··· 293 280 void (*ki_complete)(struct kiocb *iocb, long ret, long ret2); 294 281 void *private; 295 282 int ki_flags; 283 + enum rw_hint ki_hint; 296 284 }; 297 285 298 286 static inline bool is_sync_kiocb(struct kiocb *kiocb) 299 287 { 300 288 return kiocb->ki_complete == NULL; 301 - } 302 - 303 - static inline int iocb_flags(struct file *file); 304 - 305 - static inline void init_sync_kiocb(struct kiocb *kiocb, struct file *filp) 306 - { 307 - *kiocb = (struct kiocb) { 308 - .ki_filp = filp, 309 - .ki_flags = iocb_flags(filp), 310 - }; 311 289 } 312 290 313 291 /* ··· 601 597 spinlock_t i_lock; /* i_blocks, i_bytes, maybe i_size */ 602 598 unsigned short i_bytes; 603 599 unsigned int i_blkbits; 600 + enum rw_hint i_write_hint; 604 601 blkcnt_t i_blocks; 605 602 606 603 #ifdef __NEED_I_SIZE_ORDERED ··· 856 851 * Must not be taken from IRQ context. 857 852 */ 858 853 spinlock_t f_lock; 854 + enum rw_hint f_write_hint; 859 855 atomic_long_t f_count; 860 856 unsigned int f_flags; 861 857 fmode_t f_mode; ··· 1031 1025 #define OFFSET_MAX INT_LIMIT(loff_t) 1032 1026 #define OFFT_OFFSET_MAX INT_LIMIT(off_t) 1033 1027 #endif 1034 - 1035 - #include <linux/fcntl.h> 1036 1028 1037 1029 extern void send_sigio(struct fown_struct *fown, int fd, int band); 1038 1030 ··· 1880 1876 static inline bool HAS_UNMAPPED_ID(struct inode *inode) 1881 1877 { 1882 1878 return !uid_valid(inode->i_uid) || !gid_valid(inode->i_gid); 1879 + } 1880 + 1881 + static inline enum rw_hint file_write_hint(struct file *file) 1882 + { 1883 + if (file->f_write_hint != WRITE_LIFE_NOT_SET) 1884 + return file->f_write_hint; 1885 + 1886 + return file_inode(file)->i_write_hint; 1887 + } 1888 + 1889 + static inline int iocb_flags(struct file *file); 1890 + 1891 + static inline void init_sync_kiocb(struct kiocb *kiocb, struct file *filp) 1892 + { 1893 + *kiocb = (struct kiocb) { 1894 + .ki_filp = filp, 1895 + .ki_flags = iocb_flags(filp), 1896 + .ki_hint = file_write_hint(filp), 1897 + }; 1883 1898 } 1884 1899 1885 1900 /*
+21
include/uapi/linux/fcntl.h
··· 43 43 /* (1U << 31) is reserved for signed error codes */ 44 44 45 45 /* 46 + * Set/Get write life time hints. {GET,SET}_RW_HINT operate on the 47 + * underlying inode, while {GET,SET}_FILE_RW_HINT operate only on 48 + * the specific file. 49 + */ 50 + #define F_GET_RW_HINT (F_LINUX_SPECIFIC_BASE + 11) 51 + #define F_SET_RW_HINT (F_LINUX_SPECIFIC_BASE + 12) 52 + #define F_GET_FILE_RW_HINT (F_LINUX_SPECIFIC_BASE + 13) 53 + #define F_SET_FILE_RW_HINT (F_LINUX_SPECIFIC_BASE + 14) 54 + 55 + /* 56 + * Valid hint values for F_{GET,SET}_RW_HINT. 0 is "not set", or can be 57 + * used to clear any hints previously set. 58 + */ 59 + #define RWF_WRITE_LIFE_NOT_SET 0 60 + #define RWH_WRITE_LIFE_NONE 1 61 + #define RWH_WRITE_LIFE_SHORT 2 62 + #define RWH_WRITE_LIFE_MEDIUM 3 63 + #define RWH_WRITE_LIFE_LONG 4 64 + #define RWH_WRITE_LIFE_EXTREME 5 65 + 66 + /* 46 67 * Types of directory notifications that may be requested. 47 68 */ 48 69 #define DN_ACCESS 0x00000001 /* File accessed */