Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

ptp: fix the race between the release of ptp_clock and cdev

In a case when a ptp chardev (like /dev/ptp0) is open but an underlying
device is removed, closing this file leads to a race. This reproduces
easily in a kvm virtual machine:

ts# cat openptp0.c
int main() { ... fp = fopen("/dev/ptp0", "r"); ... sleep(10); }
ts# uname -r
5.5.0-rc3-46cf053e
ts# cat /proc/cmdline
... slub_debug=FZP
ts# modprobe ptp_kvm
ts# ./openptp0 &
[1] 670
opened /dev/ptp0, sleeping 10s...
ts# rmmod ptp_kvm
ts# ls /dev/ptp*
ls: cannot access '/dev/ptp*': No such file or directory
ts# ...woken up
[ 48.010809] general protection fault: 0000 [#1] SMP
[ 48.012502] CPU: 6 PID: 658 Comm: openptp0 Not tainted 5.5.0-rc3-46cf053e #25
[ 48.014624] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), ...
[ 48.016270] RIP: 0010:module_put.part.0+0x7/0x80
[ 48.017939] RSP: 0018:ffffb3850073be00 EFLAGS: 00010202
[ 48.018339] RAX: 000000006b6b6b6b RBX: 6b6b6b6b6b6b6b6b RCX: ffff89a476c00ad0
[ 48.018936] RDX: fffff65a08d3ea08 RSI: 0000000000000247 RDI: 6b6b6b6b6b6b6b6b
[ 48.019470] ... ^^^ a slub poison
[ 48.023854] Call Trace:
[ 48.024050] __fput+0x21f/0x240
[ 48.024288] task_work_run+0x79/0x90
[ 48.024555] do_exit+0x2af/0xab0
[ 48.024799] ? vfs_write+0x16a/0x190
[ 48.025082] do_group_exit+0x35/0x90
[ 48.025387] __x64_sys_exit_group+0xf/0x10
[ 48.025737] do_syscall_64+0x3d/0x130
[ 48.026056] entry_SYSCALL_64_after_hwframe+0x44/0xa9
[ 48.026479] RIP: 0033:0x7f53b12082f6
[ 48.026792] ...
[ 48.030945] Modules linked in: ptp i6300esb watchdog [last unloaded: ptp_kvm]
[ 48.045001] Fixing recursive fault but reboot is needed!

This happens in:

static void __fput(struct file *file)
{ ...
if (file->f_op->release)
file->f_op->release(inode, file); <<< cdev is kfree'd here
if (unlikely(S_ISCHR(inode->i_mode) && inode->i_cdev != NULL &&
!(mode & FMODE_PATH))) {
cdev_put(inode->i_cdev); <<< cdev fields are accessed here

Namely:

__fput()
posix_clock_release()
kref_put(&clk->kref, delete_clock) <<< the last reference
delete_clock()
delete_ptp_clock()
kfree(ptp) <<< cdev is embedded in ptp
cdev_put
module_put(p->owner) <<< *p is kfree'd, bang!

Here cdev is embedded in posix_clock which is embedded in ptp_clock.
The race happens because ptp_clock's lifetime is controlled by two
refcounts: kref and cdev.kobj in posix_clock. This is wrong.

Make ptp_clock's sysfs device a parent of cdev with cdev_device_add()
created especially for such cases. This way the parent device with its
ptp_clock is not released until all references to the cdev are released.
This adds a requirement that an initialized but not exposed struct
device should be provided to posix_clock_register() by a caller instead
of a simple dev_t.

This approach was adopted from the commit 72139dfa2464 ("watchdog: Fix
the race between the release of watchdog_core_data and cdev"). See
details of the implementation in the commit 233ed09d7fda ("chardev: add
helper function to register char devs with a struct device").

Link: https://lore.kernel.org/linux-fsdevel/20191125125342.6189-1-vdronov@redhat.com/T/#u
Analyzed-by: Stephen Johnston <sjohnsto@redhat.com>
Analyzed-by: Vern Lovejoy <vlovejoy@redhat.com>
Signed-off-by: Vladis Dronov <vdronov@redhat.com>
Acked-by: Richard Cochran <richardcochran@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

authored by

Vladis Dronov and committed by
David S. Miller
a33121e5 54fa49ee

+39 -44
+14 -17
drivers/ptp/ptp_clock.c
··· 166 166 .read = ptp_read, 167 167 }; 168 168 169 - static void delete_ptp_clock(struct posix_clock *pc) 169 + static void ptp_clock_release(struct device *dev) 170 170 { 171 - struct ptp_clock *ptp = container_of(pc, struct ptp_clock, clock); 171 + struct ptp_clock *ptp = container_of(dev, struct ptp_clock, dev); 172 172 173 173 mutex_destroy(&ptp->tsevq_mux); 174 174 mutex_destroy(&ptp->pincfg_mux); ··· 213 213 } 214 214 215 215 ptp->clock.ops = ptp_clock_ops; 216 - ptp->clock.release = delete_ptp_clock; 217 216 ptp->info = info; 218 217 ptp->devid = MKDEV(major, index); 219 218 ptp->index = index; ··· 235 236 if (err) 236 237 goto no_pin_groups; 237 238 238 - /* Create a new device in our class. */ 239 - ptp->dev = device_create_with_groups(ptp_class, parent, ptp->devid, 240 - ptp, ptp->pin_attr_groups, 241 - "ptp%d", ptp->index); 242 - if (IS_ERR(ptp->dev)) { 243 - err = PTR_ERR(ptp->dev); 244 - goto no_device; 245 - } 246 - 247 239 /* Register a new PPS source. */ 248 240 if (info->pps) { 249 241 struct pps_source_info pps; ··· 250 260 } 251 261 } 252 262 253 - /* Create a posix clock. */ 254 - err = posix_clock_register(&ptp->clock, ptp->devid); 263 + /* Initialize a new device of our class in our clock structure. */ 264 + device_initialize(&ptp->dev); 265 + ptp->dev.devt = ptp->devid; 266 + ptp->dev.class = ptp_class; 267 + ptp->dev.parent = parent; 268 + ptp->dev.groups = ptp->pin_attr_groups; 269 + ptp->dev.release = ptp_clock_release; 270 + dev_set_drvdata(&ptp->dev, ptp); 271 + dev_set_name(&ptp->dev, "ptp%d", ptp->index); 272 + 273 + /* Create a posix clock and link it to the device. */ 274 + err = posix_clock_register(&ptp->clock, &ptp->dev); 255 275 if (err) { 256 276 pr_err("failed to create posix clock\n"); 257 277 goto no_clock; ··· 273 273 if (ptp->pps_source) 274 274 pps_unregister_source(ptp->pps_source); 275 275 no_pps: 276 - device_destroy(ptp_class, ptp->devid); 277 - no_device: 278 276 ptp_cleanup_pin_groups(ptp); 279 277 no_pin_groups: 280 278 if (ptp->kworker) ··· 302 304 if (ptp->pps_source) 303 305 pps_unregister_source(ptp->pps_source); 304 306 305 - device_destroy(ptp_class, ptp->devid); 306 307 ptp_cleanup_pin_groups(ptp); 307 308 308 309 posix_clock_unregister(&ptp->clock);
+1 -1
drivers/ptp/ptp_private.h
··· 28 28 29 29 struct ptp_clock { 30 30 struct posix_clock clock; 31 - struct device *dev; 31 + struct device dev; 32 32 struct ptp_clock_info *info; 33 33 dev_t devid; 34 34 int index; /* index into clocks.map */
+11 -8
include/linux/posix-clock.h
··· 69 69 * 70 70 * @ops: Functional interface to the clock 71 71 * @cdev: Character device instance for this clock 72 - * @kref: Reference count. 72 + * @dev: Pointer to the clock's device. 73 73 * @rwsem: Protects the 'zombie' field from concurrent access. 74 74 * @zombie: If 'zombie' is true, then the hardware has disappeared. 75 - * @release: A function to free the structure when the reference count reaches 76 - * zero. May be NULL if structure is statically allocated. 77 75 * 78 76 * Drivers should embed their struct posix_clock within a private 79 77 * structure, obtaining a reference to it during callbacks using 80 78 * container_of(). 79 + * 80 + * Drivers should supply an initialized but not exposed struct device 81 + * to posix_clock_register(). It is used to manage lifetime of the 82 + * driver's private structure. It's 'release' field should be set to 83 + * a release function for this private structure. 81 84 */ 82 85 struct posix_clock { 83 86 struct posix_clock_operations ops; 84 87 struct cdev cdev; 85 - struct kref kref; 88 + struct device *dev; 86 89 struct rw_semaphore rwsem; 87 90 bool zombie; 88 - void (*release)(struct posix_clock *clk); 89 91 }; 90 92 91 93 /** 92 94 * posix_clock_register() - register a new clock 93 - * @clk: Pointer to the clock. Caller must provide 'ops' and 'release' 94 - * @devid: Allocated device id 95 + * @clk: Pointer to the clock. Caller must provide 'ops' field 96 + * @dev: Pointer to the initialized device. Caller must provide 97 + * 'release' field 95 98 * 96 99 * A clock driver calls this function to register itself with the 97 100 * clock device subsystem. If 'clk' points to dynamically allocated ··· 103 100 * 104 101 * Returns zero on success, non-zero otherwise. 105 102 */ 106 - int posix_clock_register(struct posix_clock *clk, dev_t devid); 103 + int posix_clock_register(struct posix_clock *clk, struct device *dev); 107 104 108 105 /** 109 106 * posix_clock_unregister() - unregister a clock
+13 -18
kernel/time/posix-clock.c
··· 14 14 15 15 #include "posix-timers.h" 16 16 17 - static void delete_clock(struct kref *kref); 18 - 19 17 /* 20 18 * Returns NULL if the posix_clock instance attached to 'fp' is old and stale. 21 19 */ ··· 123 125 err = 0; 124 126 125 127 if (!err) { 126 - kref_get(&clk->kref); 128 + get_device(clk->dev); 127 129 fp->private_data = clk; 128 130 } 129 131 out: ··· 139 141 if (clk->ops.release) 140 142 err = clk->ops.release(clk); 141 143 142 - kref_put(&clk->kref, delete_clock); 144 + put_device(clk->dev); 143 145 144 146 fp->private_data = NULL; 145 147 ··· 159 161 #endif 160 162 }; 161 163 162 - int posix_clock_register(struct posix_clock *clk, dev_t devid) 164 + int posix_clock_register(struct posix_clock *clk, struct device *dev) 163 165 { 164 166 int err; 165 167 166 - kref_init(&clk->kref); 167 168 init_rwsem(&clk->rwsem); 168 169 169 170 cdev_init(&clk->cdev, &posix_clock_file_operations); 171 + err = cdev_device_add(&clk->cdev, dev); 172 + if (err) { 173 + pr_err("%s unable to add device %d:%d\n", 174 + dev_name(dev), MAJOR(dev->devt), MINOR(dev->devt)); 175 + return err; 176 + } 170 177 clk->cdev.owner = clk->ops.owner; 171 - err = cdev_add(&clk->cdev, devid, 1); 178 + clk->dev = dev; 172 179 173 - return err; 180 + return 0; 174 181 } 175 182 EXPORT_SYMBOL_GPL(posix_clock_register); 176 183 177 - static void delete_clock(struct kref *kref) 178 - { 179 - struct posix_clock *clk = container_of(kref, struct posix_clock, kref); 180 - 181 - if (clk->release) 182 - clk->release(clk); 183 - } 184 - 185 184 void posix_clock_unregister(struct posix_clock *clk) 186 185 { 187 - cdev_del(&clk->cdev); 186 + cdev_device_del(&clk->cdev, clk->dev); 188 187 189 188 down_write(&clk->rwsem); 190 189 clk->zombie = true; 191 190 up_write(&clk->rwsem); 192 191 193 - kref_put(&clk->kref, delete_clock); 192 + put_device(clk->dev); 194 193 } 195 194 EXPORT_SYMBOL_GPL(posix_clock_unregister); 196 195