Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

dax: Call ->iomap_begin without entry lock during dax fault

Currently ->iomap_begin() handler is called with entry lock held. If the
filesystem held any locks between ->iomap_begin() and ->iomap_end()
(such as ext4 which will want to hold transaction open), this would cause
lock inversion with the iomap_apply() from standard IO path which first
calls ->iomap_begin() and only then calls ->actor() callback which grabs
entry locks for DAX (if it faults when copying from/to user provided
buffers).

Fix the problem by nesting grabbing of entry lock inside ->iomap_begin()
- ->iomap_end() pair.

Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>

authored by

Jan Kara and committed by
Dan Williams
9f141d6e f449b936

+67 -56
+67 -56
fs/dax.c
··· 1078 1078 } 1079 1079 EXPORT_SYMBOL_GPL(dax_iomap_rw); 1080 1080 1081 + static int dax_fault_return(int error) 1082 + { 1083 + if (error == 0) 1084 + return VM_FAULT_NOPAGE; 1085 + if (error == -ENOMEM) 1086 + return VM_FAULT_OOM; 1087 + return VM_FAULT_SIGBUS; 1088 + } 1089 + 1081 1090 /** 1082 1091 * dax_iomap_fault - handle a page fault on a DAX file 1083 1092 * @vma: The virtual memory area where the fault occurred ··· 1119 1110 if (pos >= i_size_read(inode)) 1120 1111 return VM_FAULT_SIGBUS; 1121 1112 1122 - entry = grab_mapping_entry(mapping, vmf->pgoff, 0); 1123 - if (IS_ERR(entry)) { 1124 - error = PTR_ERR(entry); 1125 - goto out; 1126 - } 1127 - 1128 1113 if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page) 1129 1114 flags |= IOMAP_WRITE; 1130 1115 ··· 1129 1126 */ 1130 1127 error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap); 1131 1128 if (error) 1132 - goto unlock_entry; 1129 + return dax_fault_return(error); 1133 1130 if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) { 1134 - error = -EIO; /* fs corruption? */ 1131 + vmf_ret = dax_fault_return(-EIO); /* fs corruption? */ 1132 + goto finish_iomap; 1133 + } 1134 + 1135 + entry = grab_mapping_entry(mapping, vmf->pgoff, 0); 1136 + if (IS_ERR(entry)) { 1137 + vmf_ret = dax_fault_return(PTR_ERR(entry)); 1135 1138 goto finish_iomap; 1136 1139 } 1137 1140 ··· 1160 1151 } 1161 1152 1162 1153 if (error) 1163 - goto finish_iomap; 1154 + goto error_unlock_entry; 1164 1155 1165 1156 __SetPageUptodate(vmf->cow_page); 1166 1157 vmf_ret = finish_fault(vmf); 1167 1158 if (!vmf_ret) 1168 1159 vmf_ret = VM_FAULT_DONE_COW; 1169 - goto finish_iomap; 1160 + goto unlock_entry; 1170 1161 } 1171 1162 1172 1163 switch (iomap.type) { ··· 1178 1169 } 1179 1170 error = dax_insert_mapping(mapping, iomap.bdev, sector, 1180 1171 PAGE_SIZE, &entry, vma, vmf); 1172 + /* -EBUSY is fine, somebody else faulted on the same PTE */ 1173 + if (error == -EBUSY) 1174 + error = 0; 1181 1175 break; 1182 1176 case IOMAP_UNWRITTEN: 1183 1177 case IOMAP_HOLE: 1184 1178 if (!(vmf->flags & FAULT_FLAG_WRITE)) { 1185 1179 vmf_ret = dax_load_hole(mapping, &entry, vmf); 1186 - goto finish_iomap; 1180 + goto unlock_entry; 1187 1181 } 1188 1182 /*FALLTHRU*/ 1189 1183 default: ··· 1195 1183 break; 1196 1184 } 1197 1185 1198 - finish_iomap: 1199 - if (ops->iomap_end) { 1200 - if (error || (vmf_ret & VM_FAULT_ERROR)) { 1201 - /* keep previous error */ 1202 - ops->iomap_end(inode, pos, PAGE_SIZE, 0, flags, 1203 - &iomap); 1204 - } else { 1205 - error = ops->iomap_end(inode, pos, PAGE_SIZE, 1206 - PAGE_SIZE, flags, &iomap); 1207 - } 1208 - } 1186 + error_unlock_entry: 1187 + vmf_ret = dax_fault_return(error) | major; 1209 1188 unlock_entry: 1210 1189 put_locked_mapping_entry(mapping, vmf->pgoff, entry); 1211 - out: 1212 - if (error == -ENOMEM) 1213 - return VM_FAULT_OOM | major; 1214 - /* -EBUSY is fine, somebody else faulted on the same PTE */ 1215 - if (error < 0 && error != -EBUSY) 1216 - return VM_FAULT_SIGBUS | major; 1217 - if (vmf_ret) { 1218 - WARN_ON_ONCE(error); /* -EBUSY from ops->iomap_end? */ 1219 - return vmf_ret; 1190 + finish_iomap: 1191 + if (ops->iomap_end) { 1192 + int copied = PAGE_SIZE; 1193 + 1194 + if (vmf_ret & VM_FAULT_ERROR) 1195 + copied = 0; 1196 + /* 1197 + * The fault is done by now and there's no way back (other 1198 + * thread may be already happily using PTE we have installed). 1199 + * Just ignore error from ->iomap_end since we cannot do much 1200 + * with it. 1201 + */ 1202 + ops->iomap_end(inode, pos, PAGE_SIZE, copied, flags, &iomap); 1220 1203 } 1221 - return VM_FAULT_NOPAGE | major; 1204 + return vmf_ret; 1222 1205 } 1223 1206 EXPORT_SYMBOL_GPL(dax_iomap_fault); 1224 1207 ··· 1338 1331 goto fallback; 1339 1332 1340 1333 /* 1341 - * grab_mapping_entry() will make sure we get a 2M empty entry, a DAX 1342 - * PMD or a HZP entry. If it can't (because a 4k page is already in 1343 - * the tree, for instance), it will return -EEXIST and we just fall 1344 - * back to 4k entries. 1345 - */ 1346 - entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD); 1347 - if (IS_ERR(entry)) 1348 - goto fallback; 1349 - 1350 - /* 1351 1334 * Note that we don't use iomap_apply here. We aren't doing I/O, only 1352 1335 * setting up a mapping, so really we're using iomap_begin() as a way 1353 1336 * to look up our filesystem block. ··· 1345 1348 pos = (loff_t)pgoff << PAGE_SHIFT; 1346 1349 error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap); 1347 1350 if (error) 1348 - goto unlock_entry; 1351 + goto fallback; 1352 + 1349 1353 if (iomap.offset + iomap.length < pos + PMD_SIZE) 1354 + goto finish_iomap; 1355 + 1356 + /* 1357 + * grab_mapping_entry() will make sure we get a 2M empty entry, a DAX 1358 + * PMD or a HZP entry. If it can't (because a 4k page is already in 1359 + * the tree, for instance), it will return -EEXIST and we just fall 1360 + * back to 4k entries. 1361 + */ 1362 + entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD); 1363 + if (IS_ERR(entry)) 1350 1364 goto finish_iomap; 1351 1365 1352 1366 vmf.pgoff = pgoff; ··· 1372 1364 case IOMAP_UNWRITTEN: 1373 1365 case IOMAP_HOLE: 1374 1366 if (WARN_ON_ONCE(write)) 1375 - goto finish_iomap; 1367 + goto unlock_entry; 1376 1368 result = dax_pmd_load_hole(vma, pmd, &vmf, address, &iomap, 1377 1369 &entry); 1378 1370 break; ··· 1381 1373 break; 1382 1374 } 1383 1375 1384 - finish_iomap: 1385 - if (ops->iomap_end) { 1386 - if (result == VM_FAULT_FALLBACK) { 1387 - ops->iomap_end(inode, pos, PMD_SIZE, 0, iomap_flags, 1388 - &iomap); 1389 - } else { 1390 - error = ops->iomap_end(inode, pos, PMD_SIZE, PMD_SIZE, 1391 - iomap_flags, &iomap); 1392 - if (error) 1393 - result = VM_FAULT_FALLBACK; 1394 - } 1395 - } 1396 1376 unlock_entry: 1397 1377 put_locked_mapping_entry(mapping, pgoff, entry); 1378 + finish_iomap: 1379 + if (ops->iomap_end) { 1380 + int copied = PMD_SIZE; 1381 + 1382 + if (result == VM_FAULT_FALLBACK) 1383 + copied = 0; 1384 + /* 1385 + * The fault is done by now and there's no way back (other 1386 + * thread may be already happily using PMD we have installed). 1387 + * Just ignore error from ->iomap_end since we cannot do much 1388 + * with it. 1389 + */ 1390 + ops->iomap_end(inode, pos, PMD_SIZE, copied, iomap_flags, 1391 + &iomap); 1392 + } 1398 1393 fallback: 1399 1394 if (result == VM_FAULT_FALLBACK) { 1400 1395 split_huge_pmd(vma, pmd, address);