Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

vfio/type1: handle DMA map/unmap up to the addressable limit

Before this commit, it was possible to create end of address space
mappings, but unmapping them via VFIO_IOMMU_UNMAP_DMA, replaying them
for newly added iommu domains, and querying their dirty pages via
VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP was broken due to bugs caused by
comparisons against (iova + size) expressions, which overflow to zero.
Additionally, there appears to be a page pinning leak in the
vfio_iommu_type1_release() path, since vfio_unmap_unpin()'s loop body
where unmap_unpin_*() are called will never be entered due to overflow
of (iova + size) to zero.

This commit handles DMA map/unmap operations up to the addressable
limit by comparing against inclusive end-of-range limits, and changing
iteration to perform relative traversals across range sizes, rather than
absolute traversals across addresses.

vfio_link_dma() inserts a zero-sized vfio_dma into the rb-tree, and is
only used for that purpose, so discard the size from consideration for
the insertion point.

Tested-by: Alejandro Jimenez <alejandro.j.jimenez@oracle.com>
Fixes: 73fa0d10d077 ("vfio: Type1 IOMMU implementation")
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Alejandro Jimenez <alejandro.j.jimenez@oracle.com>
Signed-off-by: Alex Mastro <amastro@fb.com>
Link: https://lore.kernel.org/r/20251028-fix-unmap-v6-3-2542b96bcc8e@fb.com
Signed-off-by: Alex Williamson <alex@shazbot.org>

authored by

Alex Mastro and committed by
Alex Williamson
ef270ec4 1196f1f8

+42 -35
+42 -35
drivers/vfio/vfio_iommu_type1.c
··· 168 168 { 169 169 struct rb_node *node = iommu->dma_list.rb_node; 170 170 171 + WARN_ON(!size); 172 + 171 173 while (node) { 172 174 struct vfio_dma *dma = rb_entry(node, struct vfio_dma, node); 173 175 174 - if (start + size <= dma->iova) 176 + if (start + size - 1 < dma->iova) 175 177 node = node->rb_left; 176 - else if (start >= dma->iova + dma->size) 178 + else if (start > dma->iova + dma->size - 1) 177 179 node = node->rb_right; 178 180 else 179 181 return dma; ··· 185 183 } 186 184 187 185 static struct rb_node *vfio_find_dma_first_node(struct vfio_iommu *iommu, 188 - dma_addr_t start, size_t size) 186 + dma_addr_t start, 187 + dma_addr_t end) 189 188 { 190 189 struct rb_node *res = NULL; 191 190 struct rb_node *node = iommu->dma_list.rb_node; 192 191 struct vfio_dma *dma_res = NULL; 193 192 193 + WARN_ON(end < start); 194 + 194 195 while (node) { 195 196 struct vfio_dma *dma = rb_entry(node, struct vfio_dma, node); 196 197 197 - if (start < dma->iova + dma->size) { 198 + if (start <= dma->iova + dma->size - 1) { 198 199 res = node; 199 200 dma_res = dma; 200 201 if (start >= dma->iova) ··· 207 202 node = node->rb_right; 208 203 } 209 204 } 210 - if (res && size && dma_res->iova >= start + size) 205 + if (res && dma_res->iova > end) 211 206 res = NULL; 212 207 return res; 213 208 } ··· 217 212 struct rb_node **link = &iommu->dma_list.rb_node, *parent = NULL; 218 213 struct vfio_dma *dma; 219 214 215 + WARN_ON(new->size != 0); 216 + 220 217 while (*link) { 221 218 parent = *link; 222 219 dma = rb_entry(parent, struct vfio_dma, node); 223 220 224 - if (new->iova + new->size <= dma->iova) 221 + if (new->iova <= dma->iova) 225 222 link = &(*link)->rb_left; 226 223 else 227 224 link = &(*link)->rb_right; ··· 1148 1141 static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma, 1149 1142 bool do_accounting) 1150 1143 { 1151 - dma_addr_t iova = dma->iova, end = dma->iova + dma->size; 1152 1144 struct vfio_domain *domain, *d; 1153 1145 LIST_HEAD(unmapped_region_list); 1154 1146 struct iommu_iotlb_gather iotlb_gather; 1155 1147 int unmapped_region_cnt = 0; 1156 1148 long unlocked = 0; 1149 + size_t pos = 0; 1157 1150 1158 1151 if (!dma->size) 1159 1152 return 0; ··· 1177 1170 } 1178 1171 1179 1172 iommu_iotlb_gather_init(&iotlb_gather); 1180 - while (iova < end) { 1173 + while (pos < dma->size) { 1181 1174 size_t unmapped, len; 1182 1175 phys_addr_t phys, next; 1176 + dma_addr_t iova = dma->iova + pos; 1183 1177 1184 1178 phys = iommu_iova_to_phys(domain->domain, iova); 1185 1179 if (WARN_ON(!phys)) { 1186 - iova += PAGE_SIZE; 1180 + pos += PAGE_SIZE; 1187 1181 continue; 1188 1182 } 1189 1183 ··· 1193 1185 * may require hardware cache flushing, try to find the 1194 1186 * largest contiguous physical memory chunk to unmap. 1195 1187 */ 1196 - for (len = PAGE_SIZE; iova + len < end; len += PAGE_SIZE) { 1188 + for (len = PAGE_SIZE; pos + len < dma->size; len += PAGE_SIZE) { 1197 1189 next = iommu_iova_to_phys(domain->domain, iova + len); 1198 1190 if (next != phys + len) 1199 1191 break; ··· 1214 1206 break; 1215 1207 } 1216 1208 1217 - iova += unmapped; 1209 + pos += unmapped; 1218 1210 } 1219 1211 1220 1212 dma->iommu_mapped = false; ··· 1306 1298 } 1307 1299 1308 1300 static int vfio_iova_dirty_bitmap(u64 __user *bitmap, struct vfio_iommu *iommu, 1309 - dma_addr_t iova, size_t size, size_t pgsize) 1301 + dma_addr_t iova, dma_addr_t iova_end, size_t pgsize) 1310 1302 { 1311 1303 struct vfio_dma *dma; 1312 1304 struct rb_node *n; ··· 1323 1315 if (dma && dma->iova != iova) 1324 1316 return -EINVAL; 1325 1317 1326 - dma = vfio_find_dma(iommu, iova + size - 1, 0); 1327 - if (dma && dma->iova + dma->size != iova + size) 1318 + dma = vfio_find_dma(iommu, iova_end, 1); 1319 + if (dma && dma->iova + dma->size - 1 != iova_end) 1328 1320 return -EINVAL; 1329 1321 1330 1322 for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) { ··· 1333 1325 if (dma->iova < iova) 1334 1326 continue; 1335 1327 1336 - if (dma->iova > iova + size - 1) 1328 + if (dma->iova > iova_end) 1337 1329 break; 1338 1330 1339 1331 ret = update_user_bitmap(bitmap, iommu, dma, iova, pgsize); ··· 1426 1418 if (unmap_all) { 1427 1419 if (iova || size) 1428 1420 goto unlock; 1429 - size = SIZE_MAX; 1421 + iova_end = ~(dma_addr_t)0; 1430 1422 } else { 1431 1423 if (!size || size & (pgsize - 1)) 1432 1424 goto unlock; ··· 1481 1473 if (dma && dma->iova != iova) 1482 1474 goto unlock; 1483 1475 1484 - dma = vfio_find_dma(iommu, iova_end, 0); 1485 - if (dma && dma->iova + dma->size != iova + size) 1476 + dma = vfio_find_dma(iommu, iova_end, 1); 1477 + if (dma && dma->iova + dma->size - 1 != iova_end) 1486 1478 goto unlock; 1487 1479 } 1488 1480 1489 1481 ret = 0; 1490 - n = first_n = vfio_find_dma_first_node(iommu, iova, size); 1482 + n = first_n = vfio_find_dma_first_node(iommu, iova, iova_end); 1491 1483 1492 1484 while (n) { 1493 1485 dma = rb_entry(n, struct vfio_dma, node); 1494 - if (dma->iova >= iova + size) 1486 + if (dma->iova > iova_end) 1495 1487 break; 1496 1488 1497 1489 if (!iommu->v2 && iova > dma->iova) ··· 1821 1813 1822 1814 for (; n; n = rb_next(n)) { 1823 1815 struct vfio_dma *dma; 1824 - dma_addr_t iova; 1816 + size_t pos = 0; 1825 1817 1826 1818 dma = rb_entry(n, struct vfio_dma, node); 1827 - iova = dma->iova; 1828 1819 1829 - while (iova < dma->iova + dma->size) { 1820 + while (pos < dma->size) { 1821 + dma_addr_t iova = dma->iova + pos; 1830 1822 phys_addr_t phys; 1831 1823 size_t size; 1832 1824 ··· 1842 1834 phys = iommu_iova_to_phys(d->domain, iova); 1843 1835 1844 1836 if (WARN_ON(!phys)) { 1845 - iova += PAGE_SIZE; 1837 + pos += PAGE_SIZE; 1846 1838 continue; 1847 1839 } 1848 1840 1849 1841 size = PAGE_SIZE; 1850 1842 p = phys + size; 1851 1843 i = iova + size; 1852 - while (i < dma->iova + dma->size && 1844 + while (pos + size < dma->size && 1853 1845 p == iommu_iova_to_phys(d->domain, i)) { 1854 1846 size += PAGE_SIZE; 1855 1847 p += PAGE_SIZE; ··· 1857 1849 } 1858 1850 } else { 1859 1851 unsigned long pfn; 1860 - unsigned long vaddr = dma->vaddr + 1861 - (iova - dma->iova); 1862 - size_t n = dma->iova + dma->size - iova; 1852 + unsigned long vaddr = dma->vaddr + pos; 1853 + size_t n = dma->size - pos; 1863 1854 long npage; 1864 1855 1865 1856 npage = vfio_pin_pages_remote(dma, vaddr, ··· 1889 1882 goto unwind; 1890 1883 } 1891 1884 1892 - iova += size; 1885 + pos += size; 1893 1886 } 1894 1887 } 1895 1888 ··· 1906 1899 unwind: 1907 1900 for (; n; n = rb_prev(n)) { 1908 1901 struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node); 1909 - dma_addr_t iova; 1902 + size_t pos = 0; 1910 1903 1911 1904 if (dma->iommu_mapped) { 1912 1905 iommu_unmap(domain->domain, dma->iova, dma->size); 1913 1906 continue; 1914 1907 } 1915 1908 1916 - iova = dma->iova; 1917 - while (iova < dma->iova + dma->size) { 1909 + while (pos < dma->size) { 1910 + dma_addr_t iova = dma->iova + pos; 1918 1911 phys_addr_t phys, p; 1919 1912 size_t size; 1920 1913 dma_addr_t i; 1921 1914 1922 1915 phys = iommu_iova_to_phys(domain->domain, iova); 1923 1916 if (!phys) { 1924 - iova += PAGE_SIZE; 1917 + pos += PAGE_SIZE; 1925 1918 continue; 1926 1919 } 1927 1920 1928 1921 size = PAGE_SIZE; 1929 1922 p = phys + size; 1930 1923 i = iova + size; 1931 - while (i < dma->iova + dma->size && 1924 + while (pos + size < dma->size && 1932 1925 p == iommu_iova_to_phys(domain->domain, i)) { 1933 1926 size += PAGE_SIZE; 1934 1927 p += PAGE_SIZE; ··· 3066 3059 3067 3060 if (iommu->dirty_page_tracking) 3068 3061 ret = vfio_iova_dirty_bitmap(range.bitmap.data, 3069 - iommu, iova, size, 3062 + iommu, iova, iova_end, 3070 3063 range.bitmap.pgsize); 3071 3064 else 3072 3065 ret = -EINVAL;