iommupt: Only cache flush memory changed by unmap

The cache flush was happening on every level across the whole range of
iteration, even if no leafs or tables were cleared. Instead flush only the
sub range that was actually written.

Overflushing isn't a correctness problem but it does impact the
performance of unmap.

After this series the performance compared to the original VT-d
implementation with cache flushing turned on is:

map_pages
pgsz ,avg new,old ns, min new,old ns , min % (+ve is better)
2^12, 253,266 , 213,227 , 6.06
2^21, 246,244 , 221,219 , 0.00
2^30, 231,240 , 209,217 , 3.03
256*2^12, 2604,2668 , 2415,2540 , 4.04
256*2^21, 2495,2824 , 2390,2734 , 12.12
256*2^30, 2542,2845 , 2380,2718 , 12.12

unmap_pages
pgsz ,avg new,old ns, min new,old ns , min % (+ve is better)
2^12, 259,292 , 222,251 , 11.11
2^21, 255,259 , 227,236 , 3.03
2^30, 238,254 , 217,230 , 5.05
256*2^12, 2751,2620 , 2417,2437 , 0.00
256*2^21, 2461,2526 , 2377,2423 , 1.01
256*2^30, 2498,2543 , 2370,2404 , 1.01

Fixes: efa03dab7ce4 ("iommupt: Flush the CPU cache after any writes to the page table")
Reported-by: Francois Dugast <francois.dugast@intel.com>
Closes: https://lore.kernel.org/all/20260121130233.257428-1-francois.dugast@intel.com/
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Tested-by: Francois Dugast <francois.dugast@intel.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>

authored by Jason Gunthorpe and committed by Joerg Roedel 5815d930 63804fed

+10 -1
+10 -1
drivers/iommu/generic_pt/iommu_pt.h
··· 931 struct pt_table_p *table) 932 { 933 struct pt_state pts = pt_init(range, level, table); 934 struct pt_unmap_args *unmap = arg; 935 unsigned int num_oas = 0; 936 unsigned int start_index; ··· 988 iommu_pages_list_add(&unmap->free_list, 989 pts.table_lower); 990 pt_clear_entries(&pts, ilog2(1)); 991 } 992 pts.index++; 993 } else { ··· 1004 num_contig_lg2 = pt_entry_num_contig_lg2(&pts); 1005 pt_clear_entries(&pts, num_contig_lg2); 1006 num_oas += log2_to_int(num_contig_lg2); 1007 pts.index += log2_to_int(num_contig_lg2); 1008 } 1009 if (pts.index >= pts.end_index) 1010 break; ··· 1015 } while (true); 1016 1017 unmap->unmapped += log2_mul(num_oas, pt_table_item_lg2sz(&pts)); 1018 - flush_writes_range(&pts, start_index, pts.index); 1019 1020 return ret; 1021 }
··· 931 struct pt_table_p *table) 932 { 933 struct pt_state pts = pt_init(range, level, table); 934 + unsigned int flush_start_index = UINT_MAX; 935 + unsigned int flush_end_index = UINT_MAX; 936 struct pt_unmap_args *unmap = arg; 937 unsigned int num_oas = 0; 938 unsigned int start_index; ··· 986 iommu_pages_list_add(&unmap->free_list, 987 pts.table_lower); 988 pt_clear_entries(&pts, ilog2(1)); 989 + if (pts.index < flush_start_index) 990 + flush_start_index = pts.index; 991 + flush_end_index = pts.index + 1; 992 } 993 pts.index++; 994 } else { ··· 999 num_contig_lg2 = pt_entry_num_contig_lg2(&pts); 1000 pt_clear_entries(&pts, num_contig_lg2); 1001 num_oas += log2_to_int(num_contig_lg2); 1002 + if (pts.index < flush_start_index) 1003 + flush_start_index = pts.index; 1004 pts.index += log2_to_int(num_contig_lg2); 1005 + flush_end_index = pts.index; 1006 } 1007 if (pts.index >= pts.end_index) 1008 break; ··· 1007 } while (true); 1008 1009 unmap->unmapped += log2_mul(num_oas, pt_table_item_lg2sz(&pts)); 1010 + if (flush_start_index != flush_end_index) 1011 + flush_writes_range(&pts, flush_start_index, flush_end_index); 1012 1013 return ret; 1014 }