Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

libceph: multiple workspaces for CRUSH computations

Replace a global map->crush_workspace (protected by a global mutex)
with a list of workspaces, up to the number of CPUs + 1.

This is based on a patch from Robin Geuze <robing@nl.team.blue>.
Robin and his team have observed a 10-20% increase in IOPS on all
queue depths and lower CPU usage as well on a high-end all-NVMe
100GbE cluster.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>

+166 -17
+12 -2
include/linux/ceph/osdmap.h
··· 137 137 const char *fmt, ...); 138 138 void ceph_oid_destroy(struct ceph_object_id *oid); 139 139 140 + struct workspace_manager { 141 + struct list_head idle_ws; 142 + spinlock_t ws_lock; 143 + /* Number of free workspaces */ 144 + int free_ws; 145 + /* Total number of allocated workspaces */ 146 + atomic_t total_ws; 147 + /* Waiters for a free workspace */ 148 + wait_queue_head_t ws_wait; 149 + }; 150 + 140 151 struct ceph_pg_mapping { 141 152 struct rb_node node; 142 153 struct ceph_pg pgid; ··· 195 184 * the list of osds that store+replicate them. */ 196 185 struct crush_map *crush; 197 186 198 - struct mutex crush_workspace_mutex; 199 - void *crush_workspace; 187 + struct workspace_manager crush_wsm; 200 188 }; 201 189 202 190 static inline bool ceph_osd_exists(struct ceph_osdmap *map, int osd)
+3
include/linux/crush/crush.h
··· 346 346 347 347 struct crush_work { 348 348 struct crush_work_bucket **work; /* Per-bucket working store */ 349 + #ifdef __KERNEL__ 350 + struct list_head item; 351 + #endif 349 352 }; 350 353 351 354 #ifdef __KERNEL__
+151 -15
net/ceph/osdmap.c
··· 965 965 } 966 966 967 967 /* 968 + * CRUSH workspaces 969 + * 970 + * workspace_manager framework borrowed from fs/btrfs/compression.c. 971 + * Two simplifications: there is only one type of workspace and there 972 + * is always at least one workspace. 973 + */ 974 + static struct crush_work *alloc_workspace(const struct crush_map *c) 975 + { 976 + struct crush_work *work; 977 + size_t work_size; 978 + 979 + WARN_ON(!c->working_size); 980 + work_size = crush_work_size(c, CEPH_PG_MAX_SIZE); 981 + dout("%s work_size %zu bytes\n", __func__, work_size); 982 + 983 + work = ceph_kvmalloc(work_size, GFP_NOIO); 984 + if (!work) 985 + return NULL; 986 + 987 + INIT_LIST_HEAD(&work->item); 988 + crush_init_workspace(c, work); 989 + return work; 990 + } 991 + 992 + static void free_workspace(struct crush_work *work) 993 + { 994 + WARN_ON(!list_empty(&work->item)); 995 + kvfree(work); 996 + } 997 + 998 + static void init_workspace_manager(struct workspace_manager *wsm) 999 + { 1000 + INIT_LIST_HEAD(&wsm->idle_ws); 1001 + spin_lock_init(&wsm->ws_lock); 1002 + atomic_set(&wsm->total_ws, 0); 1003 + wsm->free_ws = 0; 1004 + init_waitqueue_head(&wsm->ws_wait); 1005 + } 1006 + 1007 + static void add_initial_workspace(struct workspace_manager *wsm, 1008 + struct crush_work *work) 1009 + { 1010 + WARN_ON(!list_empty(&wsm->idle_ws)); 1011 + 1012 + list_add(&work->item, &wsm->idle_ws); 1013 + atomic_set(&wsm->total_ws, 1); 1014 + wsm->free_ws = 1; 1015 + } 1016 + 1017 + static void cleanup_workspace_manager(struct workspace_manager *wsm) 1018 + { 1019 + struct crush_work *work; 1020 + 1021 + while (!list_empty(&wsm->idle_ws)) { 1022 + work = list_first_entry(&wsm->idle_ws, struct crush_work, 1023 + item); 1024 + list_del_init(&work->item); 1025 + free_workspace(work); 1026 + } 1027 + atomic_set(&wsm->total_ws, 0); 1028 + wsm->free_ws = 0; 1029 + } 1030 + 1031 + /* 1032 + * Finds an available workspace or allocates a new one. If it's not 1033 + * possible to allocate a new one, waits until there is one. 1034 + */ 1035 + static struct crush_work *get_workspace(struct workspace_manager *wsm, 1036 + const struct crush_map *c) 1037 + { 1038 + struct crush_work *work; 1039 + int cpus = num_online_cpus(); 1040 + 1041 + again: 1042 + spin_lock(&wsm->ws_lock); 1043 + if (!list_empty(&wsm->idle_ws)) { 1044 + work = list_first_entry(&wsm->idle_ws, struct crush_work, 1045 + item); 1046 + list_del_init(&work->item); 1047 + wsm->free_ws--; 1048 + spin_unlock(&wsm->ws_lock); 1049 + return work; 1050 + 1051 + } 1052 + if (atomic_read(&wsm->total_ws) > cpus) { 1053 + DEFINE_WAIT(wait); 1054 + 1055 + spin_unlock(&wsm->ws_lock); 1056 + prepare_to_wait(&wsm->ws_wait, &wait, TASK_UNINTERRUPTIBLE); 1057 + if (atomic_read(&wsm->total_ws) > cpus && !wsm->free_ws) 1058 + schedule(); 1059 + finish_wait(&wsm->ws_wait, &wait); 1060 + goto again; 1061 + } 1062 + atomic_inc(&wsm->total_ws); 1063 + spin_unlock(&wsm->ws_lock); 1064 + 1065 + work = alloc_workspace(c); 1066 + if (!work) { 1067 + atomic_dec(&wsm->total_ws); 1068 + wake_up(&wsm->ws_wait); 1069 + 1070 + /* 1071 + * Do not return the error but go back to waiting. We 1072 + * have the inital workspace and the CRUSH computation 1073 + * time is bounded so we will get it eventually. 1074 + */ 1075 + WARN_ON(atomic_read(&wsm->total_ws) < 1); 1076 + goto again; 1077 + } 1078 + return work; 1079 + } 1080 + 1081 + /* 1082 + * Puts a workspace back on the list or frees it if we have enough 1083 + * idle ones sitting around. 1084 + */ 1085 + static void put_workspace(struct workspace_manager *wsm, 1086 + struct crush_work *work) 1087 + { 1088 + spin_lock(&wsm->ws_lock); 1089 + if (wsm->free_ws <= num_online_cpus()) { 1090 + list_add(&work->item, &wsm->idle_ws); 1091 + wsm->free_ws++; 1092 + spin_unlock(&wsm->ws_lock); 1093 + goto wake; 1094 + } 1095 + spin_unlock(&wsm->ws_lock); 1096 + 1097 + free_workspace(work); 1098 + atomic_dec(&wsm->total_ws); 1099 + wake: 1100 + if (wq_has_sleeper(&wsm->ws_wait)) 1101 + wake_up(&wsm->ws_wait); 1102 + } 1103 + 1104 + /* 968 1105 * osd map 969 1106 */ 970 1107 struct ceph_osdmap *ceph_osdmap_alloc(void) ··· 1118 981 map->primary_temp = RB_ROOT; 1119 982 map->pg_upmap = RB_ROOT; 1120 983 map->pg_upmap_items = RB_ROOT; 1121 - mutex_init(&map->crush_workspace_mutex); 984 + 985 + init_workspace_manager(&map->crush_wsm); 1122 986 1123 987 return map; 1124 988 } ··· 1127 989 void ceph_osdmap_destroy(struct ceph_osdmap *map) 1128 990 { 1129 991 dout("osdmap_destroy %p\n", map); 992 + 1130 993 if (map->crush) 1131 994 crush_destroy(map->crush); 995 + cleanup_workspace_manager(&map->crush_wsm); 996 + 1132 997 while (!RB_EMPTY_ROOT(&map->pg_temp)) { 1133 998 struct ceph_pg_mapping *pg = 1134 999 rb_entry(rb_first(&map->pg_temp), ··· 1170 1029 kvfree(map->osd_weight); 1171 1030 kvfree(map->osd_addr); 1172 1031 kvfree(map->osd_primary_affinity); 1173 - kvfree(map->crush_workspace); 1174 1032 kfree(map); 1175 1033 } 1176 1034 ··· 1244 1104 1245 1105 static int osdmap_set_crush(struct ceph_osdmap *map, struct crush_map *crush) 1246 1106 { 1247 - void *workspace; 1248 - size_t work_size; 1107 + struct crush_work *work; 1249 1108 1250 1109 if (IS_ERR(crush)) 1251 1110 return PTR_ERR(crush); 1252 1111 1253 - work_size = crush_work_size(crush, CEPH_PG_MAX_SIZE); 1254 - dout("%s work_size %zu bytes\n", __func__, work_size); 1255 - workspace = ceph_kvmalloc(work_size, GFP_NOIO); 1256 - if (!workspace) { 1112 + work = alloc_workspace(crush); 1113 + if (!work) { 1257 1114 crush_destroy(crush); 1258 1115 return -ENOMEM; 1259 1116 } 1260 - crush_init_workspace(crush, workspace); 1261 1117 1262 1118 if (map->crush) 1263 1119 crush_destroy(map->crush); 1264 - kvfree(map->crush_workspace); 1120 + cleanup_workspace_manager(&map->crush_wsm); 1265 1121 map->crush = crush; 1266 - map->crush_workspace = workspace; 1122 + add_initial_workspace(&map->crush_wsm, work); 1267 1123 return 0; 1268 1124 } 1269 1125 ··· 2458 2322 s64 choose_args_index) 2459 2323 { 2460 2324 struct crush_choose_arg_map *arg_map; 2325 + struct crush_work *work; 2461 2326 int r; 2462 2327 2463 2328 BUG_ON(result_max > CEPH_PG_MAX_SIZE); ··· 2469 2332 arg_map = lookup_choose_arg_map(&map->crush->choose_args, 2470 2333 CEPH_DEFAULT_CHOOSE_ARGS); 2471 2334 2472 - mutex_lock(&map->crush_workspace_mutex); 2335 + work = get_workspace(&map->crush_wsm, map->crush); 2473 2336 r = crush_do_rule(map->crush, ruleno, x, result, result_max, 2474 - weight, weight_max, map->crush_workspace, 2337 + weight, weight_max, work, 2475 2338 arg_map ? arg_map->args : NULL); 2476 - mutex_unlock(&map->crush_workspace_mutex); 2477 - 2339 + put_workspace(&map->crush_wsm, work); 2478 2340 return r; 2479 2341 } 2480 2342