fs/resctrl/rdtgroup.c at master · tjh.dev/kernel

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / fs / resctrl / rdtgroup.c
at master 114 kB view raw
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * User interface for Resource Allocation in Resource Director Technology(RDT)
   4 *
   5 * Copyright (C) 2016 Intel Corporation
   6 *
   7 * Author: Fenghua Yu <fenghua.yu@intel.com>
   8 *
   9 * More information about RDT be found in the Intel (R) x86 Architecture
  10 * Software Developer Manual.
  11 */
  12
  13#define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
  14
  15#include <linux/cpu.h>
  16#include <linux/debugfs.h>
  17#include <linux/fs.h>
  18#include <linux/fs_parser.h>
  19#include <linux/sysfs.h>
  20#include <linux/kernfs.h>
  21#include <linux/resctrl.h>
  22#include <linux/seq_buf.h>
  23#include <linux/seq_file.h>
  24#include <linux/sched/task.h>
  25#include <linux/slab.h>
  26#include <linux/user_namespace.h>
  27
  28#include <uapi/linux/magic.h>
  29
  30#include "internal.h"
  31
  32/* Mutex to protect rdtgroup access. */
  33DEFINE_MUTEX(rdtgroup_mutex);
  34
  35static struct kernfs_root *rdt_root;
  36
  37struct rdtgroup rdtgroup_default;
  38
  39LIST_HEAD(rdt_all_groups);
  40
  41/* list of entries for the schemata file */
  42LIST_HEAD(resctrl_schema_all);
  43
  44/*
  45 * List of struct mon_data containing private data of event files for use by
  46 * rdtgroup_mondata_show(). Protected by rdtgroup_mutex.
  47 */
  48static LIST_HEAD(mon_data_kn_priv_list);
  49
  50/* The filesystem can only be mounted once. */
  51bool resctrl_mounted;
  52
  53/* Kernel fs node for "info" directory under root */
  54static struct kernfs_node *kn_info;
  55
  56/* Kernel fs node for "mon_groups" directory under root */
  57static struct kernfs_node *kn_mongrp;
  58
  59/* Kernel fs node for "mon_data" directory under root */
  60static struct kernfs_node *kn_mondata;
  61
  62/*
  63 * Used to store the max resource name width to display the schemata names in
  64 * a tabular format.
  65 */
  66int max_name_width;
  67
  68static struct seq_buf last_cmd_status;
  69
  70static char last_cmd_status_buf[512];
  71
  72static int rdtgroup_setup_root(struct rdt_fs_context *ctx);
  73
  74static void rdtgroup_destroy_root(void);
  75
  76struct dentry *debugfs_resctrl;
  77
  78/*
  79 * Memory bandwidth monitoring event to use for the default CTRL_MON group
  80 * and each new CTRL_MON group created by the user.  Only relevant when
  81 * the filesystem is mounted with the "mba_MBps" option so it does not
  82 * matter that it remains uninitialized on systems that do not support
  83 * the "mba_MBps" option.
  84 */
  85enum resctrl_event_id mba_mbps_default_event;
  86
  87static bool resctrl_debug;
  88
  89void rdt_last_cmd_clear(void)
  90{
  91	lockdep_assert_held(&rdtgroup_mutex);
  92	seq_buf_clear(&last_cmd_status);
  93}
  94
  95void rdt_last_cmd_puts(const char *s)
  96{
  97	lockdep_assert_held(&rdtgroup_mutex);
  98	seq_buf_puts(&last_cmd_status, s);
  99}
 100
 101void rdt_last_cmd_printf(const char *fmt, ...)
 102{
 103	va_list ap;
 104
 105	va_start(ap, fmt);
 106	lockdep_assert_held(&rdtgroup_mutex);
 107	seq_buf_vprintf(&last_cmd_status, fmt, ap);
 108	va_end(ap);
 109}
 110
 111void rdt_staged_configs_clear(void)
 112{
 113	struct rdt_ctrl_domain *dom;
 114	struct rdt_resource *r;
 115
 116	lockdep_assert_held(&rdtgroup_mutex);
 117
 118	for_each_alloc_capable_rdt_resource(r) {
 119		list_for_each_entry(dom, &r->ctrl_domains, hdr.list)
 120			memset(dom->staged_config, 0, sizeof(dom->staged_config));
 121	}
 122}
 123
 124static bool resctrl_is_mbm_enabled(void)
 125{
 126	return (resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID) ||
 127		resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID));
 128}
 129
 130/*
 131 * Trivial allocator for CLOSIDs. Use BITMAP APIs to manipulate a bitmap
 132 * of free CLOSIDs.
 133 *
 134 * Using a global CLOSID across all resources has some advantages and
 135 * some drawbacks:
 136 * + We can simply set current's closid to assign a task to a resource
 137 *   group.
 138 * + Context switch code can avoid extra memory references deciding which
 139 *   CLOSID to load into the PQR_ASSOC MSR
 140 * - We give up some options in configuring resource groups across multi-socket
 141 *   systems.
 142 * - Our choices on how to configure each resource become progressively more
 143 *   limited as the number of resources grows.
 144 */
 145static unsigned long *closid_free_map;
 146
 147static int closid_free_map_len;
 148
 149int closids_supported(void)
 150{
 151	return closid_free_map_len;
 152}
 153
 154static int closid_init(void)
 155{
 156	struct resctrl_schema *s;
 157	u32 rdt_min_closid = ~0;
 158
 159	/* Monitor only platforms still call closid_init() */
 160	if (list_empty(&resctrl_schema_all))
 161		return 0;
 162
 163	/* Compute rdt_min_closid across all resources */
 164	list_for_each_entry(s, &resctrl_schema_all, list)
 165		rdt_min_closid = min(rdt_min_closid, s->num_closid);
 166
 167	closid_free_map = bitmap_alloc(rdt_min_closid, GFP_KERNEL);
 168	if (!closid_free_map)
 169		return -ENOMEM;
 170	bitmap_fill(closid_free_map, rdt_min_closid);
 171
 172	/* RESCTRL_RESERVED_CLOSID is always reserved for the default group */
 173	__clear_bit(RESCTRL_RESERVED_CLOSID, closid_free_map);
 174	closid_free_map_len = rdt_min_closid;
 175
 176	return 0;
 177}
 178
 179static void closid_exit(void)
 180{
 181	bitmap_free(closid_free_map);
 182	closid_free_map = NULL;
 183}
 184
 185static int closid_alloc(void)
 186{
 187	int cleanest_closid;
 188	u32 closid;
 189
 190	lockdep_assert_held(&rdtgroup_mutex);
 191
 192	if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID) &&
 193	    resctrl_is_mon_event_enabled(QOS_L3_OCCUP_EVENT_ID)) {
 194		cleanest_closid = resctrl_find_cleanest_closid();
 195		if (cleanest_closid < 0)
 196			return cleanest_closid;
 197		closid = cleanest_closid;
 198	} else {
 199		closid = find_first_bit(closid_free_map, closid_free_map_len);
 200		if (closid == closid_free_map_len)
 201			return -ENOSPC;
 202	}
 203	__clear_bit(closid, closid_free_map);
 204
 205	return closid;
 206}
 207
 208void closid_free(int closid)
 209{
 210	lockdep_assert_held(&rdtgroup_mutex);
 211
 212	__set_bit(closid, closid_free_map);
 213}
 214
 215/**
 216 * closid_allocated - test if provided closid is in use
 217 * @closid: closid to be tested
 218 *
 219 * Return: true if @closid is currently associated with a resource group,
 220 * false if @closid is free
 221 */
 222bool closid_allocated(unsigned int closid)
 223{
 224	lockdep_assert_held(&rdtgroup_mutex);
 225
 226	return !test_bit(closid, closid_free_map);
 227}
 228
 229bool closid_alloc_fixed(u32 closid)
 230{
 231	return __test_and_clear_bit(closid, closid_free_map);
 232}
 233
 234/**
 235 * rdtgroup_mode_by_closid - Return mode of resource group with closid
 236 * @closid: closid if the resource group
 237 *
 238 * Each resource group is associated with a @closid. Here the mode
 239 * of a resource group can be queried by searching for it using its closid.
 240 *
 241 * Return: mode as &enum rdtgrp_mode of resource group with closid @closid
 242 */
 243enum rdtgrp_mode rdtgroup_mode_by_closid(int closid)
 244{
 245	struct rdtgroup *rdtgrp;
 246
 247	list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) {
 248		if (rdtgrp->closid == closid)
 249			return rdtgrp->mode;
 250	}
 251
 252	return RDT_NUM_MODES;
 253}
 254
 255static const char * const rdt_mode_str[] = {
 256	[RDT_MODE_SHAREABLE]		= "shareable",
 257	[RDT_MODE_EXCLUSIVE]		= "exclusive",
 258	[RDT_MODE_PSEUDO_LOCKSETUP]	= "pseudo-locksetup",
 259	[RDT_MODE_PSEUDO_LOCKED]	= "pseudo-locked",
 260};
 261
 262/**
 263 * rdtgroup_mode_str - Return the string representation of mode
 264 * @mode: the resource group mode as &enum rdtgroup_mode
 265 *
 266 * Return: string representation of valid mode, "unknown" otherwise
 267 */
 268static const char *rdtgroup_mode_str(enum rdtgrp_mode mode)
 269{
 270	if (mode < RDT_MODE_SHAREABLE || mode >= RDT_NUM_MODES)
 271		return "unknown";
 272
 273	return rdt_mode_str[mode];
 274}
 275
 276/* set uid and gid of rdtgroup dirs and files to that of the creator */
 277static int rdtgroup_kn_set_ugid(struct kernfs_node *kn)
 278{
 279	struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID,
 280				.ia_uid = current_fsuid(),
 281				.ia_gid = current_fsgid(), };
 282
 283	if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) &&
 284	    gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID))
 285		return 0;
 286
 287	return kernfs_setattr(kn, &iattr);
 288}
 289
 290static int rdtgroup_add_file(struct kernfs_node *parent_kn, struct rftype *rft)
 291{
 292	struct kernfs_node *kn;
 293	int ret;
 294
 295	kn = __kernfs_create_file(parent_kn, rft->name, rft->mode,
 296				  GLOBAL_ROOT_UID, GLOBAL_ROOT_GID,
 297				  0, rft->kf_ops, rft, NULL, NULL);
 298	if (IS_ERR(kn))
 299		return PTR_ERR(kn);
 300
 301	ret = rdtgroup_kn_set_ugid(kn);
 302	if (ret) {
 303		kernfs_remove(kn);
 304		return ret;
 305	}
 306
 307	return 0;
 308}
 309
 310static int rdtgroup_seqfile_show(struct seq_file *m, void *arg)
 311{
 312	struct kernfs_open_file *of = m->private;
 313	struct rftype *rft = of->kn->priv;
 314
 315	if (rft->seq_show)
 316		return rft->seq_show(of, m, arg);
 317	return 0;
 318}
 319
 320static ssize_t rdtgroup_file_write(struct kernfs_open_file *of, char *buf,
 321				   size_t nbytes, loff_t off)
 322{
 323	struct rftype *rft = of->kn->priv;
 324
 325	if (rft->write)
 326		return rft->write(of, buf, nbytes, off);
 327
 328	return -EINVAL;
 329}
 330
 331static const struct kernfs_ops rdtgroup_kf_single_ops = {
 332	.atomic_write_len	= PAGE_SIZE,
 333	.write			= rdtgroup_file_write,
 334	.seq_show		= rdtgroup_seqfile_show,
 335};
 336
 337static const struct kernfs_ops kf_mondata_ops = {
 338	.atomic_write_len	= PAGE_SIZE,
 339	.seq_show		= rdtgroup_mondata_show,
 340};
 341
 342static bool is_cpu_list(struct kernfs_open_file *of)
 343{
 344	struct rftype *rft = of->kn->priv;
 345
 346	return rft->flags & RFTYPE_FLAGS_CPUS_LIST;
 347}
 348
 349static int rdtgroup_cpus_show(struct kernfs_open_file *of,
 350			      struct seq_file *s, void *v)
 351{
 352	struct rdtgroup *rdtgrp;
 353	struct cpumask *mask;
 354	int ret = 0;
 355
 356	rdtgrp = rdtgroup_kn_lock_live(of->kn);
 357
 358	if (rdtgrp) {
 359		if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) {
 360			if (!rdtgrp->plr->d) {
 361				rdt_last_cmd_clear();
 362				rdt_last_cmd_puts("Cache domain offline\n");
 363				ret = -ENODEV;
 364			} else {
 365				mask = &rdtgrp->plr->d->hdr.cpu_mask;
 366				seq_printf(s, is_cpu_list(of) ?
 367					   "%*pbl\n" : "%*pb\n",
 368					   cpumask_pr_args(mask));
 369			}
 370		} else {
 371			seq_printf(s, is_cpu_list(of) ? "%*pbl\n" : "%*pb\n",
 372				   cpumask_pr_args(&rdtgrp->cpu_mask));
 373		}
 374	} else {
 375		ret = -ENOENT;
 376	}
 377	rdtgroup_kn_unlock(of->kn);
 378
 379	return ret;
 380}
 381
 382/*
 383 * Update the PGR_ASSOC MSR on all cpus in @cpu_mask,
 384 *
 385 * Per task closids/rmids must have been set up before calling this function.
 386 * @r may be NULL.
 387 */
 388static void
 389update_closid_rmid(const struct cpumask *cpu_mask, struct rdtgroup *r)
 390{
 391	struct resctrl_cpu_defaults defaults, *p = NULL;
 392
 393	if (r) {
 394		defaults.closid = r->closid;
 395		defaults.rmid = r->mon.rmid;
 396		p = &defaults;
 397	}
 398
 399	on_each_cpu_mask(cpu_mask, resctrl_arch_sync_cpu_closid_rmid, p, 1);
 400}
 401
 402static int cpus_mon_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask,
 403			  cpumask_var_t tmpmask)
 404{
 405	struct rdtgroup *prgrp = rdtgrp->mon.parent, *crgrp;
 406	struct list_head *head;
 407
 408	/* Check whether cpus belong to parent ctrl group */
 409	cpumask_andnot(tmpmask, newmask, &prgrp->cpu_mask);
 410	if (!cpumask_empty(tmpmask)) {
 411		rdt_last_cmd_puts("Can only add CPUs to mongroup that belong to parent\n");
 412		return -EINVAL;
 413	}
 414
 415	/* Check whether cpus are dropped from this group */
 416	cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask);
 417	if (!cpumask_empty(tmpmask)) {
 418		/* Give any dropped cpus to parent rdtgroup */
 419		cpumask_or(&prgrp->cpu_mask, &prgrp->cpu_mask, tmpmask);
 420		update_closid_rmid(tmpmask, prgrp);
 421	}
 422
 423	/*
 424	 * If we added cpus, remove them from previous group that owned them
 425	 * and update per-cpu rmid
 426	 */
 427	cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask);
 428	if (!cpumask_empty(tmpmask)) {
 429		head = &prgrp->mon.crdtgrp_list;
 430		list_for_each_entry(crgrp, head, mon.crdtgrp_list) {
 431			if (crgrp == rdtgrp)
 432				continue;
 433			cpumask_andnot(&crgrp->cpu_mask, &crgrp->cpu_mask,
 434				       tmpmask);
 435		}
 436		update_closid_rmid(tmpmask, rdtgrp);
 437	}
 438
 439	/* Done pushing/pulling - update this group with new mask */
 440	cpumask_copy(&rdtgrp->cpu_mask, newmask);
 441
 442	return 0;
 443}
 444
 445static void cpumask_rdtgrp_clear(struct rdtgroup *r, struct cpumask *m)
 446{
 447	struct rdtgroup *crgrp;
 448
 449	cpumask_andnot(&r->cpu_mask, &r->cpu_mask, m);
 450	/* update the child mon group masks as well*/
 451	list_for_each_entry(crgrp, &r->mon.crdtgrp_list, mon.crdtgrp_list)
 452		cpumask_and(&crgrp->cpu_mask, &r->cpu_mask, &crgrp->cpu_mask);
 453}
 454
 455static int cpus_ctrl_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask,
 456			   cpumask_var_t tmpmask, cpumask_var_t tmpmask1)
 457{
 458	struct rdtgroup *r, *crgrp;
 459	struct list_head *head;
 460
 461	/* Check whether cpus are dropped from this group */
 462	cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask);
 463	if (!cpumask_empty(tmpmask)) {
 464		/* Can't drop from default group */
 465		if (rdtgrp == &rdtgroup_default) {
 466			rdt_last_cmd_puts("Can't drop CPUs from default group\n");
 467			return -EINVAL;
 468		}
 469
 470		/* Give any dropped cpus to rdtgroup_default */
 471		cpumask_or(&rdtgroup_default.cpu_mask,
 472			   &rdtgroup_default.cpu_mask, tmpmask);
 473		update_closid_rmid(tmpmask, &rdtgroup_default);
 474	}
 475
 476	/*
 477	 * If we added cpus, remove them from previous group and
 478	 * the prev group's child groups that owned them
 479	 * and update per-cpu closid/rmid.
 480	 */
 481	cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask);
 482	if (!cpumask_empty(tmpmask)) {
 483		list_for_each_entry(r, &rdt_all_groups, rdtgroup_list) {
 484			if (r == rdtgrp)
 485				continue;
 486			cpumask_and(tmpmask1, &r->cpu_mask, tmpmask);
 487			if (!cpumask_empty(tmpmask1))
 488				cpumask_rdtgrp_clear(r, tmpmask1);
 489		}
 490		update_closid_rmid(tmpmask, rdtgrp);
 491	}
 492
 493	/* Done pushing/pulling - update this group with new mask */
 494	cpumask_copy(&rdtgrp->cpu_mask, newmask);
 495
 496	/*
 497	 * Clear child mon group masks since there is a new parent mask
 498	 * now and update the rmid for the cpus the child lost.
 499	 */
 500	head = &rdtgrp->mon.crdtgrp_list;
 501	list_for_each_entry(crgrp, head, mon.crdtgrp_list) {
 502		cpumask_and(tmpmask, &rdtgrp->cpu_mask, &crgrp->cpu_mask);
 503		update_closid_rmid(tmpmask, rdtgrp);
 504		cpumask_clear(&crgrp->cpu_mask);
 505	}
 506
 507	return 0;
 508}
 509
 510static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of,
 511				   char *buf, size_t nbytes, loff_t off)
 512{
 513	cpumask_var_t tmpmask, newmask, tmpmask1;
 514	struct rdtgroup *rdtgrp;
 515	int ret;
 516
 517	if (!buf)
 518		return -EINVAL;
 519
 520	if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
 521		return -ENOMEM;
 522	if (!zalloc_cpumask_var(&newmask, GFP_KERNEL)) {
 523		free_cpumask_var(tmpmask);
 524		return -ENOMEM;
 525	}
 526	if (!zalloc_cpumask_var(&tmpmask1, GFP_KERNEL)) {
 527		free_cpumask_var(tmpmask);
 528		free_cpumask_var(newmask);
 529		return -ENOMEM;
 530	}
 531
 532	rdtgrp = rdtgroup_kn_lock_live(of->kn);
 533	if (!rdtgrp) {
 534		ret = -ENOENT;
 535		goto unlock;
 536	}
 537
 538	rdt_last_cmd_clear();
 539
 540	if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED ||
 541	    rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
 542		ret = -EINVAL;
 543		rdt_last_cmd_puts("Pseudo-locking in progress\n");
 544		goto unlock;
 545	}
 546
 547	if (is_cpu_list(of))
 548		ret = cpulist_parse(buf, newmask);
 549	else
 550		ret = cpumask_parse(buf, newmask);
 551
 552	if (ret) {
 553		rdt_last_cmd_puts("Bad CPU list/mask\n");
 554		goto unlock;
 555	}
 556
 557	/* check that user didn't specify any offline cpus */
 558	cpumask_andnot(tmpmask, newmask, cpu_online_mask);
 559	if (!cpumask_empty(tmpmask)) {
 560		ret = -EINVAL;
 561		rdt_last_cmd_puts("Can only assign online CPUs\n");
 562		goto unlock;
 563	}
 564
 565	if (rdtgrp->type == RDTCTRL_GROUP)
 566		ret = cpus_ctrl_write(rdtgrp, newmask, tmpmask, tmpmask1);
 567	else if (rdtgrp->type == RDTMON_GROUP)
 568		ret = cpus_mon_write(rdtgrp, newmask, tmpmask);
 569	else
 570		ret = -EINVAL;
 571
 572unlock:
 573	rdtgroup_kn_unlock(of->kn);
 574	free_cpumask_var(tmpmask);
 575	free_cpumask_var(newmask);
 576	free_cpumask_var(tmpmask1);
 577
 578	return ret ?: nbytes;
 579}
 580
 581/**
 582 * rdtgroup_remove - the helper to remove resource group safely
 583 * @rdtgrp: resource group to remove
 584 *
 585 * On resource group creation via a mkdir, an extra kernfs_node reference is
 586 * taken to ensure that the rdtgroup structure remains accessible for the
 587 * rdtgroup_kn_unlock() calls where it is removed.
 588 *
 589 * Drop the extra reference here, then free the rdtgroup structure.
 590 *
 591 * Return: void
 592 */
 593static void rdtgroup_remove(struct rdtgroup *rdtgrp)
 594{
 595	kernfs_put(rdtgrp->kn);
 596	kfree(rdtgrp);
 597}
 598
 599static void _update_task_closid_rmid(void *task)
 600{
 601	/*
 602	 * If the task is still current on this CPU, update PQR_ASSOC MSR.
 603	 * Otherwise, the MSR is updated when the task is scheduled in.
 604	 */
 605	if (task == current)
 606		resctrl_arch_sched_in(task);
 607}
 608
 609static void update_task_closid_rmid(struct task_struct *t)
 610{
 611	if (IS_ENABLED(CONFIG_SMP) && task_curr(t))
 612		smp_call_function_single(task_cpu(t), _update_task_closid_rmid, t, 1);
 613	else
 614		_update_task_closid_rmid(t);
 615}
 616
 617static bool task_in_rdtgroup(struct task_struct *tsk, struct rdtgroup *rdtgrp)
 618{
 619	u32 closid, rmid = rdtgrp->mon.rmid;
 620
 621	if (rdtgrp->type == RDTCTRL_GROUP)
 622		closid = rdtgrp->closid;
 623	else if (rdtgrp->type == RDTMON_GROUP)
 624		closid = rdtgrp->mon.parent->closid;
 625	else
 626		return false;
 627
 628	return resctrl_arch_match_closid(tsk, closid) &&
 629	       resctrl_arch_match_rmid(tsk, closid, rmid);
 630}
 631
 632static int __rdtgroup_move_task(struct task_struct *tsk,
 633				struct rdtgroup *rdtgrp)
 634{
 635	/* If the task is already in rdtgrp, no need to move the task. */
 636	if (task_in_rdtgroup(tsk, rdtgrp))
 637		return 0;
 638
 639	/*
 640	 * Set the task's closid/rmid before the PQR_ASSOC MSR can be
 641	 * updated by them.
 642	 *
 643	 * For ctrl_mon groups, move both closid and rmid.
 644	 * For monitor groups, can move the tasks only from
 645	 * their parent CTRL group.
 646	 */
 647	if (rdtgrp->type == RDTMON_GROUP &&
 648	    !resctrl_arch_match_closid(tsk, rdtgrp->mon.parent->closid)) {
 649		rdt_last_cmd_puts("Can't move task to different control group\n");
 650		return -EINVAL;
 651	}
 652
 653	if (rdtgrp->type == RDTMON_GROUP)
 654		resctrl_arch_set_closid_rmid(tsk, rdtgrp->mon.parent->closid,
 655					     rdtgrp->mon.rmid);
 656	else
 657		resctrl_arch_set_closid_rmid(tsk, rdtgrp->closid,
 658					     rdtgrp->mon.rmid);
 659
 660	/*
 661	 * Ensure the task's closid and rmid are written before determining if
 662	 * the task is current that will decide if it will be interrupted.
 663	 * This pairs with the full barrier between the rq->curr update and
 664	 * resctrl_arch_sched_in() during context switch.
 665	 */
 666	smp_mb();
 667
 668	/*
 669	 * By now, the task's closid and rmid are set. If the task is current
 670	 * on a CPU, the PQR_ASSOC MSR needs to be updated to make the resource
 671	 * group go into effect. If the task is not current, the MSR will be
 672	 * updated when the task is scheduled in.
 673	 */
 674	update_task_closid_rmid(tsk);
 675
 676	return 0;
 677}
 678
 679static bool is_closid_match(struct task_struct *t, struct rdtgroup *r)
 680{
 681	return (resctrl_arch_alloc_capable() && (r->type == RDTCTRL_GROUP) &&
 682		resctrl_arch_match_closid(t, r->closid));
 683}
 684
 685static bool is_rmid_match(struct task_struct *t, struct rdtgroup *r)
 686{
 687	return (resctrl_arch_mon_capable() && (r->type == RDTMON_GROUP) &&
 688		resctrl_arch_match_rmid(t, r->mon.parent->closid,
 689					r->mon.rmid));
 690}
 691
 692/**
 693 * rdtgroup_tasks_assigned - Test if tasks have been assigned to resource group
 694 * @r: Resource group
 695 *
 696 * Return: 1 if tasks have been assigned to @r, 0 otherwise
 697 */
 698int rdtgroup_tasks_assigned(struct rdtgroup *r)
 699{
 700	struct task_struct *p, *t;
 701	int ret = 0;
 702
 703	lockdep_assert_held(&rdtgroup_mutex);
 704
 705	rcu_read_lock();
 706	for_each_process_thread(p, t) {
 707		if (is_closid_match(t, r) || is_rmid_match(t, r)) {
 708			ret = 1;
 709			break;
 710		}
 711	}
 712	rcu_read_unlock();
 713
 714	return ret;
 715}
 716
 717static int rdtgroup_task_write_permission(struct task_struct *task,
 718					  struct kernfs_open_file *of)
 719{
 720	const struct cred *tcred = get_task_cred(task);
 721	const struct cred *cred = current_cred();
 722	int ret = 0;
 723
 724	/*
 725	 * Even if we're attaching all tasks in the thread group, we only
 726	 * need to check permissions on one of them.
 727	 */
 728	if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
 729	    !uid_eq(cred->euid, tcred->uid) &&
 730	    !uid_eq(cred->euid, tcred->suid)) {
 731		rdt_last_cmd_printf("No permission to move task %d\n", task->pid);
 732		ret = -EPERM;
 733	}
 734
 735	put_cred(tcred);
 736	return ret;
 737}
 738
 739static int rdtgroup_move_task(pid_t pid, struct rdtgroup *rdtgrp,
 740			      struct kernfs_open_file *of)
 741{
 742	struct task_struct *tsk;
 743	int ret;
 744
 745	rcu_read_lock();
 746	if (pid) {
 747		tsk = find_task_by_vpid(pid);
 748		if (!tsk) {
 749			rcu_read_unlock();
 750			rdt_last_cmd_printf("No task %d\n", pid);
 751			return -ESRCH;
 752		}
 753	} else {
 754		tsk = current;
 755	}
 756
 757	get_task_struct(tsk);
 758	rcu_read_unlock();
 759
 760	ret = rdtgroup_task_write_permission(tsk, of);
 761	if (!ret)
 762		ret = __rdtgroup_move_task(tsk, rdtgrp);
 763
 764	put_task_struct(tsk);
 765	return ret;
 766}
 767
 768static ssize_t rdtgroup_tasks_write(struct kernfs_open_file *of,
 769				    char *buf, size_t nbytes, loff_t off)
 770{
 771	struct rdtgroup *rdtgrp;
 772	char *pid_str;
 773	int ret = 0;
 774	pid_t pid;
 775
 776	rdtgrp = rdtgroup_kn_lock_live(of->kn);
 777	if (!rdtgrp) {
 778		rdtgroup_kn_unlock(of->kn);
 779		return -ENOENT;
 780	}
 781	rdt_last_cmd_clear();
 782
 783	if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED ||
 784	    rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
 785		ret = -EINVAL;
 786		rdt_last_cmd_puts("Pseudo-locking in progress\n");
 787		goto unlock;
 788	}
 789
 790	while (buf && buf[0] != '\0' && buf[0] != '\n') {
 791		pid_str = strim(strsep(&buf, ","));
 792
 793		if (kstrtoint(pid_str, 0, &pid)) {
 794			rdt_last_cmd_printf("Task list parsing error pid %s\n", pid_str);
 795			ret = -EINVAL;
 796			break;
 797		}
 798
 799		if (pid < 0) {
 800			rdt_last_cmd_printf("Invalid pid %d\n", pid);
 801			ret = -EINVAL;
 802			break;
 803		}
 804
 805		ret = rdtgroup_move_task(pid, rdtgrp, of);
 806		if (ret) {
 807			rdt_last_cmd_printf("Error while processing task %d\n", pid);
 808			break;
 809		}
 810	}
 811
 812unlock:
 813	rdtgroup_kn_unlock(of->kn);
 814
 815	return ret ?: nbytes;
 816}
 817
 818static void show_rdt_tasks(struct rdtgroup *r, struct seq_file *s)
 819{
 820	struct task_struct *p, *t;
 821	pid_t pid;
 822
 823	rcu_read_lock();
 824	for_each_process_thread(p, t) {
 825		if (is_closid_match(t, r) || is_rmid_match(t, r)) {
 826			pid = task_pid_vnr(t);
 827			if (pid)
 828				seq_printf(s, "%d\n", pid);
 829		}
 830	}
 831	rcu_read_unlock();
 832}
 833
 834static int rdtgroup_tasks_show(struct kernfs_open_file *of,
 835			       struct seq_file *s, void *v)
 836{
 837	struct rdtgroup *rdtgrp;
 838	int ret = 0;
 839
 840	rdtgrp = rdtgroup_kn_lock_live(of->kn);
 841	if (rdtgrp)
 842		show_rdt_tasks(rdtgrp, s);
 843	else
 844		ret = -ENOENT;
 845	rdtgroup_kn_unlock(of->kn);
 846
 847	return ret;
 848}
 849
 850static int rdtgroup_closid_show(struct kernfs_open_file *of,
 851				struct seq_file *s, void *v)
 852{
 853	struct rdtgroup *rdtgrp;
 854	int ret = 0;
 855
 856	rdtgrp = rdtgroup_kn_lock_live(of->kn);
 857	if (rdtgrp)
 858		seq_printf(s, "%u\n", rdtgrp->closid);
 859	else
 860		ret = -ENOENT;
 861	rdtgroup_kn_unlock(of->kn);
 862
 863	return ret;
 864}
 865
 866static int rdtgroup_rmid_show(struct kernfs_open_file *of,
 867			      struct seq_file *s, void *v)
 868{
 869	struct rdtgroup *rdtgrp;
 870	int ret = 0;
 871
 872	rdtgrp = rdtgroup_kn_lock_live(of->kn);
 873	if (rdtgrp)
 874		seq_printf(s, "%u\n", rdtgrp->mon.rmid);
 875	else
 876		ret = -ENOENT;
 877	rdtgroup_kn_unlock(of->kn);
 878
 879	return ret;
 880}
 881
 882#ifdef CONFIG_PROC_CPU_RESCTRL
 883/*
 884 * A task can only be part of one resctrl control group and of one monitor
 885 * group which is associated to that control group.
 886 *
 887 * 1)   res:
 888 *      mon:
 889 *
 890 *    resctrl is not available.
 891 *
 892 * 2)   res:/
 893 *      mon:
 894 *
 895 *    Task is part of the root resctrl control group, and it is not associated
 896 *    to any monitor group.
 897 *
 898 * 3)  res:/
 899 *     mon:mon0
 900 *
 901 *    Task is part of the root resctrl control group and monitor group mon0.
 902 *
 903 * 4)  res:group0
 904 *     mon:
 905 *
 906 *    Task is part of resctrl control group group0, and it is not associated
 907 *    to any monitor group.
 908 *
 909 * 5) res:group0
 910 *    mon:mon1
 911 *
 912 *    Task is part of resctrl control group group0 and monitor group mon1.
 913 */
 914int proc_resctrl_show(struct seq_file *s, struct pid_namespace *ns,
 915		      struct pid *pid, struct task_struct *tsk)
 916{
 917	struct rdtgroup *rdtg;
 918	int ret = 0;
 919
 920	mutex_lock(&rdtgroup_mutex);
 921
 922	/* Return empty if resctrl has not been mounted. */
 923	if (!resctrl_mounted) {
 924		seq_puts(s, "res:\nmon:\n");
 925		goto unlock;
 926	}
 927
 928	list_for_each_entry(rdtg, &rdt_all_groups, rdtgroup_list) {
 929		struct rdtgroup *crg;
 930
 931		/*
 932		 * Task information is only relevant for shareable
 933		 * and exclusive groups.
 934		 */
 935		if (rdtg->mode != RDT_MODE_SHAREABLE &&
 936		    rdtg->mode != RDT_MODE_EXCLUSIVE)
 937			continue;
 938
 939		if (!resctrl_arch_match_closid(tsk, rdtg->closid))
 940			continue;
 941
 942		seq_printf(s, "res:%s%s\n", (rdtg == &rdtgroup_default) ? "/" : "",
 943			   rdt_kn_name(rdtg->kn));
 944		seq_puts(s, "mon:");
 945		list_for_each_entry(crg, &rdtg->mon.crdtgrp_list,
 946				    mon.crdtgrp_list) {
 947			if (!resctrl_arch_match_rmid(tsk, crg->mon.parent->closid,
 948						     crg->mon.rmid))
 949				continue;
 950			seq_printf(s, "%s", rdt_kn_name(crg->kn));
 951			break;
 952		}
 953		seq_putc(s, '\n');
 954		goto unlock;
 955	}
 956	/*
 957	 * The above search should succeed. Otherwise return
 958	 * with an error.
 959	 */
 960	ret = -ENOENT;
 961unlock:
 962	mutex_unlock(&rdtgroup_mutex);
 963
 964	return ret;
 965}
 966#endif
 967
 968static int rdt_last_cmd_status_show(struct kernfs_open_file *of,
 969				    struct seq_file *seq, void *v)
 970{
 971	int len;
 972
 973	mutex_lock(&rdtgroup_mutex);
 974	len = seq_buf_used(&last_cmd_status);
 975	if (len)
 976		seq_printf(seq, "%.*s", len, last_cmd_status_buf);
 977	else
 978		seq_puts(seq, "ok\n");
 979	mutex_unlock(&rdtgroup_mutex);
 980	return 0;
 981}
 982
 983void *rdt_kn_parent_priv(struct kernfs_node *kn)
 984{
 985	/*
 986	 * The parent pointer is only valid within RCU section since it can be
 987	 * replaced.
 988	 */
 989	guard(rcu)();
 990	return rcu_dereference(kn->__parent)->priv;
 991}
 992
 993static int rdt_num_closids_show(struct kernfs_open_file *of,
 994				struct seq_file *seq, void *v)
 995{
 996	struct resctrl_schema *s = rdt_kn_parent_priv(of->kn);
 997
 998	seq_printf(seq, "%u\n", s->num_closid);
 999	return 0;
1000}
1001
1002static int rdt_default_ctrl_show(struct kernfs_open_file *of,
1003				 struct seq_file *seq, void *v)
1004{
1005	struct resctrl_schema *s = rdt_kn_parent_priv(of->kn);
1006	struct rdt_resource *r = s->res;
1007
1008	seq_printf(seq, "%x\n", resctrl_get_default_ctrl(r));
1009	return 0;
1010}
1011
1012static int rdt_min_cbm_bits_show(struct kernfs_open_file *of,
1013				 struct seq_file *seq, void *v)
1014{
1015	struct resctrl_schema *s = rdt_kn_parent_priv(of->kn);
1016	struct rdt_resource *r = s->res;
1017
1018	seq_printf(seq, "%u\n", r->cache.min_cbm_bits);
1019	return 0;
1020}
1021
1022static int rdt_shareable_bits_show(struct kernfs_open_file *of,
1023				   struct seq_file *seq, void *v)
1024{
1025	struct resctrl_schema *s = rdt_kn_parent_priv(of->kn);
1026	struct rdt_resource *r = s->res;
1027
1028	seq_printf(seq, "%x\n", r->cache.shareable_bits);
1029	return 0;
1030}
1031
1032/*
1033 * rdt_bit_usage_show - Display current usage of resources
1034 *
1035 * A domain is a shared resource that can now be allocated differently. Here
1036 * we display the current regions of the domain as an annotated bitmask.
1037 * For each domain of this resource its allocation bitmask
1038 * is annotated as below to indicate the current usage of the corresponding bit:
1039 *   0 - currently unused
1040 *   X - currently available for sharing and used by software and hardware
1041 *   H - currently used by hardware only but available for software use
1042 *   S - currently used and shareable by software only
1043 *   E - currently used exclusively by one resource group
1044 *   P - currently pseudo-locked by one resource group
1045 */
1046static int rdt_bit_usage_show(struct kernfs_open_file *of,
1047			      struct seq_file *seq, void *v)
1048{
1049	struct resctrl_schema *s = rdt_kn_parent_priv(of->kn);
1050	/*
1051	 * Use unsigned long even though only 32 bits are used to ensure
1052	 * test_bit() is used safely.
1053	 */
1054	unsigned long sw_shareable = 0, hw_shareable = 0;
1055	unsigned long exclusive = 0, pseudo_locked = 0;
1056	struct rdt_resource *r = s->res;
1057	struct rdt_ctrl_domain *dom;
1058	int i, hwb, swb, excl, psl;
1059	enum rdtgrp_mode mode;
1060	bool sep = false;
1061	u32 ctrl_val;
1062
1063	cpus_read_lock();
1064	mutex_lock(&rdtgroup_mutex);
1065	list_for_each_entry(dom, &r->ctrl_domains, hdr.list) {
1066		if (sep)
1067			seq_putc(seq, ';');
1068		hw_shareable = r->cache.shareable_bits;
1069		sw_shareable = 0;
1070		exclusive = 0;
1071		seq_printf(seq, "%d=", dom->hdr.id);
1072		for (i = 0; i < closids_supported(); i++) {
1073			if (!closid_allocated(i) ||
1074			    (resctrl_arch_get_io_alloc_enabled(r) &&
1075			     i == resctrl_io_alloc_closid(r)))
1076				continue;
1077			ctrl_val = resctrl_arch_get_config(r, dom, i,
1078							   s->conf_type);
1079			mode = rdtgroup_mode_by_closid(i);
1080			switch (mode) {
1081			case RDT_MODE_SHAREABLE:
1082				sw_shareable |= ctrl_val;
1083				break;
1084			case RDT_MODE_EXCLUSIVE:
1085				exclusive |= ctrl_val;
1086				break;
1087			case RDT_MODE_PSEUDO_LOCKSETUP:
1088			/*
1089			 * RDT_MODE_PSEUDO_LOCKSETUP is possible
1090			 * here but not included since the CBM
1091			 * associated with this CLOSID in this mode
1092			 * is not initialized and no task or cpu can be
1093			 * assigned this CLOSID.
1094			 */
1095				break;
1096			case RDT_MODE_PSEUDO_LOCKED:
1097			case RDT_NUM_MODES:
1098				WARN(1,
1099				     "invalid mode for closid %d\n", i);
1100				break;
1101			}
1102		}
1103
1104		/*
1105		 * When the "io_alloc" feature is enabled, a portion of the cache
1106		 * is configured for shared use between hardware and software.
1107		 * Also, when CDP is enabled the CBMs of CDP_CODE and CDP_DATA
1108		 * resources are kept in sync. So, the CBMs for "io_alloc" can
1109		 * be accessed through either resource.
1110		 */
1111		if (resctrl_arch_get_io_alloc_enabled(r)) {
1112			ctrl_val = resctrl_arch_get_config(r, dom,
1113							   resctrl_io_alloc_closid(r),
1114							   s->conf_type);
1115			hw_shareable |= ctrl_val;
1116		}
1117
1118		for (i = r->cache.cbm_len - 1; i >= 0; i--) {
1119			pseudo_locked = dom->plr ? dom->plr->cbm : 0;
1120			hwb = test_bit(i, &hw_shareable);
1121			swb = test_bit(i, &sw_shareable);
1122			excl = test_bit(i, &exclusive);
1123			psl = test_bit(i, &pseudo_locked);
1124			if (hwb && swb)
1125				seq_putc(seq, 'X');
1126			else if (hwb && !swb)
1127				seq_putc(seq, 'H');
1128			else if (!hwb && swb)
1129				seq_putc(seq, 'S');
1130			else if (excl)
1131				seq_putc(seq, 'E');
1132			else if (psl)
1133				seq_putc(seq, 'P');
1134			else /* Unused bits remain */
1135				seq_putc(seq, '0');
1136		}
1137		sep = true;
1138	}
1139	seq_putc(seq, '\n');
1140	mutex_unlock(&rdtgroup_mutex);
1141	cpus_read_unlock();
1142	return 0;
1143}
1144
1145static int rdt_min_bw_show(struct kernfs_open_file *of,
1146			   struct seq_file *seq, void *v)
1147{
1148	struct resctrl_schema *s = rdt_kn_parent_priv(of->kn);
1149	struct rdt_resource *r = s->res;
1150
1151	seq_printf(seq, "%u\n", r->membw.min_bw);
1152	return 0;
1153}
1154
1155static int rdt_num_rmids_show(struct kernfs_open_file *of,
1156			      struct seq_file *seq, void *v)
1157{
1158	struct rdt_resource *r = rdt_kn_parent_priv(of->kn);
1159
1160	seq_printf(seq, "%d\n", r->mon.num_rmid);
1161
1162	return 0;
1163}
1164
1165static int rdt_mon_features_show(struct kernfs_open_file *of,
1166				 struct seq_file *seq, void *v)
1167{
1168	struct rdt_resource *r = rdt_kn_parent_priv(of->kn);
1169	struct mon_evt *mevt;
1170
1171	for_each_mon_event(mevt) {
1172		if (mevt->rid != r->rid || !mevt->enabled)
1173			continue;
1174		seq_printf(seq, "%s\n", mevt->name);
1175		if (mevt->configurable &&
1176		    !resctrl_arch_mbm_cntr_assign_enabled(r))
1177			seq_printf(seq, "%s_config\n", mevt->name);
1178	}
1179
1180	return 0;
1181}
1182
1183static int rdt_bw_gran_show(struct kernfs_open_file *of,
1184			    struct seq_file *seq, void *v)
1185{
1186	struct resctrl_schema *s = rdt_kn_parent_priv(of->kn);
1187	struct rdt_resource *r = s->res;
1188
1189	seq_printf(seq, "%u\n", r->membw.bw_gran);
1190	return 0;
1191}
1192
1193static int rdt_delay_linear_show(struct kernfs_open_file *of,
1194				 struct seq_file *seq, void *v)
1195{
1196	struct resctrl_schema *s = rdt_kn_parent_priv(of->kn);
1197	struct rdt_resource *r = s->res;
1198
1199	seq_printf(seq, "%u\n", r->membw.delay_linear);
1200	return 0;
1201}
1202
1203static int max_threshold_occ_show(struct kernfs_open_file *of,
1204				  struct seq_file *seq, void *v)
1205{
1206	seq_printf(seq, "%u\n", resctrl_rmid_realloc_threshold);
1207
1208	return 0;
1209}
1210
1211static int rdt_thread_throttle_mode_show(struct kernfs_open_file *of,
1212					 struct seq_file *seq, void *v)
1213{
1214	struct resctrl_schema *s = rdt_kn_parent_priv(of->kn);
1215	struct rdt_resource *r = s->res;
1216
1217	switch (r->membw.throttle_mode) {
1218	case THREAD_THROTTLE_PER_THREAD:
1219		seq_puts(seq, "per-thread\n");
1220		return 0;
1221	case THREAD_THROTTLE_MAX:
1222		seq_puts(seq, "max\n");
1223		return 0;
1224	case THREAD_THROTTLE_UNDEFINED:
1225		seq_puts(seq, "undefined\n");
1226		return 0;
1227	}
1228
1229	WARN_ON_ONCE(1);
1230
1231	return 0;
1232}
1233
1234static ssize_t max_threshold_occ_write(struct kernfs_open_file *of,
1235				       char *buf, size_t nbytes, loff_t off)
1236{
1237	unsigned int bytes;
1238	int ret;
1239
1240	ret = kstrtouint(buf, 0, &bytes);
1241	if (ret)
1242		return ret;
1243
1244	if (bytes > resctrl_rmid_realloc_limit)
1245		return -EINVAL;
1246
1247	resctrl_rmid_realloc_threshold = resctrl_arch_round_mon_val(bytes);
1248
1249	return nbytes;
1250}
1251
1252/*
1253 * rdtgroup_mode_show - Display mode of this resource group
1254 */
1255static int rdtgroup_mode_show(struct kernfs_open_file *of,
1256			      struct seq_file *s, void *v)
1257{
1258	struct rdtgroup *rdtgrp;
1259
1260	rdtgrp = rdtgroup_kn_lock_live(of->kn);
1261	if (!rdtgrp) {
1262		rdtgroup_kn_unlock(of->kn);
1263		return -ENOENT;
1264	}
1265
1266	seq_printf(s, "%s\n", rdtgroup_mode_str(rdtgrp->mode));
1267
1268	rdtgroup_kn_unlock(of->kn);
1269	return 0;
1270}
1271
1272enum resctrl_conf_type resctrl_peer_type(enum resctrl_conf_type my_type)
1273{
1274	switch (my_type) {
1275	case CDP_CODE:
1276		return CDP_DATA;
1277	case CDP_DATA:
1278		return CDP_CODE;
1279	default:
1280	case CDP_NONE:
1281		return CDP_NONE;
1282	}
1283}
1284
1285static int rdt_has_sparse_bitmasks_show(struct kernfs_open_file *of,
1286					struct seq_file *seq, void *v)
1287{
1288	struct resctrl_schema *s = rdt_kn_parent_priv(of->kn);
1289	struct rdt_resource *r = s->res;
1290
1291	seq_printf(seq, "%u\n", r->cache.arch_has_sparse_bitmasks);
1292
1293	return 0;
1294}
1295
1296/**
1297 * __rdtgroup_cbm_overlaps - Does CBM for intended closid overlap with other
1298 * @r: Resource to which domain instance @d belongs.
1299 * @d: The domain instance for which @closid is being tested.
1300 * @cbm: Capacity bitmask being tested.
1301 * @closid: Intended closid for @cbm.
1302 * @type: CDP type of @r.
1303 * @exclusive: Only check if overlaps with exclusive resource groups
1304 *
1305 * Checks if provided @cbm intended to be used for @closid on domain
1306 * @d overlaps with any other closids or other hardware usage associated
1307 * with this domain. If @exclusive is true then only overlaps with
1308 * resource groups in exclusive mode will be considered. If @exclusive
1309 * is false then overlaps with any resource group or hardware entities
1310 * will be considered.
1311 *
1312 * @cbm is unsigned long, even if only 32 bits are used, to make the
1313 * bitmap functions work correctly.
1314 *
1315 * Return: false if CBM does not overlap, true if it does.
1316 */
1317static bool __rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_ctrl_domain *d,
1318				    unsigned long cbm, int closid,
1319				    enum resctrl_conf_type type, bool exclusive)
1320{
1321	enum rdtgrp_mode mode;
1322	unsigned long ctrl_b;
1323	int i;
1324
1325	/* Check for any overlap with regions used by hardware directly */
1326	if (!exclusive) {
1327		ctrl_b = r->cache.shareable_bits;
1328		if (bitmap_intersects(&cbm, &ctrl_b, r->cache.cbm_len))
1329			return true;
1330	}
1331
1332	/* Check for overlap with other resource groups */
1333	for (i = 0; i < closids_supported(); i++) {
1334		ctrl_b = resctrl_arch_get_config(r, d, i, type);
1335		mode = rdtgroup_mode_by_closid(i);
1336		if (closid_allocated(i) && i != closid &&
1337		    mode != RDT_MODE_PSEUDO_LOCKSETUP) {
1338			if (bitmap_intersects(&cbm, &ctrl_b, r->cache.cbm_len)) {
1339				if (exclusive) {
1340					if (mode == RDT_MODE_EXCLUSIVE)
1341						return true;
1342					continue;
1343				}
1344				return true;
1345			}
1346		}
1347	}
1348
1349	return false;
1350}
1351
1352/**
1353 * rdtgroup_cbm_overlaps - Does CBM overlap with other use of hardware
1354 * @s: Schema for the resource to which domain instance @d belongs.
1355 * @d: The domain instance for which @closid is being tested.
1356 * @cbm: Capacity bitmask being tested.
1357 * @closid: Intended closid for @cbm.
1358 * @exclusive: Only check if overlaps with exclusive resource groups
1359 *
1360 * Resources that can be allocated using a CBM can use the CBM to control
1361 * the overlap of these allocations. rdtgroup_cmb_overlaps() is the test
1362 * for overlap. Overlap test is not limited to the specific resource for
1363 * which the CBM is intended though - when dealing with CDP resources that
1364 * share the underlying hardware the overlap check should be performed on
1365 * the CDP resource sharing the hardware also.
1366 *
1367 * Refer to description of __rdtgroup_cbm_overlaps() for the details of the
1368 * overlap test.
1369 *
1370 * Return: true if CBM overlap detected, false if there is no overlap
1371 */
1372bool rdtgroup_cbm_overlaps(struct resctrl_schema *s, struct rdt_ctrl_domain *d,
1373			   unsigned long cbm, int closid, bool exclusive)
1374{
1375	enum resctrl_conf_type peer_type = resctrl_peer_type(s->conf_type);
1376	struct rdt_resource *r = s->res;
1377
1378	if (__rdtgroup_cbm_overlaps(r, d, cbm, closid, s->conf_type,
1379				    exclusive))
1380		return true;
1381
1382	if (!resctrl_arch_get_cdp_enabled(r->rid))
1383		return false;
1384	return  __rdtgroup_cbm_overlaps(r, d, cbm, closid, peer_type, exclusive);
1385}
1386
1387/**
1388 * rdtgroup_mode_test_exclusive - Test if this resource group can be exclusive
1389 * @rdtgrp: Resource group identified through its closid.
1390 *
1391 * An exclusive resource group implies that there should be no sharing of
1392 * its allocated resources. At the time this group is considered to be
1393 * exclusive this test can determine if its current schemata supports this
1394 * setting by testing for overlap with all other resource groups.
1395 *
1396 * Return: true if resource group can be exclusive, false if there is overlap
1397 * with allocations of other resource groups and thus this resource group
1398 * cannot be exclusive.
1399 */
1400static bool rdtgroup_mode_test_exclusive(struct rdtgroup *rdtgrp)
1401{
1402	int closid = rdtgrp->closid;
1403	struct rdt_ctrl_domain *d;
1404	struct resctrl_schema *s;
1405	struct rdt_resource *r;
1406	bool has_cache = false;
1407	u32 ctrl;
1408
1409	/* Walking r->domains, ensure it can't race with cpuhp */
1410	lockdep_assert_cpus_held();
1411
1412	list_for_each_entry(s, &resctrl_schema_all, list) {
1413		r = s->res;
1414		if (r->rid == RDT_RESOURCE_MBA || r->rid == RDT_RESOURCE_SMBA)
1415			continue;
1416		has_cache = true;
1417		list_for_each_entry(d, &r->ctrl_domains, hdr.list) {
1418			ctrl = resctrl_arch_get_config(r, d, closid,
1419						       s->conf_type);
1420			if (rdtgroup_cbm_overlaps(s, d, ctrl, closid, false)) {
1421				rdt_last_cmd_puts("Schemata overlaps\n");
1422				return false;
1423			}
1424		}
1425	}
1426
1427	if (!has_cache) {
1428		rdt_last_cmd_puts("Cannot be exclusive without CAT/CDP\n");
1429		return false;
1430	}
1431
1432	return true;
1433}
1434
1435/*
1436 * rdtgroup_mode_write - Modify the resource group's mode
1437 */
1438static ssize_t rdtgroup_mode_write(struct kernfs_open_file *of,
1439				   char *buf, size_t nbytes, loff_t off)
1440{
1441	struct rdtgroup *rdtgrp;
1442	enum rdtgrp_mode mode;
1443	int ret = 0;
1444
1445	/* Valid input requires a trailing newline */
1446	if (nbytes == 0 || buf[nbytes - 1] != '\n')
1447		return -EINVAL;
1448	buf[nbytes - 1] = '\0';
1449
1450	rdtgrp = rdtgroup_kn_lock_live(of->kn);
1451	if (!rdtgrp) {
1452		rdtgroup_kn_unlock(of->kn);
1453		return -ENOENT;
1454	}
1455
1456	rdt_last_cmd_clear();
1457
1458	mode = rdtgrp->mode;
1459
1460	if ((!strcmp(buf, "shareable") && mode == RDT_MODE_SHAREABLE) ||
1461	    (!strcmp(buf, "exclusive") && mode == RDT_MODE_EXCLUSIVE) ||
1462	    (!strcmp(buf, "pseudo-locksetup") &&
1463	     mode == RDT_MODE_PSEUDO_LOCKSETUP) ||
1464	    (!strcmp(buf, "pseudo-locked") && mode == RDT_MODE_PSEUDO_LOCKED))
1465		goto out;
1466
1467	if (mode == RDT_MODE_PSEUDO_LOCKED) {
1468		rdt_last_cmd_puts("Cannot change pseudo-locked group\n");
1469		ret = -EINVAL;
1470		goto out;
1471	}
1472
1473	if (!strcmp(buf, "shareable")) {
1474		if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
1475			ret = rdtgroup_locksetup_exit(rdtgrp);
1476			if (ret)
1477				goto out;
1478		}
1479		rdtgrp->mode = RDT_MODE_SHAREABLE;
1480	} else if (!strcmp(buf, "exclusive")) {
1481		if (!rdtgroup_mode_test_exclusive(rdtgrp)) {
1482			ret = -EINVAL;
1483			goto out;
1484		}
1485		if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
1486			ret = rdtgroup_locksetup_exit(rdtgrp);
1487			if (ret)
1488				goto out;
1489		}
1490		rdtgrp->mode = RDT_MODE_EXCLUSIVE;
1491	} else if (IS_ENABLED(CONFIG_RESCTRL_FS_PSEUDO_LOCK) &&
1492		   !strcmp(buf, "pseudo-locksetup")) {
1493		ret = rdtgroup_locksetup_enter(rdtgrp);
1494		if (ret)
1495			goto out;
1496		rdtgrp->mode = RDT_MODE_PSEUDO_LOCKSETUP;
1497	} else {
1498		rdt_last_cmd_puts("Unknown or unsupported mode\n");
1499		ret = -EINVAL;
1500	}
1501
1502out:
1503	rdtgroup_kn_unlock(of->kn);
1504	return ret ?: nbytes;
1505}
1506
1507/**
1508 * rdtgroup_cbm_to_size - Translate CBM to size in bytes
1509 * @r: RDT resource to which @d belongs.
1510 * @d: RDT domain instance.
1511 * @cbm: bitmask for which the size should be computed.
1512 *
1513 * The bitmask provided associated with the RDT domain instance @d will be
1514 * translated into how many bytes it represents. The size in bytes is
1515 * computed by first dividing the total cache size by the CBM length to
1516 * determine how many bytes each bit in the bitmask represents. The result
1517 * is multiplied with the number of bits set in the bitmask.
1518 *
1519 * @cbm is unsigned long, even if only 32 bits are used to make the
1520 * bitmap functions work correctly.
1521 */
1522unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r,
1523				  struct rdt_ctrl_domain *d, unsigned long cbm)
1524{
1525	unsigned int size = 0;
1526	struct cacheinfo *ci;
1527	int num_b;
1528
1529	if (WARN_ON_ONCE(r->ctrl_scope != RESCTRL_L2_CACHE && r->ctrl_scope != RESCTRL_L3_CACHE))
1530		return size;
1531
1532	num_b = bitmap_weight(&cbm, r->cache.cbm_len);
1533	ci = get_cpu_cacheinfo_level(cpumask_any(&d->hdr.cpu_mask), r->ctrl_scope);
1534	if (ci)
1535		size = ci->size / r->cache.cbm_len * num_b;
1536
1537	return size;
1538}
1539
1540bool is_mba_sc(struct rdt_resource *r)
1541{
1542	if (!r)
1543		r = resctrl_arch_get_resource(RDT_RESOURCE_MBA);
1544
1545	/*
1546	 * The software controller support is only applicable to MBA resource.
1547	 * Make sure to check for resource type.
1548	 */
1549	if (r->rid != RDT_RESOURCE_MBA)
1550		return false;
1551
1552	return r->membw.mba_sc;
1553}
1554
1555/*
1556 * rdtgroup_size_show - Display size in bytes of allocated regions
1557 *
1558 * The "size" file mirrors the layout of the "schemata" file, printing the
1559 * size in bytes of each region instead of the capacity bitmask.
1560 */
1561static int rdtgroup_size_show(struct kernfs_open_file *of,
1562			      struct seq_file *s, void *v)
1563{
1564	struct resctrl_schema *schema;
1565	enum resctrl_conf_type type;
1566	struct rdt_ctrl_domain *d;
1567	struct rdtgroup *rdtgrp;
1568	struct rdt_resource *r;
1569	unsigned int size;
1570	int ret = 0;
1571	u32 closid;
1572	bool sep;
1573	u32 ctrl;
1574
1575	rdtgrp = rdtgroup_kn_lock_live(of->kn);
1576	if (!rdtgrp) {
1577		rdtgroup_kn_unlock(of->kn);
1578		return -ENOENT;
1579	}
1580
1581	if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) {
1582		if (!rdtgrp->plr->d) {
1583			rdt_last_cmd_clear();
1584			rdt_last_cmd_puts("Cache domain offline\n");
1585			ret = -ENODEV;
1586		} else {
1587			seq_printf(s, "%*s:", max_name_width,
1588				   rdtgrp->plr->s->name);
1589			size = rdtgroup_cbm_to_size(rdtgrp->plr->s->res,
1590						    rdtgrp->plr->d,
1591						    rdtgrp->plr->cbm);
1592			seq_printf(s, "%d=%u\n", rdtgrp->plr->d->hdr.id, size);
1593		}
1594		goto out;
1595	}
1596
1597	closid = rdtgrp->closid;
1598
1599	list_for_each_entry(schema, &resctrl_schema_all, list) {
1600		r = schema->res;
1601		type = schema->conf_type;
1602		sep = false;
1603		seq_printf(s, "%*s:", max_name_width, schema->name);
1604		list_for_each_entry(d, &r->ctrl_domains, hdr.list) {
1605			if (sep)
1606				seq_putc(s, ';');
1607			if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
1608				size = 0;
1609			} else {
1610				if (is_mba_sc(r))
1611					ctrl = d->mbps_val[closid];
1612				else
1613					ctrl = resctrl_arch_get_config(r, d,
1614								       closid,
1615								       type);
1616				if (r->rid == RDT_RESOURCE_MBA ||
1617				    r->rid == RDT_RESOURCE_SMBA)
1618					size = ctrl;
1619				else
1620					size = rdtgroup_cbm_to_size(r, d, ctrl);
1621			}
1622			seq_printf(s, "%d=%u", d->hdr.id, size);
1623			sep = true;
1624		}
1625		seq_putc(s, '\n');
1626	}
1627
1628out:
1629	rdtgroup_kn_unlock(of->kn);
1630
1631	return ret;
1632}
1633
1634static void mondata_config_read(struct resctrl_mon_config_info *mon_info)
1635{
1636	smp_call_function_any(&mon_info->d->hdr.cpu_mask,
1637			      resctrl_arch_mon_event_config_read, mon_info, 1);
1638}
1639
1640static int mbm_config_show(struct seq_file *s, struct rdt_resource *r, u32 evtid)
1641{
1642	struct resctrl_mon_config_info mon_info;
1643	struct rdt_mon_domain *dom;
1644	bool sep = false;
1645
1646	cpus_read_lock();
1647	mutex_lock(&rdtgroup_mutex);
1648
1649	list_for_each_entry(dom, &r->mon_domains, hdr.list) {
1650		if (sep)
1651			seq_puts(s, ";");
1652
1653		memset(&mon_info, 0, sizeof(struct resctrl_mon_config_info));
1654		mon_info.r = r;
1655		mon_info.d = dom;
1656		mon_info.evtid = evtid;
1657		mondata_config_read(&mon_info);
1658
1659		seq_printf(s, "%d=0x%02x", dom->hdr.id, mon_info.mon_config);
1660		sep = true;
1661	}
1662	seq_puts(s, "\n");
1663
1664	mutex_unlock(&rdtgroup_mutex);
1665	cpus_read_unlock();
1666
1667	return 0;
1668}
1669
1670static int mbm_total_bytes_config_show(struct kernfs_open_file *of,
1671				       struct seq_file *seq, void *v)
1672{
1673	struct rdt_resource *r = rdt_kn_parent_priv(of->kn);
1674
1675	mbm_config_show(seq, r, QOS_L3_MBM_TOTAL_EVENT_ID);
1676
1677	return 0;
1678}
1679
1680static int mbm_local_bytes_config_show(struct kernfs_open_file *of,
1681				       struct seq_file *seq, void *v)
1682{
1683	struct rdt_resource *r = rdt_kn_parent_priv(of->kn);
1684
1685	mbm_config_show(seq, r, QOS_L3_MBM_LOCAL_EVENT_ID);
1686
1687	return 0;
1688}
1689
1690static void mbm_config_write_domain(struct rdt_resource *r,
1691				    struct rdt_mon_domain *d, u32 evtid, u32 val)
1692{
1693	struct resctrl_mon_config_info mon_info = {0};
1694
1695	/*
1696	 * Read the current config value first. If both are the same then
1697	 * no need to write it again.
1698	 */
1699	mon_info.r = r;
1700	mon_info.d = d;
1701	mon_info.evtid = evtid;
1702	mondata_config_read(&mon_info);
1703	if (mon_info.mon_config == val)
1704		return;
1705
1706	mon_info.mon_config = val;
1707
1708	/*
1709	 * Update MSR_IA32_EVT_CFG_BASE MSR on one of the CPUs in the
1710	 * domain. The MSRs offset from MSR MSR_IA32_EVT_CFG_BASE
1711	 * are scoped at the domain level. Writing any of these MSRs
1712	 * on one CPU is observed by all the CPUs in the domain.
1713	 */
1714	smp_call_function_any(&d->hdr.cpu_mask, resctrl_arch_mon_event_config_write,
1715			      &mon_info, 1);
1716
1717	/*
1718	 * When an Event Configuration is changed, the bandwidth counters
1719	 * for all RMIDs and Events will be cleared by the hardware. The
1720	 * hardware also sets MSR_IA32_QM_CTR.Unavailable (bit 62) for
1721	 * every RMID on the next read to any event for every RMID.
1722	 * Subsequent reads will have MSR_IA32_QM_CTR.Unavailable (bit 62)
1723	 * cleared while it is tracked by the hardware. Clear the
1724	 * mbm_local and mbm_total counts for all the RMIDs.
1725	 */
1726	resctrl_arch_reset_rmid_all(r, d);
1727}
1728
1729static int mon_config_write(struct rdt_resource *r, char *tok, u32 evtid)
1730{
1731	char *dom_str = NULL, *id_str;
1732	unsigned long dom_id, val;
1733	struct rdt_mon_domain *d;
1734
1735	/* Walking r->domains, ensure it can't race with cpuhp */
1736	lockdep_assert_cpus_held();
1737
1738next:
1739	if (!tok || tok[0] == '\0')
1740		return 0;
1741
1742	/* Start processing the strings for each domain */
1743	dom_str = strim(strsep(&tok, ";"));
1744	id_str = strsep(&dom_str, "=");
1745
1746	if (!id_str || kstrtoul(id_str, 10, &dom_id)) {
1747		rdt_last_cmd_puts("Missing '=' or non-numeric domain id\n");
1748		return -EINVAL;
1749	}
1750
1751	if (!dom_str || kstrtoul(dom_str, 16, &val)) {
1752		rdt_last_cmd_puts("Non-numeric event configuration value\n");
1753		return -EINVAL;
1754	}
1755
1756	/* Value from user cannot be more than the supported set of events */
1757	if ((val & r->mon.mbm_cfg_mask) != val) {
1758		rdt_last_cmd_printf("Invalid event configuration: max valid mask is 0x%02x\n",
1759				    r->mon.mbm_cfg_mask);
1760		return -EINVAL;
1761	}
1762
1763	list_for_each_entry(d, &r->mon_domains, hdr.list) {
1764		if (d->hdr.id == dom_id) {
1765			mbm_config_write_domain(r, d, evtid, val);
1766			goto next;
1767		}
1768	}
1769
1770	return -EINVAL;
1771}
1772
1773static ssize_t mbm_total_bytes_config_write(struct kernfs_open_file *of,
1774					    char *buf, size_t nbytes,
1775					    loff_t off)
1776{
1777	struct rdt_resource *r = rdt_kn_parent_priv(of->kn);
1778	int ret;
1779
1780	/* Valid input requires a trailing newline */
1781	if (nbytes == 0 || buf[nbytes - 1] != '\n')
1782		return -EINVAL;
1783
1784	cpus_read_lock();
1785	mutex_lock(&rdtgroup_mutex);
1786
1787	rdt_last_cmd_clear();
1788
1789	buf[nbytes - 1] = '\0';
1790
1791	ret = mon_config_write(r, buf, QOS_L3_MBM_TOTAL_EVENT_ID);
1792
1793	mutex_unlock(&rdtgroup_mutex);
1794	cpus_read_unlock();
1795
1796	return ret ?: nbytes;
1797}
1798
1799static ssize_t mbm_local_bytes_config_write(struct kernfs_open_file *of,
1800					    char *buf, size_t nbytes,
1801					    loff_t off)
1802{
1803	struct rdt_resource *r = rdt_kn_parent_priv(of->kn);
1804	int ret;
1805
1806	/* Valid input requires a trailing newline */
1807	if (nbytes == 0 || buf[nbytes - 1] != '\n')
1808		return -EINVAL;
1809
1810	cpus_read_lock();
1811	mutex_lock(&rdtgroup_mutex);
1812
1813	rdt_last_cmd_clear();
1814
1815	buf[nbytes - 1] = '\0';
1816
1817	ret = mon_config_write(r, buf, QOS_L3_MBM_LOCAL_EVENT_ID);
1818
1819	mutex_unlock(&rdtgroup_mutex);
1820	cpus_read_unlock();
1821
1822	return ret ?: nbytes;
1823}
1824
1825/*
1826 * resctrl_bmec_files_show() — Controls the visibility of BMEC-related resctrl
1827 * files. When @show is true, the files are displayed; when false, the files
1828 * are hidden.
1829 * Don't treat kernfs_find_and_get failure as an error, since this function may
1830 * be called regardless of whether BMEC is supported or the event is enabled.
1831 */
1832void resctrl_bmec_files_show(struct rdt_resource *r, struct kernfs_node *l3_mon_kn,
1833			     bool show)
1834{
1835	struct kernfs_node *kn_config, *mon_kn = NULL;
1836	char name[32];
1837
1838	if (!l3_mon_kn) {
1839		sprintf(name, "%s_MON", r->name);
1840		mon_kn = kernfs_find_and_get(kn_info, name);
1841		if (!mon_kn)
1842			return;
1843		l3_mon_kn = mon_kn;
1844	}
1845
1846	kn_config = kernfs_find_and_get(l3_mon_kn, "mbm_total_bytes_config");
1847	if (kn_config) {
1848		kernfs_show(kn_config, show);
1849		kernfs_put(kn_config);
1850	}
1851
1852	kn_config = kernfs_find_and_get(l3_mon_kn, "mbm_local_bytes_config");
1853	if (kn_config) {
1854		kernfs_show(kn_config, show);
1855		kernfs_put(kn_config);
1856	}
1857
1858	/* Release the reference only if it was acquired */
1859	if (mon_kn)
1860		kernfs_put(mon_kn);
1861}
1862
1863const char *rdtgroup_name_by_closid(u32 closid)
1864{
1865	struct rdtgroup *rdtgrp;
1866
1867	list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) {
1868		if (rdtgrp->closid == closid)
1869			return rdt_kn_name(rdtgrp->kn);
1870	}
1871
1872	return NULL;
1873}
1874
1875/* rdtgroup information files for one cache resource. */
1876static struct rftype res_common_files[] = {
1877	{
1878		.name		= "last_cmd_status",
1879		.mode		= 0444,
1880		.kf_ops		= &rdtgroup_kf_single_ops,
1881		.seq_show	= rdt_last_cmd_status_show,
1882		.fflags		= RFTYPE_TOP_INFO,
1883	},
1884	{
1885		.name		= "mbm_assign_on_mkdir",
1886		.mode		= 0644,
1887		.kf_ops		= &rdtgroup_kf_single_ops,
1888		.seq_show	= resctrl_mbm_assign_on_mkdir_show,
1889		.write		= resctrl_mbm_assign_on_mkdir_write,
1890	},
1891	{
1892		.name		= "num_closids",
1893		.mode		= 0444,
1894		.kf_ops		= &rdtgroup_kf_single_ops,
1895		.seq_show	= rdt_num_closids_show,
1896		.fflags		= RFTYPE_CTRL_INFO,
1897	},
1898	{
1899		.name		= "mon_features",
1900		.mode		= 0444,
1901		.kf_ops		= &rdtgroup_kf_single_ops,
1902		.seq_show	= rdt_mon_features_show,
1903		.fflags		= RFTYPE_MON_INFO,
1904	},
1905	{
1906		.name		= "available_mbm_cntrs",
1907		.mode		= 0444,
1908		.kf_ops		= &rdtgroup_kf_single_ops,
1909		.seq_show	= resctrl_available_mbm_cntrs_show,
1910	},
1911	{
1912		.name		= "num_rmids",
1913		.mode		= 0444,
1914		.kf_ops		= &rdtgroup_kf_single_ops,
1915		.seq_show	= rdt_num_rmids_show,
1916		.fflags		= RFTYPE_MON_INFO,
1917	},
1918	{
1919		.name		= "cbm_mask",
1920		.mode		= 0444,
1921		.kf_ops		= &rdtgroup_kf_single_ops,
1922		.seq_show	= rdt_default_ctrl_show,
1923		.fflags		= RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE,
1924	},
1925	{
1926		.name		= "num_mbm_cntrs",
1927		.mode		= 0444,
1928		.kf_ops		= &rdtgroup_kf_single_ops,
1929		.seq_show	= resctrl_num_mbm_cntrs_show,
1930	},
1931	{
1932		.name		= "min_cbm_bits",
1933		.mode		= 0444,
1934		.kf_ops		= &rdtgroup_kf_single_ops,
1935		.seq_show	= rdt_min_cbm_bits_show,
1936		.fflags		= RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE,
1937	},
1938	{
1939		.name		= "shareable_bits",
1940		.mode		= 0444,
1941		.kf_ops		= &rdtgroup_kf_single_ops,
1942		.seq_show	= rdt_shareable_bits_show,
1943		.fflags		= RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE,
1944	},
1945	{
1946		.name		= "bit_usage",
1947		.mode		= 0444,
1948		.kf_ops		= &rdtgroup_kf_single_ops,
1949		.seq_show	= rdt_bit_usage_show,
1950		.fflags		= RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE,
1951	},
1952	{
1953		.name		= "min_bandwidth",
1954		.mode		= 0444,
1955		.kf_ops		= &rdtgroup_kf_single_ops,
1956		.seq_show	= rdt_min_bw_show,
1957		.fflags		= RFTYPE_CTRL_INFO | RFTYPE_RES_MB,
1958	},
1959	{
1960		.name		= "bandwidth_gran",
1961		.mode		= 0444,
1962		.kf_ops		= &rdtgroup_kf_single_ops,
1963		.seq_show	= rdt_bw_gran_show,
1964		.fflags		= RFTYPE_CTRL_INFO | RFTYPE_RES_MB,
1965	},
1966	{
1967		.name		= "delay_linear",
1968		.mode		= 0444,
1969		.kf_ops		= &rdtgroup_kf_single_ops,
1970		.seq_show	= rdt_delay_linear_show,
1971		.fflags		= RFTYPE_CTRL_INFO | RFTYPE_RES_MB,
1972	},
1973	/*
1974	 * Platform specific which (if any) capabilities are provided by
1975	 * thread_throttle_mode. Defer "fflags" initialization to platform
1976	 * discovery.
1977	 */
1978	{
1979		.name		= "thread_throttle_mode",
1980		.mode		= 0444,
1981		.kf_ops		= &rdtgroup_kf_single_ops,
1982		.seq_show	= rdt_thread_throttle_mode_show,
1983	},
1984	{
1985		.name		= "io_alloc",
1986		.mode		= 0644,
1987		.kf_ops		= &rdtgroup_kf_single_ops,
1988		.seq_show	= resctrl_io_alloc_show,
1989		.write          = resctrl_io_alloc_write,
1990	},
1991	{
1992		.name		= "io_alloc_cbm",
1993		.mode		= 0644,
1994		.kf_ops		= &rdtgroup_kf_single_ops,
1995		.seq_show	= resctrl_io_alloc_cbm_show,
1996		.write		= resctrl_io_alloc_cbm_write,
1997	},
1998	{
1999		.name		= "max_threshold_occupancy",
2000		.mode		= 0644,
2001		.kf_ops		= &rdtgroup_kf_single_ops,
2002		.write		= max_threshold_occ_write,
2003		.seq_show	= max_threshold_occ_show,
2004		.fflags		= RFTYPE_MON_INFO | RFTYPE_RES_CACHE,
2005	},
2006	{
2007		.name		= "mbm_total_bytes_config",
2008		.mode		= 0644,
2009		.kf_ops		= &rdtgroup_kf_single_ops,
2010		.seq_show	= mbm_total_bytes_config_show,
2011		.write		= mbm_total_bytes_config_write,
2012	},
2013	{
2014		.name		= "mbm_local_bytes_config",
2015		.mode		= 0644,
2016		.kf_ops		= &rdtgroup_kf_single_ops,
2017		.seq_show	= mbm_local_bytes_config_show,
2018		.write		= mbm_local_bytes_config_write,
2019	},
2020	{
2021		.name		= "event_filter",
2022		.mode		= 0644,
2023		.kf_ops		= &rdtgroup_kf_single_ops,
2024		.seq_show	= event_filter_show,
2025		.write		= event_filter_write,
2026	},
2027	{
2028		.name		= "mbm_L3_assignments",
2029		.mode		= 0644,
2030		.kf_ops		= &rdtgroup_kf_single_ops,
2031		.seq_show	= mbm_L3_assignments_show,
2032		.write		= mbm_L3_assignments_write,
2033	},
2034	{
2035		.name		= "mbm_assign_mode",
2036		.mode		= 0644,
2037		.kf_ops		= &rdtgroup_kf_single_ops,
2038		.seq_show	= resctrl_mbm_assign_mode_show,
2039		.write		= resctrl_mbm_assign_mode_write,
2040		.fflags		= RFTYPE_MON_INFO | RFTYPE_RES_CACHE,
2041	},
2042	{
2043		.name		= "cpus",
2044		.mode		= 0644,
2045		.kf_ops		= &rdtgroup_kf_single_ops,
2046		.write		= rdtgroup_cpus_write,
2047		.seq_show	= rdtgroup_cpus_show,
2048		.fflags		= RFTYPE_BASE,
2049	},
2050	{
2051		.name		= "cpus_list",
2052		.mode		= 0644,
2053		.kf_ops		= &rdtgroup_kf_single_ops,
2054		.write		= rdtgroup_cpus_write,
2055		.seq_show	= rdtgroup_cpus_show,
2056		.flags		= RFTYPE_FLAGS_CPUS_LIST,
2057		.fflags		= RFTYPE_BASE,
2058	},
2059	{
2060		.name		= "tasks",
2061		.mode		= 0644,
2062		.kf_ops		= &rdtgroup_kf_single_ops,
2063		.write		= rdtgroup_tasks_write,
2064		.seq_show	= rdtgroup_tasks_show,
2065		.fflags		= RFTYPE_BASE,
2066	},
2067	{
2068		.name		= "mon_hw_id",
2069		.mode		= 0444,
2070		.kf_ops		= &rdtgroup_kf_single_ops,
2071		.seq_show	= rdtgroup_rmid_show,
2072		.fflags		= RFTYPE_MON_BASE | RFTYPE_DEBUG,
2073	},
2074	{
2075		.name		= "schemata",
2076		.mode		= 0644,
2077		.kf_ops		= &rdtgroup_kf_single_ops,
2078		.write		= rdtgroup_schemata_write,
2079		.seq_show	= rdtgroup_schemata_show,
2080		.fflags		= RFTYPE_CTRL_BASE,
2081	},
2082	{
2083		.name		= "mba_MBps_event",
2084		.mode		= 0644,
2085		.kf_ops		= &rdtgroup_kf_single_ops,
2086		.write		= rdtgroup_mba_mbps_event_write,
2087		.seq_show	= rdtgroup_mba_mbps_event_show,
2088	},
2089	{
2090		.name		= "mode",
2091		.mode		= 0644,
2092		.kf_ops		= &rdtgroup_kf_single_ops,
2093		.write		= rdtgroup_mode_write,
2094		.seq_show	= rdtgroup_mode_show,
2095		.fflags		= RFTYPE_CTRL_BASE,
2096	},
2097	{
2098		.name		= "size",
2099		.mode		= 0444,
2100		.kf_ops		= &rdtgroup_kf_single_ops,
2101		.seq_show	= rdtgroup_size_show,
2102		.fflags		= RFTYPE_CTRL_BASE,
2103	},
2104	{
2105		.name		= "sparse_masks",
2106		.mode		= 0444,
2107		.kf_ops		= &rdtgroup_kf_single_ops,
2108		.seq_show	= rdt_has_sparse_bitmasks_show,
2109		.fflags		= RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE,
2110	},
2111	{
2112		.name		= "ctrl_hw_id",
2113		.mode		= 0444,
2114		.kf_ops		= &rdtgroup_kf_single_ops,
2115		.seq_show	= rdtgroup_closid_show,
2116		.fflags		= RFTYPE_CTRL_BASE | RFTYPE_DEBUG,
2117	},
2118};
2119
2120static int rdtgroup_add_files(struct kernfs_node *kn, unsigned long fflags)
2121{
2122	struct rftype *rfts, *rft;
2123	int ret, len;
2124
2125	rfts = res_common_files;
2126	len = ARRAY_SIZE(res_common_files);
2127
2128	lockdep_assert_held(&rdtgroup_mutex);
2129
2130	if (resctrl_debug)
2131		fflags |= RFTYPE_DEBUG;
2132
2133	for (rft = rfts; rft < rfts + len; rft++) {
2134		if (rft->fflags && ((fflags & rft->fflags) == rft->fflags)) {
2135			ret = rdtgroup_add_file(kn, rft);
2136			if (ret)
2137				goto error;
2138		}
2139	}
2140
2141	return 0;
2142error:
2143	pr_warn("Failed to add %s, err=%d\n", rft->name, ret);
2144	while (--rft >= rfts) {
2145		if ((fflags & rft->fflags) == rft->fflags)
2146			kernfs_remove_by_name(kn, rft->name);
2147	}
2148	return ret;
2149}
2150
2151static struct rftype *rdtgroup_get_rftype_by_name(const char *name)
2152{
2153	struct rftype *rfts, *rft;
2154	int len;
2155
2156	rfts = res_common_files;
2157	len = ARRAY_SIZE(res_common_files);
2158
2159	for (rft = rfts; rft < rfts + len; rft++) {
2160		if (!strcmp(rft->name, name))
2161			return rft;
2162	}
2163
2164	return NULL;
2165}
2166
2167static void thread_throttle_mode_init(void)
2168{
2169	enum membw_throttle_mode throttle_mode = THREAD_THROTTLE_UNDEFINED;
2170	struct rdt_resource *r_mba, *r_smba;
2171
2172	r_mba = resctrl_arch_get_resource(RDT_RESOURCE_MBA);
2173	if (r_mba->alloc_capable &&
2174	    r_mba->membw.throttle_mode != THREAD_THROTTLE_UNDEFINED)
2175		throttle_mode = r_mba->membw.throttle_mode;
2176
2177	r_smba = resctrl_arch_get_resource(RDT_RESOURCE_SMBA);
2178	if (r_smba->alloc_capable &&
2179	    r_smba->membw.throttle_mode != THREAD_THROTTLE_UNDEFINED)
2180		throttle_mode = r_smba->membw.throttle_mode;
2181
2182	if (throttle_mode == THREAD_THROTTLE_UNDEFINED)
2183		return;
2184
2185	resctrl_file_fflags_init("thread_throttle_mode",
2186				 RFTYPE_CTRL_INFO | RFTYPE_RES_MB);
2187}
2188
2189/*
2190 * The resctrl file "io_alloc" is added using L3 resource. However, it results
2191 * in this file being visible for *all* cache resources (eg. L2 cache),
2192 * whether it supports "io_alloc" or not.
2193 */
2194static void io_alloc_init(void)
2195{
2196	struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3);
2197
2198	if (r->cache.io_alloc_capable) {
2199		resctrl_file_fflags_init("io_alloc", RFTYPE_CTRL_INFO |
2200					 RFTYPE_RES_CACHE);
2201		resctrl_file_fflags_init("io_alloc_cbm",
2202					 RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE);
2203	}
2204}
2205
2206void resctrl_file_fflags_init(const char *config, unsigned long fflags)
2207{
2208	struct rftype *rft;
2209
2210	rft = rdtgroup_get_rftype_by_name(config);
2211	if (rft)
2212		rft->fflags = fflags;
2213}
2214
2215/**
2216 * rdtgroup_kn_mode_restrict - Restrict user access to named resctrl file
2217 * @r: The resource group with which the file is associated.
2218 * @name: Name of the file
2219 *
2220 * The permissions of named resctrl file, directory, or link are modified
2221 * to not allow read, write, or execute by any user.
2222 *
2223 * WARNING: This function is intended to communicate to the user that the
2224 * resctrl file has been locked down - that it is not relevant to the
2225 * particular state the system finds itself in. It should not be relied
2226 * on to protect from user access because after the file's permissions
2227 * are restricted the user can still change the permissions using chmod
2228 * from the command line.
2229 *
2230 * Return: 0 on success, <0 on failure.
2231 */
2232int rdtgroup_kn_mode_restrict(struct rdtgroup *r, const char *name)
2233{
2234	struct iattr iattr = {.ia_valid = ATTR_MODE,};
2235	struct kernfs_node *kn;
2236	int ret = 0;
2237
2238	kn = kernfs_find_and_get_ns(r->kn, name, NULL);
2239	if (!kn)
2240		return -ENOENT;
2241
2242	switch (kernfs_type(kn)) {
2243	case KERNFS_DIR:
2244		iattr.ia_mode = S_IFDIR;
2245		break;
2246	case KERNFS_FILE:
2247		iattr.ia_mode = S_IFREG;
2248		break;
2249	case KERNFS_LINK:
2250		iattr.ia_mode = S_IFLNK;
2251		break;
2252	}
2253
2254	ret = kernfs_setattr(kn, &iattr);
2255	kernfs_put(kn);
2256	return ret;
2257}
2258
2259/**
2260 * rdtgroup_kn_mode_restore - Restore user access to named resctrl file
2261 * @r: The resource group with which the file is associated.
2262 * @name: Name of the file
2263 * @mask: Mask of permissions that should be restored
2264 *
2265 * Restore the permissions of the named file. If @name is a directory the
2266 * permissions of its parent will be used.
2267 *
2268 * Return: 0 on success, <0 on failure.
2269 */
2270int rdtgroup_kn_mode_restore(struct rdtgroup *r, const char *name,
2271			     umode_t mask)
2272{
2273	struct iattr iattr = {.ia_valid = ATTR_MODE,};
2274	struct kernfs_node *kn, *parent;
2275	struct rftype *rfts, *rft;
2276	int ret, len;
2277
2278	rfts = res_common_files;
2279	len = ARRAY_SIZE(res_common_files);
2280
2281	for (rft = rfts; rft < rfts + len; rft++) {
2282		if (!strcmp(rft->name, name))
2283			iattr.ia_mode = rft->mode & mask;
2284	}
2285
2286	kn = kernfs_find_and_get_ns(r->kn, name, NULL);
2287	if (!kn)
2288		return -ENOENT;
2289
2290	switch (kernfs_type(kn)) {
2291	case KERNFS_DIR:
2292		parent = kernfs_get_parent(kn);
2293		if (parent) {
2294			iattr.ia_mode |= parent->mode;
2295			kernfs_put(parent);
2296		}
2297		iattr.ia_mode |= S_IFDIR;
2298		break;
2299	case KERNFS_FILE:
2300		iattr.ia_mode |= S_IFREG;
2301		break;
2302	case KERNFS_LINK:
2303		iattr.ia_mode |= S_IFLNK;
2304		break;
2305	}
2306
2307	ret = kernfs_setattr(kn, &iattr);
2308	kernfs_put(kn);
2309	return ret;
2310}
2311
2312static int resctrl_mkdir_event_configs(struct rdt_resource *r, struct kernfs_node *l3_mon_kn)
2313{
2314	struct kernfs_node *kn_subdir, *kn_subdir2;
2315	struct mon_evt *mevt;
2316	int ret;
2317
2318	kn_subdir = kernfs_create_dir(l3_mon_kn, "event_configs", l3_mon_kn->mode, NULL);
2319	if (IS_ERR(kn_subdir))
2320		return PTR_ERR(kn_subdir);
2321
2322	ret = rdtgroup_kn_set_ugid(kn_subdir);
2323	if (ret)
2324		return ret;
2325
2326	for_each_mon_event(mevt) {
2327		if (mevt->rid != r->rid || !mevt->enabled || !resctrl_is_mbm_event(mevt->evtid))
2328			continue;
2329
2330		kn_subdir2 = kernfs_create_dir(kn_subdir, mevt->name, kn_subdir->mode, mevt);
2331		if (IS_ERR(kn_subdir2)) {
2332			ret = PTR_ERR(kn_subdir2);
2333			goto out;
2334		}
2335
2336		ret = rdtgroup_kn_set_ugid(kn_subdir2);
2337		if (ret)
2338			goto out;
2339
2340		ret = rdtgroup_add_files(kn_subdir2, RFTYPE_ASSIGN_CONFIG);
2341		if (ret)
2342			break;
2343	}
2344
2345out:
2346	return ret;
2347}
2348
2349static int rdtgroup_mkdir_info_resdir(void *priv, char *name,
2350				      unsigned long fflags)
2351{
2352	struct kernfs_node *kn_subdir;
2353	struct rdt_resource *r;
2354	int ret;
2355
2356	kn_subdir = kernfs_create_dir(kn_info, name,
2357				      kn_info->mode, priv);
2358	if (IS_ERR(kn_subdir))
2359		return PTR_ERR(kn_subdir);
2360
2361	ret = rdtgroup_kn_set_ugid(kn_subdir);
2362	if (ret)
2363		return ret;
2364
2365	ret = rdtgroup_add_files(kn_subdir, fflags);
2366	if (ret)
2367		return ret;
2368
2369	if ((fflags & RFTYPE_MON_INFO) == RFTYPE_MON_INFO) {
2370		r = priv;
2371		if (r->mon.mbm_cntr_assignable) {
2372			ret = resctrl_mkdir_event_configs(r, kn_subdir);
2373			if (ret)
2374				return ret;
2375			/*
2376			 * Hide BMEC related files if mbm_event mode
2377			 * is enabled.
2378			 */
2379			if (resctrl_arch_mbm_cntr_assign_enabled(r))
2380				resctrl_bmec_files_show(r, kn_subdir, false);
2381		}
2382	}
2383
2384	kernfs_activate(kn_subdir);
2385
2386	return ret;
2387}
2388
2389static unsigned long fflags_from_resource(struct rdt_resource *r)
2390{
2391	switch (r->rid) {
2392	case RDT_RESOURCE_L3:
2393	case RDT_RESOURCE_L2:
2394		return RFTYPE_RES_CACHE;
2395	case RDT_RESOURCE_MBA:
2396	case RDT_RESOURCE_SMBA:
2397		return RFTYPE_RES_MB;
2398	}
2399
2400	return WARN_ON_ONCE(1);
2401}
2402
2403static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn)
2404{
2405	struct resctrl_schema *s;
2406	struct rdt_resource *r;
2407	unsigned long fflags;
2408	char name[32];
2409	int ret;
2410
2411	/* create the directory */
2412	kn_info = kernfs_create_dir(parent_kn, "info", parent_kn->mode, NULL);
2413	if (IS_ERR(kn_info))
2414		return PTR_ERR(kn_info);
2415
2416	ret = rdtgroup_add_files(kn_info, RFTYPE_TOP_INFO);
2417	if (ret)
2418		goto out_destroy;
2419
2420	/* loop over enabled controls, these are all alloc_capable */
2421	list_for_each_entry(s, &resctrl_schema_all, list) {
2422		r = s->res;
2423		fflags = fflags_from_resource(r) | RFTYPE_CTRL_INFO;
2424		ret = rdtgroup_mkdir_info_resdir(s, s->name, fflags);
2425		if (ret)
2426			goto out_destroy;
2427	}
2428
2429	for_each_mon_capable_rdt_resource(r) {
2430		fflags = fflags_from_resource(r) | RFTYPE_MON_INFO;
2431		sprintf(name, "%s_MON", r->name);
2432		ret = rdtgroup_mkdir_info_resdir(r, name, fflags);
2433		if (ret)
2434			goto out_destroy;
2435	}
2436
2437	ret = rdtgroup_kn_set_ugid(kn_info);
2438	if (ret)
2439		goto out_destroy;
2440
2441	kernfs_activate(kn_info);
2442
2443	return 0;
2444
2445out_destroy:
2446	kernfs_remove(kn_info);
2447	return ret;
2448}
2449
2450static int
2451mongroup_create_dir(struct kernfs_node *parent_kn, struct rdtgroup *prgrp,
2452		    char *name, struct kernfs_node **dest_kn)
2453{
2454	struct kernfs_node *kn;
2455	int ret;
2456
2457	/* create the directory */
2458	kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp);
2459	if (IS_ERR(kn))
2460		return PTR_ERR(kn);
2461
2462	if (dest_kn)
2463		*dest_kn = kn;
2464
2465	ret = rdtgroup_kn_set_ugid(kn);
2466	if (ret)
2467		goto out_destroy;
2468
2469	kernfs_activate(kn);
2470
2471	return 0;
2472
2473out_destroy:
2474	kernfs_remove(kn);
2475	return ret;
2476}
2477
2478static inline bool is_mba_linear(void)
2479{
2480	return resctrl_arch_get_resource(RDT_RESOURCE_MBA)->membw.delay_linear;
2481}
2482
2483static int mba_sc_domain_allocate(struct rdt_resource *r, struct rdt_ctrl_domain *d)
2484{
2485	u32 num_closid = resctrl_arch_get_num_closid(r);
2486	int cpu = cpumask_any(&d->hdr.cpu_mask);
2487	int i;
2488
2489	d->mbps_val = kcalloc_node(num_closid, sizeof(*d->mbps_val),
2490				   GFP_KERNEL, cpu_to_node(cpu));
2491	if (!d->mbps_val)
2492		return -ENOMEM;
2493
2494	for (i = 0; i < num_closid; i++)
2495		d->mbps_val[i] = MBA_MAX_MBPS;
2496
2497	return 0;
2498}
2499
2500static void mba_sc_domain_destroy(struct rdt_resource *r,
2501				  struct rdt_ctrl_domain *d)
2502{
2503	kfree(d->mbps_val);
2504	d->mbps_val = NULL;
2505}
2506
2507/*
2508 * MBA software controller is supported only if
2509 * MBM is supported and MBA is in linear scale,
2510 * and the MBM monitor scope is the same as MBA
2511 * control scope.
2512 */
2513static bool supports_mba_mbps(void)
2514{
2515	struct rdt_resource *rmbm = resctrl_arch_get_resource(RDT_RESOURCE_L3);
2516	struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_MBA);
2517
2518	return (resctrl_is_mbm_enabled() &&
2519		r->alloc_capable && is_mba_linear() &&
2520		r->ctrl_scope == rmbm->mon_scope);
2521}
2522
2523/*
2524 * Enable or disable the MBA software controller
2525 * which helps user specify bandwidth in MBps.
2526 */
2527static int set_mba_sc(bool mba_sc)
2528{
2529	struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_MBA);
2530	u32 num_closid = resctrl_arch_get_num_closid(r);
2531	struct rdt_ctrl_domain *d;
2532	unsigned long fflags;
2533	int i;
2534
2535	if (!supports_mba_mbps() || mba_sc == is_mba_sc(r))
2536		return -EINVAL;
2537
2538	r->membw.mba_sc = mba_sc;
2539
2540	rdtgroup_default.mba_mbps_event = mba_mbps_default_event;
2541
2542	list_for_each_entry(d, &r->ctrl_domains, hdr.list) {
2543		for (i = 0; i < num_closid; i++)
2544			d->mbps_val[i] = MBA_MAX_MBPS;
2545	}
2546
2547	fflags = mba_sc ? RFTYPE_CTRL_BASE | RFTYPE_MON_BASE : 0;
2548	resctrl_file_fflags_init("mba_MBps_event", fflags);
2549
2550	return 0;
2551}
2552
2553/*
2554 * We don't allow rdtgroup directories to be created anywhere
2555 * except the root directory. Thus when looking for the rdtgroup
2556 * structure for a kernfs node we are either looking at a directory,
2557 * in which case the rdtgroup structure is pointed at by the "priv"
2558 * field, otherwise we have a file, and need only look to the parent
2559 * to find the rdtgroup.
2560 */
2561static struct rdtgroup *kernfs_to_rdtgroup(struct kernfs_node *kn)
2562{
2563	if (kernfs_type(kn) == KERNFS_DIR) {
2564		/*
2565		 * All the resource directories use "kn->priv"
2566		 * to point to the "struct rdtgroup" for the
2567		 * resource. "info" and its subdirectories don't
2568		 * have rdtgroup structures, so return NULL here.
2569		 */
2570		if (kn == kn_info ||
2571		    rcu_access_pointer(kn->__parent) == kn_info)
2572			return NULL;
2573		else
2574			return kn->priv;
2575	} else {
2576		return rdt_kn_parent_priv(kn);
2577	}
2578}
2579
2580static void rdtgroup_kn_get(struct rdtgroup *rdtgrp, struct kernfs_node *kn)
2581{
2582	atomic_inc(&rdtgrp->waitcount);
2583	kernfs_break_active_protection(kn);
2584}
2585
2586static void rdtgroup_kn_put(struct rdtgroup *rdtgrp, struct kernfs_node *kn)
2587{
2588	if (atomic_dec_and_test(&rdtgrp->waitcount) &&
2589	    (rdtgrp->flags & RDT_DELETED)) {
2590		if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP ||
2591		    rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED)
2592			rdtgroup_pseudo_lock_remove(rdtgrp);
2593		kernfs_unbreak_active_protection(kn);
2594		rdtgroup_remove(rdtgrp);
2595	} else {
2596		kernfs_unbreak_active_protection(kn);
2597	}
2598}
2599
2600struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn)
2601{
2602	struct rdtgroup *rdtgrp = kernfs_to_rdtgroup(kn);
2603
2604	if (!rdtgrp)
2605		return NULL;
2606
2607	rdtgroup_kn_get(rdtgrp, kn);
2608
2609	cpus_read_lock();
2610	mutex_lock(&rdtgroup_mutex);
2611
2612	/* Was this group deleted while we waited? */
2613	if (rdtgrp->flags & RDT_DELETED)
2614		return NULL;
2615
2616	return rdtgrp;
2617}
2618
2619void rdtgroup_kn_unlock(struct kernfs_node *kn)
2620{
2621	struct rdtgroup *rdtgrp = kernfs_to_rdtgroup(kn);
2622
2623	if (!rdtgrp)
2624		return;
2625
2626	mutex_unlock(&rdtgroup_mutex);
2627	cpus_read_unlock();
2628
2629	rdtgroup_kn_put(rdtgrp, kn);
2630}
2631
2632static int mkdir_mondata_all(struct kernfs_node *parent_kn,
2633			     struct rdtgroup *prgrp,
2634			     struct kernfs_node **mon_data_kn);
2635
2636static void rdt_disable_ctx(void)
2637{
2638	resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, false);
2639	resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, false);
2640	set_mba_sc(false);
2641
2642	resctrl_debug = false;
2643}
2644
2645static int rdt_enable_ctx(struct rdt_fs_context *ctx)
2646{
2647	int ret = 0;
2648
2649	if (ctx->enable_cdpl2) {
2650		ret = resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, true);
2651		if (ret)
2652			goto out_done;
2653	}
2654
2655	if (ctx->enable_cdpl3) {
2656		ret = resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, true);
2657		if (ret)
2658			goto out_cdpl2;
2659	}
2660
2661	if (ctx->enable_mba_mbps) {
2662		ret = set_mba_sc(true);
2663		if (ret)
2664			goto out_cdpl3;
2665	}
2666
2667	if (ctx->enable_debug)
2668		resctrl_debug = true;
2669
2670	return 0;
2671
2672out_cdpl3:
2673	resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, false);
2674out_cdpl2:
2675	resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, false);
2676out_done:
2677	return ret;
2678}
2679
2680static int schemata_list_add(struct rdt_resource *r, enum resctrl_conf_type type)
2681{
2682	struct resctrl_schema *s;
2683	const char *suffix = "";
2684	int ret, cl;
2685
2686	s = kzalloc(sizeof(*s), GFP_KERNEL);
2687	if (!s)
2688		return -ENOMEM;
2689
2690	s->res = r;
2691	s->num_closid = resctrl_arch_get_num_closid(r);
2692	if (resctrl_arch_get_cdp_enabled(r->rid))
2693		s->num_closid /= 2;
2694
2695	s->conf_type = type;
2696	switch (type) {
2697	case CDP_CODE:
2698		suffix = "CODE";
2699		break;
2700	case CDP_DATA:
2701		suffix = "DATA";
2702		break;
2703	case CDP_NONE:
2704		suffix = "";
2705		break;
2706	}
2707
2708	ret = snprintf(s->name, sizeof(s->name), "%s%s", r->name, suffix);
2709	if (ret >= sizeof(s->name)) {
2710		kfree(s);
2711		return -EINVAL;
2712	}
2713
2714	cl = strlen(s->name);
2715
2716	/*
2717	 * If CDP is supported by this resource, but not enabled,
2718	 * include the suffix. This ensures the tabular format of the
2719	 * schemata file does not change between mounts of the filesystem.
2720	 */
2721	if (r->cdp_capable && !resctrl_arch_get_cdp_enabled(r->rid))
2722		cl += 4;
2723
2724	if (cl > max_name_width)
2725		max_name_width = cl;
2726
2727	switch (r->schema_fmt) {
2728	case RESCTRL_SCHEMA_BITMAP:
2729		s->fmt_str = "%d=%x";
2730		break;
2731	case RESCTRL_SCHEMA_RANGE:
2732		s->fmt_str = "%d=%u";
2733		break;
2734	}
2735
2736	if (WARN_ON_ONCE(!s->fmt_str)) {
2737		kfree(s);
2738		return -EINVAL;
2739	}
2740
2741	INIT_LIST_HEAD(&s->list);
2742	list_add(&s->list, &resctrl_schema_all);
2743
2744	return 0;
2745}
2746
2747static int schemata_list_create(void)
2748{
2749	struct rdt_resource *r;
2750	int ret = 0;
2751
2752	for_each_alloc_capable_rdt_resource(r) {
2753		if (resctrl_arch_get_cdp_enabled(r->rid)) {
2754			ret = schemata_list_add(r, CDP_CODE);
2755			if (ret)
2756				break;
2757
2758			ret = schemata_list_add(r, CDP_DATA);
2759		} else {
2760			ret = schemata_list_add(r, CDP_NONE);
2761		}
2762
2763		if (ret)
2764			break;
2765	}
2766
2767	return ret;
2768}
2769
2770static void schemata_list_destroy(void)
2771{
2772	struct resctrl_schema *s, *tmp;
2773
2774	list_for_each_entry_safe(s, tmp, &resctrl_schema_all, list) {
2775		list_del(&s->list);
2776		kfree(s);
2777	}
2778}
2779
2780static int rdt_get_tree(struct fs_context *fc)
2781{
2782	struct rdt_fs_context *ctx = rdt_fc2context(fc);
2783	unsigned long flags = RFTYPE_CTRL_BASE;
2784	struct rdt_mon_domain *dom;
2785	struct rdt_resource *r;
2786	int ret;
2787
2788	cpus_read_lock();
2789	mutex_lock(&rdtgroup_mutex);
2790	/*
2791	 * resctrl file system can only be mounted once.
2792	 */
2793	if (resctrl_mounted) {
2794		ret = -EBUSY;
2795		goto out;
2796	}
2797
2798	ret = rdtgroup_setup_root(ctx);
2799	if (ret)
2800		goto out;
2801
2802	ret = rdt_enable_ctx(ctx);
2803	if (ret)
2804		goto out_root;
2805
2806	ret = schemata_list_create();
2807	if (ret)
2808		goto out_schemata_free;
2809
2810	ret = closid_init();
2811	if (ret)
2812		goto out_schemata_free;
2813
2814	if (resctrl_arch_mon_capable())
2815		flags |= RFTYPE_MON;
2816
2817	ret = rdtgroup_add_files(rdtgroup_default.kn, flags);
2818	if (ret)
2819		goto out_closid_exit;
2820
2821	kernfs_activate(rdtgroup_default.kn);
2822
2823	ret = rdtgroup_create_info_dir(rdtgroup_default.kn);
2824	if (ret < 0)
2825		goto out_closid_exit;
2826
2827	if (resctrl_arch_mon_capable()) {
2828		ret = mongroup_create_dir(rdtgroup_default.kn,
2829					  &rdtgroup_default, "mon_groups",
2830					  &kn_mongrp);
2831		if (ret < 0)
2832			goto out_info;
2833
2834		rdtgroup_assign_cntrs(&rdtgroup_default);
2835
2836		ret = mkdir_mondata_all(rdtgroup_default.kn,
2837					&rdtgroup_default, &kn_mondata);
2838		if (ret < 0)
2839			goto out_mongrp;
2840		rdtgroup_default.mon.mon_data_kn = kn_mondata;
2841	}
2842
2843	ret = rdt_pseudo_lock_init();
2844	if (ret)
2845		goto out_mondata;
2846
2847	ret = kernfs_get_tree(fc);
2848	if (ret < 0)
2849		goto out_psl;
2850
2851	if (resctrl_arch_alloc_capable())
2852		resctrl_arch_enable_alloc();
2853	if (resctrl_arch_mon_capable())
2854		resctrl_arch_enable_mon();
2855
2856	if (resctrl_arch_alloc_capable() || resctrl_arch_mon_capable())
2857		resctrl_mounted = true;
2858
2859	if (resctrl_is_mbm_enabled()) {
2860		r = resctrl_arch_get_resource(RDT_RESOURCE_L3);
2861		list_for_each_entry(dom, &r->mon_domains, hdr.list)
2862			mbm_setup_overflow_handler(dom, MBM_OVERFLOW_INTERVAL,
2863						   RESCTRL_PICK_ANY_CPU);
2864	}
2865
2866	goto out;
2867
2868out_psl:
2869	rdt_pseudo_lock_release();
2870out_mondata:
2871	if (resctrl_arch_mon_capable())
2872		kernfs_remove(kn_mondata);
2873out_mongrp:
2874	if (resctrl_arch_mon_capable()) {
2875		rdtgroup_unassign_cntrs(&rdtgroup_default);
2876		kernfs_remove(kn_mongrp);
2877	}
2878out_info:
2879	kernfs_remove(kn_info);
2880out_closid_exit:
2881	closid_exit();
2882out_schemata_free:
2883	schemata_list_destroy();
2884	rdt_disable_ctx();
2885out_root:
2886	rdtgroup_destroy_root();
2887out:
2888	rdt_last_cmd_clear();
2889	mutex_unlock(&rdtgroup_mutex);
2890	cpus_read_unlock();
2891	return ret;
2892}
2893
2894enum rdt_param {
2895	Opt_cdp,
2896	Opt_cdpl2,
2897	Opt_mba_mbps,
2898	Opt_debug,
2899	nr__rdt_params
2900};
2901
2902static const struct fs_parameter_spec rdt_fs_parameters[] = {
2903	fsparam_flag("cdp",		Opt_cdp),
2904	fsparam_flag("cdpl2",		Opt_cdpl2),
2905	fsparam_flag("mba_MBps",	Opt_mba_mbps),
2906	fsparam_flag("debug",		Opt_debug),
2907	{}
2908};
2909
2910static int rdt_parse_param(struct fs_context *fc, struct fs_parameter *param)
2911{
2912	struct rdt_fs_context *ctx = rdt_fc2context(fc);
2913	struct fs_parse_result result;
2914	const char *msg;
2915	int opt;
2916
2917	opt = fs_parse(fc, rdt_fs_parameters, param, &result);
2918	if (opt < 0)
2919		return opt;
2920
2921	switch (opt) {
2922	case Opt_cdp:
2923		ctx->enable_cdpl3 = true;
2924		return 0;
2925	case Opt_cdpl2:
2926		ctx->enable_cdpl2 = true;
2927		return 0;
2928	case Opt_mba_mbps:
2929		msg = "mba_MBps requires MBM and linear scale MBA at L3 scope";
2930		if (!supports_mba_mbps())
2931			return invalfc(fc, msg);
2932		ctx->enable_mba_mbps = true;
2933		return 0;
2934	case Opt_debug:
2935		ctx->enable_debug = true;
2936		return 0;
2937	}
2938
2939	return -EINVAL;
2940}
2941
2942static void rdt_fs_context_free(struct fs_context *fc)
2943{
2944	struct rdt_fs_context *ctx = rdt_fc2context(fc);
2945
2946	kernfs_free_fs_context(fc);
2947	kfree(ctx);
2948}
2949
2950static const struct fs_context_operations rdt_fs_context_ops = {
2951	.free		= rdt_fs_context_free,
2952	.parse_param	= rdt_parse_param,
2953	.get_tree	= rdt_get_tree,
2954};
2955
2956static int rdt_init_fs_context(struct fs_context *fc)
2957{
2958	struct rdt_fs_context *ctx;
2959
2960	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
2961	if (!ctx)
2962		return -ENOMEM;
2963
2964	ctx->kfc.magic = RDTGROUP_SUPER_MAGIC;
2965	fc->fs_private = &ctx->kfc;
2966	fc->ops = &rdt_fs_context_ops;
2967	put_user_ns(fc->user_ns);
2968	fc->user_ns = get_user_ns(&init_user_ns);
2969	fc->global = true;
2970	return 0;
2971}
2972
2973/*
2974 * Move tasks from one to the other group. If @from is NULL, then all tasks
2975 * in the systems are moved unconditionally (used for teardown).
2976 *
2977 * If @mask is not NULL the cpus on which moved tasks are running are set
2978 * in that mask so the update smp function call is restricted to affected
2979 * cpus.
2980 */
2981static void rdt_move_group_tasks(struct rdtgroup *from, struct rdtgroup *to,
2982				 struct cpumask *mask)
2983{
2984	struct task_struct *p, *t;
2985
2986	read_lock(&tasklist_lock);
2987	for_each_process_thread(p, t) {
2988		if (!from || is_closid_match(t, from) ||
2989		    is_rmid_match(t, from)) {
2990			resctrl_arch_set_closid_rmid(t, to->closid,
2991						     to->mon.rmid);
2992
2993			/*
2994			 * Order the closid/rmid stores above before the loads
2995			 * in task_curr(). This pairs with the full barrier
2996			 * between the rq->curr update and
2997			 * resctrl_arch_sched_in() during context switch.
2998			 */
2999			smp_mb();
3000
3001			/*
3002			 * If the task is on a CPU, set the CPU in the mask.
3003			 * The detection is inaccurate as tasks might move or
3004			 * schedule before the smp function call takes place.
3005			 * In such a case the function call is pointless, but
3006			 * there is no other side effect.
3007			 */
3008			if (IS_ENABLED(CONFIG_SMP) && mask && task_curr(t))
3009				cpumask_set_cpu(task_cpu(t), mask);
3010		}
3011	}
3012	read_unlock(&tasklist_lock);
3013}
3014
3015static void free_all_child_rdtgrp(struct rdtgroup *rdtgrp)
3016{
3017	struct rdtgroup *sentry, *stmp;
3018	struct list_head *head;
3019
3020	head = &rdtgrp->mon.crdtgrp_list;
3021	list_for_each_entry_safe(sentry, stmp, head, mon.crdtgrp_list) {
3022		rdtgroup_unassign_cntrs(sentry);
3023		free_rmid(sentry->closid, sentry->mon.rmid);
3024		list_del(&sentry->mon.crdtgrp_list);
3025
3026		if (atomic_read(&sentry->waitcount) != 0)
3027			sentry->flags = RDT_DELETED;
3028		else
3029			rdtgroup_remove(sentry);
3030	}
3031}
3032
3033/*
3034 * Forcibly remove all of subdirectories under root.
3035 */
3036static void rmdir_all_sub(void)
3037{
3038	struct rdtgroup *rdtgrp, *tmp;
3039
3040	/* Move all tasks to the default resource group */
3041	rdt_move_group_tasks(NULL, &rdtgroup_default, NULL);
3042
3043	list_for_each_entry_safe(rdtgrp, tmp, &rdt_all_groups, rdtgroup_list) {
3044		/* Free any child rmids */
3045		free_all_child_rdtgrp(rdtgrp);
3046
3047		/* Remove each rdtgroup other than root */
3048		if (rdtgrp == &rdtgroup_default)
3049			continue;
3050
3051		if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP ||
3052		    rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED)
3053			rdtgroup_pseudo_lock_remove(rdtgrp);
3054
3055		/*
3056		 * Give any CPUs back to the default group. We cannot copy
3057		 * cpu_online_mask because a CPU might have executed the
3058		 * offline callback already, but is still marked online.
3059		 */
3060		cpumask_or(&rdtgroup_default.cpu_mask,
3061			   &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask);
3062
3063		rdtgroup_unassign_cntrs(rdtgrp);
3064
3065		free_rmid(rdtgrp->closid, rdtgrp->mon.rmid);
3066
3067		kernfs_remove(rdtgrp->kn);
3068		list_del(&rdtgrp->rdtgroup_list);
3069
3070		if (atomic_read(&rdtgrp->waitcount) != 0)
3071			rdtgrp->flags = RDT_DELETED;
3072		else
3073			rdtgroup_remove(rdtgrp);
3074	}
3075	/* Notify online CPUs to update per cpu storage and PQR_ASSOC MSR */
3076	update_closid_rmid(cpu_online_mask, &rdtgroup_default);
3077
3078	kernfs_remove(kn_info);
3079	kernfs_remove(kn_mongrp);
3080	kernfs_remove(kn_mondata);
3081}
3082
3083/**
3084 * mon_get_kn_priv() - Get the mon_data priv data for this event.
3085 *
3086 * The same values are used across the mon_data directories of all control and
3087 * monitor groups for the same event in the same domain. Keep a list of
3088 * allocated structures and re-use an existing one with the same values for
3089 * @rid, @domid, etc.
3090 *
3091 * @rid:    The resource id for the event file being created.
3092 * @domid:  The domain id for the event file being created.
3093 * @mevt:   The type of event file being created.
3094 * @do_sum: Whether SNC summing monitors are being created.
3095 */
3096static struct mon_data *mon_get_kn_priv(enum resctrl_res_level rid, int domid,
3097					struct mon_evt *mevt,
3098					bool do_sum)
3099{
3100	struct mon_data *priv;
3101
3102	lockdep_assert_held(&rdtgroup_mutex);
3103
3104	list_for_each_entry(priv, &mon_data_kn_priv_list, list) {
3105		if (priv->rid == rid && priv->domid == domid &&
3106		    priv->sum == do_sum && priv->evtid == mevt->evtid)
3107			return priv;
3108	}
3109
3110	priv = kzalloc(sizeof(*priv), GFP_KERNEL);
3111	if (!priv)
3112		return NULL;
3113
3114	priv->rid = rid;
3115	priv->domid = domid;
3116	priv->sum = do_sum;
3117	priv->evtid = mevt->evtid;
3118	list_add_tail(&priv->list, &mon_data_kn_priv_list);
3119
3120	return priv;
3121}
3122
3123/**
3124 * mon_put_kn_priv() - Free all allocated mon_data structures.
3125 *
3126 * Called when resctrl file system is unmounted.
3127 */
3128static void mon_put_kn_priv(void)
3129{
3130	struct mon_data *priv, *tmp;
3131
3132	lockdep_assert_held(&rdtgroup_mutex);
3133
3134	list_for_each_entry_safe(priv, tmp, &mon_data_kn_priv_list, list) {
3135		list_del(&priv->list);
3136		kfree(priv);
3137	}
3138}
3139
3140static void resctrl_fs_teardown(void)
3141{
3142	lockdep_assert_held(&rdtgroup_mutex);
3143
3144	/* Cleared by rdtgroup_destroy_root() */
3145	if (!rdtgroup_default.kn)
3146		return;
3147
3148	rmdir_all_sub();
3149	rdtgroup_unassign_cntrs(&rdtgroup_default);
3150	mon_put_kn_priv();
3151	rdt_pseudo_lock_release();
3152	rdtgroup_default.mode = RDT_MODE_SHAREABLE;
3153	closid_exit();
3154	schemata_list_destroy();
3155	rdtgroup_destroy_root();
3156}
3157
3158static void rdt_kill_sb(struct super_block *sb)
3159{
3160	struct rdt_resource *r;
3161
3162	cpus_read_lock();
3163	mutex_lock(&rdtgroup_mutex);
3164
3165	rdt_disable_ctx();
3166
3167	/* Put everything back to default values. */
3168	for_each_alloc_capable_rdt_resource(r)
3169		resctrl_arch_reset_all_ctrls(r);
3170
3171	resctrl_fs_teardown();
3172	if (resctrl_arch_alloc_capable())
3173		resctrl_arch_disable_alloc();
3174	if (resctrl_arch_mon_capable())
3175		resctrl_arch_disable_mon();
3176	resctrl_mounted = false;
3177	kernfs_kill_sb(sb);
3178	mutex_unlock(&rdtgroup_mutex);
3179	cpus_read_unlock();
3180}
3181
3182static struct file_system_type rdt_fs_type = {
3183	.name			= "resctrl",
3184	.init_fs_context	= rdt_init_fs_context,
3185	.parameters		= rdt_fs_parameters,
3186	.kill_sb		= rdt_kill_sb,
3187};
3188
3189static int mon_addfile(struct kernfs_node *parent_kn, const char *name,
3190		       void *priv)
3191{
3192	struct kernfs_node *kn;
3193	int ret = 0;
3194
3195	kn = __kernfs_create_file(parent_kn, name, 0444,
3196				  GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, 0,
3197				  &kf_mondata_ops, priv, NULL, NULL);
3198	if (IS_ERR(kn))
3199		return PTR_ERR(kn);
3200
3201	ret = rdtgroup_kn_set_ugid(kn);
3202	if (ret) {
3203		kernfs_remove(kn);
3204		return ret;
3205	}
3206
3207	return ret;
3208}
3209
3210static void mon_rmdir_one_subdir(struct kernfs_node *pkn, char *name, char *subname)
3211{
3212	struct kernfs_node *kn;
3213
3214	kn = kernfs_find_and_get(pkn, name);
3215	if (!kn)
3216		return;
3217	kernfs_put(kn);
3218
3219	if (kn->dir.subdirs <= 1)
3220		kernfs_remove(kn);
3221	else
3222		kernfs_remove_by_name(kn, subname);
3223}
3224
3225/*
3226 * Remove all subdirectories of mon_data of ctrl_mon groups
3227 * and monitor groups for the given domain.
3228 * Remove files and directories containing "sum" of domain data
3229 * when last domain being summed is removed.
3230 */
3231static void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r,
3232					   struct rdt_mon_domain *d)
3233{
3234	struct rdtgroup *prgrp, *crgrp;
3235	char subname[32];
3236	bool snc_mode;
3237	char name[32];
3238
3239	snc_mode = r->mon_scope == RESCTRL_L3_NODE;
3240	sprintf(name, "mon_%s_%02d", r->name, snc_mode ? d->ci_id : d->hdr.id);
3241	if (snc_mode)
3242		sprintf(subname, "mon_sub_%s_%02d", r->name, d->hdr.id);
3243
3244	list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) {
3245		mon_rmdir_one_subdir(prgrp->mon.mon_data_kn, name, subname);
3246
3247		list_for_each_entry(crgrp, &prgrp->mon.crdtgrp_list, mon.crdtgrp_list)
3248			mon_rmdir_one_subdir(crgrp->mon.mon_data_kn, name, subname);
3249	}
3250}
3251
3252static int mon_add_all_files(struct kernfs_node *kn, struct rdt_mon_domain *d,
3253			     struct rdt_resource *r, struct rdtgroup *prgrp,
3254			     bool do_sum)
3255{
3256	struct rmid_read rr = {0};
3257	struct mon_data *priv;
3258	struct mon_evt *mevt;
3259	int ret, domid;
3260
3261	for_each_mon_event(mevt) {
3262		if (mevt->rid != r->rid || !mevt->enabled)
3263			continue;
3264		domid = do_sum ? d->ci_id : d->hdr.id;
3265		priv = mon_get_kn_priv(r->rid, domid, mevt, do_sum);
3266		if (WARN_ON_ONCE(!priv))
3267			return -EINVAL;
3268
3269		ret = mon_addfile(kn, mevt->name, priv);
3270		if (ret)
3271			return ret;
3272
3273		if (!do_sum && resctrl_is_mbm_event(mevt->evtid))
3274			mon_event_read(&rr, r, d, prgrp, &d->hdr.cpu_mask, mevt->evtid, true);
3275	}
3276
3277	return 0;
3278}
3279
3280static int mkdir_mondata_subdir(struct kernfs_node *parent_kn,
3281				struct rdt_mon_domain *d,
3282				struct rdt_resource *r, struct rdtgroup *prgrp)
3283{
3284	struct kernfs_node *kn, *ckn;
3285	char name[32];
3286	bool snc_mode;
3287	int ret = 0;
3288
3289	lockdep_assert_held(&rdtgroup_mutex);
3290
3291	snc_mode = r->mon_scope == RESCTRL_L3_NODE;
3292	sprintf(name, "mon_%s_%02d", r->name, snc_mode ? d->ci_id : d->hdr.id);
3293	kn = kernfs_find_and_get(parent_kn, name);
3294	if (kn) {
3295		/*
3296		 * rdtgroup_mutex will prevent this directory from being
3297		 * removed. No need to keep this hold.
3298		 */
3299		kernfs_put(kn);
3300	} else {
3301		kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp);
3302		if (IS_ERR(kn))
3303			return PTR_ERR(kn);
3304
3305		ret = rdtgroup_kn_set_ugid(kn);
3306		if (ret)
3307			goto out_destroy;
3308		ret = mon_add_all_files(kn, d, r, prgrp, snc_mode);
3309		if (ret)
3310			goto out_destroy;
3311	}
3312
3313	if (snc_mode) {
3314		sprintf(name, "mon_sub_%s_%02d", r->name, d->hdr.id);
3315		ckn = kernfs_create_dir(kn, name, parent_kn->mode, prgrp);
3316		if (IS_ERR(ckn)) {
3317			ret = -EINVAL;
3318			goto out_destroy;
3319		}
3320
3321		ret = rdtgroup_kn_set_ugid(ckn);
3322		if (ret)
3323			goto out_destroy;
3324
3325		ret = mon_add_all_files(ckn, d, r, prgrp, false);
3326		if (ret)
3327			goto out_destroy;
3328	}
3329
3330	kernfs_activate(kn);
3331	return 0;
3332
3333out_destroy:
3334	kernfs_remove(kn);
3335	return ret;
3336}
3337
3338/*
3339 * Add all subdirectories of mon_data for "ctrl_mon" groups
3340 * and "monitor" groups with given domain id.
3341 */
3342static void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r,
3343					   struct rdt_mon_domain *d)
3344{
3345	struct kernfs_node *parent_kn;
3346	struct rdtgroup *prgrp, *crgrp;
3347	struct list_head *head;
3348
3349	list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) {
3350		parent_kn = prgrp->mon.mon_data_kn;
3351		mkdir_mondata_subdir(parent_kn, d, r, prgrp);
3352
3353		head = &prgrp->mon.crdtgrp_list;
3354		list_for_each_entry(crgrp, head, mon.crdtgrp_list) {
3355			parent_kn = crgrp->mon.mon_data_kn;
3356			mkdir_mondata_subdir(parent_kn, d, r, crgrp);
3357		}
3358	}
3359}
3360
3361static int mkdir_mondata_subdir_alldom(struct kernfs_node *parent_kn,
3362				       struct rdt_resource *r,
3363				       struct rdtgroup *prgrp)
3364{
3365	struct rdt_mon_domain *dom;
3366	int ret;
3367
3368	/* Walking r->domains, ensure it can't race with cpuhp */
3369	lockdep_assert_cpus_held();
3370
3371	list_for_each_entry(dom, &r->mon_domains, hdr.list) {
3372		ret = mkdir_mondata_subdir(parent_kn, dom, r, prgrp);
3373		if (ret)
3374			return ret;
3375	}
3376
3377	return 0;
3378}
3379
3380/*
3381 * This creates a directory mon_data which contains the monitored data.
3382 *
3383 * mon_data has one directory for each domain which are named
3384 * in the format mon_<domain_name>_<domain_id>. For ex: A mon_data
3385 * with L3 domain looks as below:
3386 * ./mon_data:
3387 * mon_L3_00
3388 * mon_L3_01
3389 * mon_L3_02
3390 * ...
3391 *
3392 * Each domain directory has one file per event:
3393 * ./mon_L3_00/:
3394 * llc_occupancy
3395 *
3396 */
3397static int mkdir_mondata_all(struct kernfs_node *parent_kn,
3398			     struct rdtgroup *prgrp,
3399			     struct kernfs_node **dest_kn)
3400{
3401	struct rdt_resource *r;
3402	struct kernfs_node *kn;
3403	int ret;
3404
3405	/*
3406	 * Create the mon_data directory first.
3407	 */
3408	ret = mongroup_create_dir(parent_kn, prgrp, "mon_data", &kn);
3409	if (ret)
3410		return ret;
3411
3412	if (dest_kn)
3413		*dest_kn = kn;
3414
3415	/*
3416	 * Create the subdirectories for each domain. Note that all events
3417	 * in a domain like L3 are grouped into a resource whose domain is L3
3418	 */
3419	for_each_mon_capable_rdt_resource(r) {
3420		ret = mkdir_mondata_subdir_alldom(kn, r, prgrp);
3421		if (ret)
3422			goto out_destroy;
3423	}
3424
3425	return 0;
3426
3427out_destroy:
3428	kernfs_remove(kn);
3429	return ret;
3430}
3431
3432/**
3433 * cbm_ensure_valid - Enforce validity on provided CBM
3434 * @_val:	Candidate CBM
3435 * @r:		RDT resource to which the CBM belongs
3436 *
3437 * The provided CBM represents all cache portions available for use. This
3438 * may be represented by a bitmap that does not consist of contiguous ones
3439 * and thus be an invalid CBM.
3440 * Here the provided CBM is forced to be a valid CBM by only considering
3441 * the first set of contiguous bits as valid and clearing all bits.
3442 * The intention here is to provide a valid default CBM with which a new
3443 * resource group is initialized. The user can follow this with a
3444 * modification to the CBM if the default does not satisfy the
3445 * requirements.
3446 */
3447static u32 cbm_ensure_valid(u32 _val, struct rdt_resource *r)
3448{
3449	unsigned int cbm_len = r->cache.cbm_len;
3450	unsigned long first_bit, zero_bit;
3451	unsigned long val;
3452
3453	if (!_val || r->cache.arch_has_sparse_bitmasks)
3454		return _val;
3455
3456	val = _val;
3457	first_bit = find_first_bit(&val, cbm_len);
3458	zero_bit = find_next_zero_bit(&val, cbm_len, first_bit);
3459
3460	/* Clear any remaining bits to ensure contiguous region */
3461	bitmap_clear(&val, zero_bit, cbm_len - zero_bit);
3462	return (u32)val;
3463}
3464
3465/*
3466 * Initialize cache resources per RDT domain
3467 *
3468 * Set the RDT domain up to start off with all usable allocations. That is,
3469 * all shareable and unused bits. All-zero CBM is invalid.
3470 */
3471static int __init_one_rdt_domain(struct rdt_ctrl_domain *d, struct resctrl_schema *s,
3472				 u32 closid)
3473{
3474	enum resctrl_conf_type peer_type = resctrl_peer_type(s->conf_type);
3475	enum resctrl_conf_type t = s->conf_type;
3476	struct resctrl_staged_config *cfg;
3477	struct rdt_resource *r = s->res;
3478	u32 used_b = 0, unused_b = 0;
3479	unsigned long tmp_cbm;
3480	enum rdtgrp_mode mode;
3481	u32 peer_ctl, ctrl_val;
3482	int i;
3483
3484	cfg = &d->staged_config[t];
3485	cfg->have_new_ctrl = false;
3486	cfg->new_ctrl = r->cache.shareable_bits;
3487	used_b = r->cache.shareable_bits;
3488	for (i = 0; i < closids_supported(); i++) {
3489		if (closid_allocated(i) && i != closid) {
3490			mode = rdtgroup_mode_by_closid(i);
3491			if (mode == RDT_MODE_PSEUDO_LOCKSETUP)
3492				/*
3493				 * ctrl values for locksetup aren't relevant
3494				 * until the schemata is written, and the mode
3495				 * becomes RDT_MODE_PSEUDO_LOCKED.
3496				 */
3497				continue;
3498			/*
3499			 * If CDP is active include peer domain's
3500			 * usage to ensure there is no overlap
3501			 * with an exclusive group.
3502			 */
3503			if (resctrl_arch_get_cdp_enabled(r->rid))
3504				peer_ctl = resctrl_arch_get_config(r, d, i,
3505								   peer_type);
3506			else
3507				peer_ctl = 0;
3508			ctrl_val = resctrl_arch_get_config(r, d, i,
3509							   s->conf_type);
3510			used_b |= ctrl_val | peer_ctl;
3511			if (mode == RDT_MODE_SHAREABLE)
3512				cfg->new_ctrl |= ctrl_val | peer_ctl;
3513		}
3514	}
3515	if (d->plr && d->plr->cbm > 0)
3516		used_b |= d->plr->cbm;
3517	unused_b = used_b ^ (BIT_MASK(r->cache.cbm_len) - 1);
3518	unused_b &= BIT_MASK(r->cache.cbm_len) - 1;
3519	cfg->new_ctrl |= unused_b;
3520	/*
3521	 * Force the initial CBM to be valid, user can
3522	 * modify the CBM based on system availability.
3523	 */
3524	cfg->new_ctrl = cbm_ensure_valid(cfg->new_ctrl, r);
3525	/*
3526	 * Assign the u32 CBM to an unsigned long to ensure that
3527	 * bitmap_weight() does not access out-of-bound memory.
3528	 */
3529	tmp_cbm = cfg->new_ctrl;
3530	if (bitmap_weight(&tmp_cbm, r->cache.cbm_len) < r->cache.min_cbm_bits) {
3531		rdt_last_cmd_printf("No space on %s:%d\n", s->name, d->hdr.id);
3532		return -ENOSPC;
3533	}
3534	cfg->have_new_ctrl = true;
3535
3536	return 0;
3537}
3538
3539/*
3540 * Initialize cache resources with default values.
3541 *
3542 * A new RDT group is being created on an allocation capable (CAT)
3543 * supporting system. Set this group up to start off with all usable
3544 * allocations.
3545 *
3546 * If there are no more shareable bits available on any domain then
3547 * the entire allocation will fail.
3548 */
3549int rdtgroup_init_cat(struct resctrl_schema *s, u32 closid)
3550{
3551	struct rdt_ctrl_domain *d;
3552	int ret;
3553
3554	list_for_each_entry(d, &s->res->ctrl_domains, hdr.list) {
3555		ret = __init_one_rdt_domain(d, s, closid);
3556		if (ret < 0)
3557			return ret;
3558	}
3559
3560	return 0;
3561}
3562
3563/* Initialize MBA resource with default values. */
3564static void rdtgroup_init_mba(struct rdt_resource *r, u32 closid)
3565{
3566	struct resctrl_staged_config *cfg;
3567	struct rdt_ctrl_domain *d;
3568
3569	list_for_each_entry(d, &r->ctrl_domains, hdr.list) {
3570		if (is_mba_sc(r)) {
3571			d->mbps_val[closid] = MBA_MAX_MBPS;
3572			continue;
3573		}
3574
3575		cfg = &d->staged_config[CDP_NONE];
3576		cfg->new_ctrl = resctrl_get_default_ctrl(r);
3577		cfg->have_new_ctrl = true;
3578	}
3579}
3580
3581/* Initialize the RDT group's allocations. */
3582static int rdtgroup_init_alloc(struct rdtgroup *rdtgrp)
3583{
3584	struct resctrl_schema *s;
3585	struct rdt_resource *r;
3586	int ret = 0;
3587
3588	rdt_staged_configs_clear();
3589
3590	list_for_each_entry(s, &resctrl_schema_all, list) {
3591		r = s->res;
3592		if (r->rid == RDT_RESOURCE_MBA ||
3593		    r->rid == RDT_RESOURCE_SMBA) {
3594			rdtgroup_init_mba(r, rdtgrp->closid);
3595			if (is_mba_sc(r))
3596				continue;
3597		} else {
3598			ret = rdtgroup_init_cat(s, rdtgrp->closid);
3599			if (ret < 0)
3600				goto out;
3601		}
3602
3603		ret = resctrl_arch_update_domains(r, rdtgrp->closid);
3604		if (ret < 0) {
3605			rdt_last_cmd_puts("Failed to initialize allocations\n");
3606			goto out;
3607		}
3608	}
3609
3610	rdtgrp->mode = RDT_MODE_SHAREABLE;
3611
3612out:
3613	rdt_staged_configs_clear();
3614	return ret;
3615}
3616
3617static int mkdir_rdt_prepare_rmid_alloc(struct rdtgroup *rdtgrp)
3618{
3619	int ret;
3620
3621	if (!resctrl_arch_mon_capable())
3622		return 0;
3623
3624	ret = alloc_rmid(rdtgrp->closid);
3625	if (ret < 0) {
3626		rdt_last_cmd_puts("Out of RMIDs\n");
3627		return ret;
3628	}
3629	rdtgrp->mon.rmid = ret;
3630
3631	rdtgroup_assign_cntrs(rdtgrp);
3632
3633	ret = mkdir_mondata_all(rdtgrp->kn, rdtgrp, &rdtgrp->mon.mon_data_kn);
3634	if (ret) {
3635		rdt_last_cmd_puts("kernfs subdir error\n");
3636		rdtgroup_unassign_cntrs(rdtgrp);
3637		free_rmid(rdtgrp->closid, rdtgrp->mon.rmid);
3638		return ret;
3639	}
3640
3641	return 0;
3642}
3643
3644static void mkdir_rdt_prepare_rmid_free(struct rdtgroup *rgrp)
3645{
3646	if (resctrl_arch_mon_capable()) {
3647		rdtgroup_unassign_cntrs(rgrp);
3648		free_rmid(rgrp->closid, rgrp->mon.rmid);
3649	}
3650}
3651
3652/*
3653 * We allow creating mon groups only with in a directory called "mon_groups"
3654 * which is present in every ctrl_mon group. Check if this is a valid
3655 * "mon_groups" directory.
3656 *
3657 * 1. The directory should be named "mon_groups".
3658 * 2. The mon group itself should "not" be named "mon_groups".
3659 *   This makes sure "mon_groups" directory always has a ctrl_mon group
3660 *   as parent.
3661 */
3662static bool is_mon_groups(struct kernfs_node *kn, const char *name)
3663{
3664	return (!strcmp(rdt_kn_name(kn), "mon_groups") &&
3665		strcmp(name, "mon_groups"));
3666}
3667
3668static int mkdir_rdt_prepare(struct kernfs_node *parent_kn,
3669			     const char *name, umode_t mode,
3670			     enum rdt_group_type rtype, struct rdtgroup **r)
3671{
3672	struct rdtgroup *prdtgrp, *rdtgrp;
3673	unsigned long files = 0;
3674	struct kernfs_node *kn;
3675	int ret;
3676
3677	prdtgrp = rdtgroup_kn_lock_live(parent_kn);
3678	if (!prdtgrp) {
3679		ret = -ENODEV;
3680		goto out_unlock;
3681	}
3682
3683	rdt_last_cmd_clear();
3684
3685	/*
3686	 * Check that the parent directory for a monitor group is a "mon_groups"
3687	 * directory.
3688	 */
3689	if (rtype == RDTMON_GROUP && !is_mon_groups(parent_kn, name)) {
3690		ret = -EPERM;
3691		goto out_unlock;
3692	}
3693
3694	if (rtype == RDTMON_GROUP &&
3695	    (prdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP ||
3696	     prdtgrp->mode == RDT_MODE_PSEUDO_LOCKED)) {
3697		ret = -EINVAL;
3698		rdt_last_cmd_puts("Pseudo-locking in progress\n");
3699		goto out_unlock;
3700	}
3701
3702	/* allocate the rdtgroup. */
3703	rdtgrp = kzalloc(sizeof(*rdtgrp), GFP_KERNEL);
3704	if (!rdtgrp) {
3705		ret = -ENOSPC;
3706		rdt_last_cmd_puts("Kernel out of memory\n");
3707		goto out_unlock;
3708	}
3709	*r = rdtgrp;
3710	rdtgrp->mon.parent = prdtgrp;
3711	rdtgrp->type = rtype;
3712	INIT_LIST_HEAD(&rdtgrp->mon.crdtgrp_list);
3713
3714	/* kernfs creates the directory for rdtgrp */
3715	kn = kernfs_create_dir(parent_kn, name, mode, rdtgrp);
3716	if (IS_ERR(kn)) {
3717		ret = PTR_ERR(kn);
3718		rdt_last_cmd_puts("kernfs create error\n");
3719		goto out_free_rgrp;
3720	}
3721	rdtgrp->kn = kn;
3722
3723	/*
3724	 * kernfs_remove() will drop the reference count on "kn" which
3725	 * will free it. But we still need it to stick around for the
3726	 * rdtgroup_kn_unlock(kn) call. Take one extra reference here,
3727	 * which will be dropped by kernfs_put() in rdtgroup_remove().
3728	 */
3729	kernfs_get(kn);
3730
3731	ret = rdtgroup_kn_set_ugid(kn);
3732	if (ret) {
3733		rdt_last_cmd_puts("kernfs perm error\n");
3734		goto out_destroy;
3735	}
3736
3737	if (rtype == RDTCTRL_GROUP) {
3738		files = RFTYPE_BASE | RFTYPE_CTRL;
3739		if (resctrl_arch_mon_capable())
3740			files |= RFTYPE_MON;
3741	} else {
3742		files = RFTYPE_BASE | RFTYPE_MON;
3743	}
3744
3745	ret = rdtgroup_add_files(kn, files);
3746	if (ret) {
3747		rdt_last_cmd_puts("kernfs fill error\n");
3748		goto out_destroy;
3749	}
3750
3751	/*
3752	 * The caller unlocks the parent_kn upon success.
3753	 */
3754	return 0;
3755
3756out_destroy:
3757	kernfs_put(rdtgrp->kn);
3758	kernfs_remove(rdtgrp->kn);
3759out_free_rgrp:
3760	kfree(rdtgrp);
3761out_unlock:
3762	rdtgroup_kn_unlock(parent_kn);
3763	return ret;
3764}
3765
3766static void mkdir_rdt_prepare_clean(struct rdtgroup *rgrp)
3767{
3768	kernfs_remove(rgrp->kn);
3769	rdtgroup_remove(rgrp);
3770}
3771
3772/*
3773 * Create a monitor group under "mon_groups" directory of a control
3774 * and monitor group(ctrl_mon). This is a resource group
3775 * to monitor a subset of tasks and cpus in its parent ctrl_mon group.
3776 */
3777static int rdtgroup_mkdir_mon(struct kernfs_node *parent_kn,
3778			      const char *name, umode_t mode)
3779{
3780	struct rdtgroup *rdtgrp, *prgrp;
3781	int ret;
3782
3783	ret = mkdir_rdt_prepare(parent_kn, name, mode, RDTMON_GROUP, &rdtgrp);
3784	if (ret)
3785		return ret;
3786
3787	prgrp = rdtgrp->mon.parent;
3788	rdtgrp->closid = prgrp->closid;
3789
3790	ret = mkdir_rdt_prepare_rmid_alloc(rdtgrp);
3791	if (ret) {
3792		mkdir_rdt_prepare_clean(rdtgrp);
3793		goto out_unlock;
3794	}
3795
3796	kernfs_activate(rdtgrp->kn);
3797
3798	/*
3799	 * Add the rdtgrp to the list of rdtgrps the parent
3800	 * ctrl_mon group has to track.
3801	 */
3802	list_add_tail(&rdtgrp->mon.crdtgrp_list, &prgrp->mon.crdtgrp_list);
3803
3804out_unlock:
3805	rdtgroup_kn_unlock(parent_kn);
3806	return ret;
3807}
3808
3809/*
3810 * These are rdtgroups created under the root directory. Can be used
3811 * to allocate and monitor resources.
3812 */
3813static int rdtgroup_mkdir_ctrl_mon(struct kernfs_node *parent_kn,
3814				   const char *name, umode_t mode)
3815{
3816	struct rdtgroup *rdtgrp;
3817	struct kernfs_node *kn;
3818	u32 closid;
3819	int ret;
3820
3821	ret = mkdir_rdt_prepare(parent_kn, name, mode, RDTCTRL_GROUP, &rdtgrp);
3822	if (ret)
3823		return ret;
3824
3825	kn = rdtgrp->kn;
3826	ret = closid_alloc();
3827	if (ret < 0) {
3828		rdt_last_cmd_puts("Out of CLOSIDs\n");
3829		goto out_common_fail;
3830	}
3831	closid = ret;
3832	ret = 0;
3833
3834	rdtgrp->closid = closid;
3835
3836	ret = mkdir_rdt_prepare_rmid_alloc(rdtgrp);
3837	if (ret)
3838		goto out_closid_free;
3839
3840	kernfs_activate(rdtgrp->kn);
3841
3842	ret = rdtgroup_init_alloc(rdtgrp);
3843	if (ret < 0)
3844		goto out_rmid_free;
3845
3846	list_add(&rdtgrp->rdtgroup_list, &rdt_all_groups);
3847
3848	if (resctrl_arch_mon_capable()) {
3849		/*
3850		 * Create an empty mon_groups directory to hold the subset
3851		 * of tasks and cpus to monitor.
3852		 */
3853		ret = mongroup_create_dir(kn, rdtgrp, "mon_groups", NULL);
3854		if (ret) {
3855			rdt_last_cmd_puts("kernfs subdir error\n");
3856			goto out_del_list;
3857		}
3858		if (is_mba_sc(NULL))
3859			rdtgrp->mba_mbps_event = mba_mbps_default_event;
3860	}
3861
3862	goto out_unlock;
3863
3864out_del_list:
3865	list_del(&rdtgrp->rdtgroup_list);
3866out_rmid_free:
3867	mkdir_rdt_prepare_rmid_free(rdtgrp);
3868out_closid_free:
3869	closid_free(closid);
3870out_common_fail:
3871	mkdir_rdt_prepare_clean(rdtgrp);
3872out_unlock:
3873	rdtgroup_kn_unlock(parent_kn);
3874	return ret;
3875}
3876
3877static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
3878			  umode_t mode)
3879{
3880	/* Do not accept '\n' to avoid unparsable situation. */
3881	if (strchr(name, '\n'))
3882		return -EINVAL;
3883
3884	/*
3885	 * If the parent directory is the root directory and RDT
3886	 * allocation is supported, add a control and monitoring
3887	 * subdirectory
3888	 */
3889	if (resctrl_arch_alloc_capable() && parent_kn == rdtgroup_default.kn)
3890		return rdtgroup_mkdir_ctrl_mon(parent_kn, name, mode);
3891
3892	/* Else, attempt to add a monitoring subdirectory. */
3893	if (resctrl_arch_mon_capable())
3894		return rdtgroup_mkdir_mon(parent_kn, name, mode);
3895
3896	return -EPERM;
3897}
3898
3899static int rdtgroup_rmdir_mon(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask)
3900{
3901	struct rdtgroup *prdtgrp = rdtgrp->mon.parent;
3902	u32 closid, rmid;
3903	int cpu;
3904
3905	/* Give any tasks back to the parent group */
3906	rdt_move_group_tasks(rdtgrp, prdtgrp, tmpmask);
3907
3908	/*
3909	 * Update per cpu closid/rmid of the moved CPUs first.
3910	 * Note: the closid will not change, but the arch code still needs it.
3911	 */
3912	closid = prdtgrp->closid;
3913	rmid = prdtgrp->mon.rmid;
3914	for_each_cpu(cpu, &rdtgrp->cpu_mask)
3915		resctrl_arch_set_cpu_default_closid_rmid(cpu, closid, rmid);
3916
3917	/*
3918	 * Update the MSR on moved CPUs and CPUs which have moved
3919	 * task running on them.
3920	 */
3921	cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask);
3922	update_closid_rmid(tmpmask, NULL);
3923
3924	rdtgrp->flags = RDT_DELETED;
3925
3926	rdtgroup_unassign_cntrs(rdtgrp);
3927
3928	free_rmid(rdtgrp->closid, rdtgrp->mon.rmid);
3929
3930	/*
3931	 * Remove the rdtgrp from the parent ctrl_mon group's list
3932	 */
3933	WARN_ON(list_empty(&prdtgrp->mon.crdtgrp_list));
3934	list_del(&rdtgrp->mon.crdtgrp_list);
3935
3936	kernfs_remove(rdtgrp->kn);
3937
3938	return 0;
3939}
3940
3941static int rdtgroup_ctrl_remove(struct rdtgroup *rdtgrp)
3942{
3943	rdtgrp->flags = RDT_DELETED;
3944	list_del(&rdtgrp->rdtgroup_list);
3945
3946	kernfs_remove(rdtgrp->kn);
3947	return 0;
3948}
3949
3950static int rdtgroup_rmdir_ctrl(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask)
3951{
3952	u32 closid, rmid;
3953	int cpu;
3954
3955	/* Give any tasks back to the default group */
3956	rdt_move_group_tasks(rdtgrp, &rdtgroup_default, tmpmask);
3957
3958	/* Give any CPUs back to the default group */
3959	cpumask_or(&rdtgroup_default.cpu_mask,
3960		   &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask);
3961
3962	/* Update per cpu closid and rmid of the moved CPUs first */
3963	closid = rdtgroup_default.closid;
3964	rmid = rdtgroup_default.mon.rmid;
3965	for_each_cpu(cpu, &rdtgrp->cpu_mask)
3966		resctrl_arch_set_cpu_default_closid_rmid(cpu, closid, rmid);
3967
3968	/*
3969	 * Update the MSR on moved CPUs and CPUs which have moved
3970	 * task running on them.
3971	 */
3972	cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask);
3973	update_closid_rmid(tmpmask, NULL);
3974
3975	rdtgroup_unassign_cntrs(rdtgrp);
3976
3977	free_rmid(rdtgrp->closid, rdtgrp->mon.rmid);
3978	closid_free(rdtgrp->closid);
3979
3980	rdtgroup_ctrl_remove(rdtgrp);
3981
3982	/*
3983	 * Free all the child monitor group rmids.
3984	 */
3985	free_all_child_rdtgrp(rdtgrp);
3986
3987	return 0;
3988}
3989
3990static struct kernfs_node *rdt_kn_parent(struct kernfs_node *kn)
3991{
3992	/*
3993	 * Valid within the RCU section it was obtained or while rdtgroup_mutex
3994	 * is held.
3995	 */
3996	return rcu_dereference_check(kn->__parent, lockdep_is_held(&rdtgroup_mutex));
3997}
3998
3999static int rdtgroup_rmdir(struct kernfs_node *kn)
4000{
4001	struct kernfs_node *parent_kn;
4002	struct rdtgroup *rdtgrp;
4003	cpumask_var_t tmpmask;
4004	int ret = 0;
4005
4006	if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
4007		return -ENOMEM;
4008
4009	rdtgrp = rdtgroup_kn_lock_live(kn);
4010	if (!rdtgrp) {
4011		ret = -EPERM;
4012		goto out;
4013	}
4014	parent_kn = rdt_kn_parent(kn);
4015
4016	/*
4017	 * If the rdtgroup is a ctrl_mon group and parent directory
4018	 * is the root directory, remove the ctrl_mon group.
4019	 *
4020	 * If the rdtgroup is a mon group and parent directory
4021	 * is a valid "mon_groups" directory, remove the mon group.
4022	 */
4023	if (rdtgrp->type == RDTCTRL_GROUP && parent_kn == rdtgroup_default.kn &&
4024	    rdtgrp != &rdtgroup_default) {
4025		if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP ||
4026		    rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) {
4027			ret = rdtgroup_ctrl_remove(rdtgrp);
4028		} else {
4029			ret = rdtgroup_rmdir_ctrl(rdtgrp, tmpmask);
4030		}
4031	} else if (rdtgrp->type == RDTMON_GROUP &&
4032		 is_mon_groups(parent_kn, rdt_kn_name(kn))) {
4033		ret = rdtgroup_rmdir_mon(rdtgrp, tmpmask);
4034	} else {
4035		ret = -EPERM;
4036	}
4037
4038out:
4039	rdtgroup_kn_unlock(kn);
4040	free_cpumask_var(tmpmask);
4041	return ret;
4042}
4043
4044/**
4045 * mongrp_reparent() - replace parent CTRL_MON group of a MON group
4046 * @rdtgrp:		the MON group whose parent should be replaced
4047 * @new_prdtgrp:	replacement parent CTRL_MON group for @rdtgrp
4048 * @cpus:		cpumask provided by the caller for use during this call
4049 *
4050 * Replaces the parent CTRL_MON group for a MON group, resulting in all member
4051 * tasks' CLOSID immediately changing to that of the new parent group.
4052 * Monitoring data for the group is unaffected by this operation.
4053 */
4054static void mongrp_reparent(struct rdtgroup *rdtgrp,
4055			    struct rdtgroup *new_prdtgrp,
4056			    cpumask_var_t cpus)
4057{
4058	struct rdtgroup *prdtgrp = rdtgrp->mon.parent;
4059
4060	WARN_ON(rdtgrp->type != RDTMON_GROUP);
4061	WARN_ON(new_prdtgrp->type != RDTCTRL_GROUP);
4062
4063	/* Nothing to do when simply renaming a MON group. */
4064	if (prdtgrp == new_prdtgrp)
4065		return;
4066
4067	WARN_ON(list_empty(&prdtgrp->mon.crdtgrp_list));
4068	list_move_tail(&rdtgrp->mon.crdtgrp_list,
4069		       &new_prdtgrp->mon.crdtgrp_list);
4070
4071	rdtgrp->mon.parent = new_prdtgrp;
4072	rdtgrp->closid = new_prdtgrp->closid;
4073
4074	/* Propagate updated closid to all tasks in this group. */
4075	rdt_move_group_tasks(rdtgrp, rdtgrp, cpus);
4076
4077	update_closid_rmid(cpus, NULL);
4078}
4079
4080static int rdtgroup_rename(struct kernfs_node *kn,
4081			   struct kernfs_node *new_parent, const char *new_name)
4082{
4083	struct kernfs_node *kn_parent;
4084	struct rdtgroup *new_prdtgrp;
4085	struct rdtgroup *rdtgrp;
4086	cpumask_var_t tmpmask;
4087	int ret;
4088
4089	rdtgrp = kernfs_to_rdtgroup(kn);
4090	new_prdtgrp = kernfs_to_rdtgroup(new_parent);
4091	if (!rdtgrp || !new_prdtgrp)
4092		return -ENOENT;
4093
4094	/* Release both kernfs active_refs before obtaining rdtgroup mutex. */
4095	rdtgroup_kn_get(rdtgrp, kn);
4096	rdtgroup_kn_get(new_prdtgrp, new_parent);
4097
4098	mutex_lock(&rdtgroup_mutex);
4099
4100	rdt_last_cmd_clear();
4101
4102	/*
4103	 * Don't allow kernfs_to_rdtgroup() to return a parent rdtgroup if
4104	 * either kernfs_node is a file.
4105	 */
4106	if (kernfs_type(kn) != KERNFS_DIR ||
4107	    kernfs_type(new_parent) != KERNFS_DIR) {
4108		rdt_last_cmd_puts("Source and destination must be directories");
4109		ret = -EPERM;
4110		goto out;
4111	}
4112
4113	if ((rdtgrp->flags & RDT_DELETED) || (new_prdtgrp->flags & RDT_DELETED)) {
4114		ret = -ENOENT;
4115		goto out;
4116	}
4117
4118	kn_parent = rdt_kn_parent(kn);
4119	if (rdtgrp->type != RDTMON_GROUP || !kn_parent ||
4120	    !is_mon_groups(kn_parent, rdt_kn_name(kn))) {
4121		rdt_last_cmd_puts("Source must be a MON group\n");
4122		ret = -EPERM;
4123		goto out;
4124	}
4125
4126	if (!is_mon_groups(new_parent, new_name)) {
4127		rdt_last_cmd_puts("Destination must be a mon_groups subdirectory\n");
4128		ret = -EPERM;
4129		goto out;
4130	}
4131
4132	/*
4133	 * If the MON group is monitoring CPUs, the CPUs must be assigned to the
4134	 * current parent CTRL_MON group and therefore cannot be assigned to
4135	 * the new parent, making the move illegal.
4136	 */
4137	if (!cpumask_empty(&rdtgrp->cpu_mask) &&
4138	    rdtgrp->mon.parent != new_prdtgrp) {
4139		rdt_last_cmd_puts("Cannot move a MON group that monitors CPUs\n");
4140		ret = -EPERM;
4141		goto out;
4142	}
4143
4144	/*
4145	 * Allocate the cpumask for use in mongrp_reparent() to avoid the
4146	 * possibility of failing to allocate it after kernfs_rename() has
4147	 * succeeded.
4148	 */
4149	if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) {
4150		ret = -ENOMEM;
4151		goto out;
4152	}
4153
4154	/*
4155	 * Perform all input validation and allocations needed to ensure
4156	 * mongrp_reparent() will succeed before calling kernfs_rename(),
4157	 * otherwise it would be necessary to revert this call if
4158	 * mongrp_reparent() failed.
4159	 */
4160	ret = kernfs_rename(kn, new_parent, new_name);
4161	if (!ret)
4162		mongrp_reparent(rdtgrp, new_prdtgrp, tmpmask);
4163
4164	free_cpumask_var(tmpmask);
4165
4166out:
4167	mutex_unlock(&rdtgroup_mutex);
4168	rdtgroup_kn_put(rdtgrp, kn);
4169	rdtgroup_kn_put(new_prdtgrp, new_parent);
4170	return ret;
4171}
4172
4173static int rdtgroup_show_options(struct seq_file *seq, struct kernfs_root *kf)
4174{
4175	if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L3))
4176		seq_puts(seq, ",cdp");
4177
4178	if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L2))
4179		seq_puts(seq, ",cdpl2");
4180
4181	if (is_mba_sc(resctrl_arch_get_resource(RDT_RESOURCE_MBA)))
4182		seq_puts(seq, ",mba_MBps");
4183
4184	if (resctrl_debug)
4185		seq_puts(seq, ",debug");
4186
4187	return 0;
4188}
4189
4190static struct kernfs_syscall_ops rdtgroup_kf_syscall_ops = {
4191	.mkdir		= rdtgroup_mkdir,
4192	.rmdir		= rdtgroup_rmdir,
4193	.rename		= rdtgroup_rename,
4194	.show_options	= rdtgroup_show_options,
4195};
4196
4197static int rdtgroup_setup_root(struct rdt_fs_context *ctx)
4198{
4199	rdt_root = kernfs_create_root(&rdtgroup_kf_syscall_ops,
4200				      KERNFS_ROOT_CREATE_DEACTIVATED |
4201				      KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK,
4202				      &rdtgroup_default);
4203	if (IS_ERR(rdt_root))
4204		return PTR_ERR(rdt_root);
4205
4206	ctx->kfc.root = rdt_root;
4207	rdtgroup_default.kn = kernfs_root_to_node(rdt_root);
4208
4209	return 0;
4210}
4211
4212static void rdtgroup_destroy_root(void)
4213{
4214	lockdep_assert_held(&rdtgroup_mutex);
4215
4216	kernfs_destroy_root(rdt_root);
4217	rdtgroup_default.kn = NULL;
4218}
4219
4220static void rdtgroup_setup_default(void)
4221{
4222	mutex_lock(&rdtgroup_mutex);
4223
4224	rdtgroup_default.closid = RESCTRL_RESERVED_CLOSID;
4225	rdtgroup_default.mon.rmid = RESCTRL_RESERVED_RMID;
4226	rdtgroup_default.type = RDTCTRL_GROUP;
4227	INIT_LIST_HEAD(&rdtgroup_default.mon.crdtgrp_list);
4228
4229	list_add(&rdtgroup_default.rdtgroup_list, &rdt_all_groups);
4230
4231	mutex_unlock(&rdtgroup_mutex);
4232}
4233
4234static void domain_destroy_mon_state(struct rdt_mon_domain *d)
4235{
4236	int idx;
4237
4238	kfree(d->cntr_cfg);
4239	bitmap_free(d->rmid_busy_llc);
4240	for_each_mbm_idx(idx) {
4241		kfree(d->mbm_states[idx]);
4242		d->mbm_states[idx] = NULL;
4243	}
4244}
4245
4246void resctrl_offline_ctrl_domain(struct rdt_resource *r, struct rdt_ctrl_domain *d)
4247{
4248	mutex_lock(&rdtgroup_mutex);
4249
4250	if (supports_mba_mbps() && r->rid == RDT_RESOURCE_MBA)
4251		mba_sc_domain_destroy(r, d);
4252
4253	mutex_unlock(&rdtgroup_mutex);
4254}
4255
4256void resctrl_offline_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d)
4257{
4258	mutex_lock(&rdtgroup_mutex);
4259
4260	/*
4261	 * If resctrl is mounted, remove all the
4262	 * per domain monitor data directories.
4263	 */
4264	if (resctrl_mounted && resctrl_arch_mon_capable())
4265		rmdir_mondata_subdir_allrdtgrp(r, d);
4266
4267	if (resctrl_is_mbm_enabled())
4268		cancel_delayed_work(&d->mbm_over);
4269	if (resctrl_is_mon_event_enabled(QOS_L3_OCCUP_EVENT_ID) && has_busy_rmid(d)) {
4270		/*
4271		 * When a package is going down, forcefully
4272		 * decrement rmid->ebusy. There is no way to know
4273		 * that the L3 was flushed and hence may lead to
4274		 * incorrect counts in rare scenarios, but leaving
4275		 * the RMID as busy creates RMID leaks if the
4276		 * package never comes back.
4277		 */
4278		__check_limbo(d, true);
4279		cancel_delayed_work(&d->cqm_limbo);
4280	}
4281
4282	domain_destroy_mon_state(d);
4283
4284	mutex_unlock(&rdtgroup_mutex);
4285}
4286
4287/**
4288 * domain_setup_mon_state() -  Initialise domain monitoring structures.
4289 * @r:	The resource for the newly online domain.
4290 * @d:	The newly online domain.
4291 *
4292 * Allocate monitor resources that belong to this domain.
4293 * Called when the first CPU of a domain comes online, regardless of whether
4294 * the filesystem is mounted.
4295 * During boot this may be called before global allocations have been made by
4296 * resctrl_mon_resource_init().
4297 *
4298 * Returns 0 for success, or -ENOMEM.
4299 */
4300static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_mon_domain *d)
4301{
4302	u32 idx_limit = resctrl_arch_system_num_rmid_idx();
4303	size_t tsize = sizeof(*d->mbm_states[0]);
4304	enum resctrl_event_id eventid;
4305	int idx;
4306
4307	if (resctrl_is_mon_event_enabled(QOS_L3_OCCUP_EVENT_ID)) {
4308		d->rmid_busy_llc = bitmap_zalloc(idx_limit, GFP_KERNEL);
4309		if (!d->rmid_busy_llc)
4310			return -ENOMEM;
4311	}
4312
4313	for_each_mbm_event_id(eventid) {
4314		if (!resctrl_is_mon_event_enabled(eventid))
4315			continue;
4316		idx = MBM_STATE_IDX(eventid);
4317		d->mbm_states[idx] = kcalloc(idx_limit, tsize, GFP_KERNEL);
4318		if (!d->mbm_states[idx])
4319			goto cleanup;
4320	}
4321
4322	if (resctrl_is_mbm_enabled() && r->mon.mbm_cntr_assignable) {
4323		tsize = sizeof(*d->cntr_cfg);
4324		d->cntr_cfg = kcalloc(r->mon.num_mbm_cntrs, tsize, GFP_KERNEL);
4325		if (!d->cntr_cfg)
4326			goto cleanup;
4327	}
4328
4329	return 0;
4330cleanup:
4331	bitmap_free(d->rmid_busy_llc);
4332	for_each_mbm_idx(idx) {
4333		kfree(d->mbm_states[idx]);
4334		d->mbm_states[idx] = NULL;
4335	}
4336
4337	return -ENOMEM;
4338}
4339
4340int resctrl_online_ctrl_domain(struct rdt_resource *r, struct rdt_ctrl_domain *d)
4341{
4342	int err = 0;
4343
4344	mutex_lock(&rdtgroup_mutex);
4345
4346	if (supports_mba_mbps() && r->rid == RDT_RESOURCE_MBA) {
4347		/* RDT_RESOURCE_MBA is never mon_capable */
4348		err = mba_sc_domain_allocate(r, d);
4349	}
4350
4351	mutex_unlock(&rdtgroup_mutex);
4352
4353	return err;
4354}
4355
4356int resctrl_online_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d)
4357{
4358	int err;
4359
4360	mutex_lock(&rdtgroup_mutex);
4361
4362	err = domain_setup_mon_state(r, d);
4363	if (err)
4364		goto out_unlock;
4365
4366	if (resctrl_is_mbm_enabled()) {
4367		INIT_DELAYED_WORK(&d->mbm_over, mbm_handle_overflow);
4368		mbm_setup_overflow_handler(d, MBM_OVERFLOW_INTERVAL,
4369					   RESCTRL_PICK_ANY_CPU);
4370	}
4371
4372	if (resctrl_is_mon_event_enabled(QOS_L3_OCCUP_EVENT_ID))
4373		INIT_DELAYED_WORK(&d->cqm_limbo, cqm_handle_limbo);
4374
4375	/*
4376	 * If the filesystem is not mounted then only the default resource group
4377	 * exists. Creation of its directories is deferred until mount time
4378	 * by rdt_get_tree() calling mkdir_mondata_all().
4379	 * If resctrl is mounted, add per domain monitor data directories.
4380	 */
4381	if (resctrl_mounted && resctrl_arch_mon_capable())
4382		mkdir_mondata_subdir_allrdtgrp(r, d);
4383
4384out_unlock:
4385	mutex_unlock(&rdtgroup_mutex);
4386
4387	return err;
4388}
4389
4390void resctrl_online_cpu(unsigned int cpu)
4391{
4392	mutex_lock(&rdtgroup_mutex);
4393	/* The CPU is set in default rdtgroup after online. */
4394	cpumask_set_cpu(cpu, &rdtgroup_default.cpu_mask);
4395	mutex_unlock(&rdtgroup_mutex);
4396}
4397
4398static void clear_childcpus(struct rdtgroup *r, unsigned int cpu)
4399{
4400	struct rdtgroup *cr;
4401
4402	list_for_each_entry(cr, &r->mon.crdtgrp_list, mon.crdtgrp_list) {
4403		if (cpumask_test_and_clear_cpu(cpu, &cr->cpu_mask))
4404			break;
4405	}
4406}
4407
4408static struct rdt_mon_domain *get_mon_domain_from_cpu(int cpu,
4409						      struct rdt_resource *r)
4410{
4411	struct rdt_mon_domain *d;
4412
4413	lockdep_assert_cpus_held();
4414
4415	list_for_each_entry(d, &r->mon_domains, hdr.list) {
4416		/* Find the domain that contains this CPU */
4417		if (cpumask_test_cpu(cpu, &d->hdr.cpu_mask))
4418			return d;
4419	}
4420
4421	return NULL;
4422}
4423
4424void resctrl_offline_cpu(unsigned int cpu)
4425{
4426	struct rdt_resource *l3 = resctrl_arch_get_resource(RDT_RESOURCE_L3);
4427	struct rdt_mon_domain *d;
4428	struct rdtgroup *rdtgrp;
4429
4430	mutex_lock(&rdtgroup_mutex);
4431	list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) {
4432		if (cpumask_test_and_clear_cpu(cpu, &rdtgrp->cpu_mask)) {
4433			clear_childcpus(rdtgrp, cpu);
4434			break;
4435		}
4436	}
4437
4438	if (!l3->mon_capable)
4439		goto out_unlock;
4440
4441	d = get_mon_domain_from_cpu(cpu, l3);
4442	if (d) {
4443		if (resctrl_is_mbm_enabled() && cpu == d->mbm_work_cpu) {
4444			cancel_delayed_work(&d->mbm_over);
4445			mbm_setup_overflow_handler(d, 0, cpu);
4446		}
4447		if (resctrl_is_mon_event_enabled(QOS_L3_OCCUP_EVENT_ID) &&
4448		    cpu == d->cqm_work_cpu && has_busy_rmid(d)) {
4449			cancel_delayed_work(&d->cqm_limbo);
4450			cqm_setup_limbo_handler(d, 0, cpu);
4451		}
4452	}
4453
4454out_unlock:
4455	mutex_unlock(&rdtgroup_mutex);
4456}
4457
4458/*
4459 * resctrl_init - resctrl filesystem initialization
4460 *
4461 * Setup resctrl file system including set up root, create mount point,
4462 * register resctrl filesystem, and initialize files under root directory.
4463 *
4464 * Return: 0 on success or -errno
4465 */
4466int resctrl_init(void)
4467{
4468	int ret = 0;
4469
4470	seq_buf_init(&last_cmd_status, last_cmd_status_buf,
4471		     sizeof(last_cmd_status_buf));
4472
4473	rdtgroup_setup_default();
4474
4475	thread_throttle_mode_init();
4476
4477	io_alloc_init();
4478
4479	ret = resctrl_mon_resource_init();
4480	if (ret)
4481		return ret;
4482
4483	ret = sysfs_create_mount_point(fs_kobj, "resctrl");
4484	if (ret) {
4485		resctrl_mon_resource_exit();
4486		return ret;
4487	}
4488
4489	ret = register_filesystem(&rdt_fs_type);
4490	if (ret)
4491		goto cleanup_mountpoint;
4492
4493	/*
4494	 * Adding the resctrl debugfs directory here may not be ideal since
4495	 * it would let the resctrl debugfs directory appear on the debugfs
4496	 * filesystem before the resctrl filesystem is mounted.
4497	 * It may also be ok since that would enable debugging of RDT before
4498	 * resctrl is mounted.
4499	 * The reason why the debugfs directory is created here and not in
4500	 * rdt_get_tree() is because rdt_get_tree() takes rdtgroup_mutex and
4501	 * during the debugfs directory creation also &sb->s_type->i_mutex_key
4502	 * (the lockdep class of inode->i_rwsem). Other filesystem
4503	 * interactions (eg. SyS_getdents) have the lock ordering:
4504	 * &sb->s_type->i_mutex_key --> &mm->mmap_lock
4505	 * During mmap(), called with &mm->mmap_lock, the rdtgroup_mutex
4506	 * is taken, thus creating dependency:
4507	 * &mm->mmap_lock --> rdtgroup_mutex for the latter that can cause
4508	 * issues considering the other two lock dependencies.
4509	 * By creating the debugfs directory here we avoid a dependency
4510	 * that may cause deadlock (even though file operations cannot
4511	 * occur until the filesystem is mounted, but I do not know how to
4512	 * tell lockdep that).
4513	 */
4514	debugfs_resctrl = debugfs_create_dir("resctrl", NULL);
4515
4516	return 0;
4517
4518cleanup_mountpoint:
4519	sysfs_remove_mount_point(fs_kobj, "resctrl");
4520	resctrl_mon_resource_exit();
4521
4522	return ret;
4523}
4524
4525static bool resctrl_online_domains_exist(void)
4526{
4527	struct rdt_resource *r;
4528
4529	/*
4530	 * Only walk capable resources to allow resctrl_arch_get_resource()
4531	 * to return dummy 'not capable' resources.
4532	 */
4533	for_each_alloc_capable_rdt_resource(r) {
4534		if (!list_empty(&r->ctrl_domains))
4535			return true;
4536	}
4537
4538	for_each_mon_capable_rdt_resource(r) {
4539		if (!list_empty(&r->mon_domains))
4540			return true;
4541	}
4542
4543	return false;
4544}
4545
4546/**
4547 * resctrl_exit() - Remove the resctrl filesystem and free resources.
4548 *
4549 * Called by the architecture code in response to a fatal error.
4550 * Removes resctrl files and structures from kernfs to prevent further
4551 * configuration.
4552 *
4553 * When called by the architecture code, all CPUs and resctrl domains must be
4554 * offline. This ensures the limbo and overflow handlers are not scheduled to
4555 * run, meaning the data structures they access can be freed by
4556 * resctrl_mon_resource_exit().
4557 *
4558 * After resctrl_exit() returns, the architecture code should return an
4559 * error from all resctrl_arch_ functions that can do this.
4560 * resctrl_arch_get_resource() must continue to return struct rdt_resources
4561 * with the correct rid field to ensure the filesystem can be unmounted.
4562 */
4563void resctrl_exit(void)
4564{
4565	cpus_read_lock();
4566	WARN_ON_ONCE(resctrl_online_domains_exist());
4567
4568	mutex_lock(&rdtgroup_mutex);
4569	resctrl_fs_teardown();
4570	mutex_unlock(&rdtgroup_mutex);
4571
4572	cpus_read_unlock();
4573
4574	debugfs_remove_recursive(debugfs_resctrl);
4575	debugfs_resctrl = NULL;
4576	unregister_filesystem(&rdt_fs_type);
4577
4578	/*
4579	 * Do not remove the sysfs mount point added by resctrl_init() so that
4580	 * it can be used to umount resctrl.
4581	 */
4582
4583	resctrl_mon_resource_exit();
4584}