From: Fenghua Yu <fenghua.yu(a)intel.com>
Currently when moving a task to a resource group the PQR_ASSOC MSR
is updated with the new closid and rmid in an added task callback.
If the task is running the work is run as soon as possible. If the
task is not running the work is executed later in the kernel exit path
when the kernel returns to the task again.
Updating the PQR_ASSOC MSR as soon as possible on the CPU a moved task
is running is the right thing to do. Queueing work for a task that is
not running is unnecessary (the PQR_ASSOC MSR is already updated when the
task is scheduled in) and causing system resource waste with the way in
which it is implemented: Work to update the PQR_ASSOC register is queued
every time the user writes a task id to the "tasks" file, even if the task
already belongs to the resource group. This could result in multiple pending
work items associated with a single task even if they are all identical and
even though only a single update with most recent values is needed.
Specifically, even if a task is moved between different resource groups
while it is sleeping then it is only the last move that is relevant but
yet a work item is queued during each move.
This unnecessary queueing of work items could result in significant system
resource waste, especially on tasks sleeping for a long time. For example,
as demonstrated by Shakeel Butt in [1] writing the same task id to the
"tasks" file can quickly consume significant memory. The same problem
(wasted system resources) occurs when moving a task between different
resource groups.
As pointed out by Valentin Schneider in [2] there is an additional issue with
the way in which the queueing of work is done in that the task_struct update
is currently done after the work is queued, resulting in a race with the
register update possibly done before the data needed by the update is available.
To solve these issues, the PQR_ASSOC MSR is updated in a synchronous way
right after the new closid and rmid are ready during the task movement,
only if the task is running. If a moved task is not running nothing is
done since the PQR_ASSOC MSR will be updated next time the task is scheduled.
This is the same way used to update the register when tasks are moved as
part of resource group removal.
[1] https://lore.kernel.org/lkml/CALvZod7E9zzHwenzf7objzGKsdBmVwTgEJ0nPgs0LUFU3…
[2] https://lore.kernel.org/lkml/20201123022433.17905-1-valentin.schneider@arm.…
Fixes: e02737d5b826 ("x86/intel_rdt: Add tasks files")
Reported-by: Shakeel Butt <shakeelb(a)google.com>
Reported-by: Valentin Schneider <valentin.schneider(a)arm.com>
Signed-off-by: Fenghua Yu <fenghua.yu(a)intel.com>
Signed-off-by: Reinette Chatre <reinette.chatre(a)intel.com>
Reviewed-by: Tony Luck <tony.luck(a)intel.com>
Cc: stable(a)vger.kernel.org
---
arch/x86/kernel/cpu/resctrl/rdtgroup.c | 123 ++++++++++---------------
1 file changed, 50 insertions(+), 73 deletions(-)
diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
index 68db7d2dec8f..9d62f1fadcc3 100644
--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
@@ -525,6 +525,16 @@ static void rdtgroup_remove(struct rdtgroup *rdtgrp)
kfree(rdtgrp);
}
+static void _update_task_closid_rmid(void *task)
+{
+ /*
+ * If the task is still current on this CPU, update PQR_ASSOC MSR.
+ * Otherwise, the MSR is updated when the task is scheduled in.
+ */
+ if (task == current)
+ resctrl_sched_in();
+}
+
#ifdef CONFIG_SMP
/* Get the CPU if the task is on it. */
static bool task_on_cpu(struct task_struct *t, int *cpu)
@@ -552,94 +562,61 @@ static void set_task_cpumask(struct task_struct *t, struct cpumask *mask)
if (mask && task_on_cpu(t, &cpu))
cpumask_set_cpu(cpu, mask);
}
-#else
-static inline void
-set_task_cpumask(struct task_struct *t, struct cpumask *mask) { }
-#endif
-
-struct task_move_callback {
- struct callback_head work;
- struct rdtgroup *rdtgrp;
-};
-static void move_myself(struct callback_head *head)
+static void update_task_closid_rmid(struct task_struct *t)
{
- struct task_move_callback *callback;
- struct rdtgroup *rdtgrp;
-
- callback = container_of(head, struct task_move_callback, work);
- rdtgrp = callback->rdtgrp;
-
- /*
- * If resource group was deleted before this task work callback
- * was invoked, then assign the task to root group and free the
- * resource group.
- */
- if (atomic_dec_and_test(&rdtgrp->waitcount) &&
- (rdtgrp->flags & RDT_DELETED)) {
- current->closid = 0;
- current->rmid = 0;
- rdtgroup_remove(rdtgrp);
- }
+ int cpu;
- if (unlikely(current->flags & PF_EXITING))
- goto out;
+ if (task_on_cpu(t, &cpu))
+ smp_call_function_single(cpu, _update_task_closid_rmid, t, 1);
+}
- preempt_disable();
- /* update PQR_ASSOC MSR to make resource group go into effect */
- resctrl_sched_in();
- preempt_enable();
+#else
+static inline void
+set_task_cpumask(struct task_struct *t, struct cpumask *mask) { }
-out:
- kfree(callback);
+static void update_task_closid_rmid(struct task_struct *t)
+{
+ _update_task_closid_rmid(t);
}
+#endif
static int __rdtgroup_move_task(struct task_struct *tsk,
struct rdtgroup *rdtgrp)
{
- struct task_move_callback *callback;
- int ret;
-
- callback = kzalloc(sizeof(*callback), GFP_KERNEL);
- if (!callback)
- return -ENOMEM;
- callback->work.func = move_myself;
- callback->rdtgrp = rdtgrp;
-
/*
- * Take a refcount, so rdtgrp cannot be freed before the
- * callback has been invoked.
+ * Set the task's closid/rmid before the PQR_ASSOC MSR can be
+ * updated by them.
+ *
+ * For ctrl_mon groups, move both closid and rmid.
+ * For monitor groups, can move the tasks only from
+ * their parent CTRL group.
*/
- atomic_inc(&rdtgrp->waitcount);
- ret = task_work_add(tsk, &callback->work, TWA_RESUME);
- if (ret) {
- /*
- * Task is exiting. Drop the refcount and free the callback.
- * No need to check the refcount as the group cannot be
- * deleted before the write function unlocks rdtgroup_mutex.
- */
- atomic_dec(&rdtgrp->waitcount);
- kfree(callback);
- rdt_last_cmd_puts("Task exited\n");
- } else {
- /*
- * For ctrl_mon groups move both closid and rmid.
- * For monitor groups, can move the tasks only from
- * their parent CTRL group.
- */
- if (rdtgrp->type == RDTCTRL_GROUP) {
- tsk->closid = rdtgrp->closid;
+
+ if (rdtgrp->type == RDTCTRL_GROUP) {
+ tsk->closid = rdtgrp->closid;
+ tsk->rmid = rdtgrp->mon.rmid;
+ } else if (rdtgrp->type == RDTMON_GROUP) {
+ if (rdtgrp->mon.parent->closid == tsk->closid) {
tsk->rmid = rdtgrp->mon.rmid;
- } else if (rdtgrp->type == RDTMON_GROUP) {
- if (rdtgrp->mon.parent->closid == tsk->closid) {
- tsk->rmid = rdtgrp->mon.rmid;
- } else {
- rdt_last_cmd_puts("Can't move task to different control group\n");
- ret = -EINVAL;
- }
+ } else {
+ rdt_last_cmd_puts("Can't move task to different control group\n");
+ return -EINVAL;
}
+ } else {
+ rdt_last_cmd_puts("Invalid resource group type\n");
+ return -EINVAL;
}
- return ret;
+
+ /*
+ * By now, the task's closid and rmid are set. If the task is current
+ * on a CPU, the PQR_ASSOC MSR needs to be updated to make the resource
+ * group go into effect. If the task is not current, the MSR will be
+ * updated when the task is scheduled in.
+ */
+ update_task_closid_rmid(tsk);
+
+ return 0;
}
static bool is_closid_match(struct task_struct *t, struct rdtgroup *r)
--
2.26.2
When cpu is going offline, set q->offline as true
and interrupt happened. The interrupt may call the
quarantine_put. But quarantine_put do not free the
the object. The object will cause memory leak.
Add qlink_free() to free the object.
Kuan-Ying Lee (1):
kasan: fix memory leak of kasan quarantine
mm/kasan/quarantine.c | 1 +
1 file changed, 1 insertion(+)
--
2.18.0
From: "Steven Rostedt (VMware)" <rostedt(a)goodmis.org>
It was believed that metag was the only architecture that required the ring
buffer to keep 8 byte words aligned on 8 byte architectures, and with its
removal, it was assumed that the ring buffer code did not need to handle
this case. It appears that sparc64 also requires this.
The following was reported on a sparc64 boot up:
kernel: futex hash table entries: 65536 (order: 9, 4194304 bytes, linear)
kernel: Running postponed tracer tests:
kernel: Testing tracer function:
kernel: Kernel unaligned access at TPC[552a20] trace_function+0x40/0x140
kernel: Kernel unaligned access at TPC[552a24] trace_function+0x44/0x140
kernel: Kernel unaligned access at TPC[552a20] trace_function+0x40/0x140
kernel: Kernel unaligned access at TPC[552a24] trace_function+0x44/0x140
kernel: Kernel unaligned access at TPC[552a20] trace_function+0x40/0x140
kernel: PASSED
Need to put back the 64BIT aligned code for the ring buffer.
Link: https://lore.kernel.org/r/CADxRZqzXQRYgKc=y-KV=S_yHL+Y8Ay2mh5ezeZUnpRvg+syW…
Cc: stable(a)vger.kernel.org
Fixes: 86b3de60a0b6 ("ring-buffer: Remove HAVE_64BIT_ALIGNED_ACCESS")
Reported-by: Anatoly Pugachev <matorola(a)gmail.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt(a)goodmis.org>
---
arch/Kconfig | 16 ++++++++++++++++
kernel/trace/ring_buffer.c | 17 +++++++++++++----
2 files changed, 29 insertions(+), 4 deletions(-)
diff --git a/arch/Kconfig b/arch/Kconfig
index 56b6ccc0e32d..fa716994f77e 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -143,6 +143,22 @@ config UPROBES
managed by the kernel and kept transparent to the probed
application. )
+config HAVE_64BIT_ALIGNED_ACCESS
+ def_bool 64BIT && !HAVE_EFFICIENT_UNALIGNED_ACCESS
+ help
+ Some architectures require 64 bit accesses to be 64 bit
+ aligned, which also requires structs containing 64 bit values
+ to be 64 bit aligned too. This includes some 32 bit
+ architectures which can do 64 bit accesses, as well as 64 bit
+ architectures without unaligned access.
+
+ This symbol should be selected by an architecture if 64 bit
+ accesses are required to be 64 bit aligned in this way even
+ though it is not a 64 bit architecture.
+
+ See Documentation/unaligned-memory-access.txt for more
+ information on the topic of unaligned memory accesses.
+
config HAVE_EFFICIENT_UNALIGNED_ACCESS
bool
help
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index e03bc4e5d482..926845eb5ab5 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -130,7 +130,16 @@ int ring_buffer_print_entry_header(struct trace_seq *s)
#define RB_ALIGNMENT 4U
#define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
#define RB_EVNT_MIN_SIZE 8U /* two 32bit words */
-#define RB_ALIGN_DATA __aligned(RB_ALIGNMENT)
+
+#ifndef CONFIG_HAVE_64BIT_ALIGNED_ACCESS
+# define RB_FORCE_8BYTE_ALIGNMENT 0
+# define RB_ARCH_ALIGNMENT RB_ALIGNMENT
+#else
+# define RB_FORCE_8BYTE_ALIGNMENT 1
+# define RB_ARCH_ALIGNMENT 8U
+#endif
+
+#define RB_ALIGN_DATA __aligned(RB_ARCH_ALIGNMENT)
/* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */
#define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX
@@ -2718,7 +2727,7 @@ rb_update_event(struct ring_buffer_per_cpu *cpu_buffer,
event->time_delta = delta;
length -= RB_EVNT_HDR_SIZE;
- if (length > RB_MAX_SMALL_DATA) {
+ if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) {
event->type_len = 0;
event->array[0] = length;
} else
@@ -2733,11 +2742,11 @@ static unsigned rb_calculate_event_length(unsigned length)
if (!length)
length++;
- if (length > RB_MAX_SMALL_DATA)
+ if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT)
length += sizeof(event.array[0]);
length += RB_EVNT_HDR_SIZE;
- length = ALIGN(length, RB_ALIGNMENT);
+ length = ALIGN(length, RB_ARCH_ALIGNMENT);
/*
* In case the time delta is larger than the 27 bits for it
--
2.29.2
The vfio_ap device driver registers a group notifier with VFIO when the
file descriptor for a VFIO mediated device for a KVM guest is opened to
receive notification that the KVM pointer is set (VFIO_GROUP_NOTIFY_SET_KVM
event). When the KVM pointer is set, the vfio_ap driver takes the
following actions:
1. Stashes the KVM pointer in the vfio_ap_mdev struct that holds the state
of the mediated device.
2. Calls the kvm_get_kvm() function to increment its reference counter.
3. Sets the function pointer to the function that handles interception of
the instruction that enables/disables interrupt processing.
4. Sets the masks in the KVM guest's CRYCB to pass AP resources through to
the guest.
In order to avoid memory leaks, when the notifier is called to receive
notification that the KVM pointer has been set to NULL, the vfio_ap device
driver should reverse the actions taken when the KVM pointer was set.
Fixes: 258287c994de ("s390: vfio-ap: implement mediated device open callback")
Signed-off-by: Tony Krowiak <akrowiak(a)linux.ibm.com>
---
drivers/s390/crypto/vfio_ap_ops.c | 29 ++++++++++++++++++++---------
1 file changed, 20 insertions(+), 9 deletions(-)
diff --git a/drivers/s390/crypto/vfio_ap_ops.c b/drivers/s390/crypto/vfio_ap_ops.c
index e0bde8518745..cd22e85588e1 100644
--- a/drivers/s390/crypto/vfio_ap_ops.c
+++ b/drivers/s390/crypto/vfio_ap_ops.c
@@ -1037,8 +1037,6 @@ static int vfio_ap_mdev_set_kvm(struct ap_matrix_mdev *matrix_mdev,
{
struct ap_matrix_mdev *m;
- mutex_lock(&matrix_dev->lock);
-
list_for_each_entry(m, &matrix_dev->mdev_list, node) {
if ((m != matrix_mdev) && (m->kvm == kvm)) {
mutex_unlock(&matrix_dev->lock);
@@ -1049,7 +1047,6 @@ static int vfio_ap_mdev_set_kvm(struct ap_matrix_mdev *matrix_mdev,
matrix_mdev->kvm = kvm;
kvm_get_kvm(kvm);
kvm->arch.crypto.pqap_hook = &matrix_mdev->pqap_hook;
- mutex_unlock(&matrix_dev->lock);
return 0;
}
@@ -1083,35 +1080,49 @@ static int vfio_ap_mdev_iommu_notifier(struct notifier_block *nb,
return NOTIFY_DONE;
}
+static void vfio_ap_mdev_unset_kvm(struct ap_matrix_mdev *matrix_mdev)
+{
+ kvm_arch_crypto_clear_masks(matrix_mdev->kvm);
+ matrix_mdev->kvm->arch.crypto.pqap_hook = NULL;
+ vfio_ap_mdev_reset_queues(matrix_mdev->mdev);
+ kvm_put_kvm(matrix_mdev->kvm);
+ matrix_mdev->kvm = NULL;
+}
+
static int vfio_ap_mdev_group_notifier(struct notifier_block *nb,
unsigned long action, void *data)
{
- int ret;
+ int ret, notify_rc = NOTIFY_DONE;
struct ap_matrix_mdev *matrix_mdev;
if (action != VFIO_GROUP_NOTIFY_SET_KVM)
return NOTIFY_OK;
matrix_mdev = container_of(nb, struct ap_matrix_mdev, group_notifier);
+ mutex_lock(&matrix_dev->lock);
if (!data) {
- matrix_mdev->kvm = NULL;
- return NOTIFY_OK;
+ if (matrix_mdev->kvm)
+ vfio_ap_mdev_unset_kvm(matrix_mdev);
+ notify_rc = NOTIFY_OK;
+ goto notify_done;
}
ret = vfio_ap_mdev_set_kvm(matrix_mdev, data);
if (ret)
- return NOTIFY_DONE;
+ goto notify_done;
/* If there is no CRYCB pointer, then we can't copy the masks */
if (!matrix_mdev->kvm->arch.crypto.crycbd)
- return NOTIFY_DONE;
+ goto notify_done;
kvm_arch_crypto_set_masks(matrix_mdev->kvm, matrix_mdev->matrix.apm,
matrix_mdev->matrix.aqm,
matrix_mdev->matrix.adm);
- return NOTIFY_OK;
+notify_done:
+ mutex_unlock(&matrix_dev->lock);
+ return notify_rc;
}
static void vfio_ap_irq_disable_apqn(int apqn)
--
2.21.1