Hi Sasha,
my patch should not go into the stable branches. Under certain
circumstances it triggered kernel crashes.
Consequentially this patch has been reverted in the main
development branch:
8eef5c3cea65 Revert "igc: fix a log entry using uninitialized netdev"
So I suggest to remove my patch 86167183a17e from the stable branches or
apply 8eef5c3cea65 as well.
Sorry and thanks,
Corinna
On Jul 5 15:29, Sasha Levin wrote:
> This is a note to let you know that I've just added the patch titled
>
> igc: fix a log entry using uninitialized netdev
>
> to the 6.9-stable tree which can be found at:
> http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
>
> The filename of the patch is:
> igc-fix-a-log-entry-using-uninitialized-netdev.patch
> and it can be found in the queue-6.9 subdirectory.
>
> If you, or anyone else, feels it should not be added to the stable tree,
> please let <stable(a)vger.kernel.org> know about it.
>
>
>
> commit ee112b3c8929ec718b444134db87e1c585eb7d70
> Author: Corinna Vinschen <vinschen(a)redhat.com>
> Date: Tue Apr 23 12:24:54 2024 +0200
>
> igc: fix a log entry using uninitialized netdev
>
> [ Upstream commit 86167183a17e03ec77198897975e9fdfbd53cb0b ]
>
> During successful probe, igc logs this:
>
> [ 5.133667] igc 0000:01:00.0 (unnamed net_device) (uninitialized): PHC added
> ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
> The reason is that igc_ptp_init() is called very early, even before
> register_netdev() has been called. So the netdev_info() call works
> on a partially uninitialized netdev.
>
> Fix this by calling igc_ptp_init() after register_netdev(), right
> after the media autosense check, just as in igb. Add a comment,
> just as in igb.
>
> Now the log message is fine:
>
> [ 5.200987] igc 0000:01:00.0 eth0: PHC added
>
> Signed-off-by: Corinna Vinschen <vinschen(a)redhat.com>
> Reviewed-by: Hariprasad Kelam <hkelam(a)marvell.com>
> Acked-by: Vinicius Costa Gomes <vinicius.gomes(a)intel.com>
> Tested-by: Naama Meir <naamax.meir(a)linux.intel.com>
> Signed-off-by: Tony Nguyen <anthony.l.nguyen(a)intel.com>
> Signed-off-by: Sasha Levin <sashal(a)kernel.org>
>
> diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c
> index 58bc96021bb4c..07feb951be749 100644
> --- a/drivers/net/ethernet/intel/igc/igc_main.c
> +++ b/drivers/net/ethernet/intel/igc/igc_main.c
> @@ -6932,8 +6932,6 @@ static int igc_probe(struct pci_dev *pdev,
> device_set_wakeup_enable(&adapter->pdev->dev,
> adapter->flags & IGC_FLAG_WOL_SUPPORTED);
>
> - igc_ptp_init(adapter);
> -
> igc_tsn_clear_schedule(adapter);
>
> /* reset the hardware with the new settings */
> @@ -6955,6 +6953,9 @@ static int igc_probe(struct pci_dev *pdev,
> /* Check if Media Autosense is enabled */
> adapter->ei = *ei;
>
> + /* do hw tstamp init after resetting */
> + igc_ptp_init(adapter);
> +
> /* print pcie link status and MAC address */
> pcie_print_link_status(pdev);
> netdev_info(netdev, "MAC: %pM\n", netdev->dev_addr);
The following commit has been merged into the perf/core branch of tip:
Commit-ID: 68cbd415dd4b9c5b9df69f0f091879e56bf5907a
Gitweb: https://git.kernel.org/tip/68cbd415dd4b9c5b9df69f0f091879e56bf5907a
Author: Frederic Weisbecker <frederic(a)kernel.org>
AuthorDate: Fri, 21 Jun 2024 11:15:58 +02:00
Committer: Peter Zijlstra <peterz(a)infradead.org>
CommitterDate: Tue, 09 Jul 2024 13:26:31 +02:00
task_work: s/task_work_cancel()/task_work_cancel_func()/
A proper task_work_cancel() API that actually cancels a callback and not
*any* callback pointing to a given function is going to be needed for
perf events event freeing. Do the appropriate rename to prepare for
that.
Signed-off-by: Frederic Weisbecker <frederic(a)kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz(a)infradead.org>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/r/20240621091601.18227-2-frederic@kernel.org
---
include/linux/task_work.h | 2 +-
kernel/irq/manage.c | 2 +-
kernel/task_work.c | 10 +++++-----
security/keys/keyctl.c | 2 +-
4 files changed, 8 insertions(+), 8 deletions(-)
diff --git a/include/linux/task_work.h b/include/linux/task_work.h
index 795ef5a..23ab01a 100644
--- a/include/linux/task_work.h
+++ b/include/linux/task_work.h
@@ -30,7 +30,7 @@ int task_work_add(struct task_struct *task, struct callback_head *twork,
struct callback_head *task_work_cancel_match(struct task_struct *task,
bool (*match)(struct callback_head *, void *data), void *data);
-struct callback_head *task_work_cancel(struct task_struct *, task_work_func_t);
+struct callback_head *task_work_cancel_func(struct task_struct *, task_work_func_t);
void task_work_run(void);
static inline void exit_task_work(struct task_struct *task)
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 71b0fc2..dd53298 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1337,7 +1337,7 @@ static int irq_thread(void *data)
* synchronize_hardirq(). So neither IRQTF_RUNTHREAD nor the
* oneshot mask bit can be set.
*/
- task_work_cancel(current, irq_thread_dtor);
+ task_work_cancel_func(current, irq_thread_dtor);
return 0;
}
diff --git a/kernel/task_work.c b/kernel/task_work.c
index 95a7e1b..54ac240 100644
--- a/kernel/task_work.c
+++ b/kernel/task_work.c
@@ -120,9 +120,9 @@ static bool task_work_func_match(struct callback_head *cb, void *data)
}
/**
- * task_work_cancel - cancel a pending work added by task_work_add()
- * @task: the task which should execute the work
- * @func: identifies the work to remove
+ * task_work_cancel_func - cancel a pending work matching a function added by task_work_add()
+ * @task: the task which should execute the func's work
+ * @func: identifies the func to match with a work to remove
*
* Find the last queued pending work with ->func == @func and remove
* it from queue.
@@ -131,7 +131,7 @@ static bool task_work_func_match(struct callback_head *cb, void *data)
* The found work or NULL if not found.
*/
struct callback_head *
-task_work_cancel(struct task_struct *task, task_work_func_t func)
+task_work_cancel_func(struct task_struct *task, task_work_func_t func)
{
return task_work_cancel_match(task, task_work_func_match, func);
}
@@ -168,7 +168,7 @@ void task_work_run(void)
if (!work)
break;
/*
- * Synchronize with task_work_cancel(). It can not remove
+ * Synchronize with task_work_cancel_match(). It can not remove
* the first entry == work, cmpxchg(task_works) must fail.
* But it can remove another entry from the ->next list.
*/
diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c
index 4bc3e93..ab927a1 100644
--- a/security/keys/keyctl.c
+++ b/security/keys/keyctl.c
@@ -1694,7 +1694,7 @@ long keyctl_session_to_parent(void)
goto unlock;
/* cancel an already pending keyring replacement */
- oldwork = task_work_cancel(parent, key_change_session_keyring);
+ oldwork = task_work_cancel_func(parent, key_change_session_keyring);
/* the replacement session keyring is applied just prior to userspace
* restarting */
The following commit has been merged into the perf/core branch of tip:
Commit-ID: 2fd5ad3f310de22836cdacae919dd99d758a1f1b
Gitweb: https://git.kernel.org/tip/2fd5ad3f310de22836cdacae919dd99d758a1f1b
Author: Frederic Weisbecker <frederic(a)kernel.org>
AuthorDate: Fri, 21 Jun 2024 11:16:00 +02:00
Committer: Peter Zijlstra <peterz(a)infradead.org>
CommitterDate: Tue, 09 Jul 2024 13:26:33 +02:00
perf: Fix event leak upon exit
When a task is scheduled out, pending sigtrap deliveries are deferred
to the target task upon resume to userspace via task_work.
However failures while adding an event's callback to the task_work
engine are ignored. And since the last call for events exit happen
after task work is eventually closed, there is a small window during
which pending sigtrap can be queued though ignored, leaking the event
refcount addition such as in the following scenario:
TASK A
-----
do_exit()
exit_task_work(tsk);
<IRQ>
perf_event_overflow()
event->pending_sigtrap = pending_id;
irq_work_queue(&event->pending_irq);
</IRQ>
=========> PREEMPTION: TASK A -> TASK B
event_sched_out()
event->pending_sigtrap = 0;
atomic_long_inc_not_zero(&event->refcount)
// FAILS: task work has exited
task_work_add(&event->pending_task)
[...]
<IRQ WORK>
perf_pending_irq()
// early return: event->oncpu = -1
</IRQ WORK>
[...]
=========> TASK B -> TASK A
perf_event_exit_task(tsk)
perf_event_exit_event()
free_event()
WARN(atomic_long_cmpxchg(&event->refcount, 1, 0) != 1)
// leak event due to unexpected refcount == 2
As a result the event is never released while the task exits.
Fix this with appropriate task_work_add()'s error handling.
Fixes: 517e6a301f34 ("perf: Fix perf_pending_task() UaF")
Signed-off-by: Frederic Weisbecker <frederic(a)kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz(a)infradead.org>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/r/20240621091601.18227-4-frederic@kernel.org
---
kernel/events/core.c | 13 +++++--------
1 file changed, 5 insertions(+), 8 deletions(-)
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 51ce436..576400d 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2284,18 +2284,15 @@ event_sched_out(struct perf_event *event, struct perf_event_context *ctx)
}
if (event->pending_sigtrap) {
- bool dec = true;
-
event->pending_sigtrap = 0;
if (state != PERF_EVENT_STATE_OFF &&
- !event->pending_work) {
- event->pending_work = 1;
- dec = false;
+ !event->pending_work &&
+ !task_work_add(current, &event->pending_task, TWA_RESUME)) {
WARN_ON_ONCE(!atomic_long_inc_not_zero(&event->refcount));
- task_work_add(current, &event->pending_task, TWA_RESUME);
- }
- if (dec)
+ event->pending_work = 1;
+ } else {
local_dec(&event->ctx->nr_pending);
+ }
}
perf_event_set_state(event, state);
The following commit has been merged into the perf/core branch of tip:
Commit-ID: 3a5465418f5fd970e86a86c7f4075be262682840
Gitweb: https://git.kernel.org/tip/3a5465418f5fd970e86a86c7f4075be262682840
Author: Frederic Weisbecker <frederic(a)kernel.org>
AuthorDate: Fri, 21 Jun 2024 11:16:01 +02:00
Committer: Peter Zijlstra <peterz(a)infradead.org>
CommitterDate: Tue, 09 Jul 2024 13:26:33 +02:00
perf: Fix event leak upon exec and file release
The perf pending task work is never waited upon the matching event
release. In the case of a child event, released via free_event()
directly, this can potentially result in a leaked event, such as in the
following scenario that doesn't even require a weak IRQ work
implementation to trigger:
schedule()
prepare_task_switch()
=======> <NMI>
perf_event_overflow()
event->pending_sigtrap = ...
irq_work_queue(&event->pending_irq)
<======= </NMI>
perf_event_task_sched_out()
event_sched_out()
event->pending_sigtrap = 0;
atomic_long_inc_not_zero(&event->refcount)
task_work_add(&event->pending_task)
finish_lock_switch()
=======> <IRQ>
perf_pending_irq()
//do nothing, rely on pending task work
<======= </IRQ>
begin_new_exec()
perf_event_exit_task()
perf_event_exit_event()
// If is child event
free_event()
WARN(atomic_long_cmpxchg(&event->refcount, 1, 0) != 1)
// event is leaked
Similar scenarios can also happen with perf_event_remove_on_exec() or
simply against concurrent perf_event_release().
Fix this with synchonizing against the possibly remaining pending task
work while freeing the event, just like is done with remaining pending
IRQ work. This means that the pending task callback neither need nor
should hold a reference to the event, preventing it from ever beeing
freed.
Fixes: 517e6a301f34 ("perf: Fix perf_pending_task() UaF")
Signed-off-by: Frederic Weisbecker <frederic(a)kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz(a)infradead.org>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/r/20240621091601.18227-5-frederic@kernel.org
---
include/linux/perf_event.h | 1 +-
kernel/events/core.c | 38 +++++++++++++++++++++++++++++++++----
2 files changed, 35 insertions(+), 4 deletions(-)
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index a5304ae..393fb13 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -786,6 +786,7 @@ struct perf_event {
struct irq_work pending_irq;
struct callback_head pending_task;
unsigned int pending_work;
+ struct rcuwait pending_work_wait;
atomic_t event_limit;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 576400d..32c7996 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2288,7 +2288,6 @@ event_sched_out(struct perf_event *event, struct perf_event_context *ctx)
if (state != PERF_EVENT_STATE_OFF &&
!event->pending_work &&
!task_work_add(current, &event->pending_task, TWA_RESUME)) {
- WARN_ON_ONCE(!atomic_long_inc_not_zero(&event->refcount));
event->pending_work = 1;
} else {
local_dec(&event->ctx->nr_pending);
@@ -5203,9 +5202,35 @@ static bool exclusive_event_installable(struct perf_event *event,
static void perf_addr_filters_splice(struct perf_event *event,
struct list_head *head);
+static void perf_pending_task_sync(struct perf_event *event)
+{
+ struct callback_head *head = &event->pending_task;
+
+ if (!event->pending_work)
+ return;
+ /*
+ * If the task is queued to the current task's queue, we
+ * obviously can't wait for it to complete. Simply cancel it.
+ */
+ if (task_work_cancel(current, head)) {
+ event->pending_work = 0;
+ local_dec(&event->ctx->nr_pending);
+ return;
+ }
+
+ /*
+ * All accesses related to the event are within the same
+ * non-preemptible section in perf_pending_task(). The RCU
+ * grace period before the event is freed will make sure all
+ * those accesses are complete by then.
+ */
+ rcuwait_wait_event(&event->pending_work_wait, !event->pending_work, TASK_UNINTERRUPTIBLE);
+}
+
static void _free_event(struct perf_event *event)
{
irq_work_sync(&event->pending_irq);
+ perf_pending_task_sync(event);
unaccount_event(event);
@@ -6818,23 +6843,27 @@ static void perf_pending_task(struct callback_head *head)
int rctx;
/*
+ * All accesses to the event must belong to the same implicit RCU read-side
+ * critical section as the ->pending_work reset. See comment in
+ * perf_pending_task_sync().
+ */
+ preempt_disable_notrace();
+ /*
* If we 'fail' here, that's OK, it means recursion is already disabled
* and we won't recurse 'further'.
*/
- preempt_disable_notrace();
rctx = perf_swevent_get_recursion_context();
if (event->pending_work) {
event->pending_work = 0;
perf_sigtrap(event);
local_dec(&event->ctx->nr_pending);
+ rcuwait_wake_up(&event->pending_work_wait);
}
if (rctx >= 0)
perf_swevent_put_recursion_context(rctx);
preempt_enable_notrace();
-
- put_event(event);
}
#ifdef CONFIG_GUEST_PERF_EVENTS
@@ -11948,6 +11977,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
init_waitqueue_head(&event->waitq);
init_irq_work(&event->pending_irq, perf_pending_irq);
init_task_work(&event->pending_task, perf_pending_task);
+ rcuwait_init(&event->pending_work_wait);
mutex_init(&event->mmap_mutex);
raw_spin_lock_init(&event->addr_filters.lock);
From: Kan Liang <kan.liang(a)linux.intel.com>
A non-0 retire latency can be observed on a Raptorlake which doesn't
support the retire latency feature.
By design, the retire latency shares the PERF_SAMPLE_WEIGHT_STRUCT
sample type with other types of latency. That could avoid adding too
many different sample types to support all kinds of latency. For the
machine which doesn't support some kind of latency, 0 should be
returned.
Perf doesn’t clear/init all the fields of a sample data for the sake
of performance. It expects the later perf_{prepare,output}_sample() to
update the uninitialized field. However, the current implementation
doesn't touch the field of the retire latency if the feature is not
supported. The memory garbage is dumped into the perf data.
Clear the retire latency if the feature is not supported.
Fixes: c87a31093c70 ("perf/x86: Support Retire Latency")
Reported-by: "Bayduraev, Alexey V" <alexey.v.bayduraev(a)intel.com>
Tested-by: "Bayduraev, Alexey V" <alexey.v.bayduraev(a)intel.com>
Signed-off-by: Kan Liang <kan.liang(a)linux.intel.com>
Cc: stable(a)vger.kernel.org
---
arch/x86/events/intel/ds.c | 8 ++++++--
1 file changed, 6 insertions(+), 2 deletions(-)
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index b9cc520b2942..fa5ea65de0d0 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -1944,8 +1944,12 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event,
set_linear_ip(regs, basic->ip);
regs->flags = PERF_EFLAGS_EXACT;
- if ((sample_type & PERF_SAMPLE_WEIGHT_STRUCT) && (x86_pmu.flags & PMU_FL_RETIRE_LATENCY))
- data->weight.var3_w = format_size >> PEBS_RETIRE_LATENCY_OFFSET & PEBS_LATENCY_MASK;
+ if (sample_type & PERF_SAMPLE_WEIGHT_STRUCT) {
+ if (x86_pmu.flags & PMU_FL_RETIRE_LATENCY)
+ data->weight.var3_w = format_size >> PEBS_RETIRE_LATENCY_OFFSET & PEBS_LATENCY_MASK;
+ else
+ data->weight.var3_w = 0;
+ }
/*
* The record for MEMINFO is in front of GP
--
2.38.1
In the cases where the power domain connected to logics is allowed to
transition from a level(L)-->power collapse(0)-->retention(1) or
vice versa retention(1)-->power collapse(0)-->level(L) will cause the
logic to lose the configurations. The ARC does not support retention
to collapse transition on MxC rails.
The targets from SM8450 onwards the PLL logics of clock controllers are
connected to MxC rails and the recommended configurations are carried
out during the clock controller probes. The MxC transition as mentioned
above should be skipped to ensure the PLL settings are intact across
clock controller power on & off.
On older targets that do not split MX into MxA and MxC does not collapse
the logic and it is parked always at RETENTION, thus this issue is never
observed on those targets.
Cc: stable(a)vger.kernel.org # v5.17
Reviewed-by: Bjorn Andersson <andersson(a)kernel.org>
Signed-off-by: Taniya Das <quic_tdas(a)quicinc.com>
---
[Changes in v2]: Incorporate the comments in the commit text.
---
drivers/pmdomain/qcom/rpmhpd.c | 7 +++++++
1 file changed, 7 insertions(+)
diff --git a/drivers/pmdomain/qcom/rpmhpd.c b/drivers/pmdomain/qcom/rpmhpd.c
index de9121ef4216..d2cb4271a1ca 100644
--- a/drivers/pmdomain/qcom/rpmhpd.c
+++ b/drivers/pmdomain/qcom/rpmhpd.c
@@ -40,6 +40,7 @@
* @addr: Resource address as looped up using resource name from
* cmd-db
* @state_synced: Indicator that sync_state has been invoked for the rpmhpd resource
+ * @skip_retention_level: Indicate that retention level should not be used for the power domain
*/
struct rpmhpd {
struct device *dev;
@@ -56,6 +57,7 @@ struct rpmhpd {
const char *res_name;
u32 addr;
bool state_synced;
+ bool skip_retention_level;
};
struct rpmhpd_desc {
@@ -173,6 +175,7 @@ static struct rpmhpd mxc = {
.pd = { .name = "mxc", },
.peer = &mxc_ao,
.res_name = "mxc.lvl",
+ .skip_retention_level = true,
};
static struct rpmhpd mxc_ao = {
@@ -180,6 +183,7 @@ static struct rpmhpd mxc_ao = {
.active_only = true,
.peer = &mxc,
.res_name = "mxc.lvl",
+ .skip_retention_level = true,
};
static struct rpmhpd nsp = {
@@ -819,6 +823,9 @@ static int rpmhpd_update_level_mapping(struct rpmhpd *rpmhpd)
return -EINVAL;
for (i = 0; i < rpmhpd->level_count; i++) {
+ if (rpmhpd->skip_retention_level && buf[i] == RPMH_REGULATOR_LEVEL_RETENTION)
+ continue;
+
rpmhpd->level[i] = buf[i];
/* Remember the first corner with non-zero level */
---
base-commit: 62c97045b8f720c2eac807a5f38e26c9ed512371
change-id: 20240625-avoid_mxc_retention-b095a761d981
Best regards,
--
Taniya Das <quic_tdas(a)quicinc.com>
commit 93aef9eda1cea9e84ab2453fcceb8addad0e46f1 upstream.
If the bitmap block that manages the inode allocation status is corrupted,
nilfs_ifile_create_inode() may allocate a new inode from the reserved
inode area where it should not be allocated.
Previous fix commit d325dc6eb763 ("nilfs2: fix use-after-free bug of
struct nilfs_root"), fixed the problem that reserved inodes with inode
numbers less than NILFS_USER_INO (=11) were incorrectly reallocated due to
bitmap corruption, but since the start number of non-reserved inodes is
read from the super block and may change, in which case inode allocation
may occur from the extended reserved inode area.
If that happens, access to that inode will cause an IO error, causing the
file system to degrade to an error state.
Fix this potential issue by adding a wraparound option to the common
metadata object allocation routine and by modifying
nilfs_ifile_create_inode() to disable the option so that it only allocates
inodes with inode numbers greater than or equal to the inode number read
in "nilfs->ns_first_ino", regardless of the bitmap status of reserved
inodes.
Link: https://lkml.kernel.org/r/20240623051135.4180-4-konishi.ryusuke@gmail.com
Signed-off-by: Ryusuke Konishi <konishi.ryusuke(a)gmail.com>
Cc: Hillf Danton <hdanton(a)sina.com>
Cc: Jan Kara <jack(a)suse.cz>
Cc: Matthew Wilcox (Oracle) <willy(a)infradead.org>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
Please apply this patch to the stable trees indicated by the subject
prefix instead of the patch that failed.
This patch is tailored to avoid conflicts with a series involving
extensive conversions and can be applied from v4.8 to v6.8.
Also, all the builds and tests I did on each stable tree passed.
Thanks,
Ryusuke Konishi
fs/nilfs2/alloc.c | 18 ++++++++++++++----
fs/nilfs2/alloc.h | 4 ++--
fs/nilfs2/dat.c | 2 +-
fs/nilfs2/ifile.c | 7 ++-----
4 files changed, 19 insertions(+), 12 deletions(-)
diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c
index 7342de296ec3..25881bdd212b 100644
--- a/fs/nilfs2/alloc.c
+++ b/fs/nilfs2/alloc.c
@@ -377,11 +377,12 @@ void *nilfs_palloc_block_get_entry(const struct inode *inode, __u64 nr,
* @target: offset number of an entry in the group (start point)
* @bsize: size in bits
* @lock: spin lock protecting @bitmap
+ * @wrap: whether to wrap around
*/
static int nilfs_palloc_find_available_slot(unsigned char *bitmap,
unsigned long target,
unsigned int bsize,
- spinlock_t *lock)
+ spinlock_t *lock, bool wrap)
{
int pos, end = bsize;
@@ -397,6 +398,8 @@ static int nilfs_palloc_find_available_slot(unsigned char *bitmap,
end = target;
}
+ if (!wrap)
+ return -ENOSPC;
/* wrap around */
for (pos = 0; pos < end; pos++) {
@@ -495,9 +498,10 @@ int nilfs_palloc_count_max_entries(struct inode *inode, u64 nused, u64 *nmaxp)
* nilfs_palloc_prepare_alloc_entry - prepare to allocate a persistent object
* @inode: inode of metadata file using this allocator
* @req: nilfs_palloc_req structure exchanged for the allocation
+ * @wrap: whether to wrap around
*/
int nilfs_palloc_prepare_alloc_entry(struct inode *inode,
- struct nilfs_palloc_req *req)
+ struct nilfs_palloc_req *req, bool wrap)
{
struct buffer_head *desc_bh, *bitmap_bh;
struct nilfs_palloc_group_desc *desc;
@@ -516,7 +520,7 @@ int nilfs_palloc_prepare_alloc_entry(struct inode *inode,
entries_per_group = nilfs_palloc_entries_per_group(inode);
for (i = 0; i < ngroups; i += n) {
- if (group >= ngroups) {
+ if (group >= ngroups && wrap) {
/* wrap around */
group = 0;
maxgroup = nilfs_palloc_group(inode, req->pr_entry_nr,
@@ -541,7 +545,13 @@ int nilfs_palloc_prepare_alloc_entry(struct inode *inode,
bitmap = bitmap_kaddr + bh_offset(bitmap_bh);
pos = nilfs_palloc_find_available_slot(
bitmap, group_offset,
- entries_per_group, lock);
+ entries_per_group, lock, wrap);
+ /*
+ * Since the search for a free slot in the
+ * second and subsequent bitmap blocks always
+ * starts from the beginning, the wrap flag
+ * only has an effect on the first search.
+ */
if (pos >= 0) {
/* found a free entry */
nilfs_palloc_group_desc_add_entries(
diff --git a/fs/nilfs2/alloc.h b/fs/nilfs2/alloc.h
index b667e869ac07..d825a9faca6d 100644
--- a/fs/nilfs2/alloc.h
+++ b/fs/nilfs2/alloc.h
@@ -50,8 +50,8 @@ struct nilfs_palloc_req {
struct buffer_head *pr_entry_bh;
};
-int nilfs_palloc_prepare_alloc_entry(struct inode *,
- struct nilfs_palloc_req *);
+int nilfs_palloc_prepare_alloc_entry(struct inode *inode,
+ struct nilfs_palloc_req *req, bool wrap);
void nilfs_palloc_commit_alloc_entry(struct inode *,
struct nilfs_palloc_req *);
void nilfs_palloc_abort_alloc_entry(struct inode *, struct nilfs_palloc_req *);
diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c
index 9cf6ba58f585..351010828d88 100644
--- a/fs/nilfs2/dat.c
+++ b/fs/nilfs2/dat.c
@@ -75,7 +75,7 @@ int nilfs_dat_prepare_alloc(struct inode *dat, struct nilfs_palloc_req *req)
{
int ret;
- ret = nilfs_palloc_prepare_alloc_entry(dat, req);
+ ret = nilfs_palloc_prepare_alloc_entry(dat, req, true);
if (ret < 0)
return ret;
diff --git a/fs/nilfs2/ifile.c b/fs/nilfs2/ifile.c
index a8a4bc8490b4..ac10a62a41e9 100644
--- a/fs/nilfs2/ifile.c
+++ b/fs/nilfs2/ifile.c
@@ -55,13 +55,10 @@ int nilfs_ifile_create_inode(struct inode *ifile, ino_t *out_ino,
struct nilfs_palloc_req req;
int ret;
- req.pr_entry_nr = 0; /*
- * 0 says find free inode from beginning
- * of a group. dull code!!
- */
+ req.pr_entry_nr = NILFS_FIRST_INO(ifile->i_sb);
req.pr_entry_bh = NULL;
- ret = nilfs_palloc_prepare_alloc_entry(ifile, &req);
+ ret = nilfs_palloc_prepare_alloc_entry(ifile, &req, false);
if (!ret) {
ret = nilfs_palloc_get_entry_block(ifile, req.pr_entry_nr, 1,
&req.pr_entry_bh);
--
2.43.5