Currently it is possible for an NMI (or FIQ on ARM) to come in and read sched_clock() whilst update_sched_clock() has half updated the state. This results in a bad time value being observed.
This patch fixes that problem in a similar manner to Thomas Gleixner's 4396e058c52e("timekeeping: Provide fast and NMI safe access to CLOCK_MONOTONIC").
Note that ripping out the seqcount lock from sched_clock_register() and replacing it with a large comment is not nearly as bad as it looks! The locking here is actually pretty useless since most of the variables modified within the write lock are not covered by the read lock. As a result a big comment and the sequence bump implicit in the call to update_epoch() should work pretty much the same.
Suggested-by: Stephen Boyd sboyd@codeaurora.org Signed-off-by: Daniel Thompson daniel.thompson@linaro.org ---
Notes: This patch has only had fairly light testing at this point. However it survives basic tests. In particular I am running perf from FIQ/NMI and have instrumented it with some monotonicity tests none of which have reported any problem.
kernel/time/sched_clock.c | 63 +++++++++++++++++++++++++++++++++++++---------- 1 file changed, 50 insertions(+), 13 deletions(-)
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index 01d2d15aa662..485d5070259c 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -27,6 +27,10 @@ struct clock_data { u32 mult; u32 shift; bool suspended; + + /* Used only temporarily whilst we are updating the primary copy */ + u64 old_epoch_ns; + u64 old_epoch_cyc; };
static struct hrtimer sched_clock_timer; @@ -67,9 +71,14 @@ unsigned long long notrace sched_clock(void) return cd.epoch_ns;
do { - seq = raw_read_seqcount_begin(&cd.seq); - epoch_cyc = cd.epoch_cyc; - epoch_ns = cd.epoch_ns; + seq = raw_read_seqcount(&cd.seq); + if (likely(0 == (seq & 1))) { + epoch_cyc = cd.epoch_cyc; + epoch_ns = cd.epoch_ns; + } else { + epoch_cyc = cd.old_epoch_cyc; + epoch_ns = cd.old_epoch_ns; + } } while (read_seqcount_retry(&cd.seq, seq));
cyc = read_sched_clock(); @@ -78,6 +87,35 @@ unsigned long long notrace sched_clock(void) }
/* + * Update the epoch without allowing sched_clock to observe + * a mismatched epoch pair even if called from NMI. + * + * We do this by maintaining and odd/even copy of the epoch data and + * steering sched_clock to one or the other using a sequence counter. + * In order to preserve the (average case) data cache profile of + * sched_clock the system reverts back to the even copy as soon as + * possible; the odd copy is used *only* during an update. + * + * The caller is responsible for avoiding simultaneous updates. + */ +static void notrace update_epoch(u64 cyc, u64 ns) +{ + /* Update the backup copy */ + cd.old_epoch_cyc = cd.epoch_cyc; + cd.old_epoch_ns = cd.epoch_ns; + + /* Force readers to use the backup (odd) copy */ + raw_write_seqcount_latch(&cd.seq); + + /* Update the primary copy */ + cd.epoch_cyc = cyc; + cd.epoch_ns = ns; + + /* Steer readers back the primary (even) copy */ + raw_write_seqcount_latch(&cd.seq); +} + +/* * Atomically update the sched_clock epoch. */ static void notrace update_sched_clock(void) @@ -91,12 +129,7 @@ static void notrace update_sched_clock(void) cyc_to_ns((cyc - cd.epoch_cyc) & sched_clock_mask, cd.mult, cd.shift);
- raw_local_irq_save(flags); - raw_write_seqcount_begin(&cd.seq); - cd.epoch_ns = ns; - cd.epoch_cyc = cyc; - raw_write_seqcount_end(&cd.seq); - raw_local_irq_restore(flags); + update_epoch(cyc, ns); }
static enum hrtimer_restart sched_clock_poll(struct hrtimer *hrt) @@ -135,16 +168,20 @@ void __init sched_clock_register(u64 (*read)(void), int bits, ns = cd.epoch_ns + cyc_to_ns((cyc - cd.epoch_cyc) & sched_clock_mask, cd.mult, cd.shift);
- raw_write_seqcount_begin(&cd.seq); + /* + * sched_clock will report a bad value if it executes + * concurrently with the following code. No locking exists to + * prevent this; we rely mostly on this function being called + * early during kernel boot up before we have lots of other + * stuff going on. + */ read_sched_clock = read; sched_clock_mask = new_mask; cd.rate = rate; cd.wrap_kt = new_wrap_kt; cd.mult = new_mult; cd.shift = new_shift; - cd.epoch_cyc = new_epoch; - cd.epoch_ns = ns; - raw_write_seqcount_end(&cd.seq); + update_epoch(new_epoch, ns);
r = rate; if (r >= 4000000) { -- 1.9.3
On Wed, Jan 21, 2015 at 8:53 AM, Daniel Thompson daniel.thompson@linaro.org wrote:
Currently it is possible for an NMI (or FIQ on ARM) to come in and read sched_clock() whilst update_sched_clock() has half updated the state. This results in a bad time value being observed.
This patch fixes that problem in a similar manner to Thomas Gleixner's 4396e058c52e("timekeeping: Provide fast and NMI safe access to CLOCK_MONOTONIC").
Note that ripping out the seqcount lock from sched_clock_register() and replacing it with a large comment is not nearly as bad as it looks! The locking here is actually pretty useless since most of the variables modified within the write lock are not covered by the read lock. As a result a big comment and the sequence bump implicit in the call to update_epoch() should work pretty much the same.
It still looks pretty bad, even with the current explanation.
raw_write_seqcount_begin(&cd.seq);
/*
* sched_clock will report a bad value if it executes
* concurrently with the following code. No locking exists to
* prevent this; we rely mostly on this function being called
* early during kernel boot up before we have lots of other
* stuff going on.
*/ read_sched_clock = read; sched_clock_mask = new_mask; cd.rate = rate; cd.wrap_kt = new_wrap_kt; cd.mult = new_mult; cd.shift = new_shift;
cd.epoch_cyc = new_epoch;
cd.epoch_ns = ns;
raw_write_seqcount_end(&cd.seq);
update_epoch(new_epoch, ns);
So looking at this, the sched_clock_register() function may not be called super early, so I was looking to see what prevented bad reads prior to registration. And from quick inspection, its nothing. I suspect the undocumented trick that makes this work is that the mult value is initialzied to zero, so sched_clock returns 0 until things have been registered.
So it does seem like it would be worth while to do the initialization under the lock, or possibly use the suspend flag to make the first initialization safe.
thanks -john
On 21/01/15 17:29, John Stultz wrote:
On Wed, Jan 21, 2015 at 8:53 AM, Daniel Thompson daniel.thompson@linaro.org wrote:
Currently it is possible for an NMI (or FIQ on ARM) to come in and read sched_clock() whilst update_sched_clock() has half updated the state. This results in a bad time value being observed.
This patch fixes that problem in a similar manner to Thomas Gleixner's 4396e058c52e("timekeeping: Provide fast and NMI safe access to CLOCK_MONOTONIC").
Note that ripping out the seqcount lock from sched_clock_register() and replacing it with a large comment is not nearly as bad as it looks! The locking here is actually pretty useless since most of the variables modified within the write lock are not covered by the read lock. As a result a big comment and the sequence bump implicit in the call to update_epoch() should work pretty much the same.
It still looks pretty bad, even with the current explanation.
I'm inclined to agree. Although to be clear, the code I proposed should not more broken than the code we have today (and arguably more honest).
raw_write_seqcount_begin(&cd.seq);
/*
* sched_clock will report a bad value if it executes
* concurrently with the following code. No locking exists to
* prevent this; we rely mostly on this function being called
* early during kernel boot up before we have lots of other
* stuff going on.
*/ read_sched_clock = read; sched_clock_mask = new_mask; cd.rate = rate; cd.wrap_kt = new_wrap_kt; cd.mult = new_mult; cd.shift = new_shift;
cd.epoch_cyc = new_epoch;
cd.epoch_ns = ns;
raw_write_seqcount_end(&cd.seq);
update_epoch(new_epoch, ns);
So looking at this, the sched_clock_register() function may not be called super early, so I was looking to see what prevented bad reads prior to registration.
Certainly not super early, but, from the WARN_ON() at the top of the function I thought it was intended to be called before start_kernel() unmasks interrupts...
And from quick inspection, its nothing. I suspect the undocumented trick that makes this work is that the mult value is initialzied to zero, so sched_clock returns 0 until things have been registered.
So it does seem like it would be worth while to do the initialization under the lock, or possibly use the suspend flag to make the first initialization safe.
As mentioned the existing write lock doesn't really do very much at the moment.
The simple and (I think) strictly correct approach is to duplicate the whole of the clock_data (minus the seqcount) and make the read lock in sched_clock cover all accesses to the structure.
This would substantially enlarge the critical section in sched_clock() meaning we might loop round the seqcount fractionally more often. However if that causes any real problems it would be a sign the epoch was being updated too frequently.
Unless I get any objections (or you really want me to look closely at using suspend) then I'll try this approach in the next day or two.
Daniel.
On 01/21/2015 09:29 AM, John Stultz wrote:
On Wed, Jan 21, 2015 at 8:53 AM, Daniel Thompson daniel.thompson@linaro.org wrote:
Currently it is possible for an NMI (or FIQ on ARM) to come in and read sched_clock() whilst update_sched_clock() has half updated the state. This results in a bad time value being observed.
This patch fixes that problem in a similar manner to Thomas Gleixner's 4396e058c52e("timekeeping: Provide fast and NMI safe access to CLOCK_MONOTONIC").
Note that ripping out the seqcount lock from sched_clock_register() and replacing it with a large comment is not nearly as bad as it looks! The locking here is actually pretty useless since most of the variables modified within the write lock are not covered by the read lock. As a result a big comment and the sequence bump implicit in the call to update_epoch() should work pretty much the same.
It still looks pretty bad, even with the current explanation.
raw_write_seqcount_begin(&cd.seq);
/*
* sched_clock will report a bad value if it executes
* concurrently with the following code. No locking exists to
* prevent this; we rely mostly on this function being called
* early during kernel boot up before we have lots of other
* stuff going on.
*/ read_sched_clock = read; sched_clock_mask = new_mask; cd.rate = rate; cd.wrap_kt = new_wrap_kt; cd.mult = new_mult; cd.shift = new_shift;
cd.epoch_cyc = new_epoch;
cd.epoch_ns = ns;
raw_write_seqcount_end(&cd.seq);
update_epoch(new_epoch, ns);
So looking at this, the sched_clock_register() function may not be called super early, so I was looking to see what prevented bad reads prior to registration. And from quick inspection, its nothing. I suspect the undocumented trick that makes this work is that the mult value is initialzied to zero, so sched_clock returns 0 until things have been registered.
mult is never zero. It's NSEC_PER_SEC / HZ by default. The thing that's zero is the sched_clock_mask, so that's making us return 0 until sched_clock_postinit() gets called or a non-jiffy clock is registered.
Where does the bad read happen? By default we're using jiffy_sched_clock_read() and that doesn't move until interrupts are enabled. We also rely on any clocks being registered before sched_clock_postinit() is called (which is called before interrupts are enabled on the boot CPU).
So it does seem like it would be worth while to do the initialization under the lock, or possibly use the suspend flag to make the first initialization safe.
Looking back at the code now I'm not sure why we did all that under the write lock but we don't protect it with the read lock in sched_clock() itself. I guess we didn't really care because the registration phase is entirely single-threaded. I don't see any problem making this more robust so that clocks can be registered at any time. If we did that I would hope that sched_clock_postinit() becomes largely unnecessary.
Currently it is possible for an NMI (or FIQ on ARM) to come in and read sched_clock() whilst update_sched_clock() has locked the seqcount for writing. This results in the NMI handler locking up when it calls raw_read_seqcount_begin().
This patch fixes that problem by providing banked clock data in a similar manner to Thomas Gleixner's 4396e058c52e("timekeeping: Provide fast and NMI safe access to CLOCK_MONOTONIC").
Changing the mode of operation of the seqcount away from the traditional LSB-means-interrupted-write to a banked approach also revealed a good deal of "fake" write locking within sched_clock_register(). This is likely to be a latent issue because sched_clock_register() is typically called before we enable interrupts. Nevertheless the issue has been eliminated by increasing the scope of the read locking performed by sched_clock().
Suggested-by: Stephen Boyd sboyd@codeaurora.org Signed-off-by: Daniel Thompson daniel.thompson@linaro.org ---
Notes: Tested on i.MX6 with perf drowning the system in FIQs and using the perf handler to check that sched_clock() returns monotonic values. At the same time I forcefully reduced kt_wrap so that update_sched_clock() is being called at >1000Hz.
Without the patch the above system is grossly unstable, surviving [9K,115K,25K] perf event cycles during three separate runs. With the patch I ran for over 11M perf event cycles before getting bored.
v2:
* Extended the scope of the read lock in sched_clock() so we can bank all data consumed there (John Stultz)
kernel/time/sched_clock.c | 157 +++++++++++++++++++++++++++++----------------- 1 file changed, 100 insertions(+), 57 deletions(-)
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index 01d2d15aa662..a2ea66944bc1 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -18,28 +18,28 @@ #include <linux/seqlock.h> #include <linux/bitops.h>
-struct clock_data { - ktime_t wrap_kt; +struct clock_data_banked { u64 epoch_ns; u64 epoch_cyc; - seqcount_t seq; - unsigned long rate; + u64 (*read_sched_clock)(void); + u64 sched_clock_mask; u32 mult; u32 shift; bool suspended; };
+struct clock_data { + ktime_t wrap_kt; + seqcount_t seq; + unsigned long rate; + struct clock_data_banked bank[2]; +}; + static struct hrtimer sched_clock_timer; static int irqtime = -1;
core_param(irqtime, irqtime, int, 0400);
-static struct clock_data cd = { - .mult = NSEC_PER_SEC / HZ, -}; - -static u64 __read_mostly sched_clock_mask; - static u64 notrace jiffy_sched_clock_read(void) { /* @@ -49,7 +49,14 @@ static u64 notrace jiffy_sched_clock_read(void) return (u64)(jiffies - INITIAL_JIFFIES); }
-static u64 __read_mostly (*read_sched_clock)(void) = jiffy_sched_clock_read; +static struct clock_data cd = { + .bank = { + [0] = { + .mult = NSEC_PER_SEC / HZ, + .read_sched_clock = jiffy_sched_clock_read, + }, + }, +};
static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift) { @@ -58,50 +65,82 @@ static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift)
unsigned long long notrace sched_clock(void) { - u64 epoch_ns; - u64 epoch_cyc; u64 cyc; unsigned long seq; - - if (cd.suspended) - return cd.epoch_ns; + struct clock_data_banked *b; + u64 res;
do { - seq = raw_read_seqcount_begin(&cd.seq); - epoch_cyc = cd.epoch_cyc; - epoch_ns = cd.epoch_ns; + seq = raw_read_seqcount(&cd.seq); + b = cd.bank + (seq & 1); + if (b->suspended) { + res = b->epoch_ns; + } else { + cyc = b->read_sched_clock(); + cyc = (cyc - b->epoch_cyc) & b->sched_clock_mask; + res = b->epoch_ns + cyc_to_ns(cyc, b->mult, b->shift); + } } while (read_seqcount_retry(&cd.seq, seq));
- cyc = read_sched_clock(); - cyc = (cyc - epoch_cyc) & sched_clock_mask; - return epoch_ns + cyc_to_ns(cyc, cd.mult, cd.shift); + return res; +} + +/* + * Start updating the banked clock data. + * + * sched_clock will never observe mis-matched data even if called from + * an NMI. We do this by maintaining an odd/even copy of the data and + * steering sched_clock to one or the other using a sequence counter. + * In order to preserve the data cache profile of sched_clock as much + * as possible the system reverts back to the even copy when the update + * completes; the odd copy is used *only* during an update. + * + * The caller is responsible for avoiding simultaneous updates. + */ +static struct clock_data_banked *update_bank_begin(void) +{ + /* update the backup (odd) bank and steer readers towards it */ + memcpy(cd.bank + 1, cd.bank, sizeof(struct clock_data_banked)); + raw_write_seqcount_latch(&cd.seq); + + return cd.bank; +} + +/* + * Finalize update of banked clock data. + * + * This is just a trivial switch back to the primary (even) copy. + */ +static void update_bank_end(void) +{ + raw_write_seqcount_latch(&cd.seq); }
/* * Atomically update the sched_clock epoch. */ -static void notrace update_sched_clock(void) +static void notrace update_sched_clock(bool suspended) { - unsigned long flags; + struct clock_data_banked *b; u64 cyc; u64 ns;
- cyc = read_sched_clock(); - ns = cd.epoch_ns + - cyc_to_ns((cyc - cd.epoch_cyc) & sched_clock_mask, - cd.mult, cd.shift); - - raw_local_irq_save(flags); - raw_write_seqcount_begin(&cd.seq); - cd.epoch_ns = ns; - cd.epoch_cyc = cyc; - raw_write_seqcount_end(&cd.seq); - raw_local_irq_restore(flags); + b = update_bank_begin(); + + cyc = b->read_sched_clock(); + ns = b->epoch_ns + cyc_to_ns((cyc - b->epoch_cyc) & b->sched_clock_mask, + b->mult, b->shift); + + b->epoch_ns = ns; + b->epoch_cyc = cyc; + b->suspended = suspended; + + update_bank_end(); }
static enum hrtimer_restart sched_clock_poll(struct hrtimer *hrt) { - update_sched_clock(); + update_sched_clock(false); hrtimer_forward_now(hrt, cd.wrap_kt); return HRTIMER_RESTART; } @@ -111,9 +150,9 @@ void __init sched_clock_register(u64 (*read)(void), int bits, { u64 res, wrap, new_mask, new_epoch, cyc, ns; u32 new_mult, new_shift; - ktime_t new_wrap_kt; unsigned long r; char r_unit; + struct clock_data_banked *b;
if (cd.rate > rate) return; @@ -122,29 +161,30 @@ void __init sched_clock_register(u64 (*read)(void), int bits,
/* calculate the mult/shift to convert counter ticks to ns. */ clocks_calc_mult_shift(&new_mult, &new_shift, rate, NSEC_PER_SEC, 3600); + cd.rate = rate;
new_mask = CLOCKSOURCE_MASK(bits);
/* calculate how many ns until we wrap */ wrap = clocks_calc_max_nsecs(new_mult, new_shift, 0, new_mask); - new_wrap_kt = ns_to_ktime(wrap - (wrap >> 3)); + cd.wrap_kt = ns_to_ktime(wrap - (wrap >> 3)); + + b = update_bank_begin();
/* update epoch for new counter and update epoch_ns from old counter*/ new_epoch = read(); - cyc = read_sched_clock(); - ns = cd.epoch_ns + cyc_to_ns((cyc - cd.epoch_cyc) & sched_clock_mask, - cd.mult, cd.shift); + cyc = b->read_sched_clock(); + ns = b->epoch_ns + cyc_to_ns((cyc - b->epoch_cyc) & b->sched_clock_mask, + b->mult, b->shift);
- raw_write_seqcount_begin(&cd.seq); - read_sched_clock = read; - sched_clock_mask = new_mask; - cd.rate = rate; - cd.wrap_kt = new_wrap_kt; - cd.mult = new_mult; - cd.shift = new_shift; - cd.epoch_cyc = new_epoch; - cd.epoch_ns = ns; - raw_write_seqcount_end(&cd.seq); + b->read_sched_clock = read; + b->sched_clock_mask = new_mask; + b->mult = new_mult; + b->shift = new_shift; + b->epoch_cyc = new_epoch; + b->epoch_ns = ns; + + update_bank_end();
r = rate; if (r >= 4000000) { @@ -175,10 +215,10 @@ void __init sched_clock_postinit(void) * If no sched_clock function has been provided at that point, * make it the final one one. */ - if (read_sched_clock == jiffy_sched_clock_read) + if (cd.bank[0].read_sched_clock == jiffy_sched_clock_read) sched_clock_register(jiffy_sched_clock_read, BITS_PER_LONG, HZ);
- update_sched_clock(); + update_sched_clock(false);
/* * Start the timer to keep sched_clock() properly updated and @@ -191,17 +231,20 @@ void __init sched_clock_postinit(void)
static int sched_clock_suspend(void) { - update_sched_clock(); + update_sched_clock(true); hrtimer_cancel(&sched_clock_timer); - cd.suspended = true; return 0; }
static void sched_clock_resume(void) { - cd.epoch_cyc = read_sched_clock(); + struct clock_data_banked *b; + + b = update_bank_begin(); + b->epoch_cyc = b->read_sched_clock(); hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL); - cd.suspended = false; + b->suspended = false; + update_bank_end(); }
static struct syscore_ops sched_clock_ops = { -- 1.9.3
This patchset optimizes the generic sched_clock implementation to significantly reduce the data cache profile. It also makes it safe to call sched_clock() from NMI (or FIQ on ARM).
The data cache profile of sched_clock() in both the original code and my previous patch was somewhere between 2 and 3 (64-byte) cache lines, depending on alignment of struct clock_data. After patching, the cache profile for the normal case should be a single cacheline.
NMI safety was tested on i.MX6 with perf drowning the system in FIQs and using the perf handler to check that sched_clock() returned monotonic values. At the same time I forcefully reduced kt_wrap so that update_sched_clock() is being called at >1000Hz.
Without the patches the above system is grossly unstable, surviving [9K,115K,25K] perf event cycles during three separate runs. With the patch I ran for over 9M perf event cycles before getting bored.
v3: * Optimized to minimise cache profile, including elimination of the suspended flag (Thomas Gleixner). * Replaced the update_bank_begin/end with a single update function (Thomas Gleixner). * Split into multiple patches to aid review.
v2:
* Extended the scope of the read lock in sched_clock() so we can bank all data consumed there (John Stultz)
Daniel Thompson (4): sched_clock: Match scope of read and write seqcounts sched_clock: Optimize cache line usage sched_clock: Remove suspend from clock_read_data sched_clock: Avoid deadlock during read from NMI
kernel/time/sched_clock.c | 163 ++++++++++++++++++++++++++++++---------------- 1 file changed, 107 insertions(+), 56 deletions(-)
-- 1.9.3
Currently the scope of the raw_write_seqcount_begin/end in sched_clock_register far exceeds the scope of the read section in sched_clock. This gives the impression of safety during cursory review but achieves little.
Note that this is likely to be a latent issue at present because sched_clock_register() is typically called before we enable interrupts, however the issue does risk bugs being needlessly introduced as the code evolves.
This patch fixes the problem by increasing the scope of the read locking performed by sched_clock() to cover all data modified by sched_clock_register.
We also improve clarity by moving writes to struct clock_data that do not impact sched_clock() outside of the critical section.
Signed-off-by: Daniel Thompson daniel.thompson@linaro.org Cc: Russell King linux@arm.linux.org.uk Cc: Will Deacon will.deacon@arm.com Cc: Catalin Marinas catalin.marinas@arm.com --- kernel/time/sched_clock.c | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-)
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index 01d2d15aa662..3d21a8719444 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -58,23 +58,21 @@ static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift)
unsigned long long notrace sched_clock(void) { - u64 epoch_ns; - u64 epoch_cyc; - u64 cyc; + u64 cyc, res; unsigned long seq;
- if (cd.suspended) - return cd.epoch_ns; - do { seq = raw_read_seqcount_begin(&cd.seq); - epoch_cyc = cd.epoch_cyc; - epoch_ns = cd.epoch_ns; + + res = cd.epoch_ns; + if (!cd.suspended) { + cyc = read_sched_clock(); + cyc = (cyc - cd.epoch_cyc) & sched_clock_mask; + res += cyc_to_ns(cyc, cd.mult, cd.shift); + } } while (read_seqcount_retry(&cd.seq, seq));
- cyc = read_sched_clock(); - cyc = (cyc - epoch_cyc) & sched_clock_mask; - return epoch_ns + cyc_to_ns(cyc, cd.mult, cd.shift); + return res; }
/* @@ -124,10 +122,11 @@ void __init sched_clock_register(u64 (*read)(void), int bits, clocks_calc_mult_shift(&new_mult, &new_shift, rate, NSEC_PER_SEC, 3600);
new_mask = CLOCKSOURCE_MASK(bits); + cd.rate = rate;
/* calculate how many ns until we wrap */ wrap = clocks_calc_max_nsecs(new_mult, new_shift, 0, new_mask); - new_wrap_kt = ns_to_ktime(wrap - (wrap >> 3)); + cd.wrap_kt = ns_to_ktime(wrap - (wrap >> 3));
/* update epoch for new counter and update epoch_ns from old counter*/ new_epoch = read(); @@ -138,8 +137,6 @@ void __init sched_clock_register(u64 (*read)(void), int bits, raw_write_seqcount_begin(&cd.seq); read_sched_clock = read; sched_clock_mask = new_mask; - cd.rate = rate; - cd.wrap_kt = new_wrap_kt; cd.mult = new_mult; cd.shift = new_shift; cd.epoch_cyc = new_epoch;
Currently sched_clock(), a very hot code path, is not optimized to minimise its cache profile. In particular:
1. cd is not ____cacheline_aligned,
2. struct clock_data does not distinguish between hotpath and coldpath data, reducing locality of reference in the hotpath,
3. Some hotpath data is missing from struct clock_data and is marked __read_mostly (which more or less guarantees it will not share a cache line with cd).
This patch corrects these problems by extracting all hotpath data into a separate structure and using ____cacheline_aligned to ensure the hotpath uses a single (64 byte) cache line.
Signed-off-by: Daniel Thompson daniel.thompson@linaro.org Cc: Russell King linux@arm.linux.org.uk Cc: Will Deacon will.deacon@arm.com Cc: Catalin Marinas catalin.marinas@arm.com --- kernel/time/sched_clock.c | 98 ++++++++++++++++++++++++++++++----------------- 1 file changed, 62 insertions(+), 36 deletions(-)
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index 3d21a8719444..cb69a47dfee4 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -18,28 +18,44 @@ #include <linux/seqlock.h> #include <linux/bitops.h>
-struct clock_data { - ktime_t wrap_kt; +/** + * struct clock_read_data - data required to read from sched_clock + * + * Care must be taken when updating this structure; it is read by + * some very hot code paths. It occupies <=48 bytes and, when combined + * with the seqcount used to synchronize access, comfortably fits into + * a 64 byte cache line. + */ +struct clock_read_data { u64 epoch_ns; u64 epoch_cyc; - seqcount_t seq; - unsigned long rate; + u64 sched_clock_mask; + u64 (*read_sched_clock)(void); u32 mult; u32 shift; bool suspended; };
+/** + * struct clock_data - all data needed for sched_clock (including + * registration of a new clock source) + * + * The ordering of this structure has been chosen to optimize cache + * performance. In particular seq and read_data (combined) should fit + * into a single 64 byte cache line. + */ +struct clock_data { + seqcount_t seq; + struct clock_read_data read_data; + ktime_t wrap_kt; + unsigned long rate; +}; + static struct hrtimer sched_clock_timer; static int irqtime = -1;
core_param(irqtime, irqtime, int, 0400);
-static struct clock_data cd = { - .mult = NSEC_PER_SEC / HZ, -}; - -static u64 __read_mostly sched_clock_mask; - static u64 notrace jiffy_sched_clock_read(void) { /* @@ -49,7 +65,10 @@ static u64 notrace jiffy_sched_clock_read(void) return (u64)(jiffies - INITIAL_JIFFIES); }
-static u64 __read_mostly (*read_sched_clock)(void) = jiffy_sched_clock_read; +static struct clock_data cd ____cacheline_aligned = { + .read_data = { .mult = NSEC_PER_SEC / HZ, + .read_sched_clock = jiffy_sched_clock_read, }, +};
static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift) { @@ -60,15 +79,16 @@ unsigned long long notrace sched_clock(void) { u64 cyc, res; unsigned long seq; + struct clock_read_data *rd = &cd.read_data;
do { seq = raw_read_seqcount_begin(&cd.seq);
- res = cd.epoch_ns; - if (!cd.suspended) { - cyc = read_sched_clock(); - cyc = (cyc - cd.epoch_cyc) & sched_clock_mask; - res += cyc_to_ns(cyc, cd.mult, cd.shift); + res = rd->epoch_ns; + if (!rd->suspended) { + cyc = rd->read_sched_clock(); + cyc = (cyc - rd->epoch_cyc) & rd->sched_clock_mask; + res += cyc_to_ns(cyc, rd->mult, rd->shift); } } while (read_seqcount_retry(&cd.seq, seq));
@@ -83,16 +103,17 @@ static void notrace update_sched_clock(void) unsigned long flags; u64 cyc; u64 ns; + struct clock_read_data *rd = &cd.read_data;
- cyc = read_sched_clock(); - ns = cd.epoch_ns + - cyc_to_ns((cyc - cd.epoch_cyc) & sched_clock_mask, - cd.mult, cd.shift); + cyc = rd->read_sched_clock(); + ns = rd->epoch_ns + + cyc_to_ns((cyc - rd->epoch_cyc) & rd->sched_clock_mask, + rd->mult, rd->shift);
raw_local_irq_save(flags); raw_write_seqcount_begin(&cd.seq); - cd.epoch_ns = ns; - cd.epoch_cyc = cyc; + rd->epoch_ns = ns; + rd->epoch_cyc = cyc; raw_write_seqcount_end(&cd.seq); raw_local_irq_restore(flags); } @@ -109,9 +130,9 @@ void __init sched_clock_register(u64 (*read)(void), int bits, { u64 res, wrap, new_mask, new_epoch, cyc, ns; u32 new_mult, new_shift; - ktime_t new_wrap_kt; unsigned long r; char r_unit; + struct clock_read_data *rd = &cd.read_data;
if (cd.rate > rate) return; @@ -130,17 +151,18 @@ void __init sched_clock_register(u64 (*read)(void), int bits,
/* update epoch for new counter and update epoch_ns from old counter*/ new_epoch = read(); - cyc = read_sched_clock(); - ns = cd.epoch_ns + cyc_to_ns((cyc - cd.epoch_cyc) & sched_clock_mask, - cd.mult, cd.shift); + cyc = rd->read_sched_clock(); + ns = rd->epoch_ns + + cyc_to_ns((cyc - rd->epoch_cyc) & rd->sched_clock_mask, + rd->mult, rd->shift);
raw_write_seqcount_begin(&cd.seq); - read_sched_clock = read; - sched_clock_mask = new_mask; - cd.mult = new_mult; - cd.shift = new_shift; - cd.epoch_cyc = new_epoch; - cd.epoch_ns = ns; + rd->read_sched_clock = read; + rd->sched_clock_mask = new_mask; + rd->mult = new_mult; + rd->shift = new_shift; + rd->epoch_cyc = new_epoch; + rd->epoch_ns = ns; raw_write_seqcount_end(&cd.seq);
r = rate; @@ -172,7 +194,7 @@ void __init sched_clock_postinit(void) * If no sched_clock function has been provided at that point, * make it the final one one. */ - if (read_sched_clock == jiffy_sched_clock_read) + if (cd.read_data.read_sched_clock == jiffy_sched_clock_read) sched_clock_register(jiffy_sched_clock_read, BITS_PER_LONG, HZ);
update_sched_clock(); @@ -188,17 +210,21 @@ void __init sched_clock_postinit(void)
static int sched_clock_suspend(void) { + struct clock_read_data *rd = &cd.read_data; + update_sched_clock(); hrtimer_cancel(&sched_clock_timer); - cd.suspended = true; + rd->suspended = true; return 0; }
static void sched_clock_resume(void) { - cd.epoch_cyc = read_sched_clock(); + struct clock_read_data *rd = &cd.read_data; + + rd->epoch_cyc = rd->read_sched_clock(); hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL); - cd.suspended = false; + rd->suspended = false; }
static struct syscore_ops sched_clock_ops = {
On 01/30, Daniel Thompson wrote:
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index 3d21a8719444..cb69a47dfee4 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -18,28 +18,44 @@ #include <linux/seqlock.h> #include <linux/bitops.h> -struct clock_data {
- ktime_t wrap_kt;
+/**
- struct clock_read_data - data required to read from sched_clock
Nitpick: Won't kernel-doc complain that members aren't documented?
- Care must be taken when updating this structure; it is read by
- some very hot code paths. It occupies <=48 bytes and, when combined
- with the seqcount used to synchronize access, comfortably fits into
- a 64 byte cache line.
- */
+struct clock_read_data { u64 epoch_ns; u64 epoch_cyc;
- seqcount_t seq;
- unsigned long rate;
- u64 sched_clock_mask;
- u64 (*read_sched_clock)(void); u32 mult; u32 shift; bool suspended;
}; +/**
- struct clock_data - all data needed for sched_clock (including
registration of a new clock source)
Same comment.
- The ordering of this structure has been chosen to optimize cache
- performance. In particular seq and read_data (combined) should fit
- into a single 64 byte cache line.
- */
+struct clock_data {
- seqcount_t seq;
- struct clock_read_data read_data;
- ktime_t wrap_kt;
- unsigned long rate;
+}; @@ -60,15 +79,16 @@ unsigned long long notrace sched_clock(void) { u64 cyc, res; unsigned long seq;
- struct clock_read_data *rd = &cd.read_data;
do { seq = raw_read_seqcount_begin(&cd.seq);
res = cd.epoch_ns;
if (!cd.suspended) {
cyc = read_sched_clock();
cyc = (cyc - cd.epoch_cyc) & sched_clock_mask;
res += cyc_to_ns(cyc, cd.mult, cd.shift);
res = rd->epoch_ns;
if (!rd->suspended) {
Should this have likely() treatment? It would be really nice if we could use static branches here to avoid any branch penalty at all. I guess that would need some sort of special cased stop_machine() though. Or I wonder if we could replace rd->read_sched_clock() with a dumb function that returns cd.epoch_cyc so that the math turns out to be 0?
cyc = rd->read_sched_clock();
cyc = (cyc - rd->epoch_cyc) & rd->sched_clock_mask;
} } while (read_seqcount_retry(&cd.seq, seq));res += cyc_to_ns(cyc, rd->mult, rd->shift);
On 05/02/15 01:14, Stephen Boyd wrote:
On 01/30, Daniel Thompson wrote:
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index 3d21a8719444..cb69a47dfee4 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -18,28 +18,44 @@ #include <linux/seqlock.h> #include <linux/bitops.h> -struct clock_data {
- ktime_t wrap_kt;
+/**
- struct clock_read_data - data required to read from sched_clock
Nitpick: Won't kernel-doc complain that members aren't documented?
It does indeed. I'll add descriptions here...
- Care must be taken when updating this structure; it is read by
- some very hot code paths. It occupies <=48 bytes and, when combined
- with the seqcount used to synchronize access, comfortably fits into
- a 64 byte cache line.
- */
+struct clock_read_data { u64 epoch_ns; u64 epoch_cyc;
- seqcount_t seq;
- unsigned long rate;
- u64 sched_clock_mask;
- u64 (*read_sched_clock)(void); u32 mult; u32 shift; bool suspended;
}; +/**
- struct clock_data - all data needed for sched_clock (including
registration of a new clock source)
Same comment.
... and here.
- The ordering of this structure has been chosen to optimize cache
- performance. In particular seq and read_data (combined) should fit
- into a single 64 byte cache line.
- */
+struct clock_data {
- seqcount_t seq;
- struct clock_read_data read_data;
- ktime_t wrap_kt;
- unsigned long rate;
+}; @@ -60,15 +79,16 @@ unsigned long long notrace sched_clock(void) { u64 cyc, res; unsigned long seq;
- struct clock_read_data *rd = &cd.read_data;
do { seq = raw_read_seqcount_begin(&cd.seq);
res = cd.epoch_ns;
if (!cd.suspended) {
cyc = read_sched_clock();
cyc = (cyc - cd.epoch_cyc) & sched_clock_mask;
res += cyc_to_ns(cyc, cd.mult, cd.shift);
res = rd->epoch_ns;
if (!rd->suspended) {
Should this have likely() treatment? It would be really nice if we could use static branches here to avoid any branch penalty at all. I guess that would need some sort of special cased stop_machine() though. Or I wonder if we could replace rd->read_sched_clock() with a dumb function that returns cd.epoch_cyc so that the math turns out to be 0?
Great idea.
Making this code branchless with a special function sounds very much better than using likely().
Currently cd.read_data.suspended is read by the hotpath function sched_clock(). This variable need not be accessed on the hotpath; we can instead check whether the timer is suspended by checking the validity of the read_sched_clock function pointer.
The new master copy of the function pointer (actual_read_sched_clock) is introduced and is used this for all reads of the clock hardware except those within sched_clock itself.
Suggested-by: Thomas Gleixner tglx@linutronix.de Signed-off-by: Daniel Thompson daniel.thompson@linaro.org Cc: Russell King linux@arm.linux.org.uk Cc: Will Deacon will.deacon@arm.com Cc: Catalin Marinas catalin.marinas@arm.com --- kernel/time/sched_clock.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-)
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index cb69a47dfee4..638c765131fa 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -22,7 +22,7 @@ * struct clock_read_data - data required to read from sched_clock * * Care must be taken when updating this structure; it is read by - * some very hot code paths. It occupies <=48 bytes and, when combined + * some very hot code paths. It occupies <=40 bytes and, when combined * with the seqcount used to synchronize access, comfortably fits into * a 64 byte cache line. */ @@ -33,7 +33,6 @@ struct clock_read_data { u64 (*read_sched_clock)(void); u32 mult; u32 shift; - bool suspended; };
/** @@ -49,6 +48,7 @@ struct clock_data { struct clock_read_data read_data; ktime_t wrap_kt; unsigned long rate; + u64 (*actual_read_sched_clock)(void); };
static struct hrtimer sched_clock_timer; @@ -68,6 +68,8 @@ static u64 notrace jiffy_sched_clock_read(void) static struct clock_data cd ____cacheline_aligned = { .read_data = { .mult = NSEC_PER_SEC / HZ, .read_sched_clock = jiffy_sched_clock_read, }, + .actual_read_sched_clock = jiffy_sched_clock_read, + };
static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift) @@ -85,7 +87,7 @@ unsigned long long notrace sched_clock(void) seq = raw_read_seqcount_begin(&cd.seq);
res = rd->epoch_ns; - if (!rd->suspended) { + if (rd->read_sched_clock) { cyc = rd->read_sched_clock(); cyc = (cyc - rd->epoch_cyc) & rd->sched_clock_mask; res += cyc_to_ns(cyc, rd->mult, rd->shift); @@ -105,7 +107,7 @@ static void notrace update_sched_clock(void) u64 ns; struct clock_read_data *rd = &cd.read_data;
- cyc = rd->read_sched_clock(); + cyc = cd.actual_read_sched_clock(); ns = rd->epoch_ns + cyc_to_ns((cyc - rd->epoch_cyc) & rd->sched_clock_mask, rd->mult, rd->shift); @@ -151,10 +153,11 @@ void __init sched_clock_register(u64 (*read)(void), int bits,
/* update epoch for new counter and update epoch_ns from old counter*/ new_epoch = read(); - cyc = rd->read_sched_clock(); + cyc = cd.actual_read_sched_clock(); ns = rd->epoch_ns + cyc_to_ns((cyc - rd->epoch_cyc) & rd->sched_clock_mask, rd->mult, rd->shift); + cd.actual_read_sched_clock = read;
raw_write_seqcount_begin(&cd.seq); rd->read_sched_clock = read; @@ -194,7 +197,7 @@ void __init sched_clock_postinit(void) * If no sched_clock function has been provided at that point, * make it the final one one. */ - if (cd.read_data.read_sched_clock == jiffy_sched_clock_read) + if (cd.actual_read_sched_clock == jiffy_sched_clock_read) sched_clock_register(jiffy_sched_clock_read, BITS_PER_LONG, HZ);
update_sched_clock(); @@ -214,7 +217,7 @@ static int sched_clock_suspend(void)
update_sched_clock(); hrtimer_cancel(&sched_clock_timer); - rd->suspended = true; + rd->read_sched_clock = NULL; return 0; }
@@ -222,9 +225,9 @@ static void sched_clock_resume(void) { struct clock_read_data *rd = &cd.read_data;
- rd->epoch_cyc = rd->read_sched_clock(); + rd->epoch_cyc = cd.actual_read_sched_clock(); hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL); - rd->suspended = false; + rd->read_sched_clock = cd.actual_read_sched_clock; }
static struct syscore_ops sched_clock_ops = {
Currently it is possible for an NMI (or FIQ on ARM) to come in and read sched_clock() whilst update_sched_clock() has locked the seqcount for writing. This results in the NMI handler locking up when it calls raw_read_seqcount_begin().
This patch fixes the NMI safety issues by providing banked clock data. This is a similar approach to the one used in Thomas Gleixner's 4396e058c52e("timekeeping: Provide fast and NMI safe access to CLOCK_MONOTONIC").
Suggested-by: Stephen Boyd sboyd@codeaurora.org Signed-off-by: Daniel Thompson daniel.thompson@linaro.org Cc: Russell King linux@arm.linux.org.uk Cc: Will Deacon will.deacon@arm.com Cc: Catalin Marinas catalin.marinas@arm.com --- kernel/time/sched_clock.c | 91 ++++++++++++++++++++++++++++++----------------- 1 file changed, 58 insertions(+), 33 deletions(-)
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index 638c765131fa..2b1a465f1c00 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -40,12 +40,12 @@ struct clock_read_data { * registration of a new clock source) * * The ordering of this structure has been chosen to optimize cache - * performance. In particular seq and read_data (combined) should fit + * performance. In particular seq and read_data[0] (combined) should fit * into a single 64 byte cache line. */ struct clock_data { seqcount_t seq; - struct clock_read_data read_data; + struct clock_read_data read_data[2]; ktime_t wrap_kt; unsigned long rate; u64 (*actual_read_sched_clock)(void); @@ -66,10 +66,9 @@ static u64 notrace jiffy_sched_clock_read(void) }
static struct clock_data cd ____cacheline_aligned = { - .read_data = { .mult = NSEC_PER_SEC / HZ, - .read_sched_clock = jiffy_sched_clock_read, }, + .read_data[0] = { .mult = NSEC_PER_SEC / HZ, + .read_sched_clock = jiffy_sched_clock_read, }, .actual_read_sched_clock = jiffy_sched_clock_read, - };
static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift) @@ -81,10 +80,11 @@ unsigned long long notrace sched_clock(void) { u64 cyc, res; unsigned long seq; - struct clock_read_data *rd = &cd.read_data; + struct clock_read_data *rd;
do { - seq = raw_read_seqcount_begin(&cd.seq); + seq = raw_read_seqcount(&cd.seq); + rd = cd.read_data + (seq & 1);
res = rd->epoch_ns; if (rd->read_sched_clock) { @@ -98,26 +98,50 @@ unsigned long long notrace sched_clock(void) }
/* + * Updating the data required to read the clock. + * + * sched_clock will never observe mis-matched data even if called from + * an NMI. We do this by maintaining an odd/even copy of the data and + * steering sched_clock to one or the other using a sequence counter. + * In order to preserve the data cache profile of sched_clock as much + * as possible the system reverts back to the even copy when the update + * completes; the odd copy is used *only* during an update. + */ +static void update_clock_read_data(struct clock_read_data *rd) +{ + /* update the backup (odd) copy with the new data */ + cd.read_data[1] = *rd; + + /* steer readers towards the new data */ + raw_write_seqcount_latch(&cd.seq); + + /* now its safe for us to update the normal (even) copy */ + cd.read_data[0] = *rd; + + /* switch readers back to the even copy */ + raw_write_seqcount_latch(&cd.seq); +} + +/* * Atomically update the sched_clock epoch. */ static void notrace update_sched_clock(void) { - unsigned long flags; u64 cyc; u64 ns; - struct clock_read_data *rd = &cd.read_data; + struct clock_read_data rd; + + rd = cd.read_data[0];
cyc = cd.actual_read_sched_clock(); - ns = rd->epoch_ns + - cyc_to_ns((cyc - rd->epoch_cyc) & rd->sched_clock_mask, - rd->mult, rd->shift); - - raw_local_irq_save(flags); - raw_write_seqcount_begin(&cd.seq); - rd->epoch_ns = ns; - rd->epoch_cyc = cyc; - raw_write_seqcount_end(&cd.seq); - raw_local_irq_restore(flags); + ns = rd.epoch_ns + + cyc_to_ns((cyc - rd.epoch_cyc) & rd.sched_clock_mask, + rd.mult, rd.shift); + + rd.epoch_ns = ns; + rd.epoch_cyc = cyc; + + update_clock_read_data(&rd); }
static enum hrtimer_restart sched_clock_poll(struct hrtimer *hrt) @@ -134,7 +158,7 @@ void __init sched_clock_register(u64 (*read)(void), int bits, u32 new_mult, new_shift; unsigned long r; char r_unit; - struct clock_read_data *rd = &cd.read_data; + struct clock_read_data rd;
if (cd.rate > rate) return; @@ -151,22 +175,23 @@ void __init sched_clock_register(u64 (*read)(void), int bits, wrap = clocks_calc_max_nsecs(new_mult, new_shift, 0, new_mask); cd.wrap_kt = ns_to_ktime(wrap - (wrap >> 3));
+ rd = cd.read_data[0]; + /* update epoch for new counter and update epoch_ns from old counter*/ new_epoch = read(); cyc = cd.actual_read_sched_clock(); - ns = rd->epoch_ns + - cyc_to_ns((cyc - rd->epoch_cyc) & rd->sched_clock_mask, - rd->mult, rd->shift); + ns = rd.epoch_ns + + cyc_to_ns((cyc - rd.epoch_cyc) & rd.sched_clock_mask, + rd.mult, rd.shift); cd.actual_read_sched_clock = read;
- raw_write_seqcount_begin(&cd.seq); - rd->read_sched_clock = read; - rd->sched_clock_mask = new_mask; - rd->mult = new_mult; - rd->shift = new_shift; - rd->epoch_cyc = new_epoch; - rd->epoch_ns = ns; - raw_write_seqcount_end(&cd.seq); + rd.read_sched_clock = read; + rd.sched_clock_mask = new_mask; + rd.mult = new_mult; + rd.shift = new_shift; + rd.epoch_cyc = new_epoch; + rd.epoch_ns = ns; + update_clock_read_data(&rd);
r = rate; if (r >= 4000000) { @@ -213,7 +238,7 @@ void __init sched_clock_postinit(void)
static int sched_clock_suspend(void) { - struct clock_read_data *rd = &cd.read_data; + struct clock_read_data *rd = &cd.read_data[0];
update_sched_clock(); hrtimer_cancel(&sched_clock_timer); @@ -223,7 +248,7 @@ static int sched_clock_suspend(void)
static void sched_clock_resume(void) { - struct clock_read_data *rd = &cd.read_data; + struct clock_read_data *rd = &cd.read_data[0];
rd->epoch_cyc = cd.actual_read_sched_clock(); hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL);
On 01/30, Daniel Thompson wrote:
@@ -98,26 +98,50 @@ unsigned long long notrace sched_clock(void) } /*
- Updating the data required to read the clock.
- sched_clock will never observe mis-matched data even if called from
- an NMI. We do this by maintaining an odd/even copy of the data and
- steering sched_clock to one or the other using a sequence counter.
- In order to preserve the data cache profile of sched_clock as much
- as possible the system reverts back to the even copy when the update
- completes; the odd copy is used *only* during an update.
- */
+static void update_clock_read_data(struct clock_read_data *rd)
notrace?
+{
- /* update the backup (odd) copy with the new data */
- cd.read_data[1] = *rd;
- /* steer readers towards the new data */
s/new data/odd copy/?
- raw_write_seqcount_latch(&cd.seq);
- /* now its safe for us to update the normal (even) copy */
- cd.read_data[0] = *rd;
- /* switch readers back to the even copy */
- raw_write_seqcount_latch(&cd.seq);
+}
+/*
On Wed, 4 Feb 2015 17:23:51 -0800 Stephen Boyd sboyd@codeaurora.org wrote:
On 01/30, Daniel Thompson wrote:
@@ -98,26 +98,50 @@ unsigned long long notrace sched_clock(void) } /*
- Updating the data required to read the clock.
- sched_clock will never observe mis-matched data even if called from
- an NMI. We do this by maintaining an odd/even copy of the data and
- steering sched_clock to one or the other using a sequence counter.
- In order to preserve the data cache profile of sched_clock as much
- as possible the system reverts back to the even copy when the update
- completes; the odd copy is used *only* during an update.
- */
+static void update_clock_read_data(struct clock_read_data *rd)
notrace?
Why? Isn't this for update, not for readers?
-- Steve
+{
- /* update the backup (odd) copy with the new data */
- cd.read_data[1] = *rd;
- /* steer readers towards the new data */
s/new data/odd copy/?
- raw_write_seqcount_latch(&cd.seq);
- /* now its safe for us to update the normal (even) copy */
- cd.read_data[0] = *rd;
- /* switch readers back to the even copy */
- raw_write_seqcount_latch(&cd.seq);
+}
+/*
On 02/04, Steven Rostedt wrote:
On Wed, 4 Feb 2015 17:23:51 -0800 Stephen Boyd sboyd@codeaurora.org wrote:
On 01/30, Daniel Thompson wrote:
@@ -98,26 +98,50 @@ unsigned long long notrace sched_clock(void) } /*
- Updating the data required to read the clock.
- sched_clock will never observe mis-matched data even if called from
- an NMI. We do this by maintaining an odd/even copy of the data and
- steering sched_clock to one or the other using a sequence counter.
- In order to preserve the data cache profile of sched_clock as much
- as possible the system reverts back to the even copy when the update
- completes; the odd copy is used *only* during an update.
- */
+static void update_clock_read_data(struct clock_read_data *rd)
notrace?
Why? Isn't this for update, not for readers?
Good point. I was basing it on the fact that the caller, update_sched_clock() is marked notrace. It looks like it's always been that way (see commit 2f0778afac79 "ARM: 7205/2: sched_clock: allow sched_clock to be selected at runtime" from 2011-12-15 where it was introduced). So the correct thing would be to drop notrace from update_sched_clock().
On 01/30, Daniel Thompson wrote:
This patchset optimizes the generic sched_clock implementation to significantly reduce the data cache profile. It also makes it safe to call sched_clock() from NMI (or FIQ on ARM).
The data cache profile of sched_clock() in both the original code and my previous patch was somewhere between 2 and 3 (64-byte) cache lines, depending on alignment of struct clock_data. After patching, the cache profile for the normal case should be a single cacheline.
NMI safety was tested on i.MX6 with perf drowning the system in FIQs and using the perf handler to check that sched_clock() returned monotonic values. At the same time I forcefully reduced kt_wrap so that update_sched_clock() is being called at >1000Hz.
Without the patches the above system is grossly unstable, surviving [9K,115K,25K] perf event cycles during three separate runs. With the patch I ran for over 9M perf event cycles before getting bored.
I wanted to see if there was any speedup from these changes so I made a tight loop around sched_clock() that ran for 10 seconds and I ran it 10 times before and after this patch series:
unsigned long long clock, start_clock; int count = 0;
clock = start_clock = sched_clock(); while ((clock - start_clock) < 10ULL * NSEC_PER_SEC) { clock = sched_clock(); count++; }
pr_info("Made %d calls in %llu ns\n", count, clock - start_clock);
Before ------ Made 19218953 calls in 10000000439 ns Made 19212790 calls in 10000000438 ns Made 19217121 calls in 10000000142 ns Made 19227304 calls in 10000000142 ns Made 19217559 calls in 10000000142 ns Made 19230193 calls in 10000000290 ns Made 19212715 calls in 10000000290 ns Made 19234446 calls in 10000000438 ns Made 19226274 calls in 10000000439 ns Made 19236118 calls in 10000000143 ns
After ----- Made 19434797 calls in 10000000438 ns Made 19435733 calls in 10000000439 ns Made 19434499 calls in 10000000438 ns Made 19438482 calls in 10000000438 ns Made 19435604 calls in 10000000142 ns Made 19438551 calls in 10000000438 ns Made 19444550 calls in 10000000290 ns Made 19437580 calls in 10000000290 ns Made 19439429 calls in 10000048142 ns Made 19439493 calls in 10000000438 ns
So it seems to be a small improvement.
On 05/02/15 00:50, Stephen Boyd wrote:
On 01/30, Daniel Thompson wrote:
This patchset optimizes the generic sched_clock implementation to significantly reduce the data cache profile. It also makes it safe to call sched_clock() from NMI (or FIQ on ARM).
The data cache profile of sched_clock() in both the original code and my previous patch was somewhere between 2 and 3 (64-byte) cache lines, depending on alignment of struct clock_data. After patching, the cache profile for the normal case should be a single cacheline.
NMI safety was tested on i.MX6 with perf drowning the system in FIQs and using the perf handler to check that sched_clock() returned monotonic values. At the same time I forcefully reduced kt_wrap so that update_sched_clock() is being called at >1000Hz.
Without the patches the above system is grossly unstable, surviving [9K,115K,25K] perf event cycles during three separate runs. With the patch I ran for over 9M perf event cycles before getting bored.
I wanted to see if there was any speedup from these changes so I made a tight loop around sched_clock() that ran for 10 seconds and I ran it 10 times before and after this patch series:
unsigned long long clock, start_clock; int count = 0; clock = start_clock = sched_clock(); while ((clock - start_clock) < 10ULL * NSEC_PER_SEC) { clock = sched_clock(); count++; } pr_info("Made %d calls in %llu ns\n", count, clock - start_clock);
Before
Made 19218953 calls in 10000000439 ns Made 19212790 calls in 10000000438 ns Made 19217121 calls in 10000000142 ns Made 19227304 calls in 10000000142 ns Made 19217559 calls in 10000000142 ns Made 19230193 calls in 10000000290 ns Made 19212715 calls in 10000000290 ns Made 19234446 calls in 10000000438 ns Made 19226274 calls in 10000000439 ns Made 19236118 calls in 10000000143 ns After
Made 19434797 calls in 10000000438 ns Made 19435733 calls in 10000000439 ns Made 19434499 calls in 10000000438 ns Made 19438482 calls in 10000000438 ns Made 19435604 calls in 10000000142 ns Made 19438551 calls in 10000000438 ns Made 19444550 calls in 10000000290 ns Made 19437580 calls in 10000000290 ns Made 19439429 calls in 10000048142 ns Made 19439493 calls in 10000000438 ns
So it seems to be a small improvement.
Awesome!
I guess this is mostly the effect of simplifying the suspend logic since the changes to the cache profile probably wouldn't reveal much in such a tight loop.
I will re-run this after acting on your other review comments. BTW what device did you run on?
Daniel.
On 05/02/15 17:05, Daniel Thompson wrote:
On 05/02/15 00:50, Stephen Boyd wrote:
On 01/30, Daniel Thompson wrote:
This patchset optimizes the generic sched_clock implementation to significantly reduce the data cache profile. It also makes it safe to call sched_clock() from NMI (or FIQ on ARM).
The data cache profile of sched_clock() in both the original code and my previous patch was somewhere between 2 and 3 (64-byte) cache lines, depending on alignment of struct clock_data. After patching, the cache profile for the normal case should be a single cacheline.
NMI safety was tested on i.MX6 with perf drowning the system in FIQs and using the perf handler to check that sched_clock() returned monotonic values. At the same time I forcefully reduced kt_wrap so that update_sched_clock() is being called at >1000Hz.
Without the patches the above system is grossly unstable, surviving [9K,115K,25K] perf event cycles during three separate runs. With the patch I ran for over 9M perf event cycles before getting bored.
I wanted to see if there was any speedup from these changes so I made a tight loop around sched_clock() that ran for 10 seconds and I ran it 10 times before and after this patch series:
unsigned long long clock, start_clock; int count = 0; clock = start_clock = sched_clock(); while ((clock - start_clock) < 10ULL * NSEC_PER_SEC) { clock = sched_clock(); count++; } pr_info("Made %d calls in %llu ns\n", count, clock - start_clock);
Before
Made 19218953 calls in 10000000439 ns Made 19212790 calls in 10000000438 ns Made 19217121 calls in 10000000142 ns Made 19227304 calls in 10000000142 ns Made 19217559 calls in 10000000142 ns Made 19230193 calls in 10000000290 ns Made 19212715 calls in 10000000290 ns Made 19234446 calls in 10000000438 ns Made 19226274 calls in 10000000439 ns Made 19236118 calls in 10000000143 ns After
Made 19434797 calls in 10000000438 ns Made 19435733 calls in 10000000439 ns Made 19434499 calls in 10000000438 ns Made 19438482 calls in 10000000438 ns Made 19435604 calls in 10000000142 ns Made 19438551 calls in 10000000438 ns Made 19444550 calls in 10000000290 ns Made 19437580 calls in 10000000290 ns Made 19439429 calls in 10000048142 ns Made 19439493 calls in 10000000438 ns
So it seems to be a small improvement.
Awesome!
I guess this is mostly the effect of simplifying the suspend logic since the changes to the cache profile probably wouldn't reveal much in such a tight loop.
I will re-run this after acting on your other review comments. BTW what device did you run on?
I ran the same test on my Snapdragon 600 board. The results are a little odd. There is an odd quantization effect that I cannot easily explain and the results of the v4 patch seem almost too good to be true.
My results are below but I'd be very interested to see what results you get with the v4 patch!
Latest (branchless approach):
Made 18736519 calls in 10000000439 ns Made 19958774 calls in 10000000439 ns Made 18736500 calls in 10000000587 ns Made 21703993 calls in 10000000439 ns Made 18734458 calls in 10000000142 ns Made 18736175 calls in 10000000439 ns Made 19961406 calls in 10000000291 ns Made 19953920 calls in 10000000143 ns Made 21709619 calls in 10000000290 ns Made 18734077 calls in 10000000142 ns
v3:
Made 15971788 calls in 10000000438 ns Made 14594357 calls in 10000000734 ns Made 14590951 calls in 10000000735 ns Made 14595048 calls in 10000000290 ns Made 14595157 calls in 10000000143 ns Made 14594117 calls in 10000000142 ns Made 14597277 calls in 10000000142 ns Made 14594472 calls in 10000000586 ns Made 14601292 calls in 10000000587 ns Made 15968630 calls in 10000000587 ns
Current:
Made 14274871 calls in 10000000587 ns Made 15634136 calls in 10000000587 ns Made 16453528 calls in 10000000142 ns Made 14275854 calls in 10000000586 ns Made 15634128 calls in 10000000438 ns Made 14277672 calls in 10000000143 ns Made 14282904 calls in 10000000290 ns Made 14278673 calls in 10000000142 ns Made 14276096 calls in 10000000290 ns Made 14275336 calls in 10000000143 ns
On 02/08/15 04:09, Daniel Thompson wrote:
On 05/02/15 17:05, Daniel Thompson wrote:
On 05/02/15 00:50, Stephen Boyd wrote:
On 01/30, Daniel Thompson wrote:
This patchset optimizes the generic sched_clock implementation to significantly reduce the data cache profile. It also makes it safe to call sched_clock() from NMI (or FIQ on ARM).
The data cache profile of sched_clock() in both the original code and my previous patch was somewhere between 2 and 3 (64-byte) cache lines, depending on alignment of struct clock_data. After patching, the cache profile for the normal case should be a single cacheline.
NMI safety was tested on i.MX6 with perf drowning the system in FIQs and using the perf handler to check that sched_clock() returned monotonic values. At the same time I forcefully reduced kt_wrap so that update_sched_clock() is being called at >1000Hz.
Without the patches the above system is grossly unstable, surviving [9K,115K,25K] perf event cycles during three separate runs. With the patch I ran for over 9M perf event cycles before getting bored.
I wanted to see if there was any speedup from these changes so I made a tight loop around sched_clock() that ran for 10 seconds and I ran it 10 times before and after this patch series:
unsigned long long clock, start_clock; int count = 0; clock = start_clock = sched_clock(); while ((clock - start_clock) < 10ULL * NSEC_PER_SEC) { clock = sched_clock(); count++; } pr_info("Made %d calls in %llu ns\n", count, clock - start_clock);
Before
Made 19218953 calls in 10000000439 ns Made 19212790 calls in 10000000438 ns Made 19217121 calls in 10000000142 ns Made 19227304 calls in 10000000142 ns Made 19217559 calls in 10000000142 ns Made 19230193 calls in 10000000290 ns Made 19212715 calls in 10000000290 ns Made 19234446 calls in 10000000438 ns Made 19226274 calls in 10000000439 ns Made 19236118 calls in 10000000143 ns After
Made 19434797 calls in 10000000438 ns Made 19435733 calls in 10000000439 ns Made 19434499 calls in 10000000438 ns Made 19438482 calls in 10000000438 ns Made 19435604 calls in 10000000142 ns Made 19438551 calls in 10000000438 ns Made 19444550 calls in 10000000290 ns Made 19437580 calls in 10000000290 ns Made 19439429 calls in 10000048142 ns Made 19439493 calls in 10000000438 ns
So it seems to be a small improvement.
Awesome!
I guess this is mostly the effect of simplifying the suspend logic since the changes to the cache profile probably wouldn't reveal much in such a tight loop.
I will re-run this after acting on your other review comments. BTW what device did you run on?
I ran the same test on my Snapdragon 600 board. The results are a little odd. There is an odd quantization effect that I cannot easily explain and the results of the v4 patch seem almost too good to be true.
My results are below but I'd be very interested to see what results you get with the v4 patch!
I ran my test on this msm8960 board I got lying around. In marketing terms (which I honestly don't know off the top of my head) I think it's called a Snapdragon S4 Play. I had to look up Snapdragon 600 to figure out what it is. The 600 is slightly newer but should be an APQ8064 of some kind. Maybe you can try it out on an ARM Ltd. CPU if you have one?
Latest (branchless approach):
Made 18736519 calls in 10000000439 ns Made 19958774 calls in 10000000439 ns Made 18736500 calls in 10000000587 ns Made 21703993 calls in 10000000439 ns Made 18734458 calls in 10000000142 ns Made 18736175 calls in 10000000439 ns Made 19961406 calls in 10000000291 ns Made 19953920 calls in 10000000143 ns Made 21709619 calls in 10000000290 ns Made 18734077 calls in 10000000142 ns
It's certainly more, but it also seems more variable. I'll reply with raw data from my msm8960 and msm8974 on v4.
v3:
Made 15971788 calls in 10000000438 ns Made 14594357 calls in 10000000734 ns Made 14590951 calls in 10000000735 ns Made 14595048 calls in 10000000290 ns Made 14595157 calls in 10000000143 ns Made 14594117 calls in 10000000142 ns Made 14597277 calls in 10000000142 ns Made 14594472 calls in 10000000586 ns Made 14601292 calls in 10000000587 ns Made 15968630 calls in 10000000587 ns
Current:
Made 14274871 calls in 10000000587 ns Made 15634136 calls in 10000000587 ns Made 16453528 calls in 10000000142 ns Made 14275854 calls in 10000000586 ns Made 15634128 calls in 10000000438 ns Made 14277672 calls in 10000000143 ns Made 14282904 calls in 10000000290 ns Made 14278673 calls in 10000000142 ns Made 14276096 calls in 10000000290 ns Made 14275336 calls in 10000000143 ns
This patchset optimizes the generic sched_clock implementation by removing branches and significantly reducing the data cache profile. It also makes it safe to call sched_clock() from NMI (or FIQ on ARM).
The data cache profile of sched_clock() in the original code is somewhere between 2 and 3 (64-byte) cache lines, depending on alignment of struct clock_data. After patching, the cache profile for the normal case should be a single cacheline.
NMI safety was tested on i.MX6 with perf drowning the system in FIQs and using the perf handler to check that sched_clock() returned monotonic values. At the same time I forcefully reduced kt_wrap so that update_sched_clock() is being called at >1000Hz.
Without the patches the above system is grossly unstable, surviving [9K,115K,25K] perf event cycles during three separate runs. With the patch I ran for over 9M perf event cycles before getting bored.
v4: * Optimized sched_clock() to be branchless by introducing a dummy function to provide clock values while the clock is suspended (Stephen Boyd). * Improved commenting, including the kerneldoc comments (Stephen Boyd). * Removed a redundant notrace from the update logic (Steven Rostedt).
v3: * Optimized to minimise cache profile, including elimination of the suspended flag (Thomas Gleixner). * Replaced the update_bank_begin/end with a single update function (Thomas Gleixner). * Split into multiple patches to aid review.
v2: * Extended the scope of the read lock in sched_clock() so we can bank all data consumed there (John Stultz)
Daniel Thompson (5): sched_clock: Match scope of read and write seqcounts sched_clock: Optimize cache line usage sched_clock: Remove suspend from clock_read_data sched_clock: Remove redundant notrace from update function sched_clock: Avoid deadlock during read from NMI
kernel/time/sched_clock.c | 195 ++++++++++++++++++++++++++++++++-------------- 1 file changed, 138 insertions(+), 57 deletions(-)
-- 2.1.0
Currently the scope of the raw_write_seqcount_begin/end in sched_clock_register far exceeds the scope of the read section in sched_clock. This gives the impression of safety during cursory review but achieves little.
Note that this is likely to be a latent issue at present because sched_clock_register() is typically called before we enable interrupts, however the issue does risk bugs being needlessly introduced as the code evolves.
This patch fixes the problem by increasing the scope of the read locking performed by sched_clock() to cover all data modified by sched_clock_register.
We also improve clarity by moving writes to struct clock_data that do not impact sched_clock() outside of the critical section.
Signed-off-by: Daniel Thompson daniel.thompson@linaro.org Cc: Russell King linux@arm.linux.org.uk Cc: Will Deacon will.deacon@arm.com Cc: Catalin Marinas catalin.marinas@arm.com --- kernel/time/sched_clock.c | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-)
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index 01d2d15aa662..3d21a8719444 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -58,23 +58,21 @@ static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift)
unsigned long long notrace sched_clock(void) { - u64 epoch_ns; - u64 epoch_cyc; - u64 cyc; + u64 cyc, res; unsigned long seq;
- if (cd.suspended) - return cd.epoch_ns; - do { seq = raw_read_seqcount_begin(&cd.seq); - epoch_cyc = cd.epoch_cyc; - epoch_ns = cd.epoch_ns; + + res = cd.epoch_ns; + if (!cd.suspended) { + cyc = read_sched_clock(); + cyc = (cyc - cd.epoch_cyc) & sched_clock_mask; + res += cyc_to_ns(cyc, cd.mult, cd.shift); + } } while (read_seqcount_retry(&cd.seq, seq));
- cyc = read_sched_clock(); - cyc = (cyc - epoch_cyc) & sched_clock_mask; - return epoch_ns + cyc_to_ns(cyc, cd.mult, cd.shift); + return res; }
/* @@ -124,10 +122,11 @@ void __init sched_clock_register(u64 (*read)(void), int bits, clocks_calc_mult_shift(&new_mult, &new_shift, rate, NSEC_PER_SEC, 3600);
new_mask = CLOCKSOURCE_MASK(bits); + cd.rate = rate;
/* calculate how many ns until we wrap */ wrap = clocks_calc_max_nsecs(new_mult, new_shift, 0, new_mask); - new_wrap_kt = ns_to_ktime(wrap - (wrap >> 3)); + cd.wrap_kt = ns_to_ktime(wrap - (wrap >> 3));
/* update epoch for new counter and update epoch_ns from old counter*/ new_epoch = read(); @@ -138,8 +137,6 @@ void __init sched_clock_register(u64 (*read)(void), int bits, raw_write_seqcount_begin(&cd.seq); read_sched_clock = read; sched_clock_mask = new_mask; - cd.rate = rate; - cd.wrap_kt = new_wrap_kt; cd.mult = new_mult; cd.shift = new_shift; cd.epoch_cyc = new_epoch;
Currently sched_clock(), a very hot code path, is not optimized to minimise its cache profile. In particular:
1. cd is not ____cacheline_aligned,
2. struct clock_data does not distinguish between hotpath and coldpath data, reducing locality of reference in the hotpath,
3. Some hotpath data is missing from struct clock_data and is marked __read_mostly (which more or less guarantees it will not share a cache line with cd).
This patch corrects these problems by extracting all hotpath data into a separate structure and using ____cacheline_aligned to ensure the hotpath uses a single (64 byte) cache line.
Signed-off-by: Daniel Thompson daniel.thompson@linaro.org Cc: Russell King linux@arm.linux.org.uk Cc: Will Deacon will.deacon@arm.com Cc: Catalin Marinas catalin.marinas@arm.com --- kernel/time/sched_clock.c | 113 +++++++++++++++++++++++++++++++--------------- 1 file changed, 77 insertions(+), 36 deletions(-)
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index 3d21a8719444..695b2ac2e8b4 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -18,28 +18,59 @@ #include <linux/seqlock.h> #include <linux/bitops.h>
-struct clock_data { - ktime_t wrap_kt; +/** + * struct clock_read_data - data required to read from sched_clock + * + * @epoch_ns: sched_clock value at last update + * @epoch_cyc: Clock cycle value at last update + * @sched_clock_mask: Bitmask for two's complement subtraction of non 64bit + * clocks + * @read_sched_clock: Current clock source (or dummy source when suspended) + * @mult: Multipler for scaled math conversion + * @shift: Shift value for scaled math conversion + * @suspended: Flag to indicate if the clock is suspended (stopped) + * + * Care must be taken when updating this structure; it is read by + * some very hot code paths. It occupies <=48 bytes and, when combined + * with the seqcount used to synchronize access, comfortably fits into + * a 64 byte cache line. + */ +struct clock_read_data { u64 epoch_ns; u64 epoch_cyc; - seqcount_t seq; - unsigned long rate; + u64 sched_clock_mask; + u64 (*read_sched_clock)(void); u32 mult; u32 shift; bool suspended; };
+/** + * struct clock_data - all data needed for sched_clock (including + * registration of a new clock source) + * + * @seq: Sequence counter for protecting updates. + * @read_data: Data required to read from sched_clock. + * @wrap_kt: Duration for which clock can run before wrapping + * @rate: Tick rate of the registered clock + * @actual_read_sched_clock: Registered clock read function + * + * The ordering of this structure has been chosen to optimize cache + * performance. In particular seq and read_data (combined) should fit + * into a single 64 byte cache line. + */ +struct clock_data { + seqcount_t seq; + struct clock_read_data read_data; + ktime_t wrap_kt; + unsigned long rate; +}; + static struct hrtimer sched_clock_timer; static int irqtime = -1;
core_param(irqtime, irqtime, int, 0400);
-static struct clock_data cd = { - .mult = NSEC_PER_SEC / HZ, -}; - -static u64 __read_mostly sched_clock_mask; - static u64 notrace jiffy_sched_clock_read(void) { /* @@ -49,7 +80,10 @@ static u64 notrace jiffy_sched_clock_read(void) return (u64)(jiffies - INITIAL_JIFFIES); }
-static u64 __read_mostly (*read_sched_clock)(void) = jiffy_sched_clock_read; +static struct clock_data cd ____cacheline_aligned = { + .read_data = { .mult = NSEC_PER_SEC / HZ, + .read_sched_clock = jiffy_sched_clock_read, }, +};
static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift) { @@ -60,15 +94,16 @@ unsigned long long notrace sched_clock(void) { u64 cyc, res; unsigned long seq; + struct clock_read_data *rd = &cd.read_data;
do { seq = raw_read_seqcount_begin(&cd.seq);
- res = cd.epoch_ns; - if (!cd.suspended) { - cyc = read_sched_clock(); - cyc = (cyc - cd.epoch_cyc) & sched_clock_mask; - res += cyc_to_ns(cyc, cd.mult, cd.shift); + res = rd->epoch_ns; + if (!rd->suspended) { + cyc = rd->read_sched_clock(); + cyc = (cyc - rd->epoch_cyc) & rd->sched_clock_mask; + res += cyc_to_ns(cyc, rd->mult, rd->shift); } } while (read_seqcount_retry(&cd.seq, seq));
@@ -83,16 +118,17 @@ static void notrace update_sched_clock(void) unsigned long flags; u64 cyc; u64 ns; + struct clock_read_data *rd = &cd.read_data;
- cyc = read_sched_clock(); - ns = cd.epoch_ns + - cyc_to_ns((cyc - cd.epoch_cyc) & sched_clock_mask, - cd.mult, cd.shift); + cyc = rd->read_sched_clock(); + ns = rd->epoch_ns + + cyc_to_ns((cyc - rd->epoch_cyc) & rd->sched_clock_mask, + rd->mult, rd->shift);
raw_local_irq_save(flags); raw_write_seqcount_begin(&cd.seq); - cd.epoch_ns = ns; - cd.epoch_cyc = cyc; + rd->epoch_ns = ns; + rd->epoch_cyc = cyc; raw_write_seqcount_end(&cd.seq); raw_local_irq_restore(flags); } @@ -109,9 +145,9 @@ void __init sched_clock_register(u64 (*read)(void), int bits, { u64 res, wrap, new_mask, new_epoch, cyc, ns; u32 new_mult, new_shift; - ktime_t new_wrap_kt; unsigned long r; char r_unit; + struct clock_read_data *rd = &cd.read_data;
if (cd.rate > rate) return; @@ -130,17 +166,18 @@ void __init sched_clock_register(u64 (*read)(void), int bits,
/* update epoch for new counter and update epoch_ns from old counter*/ new_epoch = read(); - cyc = read_sched_clock(); - ns = cd.epoch_ns + cyc_to_ns((cyc - cd.epoch_cyc) & sched_clock_mask, - cd.mult, cd.shift); + cyc = rd->read_sched_clock(); + ns = rd->epoch_ns + + cyc_to_ns((cyc - rd->epoch_cyc) & rd->sched_clock_mask, + rd->mult, rd->shift);
raw_write_seqcount_begin(&cd.seq); - read_sched_clock = read; - sched_clock_mask = new_mask; - cd.mult = new_mult; - cd.shift = new_shift; - cd.epoch_cyc = new_epoch; - cd.epoch_ns = ns; + rd->read_sched_clock = read; + rd->sched_clock_mask = new_mask; + rd->mult = new_mult; + rd->shift = new_shift; + rd->epoch_cyc = new_epoch; + rd->epoch_ns = ns; raw_write_seqcount_end(&cd.seq);
r = rate; @@ -172,7 +209,7 @@ void __init sched_clock_postinit(void) * If no sched_clock function has been provided at that point, * make it the final one one. */ - if (read_sched_clock == jiffy_sched_clock_read) + if (cd.read_data.read_sched_clock == jiffy_sched_clock_read) sched_clock_register(jiffy_sched_clock_read, BITS_PER_LONG, HZ);
update_sched_clock(); @@ -188,17 +225,21 @@ void __init sched_clock_postinit(void)
static int sched_clock_suspend(void) { + struct clock_read_data *rd = &cd.read_data; + update_sched_clock(); hrtimer_cancel(&sched_clock_timer); - cd.suspended = true; + rd->suspended = true; return 0; }
static void sched_clock_resume(void) { - cd.epoch_cyc = read_sched_clock(); + struct clock_read_data *rd = &cd.read_data; + + rd->epoch_cyc = rd->read_sched_clock(); hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL); - cd.suspended = false; + rd->suspended = false; }
static struct syscore_ops sched_clock_ops = {
On Sun, Feb 08, 2015 at 12:02:37PM +0000, Daniel Thompson wrote:
Currently sched_clock(), a very hot code path, is not optimized to minimise its cache profile. In particular:
cd is not ____cacheline_aligned,
struct clock_data does not distinguish between hotpath and coldpath data, reducing locality of reference in the hotpath,
Some hotpath data is missing from struct clock_data and is marked __read_mostly (which more or less guarantees it will not share a cache line with cd).
This patch corrects these problems by extracting all hotpath data into a separate structure and using ____cacheline_aligned to ensure the hotpath uses a single (64 byte) cache line.
Have you got any performance figures for this change, or is this just a theoretical optimisation? It would be interesting to see what effect this has on systems with 32-byte cachelines and also scenarios where there's contention on the sequence counter.
Will
On 09/02/15 09:28, Will Deacon wrote:
On Sun, Feb 08, 2015 at 12:02:37PM +0000, Daniel Thompson wrote:
Currently sched_clock(), a very hot code path, is not optimized to minimise its cache profile. In particular:
cd is not ____cacheline_aligned,
struct clock_data does not distinguish between hotpath and coldpath data, reducing locality of reference in the hotpath,
Some hotpath data is missing from struct clock_data and is marked __read_mostly (which more or less guarantees it will not share a cache line with cd).
This patch corrects these problems by extracting all hotpath data into a separate structure and using ____cacheline_aligned to ensure the hotpath uses a single (64 byte) cache line.
Have you got any performance figures for this change, or is this just a theoretical optimisation? It would be interesting to see what effect this has on systems with 32-byte cachelines and also scenarios where there's contention on the sequence counter.
Most of my testing has focused on proving the NMI safety parts of the patch work as advertised so its mostly theoretical.
However there are some numbers from simple tight loop calls to sched_clock (Stephen Boyd's results are more interesting than mine because I observe pretty wild quantization effects that render the results hard to trust): http://thread.gmane.org/gmane.linux.kernel/1871157/focus=1879265
Not sure what useful figures would be useful for a contended sequence counter. Firstly the counter is taken for write at 7/8 wrap time of the times so even for the fastest timers the interval is likely to be >3s and is very short duration. Additionally, the NMI safety changes make it possible to read the timer whilst it is being updated so it is only during the very short struct-copy/write/struct-copy/write update sequence that we will observe the extra cache line used for a read. Benchmarks that show the effect of update are therefore non-trivial to construct.
On 02/09/15 01:47, Daniel Thompson wrote:
On 09/02/15 09:28, Will Deacon wrote:
On Sun, Feb 08, 2015 at 12:02:37PM +0000, Daniel Thompson wrote:
Currently sched_clock(), a very hot code path, is not optimized to minimise its cache profile. In particular:
cd is not ____cacheline_aligned,
struct clock_data does not distinguish between hotpath and coldpath data, reducing locality of reference in the hotpath,
Some hotpath data is missing from struct clock_data and is marked __read_mostly (which more or less guarantees it will not share a cache line with cd).
This patch corrects these problems by extracting all hotpath data into a separate structure and using ____cacheline_aligned to ensure the hotpath uses a single (64 byte) cache line.
Have you got any performance figures for this change, or is this just a theoretical optimisation? It would be interesting to see what effect this has on systems with 32-byte cachelines and also scenarios where there's contention on the sequence counter.
Most of my testing has focused on proving the NMI safety parts of the patch work as advertised so its mostly theoretical.
However there are some numbers from simple tight loop calls to sched_clock (Stephen Boyd's results are more interesting than mine because I observe pretty wild quantization effects that render the results hard to trust): http://thread.gmane.org/gmane.linux.kernel/1871157/focus=1879265
Not sure what useful figures would be useful for a contended sequence counter. Firstly the counter is taken for write at 7/8 wrap time of the times so even for the fastest timers the interval is likely to be >3s and is very short duration. Additionally, the NMI safety changes make it possible to read the timer whilst it is being updated so it is only during the very short struct-copy/write/struct-copy/write update sequence that we will observe the extra cache line used for a read. Benchmarks that show the effect of update are therefore non-trivial to construct.
Here's the raw numbers for the tight loop. I noticed that if I don't use perf I get a larger number of calls per 10s, most likely because we aren't doing anything else. These are very lightly loaded systems, i.e. busybox ramdisk with nothing going on. Kernel version is v3.19-rc4. The CPU is Krait on msm8960 and msm8974, except on msm8974 it has the arm architected timers backing sched_clock() vs. our own custom timer IP on msm8960. The cache line size is 64 bytes. I also ran it on msm8660 which is a Scorpion CPU with the same timer as msm8960 (custom timer IP) and a cache line size of 32 bytes. Unfortunately nobody has ported Scorpion over to perf events, so we don't hardware events.
msm8960 (before patch) ---------------------- # perf stat -r 10 --post "rmmod sched_clock_test" modprobe sched_clock_test Made 14528449 calls in 10000000290 ns Made 14528925 calls in 10000000142 ns Made 14524549 calls in 10000000587 ns Made 14528164 calls in 10000000734 ns Made 14524468 calls in 10000000290 ns Made 14527198 calls in 10000000438 ns Made 14523508 calls in 10000000734 ns Made 14527894 calls in 10000000290 ns Made 14529609 calls in 10000000734 ns Made 14523114 calls in 10000000142 ns
Performance counter stats for 'modprobe sched_clock_test' (10 runs):
10009.635016 task-clock (msec) # 1.000 CPUs utilized ( +- 0.00% ) 7 context-switches # 0.001 K/sec ( +- 16.16% ) 0 cpu-migrations # 0.000 K/sec 58 page-faults # 0.006 K/sec 4003806350 cycles # 0.400 GHz ( +- 0.00% ) 0 stalled-cycles-frontend # 0.00% frontend cycles idle 0 stalled-cycles-backend # 0.00% backend cycles idle 921924235 instructions # 0.23 insns per cycle ( +- 0.01% ) 0 branches # 0.000 K/sec 58521151 branch-misses # 5.846 M/sec ( +- 0.01% )
10.011767657 seconds time elapsed ( +- 0.00% )
msm8960 (after patch) --------------------- # perf stat -r 10 --post "rmmod sched_clock_test" modprobe sched_clock_test Made 19626366 calls in 10000000587 ns Made 19623708 calls in 10000000142 ns Made 19623282 calls in 10000000290 ns Made 19625304 calls in 10000000290 ns Made 19625151 calls in 10000000291 ns Made 19624906 calls in 10000000290 ns Made 19625383 calls in 10000000143 ns Made 19625235 calls in 10000000290 ns Made 19624969 calls in 10000000290 ns Made 19625209 calls in 10000000438 ns
Performance counter stats for 'modprobe sched_clock_test' (10 runs):
10009.883401 task-clock (msec) # 1.000 CPUs utilized ( +- 0.00% ) 7 context-switches # 0.001 K/sec ( +- 15.88% ) 0 cpu-migrations # 0.000 K/sec 58 page-faults # 0.006 K/sec 4003901511 cycles # 0.400 GHz ( +- 0.00% ) 0 stalled-cycles-frontend # 0.00% frontend cycles idle 0 stalled-cycles-backend # 0.00% backend cycles idle 1164635790 instructions # 0.29 insns per cycle ( +- 0.00% ) 0 branches # 0.000 K/sec 20039814 branch-misses # 2.002 M/sec ( +- 0.00% )
10.012092383 seconds time elapsed ( +- 0.00% )
msm8974 (before patch) ---------------------- # perf stat -r 10 --post "rmmod sched_clock_test" modprobe sched_clock_test Made 21289694 calls in 10000000083 ns Made 21289072 calls in 10000000082 ns Made 21289550 calls in 10000000395 ns Made 21288892 calls in 10000000291 ns Made 21288987 calls in 10000000135 ns Made 21289140 calls in 10000000395 ns Made 21289161 calls in 10000000395 ns Made 21288911 calls in 10000000239 ns Made 21289204 calls in 10000000135 ns Made 21288738 calls in 10000000135 ns
Performance counter stats for 'modprobe sched_clock_test' (10 runs):
10003.839348 task-clock (msec) # 1.000 CPUs utilized ( +- 0.00% ) 4 context-switches # 0.000 K/sec ( +- 3.70% ) 0 cpu-migrations # 0.000 K/sec 58 page-faults # 0.006 K/sec 6146323757 cycles # 0.614 GHz ( +- 0.00% ) 0 stalled-cycles-frontend # 0.00% frontend cycles idle 0 stalled-cycles-backend # 0.00% backend cycles idle 1155527762 instructions # 0.19 insns per cycle ( +- 0.00% ) 107186099 branches # 10.714 M/sec ( +- 0.00% ) 35548359 branch-misses # 33.17% of all branches ( +- 0.00% )
10.004769053 seconds time elapsed ( +- 0.00% )
msm8974 (after patch) --------------------- # perf stat -r 10 --post "rmmod sched_clock_test" modprobe sched_clock_test Made 21289357 calls in 10000000239 ns Made 21384961 calls in 10000000396 ns Made 22105925 calls in 10000000238 ns Made 27384126 calls in 10000000239 ns Made 22107737 calls in 10000000134 ns Made 21368867 calls in 10000000239 ns Made 22106065 calls in 10000000395 ns Made 27384196 calls in 10000000083 ns Made 22107334 calls in 10000000291 ns Made 21365426 calls in 10000000291 ns
Performance counter stats for 'modprobe sched_clock_test' (10 runs):
10003.753333 task-clock (msec) # 1.000 CPUs utilized ( +- 0.00% ) 7 context-switches # 0.001 K/sec ( +- 18.18% ) 0 cpu-migrations # 0.000 K/sec 58 page-faults # 0.006 K/sec 6837664600 cycles # 0.684 GHz ( +- 6.74% ) 0 stalled-cycles-frontend # 0.00% frontend cycles idle 0 stalled-cycles-backend # 0.00% backend cycles idle 1148993903 instructions # 0.17 insns per cycle ( +- 3.32% ) 115049358 branches # 11.501 M/sec ( +- 3.31% ) 42520513 branch-misses # 36.96% of all branches ( +- 5.00% )
10.004769533 seconds time elapsed ( +- 0.00% )
msm8660 (before patch) ----------------------
# perf stat -r 10 --post "rmmod sched_clock_test" modprobe sched_clock_test Made 14099029 calls in 10000000586 ns Made 14099227 calls in 10000000735 ns Made 14098763 calls in 10000000439 ns Made 14099042 calls in 10000000291 ns Made 14099273 calls in 10000000290 ns Made 14100377 calls in 10000000586 ns Made 14100183 calls in 10000000586 ns Made 14099220 calls in 10000000586 ns Made 14098853 calls in 10000000587 ns Made 14099368 calls in 10000000142 ns
Performance counter stats for 'modprobe sched_clock_test' (10 runs):
10006.700528 task-clock (msec) # 1.000 CPUs utilized ( +- 0.00% ) 11 context-switches # 0.001 K/sec ( +- 10.38% ) 0 cpu-migrations # 0.000 K/sec 56 page-faults # 0.006 K/sec 0 cycles # 0.000 GHz 0 stalled-cycles-frontend # 0.00% frontend cycles idle 0 stalled-cycles-backend # 0.00% backend cycles idle 0 instructions 0 branches # 0.000 K/sec 0 branch-misses # 0.000 K/sec
10.008796161 seconds time elapsed ( +- 0.00% )
msm8660 (after patch) --------------------- # perf stat -r 10 --post "rmmod sched_clock_test" modprobe sched_clock_test Made 20555901 calls in 10000000438 ns Made 15510019 calls in 10000000142 ns Made 15510371 calls in 10000000587 ns Made 15509184 calls in 10000000439 ns Made 15509068 calls in 10000000291 ns Made 15510719 calls in 10000000439 ns Made 15508899 calls in 10000000291 ns Made 15509206 calls in 10000000587 ns Made 15509057 calls in 10000000290 ns Made 15509178 calls in 10000000735 ns
Performance counter stats for 'modprobe sched_clock_test' (10 runs):
10009.491416 task-clock (msec) # 1.000 CPUs utilized ( +- 0.00% ) 13 context-switches # 0.001 K/sec ( +- 10.82% ) 0 cpu-migrations # 0.000 K/sec 58 page-faults # 0.006 K/sec 0 cycles # 0.000 GHz 0 stalled-cycles-frontend # 0.00% frontend cycles idle 0 stalled-cycles-backend # 0.00% backend cycles idle 0 instructions 0 branches # 0.000 K/sec 0 branch-misses # 0.000 K/sec
10.011834087 seconds time elapsed ( +- 0.00% )
Currently cd.read_data.suspended is read by the hotpath function sched_clock(). This variable need not be accessed on the hotpath. In fact, once it is removed, we can remove the conditional branches from sched_clock() and install a dummy read_sched_clock function to suspend the clock.
The new master copy of the function pointer (actual_read_sched_clock) is introduced and is used this for all reads of the clock hardware except those within sched_clock itself.
Suggested-by: Thomas Gleixner tglx@linutronix.de Signed-off-by: Daniel Thompson daniel.thompson@linaro.org Cc: Russell King linux@arm.linux.org.uk Cc: Will Deacon will.deacon@arm.com Cc: Catalin Marinas catalin.marinas@arm.com --- kernel/time/sched_clock.c | 40 +++++++++++++++++++++++++--------------- 1 file changed, 25 insertions(+), 15 deletions(-)
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index 695b2ac2e8b4..5d6407951fb8 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -28,10 +28,9 @@ * @read_sched_clock: Current clock source (or dummy source when suspended) * @mult: Multipler for scaled math conversion * @shift: Shift value for scaled math conversion - * @suspended: Flag to indicate if the clock is suspended (stopped) * * Care must be taken when updating this structure; it is read by - * some very hot code paths. It occupies <=48 bytes and, when combined + * some very hot code paths. It occupies <=40 bytes and, when combined * with the seqcount used to synchronize access, comfortably fits into * a 64 byte cache line. */ @@ -42,7 +41,6 @@ struct clock_read_data { u64 (*read_sched_clock)(void); u32 mult; u32 shift; - bool suspended; };
/** @@ -64,6 +62,7 @@ struct clock_data { struct clock_read_data read_data; ktime_t wrap_kt; unsigned long rate; + u64 (*actual_read_sched_clock)(void); };
static struct hrtimer sched_clock_timer; @@ -83,6 +82,8 @@ static u64 notrace jiffy_sched_clock_read(void) static struct clock_data cd ____cacheline_aligned = { .read_data = { .mult = NSEC_PER_SEC / HZ, .read_sched_clock = jiffy_sched_clock_read, }, + .actual_read_sched_clock = jiffy_sched_clock_read, + };
static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift) @@ -99,12 +100,9 @@ unsigned long long notrace sched_clock(void) do { seq = raw_read_seqcount_begin(&cd.seq);
- res = rd->epoch_ns; - if (!rd->suspended) { - cyc = rd->read_sched_clock(); - cyc = (cyc - rd->epoch_cyc) & rd->sched_clock_mask; - res += cyc_to_ns(cyc, rd->mult, rd->shift); - } + cyc = (rd->read_sched_clock() - rd->epoch_cyc) & + rd->sched_clock_mask; + res = rd->epoch_ns + cyc_to_ns(cyc, rd->mult, rd->shift); } while (read_seqcount_retry(&cd.seq, seq));
return res; @@ -120,7 +118,7 @@ static void notrace update_sched_clock(void) u64 ns; struct clock_read_data *rd = &cd.read_data;
- cyc = rd->read_sched_clock(); + cyc = cd.actual_read_sched_clock(); ns = rd->epoch_ns + cyc_to_ns((cyc - rd->epoch_cyc) & rd->sched_clock_mask, rd->mult, rd->shift); @@ -166,10 +164,11 @@ void __init sched_clock_register(u64 (*read)(void), int bits,
/* update epoch for new counter and update epoch_ns from old counter*/ new_epoch = read(); - cyc = rd->read_sched_clock(); + cyc = cd.actual_read_sched_clock(); ns = rd->epoch_ns + cyc_to_ns((cyc - rd->epoch_cyc) & rd->sched_clock_mask, rd->mult, rd->shift); + cd.actual_read_sched_clock = read;
raw_write_seqcount_begin(&cd.seq); rd->read_sched_clock = read; @@ -209,7 +208,7 @@ void __init sched_clock_postinit(void) * If no sched_clock function has been provided at that point, * make it the final one one. */ - if (cd.read_data.read_sched_clock == jiffy_sched_clock_read) + if (cd.actual_read_sched_clock == jiffy_sched_clock_read) sched_clock_register(jiffy_sched_clock_read, BITS_PER_LONG, HZ);
update_sched_clock(); @@ -223,13 +222,24 @@ void __init sched_clock_postinit(void) hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL); }
+/* + * Clock read function for use when the clock is suspended. + * + * This function makes it appear to sched_clock() as if the clock + * stopped counting at its last update. + */ +static u64 notrace suspended_sched_clock_read(void) +{ + return cd.read_data.epoch_cyc; +} + static int sched_clock_suspend(void) { struct clock_read_data *rd = &cd.read_data;
update_sched_clock(); hrtimer_cancel(&sched_clock_timer); - rd->suspended = true; + rd->read_sched_clock = suspended_sched_clock_read; return 0; }
@@ -237,9 +247,9 @@ static void sched_clock_resume(void) { struct clock_read_data *rd = &cd.read_data;
- rd->epoch_cyc = rd->read_sched_clock(); + rd->epoch_cyc = cd.actual_read_sched_clock(); hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL); - rd->suspended = false; + rd->read_sched_clock = cd.actual_read_sched_clock; }
static struct syscore_ops sched_clock_ops = {
Currently update_sched_clock() is marked as notrace but this function is not called by ftrace. This is trivially fixed by removing the mark up.
Signed-off-by: Daniel Thompson daniel.thompson@linaro.org --- kernel/time/sched_clock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index 5d6407951fb8..9280327676dc 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -111,7 +111,7 @@ unsigned long long notrace sched_clock(void) /* * Atomically update the sched_clock epoch. */ -static void notrace update_sched_clock(void) +static void update_sched_clock(void) { unsigned long flags; u64 cyc;
Currently it is possible for an NMI (or FIQ on ARM) to come in and read sched_clock() whilst update_sched_clock() has locked the seqcount for writing. This results in the NMI handler locking up when it calls raw_read_seqcount_begin().
This patch fixes the NMI safety issues by providing banked clock data. This is a similar approach to the one used in Thomas Gleixner's 4396e058c52e("timekeeping: Provide fast and NMI safe access to CLOCK_MONOTONIC").
Suggested-by: Stephen Boyd sboyd@codeaurora.org Signed-off-by: Daniel Thompson daniel.thompson@linaro.org Cc: Russell King linux@arm.linux.org.uk Cc: Will Deacon will.deacon@arm.com Cc: Catalin Marinas catalin.marinas@arm.com --- kernel/time/sched_clock.c | 103 ++++++++++++++++++++++++++++++---------------- 1 file changed, 68 insertions(+), 35 deletions(-)
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index 9280327676dc..a23d98c33dab 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -47,19 +47,20 @@ struct clock_read_data { * struct clock_data - all data needed for sched_clock (including * registration of a new clock source) * - * @seq: Sequence counter for protecting updates. + * @seq: Sequence counter for protecting updates. The lowest + * bit is the index for @read_data. * @read_data: Data required to read from sched_clock. * @wrap_kt: Duration for which clock can run before wrapping * @rate: Tick rate of the registered clock * @actual_read_sched_clock: Registered clock read function * * The ordering of this structure has been chosen to optimize cache - * performance. In particular seq and read_data (combined) should fit + * performance. In particular seq and read_data[0] (combined) should fit * into a single 64 byte cache line. */ struct clock_data { seqcount_t seq; - struct clock_read_data read_data; + struct clock_read_data read_data[2]; ktime_t wrap_kt; unsigned long rate; u64 (*actual_read_sched_clock)(void); @@ -80,10 +81,9 @@ static u64 notrace jiffy_sched_clock_read(void) }
static struct clock_data cd ____cacheline_aligned = { - .read_data = { .mult = NSEC_PER_SEC / HZ, - .read_sched_clock = jiffy_sched_clock_read, }, + .read_data[0] = { .mult = NSEC_PER_SEC / HZ, + .read_sched_clock = jiffy_sched_clock_read, }, .actual_read_sched_clock = jiffy_sched_clock_read, - };
static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift) @@ -95,10 +95,11 @@ unsigned long long notrace sched_clock(void) { u64 cyc, res; unsigned long seq; - struct clock_read_data *rd = &cd.read_data; + struct clock_read_data *rd;
do { - seq = raw_read_seqcount_begin(&cd.seq); + seq = raw_read_seqcount(&cd.seq); + rd = cd.read_data + (seq & 1);
cyc = (rd->read_sched_clock() - rd->epoch_cyc) & rd->sched_clock_mask; @@ -109,26 +110,50 @@ unsigned long long notrace sched_clock(void) }
/* + * Updating the data required to read the clock. + * + * sched_clock will never observe mis-matched data even if called from + * an NMI. We do this by maintaining an odd/even copy of the data and + * steering sched_clock to one or the other using a sequence counter. + * In order to preserve the data cache profile of sched_clock as much + * as possible the system reverts back to the even copy when the update + * completes; the odd copy is used *only* during an update. + */ +static void update_clock_read_data(struct clock_read_data *rd) +{ + /* update the backup (odd) copy with the new data */ + cd.read_data[1] = *rd; + + /* steer readers towards the odd copy */ + raw_write_seqcount_latch(&cd.seq); + + /* now its safe for us to update the normal (even) copy */ + cd.read_data[0] = *rd; + + /* switch readers back to the even copy */ + raw_write_seqcount_latch(&cd.seq); +} + +/* * Atomically update the sched_clock epoch. */ static void update_sched_clock(void) { - unsigned long flags; u64 cyc; u64 ns; - struct clock_read_data *rd = &cd.read_data; + struct clock_read_data rd; + + rd = cd.read_data[0];
cyc = cd.actual_read_sched_clock(); - ns = rd->epoch_ns + - cyc_to_ns((cyc - rd->epoch_cyc) & rd->sched_clock_mask, - rd->mult, rd->shift); - - raw_local_irq_save(flags); - raw_write_seqcount_begin(&cd.seq); - rd->epoch_ns = ns; - rd->epoch_cyc = cyc; - raw_write_seqcount_end(&cd.seq); - raw_local_irq_restore(flags); + ns = rd.epoch_ns + + cyc_to_ns((cyc - rd.epoch_cyc) & rd.sched_clock_mask, + rd.mult, rd.shift); + + rd.epoch_ns = ns; + rd.epoch_cyc = cyc; + + update_clock_read_data(&rd); }
static enum hrtimer_restart sched_clock_poll(struct hrtimer *hrt) @@ -145,7 +170,7 @@ void __init sched_clock_register(u64 (*read)(void), int bits, u32 new_mult, new_shift; unsigned long r; char r_unit; - struct clock_read_data *rd = &cd.read_data; + struct clock_read_data rd;
if (cd.rate > rate) return; @@ -162,22 +187,23 @@ void __init sched_clock_register(u64 (*read)(void), int bits, wrap = clocks_calc_max_nsecs(new_mult, new_shift, 0, new_mask); cd.wrap_kt = ns_to_ktime(wrap - (wrap >> 3));
+ rd = cd.read_data[0]; + /* update epoch for new counter and update epoch_ns from old counter*/ new_epoch = read(); cyc = cd.actual_read_sched_clock(); - ns = rd->epoch_ns + - cyc_to_ns((cyc - rd->epoch_cyc) & rd->sched_clock_mask, - rd->mult, rd->shift); + ns = rd.epoch_ns + + cyc_to_ns((cyc - rd.epoch_cyc) & rd.sched_clock_mask, + rd.mult, rd.shift); cd.actual_read_sched_clock = read;
- raw_write_seqcount_begin(&cd.seq); - rd->read_sched_clock = read; - rd->sched_clock_mask = new_mask; - rd->mult = new_mult; - rd->shift = new_shift; - rd->epoch_cyc = new_epoch; - rd->epoch_ns = ns; - raw_write_seqcount_end(&cd.seq); + rd.read_sched_clock = read; + rd.sched_clock_mask = new_mask; + rd.mult = new_mult; + rd.shift = new_shift; + rd.epoch_cyc = new_epoch; + rd.epoch_ns = ns; + update_clock_read_data(&rd);
r = rate; if (r >= 4000000) { @@ -227,15 +253,22 @@ void __init sched_clock_postinit(void) * * This function makes it appear to sched_clock() as if the clock * stopped counting at its last update. + * + * This function must only be called from the critical + * section in sched_clock(). It relies on the read_seqcount_retry() + * at the end of the critical section to be sure we observe the + * correct copy of epoch_cyc. */ static u64 notrace suspended_sched_clock_read(void) { - return cd.read_data.epoch_cyc; + unsigned long seq = raw_read_seqcount(&cd.seq); + + return cd.read_data[seq & 1].epoch_cyc; }
static int sched_clock_suspend(void) { - struct clock_read_data *rd = &cd.read_data; + struct clock_read_data *rd = &cd.read_data[0];
update_sched_clock(); hrtimer_cancel(&sched_clock_timer); @@ -245,7 +278,7 @@ static int sched_clock_suspend(void)
static void sched_clock_resume(void) { - struct clock_read_data *rd = &cd.read_data; + struct clock_read_data *rd = &cd.read_data[0];
rd->epoch_cyc = cd.actual_read_sched_clock(); hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL);
On 02/08/15 04:02, Daniel Thompson wrote:
This patchset optimizes the generic sched_clock implementation by removing branches and significantly reducing the data cache profile. It also makes it safe to call sched_clock() from NMI (or FIQ on ARM).
The data cache profile of sched_clock() in the original code is somewhere between 2 and 3 (64-byte) cache lines, depending on alignment of struct clock_data. After patching, the cache profile for the normal case should be a single cacheline.
NMI safety was tested on i.MX6 with perf drowning the system in FIQs and using the perf handler to check that sched_clock() returned monotonic values. At the same time I forcefully reduced kt_wrap so that update_sched_clock() is being called at >1000Hz.
Without the patches the above system is grossly unstable, surviving [9K,115K,25K] perf event cycles during three separate runs. With the patch I ran for over 9M perf event cycles before getting bored.
Looks good to me. For the series:
Reviewed-by: Stephen Boyd sboyd@codeaurora.org
This patchset optimizes the generic sched_clock implementation by removing branches and significantly reducing the data cache profile. It also makes it safe to call sched_clock() from NMI (or FIQ on ARM).
The data cache profile of sched_clock() in the original code is somewhere between 2 and 3 (64-byte) cache lines, depending on alignment of struct clock_data. After patching, the cache profile for the normal case should be a single cacheline.
NMI safety was tested on i.MX6 with perf drowning the system in FIQs and using the perf handler to check that sched_clock() returned monotonic values. At the same time I forcefully reduced kt_wrap so that update_sched_clock() is being called at >1000Hz.
Without the patches the above system is grossly unstable, surviving [9K,115K,25K] perf event cycles during three separate runs. With the patch I ran for over 9M perf event cycles before getting bored.
Performance testing has primarily been performed using a simple tight loop test (i.e. one that is unlikely to benefit from the cache profile improvements). Summary results show benefit on all CPUs although magnitude varies significantly:
Cortex A9 @ 792MHz 4.1% speedup Cortex A9 @ 1GHz 0.4% speedup (different SoC to above) Scorpian 13.6% speedup Krait 35.1% speedup Cortex A53 @ 1GHz 1.6% speedup Cortex A57 @ 1GHz 5.0% speedup
Benchmarking was done by Stephen Boyd and myself, full data for the above summaries can be found here: https://docs.google.com/spreadsheets/d/1Zd2xN42U4oAVZcArqAYdAWgFI5oDFRysURCS...
v5: * Summarized benchmark results in the patchset cover letter and added some Reviewed-by:s. * Rebased on 4.0-rc1.
v4: * Optimized sched_clock() to be branchless by introducing a dummy function to provide clock values while the clock is suspended (Stephen Boyd). * Improved commenting, including the kerneldoc comments (Stephen Boyd). * Removed a redundant notrace from the update logic (Steven Rostedt).
v3: * Optimized to minimise cache profile, including elimination of the suspended flag (Thomas Gleixner). * Replaced the update_bank_begin/end with a single update function (Thomas Gleixner). * Split into multiple patches to aid review.
v2: * Extended the scope of the read lock in sched_clock() so we can bank all data consumed there (John Stultz)
Daniel Thompson (5): sched_clock: Match scope of read and write seqcounts sched_clock: Optimize cache line usage sched_clock: Remove suspend from clock_read_data sched_clock: Remove redundant notrace from update function sched_clock: Avoid deadlock during read from NMI
kernel/time/sched_clock.c | 195 ++++++++++++++++++++++++++++++++-------------- 1 file changed, 138 insertions(+), 57 deletions(-)
-- 2.1.0
Currently the scope of the raw_write_seqcount_begin/end in sched_clock_register far exceeds the scope of the read section in sched_clock. This gives the impression of safety during cursory review but achieves little.
Note that this is likely to be a latent issue at present because sched_clock_register() is typically called before we enable interrupts, however the issue does risk bugs being needlessly introduced as the code evolves.
This patch fixes the problem by increasing the scope of the read locking performed by sched_clock() to cover all data modified by sched_clock_register.
We also improve clarity by moving writes to struct clock_data that do not impact sched_clock() outside of the critical section.
Signed-off-by: Daniel Thompson daniel.thompson@linaro.org Cc: Russell King linux@arm.linux.org.uk Cc: Will Deacon will.deacon@arm.com Cc: Catalin Marinas catalin.marinas@arm.com Reviewed-by: Stephen Boyd sboyd@codeaurora.org --- kernel/time/sched_clock.c | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-)
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index 01d2d15aa662..3d21a8719444 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -58,23 +58,21 @@ static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift)
unsigned long long notrace sched_clock(void) { - u64 epoch_ns; - u64 epoch_cyc; - u64 cyc; + u64 cyc, res; unsigned long seq;
- if (cd.suspended) - return cd.epoch_ns; - do { seq = raw_read_seqcount_begin(&cd.seq); - epoch_cyc = cd.epoch_cyc; - epoch_ns = cd.epoch_ns; + + res = cd.epoch_ns; + if (!cd.suspended) { + cyc = read_sched_clock(); + cyc = (cyc - cd.epoch_cyc) & sched_clock_mask; + res += cyc_to_ns(cyc, cd.mult, cd.shift); + } } while (read_seqcount_retry(&cd.seq, seq));
- cyc = read_sched_clock(); - cyc = (cyc - epoch_cyc) & sched_clock_mask; - return epoch_ns + cyc_to_ns(cyc, cd.mult, cd.shift); + return res; }
/* @@ -124,10 +122,11 @@ void __init sched_clock_register(u64 (*read)(void), int bits, clocks_calc_mult_shift(&new_mult, &new_shift, rate, NSEC_PER_SEC, 3600);
new_mask = CLOCKSOURCE_MASK(bits); + cd.rate = rate;
/* calculate how many ns until we wrap */ wrap = clocks_calc_max_nsecs(new_mult, new_shift, 0, new_mask); - new_wrap_kt = ns_to_ktime(wrap - (wrap >> 3)); + cd.wrap_kt = ns_to_ktime(wrap - (wrap >> 3));
/* update epoch for new counter and update epoch_ns from old counter*/ new_epoch = read(); @@ -138,8 +137,6 @@ void __init sched_clock_register(u64 (*read)(void), int bits, raw_write_seqcount_begin(&cd.seq); read_sched_clock = read; sched_clock_mask = new_mask; - cd.rate = rate; - cd.wrap_kt = new_wrap_kt; cd.mult = new_mult; cd.shift = new_shift; cd.epoch_cyc = new_epoch;
Currently sched_clock(), a very hot code path, is not optimized to minimise its cache profile. In particular:
1. cd is not ____cacheline_aligned,
2. struct clock_data does not distinguish between hotpath and coldpath data, reducing locality of reference in the hotpath,
3. Some hotpath data is missing from struct clock_data and is marked __read_mostly (which more or less guarantees it will not share a cache line with cd).
This patch corrects these problems by extracting all hotpath data into a separate structure and using ____cacheline_aligned to ensure the hotpath uses a single (64 byte) cache line.
Signed-off-by: Daniel Thompson daniel.thompson@linaro.org Cc: Russell King linux@arm.linux.org.uk Cc: Will Deacon will.deacon@arm.com Cc: Catalin Marinas catalin.marinas@arm.com Reviewed-by: Stephen Boyd sboyd@codeaurora.org --- kernel/time/sched_clock.c | 113 +++++++++++++++++++++++++++++++--------------- 1 file changed, 77 insertions(+), 36 deletions(-)
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index 3d21a8719444..695b2ac2e8b4 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -18,28 +18,59 @@ #include <linux/seqlock.h> #include <linux/bitops.h>
-struct clock_data { - ktime_t wrap_kt; +/** + * struct clock_read_data - data required to read from sched_clock + * + * @epoch_ns: sched_clock value at last update + * @epoch_cyc: Clock cycle value at last update + * @sched_clock_mask: Bitmask for two's complement subtraction of non 64bit + * clocks + * @read_sched_clock: Current clock source (or dummy source when suspended) + * @mult: Multipler for scaled math conversion + * @shift: Shift value for scaled math conversion + * @suspended: Flag to indicate if the clock is suspended (stopped) + * + * Care must be taken when updating this structure; it is read by + * some very hot code paths. It occupies <=48 bytes and, when combined + * with the seqcount used to synchronize access, comfortably fits into + * a 64 byte cache line. + */ +struct clock_read_data { u64 epoch_ns; u64 epoch_cyc; - seqcount_t seq; - unsigned long rate; + u64 sched_clock_mask; + u64 (*read_sched_clock)(void); u32 mult; u32 shift; bool suspended; };
+/** + * struct clock_data - all data needed for sched_clock (including + * registration of a new clock source) + * + * @seq: Sequence counter for protecting updates. + * @read_data: Data required to read from sched_clock. + * @wrap_kt: Duration for which clock can run before wrapping + * @rate: Tick rate of the registered clock + * @actual_read_sched_clock: Registered clock read function + * + * The ordering of this structure has been chosen to optimize cache + * performance. In particular seq and read_data (combined) should fit + * into a single 64 byte cache line. + */ +struct clock_data { + seqcount_t seq; + struct clock_read_data read_data; + ktime_t wrap_kt; + unsigned long rate; +}; + static struct hrtimer sched_clock_timer; static int irqtime = -1;
core_param(irqtime, irqtime, int, 0400);
-static struct clock_data cd = { - .mult = NSEC_PER_SEC / HZ, -}; - -static u64 __read_mostly sched_clock_mask; - static u64 notrace jiffy_sched_clock_read(void) { /* @@ -49,7 +80,10 @@ static u64 notrace jiffy_sched_clock_read(void) return (u64)(jiffies - INITIAL_JIFFIES); }
-static u64 __read_mostly (*read_sched_clock)(void) = jiffy_sched_clock_read; +static struct clock_data cd ____cacheline_aligned = { + .read_data = { .mult = NSEC_PER_SEC / HZ, + .read_sched_clock = jiffy_sched_clock_read, }, +};
static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift) { @@ -60,15 +94,16 @@ unsigned long long notrace sched_clock(void) { u64 cyc, res; unsigned long seq; + struct clock_read_data *rd = &cd.read_data;
do { seq = raw_read_seqcount_begin(&cd.seq);
- res = cd.epoch_ns; - if (!cd.suspended) { - cyc = read_sched_clock(); - cyc = (cyc - cd.epoch_cyc) & sched_clock_mask; - res += cyc_to_ns(cyc, cd.mult, cd.shift); + res = rd->epoch_ns; + if (!rd->suspended) { + cyc = rd->read_sched_clock(); + cyc = (cyc - rd->epoch_cyc) & rd->sched_clock_mask; + res += cyc_to_ns(cyc, rd->mult, rd->shift); } } while (read_seqcount_retry(&cd.seq, seq));
@@ -83,16 +118,17 @@ static void notrace update_sched_clock(void) unsigned long flags; u64 cyc; u64 ns; + struct clock_read_data *rd = &cd.read_data;
- cyc = read_sched_clock(); - ns = cd.epoch_ns + - cyc_to_ns((cyc - cd.epoch_cyc) & sched_clock_mask, - cd.mult, cd.shift); + cyc = rd->read_sched_clock(); + ns = rd->epoch_ns + + cyc_to_ns((cyc - rd->epoch_cyc) & rd->sched_clock_mask, + rd->mult, rd->shift);
raw_local_irq_save(flags); raw_write_seqcount_begin(&cd.seq); - cd.epoch_ns = ns; - cd.epoch_cyc = cyc; + rd->epoch_ns = ns; + rd->epoch_cyc = cyc; raw_write_seqcount_end(&cd.seq); raw_local_irq_restore(flags); } @@ -109,9 +145,9 @@ void __init sched_clock_register(u64 (*read)(void), int bits, { u64 res, wrap, new_mask, new_epoch, cyc, ns; u32 new_mult, new_shift; - ktime_t new_wrap_kt; unsigned long r; char r_unit; + struct clock_read_data *rd = &cd.read_data;
if (cd.rate > rate) return; @@ -130,17 +166,18 @@ void __init sched_clock_register(u64 (*read)(void), int bits,
/* update epoch for new counter and update epoch_ns from old counter*/ new_epoch = read(); - cyc = read_sched_clock(); - ns = cd.epoch_ns + cyc_to_ns((cyc - cd.epoch_cyc) & sched_clock_mask, - cd.mult, cd.shift); + cyc = rd->read_sched_clock(); + ns = rd->epoch_ns + + cyc_to_ns((cyc - rd->epoch_cyc) & rd->sched_clock_mask, + rd->mult, rd->shift);
raw_write_seqcount_begin(&cd.seq); - read_sched_clock = read; - sched_clock_mask = new_mask; - cd.mult = new_mult; - cd.shift = new_shift; - cd.epoch_cyc = new_epoch; - cd.epoch_ns = ns; + rd->read_sched_clock = read; + rd->sched_clock_mask = new_mask; + rd->mult = new_mult; + rd->shift = new_shift; + rd->epoch_cyc = new_epoch; + rd->epoch_ns = ns; raw_write_seqcount_end(&cd.seq);
r = rate; @@ -172,7 +209,7 @@ void __init sched_clock_postinit(void) * If no sched_clock function has been provided at that point, * make it the final one one. */ - if (read_sched_clock == jiffy_sched_clock_read) + if (cd.read_data.read_sched_clock == jiffy_sched_clock_read) sched_clock_register(jiffy_sched_clock_read, BITS_PER_LONG, HZ);
update_sched_clock(); @@ -188,17 +225,21 @@ void __init sched_clock_postinit(void)
static int sched_clock_suspend(void) { + struct clock_read_data *rd = &cd.read_data; + update_sched_clock(); hrtimer_cancel(&sched_clock_timer); - cd.suspended = true; + rd->suspended = true; return 0; }
static void sched_clock_resume(void) { - cd.epoch_cyc = read_sched_clock(); + struct clock_read_data *rd = &cd.read_data; + + rd->epoch_cyc = rd->read_sched_clock(); hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL); - cd.suspended = false; + rd->suspended = false; }
static struct syscore_ops sched_clock_ops = {
Currently cd.read_data.suspended is read by the hotpath function sched_clock(). This variable need not be accessed on the hotpath. In fact, once it is removed, we can remove the conditional branches from sched_clock() and install a dummy read_sched_clock function to suspend the clock.
The new master copy of the function pointer (actual_read_sched_clock) is introduced and is used for all reads of the clock hardware except those within sched_clock itself.
Suggested-by: Thomas Gleixner tglx@linutronix.de Signed-off-by: Daniel Thompson daniel.thompson@linaro.org Cc: Russell King linux@arm.linux.org.uk Cc: Will Deacon will.deacon@arm.com Cc: Catalin Marinas catalin.marinas@arm.com Reviewed-by: Stephen Boyd sboyd@codeaurora.org --- kernel/time/sched_clock.c | 40 +++++++++++++++++++++++++--------------- 1 file changed, 25 insertions(+), 15 deletions(-)
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index 695b2ac2e8b4..5d6407951fb8 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -28,10 +28,9 @@ * @read_sched_clock: Current clock source (or dummy source when suspended) * @mult: Multipler for scaled math conversion * @shift: Shift value for scaled math conversion - * @suspended: Flag to indicate if the clock is suspended (stopped) * * Care must be taken when updating this structure; it is read by - * some very hot code paths. It occupies <=48 bytes and, when combined + * some very hot code paths. It occupies <=40 bytes and, when combined * with the seqcount used to synchronize access, comfortably fits into * a 64 byte cache line. */ @@ -42,7 +41,6 @@ struct clock_read_data { u64 (*read_sched_clock)(void); u32 mult; u32 shift; - bool suspended; };
/** @@ -64,6 +62,7 @@ struct clock_data { struct clock_read_data read_data; ktime_t wrap_kt; unsigned long rate; + u64 (*actual_read_sched_clock)(void); };
static struct hrtimer sched_clock_timer; @@ -83,6 +82,8 @@ static u64 notrace jiffy_sched_clock_read(void) static struct clock_data cd ____cacheline_aligned = { .read_data = { .mult = NSEC_PER_SEC / HZ, .read_sched_clock = jiffy_sched_clock_read, }, + .actual_read_sched_clock = jiffy_sched_clock_read, + };
static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift) @@ -99,12 +100,9 @@ unsigned long long notrace sched_clock(void) do { seq = raw_read_seqcount_begin(&cd.seq);
- res = rd->epoch_ns; - if (!rd->suspended) { - cyc = rd->read_sched_clock(); - cyc = (cyc - rd->epoch_cyc) & rd->sched_clock_mask; - res += cyc_to_ns(cyc, rd->mult, rd->shift); - } + cyc = (rd->read_sched_clock() - rd->epoch_cyc) & + rd->sched_clock_mask; + res = rd->epoch_ns + cyc_to_ns(cyc, rd->mult, rd->shift); } while (read_seqcount_retry(&cd.seq, seq));
return res; @@ -120,7 +118,7 @@ static void notrace update_sched_clock(void) u64 ns; struct clock_read_data *rd = &cd.read_data;
- cyc = rd->read_sched_clock(); + cyc = cd.actual_read_sched_clock(); ns = rd->epoch_ns + cyc_to_ns((cyc - rd->epoch_cyc) & rd->sched_clock_mask, rd->mult, rd->shift); @@ -166,10 +164,11 @@ void __init sched_clock_register(u64 (*read)(void), int bits,
/* update epoch for new counter and update epoch_ns from old counter*/ new_epoch = read(); - cyc = rd->read_sched_clock(); + cyc = cd.actual_read_sched_clock(); ns = rd->epoch_ns + cyc_to_ns((cyc - rd->epoch_cyc) & rd->sched_clock_mask, rd->mult, rd->shift); + cd.actual_read_sched_clock = read;
raw_write_seqcount_begin(&cd.seq); rd->read_sched_clock = read; @@ -209,7 +208,7 @@ void __init sched_clock_postinit(void) * If no sched_clock function has been provided at that point, * make it the final one one. */ - if (cd.read_data.read_sched_clock == jiffy_sched_clock_read) + if (cd.actual_read_sched_clock == jiffy_sched_clock_read) sched_clock_register(jiffy_sched_clock_read, BITS_PER_LONG, HZ);
update_sched_clock(); @@ -223,13 +222,24 @@ void __init sched_clock_postinit(void) hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL); }
+/* + * Clock read function for use when the clock is suspended. + * + * This function makes it appear to sched_clock() as if the clock + * stopped counting at its last update. + */ +static u64 notrace suspended_sched_clock_read(void) +{ + return cd.read_data.epoch_cyc; +} + static int sched_clock_suspend(void) { struct clock_read_data *rd = &cd.read_data;
update_sched_clock(); hrtimer_cancel(&sched_clock_timer); - rd->suspended = true; + rd->read_sched_clock = suspended_sched_clock_read; return 0; }
@@ -237,9 +247,9 @@ static void sched_clock_resume(void) { struct clock_read_data *rd = &cd.read_data;
- rd->epoch_cyc = rd->read_sched_clock(); + rd->epoch_cyc = cd.actual_read_sched_clock(); hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL); - rd->suspended = false; + rd->read_sched_clock = cd.actual_read_sched_clock; }
static struct syscore_ops sched_clock_ops = {
Currently update_sched_clock() is marked as notrace but this function is not called by ftrace. This is trivially fixed by removing the mark up.
Signed-off-by: Daniel Thompson daniel.thompson@linaro.org Reviewed-by: Stephen Boyd sboyd@codeaurora.org --- kernel/time/sched_clock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index 5d6407951fb8..9280327676dc 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -111,7 +111,7 @@ unsigned long long notrace sched_clock(void) /* * Atomically update the sched_clock epoch. */ -static void notrace update_sched_clock(void) +static void update_sched_clock(void) { unsigned long flags; u64 cyc;
Currently it is possible for an NMI (or FIQ on ARM) to come in and read sched_clock() whilst update_sched_clock() has locked the seqcount for writing. This results in the NMI handler locking up when it calls raw_read_seqcount_begin().
This patch fixes the NMI safety issues by providing banked clock data. This is a similar approach to the one used in Thomas Gleixner's 4396e058c52e("timekeeping: Provide fast and NMI safe access to CLOCK_MONOTONIC").
Suggested-by: Stephen Boyd sboyd@codeaurora.org Signed-off-by: Daniel Thompson daniel.thompson@linaro.org Cc: Russell King linux@arm.linux.org.uk Cc: Will Deacon will.deacon@arm.com Cc: Catalin Marinas catalin.marinas@arm.com Reviewed-by: Stephen Boyd sboyd@codeaurora.org --- kernel/time/sched_clock.c | 103 ++++++++++++++++++++++++++++++---------------- 1 file changed, 68 insertions(+), 35 deletions(-)
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index 9280327676dc..a23d98c33dab 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -47,19 +47,20 @@ struct clock_read_data { * struct clock_data - all data needed for sched_clock (including * registration of a new clock source) * - * @seq: Sequence counter for protecting updates. + * @seq: Sequence counter for protecting updates. The lowest + * bit is the index for @read_data. * @read_data: Data required to read from sched_clock. * @wrap_kt: Duration for which clock can run before wrapping * @rate: Tick rate of the registered clock * @actual_read_sched_clock: Registered clock read function * * The ordering of this structure has been chosen to optimize cache - * performance. In particular seq and read_data (combined) should fit + * performance. In particular seq and read_data[0] (combined) should fit * into a single 64 byte cache line. */ struct clock_data { seqcount_t seq; - struct clock_read_data read_data; + struct clock_read_data read_data[2]; ktime_t wrap_kt; unsigned long rate; u64 (*actual_read_sched_clock)(void); @@ -80,10 +81,9 @@ static u64 notrace jiffy_sched_clock_read(void) }
static struct clock_data cd ____cacheline_aligned = { - .read_data = { .mult = NSEC_PER_SEC / HZ, - .read_sched_clock = jiffy_sched_clock_read, }, + .read_data[0] = { .mult = NSEC_PER_SEC / HZ, + .read_sched_clock = jiffy_sched_clock_read, }, .actual_read_sched_clock = jiffy_sched_clock_read, - };
static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift) @@ -95,10 +95,11 @@ unsigned long long notrace sched_clock(void) { u64 cyc, res; unsigned long seq; - struct clock_read_data *rd = &cd.read_data; + struct clock_read_data *rd;
do { - seq = raw_read_seqcount_begin(&cd.seq); + seq = raw_read_seqcount(&cd.seq); + rd = cd.read_data + (seq & 1);
cyc = (rd->read_sched_clock() - rd->epoch_cyc) & rd->sched_clock_mask; @@ -109,26 +110,50 @@ unsigned long long notrace sched_clock(void) }
/* + * Updating the data required to read the clock. + * + * sched_clock will never observe mis-matched data even if called from + * an NMI. We do this by maintaining an odd/even copy of the data and + * steering sched_clock to one or the other using a sequence counter. + * In order to preserve the data cache profile of sched_clock as much + * as possible the system reverts back to the even copy when the update + * completes; the odd copy is used *only* during an update. + */ +static void update_clock_read_data(struct clock_read_data *rd) +{ + /* update the backup (odd) copy with the new data */ + cd.read_data[1] = *rd; + + /* steer readers towards the odd copy */ + raw_write_seqcount_latch(&cd.seq); + + /* now its safe for us to update the normal (even) copy */ + cd.read_data[0] = *rd; + + /* switch readers back to the even copy */ + raw_write_seqcount_latch(&cd.seq); +} + +/* * Atomically update the sched_clock epoch. */ static void update_sched_clock(void) { - unsigned long flags; u64 cyc; u64 ns; - struct clock_read_data *rd = &cd.read_data; + struct clock_read_data rd; + + rd = cd.read_data[0];
cyc = cd.actual_read_sched_clock(); - ns = rd->epoch_ns + - cyc_to_ns((cyc - rd->epoch_cyc) & rd->sched_clock_mask, - rd->mult, rd->shift); - - raw_local_irq_save(flags); - raw_write_seqcount_begin(&cd.seq); - rd->epoch_ns = ns; - rd->epoch_cyc = cyc; - raw_write_seqcount_end(&cd.seq); - raw_local_irq_restore(flags); + ns = rd.epoch_ns + + cyc_to_ns((cyc - rd.epoch_cyc) & rd.sched_clock_mask, + rd.mult, rd.shift); + + rd.epoch_ns = ns; + rd.epoch_cyc = cyc; + + update_clock_read_data(&rd); }
static enum hrtimer_restart sched_clock_poll(struct hrtimer *hrt) @@ -145,7 +170,7 @@ void __init sched_clock_register(u64 (*read)(void), int bits, u32 new_mult, new_shift; unsigned long r; char r_unit; - struct clock_read_data *rd = &cd.read_data; + struct clock_read_data rd;
if (cd.rate > rate) return; @@ -162,22 +187,23 @@ void __init sched_clock_register(u64 (*read)(void), int bits, wrap = clocks_calc_max_nsecs(new_mult, new_shift, 0, new_mask); cd.wrap_kt = ns_to_ktime(wrap - (wrap >> 3));
+ rd = cd.read_data[0]; + /* update epoch for new counter and update epoch_ns from old counter*/ new_epoch = read(); cyc = cd.actual_read_sched_clock(); - ns = rd->epoch_ns + - cyc_to_ns((cyc - rd->epoch_cyc) & rd->sched_clock_mask, - rd->mult, rd->shift); + ns = rd.epoch_ns + + cyc_to_ns((cyc - rd.epoch_cyc) & rd.sched_clock_mask, + rd.mult, rd.shift); cd.actual_read_sched_clock = read;
- raw_write_seqcount_begin(&cd.seq); - rd->read_sched_clock = read; - rd->sched_clock_mask = new_mask; - rd->mult = new_mult; - rd->shift = new_shift; - rd->epoch_cyc = new_epoch; - rd->epoch_ns = ns; - raw_write_seqcount_end(&cd.seq); + rd.read_sched_clock = read; + rd.sched_clock_mask = new_mask; + rd.mult = new_mult; + rd.shift = new_shift; + rd.epoch_cyc = new_epoch; + rd.epoch_ns = ns; + update_clock_read_data(&rd);
r = rate; if (r >= 4000000) { @@ -227,15 +253,22 @@ void __init sched_clock_postinit(void) * * This function makes it appear to sched_clock() as if the clock * stopped counting at its last update. + * + * This function must only be called from the critical + * section in sched_clock(). It relies on the read_seqcount_retry() + * at the end of the critical section to be sure we observe the + * correct copy of epoch_cyc. */ static u64 notrace suspended_sched_clock_read(void) { - return cd.read_data.epoch_cyc; + unsigned long seq = raw_read_seqcount(&cd.seq); + + return cd.read_data[seq & 1].epoch_cyc; }
static int sched_clock_suspend(void) { - struct clock_read_data *rd = &cd.read_data; + struct clock_read_data *rd = &cd.read_data[0];
update_sched_clock(); hrtimer_cancel(&sched_clock_timer); @@ -245,7 +278,7 @@ static int sched_clock_suspend(void)
static void sched_clock_resume(void) { - struct clock_read_data *rd = &cd.read_data; + struct clock_read_data *rd = &cd.read_data[0];
rd->epoch_cyc = cd.actual_read_sched_clock(); hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL);
linaro-kernel@lists.linaro.org