Hi,
> Hi,
>
> the cpu frequency scaling never worked right (only on the 4.4 kernel from
> marvell). If you use the 1000 MHz firmware you are running with just 800
> MHz (this is the case on my board with a current firmware).
>
> Just have a look what the kernel thinks it is running at (frequency).
Ok, probably my bad here. By 'worked fine' i mean that this didn't lead to any
freezes or panics. I know the actual frequency wasn't set properly
Regards
/Ilias
>
> Regards,
> Christian
>
> Ilias Apalodimas <ilias.apalodimas(a)linaro.org> schrieb am Do., 14. März
> 2019, 14:44:
>
> > Hello Christian,
> > > Hi,
> > >
> > > I assume you use the 1000 MHz firmware. This does also not work on my
> > Rev 7
> > > board. But I'm pretty sure this is not a problem of the patches, because
> > if
> > > I take a newer kernel (4.19.20/27) without the patches it also does not
> > > work. A kernel 4.19.17 does work for me. My opinion on that is that this
> > is
> > > another problem which does just occure now because now the cpu frequency
> > > scaling is working with the right frequencies.
> > I am not sure which firmware i am running, i did all my tests on 5.0.0 and
> > changing between governors worked fine without the patches
> >
> > Regards
> > /Ilias
> > >
> > > Ilias Apalodimas <ilias.apalodimas(a)linaro.org> schrieb am Do., 14. März
> > > 2019, 13:15:
> > >
> > > > Hi Gregory,
> > > > > The clock parenting was not setup properly when DVFS was enabled. It
> > was
> > > > > expected that the same clock source was used with and without DVFS
> > which
> > > > > was not the case.
> > > > >
> > > > > This patch fixes this issue, allowing to make the cpufreq support
> > work
> > > > > when the CPU clocks source are not the default ones.
> > > > >
> > > > > Fixes: 92ce45fb875d ("cpufreq: Add DVFS support for Armada 37xx")
> > > > > Cc: <stable(a)vger.kernel.org>
> > > > > Reported-by: Christian Neubert <christian.neubert.86(a)gmail.com>
> > > > > Reported-by: Ilias Apalodimas <ilias.apalodimas(a)linaro.org>
> > > > > Signed-off-by: Gregory CLEMENT <gregory.clement(a)bootlin.com>
> > > > > ---
> > > > > drivers/clk/mvebu/armada-37xx-periph.c | 11 +++++++++++
> > > > > 1 file changed, 11 insertions(+)
> > > > >
> > > > > diff --git a/drivers/clk/mvebu/armada-37xx-periph.c
> > > > b/drivers/clk/mvebu/armada-37xx-periph.c
> > > > > index 1f1cff428d78..26ed3c18a239 100644
> > > > > --- a/drivers/clk/mvebu/armada-37xx-periph.c
> > > > > +++ b/drivers/clk/mvebu/armada-37xx-periph.c
> > > > > @@ -671,6 +671,17 @@ static int armada_3700_add_composite_clk(const
> > > > struct clk_periph_data *data,
> > > > > map = syscon_regmap_lookup_by_compatible(
> > > > > "marvell,armada-3700-nb-pm");
> > > > > pmcpu_clk->nb_pm_base = map;
> > > > > +
> > > > > + /*
> > > > > + * Use the same parent when DVFS is enabled that the
> > > > > + * default parent received at boot time. When this
> > > > > + * function is called, DVFS is not enabled yet, so we
> > > > > + * get the default parent and we can set the parent
> > > > > + * for DVFS.
> > > > > + */
> > > > > + if (clk_pm_cpu_set_parent(muxrate_hw,
> > > > > +
> > > > clk_pm_cpu_get_parent(muxrate_hw)))
> > > > > + dev_warn(dev, "Failed to setup default parent
> > > > clock for DVFS\n");
> > > > > }
> > > > >
> > > > > *hw = clk_hw_register_composite(dev, data->name,
> > > > data->parent_names,
> > > > > --
> > > > > 2.20.1
> > > > >
> > > > Applied this and selected only
> > > >
> > > > CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE=y
> > > > CONFIG_CPU_FREQ_GOV_PERFORMANCE=y
> > > > CONFIG_CPU_FREQ_GOV_POWERSAVE=y
> > > >
> > > > After changing the governor from 'powersave' to 'performance' the board
> > > > completely froze (i even lost access to the serial port)
> > > >
> > > > Cheers
> > > > /Ilias
> > > >
> >
The clock parenting was not setup properly when DVFS was enabled. It was
expected that the same clock source was used with and without DVFS which
was not the case.
This patch fixes this issue, allowing to make the cpufreq support work
when the CPU clocks source are not the default ones.
Fixes: 92ce45fb875d ("cpufreq: Add DVFS support for Armada 37xx")
Cc: <stable(a)vger.kernel.org>
Reported-by: Christian Neubert <christian.neubert.86(a)gmail.com>
Reported-by: Ilias Apalodimas <ilias.apalodimas(a)linaro.org>
Signed-off-by: Gregory CLEMENT <gregory.clement(a)bootlin.com>
---
drivers/clk/mvebu/armada-37xx-periph.c | 11 +++++++++++
1 file changed, 11 insertions(+)
diff --git a/drivers/clk/mvebu/armada-37xx-periph.c b/drivers/clk/mvebu/armada-37xx-periph.c
index 1f1cff428d78..26ed3c18a239 100644
--- a/drivers/clk/mvebu/armada-37xx-periph.c
+++ b/drivers/clk/mvebu/armada-37xx-periph.c
@@ -671,6 +671,17 @@ static int armada_3700_add_composite_clk(const struct clk_periph_data *data,
map = syscon_regmap_lookup_by_compatible(
"marvell,armada-3700-nb-pm");
pmcpu_clk->nb_pm_base = map;
+
+ /*
+ * Use the same parent when DVFS is enabled that the
+ * default parent received at boot time. When this
+ * function is called, DVFS is not enabled yet, so we
+ * get the default parent and we can set the parent
+ * for DVFS.
+ */
+ if (clk_pm_cpu_set_parent(muxrate_hw,
+ clk_pm_cpu_get_parent(muxrate_hw)))
+ dev_warn(dev, "Failed to setup default parent clock for DVFS\n");
}
*hw = clk_hw_register_composite(dev, data->name, data->parent_names,
--
2.20.1
From: Christian Neubert <christian.neubert.86(a)gmail.com>
The clock parenting was not setup properly when DVFS was enabled. It was
expected that the same clock source was used with and without DVFS which
was not the case.
This patch fixes this issue, allowing to make the cpufreq support work
when the CPU clock source are not the default ones.
Fixes: 92ce45fb875d ("cpufreq: Add DVFS support for Armada 37xx")
Cc: <stable(a)vger.kernel.org>
[gregory: extract from a larger patch, modify comments and commit log]
Signed-off-by: Christian Neubert <christian.neubert.86(a)gmail.com>
Signed-off-by: Gregory CLEMENT <gregory.clement(a)bootlin.com>
---
drivers/cpufreq/armada-37xx-cpufreq.c | 20 +++++++++++++++++---
1 file changed, 17 insertions(+), 3 deletions(-)
diff --git a/drivers/cpufreq/armada-37xx-cpufreq.c b/drivers/cpufreq/armada-37xx-cpufreq.c
index 75491fc841a6..ad4463e4266e 100644
--- a/drivers/cpufreq/armada-37xx-cpufreq.c
+++ b/drivers/cpufreq/armada-37xx-cpufreq.c
@@ -162,11 +162,25 @@ static void __init armada37xx_cpufreq_dvfs_setup(struct regmap *base,
}
/*
- * Set cpu clock source, for all the level we keep the same
- * clock source that the one already configured. For this one
- * we need to use the clock framework
+ * Set CPU clock source, for all the level we keep the same
+ * clock source that the one already configured with DVS
+ * disabled. For this one we need to use the clock framework
*/
parent = clk_get_parent(clk);
+
+ /*
+ * Unset parent clock to force the clock framework setting again
+ * the clock parent
+ */
+ clk_set_parent(clk, NULL);
+
+ /*
+ * For the Armada 37xx CPU clocks, setting the parent will
+ * actually configure the parent when DVFS is enabled. At
+ * hardware level it will be a different register from the one
+ * read when doing clk_get_parent that will be set with
+ * clk_set_parent.
+ */
clk_set_parent(clk, parent);
}
--
2.20.1
FUSE filesystem server and kernel client negotiate during initialization
phase, what should be the maximum write size the client will ever issue.
Correspondingly the filesystem server then queues sys_read calls to read
requests with buffer capacity large enough to carry request header
+ that max_write bytes. A filesystem server is free to set its max_write
in anywhere in the range between [1·page, fc->max_pages·page]. In
particular go-fuse[2] sets max_write by default as 64K, wheres default
fc->max_pages corresponds to 128K. Libfuse also allows users to
configure max_write, but by default presets it to possible maximum.
If max_write is < fc->max_pages·page, and in NOTIFY_RETRIEVE handler we
allow to retrieve more than max_write bytes, corresponding prepared
NOTIFY_REPLY will be thrown away by fuse_dev_do_read, because the
filesystem server, in full correspondence with server/client contract,
will be only queuing sys_read with ~max_write buffer capacity, and
fuse_dev_do_read throws away requests that cannot fit into server
request buffer. In turn the filesystem server could get stuck waiting
indefinitely for NOTIFY_REPLY since NOTIFY_RETRIEVE handler returned OK
which is understood by clients as that NOTIFY_REPLY was queued and will
be sent back.
-> Cap requested size to negotiate max_write to avoid the problem.
This aligns with the way NOTIFY_RETRIEVE handler works, which already
unconditionally caps requested retrieve size to fuse_conn->max_pages.
This way it should not hurt NOTIFY_RETRIEVE semantic if we return less
data than was originally requested.
Please see [1] for context where the problem of stuck filesystem was hit
for real, how the situation was traced and for more involving patch that
did not make it into the tree.
[1] https://marc.info/?l=linux-fsdevel&m=155057023600853&w=2
[2] https://github.com/hanwen/go-fuse
Signed-off-by: Kirill Smelkov <kirr(a)nexedi.com>
Cc: Han-Wen Nienhuys <hanwen(a)google.com>
Cc: Jakob Unterwurzacher <jakobunt(a)gmail.com>
Cc: <stable(a)vger.kernel.org> # v2.6.36+
---
fs/fuse/dev.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 8a63e52785e9..38e94bc43053 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -1749,7 +1749,7 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode,
offset = outarg->offset & ~PAGE_MASK;
file_size = i_size_read(inode);
- num = outarg->size;
+ num = min(outarg->size, fc->max_write);
if (outarg->offset > file_size)
num = 0;
else if (outarg->offset + num > file_size)
--
2.21.0.225.g810b269d1a
Hi, Greg,
Patch for 4.9 (and below 4.9) is here:
https://patchwork.linux-mips.org/patch/21375/
Huacai
------------------ Original ------------------
From: "陈华才"<chenhc(a)lemote.com>;
Date: Thu, Mar 14, 2019 06:55 AM
To: "gregkh"<gregkh(a)linuxfoundation.org>;
Cc: "linux-kernel"<linux-kernel(a)vger.kernel.org>; "stable"<stable(a)vger.kernel.org>; "huangpei"<huangpei(a)loongson.cn>; "Paul Burton"<paul.burton(a)mips.com>; "Ralf Baechle"<ralf(a)linux-mips.org>; "ambrosehua"<ambrosehua(a)gmail.com>; "Steven J . Hill"<Steven.Hill(a)cavium.com>; "linux-mips"<linux-mips(a)linux-mips.org>; "Fuxin Zhang"<zhangfx(a)lemote.com>; "wuzhangjin"<wuzhangjin(a)gmail.com>; "Li Xuefeng"<lixuefeng(a)loongson.cn>; "Xu Chenghua"<xuchenghua(a)loongson.cn>; "Sasha Levin"<sashal(a)kernel.org>;
Subject: Re:Re: Re:[PATCH 4.9 81/96] MIPS:Loongson: Introduce and use loongson_llsc_mb()
Hi, Greg,
Just 4.9(and below) need to modify spinlock.h, 4.14, 4.19 does not need to do this because they have converted to qspinlock. And, sorry for my poor reply because I can only use mobile phone now.
---原始邮件---
发件人:"Greg Kroah-Hartman"<gregkh(a)linuxfoundation.org>
发送时间:2019年3月14日(星期四) 凌晨4:58
收件人:"陈华才"<chenhc(a)lemote.com>;
主题:Re:[PATCH 4.9 81/96] MIPS:Loongson: Introduce and use loongson_llsc_mb()
On Wed, Mar 13, 2019 at 09:17:15PM +0800, 陈华才 wrote:
> Hi, GREG,
>
> 4.9 need to modify spinlock.h, please wait my patch.
>
>
>
> ---原始邮件---
> 发件人:"Greg Kroah-Hartman"<gregkh(a)linuxfoundation.org>
> 发送时间:2019年3月13日(星期三) 凌晨1:10
> 收件人:"linux-kernel"<linux-kernel(a)vger.kernel.org>;
> 主题:[PATCH 4.9 81/96] MIPS: Loongson: Introduce and use loongson_llsc_mb()
> 4.9-stable review patch. If anyone has any objections, please let me know.
>
> ------------------
>
> [ Upstream commit e02e07e3127d8aec1f4bcdfb2fc52a2d99b4859e ]
>
> On the Loongson-2G/2H/3A/3B there is a hardware flaw that ll/sc and
> lld/scd is very weak ordering. We should add sync instructions "before
> each ll/lld" and "at the branch-target between ll/sc" to workaround.
> Otherwise, this flaw will cause deadlock occasionally (e.g. when doing
> heavy load test with LTP).
>
> Below is the explaination of CPU designer:
>
> "For Loongson 3 family, when a memory access instruction (load, store,
> or prefetch)'ecuting occurs between the execution of LL and SC, the
> success or failure of SC is not predictable. Although programmer would
> not insert memory access instructions between LL and SC, the memory
> instructions before LL in program-order, may dynamically executed
> between the execution of LL/SC, so a memory fence (SYNC) is needed
> before LL/LLD to avoid this situation.
>
> Since Loongson-3A R2 (3A2000), we have improved our hardware design to
> handle this case. But we later deduce a rarely circumstance that some
> speculatively executed memory instructions due to branch misprediction
> between LL/SC still fall into the above case, so a memory fence (SYNC)
> at branch-target (if its target is not between LL/SC) is needed for
> Loongson 3A1000, 3B1500, 3A2000 and 3A3000.
>
> Our processor is continually evolving and we aim to to remove all these
> workaround-SYNCs around LL/SC for new-come processor."
>
> Here is an example:
>
> Both cpu1 and cpu2 simutaneously run atomic_add by 1 on same atomic var,
> this bug cause both '' by two cpus (in atomic_add) succeed at same
> time(''urn 1), and the variable is only *added by 1*, sometimes,
> which is wrong and unacceptable(it should be added by 2).
>
> Why disable fix-loongson3-llsc in compiler?
> Because compiler fix will cause problems in kernel'ex_table section.
>
> This patch fix all the cases in kernel, but:
>
> +. the fix at the end of futex_atomic_cmpxchg_inatomic is for branch-target
> of ''ere other cases which smp_mb__before_llsc() and smp_llsc_mb() fix
> the ll and branch-target coincidently such as atomic_sub_if_positive/
> cmpxchg/xchg, just like this one.
>
> +. Loongson 3 does support CONFIG_EDAC_ATOMIC_SCRUB, so no need to touch
> edac.h
>
> +. local_ops and cmpxchg_local should not be affected by this bug since
> only the owner can write.
>
> +. mips_atomic_set for syscall.c is deprecated and rarely used, just let
> it go
>
> Signed-off-by: Huacai Chen <chenhc(a)lemote.com>
> Signed-off-by: Huang Pei <huangpei(a)loongson.cn>
> [paul.burton(a)mips.com:
> - Simplify the addition of -mno-fix-loongson3-llsc to cflags, and add
> a comment describing why it'ere.
> - Make loongson_llsc_mb() a no-op when
> CONFIG_CPU_LOONGSON3_WORKAROUNDS=n, rather than a compiler memory
> barrier.
> - Add a comment describing the bug & how loongson_llsc_mb() helps
> in asm/barrier.h.]
> Signed-off-by: Paul Burton <paul.burton(a)mips.com>
> Cc: Ralf Baechle <ralf(a)linux-mips.org>
> Cc: ambrosehua(a)gmail.com
> Cc: Steven J . Hill <Steven.Hill(a)cavium.com>
> Cc: linux-mips(a)linux-mips.org
> Cc: Fuxin Zhang <zhangfx(a)lemote.com>
> Cc: Zhangjin Wu <wuzhangjin(a)gmail.com>
> Cc: Li Xuefeng <lixuefeng(a)loongson.cn>
> Cc: Xu Chenghua <xuchenghua(a)loongson.cn>
> Signed-off-by: Sasha Levin <sashal(a)kernel.org>
> ---
> arch/mips/Kconfig | 15 ++++++++++++++
> arch/mips/include/asm/atomic.h | 6 ++++++
> arch/mips/include/asm/barrier.h | 36 +++++++++++++++++++++++++++++++++
> arch/mips/include/asm/bitops.h | 5 +++++
> arch/mips/include/asm/futex.h | 3 +++
> arch/mips/include/asm/pgtable.h | 2 ++
> arch/mips/loongson64/Platform | 23 +++++++++++++++++++++
> arch/mips/mm/tlbex.c | 10 +++++++++
> 8 files changed, 100 insertions(+)
Ok, I will go drop this from all stable queues now, thanks!
greg k-h
On the Loongson-2G/2H/3A/3B there is a hardware flaw that ll/sc and
lld/scd is very weak ordering. We should add sync instructions "before
each ll/lld" and "at the branch-target between ll/sc" to workaround.
Otherwise, this flaw will cause deadlock occasionally (e.g. when doing
heavy load test with LTP).
Below is the explaination of CPU designer:
"For Loongson 3 family, when a memory access instruction (load, store,
or prefetch)'s executing occurs between the execution of LL and SC, the
success or failure of SC is not predictable. Although programmer would
not insert memory access instructions between LL and SC, the memory
instructions before LL in program-order, may dynamically executed
between the execution of LL/SC, so a memory fence (SYNC) is needed
before LL/LLD to avoid this situation.
Since Loongson-3A R2 (3A2000), we have improved our hardware design to
handle this case. But we later deduce a rarely circumstance that some
speculatively executed memory instructions due to branch misprediction
between LL/SC still fall into the above case, so a memory fence (SYNC)
at branch-target (if its target is not between LL/SC) is needed for
Loongson 3A1000, 3B1500, 3A2000 and 3A3000.
Our processor is continually evolving and we aim to to remove all these
workaround-SYNCs around LL/SC for new-come processor."
Here is an example:
Both cpu1 and cpu2 simutaneously run atomic_add by 1 on same atomic var,
this bug cause both 'sc' run by two cpus (in atomic_add) succeed at same
time('sc' return 1), and the variable is only *added by 1*, sometimes,
which is wrong and unacceptable(it should be added by 2).
Why disable fix-loongson3-llsc in compiler?
Because compiler fix will cause problems in kernel's __ex_table section.
This patch fix all the cases in kernel, but:
+. the fix at the end of futex_atomic_cmpxchg_inatomic is for branch-target
of 'bne', there other cases which smp_mb__before_llsc() and smp_llsc_mb() fix
the ll and branch-target coincidently such as atomic_sub_if_positive/
cmpxchg/xchg, just like this one.
+. Loongson 3 does support CONFIG_EDAC_ATOMIC_SCRUB, so no need to touch
edac.h
+. local_ops and cmpxchg_local should not be affected by this bug since
only the owner can write.
+. mips_atomic_set for syscall.c is deprecated and rarely used, just let
it go
Signed-off-by: Huacai Chen <chenhc(a)lemote.com>
Signed-off-by: Huang Pei <huangpei(a)loongson.cn>
[paul.burton(a)mips.com:
- Simplify the addition of -mno-fix-loongson3-llsc to cflags, and add
a comment describing why it's there.
- Make loongson_llsc_mb() a no-op when
CONFIG_CPU_LOONGSON3_WORKAROUNDS=n, rather than a compiler memory
barrier.
- Add a comment describing the bug & how loongson_llsc_mb() helps
in asm/barrier.h.]
Signed-off-by: Paul Burton <paul.burton(a)mips.com>
Cc: Ralf Baechle <ralf(a)linux-mips.org>
Cc: ambrosehua(a)gmail.com
Cc: Steven J . Hill <Steven.Hill(a)cavium.com>
Cc: linux-mips(a)linux-mips.org
Cc: Fuxin Zhang <zhangfx(a)lemote.com>
Cc: Zhangjin Wu <wuzhangjin(a)gmail.com>
Cc: Li Xuefeng <lixuefeng(a)loongson.cn>
Cc: Xu Chenghua <xuchenghua(a)loongson.cn>
Cc: stable(a)vger.kernel.org
Cc: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
arch/mips/Kconfig | 15 +++++++++++++++
arch/mips/include/asm/atomic.h | 6 ++++++
arch/mips/include/asm/barrier.h | 36 ++++++++++++++++++++++++++++++++++++
arch/mips/include/asm/bitops.h | 5 +++++
arch/mips/include/asm/futex.h | 3 +++
arch/mips/include/asm/pgtable.h | 2 ++
arch/mips/include/asm/spinlock.h | 7 +++++++
arch/mips/loongson64/Platform | 23 +++++++++++++++++++++++
arch/mips/mm/tlbex.c | 10 ++++++++++
9 files changed, 107 insertions(+)
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index bb9940c..4766262 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -1396,6 +1396,21 @@ config LOONGSON3_ENHANCEMENT
please say 'N' here. If you want a high-performance kernel to run on
new Loongson 3 machines only, please say 'Y' here.
+config CPU_LOONGSON3_WORKAROUNDS
+ bool "Old Loongson 3 LLSC Workarounds"
+ default y if SMP
+ depends on CPU_LOONGSON3
+ help
+ Loongson 3 processors have the llsc issues which require workarounds.
+ Without workarounds the system may hang unexpectedly.
+
+ Newer Loongson 3 will fix these issues and no workarounds are needed.
+ The workarounds have no significant side effect on them but may
+ decrease the performance of the system so this option should be
+ disabled unless the kernel is intended to be run on old systems.
+
+ If unsure, please say Y.
+
config CPU_LOONGSON2E
bool "Loongson 2E"
depends on SYS_HAS_CPU_LOONGSON2E
diff --git a/arch/mips/include/asm/atomic.h b/arch/mips/include/asm/atomic.h
index 0ab176b..31ecdda 100644
--- a/arch/mips/include/asm/atomic.h
+++ b/arch/mips/include/asm/atomic.h
@@ -59,6 +59,7 @@ static __inline__ void atomic_##op(int i, atomic_t * v) \
} else if (kernel_uses_llsc) { \
int temp; \
\
+ loongson_llsc_mb(); \
do { \
__asm__ __volatile__( \
" .set "MIPS_ISA_LEVEL" \n" \
@@ -100,6 +101,7 @@ static __inline__ int atomic_##op##_return_relaxed(int i, atomic_t * v) \
} else if (kernel_uses_llsc) { \
int temp; \
\
+ loongson_llsc_mb(); \
do { \
__asm__ __volatile__( \
" .set "MIPS_ISA_LEVEL" \n" \
@@ -148,6 +150,7 @@ static __inline__ int atomic_fetch_##op##_relaxed(int i, atomic_t * v) \
} else if (kernel_uses_llsc) { \
int temp; \
\
+ loongson_llsc_mb(); \
do { \
__asm__ __volatile__( \
" .set "MIPS_ISA_LEVEL" \n" \
@@ -401,6 +404,7 @@ static __inline__ void atomic64_##op(long i, atomic64_t * v) \
} else if (kernel_uses_llsc) { \
long temp; \
\
+ loongson_llsc_mb(); \
do { \
__asm__ __volatile__( \
" .set "MIPS_ISA_LEVEL" \n" \
@@ -442,6 +446,7 @@ static __inline__ long atomic64_##op##_return_relaxed(long i, atomic64_t * v) \
} else if (kernel_uses_llsc) { \
long temp; \
\
+ loongson_llsc_mb(); \
do { \
__asm__ __volatile__( \
" .set "MIPS_ISA_LEVEL" \n" \
@@ -491,6 +496,7 @@ static __inline__ long atomic64_fetch_##op##_relaxed(long i, atomic64_t * v) \
} else if (kernel_uses_llsc) { \
long temp; \
\
+ loongson_llsc_mb(); \
do { \
__asm__ __volatile__( \
" .set "MIPS_ISA_LEVEL" \n" \
diff --git a/arch/mips/include/asm/barrier.h b/arch/mips/include/asm/barrier.h
index a5eb1bb..b7f6ac5 100644
--- a/arch/mips/include/asm/barrier.h
+++ b/arch/mips/include/asm/barrier.h
@@ -222,6 +222,42 @@
#define __smp_mb__before_atomic() __smp_mb__before_llsc()
#define __smp_mb__after_atomic() smp_llsc_mb()
+/*
+ * Some Loongson 3 CPUs have a bug wherein execution of a memory access (load,
+ * store or pref) in between an ll & sc can cause the sc instruction to
+ * erroneously succeed, breaking atomicity. Whilst it's unusual to write code
+ * containing such sequences, this bug bites harder than we might otherwise
+ * expect due to reordering & speculation:
+ *
+ * 1) A memory access appearing prior to the ll in program order may actually
+ * be executed after the ll - this is the reordering case.
+ *
+ * In order to avoid this we need to place a memory barrier (ie. a sync
+ * instruction) prior to every ll instruction, in between it & any earlier
+ * memory access instructions. Many of these cases are already covered by
+ * smp_mb__before_llsc() but for the remaining cases, typically ones in
+ * which multiple CPUs may operate on a memory location but ordering is not
+ * usually guaranteed, we use loongson_llsc_mb() below.
+ *
+ * This reordering case is fixed by 3A R2 CPUs, ie. 3A2000 models and later.
+ *
+ * 2) If a conditional branch exists between an ll & sc with a target outside
+ * of the ll-sc loop, for example an exit upon value mismatch in cmpxchg()
+ * or similar, then misprediction of the branch may allow speculative
+ * execution of memory accesses from outside of the ll-sc loop.
+ *
+ * In order to avoid this we need a memory barrier (ie. a sync instruction)
+ * at each affected branch target, for which we also use loongson_llsc_mb()
+ * defined below.
+ *
+ * This case affects all current Loongson 3 CPUs.
+ */
+#ifdef CONFIG_CPU_LOONGSON3_WORKAROUNDS /* Loongson-3's LLSC workaround */
+#define loongson_llsc_mb() __asm__ __volatile__(__WEAK_LLSC_MB : : :"memory")
+#else
+#define loongson_llsc_mb() do { } while (0)
+#endif
+
#include <asm-generic/barrier.h>
#endif /* __ASM_BARRIER_H */
diff --git a/arch/mips/include/asm/bitops.h b/arch/mips/include/asm/bitops.h
index fa57cef..38a162d 100644
--- a/arch/mips/include/asm/bitops.h
+++ b/arch/mips/include/asm/bitops.h
@@ -68,6 +68,7 @@ static inline void set_bit(unsigned long nr, volatile unsigned long *addr)
: "ir" (1UL << bit), GCC_OFF_SMALL_ASM() (*m));
#if defined(CONFIG_CPU_MIPSR2) || defined(CONFIG_CPU_MIPSR6)
} else if (kernel_uses_llsc && __builtin_constant_p(bit)) {
+ loongson_llsc_mb();
do {
__asm__ __volatile__(
" " __LL "%0, %1 # set_bit \n"
@@ -78,6 +79,7 @@ static inline void set_bit(unsigned long nr, volatile unsigned long *addr)
} while (unlikely(!temp));
#endif /* CONFIG_CPU_MIPSR2 || CONFIG_CPU_MIPSR6 */
} else if (kernel_uses_llsc) {
+ loongson_llsc_mb();
do {
__asm__ __volatile__(
" .set "MIPS_ISA_ARCH_LEVEL" \n"
@@ -120,6 +122,7 @@ static inline void clear_bit(unsigned long nr, volatile unsigned long *addr)
: "ir" (~(1UL << bit)));
#if defined(CONFIG_CPU_MIPSR2) || defined(CONFIG_CPU_MIPSR6)
} else if (kernel_uses_llsc && __builtin_constant_p(bit)) {
+ loongson_llsc_mb();
do {
__asm__ __volatile__(
" " __LL "%0, %1 # clear_bit \n"
@@ -130,6 +133,7 @@ static inline void clear_bit(unsigned long nr, volatile unsigned long *addr)
} while (unlikely(!temp));
#endif /* CONFIG_CPU_MIPSR2 || CONFIG_CPU_MIPSR6 */
} else if (kernel_uses_llsc) {
+ loongson_llsc_mb();
do {
__asm__ __volatile__(
" .set "MIPS_ISA_ARCH_LEVEL" \n"
@@ -188,6 +192,7 @@ static inline void change_bit(unsigned long nr, volatile unsigned long *addr)
unsigned long *m = ((unsigned long *) addr) + (nr >> SZLONG_LOG);
unsigned long temp;
+ loongson_llsc_mb();
do {
__asm__ __volatile__(
" .set "MIPS_ISA_ARCH_LEVEL" \n"
diff --git a/arch/mips/include/asm/futex.h b/arch/mips/include/asm/futex.h
index a9e61ea..0a62a91 100644
--- a/arch/mips/include/asm/futex.h
+++ b/arch/mips/include/asm/futex.h
@@ -50,6 +50,7 @@
"i" (-EFAULT) \
: "memory"); \
} else if (cpu_has_llsc) { \
+ loongson_llsc_mb(); \
__asm__ __volatile__( \
" .set push \n" \
" .set noat \n" \
@@ -162,6 +163,7 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
"i" (-EFAULT)
: "memory");
} else if (cpu_has_llsc) {
+ loongson_llsc_mb();
__asm__ __volatile__(
"# futex_atomic_cmpxchg_inatomic \n"
" .set push \n"
@@ -190,6 +192,7 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
: GCC_OFF_SMALL_ASM() (*uaddr), "Jr" (oldval), "Jr" (newval),
"i" (-EFAULT)
: "memory");
+ loongson_llsc_mb();
} else
return -ENOSYS;
diff --git a/arch/mips/include/asm/pgtable.h b/arch/mips/include/asm/pgtable.h
index 9e9e944..aab7b38 100644
--- a/arch/mips/include/asm/pgtable.h
+++ b/arch/mips/include/asm/pgtable.h
@@ -229,6 +229,7 @@ static inline void set_pte(pte_t *ptep, pte_t pteval)
: [buddy] "+m" (buddy->pte), [tmp] "=&r" (tmp)
: [global] "r" (page_global));
} else if (kernel_uses_llsc) {
+ loongson_llsc_mb();
__asm__ __volatile__ (
" .set "MIPS_ISA_ARCH_LEVEL" \n"
" .set push \n"
@@ -244,6 +245,7 @@ static inline void set_pte(pte_t *ptep, pte_t pteval)
" .set mips0 \n"
: [buddy] "+m" (buddy->pte), [tmp] "=&r" (tmp)
: [global] "r" (page_global));
+ loongson_llsc_mb();
}
#else /* !CONFIG_SMP */
if (pte_none(*buddy))
diff --git a/arch/mips/include/asm/spinlock.h b/arch/mips/include/asm/spinlock.h
index a8df44d..30f980b 100644
--- a/arch/mips/include/asm/spinlock.h
+++ b/arch/mips/include/asm/spinlock.h
@@ -115,6 +115,7 @@ static inline void arch_spin_lock(arch_spinlock_t *lock)
[my_ticket] "=&r" (my_ticket)
: [inc] "r" (inc));
} else {
+ loongson_llsc_mb();
__asm__ __volatile__ (
" .set push # arch_spin_lock \n"
" .set noreorder \n"
@@ -190,6 +191,7 @@ static inline unsigned int arch_spin_trylock(arch_spinlock_t *lock)
[now_serving] "=&r" (tmp3)
: [inc] "r" (inc));
} else {
+ loongson_llsc_mb();
__asm__ __volatile__ (
" .set push # arch_spin_trylock \n"
" .set noreorder \n"
@@ -259,6 +261,7 @@ static inline void arch_read_lock(arch_rwlock_t *rw)
: GCC_OFF_SMALL_ASM() (rw->lock)
: "memory");
} else {
+ loongson_llsc_mb();
do {
__asm__ __volatile__(
"1: ll %1, %2 # arch_read_lock \n"
@@ -320,6 +323,7 @@ static inline void arch_write_lock(arch_rwlock_t *rw)
: GCC_OFF_SMALL_ASM() (rw->lock)
: "memory");
} else {
+ loongson_llsc_mb();
do {
__asm__ __volatile__(
"1: ll %1, %2 # arch_write_lock \n"
@@ -345,6 +349,7 @@ static inline void arch_write_unlock(arch_rwlock_t *rw)
: "=m" (rw->lock)
: "m" (rw->lock)
: "memory");
+ nudge_writes();
}
static inline int arch_read_trylock(arch_rwlock_t *rw)
@@ -370,6 +375,7 @@ static inline int arch_read_trylock(arch_rwlock_t *rw)
: GCC_OFF_SMALL_ASM() (rw->lock)
: "memory");
} else {
+ loongson_llsc_mb();
__asm__ __volatile__(
" .set noreorder # arch_read_trylock \n"
" li %2, 0 \n"
@@ -414,6 +420,7 @@ static inline int arch_write_trylock(arch_rwlock_t *rw)
: GCC_OFF_SMALL_ASM() (rw->lock)
: "memory");
} else {
+ loongson_llsc_mb();
do {
__asm__ __volatile__(
" ll %1, %3 # arch_write_trylock \n"
diff --git a/arch/mips/loongson64/Platform b/arch/mips/loongson64/Platform
index 0fce460..c1a4d4d 100644
--- a/arch/mips/loongson64/Platform
+++ b/arch/mips/loongson64/Platform
@@ -23,6 +23,29 @@ ifdef CONFIG_CPU_LOONGSON2F_WORKAROUNDS
endif
cflags-$(CONFIG_CPU_LOONGSON3) += -Wa,--trap
+
+#
+# Some versions of binutils, not currently mainline as of 2019/02/04, support
+# an -mfix-loongson3-llsc flag which emits a sync prior to each ll instruction
+# to work around a CPU bug (see loongson_llsc_mb() in asm/barrier.h for a
+# description).
+#
+# We disable this in order to prevent the assembler meddling with the
+# instruction that labels refer to, ie. if we label an ll instruction:
+#
+# 1: ll v0, 0(a0)
+#
+# ...then with the assembler fix applied the label may actually point at a sync
+# instruction inserted by the assembler, and if we were using the label in an
+# exception table the table would no longer contain the address of the ll
+# instruction.
+#
+# Avoid this by explicitly disabling that assembler behaviour. If upstream
+# binutils does not merge support for the flag then we can revisit & remove
+# this later - for now it ensures vendor toolchains don't cause problems.
+#
+cflags-$(CONFIG_CPU_LOONGSON3) += $(call as-option,-Wa$(comma)-mno-fix-loongson3-llsc,)
+
#
# binutils from v2.25 on and gcc starting from v4.9.0 treat -march=loongson3a
# as MIPS64 R2; older versions as just R1. This leaves the possibility open
diff --git a/arch/mips/mm/tlbex.c b/arch/mips/mm/tlbex.c
index 2da5649..0026c77 100644
--- a/arch/mips/mm/tlbex.c
+++ b/arch/mips/mm/tlbex.c
@@ -931,6 +931,8 @@ build_get_pgd_vmalloc64(u32 **p, struct uasm_label **l, struct uasm_reloc **r,
* to mimic that here by taking a load/istream page
* fault.
*/
+ if (IS_ENABLED(CONFIG_CPU_LOONGSON3_WORKAROUNDS))
+ uasm_i_sync(p, 0);
UASM_i_LA(p, ptr, (unsigned long)tlb_do_page_fault_0);
uasm_i_jr(p, ptr);
@@ -1637,6 +1639,8 @@ static void
iPTE_LW(u32 **p, unsigned int pte, unsigned int ptr)
{
#ifdef CONFIG_SMP
+ if (IS_ENABLED(CONFIG_CPU_LOONGSON3_WORKAROUNDS))
+ uasm_i_sync(p, 0);
# ifdef CONFIG_PHYS_ADDR_T_64BIT
if (cpu_has_64bits)
uasm_i_lld(p, pte, 0, ptr);
@@ -2218,6 +2222,8 @@ static void build_r4000_tlb_load_handler(void)
#endif
uasm_l_nopage_tlbl(&l, p);
+ if (IS_ENABLED(CONFIG_CPU_LOONGSON3_WORKAROUNDS))
+ uasm_i_sync(&p, 0);
build_restore_work_registers(&p);
#ifdef CONFIG_CPU_MICROMIPS
if ((unsigned long)tlb_do_page_fault_0 & 1) {
@@ -2273,6 +2279,8 @@ static void build_r4000_tlb_store_handler(void)
#endif
uasm_l_nopage_tlbs(&l, p);
+ if (IS_ENABLED(CONFIG_CPU_LOONGSON3_WORKAROUNDS))
+ uasm_i_sync(&p, 0);
build_restore_work_registers(&p);
#ifdef CONFIG_CPU_MICROMIPS
if ((unsigned long)tlb_do_page_fault_1 & 1) {
@@ -2329,6 +2337,8 @@ static void build_r4000_tlb_modify_handler(void)
#endif
uasm_l_nopage_tlbm(&l, p);
+ if (IS_ENABLED(CONFIG_CPU_LOONGSON3_WORKAROUNDS))
+ uasm_i_sync(&p, 0);
build_restore_work_registers(&p);
#ifdef CONFIG_CPU_MICROMIPS
if ((unsigned long)tlb_do_page_fault_1 & 1) {
--
2.7.0