From: "Jonathan (Zhixiong) Zhang" zjzhang@codeaurora.org
On a platform with APEI (ACPI Platform Error Interface) enabled, firmware updates a memory region with hardware error record using nocache attribute. When OS reads the region, since it maps the region with cacahed attribute even though EFI memory map defines this region as uncached, OS gets stale data and errorneously reports there is no new HW error.
When ghes driver maps the memory region, it uses the cache attribute according to EFI memory map, if EFI memory map feature is enabled at runtime.
Since both arch/x86 and arc/ia64 implemented architecture agnostic EFI memory map attribue lookup function efi_memattributes(), the code is moved from arch/x86 and arch/ia64 into EFI subsystem. On top of that, efi_remap() is added.
V2: 1. Rebased to v4.1-rc5. 2. Split removal of efi_mem_attributes() and creation of efi_ioremap() into two patches.
Jonathan (Zhixiong) Zhang (3): efi: arch, x86: arch, ia64: move efi_mem_attributes() efi: add efi_remap() acpi, apei: use EFI memmap to map GHES memory
arch/ia64/kernel/efi.c | 11 ----------- arch/x86/platform/efi/efi.c | 18 ------------------ drivers/acpi/apei/ghes.c | 13 +++++++++++++ drivers/firmware/efi/efi.c | 27 +++++++++++++++++++++++++++ include/linux/efi.h | 1 + 5 files changed, 41 insertions(+), 29 deletions(-)
From: "Jonathan (Zhixiong) Zhang" zjzhang@codeaurora.org
Both x86 and ia64 implemented efi_mem_attributs(), which is architecture agnositc. This function is moved from arch/x86 and arch/ia64 to drivers/firmware/efi.
Signed-off-by: Jonathan (Zhixiong) Zhang zjzhang@codeaurora.org --- arch/ia64/kernel/efi.c | 11 ----------- arch/x86/platform/efi/efi.c | 18 ------------------ drivers/firmware/efi/efi.c | 18 ++++++++++++++++++ 3 files changed, 18 insertions(+), 29 deletions(-)
diff --git a/arch/ia64/kernel/efi.c b/arch/ia64/kernel/efi.c index c52d7540dc05..ef20ec784b04 100644 --- a/arch/ia64/kernel/efi.c +++ b/arch/ia64/kernel/efi.c @@ -771,17 +771,6 @@ efi_mem_type (unsigned long phys_addr) }
u64 -efi_mem_attributes (unsigned long phys_addr) -{ - efi_memory_desc_t *md = efi_memory_descriptor(phys_addr); - - if (md) - return md->attribute; - return 0; -} -EXPORT_SYMBOL(efi_mem_attributes); - -u64 efi_mem_attribute (unsigned long phys_addr, unsigned long size) { unsigned long end = phys_addr + size; diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 02744df576d5..10bd5289c593 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -926,24 +926,6 @@ u32 efi_mem_type(unsigned long phys_addr) return 0; }
-u64 efi_mem_attributes(unsigned long phys_addr) -{ - efi_memory_desc_t *md; - void *p; - - if (!efi_enabled(EFI_MEMMAP)) - return 0; - - for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { - md = p; - if ((md->phys_addr <= phys_addr) && - (phys_addr < (md->phys_addr + - (md->num_pages << EFI_PAGE_SHIFT)))) - return md->attribute; - } - return 0; -} - static int __init arch_parse_efi_cmdline(char *str) { if (parse_option_str(str, "old_map")) diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c index 3061bb8629dc..86da85368778 100644 --- a/drivers/firmware/efi/efi.c +++ b/drivers/firmware/efi/efi.c @@ -517,3 +517,21 @@ char * __init efi_md_typeattr_format(char *buf, size_t size, attr & EFI_MEMORY_UC ? "UC" : ""); return buf; } + +u64 efi_mem_attributes(unsigned long phys_addr) +{ + efi_memory_desc_t *md; + void *p; + + if (!efi_enabled(EFI_MEMMAP)) + return 0; + + for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { + md = p; + if ((md->phys_addr <= phys_addr) && + (phys_addr < (md->phys_addr + + (md->num_pages << EFI_PAGE_SHIFT)))) + return md->attribute; + } + return 0; +}
(Cc'ing Tony for ia64 input)
On Mon, 01 Jun, at 12:12:18PM, Jonathan (Zhixiong) Zhang wrote:
From: "Jonathan (Zhixiong) Zhang" zjzhang@codeaurora.org
Both x86 and ia64 implemented efi_mem_attributs(), which is architecture agnositc. This function is moved from arch/x86 and arch/ia64 to drivers/firmware/efi.
Signed-off-by: Jonathan (Zhixiong) Zhang zjzhang@codeaurora.org
arch/ia64/kernel/efi.c | 11 ----------- arch/x86/platform/efi/efi.c | 18 ------------------ drivers/firmware/efi/efi.c | 18 ++++++++++++++++++ 3 files changed, 18 insertions(+), 29 deletions(-)
diff --git a/arch/ia64/kernel/efi.c b/arch/ia64/kernel/efi.c index c52d7540dc05..ef20ec784b04 100644 --- a/arch/ia64/kernel/efi.c +++ b/arch/ia64/kernel/efi.c @@ -771,17 +771,6 @@ efi_mem_type (unsigned long phys_addr) } u64 -efi_mem_attributes (unsigned long phys_addr) -{
- efi_memory_desc_t *md = efi_memory_descriptor(phys_addr);
- if (md)
return md->attribute;
- return 0;
-} -EXPORT_SYMBOL(efi_mem_attributes);
-u64 efi_mem_attribute (unsigned long phys_addr, unsigned long size) { unsigned long end = phys_addr + size; diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 02744df576d5..10bd5289c593 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -926,24 +926,6 @@ u32 efi_mem_type(unsigned long phys_addr) return 0; } -u64 efi_mem_attributes(unsigned long phys_addr) -{
- efi_memory_desc_t *md;
- void *p;
- if (!efi_enabled(EFI_MEMMAP))
return 0;
- for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
md = p;
if ((md->phys_addr <= phys_addr) &&
(phys_addr < (md->phys_addr +
(md->num_pages << EFI_PAGE_SHIFT))))
return md->attribute;
- }
- return 0;
-}
static int __init arch_parse_efi_cmdline(char *str) { if (parse_option_str(str, "old_map")) diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c index 3061bb8629dc..86da85368778 100644 --- a/drivers/firmware/efi/efi.c +++ b/drivers/firmware/efi/efi.c @@ -517,3 +517,21 @@ char * __init efi_md_typeattr_format(char *buf, size_t size, attr & EFI_MEMORY_UC ? "UC" : ""); return buf; }
+u64 efi_mem_attributes(unsigned long phys_addr) +{
- efi_memory_desc_t *md;
- void *p;
- if (!efi_enabled(EFI_MEMMAP))
return 0;
Umm... ia64 doesn't appear to set EFI_MEMMAP. So doesn't this change actively break ia64?
While I like the idea of de-duplication, the two implementations of efi_mem_attributes() are not equivalent.
Thank you for the feedback, Matt.
Given that IA64 does not set EFI_MEMMAP, it appears to me there are following options: A. Keep status quota and copy x86's efi_mem_attributes() code to arm64. B. In efi subsystem, provide week default efi_mem_attributes(). In the mean time, IA64 continues to have its own implementation. C. Add EFI_MEMMAP support (and related bits) in IA64.
Which option do you prefer? Once there is a consensus, I am willing to submit patch accordingly for review.
Regards, Jonathan
On 6/2/2015 6:36 AM, Matt Fleming wrote:
(Cc'ing Tony for ia64 input)
On Mon, 01 Jun, at 12:12:18PM, Jonathan (Zhixiong) Zhang wrote:
From: "Jonathan (Zhixiong) Zhang" zjzhang@codeaurora.org
Both x86 and ia64 implemented efi_mem_attributs(), which is architecture agnositc. This function is moved from arch/x86 and arch/ia64 to drivers/firmware/efi.
Signed-off-by: Jonathan (Zhixiong) Zhang zjzhang@codeaurora.org
arch/ia64/kernel/efi.c | 11 ----------- arch/x86/platform/efi/efi.c | 18 ------------------ drivers/firmware/efi/efi.c | 18 ++++++++++++++++++ 3 files changed, 18 insertions(+), 29 deletions(-)
diff --git a/arch/ia64/kernel/efi.c b/arch/ia64/kernel/efi.c index c52d7540dc05..ef20ec784b04 100644 --- a/arch/ia64/kernel/efi.c +++ b/arch/ia64/kernel/efi.c @@ -771,17 +771,6 @@ efi_mem_type (unsigned long phys_addr) }
u64 -efi_mem_attributes (unsigned long phys_addr) -{
- efi_memory_desc_t *md = efi_memory_descriptor(phys_addr);
- if (md)
return md->attribute;
- return 0;
-} -EXPORT_SYMBOL(efi_mem_attributes);
-u64 efi_mem_attribute (unsigned long phys_addr, unsigned long size) { unsigned long end = phys_addr + size; diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 02744df576d5..10bd5289c593 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -926,24 +926,6 @@ u32 efi_mem_type(unsigned long phys_addr) return 0; }
-u64 efi_mem_attributes(unsigned long phys_addr) -{
- efi_memory_desc_t *md;
- void *p;
- if (!efi_enabled(EFI_MEMMAP))
return 0;
- for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
md = p;
if ((md->phys_addr <= phys_addr) &&
(phys_addr < (md->phys_addr +
(md->num_pages << EFI_PAGE_SHIFT))))
return md->attribute;
- }
- return 0;
-}
- static int __init arch_parse_efi_cmdline(char *str) { if (parse_option_str(str, "old_map"))
diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c index 3061bb8629dc..86da85368778 100644 --- a/drivers/firmware/efi/efi.c +++ b/drivers/firmware/efi/efi.c @@ -517,3 +517,21 @@ char * __init efi_md_typeattr_format(char *buf, size_t size, attr & EFI_MEMORY_UC ? "UC" : ""); return buf; }
+u64 efi_mem_attributes(unsigned long phys_addr) +{
- efi_memory_desc_t *md;
- void *p;
- if (!efi_enabled(EFI_MEMMAP))
return 0;
Umm... ia64 doesn't appear to set EFI_MEMMAP. So doesn't this change actively break ia64?
While I like the idea of de-duplication, the two implementations of efi_mem_attributes() are not equivalent.
On Tue, 02 Jun, at 05:09:14PM, Zhang, Jonathan Zhixiong wrote:
Thank you for the feedback, Matt.
Given that IA64 does not set EFI_MEMMAP, it appears to me there are following options: A. Keep status quota and copy x86's efi_mem_attributes() code to arm64.
Let's avoid this option.
B. In efi subsystem, provide week default efi_mem_attributes(). In the mean time, IA64 continues to have its own implementation.
While I'm not a huge fan of using __weak this makes the most sense to me because the alternative is to rename either the ia64 or x86 implementation and that just seems silly.
C. Add EFI_MEMMAP support (and related bits) in IA64.
C. isn't an option because the ia64 memory map doesn't work the same way as x86 and arm64.
Which option do you prefer? Once there is a consensus, I am willing to submit patch accordingly for review.
Let's go with B. but please provide a comment above the weak implementation explaining *why* it's declared as weak and that any new architecture probably doesn't want to override it. Explain that the ia64 EFI memory map is special.
Sure, I will got with B with clear comment.
Thanks, Jonathan
On 6/5/2015 2:23 AM, Matt Fleming wrote:
On Tue, 02 Jun, at 05:09:14PM, Zhang, Jonathan Zhixiong wrote:
Thank you for the feedback, Matt.
Given that IA64 does not set EFI_MEMMAP, it appears to me there are following options: A. Keep status quota and copy x86's efi_mem_attributes() code to arm64.
Let's avoid this option.
B. In efi subsystem, provide week default efi_mem_attributes(). In the mean time, IA64 continues to have its own implementation.
While I'm not a huge fan of using __weak this makes the most sense to me because the alternative is to rename either the ia64 or x86 implementation and that just seems silly.
C. Add EFI_MEMMAP support (and related bits) in IA64.
C. isn't an option because the ia64 memory map doesn't work the same way as x86 and arm64.
Which option do you prefer? Once there is a consensus, I am willing to submit patch accordingly for review.
Let's go with B. but please provide a comment above the weak implementation explaining *why* it's declared as weak and that any new architecture probably doesn't want to override it. Explain that the ia64 EFI memory map is special.
From: "Jonathan (Zhixiong) Zhang" zjzhang@codeaurora.org
If it is a EFI system, if EFI memmap is enabled and if a memory region has attribute of EFI_MEMORY_UC, map it as uncached.
On x86, EFI memmap is unmapped in efi_free_boot_services(), before kernel finishes booting. So when efi_remap() is called during runtime on x86, it maps the region as cached.
Signed-off-by: Jonathan (Zhixiong) Zhang zjzhang@codeaurora.org --- drivers/firmware/efi/efi.c | 9 +++++++++ include/linux/efi.h | 1 + 2 files changed, 10 insertions(+)
diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c index 86da85368778..5b42bb6d1fde 100644 --- a/drivers/firmware/efi/efi.c +++ b/drivers/firmware/efi/efi.c @@ -535,3 +535,12 @@ u64 efi_mem_attributes(unsigned long phys_addr) } return 0; } + +void __iomem *efi_remap(phys_addr_t phys_addr, size_t size) +{ + if (efi_enabled(EFI_MEMMAP) && + (efi_mem_attributes(phys_addr) & EFI_MEMORY_UC)) + return ioremap(phys_addr, size); + else + return ioremap_cache(phys_addr, size); +} diff --git a/include/linux/efi.h b/include/linux/efi.h index af5be0368dec..0a0aa25d44d7 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -888,6 +888,7 @@ extern void efi_get_time(struct timespec *now); extern void efi_reserve_boot_services(void); extern int efi_get_fdt_params(struct efi_fdt_params *params, int verbose); extern struct efi_memory_map memmap; +extern void __iomem *efi_remap(phys_addr_t phys_addr, size_t size);
extern int efi_reboot_quirk_mode; extern bool efi_poweroff_required(void);
From: "Jonathan (Zhixiong) Zhang" zjzhang@codeaurora.org
With ACPI APEI firmware first handling, generic hardware error record is updated by firmware in GHES memory region. When firmware updated GHES memory region in DDR without going through cache, Linux reads stale data from cache.
GHES memory region should be mapped with cache attributes according to EFI memory map when applicable. If firmware updates DDR directly, EFI memory map has GHES memory region defined as uncached; If firmware updates cache, EFI memory map has GHES memory region defined as cached.
When EFI is configued, map IRQ page using efi_remap() provided by EFI subsystem.
Signed-off-by: Jonathan (Zhixiong) Zhang zjzhang@codeaurora.org --- drivers/acpi/apei/ghes.c | 13 +++++++++++++ 1 file changed, 13 insertions(+)
diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c index e82d0976a5d0..56875ca76aa7 100644 --- a/drivers/acpi/apei/ghes.c +++ b/drivers/acpi/apei/ghes.c @@ -48,6 +48,7 @@ #include <linux/pci.h> #include <linux/aer.h> #include <linux/nmi.h> +#include <linux/efi.h>
#include <acpi/ghes.h> #include <acpi/apei.h> @@ -159,6 +160,7 @@ static void __iomem *ghes_ioremap_pfn_nmi(u64 pfn) return (void __iomem *)vaddr; }
+#ifndef CONFIG_EFI static void __iomem *ghes_ioremap_pfn_irq(u64 pfn) { unsigned long vaddr; @@ -169,6 +171,7 @@ static void __iomem *ghes_ioremap_pfn_irq(u64 pfn)
return (void __iomem *)vaddr; } +#endif
static void ghes_iounmap_nmi(void __iomem *vaddr_ptr) { @@ -180,6 +183,7 @@ static void ghes_iounmap_nmi(void __iomem *vaddr_ptr) arch_apei_flush_tlb_one(vaddr); }
+#ifndef CONFIG_EFI static void ghes_iounmap_irq(void __iomem *vaddr_ptr) { unsigned long vaddr = (unsigned long __force)vaddr_ptr; @@ -189,6 +193,7 @@ static void ghes_iounmap_irq(void __iomem *vaddr_ptr) unmap_kernel_range_noflush(vaddr, PAGE_SIZE); arch_apei_flush_tlb_one(vaddr); } +#endif
static int ghes_estatus_pool_init(void) { @@ -309,7 +314,11 @@ static void ghes_copy_tofrom_phys(void *buffer, u64 paddr, u32 len, vaddr = ghes_ioremap_pfn_nmi(paddr >> PAGE_SHIFT); } else { spin_lock_irqsave(&ghes_ioremap_lock_irq, flags); +#ifdef CONFIG_EFI + vaddr = efi_remap(paddr & PAGE_MASK, PAGE_SIZE); +#else vaddr = ghes_ioremap_pfn_irq(paddr >> PAGE_SHIFT); +#endif } trunk = PAGE_SIZE - offset; trunk = min(trunk, len); @@ -324,7 +333,11 @@ static void ghes_copy_tofrom_phys(void *buffer, u64 paddr, u32 len, ghes_iounmap_nmi(vaddr); raw_spin_unlock(&ghes_ioremap_lock_nmi); } else { +#ifdef CONFIG_EFI + iounmap(vaddr); +#else ghes_iounmap_irq(vaddr); +#endif spin_unlock_irqrestore(&ghes_ioremap_lock_irq, flags); } }
[ Cc'ing Boris and Tony. Folks original patch is here, https://lkml.kernel.org/r/1433185940-24770-4-git-send-email-zjzhang@codeauro... ]
On Mon, 01 Jun, at 12:12:20PM, Jonathan (Zhixiong) Zhang wrote:
From: "Jonathan (Zhixiong) Zhang" zjzhang@codeaurora.org
With ACPI APEI firmware first handling, generic hardware error record is updated by firmware in GHES memory region. When firmware updated GHES memory region in DDR without going through cache, Linux reads stale data from cache.
GHES memory region should be mapped with cache attributes according to EFI memory map when applicable. If firmware updates DDR directly, EFI memory map has GHES memory region defined as uncached; If firmware updates cache, EFI memory map has GHES memory region defined as cached.
When EFI is configued, map IRQ page using efi_remap() provided by EFI subsystem.
[...]
@@ -159,6 +160,7 @@ static void __iomem *ghes_ioremap_pfn_nmi(u64 pfn) return (void __iomem *)vaddr; } +#ifndef CONFIG_EFI static void __iomem *ghes_ioremap_pfn_irq(u64 pfn) { unsigned long vaddr;
Sprinkling CONFIG_EFI like this is wrong. On x86 we run kernels built with CONFIG_EFI on machines with BIOS - you can't make the EFI vs. non-EFI decision at compile-time.
So this patch looks like a potential regression to me since on x86 ghes_ioremap_pfn_irq() would not be used anymore and instead we'd be using efi_remap() which will perform an ioremap_nocache() if it gets called after efi_free_boot_services().
And based on the comments in the apei code, that's going to cause issues because ioremap() does not work in atomic context, not to mention the fact that we've gone from a cached mapping to an uncached one.
Instead, I suggest you modify ghes_ioremap_* to query the EFI memmap (if it's available at runtime) to lookup the correct mapping attributes.
But I've Cc'd some more people who have actually worked on this code, since I'm not one of them.
On Fri, Jun 05, 2015 at 10:57:01AM +0100, Matt Fleming wrote:
[ Cc'ing Boris and Tony. Folks original patch is here, https://lkml.kernel.org/r/1433185940-24770-4-git-send-email-zjzhang@codeauro... ]
On Mon, 01 Jun, at 12:12:20PM, Jonathan (Zhixiong) Zhang wrote:
From: "Jonathan (Zhixiong) Zhang" zjzhang@codeaurora.org
With ACPI APEI firmware first handling, generic hardware error record is updated by firmware in GHES memory region. When firmware updated GHES memory region in DDR without going through cache,
What is DDR?
I think this needs to be clarified first before we go any further.
I picked up on the sidelines that this might be arm64-specific stuff. If so, your approach is wrong: you're merging efi_* facilities from x86 and ia64 into generic efi ones but then doing CONFIG_EFI ifdeffery in GHES.
What you should do instead is have arch-specific:
ghes_ioremap_pfn_irq() ghes_iounmap_irq() ...
and whatever else functionality which is different on your arch and which get called from the generic ghes.c driver.
In the arch-specific ones you can go wild with the ifdeffery and whatnot is needed on that specific arch.
Something like that, at least.
Thank you Borislav for the review. Pls. see comments inline...
On 6/5/2015 3:25 AM, Borislav Petkov wrote:
On Fri, Jun 05, 2015 at 10:57:01AM +0100, Matt Fleming wrote:
[ Cc'ing Boris and Tony. Folks original patch is here, https://lkml.kernel.org/r/1433185940-24770-4-git-send-email-zjzhang@codeauro... ]
On Mon, 01 Jun, at 12:12:20PM, Jonathan (Zhixiong) Zhang wrote:
From: "Jonathan (Zhixiong) Zhang" zjzhang@codeaurora.org
With ACPI APEI firmware first handling, generic hardware error record is updated by firmware in GHES memory region. When firmware updated GHES memory region in DDR without going through cache,
What is DDR?
I think this needs to be clarified first before we go any further.
I thought the word "memory" might be confusing, because there are memories on the system that is not accessible by Linux. In this context, the APEI error data is accessed (read and write) by both Linux and platform firmware; hence both sides should access the memory using same cache attribute. I wanted to emphasize the idea that even though normally DDR is cachable, but in this case when platform access it with un-cached attribute, Linux should do the same. I will try to make it more clear in next version of the patch.
I picked up on the sidelines that this might be arm64-specific stuff. If so, your approach is wrong: you're merging efi_* facilities from x86 and ia64 into generic efi ones but then doing CONFIG_EFI ifdeffery in GHES.
What you should do instead is have arch-specific:
ghes_ioremap_pfn_irq() ghes_iounmap_irq() ...
and whatever else functionality which is different on your arch and which get called from the generic ghes.c driver.
In the arch-specific ones you can go wild with the ifdeffery and whatnot is needed on that specific arch.
Something like that, at least.
Makes total sense. I was trying to reduce binary size for non-EFI system, but as Matt pointed out in another feedback, on x86 even BIOS based system has CONFIG_EFI enabled. I will submit a new version accordingly.
On Fri, Jun 05, 2015 at 10:05:13AM -0700, Zhang, Jonathan Zhixiong wrote:
What is DDR?
I think this needs to be clarified first before we go any further.
I thought the word "memory" might be confusing, because there are
So you mean normal RAM here?
memories on the system that is not accessible by Linux. In this context, the APEI error data is accessed (read and write) by both Linux and platform firmware; hence both sides should access the memory using same cache attribute. I wanted to emphasize the idea that even though normally DDR is cachable, but in this case when platform access it with un-cached attribute, Linux should do the same.
Makes sense.
Btw, do we need synchronization between firmware and Linux then? Does Linux need to know when it is ok to touch that memory?
On 6/5/2015 10:12 AM, Borislav Petkov wrote:
On Fri, Jun 05, 2015 at 10:05:13AM -0700, Zhang, Jonathan Zhixiong wrote:
What is DDR?
I think this needs to be clarified first before we go any further.
I thought the word "memory" might be confusing, because there are
So you mean normal RAM here?
Yes, exactly. I should use this word RAM instead.
memories on the system that is not accessible by Linux. In this context, the APEI error data is accessed (read and write) by both Linux and platform firmware; hence both sides should access the memory using same cache attribute. I wanted to emphasize the idea that even though normally DDR is cachable, but in this case when platform access it with un-cached attribute, Linux should do the same.
Makes sense.
Btw, do we need synchronization between firmware and Linux then? Does Linux need to know when it is ok to touch that memory?
Good question. Linux zeros out error status code in the error data after the data is consumed, this is good; but it alone does not solve the synchronization concern.
For interrupt notification type (SCI or NMI) error source, this may not be an issue since both sides can operate under the rule that the error data is only overwritten but never appended. But what about poll notification type? In this case, platform gathers error, updates the memory region as needed; Linux checks the same memory region periodically.
An ACPI APEI proposal intended to solve this concern has been discussed in UEFI forum. The idea is to have OS to send platform a signal (through updating a designated register) after error data is consumed. Therefore, when OS is accessing the memory region, platform does not try to access the same memory region in the mean time.
After this proposal is approved and published, I will submit a patch to implement it.
Thanks Matt for the review. Yes, you are right on, I am following this:
modify ghes_ioremap_* to query the EFI memmap (if it's available at runtime) to lookup the correct mapping attributes.
Jonathan
On 6/5/2015 2:57 AM, Matt Fleming wrote:
[ Cc'ing Boris and Tony. Folks original patch is here, https://lkml.kernel.org/r/1433185940-24770-4-git-send-email-zjzhang@codeauro... ]
On Mon, 01 Jun, at 12:12:20PM, Jonathan (Zhixiong) Zhang wrote:
From: "Jonathan (Zhixiong) Zhang" zjzhang@codeaurora.org
With ACPI APEI firmware first handling, generic hardware error record is updated by firmware in GHES memory region. When firmware updated GHES memory region in DDR without going through cache, Linux reads stale data from cache.
GHES memory region should be mapped with cache attributes according to EFI memory map when applicable. If firmware updates DDR directly, EFI memory map has GHES memory region defined as uncached; If firmware updates cache, EFI memory map has GHES memory region defined as cached.
When EFI is configued, map IRQ page using efi_remap() provided by EFI subsystem.
[...]
@@ -159,6 +160,7 @@ static void __iomem *ghes_ioremap_pfn_nmi(u64 pfn) return (void __iomem *)vaddr; }
+#ifndef CONFIG_EFI static void __iomem *ghes_ioremap_pfn_irq(u64 pfn) { unsigned long vaddr;
Sprinkling CONFIG_EFI like this is wrong. On x86 we run kernels built with CONFIG_EFI on machines with BIOS - you can't make the EFI vs. non-EFI decision at compile-time.
So this patch looks like a potential regression to me since on x86 ghes_ioremap_pfn_irq() would not be used anymore and instead we'd be using efi_remap() which will perform an ioremap_nocache() if it gets called after efi_free_boot_services().
And based on the comments in the apei code, that's going to cause issues because ioremap() does not work in atomic context, not to mention the fact that we've gone from a cached mapping to an uncached one.
Instead, I suggest you modify ghes_ioremap_* to query the EFI memmap (if it's available at runtime) to lookup the correct mapping attributes.
But I've Cc'd some more people who have actually worked on this code, since I'm not one of them.
On Fri, Jun 05, 2015 at 09:43:26AM -0700, Zhang, Jonathan Zhixiong wrote:
Thanks Matt for the review. Yes, you are right on, I am following this:
modify ghes_ioremap_* to query the EFI memmap (if it's available at runtime) to lookup the correct mapping attributes.
A: Because it messes up the order in which people normally read text. Q: Why is top-posting such a bad thing? A: Top-posting. Q: What is the most annoying thing in e-mail?
So please do not top-post.
Thanks.
On 6/5/2015 9:50 AM, Borislav Petkov wrote:
On Fri, Jun 05, 2015 at 09:43:26AM -0700, Zhang, Jonathan Zhixiong wrote:
Thanks Matt for the review. Yes, you are right on, I am following this:
modify ghes_ioremap_* to query the EFI memmap (if it's available at runtime) to lookup the correct mapping attributes.
A: Because it messes up the order in which people normally read text. Q: Why is top-posting such a bad thing? A: Top-posting. Q: What is the most annoying thing in e-mail?
So please do not top-post.
Thanks.
Will do. Thanks for the advise, Borislav.