When using Secure TSC, the GUEST_TSC_FREQ MSR reports a frequency based on
the nominal P0 frequency, which deviates slightly (typically ~0.2%) from
the actual mean TSC frequency due to clocking parameters. Over extended VM
uptime, this discrepancy accumulates, causing clock skew between the
hypervisor and SEV-SNP VM, leading to early timer interrupts as perceived
by the guest.
The guest kernel relies on the reported nominal frequency for TSC-based
timekeeping, while the actual frequency set during SNP_LAUNCH_START may
differ. This mismatch results in inaccurate time calculations, causing the
guest to perceive hrtimers as firing earlier than expected.
Utilize the TSC_FACTOR from the SEV firmware's secrets page (see "Secrets
Page Format" in the SNP Firmware ABI Specification) to calculate the mean
TSC frequency, ensuring accurate timekeeping and mitigating clock skew in
SEV-SNP VMs.
Use early_ioremap_encrypted() to map the secrets page as
ioremap_encrypted() uses kmalloc() which is not available during early TSC
initialization and causes a panic.
Fixes: 73bbf3b0fbba ("x86/tsc: Init the TSC for Secure TSC guests")
Cc: stable(a)vger.kernel.org
Signed-off-by: Nikunj A Dadhania <nikunj(a)amd.com>
---
v2:
* Move the SNP TSC scaling constant to the header (Dionna)
* Drop the unsigned long cast and add in securetsc_get_tsc_khz (Tom)
* Drop the RB from Tom as the code has changed
---
arch/x86/include/asm/sev.h | 18 +++++++++++++++++-
arch/x86/coco/sev/core.c | 16 ++++++++++++++--
2 files changed, 31 insertions(+), 3 deletions(-)
diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h
index fbb616fcbfb8..869355367210 100644
--- a/arch/x86/include/asm/sev.h
+++ b/arch/x86/include/asm/sev.h
@@ -223,6 +223,19 @@ struct snp_tsc_info_resp {
u8 rsvd2[100];
} __packed;
+
+/*
+ * Obtain the mean TSC frequency by decreasing the nominal TSC frequency with
+ * TSC_FACTOR as documented in the SNP Firmware ABI specification:
+ *
+ * GUEST_TSC_FREQ * (1 - (TSC_FACTOR * 0.00001))
+ *
+ * which is equivalent to:
+ *
+ * GUEST_TSC_FREQ -= (GUEST_TSC_FREQ * TSC_FACTOR) / 100000;
+ */
+#define SNP_SCALE_TSC_FREQ(freq, factor) ((freq) - ((freq) * (factor)) / 100000)
+
struct snp_guest_req {
void *req_buf;
size_t req_sz;
@@ -283,8 +296,11 @@ struct snp_secrets_page {
u8 svsm_guest_vmpl;
u8 rsvd3[3];
+ /* The percentage decrease from nominal to mean TSC frequency. */
+ u32 tsc_factor;
+
/* Remainder of page */
- u8 rsvd4[3744];
+ u8 rsvd4[3740];
} __packed;
struct snp_msg_desc {
diff --git a/arch/x86/coco/sev/core.c b/arch/x86/coco/sev/core.c
index 8375ca7fbd8a..36f419ff25d4 100644
--- a/arch/x86/coco/sev/core.c
+++ b/arch/x86/coco/sev/core.c
@@ -2156,20 +2156,32 @@ void __init snp_secure_tsc_prepare(void)
static unsigned long securetsc_get_tsc_khz(void)
{
- return snp_tsc_freq_khz;
+ return (unsigned long)snp_tsc_freq_khz;
}
void __init snp_secure_tsc_init(void)
{
+ struct snp_secrets_page *secrets;
unsigned long long tsc_freq_mhz;
+ void *mem;
if (!cc_platform_has(CC_ATTR_GUEST_SNP_SECURE_TSC))
return;
+ mem = early_memremap_encrypted(sev_secrets_pa, PAGE_SIZE);
+ if (!mem) {
+ pr_err("Unable to get TSC_FACTOR: failed to map the SNP secrets page.\n");
+ sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_SECURE_TSC);
+ }
+
+ secrets = (__force struct snp_secrets_page *)mem;
+
setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
rdmsrq(MSR_AMD64_GUEST_TSC_FREQ, tsc_freq_mhz);
- snp_tsc_freq_khz = (unsigned long)(tsc_freq_mhz * 1000);
+ snp_tsc_freq_khz = SNP_SCALE_TSC_FREQ(tsc_freq_mhz * 1000, secrets->tsc_factor);
x86_platform.calibrate_cpu = securetsc_get_tsc_khz;
x86_platform.calibrate_tsc = securetsc_get_tsc_khz;
+
+ early_memunmap(mem, PAGE_SIZE);
}
base-commit: 49151ac6671fe0372261054daf5e4da3567b8271
--
2.43.0
Commit <4f1492efb495> ("iommu/vt-d: Revert ATS timing change to fix boot
failure") placed the enabling of ATS in the probe_finalize callback. This
occurs after the default domain attachment, which is when the ATS cache
tag is assigned. Consequently, the device TLB cache tag is missed when the
domain is attached, leading to the device TLB not being invalidated in the
iommu_unmap paths.
Fix this by assigning the CACHE_TAG_DEVTLB cache tag when ATS is enabled.
Fixes: 4f1492efb495 ("iommu/vt-d: Revert ATS timing change to fix boot failure")
Cc: stable(a)vger.kernel.org
Suggested-by: Kevin Tian <kevin.tian(a)intel.com>
Signed-off-by: Lu Baolu <baolu.lu(a)linux.intel.com>
Tested-by: Shuicheng Lin <shuicheng.lin(a)intel.com>
---
drivers/iommu/intel/cache.c | 5 ++---
drivers/iommu/intel/iommu.c | 11 ++++++++++-
drivers/iommu/intel/iommu.h | 2 ++
3 files changed, 14 insertions(+), 4 deletions(-)
Change log:
v2:
- The v1 solution has a flaw: ATS will never be enabled for drivers with
driver_managed_dma enabled, as their devices are not expected to be
automatically attached to the default domain.
v1: https://lore.kernel.org/linux-iommu/20250620060802.3036137-1-baolu.lu@linux…
diff --git a/drivers/iommu/intel/cache.c b/drivers/iommu/intel/cache.c
index fc35cba59145..47692cbfaabd 100644
--- a/drivers/iommu/intel/cache.c
+++ b/drivers/iommu/intel/cache.c
@@ -40,9 +40,8 @@ static bool cache_tage_match(struct cache_tag *tag, u16 domain_id,
}
/* Assign a cache tag with specified type to domain. */
-static int cache_tag_assign(struct dmar_domain *domain, u16 did,
- struct device *dev, ioasid_t pasid,
- enum cache_tag_type type)
+int cache_tag_assign(struct dmar_domain *domain, u16 did, struct device *dev,
+ ioasid_t pasid, enum cache_tag_type type)
{
struct device_domain_info *info = dev_iommu_priv_get(dev);
struct intel_iommu *iommu = info->iommu;
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 7aa3932251b2..148b944143b8 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -3780,8 +3780,17 @@ static void intel_iommu_probe_finalize(struct device *dev)
!pci_enable_pasid(to_pci_dev(dev), info->pasid_supported & ~1))
info->pasid_enabled = 1;
- if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev))
+ if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
iommu_enable_pci_ats(info);
+ /* Assign a DEVTLB cache tag to the default domain. */
+ if (info->ats_enabled && info->domain) {
+ u16 did = domain_id_iommu(info->domain, iommu);
+
+ if (cache_tag_assign(info->domain, did, dev,
+ IOMMU_NO_PASID, CACHE_TAG_DEVTLB))
+ iommu_disable_pci_ats(info);
+ }
+ }
iommu_enable_pci_pri(info);
}
diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h
index 3ddbcc603de2..2d1afab5eedc 100644
--- a/drivers/iommu/intel/iommu.h
+++ b/drivers/iommu/intel/iommu.h
@@ -1289,6 +1289,8 @@ struct cache_tag {
unsigned int users;
};
+int cache_tag_assign(struct dmar_domain *domain, u16 did, struct device *dev,
+ ioasid_t pasid, enum cache_tag_type type);
int cache_tag_assign_domain(struct dmar_domain *domain,
struct device *dev, ioasid_t pasid);
void cache_tag_unassign_domain(struct dmar_domain *domain,
--
2.43.0
The i2c_dw_xfer_init() function requires msgs and msg_write_idx from the
dev context to be initialized.
amd_i2c_dw_xfer_quirk() inits msgs and msgs_num, but not msg_write_idx.
This could allow an out of bounds access (of msgs).
Initialize msg_write_idx before calling i2c_dw_xfer_init().
Fixes: 17631e8ca2d3 ("i2c: designware: Add driver support for AMD NAVI GPU")
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Michael J. Ruhl <michael.j.ruhl(a)intel.com>
---
drivers/i2c/busses/i2c-designware-master.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/drivers/i2c/busses/i2c-designware-master.c b/drivers/i2c/busses/i2c-designware-master.c
index c5394229b77f..40aa5114bf8c 100644
--- a/drivers/i2c/busses/i2c-designware-master.c
+++ b/drivers/i2c/busses/i2c-designware-master.c
@@ -363,6 +363,7 @@ static int amd_i2c_dw_xfer_quirk(struct i2c_adapter *adap, struct i2c_msg *msgs,
dev->msgs = msgs;
dev->msgs_num = num_msgs;
+ dev->msg_write_idx = 0;
i2c_dw_xfer_init(dev);
/* Initiate messages read/write transaction */
--
2.49.0
PL1 cannot be disabled on some platforms. The ENABLE bit is still set
after software clears it. This behavior leads to a scenario where, upon
user request to disable the Power Limit through the powercap sysfs, the
ENABLE bit remains set while the CLAMPING bit is inadvertently cleared.
According to the Intel Software Developer's Manual, the CLAMPING bit,
"When set, allows the processor to go below the OS requested P states in
order to maintain the power below specified Platform Power Limit value."
Thus this means the system may operate at higher power levels than
intended on such platforms.
Enhance the code to check ENABLE bit after writing to it, and stop
further processing if ENABLE bit cannot be changed.
Cc: stable(a)vger.kernel.org
Reported-by: Srinivas Pandruvada <srinivas.pandruvada(a)linux.intel.com>
Fixes: 2d281d8196e3 ("PowerCap: Introduce Intel RAPL power capping driver")
Signed-off-by: Zhang Rui <rui.zhang(a)intel.com>
---
Changes since V1:
- Add Fixes tag
- CC stable kernel
---
drivers/powercap/intel_rapl_common.c | 17 ++++++++++++++++-
1 file changed, 16 insertions(+), 1 deletion(-)
diff --git a/drivers/powercap/intel_rapl_common.c b/drivers/powercap/intel_rapl_common.c
index e3be40adc0d7..602f540cbe15 100644
--- a/drivers/powercap/intel_rapl_common.c
+++ b/drivers/powercap/intel_rapl_common.c
@@ -341,12 +341,27 @@ static int set_domain_enable(struct powercap_zone *power_zone, bool mode)
{
struct rapl_domain *rd = power_zone_to_rapl_domain(power_zone);
struct rapl_defaults *defaults = get_defaults(rd->rp);
+ u64 val;
int ret;
cpus_read_lock();
ret = rapl_write_pl_data(rd, POWER_LIMIT1, PL_ENABLE, mode);
- if (!ret && defaults->set_floor_freq)
+ if (ret)
+ goto end;
+
+ ret = rapl_read_pl_data(rd, POWER_LIMIT1, PL_ENABLE, false, &val);
+ if (ret)
+ goto end;
+
+ if (mode != val) {
+ pr_debug("%s cannot be %s\n", power_zone->name, mode ? "enabled" : "disabled");
+ goto end;
+ }
+
+ if (defaults->set_floor_freq)
defaults->set_floor_freq(rd, mode);
+
+end:
cpus_read_unlock();
return ret;
--
2.43.0
The i2c_dw_xfer_init() function requires msgs and msg_write_idx from the
dev context to be initialized.
amd_i2c_dw_xfer_quirk() inits msgs and msgs_num, but not msg_write_idx.
This could allow an out of bounds access (of msgs).
Initialize msg_write_idx before calling i2c_dw_xfer_init().
Fixes: 17631e8ca2d3 ("i2c: designware: Add driver support for AMD NAVI GPU")
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Michael J. Ruhl <michael.j.ruhl(a)intel.com>
---
drivers/i2c/busses/i2c-designware-master.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/drivers/i2c/busses/i2c-designware-master.c b/drivers/i2c/busses/i2c-designware-master.c
index c5394229b77f..40aa5114bf8c 100644
--- a/drivers/i2c/busses/i2c-designware-master.c
+++ b/drivers/i2c/busses/i2c-designware-master.c
@@ -363,6 +363,7 @@ static int amd_i2c_dw_xfer_quirk(struct i2c_adapter *adap, struct i2c_msg *msgs,
dev->msgs = msgs;
dev->msgs_num = num_msgs;
+ dev->msg_write_idx = 0;
i2c_dw_xfer_init(dev);
/* Initiate messages read/write transaction */
--
2.49.0
From: Maíra Canal <mcanal(a)igalia.com>
[ Upstream commit a0e6a017ab56936c0405fe914a793b241ed25ee0 ]
Currently, it is possible for the composer to be set as enabled and then
as disabled without a proper call for the vkms_vblank_simulate(). This
is problematic, because the driver would skip one CRC output, causing CRC
tests to fail. Therefore, we need to make sure that, for each time the
composer is set as enabled, a composer job is added to the queue.
In order to provide this guarantee, add a mutex that will lock before
the composer is set as enabled and will unlock only after the composer
job is added to the queue. This way, we can have a guarantee that the
driver won't skip a CRC entry.
This race-condition is affecting the IGT test "writeback-check-output",
making the test fail and also, leaking writeback framebuffers, as the
writeback job is queued, but it is not signaled. This patch avoids both
problems.
[v2]:
* Create a new mutex and keep the spinlock across the atomic commit in
order to avoid interrupts that could result in deadlocks.
[ Backport to 5.15: context cleanly applied with no semantic changes.
Build-tested. ]
Signed-off-by: Maíra Canal <mcanal(a)igalia.com>
Reviewed-by: Arthur Grillo <arthurgrillo(a)riseup.net>
Signed-off-by: Maíra Canal <mairacanal(a)riseup.net>
Link: https://patchwork.freedesktop.org/patch/msgid/20230523123207.173976-1-mcana…
Signed-off-by: Pranav Tyagi <pranav.tyagi03(a)gmail.com>
---
drivers/gpu/drm/vkms/vkms_composer.c | 9 +++++++--
drivers/gpu/drm/vkms/vkms_crtc.c | 9 +++++----
drivers/gpu/drm/vkms/vkms_drv.h | 4 +++-
3 files changed, 15 insertions(+), 7 deletions(-)
diff --git a/drivers/gpu/drm/vkms/vkms_composer.c b/drivers/gpu/drm/vkms/vkms_composer.c
index 9e8204be9a14..77fced36af55 100644
--- a/drivers/gpu/drm/vkms/vkms_composer.c
+++ b/drivers/gpu/drm/vkms/vkms_composer.c
@@ -332,10 +332,15 @@ void vkms_set_composer(struct vkms_output *out, bool enabled)
if (enabled)
drm_crtc_vblank_get(&out->crtc);
- spin_lock_irq(&out->lock);
+ mutex_lock(&out->enabled_lock);
old_enabled = out->composer_enabled;
out->composer_enabled = enabled;
- spin_unlock_irq(&out->lock);
+
+ /* the composition wasn't enabled, so unlock the lock to make sure the lock
+ * will be balanced even if we have a failed commit
+ */
+ if (!out->composer_enabled)
+ mutex_unlock(&out->enabled_lock);
if (old_enabled)
drm_crtc_vblank_put(&out->crtc);
diff --git a/drivers/gpu/drm/vkms/vkms_crtc.c b/drivers/gpu/drm/vkms/vkms_crtc.c
index 57bbd32e9beb..1b02dee8587a 100644
--- a/drivers/gpu/drm/vkms/vkms_crtc.c
+++ b/drivers/gpu/drm/vkms/vkms_crtc.c
@@ -16,7 +16,7 @@ static enum hrtimer_restart vkms_vblank_simulate(struct hrtimer *timer)
struct drm_crtc *crtc = &output->crtc;
struct vkms_crtc_state *state;
u64 ret_overrun;
- bool ret, fence_cookie;
+ bool ret, fence_cookie, composer_enabled;
fence_cookie = dma_fence_begin_signalling();
@@ -25,15 +25,15 @@ static enum hrtimer_restart vkms_vblank_simulate(struct hrtimer *timer)
if (ret_overrun != 1)
pr_warn("%s: vblank timer overrun\n", __func__);
- spin_lock(&output->lock);
ret = drm_crtc_handle_vblank(crtc);
if (!ret)
DRM_ERROR("vkms failure on handling vblank");
state = output->composer_state;
- spin_unlock(&output->lock);
+ composer_enabled = output->composer_enabled;
+ mutex_unlock(&output->enabled_lock);
- if (state && output->composer_enabled) {
+ if (state && composer_enabled) {
u64 frame = drm_crtc_accurate_vblank_count(crtc);
/* update frame_start only if a queued vkms_composer_worker()
@@ -293,6 +293,7 @@ int vkms_crtc_init(struct drm_device *dev, struct drm_crtc *crtc,
spin_lock_init(&vkms_out->lock);
spin_lock_init(&vkms_out->composer_lock);
+ mutex_init(&vkms_out->enabled_lock);
vkms_out->composer_workq = alloc_ordered_workqueue("vkms_composer", 0);
if (!vkms_out->composer_workq)
diff --git a/drivers/gpu/drm/vkms/vkms_drv.h b/drivers/gpu/drm/vkms/vkms_drv.h
index d48c23d40ce5..666997e2bcab 100644
--- a/drivers/gpu/drm/vkms/vkms_drv.h
+++ b/drivers/gpu/drm/vkms/vkms_drv.h
@@ -83,8 +83,10 @@ struct vkms_output {
struct workqueue_struct *composer_workq;
/* protects concurrent access to composer */
spinlock_t lock;
+ /* guarantees that if the composer is enabled, a job will be queued */
+ struct mutex enabled_lock;
- /* protected by @lock */
+ /* protected by @enabled_lock */
bool composer_enabled;
struct vkms_crtc_state *composer_state;
--
2.49.0