Merge tag 'acpi-6.13-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm
Pull ACPI updates from Rafael Wysocki:
"These include a couple of fixes, a new ACPI backlight quirk for Apple
MacbookPro11,2 and Air7,2 and a bunch of cleanups:
- Fix _CPC register setting issue for registers located in memory in
the ACPI CPPC library code (Lifeng Zheng)
- Use DEFINE_SIMPLE_DEV_PM_OPS in the ACPI battery driver, make it
use devm_ for initializing mutexes and allocating driver data, and
make it check the register_pm_notifier() return value (Thomas
Weißschuh, Andy Shevchenko)
- Make the ACPI EC driver support compile-time conditional and allow
ACPI to be built without CONFIG_HAS_IOPORT (Arnd Bergmann)
- Remove a redundant error check from the pfr_telemetry driver (Colin
Ian King)
- Rearrange the processor_perflib code in the ACPI processor driver
to avoid compiling x86-specific code on other architectures (Arnd
Bergmann)
- Add adev NULL check to acpi_quirk_skip_serdev_enumeration() and
make UART skip quirks work on PCI UARTs without an UID (Hans de
Goede)
- Force native backlight handling Apple MacbookPro11,2 and Air7,2 in
the ACPI video driver (Jonathan Denose)
- Switch several ACPI platform drivers back to using struct
platform_driver::remove() (Uwe Kleine-König)
- Replace strcpy() with strscpy() in multiple places in the ACPI
subsystem (Muhammad Qasim Abdul Majeed, Abdul Rahim)"
* tag 'acpi-6.13-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm: (24 commits)
ACPI: video: force native for Apple MacbookPro11,2 and Air7,2
ACPI: CPPC: Fix _CPC register setting issue
ACPI: Switch back to struct platform_driver::remove()
ACPI: x86: Add adev NULL check to acpi_quirk_skip_serdev_enumeration()
ACPI: x86: Make UART skip quirks work on PCI UARTs without an UID
ACPI: allow building without CONFIG_HAS_IOPORT
ACPI: processor_perflib: extend X86 dependency
ACPI: scan: Use strscpy() instead of strcpy()
ACPI: SBSHC: Use strscpy() instead of strcpy()
ACPI: SBS: Use strscpy() instead of strcpy()
ACPI: power: Use strscpy() instead of strcpy()
ACPI: pci_root: Use strscpy() instead of strcpy()
ACPI: pci_link: Use strscpy() instead of strcpy()
ACPI: event: Use strscpy() instead of strcpy()
ACPI: EC: Use strscpy() instead of strcpy()
ACPI: APD: Use strscpy() instead of strcpy()
ACPI: thermal: Use strscpy() instead of strcpy()
ACPI: battery: Check for error code from devm_mutex_init() call
ACPI: EC: make EC support compile-time conditional
ACPI: pfr_telemetry: remove redundant error check on ret
...
diff --git a/.get_maintainer.ignore b/.get_maintainer.ignore
index 7d1b30a..b458815 100644
--- a/.get_maintainer.ignore
+++ b/.get_maintainer.ignore
@@ -3,3 +3,4 @@
Christoph Hellwig <hch@lst.de>
Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Marc Gonzalez <marc.w.gonzalez@free.fr>
+Ralf Baechle <ralf@linux-mips.org>
diff --git a/CREDITS b/CREDITS
index 96660c6..4e68055 100644
--- a/CREDITS
+++ b/CREDITS
@@ -185,6 +185,11 @@
D: Linux/MIPS port
D: Linux/68k hacker
D: AX25 maintainer
+D: EDAC-CAVIUM OCTEON maintainer
+D: IOC3 ETHERNET DRIVER maintainer
+D: NETROM NETWORK LAYER maintainer
+D: ROSE NETWORK LAYER maintainer
+D: TURBOCHANNEL SUBSYSTEM maintainer
S: Hauptstrasse 19
S: 79837 St. Blasien
S: Germany
diff --git a/Documentation/ABI/obsolete/sysfs-selinux-user b/Documentation/ABI/obsolete/sysfs-selinux-user
new file mode 100644
index 0000000..8ab7557
--- /dev/null
+++ b/Documentation/ABI/obsolete/sysfs-selinux-user
@@ -0,0 +1,12 @@
+What: /sys/fs/selinux/user
+Date: April 2005 (predates git)
+KernelVersion: 2.6.12-rc2 (predates git)
+Contact: selinux@vger.kernel.org
+Description:
+
+ The selinuxfs "user" node allows userspace to request a list
+ of security contexts that can be reached for a given SELinux
+ user from a given starting context. This was used by libselinux
+ when various login-style programs requested contexts for
+ users, but libselinux stopped using it in 2020.
+ Kernel support will be removed no sooner than Dec 2025.
diff --git a/Documentation/ABI/stable/sysfs-block b/Documentation/ABI/stable/sysfs-block
index cea8856f..0cceb2b 100644
--- a/Documentation/ABI/stable/sysfs-block
+++ b/Documentation/ABI/stable/sysfs-block
@@ -424,6 +424,13 @@
[RW] This file is used to control (on/off) the iostats
accounting of the disk.
+What: /sys/block/<disk>/queue/iostats_passthrough
+Date: October 2024
+Contact: linux-block@vger.kernel.org
+Description:
+ [RW] This file is used to control (on/off) the iostats
+ accounting of the disk for passthrough commands.
+
What: /sys/block/<disk>/queue/logical_block_size
Date: May 2009
@@ -594,6 +601,9 @@
[RW] Maximum number of kilobytes to read-ahead for filesystems
on this block device.
+ For MADV_HUGEPAGE, the readahead size may exceed this setting
+ since its granularity is based on the hugepage size.
+
What: /sys/block/<disk>/queue/rotational
Date: January 2009
diff --git a/Documentation/ABI/testing/debugfs-hisi-hpre b/Documentation/ABI/testing/debugfs-hisi-hpre
index d4e16ef..29fb7d5 100644
--- a/Documentation/ABI/testing/debugfs-hisi-hpre
+++ b/Documentation/ABI/testing/debugfs-hisi-hpre
@@ -184,3 +184,10 @@
Contact: linux-crypto@vger.kernel.org
Description: Dump the total number of time out requests.
Available for both PF and VF, and take no other effect on HPRE.
+
+What: /sys/kernel/debug/hisi_hpre/<bdf>/cap_regs
+Date: Oct 2024
+Contact: linux-crypto@vger.kernel.org
+Description: Dump the values of the qm and hpre capability bit registers and
+ support the query of device specifications to facilitate fault locating.
+ Available for both PF and VF, and take no other effect on HPRE.
diff --git a/Documentation/ABI/testing/debugfs-hisi-sec b/Documentation/ABI/testing/debugfs-hisi-sec
index 6c6c9a6..82bf4a0 100644
--- a/Documentation/ABI/testing/debugfs-hisi-sec
+++ b/Documentation/ABI/testing/debugfs-hisi-sec
@@ -157,3 +157,10 @@
Description: Dump the total number of completed but marked error requests
to be received.
Available for both PF and VF, and take no other effect on SEC.
+
+What: /sys/kernel/debug/hisi_sec2/<bdf>/cap_regs
+Date: Oct 2024
+Contact: linux-crypto@vger.kernel.org
+Description: Dump the values of the qm and sec capability bit registers and
+ support the query of device specifications to facilitate fault locating.
+ Available for both PF and VF, and take no other effect on SEC.
diff --git a/Documentation/ABI/testing/debugfs-hisi-zip b/Documentation/ABI/testing/debugfs-hisi-zip
index a22dd69..0abd65d 100644
--- a/Documentation/ABI/testing/debugfs-hisi-zip
+++ b/Documentation/ABI/testing/debugfs-hisi-zip
@@ -158,3 +158,10 @@
Description: Dump the total number of BD type error requests
to be received.
Available for both PF and VF, and take no other effect on ZIP.
+
+What: /sys/kernel/debug/hisi_zip/<bdf>/cap_regs
+Date: Oct 2024
+Contact: linux-crypto@vger.kernel.org
+Description: Dump the values of the qm and zip capability bit registers and
+ support the query of device specifications to facilitate fault locating.
+ Available for both PF and VF, and take no other effect on ZIP.
diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index 69af217..2cb58da 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -1599,6 +1599,15 @@
pglazyfreed (npn)
Amount of reclaimed lazyfree pages
+ swpin_zero
+ Number of pages swapped into memory and filled with zero, where I/O
+ was optimized out because the page content was detected to be zero
+ during swapout.
+
+ swpout_zero
+ Number of zero-filled pages swapped out with I/O skipped due to the
+ content being detected as zero.
+
zswpin
Number of pages moved in to memory from zswap.
@@ -2945,7 +2954,7 @@
a queue (device) has been associated with the bio and
before submission.
- wbc_account_cgroup_owner(@wbc, @page, @bytes)
+ wbc_account_cgroup_owner(@wbc, @folio, @bytes)
Should be called for each data segment being written out.
While this function doesn't care exactly when it's called
during the writeback session, it's the easiest and most
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 1666576..f287548 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -446,6 +446,9 @@
arm64.nobti [ARM64] Unconditionally disable Branch Target
Identification support
+ arm64.nogcs [ARM64] Unconditionally disable Guarded Control Stack
+ support
+
arm64.nomops [ARM64] Unconditionally disable Memory Copy and Memory
Set instructions support
@@ -6727,6 +6730,15 @@
torture.verbose_sleep_duration= [KNL]
Duration of each verbose-printk() sleep in jiffies.
+ tpm.disable_pcr_integrity= [HW,TPM]
+ Do not protect PCR registers from unintended physical
+ access, or interposers in the bus by the means of
+ having an integrity protected session wrapped around
+ TPM2_PCR_Extend command. Consider this in a situation
+ where TPM is heavily utilized by IMA, thus protection
+ causing a major performance hit, and the space where
+ machines are deployed is by other means guarded.
+
tpm_suspend_pcr=[HW,TPM]
Format: integer pcr id
Specify that at suspend time, the tpm driver
diff --git a/Documentation/admin-guide/perf/index.rst b/Documentation/admin-guide/perf/index.rst
index 8502bc1..a58bd3f 100644
--- a/Documentation/admin-guide/perf/index.rst
+++ b/Documentation/admin-guide/perf/index.rst
@@ -26,3 +26,4 @@
meson-ddr-pmu
cxl
ampere_cspmu
+ mrvl-pem-pmu
diff --git a/Documentation/admin-guide/perf/mrvl-pem-pmu.rst b/Documentation/admin-guide/perf/mrvl-pem-pmu.rst
new file mode 100644
index 0000000..c390071
--- /dev/null
+++ b/Documentation/admin-guide/perf/mrvl-pem-pmu.rst
@@ -0,0 +1,56 @@
+=================================================================
+Marvell Odyssey PEM Performance Monitoring Unit (PMU UNCORE)
+=================================================================
+
+The PCI Express Interface Units(PEM) are associated with a corresponding
+monitoring unit. This includes performance counters to track various
+characteristics of the data that is transmitted over the PCIe link.
+
+The counters track inbound and outbound transactions which
+includes separate counters for posted/non-posted/completion TLPs.
+Also, inbound and outbound memory read requests along with their
+latencies can also be monitored. Address Translation Services(ATS)events
+such as ATS Translation, ATS Page Request, ATS Invalidation along with
+their corresponding latencies are also tracked.
+
+There are separate 64 bit counters to measure posted/non-posted/completion
+tlps in inbound and outbound transactions. ATS events are measured by
+different counters.
+
+The PMU driver exposes the available events and format options under sysfs,
+/sys/bus/event_source/devices/mrvl_pcie_rc_pmu_<>/events/
+/sys/bus/event_source/devices/mrvl_pcie_rc_pmu_<>/format/
+
+Examples::
+
+ # perf list | grep mrvl_pcie_rc_pmu
+ mrvl_pcie_rc_pmu_<>/ats_inv/ [Kernel PMU event]
+ mrvl_pcie_rc_pmu_<>/ats_inv_latency/ [Kernel PMU event]
+ mrvl_pcie_rc_pmu_<>/ats_pri/ [Kernel PMU event]
+ mrvl_pcie_rc_pmu_<>/ats_pri_latency/ [Kernel PMU event]
+ mrvl_pcie_rc_pmu_<>/ats_trans/ [Kernel PMU event]
+ mrvl_pcie_rc_pmu_<>/ats_trans_latency/ [Kernel PMU event]
+ mrvl_pcie_rc_pmu_<>/ib_inflight/ [Kernel PMU event]
+ mrvl_pcie_rc_pmu_<>/ib_reads/ [Kernel PMU event]
+ mrvl_pcie_rc_pmu_<>/ib_req_no_ro_ebus/ [Kernel PMU event]
+ mrvl_pcie_rc_pmu_<>/ib_req_no_ro_ncb/ [Kernel PMU event]
+ mrvl_pcie_rc_pmu_<>/ib_tlp_cpl_partid/ [Kernel PMU event]
+ mrvl_pcie_rc_pmu_<>/ib_tlp_dwords_cpl_partid/ [Kernel PMU event]
+ mrvl_pcie_rc_pmu_<>/ib_tlp_dwords_npr/ [Kernel PMU event]
+ mrvl_pcie_rc_pmu_<>/ib_tlp_dwords_pr/ [Kernel PMU event]
+ mrvl_pcie_rc_pmu_<>/ib_tlp_npr/ [Kernel PMU event]
+ mrvl_pcie_rc_pmu_<>/ib_tlp_pr/ [Kernel PMU event]
+ mrvl_pcie_rc_pmu_<>/ob_inflight_partid/ [Kernel PMU event]
+ mrvl_pcie_rc_pmu_<>/ob_merges_cpl_partid/ [Kernel PMU event]
+ mrvl_pcie_rc_pmu_<>/ob_merges_npr_partid/ [Kernel PMU event]
+ mrvl_pcie_rc_pmu_<>/ob_merges_pr_partid/ [Kernel PMU event]
+ mrvl_pcie_rc_pmu_<>/ob_reads_partid/ [Kernel PMU event]
+ mrvl_pcie_rc_pmu_<>/ob_tlp_cpl_partid/ [Kernel PMU event]
+ mrvl_pcie_rc_pmu_<>/ob_tlp_dwords_cpl_partid/ [Kernel PMU event]
+ mrvl_pcie_rc_pmu_<>/ob_tlp_dwords_npr_partid/ [Kernel PMU event]
+ mrvl_pcie_rc_pmu_<>/ob_tlp_dwords_pr_partid/ [Kernel PMU event]
+ mrvl_pcie_rc_pmu_<>/ob_tlp_npr_partid/ [Kernel PMU event]
+ mrvl_pcie_rc_pmu_<>/ob_tlp_pr_partid/ [Kernel PMU event]
+
+
+ # perf stat -e ib_inflight,ib_reads,ib_req_no_ro_ebus,ib_req_no_ro_ncb <workload>
diff --git a/Documentation/admin-guide/sysctl/fs.rst b/Documentation/admin-guide/sysctl/fs.rst
index 47499a1..30c6147 100644
--- a/Documentation/admin-guide/sysctl/fs.rst
+++ b/Documentation/admin-guide/sysctl/fs.rst
@@ -38,6 +38,11 @@
``aio-max-nr`` does not result in the
pre-allocation or re-sizing of any kernel data structures.
+dentry-negative
+----------------------------
+
+Policy for negative dentries. Set to 1 to to always delete the dentry when a
+file is removed, and 0 to disable it. By default, this behavior is disabled.
dentry-state
------------
diff --git a/Documentation/arch/arm64/arm-cca.rst b/Documentation/arch/arm64/arm-cca.rst
new file mode 100644
index 0000000..c48b7d4
--- /dev/null
+++ b/Documentation/arch/arm64/arm-cca.rst
@@ -0,0 +1,69 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=====================================
+Arm Confidential Compute Architecture
+=====================================
+
+Arm systems that support the Realm Management Extension (RME) contain
+hardware to allow a VM guest to be run in a way which protects the code
+and data of the guest from the hypervisor. It extends the older "two
+world" model (Normal and Secure World) into four worlds: Normal, Secure,
+Root and Realm. Linux can then also be run as a guest to a monitor
+running in the Realm world.
+
+The monitor running in the Realm world is known as the Realm Management
+Monitor (RMM) and implements the Realm Management Monitor
+specification[1]. The monitor acts a bit like a hypervisor (e.g. it runs
+in EL2 and manages the stage 2 page tables etc of the guests running in
+Realm world), however much of the control is handled by a hypervisor
+running in the Normal World. The Normal World hypervisor uses the Realm
+Management Interface (RMI) defined by the RMM specification to request
+the RMM to perform operations (e.g. mapping memory or executing a vCPU).
+
+The RMM defines an environment for guests where the address space (IPA)
+is split into two. The lower half is protected - any memory that is
+mapped in this half cannot be seen by the Normal World and the RMM
+restricts what operations the Normal World can perform on this memory
+(e.g. the Normal World cannot replace pages in this region without the
+guest's cooperation). The upper half is shared, the Normal World is free
+to make changes to the pages in this region, and is able to emulate MMIO
+devices in this region too.
+
+A guest running in a Realm may also communicate with the RMM using the
+Realm Services Interface (RSI) to request changes in its environment or
+to perform attestation about its environment. In particular it may
+request that areas of the protected address space are transitioned
+between 'RAM' and 'EMPTY' (in either direction). This allows a Realm
+guest to give up memory to be returned to the Normal World, or to
+request new memory from the Normal World. Without an explicit request
+from the Realm guest the RMM will otherwise prevent the Normal World
+from making these changes.
+
+Linux as a Realm Guest
+----------------------
+
+To run Linux as a guest within a Realm, the following must be provided
+either by the VMM or by a `boot loader` run in the Realm before Linux:
+
+ * All protected RAM described to Linux (by DT or ACPI) must be marked
+ RIPAS RAM before handing control over to Linux.
+
+ * MMIO devices must be either unprotected (e.g. emulated by the Normal
+ World) or marked RIPAS DEV.
+
+ * MMIO devices emulated by the Normal World and used very early in boot
+ (specifically earlycon) must be specified in the upper half of IPA.
+ For earlycon this can be done by specifying the address on the
+ command line, e.g. with an IPA size of 33 bits and the base address
+ of the emulated UART at 0x1000000: ``earlycon=uart,mmio,0x101000000``
+
+ * Linux will use bounce buffers for communicating with unprotected
+ devices. It will transition some protected memory to RIPAS EMPTY and
+ expect to be able to access unprotected pages at the same IPA address
+ but with the highest valid IPA bit set. The expectation is that the
+ VMM will remove the physical pages from the protected mapping and
+ provide those pages as unprotected pages.
+
+References
+----------
+[1] https://developer.arm.com/documentation/den0137/
diff --git a/Documentation/arch/arm64/booting.rst b/Documentation/arch/arm64/booting.rst
index b57776a..3278fb4 100644
--- a/Documentation/arch/arm64/booting.rst
+++ b/Documentation/arch/arm64/booting.rst
@@ -41,6 +41,9 @@
the RAM in the machine, or any other method the boot loader designer
sees fit.)
+For Arm Confidential Compute Realms this includes ensuring that all
+protected RAM has a Realm IPA state (RIPAS) of "RAM".
+
2. Setup the device tree
-------------------------
@@ -385,6 +388,9 @@
- HCRX_EL2.MSCEn (bit 11) must be initialised to 0b1.
+ - HCRX_EL2.MCE2 (bit 10) must be initialised to 0b1 and the hypervisor
+ must handle MOPS exceptions as described in :ref:`arm64_mops_hyp`.
+
For CPUs with the Extended Translation Control Register feature (FEAT_TCR2):
- If EL3 is present:
@@ -411,6 +417,38 @@
- HFGRWR_EL2.nPIRE0_EL1 (bit 57) must be initialised to 0b1.
+ - For CPUs with Guarded Control Stacks (FEAT_GCS):
+
+ - GCSCR_EL1 must be initialised to 0.
+
+ - GCSCRE0_EL1 must be initialised to 0.
+
+ - If EL3 is present:
+
+ - SCR_EL3.GCSEn (bit 39) must be initialised to 0b1.
+
+ - If EL2 is present:
+
+ - GCSCR_EL2 must be initialised to 0.
+
+ - If the kernel is entered at EL1 and EL2 is present:
+
+ - HCRX_EL2.GCSEn must be initialised to 0b1.
+
+ - HFGITR_EL2.nGCSEPP (bit 59) must be initialised to 0b1.
+
+ - HFGITR_EL2.nGCSSTR_EL1 (bit 58) must be initialised to 0b1.
+
+ - HFGITR_EL2.nGCSPUSHM_EL1 (bit 57) must be initialised to 0b1.
+
+ - HFGRTR_EL2.nGCS_EL1 (bit 53) must be initialised to 0b1.
+
+ - HFGRTR_EL2.nGCS_EL0 (bit 52) must be initialised to 0b1.
+
+ - HFGWTR_EL2.nGCS_EL1 (bit 53) must be initialised to 0b1.
+
+ - HFGWTR_EL2.nGCS_EL0 (bit 52) must be initialised to 0b1.
+
The requirements described above for CPU mode, caches, MMUs, architected
timers, coherency and system registers apply to all CPUs. All CPUs must
enter the kernel in the same exception level. Where the values documented
diff --git a/Documentation/arch/arm64/elf_hwcaps.rst b/Documentation/arch/arm64/elf_hwcaps.rst
index 694f67f..2ff922a 100644
--- a/Documentation/arch/arm64/elf_hwcaps.rst
+++ b/Documentation/arch/arm64/elf_hwcaps.rst
@@ -16,9 +16,9 @@
kernel exposes the presence of these features to userspace through a set
of flags called hwcaps, exposed in the auxiliary vector.
-Userspace software can test for features by acquiring the AT_HWCAP or
-AT_HWCAP2 entry of the auxiliary vector, and testing whether the relevant
-flags are set, e.g.::
+Userspace software can test for features by acquiring the AT_HWCAP,
+AT_HWCAP2 or AT_HWCAP3 entry of the auxiliary vector, and testing
+whether the relevant flags are set, e.g.::
bool floating_point_is_present(void)
{
@@ -170,6 +170,10 @@
ID_AA64ISAR1_EL1.GPI == 0b0001, as described by
Documentation/arch/arm64/pointer-authentication.rst.
+HWCAP_GCS
+ Functionality implied by ID_AA64PFR1_EL1.GCS == 0b1, as
+ described by Documentation/arch/arm64/gcs.rst.
+
HWCAP2_DCPODP
Functionality implied by ID_AA64ISAR1_EL1.DPB == 0b0010.
diff --git a/Documentation/arch/arm64/gcs.rst b/Documentation/arch/arm64/gcs.rst
new file mode 100644
index 0000000..1f65a31
--- /dev/null
+++ b/Documentation/arch/arm64/gcs.rst
@@ -0,0 +1,227 @@
+===============================================
+Guarded Control Stack support for AArch64 Linux
+===============================================
+
+This document outlines briefly the interface provided to userspace by Linux in
+order to support use of the ARM Guarded Control Stack (GCS) feature.
+
+This is an outline of the most important features and issues only and not
+intended to be exhaustive.
+
+
+
+1. General
+-----------
+
+* GCS is an architecture feature intended to provide greater protection
+ against return oriented programming (ROP) attacks and to simplify the
+ implementation of features that need to collect stack traces such as
+ profiling.
+
+* When GCS is enabled a separate guarded control stack is maintained by the
+ PE which is writeable only through specific GCS operations. This
+ stores the call stack only, when a procedure call instruction is
+ performed the current PC is pushed onto the GCS and on RET the
+ address in the LR is verified against that on the top of the GCS.
+
+* When active the current GCS pointer is stored in the system register
+ GCSPR_EL0. This is readable by userspace but can only be updated
+ via specific GCS instructions.
+
+* The architecture provides instructions for switching between guarded
+ control stacks with checks to ensure that the new stack is a valid
+ target for switching.
+
+* The functionality of GCS is similar to that provided by the x86 Shadow
+ Stack feature, due to sharing of userspace interfaces the ABI refers to
+ shadow stacks rather than GCS.
+
+* Support for GCS is reported to userspace via HWCAP_GCS in the aux vector
+ AT_HWCAP2 entry.
+
+* GCS is enabled per thread. While there is support for disabling GCS
+ at runtime this should be done with great care.
+
+* GCS memory access faults are reported as normal memory access faults.
+
+* GCS specific errors (those reported with EC 0x2d) will be reported as
+ SIGSEGV with a si_code of SEGV_CPERR (control protection error).
+
+* GCS is supported only for AArch64.
+
+* On systems where GCS is supported GCSPR_EL0 is always readable by EL0
+ regardless of the GCS configuration for the thread.
+
+* The architecture supports enabling GCS without verifying that return values
+ in LR match those in the GCS, the LR will be ignored. This is not supported
+ by Linux.
+
+
+
+2. Enabling and disabling Guarded Control Stacks
+-------------------------------------------------
+
+* GCS is enabled and disabled for a thread via the PR_SET_SHADOW_STACK_STATUS
+ prctl(), this takes a single flags argument specifying which GCS features
+ should be used.
+
+* When set PR_SHADOW_STACK_ENABLE flag allocates a Guarded Control Stack
+ and enables GCS for the thread, enabling the functionality controlled by
+ GCSCRE0_EL1.{nTR, RVCHKEN, PCRSEL}.
+
+* When set the PR_SHADOW_STACK_PUSH flag enables the functionality controlled
+ by GCSCRE0_EL1.PUSHMEn, allowing explicit GCS pushes.
+
+* When set the PR_SHADOW_STACK_WRITE flag enables the functionality controlled
+ by GCSCRE0_EL1.STREn, allowing explicit stores to the Guarded Control Stack.
+
+* Any unknown flags will cause PR_SET_SHADOW_STACK_STATUS to return -EINVAL.
+
+* PR_LOCK_SHADOW_STACK_STATUS is passed a bitmask of features with the same
+ values as used for PR_SET_SHADOW_STACK_STATUS. Any future changes to the
+ status of the specified GCS mode bits will be rejected.
+
+* PR_LOCK_SHADOW_STACK_STATUS allows any bit to be locked, this allows
+ userspace to prevent changes to any future features.
+
+* There is no support for a process to remove a lock that has been set for
+ it.
+
+* PR_SET_SHADOW_STACK_STATUS and PR_LOCK_SHADOW_STACK_STATUS affect only the
+ thread that called them, any other running threads will be unaffected.
+
+* New threads inherit the GCS configuration of the thread that created them.
+
+* GCS is disabled on exec().
+
+* The current GCS configuration for a thread may be read with the
+ PR_GET_SHADOW_STACK_STATUS prctl(), this returns the same flags that
+ are passed to PR_SET_SHADOW_STACK_STATUS.
+
+* If GCS is disabled for a thread after having previously been enabled then
+ the stack will remain allocated for the lifetime of the thread. At present
+ any attempt to reenable GCS for the thread will be rejected, this may be
+ revisited in future.
+
+* It should be noted that since enabling GCS will result in GCS becoming
+ active immediately it is not normally possible to return from the function
+ that invoked the prctl() that enabled GCS. It is expected that the normal
+ usage will be that GCS is enabled very early in execution of a program.
+
+
+
+3. Allocation of Guarded Control Stacks
+----------------------------------------
+
+* When GCS is enabled for a thread a new Guarded Control Stack will be
+ allocated for it of half the standard stack size or 2 gigabytes,
+ whichever is smaller.
+
+* When a new thread is created by a thread which has GCS enabled then a
+ new Guarded Control Stack will be allocated for the new thread with
+ half the size of the standard stack.
+
+* When a stack is allocated by enabling GCS or during thread creation then
+ the top 8 bytes of the stack will be initialised to 0 and GCSPR_EL0 will
+ be set to point to the address of this 0 value, this can be used to
+ detect the top of the stack.
+
+* Additional Guarded Control Stacks can be allocated using the
+ map_shadow_stack() system call.
+
+* Stacks allocated using map_shadow_stack() can optionally have an end of
+ stack marker and cap placed at the top of the stack. If the flag
+ SHADOW_STACK_SET_TOKEN is specified a cap will be placed on the stack,
+ if SHADOW_STACK_SET_MARKER is not specified the cap will be the top 8
+ bytes of the stack and if it is specified then the cap will be the next
+ 8 bytes. While specifying just SHADOW_STACK_SET_MARKER by itself is
+ valid since the marker is all bits 0 it has no observable effect.
+
+* Stacks allocated using map_shadow_stack() must have a size which is a
+ multiple of 8 bytes larger than 8 bytes and must be 8 bytes aligned.
+
+* An address can be specified to map_shadow_stack(), if one is provided then
+ it must be aligned to a page boundary.
+
+* When a thread is freed the Guarded Control Stack initially allocated for
+ that thread will be freed. Note carefully that if the stack has been
+ switched this may not be the stack currently in use by the thread.
+
+
+4. Signal handling
+--------------------
+
+* A new signal frame record gcs_context encodes the current GCS mode and
+ pointer for the interrupted context on signal delivery. This will always
+ be present on systems that support GCS.
+
+* The record contains a flag field which reports the current GCS configuration
+ for the interrupted context as PR_GET_SHADOW_STACK_STATUS would.
+
+* The signal handler is run with the same GCS configuration as the interrupted
+ context.
+
+* When GCS is enabled for the interrupted thread a signal handling specific
+ GCS cap token will be written to the GCS, this is an architectural GCS cap
+ with the token type (bits 0..11) all clear. The GCSPR_EL0 reported in the
+ signal frame will point to this cap token.
+
+* The signal handler will use the same GCS as the interrupted context.
+
+* When GCS is enabled on signal entry a frame with the address of the signal
+ return handler will be pushed onto the GCS, allowing return from the signal
+ handler via RET as normal. This will not be reported in the gcs_context in
+ the signal frame.
+
+
+5. Signal return
+-----------------
+
+When returning from a signal handler:
+
+* If there is a gcs_context record in the signal frame then the GCS flags
+ and GCSPR_EL0 will be restored from that context prior to further
+ validation.
+
+* If there is no gcs_context record in the signal frame then the GCS
+ configuration will be unchanged.
+
+* If GCS is enabled on return from a signal handler then GCSPR_EL0 must
+ point to a valid GCS signal cap record, this will be popped from the
+ GCS prior to signal return.
+
+* If the GCS configuration is locked when returning from a signal then any
+ attempt to change the GCS configuration will be treated as an error. This
+ is true even if GCS was not enabled prior to signal entry.
+
+* GCS may be disabled via signal return but any attempt to enable GCS via
+ signal return will be rejected.
+
+
+6. ptrace extensions
+---------------------
+
+* A new regset NT_ARM_GCS is defined for use with PTRACE_GETREGSET and
+ PTRACE_SETREGSET.
+
+* The GCS mode, including enable and disable, may be configured via ptrace.
+ If GCS is enabled via ptrace no new GCS will be allocated for the thread.
+
+* Configuration via ptrace ignores locking of GCS mode bits.
+
+
+7. ELF coredump extensions
+---------------------------
+
+* NT_ARM_GCS notes will be added to each coredump for each thread of the
+ dumped process. The contents will be equivalent to the data that would
+ have been read if a PTRACE_GETREGSET of the corresponding type were
+ executed for each thread when the coredump was generated.
+
+
+
+8. /proc extensions
+--------------------
+
+* Guarded Control Stack pages will include "ss" in their VmFlags in
+ /proc/<pid>/smaps.
diff --git a/Documentation/arch/arm64/index.rst b/Documentation/arch/arm64/index.rst
index 78544de..6a012c9 100644
--- a/Documentation/arch/arm64/index.rst
+++ b/Documentation/arch/arm64/index.rst
@@ -10,16 +10,19 @@
acpi_object_usage
amu
arm-acpi
+ arm-cca
asymmetric-32bit
booting
cpu-feature-registers
cpu-hotplug
elf_hwcaps
+ gcs
hugetlbpage
kdump
legacy_instructions
memory
memory-tagging-extension
+ mops
perf
pointer-authentication
ptdump
diff --git a/Documentation/arch/arm64/mops.rst b/Documentation/arch/arm64/mops.rst
new file mode 100644
index 0000000..2ef5b14
--- /dev/null
+++ b/Documentation/arch/arm64/mops.rst
@@ -0,0 +1,44 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+===================================
+Memory copy/set instructions (MOPS)
+===================================
+
+A MOPS memory copy/set operation consists of three consecutive CPY* or SET*
+instructions: a prologue, main and epilogue (for example: CPYP, CPYM, CPYE).
+
+A main or epilogue instruction can take a MOPS exception for various reasons,
+for example when a task is migrated to a CPU with a different MOPS
+implementation, or when the instruction's alignment and size requirements are
+not met. The software exception handler is then expected to reset the registers
+and restart execution from the prologue instruction. Normally this is handled
+by the kernel.
+
+For more details refer to "D1.3.5.7 Memory Copy and Memory Set exceptions" in
+the Arm Architecture Reference Manual DDI 0487K.a (Arm ARM).
+
+.. _arm64_mops_hyp:
+
+Hypervisor requirements
+-----------------------
+
+A hypervisor running a Linux guest must handle all MOPS exceptions from the
+guest kernel, as Linux may not be able to handle the exception at all times.
+For example, a MOPS exception can be taken when the hypervisor migrates a vCPU
+to another physical CPU with a different MOPS implementation.
+
+To do this, the hypervisor must:
+
+ - Set HCRX_EL2.MCE2 to 1 so that the exception is taken to the hypervisor.
+
+ - Have an exception handler that implements the algorithm from the Arm ARM
+ rules CNTMJ and MWFQH.
+
+ - Set the guest's PSTATE.SS to 0 in the exception handler, to handle a
+ potential step of the current instruction.
+
+ Note: Clearing PSTATE.SS is needed so that a single step exception is taken
+ on the next instruction (the prologue instruction). Otherwise prologue
+ would get silently stepped over and the single step exception taken on the
+ main instruction. Note that if the guest instruction is not being stepped
+ then clearing PSTATE.SS has no effect.
diff --git a/Documentation/arch/arm64/sme.rst b/Documentation/arch/arm64/sme.rst
index be317d4..b2fa01f 100644
--- a/Documentation/arch/arm64/sme.rst
+++ b/Documentation/arch/arm64/sme.rst
@@ -346,6 +346,10 @@
* Writes to NT_ARM_ZT will set PSTATE.ZA to 1.
+* If any register data is provided along with SME_PT_VL_ONEXEC then the
+ registers data will be interpreted with the current vector length, not
+ the vector length configured for use on exec.
+
8. ELF coredump extensions
---------------------------
diff --git a/Documentation/arch/arm64/sve.rst b/Documentation/arch/arm64/sve.rst
index 8d8837f..2815249 100644
--- a/Documentation/arch/arm64/sve.rst
+++ b/Documentation/arch/arm64/sve.rst
@@ -402,6 +402,10 @@
streaming mode and any SETREGSET of NT_ARM_SSVE will enter streaming mode
if the target was not in streaming mode.
+* If any register data is provided along with SVE_PT_VL_ONEXEC then the
+ registers data will be interpreted with the current vector length, not
+ the vector length configured for use on exec.
+
* The effect of writing a partial, incomplete payload is unspecified.
diff --git a/Documentation/block/cmdline-partition.rst b/Documentation/block/cmdline-partition.rst
index 530bedf..526ba20 100644
--- a/Documentation/block/cmdline-partition.rst
+++ b/Documentation/block/cmdline-partition.rst
@@ -39,13 +39,16 @@
create a link to block device partition with the name "PARTNAME".
User space application can access partition by partition name.
+ro
+ read-only. Flag the partition as read-only.
+
Example:
eMMC disk names are "mmcblk0" and "mmcblk0boot0".
bootargs::
- 'blkdevparts=mmcblk0:1G(data0),1G(data1),-;mmcblk0boot0:1m(boot),-(kernel)'
+ 'blkdevparts=mmcblk0:1G(data0),1G(data1),-;mmcblk0boot0:1m(boot)ro,-(kernel)'
dmesg::
diff --git a/Documentation/block/ublk.rst b/Documentation/block/ublk.rst
index ff74b3e..51665a3 100644
--- a/Documentation/block/ublk.rst
+++ b/Documentation/block/ublk.rst
@@ -199,24 +199,36 @@
- user recovery feature description
- Two new features are added for user recovery: ``UBLK_F_USER_RECOVERY`` and
- ``UBLK_F_USER_RECOVERY_REISSUE``.
+ Three new features are added for user recovery: ``UBLK_F_USER_RECOVERY``,
+ ``UBLK_F_USER_RECOVERY_REISSUE``, and ``UBLK_F_USER_RECOVERY_FAIL_IO``. To
+ enable recovery of ublk devices after the ublk server exits, the ublk server
+ should specify the ``UBLK_F_USER_RECOVERY`` flag when creating the device. The
+ ublk server may additionally specify at most one of
+ ``UBLK_F_USER_RECOVERY_REISSUE`` and ``UBLK_F_USER_RECOVERY_FAIL_IO`` to
+ modify how I/O is handled while the ublk server is dying/dead (this is called
+ the ``nosrv`` case in the driver code).
- With ``UBLK_F_USER_RECOVERY`` set, after one ubq_daemon(ublk server's io
+ With just ``UBLK_F_USER_RECOVERY`` set, after one ubq_daemon(ublk server's io
handler) is dying, ublk does not delete ``/dev/ublkb*`` during the whole
recovery stage and ublk device ID is kept. It is ublk server's
responsibility to recover the device context by its own knowledge.
Requests which have not been issued to userspace are requeued. Requests
which have been issued to userspace are aborted.
- With ``UBLK_F_USER_RECOVERY_REISSUE`` set, after one ubq_daemon(ublk
- server's io handler) is dying, contrary to ``UBLK_F_USER_RECOVERY``,
+ With ``UBLK_F_USER_RECOVERY_REISSUE`` additionally set, after one ubq_daemon
+ (ublk server's io handler) is dying, contrary to ``UBLK_F_USER_RECOVERY``,
requests which have been issued to userspace are requeued and will be
re-issued to the new process after handling ``UBLK_CMD_END_USER_RECOVERY``.
``UBLK_F_USER_RECOVERY_REISSUE`` is designed for backends who tolerate
double-write since the driver may issue the same I/O request twice. It
might be useful to a read-only FS or a VM backend.
+ With ``UBLK_F_USER_RECOVERY_FAIL_IO`` additionally set, after the ublk server
+ exits, requests which have issued to userspace are failed, as are any
+ subsequently issued requests. Applications continuously issuing I/O against
+ devices with this flag set will see a stream of I/O errors until a new ublk
+ server recovers the device.
+
Unprivileged ublk device is supported by passing ``UBLK_F_UNPRIVILEGED_DEV``.
Once the flag is set, all control commands can be sent by unprivileged
user. Except for command of ``UBLK_CMD_ADD_DEV``, permission check on
diff --git a/Documentation/crypto/api-akcipher.rst b/Documentation/crypto/api-akcipher.rst
index 40aa874..ca1ecdd 100644
--- a/Documentation/crypto/api-akcipher.rst
+++ b/Documentation/crypto/api-akcipher.rst
@@ -8,10 +8,10 @@
---------------------
.. kernel-doc:: include/crypto/akcipher.h
- :doc: Generic Public Key API
+ :doc: Generic Public Key Cipher API
.. kernel-doc:: include/crypto/akcipher.h
- :functions: crypto_alloc_akcipher crypto_free_akcipher crypto_akcipher_set_pub_key crypto_akcipher_set_priv_key crypto_akcipher_maxsize crypto_akcipher_encrypt crypto_akcipher_decrypt crypto_akcipher_sign crypto_akcipher_verify
+ :functions: crypto_alloc_akcipher crypto_free_akcipher crypto_akcipher_set_pub_key crypto_akcipher_set_priv_key crypto_akcipher_maxsize crypto_akcipher_encrypt crypto_akcipher_decrypt
Asymmetric Cipher Request Handle
--------------------------------
diff --git a/Documentation/crypto/api-sig.rst b/Documentation/crypto/api-sig.rst
new file mode 100644
index 0000000..aaec18e
--- /dev/null
+++ b/Documentation/crypto/api-sig.rst
@@ -0,0 +1,15 @@
+Asymmetric Signature Algorithm Definitions
+------------------------------------------
+
+.. kernel-doc:: include/crypto/sig.h
+ :functions: sig_alg
+
+Asymmetric Signature API
+------------------------
+
+.. kernel-doc:: include/crypto/sig.h
+ :doc: Generic Public Key Signature API
+
+.. kernel-doc:: include/crypto/sig.h
+ :functions: crypto_alloc_sig crypto_free_sig crypto_sig_set_pubkey crypto_sig_set_privkey crypto_sig_keysize crypto_sig_maxsize crypto_sig_digestsize crypto_sig_sign crypto_sig_verify
+
diff --git a/Documentation/crypto/api.rst b/Documentation/crypto/api.rst
index ff31c30..8b2a905 100644
--- a/Documentation/crypto/api.rst
+++ b/Documentation/crypto/api.rst
@@ -10,4 +10,5 @@
api-digest
api-rng
api-akcipher
+ api-sig
api-kpp
diff --git a/Documentation/crypto/architecture.rst b/Documentation/crypto/architecture.rst
index 646c338..15dcd62 100644
--- a/Documentation/crypto/architecture.rst
+++ b/Documentation/crypto/architecture.rst
@@ -214,6 +214,8 @@
- CRYPTO_ALG_TYPE_AKCIPHER Asymmetric cipher
+- CRYPTO_ALG_TYPE_SIG Asymmetric signature
+
- CRYPTO_ALG_TYPE_PCOMPRESS Enhanced version of
CRYPTO_ALG_TYPE_COMPRESS allowing for segmented compression /
decompression instead of performing the operation on one segment
diff --git a/Documentation/devicetree/bindings/arm/pmu.yaml b/Documentation/devicetree/bindings/arm/pmu.yaml
index 528544d..a148ff5 100644
--- a/Documentation/devicetree/bindings/arm/pmu.yaml
+++ b/Documentation/devicetree/bindings/arm/pmu.yaml
@@ -74,6 +74,7 @@
- qcom,krait-pmu
- qcom,scorpion-pmu
- qcom,scorpion-mp-pmu
+ - samsung,mongoose-pmu
interrupts:
# Don't know how many CPUs, so no constraints to specify
diff --git a/Documentation/devicetree/bindings/ata/ahci-platform.yaml b/Documentation/devicetree/bindings/ata/ahci-platform.yaml
index ef19468..cc35cdc 100644
--- a/Documentation/devicetree/bindings/ata/ahci-platform.yaml
+++ b/Documentation/devicetree/bindings/ata/ahci-platform.yaml
@@ -84,6 +84,9 @@
minItems: 1
maxItems: 3
+ iommus:
+ maxItems: 1
+
patternProperties:
"^sata-port@[0-9a-f]+$":
$ref: /schemas/ata/ahci-common.yaml#/$defs/ahci-port
diff --git a/Documentation/devicetree/bindings/crypto/qcom-qce.yaml b/Documentation/devicetree/bindings/crypto/qcom-qce.yaml
index e285e38..c09be97 100644
--- a/Documentation/devicetree/bindings/crypto/qcom-qce.yaml
+++ b/Documentation/devicetree/bindings/crypto/qcom-qce.yaml
@@ -44,6 +44,7 @@
- items:
- enum:
+ - qcom,sa8775p-qce
- qcom,sc7280-qce
- qcom,sm6350-qce
- qcom,sm8250-qce
diff --git a/Documentation/devicetree/bindings/mmc/mmc-card.yaml b/Documentation/devicetree/bindings/mmc/mmc-card.yaml
index fd34712..1d91d42 100644
--- a/Documentation/devicetree/bindings/mmc/mmc-card.yaml
+++ b/Documentation/devicetree/bindings/mmc/mmc-card.yaml
@@ -13,6 +13,10 @@
This documents describes the devicetree bindings for a mmc-host controller
child node describing a mmc-card / an eMMC.
+ It's possible to define a fixed partition table for an eMMC for the user
+ partition, the 2 BOOT partition (boot1/2) and the 4 GP (gp1/2/3/4) if supported
+ by the eMMC.
+
properties:
compatible:
const: mmc-card
@@ -26,6 +30,24 @@
Use this to indicate that the mmc-card has a broken hpi
implementation, and that hpi should not be used.
+patternProperties:
+ "^partitions(-boot[12]|-gp[14])?$":
+ $ref: /schemas/mtd/partitions/partitions.yaml
+
+ patternProperties:
+ "^partition@[0-9a-f]+$":
+ $ref: /schemas/mtd/partitions/partition.yaml
+
+ properties:
+ reg:
+ description: Must be multiple of 512 as it's converted
+ internally from bytes to SECTOR_SIZE (512 bytes)
+
+ required:
+ - reg
+
+ unevaluatedProperties: false
+
required:
- compatible
- reg
@@ -42,6 +64,36 @@
compatible = "mmc-card";
reg = <0>;
broken-hpi;
+
+ partitions {
+ compatible = "fixed-partitions";
+
+ #address-cells = <1>;
+ #size-cells = <1>;
+
+ partition@0 {
+ label = "kernel"; /* Kernel */
+ reg = <0x0 0x2000000>; /* 32 MB */
+ };
+
+ partition@2000000 {
+ label = "rootfs";
+ reg = <0x2000000 0x40000000>; /* 1GB */
+ };
+ };
+
+ partitions-boot1 {
+ compatible = "fixed-partitions";
+
+ #address-cells = <1>;
+ #size-cells = <1>;
+
+ partition@0 {
+ label = "bl";
+ reg = <0x0 0x2000000>; /* 32MB */
+ read-only;
+ };
+ };
};
};
diff --git a/Documentation/devicetree/bindings/perf/fsl-imx-ddr.yaml b/Documentation/devicetree/bindings/perf/fsl-imx-ddr.yaml
index 37e8b98..8597ea6 100644
--- a/Documentation/devicetree/bindings/perf/fsl-imx-ddr.yaml
+++ b/Documentation/devicetree/bindings/perf/fsl-imx-ddr.yaml
@@ -31,7 +31,9 @@
- const: fsl,imx8dxl-ddr-pmu
- const: fsl,imx8-ddr-pmu
- items:
- - const: fsl,imx95-ddr-pmu
+ - enum:
+ - fsl,imx91-ddr-pmu
+ - fsl,imx95-ddr-pmu
- const: fsl,imx93-ddr-pmu
reg:
diff --git a/Documentation/devicetree/bindings/rng/airoha,en7581-trng.yaml b/Documentation/devicetree/bindings/rng/airoha,en7581-trng.yaml
new file mode 100644
index 0000000..dfc6d24
--- /dev/null
+++ b/Documentation/devicetree/bindings/rng/airoha,en7581-trng.yaml
@@ -0,0 +1,38 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/rng/airoha,en7581-trng.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Airoha EN7851 True Random Number Generator
+
+maintainers:
+ - Christian Marangi <ansuelsmth@gmail.com>
+
+properties:
+ compatible:
+ const: airoha,en7581-trng
+
+ reg:
+ maxItems: 1
+
+ interrupts:
+ maxItems: 1
+
+required:
+ - compatible
+ - reg
+ - interrupts
+
+additionalProperties: false
+
+examples:
+ - |
+ #include <dt-bindings/interrupt-controller/irq.h>
+ #include <dt-bindings/interrupt-controller/arm-gic.h>
+
+ rng@1faa1000 {
+ compatible = "airoha,en7581-trng";
+ reg = <0x1faa1000 0x1000>;
+ interrupts = <GIC_SPI 19 IRQ_TYPE_LEVEL_HIGH>;
+ };
diff --git a/Documentation/devicetree/bindings/rng/brcm,bcm74110-rng.yaml b/Documentation/devicetree/bindings/rng/brcm,bcm74110-rng.yaml
new file mode 100644
index 0000000..8e89d4a
--- /dev/null
+++ b/Documentation/devicetree/bindings/rng/brcm,bcm74110-rng.yaml
@@ -0,0 +1,35 @@
+# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/rng/brcm,bcm74110-rng.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: BCM74110 Random number generator
+
+description:
+ Random number generator used on the BCM74110.
+
+maintainers:
+ - Markus Mayer <mmayer@broadcom.com>
+ - Florian Fainelli <florian.fainelli@broadcom.com>
+
+properties:
+ compatible:
+ enum:
+ - brcm,bcm74110-rng
+
+ reg:
+ maxItems: 1
+
+required:
+ - compatible
+ - reg
+
+additionalProperties: false
+
+examples:
+ - |
+ rng@83ba000 {
+ compatible = "brcm,bcm74110-rng";
+ reg = <0x83ba000 0x14>;
+ };
diff --git a/Documentation/devicetree/bindings/rng/imx-rng.yaml b/Documentation/devicetree/bindings/rng/imx-rng.yaml
index 07f6ff8..252fa9a 100644
--- a/Documentation/devicetree/bindings/rng/imx-rng.yaml
+++ b/Documentation/devicetree/bindings/rng/imx-rng.yaml
@@ -14,8 +14,8 @@
oneOf:
- const: fsl,imx21-rnga
- const: fsl,imx25-rngb
+ - const: fsl,imx31-rnga
- items:
- - const: fsl,imx31-rnga
- const: fsl,imx21-rnga
- items:
- enum:
diff --git a/Documentation/devicetree/bindings/rng/omap_rng.yaml b/Documentation/devicetree/bindings/rng/inside-secure,safexcel-eip76.yaml
similarity index 79%
rename from Documentation/devicetree/bindings/rng/omap_rng.yaml
rename to Documentation/devicetree/bindings/rng/inside-secure,safexcel-eip76.yaml
index c0ac4f6..0877eb4 100644
--- a/Documentation/devicetree/bindings/rng/omap_rng.yaml
+++ b/Documentation/devicetree/bindings/rng/inside-secure,safexcel-eip76.yaml
@@ -1,20 +1,25 @@
# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
%YAML 1.2
---
-$id: http://devicetree.org/schemas/rng/omap_rng.yaml#
+$id: http://devicetree.org/schemas/rng/inside-secure,safexcel-eip76.yaml#
$schema: http://devicetree.org/meta-schemas/core.yaml#
-title: OMAP SoC and Inside-Secure HWRNG Module
+title: Inside-Secure HWRNG Module
maintainers:
- Jayesh Choudhary <j-choudhary@ti.com>
properties:
compatible:
- enum:
- - ti,omap2-rng
- - ti,omap4-rng
- - inside-secure,safexcel-eip76
+ oneOf:
+ - enum:
+ - ti,omap2-rng
+ - ti,omap4-rng
+ - inside-secure,safexcel-eip76
+ - items:
+ - enum:
+ - marvell,armada-8k-rng
+ - const: inside-secure,safexcel-eip76
ti,hwmods:
const: rng
diff --git a/Documentation/devicetree/bindings/rng/st,stm32-rng.yaml b/Documentation/devicetree/bindings/rng/st,stm32-rng.yaml
index 340d01d..7db65f4 100644
--- a/Documentation/devicetree/bindings/rng/st,stm32-rng.yaml
+++ b/Documentation/devicetree/bindings/rng/st,stm32-rng.yaml
@@ -18,12 +18,19 @@
enum:
- st,stm32-rng
- st,stm32mp13-rng
+ - st,stm32mp25-rng
reg:
maxItems: 1
clocks:
- maxItems: 1
+ minItems: 1
+ maxItems: 2
+
+ clock-names:
+ items:
+ - const: core
+ - const: bus
resets:
maxItems: 1
@@ -57,6 +64,25 @@
properties:
st,rng-lock-conf: false
+ - if:
+ properties:
+ compatible:
+ contains:
+ enum:
+ - st,stm32-rng
+ - st,stm32mp13-rng
+ then:
+ properties:
+ clocks:
+ maxItems: 1
+ clock-names: false
+ else:
+ properties:
+ clocks:
+ minItems: 2
+ required:
+ - clock-names
+
additionalProperties: false
examples:
diff --git a/Documentation/filesystems/index.rst b/Documentation/filesystems/index.rst
index e8e496d..44e9e77 100644
--- a/Documentation/filesystems/index.rst
+++ b/Documentation/filesystems/index.rst
@@ -29,6 +29,7 @@
fiemap
files
locks
+ multigrain-ts
mount_api
quota
seq_file
diff --git a/Documentation/filesystems/iomap/operations.rst b/Documentation/filesystems/iomap/operations.rst
index b93115a..ef082e5 100644
--- a/Documentation/filesystems/iomap/operations.rst
+++ b/Documentation/filesystems/iomap/operations.rst
@@ -513,6 +513,21 @@
if the mapping is unwritten and the filesystem cannot handle zeroing
the unaligned regions without exposing stale contents.
+ * ``IOMAP_ATOMIC``: This write is being issued with torn-write
+ protection.
+ Only a single bio can be created for the write, and the write must
+ not be split into multiple I/O requests, i.e. flag REQ_ATOMIC must be
+ set.
+ The file range to write must be aligned to satisfy the requirements
+ of both the filesystem and the underlying block device's atomic
+ commit capabilities.
+ If filesystem metadata updates are required (e.g. unwritten extent
+ conversion or copy on write), all updates for the entire file range
+ must be committed atomically as well.
+ Only one space mapping is allowed per untorn write.
+ Untorn writes must be aligned to, and must not be longer than, a
+ single file block.
+
Callers commonly hold ``i_rwsem`` in shared or exclusive mode before
calling this function.
diff --git a/Documentation/filesystems/multigrain-ts.rst b/Documentation/filesystems/multigrain-ts.rst
new file mode 100644
index 0000000..c779e47
--- /dev/null
+++ b/Documentation/filesystems/multigrain-ts.rst
@@ -0,0 +1,125 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=====================
+Multigrain Timestamps
+=====================
+
+Introduction
+============
+Historically, the kernel has always used coarse time values to stamp inodes.
+This value is updated every jiffy, so any change that happens within that jiffy
+will end up with the same timestamp.
+
+When the kernel goes to stamp an inode (due to a read or write), it first gets
+the current time and then compares it to the existing timestamp(s) to see
+whether anything will change. If nothing changed, then it can avoid updating
+the inode's metadata.
+
+Coarse timestamps are therefore good from a performance standpoint, since they
+reduce the need for metadata updates, but bad from the standpoint of
+determining whether anything has changed, since a lot of things can happen in a
+jiffy.
+
+They are particularly troublesome with NFSv3, where unchanging timestamps can
+make it difficult to tell whether to invalidate caches. NFSv4 provides a
+dedicated change attribute that should always show a visible change, but not
+all filesystems implement this properly, causing the NFS server to substitute
+the ctime in many cases.
+
+Multigrain timestamps aim to remedy this by selectively using fine-grained
+timestamps when a file has had its timestamps queried recently, and the current
+coarse-grained time does not cause a change.
+
+Inode Timestamps
+================
+There are currently 3 timestamps in the inode that are updated to the current
+wallclock time on different activity:
+
+ctime:
+ The inode change time. This is stamped with the current time whenever
+ the inode's metadata is changed. Note that this value is not settable
+ from userland.
+
+mtime:
+ The inode modification time. This is stamped with the current time
+ any time a file's contents change.
+
+atime:
+ The inode access time. This is stamped whenever an inode's contents are
+ read. Widely considered to be a terrible mistake. Usually avoided with
+ options like noatime or relatime.
+
+Updating the mtime always implies a change to the ctime, but updating the
+atime due to a read request does not.
+
+Multigrain timestamps are only tracked for the ctime and the mtime. atimes are
+not affected and always use the coarse-grained value (subject to the floor).
+
+Inode Timestamp Ordering
+========================
+
+In addition to just providing info about changes to individual files, file
+timestamps also serve an important purpose in applications like "make". These
+programs measure timestamps in order to determine whether source files might be
+newer than cached objects.
+
+Userland applications like make can only determine ordering based on
+operational boundaries. For a syscall those are the syscall entry and exit
+points. For io_uring or nfsd operations, that's the request submission and
+response. In the case of concurrent operations, userland can make no
+determination about the order in which things will occur.
+
+For instance, if a single thread modifies one file, and then another file in
+sequence, the second file must show an equal or later mtime than the first. The
+same is true if two threads are issuing similar operations that do not overlap
+in time.
+
+If however, two threads have racing syscalls that overlap in time, then there
+is no such guarantee, and the second file may appear to have been modified
+before, after or at the same time as the first, regardless of which one was
+submitted first.
+
+Note that the above assumes that the system doesn't experience a backward jump
+of the realtime clock. If that occurs at an inopportune time, then timestamps
+can appear to go backward, even on a properly functioning system.
+
+Multigrain Timestamp Implementation
+===================================
+Multigrain timestamps are aimed at ensuring that changes to a single file are
+always recognizable, without violating the ordering guarantees when multiple
+different files are modified. This affects the mtime and the ctime, but the
+atime will always use coarse-grained timestamps.
+
+It uses an unused bit in the i_ctime_nsec field to indicate whether the mtime
+or ctime has been queried. If either or both have, then the kernel takes
+special care to ensure the next timestamp update will display a visible change.
+This ensures tight cache coherency for use-cases like NFS, without sacrificing
+the benefits of reduced metadata updates when files aren't being watched.
+
+The Ctime Floor Value
+=====================
+It's not sufficient to simply use fine or coarse-grained timestamps based on
+whether the mtime or ctime has been queried. A file could get a fine grained
+timestamp, and then a second file modified later could get a coarse-grained one
+that appears earlier than the first, which would break the kernel's timestamp
+ordering guarantees.
+
+To mitigate this problem, maintain a global floor value that ensures that
+this can't happen. The two files in the above example may appear to have been
+modified at the same time in such a case, but they will never show the reverse
+order. To avoid problems with realtime clock jumps, the floor is managed as a
+monotonic ktime_t, and the values are converted to realtime clock values as
+needed.
+
+Implementation Notes
+====================
+Multigrain timestamps are intended for use by local filesystems that get
+ctime values from the local clock. This is in contrast to network filesystems
+and the like that just mirror timestamp values from a server.
+
+For most filesystems, it's sufficient to just set the FS_MGTIME flag in the
+fstype->fs_flags in order to opt-in, providing the ctime is only ever set via
+inode_set_ctime_current(). If the filesystem has a ->getattr routine that
+doesn't call generic_fillattr, then it should call fill_mg_cmtime() to
+fill those values. For setattr, it should use setattr_copy() to update the
+timestamps, or otherwise mimic its behavior.
diff --git a/Documentation/filesystems/nfs/exporting.rst b/Documentation/filesystems/nfs/exporting.rst
index f04ce12..de64d2d 100644
--- a/Documentation/filesystems/nfs/exporting.rst
+++ b/Documentation/filesystems/nfs/exporting.rst
@@ -238,10 +238,3 @@
all of an inode's dirty data on last close. Exports that behave this
way should set EXPORT_OP_FLUSH_ON_CLOSE so that NFSD knows to skip
waiting for writeback when closing such files.
-
- EXPORT_OP_ASYNC_LOCK - Indicates a capable filesystem to do async lock
- requests from lockd. Only set EXPORT_OP_ASYNC_LOCK if the filesystem has
- it's own ->lock() functionality as core posix_lock_file() implementation
- has no async lock request handling yet. For more information about how to
- indicate an async lock request from a ->lock() file_operations struct, see
- fs/locks.c and comment for the function vfs_lock_file().
diff --git a/Documentation/filesystems/overlayfs.rst b/Documentation/filesystems/overlayfs.rst
index 3436447..4c8387e 100644
--- a/Documentation/filesystems/overlayfs.rst
+++ b/Documentation/filesystems/overlayfs.rst
@@ -440,6 +440,23 @@
fsconfig(fs_fd, FSCONFIG_SET_STRING, "datadir+", "/do2", 0);
+Specifying layers via file descriptors
+--------------------------------------
+
+Since kernel v6.13, overlayfs supports specifying layers via file descriptors in
+addition to specifying them as paths. This feature is available for the
+"datadir+", "lowerdir+", "upperdir", and "workdir+" mount options with the
+fsconfig syscall from the new mount api::
+
+ fsconfig(fs_fd, FSCONFIG_SET_FD, "lowerdir+", NULL, fd_lower1);
+ fsconfig(fs_fd, FSCONFIG_SET_FD, "lowerdir+", NULL, fd_lower2);
+ fsconfig(fs_fd, FSCONFIG_SET_FD, "lowerdir+", NULL, fd_lower3);
+ fsconfig(fs_fd, FSCONFIG_SET_FD, "datadir+", NULL, fd_data1);
+ fsconfig(fs_fd, FSCONFIG_SET_FD, "datadir+", NULL, fd_data2);
+ fsconfig(fs_fd, FSCONFIG_SET_FD, "workdir", NULL, fd_work);
+ fsconfig(fs_fd, FSCONFIG_SET_FD, "upperdir", NULL, fd_upper);
+
+
fs-verity support
-----------------
diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst
index e834779..6a882c5 100644
--- a/Documentation/filesystems/proc.rst
+++ b/Documentation/filesystems/proc.rst
@@ -579,7 +579,7 @@
mt arm64 MTE allocation tags are enabled
um userfaultfd missing tracking
uw userfaultfd wr-protect tracking
- ss shadow stack page
+ ss shadow/guarded control stack page
sl sealed
== =======================================
diff --git a/Documentation/filesystems/tmpfs.rst b/Documentation/filesystems/tmpfs.rst
index 56a26c8..d677e04 100644
--- a/Documentation/filesystems/tmpfs.rst
+++ b/Documentation/filesystems/tmpfs.rst
@@ -241,6 +241,28 @@
will give you tmpfs instance on /mytmpfs which can allocate 10GB
RAM/SWAP in 10240 inodes and it is only accessible by root.
+tmpfs has the following mounting options for case-insensitive lookup support:
+
+================= ==============================================================
+casefold Enable casefold support at this mount point using the given
+ argument as the encoding standard. Currently only UTF-8
+ encodings are supported. If no argument is used, it will load
+ the latest UTF-8 encoding available.
+strict_encoding Enable strict encoding at this mount point (disabled by
+ default). In this mode, the filesystem refuses to create file
+ and directory with names containing invalid UTF-8 characters.
+================= ==============================================================
+
+This option doesn't render the entire filesystem case-insensitive. One needs to
+still set the casefold flag per directory, by flipping the +F attribute in an
+empty directory. Nevertheless, new directories will inherit the attribute. The
+mountpoint itself cannot be made case-insensitive.
+
+Example::
+
+ $ mount -t tmpfs -o casefold=utf8-12.1.0,strict_encoding fs_name /mytmpfs
+ $ mount -t tmpfs -o casefold fs_name /mytmpfs
+
:Author:
Christoph Rohland <cr@sap.com>, 1.12.01
@@ -250,3 +272,5 @@
KOSAKI Motohiro, 16 Mar 2010
:Updated:
Chris Down, 13 July 2020
+:Updated:
+ André Almeida, 23 Aug 2024
diff --git a/Documentation/networking/devmem.rst b/Documentation/networking/devmem.rst
index a55bf21..d953636 100644
--- a/Documentation/networking/devmem.rst
+++ b/Documentation/networking/devmem.rst
@@ -225,6 +225,15 @@
Failure to do so will exhaust the limited dmabuf that is bound to the RX queue
and will lead to packet drops.
+The user must pass no more than 128 tokens, with no more than 1024 total frags
+among the token->token_count across all the tokens. If the user provides more
+than 1024 frags, the kernel will free up to 1024 frags and return early.
+
+The kernel returns the number of actual frags freed. The number of frags freed
+can be less than the tokens provided by the user in case of:
+
+(a) an internal kernel leak bug.
+(b) the user passed more than 1024 frags.
Implementation & Caveats
========================
diff --git a/Documentation/security/landlock.rst b/Documentation/security/landlock.rst
index 36f2650..59ecdb1 100644
--- a/Documentation/security/landlock.rst
+++ b/Documentation/security/landlock.rst
@@ -11,18 +11,18 @@
Landlock's goal is to create scoped access-control (i.e. sandboxing). To
harden a whole system, this feature should be available to any process,
-including unprivileged ones. Because such process may be compromised or
+including unprivileged ones. Because such a process may be compromised or
backdoored (i.e. untrusted), Landlock's features must be safe to use from the
kernel and other processes point of view. Landlock's interface must therefore
expose a minimal attack surface.
Landlock is designed to be usable by unprivileged processes while following the
system security policy enforced by other access control mechanisms (e.g. DAC,
-LSM). Indeed, a Landlock rule shall not interfere with other access-controls
-enforced on the system, only add more restrictions.
+LSM). A Landlock rule shall not interfere with other access-controls enforced
+on the system, only add more restrictions.
Any user can enforce Landlock rulesets on their processes. They are merged and
-evaluated according to the inherited ones in a way that ensures that only more
+evaluated against inherited rulesets in a way that ensures that only more
constraints can be added.
User space documentation can be found here:
@@ -43,7 +43,7 @@
only impact the processes requesting them.
* Resources (e.g. file descriptors) directly obtained from the kernel by a
sandboxed process shall retain their scoped accesses (at the time of resource
- acquisition) whatever process use them.
+ acquisition) whatever process uses them.
Cf. `File descriptor access rights`_.
Design choices
@@ -71,7 +71,7 @@
Taking the ``LANDLOCK_ACCESS_FS_TRUNCATE`` right as an example, it may be
allowed to open a file for writing without being allowed to
:manpage:`ftruncate` the resulting file descriptor if the related file
-hierarchy doesn't grant such access right. The following sequences of
+hierarchy doesn't grant that access right. The following sequences of
operations have the same semantic and should then have the same result:
* ``truncate(path);``
@@ -81,7 +81,7 @@
attached to file descriptors are retained even if they are passed between
processes (e.g. through a Unix domain socket). Such access rights will then be
enforced even if the receiving process is not sandboxed by Landlock. Indeed,
-this is required to keep a consistent access control over the whole system, and
+this is required to keep access controls consistent over the whole system, and
this avoids unattended bypasses through file descriptor passing (i.e. confused
deputy attack).
diff --git a/Documentation/userspace-api/landlock.rst b/Documentation/userspace-api/landlock.rst
index c8d3e46..d639c61 100644
--- a/Documentation/userspace-api/landlock.rst
+++ b/Documentation/userspace-api/landlock.rst
@@ -8,13 +8,13 @@
=====================================
:Author: Mickaël Salaün
-:Date: September 2024
+:Date: October 2024
-The goal of Landlock is to enable to restrict ambient rights (e.g. global
+The goal of Landlock is to enable restriction of ambient rights (e.g. global
filesystem or network access) for a set of processes. Because Landlock
-is a stackable LSM, it makes possible to create safe security sandboxes as new
-security layers in addition to the existing system-wide access-controls. This
-kind of sandbox is expected to help mitigate the security impact of bugs or
+is a stackable LSM, it makes it possible to create safe security sandboxes as
+new security layers in addition to the existing system-wide access-controls.
+This kind of sandbox is expected to help mitigate the security impact of bugs or
unexpected/malicious behaviors in user space applications. Landlock empowers
any process, including unprivileged ones, to securely restrict themselves.
@@ -86,8 +86,8 @@
LANDLOCK_SCOPE_SIGNAL,
};
-Because we may not know on which kernel version an application will be
-executed, it is safer to follow a best-effort security approach. Indeed, we
+Because we may not know which kernel version an application will be executed
+on, it is safer to follow a best-effort security approach. Indeed, we
should try to protect users as much as possible whatever the kernel they are
using.
@@ -129,7 +129,7 @@
LANDLOCK_SCOPE_SIGNAL);
}
-This enables to create an inclusive ruleset that will contain our rules.
+This enables the creation of an inclusive ruleset that will contain our rules.
.. code-block:: c
@@ -219,42 +219,41 @@
now restricted and this policy will be enforced on all its subsequently created
children as well. Once a thread is landlocked, there is no way to remove its
security policy; only adding more restrictions is allowed. These threads are
-now in a new Landlock domain, merge of their parent one (if any) with the new
-ruleset.
+now in a new Landlock domain, which is a merger of their parent one (if any)
+with the new ruleset.
Full working code can be found in `samples/landlock/sandboxer.c`_.
Good practices
--------------
-It is recommended setting access rights to file hierarchy leaves as much as
+It is recommended to set access rights to file hierarchy leaves as much as
possible. For instance, it is better to be able to have ``~/doc/`` as a
read-only hierarchy and ``~/tmp/`` as a read-write hierarchy, compared to
``~/`` as a read-only hierarchy and ``~/tmp/`` as a read-write hierarchy.
Following this good practice leads to self-sufficient hierarchies that do not
depend on their location (i.e. parent directories). This is particularly
relevant when we want to allow linking or renaming. Indeed, having consistent
-access rights per directory enables to change the location of such directory
+access rights per directory enables changing the location of such directories
without relying on the destination directory access rights (except those that
are required for this operation, see ``LANDLOCK_ACCESS_FS_REFER``
documentation).
Having self-sufficient hierarchies also helps to tighten the required access
rights to the minimal set of data. This also helps avoid sinkhole directories,
-i.e. directories where data can be linked to but not linked from. However,
+i.e. directories where data can be linked to but not linked from. However,
this depends on data organization, which might not be controlled by developers.
In this case, granting read-write access to ``~/tmp/``, instead of write-only
-access, would potentially allow to move ``~/tmp/`` to a non-readable directory
+access, would potentially allow moving ``~/tmp/`` to a non-readable directory
and still keep the ability to list the content of ``~/tmp/``.
Layers of file path access rights
---------------------------------
Each time a thread enforces a ruleset on itself, it updates its Landlock domain
-with a new layer of policy. Indeed, this complementary policy is stacked with
-the potentially other rulesets already restricting this thread. A sandboxed
-thread can then safely add more constraints to itself with a new enforced
-ruleset.
+with a new layer of policy. This complementary policy is stacked with any
+other rulesets potentially already restricting this thread. A sandboxed thread
+can then safely add more constraints to itself with a new enforced ruleset.
One policy layer grants access to a file path if at least one of its rules
encountered on the path grants the access. A sandboxed thread can only access
@@ -265,7 +264,7 @@
Bind mounts and OverlayFS
-------------------------
-Landlock enables to restrict access to file hierarchies, which means that these
+Landlock enables restricting access to file hierarchies, which means that these
access rights can be propagated with bind mounts (cf.
Documentation/filesystems/sharedsubtree.rst) but not with
Documentation/filesystems/overlayfs.rst.
@@ -278,21 +277,21 @@
are the result of bind mounts or not.
An OverlayFS mount point consists of upper and lower layers. These layers are
-combined in a merge directory, result of the mount point. This merge hierarchy
-may include files from the upper and lower layers, but modifications performed
-on the merge hierarchy only reflects on the upper layer. From a Landlock
-policy point of view, each OverlayFS layers and merge hierarchies are
-standalone and contains their own set of files and directories, which is
-different from bind mounts. A policy restricting an OverlayFS layer will not
-restrict the resulted merged hierarchy, and vice versa. Landlock users should
-then only think about file hierarchies they want to allow access to, regardless
-of the underlying filesystem.
+combined in a merge directory, and that merged directory becomes available at
+the mount point. This merge hierarchy may include files from the upper and
+lower layers, but modifications performed on the merge hierarchy only reflect
+on the upper layer. From a Landlock policy point of view, all OverlayFS layers
+and merge hierarchies are standalone and each contains their own set of files
+and directories, which is different from bind mounts. A policy restricting an
+OverlayFS layer will not restrict the resulted merged hierarchy, and vice versa.
+Landlock users should then only think about file hierarchies they want to allow
+access to, regardless of the underlying filesystem.
Inheritance
-----------
Every new thread resulting from a :manpage:`clone(2)` inherits Landlock domain
-restrictions from its parent. This is similar to the seccomp inheritance (cf.
+restrictions from its parent. This is similar to seccomp inheritance (cf.
Documentation/userspace-api/seccomp_filter.rst) or any other LSM dealing with
task's :manpage:`credentials(7)`. For instance, one process's thread may apply
Landlock rules to itself, but they will not be automatically applied to other
@@ -311,8 +310,8 @@
A sandboxed process has less privileges than a non-sandboxed process and must
then be subject to additional restrictions when manipulating another process.
To be allowed to use :manpage:`ptrace(2)` and related syscalls on a target
-process, a sandboxed process should have a subset of the target process rules,
-which means the tracee must be in a sub-domain of the tracer.
+process, a sandboxed process should have a superset of the target process's
+access rights, which means the tracee must be in a sub-domain of the tracer.
IPC scoping
-----------
@@ -322,7 +321,7 @@
for a set of actions by specifying it on a ruleset. For example, if a
sandboxed process should not be able to :manpage:`connect(2)` to a
non-sandboxed process through abstract :manpage:`unix(7)` sockets, we can
-specify such restriction with ``LANDLOCK_SCOPE_ABSTRACT_UNIX_SOCKET``.
+specify such a restriction with ``LANDLOCK_SCOPE_ABSTRACT_UNIX_SOCKET``.
Moreover, if a sandboxed process should not be able to send a signal to a
non-sandboxed process, we can specify this restriction with
``LANDLOCK_SCOPE_SIGNAL``.
@@ -394,7 +393,7 @@
Landlock is designed to be compatible with past and future versions of the
kernel. This is achieved thanks to the system call attributes and the
associated bitflags, particularly the ruleset's ``handled_access_fs``. Making
-handled access right explicit enables the kernel and user space to have a clear
+handled access rights explicit enables the kernel and user space to have a clear
contract with each other. This is required to make sure sandboxing will not
get stricter with a system update, which could break applications.
@@ -563,33 +562,34 @@
Starting with the Landlock ABI version 3, it is now possible to securely control
truncation thanks to the new ``LANDLOCK_ACCESS_FS_TRUNCATE`` access right.
-Network support (ABI < 4)
--------------------------
+TCP bind and connect (ABI < 4)
+------------------------------
Starting with the Landlock ABI version 4, it is now possible to restrict TCP
bind and connect actions to only a set of allowed ports thanks to the new
``LANDLOCK_ACCESS_NET_BIND_TCP`` and ``LANDLOCK_ACCESS_NET_CONNECT_TCP``
access rights.
-IOCTL (ABI < 5)
----------------
+Device IOCTL (ABI < 5)
+----------------------
IOCTL operations could not be denied before the fifth Landlock ABI, so
:manpage:`ioctl(2)` is always allowed when using a kernel that only supports an
earlier ABI.
Starting with the Landlock ABI version 5, it is possible to restrict the use of
-:manpage:`ioctl(2)` using the new ``LANDLOCK_ACCESS_FS_IOCTL_DEV`` right.
+:manpage:`ioctl(2)` on character and block devices using the new
+``LANDLOCK_ACCESS_FS_IOCTL_DEV`` right.
-Abstract UNIX socket scoping (ABI < 6)
---------------------------------------
+Abstract UNIX socket (ABI < 6)
+------------------------------
Starting with the Landlock ABI version 6, it is possible to restrict
connections to an abstract :manpage:`unix(7)` socket by setting
``LANDLOCK_SCOPE_ABSTRACT_UNIX_SOCKET`` to the ``scoped`` ruleset attribute.
-Signal scoping (ABI < 6)
-------------------------
+Signal (ABI < 6)
+----------------
Starting with the Landlock ABI version 6, it is possible to restrict
:manpage:`signal(7)` sending by setting ``LANDLOCK_SCOPE_SIGNAL`` to the
@@ -605,9 +605,9 @@
Landlock was first introduced in Linux 5.13 but it must be configured at build
time with ``CONFIG_SECURITY_LANDLOCK=y``. Landlock must also be enabled at boot
-time as the other security modules. The list of security modules enabled by
+time like other security modules. The list of security modules enabled by
default is set with ``CONFIG_LSM``. The kernel configuration should then
-contains ``CONFIG_LSM=landlock,[...]`` with ``[...]`` as the list of other
+contain ``CONFIG_LSM=landlock,[...]`` with ``[...]`` as the list of other
potentially useful security modules for the running system (see the
``CONFIG_LSM`` help).
@@ -669,7 +669,7 @@
What about user space sandbox managers?
---------------------------------------
-Using user space process to enforce restrictions on kernel resources can lead
+Using user space processes to enforce restrictions on kernel resources can lead
to race conditions or inconsistent evaluations (i.e. `Incorrect mirroring of
the OS code and state
<https://www.ndss-symposium.org/ndss2003/traps-and-pitfalls-practical-problems-system-call-interposition-based-security-tools/>`_).
diff --git a/Documentation/virt/kvm/s390/s390-diag.rst b/Documentation/virt/kvm/s390/s390-diag.rst
index ca85f03..3e4f9e3 100644
--- a/Documentation/virt/kvm/s390/s390-diag.rst
+++ b/Documentation/virt/kvm/s390/s390-diag.rst
@@ -35,20 +35,24 @@
documentation for the s390 hypervisors defining them.
-DIAGNOSE function code 'X'500' - KVM virtio functions
------------------------------------------------------
+DIAGNOSE function code 'X'500' - KVM functions
+----------------------------------------------
-If the function code specifies 0x500, various virtio-related functions
-are performed.
+If the function code specifies 0x500, various KVM-specific functions
+are performed, including virtio functions.
-General register 1 contains the virtio subfunction code. Supported
-virtio subfunctions depend on KVM's userspace. Generally, userspace
-provides either s390-virtio (subcodes 0-2) or virtio-ccw (subcode 3).
+General register 1 contains the subfunction code. Supported subfunctions
+depend on KVM's userspace. Regarding virtio subfunctions, generally
+userspace provides either s390-virtio (subcodes 0-2) or virtio-ccw
+(subcode 3).
Upon completion of the DIAGNOSE instruction, general register 2 contains
the function's return code, which is either a return code or a subcode
specific value.
+If the specified subfunction is not supported, a SPECIFICATION exception
+will be triggered.
+
Subcode 0 - s390-virtio notification and early console printk
Handled by userspace.
@@ -76,6 +80,23 @@
See also the virtio standard for a discussion of this hypercall.
+Subcode 4 - storage-limit
+ Handled by userspace.
+
+ After completion of the DIAGNOSE call, general register 2 will
+ contain the storage limit: the maximum physical address that might be
+ used for storage throughout the lifetime of the VM.
+
+ The storage limit does not indicate currently usable storage, it may
+ include holes, standby storage and areas reserved for other means, such
+ as memory hotplug or virtio-mem devices. Other interfaces for detecting
+ actually usable storage, such as SCLP, must be used in conjunction with
+ this subfunction.
+
+ Note that the storage limit can be larger, but never smaller than the
+ maximum storage address indicated by SCLP via the "maximum storage
+ increment" and the "increment size".
+
DIAGNOSE function code 'X'501 - KVM breakpoint
----------------------------------------------
diff --git a/MAINTAINERS b/MAINTAINERS
index 21fdaa1..6b3ef11 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -6301,7 +6301,6 @@
M: "Maciej W. Rozycki" <macro@orcam.me.uk>
L: linux-mips@vger.kernel.org
S: Maintained
-W: http://www.linux-mips.org/wiki/DECstation
F: arch/mips/dec/
F: arch/mips/include/asm/dec/
F: arch/mips/include/asm/mach-dec/
@@ -8061,10 +8060,10 @@
F: drivers/edac/highbank*
EDAC-CAVIUM OCTEON
-M: Ralf Baechle <ralf@linux-mips.org>
+M: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
L: linux-edac@vger.kernel.org
L: linux-mips@vger.kernel.org
-S: Supported
+S: Maintained
F: drivers/edac/octeon_edac*
EDAC-CAVIUM THUNDERX
@@ -10493,6 +10492,7 @@
F: Documentation/mm/vmemmap_dedup.rst
F: fs/hugetlbfs/
F: include/linux/hugetlb.h
+F: include/trace/events/hugetlbfs.h
F: mm/hugetlb.c
F: mm/hugetlb_vmemmap.c
F: mm/hugetlb_vmemmap.h
@@ -11447,7 +11447,7 @@
F: drivers/dma/ioat*
INTEL IAA CRYPTO DRIVER
-M: Tom Zanussi <tom.zanussi@linux.intel.com>
+M: Kristen Accardi <kristen.c.accardi@intel.com>
L: linux-crypto@vger.kernel.org
S: Supported
F: Documentation/driver-api/crypto/iaa/iaa-crypto.rst
@@ -11885,7 +11885,7 @@
F: drivers/iio/gyro/mpu3050*
IOC3 ETHERNET DRIVER
-M: Ralf Baechle <ralf@linux-mips.org>
+M: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
L: linux-mips@vger.kernel.org
S: Maintained
F: drivers/net/ethernet/sgi/ioc3-eth.c
@@ -13816,6 +13816,12 @@
F: Documentation/networking/device_drivers/ethernet/marvell/octeontx2.rst
F: drivers/net/ethernet/marvell/octeontx2/af/
+MARVELL PEM PMU DRIVER
+M: Linu Cherian <lcherian@marvell.com>
+M: Gowthami Thiagarajan <gthiagarajan@marvell.com>
+S: Supported
+F: drivers/perf/marvell_pem_pmu.c
+
MARVELL PRESTERA ETHERNET SWITCH DRIVER
M: Taras Chornyi <taras.chornyi@plvision.eu>
S: Supported
@@ -15480,7 +15486,6 @@
M: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
L: linux-mips@vger.kernel.org
S: Maintained
-W: http://www.linux-mips.org/
Q: https://patchwork.kernel.org/project/linux-mips/list/
T: git git://git.kernel.org/pub/scm/linux/kernel/git/mips/linux.git
F: Documentation/devicetree/bindings/mips/
@@ -15993,9 +15998,8 @@
F: tools/testing/selftests/net/netfilter/
NETROM NETWORK LAYER
-M: Ralf Baechle <ralf@linux-mips.org>
L: linux-hams@vger.kernel.org
-S: Maintained
+S: Orphan
W: https://linux-ax25.in-berlin.de
F: include/net/netrom.h
F: include/uapi/linux/netrom.h
@@ -19579,6 +19583,17 @@
F: Documentation/devicetree/bindings/i2c/renesas,iic-emev2.yaml
F: drivers/i2c/busses/i2c-emev2.c
+RENESAS ETHERNET AVB DRIVER
+M: Paul Barker <paul.barker.ct@bp.renesas.com>
+M: Niklas Söderlund <niklas.soderlund@ragnatech.se>
+L: netdev@vger.kernel.org
+L: linux-renesas-soc@vger.kernel.org
+S: Supported
+F: Documentation/devicetree/bindings/net/renesas,etheravb.yaml
+F: drivers/net/ethernet/renesas/Kconfig
+F: drivers/net/ethernet/renesas/Makefile
+F: drivers/net/ethernet/renesas/ravb*
+
RENESAS ETHERNET SWITCH DRIVER
R: Yoshihiro Shimoda <yoshihiro.shimoda.uh@renesas.com>
L: netdev@vger.kernel.org
@@ -19628,6 +19643,14 @@
F: drivers/i2c/busses/i2c-rcar.c
F: drivers/i2c/busses/i2c-sh_mobile.c
+RENESAS R-CAR SATA DRIVER
+M: Geert Uytterhoeven <geert+renesas@glider.be>
+L: linux-ide@vger.kernel.org
+L: linux-renesas-soc@vger.kernel.org
+S: Supported
+F: Documentation/devicetree/bindings/ata/renesas,rcar-sata.yaml
+F: drivers/ata/sata_rcar.c
+
RENESAS R-CAR THERMAL DRIVERS
M: Niklas Söderlund <niklas.soderlund@ragnatech.se>
L: linux-renesas-soc@vger.kernel.org
@@ -19703,6 +19726,17 @@
F: Documentation/devicetree/bindings/i2c/renesas,rzv2m.yaml
F: drivers/i2c/busses/i2c-rzv2m.c
+RENESAS SUPERH ETHERNET DRIVER
+M: Niklas Söderlund <niklas.soderlund@ragnatech.se>
+L: netdev@vger.kernel.org
+L: linux-renesas-soc@vger.kernel.org
+S: Supported
+F: Documentation/devicetree/bindings/net/renesas,ether.yaml
+F: drivers/net/ethernet/renesas/Kconfig
+F: drivers/net/ethernet/renesas/Makefile
+F: drivers/net/ethernet/renesas/sh_eth*
+F: include/linux/sh_eth.h
+
RENESAS USB PHY DRIVER
M: Yoshihiro Shimoda <yoshihiro.shimoda.uh@renesas.com>
L: linux-renesas-soc@vger.kernel.org
@@ -20035,9 +20069,8 @@
F: include/linux/mfd/rohm-shared.h
ROSE NETWORK LAYER
-M: Ralf Baechle <ralf@linux-mips.org>
L: linux-hams@vger.kernel.org
-S: Maintained
+S: Orphan
W: https://linux-ax25.in-berlin.de
F: include/net/rose.h
F: include/uapi/linux/rose.h
@@ -20193,6 +20226,16 @@
S: Supported
F: drivers/s390/cio/
+S390 CRYPTO MODULES, PRNG DRIVER, ARCH RANDOM
+M: Harald Freudenberger <freude@linux.ibm.com>
+M: Holger Dengler <dengler@linux.ibm.com>
+L: linux-crypto@vger.kernel.org
+L: linux-s390@vger.kernel.org
+S: Supported
+F: arch/s390/crypto/
+F: arch/s390/include/asm/archrandom.h
+F: arch/s390/include/asm/cpacf.h
+
S390 DASD DRIVER
M: Stefan Haberland <sth@linux.ibm.com>
M: Jan Hoeppner <hoeppner@linux.ibm.com>
@@ -20202,6 +20245,14 @@
F: drivers/s390/block/dasd*
F: include/linux/dasd_mod.h
+S390 HWRANDOM TRNG DRIVER
+M: Harald Freudenberger <freude@linux.ibm.com>
+M: Holger Dengler <dengler@linux.ibm.com>
+L: linux-crypto@vger.kernel.org
+L: linux-s390@vger.kernel.org
+S: Supported
+F: drivers/char/hw_random/s390-trng.c
+
S390 IOMMU (PCI)
M: Niklas Schnelle <schnelle@linux.ibm.com>
M: Matthew Rosato <mjrosato@linux.ibm.com>
@@ -20283,10 +20334,16 @@
F: drivers/vfio/pci/vfio_pci_zdev.c
F: include/uapi/linux/vfio_zdev.h
-S390 ZCRYPT DRIVER
+S390 ZCRYPT AND PKEY DRIVER AND AP BUS
M: Harald Freudenberger <freude@linux.ibm.com>
+M: Holger Dengler <dengler@linux.ibm.com>
L: linux-s390@vger.kernel.org
S: Supported
+F: arch/s390/include/asm/ap.h
+F: arch/s390/include/asm/pkey.h
+F: arch/s390/include/asm/trace/zcrypt.h
+F: arch/s390/include/uapi/asm/pkey.h
+F: arch/s390/include/uapi/asm/zcrypt.h
F: drivers/s390/crypto/
S390 ZFCP DRIVER
@@ -20773,6 +20830,7 @@
B: mailto:linux-security-module@vger.kernel.org
P: https://github.com/LinuxSecurityModule/kernel/blob/main/README.md
T: git https://git.kernel.org/pub/scm/linux/kernel/git/pcmoore/lsm.git
+F: include/linux/lsm/
F: include/linux/lsm_audit.h
F: include/linux/lsm_hook_defs.h
F: include/linux/lsm_hooks.h
@@ -21362,11 +21420,11 @@
SOFTWARE RAID (Multiple Disks) SUPPORT
M: Song Liu <song@kernel.org>
-R: Yu Kuai <yukuai3@huawei.com>
+M: Yu Kuai <yukuai3@huawei.com>
L: linux-raid@vger.kernel.org
S: Supported
Q: https://patchwork.kernel.org/project/linux-raid/list/
-T: git git://git.kernel.org/pub/scm/linux/kernel/git/song/md.git
+T: git git://git.kernel.org/pub/scm/linux/kernel/git/mdraid/linux.git
F: drivers/md/Kconfig
F: drivers/md/Makefile
F: drivers/md/md*
@@ -23555,10 +23613,9 @@
TURBOCHANNEL SUBSYSTEM
M: "Maciej W. Rozycki" <macro@orcam.me.uk>
-M: Ralf Baechle <ralf@linux-mips.org>
L: linux-mips@vger.kernel.org
S: Maintained
-Q: http://patchwork.linux-mips.org/project/linux-mips/list/
+Q: https://patchwork.kernel.org/project/linux-mips/list/
F: drivers/tc/
F: include/linux/tc.h
diff --git a/Makefile b/Makefile
index 79192a3..68a8faf 100644
--- a/Makefile
+++ b/Makefile
@@ -2,7 +2,7 @@
VERSION = 6
PATCHLEVEL = 12
SUBLEVEL = 0
-EXTRAVERSION = -rc7
+EXTRAVERSION =
NAME = Baby Opossum Posse
# *DOCUMENTATION*
diff --git a/arch/alpha/kernel/osf_sys.c b/arch/alpha/kernel/osf_sys.c
index c0424de..8618502 100644
--- a/arch/alpha/kernel/osf_sys.c
+++ b/arch/alpha/kernel/osf_sys.c
@@ -152,7 +152,7 @@ SYSCALL_DEFINE4(osf_getdirentries, unsigned int, fd,
long __user *, basep)
{
int error;
- struct fd arg = fdget_pos(fd);
+ CLASS(fd_pos, arg)(fd);
struct osf_dirent_callback buf = {
.ctx.actor = osf_filldir,
.dirent = dirent,
@@ -160,7 +160,7 @@ SYSCALL_DEFINE4(osf_getdirentries, unsigned int, fd,
.count = count
};
- if (!fd_file(arg))
+ if (fd_empty(arg))
return -EBADF;
error = iterate_dir(fd_file(arg), &buf.ctx);
@@ -169,7 +169,6 @@ SYSCALL_DEFINE4(osf_getdirentries, unsigned int, fd,
if (count != buf.count)
error = count - buf.count;
- fdput_pos(arg);
return error;
}
diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl
index 7472066..c59d53d 100644
--- a/arch/alpha/kernel/syscalls/syscall.tbl
+++ b/arch/alpha/kernel/syscalls/syscall.tbl
@@ -502,3 +502,7 @@
570 common lsm_set_self_attr sys_lsm_set_self_attr
571 common lsm_list_modules sys_lsm_list_modules
572 common mseal sys_mseal
+573 common setxattrat sys_setxattrat
+574 common getxattrat sys_getxattrat
+575 common listxattrat sys_listxattrat
+576 common removexattrat sys_removexattrat
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 749179a..202397b 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -1598,6 +1598,9 @@
config ARCH_SUPPORTS_CRASH_DUMP
def_bool y
+config ARCH_DEFAULT_CRASH_DUMP
+ def_bool y
+
config AUTO_ZRELADDR
bool "Auto calculation of the decompressed kernel image address" if !ARCH_MULTIPLATFORM
default !(ARCH_FOOTBRIDGE || ARCH_RPC || ARCH_SA1100)
diff --git a/arch/arm/crypto/crct10dif-ce-core.S b/arch/arm/crypto/crct10dif-ce-core.S
index 46c02c51..2bbf2df 100644
--- a/arch/arm/crypto/crct10dif-ce-core.S
+++ b/arch/arm/crypto/crct10dif-ce-core.S
@@ -112,55 +112,120 @@
FOLD_CONST_L .req q10l
FOLD_CONST_H .req q10h
+ /*
+ * Pairwise long polynomial multiplication of two 16-bit values
+ *
+ * { w0, w1 }, { y0, y1 }
+ *
+ * by two 64-bit values
+ *
+ * { x0, x1, x2, x3, x4, x5, x6, x7 }, { z0, z1, z2, z3, z4, z5, z6, z7 }
+ *
+ * where each vector element is a byte, ordered from least to most
+ * significant. The resulting 80-bit vectors are XOR'ed together.
+ *
+ * This can be implemented using 8x8 long polynomial multiplication, by
+ * reorganizing the input so that each pairwise 8x8 multiplication
+ * produces one of the terms from the decomposition below, and
+ * combining the results of each rank and shifting them into place.
+ *
+ * Rank
+ * 0 w0*x0 ^ | y0*z0 ^
+ * 1 (w0*x1 ^ w1*x0) << 8 ^ | (y0*z1 ^ y1*z0) << 8 ^
+ * 2 (w0*x2 ^ w1*x1) << 16 ^ | (y0*z2 ^ y1*z1) << 16 ^
+ * 3 (w0*x3 ^ w1*x2) << 24 ^ | (y0*z3 ^ y1*z2) << 24 ^
+ * 4 (w0*x4 ^ w1*x3) << 32 ^ | (y0*z4 ^ y1*z3) << 32 ^
+ * 5 (w0*x5 ^ w1*x4) << 40 ^ | (y0*z5 ^ y1*z4) << 40 ^
+ * 6 (w0*x6 ^ w1*x5) << 48 ^ | (y0*z6 ^ y1*z5) << 48 ^
+ * 7 (w0*x7 ^ w1*x6) << 56 ^ | (y0*z7 ^ y1*z6) << 56 ^
+ * 8 w1*x7 << 64 | y1*z7 << 64
+ *
+ * The inputs can be reorganized into
+ *
+ * { w0, w0, w0, w0, y0, y0, y0, y0 }, { w1, w1, w1, w1, y1, y1, y1, y1 }
+ * { x0, x2, x4, x6, z0, z2, z4, z6 }, { x1, x3, x5, x7, z1, z3, z5, z7 }
+ *
+ * and after performing 8x8->16 bit long polynomial multiplication of
+ * each of the halves of the first vector with those of the second one,
+ * we obtain the following four vectors of 16-bit elements:
+ *
+ * a := { w0*x0, w0*x2, w0*x4, w0*x6 }, { y0*z0, y0*z2, y0*z4, y0*z6 }
+ * b := { w0*x1, w0*x3, w0*x5, w0*x7 }, { y0*z1, y0*z3, y0*z5, y0*z7 }
+ * c := { w1*x0, w1*x2, w1*x4, w1*x6 }, { y1*z0, y1*z2, y1*z4, y1*z6 }
+ * d := { w1*x1, w1*x3, w1*x5, w1*x7 }, { y1*z1, y1*z3, y1*z5, y1*z7 }
+ *
+ * Results b and c can be XORed together, as the vector elements have
+ * matching ranks. Then, the final XOR can be pulled forward, and
+ * applied between the halves of each of the remaining three vectors,
+ * which are then shifted into place, and XORed together to produce the
+ * final 80-bit result.
+ */
+ .macro pmull16x64_p8, v16, v64
+ vext.8 q11, \v64, \v64, #1
+ vld1.64 {q12}, [r4, :128]
+ vuzp.8 q11, \v64
+ vtbl.8 d24, {\v16\()_L-\v16\()_H}, d24
+ vtbl.8 d25, {\v16\()_L-\v16\()_H}, d25
+ bl __pmull16x64_p8
+ veor \v64, q12, q14
+ .endm
+
+__pmull16x64_p8:
+ vmull.p8 q13, d23, d24
+ vmull.p8 q14, d23, d25
+ vmull.p8 q15, d22, d24
+ vmull.p8 q12, d22, d25
+
+ veor q14, q14, q15
+ veor d24, d24, d25
+ veor d26, d26, d27
+ veor d28, d28, d29
+ vmov.i32 d25, #0
+ vmov.i32 d29, #0
+ vext.8 q12, q12, q12, #14
+ vext.8 q14, q14, q14, #15
+ veor d24, d24, d26
+ bx lr
+ENDPROC(__pmull16x64_p8)
+
+ .macro pmull16x64_p64, v16, v64
+ vmull.p64 q11, \v64\()l, \v16\()_L
+ vmull.p64 \v64, \v64\()h, \v16\()_H
+ veor \v64, \v64, q11
+ .endm
+
// Fold reg1, reg2 into the next 32 data bytes, storing the result back
// into reg1, reg2.
- .macro fold_32_bytes, reg1, reg2
- vld1.64 {q11-q12}, [buf]!
+ .macro fold_32_bytes, reg1, reg2, p
+ vld1.64 {q8-q9}, [buf]!
- vmull.p64 q8, \reg1\()h, FOLD_CONST_H
- vmull.p64 \reg1, \reg1\()l, FOLD_CONST_L
- vmull.p64 q9, \reg2\()h, FOLD_CONST_H
- vmull.p64 \reg2, \reg2\()l, FOLD_CONST_L
+ pmull16x64_\p FOLD_CONST, \reg1
+ pmull16x64_\p FOLD_CONST, \reg2
-CPU_LE( vrev64.8 q11, q11 )
-CPU_LE( vrev64.8 q12, q12 )
- vswp q11l, q11h
- vswp q12l, q12h
+CPU_LE( vrev64.8 q8, q8 )
+CPU_LE( vrev64.8 q9, q9 )
+ vswp q8l, q8h
+ vswp q9l, q9h
veor.8 \reg1, \reg1, q8
veor.8 \reg2, \reg2, q9
- veor.8 \reg1, \reg1, q11
- veor.8 \reg2, \reg2, q12
.endm
// Fold src_reg into dst_reg, optionally loading the next fold constants
- .macro fold_16_bytes, src_reg, dst_reg, load_next_consts
- vmull.p64 q8, \src_reg\()l, FOLD_CONST_L
- vmull.p64 \src_reg, \src_reg\()h, FOLD_CONST_H
+ .macro fold_16_bytes, src_reg, dst_reg, p, load_next_consts
+ pmull16x64_\p FOLD_CONST, \src_reg
.ifnb \load_next_consts
vld1.64 {FOLD_CONSTS}, [fold_consts_ptr, :128]!
.endif
- veor.8 \dst_reg, \dst_reg, q8
veor.8 \dst_reg, \dst_reg, \src_reg
.endm
- .macro __adrl, out, sym
- movw \out, #:lower16:\sym
- movt \out, #:upper16:\sym
- .endm
-
-//
-// u16 crc_t10dif_pmull(u16 init_crc, const u8 *buf, size_t len);
-//
-// Assumes len >= 16.
-//
-ENTRY(crc_t10dif_pmull)
-
+ .macro crct10dif, p
// For sizes less than 256 bytes, we can't fold 128 bytes at a time.
cmp len, #256
- blt .Lless_than_256_bytes
+ blt .Lless_than_256_bytes\@
- __adrl fold_consts_ptr, .Lfold_across_128_bytes_consts
+ mov_l fold_consts_ptr, .Lfold_across_128_bytes_consts
// Load the first 128 data bytes. Byte swapping is necessary to make
// the bit order match the polynomial coefficient order.
@@ -199,27 +264,27 @@
// While >= 128 data bytes remain (not counting q0-q7), fold the 128
// bytes q0-q7 into them, storing the result back into q0-q7.
-.Lfold_128_bytes_loop:
- fold_32_bytes q0, q1
- fold_32_bytes q2, q3
- fold_32_bytes q4, q5
- fold_32_bytes q6, q7
+.Lfold_128_bytes_loop\@:
+ fold_32_bytes q0, q1, \p
+ fold_32_bytes q2, q3, \p
+ fold_32_bytes q4, q5, \p
+ fold_32_bytes q6, q7, \p
subs len, len, #128
- bge .Lfold_128_bytes_loop
+ bge .Lfold_128_bytes_loop\@
// Now fold the 112 bytes in q0-q6 into the 16 bytes in q7.
// Fold across 64 bytes.
vld1.64 {FOLD_CONSTS}, [fold_consts_ptr, :128]!
- fold_16_bytes q0, q4
- fold_16_bytes q1, q5
- fold_16_bytes q2, q6
- fold_16_bytes q3, q7, 1
+ fold_16_bytes q0, q4, \p
+ fold_16_bytes q1, q5, \p
+ fold_16_bytes q2, q6, \p
+ fold_16_bytes q3, q7, \p, 1
// Fold across 32 bytes.
- fold_16_bytes q4, q6
- fold_16_bytes q5, q7, 1
+ fold_16_bytes q4, q6, \p
+ fold_16_bytes q5, q7, \p, 1
// Fold across 16 bytes.
- fold_16_bytes q6, q7
+ fold_16_bytes q6, q7, \p
// Add 128 to get the correct number of data bytes remaining in 0...127
// (not counting q7), following the previous extra subtraction by 128.
@@ -229,25 +294,23 @@
// While >= 16 data bytes remain (not counting q7), fold the 16 bytes q7
// into them, storing the result back into q7.
- blt .Lfold_16_bytes_loop_done
-.Lfold_16_bytes_loop:
- vmull.p64 q8, q7l, FOLD_CONST_L
- vmull.p64 q7, q7h, FOLD_CONST_H
- veor.8 q7, q7, q8
+ blt .Lfold_16_bytes_loop_done\@
+.Lfold_16_bytes_loop\@:
+ pmull16x64_\p FOLD_CONST, q7
vld1.64 {q0}, [buf]!
CPU_LE( vrev64.8 q0, q0 )
vswp q0l, q0h
veor.8 q7, q7, q0
subs len, len, #16
- bge .Lfold_16_bytes_loop
+ bge .Lfold_16_bytes_loop\@
-.Lfold_16_bytes_loop_done:
+.Lfold_16_bytes_loop_done\@:
// Add 16 to get the correct number of data bytes remaining in 0...15
// (not counting q7), following the previous extra subtraction by 16.
adds len, len, #16
- beq .Lreduce_final_16_bytes
+ beq .Lreduce_final_16_bytes\@
-.Lhandle_partial_segment:
+.Lhandle_partial_segment\@:
// Reduce the last '16 + len' bytes where 1 <= len <= 15 and the first
// 16 bytes are in q7 and the rest are the remaining data in 'buf'. To
// do this without needing a fold constant for each possible 'len',
@@ -262,9 +325,9 @@
vswp q0l, q0h
// q1 = high order part of second chunk: q7 left-shifted by 'len' bytes.
- __adrl r3, .Lbyteshift_table + 16
- sub r3, r3, len
- vld1.8 {q2}, [r3]
+ mov_l r1, .Lbyteshift_table + 16
+ sub r1, r1, len
+ vld1.8 {q2}, [r1]
vtbl.8 q1l, {q7l-q7h}, q2l
vtbl.8 q1h, {q7l-q7h}, q2h
@@ -282,12 +345,46 @@
vbsl.8 q2, q1, q0
// Fold the first chunk into the second chunk, storing the result in q7.
- vmull.p64 q0, q3l, FOLD_CONST_L
- vmull.p64 q7, q3h, FOLD_CONST_H
- veor.8 q7, q7, q0
- veor.8 q7, q7, q2
+ pmull16x64_\p FOLD_CONST, q3
+ veor.8 q7, q3, q2
+ b .Lreduce_final_16_bytes\@
-.Lreduce_final_16_bytes:
+.Lless_than_256_bytes\@:
+ // Checksumming a buffer of length 16...255 bytes
+
+ mov_l fold_consts_ptr, .Lfold_across_16_bytes_consts
+
+ // Load the first 16 data bytes.
+ vld1.64 {q7}, [buf]!
+CPU_LE( vrev64.8 q7, q7 )
+ vswp q7l, q7h
+
+ // XOR the first 16 data *bits* with the initial CRC value.
+ vmov.i8 q0h, #0
+ vmov.u16 q0h[3], init_crc
+ veor.8 q7h, q7h, q0h
+
+ // Load the fold-across-16-bytes constants.
+ vld1.64 {FOLD_CONSTS}, [fold_consts_ptr, :128]!
+
+ cmp len, #16
+ beq .Lreduce_final_16_bytes\@ // len == 16
+ subs len, len, #32
+ addlt len, len, #16
+ blt .Lhandle_partial_segment\@ // 17 <= len <= 31
+ b .Lfold_16_bytes_loop\@ // 32 <= len <= 255
+
+.Lreduce_final_16_bytes\@:
+ .endm
+
+//
+// u16 crc_t10dif_pmull(u16 init_crc, const u8 *buf, size_t len);
+//
+// Assumes len >= 16.
+//
+ENTRY(crc_t10dif_pmull64)
+ crct10dif p64
+
// Reduce the 128-bit value M(x), stored in q7, to the final 16-bit CRC.
// Load 'x^48 * (x^48 mod G(x))' and 'x^48 * (x^80 mod G(x))'.
@@ -320,32 +417,19 @@
vmov.u16 r0, q0l[0]
bx lr
+ENDPROC(crc_t10dif_pmull64)
-.Lless_than_256_bytes:
- // Checksumming a buffer of length 16...255 bytes
+ENTRY(crc_t10dif_pmull8)
+ push {r4, lr}
+ mov_l r4, .L16x64perm
- __adrl fold_consts_ptr, .Lfold_across_16_bytes_consts
+ crct10dif p8
- // Load the first 16 data bytes.
- vld1.64 {q7}, [buf]!
CPU_LE( vrev64.8 q7, q7 )
vswp q7l, q7h
-
- // XOR the first 16 data *bits* with the initial CRC value.
- vmov.i8 q0h, #0
- vmov.u16 q0h[3], init_crc
- veor.8 q7h, q7h, q0h
-
- // Load the fold-across-16-bytes constants.
- vld1.64 {FOLD_CONSTS}, [fold_consts_ptr, :128]!
-
- cmp len, #16
- beq .Lreduce_final_16_bytes // len == 16
- subs len, len, #32
- addlt len, len, #16
- blt .Lhandle_partial_segment // 17 <= len <= 31
- b .Lfold_16_bytes_loop // 32 <= len <= 255
-ENDPROC(crc_t10dif_pmull)
+ vst1.64 {q7}, [r3, :128]
+ pop {r4, pc}
+ENDPROC(crc_t10dif_pmull8)
.section ".rodata", "a"
.align 4
@@ -379,3 +463,6 @@
.byte 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f
.byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
.byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe , 0x0
+
+.L16x64perm:
+ .quad 0x808080800000000, 0x909090901010101
diff --git a/arch/arm/crypto/crct10dif-ce-glue.c b/arch/arm/crypto/crct10dif-ce-glue.c
index 79f3b20..a8b7452 100644
--- a/arch/arm/crypto/crct10dif-ce-glue.c
+++ b/arch/arm/crypto/crct10dif-ce-glue.c
@@ -19,7 +19,9 @@
#define CRC_T10DIF_PMULL_CHUNK_SIZE 16U
-asmlinkage u16 crc_t10dif_pmull(u16 init_crc, const u8 *buf, size_t len);
+asmlinkage u16 crc_t10dif_pmull64(u16 init_crc, const u8 *buf, size_t len);
+asmlinkage void crc_t10dif_pmull8(u16 init_crc, const u8 *buf, size_t len,
+ u8 out[16]);
static int crct10dif_init(struct shash_desc *desc)
{
@@ -29,14 +31,14 @@ static int crct10dif_init(struct shash_desc *desc)
return 0;
}
-static int crct10dif_update(struct shash_desc *desc, const u8 *data,
- unsigned int length)
+static int crct10dif_update_ce(struct shash_desc *desc, const u8 *data,
+ unsigned int length)
{
u16 *crc = shash_desc_ctx(desc);
if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE && crypto_simd_usable()) {
kernel_neon_begin();
- *crc = crc_t10dif_pmull(*crc, data, length);
+ *crc = crc_t10dif_pmull64(*crc, data, length);
kernel_neon_end();
} else {
*crc = crc_t10dif_generic(*crc, data, length);
@@ -45,6 +47,27 @@ static int crct10dif_update(struct shash_desc *desc, const u8 *data,
return 0;
}
+static int crct10dif_update_neon(struct shash_desc *desc, const u8 *data,
+ unsigned int length)
+{
+ u16 *crcp = shash_desc_ctx(desc);
+ u8 buf[16] __aligned(16);
+ u16 crc = *crcp;
+
+ if (length > CRC_T10DIF_PMULL_CHUNK_SIZE && crypto_simd_usable()) {
+ kernel_neon_begin();
+ crc_t10dif_pmull8(crc, data, length, buf);
+ kernel_neon_end();
+
+ crc = 0;
+ data = buf;
+ length = sizeof(buf);
+ }
+
+ *crcp = crc_t10dif_generic(crc, data, length);
+ return 0;
+}
+
static int crct10dif_final(struct shash_desc *desc, u8 *out)
{
u16 *crc = shash_desc_ctx(desc);
@@ -53,10 +76,22 @@ static int crct10dif_final(struct shash_desc *desc, u8 *out)
return 0;
}
-static struct shash_alg crc_t10dif_alg = {
+static struct shash_alg algs[] = {{
.digestsize = CRC_T10DIF_DIGEST_SIZE,
.init = crct10dif_init,
- .update = crct10dif_update,
+ .update = crct10dif_update_neon,
+ .final = crct10dif_final,
+ .descsize = CRC_T10DIF_DIGEST_SIZE,
+
+ .base.cra_name = "crct10dif",
+ .base.cra_driver_name = "crct10dif-arm-neon",
+ .base.cra_priority = 150,
+ .base.cra_blocksize = CRC_T10DIF_BLOCK_SIZE,
+ .base.cra_module = THIS_MODULE,
+}, {
+ .digestsize = CRC_T10DIF_DIGEST_SIZE,
+ .init = crct10dif_init,
+ .update = crct10dif_update_ce,
.final = crct10dif_final,
.descsize = CRC_T10DIF_DIGEST_SIZE,
@@ -65,19 +100,19 @@ static struct shash_alg crc_t10dif_alg = {
.base.cra_priority = 200,
.base.cra_blocksize = CRC_T10DIF_BLOCK_SIZE,
.base.cra_module = THIS_MODULE,
-};
+}};
static int __init crc_t10dif_mod_init(void)
{
- if (!(elf_hwcap2 & HWCAP2_PMULL))
+ if (!(elf_hwcap & HWCAP_NEON))
return -ENODEV;
- return crypto_register_shash(&crc_t10dif_alg);
+ return crypto_register_shashes(algs, 1 + !!(elf_hwcap2 & HWCAP2_PMULL));
}
static void __exit crc_t10dif_mod_exit(void)
{
- crypto_unregister_shash(&crc_t10dif_alg);
+ crypto_unregister_shashes(algs, 1 + !!(elf_hwcap2 & HWCAP2_PMULL));
}
module_init(crc_t10dif_mod_init);
diff --git a/arch/arm/include/asm/arm_pmuv3.h b/arch/arm/include/asm/arm_pmuv3.h
index f63ba89..2ec0e5e 100644
--- a/arch/arm/include/asm/arm_pmuv3.h
+++ b/arch/arm/include/asm/arm_pmuv3.h
@@ -212,6 +212,8 @@ static inline void write_pmuserenr(u32 val)
write_sysreg(val, PMUSERENR);
}
+static inline void write_pmuacr(u64 val) {}
+
static inline void kvm_set_pmu_events(u32 set, struct perf_event_attr *attr) {}
static inline void kvm_clr_pmu_events(u32 clr) {}
static inline bool kvm_pmu_counter_deferred(struct perf_event_attr *attr)
@@ -231,6 +233,7 @@ static inline void kvm_vcpu_pmu_resync_el0(void) {}
#define ARMV8_PMU_DFR_VER_V3P1 0x4
#define ARMV8_PMU_DFR_VER_V3P4 0x5
#define ARMV8_PMU_DFR_VER_V3P5 0x6
+#define ARMV8_PMU_DFR_VER_V3P9 0x9
#define ARMV8_PMU_DFR_VER_IMP_DEF 0xF
static inline bool pmuv3_implemented(int pmuver)
@@ -249,6 +252,11 @@ static inline bool is_pmuv3p5(int pmuver)
return pmuver >= ARMV8_PMU_DFR_VER_V3P5;
}
+static inline bool is_pmuv3p9(int pmuver)
+{
+ return pmuver >= ARMV8_PMU_DFR_VER_V3P9;
+}
+
static inline u64 read_pmceid0(void)
{
u64 val = read_sysreg(PMCEID0);
diff --git a/arch/arm/kernel/head.S b/arch/arm/kernel/head.S
index 1ec35f0..f22c50d 100644
--- a/arch/arm/kernel/head.S
+++ b/arch/arm/kernel/head.S
@@ -252,18 +252,23 @@
*/
add r0, r4, #KERNEL_OFFSET >> (SECTION_SHIFT - PMD_ENTRY_ORDER)
ldr r6, =(_end - 1)
+
+ /* For XIP, kernel_sec_start/kernel_sec_end are currently in RO memory */
+#ifndef CONFIG_XIP_KERNEL
adr_l r5, kernel_sec_start @ _pa(kernel_sec_start)
#if defined CONFIG_CPU_ENDIAN_BE8 || defined CONFIG_CPU_ENDIAN_BE32
str r8, [r5, #4] @ Save physical start of kernel (BE)
#else
str r8, [r5] @ Save physical start of kernel (LE)
#endif
+#endif
orr r3, r8, r7 @ Add the MMU flags
add r6, r4, r6, lsr #(SECTION_SHIFT - PMD_ENTRY_ORDER)
1: str r3, [r0], #1 << PMD_ENTRY_ORDER
add r3, r3, #1 << SECTION_SHIFT
cmp r0, r6
bls 1b
+#ifndef CONFIG_XIP_KERNEL
eor r3, r3, r7 @ Remove the MMU flags
adr_l r5, kernel_sec_end @ _pa(kernel_sec_end)
#if defined CONFIG_CPU_ENDIAN_BE8 || defined CONFIG_CPU_ENDIAN_BE32
@@ -271,8 +276,7 @@
#else
str r3, [r5] @ Save physical end of kernel (LE)
#endif
-
-#ifdef CONFIG_XIP_KERNEL
+#else
/*
* Map the kernel image separately as it is not located in RAM.
*/
@@ -407,7 +411,11 @@
/*
* Use the page tables supplied from __cpu_up.
*/
+#ifdef CONFIG_XIP_KERNEL
+ ldr r3, =(secondary_data + PLAT_PHYS_OFFSET - PAGE_OFFSET)
+#else
adr_l r3, secondary_data
+#endif
mov_l r12, __secondary_switched
ldrd r4, r5, [r3, #0] @ get secondary_data.pgdir
ARM_BE8(eor r4, r4, r5) @ Swap r5 and r4 in BE:
diff --git a/arch/arm/kernel/psci_smp.c b/arch/arm/kernel/psci_smp.c
index d4392e17..3bb0c4d 100644
--- a/arch/arm/kernel/psci_smp.c
+++ b/arch/arm/kernel/psci_smp.c
@@ -45,8 +45,15 @@ extern void secondary_startup(void);
static int psci_boot_secondary(unsigned int cpu, struct task_struct *idle)
{
if (psci_ops.cpu_on)
+#ifdef CONFIG_XIP_KERNEL
+ return psci_ops.cpu_on(cpu_logical_map(cpu),
+ ((phys_addr_t)(&secondary_startup)
+ - XIP_VIRT_ADDR(CONFIG_XIP_PHYS_ADDR)
+ + CONFIG_XIP_PHYS_ADDR));
+#else
return psci_ops.cpu_on(cpu_logical_map(cpu),
virt_to_idmap(&secondary_startup));
+#endif
return -ENODEV;
}
diff --git a/arch/arm/kernel/sys_oabi-compat.c b/arch/arm/kernel/sys_oabi-compat.c
index f5781ff..2944721 100644
--- a/arch/arm/kernel/sys_oabi-compat.c
+++ b/arch/arm/kernel/sys_oabi-compat.c
@@ -235,12 +235,12 @@ asmlinkage long sys_oabi_fcntl64(unsigned int fd, unsigned int cmd,
unsigned long arg)
{
void __user *argp = (void __user *)arg;
- struct fd f = fdget_raw(fd);
+ CLASS(fd_raw, f)(fd);
struct flock64 flock;
- long err = -EBADF;
+ long err;
- if (!fd_file(f))
- goto out;
+ if (fd_empty(f))
+ return -EBADF;
switch (cmd) {
case F_GETLK64:
@@ -271,8 +271,6 @@ asmlinkage long sys_oabi_fcntl64(unsigned int fd, unsigned int cmd,
err = sys_fcntl64(fd, cmd, arg);
break;
}
- fdput(f);
-out:
return err;
}
diff --git a/arch/arm/kernel/traps.c b/arch/arm/kernel/traps.c
index 480e307..6ea6459 100644
--- a/arch/arm/kernel/traps.c
+++ b/arch/arm/kernel/traps.c
@@ -570,6 +570,7 @@ static int bad_syscall(int n, struct pt_regs *regs)
static inline int
__do_cache_op(unsigned long start, unsigned long end)
{
+ unsigned int ua_flags;
int ret;
do {
@@ -578,7 +579,9 @@ __do_cache_op(unsigned long start, unsigned long end)
if (fatal_signal_pending(current))
return 0;
+ ua_flags = uaccess_save_and_enable();
ret = flush_icache_user_range(start, start + chunk);
+ uaccess_restore(ua_flags);
if (ret)
return ret;
diff --git a/arch/arm/mm/dma-mapping-nommu.c b/arch/arm/mm/dma-mapping-nommu.c
index 97db539..fecac10 100644
--- a/arch/arm/mm/dma-mapping-nommu.c
+++ b/arch/arm/mm/dma-mapping-nommu.c
@@ -39,7 +39,7 @@ void arch_setup_dma_ops(struct device *dev, bool coherent)
/*
* Cache support for v7m is optional, so can be treated as
* coherent if no cache has been detected. Note that it is not
- * enough to check if MPU is in use or not since in absense of
+ * enough to check if MPU is in use or not since in absence of
* MPU system memory map is used.
*/
dev->dma_coherent = cacheid ? coherent : true;
diff --git a/arch/arm/mm/idmap.c b/arch/arm/mm/idmap.c
index 448e57c..4a833e8 100644
--- a/arch/arm/mm/idmap.c
+++ b/arch/arm/mm/idmap.c
@@ -84,8 +84,15 @@ static void identity_mapping_add(pgd_t *pgd, const char *text_start,
unsigned long addr, end;
unsigned long next;
+#ifdef CONFIG_XIP_KERNEL
+ addr = (phys_addr_t)(text_start) - XIP_VIRT_ADDR(CONFIG_XIP_PHYS_ADDR)
+ + CONFIG_XIP_PHYS_ADDR;
+ end = (phys_addr_t)(text_end) - XIP_VIRT_ADDR(CONFIG_XIP_PHYS_ADDR)
+ + CONFIG_XIP_PHYS_ADDR;
+#else
addr = virt_to_idmap(text_start);
end = virt_to_idmap(text_end);
+#endif
pr_info("Setting up static identity map for 0x%lx - 0x%lx\n", addr, end);
prot |= PMD_TYPE_SECT | PMD_SECT_AP_WRITE | PMD_SECT_AF;
diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c
index f85c177..f5b7a16 100644
--- a/arch/arm/mm/mmu.c
+++ b/arch/arm/mm/mmu.c
@@ -1403,18 +1403,6 @@ static void __init devicemaps_init(const struct machine_desc *mdesc)
}
/*
- * Map the kernel if it is XIP.
- * It is always first in the modulearea.
- */
-#ifdef CONFIG_XIP_KERNEL
- map.pfn = __phys_to_pfn(CONFIG_XIP_PHYS_ADDR & SECTION_MASK);
- map.virtual = MODULES_VADDR;
- map.length = ((unsigned long)_exiprom - map.virtual + ~SECTION_MASK) & SECTION_MASK;
- map.type = MT_ROM;
- create_mapping(&map);
-#endif
-
- /*
* Map the cache flushing regions.
*/
#ifdef FLUSH_BASE
@@ -1603,12 +1591,27 @@ static void __init map_kernel(void)
* This will only persist until we turn on proper memory management later on
* and we remap the whole kernel with page granularity.
*/
+#ifdef CONFIG_XIP_KERNEL
+ phys_addr_t kernel_nx_start = kernel_sec_start;
+#else
phys_addr_t kernel_x_start = kernel_sec_start;
phys_addr_t kernel_x_end = round_up(__pa(__init_end), SECTION_SIZE);
phys_addr_t kernel_nx_start = kernel_x_end;
+#endif
phys_addr_t kernel_nx_end = kernel_sec_end;
struct map_desc map;
+ /*
+ * Map the kernel if it is XIP.
+ * It is always first in the modulearea.
+ */
+#ifdef CONFIG_XIP_KERNEL
+ map.pfn = __phys_to_pfn(CONFIG_XIP_PHYS_ADDR & SECTION_MASK);
+ map.virtual = MODULES_VADDR;
+ map.length = ((unsigned long)_exiprom - map.virtual + ~SECTION_MASK) & SECTION_MASK;
+ map.type = MT_ROM;
+ create_mapping(&map);
+#else
map.pfn = __phys_to_pfn(kernel_x_start);
map.virtual = __phys_to_virt(kernel_x_start);
map.length = kernel_x_end - kernel_x_start;
@@ -1618,7 +1621,7 @@ static void __init map_kernel(void)
/* If the nx part is small it may end up covered by the tail of the RWX section */
if (kernel_x_end == kernel_nx_end)
return;
-
+#endif
map.pfn = __phys_to_pfn(kernel_nx_start);
map.virtual = __phys_to_virt(kernel_nx_start);
map.length = kernel_nx_end - kernel_nx_start;
@@ -1764,6 +1767,11 @@ void __init paging_init(const struct machine_desc *mdesc)
{
void *zero_page;
+#ifdef CONFIG_XIP_KERNEL
+ /* Store the kernel RW RAM region start/end in these variables */
+ kernel_sec_start = CONFIG_PHYS_OFFSET & SECTION_MASK;
+ kernel_sec_end = round_up(__pa(_end), SECTION_SIZE);
+#endif
pr_debug("physical kernel sections: 0x%08llx-0x%08llx\n",
kernel_sec_start, kernel_sec_end);
diff --git a/arch/arm/mm/proc-v7.S b/arch/arm/mm/proc-v7.S
index 5fb9a6a..2cd9333 100644
--- a/arch/arm/mm/proc-v7.S
+++ b/arch/arm/mm/proc-v7.S
@@ -94,7 +94,7 @@
ret lr
SYM_FUNC_END(cpu_v7_dcache_clean_area)
-#ifdef CONFIG_ARM_PSCI
+#if defined(CONFIG_ARM_PSCI) && defined(CONFIG_HARDEN_BRANCH_PREDICTOR)
.arch_extension sec
SYM_TYPED_FUNC_START(cpu_v7_smc_switch_mm)
stmfd sp!, {r0 - r3}
diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl
index 23c9820..49eeb2a 100644
--- a/arch/arm/tools/syscall.tbl
+++ b/arch/arm/tools/syscall.tbl
@@ -477,3 +477,7 @@
460 common lsm_set_self_attr sys_lsm_set_self_attr
461 common lsm_list_modules sys_lsm_list_modules
462 common mseal sys_mseal
+463 common setxattrat sys_setxattrat
+464 common getxattrat sys_getxattrat
+465 common listxattrat sys_listxattrat
+466 common removexattrat sys_removexattrat
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 70d7f4f..d743737 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -21,6 +21,7 @@
select ARCH_ENABLE_SPLIT_PMD_PTLOCK if PGTABLE_LEVELS > 2
select ARCH_ENABLE_THP_MIGRATION if TRANSPARENT_HUGEPAGE
select ARCH_HAS_CACHE_LINE_SIZE
+ select ARCH_HAS_CC_PLATFORM
select ARCH_HAS_CURRENT_STACK_POINTER
select ARCH_HAS_DEBUG_VIRTUAL
select ARCH_HAS_DEBUG_VM_PGTABLE
@@ -38,12 +39,15 @@
select ARCH_HAS_MEM_ENCRYPT
select ARCH_HAS_NMI_SAFE_THIS_CPU_OPS
select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
+ select ARCH_HAS_NONLEAF_PMD_YOUNG if ARM64_HAFT
select ARCH_HAS_PTE_DEVMAP
select ARCH_HAS_PTE_SPECIAL
select ARCH_HAS_HW_PTE_YOUNG
select ARCH_HAS_SETUP_DMA_OPS
select ARCH_HAS_SET_DIRECT_MAP
select ARCH_HAS_SET_MEMORY
+ select ARCH_HAS_MEM_ENCRYPT
+ select ARCH_HAS_FORCE_DMA_UNENCRYPTED
select ARCH_STACKWALK
select ARCH_HAS_STRICT_KERNEL_RWX
select ARCH_HAS_STRICT_MODULE_RWX
@@ -1576,6 +1580,9 @@
config ARCH_SUPPORTS_CRASH_DUMP
def_bool y
+config ARCH_DEFAULT_CRASH_DUMP
+ def_bool y
+
config ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION
def_bool CRASH_RESERVE
@@ -2156,6 +2163,9 @@
if the cpu does not implement the feature.
endmenu # "ARMv8.7 architectural features"
+config AS_HAS_MOPS
+ def_bool $(as-instr,.arch_extension mops)
+
menu "ARMv8.9 architectural features"
config ARM64_POE
@@ -2177,8 +2187,44 @@
int
default 3
+config ARM64_HAFT
+ bool "Support for Hardware managed Access Flag for Table Descriptors"
+ depends on ARM64_HW_AFDBM
+ default y
+ help
+ The ARMv8.9/ARMv9.5 introduces the feature Hardware managed Access
+ Flag for Table descriptors. When enabled an architectural executed
+ memory access will update the Access Flag in each Table descriptor
+ which is accessed during the translation table walk and for which
+ the Access Flag is 0. The Access Flag of the Table descriptor use
+ the same bit of PTE_AF.
+
+ The feature will only be enabled if all the CPUs in the system
+ support this feature. If unsure, say Y.
+
endmenu # "ARMv8.9 architectural features"
+menu "v9.4 architectural features"
+
+config ARM64_GCS
+ bool "Enable support for Guarded Control Stack (GCS)"
+ default y
+ select ARCH_HAS_USER_SHADOW_STACK
+ select ARCH_USES_HIGH_VMA_FLAGS
+ depends on !UPROBES
+ help
+ Guarded Control Stack (GCS) provides support for a separate
+ stack with restricted access which contains only return
+ addresses. This can be used to harden against some attacks
+ by comparing return address used by the program with what is
+ stored in the GCS, and may also be used to efficiently obtain
+ the call stack for applications such as profiling.
+
+ The feature is detected at runtime, and will remain disabled
+ if the system does not implement the feature.
+
+endmenu # "v9.4 architectural features"
+
config ARM64_SVE
bool "ARM Scalable Vector Extension support"
default y
diff --git a/arch/arm64/crypto/crct10dif-ce-core.S b/arch/arm64/crypto/crct10dif-ce-core.S
index 5604de6..87dd6d4 100644
--- a/arch/arm64/crypto/crct10dif-ce-core.S
+++ b/arch/arm64/crypto/crct10dif-ce-core.S
@@ -1,8 +1,11 @@
//
// Accelerated CRC-T10DIF using arm64 NEON and Crypto Extensions instructions
//
-// Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
-// Copyright (C) 2019 Google LLC <ebiggers@google.com>
+// Copyright (C) 2016 Linaro Ltd
+// Copyright (C) 2019-2024 Google LLC
+//
+// Authors: Ard Biesheuvel <ardb@google.com>
+// Eric Biggers <ebiggers@google.com>
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License version 2 as
@@ -71,161 +74,117 @@
init_crc .req w0
buf .req x1
len .req x2
- fold_consts_ptr .req x3
+ fold_consts_ptr .req x5
fold_consts .req v10
- ad .req v14
-
- k00_16 .req v15
- k32_48 .req v16
-
t3 .req v17
t4 .req v18
t5 .req v19
t6 .req v20
t7 .req v21
t8 .req v22
- t9 .req v23
- perm1 .req v24
- perm2 .req v25
- perm3 .req v26
- perm4 .req v27
+ perm .req v27
- bd1 .req v28
- bd2 .req v29
- bd3 .req v30
- bd4 .req v31
-
- .macro __pmull_init_p64
+ .macro pmull16x64_p64, a16, b64, c64
+ pmull2 \c64\().1q, \a16\().2d, \b64\().2d
+ pmull \b64\().1q, \a16\().1d, \b64\().1d
.endm
- .macro __pmull_pre_p64, bd
+ /*
+ * Pairwise long polynomial multiplication of two 16-bit values
+ *
+ * { w0, w1 }, { y0, y1 }
+ *
+ * by two 64-bit values
+ *
+ * { x0, x1, x2, x3, x4, x5, x6, x7 }, { z0, z1, z2, z3, z4, z5, z6, z7 }
+ *
+ * where each vector element is a byte, ordered from least to most
+ * significant.
+ *
+ * This can be implemented using 8x8 long polynomial multiplication, by
+ * reorganizing the input so that each pairwise 8x8 multiplication
+ * produces one of the terms from the decomposition below, and
+ * combining the results of each rank and shifting them into place.
+ *
+ * Rank
+ * 0 w0*x0 ^ | y0*z0 ^
+ * 1 (w0*x1 ^ w1*x0) << 8 ^ | (y0*z1 ^ y1*z0) << 8 ^
+ * 2 (w0*x2 ^ w1*x1) << 16 ^ | (y0*z2 ^ y1*z1) << 16 ^
+ * 3 (w0*x3 ^ w1*x2) << 24 ^ | (y0*z3 ^ y1*z2) << 24 ^
+ * 4 (w0*x4 ^ w1*x3) << 32 ^ | (y0*z4 ^ y1*z3) << 32 ^
+ * 5 (w0*x5 ^ w1*x4) << 40 ^ | (y0*z5 ^ y1*z4) << 40 ^
+ * 6 (w0*x6 ^ w1*x5) << 48 ^ | (y0*z6 ^ y1*z5) << 48 ^
+ * 7 (w0*x7 ^ w1*x6) << 56 ^ | (y0*z7 ^ y1*z6) << 56 ^
+ * 8 w1*x7 << 64 | y1*z7 << 64
+ *
+ * The inputs can be reorganized into
+ *
+ * { w0, w0, w0, w0, y0, y0, y0, y0 }, { w1, w1, w1, w1, y1, y1, y1, y1 }
+ * { x0, x2, x4, x6, z0, z2, z4, z6 }, { x1, x3, x5, x7, z1, z3, z5, z7 }
+ *
+ * and after performing 8x8->16 bit long polynomial multiplication of
+ * each of the halves of the first vector with those of the second one,
+ * we obtain the following four vectors of 16-bit elements:
+ *
+ * a := { w0*x0, w0*x2, w0*x4, w0*x6 }, { y0*z0, y0*z2, y0*z4, y0*z6 }
+ * b := { w0*x1, w0*x3, w0*x5, w0*x7 }, { y0*z1, y0*z3, y0*z5, y0*z7 }
+ * c := { w1*x0, w1*x2, w1*x4, w1*x6 }, { y1*z0, y1*z2, y1*z4, y1*z6 }
+ * d := { w1*x1, w1*x3, w1*x5, w1*x7 }, { y1*z1, y1*z3, y1*z5, y1*z7 }
+ *
+ * Results b and c can be XORed together, as the vector elements have
+ * matching ranks. Then, the final XOR (*) can be pulled forward, and
+ * applied between the halves of each of the remaining three vectors,
+ * which are then shifted into place, and combined to produce two
+ * 80-bit results.
+ *
+ * (*) NOTE: the 16x64 bit polynomial multiply below is not equivalent
+ * to the 64x64 bit one above, but XOR'ing the outputs together will
+ * produce the expected result, and this is sufficient in the context of
+ * this algorithm.
+ */
+ .macro pmull16x64_p8, a16, b64, c64
+ ext t7.16b, \b64\().16b, \b64\().16b, #1
+ tbl t5.16b, {\a16\().16b}, perm.16b
+ uzp1 t7.16b, \b64\().16b, t7.16b
+ bl __pmull_p8_16x64
+ ext \b64\().16b, t4.16b, t4.16b, #15
+ eor \c64\().16b, t8.16b, t5.16b
.endm
- .macro __pmull_init_p8
- // k00_16 := 0x0000000000000000_000000000000ffff
- // k32_48 := 0x00000000ffffffff_0000ffffffffffff
- movi k32_48.2d, #0xffffffff
- mov k32_48.h[2], k32_48.h[0]
- ushr k00_16.2d, k32_48.2d, #32
+SYM_FUNC_START_LOCAL(__pmull_p8_16x64)
+ ext t6.16b, t5.16b, t5.16b, #8
- // prepare the permutation vectors
- mov_q x5, 0x080f0e0d0c0b0a09
- movi perm4.8b, #8
- dup perm1.2d, x5
- eor perm1.16b, perm1.16b, perm4.16b
- ushr perm2.2d, perm1.2d, #8
- ushr perm3.2d, perm1.2d, #16
- ushr perm4.2d, perm1.2d, #24
- sli perm2.2d, perm1.2d, #56
- sli perm3.2d, perm1.2d, #48
- sli perm4.2d, perm1.2d, #40
- .endm
+ pmull t3.8h, t7.8b, t5.8b
+ pmull t4.8h, t7.8b, t6.8b
+ pmull2 t5.8h, t7.16b, t5.16b
+ pmull2 t6.8h, t7.16b, t6.16b
- .macro __pmull_pre_p8, bd
- tbl bd1.16b, {\bd\().16b}, perm1.16b
- tbl bd2.16b, {\bd\().16b}, perm2.16b
- tbl bd3.16b, {\bd\().16b}, perm3.16b
- tbl bd4.16b, {\bd\().16b}, perm4.16b
- .endm
-
-SYM_FUNC_START_LOCAL(__pmull_p8_core)
-.L__pmull_p8_core:
- ext t4.8b, ad.8b, ad.8b, #1 // A1
- ext t5.8b, ad.8b, ad.8b, #2 // A2
- ext t6.8b, ad.8b, ad.8b, #3 // A3
-
- pmull t4.8h, t4.8b, fold_consts.8b // F = A1*B
- pmull t8.8h, ad.8b, bd1.8b // E = A*B1
- pmull t5.8h, t5.8b, fold_consts.8b // H = A2*B
- pmull t7.8h, ad.8b, bd2.8b // G = A*B2
- pmull t6.8h, t6.8b, fold_consts.8b // J = A3*B
- pmull t9.8h, ad.8b, bd3.8b // I = A*B3
- pmull t3.8h, ad.8b, bd4.8b // K = A*B4
- b 0f
-
-.L__pmull_p8_core2:
- tbl t4.16b, {ad.16b}, perm1.16b // A1
- tbl t5.16b, {ad.16b}, perm2.16b // A2
- tbl t6.16b, {ad.16b}, perm3.16b // A3
-
- pmull2 t4.8h, t4.16b, fold_consts.16b // F = A1*B
- pmull2 t8.8h, ad.16b, bd1.16b // E = A*B1
- pmull2 t5.8h, t5.16b, fold_consts.16b // H = A2*B
- pmull2 t7.8h, ad.16b, bd2.16b // G = A*B2
- pmull2 t6.8h, t6.16b, fold_consts.16b // J = A3*B
- pmull2 t9.8h, ad.16b, bd3.16b // I = A*B3
- pmull2 t3.8h, ad.16b, bd4.16b // K = A*B4
-
-0: eor t4.16b, t4.16b, t8.16b // L = E + F
- eor t5.16b, t5.16b, t7.16b // M = G + H
- eor t6.16b, t6.16b, t9.16b // N = I + J
-
- uzp1 t8.2d, t4.2d, t5.2d
- uzp2 t4.2d, t4.2d, t5.2d
- uzp1 t7.2d, t6.2d, t3.2d
- uzp2 t6.2d, t6.2d, t3.2d
-
- // t4 = (L) (P0 + P1) << 8
- // t5 = (M) (P2 + P3) << 16
- eor t8.16b, t8.16b, t4.16b
- and t4.16b, t4.16b, k32_48.16b
-
- // t6 = (N) (P4 + P5) << 24
- // t7 = (K) (P6 + P7) << 32
- eor t7.16b, t7.16b, t6.16b
- and t6.16b, t6.16b, k00_16.16b
-
- eor t8.16b, t8.16b, t4.16b
- eor t7.16b, t7.16b, t6.16b
-
- zip2 t5.2d, t8.2d, t4.2d
- zip1 t4.2d, t8.2d, t4.2d
- zip2 t3.2d, t7.2d, t6.2d
- zip1 t6.2d, t7.2d, t6.2d
-
- ext t4.16b, t4.16b, t4.16b, #15
+ ext t8.16b, t3.16b, t3.16b, #8
+ eor t4.16b, t4.16b, t6.16b
+ ext t7.16b, t5.16b, t5.16b, #8
+ ext t6.16b, t4.16b, t4.16b, #8
+ eor t8.8b, t8.8b, t3.8b
+ eor t5.8b, t5.8b, t7.8b
+ eor t4.8b, t4.8b, t6.8b
ext t5.16b, t5.16b, t5.16b, #14
- ext t6.16b, t6.16b, t6.16b, #13
- ext t3.16b, t3.16b, t3.16b, #12
-
- eor t4.16b, t4.16b, t5.16b
- eor t6.16b, t6.16b, t3.16b
ret
-SYM_FUNC_END(__pmull_p8_core)
+SYM_FUNC_END(__pmull_p8_16x64)
- .macro __pmull_p8, rq, ad, bd, i
- .ifnc \bd, fold_consts
- .err
- .endif
- mov ad.16b, \ad\().16b
- .ifb \i
- pmull \rq\().8h, \ad\().8b, \bd\().8b // D = A*B
- .else
- pmull2 \rq\().8h, \ad\().16b, \bd\().16b // D = A*B
- .endif
-
- bl .L__pmull_p8_core\i
-
- eor \rq\().16b, \rq\().16b, t4.16b
- eor \rq\().16b, \rq\().16b, t6.16b
- .endm
// Fold reg1, reg2 into the next 32 data bytes, storing the result back
// into reg1, reg2.
.macro fold_32_bytes, p, reg1, reg2
ldp q11, q12, [buf], #0x20
- __pmull_\p v8, \reg1, fold_consts, 2
- __pmull_\p \reg1, \reg1, fold_consts
+ pmull16x64_\p fold_consts, \reg1, v8
CPU_LE( rev64 v11.16b, v11.16b )
CPU_LE( rev64 v12.16b, v12.16b )
- __pmull_\p v9, \reg2, fold_consts, 2
- __pmull_\p \reg2, \reg2, fold_consts
+ pmull16x64_\p fold_consts, \reg2, v9
CPU_LE( ext v11.16b, v11.16b, v11.16b, #8 )
CPU_LE( ext v12.16b, v12.16b, v12.16b, #8 )
@@ -238,26 +197,15 @@
// Fold src_reg into dst_reg, optionally loading the next fold constants
.macro fold_16_bytes, p, src_reg, dst_reg, load_next_consts
- __pmull_\p v8, \src_reg, fold_consts
- __pmull_\p \src_reg, \src_reg, fold_consts, 2
+ pmull16x64_\p fold_consts, \src_reg, v8
.ifnb \load_next_consts
ld1 {fold_consts.2d}, [fold_consts_ptr], #16
- __pmull_pre_\p fold_consts
.endif
eor \dst_reg\().16b, \dst_reg\().16b, v8.16b
eor \dst_reg\().16b, \dst_reg\().16b, \src_reg\().16b
.endm
- .macro __pmull_p64, rd, rn, rm, n
- .ifb \n
- pmull \rd\().1q, \rn\().1d, \rm\().1d
- .else
- pmull2 \rd\().1q, \rn\().2d, \rm\().2d
- .endif
- .endm
-
.macro crc_t10dif_pmull, p
- __pmull_init_\p
// For sizes less than 256 bytes, we can't fold 128 bytes at a time.
cmp len, #256
@@ -296,7 +244,6 @@
// Load the constants for folding across 128 bytes.
ld1 {fold_consts.2d}, [fold_consts_ptr]
- __pmull_pre_\p fold_consts
// Subtract 128 for the 128 data bytes just consumed. Subtract another
// 128 to simplify the termination condition of the following loop.
@@ -318,7 +265,6 @@
// Fold across 64 bytes.
add fold_consts_ptr, fold_consts_ptr, #16
ld1 {fold_consts.2d}, [fold_consts_ptr], #16
- __pmull_pre_\p fold_consts
fold_16_bytes \p, v0, v4
fold_16_bytes \p, v1, v5
fold_16_bytes \p, v2, v6
@@ -339,8 +285,7 @@
// into them, storing the result back into v7.
b.lt .Lfold_16_bytes_loop_done_\@
.Lfold_16_bytes_loop_\@:
- __pmull_\p v8, v7, fold_consts
- __pmull_\p v7, v7, fold_consts, 2
+ pmull16x64_\p fold_consts, v7, v8
eor v7.16b, v7.16b, v8.16b
ldr q0, [buf], #16
CPU_LE( rev64 v0.16b, v0.16b )
@@ -387,51 +332,10 @@
bsl v2.16b, v1.16b, v0.16b
// Fold the first chunk into the second chunk, storing the result in v7.
- __pmull_\p v0, v3, fold_consts
- __pmull_\p v7, v3, fold_consts, 2
- eor v7.16b, v7.16b, v0.16b
+ pmull16x64_\p fold_consts, v3, v0
+ eor v7.16b, v3.16b, v0.16b
eor v7.16b, v7.16b, v2.16b
-
-.Lreduce_final_16_bytes_\@:
- // Reduce the 128-bit value M(x), stored in v7, to the final 16-bit CRC.
-
- movi v2.16b, #0 // init zero register
-
- // Load 'x^48 * (x^48 mod G(x))' and 'x^48 * (x^80 mod G(x))'.
- ld1 {fold_consts.2d}, [fold_consts_ptr], #16
- __pmull_pre_\p fold_consts
-
- // Fold the high 64 bits into the low 64 bits, while also multiplying by
- // x^64. This produces a 128-bit value congruent to x^64 * M(x) and
- // whose low 48 bits are 0.
- ext v0.16b, v2.16b, v7.16b, #8
- __pmull_\p v7, v7, fold_consts, 2 // high bits * x^48 * (x^80 mod G(x))
- eor v0.16b, v0.16b, v7.16b // + low bits * x^64
-
- // Fold the high 32 bits into the low 96 bits. This produces a 96-bit
- // value congruent to x^64 * M(x) and whose low 48 bits are 0.
- ext v1.16b, v0.16b, v2.16b, #12 // extract high 32 bits
- mov v0.s[3], v2.s[0] // zero high 32 bits
- __pmull_\p v1, v1, fold_consts // high 32 bits * x^48 * (x^48 mod G(x))
- eor v0.16b, v0.16b, v1.16b // + low bits
-
- // Load G(x) and floor(x^48 / G(x)).
- ld1 {fold_consts.2d}, [fold_consts_ptr]
- __pmull_pre_\p fold_consts
-
- // Use Barrett reduction to compute the final CRC value.
- __pmull_\p v1, v0, fold_consts, 2 // high 32 bits * floor(x^48 / G(x))
- ushr v1.2d, v1.2d, #32 // /= x^32
- __pmull_\p v1, v1, fold_consts // *= G(x)
- ushr v0.2d, v0.2d, #48
- eor v0.16b, v0.16b, v1.16b // + low 16 nonzero bits
- // Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of v0.
-
- umov w0, v0.h[0]
- .ifc \p, p8
- frame_pop
- .endif
- ret
+ b .Lreduce_final_16_bytes_\@
.Lless_than_256_bytes_\@:
// Checksumming a buffer of length 16...255 bytes
@@ -450,7 +354,6 @@
// Load the fold-across-16-bytes constants.
ld1 {fold_consts.2d}, [fold_consts_ptr], #16
- __pmull_pre_\p fold_consts
cmp len, #16
b.eq .Lreduce_final_16_bytes_\@ // len == 16
@@ -458,6 +361,8 @@
b.ge .Lfold_16_bytes_loop_\@ // 32 <= len <= 255
add len, len, #16
b .Lhandle_partial_segment_\@ // 17 <= len <= 31
+
+.Lreduce_final_16_bytes_\@:
.endm
//
@@ -467,7 +372,22 @@
//
SYM_FUNC_START(crc_t10dif_pmull_p8)
frame_push 1
+
+ // Compose { 0,0,0,0, 8,8,8,8, 1,1,1,1, 9,9,9,9 }
+ movi perm.4h, #8, lsl #8
+ orr perm.2s, #1, lsl #16
+ orr perm.2s, #1, lsl #24
+ zip1 perm.16b, perm.16b, perm.16b
+ zip1 perm.16b, perm.16b, perm.16b
+
crc_t10dif_pmull p8
+
+CPU_LE( rev64 v7.16b, v7.16b )
+CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
+ str q7, [x3]
+
+ frame_pop
+ ret
SYM_FUNC_END(crc_t10dif_pmull_p8)
.align 5
@@ -478,6 +398,41 @@
//
SYM_FUNC_START(crc_t10dif_pmull_p64)
crc_t10dif_pmull p64
+
+ // Reduce the 128-bit value M(x), stored in v7, to the final 16-bit CRC.
+
+ movi v2.16b, #0 // init zero register
+
+ // Load 'x^48 * (x^48 mod G(x))' and 'x^48 * (x^80 mod G(x))'.
+ ld1 {fold_consts.2d}, [fold_consts_ptr], #16
+
+ // Fold the high 64 bits into the low 64 bits, while also multiplying by
+ // x^64. This produces a 128-bit value congruent to x^64 * M(x) and
+ // whose low 48 bits are 0.
+ ext v0.16b, v2.16b, v7.16b, #8
+ pmull2 v7.1q, v7.2d, fold_consts.2d // high bits * x^48 * (x^80 mod G(x))
+ eor v0.16b, v0.16b, v7.16b // + low bits * x^64
+
+ // Fold the high 32 bits into the low 96 bits. This produces a 96-bit
+ // value congruent to x^64 * M(x) and whose low 48 bits are 0.
+ ext v1.16b, v0.16b, v2.16b, #12 // extract high 32 bits
+ mov v0.s[3], v2.s[0] // zero high 32 bits
+ pmull v1.1q, v1.1d, fold_consts.1d // high 32 bits * x^48 * (x^48 mod G(x))
+ eor v0.16b, v0.16b, v1.16b // + low bits
+
+ // Load G(x) and floor(x^48 / G(x)).
+ ld1 {fold_consts.2d}, [fold_consts_ptr]
+
+ // Use Barrett reduction to compute the final CRC value.
+ pmull2 v1.1q, v0.2d, fold_consts.2d // high 32 bits * floor(x^48 / G(x))
+ ushr v1.2d, v1.2d, #32 // /= x^32
+ pmull v1.1q, v1.1d, fold_consts.1d // *= G(x)
+ ushr v0.2d, v0.2d, #48
+ eor v0.16b, v0.16b, v1.16b // + low 16 nonzero bits
+ // Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of v0.
+
+ umov w0, v0.h[0]
+ ret
SYM_FUNC_END(crc_t10dif_pmull_p64)
.section ".rodata", "a"
diff --git a/arch/arm64/crypto/crct10dif-ce-glue.c b/arch/arm64/crypto/crct10dif-ce-glue.c
index 606d25c..08bcbd8 100644
--- a/arch/arm64/crypto/crct10dif-ce-glue.c
+++ b/arch/arm64/crypto/crct10dif-ce-glue.c
@@ -20,7 +20,8 @@
#define CRC_T10DIF_PMULL_CHUNK_SIZE 16U
-asmlinkage u16 crc_t10dif_pmull_p8(u16 init_crc, const u8 *buf, size_t len);
+asmlinkage void crc_t10dif_pmull_p8(u16 init_crc, const u8 *buf, size_t len,
+ u8 out[16]);
asmlinkage u16 crc_t10dif_pmull_p64(u16 init_crc, const u8 *buf, size_t len);
static int crct10dif_init(struct shash_desc *desc)
@@ -34,25 +35,21 @@ static int crct10dif_init(struct shash_desc *desc)
static int crct10dif_update_pmull_p8(struct shash_desc *desc, const u8 *data,
unsigned int length)
{
- u16 *crc = shash_desc_ctx(desc);
+ u16 *crcp = shash_desc_ctx(desc);
+ u16 crc = *crcp;
+ u8 buf[16];
- if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE && crypto_simd_usable()) {
- do {
- unsigned int chunk = length;
+ if (length > CRC_T10DIF_PMULL_CHUNK_SIZE && crypto_simd_usable()) {
+ kernel_neon_begin();
+ crc_t10dif_pmull_p8(crc, data, length, buf);
+ kernel_neon_end();
- if (chunk > SZ_4K + CRC_T10DIF_PMULL_CHUNK_SIZE)
- chunk = SZ_4K;
-
- kernel_neon_begin();
- *crc = crc_t10dif_pmull_p8(*crc, data, chunk);
- kernel_neon_end();
- data += chunk;
- length -= chunk;
- } while (length);
- } else {
- *crc = crc_t10dif_generic(*crc, data, length);
+ crc = 0;
+ data = buf;
+ length = sizeof(buf);
}
+ *crcp = crc_t10dif_generic(crc, data, length);
return 0;
}
@@ -62,18 +59,9 @@ static int crct10dif_update_pmull_p64(struct shash_desc *desc, const u8 *data,
u16 *crc = shash_desc_ctx(desc);
if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE && crypto_simd_usable()) {
- do {
- unsigned int chunk = length;
-
- if (chunk > SZ_4K + CRC_T10DIF_PMULL_CHUNK_SIZE)
- chunk = SZ_4K;
-
- kernel_neon_begin();
- *crc = crc_t10dif_pmull_p64(*crc, data, chunk);
- kernel_neon_end();
- data += chunk;
- length -= chunk;
- } while (length);
+ kernel_neon_begin();
+ *crc = crc_t10dif_pmull_p64(*crc, data, length);
+ kernel_neon_end();
} else {
*crc = crc_t10dif_generic(*crc, data, length);
}
diff --git a/arch/arm64/include/asm/arm_pmuv3.h b/arch/arm64/include/asm/arm_pmuv3.h
index 468a049..8a777de 100644
--- a/arch/arm64/include/asm/arm_pmuv3.h
+++ b/arch/arm64/include/asm/arm_pmuv3.h
@@ -152,6 +152,11 @@ static inline void write_pmuserenr(u32 val)
write_sysreg(val, pmuserenr_el0);
}
+static inline void write_pmuacr(u64 val)
+{
+ write_sysreg_s(val, SYS_PMUACR_EL1);
+}
+
static inline u64 read_pmceid0(void)
{
return read_sysreg(pmceid0_el0);
@@ -178,4 +183,9 @@ static inline bool is_pmuv3p5(int pmuver)
return pmuver >= ID_AA64DFR0_EL1_PMUVer_V3P5;
}
+static inline bool is_pmuv3p9(int pmuver)
+{
+ return pmuver >= ID_AA64DFR0_EL1_PMUVer_V3P9;
+}
+
#endif
diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h
index bc0b0d7..3d8d534 100644
--- a/arch/arm64/include/asm/assembler.h
+++ b/arch/arm64/include/asm/assembler.h
@@ -249,13 +249,6 @@ alternative_endif
.endm
/*
- * vma_vm_mm - get mm pointer from vma pointer (vma->vm_mm)
- */
- .macro vma_vm_mm, rd, rn
- ldr \rd, [\rn, #VMA_VM_MM]
- .endm
-
-/*
* read_ctr - read CTR_EL0. If the system has mismatched register fields,
* provide the system wide safe value from arm64_ftr_reg_ctrel0.sys_val
*/
diff --git a/arch/arm64/include/asm/cpucaps.h b/arch/arm64/include/asm/cpucaps.h
index a6e5b07..a08a121 100644
--- a/arch/arm64/include/asm/cpucaps.h
+++ b/arch/arm64/include/asm/cpucaps.h
@@ -42,6 +42,8 @@ cpucap_is_possible(const unsigned int cap)
return IS_ENABLED(CONFIG_ARM64_BTI);
case ARM64_HAS_TLB_RANGE:
return IS_ENABLED(CONFIG_ARM64_TLB_RANGE);
+ case ARM64_HAS_S1POE:
+ return IS_ENABLED(CONFIG_ARM64_POE);
case ARM64_UNMAP_KERNEL_AT_EL0:
return IS_ENABLED(CONFIG_UNMAP_KERNEL_AT_EL0);
case ARM64_WORKAROUND_843419:
diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h
index 3d261cc..3d63c20 100644
--- a/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
@@ -12,7 +12,7 @@
#include <asm/hwcap.h>
#include <asm/sysreg.h>
-#define MAX_CPU_FEATURES 128
+#define MAX_CPU_FEATURES 192
#define cpu_feature(x) KERNEL_HWCAP_ ## x
#define ARM64_SW_FEATURE_OVERRIDE_NOKASLR 0
@@ -438,6 +438,7 @@ void cpu_set_feature(unsigned int num);
bool cpu_have_feature(unsigned int num);
unsigned long cpu_get_elf_hwcap(void);
unsigned long cpu_get_elf_hwcap2(void);
+unsigned long cpu_get_elf_hwcap3(void);
#define cpu_set_named_feature(name) cpu_set_feature(cpu_feature(name))
#define cpu_have_named_feature(name) cpu_have_feature(cpu_feature(name))
@@ -834,8 +835,19 @@ static inline bool system_supports_lpa2(void)
static inline bool system_supports_poe(void)
{
- return IS_ENABLED(CONFIG_ARM64_POE) &&
- alternative_has_cap_unlikely(ARM64_HAS_S1POE);
+ return alternative_has_cap_unlikely(ARM64_HAS_S1POE);
+}
+
+static inline bool system_supports_gcs(void)
+{
+ return IS_ENABLED(CONFIG_ARM64_GCS) &&
+ alternative_has_cap_unlikely(ARM64_HAS_GCS);
+}
+
+static inline bool system_supports_haft(void)
+{
+ return IS_ENABLED(CONFIG_ARM64_HAFT) &&
+ cpus_have_final_cap(ARM64_HAFT);
}
int do_emulate_mrs(struct pt_regs *regs, u32 sys_reg, u32 rt);
diff --git a/arch/arm64/include/asm/daifflags.h b/arch/arm64/include/asm/daifflags.h
index 55f57df..fbb5c99 100644
--- a/arch/arm64/include/asm/daifflags.h
+++ b/arch/arm64/include/asm/daifflags.h
@@ -132,7 +132,7 @@ static inline void local_daif_inherit(struct pt_regs *regs)
trace_hardirqs_on();
if (system_uses_irq_prio_masking())
- gic_write_pmr(regs->pmr_save);
+ gic_write_pmr(regs->pmr);
/*
* We can't use local_daif_restore(regs->pstate) here as
diff --git a/arch/arm64/include/asm/debug-monitors.h b/arch/arm64/include/asm/debug-monitors.h
index 13d437b..8f6ba31 100644
--- a/arch/arm64/include/asm/debug-monitors.h
+++ b/arch/arm64/include/asm/debug-monitors.h
@@ -105,6 +105,7 @@ void kernel_enable_single_step(struct pt_regs *regs);
void kernel_disable_single_step(void);
int kernel_active_single_step(void);
void kernel_rewind_single_step(struct pt_regs *regs);
+void kernel_fastforward_single_step(struct pt_regs *regs);
#ifdef CONFIG_HAVE_HW_BREAKPOINT
int reinstall_suspended_bps(struct pt_regs *regs);
diff --git a/arch/arm64/include/asm/el2_setup.h b/arch/arm64/include/asm/el2_setup.h
index e0ffdf1..27086a8 100644
--- a/arch/arm64/include/asm/el2_setup.h
+++ b/arch/arm64/include/asm/el2_setup.h
@@ -27,6 +27,14 @@
ubfx x0, x0, #ID_AA64MMFR1_EL1_HCX_SHIFT, #4
cbz x0, .Lskip_hcrx_\@
mov_q x0, HCRX_HOST_FLAGS
+
+ /* Enable GCS if supported */
+ mrs_s x1, SYS_ID_AA64PFR1_EL1
+ ubfx x1, x1, #ID_AA64PFR1_EL1_GCS_SHIFT, #4
+ cbz x1, .Lset_hcrx_\@
+ orr x0, x0, #HCRX_EL2_GCSEn
+
+.Lset_hcrx_\@:
msr_s SYS_HCRX_EL2, x0
.Lskip_hcrx_\@:
.endm
@@ -200,6 +208,16 @@
orr x0, x0, #HFGxTR_EL2_nPOR_EL0
.Lskip_poe_fgt_\@:
+ /* GCS depends on PIE so we don't check it if PIE is absent */
+ mrs_s x1, SYS_ID_AA64PFR1_EL1
+ ubfx x1, x1, #ID_AA64PFR1_EL1_GCS_SHIFT, #4
+ cbz x1, .Lset_fgt_\@
+
+ /* Disable traps of access to GCS registers at EL0 and EL1 */
+ orr x0, x0, #HFGxTR_EL2_nGCS_EL1_MASK
+ orr x0, x0, #HFGxTR_EL2_nGCS_EL0_MASK
+
+.Lset_fgt_\@:
msr_s SYS_HFGRTR_EL2, x0
msr_s SYS_HFGWTR_EL2, x0
msr_s SYS_HFGITR_EL2, xzr
@@ -215,6 +233,17 @@
.Lskip_fgt_\@:
.endm
+.macro __init_el2_gcs
+ mrs_s x1, SYS_ID_AA64PFR1_EL1
+ ubfx x1, x1, #ID_AA64PFR1_EL1_GCS_SHIFT, #4
+ cbz x1, .Lskip_gcs_\@
+
+ /* Ensure GCS is not enabled when we start trying to do BLs */
+ msr_s SYS_GCSCR_EL1, xzr
+ msr_s SYS_GCSCRE0_EL1, xzr
+.Lskip_gcs_\@:
+.endm
+
.macro __init_el2_nvhe_prepare_eret
mov x0, #INIT_PSTATE_EL1
msr spsr_el2, x0
@@ -240,6 +269,7 @@
__init_el2_nvhe_idregs
__init_el2_cptr
__init_el2_fgt
+ __init_el2_gcs
.endm
#ifndef __KVM_NVHE_HYPERVISOR__
diff --git a/arch/arm64/include/asm/esr.h b/arch/arm64/include/asm/esr.h
index da6d2c1..d1b1a33 100644
--- a/arch/arm64/include/asm/esr.h
+++ b/arch/arm64/include/asm/esr.h
@@ -51,7 +51,8 @@
#define ESR_ELx_EC_FP_EXC32 UL(0x28)
/* Unallocated EC: 0x29 - 0x2B */
#define ESR_ELx_EC_FP_EXC64 UL(0x2C)
-/* Unallocated EC: 0x2D - 0x2E */
+#define ESR_ELx_EC_GCS UL(0x2D)
+/* Unallocated EC: 0x2E */
#define ESR_ELx_EC_SERROR UL(0x2F)
#define ESR_ELx_EC_BREAKPT_LOW UL(0x30)
#define ESR_ELx_EC_BREAKPT_CUR UL(0x31)
@@ -386,6 +387,31 @@
#define ESR_ELx_MOPS_ISS_SRCREG(esr) (((esr) & (UL(0x1f) << 5)) >> 5)
#define ESR_ELx_MOPS_ISS_SIZEREG(esr) (((esr) & (UL(0x1f) << 0)) >> 0)
+/* ISS field definitions for GCS */
+#define ESR_ELx_ExType_SHIFT (20)
+#define ESR_ELx_ExType_MASK GENMASK(23, 20)
+#define ESR_ELx_Raddr_SHIFT (10)
+#define ESR_ELx_Raddr_MASK GENMASK(14, 10)
+#define ESR_ELx_Rn_SHIFT (5)
+#define ESR_ELx_Rn_MASK GENMASK(9, 5)
+#define ESR_ELx_Rvalue_SHIFT 5
+#define ESR_ELx_Rvalue_MASK GENMASK(9, 5)
+#define ESR_ELx_IT_SHIFT (0)
+#define ESR_ELx_IT_MASK GENMASK(4, 0)
+
+#define ESR_ELx_ExType_DATA_CHECK 0
+#define ESR_ELx_ExType_EXLOCK 1
+#define ESR_ELx_ExType_STR 2
+
+#define ESR_ELx_IT_RET 0
+#define ESR_ELx_IT_GCSPOPM 1
+#define ESR_ELx_IT_RET_KEYA 2
+#define ESR_ELx_IT_RET_KEYB 3
+#define ESR_ELx_IT_GCSSS1 4
+#define ESR_ELx_IT_GCSSS2 5
+#define ESR_ELx_IT_GCSPOPCX 6
+#define ESR_ELx_IT_GCSPOPX 7
+
#ifndef __ASSEMBLY__
#include <asm/types.h>
diff --git a/arch/arm64/include/asm/exception.h b/arch/arm64/include/asm/exception.h
index f296662..d48fc16 100644
--- a/arch/arm64/include/asm/exception.h
+++ b/arch/arm64/include/asm/exception.h
@@ -57,6 +57,8 @@ void do_el0_undef(struct pt_regs *regs, unsigned long esr);
void do_el1_undef(struct pt_regs *regs, unsigned long esr);
void do_el0_bti(struct pt_regs *regs);
void do_el1_bti(struct pt_regs *regs, unsigned long esr);
+void do_el0_gcs(struct pt_regs *regs, unsigned long esr);
+void do_el1_gcs(struct pt_regs *regs, unsigned long esr);
void do_debug_exception(unsigned long addr_if_watchpoint, unsigned long esr,
struct pt_regs *regs);
void do_fpsimd_acc(unsigned long esr, struct pt_regs *regs);
@@ -73,6 +75,7 @@ void do_el0_svc_compat(struct pt_regs *regs);
void do_el0_fpac(struct pt_regs *regs, unsigned long esr);
void do_el1_fpac(struct pt_regs *regs, unsigned long esr);
void do_el0_mops(struct pt_regs *regs, unsigned long esr);
+void do_el1_mops(struct pt_regs *regs, unsigned long esr);
void do_serror(struct pt_regs *regs, unsigned long esr);
void do_signal(struct pt_regs *regs);
diff --git a/arch/arm64/include/asm/gcs.h b/arch/arm64/include/asm/gcs.h
new file mode 100644
index 0000000..f506606
--- /dev/null
+++ b/arch/arm64/include/asm/gcs.h
@@ -0,0 +1,107 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2023 ARM Ltd.
+ */
+#ifndef __ASM_GCS_H
+#define __ASM_GCS_H
+
+#include <asm/types.h>
+#include <asm/uaccess.h>
+
+struct kernel_clone_args;
+struct ksignal;
+
+static inline void gcsb_dsync(void)
+{
+ asm volatile(".inst 0xd503227f" : : : "memory");
+}
+
+static inline void gcsstr(u64 *addr, u64 val)
+{
+ register u64 *_addr __asm__ ("x0") = addr;
+ register long _val __asm__ ("x1") = val;
+
+ /* GCSSTTR x1, x0 */
+ asm volatile(
+ ".inst 0xd91f1c01\n"
+ :
+ : "rZ" (_val), "r" (_addr)
+ : "memory");
+}
+
+static inline void gcsss1(u64 Xt)
+{
+ asm volatile (
+ "sys #3, C7, C7, #2, %0\n"
+ :
+ : "rZ" (Xt)
+ : "memory");
+}
+
+static inline u64 gcsss2(void)
+{
+ u64 Xt;
+
+ asm volatile(
+ "SYSL %0, #3, C7, C7, #3\n"
+ : "=r" (Xt)
+ :
+ : "memory");
+
+ return Xt;
+}
+
+#define PR_SHADOW_STACK_SUPPORTED_STATUS_MASK \
+ (PR_SHADOW_STACK_ENABLE | PR_SHADOW_STACK_WRITE | PR_SHADOW_STACK_PUSH)
+
+#ifdef CONFIG_ARM64_GCS
+
+static inline bool task_gcs_el0_enabled(struct task_struct *task)
+{
+ return current->thread.gcs_el0_mode & PR_SHADOW_STACK_ENABLE;
+}
+
+void gcs_set_el0_mode(struct task_struct *task);
+void gcs_free(struct task_struct *task);
+void gcs_preserve_current_state(void);
+unsigned long gcs_alloc_thread_stack(struct task_struct *tsk,
+ const struct kernel_clone_args *args);
+
+static inline int gcs_check_locked(struct task_struct *task,
+ unsigned long new_val)
+{
+ unsigned long cur_val = task->thread.gcs_el0_mode;
+
+ cur_val &= task->thread.gcs_el0_locked;
+ new_val &= task->thread.gcs_el0_locked;
+
+ if (cur_val != new_val)
+ return -EBUSY;
+
+ return 0;
+}
+
+#else
+
+static inline bool task_gcs_el0_enabled(struct task_struct *task)
+{
+ return false;
+}
+
+static inline void gcs_set_el0_mode(struct task_struct *task) { }
+static inline void gcs_free(struct task_struct *task) { }
+static inline void gcs_preserve_current_state(void) { }
+static inline unsigned long gcs_alloc_thread_stack(struct task_struct *tsk,
+ const struct kernel_clone_args *args)
+{
+ return -ENOTSUPP;
+}
+static inline int gcs_check_locked(struct task_struct *task,
+ unsigned long new_val)
+{
+ return 0;
+}
+
+#endif
+
+#endif
diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h
index 293f880..c6dff3e 100644
--- a/arch/arm64/include/asm/hugetlb.h
+++ b/arch/arm64/include/asm/hugetlb.h
@@ -11,6 +11,7 @@
#define __ASM_HUGETLB_H
#include <asm/cacheflush.h>
+#include <asm/mte.h>
#include <asm/page.h>
#ifdef CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION
@@ -21,6 +22,13 @@ extern bool arch_hugetlb_migration_supported(struct hstate *h);
static inline void arch_clear_hugetlb_flags(struct folio *folio)
{
clear_bit(PG_dcache_clean, &folio->flags);
+
+#ifdef CONFIG_ARM64_MTE
+ if (system_supports_mte()) {
+ clear_bit(PG_mte_tagged, &folio->flags);
+ clear_bit(PG_mte_lock, &folio->flags);
+ }
+#endif
}
#define arch_clear_hugetlb_flags arch_clear_hugetlb_flags
diff --git a/arch/arm64/include/asm/hwcap.h b/arch/arm64/include/asm/hwcap.h
index a775add..2b6c61c6 100644
--- a/arch/arm64/include/asm/hwcap.h
+++ b/arch/arm64/include/asm/hwcap.h
@@ -92,6 +92,7 @@
#define KERNEL_HWCAP_SB __khwcap_feature(SB)
#define KERNEL_HWCAP_PACA __khwcap_feature(PACA)
#define KERNEL_HWCAP_PACG __khwcap_feature(PACG)
+#define KERNEL_HWCAP_GCS __khwcap_feature(GCS)
#define __khwcap2_feature(x) (const_ilog2(HWCAP2_ ## x) + 64)
#define KERNEL_HWCAP_DCPODP __khwcap2_feature(DCPODP)
@@ -159,17 +160,21 @@
#define KERNEL_HWCAP_SME_SF8DP2 __khwcap2_feature(SME_SF8DP2)
#define KERNEL_HWCAP_POE __khwcap2_feature(POE)
+#define __khwcap3_feature(x) (const_ilog2(HWCAP3_ ## x) + 128)
+
/*
* This yields a mask that user programs can use to figure out what
* instruction set this cpu supports.
*/
#define ELF_HWCAP cpu_get_elf_hwcap()
#define ELF_HWCAP2 cpu_get_elf_hwcap2()
+#define ELF_HWCAP3 cpu_get_elf_hwcap3()
#ifdef CONFIG_COMPAT
#define COMPAT_ELF_HWCAP (compat_elf_hwcap)
#define COMPAT_ELF_HWCAP2 (compat_elf_hwcap2)
-extern unsigned int compat_elf_hwcap, compat_elf_hwcap2;
+#define COMPAT_ELF_HWCAP3 (compat_elf_hwcap3)
+extern unsigned int compat_elf_hwcap, compat_elf_hwcap2, compat_elf_hwcap3;
#endif
enum {
diff --git a/arch/arm64/include/asm/insn.h b/arch/arm64/include/asm/insn.h
index 8c0a36f..e390c43 100644
--- a/arch/arm64/include/asm/insn.h
+++ b/arch/arm64/include/asm/insn.h
@@ -353,6 +353,7 @@ __AARCH64_INSN_FUNCS(ldrsw_lit, 0xFF000000, 0x98000000)
__AARCH64_INSN_FUNCS(exclusive, 0x3F800000, 0x08000000)
__AARCH64_INSN_FUNCS(load_ex, 0x3F400000, 0x08400000)
__AARCH64_INSN_FUNCS(store_ex, 0x3F400000, 0x08000000)
+__AARCH64_INSN_FUNCS(mops, 0x3B200C00, 0x19000400)
__AARCH64_INSN_FUNCS(stp, 0x7FC00000, 0x29000000)
__AARCH64_INSN_FUNCS(ldp, 0x7FC00000, 0x29400000)
__AARCH64_INSN_FUNCS(stp_post, 0x7FC00000, 0x28800000)
@@ -575,6 +576,11 @@ static __always_inline u32 aarch64_insn_gen_nop(void)
return aarch64_insn_gen_hint(AARCH64_INSN_HINT_NOP);
}
+static __always_inline bool aarch64_insn_is_nop(u32 insn)
+{
+ return insn == aarch64_insn_gen_nop();
+}
+
u32 aarch64_insn_gen_branch_reg(enum aarch64_insn_register reg,
enum aarch64_insn_branch_type type);
u32 aarch64_insn_gen_load_store_reg(enum aarch64_insn_register reg,
diff --git a/arch/arm64/include/asm/io.h b/arch/arm64/include/asm/io.h
index 1ada23a..8688343 100644
--- a/arch/arm64/include/asm/io.h
+++ b/arch/arm64/include/asm/io.h
@@ -17,6 +17,7 @@
#include <asm/early_ioremap.h>
#include <asm/alternative.h>
#include <asm/cpufeature.h>
+#include <asm/rsi.h>
/*
* Generic IO read/write. These perform native-endian accesses.
@@ -318,4 +319,11 @@ extern bool arch_memremap_can_ram_remap(resource_size_t offset, size_t size,
unsigned long flags);
#define arch_memremap_can_ram_remap arch_memremap_can_ram_remap
+static inline bool arm64_is_protected_mmio(phys_addr_t phys_addr, size_t size)
+{
+ if (unlikely(is_realm_world()))
+ return __arm64_is_protected_mmio(phys_addr, size);
+ return false;
+}
+
#endif /* __ASM_IO_H */
diff --git a/arch/arm64/include/asm/kernel-pgtable.h b/arch/arm64/include/asm/kernel-pgtable.h
index bf05a77..fd5a084 100644
--- a/arch/arm64/include/asm/kernel-pgtable.h
+++ b/arch/arm64/include/asm/kernel-pgtable.h
@@ -26,7 +26,6 @@
#define SWAPPER_SKIP_LEVEL 0
#endif
#define SWAPPER_BLOCK_SIZE (UL(1) << SWAPPER_BLOCK_SHIFT)
-#define SWAPPER_TABLE_SHIFT (SWAPPER_BLOCK_SHIFT + PAGE_SHIFT - 3)
#define SWAPPER_PGTABLE_LEVELS (CONFIG_PGTABLE_LEVELS - SWAPPER_SKIP_LEVEL)
#define INIT_IDMAP_PGTABLE_LEVELS (IDMAP_LEVELS - SWAPPER_SKIP_LEVEL)
diff --git a/arch/arm64/include/asm/mem_encrypt.h b/arch/arm64/include/asm/mem_encrypt.h
index b0c9a86..f8f78f6 100644
--- a/arch/arm64/include/asm/mem_encrypt.h
+++ b/arch/arm64/include/asm/mem_encrypt.h
@@ -2,6 +2,8 @@
#ifndef __ASM_MEM_ENCRYPT_H
#define __ASM_MEM_ENCRYPT_H
+#include <asm/rsi.h>
+
struct arm64_mem_crypt_ops {
int (*encrypt)(unsigned long addr, int numpages);
int (*decrypt)(unsigned long addr, int numpages);
@@ -12,4 +14,11 @@ int arm64_mem_crypt_ops_register(const struct arm64_mem_crypt_ops *ops);
int set_memory_encrypted(unsigned long addr, int numpages);
int set_memory_decrypted(unsigned long addr, int numpages);
+int realm_register_memory_enc_ops(void);
+
+static inline bool force_dma_unencrypted(struct device *dev)
+{
+ return is_realm_world();
+}
+
#endif /* __ASM_MEM_ENCRYPT_H */
diff --git a/arch/arm64/include/asm/mman.h b/arch/arm64/include/asm/mman.h
index 798d965..1d53022 100644
--- a/arch/arm64/include/asm/mman.h
+++ b/arch/arm64/include/asm/mman.h
@@ -41,9 +41,12 @@ static inline unsigned long arch_calc_vm_flag_bits(struct file *file,
* backed by tags-capable memory. The vm_flags may be overridden by a
* filesystem supporting MTE (RAM-based).
*/
- if (system_supports_mte() &&
- ((flags & MAP_ANONYMOUS) || shmem_file(file)))
- return VM_MTE_ALLOWED;
+ if (system_supports_mte()) {
+ if (flags & (MAP_ANONYMOUS | MAP_HUGETLB))
+ return VM_MTE_ALLOWED;
+ if (shmem_file(file))
+ return VM_MTE_ALLOWED;
+ }
return 0;
}
@@ -66,11 +69,26 @@ static inline bool arch_validate_prot(unsigned long prot,
static inline bool arch_validate_flags(unsigned long vm_flags)
{
- if (!system_supports_mte())
- return true;
+ if (system_supports_mte()) {
+ /*
+ * only allow VM_MTE if VM_MTE_ALLOWED has been set
+ * previously
+ */
+ if ((vm_flags & VM_MTE) && !(vm_flags & VM_MTE_ALLOWED))
+ return false;
+ }
- /* only allow VM_MTE if VM_MTE_ALLOWED has been set previously */
- return !(vm_flags & VM_MTE) || (vm_flags & VM_MTE_ALLOWED);
+ if (system_supports_gcs() && (vm_flags & VM_SHADOW_STACK)) {
+ /* An executable GCS isn't a good idea. */
+ if (vm_flags & VM_EXEC)
+ return false;
+
+ /* The memory management core should prevent this */
+ VM_WARN_ON(vm_flags & VM_SHARED);
+ }
+
+ return true;
+
}
#define arch_validate_flags(vm_flags) arch_validate_flags(vm_flags)
diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h
index 7c09d47..48b3d95 100644
--- a/arch/arm64/include/asm/mmu_context.h
+++ b/arch/arm64/include/asm/mmu_context.h
@@ -20,6 +20,7 @@
#include <asm/cacheflush.h>
#include <asm/cpufeature.h>
#include <asm/daifflags.h>
+#include <asm/gcs.h>
#include <asm/proc-fns.h>
#include <asm/cputype.h>
#include <asm/sysreg.h>
@@ -311,6 +312,14 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
return por_el0_allows_pkey(vma_pkey(vma), write, execute);
}
+#define deactivate_mm deactivate_mm
+static inline void deactivate_mm(struct task_struct *tsk,
+ struct mm_struct *mm)
+{
+ gcs_free(tsk);
+}
+
+
#include <asm-generic/mmu_context.h>
#endif /* !__ASSEMBLY__ */
diff --git a/arch/arm64/include/asm/mte.h b/arch/arm64/include/asm/mte.h
index 0f84518..6567df8ec 100644
--- a/arch/arm64/include/asm/mte.h
+++ b/arch/arm64/include/asm/mte.h
@@ -41,6 +41,8 @@ void mte_free_tag_storage(char *storage);
static inline void set_page_mte_tagged(struct page *page)
{
+ VM_WARN_ON_ONCE(folio_test_hugetlb(page_folio(page)));
+
/*
* Ensure that the tags written prior to this function are visible
* before the page flags update.
@@ -53,6 +55,8 @@ static inline bool page_mte_tagged(struct page *page)
{
bool ret = test_bit(PG_mte_tagged, &page->flags);
+ VM_WARN_ON_ONCE(folio_test_hugetlb(page_folio(page)));
+
/*
* If the page is tagged, ensure ordering with a likely subsequent
* read of the tags.
@@ -76,6 +80,8 @@ static inline bool page_mte_tagged(struct page *page)
*/
static inline bool try_page_mte_tagging(struct page *page)
{
+ VM_WARN_ON_ONCE(folio_test_hugetlb(page_folio(page)));
+
if (!test_and_set_bit(PG_mte_lock, &page->flags))
return true;
@@ -157,6 +163,67 @@ static inline int mte_ptrace_copy_tags(struct task_struct *child,
#endif /* CONFIG_ARM64_MTE */
+#if defined(CONFIG_HUGETLB_PAGE) && defined(CONFIG_ARM64_MTE)
+static inline void folio_set_hugetlb_mte_tagged(struct folio *folio)
+{
+ VM_WARN_ON_ONCE(!folio_test_hugetlb(folio));
+
+ /*
+ * Ensure that the tags written prior to this function are visible
+ * before the folio flags update.
+ */
+ smp_wmb();
+ set_bit(PG_mte_tagged, &folio->flags);
+
+}
+
+static inline bool folio_test_hugetlb_mte_tagged(struct folio *folio)
+{
+ bool ret = test_bit(PG_mte_tagged, &folio->flags);
+
+ VM_WARN_ON_ONCE(!folio_test_hugetlb(folio));
+
+ /*
+ * If the folio is tagged, ensure ordering with a likely subsequent
+ * read of the tags.
+ */
+ if (ret)
+ smp_rmb();
+ return ret;
+}
+
+static inline bool folio_try_hugetlb_mte_tagging(struct folio *folio)
+{
+ VM_WARN_ON_ONCE(!folio_test_hugetlb(folio));
+
+ if (!test_and_set_bit(PG_mte_lock, &folio->flags))
+ return true;
+
+ /*
+ * The tags are either being initialised or may have been initialised
+ * already. Check if the PG_mte_tagged flag has been set or wait
+ * otherwise.
+ */
+ smp_cond_load_acquire(&folio->flags, VAL & (1UL << PG_mte_tagged));
+
+ return false;
+}
+#else
+static inline void folio_set_hugetlb_mte_tagged(struct folio *folio)
+{
+}
+
+static inline bool folio_test_hugetlb_mte_tagged(struct folio *folio)
+{
+ return false;
+}
+
+static inline bool folio_try_hugetlb_mte_tagging(struct folio *folio)
+{
+ return false;
+}
+#endif
+
static inline void mte_disable_tco_entry(struct task_struct *task)
{
if (!system_supports_mte())
diff --git a/arch/arm64/include/asm/pgalloc.h b/arch/arm64/include/asm/pgalloc.h
index 8ff5f2a..e754228 100644
--- a/arch/arm64/include/asm/pgalloc.h
+++ b/arch/arm64/include/asm/pgalloc.h
@@ -28,7 +28,7 @@ static inline void __pud_populate(pud_t *pudp, phys_addr_t pmdp, pudval_t prot)
static inline void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmdp)
{
- pudval_t pudval = PUD_TYPE_TABLE;
+ pudval_t pudval = PUD_TYPE_TABLE | PUD_TABLE_AF;
pudval |= (mm == &init_mm) ? PUD_TABLE_UXN : PUD_TABLE_PXN;
__pud_populate(pudp, __pa(pmdp), pudval);
@@ -50,7 +50,7 @@ static inline void __p4d_populate(p4d_t *p4dp, phys_addr_t pudp, p4dval_t prot)
static inline void p4d_populate(struct mm_struct *mm, p4d_t *p4dp, pud_t *pudp)
{
- p4dval_t p4dval = P4D_TYPE_TABLE;
+ p4dval_t p4dval = P4D_TYPE_TABLE | P4D_TABLE_AF;
p4dval |= (mm == &init_mm) ? P4D_TABLE_UXN : P4D_TABLE_PXN;
__p4d_populate(p4dp, __pa(pudp), p4dval);
@@ -79,7 +79,7 @@ static inline void __pgd_populate(pgd_t *pgdp, phys_addr_t p4dp, pgdval_t prot)
static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgdp, p4d_t *p4dp)
{
- pgdval_t pgdval = PGD_TYPE_TABLE;
+ pgdval_t pgdval = PGD_TYPE_TABLE | PGD_TABLE_AF;
pgdval |= (mm == &init_mm) ? PGD_TABLE_UXN : PGD_TABLE_PXN;
__pgd_populate(pgdp, __pa(p4dp), pgdval);
@@ -127,14 +127,16 @@ static inline void
pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmdp, pte_t *ptep)
{
VM_BUG_ON(mm && mm != &init_mm);
- __pmd_populate(pmdp, __pa(ptep), PMD_TYPE_TABLE | PMD_TABLE_UXN);
+ __pmd_populate(pmdp, __pa(ptep),
+ PMD_TYPE_TABLE | PMD_TABLE_AF | PMD_TABLE_UXN);
}
static inline void
pmd_populate(struct mm_struct *mm, pmd_t *pmdp, pgtable_t ptep)
{
VM_BUG_ON(mm == &init_mm);
- __pmd_populate(pmdp, page_to_phys(ptep), PMD_TYPE_TABLE | PMD_TABLE_PXN);
+ __pmd_populate(pmdp, page_to_phys(ptep),
+ PMD_TYPE_TABLE | PMD_TABLE_AF | PMD_TABLE_PXN);
}
#endif
diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h
index fd330c1..c78a988 100644
--- a/arch/arm64/include/asm/pgtable-hwdef.h
+++ b/arch/arm64/include/asm/pgtable-hwdef.h
@@ -99,6 +99,7 @@
#define PGD_TYPE_TABLE (_AT(pgdval_t, 3) << 0)
#define PGD_TABLE_BIT (_AT(pgdval_t, 1) << 1)
#define PGD_TYPE_MASK (_AT(pgdval_t, 3) << 0)
+#define PGD_TABLE_AF (_AT(pgdval_t, 1) << 10) /* Ignored if no FEAT_HAFT */
#define PGD_TABLE_PXN (_AT(pgdval_t, 1) << 59)
#define PGD_TABLE_UXN (_AT(pgdval_t, 1) << 60)
@@ -110,6 +111,7 @@
#define P4D_TYPE_MASK (_AT(p4dval_t, 3) << 0)
#define P4D_TYPE_SECT (_AT(p4dval_t, 1) << 0)
#define P4D_SECT_RDONLY (_AT(p4dval_t, 1) << 7) /* AP[2] */
+#define P4D_TABLE_AF (_AT(p4dval_t, 1) << 10) /* Ignored if no FEAT_HAFT */
#define P4D_TABLE_PXN (_AT(p4dval_t, 1) << 59)
#define P4D_TABLE_UXN (_AT(p4dval_t, 1) << 60)
@@ -121,6 +123,7 @@
#define PUD_TYPE_MASK (_AT(pudval_t, 3) << 0)
#define PUD_TYPE_SECT (_AT(pudval_t, 1) << 0)
#define PUD_SECT_RDONLY (_AT(pudval_t, 1) << 7) /* AP[2] */
+#define PUD_TABLE_AF (_AT(pudval_t, 1) << 10) /* Ignored if no FEAT_HAFT */
#define PUD_TABLE_PXN (_AT(pudval_t, 1) << 59)
#define PUD_TABLE_UXN (_AT(pudval_t, 1) << 60)
@@ -131,6 +134,7 @@
#define PMD_TYPE_TABLE (_AT(pmdval_t, 3) << 0)
#define PMD_TYPE_SECT (_AT(pmdval_t, 1) << 0)
#define PMD_TABLE_BIT (_AT(pmdval_t, 1) << 1)
+#define PMD_TABLE_AF (_AT(pmdval_t, 1) << 10) /* Ignored if no FEAT_HAFT */
/*
* Section
diff --git a/arch/arm64/include/asm/pgtable-prot.h b/arch/arm64/include/asm/pgtable-prot.h
index 2a11d0c..9f9cf13 100644
--- a/arch/arm64/include/asm/pgtable-prot.h
+++ b/arch/arm64/include/asm/pgtable-prot.h
@@ -35,7 +35,6 @@
#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_WP */
#define _PROT_DEFAULT (PTE_TYPE_PAGE | PTE_AF | PTE_SHARED)
-#define _PROT_SECT_DEFAULT (PMD_TYPE_SECT | PMD_SECT_AF | PMD_SECT_S)
#define PROT_DEFAULT (PTE_TYPE_PAGE | PTE_MAYBE_NG | PTE_MAYBE_SHARED | PTE_AF)
#define PROT_SECT_DEFAULT (PMD_TYPE_SECT | PMD_MAYBE_NG | PMD_MAYBE_SHARED | PMD_SECT_AF)
@@ -68,8 +67,12 @@
#include <asm/cpufeature.h>
#include <asm/pgtable-types.h>
+#include <asm/rsi.h>
extern bool arm64_use_ng_mappings;
+extern unsigned long prot_ns_shared;
+
+#define PROT_NS_SHARED (is_realm_world() ? prot_ns_shared : 0)
#define PTE_MAYBE_NG (arm64_use_ng_mappings ? PTE_NG : 0)
#define PMD_MAYBE_NG (arm64_use_ng_mappings ? PMD_SECT_NG : 0)
@@ -144,15 +147,23 @@ static inline bool __pure lpa2_is_enabled(void)
/* 6: PTE_PXN | PTE_WRITE */
/* 7: PAGE_SHARED_EXEC PTE_PXN | PTE_WRITE | PTE_USER */
/* 8: PAGE_KERNEL_ROX PTE_UXN */
-/* 9: PTE_UXN | PTE_USER */
+/* 9: PAGE_GCS_RO PTE_UXN | PTE_USER */
/* a: PAGE_KERNEL_EXEC PTE_UXN | PTE_WRITE */
-/* b: PTE_UXN | PTE_WRITE | PTE_USER */
+/* b: PAGE_GCS PTE_UXN | PTE_WRITE | PTE_USER */
/* c: PAGE_KERNEL_RO PTE_UXN | PTE_PXN */
/* d: PAGE_READONLY PTE_UXN | PTE_PXN | PTE_USER */
/* e: PAGE_KERNEL PTE_UXN | PTE_PXN | PTE_WRITE */
/* f: PAGE_SHARED PTE_UXN | PTE_PXN | PTE_WRITE | PTE_USER */
+#define _PAGE_GCS (_PAGE_DEFAULT | PTE_NG | PTE_UXN | PTE_WRITE | PTE_USER)
+#define _PAGE_GCS_RO (_PAGE_DEFAULT | PTE_NG | PTE_UXN | PTE_USER)
+
+#define PAGE_GCS __pgprot(_PAGE_GCS)
+#define PAGE_GCS_RO __pgprot(_PAGE_GCS_RO)
+
#define PIE_E0 ( \
+ PIRx_ELx_PERM(pte_pi_index(_PAGE_GCS), PIE_GCS) | \
+ PIRx_ELx_PERM(pte_pi_index(_PAGE_GCS_RO), PIE_R) | \
PIRx_ELx_PERM(pte_pi_index(_PAGE_EXECONLY), PIE_X_O) | \
PIRx_ELx_PERM(pte_pi_index(_PAGE_READONLY_EXEC), PIE_RX_O) | \
PIRx_ELx_PERM(pte_pi_index(_PAGE_SHARED_EXEC), PIE_RWX_O) | \
@@ -160,6 +171,8 @@ static inline bool __pure lpa2_is_enabled(void)
PIRx_ELx_PERM(pte_pi_index(_PAGE_SHARED), PIE_RW_O))
#define PIE_E1 ( \
+ PIRx_ELx_PERM(pte_pi_index(_PAGE_GCS), PIE_NONE_O) | \
+ PIRx_ELx_PERM(pte_pi_index(_PAGE_GCS_RO), PIE_NONE_O) | \
PIRx_ELx_PERM(pte_pi_index(_PAGE_EXECONLY), PIE_NONE_O) | \
PIRx_ELx_PERM(pte_pi_index(_PAGE_READONLY_EXEC), PIE_R) | \
PIRx_ELx_PERM(pte_pi_index(_PAGE_SHARED_EXEC), PIE_RW) | \
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index c329ea0..6986345 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -265,8 +265,7 @@ static inline pte_t pte_mkspecial(pte_t pte)
static inline pte_t pte_mkcont(pte_t pte)
{
- pte = set_pte_bit(pte, __pgprot(PTE_CONT));
- return set_pte_bit(pte, __pgprot(PTE_TYPE_PAGE));
+ return set_pte_bit(pte, __pgprot(PTE_CONT));
}
static inline pte_t pte_mknoncont(pte_t pte)
@@ -338,7 +337,7 @@ static inline pte_t __ptep_get(pte_t *ptep)
}
extern void __sync_icache_dcache(pte_t pteval);
-bool pgattr_change_is_safe(u64 old, u64 new);
+bool pgattr_change_is_safe(pteval_t old, pteval_t new);
/*
* PTE bits configuration in the presence of hardware Dirty Bit Management
@@ -439,11 +438,6 @@ static inline void __set_ptes(struct mm_struct *mm,
}
/*
- * Huge pte definitions.
- */
-#define pte_mkhuge(pte) (__pte(pte_val(pte) & ~PTE_TABLE_BIT))
-
-/*
* Hugetlb definitions.
*/
#define HUGE_MAX_HSTATE 4
@@ -684,6 +678,11 @@ static inline void set_pud_at(struct mm_struct *mm, unsigned long addr,
#define pgprot_nx(prot) \
__pgprot_modify(prot, PTE_MAYBE_GP, PTE_PXN)
+#define pgprot_decrypted(prot) \
+ __pgprot_modify(prot, PROT_NS_SHARED, PROT_NS_SHARED)
+#define pgprot_encrypted(prot) \
+ __pgprot_modify(prot, PROT_NS_SHARED, 0)
+
/*
* Mark the prot value as uncacheable and unbufferable.
*/
@@ -927,6 +926,9 @@ static inline phys_addr_t p4d_page_paddr(p4d_t p4d)
static inline pud_t *p4d_to_folded_pud(p4d_t *p4dp, unsigned long addr)
{
+ /* Ensure that 'p4dp' indexes a page table according to 'addr' */
+ VM_BUG_ON(((addr >> P4D_SHIFT) ^ ((u64)p4dp >> 3)) % PTRS_PER_P4D);
+
return (pud_t *)PTR_ALIGN_DOWN(p4dp, PAGE_SIZE) + pud_index(addr);
}
@@ -1051,6 +1053,9 @@ static inline phys_addr_t pgd_page_paddr(pgd_t pgd)
static inline p4d_t *pgd_to_folded_p4d(pgd_t *pgdp, unsigned long addr)
{
+ /* Ensure that 'pgdp' indexes a page table according to 'addr' */
+ VM_BUG_ON(((addr >> PGDIR_SHIFT) ^ ((u64)pgdp >> 3)) % PTRS_PER_PGD);
+
return (p4d_t *)PTR_ALIGN_DOWN(pgdp, PAGE_SIZE) + p4d_index(addr);
}
@@ -1259,15 +1264,17 @@ static inline int __ptep_clear_flush_young(struct vm_area_struct *vma,
return young;
}
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)
#define __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG
static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma,
unsigned long address,
pmd_t *pmdp)
{
+ /* Operation applies to PMD table entry only if FEAT_HAFT is enabled */
+ VM_WARN_ON(pmd_table(READ_ONCE(*pmdp)) && !system_supports_haft());
return __ptep_test_and_clear_young(vma, address, (pte_t *)pmdp);
}
-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG */
static inline pte_t __ptep_get_and_clear(struct mm_struct *mm,
unsigned long address, pte_t *ptep)
@@ -1502,6 +1509,10 @@ static inline void update_mmu_cache_range(struct vm_fault *vmf,
*/
#define arch_has_hw_pte_young cpu_has_hw_af
+#ifdef CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG
+#define arch_has_hw_nonleaf_pmd_young system_supports_haft
+#endif
+
/*
* Experimentally, it's cheap to set the access flag in hardware and we
* benefit from prefaulting mappings as 'old' to start with.
diff --git a/arch/arm64/include/asm/probes.h b/arch/arm64/include/asm/probes.h
index 0069467..d493688 100644
--- a/arch/arm64/include/asm/probes.h
+++ b/arch/arm64/include/asm/probes.h
@@ -9,21 +9,18 @@
#include <asm/insn.h>
-typedef u32 probe_opcode_t;
typedef void (probes_handler_t) (u32 opcode, long addr, struct pt_regs *);
-/* architecture specific copy of original instruction */
struct arch_probe_insn {
- probe_opcode_t *insn;
- pstate_check_t *pstate_cc;
probes_handler_t *handler;
- /* restore address after step xol */
- unsigned long restore;
};
#ifdef CONFIG_KPROBES
-typedef u32 kprobe_opcode_t;
+typedef __le32 kprobe_opcode_t;
struct arch_specific_insn {
struct arch_probe_insn api;
+ kprobe_opcode_t *xol_insn;
+ /* restore address after step xol */
+ unsigned long xol_restore;
};
#endif
diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h
index 1438424..1bf1a3b 100644
--- a/arch/arm64/include/asm/processor.h
+++ b/arch/arm64/include/asm/processor.h
@@ -185,6 +185,13 @@ struct thread_struct {
u64 svcr;
u64 tpidr2_el0;
u64 por_el0;
+#ifdef CONFIG_ARM64_GCS
+ unsigned int gcs_el0_mode;
+ unsigned int gcs_el0_locked;
+ u64 gcspr_el0;
+ u64 gcs_base;
+ u64 gcs_size;
+#endif
};
static inline unsigned int thread_get_vl(struct thread_struct *thread,
@@ -285,22 +292,44 @@ void tls_preserve_current_state(void);
.fpsimd_cpu = NR_CPUS, \
}
-static inline void start_thread_common(struct pt_regs *regs, unsigned long pc)
+static inline void start_thread_common(struct pt_regs *regs, unsigned long pc,
+ unsigned long pstate)
{
- s32 previous_syscall = regs->syscallno;
- memset(regs, 0, sizeof(*regs));
- regs->syscallno = previous_syscall;
- regs->pc = pc;
+ /*
+ * Ensure all GPRs are zeroed, and initialize PC + PSTATE.
+ * The SP (or compat SP) will be initialized later.
+ */
+ regs->user_regs = (struct user_pt_regs) {
+ .pc = pc,
+ .pstate = pstate,
+ };
+ /*
+ * To allow the syscalls:sys_exit_execve tracepoint we need to preserve
+ * syscallno, but do not need orig_x0 or the original GPRs.
+ */
+ regs->orig_x0 = 0;
+
+ /*
+ * An exec from a kernel thread won't have an existing PMR value.
+ */
if (system_uses_irq_prio_masking())
- regs->pmr_save = GIC_PRIO_IRQON;
+ regs->pmr = GIC_PRIO_IRQON;
+
+ /*
+ * The pt_regs::stackframe field must remain valid throughout this
+ * function as a stacktrace can be taken at any time. Any user or
+ * kernel task should have a valid final frame.
+ */
+ WARN_ON_ONCE(regs->stackframe.record.fp != 0);
+ WARN_ON_ONCE(regs->stackframe.record.lr != 0);
+ WARN_ON_ONCE(regs->stackframe.type != FRAME_META_TYPE_FINAL);
}
static inline void start_thread(struct pt_regs *regs, unsigned long pc,
unsigned long sp)
{
- start_thread_common(regs, pc);
- regs->pstate = PSR_MODE_EL0t;
+ start_thread_common(regs, pc, PSR_MODE_EL0t);
spectre_v4_enable_task_mitigation(current);
regs->sp = sp;
}
@@ -309,15 +338,13 @@ static inline void start_thread(struct pt_regs *regs, unsigned long pc,
static inline void compat_start_thread(struct pt_regs *regs, unsigned long pc,
unsigned long sp)
{
- start_thread_common(regs, pc);
- regs->pstate = PSR_AA32_MODE_USR;
+ unsigned long pstate = PSR_AA32_MODE_USR;
if (pc & 1)
- regs->pstate |= PSR_AA32_T_BIT;
+ pstate |= PSR_AA32_T_BIT;
+ if (IS_ENABLED(CONFIG_CPU_BIG_ENDIAN))
+ pstate |= PSR_AA32_E_BIT;
-#ifdef __AARCH64EB__
- regs->pstate |= PSR_AA32_E_BIT;
-#endif
-
+ start_thread_common(regs, pc, pstate);
spectre_v4_enable_task_mitigation(current);
regs->compat_sp = sp;
}
diff --git a/arch/arm64/include/asm/ptrace.h b/arch/arm64/include/asm/ptrace.h
index 0abe975..47ff865 100644
--- a/arch/arm64/include/asm/ptrace.h
+++ b/arch/arm64/include/asm/ptrace.h
@@ -98,6 +98,8 @@
#include <linux/bug.h>
#include <linux/types.h>
+#include <asm/stacktrace/frame.h>
+
/* sizeof(struct user) for AArch32 */
#define COMPAT_USER_SZ 296
@@ -149,8 +151,7 @@ static inline unsigned long pstate_to_compat_psr(const unsigned long pstate)
/*
* This struct defines the way the registers are stored on the stack during an
- * exception. Note that sizeof(struct pt_regs) has to be a multiple of 16 (for
- * stack alignment). struct user_pt_regs must form a prefix of struct pt_regs.
+ * exception. struct user_pt_regs must form a prefix of struct pt_regs.
*/
struct pt_regs {
union {
@@ -163,23 +164,20 @@ struct pt_regs {
};
};
u64 orig_x0;
-#ifdef __AARCH64EB__
- u32 unused2;
s32 syscallno;
-#else
- s32 syscallno;
- u32 unused2;
-#endif
+ u32 pmr;
+
u64 sdei_ttbr1;
- /* Only valid when ARM64_HAS_GIC_PRIO_MASKING is enabled. */
- u64 pmr_save;
- u64 stackframe[2];
+ struct frame_record_meta stackframe;
/* Only valid for some EL1 exceptions. */
u64 lockdep_hardirqs;
u64 exit_rcu;
};
+/* For correct stack alignment, pt_regs has to be a multiple of 16 bytes. */
+static_assert(IS_ALIGNED(sizeof(struct pt_regs), 16));
+
static inline bool in_syscall(struct pt_regs const *regs)
{
return regs->syscallno != NO_SYSCALL;
@@ -213,7 +211,7 @@ static inline void forget_syscall(struct pt_regs *regs)
#define irqs_priority_unmasked(regs) \
(system_uses_irq_prio_masking() ? \
- (regs)->pmr_save == GIC_PRIO_IRQON : \
+ (regs)->pmr == GIC_PRIO_IRQON : \
true)
#define interrupts_enabled(regs) \
diff --git a/arch/arm64/include/asm/rsi.h b/arch/arm64/include/asm/rsi.h
new file mode 100644
index 0000000..188cbb9
--- /dev/null
+++ b/arch/arm64/include/asm/rsi.h
@@ -0,0 +1,68 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2024 ARM Ltd.
+ */
+
+#ifndef __ASM_RSI_H_
+#define __ASM_RSI_H_
+
+#include <linux/errno.h>
+#include <linux/jump_label.h>
+#include <asm/rsi_cmds.h>
+
+DECLARE_STATIC_KEY_FALSE(rsi_present);
+
+void __init arm64_rsi_init(void);
+
+bool __arm64_is_protected_mmio(phys_addr_t base, size_t size);
+
+static inline bool is_realm_world(void)
+{
+ return static_branch_unlikely(&rsi_present);
+}
+
+static inline int rsi_set_memory_range(phys_addr_t start, phys_addr_t end,
+ enum ripas state, unsigned long flags)
+{
+ unsigned long ret;
+ phys_addr_t top;
+
+ while (start != end) {
+ ret = rsi_set_addr_range_state(start, end, state, flags, &top);
+ if (ret || top < start || top > end)
+ return -EINVAL;
+ start = top;
+ }
+
+ return 0;
+}
+
+/*
+ * Convert the specified range to RAM. Do not use this if you rely on the
+ * contents of a page that may already be in RAM state.
+ */
+static inline int rsi_set_memory_range_protected(phys_addr_t start,
+ phys_addr_t end)
+{
+ return rsi_set_memory_range(start, end, RSI_RIPAS_RAM,
+ RSI_CHANGE_DESTROYED);
+}
+
+/*
+ * Convert the specified range to RAM. Do not convert any pages that may have
+ * been DESTROYED, without our permission.
+ */
+static inline int rsi_set_memory_range_protected_safe(phys_addr_t start,
+ phys_addr_t end)
+{
+ return rsi_set_memory_range(start, end, RSI_RIPAS_RAM,
+ RSI_NO_CHANGE_DESTROYED);
+}
+
+static inline int rsi_set_memory_range_shared(phys_addr_t start,
+ phys_addr_t end)
+{
+ return rsi_set_memory_range(start, end, RSI_RIPAS_EMPTY,
+ RSI_CHANGE_DESTROYED);
+}
+#endif /* __ASM_RSI_H_ */
diff --git a/arch/arm64/include/asm/rsi_cmds.h b/arch/arm64/include/asm/rsi_cmds.h
new file mode 100644
index 0000000..e6a2110
--- /dev/null
+++ b/arch/arm64/include/asm/rsi_cmds.h
@@ -0,0 +1,160 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2023 ARM Ltd.
+ */
+
+#ifndef __ASM_RSI_CMDS_H
+#define __ASM_RSI_CMDS_H
+
+#include <linux/arm-smccc.h>
+
+#include <asm/rsi_smc.h>
+
+#define RSI_GRANULE_SHIFT 12
+#define RSI_GRANULE_SIZE (_AC(1, UL) << RSI_GRANULE_SHIFT)
+
+enum ripas {
+ RSI_RIPAS_EMPTY = 0,
+ RSI_RIPAS_RAM = 1,
+ RSI_RIPAS_DESTROYED = 2,
+ RSI_RIPAS_DEV = 3,
+};
+
+static inline unsigned long rsi_request_version(unsigned long req,
+ unsigned long *out_lower,
+ unsigned long *out_higher)
+{
+ struct arm_smccc_res res;
+
+ arm_smccc_smc(SMC_RSI_ABI_VERSION, req, 0, 0, 0, 0, 0, 0, &res);
+
+ if (out_lower)
+ *out_lower = res.a1;
+ if (out_higher)
+ *out_higher = res.a2;
+
+ return res.a0;
+}
+
+static inline unsigned long rsi_get_realm_config(struct realm_config *cfg)
+{
+ struct arm_smccc_res res;
+
+ arm_smccc_smc(SMC_RSI_REALM_CONFIG, virt_to_phys(cfg),
+ 0, 0, 0, 0, 0, 0, &res);
+ return res.a0;
+}
+
+static inline unsigned long rsi_ipa_state_get(phys_addr_t start,
+ phys_addr_t end,
+ enum ripas *state,
+ phys_addr_t *top)
+{
+ struct arm_smccc_res res;
+
+ arm_smccc_smc(SMC_RSI_IPA_STATE_GET,
+ start, end, 0, 0, 0, 0, 0,
+ &res);
+
+ if (res.a0 == RSI_SUCCESS) {
+ if (top)
+ *top = res.a1;
+ if (state)
+ *state = res.a2;
+ }
+
+ return res.a0;
+}
+
+static inline long rsi_set_addr_range_state(phys_addr_t start,
+ phys_addr_t end,
+ enum ripas state,
+ unsigned long flags,
+ phys_addr_t *top)
+{
+ struct arm_smccc_res res;
+
+ arm_smccc_smc(SMC_RSI_IPA_STATE_SET, start, end, state,
+ flags, 0, 0, 0, &res);
+
+ if (top)
+ *top = res.a1;
+
+ if (res.a2 != RSI_ACCEPT)
+ return -EPERM;
+
+ return res.a0;
+}
+
+/**
+ * rsi_attestation_token_init - Initialise the operation to retrieve an
+ * attestation token.
+ *
+ * @challenge: The challenge data to be used in the attestation token
+ * generation.
+ * @size: Size of the challenge data in bytes.
+ *
+ * Initialises the attestation token generation and returns an upper bound
+ * on the attestation token size that can be used to allocate an adequate
+ * buffer. The caller is expected to subsequently call
+ * rsi_attestation_token_continue() to retrieve the attestation token data on
+ * the same CPU.
+ *
+ * Returns:
+ * On success, returns the upper limit of the attestation report size.
+ * Otherwise, -EINVAL
+ */
+static inline long
+rsi_attestation_token_init(const u8 *challenge, unsigned long size)
+{
+ struct arm_smccc_1_2_regs regs = { 0 };
+
+ /* The challenge must be at least 32bytes and at most 64bytes */
+ if (!challenge || size < 32 || size > 64)
+ return -EINVAL;
+
+ regs.a0 = SMC_RSI_ATTESTATION_TOKEN_INIT;
+ memcpy(®s.a1, challenge, size);
+ arm_smccc_1_2_smc(®s, ®s);
+
+ if (regs.a0 == RSI_SUCCESS)
+ return regs.a1;
+
+ return -EINVAL;
+}
+
+/**
+ * rsi_attestation_token_continue - Continue the operation to retrieve an
+ * attestation token.
+ *
+ * @granule: {I}PA of the Granule to which the token will be written.
+ * @offset: Offset within Granule to start of buffer in bytes.
+ * @size: The size of the buffer.
+ * @len: The number of bytes written to the buffer.
+ *
+ * Retrieves up to a RSI_GRANULE_SIZE worth of token data per call. The caller
+ * is expected to call rsi_attestation_token_init() before calling this
+ * function to retrieve the attestation token.
+ *
+ * Return:
+ * * %RSI_SUCCESS - Attestation token retrieved successfully.
+ * * %RSI_INCOMPLETE - Token generation is not complete.
+ * * %RSI_ERROR_INPUT - A parameter was not valid.
+ * * %RSI_ERROR_STATE - Attestation not in progress.
+ */
+static inline unsigned long rsi_attestation_token_continue(phys_addr_t granule,
+ unsigned long offset,
+ unsigned long size,
+ unsigned long *len)
+{
+ struct arm_smccc_res res;
+
+ arm_smccc_1_1_invoke(SMC_RSI_ATTESTATION_TOKEN_CONTINUE,
+ granule, offset, size, 0, &res);
+
+ if (len)
+ *len = res.a1;
+ return res.a0;
+}
+
+#endif /* __ASM_RSI_CMDS_H */
diff --git a/arch/arm64/include/asm/rsi_smc.h b/arch/arm64/include/asm/rsi_smc.h
new file mode 100644
index 0000000..6cb070e
--- /dev/null
+++ b/arch/arm64/include/asm/rsi_smc.h
@@ -0,0 +1,193 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2023 ARM Ltd.
+ */
+
+#ifndef __ASM_RSI_SMC_H_
+#define __ASM_RSI_SMC_H_
+
+#include <linux/arm-smccc.h>
+
+/*
+ * This file describes the Realm Services Interface (RSI) Application Binary
+ * Interface (ABI) for SMC calls made from within the Realm to the RMM and
+ * serviced by the RMM.
+ */
+
+/*
+ * The major version number of the RSI implementation. This is increased when
+ * the binary format or semantics of the SMC calls change.
+ */
+#define RSI_ABI_VERSION_MAJOR UL(1)
+
+/*
+ * The minor version number of the RSI implementation. This is increased when
+ * a bug is fixed, or a feature is added without breaking binary compatibility.
+ */
+#define RSI_ABI_VERSION_MINOR UL(0)
+
+#define RSI_ABI_VERSION ((RSI_ABI_VERSION_MAJOR << 16) | \
+ RSI_ABI_VERSION_MINOR)
+
+#define RSI_ABI_VERSION_GET_MAJOR(_version) ((_version) >> 16)
+#define RSI_ABI_VERSION_GET_MINOR(_version) ((_version) & 0xFFFF)
+
+#define RSI_SUCCESS UL(0)
+#define RSI_ERROR_INPUT UL(1)
+#define RSI_ERROR_STATE UL(2)
+#define RSI_INCOMPLETE UL(3)
+#define RSI_ERROR_UNKNOWN UL(4)
+
+#define SMC_RSI_FID(n) ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \
+ ARM_SMCCC_SMC_64, \
+ ARM_SMCCC_OWNER_STANDARD, \
+ n)
+
+/*
+ * Returns RSI version.
+ *
+ * arg1 == Requested interface revision
+ * ret0 == Status / error
+ * ret1 == Lower implemented interface revision
+ * ret2 == Higher implemented interface revision
+ */
+#define SMC_RSI_ABI_VERSION SMC_RSI_FID(0x190)
+
+/*
+ * Read feature register.
+ *
+ * arg1 == Feature register index
+ * ret0 == Status / error
+ * ret1 == Feature register value
+ */
+#define SMC_RSI_FEATURES SMC_RSI_FID(0x191)
+
+/*
+ * Read measurement for the current Realm.
+ *
+ * arg1 == Index, which measurements slot to read
+ * ret0 == Status / error
+ * ret1 == Measurement value, bytes: 0 - 7
+ * ret2 == Measurement value, bytes: 8 - 15
+ * ret3 == Measurement value, bytes: 16 - 23
+ * ret4 == Measurement value, bytes: 24 - 31
+ * ret5 == Measurement value, bytes: 32 - 39
+ * ret6 == Measurement value, bytes: 40 - 47
+ * ret7 == Measurement value, bytes: 48 - 55
+ * ret8 == Measurement value, bytes: 56 - 63
+ */
+#define SMC_RSI_MEASUREMENT_READ SMC_RSI_FID(0x192)
+
+/*
+ * Extend Realm Extensible Measurement (REM) value.
+ *
+ * arg1 == Index, which measurements slot to extend
+ * arg2 == Size of realm measurement in bytes, max 64 bytes
+ * arg3 == Measurement value, bytes: 0 - 7
+ * arg4 == Measurement value, bytes: 8 - 15
+ * arg5 == Measurement value, bytes: 16 - 23
+ * arg6 == Measurement value, bytes: 24 - 31
+ * arg7 == Measurement value, bytes: 32 - 39
+ * arg8 == Measurement value, bytes: 40 - 47
+ * arg9 == Measurement value, bytes: 48 - 55
+ * arg10 == Measurement value, bytes: 56 - 63
+ * ret0 == Status / error
+ */
+#define SMC_RSI_MEASUREMENT_EXTEND SMC_RSI_FID(0x193)
+
+/*
+ * Initialize the operation to retrieve an attestation token.
+ *
+ * arg1 == Challenge value, bytes: 0 - 7
+ * arg2 == Challenge value, bytes: 8 - 15
+ * arg3 == Challenge value, bytes: 16 - 23
+ * arg4 == Challenge value, bytes: 24 - 31
+ * arg5 == Challenge value, bytes: 32 - 39
+ * arg6 == Challenge value, bytes: 40 - 47
+ * arg7 == Challenge value, bytes: 48 - 55
+ * arg8 == Challenge value, bytes: 56 - 63
+ * ret0 == Status / error
+ * ret1 == Upper bound of token size in bytes
+ */
+#define SMC_RSI_ATTESTATION_TOKEN_INIT SMC_RSI_FID(0x194)
+
+/*
+ * Continue the operation to retrieve an attestation token.
+ *
+ * arg1 == The IPA of token buffer
+ * arg2 == Offset within the granule of the token buffer
+ * arg3 == Size of the granule buffer
+ * ret0 == Status / error
+ * ret1 == Length of token bytes copied to the granule buffer
+ */
+#define SMC_RSI_ATTESTATION_TOKEN_CONTINUE SMC_RSI_FID(0x195)
+
+#ifndef __ASSEMBLY__
+
+struct realm_config {
+ union {
+ struct {
+ unsigned long ipa_bits; /* Width of IPA in bits */
+ unsigned long hash_algo; /* Hash algorithm */
+ };
+ u8 pad[0x200];
+ };
+ union {
+ u8 rpv[64]; /* Realm Personalization Value */
+ u8 pad2[0xe00];
+ };
+ /*
+ * The RMM requires the configuration structure to be aligned to a 4k
+ * boundary, ensure this happens by aligning this structure.
+ */
+} __aligned(0x1000);
+
+#endif /* __ASSEMBLY__ */
+
+/*
+ * Read configuration for the current Realm.
+ *
+ * arg1 == struct realm_config addr
+ * ret0 == Status / error
+ */
+#define SMC_RSI_REALM_CONFIG SMC_RSI_FID(0x196)
+
+/*
+ * Request RIPAS of a target IPA range to be changed to a specified value.
+ *
+ * arg1 == Base IPA address of target region
+ * arg2 == Top of the region
+ * arg3 == RIPAS value
+ * arg4 == flags
+ * ret0 == Status / error
+ * ret1 == Top of modified IPA range
+ * ret2 == Whether the Host accepted or rejected the request
+ */
+#define SMC_RSI_IPA_STATE_SET SMC_RSI_FID(0x197)
+
+#define RSI_NO_CHANGE_DESTROYED UL(0)
+#define RSI_CHANGE_DESTROYED UL(1)
+
+#define RSI_ACCEPT UL(0)
+#define RSI_REJECT UL(1)
+
+/*
+ * Get RIPAS of a target IPA range.
+ *
+ * arg1 == Base IPA of target region
+ * arg2 == End of target IPA region
+ * ret0 == Status / error
+ * ret1 == Top of IPA region which has the reported RIPAS value
+ * ret2 == RIPAS value
+ */
+#define SMC_RSI_IPA_STATE_GET SMC_RSI_FID(0x198)
+
+/*
+ * Make a Host call.
+ *
+ * arg1 == IPA of host call structure
+ * ret0 == Status / error
+ */
+#define SMC_RSI_HOST_CALL SMC_RSI_FID(0x199)
+
+#endif /* __ASM_RSI_SMC_H_ */
diff --git a/arch/arm64/include/asm/scs.h b/arch/arm64/include/asm/scs.h
index 2e010ea..a76f9b3 100644
--- a/arch/arm64/include/asm/scs.h
+++ b/arch/arm64/include/asm/scs.h
@@ -46,8 +46,14 @@ static inline void dynamic_scs_init(void)
static inline void dynamic_scs_init(void) {}
#endif
+enum {
+ EDYNSCS_INVALID_CIE_HEADER = 1,
+ EDYNSCS_INVALID_CIE_SDATA_SIZE = 2,
+ EDYNSCS_INVALID_FDE_AUGM_DATA_SIZE = 3,
+ EDYNSCS_INVALID_CFA_OPCODE = 4,
+};
+
int __pi_scs_patch(const u8 eh_frame[], int size);
-asmlinkage void __pi_scs_patch_vmlinux(void);
#endif /* __ASSEMBLY __ */
diff --git a/arch/arm64/include/asm/set_memory.h b/arch/arm64/include/asm/set_memory.h
index 917761f..37774c7 100644
--- a/arch/arm64/include/asm/set_memory.h
+++ b/arch/arm64/include/asm/set_memory.h
@@ -15,4 +15,7 @@ int set_direct_map_invalid_noflush(struct page *page);
int set_direct_map_default_noflush(struct page *page);
bool kernel_page_present(struct page *page);
+int set_memory_encrypted(unsigned long addr, int numpages);
+int set_memory_decrypted(unsigned long addr, int numpages);
+
#endif /* _ASM_ARM64_SET_MEMORY_H */
diff --git a/arch/arm64/include/asm/stacktrace/common.h b/arch/arm64/include/asm/stacktrace/common.h
index f63dc65..821a8fd 100644
--- a/arch/arm64/include/asm/stacktrace/common.h
+++ b/arch/arm64/include/asm/stacktrace/common.h
@@ -60,13 +60,27 @@ static inline void unwind_init_common(struct unwind_state *state)
state->stack = stackinfo_get_unknown();
}
-static struct stack_info *unwind_find_next_stack(const struct unwind_state *state,
- unsigned long sp,
- unsigned long size)
+/**
+ * unwind_find_stack() - Find the accessible stack which entirely contains an
+ * object.
+ *
+ * @state: the current unwind state.
+ * @sp: the base address of the object.
+ * @size: the size of the object.
+ *
+ * Return: a pointer to the relevant stack_info if found; NULL otherwise.
+ */
+static struct stack_info *unwind_find_stack(struct unwind_state *state,
+ unsigned long sp,
+ unsigned long size)
{
- for (int i = 0; i < state->nr_stacks; i++) {
- struct stack_info *info = &state->stacks[i];
+ struct stack_info *info = &state->stack;
+ if (stackinfo_on_stack(info, sp, size))
+ return info;
+
+ for (int i = 0; i < state->nr_stacks; i++) {
+ info = &state->stacks[i];
if (stackinfo_on_stack(info, sp, size))
return info;
}
@@ -75,36 +89,31 @@ static struct stack_info *unwind_find_next_stack(const struct unwind_state *stat
}
/**
- * unwind_consume_stack() - Check if an object is on an accessible stack,
- * updating stack boundaries so that future unwind steps cannot consume this
- * object again.
+ * unwind_consume_stack() - Update stack boundaries so that future unwind steps
+ * cannot consume this object again.
*
* @state: the current unwind state.
+ * @info: the stack_info of the stack containing the object.
* @sp: the base address of the object.
* @size: the size of the object.
*
* Return: 0 upon success, an error code otherwise.
*/
-static inline int unwind_consume_stack(struct unwind_state *state,
- unsigned long sp,
- unsigned long size)
+static inline void unwind_consume_stack(struct unwind_state *state,
+ struct stack_info *info,
+ unsigned long sp,
+ unsigned long size)
{
- struct stack_info *next;
-
- if (stackinfo_on_stack(&state->stack, sp, size))
- goto found;
-
- next = unwind_find_next_stack(state, sp, size);
- if (!next)
- return -EINVAL;
+ struct stack_info tmp;
/*
* Stack transitions are strictly one-way, and once we've
* transitioned from one stack to another, it's never valid to
* unwind back to the old stack.
*
- * Remove the current stack from the list of stacks so that it cannot
- * be found on a subsequent transition.
+ * Destroy the old stack info so that it cannot be found upon a
+ * subsequent transition. If the stack has not changed, we'll
+ * immediately restore the current stack info.
*
* Note that stacks can nest in several valid orders, e.g.
*
@@ -115,16 +124,15 @@ static inline int unwind_consume_stack(struct unwind_state *state,
* ... so we do not check the specific order of stack
* transitions.
*/
- state->stack = *next;
- *next = stackinfo_get_unknown();
+ tmp = *info;
+ *info = stackinfo_get_unknown();
+ state->stack = tmp;
-found:
/*
* Future unwind steps can only consume stack above this frame record.
* Update the current stack to start immediately above it.
*/
state->stack.low = sp + size;
- return 0;
}
/**
@@ -137,21 +145,25 @@ static inline int unwind_consume_stack(struct unwind_state *state,
static inline int
unwind_next_frame_record(struct unwind_state *state)
{
+ struct stack_info *info;
+ struct frame_record *record;
unsigned long fp = state->fp;
- int err;
if (fp & 0x7)
return -EINVAL;
- err = unwind_consume_stack(state, fp, 16);
- if (err)
- return err;
+ info = unwind_find_stack(state, fp, sizeof(*record));
+ if (!info)
+ return -EINVAL;
+
+ unwind_consume_stack(state, info, fp, sizeof(*record));
/*
* Record this frame record's values.
*/
- state->fp = READ_ONCE(*(unsigned long *)(fp));
- state->pc = READ_ONCE(*(unsigned long *)(fp + 8));
+ record = (struct frame_record *)fp;
+ state->fp = READ_ONCE(record->fp);
+ state->pc = READ_ONCE(record->lr);
return 0;
}
diff --git a/arch/arm64/include/asm/stacktrace/frame.h b/arch/arm64/include/asm/stacktrace/frame.h
new file mode 100644
index 0000000..0ee0f6b
--- /dev/null
+++ b/arch/arm64/include/asm/stacktrace/frame.h
@@ -0,0 +1,48 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef __ASM_STACKTRACE_FRAME_H
+#define __ASM_STACKTRACE_FRAME_H
+
+/*
+ * - FRAME_META_TYPE_NONE
+ *
+ * This value is reserved.
+ *
+ * - FRAME_META_TYPE_FINAL
+ *
+ * The record is the last entry on the stack.
+ * Unwinding should terminate successfully.
+ *
+ * - FRAME_META_TYPE_PT_REGS
+ *
+ * The record is embedded within a struct pt_regs, recording the registers at
+ * an arbitrary point in time.
+ * Unwinding should consume pt_regs::pc, followed by pt_regs::lr.
+ *
+ * Note: all other values are reserved and should result in unwinding
+ * terminating with an error.
+ */
+#define FRAME_META_TYPE_NONE 0
+#define FRAME_META_TYPE_FINAL 1
+#define FRAME_META_TYPE_PT_REGS 2
+
+#ifndef __ASSEMBLY__
+/*
+ * A standard AAPCS64 frame record.
+ */
+struct frame_record {
+ u64 fp;
+ u64 lr;
+};
+
+/*
+ * A metadata frame record indicating a special unwind.
+ * The record::{fp,lr} fields must be zero to indicate the presence of
+ * metadata.
+ */
+struct frame_record_meta {
+ struct frame_record record;
+ u64 type;
+};
+#endif /* __ASSEMBLY */
+
+#endif /* __ASM_STACKTRACE_FRAME_H */
diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h
index 9ea97dd..9c98ff4 100644
--- a/arch/arm64/include/asm/sysreg.h
+++ b/arch/arm64/include/asm/sysreg.h
@@ -1101,6 +1101,26 @@
/* Initial value for Permission Overlay Extension for EL0 */
#define POR_EL0_INIT POE_RXW
+/*
+ * Definitions for Guarded Control Stack
+ */
+
+#define GCS_CAP_ADDR_MASK GENMASK(63, 12)
+#define GCS_CAP_ADDR_SHIFT 12
+#define GCS_CAP_ADDR_WIDTH 52
+#define GCS_CAP_ADDR(x) FIELD_GET(GCS_CAP_ADDR_MASK, x)
+
+#define GCS_CAP_TOKEN_MASK GENMASK(11, 0)
+#define GCS_CAP_TOKEN_SHIFT 0
+#define GCS_CAP_TOKEN_WIDTH 12
+#define GCS_CAP_TOKEN(x) FIELD_GET(GCS_CAP_TOKEN_MASK, x)
+
+#define GCS_CAP_VALID_TOKEN 0x1
+#define GCS_CAP_IN_PROGRESS_TOKEN 0x5
+
+#define GCS_CAP(x) ((((unsigned long)x) & GCS_CAP_ADDR_MASK) | \
+ GCS_CAP_VALID_TOKEN)
+
#define ARM64_FEATURE_FIELD_BITS 4
/* Defined for compatibility only, do not add new users. */
diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h
index 95fbc8c..bc94e03 100644
--- a/arch/arm64/include/asm/tlbflush.h
+++ b/arch/arm64/include/asm/tlbflush.h
@@ -431,6 +431,23 @@ do { \
#define __flush_s2_tlb_range_op(op, start, pages, stride, tlb_level) \
__flush_tlb_range_op(op, start, pages, stride, 0, tlb_level, false, kvm_lpa2_is_enabled());
+static inline bool __flush_tlb_range_limit_excess(unsigned long start,
+ unsigned long end, unsigned long pages, unsigned long stride)
+{
+ /*
+ * When the system does not support TLB range based flush
+ * operation, (MAX_DVM_OPS - 1) pages can be handled. But
+ * with TLB range based operation, MAX_TLBI_RANGE_PAGES
+ * pages can be handled.
+ */
+ if ((!system_supports_tlb_range() &&
+ (end - start) >= (MAX_DVM_OPS * stride)) ||
+ pages > MAX_TLBI_RANGE_PAGES)
+ return true;
+
+ return false;
+}
+
static inline void __flush_tlb_range_nosync(struct vm_area_struct *vma,
unsigned long start, unsigned long end,
unsigned long stride, bool last_level,
@@ -442,15 +459,7 @@ static inline void __flush_tlb_range_nosync(struct vm_area_struct *vma,
end = round_up(end, stride);
pages = (end - start) >> PAGE_SHIFT;
- /*
- * When not uses TLB range ops, we can handle up to
- * (MAX_DVM_OPS - 1) pages;
- * When uses TLB range ops, we can handle up to
- * MAX_TLBI_RANGE_PAGES pages.
- */
- if ((!system_supports_tlb_range() &&
- (end - start) >= (MAX_DVM_OPS * stride)) ||
- pages > MAX_TLBI_RANGE_PAGES) {
+ if (__flush_tlb_range_limit_excess(start, end, pages, stride)) {
flush_tlb_mm(vma->vm_mm);
return;
}
@@ -492,19 +501,21 @@ static inline void flush_tlb_range(struct vm_area_struct *vma,
static inline void flush_tlb_kernel_range(unsigned long start, unsigned long end)
{
- unsigned long addr;
+ const unsigned long stride = PAGE_SIZE;
+ unsigned long pages;
- if ((end - start) > (MAX_DVM_OPS * PAGE_SIZE)) {
+ start = round_down(start, stride);
+ end = round_up(end, stride);
+ pages = (end - start) >> PAGE_SHIFT;
+
+ if (__flush_tlb_range_limit_excess(start, end, pages, stride)) {
flush_tlb_all();
return;
}
- start = __TLBI_VADDR(start, 0);
- end = __TLBI_VADDR(end, 0);
-
dsb(ishst);
- for (addr = start; addr < end; addr += 1 << (PAGE_SHIFT - 12))
- __tlbi(vaale1is, addr);
+ __flush_tlb_range_op(vaale1is, start, pages, stride, 0,
+ TLBI_TTL_UNKNOWN, false, lpa2_is_enabled());
dsb(ish);
isb();
}
diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h
index 1aa4ecb..5b91803 100644
--- a/arch/arm64/include/asm/uaccess.h
+++ b/arch/arm64/include/asm/uaccess.h
@@ -502,4 +502,44 @@ static inline size_t probe_subpage_writeable(const char __user *uaddr,
#endif /* CONFIG_ARCH_HAS_SUBPAGE_FAULTS */
+#ifdef CONFIG_ARM64_GCS
+
+static inline int gcssttr(unsigned long __user *addr, unsigned long val)
+{
+ register unsigned long __user *_addr __asm__ ("x0") = addr;
+ register unsigned long _val __asm__ ("x1") = val;
+ int err = 0;
+
+ /* GCSSTTR x1, x0 */
+ asm volatile(
+ "1: .inst 0xd91f1c01\n"
+ "2: \n"
+ _ASM_EXTABLE_UACCESS_ERR(1b, 2b, %w0)
+ : "+r" (err)
+ : "rZ" (_val), "r" (_addr)
+ : "memory");
+
+ return err;
+}
+
+static inline void put_user_gcs(unsigned long val, unsigned long __user *addr,
+ int *err)
+{
+ int ret;
+
+ if (!access_ok((char __user *)addr, sizeof(u64))) {
+ *err = -EFAULT;
+ return;
+ }
+
+ uaccess_ttbr0_enable();
+ ret = gcssttr(addr, val);
+ if (ret != 0)
+ *err = ret;
+ uaccess_ttbr0_disable();
+}
+
+
+#endif /* CONFIG_ARM64_GCS */
+
#endif /* __ASM_UACCESS_H */
diff --git a/arch/arm64/include/uapi/asm/hwcap.h b/arch/arm64/include/uapi/asm/hwcap.h
index 055381b..48d46b7 100644
--- a/arch/arm64/include/uapi/asm/hwcap.h
+++ b/arch/arm64/include/uapi/asm/hwcap.h
@@ -21,7 +21,7 @@
* HWCAP flags - for AT_HWCAP
*
* Bits 62 and 63 are reserved for use by libc.
- * Bits 32-61 are unallocated for potential use by libc.
+ * Bits 33-61 are unallocated for potential use by libc.
*/
#define HWCAP_FP (1 << 0)
#define HWCAP_ASIMD (1 << 1)
@@ -55,6 +55,7 @@
#define HWCAP_SB (1 << 29)
#define HWCAP_PACA (1 << 30)
#define HWCAP_PACG (1UL << 31)
+#define HWCAP_GCS (1UL << 32)
/*
* HWCAP2 flags - for AT_HWCAP2
@@ -124,4 +125,8 @@
#define HWCAP2_SME_SF8DP2 (1UL << 62)
#define HWCAP2_POE (1UL << 63)
+/*
+ * HWCAP3 flags - for AT_HWCAP3
+ */
+
#endif /* _UAPI__ASM_HWCAP_H */
diff --git a/arch/arm64/include/uapi/asm/ptrace.h b/arch/arm64/include/uapi/asm/ptrace.h
index 7fa2f703..0f39ba4 100644
--- a/arch/arm64/include/uapi/asm/ptrace.h
+++ b/arch/arm64/include/uapi/asm/ptrace.h
@@ -324,6 +324,14 @@ struct user_za_header {
#define ZA_PT_SIZE(vq) \
(ZA_PT_ZA_OFFSET + ZA_PT_ZA_SIZE(vq))
+/* GCS state (NT_ARM_GCS) */
+
+struct user_gcs {
+ __u64 features_enabled;
+ __u64 features_locked;
+ __u64 gcspr_el0;
+};
+
#endif /* __ASSEMBLY__ */
#endif /* _UAPI__ASM_PTRACE_H */
diff --git a/arch/arm64/include/uapi/asm/sigcontext.h b/arch/arm64/include/uapi/asm/sigcontext.h
index bb7af77..d42f7a9 100644
--- a/arch/arm64/include/uapi/asm/sigcontext.h
+++ b/arch/arm64/include/uapi/asm/sigcontext.h
@@ -183,6 +183,15 @@ struct zt_context {
__u16 __reserved[3];
};
+#define GCS_MAGIC 0x47435300
+
+struct gcs_context {
+ struct _aarch64_ctx head;
+ __u64 gcspr;
+ __u64 features_enabled;
+ __u64 reserved;
+};
+
#endif /* !__ASSEMBLY__ */
#include <asm/sve_context.h>
diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile
index 2b112f3..71c29a2 100644
--- a/arch/arm64/kernel/Makefile
+++ b/arch/arm64/kernel/Makefile
@@ -33,7 +33,8 @@
return_address.o cpuinfo.o cpu_errata.o \
cpufeature.o alternative.o cacheinfo.o \
smp.o smp_spin_table.o topology.o smccc-call.o \
- syscall.o proton-pack.o idle.o patching.o pi/
+ syscall.o proton-pack.o idle.o patching.o pi/ \
+ rsi.o
obj-$(CONFIG_COMPAT) += sys32.o signal32.o \
sys_compat.o
diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c
index b21dd24..f89df36 100644
--- a/arch/arm64/kernel/asm-offsets.c
+++ b/arch/arm64/kernel/asm-offsets.c
@@ -12,15 +12,12 @@
#include <linux/ftrace.h>
#include <linux/kexec.h>
#include <linux/mm.h>
-#include <linux/dma-mapping.h>
#include <linux/kvm_host.h>
-#include <linux/preempt.h>
#include <linux/suspend.h>
#include <asm/cpufeature.h>
#include <asm/fixmap.h>
#include <asm/thread_info.h>
#include <asm/memory.h>
-#include <asm/signal32.h>
#include <asm/smp_plat.h>
#include <asm/suspend.h>
#include <linux/kbuild.h>
@@ -28,8 +25,6 @@
int main(void)
{
- DEFINE(TSK_ACTIVE_MM, offsetof(struct task_struct, active_mm));
- BLANK();
DEFINE(TSK_TI_CPU, offsetof(struct task_struct, thread_info.cpu));
DEFINE(TSK_TI_FLAGS, offsetof(struct task_struct, thread_info.flags));
DEFINE(TSK_TI_PREEMPT, offsetof(struct task_struct, thread_info.preempt_count));
@@ -79,8 +74,9 @@ int main(void)
DEFINE(S_PSTATE, offsetof(struct pt_regs, pstate));
DEFINE(S_SYSCALLNO, offsetof(struct pt_regs, syscallno));
DEFINE(S_SDEI_TTBR1, offsetof(struct pt_regs, sdei_ttbr1));
- DEFINE(S_PMR_SAVE, offsetof(struct pt_regs, pmr_save));
+ DEFINE(S_PMR, offsetof(struct pt_regs, pmr));
DEFINE(S_STACKFRAME, offsetof(struct pt_regs, stackframe));
+ DEFINE(S_STACKFRAME_TYPE, offsetof(struct pt_regs, stackframe.type));
DEFINE(PT_REGS_SIZE, sizeof(struct pt_regs));
BLANK();
#ifdef CONFIG_DYNAMIC_FTRACE_WITH_ARGS
@@ -99,25 +95,6 @@ int main(void)
DEFINE(FREGS_SIZE, sizeof(struct ftrace_regs));
BLANK();
#endif
-#ifdef CONFIG_COMPAT
- DEFINE(COMPAT_SIGFRAME_REGS_OFFSET, offsetof(struct compat_sigframe, uc.uc_mcontext.arm_r0));
- DEFINE(COMPAT_RT_SIGFRAME_REGS_OFFSET, offsetof(struct compat_rt_sigframe, sig.uc.uc_mcontext.arm_r0));
- BLANK();
-#endif
- DEFINE(MM_CONTEXT_ID, offsetof(struct mm_struct, context.id.counter));
- BLANK();
- DEFINE(VMA_VM_MM, offsetof(struct vm_area_struct, vm_mm));
- DEFINE(VMA_VM_FLAGS, offsetof(struct vm_area_struct, vm_flags));
- BLANK();
- DEFINE(VM_EXEC, VM_EXEC);
- BLANK();
- DEFINE(PAGE_SZ, PAGE_SIZE);
- BLANK();
- DEFINE(DMA_TO_DEVICE, DMA_TO_DEVICE);
- DEFINE(DMA_FROM_DEVICE, DMA_FROM_DEVICE);
- BLANK();
- DEFINE(PREEMPT_DISABLE_OFFSET, PREEMPT_DISABLE_OFFSET);
- BLANK();
DEFINE(CPU_BOOT_TASK, offsetof(struct secondary_data, task));
BLANK();
DEFINE(FTR_OVR_VAL_OFFSET, offsetof(struct arm64_ftr_override, val));
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index 718728a..351aa82 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -103,6 +103,7 @@ static DECLARE_BITMAP(elf_hwcap, MAX_CPU_FEATURES) __read_mostly;
COMPAT_HWCAP_LPAE)
unsigned int compat_elf_hwcap __read_mostly = COMPAT_ELF_HWCAP_DEFAULT;
unsigned int compat_elf_hwcap2 __read_mostly;
+unsigned int compat_elf_hwcap3 __read_mostly;
#endif
DECLARE_BITMAP(system_cpucaps, ARM64_NCAPS);
@@ -228,6 +229,7 @@ static const struct arm64_ftr_bits ftr_id_aa64isar0[] = {
};
static const struct arm64_ftr_bits ftr_id_aa64isar1[] = {
+ ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_XS_SHIFT, 4, 0),
ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_I8MM_SHIFT, 4, 0),
ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_DGH_SHIFT, 4, 0),
ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_BF16_SHIFT, 4, 0),
@@ -291,6 +293,8 @@ static const struct arm64_ftr_bits ftr_id_aa64pfr0[] = {
};
static const struct arm64_ftr_bits ftr_id_aa64pfr1[] = {
+ ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_GCS),
+ FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_EL1_GCS_SHIFT, 4, 0),
ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME),
FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_EL1_SME_SHIFT, 4, 0),
ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_EL1_MPAM_frac_SHIFT, 4, 0),
@@ -2358,6 +2362,14 @@ static void cpu_enable_poe(const struct arm64_cpu_capabilities *__unused)
}
#endif
+#ifdef CONFIG_ARM64_GCS
+static void cpu_enable_gcs(const struct arm64_cpu_capabilities *__unused)
+{
+ /* GCSPR_EL0 is always readable */
+ write_sysreg_s(GCSCRE0_EL1_nTR, SYS_GCSCRE0_EL1);
+}
+#endif
+
/* Internal helper functions to match cpu capability type */
static bool
cpucap_late_cpu_optional(const struct arm64_cpu_capabilities *cap)
@@ -2591,6 +2603,21 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
ARM64_CPUID_FIELDS(ID_AA64MMFR1_EL1, HAFDBS, DBM)
},
#endif
+#ifdef CONFIG_ARM64_HAFT
+ {
+ .desc = "Hardware managed Access Flag for Table Descriptors",
+ /*
+ * Contrary to the page/block access flag, the table access flag
+ * cannot be emulated in software (no access fault will occur).
+ * Therefore this should be used only if it's supported system
+ * wide.
+ */
+ .type = ARM64_CPUCAP_SYSTEM_FEATURE,
+ .capability = ARM64_HAFT,
+ .matches = has_cpuid_feature,
+ ARM64_CPUID_FIELDS(ID_AA64MMFR1_EL1, HAFDBS, HAFT)
+ },
+#endif
{
.desc = "CRC32 instructions",
.capability = ARM64_HAS_CRC32,
@@ -2890,6 +2917,16 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
ARM64_CPUID_FIELDS(ID_AA64MMFR3_EL1, S1POE, IMP)
},
#endif
+#ifdef CONFIG_ARM64_GCS
+ {
+ .desc = "Guarded Control Stack (GCS)",
+ .capability = ARM64_HAS_GCS,
+ .type = ARM64_CPUCAP_SYSTEM_FEATURE,
+ .cpu_enable = cpu_enable_gcs,
+ .matches = has_cpuid_feature,
+ ARM64_CPUID_FIELDS(ID_AA64PFR1_EL1, GCS, IMP)
+ },
+#endif
{},
};
@@ -3006,6 +3043,9 @@ static const struct arm64_cpu_capabilities arm64_elf_hwcaps[] = {
HWCAP_CAP(ID_AA64ZFR0_EL1, F32MM, IMP, CAP_HWCAP, KERNEL_HWCAP_SVEF32MM),
HWCAP_CAP(ID_AA64ZFR0_EL1, F64MM, IMP, CAP_HWCAP, KERNEL_HWCAP_SVEF64MM),
#endif
+#ifdef CONFIG_ARM64_GCS
+ HWCAP_CAP(ID_AA64PFR1_EL1, GCS, IMP, CAP_HWCAP, KERNEL_HWCAP_GCS),
+#endif
HWCAP_CAP(ID_AA64PFR1_EL1, SSBS, SSBS2, CAP_HWCAP, KERNEL_HWCAP_SSBS),
#ifdef CONFIG_ARM64_BTI
HWCAP_CAP(ID_AA64PFR1_EL1, BT, IMP, CAP_HWCAP, KERNEL_HWCAP_BTI),
@@ -3499,6 +3539,11 @@ unsigned long cpu_get_elf_hwcap2(void)
return elf_hwcap[1];
}
+unsigned long cpu_get_elf_hwcap3(void)
+{
+ return elf_hwcap[2];
+}
+
static void __init setup_boot_cpu_capabilities(void)
{
/*
diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c
index 44718d0..f2f92c6 100644
--- a/arch/arm64/kernel/cpuinfo.c
+++ b/arch/arm64/kernel/cpuinfo.c
@@ -80,6 +80,7 @@ static const char *const hwcap_str[] = {
[KERNEL_HWCAP_SB] = "sb",
[KERNEL_HWCAP_PACA] = "paca",
[KERNEL_HWCAP_PACG] = "pacg",
+ [KERNEL_HWCAP_GCS] = "gcs",
[KERNEL_HWCAP_DCPODP] = "dcpodp",
[KERNEL_HWCAP_SVE2] = "sve2",
[KERNEL_HWCAP_SVEAES] = "sveaes",
diff --git a/arch/arm64/kernel/debug-monitors.c b/arch/arm64/kernel/debug-monitors.c
index 024a7b2..58f047d 100644
--- a/arch/arm64/kernel/debug-monitors.c
+++ b/arch/arm64/kernel/debug-monitors.c
@@ -303,7 +303,6 @@ static int call_break_hook(struct pt_regs *regs, unsigned long esr)
{
struct break_hook *hook;
struct list_head *list;
- int (*fn)(struct pt_regs *regs, unsigned long esr) = NULL;
list = user_mode(regs) ? &user_break_hook : &kernel_break_hook;
@@ -313,10 +312,10 @@ static int call_break_hook(struct pt_regs *regs, unsigned long esr)
*/
list_for_each_entry_rcu(hook, list, node) {
if ((esr_brk_comment(esr) & ~hook->mask) == hook->imm)
- fn = hook->fn;
+ return hook->fn(regs, esr);
}
- return fn ? fn(regs, esr) : DBG_HOOK_ERROR;
+ return DBG_HOOK_ERROR;
}
NOKPROBE_SYMBOL(call_break_hook);
@@ -441,6 +440,11 @@ void kernel_rewind_single_step(struct pt_regs *regs)
set_regs_spsr_ss(regs);
}
+void kernel_fastforward_single_step(struct pt_regs *regs)
+{
+ clear_regs_spsr_ss(regs);
+}
+
/* ptrace API */
void user_enable_single_step(struct task_struct *task)
{
diff --git a/arch/arm64/kernel/efi.c b/arch/arm64/kernel/efi.c
index 712718a..1d25d88 100644
--- a/arch/arm64/kernel/efi.c
+++ b/arch/arm64/kernel/efi.c
@@ -34,8 +34,16 @@ static __init pteval_t create_mapping_protection(efi_memory_desc_t *md)
u64 attr = md->attribute;
u32 type = md->type;
- if (type == EFI_MEMORY_MAPPED_IO)
- return PROT_DEVICE_nGnRE;
+ if (type == EFI_MEMORY_MAPPED_IO) {
+ pgprot_t prot = __pgprot(PROT_DEVICE_nGnRE);
+
+ if (arm64_is_protected_mmio(md->phys_addr,
+ md->num_pages << EFI_PAGE_SHIFT))
+ prot = pgprot_encrypted(prot);
+ else
+ prot = pgprot_decrypted(prot);
+ return pgprot_val(prot);
+ }
if (region_is_misaligned(md)) {
static bool __initdata code_is_misaligned;
diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c
index 3fcd9d0..b260ddc 100644
--- a/arch/arm64/kernel/entry-common.c
+++ b/arch/arm64/kernel/entry-common.c
@@ -463,6 +463,24 @@ static void noinstr el1_bti(struct pt_regs *regs, unsigned long esr)
exit_to_kernel_mode(regs);
}
+static void noinstr el1_gcs(struct pt_regs *regs, unsigned long esr)
+{
+ enter_from_kernel_mode(regs);
+ local_daif_inherit(regs);
+ do_el1_gcs(regs, esr);
+ local_daif_mask();
+ exit_to_kernel_mode(regs);
+}
+
+static void noinstr el1_mops(struct pt_regs *regs, unsigned long esr)
+{
+ enter_from_kernel_mode(regs);
+ local_daif_inherit(regs);
+ do_el1_mops(regs, esr);
+ local_daif_mask();
+ exit_to_kernel_mode(regs);
+}
+
static void noinstr el1_dbg(struct pt_regs *regs, unsigned long esr)
{
unsigned long far = read_sysreg(far_el1);
@@ -505,6 +523,12 @@ asmlinkage void noinstr el1h_64_sync_handler(struct pt_regs *regs)
case ESR_ELx_EC_BTI:
el1_bti(regs, esr);
break;
+ case ESR_ELx_EC_GCS:
+ el1_gcs(regs, esr);
+ break;
+ case ESR_ELx_EC_MOPS:
+ el1_mops(regs, esr);
+ break;
case ESR_ELx_EC_BREAKPT_CUR:
case ESR_ELx_EC_SOFTSTP_CUR:
case ESR_ELx_EC_WATCHPT_CUR:
@@ -684,6 +708,14 @@ static void noinstr el0_mops(struct pt_regs *regs, unsigned long esr)
exit_to_user_mode(regs);
}
+static void noinstr el0_gcs(struct pt_regs *regs, unsigned long esr)
+{
+ enter_from_user_mode(regs);
+ local_daif_restore(DAIF_PROCCTX);
+ do_el0_gcs(regs, esr);
+ exit_to_user_mode(regs);
+}
+
static void noinstr el0_inv(struct pt_regs *regs, unsigned long esr)
{
enter_from_user_mode(regs);
@@ -766,6 +798,9 @@ asmlinkage void noinstr el0t_64_sync_handler(struct pt_regs *regs)
case ESR_ELx_EC_MOPS:
el0_mops(regs, esr);
break;
+ case ESR_ELx_EC_GCS:
+ el0_gcs(regs, esr);
+ break;
case ESR_ELx_EC_BREAKPT_LOW:
case ESR_ELx_EC_SOFTSTP_LOW:
case ESR_ELx_EC_WATCHPT_LOW:
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 7ef0e12..5ae2a34 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -25,6 +25,7 @@
#include <asm/processor.h>
#include <asm/ptrace.h>
#include <asm/scs.h>
+#include <asm/stacktrace/frame.h>
#include <asm/thread_info.h>
#include <asm/asm-uaccess.h>
#include <asm/unistd.h>
@@ -284,15 +285,16 @@
stp lr, x21, [sp, #S_LR]
/*
- * For exceptions from EL0, create a final frame record.
- * For exceptions from EL1, create a synthetic frame record so the
- * interrupted code shows up in the backtrace.
+ * Create a metadata frame record. The unwinder will use this to
+ * identify and unwind exception boundaries.
*/
- .if \el == 0
stp xzr, xzr, [sp, #S_STACKFRAME]
+ .if \el == 0
+ mov x0, #FRAME_META_TYPE_FINAL
.else
- stp x29, x22, [sp, #S_STACKFRAME]
+ mov x0, #FRAME_META_TYPE_PT_REGS
.endif
+ str x0, [sp, #S_STACKFRAME_TYPE]
add x29, sp, #S_STACKFRAME
#ifdef CONFIG_ARM64_SW_TTBR0_PAN
@@ -315,7 +317,7 @@
alternative_else_nop_endif
mrs_s x20, SYS_ICC_PMR_EL1
- str x20, [sp, #S_PMR_SAVE]
+ str w20, [sp, #S_PMR]
mov x20, #GIC_PRIO_IRQON | GIC_PRIO_PSR_I_SET
msr_s SYS_ICC_PMR_EL1, x20
@@ -342,7 +344,7 @@
b .Lskip_pmr_restore\@
alternative_else_nop_endif
- ldr x20, [sp, #S_PMR_SAVE]
+ ldr w20, [sp, #S_PMR]
msr_s SYS_ICC_PMR_EL1, x20
/* Ensure priority change is seen by redistributor */
diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c
index 6d21971..8c4c1a2 100644
--- a/arch/arm64/kernel/fpsimd.c
+++ b/arch/arm64/kernel/fpsimd.c
@@ -386,7 +386,7 @@ static void task_fpsimd_load(void)
* fpsimd_save_user_state() or memory corruption, we
* should always record an explicit format
* when we save. We always at least have the
- * memory allocated for FPSMID registers so
+ * memory allocated for FPSIMD registers so
* try that and hope for the best.
*/
WARN_ON_ONCE(1);
diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index cb68adc..5ab1970 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -32,6 +32,7 @@
#include <asm/scs.h>
#include <asm/smp.h>
#include <asm/sysreg.h>
+#include <asm/stacktrace/frame.h>
#include <asm/thread_info.h>
#include <asm/virt.h>
@@ -199,6 +200,8 @@
sub sp, sp, #PT_REGS_SIZE
stp xzr, xzr, [sp, #S_STACKFRAME]
+ mov \tmp1, #FRAME_META_TYPE_FINAL
+ str \tmp1, [sp, #S_STACKFRAME_TYPE]
add x29, sp, #S_STACKFRAME
scs_load_current
diff --git a/arch/arm64/kernel/hibernate.c b/arch/arm64/kernel/hibernate.c
index 7b11d84..18749e9 100644
--- a/arch/arm64/kernel/hibernate.c
+++ b/arch/arm64/kernel/hibernate.c
@@ -266,9 +266,15 @@ static int swsusp_mte_save_tags(void)
max_zone_pfn = zone_end_pfn(zone);
for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) {
struct page *page = pfn_to_online_page(pfn);
+ struct folio *folio;
if (!page)
continue;
+ folio = page_folio(page);
+
+ if (folio_test_hugetlb(folio) &&
+ !folio_test_hugetlb_mte_tagged(folio))
+ continue;
if (!page_mte_tagged(page))
continue;
diff --git a/arch/arm64/kernel/module.c b/arch/arm64/kernel/module.c
index 36b25af..06bb680 100644
--- a/arch/arm64/kernel/module.c
+++ b/arch/arm64/kernel/module.c
@@ -462,14 +462,20 @@ int module_finalize(const Elf_Ehdr *hdr,
struct module *me)
{
const Elf_Shdr *s;
+ int ret;
+
s = find_section(hdr, sechdrs, ".altinstructions");
if (s)
apply_alternatives_module((void *)s->sh_addr, s->sh_size);
if (scs_is_dynamic()) {
s = find_section(hdr, sechdrs, ".init.eh_frame");
- if (s)
- __pi_scs_patch((void *)s->sh_addr, s->sh_size);
+ if (s) {
+ ret = __pi_scs_patch((void *)s->sh_addr, s->sh_size);
+ if (ret)
+ pr_err("module %s: error occurred during dynamic SCS patching (%d)\n",
+ me->name, ret);
+ }
}
return module_init_ftrace_plt(hdr, sechdrs, me);
diff --git a/arch/arm64/kernel/mte.c b/arch/arm64/kernel/mte.c
index 6174671..2fbfd27 100644
--- a/arch/arm64/kernel/mte.c
+++ b/arch/arm64/kernel/mte.c
@@ -38,7 +38,24 @@ EXPORT_SYMBOL_GPL(mte_async_or_asymm_mode);
void mte_sync_tags(pte_t pte, unsigned int nr_pages)
{
struct page *page = pte_page(pte);
- unsigned int i;
+ struct folio *folio = page_folio(page);
+ unsigned long i;
+
+ if (folio_test_hugetlb(folio)) {
+ unsigned long nr = folio_nr_pages(folio);
+
+ /* Hugetlb MTE flags are set for head page only */
+ if (folio_try_hugetlb_mte_tagging(folio)) {
+ for (i = 0; i < nr; i++, page++)
+ mte_clear_page_tags(page_address(page));
+ folio_set_hugetlb_mte_tagged(folio);
+ }
+
+ /* ensure the tags are visible before the PTE is set */
+ smp_wmb();
+
+ return;
+ }
/* if PG_mte_tagged is set, tags have already been initialised */
for (i = 0; i < nr_pages; i++, page++) {
@@ -410,6 +427,7 @@ static int __access_remote_tags(struct mm_struct *mm, unsigned long addr,
void *maddr;
struct page *page = get_user_page_vma_remote(mm, addr,
gup_flags, &vma);
+ struct folio *folio;
if (IS_ERR(page)) {
err = PTR_ERR(page);
@@ -428,7 +446,12 @@ static int __access_remote_tags(struct mm_struct *mm, unsigned long addr,
put_page(page);
break;
}
- WARN_ON_ONCE(!page_mte_tagged(page));
+
+ folio = page_folio(page);
+ if (folio_test_hugetlb(folio))
+ WARN_ON_ONCE(!folio_test_hugetlb_mte_tagged(folio));
+ else
+ WARN_ON_ONCE(!page_mte_tagged(page));
/* limit access to the end of the page */
offset = offset_in_page(addr);
diff --git a/arch/arm64/kernel/pi/idreg-override.c b/arch/arm64/kernel/pi/idreg-override.c
index 29d4b62..2215925 100644
--- a/arch/arm64/kernel/pi/idreg-override.c
+++ b/arch/arm64/kernel/pi/idreg-override.c
@@ -38,6 +38,15 @@ struct ftr_set_desc {
#define FIELD(n, s, f) { .name = n, .shift = s, .width = 4, .filter = f }
+static const struct ftr_set_desc mmfr0 __prel64_initconst = {
+ .name = "id_aa64mmfr0",
+ .override = &id_aa64mmfr0_override,
+ .fields = {
+ FIELD("ecv", ID_AA64MMFR0_EL1_ECV_SHIFT, NULL),
+ {}
+ },
+};
+
static bool __init mmfr1_vh_filter(u64 val)
{
/*
@@ -133,6 +142,7 @@ static const struct ftr_set_desc pfr1 __prel64_initconst = {
.override = &id_aa64pfr1_override,
.fields = {
FIELD("bt", ID_AA64PFR1_EL1_BT_SHIFT, NULL ),
+ FIELD("gcs", ID_AA64PFR1_EL1_GCS_SHIFT, NULL),
FIELD("mte", ID_AA64PFR1_EL1_MTE_SHIFT, NULL),
FIELD("sme", ID_AA64PFR1_EL1_SME_SHIFT, pfr1_sme_filter),
{}
@@ -196,6 +206,7 @@ static const struct ftr_set_desc sw_features __prel64_initconst = {
static const
PREL64(const struct ftr_set_desc, reg) regs[] __prel64_initconst = {
+ { &mmfr0 },
{ &mmfr1 },
{ &mmfr2 },
{ &pfr0 },
@@ -215,6 +226,7 @@ static const struct {
{ "arm64.nosve", "id_aa64pfr0.sve=0" },
{ "arm64.nosme", "id_aa64pfr1.sme=0" },
{ "arm64.nobti", "id_aa64pfr1.bt=0" },
+ { "arm64.nogcs", "id_aa64pfr1.gcs=0" },
{ "arm64.nopauth",
"id_aa64isar1.gpi=0 id_aa64isar1.gpa=0 "
"id_aa64isar1.api=0 id_aa64isar1.apa=0 "
diff --git a/arch/arm64/kernel/pi/map_range.c b/arch/arm64/kernel/pi/map_range.c
index 5410b2c..2b69e3b 100644
--- a/arch/arm64/kernel/pi/map_range.c
+++ b/arch/arm64/kernel/pi/map_range.c
@@ -30,7 +30,7 @@ void __init map_range(u64 *pte, u64 start, u64 end, u64 pa, pgprot_t prot,
int level, pte_t *tbl, bool may_use_cont, u64 va_offset)
{
u64 cmask = (level == 3) ? CONT_PTE_SIZE - 1 : U64_MAX;
- u64 protval = pgprot_val(prot) & ~PTE_TYPE_MASK;
+ pteval_t protval = pgprot_val(prot) & ~PTE_TYPE_MASK;
int lshift = (3 - level) * (PAGE_SHIFT - 3);
u64 lmask = (PAGE_SIZE << lshift) - 1;
diff --git a/arch/arm64/kernel/pi/patch-scs.c b/arch/arm64/kernel/pi/patch-scs.c
index 49d8b40..55d0cd6 100644
--- a/arch/arm64/kernel/pi/patch-scs.c
+++ b/arch/arm64/kernel/pi/patch-scs.c
@@ -50,6 +50,10 @@ bool dynamic_scs_is_enabled;
#define DW_CFA_GNU_negative_offset_extended 0x2f
#define DW_CFA_hi_user 0x3f
+#define DW_EH_PE_sdata4 0x0b
+#define DW_EH_PE_sdata8 0x0c
+#define DW_EH_PE_pcrel 0x10
+
enum {
PACIASP = 0xd503233f,
AUTIASP = 0xd50323bf,
@@ -120,7 +124,12 @@ struct eh_frame {
union {
struct { // CIE
u8 version;
- u8 augmentation_string[];
+ u8 augmentation_string[3];
+ u8 code_alignment_factor;
+ u8 data_alignment_factor;
+ u8 return_address_register;
+ u8 augmentation_data_size;
+ u8 fde_pointer_format;
};
struct { // FDE
@@ -128,30 +137,39 @@ struct eh_frame {
s32 range;
u8 opcodes[];
};
+
+ struct { // FDE
+ s64 initial_loc64;
+ s64 range64;
+ u8 opcodes64[];
+ };
};
};
static int scs_handle_fde_frame(const struct eh_frame *frame,
- bool fde_has_augmentation_data,
int code_alignment_factor,
+ bool use_sdata8,
bool dry_run)
{
int size = frame->size - offsetof(struct eh_frame, opcodes) + 4;
u64 loc = (u64)offset_to_ptr(&frame->initial_loc);
const u8 *opcode = frame->opcodes;
+ int l;
- if (fde_has_augmentation_data) {
- int l;
-
- // assume single byte uleb128_t
- if (WARN_ON(*opcode & BIT(7)))
- return -ENOEXEC;
-
- l = *opcode++;
- opcode += l;
- size -= l + 1;
+ if (use_sdata8) {
+ loc = (u64)&frame->initial_loc64 + frame->initial_loc64;
+ opcode = frame->opcodes64;
+ size -= 8;
}
+ // assume single byte uleb128_t for augmentation data size
+ if (*opcode & BIT(7))
+ return EDYNSCS_INVALID_FDE_AUGM_DATA_SIZE;
+
+ l = *opcode++;
+ opcode += l;
+ size -= l + 1;
+
/*
* Starting from 'loc', apply the CFA opcodes that advance the location
* pointer, and identify the locations of the PAC instructions.
@@ -201,7 +219,7 @@ static int scs_handle_fde_frame(const struct eh_frame *frame,
break;
default:
- return -ENOEXEC;
+ return EDYNSCS_INVALID_CFA_OPCODE;
}
}
return 0;
@@ -209,12 +227,12 @@ static int scs_handle_fde_frame(const struct eh_frame *frame,
int scs_patch(const u8 eh_frame[], int size)
{
+ int code_alignment_factor = 1;
+ bool fde_use_sdata8 = false;
const u8 *p = eh_frame;
while (size > 4) {
const struct eh_frame *frame = (const void *)p;
- bool fde_has_augmentation_data = true;
- int code_alignment_factor = 1;
int ret;
if (frame->size == 0 ||
@@ -223,28 +241,47 @@ int scs_patch(const u8 eh_frame[], int size)
break;
if (frame->cie_id_or_pointer == 0) {
- const u8 *p = frame->augmentation_string;
-
- /* a 'z' in the augmentation string must come first */
- fde_has_augmentation_data = *p == 'z';
+ /*
+ * Require presence of augmentation data (z) with a
+ * specifier for the size of the FDE initial_loc and
+ * range fields (R), and nothing else.
+ */
+ if (strcmp(frame->augmentation_string, "zR"))
+ return EDYNSCS_INVALID_CIE_HEADER;
/*
* The code alignment factor is a uleb128 encoded field
* but given that the only sensible values are 1 or 4,
- * there is no point in decoding the whole thing.
+ * there is no point in decoding the whole thing. Also
+ * sanity check the size of the data alignment factor
+ * field, and the values of the return address register
+ * and augmentation data size fields.
*/
- p += strlen(p) + 1;
- if (!WARN_ON(*p & BIT(7)))
- code_alignment_factor = *p;
+ if ((frame->code_alignment_factor & BIT(7)) ||
+ (frame->data_alignment_factor & BIT(7)) ||
+ frame->return_address_register != 30 ||
+ frame->augmentation_data_size != 1)
+ return EDYNSCS_INVALID_CIE_HEADER;
+
+ code_alignment_factor = frame->code_alignment_factor;
+
+ switch (frame->fde_pointer_format) {
+ case DW_EH_PE_pcrel | DW_EH_PE_sdata4:
+ fde_use_sdata8 = false;
+ break;
+ case DW_EH_PE_pcrel | DW_EH_PE_sdata8:
+ fde_use_sdata8 = true;
+ break;
+ default:
+ return EDYNSCS_INVALID_CIE_SDATA_SIZE;
+ }
} else {
- ret = scs_handle_fde_frame(frame,
- fde_has_augmentation_data,
- code_alignment_factor,
- true);
+ ret = scs_handle_fde_frame(frame, code_alignment_factor,
+ fde_use_sdata8, true);
if (ret)
return ret;
- scs_handle_fde_frame(frame, fde_has_augmentation_data,
- code_alignment_factor, false);
+ scs_handle_fde_frame(frame, code_alignment_factor,
+ fde_use_sdata8, false);
}
p += sizeof(frame->size) + frame->size;
diff --git a/arch/arm64/kernel/probes/decode-insn.c b/arch/arm64/kernel/probes/decode-insn.c
index 3496d61..6438bf6 100644
--- a/arch/arm64/kernel/probes/decode-insn.c
+++ b/arch/arm64/kernel/probes/decode-insn.c
@@ -58,10 +58,13 @@ static bool __kprobes aarch64_insn_is_steppable(u32 insn)
* Instructions which load PC relative literals are not going to work
* when executed from an XOL slot. Instructions doing an exclusive
* load/store are not going to complete successfully when single-step
- * exception handling happens in the middle of the sequence.
+ * exception handling happens in the middle of the sequence. Memory
+ * copy/set instructions require that all three instructions be placed
+ * consecutively in memory.
*/
if (aarch64_insn_uses_literal(insn) ||
- aarch64_insn_is_exclusive(insn))
+ aarch64_insn_is_exclusive(insn) ||
+ aarch64_insn_is_mops(insn))
return false;
return true;
@@ -73,9 +76,18 @@ static bool __kprobes aarch64_insn_is_steppable(u32 insn)
* INSN_GOOD_NO_SLOT If instruction is supported but doesn't use its slot.
*/
enum probe_insn __kprobes
-arm_probe_decode_insn(probe_opcode_t insn, struct arch_probe_insn *api)
+arm_probe_decode_insn(u32 insn, struct arch_probe_insn *api)
{
/*
+ * While 'nop' instruction can execute in the out-of-line slot,
+ * simulating them in breakpoint handling offers better performance.
+ */
+ if (aarch64_insn_is_nop(insn)) {
+ api->handler = simulate_nop;
+ return INSN_GOOD_NO_SLOT;
+ }
+
+ /*
* Instructions reading or modifying the PC won't work from the XOL
* slot.
*/
@@ -133,8 +145,8 @@ enum probe_insn __kprobes
arm_kprobe_decode_insn(kprobe_opcode_t *addr, struct arch_specific_insn *asi)
{
enum probe_insn decoded;
- probe_opcode_t insn = le32_to_cpu(*addr);
- probe_opcode_t *scan_end = NULL;
+ u32 insn = le32_to_cpu(*addr);
+ kprobe_opcode_t *scan_end = NULL;
unsigned long size = 0, offset = 0;
struct arch_probe_insn *api = &asi->api;
diff --git a/arch/arm64/kernel/probes/decode-insn.h b/arch/arm64/kernel/probes/decode-insn.h
index 8b758c5..0e4195d 100644
--- a/arch/arm64/kernel/probes/decode-insn.h
+++ b/arch/arm64/kernel/probes/decode-insn.h
@@ -28,6 +28,6 @@ enum probe_insn __kprobes
arm_kprobe_decode_insn(kprobe_opcode_t *addr, struct arch_specific_insn *asi);
#endif
enum probe_insn __kprobes
-arm_probe_decode_insn(probe_opcode_t insn, struct arch_probe_insn *asi);
+arm_probe_decode_insn(u32 insn, struct arch_probe_insn *asi);
#endif /* _ARM_KERNEL_KPROBES_ARM64_H */
diff --git a/arch/arm64/kernel/probes/kprobes.c b/arch/arm64/kernel/probes/kprobes.c
index 4268678..48d88e0 100644
--- a/arch/arm64/kernel/probes/kprobes.c
+++ b/arch/arm64/kernel/probes/kprobes.c
@@ -43,7 +43,7 @@ post_kprobe_handler(struct kprobe *, struct kprobe_ctlblk *, struct pt_regs *);
static void __kprobes arch_prepare_ss_slot(struct kprobe *p)
{
- kprobe_opcode_t *addr = p->ainsn.api.insn;
+ kprobe_opcode_t *addr = p->ainsn.xol_insn;
/*
* Prepare insn slot, Mark Rutland points out it depends on a coupe of
@@ -64,20 +64,20 @@ static void __kprobes arch_prepare_ss_slot(struct kprobe *p)
* the BRK exception handler, so it is unnecessary to generate
* Contex-Synchronization-Event via ISB again.
*/
- aarch64_insn_patch_text_nosync(addr, p->opcode);
+ aarch64_insn_patch_text_nosync(addr, le32_to_cpu(p->opcode));
aarch64_insn_patch_text_nosync(addr + 1, BRK64_OPCODE_KPROBES_SS);
/*
* Needs restoring of return address after stepping xol.
*/
- p->ainsn.api.restore = (unsigned long) p->addr +
+ p->ainsn.xol_restore = (unsigned long) p->addr +
sizeof(kprobe_opcode_t);
}
static void __kprobes arch_prepare_simulate(struct kprobe *p)
{
/* This instructions is not executed xol. No need to adjust the PC */
- p->ainsn.api.restore = 0;
+ p->ainsn.xol_restore = 0;
}
static void __kprobes arch_simulate_insn(struct kprobe *p, struct pt_regs *regs)
@@ -85,7 +85,7 @@ static void __kprobes arch_simulate_insn(struct kprobe *p, struct pt_regs *regs)
struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
if (p->ainsn.api.handler)
- p->ainsn.api.handler((u32)p->opcode, (long)p->addr, regs);
+ p->ainsn.api.handler(le32_to_cpu(p->opcode), (long)p->addr, regs);
/* single step simulated, now go for post processing */
post_kprobe_handler(p, kcb, regs);
@@ -99,7 +99,7 @@ int __kprobes arch_prepare_kprobe(struct kprobe *p)
return -EINVAL;
/* copy instruction */
- p->opcode = le32_to_cpu(*p->addr);
+ p->opcode = *p->addr;
if (search_exception_tables(probe_addr))
return -EINVAL;
@@ -110,18 +110,18 @@ int __kprobes arch_prepare_kprobe(struct kprobe *p)
return -EINVAL;
case INSN_GOOD_NO_SLOT: /* insn need simulation */
- p->ainsn.api.insn = NULL;
+ p->ainsn.xol_insn = NULL;
break;
case INSN_GOOD: /* instruction uses slot */
- p->ainsn.api.insn = get_insn_slot();
- if (!p->ainsn.api.insn)
+ p->ainsn.xol_insn = get_insn_slot();
+ if (!p->ainsn.xol_insn)
return -ENOMEM;
break;
}
/* prepare the instruction */
- if (p->ainsn.api.insn)
+ if (p->ainsn.xol_insn)
arch_prepare_ss_slot(p);
else
arch_prepare_simulate(p);
@@ -142,15 +142,16 @@ void __kprobes arch_arm_kprobe(struct kprobe *p)
void __kprobes arch_disarm_kprobe(struct kprobe *p)
{
void *addr = p->addr;
+ u32 insn = le32_to_cpu(p->opcode);
- aarch64_insn_patch_text(&addr, &p->opcode, 1);
+ aarch64_insn_patch_text(&addr, &insn, 1);
}
void __kprobes arch_remove_kprobe(struct kprobe *p)
{
- if (p->ainsn.api.insn) {
- free_insn_slot(p->ainsn.api.insn, 0);
- p->ainsn.api.insn = NULL;
+ if (p->ainsn.xol_insn) {
+ free_insn_slot(p->ainsn.xol_insn, 0);
+ p->ainsn.xol_insn = NULL;
}
}
@@ -205,9 +206,9 @@ static void __kprobes setup_singlestep(struct kprobe *p,
}
- if (p->ainsn.api.insn) {
+ if (p->ainsn.xol_insn) {
/* prepare for single stepping */
- slot = (unsigned long)p->ainsn.api.insn;
+ slot = (unsigned long)p->ainsn.xol_insn;
kprobes_save_local_irqflag(kcb, regs);
instruction_pointer_set(regs, slot);
@@ -245,8 +246,8 @@ static void __kprobes
post_kprobe_handler(struct kprobe *cur, struct kprobe_ctlblk *kcb, struct pt_regs *regs)
{
/* return addr restore if non-branching insn */
- if (cur->ainsn.api.restore != 0)
- instruction_pointer_set(regs, cur->ainsn.api.restore);
+ if (cur->ainsn.xol_restore != 0)
+ instruction_pointer_set(regs, cur->ainsn.xol_restore);
/* restore back original saved kprobe variables and continue */
if (kcb->kprobe_status == KPROBE_REENTER) {
@@ -348,7 +349,7 @@ kprobe_breakpoint_ss_handler(struct pt_regs *regs, unsigned long esr)
struct kprobe *cur = kprobe_running();
if (cur && (kcb->kprobe_status & (KPROBE_HIT_SS | KPROBE_REENTER)) &&
- ((unsigned long)&cur->ainsn.api.insn[1] == addr)) {
+ ((unsigned long)&cur->ainsn.xol_insn[1] == addr)) {
kprobes_restore_local_irqflag(kcb, regs);
post_kprobe_handler(cur, kcb, regs);
diff --git a/arch/arm64/kernel/probes/simulate-insn.c b/arch/arm64/kernel/probes/simulate-insn.c
index b65334a..4c6d2d7 100644
--- a/arch/arm64/kernel/probes/simulate-insn.c
+++ b/arch/arm64/kernel/probes/simulate-insn.c
@@ -196,3 +196,9 @@ simulate_ldrsw_literal(u32 opcode, long addr, struct pt_regs *regs)
instruction_pointer_set(regs, instruction_pointer(regs) + 4);
}
+
+void __kprobes
+simulate_nop(u32 opcode, long addr, struct pt_regs *regs)
+{
+ arm64_skip_faulting_instruction(regs, AARCH64_INSN_SIZE);
+}
diff --git a/arch/arm64/kernel/probes/simulate-insn.h b/arch/arm64/kernel/probes/simulate-insn.h
index e065dc9..efb2803 100644
--- a/arch/arm64/kernel/probes/simulate-insn.h
+++ b/arch/arm64/kernel/probes/simulate-insn.h
@@ -16,5 +16,6 @@ void simulate_cbz_cbnz(u32 opcode, long addr, struct pt_regs *regs);
void simulate_tbz_tbnz(u32 opcode, long addr, struct pt_regs *regs);
void simulate_ldr_literal(u32 opcode, long addr, struct pt_regs *regs);
void simulate_ldrsw_literal(u32 opcode, long addr, struct pt_regs *regs);
+void simulate_nop(u32 opcode, long addr, struct pt_regs *regs);
#endif /* _ARM_KERNEL_KPROBES_SIMULATE_INSN_H */
diff --git a/arch/arm64/kernel/probes/uprobes.c b/arch/arm64/kernel/probes/uprobes.c
index a2f137a..cb3d05a 100644
--- a/arch/arm64/kernel/probes/uprobes.c
+++ b/arch/arm64/kernel/probes/uprobes.c
@@ -17,12 +17,20 @@ void arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr,
void *xol_page_kaddr = kmap_atomic(page);
void *dst = xol_page_kaddr + (vaddr & ~PAGE_MASK);
+ /*
+ * Initial cache maintenance of the xol page done via set_pte_at().
+ * Subsequent CMOs only needed if the xol slot changes.
+ */
+ if (!memcmp(dst, src, len))
+ goto done;
+
/* Initialize the slot */
memcpy(dst, src, len);
/* flush caches (dcache/icache) */
sync_icache_aliases((unsigned long)dst, (unsigned long)dst + len);
+done:
kunmap_atomic(xol_page_kaddr);
}
@@ -34,7 +42,7 @@ unsigned long uprobe_get_swbp_addr(struct pt_regs *regs)
int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm,
unsigned long addr)
{
- probe_opcode_t insn;
+ u32 insn;
/* TODO: Currently we do not support AARCH32 instruction probing */
if (mm->context.flags & MMCF_AARCH32)
@@ -102,7 +110,7 @@ bool arch_uprobe_xol_was_trapped(struct task_struct *t)
bool arch_uprobe_skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs)
{
- probe_opcode_t insn;
+ u32 insn;
unsigned long addr;
if (!auprobe->simulate)
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index 3e7c8c8..2968a33b 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -49,6 +49,7 @@
#include <asm/cacheflush.h>
#include <asm/exec.h>
#include <asm/fpsimd.h>
+#include <asm/gcs.h>
#include <asm/mmu_context.h>
#include <asm/mte.h>
#include <asm/processor.h>
@@ -227,7 +228,7 @@ void __show_regs(struct pt_regs *regs)
printk("sp : %016llx\n", sp);
if (system_uses_irq_prio_masking())
- printk("pmr_save: %08llx\n", regs->pmr_save);
+ printk("pmr: %08x\n", regs->pmr);
i = top_reg;
@@ -280,6 +281,51 @@ static void flush_poe(void)
write_sysreg_s(POR_EL0_INIT, SYS_POR_EL0);
}
+#ifdef CONFIG_ARM64_GCS
+
+static void flush_gcs(void)
+{
+ if (!system_supports_gcs())
+ return;
+
+ gcs_free(current);
+ current->thread.gcs_el0_mode = 0;
+ write_sysreg_s(GCSCRE0_EL1_nTR, SYS_GCSCRE0_EL1);
+ write_sysreg_s(0, SYS_GCSPR_EL0);
+}
+
+static int copy_thread_gcs(struct task_struct *p,
+ const struct kernel_clone_args *args)
+{
+ unsigned long gcs;
+
+ if (!system_supports_gcs())
+ return 0;
+
+ p->thread.gcs_base = 0;
+ p->thread.gcs_size = 0;
+
+ gcs = gcs_alloc_thread_stack(p, args);
+ if (IS_ERR_VALUE(gcs))
+ return PTR_ERR((void *)gcs);
+
+ p->thread.gcs_el0_mode = current->thread.gcs_el0_mode;
+ p->thread.gcs_el0_locked = current->thread.gcs_el0_locked;
+
+ return 0;
+}
+
+#else
+
+static void flush_gcs(void) { }
+static int copy_thread_gcs(struct task_struct *p,
+ const struct kernel_clone_args *args)
+{
+ return 0;
+}
+
+#endif
+
void flush_thread(void)
{
fpsimd_flush_thread();
@@ -287,11 +333,13 @@ void flush_thread(void)
flush_ptrace_hw_breakpoint(current);
flush_tagged_addr_state();
flush_poe();
+ flush_gcs();
}
void arch_release_task_struct(struct task_struct *tsk)
{
fpsimd_release_task(tsk);
+ gcs_free(tsk);
}
int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
@@ -355,6 +403,7 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
unsigned long stack_start = args->stack;
unsigned long tls = args->tls;
struct pt_regs *childregs = task_pt_regs(p);
+ int ret;
memset(&p->thread.cpu_context, 0, sizeof(struct cpu_context));
@@ -399,6 +448,10 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
p->thread.uw.tp_value = tls;
p->thread.tpidr2_el0 = 0;
}
+
+ ret = copy_thread_gcs(p, args);
+ if (ret != 0)
+ return ret;
} else {
/*
* A kthread has no context to ERET to, so ensure any buggy
@@ -409,6 +462,7 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
*/
memset(childregs, 0, sizeof(struct pt_regs));
childregs->pstate = PSR_MODE_EL1h | PSR_IL_BIT;
+ childregs->stackframe.type = FRAME_META_TYPE_FINAL;
p->thread.cpu_context.x19 = (unsigned long)args->fn;
p->thread.cpu_context.x20 = (unsigned long)args->fn_arg;
@@ -422,7 +476,7 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
* For the benefit of the unwinder, set up childregs->stackframe
* as the final frame for the new task.
*/
- p->thread.cpu_context.fp = (unsigned long)childregs->stackframe;
+ p->thread.cpu_context.fp = (unsigned long)&childregs->stackframe;
ptrace_hw_copy_thread(p);
@@ -442,7 +496,7 @@ static void tls_thread_switch(struct task_struct *next)
if (is_compat_thread(task_thread_info(next)))
write_sysreg(next->thread.uw.tp_value, tpidrro_el0);
- else if (!arm64_kernel_unmapped_at_el0())
+ else
write_sysreg(0, tpidrro_el0);
write_sysreg(*task_user_tls(next), tpidr_el0);
@@ -487,6 +541,46 @@ static void entry_task_switch(struct task_struct *next)
__this_cpu_write(__entry_task, next);
}
+#ifdef CONFIG_ARM64_GCS
+
+void gcs_preserve_current_state(void)
+{
+ current->thread.gcspr_el0 = read_sysreg_s(SYS_GCSPR_EL0);
+}
+
+static void gcs_thread_switch(struct task_struct *next)
+{
+ if (!system_supports_gcs())
+ return;
+
+ /* GCSPR_EL0 is always readable */
+ gcs_preserve_current_state();
+ write_sysreg_s(next->thread.gcspr_el0, SYS_GCSPR_EL0);
+
+ if (current->thread.gcs_el0_mode != next->thread.gcs_el0_mode)
+ gcs_set_el0_mode(next);
+
+ /*
+ * Ensure that GCS memory effects of the 'prev' thread are
+ * ordered before other memory accesses with release semantics
+ * (or preceded by a DMB) on the current PE. In addition, any
+ * memory accesses with acquire semantics (or succeeded by a
+ * DMB) are ordered before GCS memory effects of the 'next'
+ * thread. This will ensure that the GCS memory effects are
+ * visible to other PEs in case of migration.
+ */
+ if (task_gcs_el0_enabled(current) || task_gcs_el0_enabled(next))
+ gcsb_dsync();
+}
+
+#else
+
+static void gcs_thread_switch(struct task_struct *next)
+{
+}
+
+#endif
+
/*
* Handle sysreg updates for ARM erratum 1418040 which affects the 32bit view of
* CNTVCT, various other errata which require trapping all CNTVCT{,_EL0}
@@ -583,6 +677,7 @@ struct task_struct *__switch_to(struct task_struct *prev,
cntkctl_thread_switch(prev, next);
ptrauth_thread_switch_user(next);
permission_overlay_switch(next);
+ gcs_thread_switch(next);
/*
* Complete any pending TLB or cache maintenance on this CPU in case
diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c
index b756578..e4437f6 100644
--- a/arch/arm64/kernel/ptrace.c
+++ b/arch/arm64/kernel/ptrace.c
@@ -34,6 +34,7 @@
#include <asm/cpufeature.h>
#include <asm/debug-monitors.h>
#include <asm/fpsimd.h>
+#include <asm/gcs.h>
#include <asm/mte.h>
#include <asm/pointer_auth.h>
#include <asm/stacktrace.h>
@@ -898,7 +899,11 @@ static int sve_set_common(struct task_struct *target,
if (ret)
goto out;
- /* Actual VL set may be less than the user asked for: */
+ /*
+ * Actual VL set may be different from what the user asked
+ * for, or we may have configured the _ONEXEC VL not the
+ * current VL:
+ */
vq = sve_vq_from_vl(task_get_vl(target, type));
/* Enter/exit streaming mode */
@@ -1125,7 +1130,11 @@ static int za_set(struct task_struct *target,
if (ret)
goto out;
- /* Actual VL set may be less than the user asked for: */
+ /*
+ * Actual VL set may be different from what the user asked
+ * for, or we may have configured the _ONEXEC rather than
+ * current VL:
+ */
vq = sve_vq_from_vl(task_get_sme_vl(target));
/* Ensure there is some SVE storage for streaming mode */
@@ -1473,6 +1482,52 @@ static int poe_set(struct task_struct *target, const struct
}
#endif
+#ifdef CONFIG_ARM64_GCS
+static int gcs_get(struct task_struct *target,
+ const struct user_regset *regset,
+ struct membuf to)
+{
+ struct user_gcs user_gcs;
+
+ if (!system_supports_gcs())
+ return -EINVAL;
+
+ if (target == current)
+ gcs_preserve_current_state();
+
+ user_gcs.features_enabled = target->thread.gcs_el0_mode;
+ user_gcs.features_locked = target->thread.gcs_el0_locked;
+ user_gcs.gcspr_el0 = target->thread.gcspr_el0;
+
+ return membuf_write(&to, &user_gcs, sizeof(user_gcs));
+}
+
+static int gcs_set(struct task_struct *target, const struct
+ user_regset *regset, unsigned int pos,
+ unsigned int count, const void *kbuf, const
+ void __user *ubuf)
+{
+ int ret;
+ struct user_gcs user_gcs;
+
+ if (!system_supports_gcs())
+ return -EINVAL;
+
+ ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &user_gcs, 0, -1);
+ if (ret)
+ return ret;
+
+ if (user_gcs.features_enabled & ~PR_SHADOW_STACK_SUPPORTED_STATUS_MASK)
+ return -EINVAL;
+
+ target->thread.gcs_el0_mode = user_gcs.features_enabled;
+ target->thread.gcs_el0_locked = user_gcs.features_locked;
+ target->thread.gcspr_el0 = user_gcs.gcspr_el0;
+
+ return 0;
+}
+#endif
+
enum aarch64_regset {
REGSET_GPR,
REGSET_FPR,
@@ -1503,7 +1558,10 @@ enum aarch64_regset {
REGSET_TAGGED_ADDR_CTRL,
#endif
#ifdef CONFIG_ARM64_POE
- REGSET_POE
+ REGSET_POE,
+#endif
+#ifdef CONFIG_ARM64_GCS
+ REGSET_GCS,
#endif
};
@@ -1674,6 +1732,16 @@ static const struct user_regset aarch64_regsets[] = {
.set = poe_set,
},
#endif
+#ifdef CONFIG_ARM64_GCS
+ [REGSET_GCS] = {
+ .core_note_type = NT_ARM_GCS,
+ .n = sizeof(struct user_gcs) / sizeof(u64),
+ .size = sizeof(u64),
+ .align = sizeof(u64),
+ .regset_get = gcs_get,
+ .set = gcs_set,
+ },
+#endif
};
static const struct user_regset_view user_aarch64_view = {
diff --git a/arch/arm64/kernel/rsi.c b/arch/arm64/kernel/rsi.c
new file mode 100644
index 0000000..3031f25
--- /dev/null
+++ b/arch/arm64/kernel/rsi.c
@@ -0,0 +1,142 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2023 ARM Ltd.
+ */
+
+#include <linux/jump_label.h>
+#include <linux/memblock.h>
+#include <linux/psci.h>
+#include <linux/swiotlb.h>
+#include <linux/cc_platform.h>
+
+#include <asm/io.h>
+#include <asm/mem_encrypt.h>
+#include <asm/rsi.h>
+
+static struct realm_config config;
+
+unsigned long prot_ns_shared;
+EXPORT_SYMBOL(prot_ns_shared);
+
+DEFINE_STATIC_KEY_FALSE_RO(rsi_present);
+EXPORT_SYMBOL(rsi_present);
+
+bool cc_platform_has(enum cc_attr attr)
+{
+ switch (attr) {
+ case CC_ATTR_MEM_ENCRYPT:
+ return is_realm_world();
+ default:
+ return false;
+ }
+}
+EXPORT_SYMBOL_GPL(cc_platform_has);
+
+static bool rsi_version_matches(void)
+{
+ unsigned long ver_lower, ver_higher;
+ unsigned long ret = rsi_request_version(RSI_ABI_VERSION,
+ &ver_lower,
+ &ver_higher);
+
+ if (ret == SMCCC_RET_NOT_SUPPORTED)
+ return false;
+
+ if (ret != RSI_SUCCESS) {
+ pr_err("RME: RMM doesn't support RSI version %lu.%lu. Supported range: %lu.%lu-%lu.%lu\n",
+ RSI_ABI_VERSION_MAJOR, RSI_ABI_VERSION_MINOR,
+ RSI_ABI_VERSION_GET_MAJOR(ver_lower),
+ RSI_ABI_VERSION_GET_MINOR(ver_lower),
+ RSI_ABI_VERSION_GET_MAJOR(ver_higher),
+ RSI_ABI_VERSION_GET_MINOR(ver_higher));
+ return false;
+ }
+
+ pr_info("RME: Using RSI version %lu.%lu\n",
+ RSI_ABI_VERSION_GET_MAJOR(ver_lower),
+ RSI_ABI_VERSION_GET_MINOR(ver_lower));
+
+ return true;
+}
+
+static void __init arm64_rsi_setup_memory(void)
+{
+ u64 i;
+ phys_addr_t start, end;
+
+ /*
+ * Iterate over the available memory ranges and convert the state to
+ * protected memory. We should take extra care to ensure that we DO NOT
+ * permit any "DESTROYED" pages to be converted to "RAM".
+ *
+ * panic() is used because if the attempt to switch the memory to
+ * protected has failed here, then future accesses to the memory are
+ * simply going to be reflected as a SEA (Synchronous External Abort)
+ * which we can't handle. Bailing out early prevents the guest limping
+ * on and dying later.
+ */
+ for_each_mem_range(i, &start, &end) {
+ if (rsi_set_memory_range_protected_safe(start, end)) {
+ panic("Failed to set memory range to protected: %pa-%pa",
+ &start, &end);
+ }
+ }
+}
+
+bool __arm64_is_protected_mmio(phys_addr_t base, size_t size)
+{
+ enum ripas ripas;
+ phys_addr_t end, top;
+
+ /* Overflow ? */
+ if (WARN_ON(base + size <= base))
+ return false;
+
+ end = ALIGN(base + size, RSI_GRANULE_SIZE);
+ base = ALIGN_DOWN(base, RSI_GRANULE_SIZE);
+
+ while (base < end) {
+ if (WARN_ON(rsi_ipa_state_get(base, end, &ripas, &top)))
+ break;
+ if (WARN_ON(top <= base))
+ break;
+ if (ripas != RSI_RIPAS_DEV)
+ break;
+ base = top;
+ }
+
+ return base >= end;
+}
+EXPORT_SYMBOL(__arm64_is_protected_mmio);
+
+static int realm_ioremap_hook(phys_addr_t phys, size_t size, pgprot_t *prot)
+{
+ if (__arm64_is_protected_mmio(phys, size))
+ *prot = pgprot_encrypted(*prot);
+ else
+ *prot = pgprot_decrypted(*prot);
+
+ return 0;
+}
+
+void __init arm64_rsi_init(void)
+{
+ if (arm_smccc_1_1_get_conduit() != SMCCC_CONDUIT_SMC)
+ return;
+ if (!rsi_version_matches())
+ return;
+ if (WARN_ON(rsi_get_realm_config(&config)))
+ return;
+ prot_ns_shared = BIT(config.ipa_bits - 1);
+
+ if (arm64_ioremap_prot_hook_register(realm_ioremap_hook))
+ return;
+
+ if (realm_register_memory_enc_ops())
+ return;
+
+ arm64_rsi_setup_memory();
+
+ static_branch_enable(&rsi_present);
+}
+
diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c
index b22d28e..b5e1e306 100644
--- a/arch/arm64/kernel/setup.c
+++ b/arch/arm64/kernel/setup.c
@@ -43,6 +43,7 @@
#include <asm/cpu_ops.h>
#include <asm/kasan.h>
#include <asm/numa.h>
+#include <asm/rsi.h>
#include <asm/scs.h>
#include <asm/sections.h>
#include <asm/setup.h>
@@ -351,6 +352,8 @@ void __init __no_sanitize_address setup_arch(char **cmdline_p)
else
psci_acpi_init();
+ arm64_rsi_init();
+
init_bootcpu_ops();
smp_init_cpus();
smp_build_mpidr_hash();
diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
index c7d311d..14ac6fd 100644
--- a/arch/arm64/kernel/signal.c
+++ b/arch/arm64/kernel/signal.c
@@ -26,6 +26,7 @@
#include <asm/elf.h>
#include <asm/exception.h>
#include <asm/cacheflush.h>
+#include <asm/gcs.h>
#include <asm/ucontext.h>
#include <asm/unistd.h>
#include <asm/fpsimd.h>
@@ -35,6 +36,15 @@
#include <asm/traps.h>
#include <asm/vdso.h>
+#ifdef CONFIG_ARM64_GCS
+#define GCS_SIGNAL_CAP(addr) (((unsigned long)addr) & GCS_CAP_ADDR_MASK)
+
+static bool gcs_signal_cap_valid(u64 addr, u64 val)
+{
+ return val == GCS_SIGNAL_CAP(addr);
+}
+#endif
+
/*
* Do a signal return; undo the signal stack. These are aligned to 128-bit.
*/
@@ -43,11 +53,6 @@ struct rt_sigframe {
struct ucontext uc;
};
-struct frame_record {
- u64 fp;
- u64 lr;
-};
-
struct rt_sigframe_user_layout {
struct rt_sigframe __user *sigframe;
struct frame_record __user *next_frame;
@@ -57,6 +62,7 @@ struct rt_sigframe_user_layout {
unsigned long fpsimd_offset;
unsigned long esr_offset;
+ unsigned long gcs_offset;
unsigned long sve_offset;
unsigned long tpidr2_offset;
unsigned long za_offset;
@@ -79,7 +85,6 @@ struct user_access_state {
u64 por_el0;
};
-#define BASE_SIGFRAME_SIZE round_up(sizeof(struct rt_sigframe), 16)
#define TERMINATOR_SIZE round_up(sizeof(struct _aarch64_ctx), 16)
#define EXTRA_CONTEXT_SIZE round_up(sizeof(struct extra_context), 16)
@@ -242,6 +247,8 @@ struct user_ctxs {
u32 fpmr_size;
struct poe_context __user *poe;
u32 poe_size;
+ struct gcs_context __user *gcs;
+ u32 gcs_size;
};
static int preserve_fpsimd_context(struct fpsimd_context __user *ctx)
@@ -689,6 +696,82 @@ extern int restore_zt_context(struct user_ctxs *user);
#endif /* ! CONFIG_ARM64_SME */
+#ifdef CONFIG_ARM64_GCS
+
+static int preserve_gcs_context(struct gcs_context __user *ctx)
+{
+ int err = 0;
+ u64 gcspr = read_sysreg_s(SYS_GCSPR_EL0);
+
+ /*
+ * If GCS is enabled we will add a cap token to the frame,
+ * include it in the GCSPR_EL0 we report to support stack
+ * switching via sigreturn if GCS is enabled. We do not allow
+ * enabling via sigreturn so the token is only relevant for
+ * threads with GCS enabled.
+ */
+ if (task_gcs_el0_enabled(current))
+ gcspr -= 8;
+
+ __put_user_error(GCS_MAGIC, &ctx->head.magic, err);
+ __put_user_error(sizeof(*ctx), &ctx->head.size, err);
+ __put_user_error(gcspr, &ctx->gcspr, err);
+ __put_user_error(0, &ctx->reserved, err);
+ __put_user_error(current->thread.gcs_el0_mode,
+ &ctx->features_enabled, err);
+
+ return err;
+}
+
+static int restore_gcs_context(struct user_ctxs *user)
+{
+ u64 gcspr, enabled;
+ int err = 0;
+
+ if (user->gcs_size != sizeof(*user->gcs))
+ return -EINVAL;
+
+ __get_user_error(gcspr, &user->gcs->gcspr, err);
+ __get_user_error(enabled, &user->gcs->features_enabled, err);
+ if (err)
+ return err;
+
+ /* Don't allow unknown modes */
+ if (enabled & ~PR_SHADOW_STACK_SUPPORTED_STATUS_MASK)
+ return -EINVAL;
+
+ err = gcs_check_locked(current, enabled);
+ if (err != 0)
+ return err;
+
+ /* Don't allow enabling */
+ if (!task_gcs_el0_enabled(current) &&
+ (enabled & PR_SHADOW_STACK_ENABLE))
+ return -EINVAL;
+
+ /* If we are disabling disable everything */
+ if (!(enabled & PR_SHADOW_STACK_ENABLE))
+ enabled = 0;
+
+ current->thread.gcs_el0_mode = enabled;
+
+ /*
+ * We let userspace set GCSPR_EL0 to anything here, we will
+ * validate later in gcs_restore_signal().
+ */
+ write_sysreg_s(gcspr, SYS_GCSPR_EL0);
+
+ return 0;
+}
+
+#else /* ! CONFIG_ARM64_GCS */
+
+/* Turn any non-optimised out attempts to use these into a link error: */
+extern int preserve_gcs_context(void __user *ctx);
+extern int restore_gcs_context(struct user_ctxs *user);
+
+#endif /* ! CONFIG_ARM64_GCS */
+
static int parse_user_sigframe(struct user_ctxs *user,
struct rt_sigframe __user *sf)
{
@@ -707,6 +790,7 @@ static int parse_user_sigframe(struct user_ctxs *user,
user->zt = NULL;
user->fpmr = NULL;
user->poe = NULL;
+ user->gcs = NULL;
if (!IS_ALIGNED((unsigned long)base, 16))
goto invalid;
@@ -823,6 +907,17 @@ static int parse_user_sigframe(struct user_ctxs *user,
user->fpmr_size = size;
break;
+ case GCS_MAGIC:
+ if (!system_supports_gcs())
+ goto invalid;
+
+ if (user->gcs)
+ goto invalid;
+
+ user->gcs = (struct gcs_context __user *)head;
+ user->gcs_size = size;
+ break;
+
case EXTRA_MAGIC:
if (have_extra_context)
goto invalid;
@@ -943,6 +1038,9 @@ static int restore_sigframe(struct pt_regs *regs,
err = restore_fpsimd_context(&user);
}
+ if (err == 0 && system_supports_gcs() && user.gcs)
+ err = restore_gcs_context(&user);
+
if (err == 0 && system_supports_tpidr2() && user.tpidr2)
err = restore_tpidr2_context(&user);
@@ -961,6 +1059,58 @@ static int restore_sigframe(struct pt_regs *regs,
return err;
}
+#ifdef CONFIG_ARM64_GCS
+static int gcs_restore_signal(void)
+{
+ unsigned long __user *gcspr_el0;
+ u64 cap;
+ int ret;
+
+ if (!system_supports_gcs())
+ return 0;
+
+ if (!(current->thread.gcs_el0_mode & PR_SHADOW_STACK_ENABLE))
+ return 0;
+
+ gcspr_el0 = (unsigned long __user *)read_sysreg_s(SYS_GCSPR_EL0);
+
+ /*
+ * Ensure that any changes to the GCS done via GCS operations
+ * are visible to the normal reads we do to validate the
+ * token.
+ */
+ gcsb_dsync();
+
+ /*
+ * GCSPR_EL0 should be pointing at a capped GCS, read the cap.
+ * We don't enforce that this is in a GCS page, if it is not
+ * then faults will be generated on GCS operations - the main
+ * concern is to protect GCS pages.
+ */
+ ret = copy_from_user(&cap, gcspr_el0, sizeof(cap));
+ if (ret)
+ return -EFAULT;
+
+ /*
+ * Check that the cap is the actual GCS before replacing it.
+ */
+ if (!gcs_signal_cap_valid((u64)gcspr_el0, cap))
+ return -EINVAL;
+
+ /* Invalidate the token to prevent reuse */
+ put_user_gcs(0, (__user void*)gcspr_el0, &ret);
+ if (ret != 0)
+ return -EFAULT;
+
+ write_sysreg_s(gcspr_el0 + 1, SYS_GCSPR_EL0);
+
+ return 0;
+}
+
+#else
+static int gcs_restore_signal(void) { return 0; }
+#endif
+
SYSCALL_DEFINE0(rt_sigreturn)
{
struct pt_regs *regs = current_pt_regs();
@@ -985,6 +1135,9 @@ SYSCALL_DEFINE0(rt_sigreturn)
if (restore_sigframe(regs, frame, &ua_state))
goto badframe;
+ if (gcs_restore_signal())
+ goto badframe;
+
if (restore_altstack(&frame->uc.uc_stack))
goto badframe;
@@ -1024,6 +1177,15 @@ static int setup_sigframe_layout(struct rt_sigframe_user_layout *user,
return err;
}
+#ifdef CONFIG_ARM64_GCS
+ if (system_supports_gcs() && (add_all || current->thread.gcspr_el0)) {
+ err = sigframe_alloc(user, &user->gcs_offset,
+ sizeof(struct gcs_context));
+ if (err)
+ return err;
+ }
+#endif
+
if (system_supports_sve() || system_supports_sme()) {
unsigned int vq = 0;
@@ -1132,6 +1294,12 @@ static int setup_sigframe(struct rt_sigframe_user_layout *user,
__put_user_error(current->thread.fault_code, &esr_ctx->esr, err);
}
+ if (system_supports_gcs() && err == 0 && user->gcs_offset) {
+ struct gcs_context __user *gcs_ctx =
+ apply_user_offset(user, user->gcs_offset);
+ err |= preserve_gcs_context(gcs_ctx);
+ }
+
/* Scalable Vector Extension state (including streaming), if present */
if ((system_supports_sve() || system_supports_sme()) &&
err == 0 && user->sve_offset) {
@@ -1154,7 +1322,7 @@ static int setup_sigframe(struct rt_sigframe_user_layout *user,
err |= preserve_fpmr_context(fpmr_ctx);
}
- if (system_supports_poe() && err == 0 && user->poe_offset) {
+ if (system_supports_poe() && err == 0) {
struct poe_context __user *poe_ctx =
apply_user_offset(user, user->poe_offset);
@@ -1249,7 +1417,48 @@ static int get_sigframe(struct rt_sigframe_user_layout *user,
return 0;
}
-static void setup_return(struct pt_regs *regs, struct k_sigaction *ka,
+#ifdef CONFIG_ARM64_GCS
+
+static int gcs_signal_entry(__sigrestore_t sigtramp, struct ksignal *ksig)
+{
+ unsigned long __user *gcspr_el0;
+ int ret = 0;
+
+ if (!system_supports_gcs())
+ return 0;
+
+ if (!task_gcs_el0_enabled(current))
+ return 0;
+
+ /*
+ * We are entering a signal handler, current register state is
+ * active.
+ */
+ gcspr_el0 = (unsigned long __user *)read_sysreg_s(SYS_GCSPR_EL0);
+
+ /*
+ * Push a cap and the GCS entry for the trampoline onto the GCS.
+ */
+ put_user_gcs((unsigned long)sigtramp, gcspr_el0 - 2, &ret);
+ put_user_gcs(GCS_SIGNAL_CAP(gcspr_el0 - 1), gcspr_el0 - 1, &ret);
+ if (ret != 0)
+ return ret;
+
+ gcspr_el0 -= 2;
+ write_sysreg_s((unsigned long)gcspr_el0, SYS_GCSPR_EL0);
+
+ return 0;
+}
+#else
+
+static int gcs_signal_entry(__sigrestore_t sigtramp, struct ksignal *ksig)
+{
+ return 0;
+}
+
+#endif
+
+static int setup_return(struct pt_regs *regs, struct ksignal *ksig,
struct rt_sigframe_user_layout *user, int usig)
{
__sigrestore_t sigtramp;
@@ -1257,7 +1466,7 @@ static void setup_return(struct pt_regs *regs, struct k_sigaction *ka,
regs->regs[0] = usig;
regs->sp = (unsigned long)user->sigframe;
regs->regs[29] = (unsigned long)&user->next_frame->fp;
- regs->pc = (unsigned long)ka->sa.sa_handler;
+ regs->pc = (unsigned long)ksig->ka.sa.sa_handler;
/*
* Signal delivery is a (wacky) indirect function call in
@@ -1297,12 +1506,14 @@ static void setup_return(struct pt_regs *regs, struct k_sigaction *ka,
sme_smstop();
}
- if (ka->sa.sa_flags & SA_RESTORER)
- sigtramp = ka->sa.sa_restorer;
+ if (ksig->ka.sa.sa_flags & SA_RESTORER)
+ sigtramp = ksig->ka.sa.sa_restorer;
else
sigtramp = VDSO_SYMBOL(current->mm->context.vdso, sigtramp);
regs->regs[30] = (unsigned long)sigtramp;
+
+ return gcs_signal_entry(sigtramp, ksig);
}
static int setup_rt_frame(int usig, struct ksignal *ksig, sigset_t *set,
@@ -1327,7 +1538,7 @@ static int setup_rt_frame(int usig, struct ksignal *ksig, sigset_t *set,
err |= __save_altstack(&frame->uc.uc_stack, regs->sp);
err |= setup_sigframe(&user, regs, set, &ua_state);
if (err == 0) {
- setup_return(regs, &ksig->ka, &user, usig);
+ err = setup_return(regs, ksig, &user, usig);
if (ksig->ka.sa.sa_flags & SA_SIGINFO) {
err |= copy_siginfo_to_user(&frame->info, &ksig->info);
regs->regs[1] = (unsigned long)&frame->info;
diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c
index 2729faa..caef854 100644
--- a/arch/arm64/kernel/stacktrace.c
+++ b/arch/arm64/kernel/stacktrace.c
@@ -20,6 +20,23 @@
#include <asm/stack_pointer.h>
#include <asm/stacktrace.h>
+enum kunwind_source {
+ KUNWIND_SOURCE_UNKNOWN,
+ KUNWIND_SOURCE_FRAME,
+ KUNWIND_SOURCE_CALLER,
+ KUNWIND_SOURCE_TASK,
+ KUNWIND_SOURCE_REGS_PC,
+ KUNWIND_SOURCE_REGS_LR,
+};
+
+union unwind_flags {
+ unsigned long all;
+ struct {
+ unsigned long fgraph : 1,
+ kretprobe : 1;
+ };
+};
+
/*
* Kernel unwind state
*
@@ -37,6 +54,9 @@ struct kunwind_state {
#ifdef CONFIG_KRETPROBES
struct llist_node *kr_cur;
#endif
+ enum kunwind_source source;
+ union unwind_flags flags;
+ struct pt_regs *regs;
};
static __always_inline void
@@ -45,6 +65,9 @@ kunwind_init(struct kunwind_state *state,
{
unwind_init_common(&state->common);
state->task = task;
+ state->source = KUNWIND_SOURCE_UNKNOWN;
+ state->flags.all = 0;
+ state->regs = NULL;
}
/*
@@ -60,8 +83,10 @@ kunwind_init_from_regs(struct kunwind_state *state,
{
kunwind_init(state, current);
+ state->regs = regs;
state->common.fp = regs->regs[29];
state->common.pc = regs->pc;
+ state->source = KUNWIND_SOURCE_REGS_PC;
}
/*
@@ -79,6 +104,7 @@ kunwind_init_from_caller(struct kunwind_state *state)
state->common.fp = (unsigned long)__builtin_frame_address(1);
state->common.pc = (unsigned long)__builtin_return_address(0);
+ state->source = KUNWIND_SOURCE_CALLER;
}
/*
@@ -99,6 +125,7 @@ kunwind_init_from_task(struct kunwind_state *state,
state->common.fp = thread_saved_fp(task);
state->common.pc = thread_saved_pc(task);
+ state->source = KUNWIND_SOURCE_TASK;
}
static __always_inline int
@@ -114,6 +141,7 @@ kunwind_recover_return_address(struct kunwind_state *state)
if (WARN_ON_ONCE(state->common.pc == orig_pc))
return -EINVAL;
state->common.pc = orig_pc;
+ state->flags.fgraph = 1;
}
#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
@@ -124,12 +152,110 @@ kunwind_recover_return_address(struct kunwind_state *state)
(void *)state->common.fp,
&state->kr_cur);
state->common.pc = orig_pc;
+ state->flags.kretprobe = 1;
}
#endif /* CONFIG_KRETPROBES */
return 0;
}
+static __always_inline
+int kunwind_next_regs_pc(struct kunwind_state *state)
+{
+ struct stack_info *info;
+ unsigned long fp = state->common.fp;
+ struct pt_regs *regs;
+
+ regs = container_of((u64 *)fp, struct pt_regs, stackframe.record.fp);
+
+ info = unwind_find_stack(&state->common, (unsigned long)regs, sizeof(*regs));
+ if (!info)
+ return -EINVAL;
+
+ unwind_consume_stack(&state->common, info, (unsigned long)regs,
+ sizeof(*regs));
+
+ state->regs = regs;
+ state->common.pc = regs->pc;
+ state->common.fp = regs->regs[29];
+ state->source = KUNWIND_SOURCE_REGS_PC;
+ return 0;
+}
+
+static __always_inline int
+kunwind_next_regs_lr(struct kunwind_state *state)
+{
+ /*
+ * The stack for the regs was consumed by kunwind_next_regs_pc(), so we
+ * cannot consume that again here, but we know the regs are safe to
+ * access.
+ */
+ state->common.pc = state->regs->regs[30];
+ state->common.fp = state->regs->regs[29];
+ state->regs = NULL;
+ state->source = KUNWIND_SOURCE_REGS_LR;
+
+ return 0;
+}
+
+static __always_inline int
+kunwind_next_frame_record_meta(struct kunwind_state *state)
+{
+ struct task_struct *tsk = state->task;
+ unsigned long fp = state->common.fp;
+ struct frame_record_meta *meta;
+ struct stack_info *info;
+
+ info = unwind_find_stack(&state->common, fp, sizeof(*meta));
+ if (!info)
+ return -EINVAL;
+
+ meta = (struct frame_record_meta *)fp;
+ switch (READ_ONCE(meta->type)) {
+ case FRAME_META_TYPE_FINAL:
+ if (meta == &task_pt_regs(tsk)->stackframe)
+ return -ENOENT;
+ WARN_ON_ONCE(1);
+ return -EINVAL;
+ case FRAME_META_TYPE_PT_REGS:
+ return kunwind_next_regs_pc(state);
+ default:
+ WARN_ON_ONCE(1);
+ return -EINVAL;
+ }
+}
+
+static __always_inline int
+kunwind_next_frame_record(struct kunwind_state *state)
+{
+ unsigned long fp = state->common.fp;
+ struct frame_record *record;
+ struct stack_info *info;
+ unsigned long new_fp, new_pc;
+
+ if (fp & 0x7)
+ return -EINVAL;
+
+ info = unwind_find_stack(&state->common, fp, sizeof(*record));
+ if (!info)
+ return -EINVAL;
+
+ record = (struct frame_record *)fp;
+ new_fp = READ_ONCE(record->fp);
+ new_pc = READ_ONCE(record->lr);
+
+ if (!new_fp && !new_pc)
+ return kunwind_next_frame_record_meta(state);
+
+ unwind_consume_stack(&state->common, info, fp, sizeof(*record));
+
+ state->common.fp = new_fp;
+ state->common.pc = new_pc;
+ state->source = KUNWIND_SOURCE_FRAME;
+
+ return 0;
+}
+
/*
* Unwind from one frame record (A) to the next frame record (B).
*
@@ -140,15 +266,24 @@ kunwind_recover_return_address(struct kunwind_state *state)
static __always_inline int
kunwind_next(struct kunwind_state *state)
{
- struct task_struct *tsk = state->task;
- unsigned long fp = state->common.fp;
int err;
- /* Final frame; nothing to unwind */
- if (fp == (unsigned long)task_pt_regs(tsk)->stackframe)
- return -ENOENT;
+ state->flags.all = 0;
- err = unwind_next_frame_record(&state->common);
+ switch (state->source) {
+ case KUNWIND_SOURCE_FRAME:
+ case KUNWIND_SOURCE_CALLER:
+ case KUNWIND_SOURCE_TASK:
+ case KUNWIND_SOURCE_REGS_LR:
+ err = kunwind_next_frame_record(state);
+ break;
+ case KUNWIND_SOURCE_REGS_PC:
+ err = kunwind_next_regs_lr(state);
+ break;
+ default:
+ err = -EINVAL;
+ }
+
if (err)
return err;
@@ -294,10 +429,33 @@ noinline noinstr void arch_bpf_stack_walk(bool (*consume_entry)(void *cookie, u6
kunwind_stack_walk(arch_bpf_unwind_consume_entry, &data, current, NULL);
}
-static bool dump_backtrace_entry(void *arg, unsigned long where)
+static const char *state_source_string(const struct kunwind_state *state)
{
+ switch (state->source) {
+ case KUNWIND_SOURCE_FRAME: return NULL;
+ case KUNWIND_SOURCE_CALLER: return "C";
+ case KUNWIND_SOURCE_TASK: return "T";
+ case KUNWIND_SOURCE_REGS_PC: return "P";
+ case KUNWIND_SOURCE_REGS_LR: return "L";
+ default: return "U";
+ }
+}
+
+static bool dump_backtrace_entry(const struct kunwind_state *state, void *arg)
+{
+ const char *source = state_source_string(state);
+ union unwind_flags flags = state->flags;
+ bool has_info = source || flags.all;
char *loglvl = arg;
- printk("%s %pSb\n", loglvl, (void *)where);
+
+ printk("%s %pSb%s%s%s%s%s\n", loglvl,
+ (void *)state->common.pc,
+ has_info ? " (" : "",
+ source ? source : "",
+ flags.fgraph ? "F" : "",
+ flags.kretprobe ? "K" : "",
+ has_info ? ")" : "");
+
return true;
}
@@ -316,7 +474,7 @@ void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk,
return;
printk("%sCall trace:\n", loglvl);
- arch_stack_walk(dump_backtrace_entry, (void *)loglvl, tsk, regs);
+ kunwind_stack_walk(dump_backtrace_entry, (void *)loglvl, tsk, regs);
put_task_stack(tsk);
}
diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c
index 563cbce..ee318f6 100644
--- a/arch/arm64/kernel/traps.c
+++ b/arch/arm64/kernel/traps.c
@@ -506,6 +506,16 @@ void do_el1_bti(struct pt_regs *regs, unsigned long esr)
die("Oops - BTI", regs, esr);
}
+void do_el0_gcs(struct pt_regs *regs, unsigned long esr)
+{
+ force_signal_inject(SIGSEGV, SEGV_CPERR, regs->pc, 0);
+}
+
+void do_el1_gcs(struct pt_regs *regs, unsigned long esr)
+{
+ die("Oops - GCS", regs, esr);
+}
+
void do_el0_fpac(struct pt_regs *regs, unsigned long esr)
{
force_signal_inject(SIGILL, ILL_ILLOPN, regs->pc, esr);
@@ -531,6 +541,13 @@ void do_el0_mops(struct pt_regs *regs, unsigned long esr)
user_fastforward_single_step(current);
}
+void do_el1_mops(struct pt_regs *regs, unsigned long esr)
+{
+ arm64_mops_reset_regs(®s->user_regs, esr);
+
+ kernel_fastforward_single_step(regs);
+}
+
#define __user_cache_maint(insn, address, res) \
if (address >= TASK_SIZE_MAX) { \
res = -EFAULT; \
@@ -852,6 +869,7 @@ static const char *esr_class_str[] = {
[ESR_ELx_EC_MOPS] = "MOPS",
[ESR_ELx_EC_FP_EXC32] = "FP (AArch32)",
[ESR_ELx_EC_FP_EXC64] = "FP (AArch64)",
+ [ESR_ELx_EC_GCS] = "Guarded Control Stack",
[ESR_ELx_EC_SERROR] = "SError",
[ESR_ELx_EC_BREAKPT_LOW] = "Breakpoint (lower EL)",
[ESR_ELx_EC_BREAKPT_CUR] = "Breakpoint (current EL)",
diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S
index 58d89d9..f84c71f 100644
--- a/arch/arm64/kernel/vmlinux.lds.S
+++ b/arch/arm64/kernel/vmlinux.lds.S
@@ -287,6 +287,9 @@
__initdata_end = .;
__init_end = .;
+ .data.rel.ro : { *(.data.rel.ro) }
+ ASSERT(SIZEOF(.data.rel.ro) == 0, "Unexpected RELRO detected!")
+
_data = .;
_sdata = .;
RW_DATA(L1_CACHE_BYTES, PAGE_SIZE, THREAD_ALIGN)
@@ -343,9 +346,6 @@
*(.plt) *(.plt.*) *(.iplt) *(.igot .igot.plt)
}
ASSERT(SIZEOF(.plt) == 0, "Unexpected run-time procedure linkages detected!")
-
- .data.rel.ro : { *(.data.rel.ro) }
- ASSERT(SIZEOF(.data.rel.ro) == 0, "Unexpected RELRO detected!")
}
#include "image-vars.h"
diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c
index 962f985..e738a35 100644
--- a/arch/arm64/kvm/guest.c
+++ b/arch/arm64/kvm/guest.c
@@ -1055,6 +1055,7 @@ int kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm,
void *maddr;
unsigned long num_tags;
struct page *page;
+ struct folio *folio;
if (is_error_noslot_pfn(pfn)) {
ret = -EFAULT;
@@ -1068,10 +1069,13 @@ int kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm,
ret = -EFAULT;
goto out;
}
+ folio = page_folio(page);
maddr = page_address(page);
if (!write) {
- if (page_mte_tagged(page))
+ if ((folio_test_hugetlb(folio) &&
+ folio_test_hugetlb_mte_tagged(folio)) ||
+ page_mte_tagged(page))
num_tags = mte_copy_tags_to_user(tags, maddr,
MTE_GRANULES_PER_PAGE);
else
@@ -1085,14 +1089,20 @@ int kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm,
* __set_ptes() in the VMM but still overriding the
* tags, hence ignoring the return value.
*/
- try_page_mte_tagging(page);
+ if (folio_test_hugetlb(folio))
+ folio_try_hugetlb_mte_tagging(folio);
+ else
+ try_page_mte_tagging(page);
num_tags = mte_copy_tags_from_user(maddr, tags,
MTE_GRANULES_PER_PAGE);
/* uaccess failed, don't leave stale tags */
if (num_tags != MTE_GRANULES_PER_PAGE)
mte_clear_page_tags(maddr);
- set_page_mte_tagged(page);
+ if (folio_test_hugetlb(folio))
+ folio_set_hugetlb_mte_tagged(folio);
+ else
+ set_page_mte_tagged(page);
kvm_release_pfn_dirty(pfn);
}
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 0f7658a..56d9a7f 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -1402,10 +1402,21 @@ static void sanitise_mte_tags(struct kvm *kvm, kvm_pfn_t pfn,
{
unsigned long i, nr_pages = size >> PAGE_SHIFT;
struct page *page = pfn_to_page(pfn);
+ struct folio *folio = page_folio(page);
if (!kvm_has_mte(kvm))
return;
+ if (folio_test_hugetlb(folio)) {
+ /* Hugetlb has MTE flags set on head page only */
+ if (folio_try_hugetlb_mte_tagging(folio)) {
+ for (i = 0; i < nr_pages; i++, page++)
+ mte_clear_page_tags(page_address(page));
+ folio_set_hugetlb_mte_tagged(folio);
+ }
+ return;
+ }
+
for (i = 0; i < nr_pages; i++, page++) {
if (try_page_mte_tagging(page)) {
mte_clear_page_tags(page_address(page));
diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile
index 13e6a28..8e882f4 100644
--- a/arch/arm64/lib/Makefile
+++ b/arch/arm64/lib/Makefile
@@ -13,7 +13,7 @@
lib-$(CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE) += uaccess_flushcache.o
-obj-$(CONFIG_CRC32) += crc32.o
+obj-$(CONFIG_CRC32) += crc32.o crc32-glue.o
obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o
diff --git a/arch/arm64/lib/clear_page.S b/arch/arm64/lib/clear_page.S
index ebde40e..bd6f7d5 100644
--- a/arch/arm64/lib/clear_page.S
+++ b/arch/arm64/lib/clear_page.S
@@ -15,6 +15,19 @@
* x0 - dest
*/
SYM_FUNC_START(__pi_clear_page)
+#ifdef CONFIG_AS_HAS_MOPS
+ .arch_extension mops
+alternative_if_not ARM64_HAS_MOPS
+ b .Lno_mops
+alternative_else_nop_endif
+
+ mov x1, #PAGE_SIZE
+ setpn [x0]!, x1!, xzr
+ setmn [x0]!, x1!, xzr
+ seten [x0]!, x1!, xzr
+ ret
+.Lno_mops:
+#endif
mrs x1, dczid_el0
tbnz x1, #4, 2f /* Branch if DC ZVA is prohibited */
and w1, w1, #0xf
diff --git a/arch/arm64/lib/copy_page.S b/arch/arm64/lib/copy_page.S
index 6a56d7c..e6374e7 100644
--- a/arch/arm64/lib/copy_page.S
+++ b/arch/arm64/lib/copy_page.S
@@ -18,6 +18,19 @@
* x1 - src
*/
SYM_FUNC_START(__pi_copy_page)
+#ifdef CONFIG_AS_HAS_MOPS
+ .arch_extension mops
+alternative_if_not ARM64_HAS_MOPS
+ b .Lno_mops
+alternative_else_nop_endif
+
+ mov x2, #PAGE_SIZE
+ cpypwn [x0]!, [x1]!, x2!
+ cpymwn [x0]!, [x1]!, x2!
+ cpyewn [x0]!, [x1]!, x2!
+ ret
+.Lno_mops:
+#endif
ldp x2, x3, [x1]
ldp x4, x5, [x1, #16]
ldp x6, x7, [x1, #32]
diff --git a/arch/arm64/lib/crc32-glue.c b/arch/arm64/lib/crc32-glue.c
new file mode 100644
index 0000000..295ae3e
--- /dev/null
+++ b/arch/arm64/lib/crc32-glue.c
@@ -0,0 +1,82 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/crc32.h>
+#include <linux/linkage.h>
+
+#include <asm/alternative.h>
+#include <asm/cpufeature.h>
+#include <asm/neon.h>
+#include <asm/simd.h>
+
+#include <crypto/internal/simd.h>
+
+// The minimum input length to consider the 4-way interleaved code path
+static const size_t min_len = 1024;
+
+asmlinkage u32 crc32_le_arm64(u32 crc, unsigned char const *p, size_t len);
+asmlinkage u32 crc32c_le_arm64(u32 crc, unsigned char const *p, size_t len);
+asmlinkage u32 crc32_be_arm64(u32 crc, unsigned char const *p, size_t len);
+
+asmlinkage u32 crc32_le_arm64_4way(u32 crc, unsigned char const *p, size_t len);
+asmlinkage u32 crc32c_le_arm64_4way(u32 crc, unsigned char const *p, size_t len);
+asmlinkage u32 crc32_be_arm64_4way(u32 crc, unsigned char const *p, size_t len);
+
+u32 __pure crc32_le(u32 crc, unsigned char const *p, size_t len)
+{
+ if (!alternative_has_cap_likely(ARM64_HAS_CRC32))
+ return crc32_le_base(crc, p, len);
+
+ if (len >= min_len && cpu_have_named_feature(PMULL) && crypto_simd_usable()) {
+ kernel_neon_begin();
+ crc = crc32_le_arm64_4way(crc, p, len);
+ kernel_neon_end();
+
+ p += round_down(len, 64);
+ len %= 64;
+
+ if (!len)
+ return crc;
+ }
+
+ return crc32_le_arm64(crc, p, len);
+}
+
+u32 __pure __crc32c_le(u32 crc, unsigned char const *p, size_t len)
+{
+ if (!alternative_has_cap_likely(ARM64_HAS_CRC32))
+ return __crc32c_le_base(crc, p, len);
+
+ if (len >= min_len && cpu_have_named_feature(PMULL) && crypto_simd_usable()) {
+ kernel_neon_begin();
+ crc = crc32c_le_arm64_4way(crc, p, len);
+ kernel_neon_end();
+
+ p += round_down(len, 64);
+ len %= 64;
+
+ if (!len)
+ return crc;
+ }
+
+ return crc32c_le_arm64(crc, p, len);
+}
+
+u32 __pure crc32_be(u32 crc, unsigned char const *p, size_t len)
+{
+ if (!alternative_has_cap_likely(ARM64_HAS_CRC32))
+ return crc32_be_base(crc, p, len);
+
+ if (len >= min_len && cpu_have_named_feature(PMULL) && crypto_simd_usable()) {
+ kernel_neon_begin();
+ crc = crc32_be_arm64_4way(crc, p, len);
+ kernel_neon_end();
+
+ p += round_down(len, 64);
+ len %= 64;
+
+ if (!len)
+ return crc;
+ }
+
+ return crc32_be_arm64(crc, p, len);
+}
diff --git a/arch/arm64/lib/crc32.S b/arch/arm64/lib/crc32.S
index 8340dcc..6882531 100644
--- a/arch/arm64/lib/crc32.S
+++ b/arch/arm64/lib/crc32.S
@@ -1,54 +1,60 @@
/* SPDX-License-Identifier: GPL-2.0-only */
/*
- * Accelerated CRC32(C) using AArch64 CRC instructions
+ * Accelerated CRC32(C) using AArch64 CRC and PMULL instructions
*
- * Copyright (C) 2016 - 2018 Linaro Ltd <ard.biesheuvel@linaro.org>
+ * Copyright (C) 2016 - 2018 Linaro Ltd.
+ * Copyright (C) 2024 Google LLC
+ *
+ * Author: Ard Biesheuvel <ardb@kernel.org>
*/
#include <linux/linkage.h>
-#include <asm/alternative.h>
#include <asm/assembler.h>
- .arch armv8-a+crc
+ .cpu generic+crc+crypto
- .macro byteorder, reg, be
- .if \be
-CPU_LE( rev \reg, \reg )
- .else
-CPU_BE( rev \reg, \reg )
- .endif
+ .macro bitle, reg
.endm
- .macro byteorder16, reg, be
- .if \be
-CPU_LE( rev16 \reg, \reg )
- .else
-CPU_BE( rev16 \reg, \reg )
- .endif
- .endm
-
- .macro bitorder, reg, be
- .if \be
+ .macro bitbe, reg
rbit \reg, \reg
- .endif
.endm
- .macro bitorder16, reg, be
- .if \be
- rbit \reg, \reg
- lsr \reg, \reg, #16
- .endif
+ .macro bytele, reg
.endm
- .macro bitorder8, reg, be
- .if \be
+ .macro bytebe, reg
rbit \reg, \reg
lsr \reg, \reg, #24
- .endif
.endm
- .macro __crc32, c, be=0
- bitorder w0, \be
+ .macro hwordle, reg
+CPU_BE( rev16 \reg, \reg )
+ .endm
+
+ .macro hwordbe, reg
+CPU_LE( rev \reg, \reg )
+ rbit \reg, \reg
+CPU_BE( lsr \reg, \reg, #16 )
+ .endm
+
+ .macro le, regs:vararg
+ .irp r, \regs
+CPU_BE( rev \r, \r )
+ .endr
+ .endm
+
+ .macro be, regs:vararg
+ .irp r, \regs
+CPU_LE( rev \r, \r )
+ .endr
+ .irp r, \regs
+ rbit \r, \r
+ .endr
+ .endm
+
+ .macro __crc32, c, order=le
+ bit\order w0
cmp x2, #16
b.lt 8f // less than 16 bytes
@@ -61,14 +67,7 @@
add x8, x8, x1
add x1, x1, x7
ldp x5, x6, [x8]
- byteorder x3, \be
- byteorder x4, \be
- byteorder x5, \be
- byteorder x6, \be
- bitorder x3, \be
- bitorder x4, \be
- bitorder x5, \be
- bitorder x6, \be
+ \order x3, x4, x5, x6
tst x7, #8
crc32\c\()x w8, w0, x3
@@ -96,65 +95,268 @@
32: ldp x3, x4, [x1], #32
sub x2, x2, #32
ldp x5, x6, [x1, #-16]
- byteorder x3, \be
- byteorder x4, \be
- byteorder x5, \be
- byteorder x6, \be
- bitorder x3, \be
- bitorder x4, \be
- bitorder x5, \be
- bitorder x6, \be
+ \order x3, x4, x5, x6
crc32\c\()x w0, w0, x3
crc32\c\()x w0, w0, x4
crc32\c\()x w0, w0, x5
crc32\c\()x w0, w0, x6
cbnz x2, 32b
-0: bitorder w0, \be
+0: bit\order w0
ret
8: tbz x2, #3, 4f
ldr x3, [x1], #8
- byteorder x3, \be
- bitorder x3, \be
+ \order x3
crc32\c\()x w0, w0, x3
4: tbz x2, #2, 2f
ldr w3, [x1], #4
- byteorder w3, \be
- bitorder w3, \be
+ \order w3
crc32\c\()w w0, w0, w3
2: tbz x2, #1, 1f
ldrh w3, [x1], #2
- byteorder16 w3, \be
- bitorder16 w3, \be
+ hword\order w3
crc32\c\()h w0, w0, w3
1: tbz x2, #0, 0f
ldrb w3, [x1]
- bitorder8 w3, \be
+ byte\order w3
crc32\c\()b w0, w0, w3
-0: bitorder w0, \be
+0: bit\order w0
ret
.endm
.align 5
-SYM_FUNC_START(crc32_le)
-alternative_if_not ARM64_HAS_CRC32
- b crc32_le_base
-alternative_else_nop_endif
+SYM_FUNC_START(crc32_le_arm64)
__crc32
-SYM_FUNC_END(crc32_le)
+SYM_FUNC_END(crc32_le_arm64)
.align 5
-SYM_FUNC_START(__crc32c_le)
-alternative_if_not ARM64_HAS_CRC32
- b __crc32c_le_base
-alternative_else_nop_endif
+SYM_FUNC_START(crc32c_le_arm64)
__crc32 c
-SYM_FUNC_END(__crc32c_le)
+SYM_FUNC_END(crc32c_le_arm64)
.align 5
-SYM_FUNC_START(crc32_be)
-alternative_if_not ARM64_HAS_CRC32
- b crc32_be_base
-alternative_else_nop_endif
- __crc32 be=1
-SYM_FUNC_END(crc32_be)
+SYM_FUNC_START(crc32_be_arm64)
+ __crc32 order=be
+SYM_FUNC_END(crc32_be_arm64)
+
+ in .req x1
+ len .req x2
+
+ /*
+ * w0: input CRC at entry, output CRC at exit
+ * x1: pointer to input buffer
+ * x2: length of input in bytes
+ */
+ .macro crc4way, insn, table, order=le
+ bit\order w0
+ lsr len, len, #6 // len := # of 64-byte blocks
+
+ /* Process up to 64 blocks of 64 bytes at a time */
+.La\@: mov x3, #64
+ cmp len, #64
+ csel x3, x3, len, hi // x3 := min(len, 64)
+ sub len, len, x3
+
+ /* Divide the input into 4 contiguous blocks */
+ add x4, x3, x3, lsl #1 // x4 := 3 * x3
+ add x7, in, x3, lsl #4 // x7 := in + 16 * x3
+ add x8, in, x3, lsl #5 // x8 := in + 32 * x3
+ add x9, in, x4, lsl #4 // x9 := in + 16 * x4
+
+ /* Load the folding coefficients from the lookup table */
+ adr_l x5, \table - 12 // entry 0 omitted
+ add x5, x5, x4, lsl #2 // x5 += 12 * x3
+ ldp s0, s1, [x5]
+ ldr s2, [x5, #8]
+
+ /* Zero init partial CRCs for this iteration */
+ mov w4, wzr
+ mov w5, wzr
+ mov w6, wzr
+ mov x17, xzr
+
+.Lb\@: sub x3, x3, #1
+ \insn w6, w6, x17
+ ldp x10, x11, [in], #16
+ ldp x12, x13, [x7], #16
+ ldp x14, x15, [x8], #16
+ ldp x16, x17, [x9], #16
+
+ \order x10, x11, x12, x13, x14, x15, x16, x17
+
+ /* Apply the CRC transform to 4 16-byte blocks in parallel */
+ \insn w0, w0, x10
+ \insn w4, w4, x12
+ \insn w5, w5, x14
+ \insn w6, w6, x16
+ \insn w0, w0, x11
+ \insn w4, w4, x13
+ \insn w5, w5, x15
+ cbnz x3, .Lb\@
+
+ /* Combine the 4 partial results into w0 */
+ mov v3.d[0], x0
+ mov v4.d[0], x4
+ mov v5.d[0], x5
+ pmull v0.1q, v0.1d, v3.1d
+ pmull v1.1q, v1.1d, v4.1d
+ pmull v2.1q, v2.1d, v5.1d
+ eor v0.8b, v0.8b, v1.8b
+ eor v0.8b, v0.8b, v2.8b
+ mov x5, v0.d[0]
+ eor x5, x5, x17
+ \insn w0, w6, x5
+
+ mov in, x9
+ cbnz len, .La\@
+
+ bit\order w0
+ ret
+ .endm
+
+ .align 5
+SYM_FUNC_START(crc32c_le_arm64_4way)
+ crc4way crc32cx, .L0
+SYM_FUNC_END(crc32c_le_arm64_4way)
+
+ .align 5
+SYM_FUNC_START(crc32_le_arm64_4way)
+ crc4way crc32x, .L1
+SYM_FUNC_END(crc32_le_arm64_4way)
+
+ .align 5
+SYM_FUNC_START(crc32_be_arm64_4way)
+ crc4way crc32x, .L1, be
+SYM_FUNC_END(crc32_be_arm64_4way)
+
+ .section .rodata, "a", %progbits
+ .align 6
+.L0: .long 0xddc0152b, 0xba4fc28e, 0x493c7d27
+ .long 0x0715ce53, 0x9e4addf8, 0xba4fc28e
+ .long 0xc96cfdc0, 0x0715ce53, 0xddc0152b
+ .long 0xab7aff2a, 0x0d3b6092, 0x9e4addf8
+ .long 0x299847d5, 0x878a92a7, 0x39d3b296
+ .long 0xb6dd949b, 0xab7aff2a, 0x0715ce53
+ .long 0xa60ce07b, 0x83348832, 0x47db8317
+ .long 0xd270f1a2, 0xb9e02b86, 0x0d3b6092
+ .long 0x65863b64, 0xb6dd949b, 0xc96cfdc0
+ .long 0xb3e32c28, 0xbac2fd7b, 0x878a92a7
+ .long 0xf285651c, 0xce7f39f4, 0xdaece73e
+ .long 0x271d9844, 0xd270f1a2, 0xab7aff2a
+ .long 0x6cb08e5c, 0x2b3cac5d, 0x2162d385
+ .long 0xcec3662e, 0x1b03397f, 0x83348832
+ .long 0x8227bb8a, 0xb3e32c28, 0x299847d5
+ .long 0xd7a4825c, 0xdd7e3b0c, 0xb9e02b86
+ .long 0xf6076544, 0x10746f3c, 0x18b33a4e
+ .long 0x98d8d9cb, 0x271d9844, 0xb6dd949b
+ .long 0x57a3d037, 0x93a5f730, 0x78d9ccb7
+ .long 0x3771e98f, 0x6b749fb2, 0xbac2fd7b
+ .long 0xe0ac139e, 0xcec3662e, 0xa60ce07b
+ .long 0x6f345e45, 0xe6fc4e6a, 0xce7f39f4
+ .long 0xa2b73df1, 0xb0cd4768, 0x61d82e56
+ .long 0x86d8e4d2, 0xd7a4825c, 0xd270f1a2
+ .long 0xa90fd27a, 0x0167d312, 0xc619809d
+ .long 0xca6ef3ac, 0x26f6a60a, 0x2b3cac5d
+ .long 0x4597456a, 0x98d8d9cb, 0x65863b64
+ .long 0xc9c8b782, 0x68bce87a, 0x1b03397f
+ .long 0x62ec6c6d, 0x6956fc3b, 0xebb883bd
+ .long 0x2342001e, 0x3771e98f, 0xb3e32c28
+ .long 0xe8b6368b, 0x2178513a, 0x064f7f26
+ .long 0x9ef68d35, 0x170076fa, 0xdd7e3b0c
+ .long 0x0b0bf8ca, 0x6f345e45, 0xf285651c
+ .long 0x02ee03b2, 0xff0dba97, 0x10746f3c
+ .long 0x135c83fd, 0xf872e54c, 0xc7a68855
+ .long 0x00bcf5f6, 0x86d8e4d2, 0x271d9844
+ .long 0x58ca5f00, 0x5bb8f1bc, 0x8e766a0c
+ .long 0xded288f8, 0xb3af077a, 0x93a5f730
+ .long 0x37170390, 0xca6ef3ac, 0x6cb08e5c
+ .long 0xf48642e9, 0xdd66cbbb, 0x6b749fb2
+ .long 0xb25b29f2, 0xe9e28eb4, 0x1393e203
+ .long 0x45cddf4e, 0xc9c8b782, 0xcec3662e
+ .long 0xdfd94fb2, 0x93e106a4, 0x96c515bb
+ .long 0x021ac5ef, 0xd813b325, 0xe6fc4e6a
+ .long 0x8e1450f7, 0x2342001e, 0x8227bb8a
+ .long 0xe0cdcf86, 0x6d9a4957, 0xb0cd4768
+ .long 0x613eee91, 0xd2c3ed1a, 0x39c7ff35
+ .long 0xbedc6ba1, 0x9ef68d35, 0xd7a4825c
+ .long 0x0cd1526a, 0xf2271e60, 0x0ab3844b
+ .long 0xd6c3a807, 0x2664fd8b, 0x0167d312
+ .long 0x1d31175f, 0x02ee03b2, 0xf6076544
+ .long 0x4be7fd90, 0x363bd6b3, 0x26f6a60a
+ .long 0x6eeed1c9, 0x5fabe670, 0xa741c1bf
+ .long 0xb3a6da94, 0x00bcf5f6, 0x98d8d9cb
+ .long 0x2e7d11a7, 0x17f27698, 0x49c3cc9c
+ .long 0x889774e1, 0xaa7c7ad5, 0x68bce87a
+ .long 0x8a074012, 0xded288f8, 0x57a3d037
+ .long 0xbd0bb25f, 0x6d390dec, 0x6956fc3b
+ .long 0x3be3c09b, 0x6353c1cc, 0x42d98888
+ .long 0x465a4eee, 0xf48642e9, 0x3771e98f
+ .long 0x2e5f3c8c, 0xdd35bc8d, 0xb42ae3d9
+ .long 0xa52f58ec, 0x9a5ede41, 0x2178513a
+ .long 0x47972100, 0x45cddf4e, 0xe0ac139e
+ .long 0x359674f7, 0xa51b6135, 0x170076fa
+
+.L1: .long 0xaf449247, 0x81256527, 0xccaa009e
+ .long 0x57c54819, 0x1d9513d7, 0x81256527
+ .long 0x3f41287a, 0x57c54819, 0xaf449247
+ .long 0xf5e48c85, 0x910eeec1, 0x1d9513d7
+ .long 0x1f0c2cdd, 0x9026d5b1, 0xae0b5394
+ .long 0x71d54a59, 0xf5e48c85, 0x57c54819
+ .long 0x1c63267b, 0xfe807bbd, 0x0cbec0ed
+ .long 0xd31343ea, 0xe95c1271, 0x910eeec1
+ .long 0xf9d9c7ee, 0x71d54a59, 0x3f41287a
+ .long 0x9ee62949, 0xcec97417, 0x9026d5b1
+ .long 0xa55d1514, 0xf183c71b, 0xd1df2327
+ .long 0x21aa2b26, 0xd31343ea, 0xf5e48c85
+ .long 0x9d842b80, 0xeea395c4, 0x3c656ced
+ .long 0xd8110ff1, 0xcd669a40, 0xfe807bbd
+ .long 0x3f9e9356, 0x9ee62949, 0x1f0c2cdd
+ .long 0x1d6708a0, 0x0c30f51d, 0xe95c1271
+ .long 0xef82aa68, 0xdb3935ea, 0xb918a347
+ .long 0xd14bcc9b, 0x21aa2b26, 0x71d54a59
+ .long 0x99cce860, 0x356d209f, 0xff6f2fc2
+ .long 0xd8af8e46, 0xc352f6de, 0xcec97417
+ .long 0xf1996890, 0xd8110ff1, 0x1c63267b
+ .long 0x631bc508, 0xe95c7216, 0xf183c71b
+ .long 0x8511c306, 0x8e031a19, 0x9b9bdbd0
+ .long 0xdb3839f3, 0x1d6708a0, 0xd31343ea
+ .long 0x7a92fffb, 0xf7003835, 0x4470ac44
+ .long 0x6ce68f2a, 0x00eba0c8, 0xeea395c4
+ .long 0x4caaa263, 0xd14bcc9b, 0xf9d9c7ee
+ .long 0xb46f7cff, 0x9a1b53c8, 0xcd669a40
+ .long 0x60290934, 0x81b6f443, 0x6d40f445
+ .long 0x8e976a7d, 0xd8af8e46, 0x9ee62949
+ .long 0xdcf5088a, 0x9dbdc100, 0x145575d5
+ .long 0x1753ab84, 0xbbf2f6d6, 0x0c30f51d
+ .long 0x255b139e, 0x631bc508, 0xa55d1514
+ .long 0xd784eaa8, 0xce26786c, 0xdb3935ea
+ .long 0x6d2c864a, 0x8068c345, 0x2586d334
+ .long 0x02072e24, 0xdb3839f3, 0x21aa2b26
+ .long 0x06689b0a, 0x5efd72f5, 0xe0575528
+ .long 0x1e52f5ea, 0x4117915b, 0x356d209f
+ .long 0x1d3d1db6, 0x6ce68f2a, 0x9d842b80
+ .long 0x3796455c, 0xb8e0e4a8, 0xc352f6de
+ .long 0xdf3a4eb3, 0xc55a2330, 0xb84ffa9c
+ .long 0x28ae0976, 0xb46f7cff, 0xd8110ff1
+ .long 0x9764bc8d, 0xd7e7a22c, 0x712510f0
+ .long 0x13a13e18, 0x3e9a43cd, 0xe95c7216
+ .long 0xb8ee242e, 0x8e976a7d, 0x3f9e9356
+ .long 0x0c540e7b, 0x753c81ff, 0x8e031a19
+ .long 0x9924c781, 0xb9220208, 0x3edcde65
+ .long 0x3954de39, 0x1753ab84, 0x1d6708a0
+ .long 0xf32238b5, 0xbec81497, 0x9e70b943
+ .long 0xbbd2cd2c, 0x0925d861, 0xf7003835
+ .long 0xcc401304, 0xd784eaa8, 0xef82aa68
+ .long 0x4987e684, 0x6044fbb0, 0x00eba0c8
+ .long 0x3aa11427, 0x18fe3b4a, 0x87441142
+ .long 0x297aad60, 0x02072e24, 0xd14bcc9b
+ .long 0xf60c5e51, 0x6ef6f487, 0x5b7fdd0a
+ .long 0x632d78c5, 0x3fc33de4, 0x9a1b53c8
+ .long 0x25b8822a, 0x1e52f5ea, 0x99cce860
+ .long 0xd4fc84bc, 0x1af62fb8, 0x81b6f443
+ .long 0x5690aa32, 0xa91fdefb, 0x688a110e
+ .long 0x1357a093, 0x3796455c, 0xd8af8e46
+ .long 0x798fdd33, 0xaaa18a37, 0x357b9517
+ .long 0xc2815395, 0x54d42691, 0x9dbdc100
+ .long 0x21cfc0f7, 0x28ae0976, 0xf1996890
+ .long 0xa0decef3, 0x7b4aa8b7, 0xbbf2f6d6
diff --git a/arch/arm64/lib/memcpy.S b/arch/arm64/lib/memcpy.S
index 4ab48d4..9b99106 100644
--- a/arch/arm64/lib/memcpy.S
+++ b/arch/arm64/lib/memcpy.S
@@ -57,7 +57,7 @@
The loop tail is handled by always copying 64 bytes from the end.
*/
-SYM_FUNC_START(__pi_memcpy)
+SYM_FUNC_START_LOCAL(__pi_memcpy_generic)
add srcend, src, count
add dstend, dstin, count
cmp count, 128
@@ -238,7 +238,24 @@
stp B_l, B_h, [dstin, 16]
stp C_l, C_h, [dstin]
ret
+SYM_FUNC_END(__pi_memcpy_generic)
+
+#ifdef CONFIG_AS_HAS_MOPS
+ .arch_extension mops
+SYM_FUNC_START(__pi_memcpy)
+alternative_if_not ARM64_HAS_MOPS
+ b __pi_memcpy_generic
+alternative_else_nop_endif
+
+ mov dst, dstin
+ cpyp [dst]!, [src]!, count!
+ cpym [dst]!, [src]!, count!
+ cpye [dst]!, [src]!, count!
+ ret
SYM_FUNC_END(__pi_memcpy)
+#else
+SYM_FUNC_ALIAS(__pi_memcpy, __pi_memcpy_generic)
+#endif
SYM_FUNC_ALIAS(__memcpy, __pi_memcpy)
EXPORT_SYMBOL(__memcpy)
diff --git a/arch/arm64/lib/memset.S b/arch/arm64/lib/memset.S
index a5aebe82..97157da 100644
--- a/arch/arm64/lib/memset.S
+++ b/arch/arm64/lib/memset.S
@@ -26,6 +26,7 @@
*/
dstin .req x0
+val_x .req x1
val .req w1
count .req x2
tmp1 .req x3
@@ -42,7 +43,7 @@
tmp3w .req w9
tmp3 .req x9
-SYM_FUNC_START(__pi_memset)
+SYM_FUNC_START_LOCAL(__pi_memset_generic)
mov dst, dstin /* Preserve return value. */
and A_lw, val, #255
orr A_lw, A_lw, A_lw, lsl #8
@@ -201,7 +202,24 @@
ands count, count, zva_bits_x
b.ne .Ltail_maybe_long
ret
+SYM_FUNC_END(__pi_memset_generic)
+
+#ifdef CONFIG_AS_HAS_MOPS
+ .arch_extension mops
+SYM_FUNC_START(__pi_memset)
+alternative_if_not ARM64_HAS_MOPS
+ b __pi_memset_generic
+alternative_else_nop_endif
+
+ mov dst, dstin
+ setp [dst]!, count!, val_x
+ setm [dst]!, count!, val_x
+ sete [dst]!, count!, val_x
+ ret
SYM_FUNC_END(__pi_memset)
+#else
+SYM_FUNC_ALIAS(__pi_memset, __pi_memset_generic)
+#endif
SYM_FUNC_ALIAS(__memset, __pi_memset)
EXPORT_SYMBOL(__memset)
diff --git a/arch/arm64/mm/Makefile b/arch/arm64/mm/Makefile
index 2fc8c6d..fc92170 100644
--- a/arch/arm64/mm/Makefile
+++ b/arch/arm64/mm/Makefile
@@ -11,6 +11,7 @@
obj-$(CONFIG_TRANS_TABLE) += trans_pgd-asm.o
obj-$(CONFIG_DEBUG_VIRTUAL) += physaddr.o
obj-$(CONFIG_ARM64_MTE) += mteswap.o
+obj-$(CONFIG_ARM64_GCS) += gcs.o
KASAN_SANITIZE_physaddr.o += n
obj-$(CONFIG_KASAN) += kasan_init.o
diff --git a/arch/arm64/mm/copypage.c b/arch/arm64/mm/copypage.c
index a7bb200..87b3f1a 100644
--- a/arch/arm64/mm/copypage.c
+++ b/arch/arm64/mm/copypage.c
@@ -18,15 +18,40 @@ void copy_highpage(struct page *to, struct page *from)
{
void *kto = page_address(to);
void *kfrom = page_address(from);
+ struct folio *src = page_folio(from);
+ struct folio *dst = page_folio(to);
+ unsigned int i, nr_pages;
copy_page(kto, kfrom);
if (kasan_hw_tags_enabled())
page_kasan_tag_reset(to);
- if (system_supports_mte() && page_mte_tagged(from)) {
+ if (!system_supports_mte())
+ return;
+
+ if (folio_test_hugetlb(src) &&
+ folio_test_hugetlb_mte_tagged(src)) {
+ if (!folio_try_hugetlb_mte_tagging(dst))
+ return;
+
+ /*
+ * Populate tags for all subpages.
+ *
+ * Don't assume the first page is head page since
+ * huge page copy may start from any subpage.
+ */
+ nr_pages = folio_nr_pages(src);
+ for (i = 0; i < nr_pages; i++) {
+ kfrom = page_address(folio_page(src, i));
+ kto = page_address(folio_page(dst, i));
+ mte_copy_page_tags(kto, kfrom);
+ }
+ folio_set_hugetlb_mte_tagged(dst);
+ } else if (page_mte_tagged(from)) {
/* It's a new page, shouldn't have been tagged yet */
WARN_ON_ONCE(!try_page_mte_tagging(to));
+
mte_copy_page_tags(kto, kfrom);
set_page_mte_tagged(to);
}
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index 8b281cf..c2f89a6 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -504,6 +504,14 @@ static bool fault_from_pkey(unsigned long esr, struct vm_area_struct *vma,
false);
}
+static bool is_gcs_fault(unsigned long esr)
+{
+ if (!esr_is_data_abort(esr))
+ return false;
+
+ return ESR_ELx_ISS2(esr) & ESR_ELx_GCS;
+}
+
static bool is_el0_instruction_abort(unsigned long esr)
{
return ESR_ELx_EC(esr) == ESR_ELx_EC_IABT_LOW;
@@ -518,6 +526,23 @@ static bool is_write_abort(unsigned long esr)
return (esr & ESR_ELx_WNR) && !(esr & ESR_ELx_CM);
}
+static bool is_invalid_gcs_access(struct vm_area_struct *vma, u64 esr)
+{
+ if (!system_supports_gcs())
+ return false;
+
+ if (unlikely(is_gcs_fault(esr))) {
+ /* GCS accesses must be performed on a GCS page */
+ if (!(vma->vm_flags & VM_SHADOW_STACK))
+ return true;
+ } else if (unlikely(vma->vm_flags & VM_SHADOW_STACK)) {
+ /* Only GCS operations can write to a GCS page */
+ return esr_is_data_abort(esr) && is_write_abort(esr);
+ }
+
+ return false;
+}
+
static int __kprobes do_page_fault(unsigned long far, unsigned long esr,
struct pt_regs *regs)
{
@@ -554,6 +579,14 @@ static int __kprobes do_page_fault(unsigned long far, unsigned long esr,
/* It was exec fault */
vm_flags = VM_EXEC;
mm_flags |= FAULT_FLAG_INSTRUCTION;
+ } else if (is_gcs_fault(esr)) {
+ /*
+ * The GCS permission on a page implies both read and
+ * write so always handle any GCS fault as a write fault,
+ * we need to trigger CoW even for GCS reads.
+ */
+ vm_flags = VM_WRITE;
+ mm_flags |= FAULT_FLAG_WRITE;
} else if (is_write_abort(esr)) {
/* It was write fault */
vm_flags = VM_WRITE;
@@ -587,6 +620,13 @@ static int __kprobes do_page_fault(unsigned long far, unsigned long esr,
if (!vma)
goto lock_mmap;
+ if (is_invalid_gcs_access(vma, esr)) {
+ vma_end_read(vma);
+ fault = 0;
+ si_code = SEGV_ACCERR;
+ goto bad_area;
+ }
+
if (!(vma->vm_flags & vm_flags)) {
vma_end_read(vma);
fault = 0;
diff --git a/arch/arm64/mm/fixmap.c b/arch/arm64/mm/fixmap.c
index de1e09d..c5c5425 100644
--- a/arch/arm64/mm/fixmap.c
+++ b/arch/arm64/mm/fixmap.c
@@ -47,7 +47,8 @@ static void __init early_fixmap_init_pte(pmd_t *pmdp, unsigned long addr)
if (pmd_none(pmd)) {
ptep = bm_pte[BM_PTE_TABLE_IDX(addr)];
- __pmd_populate(pmdp, __pa_symbol(ptep), PMD_TYPE_TABLE);
+ __pmd_populate(pmdp, __pa_symbol(ptep),
+ PMD_TYPE_TABLE | PMD_TABLE_AF);
}
}
@@ -59,7 +60,8 @@ static void __init early_fixmap_init_pmd(pud_t *pudp, unsigned long addr,
pmd_t *pmdp;
if (pud_none(pud))
- __pud_populate(pudp, __pa_symbol(bm_pmd), PUD_TYPE_TABLE);
+ __pud_populate(pudp, __pa_symbol(bm_pmd),
+ PUD_TYPE_TABLE | PUD_TABLE_AF);
pmdp = pmd_offset_kimg(pudp, addr);
do {
@@ -86,7 +88,8 @@ static void __init early_fixmap_init_pud(p4d_t *p4dp, unsigned long addr,
}
if (p4d_none(p4d))
- __p4d_populate(p4dp, __pa_symbol(bm_pud), P4D_TYPE_TABLE);
+ __p4d_populate(p4dp, __pa_symbol(bm_pud),
+ P4D_TYPE_TABLE | P4D_TABLE_AF);
pudp = pud_offset_kimg(p4dp, addr);
early_fixmap_init_pmd(pudp, addr, end);
diff --git a/arch/arm64/mm/gcs.c b/arch/arm64/mm/gcs.c
new file mode 100644
index 0000000..5c46ec5
--- /dev/null
+++ b/arch/arm64/mm/gcs.c
@@ -0,0 +1,254 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/syscalls.h>
+#include <linux/types.h>
+
+#include <asm/cmpxchg.h>
+#include <asm/cpufeature.h>
+#include <asm/gcs.h>
+#include <asm/page.h>
+
+static unsigned long alloc_gcs(unsigned long addr, unsigned long size)
+{
+ int flags = MAP_ANONYMOUS | MAP_PRIVATE;
+ struct mm_struct *mm = current->mm;
+ unsigned long mapped_addr, unused;
+
+ if (addr)
+ flags |= MAP_FIXED_NOREPLACE;
+
+ mmap_write_lock(mm);
+ mapped_addr = do_mmap(NULL, addr, size, PROT_READ, flags,
+ VM_SHADOW_STACK | VM_WRITE, 0, &unused, NULL);
+ mmap_write_unlock(mm);
+
+ return mapped_addr;
+}
+
+static unsigned long gcs_size(unsigned long size)
+{
+ if (size)
+ return PAGE_ALIGN(size);
+
+ /* Allocate RLIMIT_STACK/2 with limits of PAGE_SIZE..2G */
+ size = PAGE_ALIGN(min_t(unsigned long long,
+ rlimit(RLIMIT_STACK) / 2, SZ_2G));
+ return max(PAGE_SIZE, size);
+}
+
+unsigned long gcs_alloc_thread_stack(struct task_struct *tsk,
+ const struct kernel_clone_args *args)
+{
+ unsigned long addr, size;
+
+ if (!system_supports_gcs())
+ return 0;
+
+ if (!task_gcs_el0_enabled(tsk))
+ return 0;
+
+ if ((args->flags & (CLONE_VFORK | CLONE_VM)) != CLONE_VM) {
+ tsk->thread.gcspr_el0 = read_sysreg_s(SYS_GCSPR_EL0);
+ return 0;
+ }
+
+ size = args->stack_size / 2;
+
+ size = gcs_size(size);
+ addr = alloc_gcs(0, size);
+ if (IS_ERR_VALUE(addr))
+ return addr;
+
+ tsk->thread.gcs_base = addr;
+ tsk->thread.gcs_size = size;
+ tsk->thread.gcspr_el0 = addr + size - sizeof(u64);
+
+ return addr;
+}
+
+SYSCALL_DEFINE3(map_shadow_stack, unsigned long, addr, unsigned long, size, unsigned int, flags)
+{
+ unsigned long alloc_size;
+ unsigned long __user *cap_ptr;
+ unsigned long cap_val;
+ int ret = 0;
+ int cap_offset;
+
+ if (!system_supports_gcs())
+ return -EOPNOTSUPP;
+
+ if (flags & ~(SHADOW_STACK_SET_TOKEN | SHADOW_STACK_SET_MARKER))
+ return -EINVAL;
+
+ if (!PAGE_ALIGNED(addr))
+ return -EINVAL;
+
+ if (size == 8 || !IS_ALIGNED(size, 8))
+ return -EINVAL;
+
+ /*
+ * An overflow would result in attempting to write the restore token
+ * to the wrong location. Not catastrophic, but just return the right
+ * error code and block it.
+ */
+ alloc_size = PAGE_ALIGN(size);
+ if (alloc_size < size)
+ return -EOVERFLOW;
+
+ addr = alloc_gcs(addr, alloc_size);
+ if (IS_ERR_VALUE(addr))
+ return addr;
+
+ /*
+ * Put a cap token at the end of the allocated region so it
+ * can be switched to.
+ */
+ if (flags & SHADOW_STACK_SET_TOKEN) {
+ /* Leave an extra empty frame as a top of stack marker? */
+ if (flags & SHADOW_STACK_SET_MARKER)
+ cap_offset = 2;
+ else
+ cap_offset = 1;
+
+ cap_ptr = (unsigned long __user *)(addr + size -
+ (cap_offset * sizeof(unsigned long)));
+ cap_val = GCS_CAP(cap_ptr);
+
+ put_user_gcs(cap_val, cap_ptr, &ret);
+ if (ret != 0) {
+ vm_munmap(addr, size);
+ return -EFAULT;
+ }
+
+ /*
+ * Ensure the new cap is ordered before standard
+ * memory accesses to the same location.
+ */
+ gcsb_dsync();
+ }
+
+ return addr;
+}
+
+/*
+ * Apply the GCS mode configured for the specified task to the
+ * hardware.
+ */
+void gcs_set_el0_mode(struct task_struct *task)
+{
+ u64 gcscre0_el1 = GCSCRE0_EL1_nTR;
+
+ if (task->thread.gcs_el0_mode & PR_SHADOW_STACK_ENABLE)
+ gcscre0_el1 |= GCSCRE0_EL1_RVCHKEN | GCSCRE0_EL1_PCRSEL;
+
+ if (task->thread.gcs_el0_mode & PR_SHADOW_STACK_WRITE)
+ gcscre0_el1 |= GCSCRE0_EL1_STREn;
+
+ if (task->thread.gcs_el0_mode & PR_SHADOW_STACK_PUSH)
+ gcscre0_el1 |= GCSCRE0_EL1_PUSHMEn;
+
+ write_sysreg_s(gcscre0_el1, SYS_GCSCRE0_EL1);
+}
+
+void gcs_free(struct task_struct *task)
+{
+ if (!system_supports_gcs())
+ return;
+
+ /*
+ * When fork() with CLONE_VM fails, the child (tsk) already
+ * has a GCS allocated, and exit_thread() calls this function
+ * to free it. In this case the parent (current) and the
+ * child share the same mm struct.
+ */
+ if (!task->mm || task->mm != current->mm)
+ return;
+
+ if (task->thread.gcs_base)
+ vm_munmap(task->thread.gcs_base, task->thread.gcs_size);
+
+ task->thread.gcspr_el0 = 0;
+ task->thread.gcs_base = 0;
+ task->thread.gcs_size = 0;
+}
+
+int arch_set_shadow_stack_status(struct task_struct *task, unsigned long arg)
+{
+ unsigned long gcs, size;
+ int ret;
+
+ if (!system_supports_gcs())
+ return -EINVAL;
+
+ if (is_compat_thread(task_thread_info(task)))
+ return -EINVAL;
+
+ /* Reject unknown flags */
+ if (arg & ~PR_SHADOW_STACK_SUPPORTED_STATUS_MASK)
+ return -EINVAL;
+
+ ret = gcs_check_locked(task, arg);
+ if (ret != 0)
+ return ret;
+
+ /* If we are enabling GCS then make sure we have a stack */
+ if (arg & PR_SHADOW_STACK_ENABLE &&
+ !task_gcs_el0_enabled(task)) {
+ /* Do not allow GCS to be reenabled */
+ if (task->thread.gcs_base || task->thread.gcspr_el0)
+ return -EINVAL;
+
+ if (task != current)
+ return -EBUSY;
+
+ size = gcs_size(0);
+ gcs = alloc_gcs(0, size);
+ if (!gcs)
+ return -ENOMEM;
+
+ task->thread.gcspr_el0 = gcs + size - sizeof(u64);
+ task->thread.gcs_base = gcs;
+ task->thread.gcs_size = size;
+ if (task == current)
+ write_sysreg_s(task->thread.gcspr_el0,
+ SYS_GCSPR_EL0);
+ }
+
+ task->thread.gcs_el0_mode = arg;
+ if (task == current)
+ gcs_set_el0_mode(task);
+
+ return 0;
+}
+
+int arch_get_shadow_stack_status(struct task_struct *task,
+ unsigned long __user *arg)
+{
+ if (!system_supports_gcs())
+ return -EINVAL;
+
+ if (is_compat_thread(task_thread_info(task)))
+ return -EINVAL;
+
+ return put_user(task->thread.gcs_el0_mode, arg);
+}
+
+int arch_lock_shadow_stack_status(struct task_struct *task,
+ unsigned long arg)
+{
+ if (!system_supports_gcs())
+ return -EINVAL;
+
+ if (is_compat_thread(task_thread_info(task)))
+ return -EINVAL;
+
+ /*
+ * We support locking unknown bits so applications can prevent
+ * any changes in a future proof manner.
+ */
+ task->thread.gcs_el0_locked |= arg;
+
+ return 0;
+}
diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
index 5f1e2103..3215adf 100644
--- a/arch/arm64/mm/hugetlbpage.c
+++ b/arch/arm64/mm/hugetlbpage.c
@@ -361,14 +361,25 @@ pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags_t flags)
{
size_t pagesize = 1UL << shift;
- entry = pte_mkhuge(entry);
- if (pagesize == CONT_PTE_SIZE) {
- entry = pte_mkcont(entry);
- } else if (pagesize == CONT_PMD_SIZE) {
+ switch (pagesize) {
+#ifndef __PAGETABLE_PMD_FOLDED
+ case PUD_SIZE:
+ entry = pud_pte(pud_mkhuge(pte_pud(entry)));
+ break;
+#endif
+ case CONT_PMD_SIZE:
entry = pmd_pte(pmd_mkcont(pte_pmd(entry)));
- } else if (pagesize != PUD_SIZE && pagesize != PMD_SIZE) {
+ fallthrough;
+ case PMD_SIZE:
+ entry = pmd_pte(pmd_mkhuge(pte_pmd(entry)));
+ break;
+ case CONT_PTE_SIZE:
+ entry = pte_mkcont(entry);
+ break;
+ default:
pr_warn("%s: unrecognized huge page size 0x%lx\n",
__func__, pagesize);
+ break;
}
return entry;
}
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index 27a32ff..d21f67d 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -41,6 +41,7 @@
#include <asm/kvm_host.h>
#include <asm/memory.h>
#include <asm/numa.h>
+#include <asm/rsi.h>
#include <asm/sections.h>
#include <asm/setup.h>
#include <linux/sizes.h>
@@ -366,8 +367,14 @@ void __init bootmem_init(void)
*/
void __init mem_init(void)
{
+ unsigned int flags = SWIOTLB_VERBOSE;
bool swiotlb = max_pfn > PFN_DOWN(arm64_dma_phys_limit);
+ if (is_realm_world()) {
+ swiotlb = true;
+ flags |= SWIOTLB_FORCE;
+ }
+
if (IS_ENABLED(CONFIG_DMA_BOUNCE_UNALIGNED_KMALLOC) && !swiotlb) {
/*
* If no bouncing needed for ZONE_DMA, reduce the swiotlb
@@ -379,7 +386,8 @@ void __init mem_init(void)
swiotlb = true;
}
- swiotlb_init(swiotlb, SWIOTLB_VERBOSE);
+ swiotlb_init(swiotlb, flags);
+ swiotlb_update_mem_attributes();
/* this will put all unused low memory onto the freelists */
memblock_free_all();
diff --git a/arch/arm64/mm/mmap.c b/arch/arm64/mm/mmap.c
index 7e3ad97..07aeab8 100644
--- a/arch/arm64/mm/mmap.c
+++ b/arch/arm64/mm/mmap.c
@@ -83,8 +83,15 @@ arch_initcall(adjust_protection_map);
pgprot_t vm_get_page_prot(unsigned long vm_flags)
{
- pteval_t prot = pgprot_val(protection_map[vm_flags &
+ pteval_t prot;
+
+ /* Short circuit GCS to avoid bloating the table. */
+ if (system_supports_gcs() && (vm_flags & VM_SHADOW_STACK)) {
+ prot = _PAGE_GCS_RO;
+ } else {
+ prot = pgprot_val(protection_map[vm_flags &
(VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]);
+ }
if (vm_flags & VM_ARM64_BTI)
prot |= PTE_GP;
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index e55b02f..e2739b6 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -119,7 +119,7 @@ static phys_addr_t __init early_pgtable_alloc(int shift)
return phys;
}
-bool pgattr_change_is_safe(u64 old, u64 new)
+bool pgattr_change_is_safe(pteval_t old, pteval_t new)
{
/*
* The following mapping attributes may be updated in live
@@ -201,7 +201,7 @@ static void alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr,
BUG_ON(pmd_sect(pmd));
if (pmd_none(pmd)) {
- pmdval_t pmdval = PMD_TYPE_TABLE | PMD_TABLE_UXN;
+ pmdval_t pmdval = PMD_TYPE_TABLE | PMD_TABLE_UXN | PMD_TABLE_AF;
phys_addr_t pte_phys;
if (flags & NO_EXEC_MAPPINGS)
@@ -288,7 +288,7 @@ static void alloc_init_cont_pmd(pud_t *pudp, unsigned long addr,
*/
BUG_ON(pud_sect(pud));
if (pud_none(pud)) {
- pudval_t pudval = PUD_TYPE_TABLE | PUD_TABLE_UXN;
+ pudval_t pudval = PUD_TYPE_TABLE | PUD_TABLE_UXN | PUD_TABLE_AF;
phys_addr_t pmd_phys;
if (flags & NO_EXEC_MAPPINGS)
@@ -333,7 +333,7 @@ static void alloc_init_pud(p4d_t *p4dp, unsigned long addr, unsigned long end,
pud_t *pudp;
if (p4d_none(p4d)) {
- p4dval_t p4dval = P4D_TYPE_TABLE | P4D_TABLE_UXN;
+ p4dval_t p4dval = P4D_TYPE_TABLE | P4D_TABLE_UXN | P4D_TABLE_AF;
phys_addr_t pud_phys;
if (flags & NO_EXEC_MAPPINGS)
@@ -391,7 +391,7 @@ static void alloc_init_p4d(pgd_t *pgdp, unsigned long addr, unsigned long end,
p4d_t *p4dp;
if (pgd_none(pgd)) {
- pgdval_t pgdval = PGD_TYPE_TABLE | PGD_TABLE_UXN;
+ pgdval_t pgdval = PGD_TYPE_TABLE | PGD_TABLE_UXN | PGD_TABLE_AF;
phys_addr_t p4d_phys;
if (flags & NO_EXEC_MAPPINGS)
diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c
index 0e270a1..6ae6ae8 100644
--- a/arch/arm64/mm/pageattr.c
+++ b/arch/arm64/mm/pageattr.c
@@ -5,10 +5,12 @@
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/module.h>
+#include <linux/mem_encrypt.h>
#include <linux/sched.h>
#include <linux/vmalloc.h>
#include <asm/cacheflush.h>
+#include <asm/pgtable-prot.h>
#include <asm/set_memory.h>
#include <asm/tlbflush.h>
#include <asm/kfence.h>
@@ -23,14 +25,16 @@ bool rodata_full __ro_after_init = IS_ENABLED(CONFIG_RODATA_FULL_DEFAULT_ENABLED
bool can_set_direct_map(void)
{
/*
- * rodata_full and DEBUG_PAGEALLOC require linear map to be
- * mapped at page granularity, so that it is possible to
+ * rodata_full, DEBUG_PAGEALLOC and a Realm guest all require linear
+ * map to be mapped at page granularity, so that it is possible to
* protect/unprotect single pages.
*
* KFENCE pool requires page-granular mapping if initialized late.
+ *
+ * Realms need to make pages shared/protected at page granularity.
*/
return rodata_full || debug_pagealloc_enabled() ||
- arm64_kfence_can_set_direct_map();
+ arm64_kfence_can_set_direct_map() || is_realm_world();
}
static int change_page_range(pte_t *ptep, unsigned long addr, void *data)
@@ -60,7 +64,13 @@ static int __change_memory_common(unsigned long start, unsigned long size,
ret = apply_to_page_range(&init_mm, start, size, change_page_range,
&data);
- flush_tlb_kernel_range(start, start + size);
+ /*
+ * If the memory is being made valid without changing any other bits
+ * then a TLBI isn't required as a non-valid entry cannot be cached in
+ * the TLB.
+ */
+ if (pgprot_val(set_mask) != PTE_VALID || pgprot_val(clear_mask))
+ flush_tlb_kernel_range(start, start + size);
return ret;
}
@@ -192,6 +202,86 @@ int set_direct_map_default_noflush(struct page *page)
PAGE_SIZE, change_page_range, &data);
}
+static int __set_memory_enc_dec(unsigned long addr,
+ int numpages,
+ bool encrypt)
+{
+ unsigned long set_prot = 0, clear_prot = 0;
+ phys_addr_t start, end;
+ int ret;
+
+ if (!is_realm_world())
+ return 0;
+
+ if (!__is_lm_address(addr))
+ return -EINVAL;
+
+ start = __virt_to_phys(addr);
+ end = start + numpages * PAGE_SIZE;
+
+ if (encrypt)
+ clear_prot = PROT_NS_SHARED;
+ else
+ set_prot = PROT_NS_SHARED;
+
+ /*
+ * Break the mapping before we make any changes to avoid stale TLB
+ * entries or Synchronous External Aborts caused by RIPAS_EMPTY
+ */
+ ret = __change_memory_common(addr, PAGE_SIZE * numpages,
+ __pgprot(set_prot),
+ __pgprot(clear_prot | PTE_VALID));
+
+ if (ret)
+ return ret;
+
+ if (encrypt)
+ ret = rsi_set_memory_range_protected(start, end);
+ else
+ ret = rsi_set_memory_range_shared(start, end);
+
+ if (ret)
+ return ret;
+
+ return __change_memory_common(addr, PAGE_SIZE * numpages,
+ __pgprot(PTE_VALID),
+ __pgprot(0));
+}
+
+static int realm_set_memory_encrypted(unsigned long addr, int numpages)
+{
+ int ret = __set_memory_enc_dec(addr, numpages, true);
+
+ /*
+ * If the request to change state fails, then the only sensible cause
+ * of action for the caller is to leak the memory
+ */
+ WARN(ret, "Failed to encrypt memory, %d pages will be leaked",
+ numpages);
+
+ return ret;
+}
+
+static int realm_set_memory_decrypted(unsigned long addr, int numpages)
+{
+ int ret = __set_memory_enc_dec(addr, numpages, false);
+
+ WARN(ret, "Failed to decrypt memory, %d pages will be leaked",
+ numpages);
+
+ return ret;
+}
+
+static const struct arm64_mem_crypt_ops realm_crypt_ops = {
+ .encrypt = realm_set_memory_encrypted,
+ .decrypt = realm_set_memory_decrypted,
+};
+
+int realm_register_memory_enc_ops(void)
+{
+ return arm64_mem_crypt_ops_register(&realm_crypt_ops);
+}
+
#ifdef CONFIG_DEBUG_PAGEALLOC
void __kernel_map_pages(struct page *page, int numpages, int enable)
{
diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S
index 8abdc7f..b8edc57 100644
--- a/arch/arm64/mm/proc.S
+++ b/arch/arm64/mm/proc.S
@@ -465,10 +465,12 @@
*/
mair .req x17
tcr .req x16
+ tcr2 .req x15
mov_q mair, MAIR_EL1_SET
mov_q tcr, TCR_T0SZ(IDMAP_VA_BITS) | TCR_T1SZ(VA_BITS_MIN) | TCR_CACHE_FLAGS | \
TCR_SHARED | TCR_TG_FLAGS | TCR_KASLR_FLAGS | TCR_ASID16 | \
TCR_TBI0 | TCR_A1 | TCR_KASAN_SW_FLAGS | TCR_MTE_FLAGS
+ mov tcr2, xzr
tcr_clear_errata_bits tcr, x9, x5
@@ -493,9 +495,14 @@
* via capabilities.
*/
mrs x9, ID_AA64MMFR1_EL1
- and x9, x9, ID_AA64MMFR1_EL1_HAFDBS_MASK
+ ubfx x9, x9, ID_AA64MMFR1_EL1_HAFDBS_SHIFT, #4
cbz x9, 1f
orr tcr, tcr, #TCR_HA // hardware Access flag update
+#ifdef CONFIG_ARM64_HAFT
+ cmp x9, ID_AA64MMFR1_EL1_HAFDBS_HAFT
+ b.lt 1f
+ orr tcr2, tcr2, TCR2_EL1x_HAFT
+#endif /* CONFIG_ARM64_HAFT */
1:
#endif /* CONFIG_ARM64_HW_AFDBM */
msr mair_el1, mair
@@ -525,11 +532,16 @@
#undef PTE_MAYBE_NG
#undef PTE_MAYBE_SHARED
- mov x0, TCR2_EL1x_PIE
- msr REG_TCR2_EL1, x0
+ orr tcr2, tcr2, TCR2_EL1x_PIE
.Lskip_indirection:
+ mrs_s x1, SYS_ID_AA64MMFR3_EL1
+ ubfx x1, x1, #ID_AA64MMFR3_EL1_TCRX_SHIFT, #4
+ cbz x1, 1f
+ msr REG_TCR2_EL1, tcr2
+1:
+
/*
* Prepare SCTLR
*/
@@ -538,4 +550,5 @@
.unreq mair
.unreq tcr
+ .unreq tcr2
SYM_FUNC_END(__cpu_setup)
diff --git a/arch/arm64/mm/ptdump.c b/arch/arm64/mm/ptdump.c
index 264c5f9..688fbe0 100644
--- a/arch/arm64/mm/ptdump.c
+++ b/arch/arm64/mm/ptdump.c
@@ -80,10 +80,10 @@ static const struct ptdump_prot_bits pte_bits[] = {
.set = "CON",
.clear = " ",
}, {
- .mask = PTE_TABLE_BIT,
- .val = PTE_TABLE_BIT,
- .set = " ",
- .clear = "BLK",
+ .mask = PTE_TABLE_BIT | PTE_VALID,
+ .val = PTE_VALID,
+ .set = "BLK",
+ .clear = " ",
}, {
.mask = PTE_UXN,
.val = PTE_UXN,
diff --git a/arch/arm64/tools/cpucaps b/arch/arm64/tools/cpucaps
index eedb5ac..8dfb2fa 100644
--- a/arch/arm64/tools/cpucaps
+++ b/arch/arm64/tools/cpucaps
@@ -29,6 +29,7 @@
HAS_FPMR
HAS_FGT
HAS_FPSIMD
+HAS_GCS
HAS_GENERIC_AUTH
HAS_GENERIC_AUTH_ARCH_QARMA3
HAS_GENERIC_AUTH_ARCH_QARMA5
@@ -56,6 +57,7 @@
HAS_VA52
HAS_VIRT_HOST_EXTN
HAS_WFXT
+HAFT
HW_DBM
KVM_HVHE
KVM_PROTECTED_MODE
diff --git a/arch/arm64/tools/syscall_32.tbl b/arch/arm64/tools/syscall_32.tbl
index 9a37930..69a8299 100644
--- a/arch/arm64/tools/syscall_32.tbl
+++ b/arch/arm64/tools/syscall_32.tbl
@@ -474,3 +474,7 @@
460 common lsm_set_self_attr sys_lsm_set_self_attr
461 common lsm_list_modules sys_lsm_list_modules
462 common mseal sys_mseal
+463 common setxattrat sys_setxattrat
+464 common getxattrat sys_getxattrat
+465 common listxattrat sys_listxattrat
+466 common removexattrat sys_removexattrat
diff --git a/arch/arm64/tools/sysreg b/arch/arm64/tools/sysreg
index 8d637ac..283279a 100644
--- a/arch/arm64/tools/sysreg
+++ b/arch/arm64/tools/sysreg
@@ -1238,6 +1238,7 @@
0b0110 V3P5
0b0111 V3P7
0b1000 V3P8
+ 0b1001 V3P9
0b1111 IMP_DEF
EndEnum
UnsignedEnum 7:4 TraceVer
@@ -1648,6 +1649,8 @@
UnsignedEnum 39:36 ETS
0b0000 NI
0b0001 IMP
+ 0b0010 ETS2
+ 0b0011 ETS3
EndEnum
UnsignedEnum 35:32 TWED
0b0000 NI
@@ -1688,6 +1691,8 @@
0b0000 NI
0b0001 AF
0b0010 DBM
+ 0b0011 HAFT
+ 0b0100 HDBSS
EndEnum
EndSysreg
@@ -2178,6 +2183,13 @@
Field 3:0 ALIGN
EndSysreg
+Sysreg PMUACR_EL1 3 0 9 14 4
+Res0 63:33
+Field 32 F0
+Field 31 C
+Field 30:0 P
+EndSysreg
+
Sysreg PMSELR_EL0 3 3 9 12 5
Res0 63:5
Field 4:0 SEL
diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig
index bb35c34..d9fce0f 100644
--- a/arch/loongarch/Kconfig
+++ b/arch/loongarch/Kconfig
@@ -604,6 +604,9 @@
config ARCH_SUPPORTS_CRASH_DUMP
def_bool y
+config ARCH_DEFAULT_CRASH_DUMP
+ def_bool y
+
config ARCH_SELECTS_CRASH_DUMP
def_bool y
depends on CRASH_DUMP
diff --git a/arch/loongarch/include/asm/kasan.h b/arch/loongarch/include/asm/kasan.h
index c6bce5f..7f52bd3 100644
--- a/arch/loongarch/include/asm/kasan.h
+++ b/arch/loongarch/include/asm/kasan.h
@@ -25,6 +25,7 @@
/* 64-bit segment value. */
#define XKPRANGE_UC_SEG (0x8000)
#define XKPRANGE_CC_SEG (0x9000)
+#define XKPRANGE_WC_SEG (0xa000)
#define XKVRANGE_VC_SEG (0xffff)
/* Cached */
@@ -41,20 +42,28 @@
#define XKPRANGE_UC_SHADOW_SIZE (XKPRANGE_UC_SIZE >> KASAN_SHADOW_SCALE_SHIFT)
#define XKPRANGE_UC_SHADOW_END (XKPRANGE_UC_KASAN_OFFSET + XKPRANGE_UC_SHADOW_SIZE)
+/* WriteCombine */
+#define XKPRANGE_WC_START WRITECOMBINE_BASE
+#define XKPRANGE_WC_SIZE XRANGE_SIZE
+#define XKPRANGE_WC_KASAN_OFFSET XKPRANGE_UC_SHADOW_END
+#define XKPRANGE_WC_SHADOW_SIZE (XKPRANGE_WC_SIZE >> KASAN_SHADOW_SCALE_SHIFT)
+#define XKPRANGE_WC_SHADOW_END (XKPRANGE_WC_KASAN_OFFSET + XKPRANGE_WC_SHADOW_SIZE)
+
/* VMALLOC (Cached or UnCached) */
#define XKVRANGE_VC_START MODULES_VADDR
#define XKVRANGE_VC_SIZE round_up(KFENCE_AREA_END - MODULES_VADDR + 1, PGDIR_SIZE)
-#define XKVRANGE_VC_KASAN_OFFSET XKPRANGE_UC_SHADOW_END
+#define XKVRANGE_VC_KASAN_OFFSET XKPRANGE_WC_SHADOW_END
#define XKVRANGE_VC_SHADOW_SIZE (XKVRANGE_VC_SIZE >> KASAN_SHADOW_SCALE_SHIFT)
#define XKVRANGE_VC_SHADOW_END (XKVRANGE_VC_KASAN_OFFSET + XKVRANGE_VC_SHADOW_SIZE)
/* KAsan shadow memory start right after vmalloc. */
#define KASAN_SHADOW_START round_up(KFENCE_AREA_END, PGDIR_SIZE)
#define KASAN_SHADOW_SIZE (XKVRANGE_VC_SHADOW_END - XKPRANGE_CC_KASAN_OFFSET)
-#define KASAN_SHADOW_END round_up(KASAN_SHADOW_START + KASAN_SHADOW_SIZE, PGDIR_SIZE)
+#define KASAN_SHADOW_END (round_up(KASAN_SHADOW_START + KASAN_SHADOW_SIZE, PGDIR_SIZE) - 1)
#define XKPRANGE_CC_SHADOW_OFFSET (KASAN_SHADOW_START + XKPRANGE_CC_KASAN_OFFSET)
#define XKPRANGE_UC_SHADOW_OFFSET (KASAN_SHADOW_START + XKPRANGE_UC_KASAN_OFFSET)
+#define XKPRANGE_WC_SHADOW_OFFSET (KASAN_SHADOW_START + XKPRANGE_WC_KASAN_OFFSET)
#define XKVRANGE_VC_SHADOW_OFFSET (KASAN_SHADOW_START + XKVRANGE_VC_KASAN_OFFSET)
extern bool kasan_early_stage;
diff --git a/arch/loongarch/include/asm/page.h b/arch/loongarch/include/asm/page.h
index e85df33..8f21567 100644
--- a/arch/loongarch/include/asm/page.h
+++ b/arch/loongarch/include/asm/page.h
@@ -113,10 +113,7 @@ struct page *tlb_virt_to_page(unsigned long kaddr);
extern int __virt_addr_valid(volatile void *kaddr);
#define virt_addr_valid(kaddr) __virt_addr_valid((volatile void *)(kaddr))
-#define VM_DATA_DEFAULT_FLAGS \
- (VM_READ | VM_WRITE | \
- ((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0) | \
- VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
+#define VM_DATA_DEFAULT_FLAGS VM_DATA_FLAGS_TSK_EXEC
#include <asm-generic/memory_model.h>
#include <asm-generic/getorder.h>
diff --git a/arch/loongarch/kernel/acpi.c b/arch/loongarch/kernel/acpi.c
index f1a74b8..382a09a7 100644
--- a/arch/loongarch/kernel/acpi.c
+++ b/arch/loongarch/kernel/acpi.c
@@ -58,48 +58,48 @@ void __iomem *acpi_os_ioremap(acpi_physical_address phys, acpi_size size)
return ioremap_cache(phys, size);
}
-static int cpu_enumerated = 0;
-
#ifdef CONFIG_SMP
-static int set_processor_mask(u32 id, u32 flags)
+static int set_processor_mask(u32 id, u32 pass)
{
- int nr_cpus;
- int cpu, cpuid = id;
+ int cpu = -1, cpuid = id;
- if (!cpu_enumerated)
- nr_cpus = NR_CPUS;
- else
- nr_cpus = nr_cpu_ids;
-
- if (num_processors >= nr_cpus) {
+ if (num_processors >= NR_CPUS) {
pr_warn(PREFIX "nr_cpus limit of %i reached."
- " processor 0x%x ignored.\n", nr_cpus, cpuid);
+ " processor 0x%x ignored.\n", NR_CPUS, cpuid);
return -ENODEV;
}
+
if (cpuid == loongson_sysconf.boot_cpu_id)
cpu = 0;
- else
- cpu = find_first_zero_bit(cpumask_bits(cpu_present_mask), NR_CPUS);
- if (!cpu_enumerated)
- set_cpu_possible(cpu, true);
-
- if (flags & ACPI_MADT_ENABLED) {
+ switch (pass) {
+ case 1: /* Pass 1 handle enabled processors */
+ if (cpu < 0)
+ cpu = find_first_zero_bit(cpumask_bits(cpu_present_mask), NR_CPUS);
num_processors++;
set_cpu_present(cpu, true);
- __cpu_number_map[cpuid] = cpu;
- __cpu_logical_map[cpu] = cpuid;
- } else
+ break;
+ case 2: /* Pass 2 handle disabled processors */
+ if (cpu < 0)
+ cpu = find_first_zero_bit(cpumask_bits(cpu_possible_mask), NR_CPUS);
disabled_cpus++;
+ break;
+ default:
+ return cpu;
+ }
+
+ set_cpu_possible(cpu, true);
+ __cpu_number_map[cpuid] = cpu;
+ __cpu_logical_map[cpu] = cpuid;
return cpu;
}
#endif
static int __init
-acpi_parse_processor(union acpi_subtable_headers *header, const unsigned long end)
+acpi_parse_p1_processor(union acpi_subtable_headers *header, const unsigned long end)
{
struct acpi_madt_core_pic *processor = NULL;
@@ -110,13 +110,30 @@ acpi_parse_processor(union acpi_subtable_headers *header, const unsigned long en
acpi_table_print_madt_entry(&header->common);
#ifdef CONFIG_SMP
acpi_core_pic[processor->core_id] = *processor;
- set_processor_mask(processor->core_id, processor->flags);
+ if (processor->flags & ACPI_MADT_ENABLED)
+ set_processor_mask(processor->core_id, 1);
#endif
return 0;
}
static int __init
+acpi_parse_p2_processor(union acpi_subtable_headers *header, const unsigned long end)
+{
+ struct acpi_madt_core_pic *processor = NULL;
+
+ processor = (struct acpi_madt_core_pic *)header;
+ if (BAD_MADT_ENTRY(processor, end))
+ return -EINVAL;
+
+#ifdef CONFIG_SMP
+ if (!(processor->flags & ACPI_MADT_ENABLED))
+ set_processor_mask(processor->core_id, 2);
+#endif
+
+ return 0;
+}
+static int __init
acpi_parse_eio_master(union acpi_subtable_headers *header, const unsigned long end)
{
static int core = 0;
@@ -143,12 +160,14 @@ static void __init acpi_process_madt(void)
}
#endif
acpi_table_parse_madt(ACPI_MADT_TYPE_CORE_PIC,
- acpi_parse_processor, MAX_CORE_PIC);
+ acpi_parse_p1_processor, MAX_CORE_PIC);
+
+ acpi_table_parse_madt(ACPI_MADT_TYPE_CORE_PIC,
+ acpi_parse_p2_processor, MAX_CORE_PIC);
acpi_table_parse_madt(ACPI_MADT_TYPE_EIO_PIC,
acpi_parse_eio_master, MAX_IO_PICS);
- cpu_enumerated = 1;
loongson_sysconf.nr_cpus = num_processors;
}
@@ -310,6 +329,10 @@ static int __ref acpi_map_cpu2node(acpi_handle handle, int cpu, int physid)
int nid;
nid = acpi_get_node(handle);
+
+ if (nid != NUMA_NO_NODE)
+ nid = early_cpu_to_node(cpu);
+
if (nid != NUMA_NO_NODE) {
set_cpuid_to_node(physid, nid);
node_set(nid, numa_nodes_parsed);
@@ -324,12 +347,14 @@ int acpi_map_cpu(acpi_handle handle, phys_cpuid_t physid, u32 acpi_id, int *pcpu
{
int cpu;
- cpu = set_processor_mask(physid, ACPI_MADT_ENABLED);
- if (cpu < 0) {
+ cpu = cpu_number_map(physid);
+ if (cpu < 0 || cpu >= nr_cpu_ids) {
pr_info(PREFIX "Unable to map lapic to logical cpu number\n");
- return cpu;
+ return -ERANGE;
}
+ num_processors++;
+ set_cpu_present(cpu, true);
acpi_map_cpu2node(handle, cpu, physid);
*pcpu = cpu;
diff --git a/arch/loongarch/kernel/paravirt.c b/arch/loongarch/kernel/paravirt.c
index a5fc61f..e5a39bb 100644
--- a/arch/loongarch/kernel/paravirt.c
+++ b/arch/loongarch/kernel/paravirt.c
@@ -51,11 +51,18 @@ static u64 paravt_steal_clock(int cpu)
}
#ifdef CONFIG_SMP
+static struct smp_ops native_ops;
+
static void pv_send_ipi_single(int cpu, unsigned int action)
{
int min, old;
irq_cpustat_t *info = &per_cpu(irq_stat, cpu);
+ if (unlikely(action == ACTION_BOOT_CPU)) {
+ native_ops.send_ipi_single(cpu, action);
+ return;
+ }
+
old = atomic_fetch_or(BIT(action), &info->message);
if (old)
return;
@@ -75,6 +82,11 @@ static void pv_send_ipi_mask(const struct cpumask *mask, unsigned int action)
if (cpumask_empty(mask))
return;
+ if (unlikely(action == ACTION_BOOT_CPU)) {
+ native_ops.send_ipi_mask(mask, action);
+ return;
+ }
+
action = BIT(action);
for_each_cpu(i, mask) {
info = &per_cpu(irq_stat, i);
@@ -147,6 +159,8 @@ static void pv_init_ipi(void)
{
int r, swi;
+ /* Init native ipi irq for ACTION_BOOT_CPU */
+ native_ops.init_ipi();
swi = get_percpu_irq(INT_SWI0);
if (swi < 0)
panic("SWI0 IRQ mapping failed\n");
@@ -193,6 +207,7 @@ int __init pv_ipi_init(void)
return 0;
#ifdef CONFIG_SMP
+ native_ops = mp_ops;
mp_ops.init_ipi = pv_init_ipi;
mp_ops.send_ipi_single = pv_send_ipi_single;
mp_ops.send_ipi_mask = pv_send_ipi_mask;
diff --git a/arch/loongarch/kernel/smp.c b/arch/loongarch/kernel/smp.c
index 9afc2d8..5d59e9c 100644
--- a/arch/loongarch/kernel/smp.c
+++ b/arch/loongarch/kernel/smp.c
@@ -302,7 +302,7 @@ static void __init fdt_smp_setup(void)
__cpu_number_map[cpuid] = cpu;
__cpu_logical_map[cpu] = cpuid;
- early_numa_add_cpu(cpu, 0);
+ early_numa_add_cpu(cpuid, 0);
set_cpuid_to_node(cpuid, 0);
}
@@ -331,11 +331,11 @@ void __init loongson_prepare_cpus(unsigned int max_cpus)
int i = 0;
parse_acpi_topology();
+ cpu_data[0].global_id = cpu_logical_map(0);
for (i = 0; i < loongson_sysconf.nr_cpus; i++) {
set_cpu_present(i, true);
csr_mail_send(0, __cpu_logical_map[i], 0);
- cpu_data[i].global_id = __cpu_logical_map[i];
}
per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
@@ -380,6 +380,7 @@ void loongson_init_secondary(void)
cpu_logical_map(cpu) / loongson_sysconf.cores_per_package;
cpu_data[cpu].core = pptt_enabled ? cpu_data[cpu].core :
cpu_logical_map(cpu) % loongson_sysconf.cores_per_package;
+ cpu_data[cpu].global_id = cpu_logical_map(cpu);
}
void loongson_smp_finish(void)
diff --git a/arch/loongarch/mm/kasan_init.c b/arch/loongarch/mm/kasan_init.c
index 427d6b1..d268127 100644
--- a/arch/loongarch/mm/kasan_init.c
+++ b/arch/loongarch/mm/kasan_init.c
@@ -13,6 +13,13 @@
static pgd_t kasan_pg_dir[PTRS_PER_PGD] __initdata __aligned(PAGE_SIZE);
+#ifdef __PAGETABLE_P4D_FOLDED
+#define __pgd_none(early, pgd) (0)
+#else
+#define __pgd_none(early, pgd) (early ? (pgd_val(pgd) == 0) : \
+(__pa(pgd_val(pgd)) == (unsigned long)__pa(kasan_early_shadow_p4d)))
+#endif
+
#ifdef __PAGETABLE_PUD_FOLDED
#define __p4d_none(early, p4d) (0)
#else
@@ -55,6 +62,9 @@ void *kasan_mem_to_shadow(const void *addr)
case XKPRANGE_UC_SEG:
offset = XKPRANGE_UC_SHADOW_OFFSET;
break;
+ case XKPRANGE_WC_SEG:
+ offset = XKPRANGE_WC_SHADOW_OFFSET;
+ break;
case XKVRANGE_VC_SEG:
offset = XKVRANGE_VC_SHADOW_OFFSET;
break;
@@ -79,6 +89,8 @@ const void *kasan_shadow_to_mem(const void *shadow_addr)
if (addr >= XKVRANGE_VC_SHADOW_OFFSET)
return (void *)(((addr - XKVRANGE_VC_SHADOW_OFFSET) << KASAN_SHADOW_SCALE_SHIFT) + XKVRANGE_VC_START);
+ else if (addr >= XKPRANGE_WC_SHADOW_OFFSET)
+ return (void *)(((addr - XKPRANGE_WC_SHADOW_OFFSET) << KASAN_SHADOW_SCALE_SHIFT) + XKPRANGE_WC_START);
else if (addr >= XKPRANGE_UC_SHADOW_OFFSET)
return (void *)(((addr - XKPRANGE_UC_SHADOW_OFFSET) << KASAN_SHADOW_SCALE_SHIFT) + XKPRANGE_UC_START);
else if (addr >= XKPRANGE_CC_SHADOW_OFFSET)
@@ -142,6 +154,19 @@ static pud_t *__init kasan_pud_offset(p4d_t *p4dp, unsigned long addr, int node,
return pud_offset(p4dp, addr);
}
+static p4d_t *__init kasan_p4d_offset(pgd_t *pgdp, unsigned long addr, int node, bool early)
+{
+ if (__pgd_none(early, pgdp_get(pgdp))) {
+ phys_addr_t p4d_phys = early ?
+ __pa_symbol(kasan_early_shadow_p4d) : kasan_alloc_zeroed_page(node);
+ if (!early)
+ memcpy(__va(p4d_phys), kasan_early_shadow_p4d, sizeof(kasan_early_shadow_p4d));
+ pgd_populate(&init_mm, pgdp, (p4d_t *)__va(p4d_phys));
+ }
+
+ return p4d_offset(pgdp, addr);
+}
+
static void __init kasan_pte_populate(pmd_t *pmdp, unsigned long addr,
unsigned long end, int node, bool early)
{
@@ -178,19 +203,19 @@ static void __init kasan_pud_populate(p4d_t *p4dp, unsigned long addr,
do {
next = pud_addr_end(addr, end);
kasan_pmd_populate(pudp, addr, next, node, early);
- } while (pudp++, addr = next, addr != end);
+ } while (pudp++, addr = next, addr != end && __pud_none(early, READ_ONCE(*pudp)));
}
static void __init kasan_p4d_populate(pgd_t *pgdp, unsigned long addr,
unsigned long end, int node, bool early)
{
unsigned long next;
- p4d_t *p4dp = p4d_offset(pgdp, addr);
+ p4d_t *p4dp = kasan_p4d_offset(pgdp, addr, node, early);
do {
next = p4d_addr_end(addr, end);
kasan_pud_populate(p4dp, addr, next, node, early);
- } while (p4dp++, addr = next, addr != end);
+ } while (p4dp++, addr = next, addr != end && __p4d_none(early, READ_ONCE(*p4dp)));
}
static void __init kasan_pgd_populate(unsigned long addr, unsigned long end,
@@ -218,7 +243,7 @@ static void __init kasan_map_populate(unsigned long start, unsigned long end,
asmlinkage void __init kasan_early_init(void)
{
BUILD_BUG_ON(!IS_ALIGNED(KASAN_SHADOW_START, PGDIR_SIZE));
- BUILD_BUG_ON(!IS_ALIGNED(KASAN_SHADOW_END, PGDIR_SIZE));
+ BUILD_BUG_ON(!IS_ALIGNED(KASAN_SHADOW_END + 1, PGDIR_SIZE));
}
static inline void kasan_set_pgd(pgd_t *pgdp, pgd_t pgdval)
@@ -233,7 +258,7 @@ static void __init clear_pgds(unsigned long start, unsigned long end)
* swapper_pg_dir. pgd_clear() can't be used
* here because it's nop on 2,3-level pagetable setups
*/
- for (; start < end; start += PGDIR_SIZE)
+ for (; start < end; start = pgd_addr_end(start, end))
kasan_set_pgd((pgd_t *)pgd_offset_k(start), __pgd(0));
}
@@ -243,6 +268,17 @@ void __init kasan_init(void)
phys_addr_t pa_start, pa_end;
/*
+ * If PGDIR_SIZE is too large for cpu_vabits, KASAN_SHADOW_END will
+ * overflow UINTPTR_MAX and then looks like a user space address.
+ * For example, PGDIR_SIZE of CONFIG_4KB_4LEVEL is 2^39, which is too
+ * large for Loongson-2K series whose cpu_vabits = 39.
+ */
+ if (KASAN_SHADOW_END < vm_map_base) {
+ pr_warn("PGDIR_SIZE too large for cpu_vabits, KernelAddressSanitizer disabled.\n");
+ return;
+ }
+
+ /*
* PGD was populated as invalid_pmd_table or invalid_pud_table
* in pagetable_init() which depends on how many levels of page
* table you are using, but we had to clean the gpd of kasan
diff --git a/arch/m68k/Kconfig b/arch/m68k/Kconfig
index cc26df9..7c4f7bc 100644
--- a/arch/m68k/Kconfig
+++ b/arch/m68k/Kconfig
@@ -84,24 +84,23 @@
support by paged memory management. If unsure, say 'Y'.
config MMU_MOTOROLA
- bool
+ def_bool MMU && M68KCLASSIC
select HAVE_PAGE_SIZE_4KB
config MMU_COLDFIRE
+ def_bool MMU && COLDFIRE
select HAVE_PAGE_SIZE_8KB
- bool
config MMU_SUN3
- bool
+ def_bool MMU && SUN3
select HAVE_PAGE_SIZE_8KB
- depends on MMU && !MMU_MOTOROLA && !MMU_COLDFIRE
config ARCH_SUPPORTS_KEXEC
- def_bool M68KCLASSIC && MMU
+ def_bool (M68KCLASSIC || SUN3) && MMU
config BOOTINFO_PROC
bool "Export bootinfo in procfs"
- depends on KEXEC && M68KCLASSIC
+ depends on KEXEC && (M68KCLASSIC || SUN3)
help
Say Y to export the bootinfo used to boot the kernel in a
"bootinfo" file in procfs. This is useful with kexec.
diff --git a/arch/m68k/Kconfig.cpu b/arch/m68k/Kconfig.cpu
index c777a12..c9a7e60 100644
--- a/arch/m68k/Kconfig.cpu
+++ b/arch/m68k/Kconfig.cpu
@@ -2,7 +2,7 @@
comment "Processor Type"
choice
- prompt "CPU family support"
+ prompt "CPU/machine family support"
default M68KCLASSIC if MMU
default COLDFIRE if !MMU
help
@@ -19,8 +19,9 @@
processor, select COLDFIRE.
config M68KCLASSIC
- bool "Classic M68K CPU family support"
+ bool "Classic M68K CPU/machine family support"
select HAVE_ARCH_PFN_VALID
+ select M68020 if MMU && !(M68030 || M68040 || M68060)
config COLDFIRE
bool "Coldfire CPU family support"
@@ -32,13 +33,23 @@
select HAVE_LEGACY_CLK
select HAVE_PAGE_SIZE_8KB if !MMU
+config SUN3
+ bool "Sun3 machine support"
+ depends on MMU
+ select HAVE_ARCH_PFN_VALID
+ select LEGACY_TIMER_TICK
+ select NO_DMA
+ select M68020
+ help
+ This option enables support for the Sun 3 series of workstations
+ (3/50, 3/60, 3/1xx, 3/2xx systems). These use a classic 68020 CPU
+ but the custom memory management unit makes them incompatible with
+ all other classic m68k machines, including Sun 3x.
+
endchoice
-if M68KCLASSIC
-
config M68000
- def_bool y
- depends on !MMU
+ def_bool M68KCLASSIC && !MMU
select CPU_HAS_NO_BITFIELDS
select CPU_HAS_NO_CAS
select CPU_HAS_NO_MULDIV64
@@ -56,7 +67,7 @@
a paging MMU.
config M68020
- bool "68020 support"
+ bool "68020 support" if M68KCLASSIC
depends on MMU
select FPU
select CPU_HAS_ADDRESS_SPACES
@@ -66,9 +77,10 @@
68851 MMU (Memory Management Unit) to run Linux/m68k, except on the
Sun 3, which provides its own version.
+if M68KCLASSIC && MMU
+
config M68030
bool "68030 support"
- depends on MMU && !MMU_SUN3
select FPU
select CPU_HAS_ADDRESS_SPACES
help
@@ -78,7 +90,6 @@
config M68040
bool "68040 support"
- depends on MMU && !MMU_SUN3
select FPU
select CPU_HAS_ADDRESS_SPACES
help
@@ -89,13 +100,14 @@
config M68060
bool "68060 support"
- depends on MMU && !MMU_SUN3
select FPU
select CPU_HAS_ADDRESS_SPACES
help
If you anticipate running this kernel on a computer with a MC68060
processor, say Y. Otherwise, say N.
+endif # M68KCLASSIC
+
config M68328
bool
depends on !MMU
@@ -117,8 +129,6 @@
help
Motorola 68VZ328 processor support.
-endif # M68KCLASSIC
-
if COLDFIRE
choice
@@ -325,7 +335,7 @@
config M68KFPU_EMU
bool "Math emulation support"
- depends on M68KCLASSIC && FPU
+ depends on (M68KCLASSIC || SUN3) && FPU
help
At some point in the future, this will cause floating-point math
instructions to be emulated by the kernel on machines that lack a
diff --git a/arch/m68k/Kconfig.machine b/arch/m68k/Kconfig.machine
index d06b1c5..de39f23 100644
--- a/arch/m68k/Kconfig.machine
+++ b/arch/m68k/Kconfig.machine
@@ -6,7 +6,6 @@
config AMIGA
bool "Amiga support"
depends on MMU
- select MMU_MOTOROLA if MMU
select LEGACY_TIMER_TICK
help
This option enables support for the Amiga series of computers. If
@@ -16,7 +15,6 @@
config ATARI
bool "Atari support"
depends on MMU
- select MMU_MOTOROLA if MMU
select HAVE_ARCH_NVRAM_OPS
select LEGACY_TIMER_TICK
help
@@ -31,7 +29,6 @@
config MAC
bool "Macintosh support"
depends on MMU
- select MMU_MOTOROLA if MMU
select HAVE_ARCH_NVRAM_OPS
select HAVE_PATA_PLATFORM
select LEGACY_TIMER_TICK
@@ -44,7 +41,6 @@
config APOLLO
bool "Apollo support"
depends on MMU
- select MMU_MOTOROLA if MMU
select LEGACY_TIMER_TICK
help
Say Y here if you want to run Linux on an MC680x0-based Apollo
@@ -53,7 +49,6 @@
config VME
bool "VME (Motorola and BVM) support"
depends on MMU
- select MMU_MOTOROLA if MMU
help
Say Y here if you want to build a kernel for a 680x0 based VME
board. Boards currently supported include Motorola boards MVME147,
@@ -97,7 +92,6 @@
config HP300
bool "HP9000/300 and HP9000/400 support"
depends on MMU
- select MMU_MOTOROLA if MMU
select LEGACY_TIMER_TICK
help
This option enables support for the HP9000/300 and HP9000/400 series
@@ -110,7 +104,6 @@
bool "Sun3x support"
depends on MMU
select LEGACY_TIMER_TICK
- select MMU_MOTOROLA if MMU
select M68030
help
This option enables support for the Sun 3x series of workstations.
@@ -124,7 +117,6 @@
config Q40
bool "Q40/Q60 support"
depends on MMU
- select MMU_MOTOROLA if MMU
select LEGACY_TIMER_TICK
help
The Q40 is a Motorola 68040-based successor to the Sinclair QL
@@ -133,22 +125,6 @@
Q60. Select your CPU below. For 68LC060 don't forget to enable FPU
emulation.
-config SUN3
- bool "Sun3 support"
- depends on MMU
- depends on !MMU_MOTOROLA
- select MMU_SUN3 if MMU
- select LEGACY_TIMER_TICK
- select NO_DMA
- select M68020
- help
- This option enables support for the Sun 3 series of workstations
- (3/50, 3/60, 3/1xx, 3/2xx systems). Enabling this option requires
- that all other hardware types must be disabled, as Sun 3 kernels
- are incompatible with all other m68k targets (including Sun 3x!).
-
- If you don't want to compile a kernel exclusively for a Sun 3, say N.
-
config VIRT
bool "Virtual M68k Machine support"
depends on MMU
@@ -157,7 +133,6 @@
select GOLDFISH_TIMER
select GOLDFISH_TTY
select M68040
- select MMU_MOTOROLA if MMU
select RTC_CLASS
select RTC_DRV_GOLDFISH
select TTY
diff --git a/arch/m68k/configs/amiga_defconfig b/arch/m68k/configs/amiga_defconfig
index d01dc47..a70aec9a 100644
--- a/arch/m68k/configs/amiga_defconfig
+++ b/arch/m68k/configs/amiga_defconfig
@@ -620,6 +620,7 @@
CONFIG_TEST_DHRY=m
CONFIG_TEST_MIN_HEAP=m
CONFIG_TEST_DIV64=m
+CONFIG_TEST_MULDIV64=m
CONFIG_REED_SOLOMON_TEST=m
CONFIG_ATOMIC64_SELFTEST=m
CONFIG_ASYNC_RAID6_TEST=m
diff --git a/arch/m68k/configs/apollo_defconfig b/arch/m68k/configs/apollo_defconfig
index 46808e5..312853f 100644
--- a/arch/m68k/configs/apollo_defconfig
+++ b/arch/m68k/configs/apollo_defconfig
@@ -577,6 +577,7 @@
CONFIG_TEST_DHRY=m
CONFIG_TEST_MIN_HEAP=m
CONFIG_TEST_DIV64=m
+CONFIG_TEST_MULDIV64=m
CONFIG_REED_SOLOMON_TEST=m
CONFIG_ATOMIC64_SELFTEST=m
CONFIG_ASYNC_RAID6_TEST=m
diff --git a/arch/m68k/configs/atari_defconfig b/arch/m68k/configs/atari_defconfig
index 4469a78..0853e43 100644
--- a/arch/m68k/configs/atari_defconfig
+++ b/arch/m68k/configs/atari_defconfig
@@ -597,6 +597,7 @@
CONFIG_TEST_DHRY=m
CONFIG_TEST_MIN_HEAP=m
CONFIG_TEST_DIV64=m
+CONFIG_TEST_MULDIV64=m
CONFIG_REED_SOLOMON_TEST=m
CONFIG_ATOMIC64_SELFTEST=m
CONFIG_ASYNC_RAID6_TEST=m
diff --git a/arch/m68k/configs/bvme6000_defconfig b/arch/m68k/configs/bvme6000_defconfig
index c071932..f738202 100644
--- a/arch/m68k/configs/bvme6000_defconfig
+++ b/arch/m68k/configs/bvme6000_defconfig
@@ -569,6 +569,7 @@
CONFIG_TEST_DHRY=m
CONFIG_TEST_MIN_HEAP=m
CONFIG_TEST_DIV64=m
+CONFIG_TEST_MULDIV64=m
CONFIG_REED_SOLOMON_TEST=m
CONFIG_ATOMIC64_SELFTEST=m
CONFIG_ASYNC_RAID6_TEST=m
diff --git a/arch/m68k/configs/hp300_defconfig b/arch/m68k/configs/hp300_defconfig
index 8d429e6..74f74e0 100644
--- a/arch/m68k/configs/hp300_defconfig
+++ b/arch/m68k/configs/hp300_defconfig
@@ -579,6 +579,7 @@
CONFIG_TEST_DHRY=m
CONFIG_TEST_MIN_HEAP=m
CONFIG_TEST_DIV64=m
+CONFIG_TEST_MULDIV64=m
CONFIG_REED_SOLOMON_TEST=m
CONFIG_ATOMIC64_SELFTEST=m
CONFIG_ASYNC_RAID6_TEST=m
diff --git a/arch/m68k/configs/mac_defconfig b/arch/m68k/configs/mac_defconfig
index bafd33d..14c8f1b 100644
--- a/arch/m68k/configs/mac_defconfig
+++ b/arch/m68k/configs/mac_defconfig
@@ -596,6 +596,7 @@
CONFIG_TEST_DHRY=m
CONFIG_TEST_MIN_HEAP=m
CONFIG_TEST_DIV64=m
+CONFIG_TEST_MULDIV64=m
CONFIG_REED_SOLOMON_TEST=m
CONFIG_ATOMIC64_SELFTEST=m
CONFIG_ASYNC_RAID6_TEST=m
diff --git a/arch/m68k/configs/multi_defconfig b/arch/m68k/configs/multi_defconfig
index 6f5ca3f..41c8112 100644
--- a/arch/m68k/configs/multi_defconfig
+++ b/arch/m68k/configs/multi_defconfig
@@ -682,6 +682,7 @@
CONFIG_TEST_DHRY=m
CONFIG_TEST_MIN_HEAP=m
CONFIG_TEST_DIV64=m
+CONFIG_TEST_MULDIV64=m
CONFIG_REED_SOLOMON_TEST=m
CONFIG_ATOMIC64_SELFTEST=m
CONFIG_ASYNC_RAID6_TEST=m
diff --git a/arch/m68k/configs/mvme147_defconfig b/arch/m68k/configs/mvme147_defconfig
index d16b328..e72d37e 100644
--- a/arch/m68k/configs/mvme147_defconfig
+++ b/arch/m68k/configs/mvme147_defconfig
@@ -568,6 +568,7 @@
CONFIG_TEST_DHRY=m
CONFIG_TEST_MIN_HEAP=m
CONFIG_TEST_DIV64=m
+CONFIG_TEST_MULDIV64=m
CONFIG_REED_SOLOMON_TEST=m
CONFIG_ATOMIC64_SELFTEST=m
CONFIG_ASYNC_RAID6_TEST=m
diff --git a/arch/m68k/configs/mvme16x_defconfig b/arch/m68k/configs/mvme16x_defconfig
index 80f6c15..733f1fc 100644
--- a/arch/m68k/configs/mvme16x_defconfig
+++ b/arch/m68k/configs/mvme16x_defconfig
@@ -569,6 +569,7 @@
CONFIG_TEST_DHRY=m
CONFIG_TEST_MIN_HEAP=m
CONFIG_TEST_DIV64=m
+CONFIG_TEST_MULDIV64=m
CONFIG_REED_SOLOMON_TEST=m
CONFIG_ATOMIC64_SELFTEST=m
CONFIG_ASYNC_RAID6_TEST=m
diff --git a/arch/m68k/configs/q40_defconfig b/arch/m68k/configs/q40_defconfig
index 0e81589..3efe254 100644
--- a/arch/m68k/configs/q40_defconfig
+++ b/arch/m68k/configs/q40_defconfig
@@ -586,6 +586,7 @@
CONFIG_TEST_DHRY=m
CONFIG_TEST_MIN_HEAP=m
CONFIG_TEST_DIV64=m
+CONFIG_TEST_MULDIV64=m
CONFIG_REED_SOLOMON_TEST=m
CONFIG_ATOMIC64_SELFTEST=m
CONFIG_ASYNC_RAID6_TEST=m
diff --git a/arch/m68k/configs/sun3_defconfig b/arch/m68k/configs/sun3_defconfig
index 8cd7852..1b8ea0e7 100644
--- a/arch/m68k/configs/sun3_defconfig
+++ b/arch/m68k/configs/sun3_defconfig
@@ -566,6 +566,7 @@
CONFIG_TEST_DHRY=m
CONFIG_TEST_MIN_HEAP=m
CONFIG_TEST_DIV64=m
+CONFIG_TEST_MULDIV64=m
CONFIG_REED_SOLOMON_TEST=m
CONFIG_ATOMIC64_SELFTEST=m
CONFIG_ASYNC_RAID6_TEST=m
diff --git a/arch/m68k/configs/sun3x_defconfig b/arch/m68k/configs/sun3x_defconfig
index 7803536..5bda93f 100644
--- a/arch/m68k/configs/sun3x_defconfig
+++ b/arch/m68k/configs/sun3x_defconfig
@@ -567,6 +567,7 @@
CONFIG_TEST_DHRY=m
CONFIG_TEST_MIN_HEAP=m
CONFIG_TEST_DIV64=m
+CONFIG_TEST_MULDIV64=m
CONFIG_REED_SOLOMON_TEST=m
CONFIG_ATOMIC64_SELFTEST=m
CONFIG_ASYNC_RAID6_TEST=m
diff --git a/arch/m68k/include/asm/irq.h b/arch/m68k/include/asm/irq.h
index 14992fd..2263e92 100644
--- a/arch/m68k/include/asm/irq.h
+++ b/arch/m68k/include/asm/irq.h
@@ -28,10 +28,8 @@
#define NR_IRQS 32
#elif defined(CONFIG_APOLLO)
#define NR_IRQS 24
-#elif defined(CONFIG_HP300)
+#else /* CONFIG_HP300 etc. */
#define NR_IRQS 8
-#else
-#define NR_IRQS 0
#endif
#if defined(CONFIG_M68020) || defined(CONFIG_M68030) || \
diff --git a/arch/m68k/include/asm/mvme147hw.h b/arch/m68k/include/asm/mvme147hw.h
index e28eb1c..dbf8805 100644
--- a/arch/m68k/include/asm/mvme147hw.h
+++ b/arch/m68k/include/asm/mvme147hw.h
@@ -93,8 +93,8 @@ struct pcc_regs {
#define M147_SCC_B_ADDR 0xfffe3000
#define M147_SCC_PCLK 5000000
-#define MVME147_IRQ_SCSI_PORT (IRQ_USER+0x45)
-#define MVME147_IRQ_SCSI_DMA (IRQ_USER+0x46)
+#define MVME147_IRQ_SCSI_PORT (IRQ_USER + 5)
+#define MVME147_IRQ_SCSI_DMA (IRQ_USER + 6)
/* SCC interrupts, for MVME147 */
diff --git a/arch/m68k/kernel/Makefile b/arch/m68k/kernel/Makefile
index f335bf3..6c732ed 100644
--- a/arch/m68k/kernel/Makefile
+++ b/arch/m68k/kernel/Makefile
@@ -5,16 +5,8 @@
extra-y += vmlinux.lds
-obj-$(CONFIG_AMIGA) := head.o
-obj-$(CONFIG_ATARI) := head.o
-obj-$(CONFIG_MAC) := head.o
-obj-$(CONFIG_APOLLO) := head.o
-obj-$(CONFIG_VME) := head.o
-obj-$(CONFIG_HP300) := head.o
-obj-$(CONFIG_Q40) := head.o
-obj-$(CONFIG_SUN3X) := head.o
-obj-$(CONFIG_VIRT) := head.o
-obj-$(CONFIG_SUN3) := sun3-head.o
+obj-$(CONFIG_MMU_MOTOROLA) := head.o
+obj-$(CONFIG_SUN3) := sun3-head.o
obj-y += entry.o irq.o module.o process.o ptrace.o
obj-y += setup.o signal.o sys_m68k.o syscalltable.o time.o traps.o
diff --git a/arch/m68k/kernel/early_printk.c b/arch/m68k/kernel/early_printk.c
index 3cc944d..f11ef9f 100644
--- a/arch/m68k/kernel/early_printk.c
+++ b/arch/m68k/kernel/early_printk.c
@@ -13,6 +13,7 @@
#include <asm/setup.h>
+#include "../mvme147/mvme147.h"
#include "../mvme16x/mvme16x.h"
asmlinkage void __init debug_cons_nputs(const char *s, unsigned n);
@@ -22,7 +23,9 @@ static void __ref debug_cons_write(struct console *c,
{
#if !(defined(CONFIG_SUN3) || defined(CONFIG_M68000) || \
defined(CONFIG_COLDFIRE))
- if (MACH_IS_MVME16x)
+ if (MACH_IS_MVME147)
+ mvme147_scc_write(c, s, n);
+ else if (MACH_IS_MVME16x)
mvme16x_cons_write(c, s, n);
else
debug_cons_nputs(s, n);
diff --git a/arch/m68k/kernel/setup_mm.c b/arch/m68k/kernel/setup_mm.c
index 10310b0..15c1a59 100644
--- a/arch/m68k/kernel/setup_mm.c
+++ b/arch/m68k/kernel/setup_mm.c
@@ -249,7 +249,11 @@ void __init setup_arch(char **cmdline_p)
process_uboot_commandline(&m68k_command_line[0], CL_SIZE);
*cmdline_p = m68k_command_line;
memcpy(boot_command_line, *cmdline_p, CL_SIZE);
-
+ /*
+ * Initialise the static keys early as they may be enabled by the
+ * cpufeature code and early parameters.
+ */
+ jump_label_init();
parse_early_param();
switch (m68k_machtype) {
diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl
index 22a3cbd..f5ed71f 100644
--- a/arch/m68k/kernel/syscalls/syscall.tbl
+++ b/arch/m68k/kernel/syscalls/syscall.tbl
@@ -462,3 +462,7 @@
460 common lsm_set_self_attr sys_lsm_set_self_attr
461 common lsm_list_modules sys_lsm_list_modules
462 common mseal sys_mseal
+463 common setxattrat sys_setxattrat
+464 common getxattrat sys_getxattrat
+465 common listxattrat sys_listxattrat
+466 common removexattrat sys_removexattrat
diff --git a/arch/m68k/kernel/time.c b/arch/m68k/kernel/time.c
index a97600b..acd4c2d 100644
--- a/arch/m68k/kernel/time.c
+++ b/arch/m68k/kernel/time.c
@@ -62,7 +62,7 @@ void timer_heartbeat(void)
}
#endif /* CONFIG_HEARTBEAT */
-#ifdef CONFIG_M68KCLASSIC
+#if defined(CONFIG_M68KCLASSIC) || defined(CONFIG_SUN3)
/* machine dependent timer functions */
int (*mach_hwclk) (int, struct rtc_time*);
EXPORT_SYMBOL(mach_hwclk);
@@ -149,7 +149,7 @@ static int __init rtc_init(void)
module_init(rtc_init);
#endif /* CONFIG_RTC_DRV_GENERIC */
-#endif /* CONFIG M68KCLASSIC */
+#endif /* CONFIG_M68KCLASSIC || SUN3 */
void __init time_init(void)
{
diff --git a/arch/m68k/kernel/traps.c b/arch/m68k/kernel/traps.c
index 53d0cf3..d2f25e8 100644
--- a/arch/m68k/kernel/traps.c
+++ b/arch/m68k/kernel/traps.c
@@ -383,7 +383,7 @@ static inline void bus_error030 (struct frame *fp)
fp->ptregs.format == 0xa ? fp->ptregs.pc + 4 : fp->un.fmtb.baddr);
if (ssw & DF)
pr_debug("Data %s fault at %#010lx in %s (pc=%#lx)\n",
- ssw & RW ? "read" : "write",
+ str_read_write(ssw & RW),
fp->un.fmtb.daddr,
space_names[ssw & DFC], fp->ptregs.pc);
@@ -419,7 +419,7 @@ static inline void bus_error030 (struct frame *fp)
}
pr_err("Data %s fault at %#010lx in %s (pc=%#lx)\n",
- ssw & RW ? "read" : "write",
+ str_read_write(ssw & RW),
fp->un.fmtb.daddr,
space_names[ssw & DFC], fp->ptregs.pc);
}
@@ -455,7 +455,7 @@ static inline void bus_error030 (struct frame *fp)
pr_debug("*** unexpected busfault type=%#04x\n",
buserr_type);
pr_debug("invalid %s access at %#lx from pc %#lx\n",
- !(ssw & RW) ? "write" : "read", addr,
+ str_read_write(ssw & RW), addr,
fp->ptregs.pc);
die_if_kernel ("Oops", &fp->ptregs, buserr_type);
force_sig (SIGBUS);
@@ -514,7 +514,7 @@ static inline void bus_error030 (struct frame *fp)
fp->ptregs.format == 0xa ? fp->ptregs.pc + 4 : fp->un.fmtb.baddr);
if (ssw & DF)
pr_debug("Data %s fault at %#010lx in %s (pc=%#lx)\n",
- ssw & RW ? "read" : "write",
+ str_read_write(ssw & RW),
fp->un.fmtb.daddr,
space_names[ssw & DFC], fp->ptregs.pc);
@@ -548,7 +548,7 @@ static inline void bus_error030 (struct frame *fp)
/* We might have an exception table for this PC */
if (ssw & 4 && !search_exception_tables(fp->ptregs.pc)) {
pr_err("Data %s fault at %#010lx in %s (pc=%#lx)\n",
- ssw & RW ? "read" : "write",
+ str_read_write(ssw & RW),
fp->un.fmtb.daddr,
space_names[ssw & DFC], fp->ptregs.pc);
goto buserr;
@@ -564,7 +564,7 @@ static inline void bus_error030 (struct frame *fp)
mmusr);
} else if (mmusr & (MMU_B|MMU_L|MMU_S)) {
pr_err("invalid %s access at %#lx from pc %#lx\n",
- !(ssw & RW) ? "write" : "read", addr,
+ str_read_write(ssw & RW), addr,
fp->ptregs.pc);
die_if_kernel("Oops",&fp->ptregs,mmusr);
force_sig(SIGSEGV);
@@ -575,7 +575,7 @@ static inline void bus_error030 (struct frame *fp)
#endif
pr_err("weird %s access at %#lx from pc %#lx (ssw is %#x)\n",
- !(ssw & RW) ? "write" : "read", addr,
+ str_read_write(ssw & RW), addr,
fp->ptregs.pc, ssw);
asm volatile ("ptestr #1,%1@,#0\n\t"
"pmove %%psr,%0"
@@ -991,7 +991,7 @@ static void bad_super_trap(struct frame *fp)
fp->ptregs.pc + 4 : fp->un.fmtb.baddr);
if (ssw & DF)
pr_err("Data %s fault at %#010lx in %s (pc=%#lx)\n",
- ssw & RW ? "read" : "write",
+ str_read_write(ssw & RW),
fp->un.fmtb.daddr, space_names[ssw & DFC],
fp->ptregs.pc);
}
diff --git a/arch/m68k/mvme147/config.c b/arch/m68k/mvme147/config.c
index 8b5dc07..824c42a 100644
--- a/arch/m68k/mvme147/config.c
+++ b/arch/m68k/mvme147/config.c
@@ -32,9 +32,10 @@
#include <asm/mvme147hw.h>
#include <asm/config.h>
+#include "mvme147.h"
static void mvme147_get_model(char *model);
-extern void mvme147_sched_init(void);
+static void __init mvme147_sched_init(void);
extern int mvme147_hwclk (int, struct rtc_time *);
extern void mvme147_reset (void);
@@ -123,7 +124,7 @@ static irqreturn_t mvme147_timer_int (int irq, void *dev_id)
}
-void mvme147_sched_init (void)
+static void __init mvme147_sched_init(void)
{
if (request_irq(PCC_IRQ_TIMER1, mvme147_timer_int, IRQF_TIMER,
"timer 1", NULL))
@@ -185,3 +186,32 @@ int mvme147_hwclk(int op, struct rtc_time *t)
}
return 0;
}
+
+static void scc_delay(void)
+{
+ __asm__ __volatile__ ("nop; nop;");
+}
+
+static void scc_write(char ch)
+{
+ do {
+ scc_delay();
+ } while (!(in_8(M147_SCC_A_ADDR) & BIT(2)));
+ scc_delay();
+ out_8(M147_SCC_A_ADDR, 8);
+ scc_delay();
+ out_8(M147_SCC_A_ADDR, ch);
+}
+
+void mvme147_scc_write(struct console *co, const char *str, unsigned int count)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ while (count--) {
+ if (*str == '\n')
+ scc_write('\r');
+ scc_write(*str++);
+ }
+ local_irq_restore(flags);
+}
diff --git a/arch/m68k/mvme147/mvme147.h b/arch/m68k/mvme147/mvme147.h
new file mode 100644
index 0000000..140bc98
--- /dev/null
+++ b/arch/m68k/mvme147/mvme147.h
@@ -0,0 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+struct console;
+
+/* config.c */
+void mvme147_scc_write(struct console *co, const char *str, unsigned int count);
diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl
index 2b81a6b..680f568 100644
--- a/arch/microblaze/kernel/syscalls/syscall.tbl
+++ b/arch/microblaze/kernel/syscalls/syscall.tbl
@@ -468,3 +468,7 @@
460 common lsm_set_self_attr sys_lsm_set_self_attr
461 common lsm_list_modules sys_lsm_list_modules
462 common mseal sys_mseal
+463 common setxattrat sys_setxattrat
+464 common getxattrat sys_getxattrat
+465 common listxattrat sys_listxattrat
+466 common removexattrat sys_removexattrat
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index 397edf0..467b10f 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -2876,6 +2876,9 @@
config ARCH_SUPPORTS_CRASH_DUMP
def_bool y
+config ARCH_DEFAULT_CRASH_DUMP
+ def_bool y
+
config PHYSICAL_START
hex "Physical address where the kernel is loaded"
default "0xffffffff84000000"
diff --git a/arch/mips/boot/dts/brcm/bcm6358.dtsi b/arch/mips/boot/dts/brcm/bcm6358.dtsi
index 777c437..5e487f6 100644
--- a/arch/mips/boot/dts/brcm/bcm6358.dtsi
+++ b/arch/mips/boot/dts/brcm/bcm6358.dtsi
@@ -13,6 +13,7 @@ cpus {
#size-cells = <0>;
mips-hpt-frequency = <150000000>;
+ brcm,bmips-cbr-reg = <0xff400000>;
cpu@0 {
compatible = "brcm,bmips4350";
diff --git a/arch/mips/boot/dts/brcm/bcm6368.dtsi b/arch/mips/boot/dts/brcm/bcm6368.dtsi
index fc15e20..087f329 100644
--- a/arch/mips/boot/dts/brcm/bcm6368.dtsi
+++ b/arch/mips/boot/dts/brcm/bcm6368.dtsi
@@ -13,6 +13,7 @@ cpus {
#size-cells = <0>;
mips-hpt-frequency = <200000000>;
+ brcm,bmips-cbr-reg = <0xff400000>;
cpu@0 {
compatible = "brcm,bmips4350";
diff --git a/arch/mips/boot/dts/mobileye/eyeq6h-epm6.dts b/arch/mips/boot/dts/mobileye/eyeq6h-epm6.dts
index ebc0d36..59a3e95 100644
--- a/arch/mips/boot/dts/mobileye/eyeq6h-epm6.dts
+++ b/arch/mips/boot/dts/mobileye/eyeq6h-epm6.dts
@@ -8,7 +8,7 @@
#include "eyeq6h.dtsi"
/ {
- compatible = "mobileye,eyeq6-epm6", "mobileye,eyeq6";
+ compatible = "mobileye,eyeq6h-epm6", "mobileye,eyeq6h";
model = "Mobile EyeQ6H MP6 Evaluation board";
chosen {
diff --git a/arch/mips/boot/dts/realtek/cameo-rtl9302c-2x-rtl8224-2xge.dts b/arch/mips/boot/dts/realtek/cameo-rtl9302c-2x-rtl8224-2xge.dts
index 77d2566..6789bf3 100644
--- a/arch/mips/boot/dts/realtek/cameo-rtl9302c-2x-rtl8224-2xge.dts
+++ b/arch/mips/boot/dts/realtek/cameo-rtl9302c-2x-rtl8224-2xge.dts
@@ -1,7 +1,7 @@
// SPDX-License-Identifier: GPL-2.0-or-later
/dts-v1/;
-#include "rtl930x.dtsi"
+#include "rtl9302c.dtsi"
#include <dt-bindings/input/input.h>
#include <dt-bindings/gpio/gpio.h>
diff --git a/arch/mips/boot/dts/realtek/rtl9302c.dtsi b/arch/mips/boot/dts/realtek/rtl9302c.dtsi
new file mode 100644
index 0000000..8690433
--- /dev/null
+++ b/arch/mips/boot/dts/realtek/rtl9302c.dtsi
@@ -0,0 +1,15 @@
+// SPDX-License-Identifier: GPL-2.0-or-later OR BSD-2-Clause
+
+#include "rtl930x.dtsi"
+
+&switch0 {
+ compatible = "realtek,rtl9302c-switch", "syscon", "simple-mfd";
+};
+
+&i2c0 {
+ compatible = "realtek,rtl9302c-i2c", "realtek,rtl9301-i2c";
+};
+
+&i2c1 {
+ compatible = "realtek,rtl9302c-i2c", "realtek,rtl9301-i2c";
+};
diff --git a/arch/mips/boot/dts/realtek/rtl930x.dtsi b/arch/mips/boot/dts/realtek/rtl930x.dtsi
index f271940f..6a6f3f3 100644
--- a/arch/mips/boot/dts/realtek/rtl930x.dtsi
+++ b/arch/mips/boot/dts/realtek/rtl930x.dtsi
@@ -29,6 +29,35 @@ lx_clk: clock-175mhz {
#clock-cells = <0>;
clock-frequency = <175000000>;
};
+
+ switch0: switch@1b000000 {
+ compatible = "realtek,rtl9301-switch", "syscon", "simple-mfd";
+ reg = <0x1b000000 0x10000>;
+ #address-cells = <1>;
+ #size-cells = <1>;
+
+ reboot@c {
+ compatible = "syscon-reboot";
+ reg = <0x0c 0x4>;
+ value = <0x01>;
+ };
+
+ i2c0: i2c@36c {
+ compatible = "realtek,rtl9301-i2c";
+ reg = <0x36c 0x14>;
+ #address-cells = <1>;
+ #size-cells = <0>;
+ status = "disabled";
+ };
+
+ i2c1: i2c@388 {
+ compatible = "realtek,rtl9301-i2c";
+ reg = <0x388 0x14>;
+ #address-cells = <1>;
+ #size-cells = <0>;
+ status = "disabled";
+ };
+ };
};
&soc {
diff --git a/arch/mips/configs/loongson3_defconfig b/arch/mips/configs/loongson3_defconfig
index 78f4987..98844b4 100644
--- a/arch/mips/configs/loongson3_defconfig
+++ b/arch/mips/configs/loongson3_defconfig
@@ -5,6 +5,8 @@
CONFIG_AUDIT=y
CONFIG_NO_HZ=y
CONFIG_HIGH_RES_TIMERS=y
+CONFIG_BPF_SYSCALL=y
+CONFIG_BPF_JIT=y
CONFIG_PREEMPT=y
CONFIG_BSD_PROCESS_ACCT=y
CONFIG_BSD_PROCESS_ACCT_V3=y
@@ -22,18 +24,16 @@
CONFIG_NAMESPACES=y
CONFIG_USER_NS=y
CONFIG_SCHED_AUTOGROUP=y
-CONFIG_SYSFS_DEPRECATED=y
CONFIG_RELAY=y
CONFIG_BLK_DEV_INITRD=y
-CONFIG_BPF_SYSCALL=y
CONFIG_EXPERT=y
CONFIG_PERF_EVENTS=y
+CONFIG_KEXEC=y
CONFIG_MACH_LOONGSON64=y
CONFIG_CPU_HAS_MSA=y
CONFIG_NUMA=y
CONFIG_NR_CPUS=16
CONFIG_HZ_256=y
-CONFIG_KEXEC=y
CONFIG_MIPS32_O32=y
CONFIG_MIPS32_N32=y
CONFIG_VIRTUALIZATION=y
@@ -47,15 +47,12 @@
CONFIG_PARTITION_ADVANCED=y
CONFIG_MQ_IOSCHED_DEADLINE=m
CONFIG_IOSCHED_BFQ=y
-CONFIG_BFQ_GROUP_IOSCHED=y
CONFIG_BINFMT_MISC=m
CONFIG_KSM=y
CONFIG_NET=y
CONFIG_PACKET=y
-CONFIG_UNIX=y
CONFIG_XFRM_USER=y
CONFIG_NET_KEY=y
-CONFIG_INET=y
CONFIG_IP_MULTICAST=y
CONFIG_IP_ADVANCED_ROUTER=y
CONFIG_IP_MULTIPLE_TABLES=y
@@ -106,7 +103,6 @@
CONFIG_IP_NF_TARGET_TTL=m
CONFIG_IP_NF_RAW=m
CONFIG_IP_NF_SECURITY=m
-CONFIG_IP_NF_ARPTABLES=m
CONFIG_IP_NF_ARPFILTER=m
CONFIG_IP_NF_ARP_MANGLE=m
CONFIG_NF_TABLES_IPV6=y
@@ -128,7 +124,6 @@
CONFIG_BRIDGE=m
CONFIG_VSOCKETS=m
CONFIG_VIRTIO_VSOCKETS=m
-CONFIG_BPF_JIT=y
CONFIG_CFG80211=m
CONFIG_CFG80211_WEXT=y
CONFIG_MAC80211=m
@@ -146,6 +141,7 @@
CONFIG_BLK_DEV_RAM=y
CONFIG_BLK_DEV_RAM_SIZE=8192
CONFIG_VIRTIO_BLK=y
+CONFIG_BLK_DEV_NVME=m
CONFIG_RAID_ATTRS=m
CONFIG_BLK_DEV_SD=y
CONFIG_BLK_DEV_SR=y
@@ -167,12 +163,10 @@
CONFIG_PATA_ATIIXP=y
CONFIG_MD=y
CONFIG_BLK_DEV_MD=m
-CONFIG_MD_LINEAR=m
CONFIG_MD_RAID0=m
CONFIG_MD_RAID1=m
CONFIG_MD_RAID10=m
CONFIG_MD_RAID456=m
-CONFIG_MD_MULTIPATH=m
CONFIG_BLK_DEV_DM=m
CONFIG_DM_CRYPT=m
CONFIG_DM_SNAPSHOT=m
@@ -196,7 +190,6 @@
# CONFIG_NET_VENDOR_ARC is not set
# CONFIG_NET_VENDOR_ATHEROS is not set
# CONFIG_NET_VENDOR_BROADCOM is not set
-# CONFIG_NET_VENDOR_BROCADE is not set
# CONFIG_NET_VENDOR_CHELSIO is not set
# CONFIG_NET_VENDOR_CIRRUS is not set
# CONFIG_NET_VENDOR_CISCO is not set
@@ -216,6 +209,7 @@
# CONFIG_NET_VENDOR_NVIDIA is not set
# CONFIG_NET_VENDOR_OKI is not set
# CONFIG_NET_VENDOR_QLOGIC is not set
+# CONFIG_NET_VENDOR_BROCADE is not set
# CONFIG_NET_VENDOR_RDC is not set
CONFIG_8139CP=m
CONFIG_8139TOO=m
@@ -242,7 +236,6 @@
CONFIG_PPP_ASYNC=m
CONFIG_PPP_SYNC_TTY=m
CONFIG_ATH9K=m
-CONFIG_HOSTAP=m
CONFIG_INPUT_SPARSEKMAP=y
CONFIG_INPUT_MOUSEDEV=y
CONFIG_INPUT_MOUSEDEV_PSAUX=y
@@ -276,23 +269,20 @@
CONFIG_MEDIA_USB_SUPPORT=y
CONFIG_USB_VIDEO_CLASS=m
CONFIG_DRM=y
+CONFIG_DRM_RADEON=m
CONFIG_DRM_AMDGPU=m
CONFIG_DRM_AMDGPU_SI=y
CONFIG_DRM_AMDGPU_CIK=y
CONFIG_DRM_AMDGPU_USERPTR=y
CONFIG_DRM_AMD_ACP=y
-CONFIG_DRM_AMD_DC=y
CONFIG_DRM_AMD_DC_SI=y
CONFIG_DRM_AST=m
-CONFIG_DRM_RADEON=m
CONFIG_DRM_QXL=y
CONFIG_DRM_VIRTIO_GPU=y
CONFIG_FB=y
CONFIG_FB_RADEON=y
CONFIG_LCD_CLASS_DEVICE=y
CONFIG_LCD_PLATFORM=m
-# CONFIG_VGA_CONSOLE is not set
-CONFIG_FRAMEBUFFER_CONSOLE=y
CONFIG_FRAMEBUFFER_CONSOLE_ROTATION=y
CONFIG_LOGO=y
CONFIG_SOUND=y
@@ -350,13 +340,11 @@
CONFIG_XFS_FS=y
CONFIG_XFS_POSIX_ACL=y
CONFIG_QUOTA=y
-# CONFIG_PRINT_QUOTA_WARNING is not set
CONFIG_QFMT_V1=m
CONFIG_QFMT_V2=m
CONFIG_AUTOFS_FS=y
CONFIG_FUSE_FS=m
CONFIG_VIRTIO_FS=m
-CONFIG_NETFS_SUPPORT=m
CONFIG_FSCACHE=y
CONFIG_ISO9660_FS=m
CONFIG_JOLIET=y
@@ -391,23 +379,21 @@
CONFIG_SECURITY_PATH=y
CONFIG_SECURITY_SELINUX=y
CONFIG_SECURITY_SELINUX_BOOTPARAM=y
-CONFIG_SECURITY_SELINUX_DISABLE=y
CONFIG_DEFAULT_SECURITY_DAC=y
-CONFIG_CRYPTO_HMAC=y
-CONFIG_CRYPTO_MD5=y
-CONFIG_CRYPTO_WP512=m
CONFIG_CRYPTO_BLOWFISH=m
CONFIG_CRYPTO_CAST5=m
CONFIG_CRYPTO_CAST6=m
CONFIG_CRYPTO_SERPENT=m
CONFIG_CRYPTO_TWOFISH=m
+CONFIG_CRYPTO_HMAC=y
+CONFIG_CRYPTO_MD5=y
+CONFIG_CRYPTO_WP512=m
CONFIG_CRYPTO_DEFLATE=m
CONFIG_PRINTK_TIME=y
CONFIG_STRIP_ASM_SYMS=y
CONFIG_MAGIC_SYSRQ=y
CONFIG_DEBUG_FS=y
# CONFIG_SCHED_DEBUG is not set
-# CONFIG_DEBUG_PREEMPT is not set
CONFIG_FUNCTION_TRACER=y
CONFIG_FTRACE_SYSCALLS=y
CONFIG_CMDLINE_BOOL=y
diff --git a/arch/mips/crypto/crc32-mips.c b/arch/mips/crypto/crc32-mips.c
index a7a1d43..90eacf0 100644
--- a/arch/mips/crypto/crc32-mips.c
+++ b/arch/mips/crypto/crc32-mips.c
@@ -123,20 +123,20 @@ static u32 crc32c_mips_le_hw(u32 crc_, const u8 *p, unsigned int len)
for (; len >= sizeof(u64); p += sizeof(u64), len -= sizeof(u64)) {
u64 value = get_unaligned_le64(p);
- CRC32(crc, value, d);
+ CRC32C(crc, value, d);
}
if (len & sizeof(u32)) {
u32 value = get_unaligned_le32(p);
- CRC32(crc, value, w);
+ CRC32C(crc, value, w);
p += sizeof(u32);
}
} else {
for (; len >= sizeof(u32); len -= sizeof(u32)) {
u32 value = get_unaligned_le32(p);
- CRC32(crc, value, w);
+ CRC32C(crc, value, w);
p += sizeof(u32);
}
}
diff --git a/arch/mips/include/asm/mips-cm.h b/arch/mips/include/asm/mips-cm.h
index 1e78227..23ce951 100644
--- a/arch/mips/include/asm/mips-cm.h
+++ b/arch/mips/include/asm/mips-cm.h
@@ -326,7 +326,9 @@ GCR_CX_ACCESSOR_RW(32, 0x018, other)
/* GCR_Cx_RESET_BASE - Configure where powered up cores will fetch from */
GCR_CX_ACCESSOR_RW(32, 0x020, reset_base)
+GCR_CX_ACCESSOR_RW(64, 0x020, reset64_base)
#define CM_GCR_Cx_RESET_BASE_BEVEXCBASE GENMASK(31, 12)
+#define CM_GCR_Cx_RESET64_BASE_BEVEXCBASE GENMASK_ULL(47, 12)
#define CM_GCR_Cx_RESET_BASE_MODE BIT(1)
/* GCR_Cx_ID - Identify the current core */
diff --git a/arch/mips/include/asm/switch_to.h b/arch/mips/include/asm/switch_to.h
index a4374b4..d6ccd53 100644
--- a/arch/mips/include/asm/switch_to.h
+++ b/arch/mips/include/asm/switch_to.h
@@ -97,7 +97,7 @@ do { \
} \
} while (0)
#else
-# define __sanitize_fcr31(next)
+# define __sanitize_fcr31(next) do { (void) (next); } while (0)
#endif
/*
diff --git a/arch/mips/kernel/proc.c b/arch/mips/kernel/proc.c
index 8eba5a1..8f0a000 100644
--- a/arch/mips/kernel/proc.c
+++ b/arch/mips/kernel/proc.c
@@ -66,24 +66,23 @@ static int show_cpuinfo(struct seq_file *m, void *v)
seq_printf(m, "BogoMIPS\t\t: %u.%02u\n",
cpu_data[n].udelay_val / (500000/HZ),
(cpu_data[n].udelay_val / (5000/HZ)) % 100);
- seq_printf(m, "wait instruction\t: %s\n", cpu_wait ? "yes" : "no");
+ seq_printf(m, "wait instruction\t: %s\n", str_yes_no(cpu_wait));
seq_printf(m, "microsecond timers\t: %s\n",
- cpu_has_counter ? "yes" : "no");
+ str_yes_no(cpu_has_counter));
seq_printf(m, "tlb_entries\t\t: %d\n", cpu_data[n].tlbsize);
seq_printf(m, "extra interrupt vector\t: %s\n",
- cpu_has_divec ? "yes" : "no");
- seq_printf(m, "hardware watchpoint\t: %s",
- cpu_has_watch ? "yes, " : "no\n");
+ str_yes_no(cpu_has_divec));
+ seq_printf(m, "hardware watchpoint\t: %s", str_yes_no(cpu_has_watch));
if (cpu_has_watch) {
- seq_printf(m, "count: %d, address/irw mask: [",
+ seq_printf(m, ", count: %d, address/irw mask: [",
cpu_data[n].watch_reg_count);
for (i = 0; i < cpu_data[n].watch_reg_count; i++)
seq_printf(m, "%s0x%04x", i ? ", " : "",
cpu_data[n].watch_reg_masks[i]);
- seq_puts(m, "]\n");
+ seq_puts(m, "]");
}
- seq_puts(m, "isa\t\t\t:");
+ seq_puts(m, "\nisa\t\t\t:");
if (cpu_has_mips_1)
seq_puts(m, " mips1");
if (cpu_has_mips_2)
@@ -155,7 +154,7 @@ static int show_cpuinfo(struct seq_file *m, void *v)
if (cpu_has_mmips) {
seq_printf(m, "micromips kernel\t: %s\n",
- (read_c0_config3() & MIPS_CONF3_ISA_OE) ? "yes" : "no");
+ str_yes_no(read_c0_config3() & MIPS_CONF3_ISA_OE));
}
seq_puts(m, "Options implemented\t:");
diff --git a/arch/mips/kernel/smp-cps.c b/arch/mips/kernel/smp-cps.c
index 395622c..82c8f9b 100644
--- a/arch/mips/kernel/smp-cps.c
+++ b/arch/mips/kernel/smp-cps.c
@@ -37,7 +37,7 @@ enum label_id {
UASM_L_LA(_not_nmi)
static DECLARE_BITMAP(core_power, NR_CPUS);
-static uint32_t core_entry_reg;
+static u64 core_entry_reg;
static phys_addr_t cps_vec_pa;
struct core_boot_config *mips_cps_core_bootcfg;
@@ -94,6 +94,20 @@ static void __init *mips_cps_build_core_entry(void *addr)
return p;
}
+static bool __init check_64bit_reset(void)
+{
+ bool cx_64bit_reset = false;
+
+ mips_cm_lock_other(0, 0, 0, CM_GCR_Cx_OTHER_BLOCK_LOCAL);
+ write_gcr_co_reset64_base(CM_GCR_Cx_RESET64_BASE_BEVEXCBASE);
+ if ((read_gcr_co_reset64_base() & CM_GCR_Cx_RESET64_BASE_BEVEXCBASE) ==
+ CM_GCR_Cx_RESET64_BASE_BEVEXCBASE)
+ cx_64bit_reset = true;
+ mips_cm_unlock_other();
+
+ return cx_64bit_reset;
+}
+
static int __init allocate_cps_vecs(void)
{
/* Try to allocate in KSEG1 first */
@@ -105,11 +119,23 @@ static int __init allocate_cps_vecs(void)
CM_GCR_Cx_RESET_BASE_BEVEXCBASE;
if (!cps_vec_pa && mips_cm_is64) {
- cps_vec_pa = memblock_phys_alloc_range(BEV_VEC_SIZE, BEV_VEC_ALIGN,
- 0x0, SZ_4G - 1);
- if (cps_vec_pa)
- core_entry_reg = (cps_vec_pa & CM_GCR_Cx_RESET_BASE_BEVEXCBASE) |
+ phys_addr_t end;
+
+ if (check_64bit_reset()) {
+ pr_info("VP Local Reset Exception Base support 47 bits address\n");
+ end = MEMBLOCK_ALLOC_ANYWHERE;
+ } else {
+ end = SZ_4G - 1;
+ }
+ cps_vec_pa = memblock_phys_alloc_range(BEV_VEC_SIZE, BEV_VEC_ALIGN, 0, end);
+ if (cps_vec_pa) {
+ if (check_64bit_reset())
+ core_entry_reg = (cps_vec_pa & CM_GCR_Cx_RESET64_BASE_BEVEXCBASE) |
CM_GCR_Cx_RESET_BASE_MODE;
+ else
+ core_entry_reg = (cps_vec_pa & CM_GCR_Cx_RESET_BASE_BEVEXCBASE) |
+ CM_GCR_Cx_RESET_BASE_MODE;
+ }
}
if (!cps_vec_pa)
@@ -308,7 +334,10 @@ static void boot_core(unsigned int core, unsigned int vpe_id)
mips_cm_lock_other(0, core, 0, CM_GCR_Cx_OTHER_BLOCK_LOCAL);
/* Set its reset vector */
- write_gcr_co_reset_base(core_entry_reg);
+ if (mips_cm_is64)
+ write_gcr_co_reset64_base(core_entry_reg);
+ else
+ write_gcr_co_reset_base(core_entry_reg);
/* Ensure its coherency is disabled */
write_gcr_co_coherence(0);
@@ -411,7 +440,10 @@ static int cps_boot_secondary(int cpu, struct task_struct *idle)
if (cpu_has_vp) {
mips_cm_lock_other(0, core, vpe_id, CM_GCR_Cx_OTHER_BLOCK_LOCAL);
- write_gcr_co_reset_base(core_entry_reg);
+ if (mips_cm_is64)
+ write_gcr_co_reset64_base(core_entry_reg);
+ else
+ write_gcr_co_reset_base(core_entry_reg);
mips_cm_unlock_other();
}
diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl
index 953f5b7..0b9b7e2 100644
--- a/arch/mips/kernel/syscalls/syscall_n32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_n32.tbl
@@ -401,3 +401,7 @@
460 n32 lsm_set_self_attr sys_lsm_set_self_attr
461 n32 lsm_list_modules sys_lsm_list_modules
462 n32 mseal sys_mseal
+463 n32 setxattrat sys_setxattrat
+464 n32 getxattrat sys_getxattrat
+465 n32 listxattrat sys_listxattrat
+466 n32 removexattrat sys_removexattrat
diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl
index 1464c6b..c844cd5 100644
--- a/arch/mips/kernel/syscalls/syscall_n64.tbl
+++ b/arch/mips/kernel/syscalls/syscall_n64.tbl
@@ -377,3 +377,7 @@
460 n64 lsm_set_self_attr sys_lsm_set_self_attr
461 n64 lsm_list_modules sys_lsm_list_modules
462 n64 mseal sys_mseal
+463 n64 setxattrat sys_setxattrat
+464 n64 getxattrat sys_getxattrat
+465 n64 listxattrat sys_listxattrat
+466 n64 removexattrat sys_removexattrat
diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl
index 2439a24..349b8aa 100644
--- a/arch/mips/kernel/syscalls/syscall_o32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_o32.tbl
@@ -450,3 +450,7 @@
460 o32 lsm_set_self_attr sys_lsm_set_self_attr
461 o32 lsm_list_modules sys_lsm_list_modules
462 o32 mseal sys_mseal
+463 o32 setxattrat sys_setxattrat
+464 o32 getxattrat sys_getxattrat
+465 o32 listxattrat sys_listxattrat
+466 o32 removexattrat sys_removexattrat
diff --git a/arch/mips/sgi-ip22/ip22-gio.c b/arch/mips/sgi-ip22/ip22-gio.c
index d20eec7..5893ea4 100644
--- a/arch/mips/sgi-ip22/ip22-gio.c
+++ b/arch/mips/sgi-ip22/ip22-gio.c
@@ -165,9 +165,8 @@ static ssize_t modalias_show(struct device *dev, struct device_attribute *a,
char *buf)
{
struct gio_device *gio_dev = to_gio_device(dev);
- int len = snprintf(buf, PAGE_SIZE, "gio:%x\n", gio_dev->id.id);
- return (len >= PAGE_SIZE) ? (PAGE_SIZE - 1) : len;
+ return sysfs_emit(buf, "gio:%x\n", gio_dev->id.id);
}
static DEVICE_ATTR_RO(modalias);
@@ -177,7 +176,7 @@ static ssize_t name_show(struct device *dev,
struct gio_device *giodev;
giodev = to_gio_device(dev);
- return sprintf(buf, "%s", giodev->name);
+ return sysfs_emit(buf, "%s\n", giodev->name);
}
static DEVICE_ATTR_RO(name);
@@ -187,7 +186,7 @@ static ssize_t id_show(struct device *dev,
struct gio_device *giodev;
giodev = to_gio_device(dev);
- return sprintf(buf, "%x", giodev->id.id);
+ return sysfs_emit(buf, "%x\n", giodev->id.id);
}
static DEVICE_ATTR_RO(id);
diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl
index 66dc406..d9fc94c 100644
--- a/arch/parisc/kernel/syscalls/syscall.tbl
+++ b/arch/parisc/kernel/syscalls/syscall.tbl
@@ -461,3 +461,7 @@
460 common lsm_set_self_attr sys_lsm_set_self_attr
461 common lsm_list_modules sys_lsm_list_modules
462 common mseal sys_mseal
+463 common setxattrat sys_setxattrat
+464 common getxattrat sys_getxattrat
+465 common listxattrat sys_listxattrat
+466 common removexattrat sys_removexattrat
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 8094a01..1a2ff02 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -684,6 +684,10 @@
config ARCH_SUPPORTS_CRASH_DUMP
def_bool PPC64 || PPC_BOOK3S_32 || PPC_85xx || (44x && !SMP)
+config ARCH_DEFAULT_CRASH_DUMP
+ bool
+ default y if !PPC_BOOK3S_32
+
config ARCH_SELECTS_CRASH_DUMP
def_bool y
depends on CRASH_DUMP
diff --git a/arch/powerpc/crypto/Kconfig b/arch/powerpc/crypto/Kconfig
index 46a4c85..951a437 100644
--- a/arch/powerpc/crypto/Kconfig
+++ b/arch/powerpc/crypto/Kconfig
@@ -107,12 +107,12 @@
config CRYPTO_AES_GCM_P10
tristate "Stitched AES/GCM acceleration support on P10 or later CPU (PPC)"
- depends on BROKEN
depends on PPC64 && CPU_LITTLE_ENDIAN && VSX
select CRYPTO_LIB_AES
select CRYPTO_ALGAPI
select CRYPTO_AEAD
select CRYPTO_SKCIPHER
+ select CRYPTO_SIMD
help
AEAD cipher: AES cipher algorithms (FIPS-197)
GCM (Galois/Counter Mode) authenticated encryption mode (NIST SP800-38D)
diff --git a/arch/powerpc/crypto/aes-gcm-p10-glue.c b/arch/powerpc/crypto/aes-gcm-p10-glue.c
index f66ad56e..f37b3d1 100644
--- a/arch/powerpc/crypto/aes-gcm-p10-glue.c
+++ b/arch/powerpc/crypto/aes-gcm-p10-glue.c
@@ -8,6 +8,7 @@
#include <linux/unaligned.h>
#include <asm/simd.h>
#include <asm/switch_to.h>
+#include <crypto/gcm.h>
#include <crypto/aes.h>
#include <crypto/algapi.h>
#include <crypto/b128ops.h>
@@ -24,6 +25,7 @@
#define PPC_ALIGN 16
#define GCM_IV_SIZE 12
+#define RFC4106_NONCE_SIZE 4
MODULE_DESCRIPTION("PPC64le AES-GCM with Stitched implementation");
MODULE_AUTHOR("Danny Tsen <dtsen@linux.ibm.com");
@@ -31,7 +33,7 @@ MODULE_LICENSE("GPL v2");
MODULE_ALIAS_CRYPTO("aes");
asmlinkage int aes_p10_set_encrypt_key(const u8 *userKey, const int bits,
- void *key);
+ void *key);
asmlinkage void aes_p10_encrypt(const u8 *in, u8 *out, const void *key);
asmlinkage void aes_p10_gcm_encrypt(u8 *in, u8 *out, size_t len,
void *rkey, u8 *iv, void *Xi);
@@ -39,7 +41,8 @@ asmlinkage void aes_p10_gcm_decrypt(u8 *in, u8 *out, size_t len,
void *rkey, u8 *iv, void *Xi);
asmlinkage void gcm_init_htable(unsigned char htable[], unsigned char Xi[]);
asmlinkage void gcm_ghash_p10(unsigned char *Xi, unsigned char *Htable,
- unsigned char *aad, unsigned int alen);
+ unsigned char *aad, unsigned int alen);
+asmlinkage void gcm_update(u8 *iv, void *Xi);
struct aes_key {
u8 key[AES_MAX_KEYLENGTH];
@@ -52,6 +55,7 @@ struct gcm_ctx {
u8 aad_hash[16];
u64 aadLen;
u64 Plen; /* offset 56 - used in aes_p10_gcm_{en/de}crypt */
+ u8 pblock[16];
};
struct Hash_ctx {
u8 H[16]; /* subkey */
@@ -60,17 +64,20 @@ struct Hash_ctx {
struct p10_aes_gcm_ctx {
struct aes_key enc_key;
+ u8 nonce[RFC4106_NONCE_SIZE];
};
static void vsx_begin(void)
{
preempt_disable();
+ pagefault_disable();
enable_kernel_vsx();
}
static void vsx_end(void)
{
disable_kernel_vsx();
+ pagefault_enable();
preempt_enable();
}
@@ -185,7 +192,7 @@ static int set_authsize(struct crypto_aead *tfm, unsigned int authsize)
}
static int p10_aes_gcm_setkey(struct crypto_aead *aead, const u8 *key,
- unsigned int keylen)
+ unsigned int keylen)
{
struct crypto_tfm *tfm = crypto_aead_tfm(aead);
struct p10_aes_gcm_ctx *ctx = crypto_tfm_ctx(tfm);
@@ -198,7 +205,8 @@ static int p10_aes_gcm_setkey(struct crypto_aead *aead, const u8 *key,
return ret ? -EINVAL : 0;
}
-static int p10_aes_gcm_crypt(struct aead_request *req, int enc)
+static int p10_aes_gcm_crypt(struct aead_request *req, u8 *riv,
+ int assoclen, int enc)
{
struct crypto_tfm *tfm = req->base.tfm;
struct p10_aes_gcm_ctx *ctx = crypto_tfm_ctx(tfm);
@@ -210,7 +218,6 @@ static int p10_aes_gcm_crypt(struct aead_request *req, int enc)
struct skcipher_walk walk;
u8 *assocmem = NULL;
u8 *assoc;
- unsigned int assoclen = req->assoclen;
unsigned int cryptlen = req->cryptlen;
unsigned char ivbuf[AES_BLOCK_SIZE+PPC_ALIGN];
unsigned char *iv = PTR_ALIGN((void *)ivbuf, PPC_ALIGN);
@@ -218,11 +225,12 @@ static int p10_aes_gcm_crypt(struct aead_request *req, int enc)
unsigned long auth_tag_len = crypto_aead_authsize(__crypto_aead_cast(tfm));
u8 otag[16];
int total_processed = 0;
+ int nbytes;
memset(databuf, 0, sizeof(databuf));
memset(hashbuf, 0, sizeof(hashbuf));
memset(ivbuf, 0, sizeof(ivbuf));
- memcpy(iv, req->iv, GCM_IV_SIZE);
+ memcpy(iv, riv, GCM_IV_SIZE);
/* Linearize assoc, if not already linear */
if (req->src->length >= assoclen && req->src->length) {
@@ -257,19 +265,25 @@ static int p10_aes_gcm_crypt(struct aead_request *req, int enc)
if (ret)
return ret;
- while (walk.nbytes > 0 && ret == 0) {
+ while ((nbytes = walk.nbytes) > 0 && ret == 0) {
+ u8 *src = walk.src.virt.addr;
+ u8 *dst = walk.dst.virt.addr;
+ u8 buf[AES_BLOCK_SIZE];
+
+ if (unlikely(nbytes > 0 && nbytes < AES_BLOCK_SIZE))
+ src = dst = memcpy(buf, src, nbytes);
vsx_begin();
if (enc)
- aes_p10_gcm_encrypt(walk.src.virt.addr,
- walk.dst.virt.addr,
- walk.nbytes,
+ aes_p10_gcm_encrypt(src, dst, nbytes,
&ctx->enc_key, gctx->iv, hash->Htable);
else
- aes_p10_gcm_decrypt(walk.src.virt.addr,
- walk.dst.virt.addr,
- walk.nbytes,
+ aes_p10_gcm_decrypt(src, dst, nbytes,
&ctx->enc_key, gctx->iv, hash->Htable);
+
+ if (unlikely(nbytes > 0 && nbytes < AES_BLOCK_SIZE))
+ memcpy(walk.dst.virt.addr, buf, nbytes);
+
vsx_end();
total_processed += walk.nbytes;
@@ -281,6 +295,7 @@ static int p10_aes_gcm_crypt(struct aead_request *req, int enc)
/* Finalize hash */
vsx_begin();
+ gcm_update(gctx->iv, hash->Htable);
finish_tag(gctx, hash, total_processed);
vsx_end();
@@ -302,17 +317,63 @@ static int p10_aes_gcm_crypt(struct aead_request *req, int enc)
return 0;
}
+static int rfc4106_setkey(struct crypto_aead *tfm, const u8 *inkey,
+ unsigned int keylen)
+{
+ struct p10_aes_gcm_ctx *ctx = crypto_aead_ctx(tfm);
+ int err;
+
+ keylen -= RFC4106_NONCE_SIZE;
+ err = p10_aes_gcm_setkey(tfm, inkey, keylen);
+ if (err)
+ return err;
+
+ memcpy(ctx->nonce, inkey + keylen, RFC4106_NONCE_SIZE);
+ return 0;
+}
+
+static int rfc4106_setauthsize(struct crypto_aead *tfm, unsigned int authsize)
+{
+ return crypto_rfc4106_check_authsize(authsize);
+}
+
+static int rfc4106_encrypt(struct aead_request *req)
+{
+ struct crypto_aead *aead = crypto_aead_reqtfm(req);
+ struct p10_aes_gcm_ctx *ctx = crypto_aead_ctx(aead);
+ u8 iv[AES_BLOCK_SIZE];
+
+ memcpy(iv, ctx->nonce, RFC4106_NONCE_SIZE);
+ memcpy(iv + RFC4106_NONCE_SIZE, req->iv, GCM_RFC4106_IV_SIZE);
+
+ return crypto_ipsec_check_assoclen(req->assoclen) ?:
+ p10_aes_gcm_crypt(req, iv, req->assoclen - GCM_RFC4106_IV_SIZE, 1);
+}
+
+static int rfc4106_decrypt(struct aead_request *req)
+{
+ struct crypto_aead *aead = crypto_aead_reqtfm(req);
+ struct p10_aes_gcm_ctx *ctx = crypto_aead_ctx(aead);
+ u8 iv[AES_BLOCK_SIZE];
+
+ memcpy(iv, ctx->nonce, RFC4106_NONCE_SIZE);
+ memcpy(iv + RFC4106_NONCE_SIZE, req->iv, GCM_RFC4106_IV_SIZE);
+
+ return crypto_ipsec_check_assoclen(req->assoclen) ?:
+ p10_aes_gcm_crypt(req, iv, req->assoclen - GCM_RFC4106_IV_SIZE, 0);
+}
+
static int p10_aes_gcm_encrypt(struct aead_request *req)
{
- return p10_aes_gcm_crypt(req, 1);
+ return p10_aes_gcm_crypt(req, req->iv, req->assoclen, 1);
}
static int p10_aes_gcm_decrypt(struct aead_request *req)
{
- return p10_aes_gcm_crypt(req, 0);
+ return p10_aes_gcm_crypt(req, req->iv, req->assoclen, 0);
}
-static struct aead_alg gcm_aes_alg = {
+static struct aead_alg gcm_aes_algs[] = {{
.ivsize = GCM_IV_SIZE,
.maxauthsize = 16,
@@ -321,23 +382,57 @@ static struct aead_alg gcm_aes_alg = {
.encrypt = p10_aes_gcm_encrypt,
.decrypt = p10_aes_gcm_decrypt,
- .base.cra_name = "gcm(aes)",
- .base.cra_driver_name = "aes_gcm_p10",
+ .base.cra_name = "__gcm(aes)",
+ .base.cra_driver_name = "__aes_gcm_p10",
.base.cra_priority = 2100,
.base.cra_blocksize = 1,
- .base.cra_ctxsize = sizeof(struct p10_aes_gcm_ctx),
+ .base.cra_ctxsize = sizeof(struct p10_aes_gcm_ctx)+
+ 4 * sizeof(u64[2]),
.base.cra_module = THIS_MODULE,
-};
+ .base.cra_flags = CRYPTO_ALG_INTERNAL,
+}, {
+ .ivsize = GCM_RFC4106_IV_SIZE,
+ .maxauthsize = 16,
+ .setkey = rfc4106_setkey,
+ .setauthsize = rfc4106_setauthsize,
+ .encrypt = rfc4106_encrypt,
+ .decrypt = rfc4106_decrypt,
+
+ .base.cra_name = "__rfc4106(gcm(aes))",
+ .base.cra_driver_name = "__rfc4106_aes_gcm_p10",
+ .base.cra_priority = 2100,
+ .base.cra_blocksize = 1,
+ .base.cra_ctxsize = sizeof(struct p10_aes_gcm_ctx) +
+ 4 * sizeof(u64[2]),
+ .base.cra_module = THIS_MODULE,
+ .base.cra_flags = CRYPTO_ALG_INTERNAL,
+}};
+
+static struct simd_aead_alg *p10_simd_aeads[ARRAY_SIZE(gcm_aes_algs)];
static int __init p10_init(void)
{
- return crypto_register_aead(&gcm_aes_alg);
+ int ret;
+
+ if (!cpu_has_feature(CPU_FTR_ARCH_31))
+ return 0;
+
+ ret = simd_register_aeads_compat(gcm_aes_algs,
+ ARRAY_SIZE(gcm_aes_algs),
+ p10_simd_aeads);
+ if (ret) {
+ simd_unregister_aeads(gcm_aes_algs, ARRAY_SIZE(gcm_aes_algs),
+ p10_simd_aeads);
+ return ret;
+ }
+ return 0;
}
static void __exit p10_exit(void)
{
- crypto_unregister_aead(&gcm_aes_alg);
+ simd_unregister_aeads(gcm_aes_algs, ARRAY_SIZE(gcm_aes_algs),
+ p10_simd_aeads);
}
-module_cpu_feature_match(PPC_MODULE_FEATURE_P10, p10_init);
+module_init(p10_init);
module_exit(p10_exit);
diff --git a/arch/powerpc/crypto/aes-gcm-p10.S b/arch/powerpc/crypto/aes-gcm-p10.S
index a51f4b2..89f50ee 100644
--- a/arch/powerpc/crypto/aes-gcm-p10.S
+++ b/arch/powerpc/crypto/aes-gcm-p10.S
@@ -1,42 +1,42 @@
/* SPDX-License-Identifier: GPL-2.0-or-later */
- #
- # Accelerated AES-GCM stitched implementation for ppc64le.
- #
- # Copyright 2022- IBM Inc. All rights reserved
- #
- #===================================================================================
- # Written by Danny Tsen <dtsen@linux.ibm.com>
- #
- # GHASH is based on the Karatsuba multiplication method.
- #
- # Xi xor X1
- #
- # X1 * H^4 + X2 * H^3 + x3 * H^2 + X4 * H =
- # (X1.h * H4.h + xX.l * H4.l + X1 * H4) +
- # (X2.h * H3.h + X2.l * H3.l + X2 * H3) +
- # (X3.h * H2.h + X3.l * H2.l + X3 * H2) +
- # (X4.h * H.h + X4.l * H.l + X4 * H)
- #
- # Xi = v0
- # H Poly = v2
- # Hash keys = v3 - v14
- # ( H.l, H, H.h)
- # ( H^2.l, H^2, H^2.h)
- # ( H^3.l, H^3, H^3.h)
- # ( H^4.l, H^4, H^4.h)
- #
- # v30 is IV
- # v31 - counter 1
- #
- # AES used,
- # vs0 - vs14 for round keys
- # v15, v16, v17, v18, v19, v20, v21, v22 for 8 blocks (encrypted)
- #
- # This implementation uses stitched AES-GCM approach to improve overall performance.
- # AES is implemented with 8x blocks and GHASH is using 2 4x blocks.
- #
- # ===================================================================================
- #
+#
+# Accelerated AES-GCM stitched implementation for ppc64le.
+#
+# Copyright 2024- IBM Inc.
+#
+#===================================================================================
+# Written by Danny Tsen <dtsen@us.ibm.com>
+#
+# GHASH is based on the Karatsuba multiplication method.
+#
+# Xi xor X1
+#
+# X1 * H^4 + X2 * H^3 + x3 * H^2 + X4 * H =
+# (X1.h * H4.h + xX.l * H4.l + X1 * H4) +
+# (X2.h * H3.h + X2.l * H3.l + X2 * H3) +
+# (X3.h * H2.h + X3.l * H2.l + X3 * H2) +
+# (X4.h * H.h + X4.l * H.l + X4 * H)
+#
+# Xi = v0
+# H Poly = v2
+# Hash keys = v3 - v14
+# ( H.l, H, H.h)
+# ( H^2.l, H^2, H^2.h)
+# ( H^3.l, H^3, H^3.h)
+# ( H^4.l, H^4, H^4.h)
+#
+# v30 is IV
+# v31 - counter 1
+#
+# AES used,
+# vs0 - round key 0
+# v15, v16, v17, v18, v19, v20, v21, v22 for 8 blocks (encrypted)
+#
+# This implementation uses stitched AES-GCM approach to improve overall performance.
+# AES is implemented with 8x blocks and GHASH is using 2 4x blocks.
+#
+# ===================================================================================
+#
#include <asm/ppc_asm.h>
#include <linux/linkage.h>
@@ -44,483 +44,224 @@
.machine "any"
.text
- # 4x loops
- # v15 - v18 - input states
- # vs1 - vs9 - round keys
- #
-.macro Loop_aes_middle4x
- xxlor 19+32, 1, 1
- xxlor 20+32, 2, 2
- xxlor 21+32, 3, 3
- xxlor 22+32, 4, 4
-
- vcipher 15, 15, 19
- vcipher 16, 16, 19
- vcipher 17, 17, 19
- vcipher 18, 18, 19
-
- vcipher 15, 15, 20
- vcipher 16, 16, 20
- vcipher 17, 17, 20
- vcipher 18, 18, 20
-
- vcipher 15, 15, 21
- vcipher 16, 16, 21
- vcipher 17, 17, 21
- vcipher 18, 18, 21
-
- vcipher 15, 15, 22
- vcipher 16, 16, 22
- vcipher 17, 17, 22
- vcipher 18, 18, 22
-
- xxlor 19+32, 5, 5
- xxlor 20+32, 6, 6
- xxlor 21+32, 7, 7
- xxlor 22+32, 8, 8
-
- vcipher 15, 15, 19
- vcipher 16, 16, 19
- vcipher 17, 17, 19
- vcipher 18, 18, 19
-
- vcipher 15, 15, 20
- vcipher 16, 16, 20
- vcipher 17, 17, 20
- vcipher 18, 18, 20
-
- vcipher 15, 15, 21
- vcipher 16, 16, 21
- vcipher 17, 17, 21
- vcipher 18, 18, 21
-
- vcipher 15, 15, 22
- vcipher 16, 16, 22
- vcipher 17, 17, 22
- vcipher 18, 18, 22
-
- xxlor 23+32, 9, 9
- vcipher 15, 15, 23
- vcipher 16, 16, 23
- vcipher 17, 17, 23
- vcipher 18, 18, 23
+.macro SAVE_GPR GPR OFFSET FRAME
+ std \GPR,\OFFSET(\FRAME)
.endm
- # 8x loops
- # v15 - v22 - input states
- # vs1 - vs9 - round keys
- #
-.macro Loop_aes_middle8x
- xxlor 23+32, 1, 1
- xxlor 24+32, 2, 2
- xxlor 25+32, 3, 3
- xxlor 26+32, 4, 4
-
- vcipher 15, 15, 23
- vcipher 16, 16, 23
- vcipher 17, 17, 23
- vcipher 18, 18, 23
- vcipher 19, 19, 23
- vcipher 20, 20, 23
- vcipher 21, 21, 23
- vcipher 22, 22, 23
-
- vcipher 15, 15, 24
- vcipher 16, 16, 24
- vcipher 17, 17, 24
- vcipher 18, 18, 24
- vcipher 19, 19, 24
- vcipher 20, 20, 24
- vcipher 21, 21, 24
- vcipher 22, 22, 24
-
- vcipher 15, 15, 25
- vcipher 16, 16, 25
- vcipher 17, 17, 25
- vcipher 18, 18, 25
- vcipher 19, 19, 25
- vcipher 20, 20, 25
- vcipher 21, 21, 25
- vcipher 22, 22, 25
-
- vcipher 15, 15, 26
- vcipher 16, 16, 26
- vcipher 17, 17, 26
- vcipher 18, 18, 26
- vcipher 19, 19, 26
- vcipher 20, 20, 26
- vcipher 21, 21, 26
- vcipher 22, 22, 26
-
- xxlor 23+32, 5, 5
- xxlor 24+32, 6, 6
- xxlor 25+32, 7, 7
- xxlor 26+32, 8, 8
-
- vcipher 15, 15, 23
- vcipher 16, 16, 23
- vcipher 17, 17, 23
- vcipher 18, 18, 23
- vcipher 19, 19, 23
- vcipher 20, 20, 23
- vcipher 21, 21, 23
- vcipher 22, 22, 23
-
- vcipher 15, 15, 24
- vcipher 16, 16, 24
- vcipher 17, 17, 24
- vcipher 18, 18, 24
- vcipher 19, 19, 24
- vcipher 20, 20, 24
- vcipher 21, 21, 24
- vcipher 22, 22, 24
-
- vcipher 15, 15, 25
- vcipher 16, 16, 25
- vcipher 17, 17, 25
- vcipher 18, 18, 25
- vcipher 19, 19, 25
- vcipher 20, 20, 25
- vcipher 21, 21, 25
- vcipher 22, 22, 25
-
- vcipher 15, 15, 26
- vcipher 16, 16, 26
- vcipher 17, 17, 26
- vcipher 18, 18, 26
- vcipher 19, 19, 26
- vcipher 20, 20, 26
- vcipher 21, 21, 26
- vcipher 22, 22, 26
-
- xxlor 23+32, 9, 9
- vcipher 15, 15, 23
- vcipher 16, 16, 23
- vcipher 17, 17, 23
- vcipher 18, 18, 23
- vcipher 19, 19, 23
- vcipher 20, 20, 23
- vcipher 21, 21, 23
- vcipher 22, 22, 23
+.macro SAVE_VRS VRS OFFSET FRAME
+ stxv \VRS+32, \OFFSET(\FRAME)
.endm
-.macro Loop_aes_middle_1x
- xxlor 19+32, 1, 1
- xxlor 20+32, 2, 2
- xxlor 21+32, 3, 3
- xxlor 22+32, 4, 4
-
- vcipher 15, 15, 19
- vcipher 15, 15, 20
- vcipher 15, 15, 21
- vcipher 15, 15, 22
-
- xxlor 19+32, 5, 5
- xxlor 20+32, 6, 6
- xxlor 21+32, 7, 7
- xxlor 22+32, 8, 8
-
- vcipher 15, 15, 19
- vcipher 15, 15, 20
- vcipher 15, 15, 21
- vcipher 15, 15, 22
-
- xxlor 19+32, 9, 9
- vcipher 15, 15, 19
+.macro RESTORE_GPR GPR OFFSET FRAME
+ ld \GPR,\OFFSET(\FRAME)
.endm
- #
- # Compute 4x hash values based on Karatsuba method.
- #
-.macro ppc_aes_gcm_ghash
- vxor 15, 15, 0
-
- vpmsumd 23, 12, 15 # H4.L * X.L
- vpmsumd 24, 9, 16
- vpmsumd 25, 6, 17
- vpmsumd 26, 3, 18
-
- vxor 23, 23, 24
- vxor 23, 23, 25
- vxor 23, 23, 26 # L
-
- vpmsumd 24, 13, 15 # H4.L * X.H + H4.H * X.L
- vpmsumd 25, 10, 16 # H3.L * X1.H + H3.H * X1.L
- vpmsumd 26, 7, 17
- vpmsumd 27, 4, 18
-
- vxor 24, 24, 25
- vxor 24, 24, 26
- vxor 24, 24, 27 # M
-
- # sum hash and reduction with H Poly
- vpmsumd 28, 23, 2 # reduction
-
- vxor 29, 29, 29
- vsldoi 26, 24, 29, 8 # mL
- vsldoi 29, 29, 24, 8 # mH
- vxor 23, 23, 26 # mL + L
-
- vsldoi 23, 23, 23, 8 # swap
- vxor 23, 23, 28
-
- vpmsumd 24, 14, 15 # H4.H * X.H
- vpmsumd 25, 11, 16
- vpmsumd 26, 8, 17
- vpmsumd 27, 5, 18
-
- vxor 24, 24, 25
- vxor 24, 24, 26
- vxor 24, 24, 27
-
- vxor 24, 24, 29
-
- # sum hash and reduction with H Poly
- vsldoi 27, 23, 23, 8 # swap
- vpmsumd 23, 23, 2
- vxor 27, 27, 24
- vxor 23, 23, 27
-
- xxlor 32, 23+32, 23+32 # update hash
-
-.endm
-
- #
- # Combine two 4x ghash
- # v15 - v22 - input blocks
- #
-.macro ppc_aes_gcm_ghash2_4x
- # first 4x hash
- vxor 15, 15, 0 # Xi + X
-
- vpmsumd 23, 12, 15 # H4.L * X.L
- vpmsumd 24, 9, 16
- vpmsumd 25, 6, 17
- vpmsumd 26, 3, 18
-
- vxor 23, 23, 24
- vxor 23, 23, 25
- vxor 23, 23, 26 # L
-
- vpmsumd 24, 13, 15 # H4.L * X.H + H4.H * X.L
- vpmsumd 25, 10, 16 # H3.L * X1.H + H3.H * X1.L
- vpmsumd 26, 7, 17
- vpmsumd 27, 4, 18
-
- vxor 24, 24, 25
- vxor 24, 24, 26
-
- # sum hash and reduction with H Poly
- vpmsumd 28, 23, 2 # reduction
-
- vxor 29, 29, 29
-
- vxor 24, 24, 27 # M
- vsldoi 26, 24, 29, 8 # mL
- vsldoi 29, 29, 24, 8 # mH
- vxor 23, 23, 26 # mL + L
-
- vsldoi 23, 23, 23, 8 # swap
- vxor 23, 23, 28
-
- vpmsumd 24, 14, 15 # H4.H * X.H
- vpmsumd 25, 11, 16
- vpmsumd 26, 8, 17
- vpmsumd 27, 5, 18
-
- vxor 24, 24, 25
- vxor 24, 24, 26
- vxor 24, 24, 27 # H
-
- vxor 24, 24, 29 # H + mH
-
- # sum hash and reduction with H Poly
- vsldoi 27, 23, 23, 8 # swap
- vpmsumd 23, 23, 2
- vxor 27, 27, 24
- vxor 27, 23, 27 # 1st Xi
-
- # 2nd 4x hash
- vpmsumd 24, 9, 20
- vpmsumd 25, 6, 21
- vpmsumd 26, 3, 22
- vxor 19, 19, 27 # Xi + X
- vpmsumd 23, 12, 19 # H4.L * X.L
-
- vxor 23, 23, 24
- vxor 23, 23, 25
- vxor 23, 23, 26 # L
-
- vpmsumd 24, 13, 19 # H4.L * X.H + H4.H * X.L
- vpmsumd 25, 10, 20 # H3.L * X1.H + H3.H * X1.L
- vpmsumd 26, 7, 21
- vpmsumd 27, 4, 22
-
- vxor 24, 24, 25
- vxor 24, 24, 26
-
- # sum hash and reduction with H Poly
- vpmsumd 28, 23, 2 # reduction
-
- vxor 29, 29, 29
-
- vxor 24, 24, 27 # M
- vsldoi 26, 24, 29, 8 # mL
- vsldoi 29, 29, 24, 8 # mH
- vxor 23, 23, 26 # mL + L
-
- vsldoi 23, 23, 23, 8 # swap
- vxor 23, 23, 28
-
- vpmsumd 24, 14, 19 # H4.H * X.H
- vpmsumd 25, 11, 20
- vpmsumd 26, 8, 21
- vpmsumd 27, 5, 22
-
- vxor 24, 24, 25
- vxor 24, 24, 26
- vxor 24, 24, 27 # H
-
- vxor 24, 24, 29 # H + mH
-
- # sum hash and reduction with H Poly
- vsldoi 27, 23, 23, 8 # swap
- vpmsumd 23, 23, 2
- vxor 27, 27, 24
- vxor 23, 23, 27
-
- xxlor 32, 23+32, 23+32 # update hash
-
-.endm
-
- #
- # Compute update single hash
- #
-.macro ppc_update_hash_1x
- vxor 28, 28, 0
-
- vxor 19, 19, 19
-
- vpmsumd 22, 3, 28 # L
- vpmsumd 23, 4, 28 # M
- vpmsumd 24, 5, 28 # H
-
- vpmsumd 27, 22, 2 # reduction
-
- vsldoi 25, 23, 19, 8 # mL
- vsldoi 26, 19, 23, 8 # mH
- vxor 22, 22, 25 # LL + LL
- vxor 24, 24, 26 # HH + HH
-
- vsldoi 22, 22, 22, 8 # swap
- vxor 22, 22, 27
-
- vsldoi 20, 22, 22, 8 # swap
- vpmsumd 22, 22, 2 # reduction
- vxor 20, 20, 24
- vxor 22, 22, 20
-
- vmr 0, 22 # update hash
-
+.macro RESTORE_VRS VRS OFFSET FRAME
+ lxv \VRS+32, \OFFSET(\FRAME)
.endm
.macro SAVE_REGS
- stdu 1,-640(1)
mflr 0
+ std 0, 16(1)
+ stdu 1,-512(1)
- std 14,112(1)
- std 15,120(1)
- std 16,128(1)
- std 17,136(1)
- std 18,144(1)
- std 19,152(1)
- std 20,160(1)
- std 21,168(1)
- li 9, 256
- stvx 20, 9, 1
- addi 9, 9, 16
- stvx 21, 9, 1
- addi 9, 9, 16
- stvx 22, 9, 1
- addi 9, 9, 16
- stvx 23, 9, 1
- addi 9, 9, 16
- stvx 24, 9, 1
- addi 9, 9, 16
- stvx 25, 9, 1
- addi 9, 9, 16
- stvx 26, 9, 1
- addi 9, 9, 16
- stvx 27, 9, 1
- addi 9, 9, 16
- stvx 28, 9, 1
- addi 9, 9, 16
- stvx 29, 9, 1
- addi 9, 9, 16
- stvx 30, 9, 1
- addi 9, 9, 16
- stvx 31, 9, 1
- stxv 14, 464(1)
- stxv 15, 480(1)
- stxv 16, 496(1)
- stxv 17, 512(1)
- stxv 18, 528(1)
- stxv 19, 544(1)
- stxv 20, 560(1)
- stxv 21, 576(1)
- stxv 22, 592(1)
- std 0, 656(1)
-.endm
+ SAVE_GPR 14, 112, 1
+ SAVE_GPR 15, 120, 1
+ SAVE_GPR 16, 128, 1
+ SAVE_GPR 17, 136, 1
+ SAVE_GPR 18, 144, 1
+ SAVE_GPR 19, 152, 1
+ SAVE_GPR 20, 160, 1
+ SAVE_GPR 21, 168, 1
+ SAVE_GPR 22, 176, 1
+ SAVE_GPR 23, 184, 1
+ SAVE_GPR 24, 192, 1
+
+ addi 9, 1, 256
+ SAVE_VRS 20, 0, 9
+ SAVE_VRS 21, 16, 9
+ SAVE_VRS 22, 32, 9
+ SAVE_VRS 23, 48, 9
+ SAVE_VRS 24, 64, 9
+ SAVE_VRS 25, 80, 9
+ SAVE_VRS 26, 96, 9
+ SAVE_VRS 27, 112, 9
+ SAVE_VRS 28, 128, 9
+ SAVE_VRS 29, 144, 9
+ SAVE_VRS 30, 160, 9
+ SAVE_VRS 31, 176, 9
+.endm # SAVE_REGS
.macro RESTORE_REGS
- lxv 14, 464(1)
- lxv 15, 480(1)
- lxv 16, 496(1)
- lxv 17, 512(1)
- lxv 18, 528(1)
- lxv 19, 544(1)
- lxv 20, 560(1)
- lxv 21, 576(1)
- lxv 22, 592(1)
- li 9, 256
- lvx 20, 9, 1
- addi 9, 9, 16
- lvx 21, 9, 1
- addi 9, 9, 16
- lvx 22, 9, 1
- addi 9, 9, 16
- lvx 23, 9, 1
- addi 9, 9, 16
- lvx 24, 9, 1
- addi 9, 9, 16
- lvx 25, 9, 1
- addi 9, 9, 16
- lvx 26, 9, 1
- addi 9, 9, 16
- lvx 27, 9, 1
- addi 9, 9, 16
- lvx 28, 9, 1
- addi 9, 9, 16
- lvx 29, 9, 1
- addi 9, 9, 16
- lvx 30, 9, 1
- addi 9, 9, 16
- lvx 31, 9, 1
+ addi 9, 1, 256
+ RESTORE_VRS 20, 0, 9
+ RESTORE_VRS 21, 16, 9
+ RESTORE_VRS 22, 32, 9
+ RESTORE_VRS 23, 48, 9
+ RESTORE_VRS 24, 64, 9
+ RESTORE_VRS 25, 80, 9
+ RESTORE_VRS 26, 96, 9
+ RESTORE_VRS 27, 112, 9
+ RESTORE_VRS 28, 128, 9
+ RESTORE_VRS 29, 144, 9
+ RESTORE_VRS 30, 160, 9
+ RESTORE_VRS 31, 176, 9
- ld 0, 656(1)
- ld 14,112(1)
- ld 15,120(1)
- ld 16,128(1)
- ld 17,136(1)
- ld 18,144(1)
- ld 19,152(1)
- ld 20,160(1)
- ld 21,168(1)
+ RESTORE_GPR 14, 112, 1
+ RESTORE_GPR 15, 120, 1
+ RESTORE_GPR 16, 128, 1
+ RESTORE_GPR 17, 136, 1
+ RESTORE_GPR 18, 144, 1
+ RESTORE_GPR 19, 152, 1
+ RESTORE_GPR 20, 160, 1
+ RESTORE_GPR 21, 168, 1
+ RESTORE_GPR 22, 176, 1
+ RESTORE_GPR 23, 184, 1
+ RESTORE_GPR 24, 192, 1
- mtlr 0
- addi 1, 1, 640
+ addi 1, 1, 512
+ ld 0, 16(1)
+ mtlr 0
+.endm # RESTORE_REGS
+
+# 4x loops
+.macro AES_CIPHER_4x _VCIPHER ST r
+ \_VCIPHER \ST, \ST, \r
+ \_VCIPHER \ST+1, \ST+1, \r
+ \_VCIPHER \ST+2, \ST+2, \r
+ \_VCIPHER \ST+3, \ST+3, \r
.endm
+# 8x loops
+.macro AES_CIPHER_8x _VCIPHER ST r
+ \_VCIPHER \ST, \ST, \r
+ \_VCIPHER \ST+1, \ST+1, \r
+ \_VCIPHER \ST+2, \ST+2, \r
+ \_VCIPHER \ST+3, \ST+3, \r
+ \_VCIPHER \ST+4, \ST+4, \r
+ \_VCIPHER \ST+5, \ST+5, \r
+ \_VCIPHER \ST+6, \ST+6, \r
+ \_VCIPHER \ST+7, \ST+7, \r
+.endm
+
+.macro LOOP_8AES_STATE
+ xxlor 32+23, 1, 1
+ xxlor 32+24, 2, 2
+ xxlor 32+25, 3, 3
+ xxlor 32+26, 4, 4
+ AES_CIPHER_8x vcipher, 15, 23
+ AES_CIPHER_8x vcipher, 15, 24
+ AES_CIPHER_8x vcipher, 15, 25
+ AES_CIPHER_8x vcipher, 15, 26
+ xxlor 32+23, 5, 5
+ xxlor 32+24, 6, 6
+ xxlor 32+25, 7, 7
+ xxlor 32+26, 8, 8
+ AES_CIPHER_8x vcipher, 15, 23
+ AES_CIPHER_8x vcipher, 15, 24
+ AES_CIPHER_8x vcipher, 15, 25
+ AES_CIPHER_8x vcipher, 15, 26
+.endm
+
+#
+# PPC_GHASH4x(H, S1, S2, S3, S4): Compute 4x hash values based on Karatsuba method.
+# H: returning digest
+# S#: states
+#
+# S1 should xor with the previous digest
+#
+# Xi = v0
+# H Poly = v2
+# Hash keys = v3 - v14
+# Scratch: v23 - v29
+#
+.macro PPC_GHASH4x H S1 S2 S3 S4
+
+ vpmsumd 23, 12, \S1 # H4.L * X.L
+ vpmsumd 24, 9, \S2
+ vpmsumd 25, 6, \S3
+ vpmsumd 26, 3, \S4
+
+ vpmsumd 27, 13, \S1 # H4.L * X.H + H4.H * X.L
+ vpmsumd 28, 10, \S2 # H3.L * X1.H + H3.H * X1.L
+
+ vxor 23, 23, 24
+ vxor 23, 23, 25
+ vxor 23, 23, 26 # L
+
+ vxor 24, 27, 28
+ vpmsumd 25, 7, \S3
+ vpmsumd 26, 4, \S4
+
+ vxor 24, 24, 25
+ vxor 24, 24, 26 # M
+
+ # sum hash and reduction with H Poly
+ vpmsumd 28, 23, 2 # reduction
+
+ vxor 1, 1, 1
+ vsldoi 25, 24, 1, 8 # mL
+ vsldoi 1, 1, 24, 8 # mH
+ vxor 23, 23, 25 # mL + L
+
+ # This performs swap and xor like,
+ # vsldoi 23, 23, 23, 8 # swap
+ # vxor 23, 23, 28
+ xxlor 32+25, 10, 10
+ vpermxor 23, 23, 28, 25
+
+ vpmsumd 26, 14, \S1 # H4.H * X.H
+ vpmsumd 27, 11, \S2
+ vpmsumd 28, 8, \S3
+ vpmsumd 29, 5, \S4
+
+ vxor 24, 26, 27
+ vxor 24, 24, 28
+ vxor 24, 24, 29
+
+ vxor 24, 24, 1
+
+ # sum hash and reduction with H Poly
+ vsldoi 25, 23, 23, 8 # swap
+ vpmsumd 23, 23, 2
+ vxor 27, 25, 24
+ vxor \H, 23, 27
+.endm
+
+#
+# Compute update single ghash
+# scratch: v1, v22..v27
+#
+.macro PPC_GHASH1x H S1
+
+ vxor 1, 1, 1
+
+ vpmsumd 22, 3, \S1 # L
+ vpmsumd 23, 4, \S1 # M
+ vpmsumd 24, 5, \S1 # H
+
+ vpmsumd 27, 22, 2 # reduction
+
+ vsldoi 25, 23, 1, 8 # mL
+ vsldoi 26, 1, 23, 8 # mH
+ vxor 22, 22, 25 # LL + LL
+ vxor 24, 24, 26 # HH + HH
+
+ xxlor 32+25, 10, 10
+ vpermxor 22, 22, 27, 25
+
+ vsldoi 23, 22, 22, 8 # swap
+ vpmsumd 22, 22, 2 # reduction
+ vxor 23, 23, 24
+ vxor \H, 22, 23
+.endm
+
+#
+# LOAD_HASH_TABLE
+# Xi = v0
+# H Poly = v2
+# Hash keys = v3 - v14
+#
.macro LOAD_HASH_TABLE
# Load Xi
lxvb16x 32, 0, 8 # load Xi
@@ -557,116 +298,434 @@
lxvd2x 14+32, 10, 8 # H^4h
.endm
- #
- # aes_p10_gcm_encrypt (const void *inp, void *out, size_t len,
- # const char *rk, unsigned char iv[16], void *Xip);
- #
- # r3 - inp
- # r4 - out
- # r5 - len
- # r6 - AES round keys
- # r7 - iv and other data
- # r8 - Xi, HPoli, hash keys
- #
- # rounds is at offset 240 in rk
- # Xi is at 0 in gcm_table (Xip).
- #
-_GLOBAL(aes_p10_gcm_encrypt)
-.align 5
+################################################################################
+# Compute AES and ghash one block at a time.
+# r23: AES rounds
+# v30: current IV
+# vs0: roundkey 0
+#
+################################################################################
+SYM_FUNC_START_LOCAL(aes_gcm_crypt_1x)
+
+ cmpdi 5, 16
+ bge __More_1x
+ blr
+__More_1x:
+ li 10, 16
+ divdu 12, 5, 10
+
+ xxlxor 32+15, 32+30, 0
+
+ # Pre-load 8 AES rounds to scratch vectors.
+ xxlor 32+16, 1, 1
+ xxlor 32+17, 2, 2
+ xxlor 32+18, 3, 3
+ xxlor 32+19, 4, 4
+ xxlor 32+20, 5, 5
+ xxlor 32+21, 6, 6
+ xxlor 32+28, 7, 7
+ xxlor 32+29, 8, 8
+ lwz 23, 240(6) # n rounds
+ addi 22, 23, -9 # remaing AES rounds
+
+ cmpdi 12, 0
+ bgt __Loop_1x
+ blr
+
+__Loop_1x:
+ mtctr 22
+ addi 10, 6, 144
+ vcipher 15, 15, 16
+ vcipher 15, 15, 17
+ vcipher 15, 15, 18
+ vcipher 15, 15, 19
+ vcipher 15, 15, 20
+ vcipher 15, 15, 21
+ vcipher 15, 15, 28
+ vcipher 15, 15, 29
+
+__Loop_aes_1state:
+ lxv 32+1, 0(10)
+ vcipher 15, 15, 1
+ addi 10, 10, 16
+ bdnz __Loop_aes_1state
+ lxv 32+1, 0(10) # last round key
+ lxvb16x 11, 0, 14 # load input block
+ vcipherlast 15, 15, 1
+
+ xxlxor 32+15, 32+15, 11
+ stxvb16x 32+15, 0, 9 # store output
+ addi 14, 14, 16
+ addi 9, 9, 16
+
+ cmpdi 24, 0 # decrypt?
+ bne __Encrypt_1x
+ xxlor 15+32, 11, 11
+__Encrypt_1x:
+ vxor 15, 15, 0
+ PPC_GHASH1x 0, 15
+
+ addi 5, 5, -16
+ addi 11, 11, 16
+
+ vadduwm 30, 30, 31 # IV + counter
+ xxlxor 32+15, 32+30, 0
+ addi 12, 12, -1
+ cmpdi 12, 0
+ bgt __Loop_1x
+
+ stxvb16x 32+30, 0, 7 # update IV
+ stxvb16x 32+0, 0, 8 # update Xi
+ blr
+SYM_FUNC_END(aes_gcm_crypt_1x)
+
+################################################################################
+# Process a normal partial block when we come here.
+# Compute partial mask, Load and store partial block to stack.
+# Update partial_len and pblock.
+# pblock is (encrypted ^ AES state) for encrypt
+# and (input ^ AES state) for decrypt.
+#
+################################################################################
+SYM_FUNC_START_LOCAL(__Process_partial)
+
+ # create partial mask
+ vspltisb 16, -1
+ li 12, 16
+ sub 12, 12, 5
+ sldi 12, 12, 3
+ mtvsrdd 32+17, 0, 12
+ vslo 16, 16, 17 # partial block mask
+
+ lxvb16x 11, 0, 14 # load partial block
+ xxland 11, 11, 32+16
+
+ # AES crypt partial
+ xxlxor 32+15, 32+30, 0
+ lwz 23, 240(6) # n rounds
+ addi 22, 23, -1 # loop - 1
+ mtctr 22
+ addi 10, 6, 16
+
+__Loop_aes_pstate:
+ lxv 32+1, 0(10)
+ vcipher 15, 15, 1
+ addi 10, 10, 16
+ bdnz __Loop_aes_pstate
+ lxv 32+1, 0(10) # last round key
+ vcipherlast 15, 15, 1
+
+ xxlxor 32+15, 32+15, 11
+ vand 15, 15, 16
+
+ # AES crypt output v15
+ # Write partial
+ li 10, 224
+ stxvb16x 15+32, 10, 1 # write v15 to stack
+ addi 10, 1, 223
+ addi 12, 9, -1
+ mtctr 5 # partial block len
+__Write_partial:
+ lbzu 22, 1(10)
+ stbu 22, 1(12)
+ bdnz __Write_partial
+
+ cmpdi 24, 0 # decrypt?
+ bne __Encrypt_partial
+ xxlor 32+15, 11, 11 # decrypt using the input block
+__Encrypt_partial:
+ #vxor 15, 15, 0 # ^ previous hash
+ #PPC_GHASH1x 0, 15
+
+ add 14, 14, 5
+ add 9, 9, 5
+ std 5, 56(7) # update partial
+ sub 11, 11, 5
+ li 5, 0 # done last byte
+
+ #
+ # Don't increase IV since this is the last partial.
+ # It should get updated in gcm_update if no more data blocks.
+ #vadduwm 30, 30, 31 # increase IV
+ stxvb16x 32+30, 0, 7 # update IV
+ li 10, 64
+ stxvb16x 32+0, 0, 8 # Update X1
+ stxvb16x 32+15, 10, 7 # Update pblock
+ blr
+SYM_FUNC_END(__Process_partial)
+
+################################################################################
+# Combine partial blocks and ghash when we come here.
+#
+# The partial block has to be shifted to the right location to encrypt/decrypt
+# and compute ghash if combing the previous partial block is needed.
+# - Compute ghash for a full block. Clear Partial_len and pblock. Update IV.
+# Write Xi.
+# - Don't compute ghash if not full block. gcm_update will take care of it
+# is the last block. Update Partial_len and pblock.
+#
+################################################################################
+SYM_FUNC_START_LOCAL(__Combine_partial)
+
+ ld 12, 56(7)
+ mr 21, 5 # these bytes to be processed
+
+ li 17, 0
+ li 16, 16
+ sub 22, 16, 12 # bytes to complete a block
+ sub 17, 22, 5 # remaining bytes in a block
+ cmpdi 5, 16
+ ble __Inp_msg_less16
+ li 17, 0
+ mr 21, 22
+ b __Combine_continue
+__Inp_msg_less16:
+ cmpd 22, 5
+ bgt __Combine_continue
+ li 17, 0
+ mr 21, 22 # these bytes to be processed
+
+__Combine_continue:
+ # load msg and shift to the proper location and mask
+ vspltisb 16, -1
+ sldi 15, 12, 3
+ mtvsrdd 32+17, 0, 15
+ vslo 16, 16, 17
+ vsro 16, 16, 17
+ sldi 15, 17, 3
+ mtvsrdd 32+17, 0, 15
+ vsro 16, 16, 17
+ vslo 16, 16, 17 # mask
+
+ lxvb16x 32+19, 0, 14 # load partial block
+ sldi 15, 12, 3
+ mtvsrdd 32+17, 0, 15
+ vsro 19, 19, 17 # 0x00..xxxx??..??
+ sldi 15, 17, 3
+ mtvsrdd 32+17, 0, 15
+ vsro 19, 19, 17 # 0x00..xxxx
+ vslo 19, 19, 17 # shift back to form 0x00..xxxx00..00
+
+ # AES crypt partial
+ xxlxor 32+15, 32+30, 0
+ lwz 23, 240(6) # n rounds
+ addi 22, 23, -1 # loop - 1
+ mtctr 22
+ addi 10, 6, 16
+
+__Loop_aes_cpstate:
+ lxv 32+1, 0(10)
+ vcipher 15, 15, 1
+ addi 10, 10, 16
+ bdnz __Loop_aes_cpstate
+ lxv 32+1, 0(10) # last round key
+ vcipherlast 15, 15, 1
+
+ vxor 15, 15, 19
+ vand 15, 15, 16
+
+ # AES crypt output v15
+ # Write partial
+ li 10, 224
+ stxvb16x 15+32, 10, 1 # write v15 to stack
+ addi 10, 1, 223
+ add 10, 10, 12 # add offset
+ addi 15, 9, -1
+ mtctr 21 # partial block len
+__Write_combine_partial:
+ lbzu 22, 1(10)
+ stbu 22, 1(15)
+ bdnz __Write_combine_partial
+
+ add 14, 14, 21
+ add 11, 11, 21
+ add 9, 9, 21
+ sub 5, 5, 21
+
+ # Encrypt/Decrypt?
+ cmpdi 24, 0 # decrypt?
+ bne __Encrypt_combine_partial
+ vmr 15, 19 # decrypt using the input block
+
+__Encrypt_combine_partial:
+ #
+ # Update partial flag and combine ghash.
+__Update_partial_ghash:
+ li 10, 64
+ lxvb16x 32+17, 10, 7 # load previous pblock
+ add 12, 12, 21 # combined pprocessed
+ vxor 15, 15, 17 # combined pblock
+
+ cmpdi 12, 16
+ beq __Clear_partial_flag
+ std 12, 56(7) # update partial len
+ stxvb16x 32+15, 10, 7 # Update current pblock
+ blr
+
+__Clear_partial_flag:
+ li 12, 0
+ std 12, 56(7)
+ # Update IV and ghash here
+ vadduwm 30, 30, 31 # increase IV
+ stxvb16x 32+30, 0, 7 # update IV
+
+ # v15 either is either (input blockor encrypted)^(AES state)
+ vxor 15, 15, 0
+ PPC_GHASH1x 0, 15
+ stxvb16x 32+0, 10, 7 # update pblock for debug?
+ stxvb16x 32+0, 0, 8 # update Xi
+ blr
+SYM_FUNC_END(__Combine_partial)
+
+################################################################################
+# gcm_update(iv, Xi) - compute last hash
+#
+################################################################################
+SYM_FUNC_START(gcm_update)
+
+ ld 10, 56(3)
+ cmpdi 10, 0
+ beq __no_update
+
+ lxvb16x 32, 0, 4 # load Xi
+ # load Hash - h^4, h^3, h^2, h
+ li 10, 32
+ lxvd2x 2+32, 10, 4 # H Poli
+ li 10, 48
+ lxvd2x 3+32, 10, 4 # Hl
+ li 10, 64
+ lxvd2x 4+32, 10, 4 # H
+ li 10, 80
+ lxvd2x 5+32, 10, 4 # Hh
+
+ addis 11, 2, permx@toc@ha
+ addi 11, 11, permx@toc@l
+ lxv 10, 0(11) # vs10: vpermxor vector
+
+ li 9, 64
+ lxvb16x 32+6, 9, 3 # load pblock
+ vxor 6, 6, 0
+
+ vxor 1, 1, 1
+ vpmsumd 12, 3, 6 # L
+ vpmsumd 13, 4, 6 # M
+ vpmsumd 14, 5, 6 # H
+ vpmsumd 17, 12, 2 # reduction
+ vsldoi 15, 13, 1, 8 # mL
+ vsldoi 16, 1, 13, 8 # mH
+ vxor 12, 12, 15 # LL + LL
+ vxor 14, 14, 16 # HH + HH
+ xxlor 32+15, 10, 10
+ vpermxor 12, 12, 17, 15
+ vsldoi 13, 12, 12, 8 # swap
+ vpmsumd 12, 12, 2 # reduction
+ vxor 13, 13, 14
+ vxor 7, 12, 13
+
+ #vxor 0, 0, 0
+ #stxvb16x 32+0, 9, 3
+ li 10, 0
+ std 10, 56(3)
+ stxvb16x 32+7, 0, 4
+
+__no_update:
+ blr
+SYM_FUNC_END(gcm_update)
+
+################################################################################
+# aes_p10_gcm_encrypt (const void *inp, void *out, size_t len,
+# const char *rk, unsigned char iv[16], void *Xip);
+#
+# r3 - inp
+# r4 - out
+# r5 - len
+# r6 - AES round keys
+# r7 - iv and other data
+# r8 - Xi, HPoli, hash keys
+#
+# rounds is at offset 240 in rk
+# Xi is at 0 in gcm_table (Xip).
+#
+################################################################################
+SYM_FUNC_START(aes_p10_gcm_encrypt)
+
+ cmpdi 5, 0
+ ble __Invalid_msg_len
SAVE_REGS
-
LOAD_HASH_TABLE
# initialize ICB: GHASH( IV ), IV - r7
lxvb16x 30+32, 0, 7 # load IV - v30
- mr 12, 5 # length
- li 11, 0 # block index
+ mr 14, 3
+ mr 9, 4
# counter 1
vxor 31, 31, 31
vspltisb 22, 1
vsldoi 31, 31, 22,1 # counter 1
- # load round key to VSR
- lxv 0, 0(6)
- lxv 1, 0x10(6)
- lxv 2, 0x20(6)
- lxv 3, 0x30(6)
- lxv 4, 0x40(6)
- lxv 5, 0x50(6)
- lxv 6, 0x60(6)
- lxv 7, 0x70(6)
- lxv 8, 0x80(6)
- lxv 9, 0x90(6)
- lxv 10, 0xa0(6)
+ addis 11, 2, permx@toc@ha
+ addi 11, 11, permx@toc@l
+ lxv 10, 0(11) # vs10: vpermxor vector
+ li 11, 0
+
+ # load 9 round keys to VSR
+ lxv 0, 0(6) # round key 0
+ lxv 1, 16(6) # round key 1
+ lxv 2, 32(6) # round key 2
+ lxv 3, 48(6) # round key 3
+ lxv 4, 64(6) # round key 4
+ lxv 5, 80(6) # round key 5
+ lxv 6, 96(6) # round key 6
+ lxv 7, 112(6) # round key 7
+ lxv 8, 128(6) # round key 8
# load rounds - 10 (128), 12 (192), 14 (256)
- lwz 9,240(6)
+ lwz 23, 240(6) # n rounds
+ li 24, 1 # encrypt
+__Process_encrypt:
#
- # vxor state, state, w # addroundkey
- xxlor 32+29, 0, 0
- vxor 15, 30, 29 # IV + round key - add round key 0
-
- cmpdi 9, 10
- beq Loop_aes_gcm_8x
-
- # load 2 more round keys (v11, v12)
- lxv 11, 0xb0(6)
- lxv 12, 0xc0(6)
-
- cmpdi 9, 12
- beq Loop_aes_gcm_8x
-
- # load 2 more round keys (v11, v12, v13, v14)
- lxv 13, 0xd0(6)
- lxv 14, 0xe0(6)
- cmpdi 9, 14
- beq Loop_aes_gcm_8x
-
- b aes_gcm_out
-
-.align 5
-Loop_aes_gcm_8x:
- mr 14, 3
- mr 9, 4
-
+ # Process different blocks
#
- # check partial block
- #
-Continue_partial_check:
- ld 15, 56(7)
- cmpdi 15, 0
- beq Continue
- bgt Final_block
- cmpdi 15, 16
- blt Final_block
+ ld 12, 56(7)
+ cmpdi 12, 0
+ bgt __Do_combine_enc
+ cmpdi 5, 128
+ blt __Process_more_enc
-Continue:
- # n blcoks
+#
+# Process 8x AES/GCM blocks
+#
+__Process_8x_enc:
+ # 8x blcoks
li 10, 128
- divdu 10, 12, 10 # n 128 bytes-blocks
- cmpdi 10, 0
- beq Loop_last_block
+ divdu 12, 5, 10 # n 128 bytes-blocks
- vaddudm 30, 30, 31 # IV + counter
- vxor 16, 30, 29
- vaddudm 30, 30, 31
- vxor 17, 30, 29
- vaddudm 30, 30, 31
- vxor 18, 30, 29
- vaddudm 30, 30, 31
- vxor 19, 30, 29
- vaddudm 30, 30, 31
- vxor 20, 30, 29
- vaddudm 30, 30, 31
- vxor 21, 30, 29
- vaddudm 30, 30, 31
- vxor 22, 30, 29
+ addi 12, 12, -1 # loop - 1
- mtctr 10
+ vmr 15, 30 # first state: IV
+ vadduwm 16, 15, 31 # state + counter
+ vadduwm 17, 16, 31
+ vadduwm 18, 17, 31
+ vadduwm 19, 18, 31
+ vadduwm 20, 19, 31
+ vadduwm 21, 20, 31
+ vadduwm 22, 21, 31
+ xxlor 9, 32+22, 32+22 # save last state
+
+ # vxor state, state, w # addroundkey
+ xxlor 32+29, 0, 0
+ vxor 15, 15, 29 # IV + round key - add round key 0
+ vxor 16, 16, 29
+ vxor 17, 17, 29
+ vxor 18, 18, 29
+ vxor 19, 19, 29
+ vxor 20, 20, 29
+ vxor 21, 21, 29
+ vxor 22, 22, 29
li 15, 16
li 16, 32
@@ -676,846 +735,502 @@
li 20, 96
li 21, 112
- lwz 10, 240(6)
-
-Loop_8x_block:
-
- lxvb16x 15, 0, 14 # load block
- lxvb16x 16, 15, 14 # load block
- lxvb16x 17, 16, 14 # load block
- lxvb16x 18, 17, 14 # load block
- lxvb16x 19, 18, 14 # load block
- lxvb16x 20, 19, 14 # load block
- lxvb16x 21, 20, 14 # load block
- lxvb16x 22, 21, 14 # load block
- addi 14, 14, 128
-
- Loop_aes_middle8x
-
- xxlor 23+32, 10, 10
-
- cmpdi 10, 10
- beq Do_next_ghash
-
- # 192 bits
- xxlor 24+32, 11, 11
-
- vcipher 15, 15, 23
- vcipher 16, 16, 23
- vcipher 17, 17, 23
- vcipher 18, 18, 23
- vcipher 19, 19, 23
- vcipher 20, 20, 23
- vcipher 21, 21, 23
- vcipher 22, 22, 23
-
- vcipher 15, 15, 24
- vcipher 16, 16, 24
- vcipher 17, 17, 24
- vcipher 18, 18, 24
- vcipher 19, 19, 24
- vcipher 20, 20, 24
- vcipher 21, 21, 24
- vcipher 22, 22, 24
-
- xxlor 23+32, 12, 12
-
- cmpdi 10, 12
- beq Do_next_ghash
-
- # 256 bits
- xxlor 24+32, 13, 13
-
- vcipher 15, 15, 23
- vcipher 16, 16, 23
- vcipher 17, 17, 23
- vcipher 18, 18, 23
- vcipher 19, 19, 23
- vcipher 20, 20, 23
- vcipher 21, 21, 23
- vcipher 22, 22, 23
-
- vcipher 15, 15, 24
- vcipher 16, 16, 24
- vcipher 17, 17, 24
- vcipher 18, 18, 24
- vcipher 19, 19, 24
- vcipher 20, 20, 24
- vcipher 21, 21, 24
- vcipher 22, 22, 24
-
- xxlor 23+32, 14, 14
-
- cmpdi 10, 14
- beq Do_next_ghash
- b aes_gcm_out
-
-Do_next_ghash:
-
#
- # last round
- vcipherlast 15, 15, 23
- vcipherlast 16, 16, 23
+ # Pre-compute first 8 AES state and leave 1/3/5 more rounds
+ # for the loop.
+ #
+ addi 22, 23, -9 # process 8 keys
+ mtctr 22 # AES key loop
+ addi 10, 6, 144
- xxlxor 47, 47, 15
- stxvb16x 47, 0, 9 # store output
- xxlxor 48, 48, 16
- stxvb16x 48, 15, 9 # store output
+ LOOP_8AES_STATE # process 8 AES keys
- vcipherlast 17, 17, 23
- vcipherlast 18, 18, 23
+__PreLoop_aes_state:
+ lxv 32+1, 0(10) # round key
+ AES_CIPHER_8x vcipher 15 1
+ addi 10, 10, 16
+ bdnz __PreLoop_aes_state
+ lxv 32+1, 0(10) # last round key (v1)
- xxlxor 49, 49, 17
- stxvb16x 49, 16, 9 # store output
- xxlxor 50, 50, 18
- stxvb16x 50, 17, 9 # store output
+ cmpdi 12, 0 # Only one loop (8 block)
+ beq __Finish_ghash
- vcipherlast 19, 19, 23
- vcipherlast 20, 20, 23
+#
+# Loop 8x blocks and compute ghash
+#
+__Loop_8x_block_enc:
+ vcipherlast 15, 15, 1
+ vcipherlast 16, 16, 1
+ vcipherlast 17, 17, 1
+ vcipherlast 18, 18, 1
+ vcipherlast 19, 19, 1
+ vcipherlast 20, 20, 1
+ vcipherlast 21, 21, 1
+ vcipherlast 22, 22, 1
- xxlxor 51, 51, 19
- stxvb16x 51, 18, 9 # store output
- xxlxor 52, 52, 20
- stxvb16x 52, 19, 9 # store output
+ lxvb16x 32+23, 0, 14 # load block
+ lxvb16x 32+24, 15, 14 # load block
+ lxvb16x 32+25, 16, 14 # load block
+ lxvb16x 32+26, 17, 14 # load block
+ lxvb16x 32+27, 18, 14 # load block
+ lxvb16x 32+28, 19, 14 # load block
+ lxvb16x 32+29, 20, 14 # load block
+ lxvb16x 32+30, 21, 14 # load block
+ addi 14, 14, 128
- vcipherlast 21, 21, 23
- vcipherlast 22, 22, 23
+ vxor 15, 15, 23
+ vxor 16, 16, 24
+ vxor 17, 17, 25
+ vxor 18, 18, 26
+ vxor 19, 19, 27
+ vxor 20, 20, 28
+ vxor 21, 21, 29
+ vxor 22, 22, 30
- xxlxor 53, 53, 21
- stxvb16x 53, 20, 9 # store output
- xxlxor 54, 54, 22
- stxvb16x 54, 21, 9 # store output
-
- addi 9, 9, 128
+ stxvb16x 47, 0, 9 # store output
+ stxvb16x 48, 15, 9 # store output
+ stxvb16x 49, 16, 9 # store output
+ stxvb16x 50, 17, 9 # store output
+ stxvb16x 51, 18, 9 # store output
+ stxvb16x 52, 19, 9 # store output
+ stxvb16x 53, 20, 9 # store output
+ stxvb16x 54, 21, 9 # store output
+ addi 9, 9, 128
# ghash here
- ppc_aes_gcm_ghash2_4x
+ vxor 15, 15, 0
+ PPC_GHASH4x 0, 15, 16, 17, 18
- xxlor 27+32, 0, 0
- vaddudm 30, 30, 31 # IV + counter
- vmr 29, 30
- vxor 15, 30, 27 # add round key
- vaddudm 30, 30, 31
- vxor 16, 30, 27
- vaddudm 30, 30, 31
- vxor 17, 30, 27
- vaddudm 30, 30, 31
- vxor 18, 30, 27
- vaddudm 30, 30, 31
- vxor 19, 30, 27
- vaddudm 30, 30, 31
- vxor 20, 30, 27
- vaddudm 30, 30, 31
- vxor 21, 30, 27
- vaddudm 30, 30, 31
- vxor 22, 30, 27
+ vxor 19, 19, 0
+ PPC_GHASH4x 0, 19, 20, 21, 22
- addi 12, 12, -128
+ xxlor 32+15, 9, 9 # last state
+ vadduwm 15, 15, 31 # state + counter
+ vadduwm 16, 15, 31
+ vadduwm 17, 16, 31
+ vadduwm 18, 17, 31
+ vadduwm 19, 18, 31
+ vadduwm 20, 19, 31
+ vadduwm 21, 20, 31
+ vadduwm 22, 21, 31
+ xxlor 9, 32+22, 32+22 # save last state
+
+ xxlor 32+27, 0, 0 # restore roundkey 0
+ vxor 15, 15, 27 # IV + round key - add round key 0
+ vxor 16, 16, 27
+ vxor 17, 17, 27
+ vxor 18, 18, 27
+ vxor 19, 19, 27
+ vxor 20, 20, 27
+ vxor 21, 21, 27
+ vxor 22, 22, 27
+
+ addi 5, 5, -128
addi 11, 11, 128
- bdnz Loop_8x_block
+ LOOP_8AES_STATE # process 8 AES keys
+ mtctr 22 # AES key loop
+ addi 10, 6, 144
+__LastLoop_aes_state:
+ lxv 32+1, 0(10) # round key
+ AES_CIPHER_8x vcipher 15 1
+ addi 10, 10, 16
+ bdnz __LastLoop_aes_state
+ lxv 32+1, 0(10) # last round key (v1)
- vmr 30, 29
- stxvb16x 30+32, 0, 7 # update IV
+ addi 12, 12, -1
+ cmpdi 12, 0
+ bne __Loop_8x_block_enc
-Loop_last_block:
- cmpdi 12, 0
+__Finish_ghash:
+ vcipherlast 15, 15, 1
+ vcipherlast 16, 16, 1
+ vcipherlast 17, 17, 1
+ vcipherlast 18, 18, 1
+ vcipherlast 19, 19, 1
+ vcipherlast 20, 20, 1
+ vcipherlast 21, 21, 1
+ vcipherlast 22, 22, 1
+
+ lxvb16x 32+23, 0, 14 # load block
+ lxvb16x 32+24, 15, 14 # load block
+ lxvb16x 32+25, 16, 14 # load block
+ lxvb16x 32+26, 17, 14 # load block
+ lxvb16x 32+27, 18, 14 # load block
+ lxvb16x 32+28, 19, 14 # load block
+ lxvb16x 32+29, 20, 14 # load block
+ lxvb16x 32+30, 21, 14 # load block
+ addi 14, 14, 128
+
+ vxor 15, 15, 23
+ vxor 16, 16, 24
+ vxor 17, 17, 25
+ vxor 18, 18, 26
+ vxor 19, 19, 27
+ vxor 20, 20, 28
+ vxor 21, 21, 29
+ vxor 22, 22, 30
+
+ stxvb16x 47, 0, 9 # store output
+ stxvb16x 48, 15, 9 # store output
+ stxvb16x 49, 16, 9 # store output
+ stxvb16x 50, 17, 9 # store output
+ stxvb16x 51, 18, 9 # store output
+ stxvb16x 52, 19, 9 # store output
+ stxvb16x 53, 20, 9 # store output
+ stxvb16x 54, 21, 9 # store output
+ addi 9, 9, 128
+
+ vxor 15, 15, 0
+ PPC_GHASH4x 0, 15, 16, 17, 18
+
+ vxor 19, 19, 0
+ PPC_GHASH4x 0, 19, 20, 21, 22
+
+ xxlor 30+32, 9, 9 # last ctr
+ vadduwm 30, 30, 31 # increase ctr
+ stxvb16x 32+30, 0, 7 # update IV
+ stxvb16x 32+0, 0, 8 # update Xi
+
+ addi 5, 5, -128
+ addi 11, 11, 128
+
+ #
+ # Done 8x blocks
+ #
+
+ cmpdi 5, 0
beq aes_gcm_out
- # loop last few blocks
- li 10, 16
- divdu 10, 12, 10
+__Process_more_enc:
+ li 24, 1 # encrypt
+ bl aes_gcm_crypt_1x
+ cmpdi 5, 0
+ beq aes_gcm_out
- mtctr 10
+ bl __Process_partial
+ cmpdi 5, 0
+ beq aes_gcm_out
+__Do_combine_enc:
+ bl __Combine_partial
+ cmpdi 5, 0
+ bgt __Process_encrypt
+ b aes_gcm_out
- lwz 10, 240(6)
+SYM_FUNC_END(aes_p10_gcm_encrypt)
- cmpdi 12, 16
- blt Final_block
+################################################################################
+# aes_p10_gcm_decrypt (const void *inp, void *out, size_t len,
+# const char *rk, unsigned char iv[16], void *Xip);
+# 8x Decrypt
+#
+################################################################################
+SYM_FUNC_START(aes_p10_gcm_decrypt)
-Next_rem_block:
- lxvb16x 15, 0, 14 # load block
+ cmpdi 5, 0
+ ble __Invalid_msg_len
- Loop_aes_middle_1x
+ SAVE_REGS
+ LOAD_HASH_TABLE
- xxlor 23+32, 10, 10
+ # initialize ICB: GHASH( IV ), IV - r7
+ lxvb16x 30+32, 0, 7 # load IV - v30
- cmpdi 10, 10
- beq Do_next_1x
+ mr 14, 3
+ mr 9, 4
- # 192 bits
- xxlor 24+32, 11, 11
+ # counter 1
+ vxor 31, 31, 31
+ vspltisb 22, 1
+ vsldoi 31, 31, 22,1 # counter 1
- vcipher 15, 15, 23
- vcipher 15, 15, 24
+ addis 11, 2, permx@toc@ha
+ addi 11, 11, permx@toc@l
+ lxv 10, 0(11) # vs10: vpermxor vector
+ li 11, 0
- xxlor 23+32, 12, 12
+ # load 9 round keys to VSR
+ lxv 0, 0(6) # round key 0
+ lxv 1, 16(6) # round key 1
+ lxv 2, 32(6) # round key 2
+ lxv 3, 48(6) # round key 3
+ lxv 4, 64(6) # round key 4
+ lxv 5, 80(6) # round key 5
+ lxv 6, 96(6) # round key 6
+ lxv 7, 112(6) # round key 7
+ lxv 8, 128(6) # round key 8
- cmpdi 10, 12
- beq Do_next_1x
+ # load rounds - 10 (128), 12 (192), 14 (256)
+ lwz 23, 240(6) # n rounds
+ li 24, 0 # decrypt
- # 256 bits
- xxlor 24+32, 13, 13
-
- vcipher 15, 15, 23
- vcipher 15, 15, 24
-
- xxlor 23+32, 14, 14
-
- cmpdi 10, 14
- beq Do_next_1x
-
-Do_next_1x:
- vcipherlast 15, 15, 23
-
- xxlxor 47, 47, 15
- stxvb16x 47, 0, 9 # store output
- addi 14, 14, 16
- addi 9, 9, 16
-
- vmr 28, 15
- ppc_update_hash_1x
-
- addi 12, 12, -16
- addi 11, 11, 16
- xxlor 19+32, 0, 0
- vaddudm 30, 30, 31 # IV + counter
- vxor 15, 30, 19 # add round key
-
- bdnz Next_rem_block
-
- li 15, 0
- std 15, 56(7) # clear partial?
- stxvb16x 30+32, 0, 7 # update IV
+__Process_decrypt:
+ #
+ # Process different blocks
+ #
+ ld 12, 56(7)
cmpdi 12, 0
- beq aes_gcm_out
+ bgt __Do_combine_dec
+ cmpdi 5, 128
+ blt __Process_more_dec
-Final_block:
- lwz 10, 240(6)
- Loop_aes_middle_1x
+#
+# Process 8x AES/GCM blocks
+#
+__Process_8x_dec:
+ # 8x blcoks
+ li 10, 128
+ divdu 12, 5, 10 # n 128 bytes-blocks
- xxlor 23+32, 10, 10
+ addi 12, 12, -1 # loop - 1
- cmpdi 10, 10
- beq Do_final_1x
+ vmr 15, 30 # first state: IV
+ vadduwm 16, 15, 31 # state + counter
+ vadduwm 17, 16, 31
+ vadduwm 18, 17, 31
+ vadduwm 19, 18, 31
+ vadduwm 20, 19, 31
+ vadduwm 21, 20, 31
+ vadduwm 22, 21, 31
+ xxlor 9, 32+22, 32+22 # save last state
- # 192 bits
- xxlor 24+32, 11, 11
-
- vcipher 15, 15, 23
- vcipher 15, 15, 24
-
- xxlor 23+32, 12, 12
-
- cmpdi 10, 12
- beq Do_final_1x
-
- # 256 bits
- xxlor 24+32, 13, 13
-
- vcipher 15, 15, 23
- vcipher 15, 15, 24
-
- xxlor 23+32, 14, 14
-
- cmpdi 10, 14
- beq Do_final_1x
-
-Do_final_1x:
- vcipherlast 15, 15, 23
-
- # check partial block
- li 21, 0 # encrypt
- ld 15, 56(7) # partial?
- cmpdi 15, 0
- beq Normal_block
- bl Do_partial_block
-
- cmpdi 12, 0
- ble aes_gcm_out
-
- b Continue_partial_check
-
-Normal_block:
- lxvb16x 15, 0, 14 # load last block
- xxlxor 47, 47, 15
-
- # create partial block mask
- li 15, 16
- sub 15, 15, 12 # index to the mask
-
- vspltisb 16, -1 # first 16 bytes - 0xffff...ff
- vspltisb 17, 0 # second 16 bytes - 0x0000...00
- li 10, 192
- stvx 16, 10, 1
- addi 10, 10, 16
- stvx 17, 10, 1
-
- addi 10, 1, 192
- lxvb16x 16, 15, 10 # load partial block mask
- xxland 47, 47, 16
-
- vmr 28, 15
- ppc_update_hash_1x
-
- # * should store only the remaining bytes.
- bl Write_partial_block
-
- stxvb16x 30+32, 0, 7 # update IV
- std 12, 56(7) # update partial?
- li 16, 16
-
- stxvb16x 32, 0, 8 # write out Xi
- stxvb16x 32, 16, 8 # write out Xi
- b aes_gcm_out
-
- #
- # Compute data mask
- #
-.macro GEN_MASK _mask _start _end
- vspltisb 16, -1 # first 16 bytes - 0xffff...ff
- vspltisb 17, 0 # second 16 bytes - 0x0000...00
- li 10, 192
- stxvb16x 17+32, 10, 1
- add 10, 10, \_start
- stxvb16x 16+32, 10, 1
- add 10, 10, \_end
- stxvb16x 17+32, 10, 1
-
- addi 10, 1, 192
- lxvb16x \_mask, 0, 10 # load partial block mask
-.endm
-
- #
- # Handle multiple partial blocks for encrypt and decrypt
- # operations.
- #
-SYM_FUNC_START_LOCAL(Do_partial_block)
- add 17, 15, 5
- cmpdi 17, 16
- bgt Big_block
- GEN_MASK 18, 15, 5
- b _Partial
-SYM_FUNC_END(Do_partial_block)
-Big_block:
- li 16, 16
- GEN_MASK 18, 15, 16
-
-_Partial:
- lxvb16x 17+32, 0, 14 # load last block
- sldi 16, 15, 3
- mtvsrdd 32+16, 0, 16
- vsro 17, 17, 16
- xxlxor 47, 47, 17+32
- xxland 47, 47, 18
-
- vxor 0, 0, 0 # clear Xi
- vmr 28, 15
-
- cmpdi 21, 0 # encrypt/decrypt ops?
- beq Skip_decrypt
- xxland 32+28, 32+17, 18
-
-Skip_decrypt:
-
- ppc_update_hash_1x
-
- li 16, 16
- lxvb16x 32+29, 16, 8
- vxor 0, 0, 29
- stxvb16x 32, 0, 8 # save Xi
- stxvb16x 32, 16, 8 # save Xi
-
- # store partial block
- # loop the rest of the stream if any
- sldi 16, 15, 3
- mtvsrdd 32+16, 0, 16
- vslo 15, 15, 16
- #stxvb16x 15+32, 0, 9 # last block
-
- li 16, 16
- sub 17, 16, 15 # 16 - partial
-
- add 16, 15, 5
- cmpdi 16, 16
- bgt Larger_16
- mr 17, 5
-Larger_16:
-
- # write partial
- li 10, 192
- stxvb16x 15+32, 10, 1 # save current block
-
- addi 10, 9, -1
- addi 16, 1, 191
- mtctr 17 # move partial byte count
-
-Write_last_partial:
- lbzu 18, 1(16)
- stbu 18, 1(10)
- bdnz Write_last_partial
- # Complete loop partial
-
- add 14, 14, 17
- add 9, 9, 17
- sub 12, 12, 17
- add 11, 11, 17
-
- add 15, 15, 5
- cmpdi 15, 16
- blt Save_partial
-
- vaddudm 30, 30, 31
- stxvb16x 30+32, 0, 7 # update IV
+ # vxor state, state, w # addroundkey
xxlor 32+29, 0, 0
- vxor 15, 30, 29 # IV + round key - add round key 0
- li 15, 0
- std 15, 56(7) # partial done - clear
- b Partial_done
-Save_partial:
- std 15, 56(7) # partial
+ vxor 15, 15, 29 # IV + round key - add round key 0
+ vxor 16, 16, 29
+ vxor 17, 17, 29
+ vxor 18, 18, 29
+ vxor 19, 19, 29
+ vxor 20, 20, 29
+ vxor 21, 21, 29
+ vxor 22, 22, 29
-Partial_done:
- blr
+ li 15, 16
+ li 16, 32
+ li 17, 48
+ li 18, 64
+ li 19, 80
+ li 20, 96
+ li 21, 112
- #
- # Write partial block
- # r9 - output
- # r12 - remaining bytes
- # v15 - partial input data
- #
-SYM_FUNC_START_LOCAL(Write_partial_block)
- li 10, 192
- stxvb16x 15+32, 10, 1 # last block
+ #
+ # Pre-compute first 8 AES state and leave 1/3/5 more rounds
+ # for the loop.
+ #
+ addi 22, 23, -9 # process 8 keys
+ mtctr 22 # AES key loop
+ addi 10, 6, 144
- addi 10, 9, -1
- addi 16, 1, 191
+ LOOP_8AES_STATE # process 8 AES keys
- mtctr 12 # remaining bytes
- li 15, 0
+__PreLoop_aes_state_dec:
+ lxv 32+1, 0(10) # round key
+ AES_CIPHER_8x vcipher 15 1
+ addi 10, 10, 16
+ bdnz __PreLoop_aes_state_dec
+ lxv 32+1, 0(10) # last round key (v1)
-Write_last_byte:
- lbzu 14, 1(16)
- stbu 14, 1(10)
- bdnz Write_last_byte
- blr
-SYM_FUNC_END(Write_partial_block)
+ cmpdi 12, 0 # Only one loop (8 block)
+ beq __Finish_ghash_dec
-aes_gcm_out:
- # out = state
- stxvb16x 32, 0, 8 # write out Xi
- add 3, 11, 12 # return count
+#
+# Loop 8x blocks and compute ghash
+#
+__Loop_8x_block_dec:
+ vcipherlast 15, 15, 1
+ vcipherlast 16, 16, 1
+ vcipherlast 17, 17, 1
+ vcipherlast 18, 18, 1
+ vcipherlast 19, 19, 1
+ vcipherlast 20, 20, 1
+ vcipherlast 21, 21, 1
+ vcipherlast 22, 22, 1
+
+ lxvb16x 32+23, 0, 14 # load block
+ lxvb16x 32+24, 15, 14 # load block
+ lxvb16x 32+25, 16, 14 # load block
+ lxvb16x 32+26, 17, 14 # load block
+ lxvb16x 32+27, 18, 14 # load block
+ lxvb16x 32+28, 19, 14 # load block
+ lxvb16x 32+29, 20, 14 # load block
+ lxvb16x 32+30, 21, 14 # load block
+ addi 14, 14, 128
+
+ vxor 15, 15, 23
+ vxor 16, 16, 24
+ vxor 17, 17, 25
+ vxor 18, 18, 26
+ vxor 19, 19, 27
+ vxor 20, 20, 28
+ vxor 21, 21, 29
+ vxor 22, 22, 30
+
+ stxvb16x 47, 0, 9 # store output
+ stxvb16x 48, 15, 9 # store output
+ stxvb16x 49, 16, 9 # store output
+ stxvb16x 50, 17, 9 # store output
+ stxvb16x 51, 18, 9 # store output
+ stxvb16x 52, 19, 9 # store output
+ stxvb16x 53, 20, 9 # store output
+ stxvb16x 54, 21, 9 # store output
+
+ addi 9, 9, 128
+
+ vmr 15, 23
+ vmr 16, 24
+ vmr 17, 25
+ vmr 18, 26
+ vmr 19, 27
+ vmr 20, 28
+ vmr 21, 29
+ vmr 22, 30
+
+ # ghash here
+ vxor 15, 15, 0
+ PPC_GHASH4x 0, 15, 16, 17, 18
+
+ vxor 19, 19, 0
+ PPC_GHASH4x 0, 19, 20, 21, 22
+
+ xxlor 32+15, 9, 9 # last state
+ vadduwm 15, 15, 31 # state + counter
+ vadduwm 16, 15, 31
+ vadduwm 17, 16, 31
+ vadduwm 18, 17, 31
+ vadduwm 19, 18, 31
+ vadduwm 20, 19, 31
+ vadduwm 21, 20, 31
+ vadduwm 22, 21, 31
+ xxlor 9, 32+22, 32+22 # save last state
+
+ xxlor 32+27, 0, 0 # restore roundkey 0
+ vxor 15, 15, 27 # IV + round key - add round key 0
+ vxor 16, 16, 27
+ vxor 17, 17, 27
+ vxor 18, 18, 27
+ vxor 19, 19, 27
+ vxor 20, 20, 27
+ vxor 21, 21, 27
+ vxor 22, 22, 27
+
+ addi 5, 5, -128
+ addi 11, 11, 128
+
+ LOOP_8AES_STATE # process 8 AES keys
+ mtctr 22 # AES key loop
+ addi 10, 6, 144
+__LastLoop_aes_state_dec:
+ lxv 32+1, 0(10) # round key
+ AES_CIPHER_8x vcipher 15 1
+ addi 10, 10, 16
+ bdnz __LastLoop_aes_state_dec
+ lxv 32+1, 0(10) # last round key (v1)
+
+ addi 12, 12, -1
+ cmpdi 12, 0
+ bne __Loop_8x_block_dec
+
+__Finish_ghash_dec:
+ vcipherlast 15, 15, 1
+ vcipherlast 16, 16, 1
+ vcipherlast 17, 17, 1
+ vcipherlast 18, 18, 1
+ vcipherlast 19, 19, 1
+ vcipherlast 20, 20, 1
+ vcipherlast 21, 21, 1
+ vcipherlast 22, 22, 1
+
+ lxvb16x 32+23, 0, 14 # load block
+ lxvb16x 32+24, 15, 14 # load block
+ lxvb16x 32+25, 16, 14 # load block
+ lxvb16x 32+26, 17, 14 # load block
+ lxvb16x 32+27, 18, 14 # load block
+ lxvb16x 32+28, 19, 14 # load block
+ lxvb16x 32+29, 20, 14 # load block
+ lxvb16x 32+30, 21, 14 # load block
+ addi 14, 14, 128
+
+ vxor 15, 15, 23
+ vxor 16, 16, 24
+ vxor 17, 17, 25
+ vxor 18, 18, 26
+ vxor 19, 19, 27
+ vxor 20, 20, 28
+ vxor 21, 21, 29
+ vxor 22, 22, 30
+
+ stxvb16x 47, 0, 9 # store output
+ stxvb16x 48, 15, 9 # store output
+ stxvb16x 49, 16, 9 # store output
+ stxvb16x 50, 17, 9 # store output
+ stxvb16x 51, 18, 9 # store output
+ stxvb16x 52, 19, 9 # store output
+ stxvb16x 53, 20, 9 # store output
+ stxvb16x 54, 21, 9 # store output
+ addi 9, 9, 128
+
+ #vmr 15, 23
+ vxor 15, 23, 0
+ vmr 16, 24
+ vmr 17, 25
+ vmr 18, 26
+ vmr 19, 27
+ vmr 20, 28
+ vmr 21, 29
+ vmr 22, 30
+
+ #vxor 15, 15, 0
+ PPC_GHASH4x 0, 15, 16, 17, 18
+
+ vxor 19, 19, 0
+ PPC_GHASH4x 0, 19, 20, 21, 22
+
+ xxlor 30+32, 9, 9 # last ctr
+ vadduwm 30, 30, 31 # increase ctr
+ stxvb16x 32+30, 0, 7 # update IV
+ stxvb16x 32+0, 0, 8 # update Xi
+
+ addi 5, 5, -128
+ addi 11, 11, 128
+
+ #
+ # Done 8x blocks
+ #
+
+ cmpdi 5, 0
+ beq aes_gcm_out
+
+__Process_more_dec:
+ li 24, 0 # decrypt
+ bl aes_gcm_crypt_1x
+ cmpdi 5, 0
+ beq aes_gcm_out
+
+ bl __Process_partial
+ cmpdi 5, 0
+ beq aes_gcm_out
+__Do_combine_dec:
+ bl __Combine_partial
+ cmpdi 5, 0
+ bgt __Process_decrypt
+ b aes_gcm_out
+SYM_FUNC_END(aes_p10_gcm_decrypt)
+
+SYM_FUNC_START_LOCAL(aes_gcm_out)
+
+ mr 3, 11 # return count
RESTORE_REGS
blr
- #
- # 8x Decrypt
- #
-_GLOBAL(aes_p10_gcm_decrypt)
-.align 5
+__Invalid_msg_len:
+ li 3, 0
+ blr
+SYM_FUNC_END(aes_gcm_out)
- SAVE_REGS
-
- LOAD_HASH_TABLE
-
- # initialize ICB: GHASH( IV ), IV - r7
- lxvb16x 30+32, 0, 7 # load IV - v30
-
- mr 12, 5 # length
- li 11, 0 # block index
-
- # counter 1
- vxor 31, 31, 31
- vspltisb 22, 1
- vsldoi 31, 31, 22,1 # counter 1
-
- # load round key to VSR
- lxv 0, 0(6)
- lxv 1, 0x10(6)
- lxv 2, 0x20(6)
- lxv 3, 0x30(6)
- lxv 4, 0x40(6)
- lxv 5, 0x50(6)
- lxv 6, 0x60(6)
- lxv 7, 0x70(6)
- lxv 8, 0x80(6)
- lxv 9, 0x90(6)
- lxv 10, 0xa0(6)
-
- # load rounds - 10 (128), 12 (192), 14 (256)
- lwz 9,240(6)
-
- #
- # vxor state, state, w # addroundkey
- xxlor 32+29, 0, 0
- vxor 15, 30, 29 # IV + round key - add round key 0
-
- cmpdi 9, 10
- beq Loop_aes_gcm_8x_dec
-
- # load 2 more round keys (v11, v12)
- lxv 11, 0xb0(6)
- lxv 12, 0xc0(6)
-
- cmpdi 9, 12
- beq Loop_aes_gcm_8x_dec
-
- # load 2 more round keys (v11, v12, v13, v14)
- lxv 13, 0xd0(6)
- lxv 14, 0xe0(6)
- cmpdi 9, 14
- beq Loop_aes_gcm_8x_dec
-
- b aes_gcm_out
-
-.align 5
-Loop_aes_gcm_8x_dec:
- mr 14, 3
- mr 9, 4
-
- #
- # check partial block
- #
-Continue_partial_check_dec:
- ld 15, 56(7)
- cmpdi 15, 0
- beq Continue_dec
- bgt Final_block_dec
- cmpdi 15, 16
- blt Final_block_dec
-
-Continue_dec:
- # n blcoks
- li 10, 128
- divdu 10, 12, 10 # n 128 bytes-blocks
- cmpdi 10, 0
- beq Loop_last_block_dec
-
- vaddudm 30, 30, 31 # IV + counter
- vxor 16, 30, 29
- vaddudm 30, 30, 31
- vxor 17, 30, 29
- vaddudm 30, 30, 31
- vxor 18, 30, 29
- vaddudm 30, 30, 31
- vxor 19, 30, 29
- vaddudm 30, 30, 31
- vxor 20, 30, 29
- vaddudm 30, 30, 31
- vxor 21, 30, 29
- vaddudm 30, 30, 31
- vxor 22, 30, 29
-
- mtctr 10
-
- li 15, 16
- li 16, 32
- li 17, 48
- li 18, 64
- li 19, 80
- li 20, 96
- li 21, 112
-
- lwz 10, 240(6)
-
-Loop_8x_block_dec:
-
- lxvb16x 15, 0, 14 # load block
- lxvb16x 16, 15, 14 # load block
- lxvb16x 17, 16, 14 # load block
- lxvb16x 18, 17, 14 # load block
- lxvb16x 19, 18, 14 # load block
- lxvb16x 20, 19, 14 # load block
- lxvb16x 21, 20, 14 # load block
- lxvb16x 22, 21, 14 # load block
- addi 14, 14, 128
-
- Loop_aes_middle8x
-
- xxlor 23+32, 10, 10
-
- cmpdi 10, 10
- beq Do_next_ghash_dec
-
- # 192 bits
- xxlor 24+32, 11, 11
-
- vcipher 15, 15, 23
- vcipher 16, 16, 23
- vcipher 17, 17, 23
- vcipher 18, 18, 23
- vcipher 19, 19, 23
- vcipher 20, 20, 23
- vcipher 21, 21, 23
- vcipher 22, 22, 23
-
- vcipher 15, 15, 24
- vcipher 16, 16, 24
- vcipher 17, 17, 24
- vcipher 18, 18, 24
- vcipher 19, 19, 24
- vcipher 20, 20, 24
- vcipher 21, 21, 24
- vcipher 22, 22, 24
-
- xxlor 23+32, 12, 12
-
- cmpdi 10, 12
- beq Do_next_ghash_dec
-
- # 256 bits
- xxlor 24+32, 13, 13
-
- vcipher 15, 15, 23
- vcipher 16, 16, 23
- vcipher 17, 17, 23
- vcipher 18, 18, 23
- vcipher 19, 19, 23
- vcipher 20, 20, 23
- vcipher 21, 21, 23
- vcipher 22, 22, 23
-
- vcipher 15, 15, 24
- vcipher 16, 16, 24
- vcipher 17, 17, 24
- vcipher 18, 18, 24
- vcipher 19, 19, 24
- vcipher 20, 20, 24
- vcipher 21, 21, 24
- vcipher 22, 22, 24
-
- xxlor 23+32, 14, 14
-
- cmpdi 10, 14
- beq Do_next_ghash_dec
- b aes_gcm_out
-
-Do_next_ghash_dec:
-
- #
- # last round
- vcipherlast 15, 15, 23
- vcipherlast 16, 16, 23
-
- xxlxor 47, 47, 15
- stxvb16x 47, 0, 9 # store output
- xxlxor 48, 48, 16
- stxvb16x 48, 15, 9 # store output
-
- vcipherlast 17, 17, 23
- vcipherlast 18, 18, 23
-
- xxlxor 49, 49, 17
- stxvb16x 49, 16, 9 # store output
- xxlxor 50, 50, 18
- stxvb16x 50, 17, 9 # store output
-
- vcipherlast 19, 19, 23
- vcipherlast 20, 20, 23
-
- xxlxor 51, 51, 19
- stxvb16x 51, 18, 9 # store output
- xxlxor 52, 52, 20
- stxvb16x 52, 19, 9 # store output
-
- vcipherlast 21, 21, 23
- vcipherlast 22, 22, 23
-
- xxlxor 53, 53, 21
- stxvb16x 53, 20, 9 # store output
- xxlxor 54, 54, 22
- stxvb16x 54, 21, 9 # store output
-
- addi 9, 9, 128
-
- xxlor 15+32, 15, 15
- xxlor 16+32, 16, 16
- xxlor 17+32, 17, 17
- xxlor 18+32, 18, 18
- xxlor 19+32, 19, 19
- xxlor 20+32, 20, 20
- xxlor 21+32, 21, 21
- xxlor 22+32, 22, 22
-
- # ghash here
- ppc_aes_gcm_ghash2_4x
-
- xxlor 27+32, 0, 0
- vaddudm 30, 30, 31 # IV + counter
- vmr 29, 30
- vxor 15, 30, 27 # add round key
- vaddudm 30, 30, 31
- vxor 16, 30, 27
- vaddudm 30, 30, 31
- vxor 17, 30, 27
- vaddudm 30, 30, 31
- vxor 18, 30, 27
- vaddudm 30, 30, 31
- vxor 19, 30, 27
- vaddudm 30, 30, 31
- vxor 20, 30, 27
- vaddudm 30, 30, 31
- vxor 21, 30, 27
- vaddudm 30, 30, 31
- vxor 22, 30, 27
-
- addi 12, 12, -128
- addi 11, 11, 128
-
- bdnz Loop_8x_block_dec
-
- vmr 30, 29
- stxvb16x 30+32, 0, 7 # update IV
-
-Loop_last_block_dec:
- cmpdi 12, 0
- beq aes_gcm_out
-
- # loop last few blocks
- li 10, 16
- divdu 10, 12, 10
-
- mtctr 10
-
- lwz 10, 240(6)
-
- cmpdi 12, 16
- blt Final_block_dec
-
-Next_rem_block_dec:
- lxvb16x 15, 0, 14 # load block
-
- Loop_aes_middle_1x
-
- xxlor 23+32, 10, 10
-
- cmpdi 10, 10
- beq Do_next_1x_dec
-
- # 192 bits
- xxlor 24+32, 11, 11
-
- vcipher 15, 15, 23
- vcipher 15, 15, 24
-
- xxlor 23+32, 12, 12
-
- cmpdi 10, 12
- beq Do_next_1x_dec
-
- # 256 bits
- xxlor 24+32, 13, 13
-
- vcipher 15, 15, 23
- vcipher 15, 15, 24
-
- xxlor 23+32, 14, 14
-
- cmpdi 10, 14
- beq Do_next_1x_dec
-
-Do_next_1x_dec:
- vcipherlast 15, 15, 23
-
- xxlxor 47, 47, 15
- stxvb16x 47, 0, 9 # store output
- addi 14, 14, 16
- addi 9, 9, 16
-
- xxlor 28+32, 15, 15
- #vmr 28, 15
- ppc_update_hash_1x
-
- addi 12, 12, -16
- addi 11, 11, 16
- xxlor 19+32, 0, 0
- vaddudm 30, 30, 31 # IV + counter
- vxor 15, 30, 19 # add round key
-
- bdnz Next_rem_block_dec
-
- li 15, 0
- std 15, 56(7) # clear partial?
- stxvb16x 30+32, 0, 7 # update IV
- cmpdi 12, 0
- beq aes_gcm_out
-
-Final_block_dec:
- lwz 10, 240(6)
- Loop_aes_middle_1x
-
- xxlor 23+32, 10, 10
-
- cmpdi 10, 10
- beq Do_final_1x_dec
-
- # 192 bits
- xxlor 24+32, 11, 11
-
- vcipher 15, 15, 23
- vcipher 15, 15, 24
-
- xxlor 23+32, 12, 12
-
- cmpdi 10, 12
- beq Do_final_1x_dec
-
- # 256 bits
- xxlor 24+32, 13, 13
-
- vcipher 15, 15, 23
- vcipher 15, 15, 24
-
- xxlor 23+32, 14, 14
-
- cmpdi 10, 14
- beq Do_final_1x_dec
-
-Do_final_1x_dec:
- vcipherlast 15, 15, 23
-
- # check partial block
- li 21, 1 # decrypt
- ld 15, 56(7) # partial?
- cmpdi 15, 0
- beq Normal_block_dec
- bl Do_partial_block
- cmpdi 12, 0
- ble aes_gcm_out
-
- b Continue_partial_check_dec
-
-Normal_block_dec:
- lxvb16x 15, 0, 14 # load last block
- xxlxor 47, 47, 15
-
- # create partial block mask
- li 15, 16
- sub 15, 15, 12 # index to the mask
-
- vspltisb 16, -1 # first 16 bytes - 0xffff...ff
- vspltisb 17, 0 # second 16 bytes - 0x0000...00
- li 10, 192
- stvx 16, 10, 1
- addi 10, 10, 16
- stvx 17, 10, 1
-
- addi 10, 1, 192
- lxvb16x 16, 15, 10 # load partial block mask
- xxland 47, 47, 16
-
- xxland 32+28, 15, 16
- #vmr 28, 15
- ppc_update_hash_1x
-
- # * should store only the remaining bytes.
- bl Write_partial_block
-
- stxvb16x 30+32, 0, 7 # update IV
- std 12, 56(7) # update partial?
- li 16, 16
-
- stxvb16x 32, 0, 8 # write out Xi
- stxvb16x 32, 16, 8 # write out Xi
- b aes_gcm_out
+SYM_DATA_START_LOCAL(PERMX)
+.align 4
+# for vector permute and xor
+permx:
+.long 0x4c5d6e7f, 0x08192a3b, 0xc4d5e6f7, 0x8091a2b3
+SYM_DATA_END(permx)
diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl
index ebae841..d8b4ab7 100644
--- a/arch/powerpc/kernel/syscalls/syscall.tbl
+++ b/arch/powerpc/kernel/syscalls/syscall.tbl
@@ -553,3 +553,7 @@
460 common lsm_set_self_attr sys_lsm_set_self_attr
461 common lsm_list_modules sys_lsm_list_modules
462 common mseal sys_mseal
+463 common setxattrat sys_setxattrat
+464 common getxattrat sys_getxattrat
+465 common listxattrat sys_listxattrat
+466 common removexattrat sys_removexattrat
diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
index 34c0adb..742aa58 100644
--- a/arch/powerpc/kvm/book3s_64_vio.c
+++ b/arch/powerpc/kvm/book3s_64_vio.c
@@ -115,10 +115,9 @@ long kvm_spapr_tce_attach_iommu_group(struct kvm *kvm, int tablefd,
struct iommu_table_group *table_group;
long i;
struct kvmppc_spapr_tce_iommu_table *stit;
- struct fd f;
+ CLASS(fd, f)(tablefd);
- f = fdget(tablefd);
- if (!fd_file(f))
+ if (fd_empty(f))
return -EBADF;
rcu_read_lock();
@@ -130,16 +129,12 @@ long kvm_spapr_tce_attach_iommu_group(struct kvm *kvm, int tablefd,
}
rcu_read_unlock();
- if (!found) {
- fdput(f);
+ if (!found)
return -EINVAL;
- }
table_group = iommu_group_get_iommudata(grp);
- if (WARN_ON(!table_group)) {
- fdput(f);
+ if (WARN_ON(!table_group))
return -EFAULT;
- }
for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
struct iommu_table *tbltmp = table_group->tables[i];
@@ -160,10 +155,8 @@ long kvm_spapr_tce_attach_iommu_group(struct kvm *kvm, int tablefd,
break;
}
}
- if (!tbl) {
- fdput(f);
+ if (!tbl)
return -EINVAL;
- }
rcu_read_lock();
list_for_each_entry_rcu(stit, &stt->iommu_tables, next) {
@@ -174,7 +167,6 @@ long kvm_spapr_tce_attach_iommu_group(struct kvm *kvm, int tablefd,
/* stit is being destroyed */
iommu_tce_table_put(tbl);
rcu_read_unlock();
- fdput(f);
return -ENOTTY;
}
/*
@@ -182,7 +174,6 @@ long kvm_spapr_tce_attach_iommu_group(struct kvm *kvm, int tablefd,
* its KVM reference counter and can return.
*/
rcu_read_unlock();
- fdput(f);
return 0;
}
rcu_read_unlock();
@@ -190,7 +181,6 @@ long kvm_spapr_tce_attach_iommu_group(struct kvm *kvm, int tablefd,
stit = kzalloc(sizeof(*stit), GFP_KERNEL);
if (!stit) {
iommu_tce_table_put(tbl);
- fdput(f);
return -ENOMEM;
}
@@ -199,7 +189,6 @@ long kvm_spapr_tce_attach_iommu_group(struct kvm *kvm, int tablefd,
list_add_rcu(&stit->next, &stt->iommu_tables);
- fdput(f);
return 0;
}
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index f143299..b3b37ea 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -1933,12 +1933,11 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
#endif
#ifdef CONFIG_KVM_MPIC
case KVM_CAP_IRQ_MPIC: {
- struct fd f;
+ CLASS(fd, f)(cap->args[0]);
struct kvm_device *dev;
r = -EBADF;
- f = fdget(cap->args[0]);
- if (!fd_file(f))
+ if (fd_empty(f))
break;
r = -EPERM;
@@ -1946,18 +1945,16 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
if (dev)
r = kvmppc_mpic_connect_vcpu(dev, vcpu, cap->args[1]);
- fdput(f);
break;
}
#endif
#ifdef CONFIG_KVM_XICS
case KVM_CAP_IRQ_XICS: {
- struct fd f;
+ CLASS(fd, f)(cap->args[0]);
struct kvm_device *dev;
r = -EBADF;
- f = fdget(cap->args[0]);
- if (!fd_file(f))
+ if (fd_empty(f))
break;
r = -EPERM;
@@ -1968,34 +1965,27 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
else
r = kvmppc_xics_connect_vcpu(dev, vcpu, cap->args[1]);
}
-
- fdput(f);
break;
}
#endif /* CONFIG_KVM_XICS */
#ifdef CONFIG_KVM_XIVE
case KVM_CAP_PPC_IRQ_XIVE: {
- struct fd f;
+ CLASS(fd, f)(cap->args[0]);
struct kvm_device *dev;
r = -EBADF;
- f = fdget(cap->args[0]);
- if (!fd_file(f))
+ if (fd_empty(f))
break;
r = -ENXIO;
- if (!xive_enabled()) {
- fdput(f);
+ if (!xive_enabled())
break;
- }
r = -EPERM;
dev = kvm_device_from_filp(fd_file(f));
if (dev)
r = kvmppc_xive_native_connect_vcpu(dev, vcpu,
cap->args[1]);
-
- fdput(f);
break;
}
#endif /* CONFIG_KVM_XIVE */
diff --git a/arch/powerpc/platforms/cell/spu_syscalls.c b/arch/powerpc/platforms/cell/spu_syscalls.c
index cd7d42f..000894e 100644
--- a/arch/powerpc/platforms/cell/spu_syscalls.c
+++ b/arch/powerpc/platforms/cell/spu_syscalls.c
@@ -36,6 +36,9 @@ static inline struct spufs_calls *spufs_calls_get(void)
static inline void spufs_calls_put(struct spufs_calls *calls)
{
+ if (!calls)
+ return;
+
BUG_ON(calls != spufs_calls);
/* we don't need to rcu this, as we hold a reference to the module */
@@ -53,82 +56,55 @@ static inline void spufs_calls_put(struct spufs_calls *calls) { }
#endif /* CONFIG_SPU_FS_MODULE */
+DEFINE_CLASS(spufs_calls, struct spufs_calls *, spufs_calls_put(_T), spufs_calls_get(), void)
+
SYSCALL_DEFINE4(spu_create, const char __user *, name, unsigned int, flags,
umode_t, mode, int, neighbor_fd)
{
- long ret;
- struct spufs_calls *calls;
-
- calls = spufs_calls_get();
+ CLASS(spufs_calls, calls)();
if (!calls)
return -ENOSYS;
if (flags & SPU_CREATE_AFFINITY_SPU) {
- struct fd neighbor = fdget(neighbor_fd);
- ret = -EBADF;
- if (fd_file(neighbor)) {
- ret = calls->create_thread(name, flags, mode, fd_file(neighbor));
- fdput(neighbor);
- }
- } else
- ret = calls->create_thread(name, flags, mode, NULL);
-
- spufs_calls_put(calls);
- return ret;
+ CLASS(fd, neighbor)(neighbor_fd);
+ if (fd_empty(neighbor))
+ return -EBADF;
+ return calls->create_thread(name, flags, mode, fd_file(neighbor));
+ } else {
+ return calls->create_thread(name, flags, mode, NULL);
+ }
}
SYSCALL_DEFINE3(spu_run,int, fd, __u32 __user *, unpc, __u32 __user *, ustatus)
{
- long ret;
- struct fd arg;
- struct spufs_calls *calls;
-
- calls = spufs_calls_get();
+ CLASS(spufs_calls, calls)();
if (!calls)
return -ENOSYS;
- ret = -EBADF;
- arg = fdget(fd);
- if (fd_file(arg)) {
- ret = calls->spu_run(fd_file(arg), unpc, ustatus);
- fdput(arg);
- }
+ CLASS(fd, arg)(fd);
+ if (fd_empty(arg))
+ return -EBADF;
- spufs_calls_put(calls);
- return ret;
+ return calls->spu_run(fd_file(arg), unpc, ustatus);
}
#ifdef CONFIG_COREDUMP
int elf_coredump_extra_notes_size(void)
{
- struct spufs_calls *calls;
- int ret;
-
- calls = spufs_calls_get();
+ CLASS(spufs_calls, calls)();
if (!calls)
return 0;
- ret = calls->coredump_extra_notes_size();
-
- spufs_calls_put(calls);
-
- return ret;
+ return calls->coredump_extra_notes_size();
}
int elf_coredump_extra_notes_write(struct coredump_params *cprm)
{
- struct spufs_calls *calls;
- int ret;
-
- calls = spufs_calls_get();
+ CLASS(spufs_calls, calls)();
if (!calls)
return 0;
- ret = calls->coredump_extra_notes_write(cprm);
-
- spufs_calls_put(calls);
-
- return ret;
+ return calls->coredump_extra_notes_write(cprm);
}
#endif
diff --git a/arch/powerpc/platforms/cell/spufs/coredump.c b/arch/powerpc/platforms/cell/spufs/coredump.c
index 18daafb..301ee7d 100644
--- a/arch/powerpc/platforms/cell/spufs/coredump.c
+++ b/arch/powerpc/platforms/cell/spufs/coredump.c
@@ -73,9 +73,7 @@ static struct spu_context *coredump_next_context(int *fd)
return NULL;
*fd = n - 1;
- rcu_read_lock();
- file = lookup_fdget_rcu(*fd);
- rcu_read_unlock();
+ file = fget_raw(*fd);
if (file) {
ctx = SPUFS_I(file_inode(file))->i_ctx;
get_spu_context(ctx);
diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index f4c5705..fa8f2da 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -898,6 +898,9 @@
config ARCH_SUPPORTS_CRASH_DUMP
def_bool y
+config ARCH_DEFAULT_CRASH_DUMP
+ def_bool y
+
config ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION
def_bool CRASH_RESERVE
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index d339fe4f..a45259d 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -52,6 +52,13 @@
depends on KASAN
default 0x1C000000000000
+config GCC_ASM_FLAG_OUTPUT_BROKEN
+ def_bool CC_IS_GCC && GCC_VERSION < 140200
+ help
+ GCC versions before 14.2.0 may die with an internal
+ compiler error in some configurations if flag output
+ operands are used within inline assemblies.
+
config S390
def_bool y
#
@@ -224,6 +231,7 @@
select HAVE_VIRT_CPU_ACCOUNTING_IDLE
select IOMMU_HELPER if PCI
select IOMMU_SUPPORT if PCI
+ select LOCK_MM_AND_FIND_VMA
select MMU_GATHER_MERGE_VMAS
select MMU_GATHER_NO_GATHER
select MMU_GATHER_RCU_TABLE_FREE
@@ -276,6 +284,9 @@
This option also enables s390 zfcpdump.
See also <file:Documentation/arch/s390/zfcpdump.rst>
+config ARCH_DEFAULT_CRASH_DUMP
+ def_bool y
+
menu "Processor type and features"
config HAVE_MARCH_Z10_FEATURES
diff --git a/arch/s390/boot/physmem_info.c b/arch/s390/boot/physmem_info.c
index 1d131a8..7617aa2 100644
--- a/arch/s390/boot/physmem_info.c
+++ b/arch/s390/boot/physmem_info.c
@@ -9,6 +9,7 @@
#include <asm/sections.h>
#include <asm/setup.h>
#include <asm/sclp.h>
+#include <asm/asm.h>
#include <asm/uv.h>
#include "decompressor.h"
#include "boot.h"
@@ -59,13 +60,13 @@ static int __diag260(unsigned long rx1, unsigned long rx2)
{
unsigned long reg1, reg2, ry;
union register_pair rx;
+ int cc, exception;
psw_t old;
- int rc;
rx.even = rx1;
rx.odd = rx2;
ry = 0x10; /* storage configuration */
- rc = -1; /* fail */
+ exception = 1;
asm volatile(
" mvc 0(16,%[psw_old]),0(%[psw_pgm])\n"
" epsw %[reg1],%[reg2]\n"
@@ -74,20 +75,22 @@ static int __diag260(unsigned long rx1, unsigned long rx2)
" larl %[reg1],1f\n"
" stg %[reg1],8(%[psw_pgm])\n"
" diag %[rx],%[ry],0x260\n"
- " ipm %[rc]\n"
- " srl %[rc],28\n"
+ " lhi %[exc],0\n"
"1: mvc 0(16,%[psw_pgm]),0(%[psw_old])\n"
- : [reg1] "=&d" (reg1),
+ CC_IPM(cc)
+ : CC_OUT(cc, cc),
+ [exc] "+d" (exception),
+ [reg1] "=&d" (reg1),
[reg2] "=&a" (reg2),
- [rc] "+&d" (rc),
[ry] "+&d" (ry),
"+Q" (get_lowcore()->program_new_psw),
"=Q" (old)
: [rx] "d" (rx.pair),
[psw_old] "a" (&old),
[psw_pgm] "a" (&get_lowcore()->program_new_psw)
- : "cc", "memory");
- return rc == 0 ? ry : -1;
+ : CC_CLOBBER_LIST("memory"));
+ cc = exception ? -1 : CC_TRANSFORM(cc);
+ return cc == 0 ? ry : -1;
}
static int diag260(void)
@@ -109,10 +112,12 @@ static int diag260(void)
return 0;
}
-static int tprot(unsigned long addr)
+#define DIAG500_SC_STOR_LIMIT 4
+
+static int diag500_storage_limit(unsigned long *max_physmem_end)
{
+ unsigned long storage_limit;
unsigned long reg1, reg2;
- int rc = -EFAULT;
psw_t old;
asm volatile(
@@ -122,20 +127,57 @@ static int tprot(unsigned long addr)
" st %[reg2],4(%[psw_pgm])\n"
" larl %[reg1],1f\n"
" stg %[reg1],8(%[psw_pgm])\n"
- " tprot 0(%[addr]),0\n"
- " ipm %[rc]\n"
- " srl %[rc],28\n"
+ " lghi 1,%[subcode]\n"
+ " lghi 2,0\n"
+ " diag 2,4,0x500\n"
"1: mvc 0(16,%[psw_pgm]),0(%[psw_old])\n"
+ " lgr %[slimit],2\n"
: [reg1] "=&d" (reg1),
[reg2] "=&a" (reg2),
- [rc] "+&d" (rc),
+ [slimit] "=d" (storage_limit),
+ "=Q" (get_lowcore()->program_new_psw),
+ "=Q" (old)
+ : [psw_old] "a" (&old),
+ [psw_pgm] "a" (&get_lowcore()->program_new_psw),
+ [subcode] "i" (DIAG500_SC_STOR_LIMIT)
+ : "memory", "1", "2");
+ if (!storage_limit)
+ return -EINVAL;
+ /* Convert inclusive end to exclusive end */
+ *max_physmem_end = storage_limit + 1;
+ return 0;
+}
+
+static int tprot(unsigned long addr)
+{
+ unsigned long reg1, reg2;
+ int cc, exception;
+ psw_t old;
+
+ exception = 1;
+ asm volatile(
+ " mvc 0(16,%[psw_old]),0(%[psw_pgm])\n"
+ " epsw %[reg1],%[reg2]\n"
+ " st %[reg1],0(%[psw_pgm])\n"
+ " st %[reg2],4(%[psw_pgm])\n"
+ " larl %[reg1],1f\n"
+ " stg %[reg1],8(%[psw_pgm])\n"
+ " tprot 0(%[addr]),0\n"
+ " lhi %[exc],0\n"
+ "1: mvc 0(16,%[psw_pgm]),0(%[psw_old])\n"
+ CC_IPM(cc)
+ : CC_OUT(cc, cc),
+ [exc] "+d" (exception),
+ [reg1] "=&d" (reg1),
+ [reg2] "=&a" (reg2),
"=Q" (get_lowcore()->program_new_psw.addr),
"=Q" (old)
: [psw_old] "a" (&old),
[psw_pgm] "a" (&get_lowcore()->program_new_psw),
[addr] "a" (addr)
- : "cc", "memory");
- return rc;
+ : CC_CLOBBER_LIST("memory"));
+ cc = exception ? -EFAULT : CC_TRANSFORM(cc);
+ return cc;
}
static unsigned long search_mem_end(void)
@@ -157,7 +199,9 @@ unsigned long detect_max_physmem_end(void)
{
unsigned long max_physmem_end = 0;
- if (!sclp_early_get_memsize(&max_physmem_end)) {
+ if (!diag500_storage_limit(&max_physmem_end)) {
+ physmem_info.info_source = MEM_DETECT_DIAG500_STOR_LIMIT;
+ } else if (!sclp_early_get_memsize(&max_physmem_end)) {
physmem_info.info_source = MEM_DETECT_SCLP_READ_INFO;
} else {
max_physmem_end = search_mem_end();
@@ -170,6 +214,13 @@ void detect_physmem_online_ranges(unsigned long max_physmem_end)
{
if (!sclp_early_read_storage_info()) {
physmem_info.info_source = MEM_DETECT_SCLP_STOR_INFO;
+ } else if (physmem_info.info_source == MEM_DETECT_DIAG500_STOR_LIMIT) {
+ unsigned long online_end;
+
+ if (!sclp_early_get_memsize(&online_end)) {
+ physmem_info.info_source = MEM_DETECT_SCLP_READ_INFO;
+ add_physmem_online_range(0, online_end);
+ }
} else if (!diag260()) {
physmem_info.info_source = MEM_DETECT_DIAG260;
} else if (max_physmem_end) {
diff --git a/arch/s390/boot/startup.c b/arch/s390/boot/startup.c
index c8f149a..abe6e6c 100644
--- a/arch/s390/boot/startup.c
+++ b/arch/s390/boot/startup.c
@@ -182,12 +182,15 @@ static void kaslr_adjust_got(unsigned long offset)
* Merge information from several sources into a single ident_map_size value.
* "ident_map_size" represents the upper limit of physical memory we may ever
* reach. It might not be all online memory, but also include standby (offline)
- * memory. "ident_map_size" could be lower then actual standby or even online
+ * memory or memory areas reserved for other means (e.g., memory devices such as
+ * virtio-mem).
+ *
+ * "ident_map_size" could be lower then actual standby/reserved or even online
* memory present, due to limiting factors. We should never go above this limit.
* It is the size of our identity mapping.
*
* Consider the following factors:
- * 1. max_physmem_end - end of physical memory online or standby.
+ * 1. max_physmem_end - end of physical memory online, standby or reserved.
* Always >= end of the last online memory range (get_physmem_online_end()).
* 2. CONFIG_MAX_PHYSMEM_BITS - the maximum size of physical memory the
* kernel is able to support.
@@ -480,7 +483,7 @@ void startup_kernel(void)
* __vmlinux_relocs_64_end as the lower range address. However,
* .amode31 section is written to by the decompressed kernel - at
* that time the contents of .vmlinux.relocs is not needed anymore.
- * Conversly, .vmlinux.relocs is read only by the decompressor, even
+ * Conversely, .vmlinux.relocs is read only by the decompressor, even
* before the kernel started. Therefore, in case the two sections
* overlap there is no risk of corrupting any data.
*/
diff --git a/arch/s390/boot/uv.c b/arch/s390/boot/uv.c
index 318e6ba..4568e8f 100644
--- a/arch/s390/boot/uv.c
+++ b/arch/s390/boot/uv.c
@@ -22,8 +22,8 @@ void uv_query_info(void)
if (!test_facility(158))
return;
- /* rc==0x100 means that there is additional data we do not process */
- if (uv_call(0, (uint64_t)&uvcb) && uvcb.header.rc != 0x100)
+ /* Ignore that there might be more data we do not process */
+ if (uv_call(0, (uint64_t)&uvcb) && uvcb.header.rc != UVC_RC_MORE_DATA)
return;
if (IS_ENABLED(CONFIG_KVM)) {
@@ -46,7 +46,8 @@ void uv_query_info(void)
uv_info.supp_add_secret_req_ver = uvcb.supp_add_secret_req_ver;
uv_info.supp_add_secret_pcf = uvcb.supp_add_secret_pcf;
uv_info.supp_secret_types = uvcb.supp_secret_types;
- uv_info.max_secrets = uvcb.max_secrets;
+ uv_info.max_assoc_secrets = uvcb.max_assoc_secrets;
+ uv_info.max_retr_secrets = uvcb.max_retr_secrets;
}
if (test_bit_inv(BIT_UVC_CMD_SET_SHARED_ACCESS, (unsigned long *)uvcb.inst_calls_list) &&
diff --git a/arch/s390/configs/debug_defconfig b/arch/s390/configs/debug_defconfig
index fb0e9a1..d8d227a 100644
--- a/arch/s390/configs/debug_defconfig
+++ b/arch/s390/configs/debug_defconfig
@@ -625,6 +625,7 @@
CONFIG_MLX5_VFIO_PCI=m
CONFIG_VIRTIO_PCI=m
CONFIG_VIRTIO_BALLOON=m
+CONFIG_VIRTIO_MEM=m
CONFIG_VIRTIO_INPUT=y
CONFIG_VHOST_NET=m
CONFIG_VHOST_VSOCK=m
@@ -810,6 +811,7 @@
CONFIG_PKEY_CCA=m
CONFIG_PKEY_EP11=m
CONFIG_PKEY_PCKMO=m
+CONFIG_PKEY_UV=m
CONFIG_CRYPTO_PAES_S390=m
CONFIG_CRYPTO_DEV_VIRTIO=m
CONFIG_SYSTEM_BLACKLIST_KEYRING=y
diff --git a/arch/s390/configs/defconfig b/arch/s390/configs/defconfig
index 88be0a7..6c2f2bb 100644
--- a/arch/s390/configs/defconfig
+++ b/arch/s390/configs/defconfig
@@ -615,6 +615,7 @@
CONFIG_MLX5_VFIO_PCI=m
CONFIG_VIRTIO_PCI=m
CONFIG_VIRTIO_BALLOON=m
+CONFIG_VIRTIO_MEM=m
CONFIG_VIRTIO_INPUT=y
CONFIG_VHOST_NET=m
CONFIG_VHOST_VSOCK=m
@@ -797,6 +798,7 @@
CONFIG_PKEY_CCA=m
CONFIG_PKEY_EP11=m
CONFIG_PKEY_PCKMO=m
+CONFIG_PKEY_UV=m
CONFIG_CRYPTO_PAES_S390=m
CONFIG_CRYPTO_DEV_VIRTIO=m
CONFIG_SYSTEM_BLACKLIST_KEYRING=y
diff --git a/arch/s390/crypto/paes_s390.c b/arch/s390/crypto/paes_s390.c
index ef4491c..5110937 100644
--- a/arch/s390/crypto/paes_s390.c
+++ b/arch/s390/crypto/paes_s390.c
@@ -34,14 +34,22 @@
* is called. As paes can handle different kinds of key blobs
* and padding is also possible, the limits need to be generous.
*/
-#define PAES_MIN_KEYSIZE 16
-#define PAES_MAX_KEYSIZE MAXEP11AESKEYBLOBSIZE
+#define PAES_MIN_KEYSIZE 16
+#define PAES_MAX_KEYSIZE MAXEP11AESKEYBLOBSIZE
+#define PAES_256_PROTKEY_SIZE (32 + 32) /* key + verification pattern */
+#define PXTS_256_PROTKEY_SIZE (32 + 32 + 32) /* k1 + k2 + verification pattern */
static u8 *ctrblk;
static DEFINE_MUTEX(ctrblk_lock);
static cpacf_mask_t km_functions, kmc_functions, kmctr_functions;
+struct paes_protkey {
+ u32 type;
+ u32 len;
+ u8 protkey[PXTS_256_PROTKEY_SIZE];
+};
+
struct key_blob {
/*
* Small keys will be stored in the keybuf. Larger keys are
@@ -55,31 +63,43 @@ struct key_blob {
unsigned int keylen;
};
-static inline int _key_to_kb(struct key_blob *kb,
- const u8 *key,
- unsigned int keylen)
+/*
+ * make_clrkey_token() - wrap the raw key ck with pkey clearkey token
+ * information.
+ * @returns the size of the clearkey token
+ */
+static inline u32 make_clrkey_token(const u8 *ck, size_t cklen, u8 *dest)
{
- struct clearkey_header {
+ struct clrkey_token {
u8 type;
u8 res0[3];
u8 version;
u8 res1[3];
u32 keytype;
u32 len;
- } __packed * h;
+ u8 key[];
+ } __packed *token = (struct clrkey_token *)dest;
+ token->type = 0x00;
+ token->version = 0x02;
+ token->keytype = (cklen - 8) >> 3;
+ token->len = cklen;
+ memcpy(token->key, ck, cklen);
+
+ return sizeof(*token) + cklen;
+}
+
+static inline int _key_to_kb(struct key_blob *kb,
+ const u8 *key,
+ unsigned int keylen)
+{
switch (keylen) {
case 16:
case 24:
case 32:
/* clear key value, prepare pkey clear key token in keybuf */
memset(kb->keybuf, 0, sizeof(kb->keybuf));
- h = (struct clearkey_header *) kb->keybuf;
- h->version = 0x02; /* TOKVER_CLEAR_KEY */
- h->keytype = (keylen - 8) >> 3;
- h->len = keylen;
- memcpy(kb->keybuf + sizeof(*h), key, keylen);
- kb->keylen = sizeof(*h) + keylen;
+ kb->keylen = make_clrkey_token(key, keylen, kb->keybuf);
kb->key = kb->keybuf;
break;
default:
@@ -99,6 +119,40 @@ static inline int _key_to_kb(struct key_blob *kb,
return 0;
}
+static inline int _xts_key_to_kb(struct key_blob *kb,
+ const u8 *key,
+ unsigned int keylen)
+{
+ size_t cklen = keylen / 2;
+
+ memset(kb->keybuf, 0, sizeof(kb->keybuf));
+
+ switch (keylen) {
+ case 32:
+ case 64:
+ /* clear key value, prepare pkey clear key tokens in keybuf */
+ kb->key = kb->keybuf;
+ kb->keylen = make_clrkey_token(key, cklen, kb->key);
+ kb->keylen += make_clrkey_token(key + cklen, cklen,
+ kb->key + kb->keylen);
+ break;
+ default:
+ /* other key material, let pkey handle this */
+ if (keylen <= sizeof(kb->keybuf)) {
+ kb->key = kb->keybuf;
+ } else {
+ kb->key = kmalloc(keylen, GFP_KERNEL);
+ if (!kb->key)
+ return -ENOMEM;
+ }
+ memcpy(kb->key, key, keylen);
+ kb->keylen = keylen;
+ break;
+ }
+
+ return 0;
+}
+
static inline void _free_kb_keybuf(struct key_blob *kb)
{
if (kb->key && kb->key != kb->keybuf
@@ -106,52 +160,53 @@ static inline void _free_kb_keybuf(struct key_blob *kb)
kfree_sensitive(kb->key);
kb->key = NULL;
}
+ memzero_explicit(kb->keybuf, sizeof(kb->keybuf));
}
struct s390_paes_ctx {
struct key_blob kb;
- struct pkey_protkey pk;
+ struct paes_protkey pk;
spinlock_t pk_lock;
unsigned long fc;
};
struct s390_pxts_ctx {
- struct key_blob kb[2];
- struct pkey_protkey pk[2];
+ struct key_blob kb;
+ struct paes_protkey pk[2];
spinlock_t pk_lock;
unsigned long fc;
};
-static inline int __paes_keyblob2pkey(struct key_blob *kb,
- struct pkey_protkey *pk)
+static inline int __paes_keyblob2pkey(const u8 *key, unsigned int keylen,
+ struct paes_protkey *pk)
{
- int i, ret = -EIO;
+ int i, rc = -EIO;
/* try three times in case of busy card */
- for (i = 0; ret && i < 3; i++) {
- if (ret == -EBUSY && in_task()) {
+ for (i = 0; rc && i < 3; i++) {
+ if (rc == -EBUSY && in_task()) {
if (msleep_interruptible(1000))
return -EINTR;
}
- ret = pkey_key2protkey(kb->key, kb->keylen,
- pk->protkey, &pk->len, &pk->type);
+ rc = pkey_key2protkey(key, keylen, pk->protkey, &pk->len,
+ &pk->type);
}
- return ret;
+ return rc;
}
static inline int __paes_convert_key(struct s390_paes_ctx *ctx)
{
- int ret;
- struct pkey_protkey pkey;
+ struct paes_protkey pk;
+ int rc;
- pkey.len = sizeof(pkey.protkey);
- ret = __paes_keyblob2pkey(&ctx->kb, &pkey);
- if (ret)
- return ret;
+ pk.len = sizeof(pk.protkey);
+ rc = __paes_keyblob2pkey(ctx->kb.key, ctx->kb.keylen, &pk);
+ if (rc)
+ return rc;
spin_lock_bh(&ctx->pk_lock);
- memcpy(&ctx->pk, &pkey, sizeof(pkey));
+ memcpy(&ctx->pk, &pk, sizeof(pk));
spin_unlock_bh(&ctx->pk_lock);
return 0;
@@ -176,8 +231,8 @@ static void ecb_paes_exit(struct crypto_skcipher *tfm)
static inline int __ecb_paes_set_key(struct s390_paes_ctx *ctx)
{
- int rc;
unsigned long fc;
+ int rc;
rc = __paes_convert_key(ctx);
if (rc)
@@ -197,8 +252,8 @@ static inline int __ecb_paes_set_key(struct s390_paes_ctx *ctx)
static int ecb_paes_set_key(struct crypto_skcipher *tfm, const u8 *in_key,
unsigned int key_len)
{
- int rc;
struct s390_paes_ctx *ctx = crypto_skcipher_ctx(tfm);
+ int rc;
_free_kb_keybuf(&ctx->kb);
rc = _key_to_kb(&ctx->kb, in_key, key_len);
@@ -212,19 +267,19 @@ static int ecb_paes_crypt(struct skcipher_request *req, unsigned long modifier)
{
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
struct s390_paes_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct {
+ u8 key[PAES_256_PROTKEY_SIZE];
+ } param;
struct skcipher_walk walk;
unsigned int nbytes, n, k;
- int ret;
- struct {
- u8 key[MAXPROTKEYSIZE];
- } param;
+ int rc;
- ret = skcipher_walk_virt(&walk, req, false);
- if (ret)
- return ret;
+ rc = skcipher_walk_virt(&walk, req, false);
+ if (rc)
+ return rc;
spin_lock_bh(&ctx->pk_lock);
- memcpy(param.key, ctx->pk.protkey, MAXPROTKEYSIZE);
+ memcpy(param.key, ctx->pk.protkey, PAES_256_PROTKEY_SIZE);
spin_unlock_bh(&ctx->pk_lock);
while ((nbytes = walk.nbytes) != 0) {
@@ -233,16 +288,16 @@ static int ecb_paes_crypt(struct skcipher_request *req, unsigned long modifier)
k = cpacf_km(ctx->fc | modifier, ¶m,
walk.dst.virt.addr, walk.src.virt.addr, n);
if (k)
- ret = skcipher_walk_done(&walk, nbytes - k);
+ rc = skcipher_walk_done(&walk, nbytes - k);
if (k < n) {
if (__paes_convert_key(ctx))
return skcipher_walk_done(&walk, -EIO);
spin_lock_bh(&ctx->pk_lock);
- memcpy(param.key, ctx->pk.protkey, MAXPROTKEYSIZE);
+ memcpy(param.key, ctx->pk.protkey, PAES_256_PROTKEY_SIZE);
spin_unlock_bh(&ctx->pk_lock);
}
}
- return ret;
+ return rc;
}
static int ecb_paes_encrypt(struct skcipher_request *req)
@@ -291,8 +346,8 @@ static void cbc_paes_exit(struct crypto_skcipher *tfm)
static inline int __cbc_paes_set_key(struct s390_paes_ctx *ctx)
{
- int rc;
unsigned long fc;
+ int rc;
rc = __paes_convert_key(ctx);
if (rc)
@@ -312,8 +367,8 @@ static inline int __cbc_paes_set_key(struct s390_paes_ctx *ctx)
static int cbc_paes_set_key(struct crypto_skcipher *tfm, const u8 *in_key,
unsigned int key_len)
{
- int rc;
struct s390_paes_ctx *ctx = crypto_skcipher_ctx(tfm);
+ int rc;
_free_kb_keybuf(&ctx->kb);
rc = _key_to_kb(&ctx->kb, in_key, key_len);
@@ -327,21 +382,21 @@ static int cbc_paes_crypt(struct skcipher_request *req, unsigned long modifier)
{
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
struct s390_paes_ctx *ctx = crypto_skcipher_ctx(tfm);
- struct skcipher_walk walk;
- unsigned int nbytes, n, k;
- int ret;
struct {
u8 iv[AES_BLOCK_SIZE];
- u8 key[MAXPROTKEYSIZE];
+ u8 key[PAES_256_PROTKEY_SIZE];
} param;
+ struct skcipher_walk walk;
+ unsigned int nbytes, n, k;
+ int rc;
- ret = skcipher_walk_virt(&walk, req, false);
- if (ret)
- return ret;
+ rc = skcipher_walk_virt(&walk, req, false);
+ if (rc)
+ return rc;
memcpy(param.iv, walk.iv, AES_BLOCK_SIZE);
spin_lock_bh(&ctx->pk_lock);
- memcpy(param.key, ctx->pk.protkey, MAXPROTKEYSIZE);
+ memcpy(param.key, ctx->pk.protkey, PAES_256_PROTKEY_SIZE);
spin_unlock_bh(&ctx->pk_lock);
while ((nbytes = walk.nbytes) != 0) {
@@ -351,17 +406,17 @@ static int cbc_paes_crypt(struct skcipher_request *req, unsigned long modifier)
walk.dst.virt.addr, walk.src.virt.addr, n);
if (k) {
memcpy(walk.iv, param.iv, AES_BLOCK_SIZE);
- ret = skcipher_walk_done(&walk, nbytes - k);
+ rc = skcipher_walk_done(&walk, nbytes - k);
}
if (k < n) {
if (__paes_convert_key(ctx))
return skcipher_walk_done(&walk, -EIO);
spin_lock_bh(&ctx->pk_lock);
- memcpy(param.key, ctx->pk.protkey, MAXPROTKEYSIZE);
+ memcpy(param.key, ctx->pk.protkey, PAES_256_PROTKEY_SIZE);
spin_unlock_bh(&ctx->pk_lock);
}
}
- return ret;
+ return rc;
}
static int cbc_paes_encrypt(struct skcipher_request *req)
@@ -396,8 +451,7 @@ static int xts_paes_init(struct crypto_skcipher *tfm)
{
struct s390_pxts_ctx *ctx = crypto_skcipher_ctx(tfm);
- ctx->kb[0].key = NULL;
- ctx->kb[1].key = NULL;
+ ctx->kb.key = NULL;
spin_lock_init(&ctx->pk_lock);
return 0;
@@ -407,24 +461,51 @@ static void xts_paes_exit(struct crypto_skcipher *tfm)
{
struct s390_pxts_ctx *ctx = crypto_skcipher_ctx(tfm);
- _free_kb_keybuf(&ctx->kb[0]);
- _free_kb_keybuf(&ctx->kb[1]);
+ _free_kb_keybuf(&ctx->kb);
}
static inline int __xts_paes_convert_key(struct s390_pxts_ctx *ctx)
{
- struct pkey_protkey pkey0, pkey1;
+ struct paes_protkey pk0, pk1;
+ size_t split_keylen;
+ int rc;
- pkey0.len = sizeof(pkey0.protkey);
- pkey1.len = sizeof(pkey1.protkey);
+ pk0.len = sizeof(pk0.protkey);
+ pk1.len = sizeof(pk1.protkey);
- if (__paes_keyblob2pkey(&ctx->kb[0], &pkey0) ||
- __paes_keyblob2pkey(&ctx->kb[1], &pkey1))
+ rc = __paes_keyblob2pkey(ctx->kb.key, ctx->kb.keylen, &pk0);
+ if (rc)
+ return rc;
+
+ switch (pk0.type) {
+ case PKEY_KEYTYPE_AES_128:
+ case PKEY_KEYTYPE_AES_256:
+ /* second keytoken required */
+ if (ctx->kb.keylen % 2)
+ return -EINVAL;
+ split_keylen = ctx->kb.keylen / 2;
+
+ rc = __paes_keyblob2pkey(ctx->kb.key + split_keylen,
+ split_keylen, &pk1);
+ if (rc)
+ return rc;
+
+ if (pk0.type != pk1.type)
+ return -EINVAL;
+ break;
+ case PKEY_KEYTYPE_AES_XTS_128:
+ case PKEY_KEYTYPE_AES_XTS_256:
+ /* single key */
+ pk1.type = 0;
+ break;
+ default:
+ /* unsupported protected keytype */
return -EINVAL;
+ }
spin_lock_bh(&ctx->pk_lock);
- memcpy(&ctx->pk[0], &pkey0, sizeof(pkey0));
- memcpy(&ctx->pk[1], &pkey1, sizeof(pkey1));
+ ctx->pk[0] = pk0;
+ ctx->pk[1] = pk1;
spin_unlock_bh(&ctx->pk_lock);
return 0;
@@ -433,17 +514,30 @@ static inline int __xts_paes_convert_key(struct s390_pxts_ctx *ctx)
static inline int __xts_paes_set_key(struct s390_pxts_ctx *ctx)
{
unsigned long fc;
+ int rc;
- if (__xts_paes_convert_key(ctx))
- return -EINVAL;
-
- if (ctx->pk[0].type != ctx->pk[1].type)
- return -EINVAL;
+ rc = __xts_paes_convert_key(ctx);
+ if (rc)
+ return rc;
/* Pick the correct function code based on the protected key type */
- fc = (ctx->pk[0].type == PKEY_KEYTYPE_AES_128) ? CPACF_KM_PXTS_128 :
- (ctx->pk[0].type == PKEY_KEYTYPE_AES_256) ?
- CPACF_KM_PXTS_256 : 0;
+ switch (ctx->pk[0].type) {
+ case PKEY_KEYTYPE_AES_128:
+ fc = CPACF_KM_PXTS_128;
+ break;
+ case PKEY_KEYTYPE_AES_256:
+ fc = CPACF_KM_PXTS_256;
+ break;
+ case PKEY_KEYTYPE_AES_XTS_128:
+ fc = CPACF_KM_PXTS_128_FULL;
+ break;
+ case PKEY_KEYTYPE_AES_XTS_256:
+ fc = CPACF_KM_PXTS_256_FULL;
+ break;
+ default:
+ fc = 0;
+ break;
+ }
/* Check if the function code is available */
ctx->fc = (fc && cpacf_test_func(&km_functions, fc)) ? fc : 0;
@@ -452,24 +546,19 @@ static inline int __xts_paes_set_key(struct s390_pxts_ctx *ctx)
}
static int xts_paes_set_key(struct crypto_skcipher *tfm, const u8 *in_key,
- unsigned int xts_key_len)
+ unsigned int in_keylen)
{
- int rc;
struct s390_pxts_ctx *ctx = crypto_skcipher_ctx(tfm);
u8 ckey[2 * AES_MAX_KEY_SIZE];
- unsigned int ckey_len, key_len;
+ unsigned int ckey_len;
+ int rc;
- if (xts_key_len % 2)
+ if ((in_keylen == 32 || in_keylen == 64) &&
+ xts_verify_key(tfm, in_key, in_keylen))
return -EINVAL;
- key_len = xts_key_len / 2;
-
- _free_kb_keybuf(&ctx->kb[0]);
- _free_kb_keybuf(&ctx->kb[1]);
- rc = _key_to_kb(&ctx->kb[0], in_key, key_len);
- if (rc)
- return rc;
- rc = _key_to_kb(&ctx->kb[1], in_key + key_len, key_len);
+ _free_kb_keybuf(&ctx->kb);
+ rc = _xts_key_to_kb(&ctx->kb, in_key, in_keylen);
if (rc)
return rc;
@@ -478,6 +567,13 @@ static int xts_paes_set_key(struct crypto_skcipher *tfm, const u8 *in_key,
return rc;
/*
+ * It is not possible on a single protected key (e.g. full AES-XTS) to
+ * check, if k1 and k2 are the same.
+ */
+ if (ctx->pk[0].type == PKEY_KEYTYPE_AES_XTS_128 ||
+ ctx->pk[0].type == PKEY_KEYTYPE_AES_XTS_256)
+ return 0;
+ /*
* xts_verify_key verifies the key length is not odd and makes
* sure that the two keys are not the same. This can be done
* on the two protected keys as well
@@ -489,28 +585,82 @@ static int xts_paes_set_key(struct crypto_skcipher *tfm, const u8 *in_key,
return xts_verify_key(tfm, ckey, 2*ckey_len);
}
-static int xts_paes_crypt(struct skcipher_request *req, unsigned long modifier)
+static int paes_xts_crypt_full(struct skcipher_request *req,
+ unsigned long modifier)
{
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
struct s390_pxts_ctx *ctx = crypto_skcipher_ctx(tfm);
- struct skcipher_walk walk;
unsigned int keylen, offset, nbytes, n, k;
- int ret;
struct {
- u8 key[MAXPROTKEYSIZE]; /* key + verification pattern */
+ u8 key[64];
+ u8 tweak[16];
+ u8 nap[16];
+ u8 wkvp[32];
+ } fxts_param = {
+ .nap = {0},
+ };
+ struct skcipher_walk walk;
+ int rc;
+
+ rc = skcipher_walk_virt(&walk, req, false);
+ if (rc)
+ return rc;
+
+ keylen = (ctx->pk[0].type == PKEY_KEYTYPE_AES_XTS_128) ? 32 : 64;
+ offset = (ctx->pk[0].type == PKEY_KEYTYPE_AES_XTS_128) ? 32 : 0;
+
+ spin_lock_bh(&ctx->pk_lock);
+ memcpy(fxts_param.key + offset, ctx->pk[0].protkey, keylen);
+ memcpy(fxts_param.wkvp, ctx->pk[0].protkey + keylen,
+ sizeof(fxts_param.wkvp));
+ spin_unlock_bh(&ctx->pk_lock);
+ memcpy(fxts_param.tweak, walk.iv, sizeof(fxts_param.tweak));
+ fxts_param.nap[0] = 0x01; /* initial alpha power (1, little-endian) */
+
+ while ((nbytes = walk.nbytes) != 0) {
+ /* only use complete blocks */
+ n = nbytes & ~(AES_BLOCK_SIZE - 1);
+ k = cpacf_km(ctx->fc | modifier, fxts_param.key + offset,
+ walk.dst.virt.addr, walk.src.virt.addr, n);
+ if (k)
+ rc = skcipher_walk_done(&walk, nbytes - k);
+ if (k < n) {
+ if (__xts_paes_convert_key(ctx))
+ return skcipher_walk_done(&walk, -EIO);
+ spin_lock_bh(&ctx->pk_lock);
+ memcpy(fxts_param.key + offset, ctx->pk[0].protkey,
+ keylen);
+ memcpy(fxts_param.wkvp, ctx->pk[0].protkey + keylen,
+ sizeof(fxts_param.wkvp));
+ spin_unlock_bh(&ctx->pk_lock);
+ }
+ }
+
+ return rc;
+}
+
+static int paes_xts_crypt(struct skcipher_request *req, unsigned long modifier)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct s390_pxts_ctx *ctx = crypto_skcipher_ctx(tfm);
+ unsigned int keylen, offset, nbytes, n, k;
+ struct {
+ u8 key[PAES_256_PROTKEY_SIZE];
u8 tweak[16];
u8 block[16];
u8 bit[16];
u8 xts[16];
} pcc_param;
struct {
- u8 key[MAXPROTKEYSIZE]; /* key + verification pattern */
+ u8 key[PAES_256_PROTKEY_SIZE];
u8 init[16];
} xts_param;
+ struct skcipher_walk walk;
+ int rc;
- ret = skcipher_walk_virt(&walk, req, false);
- if (ret)
- return ret;
+ rc = skcipher_walk_virt(&walk, req, false);
+ if (rc)
+ return rc;
keylen = (ctx->pk[0].type == PKEY_KEYTYPE_AES_128) ? 48 : 64;
offset = (ctx->pk[0].type == PKEY_KEYTYPE_AES_128) ? 16 : 0;
@@ -530,7 +680,7 @@ static int xts_paes_crypt(struct skcipher_request *req, unsigned long modifier)
k = cpacf_km(ctx->fc | modifier, xts_param.key + offset,
walk.dst.virt.addr, walk.src.virt.addr, n);
if (k)
- ret = skcipher_walk_done(&walk, nbytes - k);
+ rc = skcipher_walk_done(&walk, nbytes - k);
if (k < n) {
if (__xts_paes_convert_key(ctx))
return skcipher_walk_done(&walk, -EIO);
@@ -541,7 +691,24 @@ static int xts_paes_crypt(struct skcipher_request *req, unsigned long modifier)
}
}
- return ret;
+ return rc;
+}
+
+static inline int xts_paes_crypt(struct skcipher_request *req, unsigned long modifier)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct s390_pxts_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+ switch (ctx->fc) {
+ case CPACF_KM_PXTS_128:
+ case CPACF_KM_PXTS_256:
+ return paes_xts_crypt(req, modifier);
+ case CPACF_KM_PXTS_128_FULL:
+ case CPACF_KM_PXTS_256_FULL:
+ return paes_xts_crypt_full(req, modifier);
+ default:
+ return -EINVAL;
+ }
}
static int xts_paes_encrypt(struct skcipher_request *req)
@@ -591,8 +758,8 @@ static void ctr_paes_exit(struct crypto_skcipher *tfm)
static inline int __ctr_paes_set_key(struct s390_paes_ctx *ctx)
{
- int rc;
unsigned long fc;
+ int rc;
rc = __paes_convert_key(ctx);
if (rc)
@@ -613,8 +780,8 @@ static inline int __ctr_paes_set_key(struct s390_paes_ctx *ctx)
static int ctr_paes_set_key(struct crypto_skcipher *tfm, const u8 *in_key,
unsigned int key_len)
{
- int rc;
struct s390_paes_ctx *ctx = crypto_skcipher_ctx(tfm);
+ int rc;
_free_kb_keybuf(&ctx->kb);
rc = _key_to_kb(&ctx->kb, in_key, key_len);
@@ -644,19 +811,19 @@ static int ctr_paes_crypt(struct skcipher_request *req)
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
struct s390_paes_ctx *ctx = crypto_skcipher_ctx(tfm);
u8 buf[AES_BLOCK_SIZE], *ctrptr;
+ struct {
+ u8 key[PAES_256_PROTKEY_SIZE];
+ } param;
struct skcipher_walk walk;
unsigned int nbytes, n, k;
- int ret, locked;
- struct {
- u8 key[MAXPROTKEYSIZE];
- } param;
+ int rc, locked;
- ret = skcipher_walk_virt(&walk, req, false);
- if (ret)
- return ret;
+ rc = skcipher_walk_virt(&walk, req, false);
+ if (rc)
+ return rc;
spin_lock_bh(&ctx->pk_lock);
- memcpy(param.key, ctx->pk.protkey, MAXPROTKEYSIZE);
+ memcpy(param.key, ctx->pk.protkey, PAES_256_PROTKEY_SIZE);
spin_unlock_bh(&ctx->pk_lock);
locked = mutex_trylock(&ctrblk_lock);
@@ -673,7 +840,7 @@ static int ctr_paes_crypt(struct skcipher_request *req)
memcpy(walk.iv, ctrptr + k - AES_BLOCK_SIZE,
AES_BLOCK_SIZE);
crypto_inc(walk.iv, AES_BLOCK_SIZE);
- ret = skcipher_walk_done(&walk, nbytes - k);
+ rc = skcipher_walk_done(&walk, nbytes - k);
}
if (k < n) {
if (__paes_convert_key(ctx)) {
@@ -682,7 +849,7 @@ static int ctr_paes_crypt(struct skcipher_request *req)
return skcipher_walk_done(&walk, -EIO);
}
spin_lock_bh(&ctx->pk_lock);
- memcpy(param.key, ctx->pk.protkey, MAXPROTKEYSIZE);
+ memcpy(param.key, ctx->pk.protkey, PAES_256_PROTKEY_SIZE);
spin_unlock_bh(&ctx->pk_lock);
}
}
@@ -702,15 +869,15 @@ static int ctr_paes_crypt(struct skcipher_request *req)
if (__paes_convert_key(ctx))
return skcipher_walk_done(&walk, -EIO);
spin_lock_bh(&ctx->pk_lock);
- memcpy(param.key, ctx->pk.protkey, MAXPROTKEYSIZE);
+ memcpy(param.key, ctx->pk.protkey, PAES_256_PROTKEY_SIZE);
spin_unlock_bh(&ctx->pk_lock);
}
memcpy(walk.dst.virt.addr, buf, nbytes);
crypto_inc(walk.iv, AES_BLOCK_SIZE);
- ret = skcipher_walk_done(&walk, nbytes);
+ rc = skcipher_walk_done(&walk, nbytes);
}
- return ret;
+ return rc;
}
static struct skcipher_alg ctr_paes_alg = {
@@ -750,7 +917,7 @@ static void paes_s390_fini(void)
static int __init paes_s390_init(void)
{
- int ret;
+ int rc;
/* Query available functions for KM, KMC and KMCTR */
cpacf_query(CPACF_KM, &km_functions);
@@ -760,23 +927,23 @@ static int __init paes_s390_init(void)
if (cpacf_test_func(&km_functions, CPACF_KM_PAES_128) ||
cpacf_test_func(&km_functions, CPACF_KM_PAES_192) ||
cpacf_test_func(&km_functions, CPACF_KM_PAES_256)) {
- ret = crypto_register_skcipher(&ecb_paes_alg);
- if (ret)
+ rc = crypto_register_skcipher(&ecb_paes_alg);
+ if (rc)
goto out_err;
}
if (cpacf_test_func(&kmc_functions, CPACF_KMC_PAES_128) ||
cpacf_test_func(&kmc_functions, CPACF_KMC_PAES_192) ||
cpacf_test_func(&kmc_functions, CPACF_KMC_PAES_256)) {
- ret = crypto_register_skcipher(&cbc_paes_alg);
- if (ret)
+ rc = crypto_register_skcipher(&cbc_paes_alg);
+ if (rc)
goto out_err;
}
if (cpacf_test_func(&km_functions, CPACF_KM_PXTS_128) ||
cpacf_test_func(&km_functions, CPACF_KM_PXTS_256)) {
- ret = crypto_register_skcipher(&xts_paes_alg);
- if (ret)
+ rc = crypto_register_skcipher(&xts_paes_alg);
+ if (rc)
goto out_err;
}
@@ -785,18 +952,18 @@ static int __init paes_s390_init(void)
cpacf_test_func(&kmctr_functions, CPACF_KMCTR_PAES_256)) {
ctrblk = (u8 *) __get_free_page(GFP_KERNEL);
if (!ctrblk) {
- ret = -ENOMEM;
+ rc = -ENOMEM;
goto out_err;
}
- ret = crypto_register_skcipher(&ctr_paes_alg);
- if (ret)
+ rc = crypto_register_skcipher(&ctr_paes_alg);
+ if (rc)
goto out_err;
}
return 0;
out_err:
paes_s390_fini();
- return ret;
+ return rc;
}
module_init(paes_s390_init);
diff --git a/arch/s390/crypto/prng.c b/arch/s390/crypto/prng.c
index a077087..2becd77d 100644
--- a/arch/s390/crypto/prng.c
+++ b/arch/s390/crypto/prng.c
@@ -679,7 +679,7 @@ static ssize_t prng_chunksize_show(struct device *dev,
struct device_attribute *attr,
char *buf)
{
- return scnprintf(buf, PAGE_SIZE, "%u\n", prng_chunk_size);
+ return sysfs_emit(buf, "%u\n", prng_chunk_size);
}
static DEVICE_ATTR(chunksize, 0444, prng_chunksize_show, NULL);
@@ -698,7 +698,7 @@ static ssize_t prng_counter_show(struct device *dev,
counter = prng_data->prngws.byte_counter;
mutex_unlock(&prng_data->mutex);
- return scnprintf(buf, PAGE_SIZE, "%llu\n", counter);
+ return sysfs_emit(buf, "%llu\n", counter);
}
static DEVICE_ATTR(byte_counter, 0444, prng_counter_show, NULL);
@@ -707,7 +707,7 @@ static ssize_t prng_errorflag_show(struct device *dev,
struct device_attribute *attr,
char *buf)
{
- return scnprintf(buf, PAGE_SIZE, "%d\n", prng_errorflag);
+ return sysfs_emit(buf, "%d\n", prng_errorflag);
}
static DEVICE_ATTR(errorflag, 0444, prng_errorflag_show, NULL);
@@ -717,9 +717,9 @@ static ssize_t prng_mode_show(struct device *dev,
char *buf)
{
if (prng_mode == PRNG_MODE_TDES)
- return scnprintf(buf, PAGE_SIZE, "TDES\n");
+ return sysfs_emit(buf, "TDES\n");
else
- return scnprintf(buf, PAGE_SIZE, "SHA512\n");
+ return sysfs_emit(buf, "SHA512\n");
}
static DEVICE_ATTR(mode, 0444, prng_mode_show, NULL);
@@ -742,7 +742,7 @@ static ssize_t prng_reseed_limit_show(struct device *dev,
struct device_attribute *attr,
char *buf)
{
- return scnprintf(buf, PAGE_SIZE, "%u\n", prng_reseed_limit);
+ return sysfs_emit(buf, "%u\n", prng_reseed_limit);
}
static ssize_t prng_reseed_limit_store(struct device *dev,
struct device_attribute *attr,
@@ -773,7 +773,7 @@ static ssize_t prng_strength_show(struct device *dev,
struct device_attribute *attr,
char *buf)
{
- return scnprintf(buf, PAGE_SIZE, "256\n");
+ return sysfs_emit(buf, "256\n");
}
static DEVICE_ATTR(strength, 0444, prng_strength_show, NULL);
diff --git a/arch/s390/include/asm/asm.h b/arch/s390/include/asm/asm.h
new file mode 100644
index 0000000..ec011b9
--- /dev/null
+++ b/arch/s390/include/asm/asm.h
@@ -0,0 +1,51 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_S390_ASM_H
+#define _ASM_S390_ASM_H
+
+#include <linux/stringify.h>
+
+/*
+ * Helper macros to be used for flag output operand handling.
+ * Inline assemblies must use four of the five supplied macros:
+ *
+ * Use CC_IPM(sym) at the end of the inline assembly; this extracts the
+ * condition code and program mask with the ipm instruction and writes it to
+ * the variable with symbolic name [sym] if the compiler has no support for
+ * flag output operands. If the compiler has support for flag output operands
+ * this generates no code.
+ *
+ * Use CC_OUT(sym, var) at the output operand list of an inline assembly. This
+ * defines an output operand with symbolic name [sym] for the variable
+ * [var]. [var] must be an int variable and [sym] must be identical with [sym]
+ * used with CC_IPM().
+ *
+ * Use either CC_CLOBBER or CC_CLOBBER_LIST() for the clobber list. Use
+ * CC_CLOBBER if the clobber list contains only "cc", otherwise use
+ * CC_CLOBBER_LIST() and add all clobbers as argument to the macro.
+ *
+ * Use CC_TRANSFORM() to convert the variable [var] which contains the
+ * extracted condition code. If the condition code is extracted with ipm, the
+ * [var] also contains the program mask. CC_TRANSFORM() moves the condition
+ * code to the two least significant bits and sets all other bits to zero.
+ */
+#if defined(__GCC_ASM_FLAG_OUTPUTS__) && !(IS_ENABLED(CONFIG_GCC_ASM_FLAG_OUTPUT_BROKEN))
+
+#define __HAVE_ASM_FLAG_OUTPUTS__
+
+#define CC_IPM(sym)
+#define CC_OUT(sym, var) "=@cc" (var)
+#define CC_TRANSFORM(cc) ({ cc; })
+#define CC_CLOBBER
+#define CC_CLOBBER_LIST(...) __VA_ARGS__
+
+#else
+
+#define CC_IPM(sym) " ipm %[" __stringify(sym) "]\n"
+#define CC_OUT(sym, var) [sym] "=d" (var)
+#define CC_TRANSFORM(cc) ({ (cc) >> 28; })
+#define CC_CLOBBER "cc"
+#define CC_CLOBBER_LIST(...) "cc", __VA_ARGS__
+
+#endif
+
+#endif /* _ASM_S390_ASM_H */
diff --git a/arch/s390/include/asm/atomic.h b/arch/s390/include/asm/atomic.h
index 0c4cad7..6723fca64 100644
--- a/arch/s390/include/asm/atomic.h
+++ b/arch/s390/include/asm/atomic.h
@@ -72,14 +72,24 @@ ATOMIC_OPS(xor)
#define arch_atomic_fetch_or arch_atomic_fetch_or
#define arch_atomic_fetch_xor arch_atomic_fetch_xor
-#define arch_atomic_xchg(v, new) (arch_xchg(&((v)->counter), new))
+static __always_inline int arch_atomic_xchg(atomic_t *v, int new)
+{
+ return arch_xchg(&v->counter, new);
+}
+#define arch_atomic_xchg arch_atomic_xchg
static __always_inline int arch_atomic_cmpxchg(atomic_t *v, int old, int new)
{
- return __atomic_cmpxchg(&v->counter, old, new);
+ return arch_cmpxchg(&v->counter, old, new);
}
#define arch_atomic_cmpxchg arch_atomic_cmpxchg
+static __always_inline bool arch_atomic_try_cmpxchg(atomic_t *v, int *old, int new)
+{
+ return arch_try_cmpxchg(&v->counter, old, new);
+}
+#define arch_atomic_try_cmpxchg arch_atomic_try_cmpxchg
+
#define ATOMIC64_INIT(i) { (i) }
static __always_inline s64 arch_atomic64_read(const atomic64_t *v)
@@ -112,14 +122,24 @@ static __always_inline void arch_atomic64_add(s64 i, atomic64_t *v)
}
#define arch_atomic64_add arch_atomic64_add
-#define arch_atomic64_xchg(v, new) (arch_xchg(&((v)->counter), new))
+static __always_inline s64 arch_atomic64_xchg(atomic64_t *v, s64 new)
+{
+ return arch_xchg(&v->counter, new);
+}
+#define arch_atomic64_xchg arch_atomic64_xchg
static __always_inline s64 arch_atomic64_cmpxchg(atomic64_t *v, s64 old, s64 new)
{
- return __atomic64_cmpxchg((long *)&v->counter, old, new);
+ return arch_cmpxchg(&v->counter, old, new);
}
#define arch_atomic64_cmpxchg arch_atomic64_cmpxchg
+static __always_inline bool arch_atomic64_try_cmpxchg(atomic64_t *v, s64 *old, s64 new)
+{
+ return arch_try_cmpxchg(&v->counter, old, new);
+}
+#define arch_atomic64_try_cmpxchg arch_atomic64_try_cmpxchg
+
#define ATOMIC64_OPS(op) \
static __always_inline void arch_atomic64_##op(s64 i, atomic64_t *v) \
{ \
diff --git a/arch/s390/include/asm/atomic_ops.h b/arch/s390/include/asm/atomic_ops.h
index 65380da..1d6b205 100644
--- a/arch/s390/include/asm/atomic_ops.h
+++ b/arch/s390/include/asm/atomic_ops.h
@@ -169,79 +169,4 @@ __ATOMIC64_OPS(__atomic64_xor, "xgr")
#endif /* MARCH_HAS_Z196_FEATURES */
-static __always_inline int __atomic_cmpxchg(int *ptr, int old, int new)
-{
- asm volatile(
- " cs %[old],%[new],%[ptr]"
- : [old] "+d" (old), [ptr] "+Q" (*ptr)
- : [new] "d" (new)
- : "cc", "memory");
- return old;
-}
-
-static __always_inline long __atomic64_cmpxchg(long *ptr, long old, long new)
-{
- asm volatile(
- " csg %[old],%[new],%[ptr]"
- : [old] "+d" (old), [ptr] "+QS" (*ptr)
- : [new] "d" (new)
- : "cc", "memory");
- return old;
-}
-
-/* GCC versions before 14.2.0 may die with an ICE in some configurations. */
-#if defined(__GCC_ASM_FLAG_OUTPUTS__) && !(IS_ENABLED(CONFIG_CC_IS_GCC) && (GCC_VERSION < 140200))
-
-static __always_inline bool __atomic_cmpxchg_bool(int *ptr, int old, int new)
-{
- int cc;
-
- asm volatile(
- " cs %[old],%[new],%[ptr]"
- : [old] "+d" (old), [ptr] "+Q" (*ptr), "=@cc" (cc)
- : [new] "d" (new)
- : "memory");
- return cc == 0;
-}
-
-static __always_inline bool __atomic64_cmpxchg_bool(long *ptr, long old, long new)
-{
- int cc;
-
- asm volatile(
- " csg %[old],%[new],%[ptr]"
- : [old] "+d" (old), [ptr] "+QS" (*ptr), "=@cc" (cc)
- : [new] "d" (new)
- : "memory");
- return cc == 0;
-}
-
-#else /* __GCC_ASM_FLAG_OUTPUTS__ */
-
-static __always_inline bool __atomic_cmpxchg_bool(int *ptr, int old, int new)
-{
- int old_expected = old;
-
- asm volatile(
- " cs %[old],%[new],%[ptr]"
- : [old] "+d" (old), [ptr] "+Q" (*ptr)
- : [new] "d" (new)
- : "cc", "memory");
- return old == old_expected;
-}
-
-static __always_inline bool __atomic64_cmpxchg_bool(long *ptr, long old, long new)
-{
- long old_expected = old;
-
- asm volatile(
- " csg %[old],%[new],%[ptr]"
- : [old] "+d" (old), [ptr] "+QS" (*ptr)
- : [new] "d" (new)
- : "cc", "memory");
- return old == old_expected;
-}
-
-#endif /* __GCC_ASM_FLAG_OUTPUTS__ */
-
#endif /* __ARCH_S390_ATOMIC_OPS__ */
diff --git a/arch/s390/include/asm/cmpxchg.h b/arch/s390/include/asm/cmpxchg.h
index aae0315..a9e2006 100644
--- a/arch/s390/include/asm/cmpxchg.h
+++ b/arch/s390/include/asm/cmpxchg.h
@@ -11,185 +11,231 @@
#include <linux/mmdebug.h>
#include <linux/types.h>
#include <linux/bug.h>
+#include <asm/asm.h>
+
+void __cmpxchg_called_with_bad_pointer(void);
+
+static __always_inline u32 __cs_asm(u64 ptr, u32 old, u32 new)
+{
+ asm volatile(
+ " cs %[old],%[new],%[ptr]\n"
+ : [old] "+d" (old), [ptr] "+Q" (*(u32 *)ptr)
+ : [new] "d" (new)
+ : "memory", "cc");
+ return old;
+}
+
+static __always_inline u64 __csg_asm(u64 ptr, u64 old, u64 new)
+{
+ asm volatile(
+ " csg %[old],%[new],%[ptr]\n"
+ : [old] "+d" (old), [ptr] "+QS" (*(u64 *)ptr)
+ : [new] "d" (new)
+ : "memory", "cc");
+ return old;
+}
+
+static inline u8 __arch_cmpxchg1(u64 ptr, u8 old, u8 new)
+{
+ union {
+ u8 b[4];
+ u32 w;
+ } old32, new32;
+ u32 prev;
+ int i;
+
+ i = ptr & 3;
+ ptr &= ~0x3;
+ prev = READ_ONCE(*(u32 *)ptr);
+ do {
+ old32.w = prev;
+ if (old32.b[i] != old)
+ return old32.b[i];
+ new32.w = old32.w;
+ new32.b[i] = new;
+ prev = __cs_asm(ptr, old32.w, new32.w);
+ } while (prev != old32.w);
+ return old;
+}
+
+static inline u16 __arch_cmpxchg2(u64 ptr, u16 old, u16 new)
+{
+ union {
+ u16 b[2];
+ u32 w;
+ } old32, new32;
+ u32 prev;
+ int i;
+
+ i = (ptr & 3) >> 1;
+ ptr &= ~0x3;
+ prev = READ_ONCE(*(u32 *)ptr);
+ do {
+ old32.w = prev;
+ if (old32.b[i] != old)
+ return old32.b[i];
+ new32.w = old32.w;
+ new32.b[i] = new;
+ prev = __cs_asm(ptr, old32.w, new32.w);
+ } while (prev != old32.w);
+ return old;
+}
+
+static __always_inline u64 __arch_cmpxchg(u64 ptr, u64 old, u64 new, int size)
+{
+ switch (size) {
+ case 1: return __arch_cmpxchg1(ptr, old & 0xff, new & 0xff);
+ case 2: return __arch_cmpxchg2(ptr, old & 0xffff, new & 0xffff);
+ case 4: return __cs_asm(ptr, old & 0xffffffff, new & 0xffffffff);
+ case 8: return __csg_asm(ptr, old, new);
+ default: __cmpxchg_called_with_bad_pointer();
+ }
+ return old;
+}
+
+#define arch_cmpxchg(ptr, o, n) \
+({ \
+ (__typeof__(*(ptr)))__arch_cmpxchg((unsigned long)(ptr), \
+ (unsigned long)(o), \
+ (unsigned long)(n), \
+ sizeof(*(ptr))); \
+})
+
+#define arch_cmpxchg64 arch_cmpxchg
+#define arch_cmpxchg_local arch_cmpxchg
+#define arch_cmpxchg64_local arch_cmpxchg
+
+#ifdef __HAVE_ASM_FLAG_OUTPUTS__
+
+#define arch_try_cmpxchg(ptr, oldp, new) \
+({ \
+ __typeof__(ptr) __oldp = (__typeof__(ptr))(oldp); \
+ __typeof__(*(ptr)) __old = *__oldp; \
+ __typeof__(*(ptr)) __new = (new); \
+ __typeof__(*(ptr)) __prev; \
+ int __cc; \
+ \
+ switch (sizeof(*(ptr))) { \
+ case 1: \
+ case 2: { \
+ __prev = arch_cmpxchg((ptr), (__old), (__new)); \
+ __cc = (__prev != __old); \
+ if (unlikely(__cc)) \
+ *__oldp = __prev; \
+ break; \
+ } \
+ case 4: { \
+ asm volatile( \
+ " cs %[__old],%[__new],%[__ptr]\n" \
+ : [__old] "+d" (*__oldp), \
+ [__ptr] "+Q" (*(ptr)), \
+ "=@cc" (__cc) \
+ : [__new] "d" (__new) \
+ : "memory"); \
+ break; \
+ } \
+ case 8: { \
+ asm volatile( \
+ " csg %[__old],%[__new],%[__ptr]\n" \
+ : [__old] "+d" (*__oldp), \
+ [__ptr] "+QS" (*(ptr)), \
+ "=@cc" (__cc) \
+ : [__new] "d" (__new) \
+ : "memory"); \
+ break; \
+ } \
+ default: \
+ __cmpxchg_called_with_bad_pointer(); \
+ } \
+ likely(__cc == 0); \
+})
+
+#else /* __HAVE_ASM_FLAG_OUTPUTS__ */
+
+#define arch_try_cmpxchg(ptr, oldp, new) \
+({ \
+ __typeof__((ptr)) __oldp = (__typeof__(ptr))(oldp); \
+ __typeof__(*(ptr)) __old = *__oldp; \
+ __typeof__(*(ptr)) __new = (new); \
+ __typeof__(*(ptr)) __prev; \
+ \
+ __prev = arch_cmpxchg((ptr), (__old), (__new)); \
+ if (unlikely(__prev != __old)) \
+ *__oldp = __prev; \
+ likely(__prev == __old); \
+})
+
+#endif /* __HAVE_ASM_FLAG_OUTPUTS__ */
+
+#define arch_try_cmpxchg64 arch_try_cmpxchg
+#define arch_try_cmpxchg_local arch_try_cmpxchg
+#define arch_try_cmpxchg64_local arch_try_cmpxchg
void __xchg_called_with_bad_pointer(void);
-static __always_inline unsigned long
-__arch_xchg(unsigned long x, unsigned long address, int size)
+static inline u8 __arch_xchg1(u64 ptr, u8 x)
{
- unsigned long old;
- int shift;
+ int shift = (3 ^ (ptr & 3)) << 3;
+ u32 mask, old, new;
+ ptr &= ~0x3;
+ mask = ~(0xff << shift);
+ old = READ_ONCE(*(u32 *)ptr);
+ do {
+ new = old & mask;
+ new |= x << shift;
+ } while (!arch_try_cmpxchg((u32 *)ptr, &old, new));
+ return old >> shift;
+}
+
+static inline u16 __arch_xchg2(u64 ptr, u16 x)
+{
+ int shift = (2 ^ (ptr & 2)) << 3;
+ u32 mask, old, new;
+
+ ptr &= ~0x3;
+ mask = ~(0xffff << shift);
+ old = READ_ONCE(*(u32 *)ptr);
+ do {
+ new = old & mask;
+ new |= x << shift;
+ } while (!arch_try_cmpxchg((u32 *)ptr, &old, new));
+ return old >> shift;
+}
+
+static __always_inline u64 __arch_xchg(u64 ptr, u64 x, int size)
+{
switch (size) {
case 1:
- shift = (3 ^ (address & 3)) << 3;
- address ^= address & 3;
- asm volatile(
- " l %0,%1\n"
- "0: lr 0,%0\n"
- " nr 0,%3\n"
- " or 0,%2\n"
- " cs %0,0,%1\n"
- " jl 0b\n"
- : "=&d" (old), "+Q" (*(int *) address)
- : "d" ((x & 0xff) << shift), "d" (~(0xff << shift))
- : "memory", "cc", "0");
- return old >> shift;
+ return __arch_xchg1(ptr, x & 0xff);
case 2:
- shift = (2 ^ (address & 2)) << 3;
- address ^= address & 2;
- asm volatile(
- " l %0,%1\n"
- "0: lr 0,%0\n"
- " nr 0,%3\n"
- " or 0,%2\n"
- " cs %0,0,%1\n"
- " jl 0b\n"
- : "=&d" (old), "+Q" (*(int *) address)
- : "d" ((x & 0xffff) << shift), "d" (~(0xffff << shift))
- : "memory", "cc", "0");
- return old >> shift;
- case 4:
- asm volatile(
- " l %0,%1\n"
- "0: cs %0,%2,%1\n"
- " jl 0b\n"
- : "=&d" (old), "+Q" (*(int *) address)
- : "d" (x)
- : "memory", "cc");
+ return __arch_xchg2(ptr, x & 0xffff);
+ case 4: {
+ u32 old = READ_ONCE(*(u32 *)ptr);
+
+ do {
+ } while (!arch_try_cmpxchg((u32 *)ptr, &old, x & 0xffffffff));
return old;
- case 8:
- asm volatile(
- " lg %0,%1\n"
- "0: csg %0,%2,%1\n"
- " jl 0b\n"
- : "=&d" (old), "+QS" (*(long *) address)
- : "d" (x)
- : "memory", "cc");
+ }
+ case 8: {
+ u64 old = READ_ONCE(*(u64 *)ptr);
+
+ do {
+ } while (!arch_try_cmpxchg((u64 *)ptr, &old, x));
return old;
}
+ }
__xchg_called_with_bad_pointer();
return x;
}
#define arch_xchg(ptr, x) \
({ \
- __typeof__(*(ptr)) __ret; \
- \
- __ret = (__typeof__(*(ptr))) \
- __arch_xchg((unsigned long)(x), (unsigned long)(ptr), \
- sizeof(*(ptr))); \
- __ret; \
+ (__typeof__(*(ptr)))__arch_xchg((unsigned long)(ptr), \
+ (unsigned long)(x), \
+ sizeof(*(ptr))); \
})
-void __cmpxchg_called_with_bad_pointer(void);
-
-static __always_inline unsigned long __cmpxchg(unsigned long address,
- unsigned long old,
- unsigned long new, int size)
-{
- switch (size) {
- case 1: {
- unsigned int prev, shift, mask;
-
- shift = (3 ^ (address & 3)) << 3;
- address ^= address & 3;
- old = (old & 0xff) << shift;
- new = (new & 0xff) << shift;
- mask = ~(0xff << shift);
- asm volatile(
- " l %[prev],%[address]\n"
- " nr %[prev],%[mask]\n"
- " xilf %[mask],0xffffffff\n"
- " or %[new],%[prev]\n"
- " or %[prev],%[tmp]\n"
- "0: lr %[tmp],%[prev]\n"
- " cs %[prev],%[new],%[address]\n"
- " jnl 1f\n"
- " xr %[tmp],%[prev]\n"
- " xr %[new],%[tmp]\n"
- " nr %[tmp],%[mask]\n"
- " jz 0b\n"
- "1:"
- : [prev] "=&d" (prev),
- [address] "+Q" (*(int *)address),
- [tmp] "+&d" (old),
- [new] "+&d" (new),
- [mask] "+&d" (mask)
- :: "memory", "cc");
- return prev >> shift;
- }
- case 2: {
- unsigned int prev, shift, mask;
-
- shift = (2 ^ (address & 2)) << 3;
- address ^= address & 2;
- old = (old & 0xffff) << shift;
- new = (new & 0xffff) << shift;
- mask = ~(0xffff << shift);
- asm volatile(
- " l %[prev],%[address]\n"
- " nr %[prev],%[mask]\n"
- " xilf %[mask],0xffffffff\n"
- " or %[new],%[prev]\n"
- " or %[prev],%[tmp]\n"
- "0: lr %[tmp],%[prev]\n"
- " cs %[prev],%[new],%[address]\n"
- " jnl 1f\n"
- " xr %[tmp],%[prev]\n"
- " xr %[new],%[tmp]\n"
- " nr %[tmp],%[mask]\n"
- " jz 0b\n"
- "1:"
- : [prev] "=&d" (prev),
- [address] "+Q" (*(int *)address),
- [tmp] "+&d" (old),
- [new] "+&d" (new),
- [mask] "+&d" (mask)
- :: "memory", "cc");
- return prev >> shift;
- }
- case 4: {
- unsigned int prev = old;
-
- asm volatile(
- " cs %[prev],%[new],%[address]\n"
- : [prev] "+&d" (prev),
- [address] "+Q" (*(int *)address)
- : [new] "d" (new)
- : "memory", "cc");
- return prev;
- }
- case 8: {
- unsigned long prev = old;
-
- asm volatile(
- " csg %[prev],%[new],%[address]\n"
- : [prev] "+&d" (prev),
- [address] "+QS" (*(long *)address)
- : [new] "d" (new)
- : "memory", "cc");
- return prev;
- }
- }
- __cmpxchg_called_with_bad_pointer();
- return old;
-}
-
-#define arch_cmpxchg(ptr, o, n) \
-({ \
- __typeof__(*(ptr)) __ret; \
- \
- __ret = (__typeof__(*(ptr))) \
- __cmpxchg((unsigned long)(ptr), (unsigned long)(o), \
- (unsigned long)(n), sizeof(*(ptr))); \
- __ret; \
-})
-
-#define arch_cmpxchg64 arch_cmpxchg
-#define arch_cmpxchg_local arch_cmpxchg
-#define arch_cmpxchg64_local arch_cmpxchg
-
#define system_has_cmpxchg128() 1
static __always_inline u128 arch_cmpxchg128(volatile u128 *ptr, u128 old, u128 new)
@@ -203,5 +249,25 @@ static __always_inline u128 arch_cmpxchg128(volatile u128 *ptr, u128 old, u128 n
}
#define arch_cmpxchg128 arch_cmpxchg128
+#define arch_cmpxchg128_local arch_cmpxchg128
+
+#ifdef __HAVE_ASM_FLAG_OUTPUTS__
+
+static __always_inline bool arch_try_cmpxchg128(volatile u128 *ptr, u128 *oldp, u128 new)
+{
+ int cc;
+
+ asm volatile(
+ " cdsg %[old],%[new],%[ptr]\n"
+ : [old] "+d" (*oldp), [ptr] "+QS" (*ptr), "=@cc" (cc)
+ : [new] "d" (new)
+ : "memory");
+ return likely(cc == 0);
+}
+
+#define arch_try_cmpxchg128 arch_try_cmpxchg128
+#define arch_try_cmpxchg128_local arch_try_cmpxchg128
+
+#endif /* __HAVE_ASM_FLAG_OUTPUTS__ */
#endif /* __ASM_CMPXCHG_H */
diff --git a/arch/s390/include/asm/cpacf.h b/arch/s390/include/asm/cpacf.h
index 1d3a4b0..59ab119 100644
--- a/arch/s390/include/asm/cpacf.h
+++ b/arch/s390/include/asm/cpacf.h
@@ -56,6 +56,8 @@
#define CPACF_KM_PXTS_256 0x3c
#define CPACF_KM_XTS_128_FULL 0x52
#define CPACF_KM_XTS_256_FULL 0x54
+#define CPACF_KM_PXTS_128_FULL 0x5a
+#define CPACF_KM_PXTS_256_FULL 0x5c
/*
* Function codes for the KMC (CIPHER MESSAGE WITH CHAINING)
diff --git a/arch/s390/include/asm/cpu_mf.h b/arch/s390/include/asm/cpu_mf.h
index 9e4bbc3..e1a279e 100644
--- a/arch/s390/include/asm/cpu_mf.h
+++ b/arch/s390/include/asm/cpu_mf.h
@@ -13,6 +13,7 @@
#include <linux/kmsan-checks.h>
#include <asm/asm-extable.h>
#include <asm/facility.h>
+#include <asm/asm.h>
asm(".include \"asm/cpu_mf-insn.h\"\n");
@@ -185,11 +186,12 @@ static inline int lcctl(u64 ctl)
int cc;
asm volatile (
- " lcctl %1\n"
- " ipm %0\n"
- " srl %0,28\n"
- : "=d" (cc) : "Q" (ctl) : "cc");
- return cc;
+ " lcctl %[ctl]\n"
+ CC_IPM(cc)
+ : CC_OUT(cc, cc)
+ : [ctl] "Q" (ctl)
+ : CC_CLOBBER);
+ return CC_TRANSFORM(cc);
}
/* Extract CPU counter */
@@ -199,12 +201,13 @@ static inline int __ecctr(u64 ctr, u64 *content)
int cc;
asm volatile (
- " ecctr %0,%2\n"
- " ipm %1\n"
- " srl %1,28\n"
- : "=d" (_content), "=d" (cc) : "d" (ctr) : "cc");
+ " ecctr %[_content],%[ctr]\n"
+ CC_IPM(cc)
+ : CC_OUT(cc, cc), [_content] "=d" (_content)
+ : [ctr] "d" (ctr)
+ : CC_CLOBBER);
*content = _content;
- return cc;
+ return CC_TRANSFORM(cc);
}
/* Extract CPU counter */
@@ -234,18 +237,17 @@ static __always_inline int stcctm(enum stcctm_ctr_set set, u64 range, u64 *dest)
int cc;
asm volatile (
- " STCCTM %2,%3,%1\n"
- " ipm %0\n"
- " srl %0,28\n"
- : "=d" (cc)
- : "Q" (*dest), "d" (range), "i" (set)
- : "cc", "memory");
+ " STCCTM %[range],%[set],%[dest]\n"
+ CC_IPM(cc)
+ : CC_OUT(cc, cc)
+ : [dest] "Q" (*dest), [range] "d" (range), [set] "i" (set)
+ : CC_CLOBBER_LIST("memory"));
/*
* If cc == 2, less than RANGE counters are stored, but it's not easy
* to tell how many. Always unpoison the whole range for simplicity.
*/
kmsan_unpoison_memory(dest, range * sizeof(u64));
- return cc;
+ return CC_TRANSFORM(cc);
}
/* Query sampling information */
@@ -265,19 +267,20 @@ static inline int qsi(struct hws_qsi_info_block *info)
/* Load sampling controls */
static inline int lsctl(struct hws_lsctl_request_block *req)
{
- int cc;
+ int cc, exception;
- cc = 1;
+ exception = 1;
asm volatile(
- "0: lsctl 0(%1)\n"
- "1: ipm %0\n"
- " srl %0,28\n"
+ "0: lsctl %[req]\n"
+ "1: lhi %[exc],0\n"
"2:\n"
+ CC_IPM(cc)
EX_TABLE(0b, 2b) EX_TABLE(1b, 2b)
- : "+d" (cc), "+a" (req)
- : "m" (*req)
- : "cc", "memory");
-
- return cc ? -EINVAL : 0;
+ : CC_OUT(cc, cc), [exc] "+d" (exception)
+ : [req] "Q" (*req)
+ : CC_CLOBBER);
+ if (exception || CC_TRANSFORM(cc))
+ return -EINVAL;
+ return 0;
}
#endif /* _ASM_S390_CPU_MF_H */
diff --git a/arch/s390/include/asm/facility.h b/arch/s390/include/asm/facility.h
index 715bcf8..5f5b1aa 100644
--- a/arch/s390/include/asm/facility.h
+++ b/arch/s390/include/asm/facility.h
@@ -88,7 +88,7 @@ static __always_inline bool test_facility(unsigned long nr)
return __test_facility(nr, &stfle_fac_list);
}
-static inline unsigned long __stfle_asm(u64 *stfle_fac_list, int size)
+static inline unsigned long __stfle_asm(u64 *fac_list, int size)
{
unsigned long reg0 = size - 1;
@@ -96,7 +96,7 @@ static inline unsigned long __stfle_asm(u64 *stfle_fac_list, int size)
" lgr 0,%[reg0]\n"
" .insn s,0xb2b00000,%[list]\n" /* stfle */
" lgr %[reg0],0\n"
- : [reg0] "+&d" (reg0), [list] "+Q" (*stfle_fac_list)
+ : [reg0] "+&d" (reg0), [list] "+Q" (*fac_list)
:
: "memory", "cc", "0");
return reg0;
@@ -104,10 +104,10 @@ static inline unsigned long __stfle_asm(u64 *stfle_fac_list, int size)
/**
* stfle - Store facility list extended
- * @stfle_fac_list: array where facility list can be stored
+ * @fac_list: array where facility list can be stored
* @size: size of passed in array in double words
*/
-static inline void __stfle(u64 *stfle_fac_list, int size)
+static inline void __stfle(u64 *fac_list, int size)
{
unsigned long nr;
u32 stfl_fac_list;
@@ -116,20 +116,20 @@ static inline void __stfle(u64 *stfle_fac_list, int size)
" stfl 0(0)\n"
: "=m" (get_lowcore()->stfl_fac_list));
stfl_fac_list = get_lowcore()->stfl_fac_list;
- memcpy(stfle_fac_list, &stfl_fac_list, 4);
+ memcpy(fac_list, &stfl_fac_list, 4);
nr = 4; /* bytes stored by stfl */
if (stfl_fac_list & 0x01000000) {
/* More facility bits available with stfle */
- nr = __stfle_asm(stfle_fac_list, size);
+ nr = __stfle_asm(fac_list, size);
nr = min_t(unsigned long, (nr + 1) * 8, size * 8);
}
- memset((char *) stfle_fac_list + nr, 0, size * 8 - nr);
+ memset((char *)fac_list + nr, 0, size * 8 - nr);
}
-static inline void stfle(u64 *stfle_fac_list, int size)
+static inline void stfle(u64 *fac_list, int size)
{
preempt_disable();
- __stfle(stfle_fac_list, size);
+ __stfle(fac_list, size);
preempt_enable();
}
diff --git a/arch/s390/include/asm/gmap.h b/arch/s390/include/asm/gmap.h
index 9725586..64761c7 100644
--- a/arch/s390/include/asm/gmap.h
+++ b/arch/s390/include/asm/gmap.h
@@ -107,9 +107,6 @@ void gmap_remove(struct gmap *gmap);
struct gmap *gmap_get(struct gmap *gmap);
void gmap_put(struct gmap *gmap);
-void gmap_enable(struct gmap *gmap);
-void gmap_disable(struct gmap *gmap);
-struct gmap *gmap_get_enabled(void);
int gmap_map_segment(struct gmap *gmap, unsigned long from,
unsigned long to, unsigned long len);
int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len);
diff --git a/arch/s390/include/asm/kexec.h b/arch/s390/include/asm/kexec.h
index 1bd08eb..9084b75 100644
--- a/arch/s390/include/asm/kexec.h
+++ b/arch/s390/include/asm/kexec.h
@@ -94,6 +94,9 @@ void arch_kexec_protect_crashkres(void);
void arch_kexec_unprotect_crashkres(void);
#define arch_kexec_unprotect_crashkres arch_kexec_unprotect_crashkres
+
+bool is_kdump_kernel(void);
+#define is_kdump_kernel is_kdump_kernel
#endif
#ifdef CONFIG_KEXEC_FILE
diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index 8e77afb..51201b4 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -527,6 +527,9 @@ struct kvm_vcpu_stat {
#define PGM_REGION_FIRST_TRANS 0x39
#define PGM_REGION_SECOND_TRANS 0x3a
#define PGM_REGION_THIRD_TRANS 0x3b
+#define PGM_SECURE_STORAGE_ACCESS 0x3d
+#define PGM_NON_SECURE_STORAGE_ACCESS 0x3e
+#define PGM_SECURE_STORAGE_VIOLATION 0x3f
#define PGM_MONITOR 0x40
#define PGM_PER 0x80
#define PGM_CRYPTO_OPERATION 0x119
@@ -747,8 +750,6 @@ struct kvm_vcpu_arch {
struct hrtimer ckc_timer;
struct kvm_s390_pgm_info pgm;
struct gmap *gmap;
- /* backup location for the currently enabled gmap when scheduled out */
- struct gmap *enabled_gmap;
struct kvm_guestdbg_info_arch guestdbg;
unsigned long pfault_token;
unsigned long pfault_select;
diff --git a/arch/s390/include/asm/lowcore.h b/arch/s390/include/asm/lowcore.h
index 48c6471..42a092f 100644
--- a/arch/s390/include/asm/lowcore.h
+++ b/arch/s390/include/asm/lowcore.h
@@ -165,8 +165,7 @@ struct lowcore {
__u64 percpu_offset; /* 0x03b8 */
__u8 pad_0x03c0[0x03c8-0x03c0]; /* 0x03c0 */
__u64 machine_flags; /* 0x03c8 */
- __u64 gmap; /* 0x03d0 */
- __u8 pad_0x03d8[0x0400-0x03d8]; /* 0x03d8 */
+ __u8 pad_0x03d0[0x0400-0x03d0]; /* 0x03d0 */
__u32 return_lpswe; /* 0x0400 */
__u32 return_mcck_lpswe; /* 0x0404 */
diff --git a/arch/s390/include/asm/page.h b/arch/s390/include/asm/page.h
index 73e1e03..4405084 100644
--- a/arch/s390/include/asm/page.h
+++ b/arch/s390/include/asm/page.h
@@ -10,6 +10,7 @@
#include <linux/const.h>
#include <asm/types.h>
+#include <asm/asm.h>
#define _PAGE_SHIFT CONFIG_PAGE_SHIFT
#define _PAGE_SIZE (_AC(1, UL) << _PAGE_SHIFT)
@@ -148,11 +149,12 @@ static inline int page_reset_referenced(unsigned long addr)
int cc;
asm volatile(
- " rrbe 0,%1\n"
- " ipm %0\n"
- " srl %0,28\n"
- : "=d" (cc) : "a" (addr) : "cc");
- return cc;
+ " rrbe 0,%[addr]\n"
+ CC_IPM(cc)
+ : CC_OUT(cc, cc)
+ : [addr] "a" (addr)
+ : CC_CLOBBER);
+ return CC_TRANSFORM(cc);
}
/* Bits int the storage key */
diff --git a/arch/s390/include/asm/pai.h b/arch/s390/include/asm/pai.h
index 25f2077..ebeabd0 100644
--- a/arch/s390/include/asm/pai.h
+++ b/arch/s390/include/asm/pai.h
@@ -11,6 +11,7 @@
#include <linux/jump_label.h>
#include <asm/lowcore.h>
#include <asm/ptrace.h>
+#include <asm/asm.h>
struct qpaci_info_block {
u64 header;
@@ -33,12 +34,11 @@ static inline int qpaci(struct qpaci_info_block *info)
" lgr 0,%[size]\n"
" .insn s,0xb28f0000,%[info]\n"
" lgr %[size],0\n"
- " ipm %[cc]\n"
- " srl %[cc],28\n"
- : [cc] "=d" (cc), [info] "=Q" (*info), [size] "+&d" (size)
+ CC_IPM(cc)
+ : CC_OUT(cc, cc), [info] "=Q" (*info), [size] "+&d" (size)
:
- : "0", "cc", "memory");
- return cc ? (size + 1) * sizeof(u64) : 0;
+ : CC_CLOBBER_LIST("0", "memory"));
+ return CC_TRANSFORM(cc) ? (size + 1) * sizeof(u64) : 0;
}
#define PAI_CRYPTO_BASE 0x1000 /* First event number */
diff --git a/arch/s390/include/asm/pci.h b/arch/s390/include/asm/pci.h
index 9d920ce..5013a69 100644
--- a/arch/s390/include/asm/pci.h
+++ b/arch/s390/include/asm/pci.h
@@ -107,9 +107,10 @@ struct zpci_bus {
struct list_head resources;
struct list_head bus_next;
struct resource bus_resource;
- int pchid;
+ int topo; /* TID if topo_is_tid, PCHID otherwise */
int domain_nr;
- bool multifunction;
+ u8 multifunction : 1;
+ u8 topo_is_tid : 1;
enum pci_bus_speed max_bus_speed;
};
@@ -130,9 +131,12 @@ struct zpci_dev {
u16 vfn; /* virtual function number */
u16 pchid; /* physical channel ID */
u16 maxstbl; /* Maximum store block size */
+ u16 rid; /* RID as supplied by firmware */
+ u16 tid; /* Topology for which RID is valid */
u8 pfgid; /* function group ID */
u8 pft; /* pci function type */
u8 port;
+ u8 fidparm;
u8 dtsm; /* Supported DT mask */
u8 rid_available : 1;
u8 has_hp_slot : 1;
@@ -140,7 +144,8 @@ struct zpci_dev {
u8 is_physfn : 1;
u8 util_str_avail : 1;
u8 irqs_registered : 1;
- u8 reserved : 2;
+ u8 tid_avail : 1;
+ u8 reserved : 1;
unsigned int devfn; /* DEVFN part of the RID*/
u8 pfip[CLP_PFIP_NR_SEGMENTS]; /* pci function internal path */
@@ -210,12 +215,14 @@ extern struct airq_iv *zpci_aif_sbv;
----------------------------------------------------------------------------- */
/* Base stuff */
struct zpci_dev *zpci_create_device(u32 fid, u32 fh, enum zpci_state state);
+int zpci_add_device(struct zpci_dev *zdev);
int zpci_enable_device(struct zpci_dev *);
int zpci_disable_device(struct zpci_dev *);
int zpci_scan_configured_device(struct zpci_dev *zdev, u32 fh);
int zpci_deconfigure_device(struct zpci_dev *zdev);
void zpci_device_reserved(struct zpci_dev *zdev);
bool zpci_is_device_configured(struct zpci_dev *zdev);
+int zpci_scan_devices(void);
int zpci_hot_reset_device(struct zpci_dev *zdev);
int zpci_register_ioat(struct zpci_dev *, u8, u64, u64, u64, u8 *);
@@ -225,7 +232,7 @@ void zpci_update_fh(struct zpci_dev *zdev, u32 fh);
/* CLP */
int clp_setup_writeback_mio(void);
-int clp_scan_pci_devices(void);
+int clp_scan_pci_devices(struct list_head *scan_list);
int clp_query_pci_fn(struct zpci_dev *zdev);
int clp_enable_fh(struct zpci_dev *zdev, u32 *fh, u8 nr_dma_as);
int clp_disable_fh(struct zpci_dev *zdev, u32 *fh);
diff --git a/arch/s390/include/asm/pci_clp.h b/arch/s390/include/asm/pci_clp.h
index f0c677d..3fff2f7 100644
--- a/arch/s390/include/asm/pci_clp.h
+++ b/arch/s390/include/asm/pci_clp.h
@@ -110,7 +110,8 @@ struct clp_req_query_pci {
struct clp_rsp_query_pci {
struct clp_rsp_hdr hdr;
u16 vfn; /* virtual fn number */
- u16 : 3;
+ u16 : 2;
+ u16 tid_avail : 1;
u16 rid_avail : 1;
u16 is_physfn : 1;
u16 reserved1 : 1;
@@ -122,16 +123,18 @@ struct clp_rsp_query_pci {
u16 pchid;
__le32 bar[PCI_STD_NUM_BARS];
u8 pfip[CLP_PFIP_NR_SEGMENTS]; /* pci function internal path */
- u16 : 12;
- u16 port : 4;
+ u8 fidparm;
+ u8 reserved3 : 4;
+ u8 port : 4;
u8 fmb_len;
u8 pft; /* pci function type */
u64 sdma; /* start dma as */
u64 edma; /* end dma as */
#define ZPCI_RID_MASK_DEVFN 0x00ff
u16 rid; /* BUS/DEVFN PCI address */
- u16 reserved0;
- u32 reserved[10];
+ u32 reserved0;
+ u16 tid;
+ u32 reserved[9];
u32 uid; /* user defined id */
u8 util_str[CLP_UTIL_STR_LEN]; /* utility string */
u32 reserved2[16];
diff --git a/arch/s390/include/asm/pci_io.h b/arch/s390/include/asm/pci_io.h
index 2686bee..43a5ea4 100644
--- a/arch/s390/include/asm/pci_io.h
+++ b/arch/s390/include/asm/pci_io.h
@@ -143,7 +143,7 @@ static inline int zpci_get_max_io_size(u64 src, u64 dst, int len, int max)
static inline int zpci_memcpy_fromio(void *dst,
const volatile void __iomem *src,
- unsigned long n)
+ size_t n)
{
int size, rc = 0;
@@ -162,7 +162,7 @@ static inline int zpci_memcpy_fromio(void *dst,
}
static inline int zpci_memcpy_toio(volatile void __iomem *dst,
- const void *src, unsigned long n)
+ const void *src, size_t n)
{
int size, rc = 0;
@@ -187,7 +187,7 @@ static inline int zpci_memcpy_toio(volatile void __iomem *dst,
}
static inline int zpci_memset_io(volatile void __iomem *dst,
- unsigned char val, size_t count)
+ int val, size_t count)
{
u8 *src = kmalloc(count, GFP_KERNEL);
int rc;
diff --git a/arch/s390/include/asm/physmem_info.h b/arch/s390/include/asm/physmem_info.h
index f45cfc8..51b68a4 100644
--- a/arch/s390/include/asm/physmem_info.h
+++ b/arch/s390/include/asm/physmem_info.h
@@ -9,6 +9,7 @@ enum physmem_info_source {
MEM_DETECT_NONE = 0,
MEM_DETECT_SCLP_STOR_INFO,
MEM_DETECT_DIAG260,
+ MEM_DETECT_DIAG500_STOR_LIMIT,
MEM_DETECT_SCLP_READ_INFO,
MEM_DETECT_BIN_SEARCH
};
@@ -107,6 +108,8 @@ static inline const char *get_physmem_info_source(void)
return "sclp storage info";
case MEM_DETECT_DIAG260:
return "diag260";
+ case MEM_DETECT_DIAG500_STOR_LIMIT:
+ return "diag500 storage limit";
case MEM_DETECT_SCLP_READ_INFO:
return "sclp read info";
case MEM_DETECT_BIN_SEARCH:
diff --git a/arch/s390/include/asm/preempt.h b/arch/s390/include/asm/preempt.h
index deca3f2..0cde7e2 100644
--- a/arch/s390/include/asm/preempt.h
+++ b/arch/s390/include/asm/preempt.h
@@ -5,6 +5,7 @@
#include <asm/current.h>
#include <linux/thread_info.h>
#include <asm/atomic_ops.h>
+#include <asm/cmpxchg.h>
#include <asm/march.h>
#ifdef MARCH_HAS_Z196_FEATURES
@@ -22,12 +23,10 @@ static __always_inline void preempt_count_set(int pc)
{
int old, new;
+ old = READ_ONCE(get_lowcore()->preempt_count);
do {
- old = READ_ONCE(get_lowcore()->preempt_count);
- new = (old & PREEMPT_NEED_RESCHED) |
- (pc & ~PREEMPT_NEED_RESCHED);
- } while (__atomic_cmpxchg(&get_lowcore()->preempt_count,
- old, new) != old);
+ new = (old & PREEMPT_NEED_RESCHED) | (pc & ~PREEMPT_NEED_RESCHED);
+ } while (!arch_try_cmpxchg(&get_lowcore()->preempt_count, &old, new));
}
static __always_inline void set_preempt_need_resched(void)
diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h
index 9a5236a..8761fd0 100644
--- a/arch/s390/include/asm/processor.h
+++ b/arch/s390/include/asm/processor.h
@@ -39,6 +39,7 @@
#include <asm/runtime_instr.h>
#include <asm/irqflags.h>
#include <asm/alternative.h>
+#include <asm/fault.h>
struct pcpu {
unsigned long ec_mask; /* bit mask for ec_xxx functions */
@@ -187,10 +188,8 @@ struct thread_struct {
unsigned long hardirq_timer; /* task cputime in hardirq context */
unsigned long softirq_timer; /* task cputime in softirq context */
const sys_call_ptr_t *sys_call_table; /* system call table address */
- unsigned long gmap_addr; /* address of last gmap fault. */
- unsigned int gmap_write_flag; /* gmap fault write indication */
+ union teid gmap_teid; /* address and flags of last gmap fault */
unsigned int gmap_int_code; /* int code of last gmap fault */
- unsigned int gmap_pfault; /* signal of a pending guest pfault */
int ufpu_flags; /* user fpu flags */
int kfpu_flags; /* kernel fpu flags */
diff --git a/arch/s390/include/asm/ptrace.h b/arch/s390/include/asm/ptrace.h
index 2ad9324..788bc44 100644
--- a/arch/s390/include/asm/ptrace.h
+++ b/arch/s390/include/asm/ptrace.h
@@ -14,11 +14,13 @@
#define PIF_SYSCALL 0 /* inside a system call */
#define PIF_EXECVE_PGSTE_RESTART 1 /* restart execve for PGSTE binaries */
#define PIF_SYSCALL_RET_SET 2 /* return value was set via ptrace */
+#define PIF_GUEST_FAULT 3 /* indicates program check in sie64a */
#define PIF_FTRACE_FULL_REGS 4 /* all register contents valid (ftrace) */
#define _PIF_SYSCALL BIT(PIF_SYSCALL)
#define _PIF_EXECVE_PGSTE_RESTART BIT(PIF_EXECVE_PGSTE_RESTART)
#define _PIF_SYSCALL_RET_SET BIT(PIF_SYSCALL_RET_SET)
+#define _PIF_GUEST_FAULT BIT(PIF_GUEST_FAULT)
#define _PIF_FTRACE_FULL_REGS BIT(PIF_FTRACE_FULL_REGS)
#define PSW32_MASK_PER _AC(0x40000000, UL)
diff --git a/arch/s390/include/asm/set_memory.h b/arch/s390/include/asm/set_memory.h
index 06fbabe..cb4cc0f 100644
--- a/arch/s390/include/asm/set_memory.h
+++ b/arch/s390/include/asm/set_memory.h
@@ -62,5 +62,6 @@ __SET_MEMORY_FUNC(set_memory_4k, SET_MEMORY_4K)
int set_direct_map_invalid_noflush(struct page *page);
int set_direct_map_default_noflush(struct page *page);
+bool kernel_page_present(struct page *page);
#endif
diff --git a/arch/s390/include/asm/sigp.h b/arch/s390/include/asm/sigp.h
index edee63d..472943b 100644
--- a/arch/s390/include/asm/sigp.h
+++ b/arch/s390/include/asm/sigp.h
@@ -38,6 +38,8 @@
#ifndef __ASSEMBLY__
+#include <asm/asm.h>
+
static inline int ____pcpu_sigp(u16 addr, u8 order, unsigned long parm,
u32 *status)
{
@@ -46,13 +48,12 @@ static inline int ____pcpu_sigp(u16 addr, u8 order, unsigned long parm,
asm volatile(
" sigp %[r1],%[addr],0(%[order])\n"
- " ipm %[cc]\n"
- " srl %[cc],28\n"
- : [cc] "=&d" (cc), [r1] "+&d" (r1.pair)
+ CC_IPM(cc)
+ : CC_OUT(cc, cc), [r1] "+d" (r1.pair)
: [addr] "d" (addr), [order] "a" (order)
- : "cc");
+ : CC_CLOBBER);
*status = r1.even;
- return cc;
+ return CC_TRANSFORM(cc);
}
static inline int __pcpu_sigp(u16 addr, u8 order, unsigned long parm,
diff --git a/arch/s390/include/asm/sparsemem.h b/arch/s390/include/asm/sparsemem.h
index c549893..668dfc5 100644
--- a/arch/s390/include/asm/sparsemem.h
+++ b/arch/s390/include/asm/sparsemem.h
@@ -2,7 +2,23 @@
#ifndef _ASM_S390_SPARSEMEM_H
#define _ASM_S390_SPARSEMEM_H
-#define SECTION_SIZE_BITS 28
+#define SECTION_SIZE_BITS 27
#define MAX_PHYSMEM_BITS CONFIG_MAX_PHYSMEM_BITS
+#ifdef CONFIG_NUMA
+
+static inline int memory_add_physaddr_to_nid(u64 addr)
+{
+ return 0;
+}
+#define memory_add_physaddr_to_nid memory_add_physaddr_to_nid
+
+static inline int phys_to_target_node(u64 start)
+{
+ return 0;
+}
+#define phys_to_target_node phys_to_target_node
+
+#endif /* CONFIG_NUMA */
+
#endif /* _ASM_S390_SPARSEMEM_H */
diff --git a/arch/s390/include/asm/spinlock.h b/arch/s390/include/asm/spinlock.h
index 77d5e80..ac868a9 100644
--- a/arch/s390/include/asm/spinlock.h
+++ b/arch/s390/include/asm/spinlock.h
@@ -57,8 +57,10 @@ static inline int arch_spin_is_locked(arch_spinlock_t *lp)
static inline int arch_spin_trylock_once(arch_spinlock_t *lp)
{
+ int old = 0;
+
barrier();
- return likely(__atomic_cmpxchg_bool(&lp->lock, 0, SPINLOCK_LOCKVAL));
+ return likely(arch_try_cmpxchg(&lp->lock, &old, SPINLOCK_LOCKVAL));
}
static inline void arch_spin_lock(arch_spinlock_t *lp)
@@ -118,7 +120,9 @@ static inline void arch_read_unlock(arch_rwlock_t *rw)
static inline void arch_write_lock(arch_rwlock_t *rw)
{
- if (!__atomic_cmpxchg_bool(&rw->cnts, 0, 0x30000))
+ int old = 0;
+
+ if (!arch_try_cmpxchg(&rw->cnts, &old, 0x30000))
arch_write_lock_wait(rw);
}
@@ -133,8 +137,7 @@ static inline int arch_read_trylock(arch_rwlock_t *rw)
int old;
old = READ_ONCE(rw->cnts);
- return (!(old & 0xffff0000) &&
- __atomic_cmpxchg_bool(&rw->cnts, old, old + 1));
+ return (!(old & 0xffff0000) && arch_try_cmpxchg(&rw->cnts, &old, old + 1));
}
static inline int arch_write_trylock(arch_rwlock_t *rw)
@@ -142,7 +145,7 @@ static inline int arch_write_trylock(arch_rwlock_t *rw)
int old;
old = READ_ONCE(rw->cnts);
- return !old && __atomic_cmpxchg_bool(&rw->cnts, 0, 0x30000);
+ return !old && arch_try_cmpxchg(&rw->cnts, &old, 0x30000);
}
#endif /* __ASM_SPINLOCK_H */
diff --git a/arch/s390/include/asm/timex.h b/arch/s390/include/asm/timex.h
index 640901f..8fe5645 100644
--- a/arch/s390/include/asm/timex.h
+++ b/arch/s390/include/asm/timex.h
@@ -13,6 +13,7 @@
#include <linux/preempt.h>
#include <linux/time64.h>
#include <asm/lowcore.h>
+#include <asm/asm.h>
/* The value of the TOD clock for 1.1.1970. */
#define TOD_UNIX_EPOCH 0x7d91048bca000000ULL
@@ -44,11 +45,12 @@ static inline int set_tod_clock(__u64 time)
int cc;
asm volatile(
- " sck %1\n"
- " ipm %0\n"
- " srl %0,28\n"
- : "=d" (cc) : "Q" (time) : "cc");
- return cc;
+ " sck %[time]\n"
+ CC_IPM(cc)
+ : CC_OUT(cc, cc)
+ : [time] "Q" (time)
+ : CC_CLOBBER);
+ return CC_TRANSFORM(cc);
}
static inline int store_tod_clock_ext_cc(union tod_clock *clk)
@@ -56,11 +58,12 @@ static inline int store_tod_clock_ext_cc(union tod_clock *clk)
int cc;
asm volatile(
- " stcke %1\n"
- " ipm %0\n"
- " srl %0,28\n"
- : "=d" (cc), "=Q" (*clk) : : "cc");
- return cc;
+ " stcke %[clk]\n"
+ CC_IPM(cc)
+ : CC_OUT(cc, cc), [clk] "=Q" (*clk)
+ :
+ : CC_CLOBBER);
+ return CC_TRANSFORM(cc);
}
static __always_inline void store_tod_clock_ext(union tod_clock *tod)
@@ -149,12 +152,11 @@ struct ptff_qui {
" lgr 0,%[reg0]\n" \
" lgr 1,%[reg1]\n" \
" ptff\n" \
- " ipm %[rc]\n" \
- " srl %[rc],28\n" \
- : [rc] "=&d" (rc), "+m" (*(struct addrtype *)reg1) \
+ CC_IPM(rc) \
+ : CC_OUT(rc, rc), "+m" (*(struct addrtype *)reg1) \
: [reg0] "d" (reg0), [reg1] "d" (reg1) \
- : "cc", "0", "1"); \
- rc; \
+ : CC_CLOBBER_LIST("0", "1")); \
+ CC_TRANSFORM(rc); \
})
static inline unsigned long local_tick_disable(void)
diff --git a/arch/s390/include/asm/uv.h b/arch/s390/include/asm/uv.h
index 153d934..dc33260 100644
--- a/arch/s390/include/asm/uv.h
+++ b/arch/s390/include/asm/uv.h
@@ -2,7 +2,7 @@
/*
* Ultravisor Interfaces
*
- * Copyright IBM Corp. 2019, 2022
+ * Copyright IBM Corp. 2019, 2024
*
* Author(s):
* Vasily Gorbik <gor@linux.ibm.com>
@@ -17,6 +17,7 @@
#include <linux/sched.h>
#include <asm/page.h>
#include <asm/gmap.h>
+#include <asm/asm.h>
#define UVC_CC_OK 0
#define UVC_CC_ERROR 1
@@ -28,9 +29,11 @@
#define UVC_RC_INV_STATE 0x0003
#define UVC_RC_INV_LEN 0x0005
#define UVC_RC_NO_RESUME 0x0007
+#define UVC_RC_MORE_DATA 0x0100
#define UVC_RC_NEED_DESTROY 0x8000
#define UVC_CMD_QUI 0x0001
+#define UVC_CMD_QUERY_KEYS 0x0002
#define UVC_CMD_INIT_UV 0x000f
#define UVC_CMD_CREATE_SEC_CONF 0x0100
#define UVC_CMD_DESTROY_SEC_CONF 0x0101
@@ -61,6 +64,7 @@
#define UVC_CMD_ADD_SECRET 0x1031
#define UVC_CMD_LIST_SECRETS 0x1033
#define UVC_CMD_LOCK_SECRETS 0x1034
+#define UVC_CMD_RETR_SECRET 0x1035
/* Bits in installed uv calls */
enum uv_cmds_inst {
@@ -94,6 +98,8 @@ enum uv_cmds_inst {
BIT_UVC_CMD_ADD_SECRET = 29,
BIT_UVC_CMD_LIST_SECRETS = 30,
BIT_UVC_CMD_LOCK_SECRETS = 31,
+ BIT_UVC_CMD_RETR_SECRET = 33,
+ BIT_UVC_CMD_QUERY_KEYS = 34,
};
enum uv_feat_ind {
@@ -140,11 +146,27 @@ struct uv_cb_qui {
u64 reservedf0; /* 0x00f0 */
u64 supp_add_secret_req_ver; /* 0x00f8 */
u64 supp_add_secret_pcf; /* 0x0100 */
- u64 supp_secret_types; /* 0x0180 */
- u16 max_secrets; /* 0x0110 */
- u8 reserved112[0x120 - 0x112]; /* 0x0112 */
+ u64 supp_secret_types; /* 0x0108 */
+ u16 max_assoc_secrets; /* 0x0110 */
+ u16 max_retr_secrets; /* 0x0112 */
+ u8 reserved114[0x120 - 0x114]; /* 0x0114 */
} __packed __aligned(8);
+struct uv_key_hash {
+ u64 dword[4];
+} __packed __aligned(8);
+
+#define UVC_QUERY_KEYS_IDX_HK 0
+#define UVC_QUERY_KEYS_IDX_BACK_HK 1
+
+/* Query Ultravisor Keys */
+struct uv_cb_query_keys {
+ struct uv_cb_header header; /* 0x0000 */
+ u64 reserved08[3]; /* 0x0008 */
+ struct uv_key_hash key_hashes[15]; /* 0x0020 */
+} __packed __aligned(8);
+static_assert(sizeof(struct uv_cb_query_keys) == 0x200);
+
/* Initialize Ultravisor */
struct uv_cb_init {
struct uv_cb_header header;
@@ -317,7 +339,6 @@ struct uv_cb_dump_complete {
* A common UV call struct for pv guests that contains a single address
* Examples:
* Add Secret
- * List Secrets
*/
struct uv_cb_guest_addr {
struct uv_cb_header header;
@@ -326,18 +347,102 @@ struct uv_cb_guest_addr {
u64 reserved28[4];
} __packed __aligned(8);
+#define UVC_RC_RETR_SECR_BUF_SMALL 0x0109
+#define UVC_RC_RETR_SECR_STORE_EMPTY 0x010f
+#define UVC_RC_RETR_SECR_INV_IDX 0x0110
+#define UVC_RC_RETR_SECR_INV_SECRET 0x0111
+
+struct uv_cb_retr_secr {
+ struct uv_cb_header header;
+ u64 reserved08[2];
+ u16 secret_idx;
+ u16 reserved1a;
+ u32 buf_size;
+ u64 buf_addr;
+ u64 reserved28[4];
+} __packed __aligned(8);
+
+struct uv_cb_list_secrets {
+ struct uv_cb_header header;
+ u64 reserved08[2];
+ u8 reserved18[6];
+ u16 start_idx;
+ u64 list_addr;
+ u64 reserved28[4];
+} __packed __aligned(8);
+
+enum uv_secret_types {
+ UV_SECRET_INVAL = 0x0,
+ UV_SECRET_NULL = 0x1,
+ UV_SECRET_ASSOCIATION = 0x2,
+ UV_SECRET_PLAIN = 0x3,
+ UV_SECRET_AES_128 = 0x4,
+ UV_SECRET_AES_192 = 0x5,
+ UV_SECRET_AES_256 = 0x6,
+ UV_SECRET_AES_XTS_128 = 0x7,
+ UV_SECRET_AES_XTS_256 = 0x8,
+ UV_SECRET_HMAC_SHA_256 = 0x9,
+ UV_SECRET_HMAC_SHA_512 = 0xa,
+ /* 0x0b - 0x10 reserved */
+ UV_SECRET_ECDSA_P256 = 0x11,
+ UV_SECRET_ECDSA_P384 = 0x12,
+ UV_SECRET_ECDSA_P521 = 0x13,
+ UV_SECRET_ECDSA_ED25519 = 0x14,
+ UV_SECRET_ECDSA_ED448 = 0x15,
+};
+
+/**
+ * uv_secret_list_item_hdr - UV secret metadata.
+ * @index: Index of the secret in the secret list.
+ * @type: Type of the secret. See `enum uv_secret_types`.
+ * @length: Length of the stored secret.
+ */
+struct uv_secret_list_item_hdr {
+ u16 index;
+ u16 type;
+ u32 length;
+} __packed __aligned(8);
+
+#define UV_SECRET_ID_LEN 32
+/**
+ * uv_secret_list_item - UV secret entry.
+ * @hdr: The metadata of this secret.
+ * @id: The ID of this secret, not the secret itself.
+ */
+struct uv_secret_list_item {
+ struct uv_secret_list_item_hdr hdr;
+ u64 reserverd08;
+ u8 id[UV_SECRET_ID_LEN];
+} __packed __aligned(8);
+
+/**
+ * uv_secret_list - UV secret-metadata list.
+ * @num_secr_stored: Number of secrets stored in this list.
+ * @total_num_secrets: Number of secrets stored in the UV for this guest.
+ * @next_secret_idx: positive number if there are more secrets available or zero.
+ * @secrets: Up to 85 UV-secret metadata entries.
+ */
+struct uv_secret_list {
+ u16 num_secr_stored;
+ u16 total_num_secrets;
+ u16 next_secret_idx;
+ u16 reserved_06;
+ u64 reserved_08;
+ struct uv_secret_list_item secrets[85];
+} __packed __aligned(8);
+static_assert(sizeof(struct uv_secret_list) == PAGE_SIZE);
+
static inline int __uv_call(unsigned long r1, unsigned long r2)
{
int cc;
asm volatile(
- " .insn rrf,0xB9A40000,%[r1],%[r2],0,0\n"
- " ipm %[cc]\n"
- " srl %[cc],28\n"
- : [cc] "=d" (cc)
+ " .insn rrf,0xb9a40000,%[r1],%[r2],0,0\n"
+ CC_IPM(cc)
+ : CC_OUT(cc, cc)
: [r1] "a" (r1), [r2] "a" (r2)
- : "memory", "cc");
- return cc;
+ : CC_CLOBBER_LIST("memory"));
+ return CC_TRANSFORM(cc);
}
static inline int uv_call(unsigned long r1, unsigned long r2)
@@ -382,6 +487,48 @@ static inline int uv_cmd_nodata(u64 handle, u16 cmd, u16 *rc, u16 *rrc)
return cc ? -EINVAL : 0;
}
+/**
+ * uv_list_secrets() - Do a List Secrets UVC.
+ *
+ * @buf: Buffer to write list into; size of one page.
+ * @start_idx: The smallest index that should be included in the list.
+ * For the fist invocation use 0.
+ * @rc: Pointer to store the return code or NULL.
+ * @rrc: Pointer to store the return reason code or NULL.
+ *
+ * This function calls the List Secrets UVC. The result is written into `buf`,
+ * that needs to be at least one page of writable memory.
+ * `buf` consists of:
+ * * %struct uv_secret_list_hdr
+ * * %struct uv_secret_list_item (multiple)
+ *
+ * For `start_idx` use _0_ for the first call. If there are more secrets available
+ * but could not fit into the page then `rc` is `UVC_RC_MORE_DATA`.
+ * In this case use `uv_secret_list_hdr.next_secret_idx` for `start_idx`.
+ *
+ * Context: might sleep.
+ *
+ * Return: The UVC condition code.
+ */
+static inline int uv_list_secrets(struct uv_secret_list *buf, u16 start_idx,
+ u16 *rc, u16 *rrc)
+{
+ struct uv_cb_list_secrets uvcb = {
+ .header.len = sizeof(uvcb),
+ .header.cmd = UVC_CMD_LIST_SECRETS,
+ .start_idx = start_idx,
+ .list_addr = (u64)buf,
+ };
+ int cc = uv_call_sched(0, (u64)&uvcb);
+
+ if (rc)
+ *rc = uvcb.header.rc;
+ if (rrc)
+ *rrc = uvcb.header.rrc;
+
+ return cc;
+}
+
struct uv_info {
unsigned long inst_calls_list[4];
unsigned long uv_base_stor_len;
@@ -402,7 +549,8 @@ struct uv_info {
unsigned long supp_add_secret_req_ver;
unsigned long supp_add_secret_pcf;
unsigned long supp_secret_types;
- unsigned short max_secrets;
+ unsigned short max_assoc_secrets;
+ unsigned short max_retr_secrets;
};
extern struct uv_info uv_info;
@@ -468,6 +616,10 @@ static inline int uv_remove_shared(unsigned long addr)
return share(addr, UVC_CMD_REMOVE_SHARED_ACCESS);
}
+int uv_get_secret_metadata(const u8 secret_id[UV_SECRET_ID_LEN],
+ struct uv_secret_list_item_hdr *secret);
+int uv_retrieve_secret(u16 secret_idx, u8 *buf, size_t buf_size);
+
extern int prot_virt_host;
static inline int is_prot_virt_host(void)
diff --git a/arch/s390/include/uapi/asm/dasd.h b/arch/s390/include/uapi/asm/dasd.h
index b11d988..7c364b3 100644
--- a/arch/s390/include/uapi/asm/dasd.h
+++ b/arch/s390/include/uapi/asm/dasd.h
@@ -294,7 +294,7 @@ struct dasd_snid_ioctl_data {
/********************************************************************************
* SECTION: Definition of IOCTLs
*
- * Here ist how the ioctl-nr should be used:
+ * Here is how the ioctl-nr should be used:
* 0 - 31 DASD driver itself
* 32 - 239 still open
* 240 - 255 reserved for EMC
diff --git a/arch/s390/include/uapi/asm/pkey.h b/arch/s390/include/uapi/asm/pkey.h
index 60431d0..ca42e941 100644
--- a/arch/s390/include/uapi/asm/pkey.h
+++ b/arch/s390/include/uapi/asm/pkey.h
@@ -48,21 +48,22 @@
/* the newer ioctls use a pkey_key_type enum for type information */
enum pkey_key_type {
- PKEY_TYPE_CCA_DATA = (__u32) 1,
- PKEY_TYPE_CCA_CIPHER = (__u32) 2,
- PKEY_TYPE_EP11 = (__u32) 3,
- PKEY_TYPE_CCA_ECC = (__u32) 0x1f,
- PKEY_TYPE_EP11_AES = (__u32) 6,
- PKEY_TYPE_EP11_ECC = (__u32) 7,
- PKEY_TYPE_PROTKEY = (__u32) 8,
+ PKEY_TYPE_CCA_DATA = (__u32)1,
+ PKEY_TYPE_CCA_CIPHER = (__u32)2,
+ PKEY_TYPE_EP11 = (__u32)3,
+ PKEY_TYPE_CCA_ECC = (__u32)0x1f,
+ PKEY_TYPE_EP11_AES = (__u32)6,
+ PKEY_TYPE_EP11_ECC = (__u32)7,
+ PKEY_TYPE_PROTKEY = (__u32)8,
+ PKEY_TYPE_UVSECRET = (__u32)9,
};
/* the newer ioctls use a pkey_key_size enum for key size information */
enum pkey_key_size {
- PKEY_SIZE_AES_128 = (__u32) 128,
- PKEY_SIZE_AES_192 = (__u32) 192,
- PKEY_SIZE_AES_256 = (__u32) 256,
- PKEY_SIZE_UNKNOWN = (__u32) 0xFFFFFFFF,
+ PKEY_SIZE_AES_128 = (__u32)128,
+ PKEY_SIZE_AES_192 = (__u32)192,
+ PKEY_SIZE_AES_256 = (__u32)256,
+ PKEY_SIZE_UNKNOWN = (__u32)0xFFFFFFFF,
};
/* some of the newer ioctls use these flags */
@@ -125,6 +126,7 @@ struct pkey_genseck {
__u32 keytype; /* in: key type to generate */
struct pkey_seckey seckey; /* out: the secure key blob */
};
+
#define PKEY_GENSECK _IOWR(PKEY_IOCTL_MAGIC, 0x01, struct pkey_genseck)
/*
@@ -137,6 +139,7 @@ struct pkey_clr2seck {
struct pkey_clrkey clrkey; /* in: the clear key value */
struct pkey_seckey seckey; /* out: the secure key blob */
};
+
#define PKEY_CLR2SECK _IOWR(PKEY_IOCTL_MAGIC, 0x02, struct pkey_clr2seck)
/*
@@ -148,6 +151,7 @@ struct pkey_sec2protk {
struct pkey_seckey seckey; /* in: the secure key blob */
struct pkey_protkey protkey; /* out: the protected key */
};
+
#define PKEY_SEC2PROTK _IOWR(PKEY_IOCTL_MAGIC, 0x03, struct pkey_sec2protk)
/*
@@ -158,6 +162,7 @@ struct pkey_clr2protk {
struct pkey_clrkey clrkey; /* in: the clear key value */
struct pkey_protkey protkey; /* out: the protected key */
};
+
#define PKEY_CLR2PROTK _IOWR(PKEY_IOCTL_MAGIC, 0x04, struct pkey_clr2protk)
/*
@@ -169,6 +174,7 @@ struct pkey_findcard {
__u16 cardnr; /* out: card number */
__u16 domain; /* out: domain number */
};
+
#define PKEY_FINDCARD _IOWR(PKEY_IOCTL_MAGIC, 0x05, struct pkey_findcard)
/*
@@ -178,6 +184,7 @@ struct pkey_skey2pkey {
struct pkey_seckey seckey; /* in: the secure key blob */
struct pkey_protkey protkey; /* out: the protected key */
};
+
#define PKEY_SKEY2PKEY _IOWR(PKEY_IOCTL_MAGIC, 0x06, struct pkey_skey2pkey)
/*
@@ -195,6 +202,7 @@ struct pkey_verifykey {
__u16 keysize; /* out: key size in bits */
__u32 attributes; /* out: attribute bits */
};
+
#define PKEY_VERIFYKEY _IOWR(PKEY_IOCTL_MAGIC, 0x07, struct pkey_verifykey)
#define PKEY_VERIFY_ATTR_AES 0x00000001 /* key is an AES key */
#define PKEY_VERIFY_ATTR_OLD_MKVP 0x00000100 /* key has old MKVP value */
@@ -226,6 +234,7 @@ struct pkey_kblob2pkey {
__u32 keylen; /* in: the key blob length */
struct pkey_protkey protkey; /* out: the protected key */
};
+
#define PKEY_KBLOB2PROTK _IOWR(PKEY_IOCTL_MAGIC, 0x0A, struct pkey_kblob2pkey)
/*
@@ -258,6 +267,7 @@ struct pkey_genseck2 {
__u32 keylen; /* in: available key blob buffer size */
/* out: actual key blob size */
};
+
#define PKEY_GENSECK2 _IOWR(PKEY_IOCTL_MAGIC, 0x11, struct pkey_genseck2)
/*
@@ -292,6 +302,7 @@ struct pkey_clr2seck2 {
__u32 keylen; /* in: available key blob buffer size */
/* out: actual key blob size */
};
+
#define PKEY_CLR2SECK2 _IOWR(PKEY_IOCTL_MAGIC, 0x12, struct pkey_clr2seck2)
/*
@@ -329,6 +340,7 @@ struct pkey_verifykey2 {
enum pkey_key_size size; /* out: the key size */
__u32 flags; /* out: additional key info flags */
};
+
#define PKEY_VERIFYKEY2 _IOWR(PKEY_IOCTL_MAGIC, 0x17, struct pkey_verifykey2)
/*
@@ -351,6 +363,7 @@ struct pkey_kblob2pkey2 {
__u32 apqn_entries; /* in: # of apqn target list entries */
struct pkey_protkey protkey; /* out: the protected key */
};
+
#define PKEY_KBLOB2PROTK2 _IOWR(PKEY_IOCTL_MAGIC, 0x1A, struct pkey_kblob2pkey2)
/*
@@ -387,6 +400,7 @@ struct pkey_apqns4key {
__u32 apqn_entries; /* in: max # of apqn entries in the list */
/* out: # apqns stored into the list */
};
+
#define PKEY_APQNS4K _IOWR(PKEY_IOCTL_MAGIC, 0x1B, struct pkey_apqns4key)
/*
@@ -426,6 +440,7 @@ struct pkey_apqns4keytype {
__u32 apqn_entries; /* in: max # of apqn entries in the list */
/* out: # apqns stored into the list */
};
+
#define PKEY_APQNS4KT _IOWR(PKEY_IOCTL_MAGIC, 0x1C, struct pkey_apqns4keytype)
/*
@@ -452,6 +467,7 @@ struct pkey_kblob2pkey3 {
__u32 pkeylen; /* in/out: size of pkey buffer/actual len of pkey */
__u8 __user *pkey; /* in: pkey blob buffer space ptr */
};
+
#define PKEY_KBLOB2PROTK3 _IOWR(PKEY_IOCTL_MAGIC, 0x1D, struct pkey_kblob2pkey3)
#endif /* _UAPI_PKEY_H */
diff --git a/arch/s390/include/uapi/asm/uvdevice.h b/arch/s390/include/uapi/asm/uvdevice.h
index b9c2f14..4947f26a 100644
--- a/arch/s390/include/uapi/asm/uvdevice.h
+++ b/arch/s390/include/uapi/asm/uvdevice.h
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
/*
- * Copyright IBM Corp. 2022
+ * Copyright IBM Corp. 2022, 2024
* Author(s): Steffen Eiden <seiden@linux.ibm.com>
*/
#ifndef __S390_ASM_UVDEVICE_H
@@ -52,7 +52,7 @@ struct uvio_uvdev_info {
__u64 supp_uvio_cmds;
/*
* If bit `n` is set, the Ultravisor(UV) supports the UV-call
- * corresponding to the IOCTL with nr `n` in the calling contextx (host
+ * corresponding to the IOCTL with nr `n` in the calling context (host
* or guest). The value is only valid if the corresponding bit in
* @supp_uvio_cmds is set as well.
*/
@@ -71,6 +71,7 @@ struct uvio_uvdev_info {
#define UVIO_ATT_ADDITIONAL_MAX_LEN 0x8000
#define UVIO_ADD_SECRET_MAX_LEN 0x100000
#define UVIO_LIST_SECRETS_LEN 0x1000
+#define UVIO_RETR_SECRET_MAX_LEN 0x2000
#define UVIO_DEVICE_NAME "uv"
#define UVIO_TYPE_UVC 'u'
@@ -81,22 +82,25 @@ enum UVIO_IOCTL_NR {
UVIO_IOCTL_ADD_SECRET_NR,
UVIO_IOCTL_LIST_SECRETS_NR,
UVIO_IOCTL_LOCK_SECRETS_NR,
+ UVIO_IOCTL_RETR_SECRET_NR,
/* must be the last entry */
UVIO_IOCTL_NUM_IOCTLS
};
-#define UVIO_IOCTL(nr) _IOWR(UVIO_TYPE_UVC, nr, struct uvio_ioctl_cb)
-#define UVIO_IOCTL_UVDEV_INFO UVIO_IOCTL(UVIO_IOCTL_UVDEV_INFO_NR)
-#define UVIO_IOCTL_ATT UVIO_IOCTL(UVIO_IOCTL_ATT_NR)
-#define UVIO_IOCTL_ADD_SECRET UVIO_IOCTL(UVIO_IOCTL_ADD_SECRET_NR)
-#define UVIO_IOCTL_LIST_SECRETS UVIO_IOCTL(UVIO_IOCTL_LIST_SECRETS_NR)
-#define UVIO_IOCTL_LOCK_SECRETS UVIO_IOCTL(UVIO_IOCTL_LOCK_SECRETS_NR)
+#define UVIO_IOCTL(nr) _IOWR(UVIO_TYPE_UVC, nr, struct uvio_ioctl_cb)
+#define UVIO_IOCTL_UVDEV_INFO UVIO_IOCTL(UVIO_IOCTL_UVDEV_INFO_NR)
+#define UVIO_IOCTL_ATT UVIO_IOCTL(UVIO_IOCTL_ATT_NR)
+#define UVIO_IOCTL_ADD_SECRET UVIO_IOCTL(UVIO_IOCTL_ADD_SECRET_NR)
+#define UVIO_IOCTL_LIST_SECRETS UVIO_IOCTL(UVIO_IOCTL_LIST_SECRETS_NR)
+#define UVIO_IOCTL_LOCK_SECRETS UVIO_IOCTL(UVIO_IOCTL_LOCK_SECRETS_NR)
+#define UVIO_IOCTL_RETR_SECRET UVIO_IOCTL(UVIO_IOCTL_RETR_SECRET_NR)
-#define UVIO_SUPP_CALL(nr) (1ULL << (nr))
-#define UVIO_SUPP_UDEV_INFO UVIO_SUPP_CALL(UVIO_IOCTL_UDEV_INFO_NR)
-#define UVIO_SUPP_ATT UVIO_SUPP_CALL(UVIO_IOCTL_ATT_NR)
-#define UVIO_SUPP_ADD_SECRET UVIO_SUPP_CALL(UVIO_IOCTL_ADD_SECRET_NR)
-#define UVIO_SUPP_LIST_SECRETS UVIO_SUPP_CALL(UVIO_IOCTL_LIST_SECRETS_NR)
-#define UVIO_SUPP_LOCK_SECRETS UVIO_SUPP_CALL(UVIO_IOCTL_LOCK_SECRETS_NR)
+#define UVIO_SUPP_CALL(nr) (1ULL << (nr))
+#define UVIO_SUPP_UDEV_INFO UVIO_SUPP_CALL(UVIO_IOCTL_UDEV_INFO_NR)
+#define UVIO_SUPP_ATT UVIO_SUPP_CALL(UVIO_IOCTL_ATT_NR)
+#define UVIO_SUPP_ADD_SECRET UVIO_SUPP_CALL(UVIO_IOCTL_ADD_SECRET_NR)
+#define UVIO_SUPP_LIST_SECRETS UVIO_SUPP_CALL(UVIO_IOCTL_LIST_SECRETS_NR)
+#define UVIO_SUPP_LOCK_SECRETS UVIO_SUPP_CALL(UVIO_IOCTL_LOCK_SECRETS_NR)
+#define UVIO_SUPP_RETR_SECRET UVIO_SUPP_CALL(UVIO_IOCTL_RETR_SECRET_NR)
#endif /* __S390_ASM_UVDEVICE_H */
diff --git a/arch/s390/kernel/asm-offsets.c b/arch/s390/kernel/asm-offsets.c
index 5529248..1d7ed0f 100644
--- a/arch/s390/kernel/asm-offsets.c
+++ b/arch/s390/kernel/asm-offsets.c
@@ -13,7 +13,6 @@
#include <linux/purgatory.h>
#include <linux/pgtable.h>
#include <linux/ftrace.h>
-#include <asm/gmap.h>
#include <asm/stacktrace.h>
int main(void)
@@ -138,7 +137,6 @@ int main(void)
OFFSET(__LC_USER_ASCE, lowcore, user_asce);
OFFSET(__LC_LPP, lowcore, lpp);
OFFSET(__LC_CURRENT_PID, lowcore, current_pid);
- OFFSET(__LC_GMAP, lowcore, gmap);
OFFSET(__LC_LAST_BREAK, lowcore, last_break);
/* software defined ABI-relevant lowcore locations 0xe00 - 0xe20 */
OFFSET(__LC_DUMP_REIPL, lowcore, ipib);
@@ -161,7 +159,6 @@ int main(void)
OFFSET(__LC_PGM_TDB, lowcore, pgm_tdb);
BLANK();
/* gmap/sie offsets */
- OFFSET(__GMAP_ASCE, gmap, asce);
OFFSET(__SIE_PROG0C, kvm_s390_sie_block, prog0c);
OFFSET(__SIE_PROG20, kvm_s390_sie_block, prog20);
/* kexec_sha_region */
diff --git a/arch/s390/kernel/cpcmd.c b/arch/s390/kernel/cpcmd.c
index b210a29..2f4174b 100644
--- a/arch/s390/kernel/cpcmd.c
+++ b/arch/s390/kernel/cpcmd.c
@@ -20,6 +20,7 @@
#include <asm/diag.h>
#include <asm/ebcdic.h>
#include <asm/cpcmd.h>
+#include <asm/asm.h>
static DEFINE_SPINLOCK(cpcmd_lock);
static char cpcmd_buf[241];
@@ -45,12 +46,11 @@ static int diag8_response(int cmdlen, char *response, int *rlen)
ry.odd = *rlen;
asm volatile(
" diag %[rx],%[ry],0x8\n"
- " ipm %[cc]\n"
- " srl %[cc],28\n"
- : [cc] "=&d" (cc), [ry] "+&d" (ry.pair)
+ CC_IPM(cc)
+ : CC_OUT(cc, cc), [ry] "+d" (ry.pair)
: [rx] "d" (rx.pair)
- : "cc");
- if (cc)
+ : CC_CLOBBER);
+ if (CC_TRANSFORM(cc))
*rlen += ry.odd;
else
*rlen = ry.odd;
diff --git a/arch/s390/kernel/crash_dump.c b/arch/s390/kernel/crash_dump.c
index edae134..cd0c93a 100644
--- a/arch/s390/kernel/crash_dump.c
+++ b/arch/s390/kernel/crash_dump.c
@@ -237,6 +237,17 @@ int remap_oldmem_pfn_range(struct vm_area_struct *vma, unsigned long from,
prot);
}
+/*
+ * Return true only when in a kdump or stand-alone kdump environment.
+ * Note that /proc/vmcore might also be available in "standard zfcp/nvme dump"
+ * environments, where this function returns false; see dump_available().
+ */
+bool is_kdump_kernel(void)
+{
+ return oldmem_data.start;
+}
+EXPORT_SYMBOL_GPL(is_kdump_kernel);
+
static const char *nt_name(Elf64_Word type)
{
const char *name = "LINUX";
diff --git a/arch/s390/kernel/debug.c b/arch/s390/kernel/debug.c
index e62bea9..b3f2103 100644
--- a/arch/s390/kernel/debug.c
+++ b/arch/s390/kernel/debug.c
@@ -38,13 +38,13 @@
typedef struct file_private_info {
loff_t offset; /* offset of last read in file */
- int act_area; /* number of last formated area */
+ int act_area; /* number of last formatted area */
int act_page; /* act page in given area */
- int act_entry; /* last formated entry (offset */
+ int act_entry; /* last formatted entry (offset */
/* relative to beginning of last */
- /* formated page) */
+ /* formatted page) */
size_t act_entry_offset; /* up to this offset we copied */
- /* in last read the last formated */
+ /* in last read the last formatted */
/* entry to userland */
char temp_buf[2048]; /* buffer for output */
debug_info_t *debug_info_org; /* original debug information */
@@ -63,7 +63,7 @@ typedef struct {
long args[];
} debug_sprintf_entry_t;
-/* internal function prototyes */
+/* internal function prototypes */
static int debug_init(void);
static ssize_t debug_output(struct file *file, char __user *user_buf,
@@ -380,7 +380,7 @@ static void debug_info_put(debug_info_t *db_info)
/*
* debug_format_entry:
- * - format one debug entry and return size of formated data
+ * - format one debug entry and return size of formatted data
*/
static int debug_format_entry(file_private_info_t *p_info)
{
@@ -449,7 +449,7 @@ static inline int debug_next_entry(file_private_info_t *p_info)
/*
* debug_output:
* - called for user read()
- * - copies formated debug entries to the user buffer
+ * - copies formatted debug entries to the user buffer
*/
static ssize_t debug_output(struct file *file, /* file descriptor */
char __user *user_buf, /* user buffer */
@@ -523,7 +523,7 @@ static ssize_t debug_input(struct file *file, const char __user *user_buf,
/*
* debug_open:
* - called for user open()
- * - copies formated output to private_data area of the file
+ * - copies formatted output to private_data area of the file
* handle
*/
static int debug_open(struct inode *inode, struct file *file)
@@ -1513,7 +1513,7 @@ int debug_dflt_header_fn(debug_info_t *id, struct debug_view *view,
EXPORT_SYMBOL(debug_dflt_header_fn);
/*
- * prints debug data sprintf-formated:
+ * prints debug data sprintf-formatted:
* debug_sprinf_event/exception calls must be used together with this view
*/
diff --git a/arch/s390/kernel/diag.c b/arch/s390/kernel/diag.c
index 007e179..cdd6e31 100644
--- a/arch/s390/kernel/diag.c
+++ b/arch/s390/kernel/diag.c
@@ -16,6 +16,7 @@
#include <asm/diag.h>
#include <asm/trace/diag.h>
#include <asm/sections.h>
+#include <asm/asm.h>
#include "entry.h"
struct diag_stat {
@@ -307,16 +308,15 @@ EXPORT_SYMBOL(diag26c);
int diag49c(unsigned long subcode)
{
- int rc;
+ int cc;
diag_stat_inc(DIAG_STAT_X49C);
asm volatile(
" diag %[subcode],0,0x49c\n"
- " ipm %[rc]\n"
- " srl %[rc],28\n"
- : [rc] "=d" (rc)
+ CC_IPM(cc)
+ : CC_OUT(cc, cc)
: [subcode] "d" (subcode)
- : "cc");
- return rc;
+ : CC_CLOBBER);
+ return CC_TRANSFORM(cc);
}
EXPORT_SYMBOL(diag49c);
diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S
index d6d5317..1ff1323 100644
--- a/arch/s390/kernel/entry.S
+++ b/arch/s390/kernel/entry.S
@@ -222,17 +222,6 @@
lctlg %c1,%c1,__LC_KERNEL_ASCE(%r14) # load primary asce
lg %r14,__LC_CURRENT(%r14)
mvi __TI_sie(%r14),0
-# some program checks are suppressing. C code (e.g. do_protection_exception)
-# will rewind the PSW by the ILC, which is often 4 bytes in case of SIE. There
-# are some corner cases (e.g. runtime instrumentation) where ILC is unpredictable.
-# Other instructions between __sie64a and .Lsie_done should not cause program
-# interrupts. So lets use 3 nops as a landing pad for all possible rewinds.
-.Lrewind_pad6:
- nopr 7
-.Lrewind_pad4:
- nopr 7
-.Lrewind_pad2:
- nopr 7
SYM_INNER_LABEL(sie_exit, SYM_L_GLOBAL)
lg %r14,__SF_SIE_SAVEAREA(%r15) # load guest register save area
stmg %r0,%r13,0(%r14) # save guest gprs 0-13
@@ -244,15 +233,6 @@
lmg %r6,%r14,__SF_GPRS(%r15) # restore kernel registers
lg %r2,__SF_SIE_REASON(%r15) # return exit reason code
BR_EX %r14
-.Lsie_fault:
- lghi %r14,-EFAULT
- stg %r14,__SF_SIE_REASON(%r15) # set exit reason code
- j sie_exit
-
- EX_TABLE(.Lrewind_pad6,.Lsie_fault)
- EX_TABLE(.Lrewind_pad4,.Lsie_fault)
- EX_TABLE(.Lrewind_pad2,.Lsie_fault)
- EX_TABLE(sie_exit,.Lsie_fault)
SYM_FUNC_END(__sie64a)
EXPORT_SYMBOL(__sie64a)
EXPORT_SYMBOL(sie_exit)
@@ -327,13 +307,21 @@
GET_LC %r13
stpt __LC_SYS_ENTER_TIMER(%r13)
BPOFF
- lgr %r10,%r15
lmg %r8,%r9,__LC_PGM_OLD_PSW(%r13)
+ xgr %r10,%r10
tmhh %r8,0x0001 # coming from user space?
jno .Lpgm_skip_asce
lctlg %c1,%c1,__LC_KERNEL_ASCE(%r13)
j 3f # -> fault in user space
.Lpgm_skip_asce:
+#if IS_ENABLED(CONFIG_KVM)
+ lg %r11,__LC_CURRENT(%r13)
+ tm __TI_sie(%r11),0xff
+ jz 1f
+ BPENTER __SF_SIE_FLAGS(%r15),_TIF_ISOLATE_BP_GUEST
+ SIEEXIT __SF_SIE_CONTROL(%r15),%r13
+ lghi %r10,_PIF_GUEST_FAULT
+#endif
1: tmhh %r8,0x4000 # PER bit set in old PSW ?
jnz 2f # -> enabled, can't be a double fault
tm __LC_PGM_ILC+3(%r13),0x80 # check for per exception
@@ -344,21 +332,12 @@
CHECK_VMAP_STACK __LC_SAVE_AREA,%r13,4f
3: lg %r15,__LC_KERNEL_STACK(%r13)
4: la %r11,STACK_FRAME_OVERHEAD(%r15)
- xc __PT_FLAGS(8,%r11),__PT_FLAGS(%r11)
+ stg %r10,__PT_FLAGS(%r11)
xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15)
stmg %r0,%r7,__PT_R0(%r11)
mvc __PT_R8(64,%r11),__LC_SAVE_AREA(%r13)
mvc __PT_LAST_BREAK(8,%r11),__LC_PGM_LAST_BREAK(%r13)
- stctg %c1,%c1,__PT_CR1(%r11)
-#if IS_ENABLED(CONFIG_KVM)
- ltg %r12,__LC_GMAP(%r13)
- jz 5f
- clc __GMAP_ASCE(8,%r12), __PT_CR1(%r11)
- jne 5f
- BPENTER __SF_SIE_FLAGS(%r10),_TIF_ISOLATE_BP_GUEST
- SIEEXIT __SF_SIE_CONTROL(%r10),%r13
-#endif
-5: stmg %r8,%r9,__PT_PSW(%r11)
+ stmg %r8,%r9,__PT_PSW(%r11)
# clear user controlled registers to prevent speculative use
xgr %r0,%r0
xgr %r1,%r1
@@ -367,6 +346,7 @@
xgr %r5,%r5
xgr %r6,%r6
xgr %r7,%r7
+ xgr %r12,%r12
lgr %r2,%r11
brasl %r14,__do_pgm_check
tmhh %r8,0x0001 # returning to user space?
diff --git a/arch/s390/kernel/ipl.c b/arch/s390/kernel/ipl.c
index f17bb7b..edbb52ce 100644
--- a/arch/s390/kernel/ipl.c
+++ b/arch/s390/kernel/ipl.c
@@ -209,7 +209,7 @@ static ssize_t sys_##_prefix##_##_name##_show(struct kobject *kobj, \
struct kobj_attribute *attr, \
char *page) \
{ \
- return scnprintf(page, PAGE_SIZE, _format, ##args); \
+ return sysfs_emit(page, _format, ##args); \
}
#define IPL_ATTR_CCW_STORE_FN(_prefix, _name, _ipl_blk) \
@@ -372,7 +372,7 @@ EXPORT_SYMBOL_GPL(ipl_info);
static ssize_t ipl_type_show(struct kobject *kobj, struct kobj_attribute *attr,
char *page)
{
- return sprintf(page, "%s\n", ipl_type_str(ipl_info.type));
+ return sysfs_emit(page, "%s\n", ipl_type_str(ipl_info.type));
}
static struct kobj_attribute sys_ipl_type_attr = __ATTR_RO(ipl_type);
@@ -380,7 +380,7 @@ static struct kobj_attribute sys_ipl_type_attr = __ATTR_RO(ipl_type);
static ssize_t ipl_secure_show(struct kobject *kobj,
struct kobj_attribute *attr, char *page)
{
- return sprintf(page, "%i\n", !!ipl_secure_flag);
+ return sysfs_emit(page, "%i\n", !!ipl_secure_flag);
}
static struct kobj_attribute sys_ipl_secure_attr =
@@ -389,7 +389,7 @@ static struct kobj_attribute sys_ipl_secure_attr =
static ssize_t ipl_has_secure_show(struct kobject *kobj,
struct kobj_attribute *attr, char *page)
{
- return sprintf(page, "%i\n", !!sclp.has_sipl);
+ return sysfs_emit(page, "%i\n", !!sclp.has_sipl);
}
static struct kobj_attribute sys_ipl_has_secure_attr =
@@ -402,7 +402,7 @@ static ssize_t ipl_vm_parm_show(struct kobject *kobj,
if (ipl_block_valid && (ipl_block.pb0_hdr.pbt == IPL_PBT_CCW))
ipl_block_get_ascii_vmparm(parm, sizeof(parm), &ipl_block);
- return sprintf(page, "%s\n", parm);
+ return sysfs_emit(page, "%s\n", parm);
}
static struct kobj_attribute sys_ipl_vm_parm_attr =
@@ -413,18 +413,18 @@ static ssize_t sys_ipl_device_show(struct kobject *kobj,
{
switch (ipl_info.type) {
case IPL_TYPE_CCW:
- return sprintf(page, "0.%x.%04x\n", ipl_block.ccw.ssid,
- ipl_block.ccw.devno);
+ return sysfs_emit(page, "0.%x.%04x\n", ipl_block.ccw.ssid,
+ ipl_block.ccw.devno);
case IPL_TYPE_ECKD:
case IPL_TYPE_ECKD_DUMP:
- return sprintf(page, "0.%x.%04x\n", ipl_block.eckd.ssid,
- ipl_block.eckd.devno);
+ return sysfs_emit(page, "0.%x.%04x\n", ipl_block.eckd.ssid,
+ ipl_block.eckd.devno);
case IPL_TYPE_FCP:
case IPL_TYPE_FCP_DUMP:
- return sprintf(page, "0.0.%04x\n", ipl_block.fcp.devno);
+ return sysfs_emit(page, "0.0.%04x\n", ipl_block.fcp.devno);
case IPL_TYPE_NVME:
case IPL_TYPE_NVME_DUMP:
- return sprintf(page, "%08ux\n", ipl_block.nvme.fid);
+ return sysfs_emit(page, "%08ux\n", ipl_block.nvme.fid);
default:
return 0;
}
@@ -503,12 +503,12 @@ static ssize_t eckd_##_name##_br_chr_show(struct kobject *kobj, \
if (!ipb->br_chr.cyl && \
!ipb->br_chr.head && \
!ipb->br_chr.record) \
- return sprintf(buf, "auto\n"); \
+ return sysfs_emit(buf, "auto\n"); \
\
- return sprintf(buf, "0x%x,0x%x,0x%x\n", \
- ipb->br_chr.cyl, \
- ipb->br_chr.head, \
- ipb->br_chr.record); \
+ return sysfs_emit(buf, "0x%x,0x%x,0x%x\n", \
+ ipb->br_chr.cyl, \
+ ipb->br_chr.head, \
+ ipb->br_chr.record); \
}
#define IPL_ATTR_BR_CHR_STORE_FN(_name, _ipb) \
@@ -573,11 +573,11 @@ static ssize_t ipl_ccw_loadparm_show(struct kobject *kobj,
char loadparm[LOADPARM_LEN + 1] = {};
if (!sclp_ipl_info.is_valid)
- return sprintf(page, "#unknown#\n");
+ return sysfs_emit(page, "#unknown#\n");
memcpy(loadparm, &sclp_ipl_info.loadparm, LOADPARM_LEN);
EBCASC(loadparm, LOADPARM_LEN);
strim(loadparm);
- return sprintf(page, "%s\n", loadparm);
+ return sysfs_emit(page, "%s\n", loadparm);
}
static struct kobj_attribute sys_ipl_ccw_loadparm_attr =
@@ -731,7 +731,7 @@ static ssize_t reipl_generic_vmparm_show(struct ipl_parameter_block *ipb,
char vmparm[DIAG308_VMPARM_SIZE + 1] = {};
ipl_block_get_ascii_vmparm(vmparm, sizeof(vmparm), ipb);
- return sprintf(page, "%s\n", vmparm);
+ return sysfs_emit(page, "%s\n", vmparm);
}
static ssize_t reipl_generic_vmparm_store(struct ipl_parameter_block *ipb,
@@ -839,7 +839,7 @@ static ssize_t reipl_generic_loadparm_show(struct ipl_parameter_block *ipb,
char buf[LOADPARM_LEN + 1];
reipl_get_ascii_loadparm(buf, ipb);
- return sprintf(page, "%s\n", buf);
+ return sysfs_emit(page, "%s\n", buf);
}
static ssize_t reipl_generic_loadparm_store(struct ipl_parameter_block *ipb,
@@ -895,7 +895,7 @@ DEFINE_GENERIC_LOADPARM(eckd);
static ssize_t reipl_fcp_clear_show(struct kobject *kobj,
struct kobj_attribute *attr, char *page)
{
- return sprintf(page, "%u\n", reipl_fcp_clear);
+ return sysfs_emit(page, "%u\n", reipl_fcp_clear);
}
static ssize_t reipl_fcp_clear_store(struct kobject *kobj,
@@ -963,7 +963,7 @@ static struct attribute_group reipl_nvme_attr_group = {
static ssize_t reipl_nvme_clear_show(struct kobject *kobj,
struct kobj_attribute *attr, char *page)
{
- return sprintf(page, "%u\n", reipl_nvme_clear);
+ return sysfs_emit(page, "%u\n", reipl_nvme_clear);
}
static ssize_t reipl_nvme_clear_store(struct kobject *kobj,
@@ -984,7 +984,7 @@ DEFINE_IPL_CCW_ATTR_RW(reipl_ccw, device, reipl_block_ccw->ccw);
static ssize_t reipl_ccw_clear_show(struct kobject *kobj,
struct kobj_attribute *attr, char *page)
{
- return sprintf(page, "%u\n", reipl_ccw_clear);
+ return sysfs_emit(page, "%u\n", reipl_ccw_clear);
}
static ssize_t reipl_ccw_clear_store(struct kobject *kobj,
@@ -1056,7 +1056,7 @@ static struct attribute_group reipl_eckd_attr_group = {
static ssize_t reipl_eckd_clear_show(struct kobject *kobj,
struct kobj_attribute *attr, char *page)
{
- return sprintf(page, "%u\n", reipl_eckd_clear);
+ return sysfs_emit(page, "%u\n", reipl_eckd_clear);
}
static ssize_t reipl_eckd_clear_store(struct kobject *kobj,
@@ -1086,7 +1086,7 @@ static ssize_t reipl_nss_name_show(struct kobject *kobj,
char nss_name[NSS_NAME_SIZE + 1] = {};
reipl_get_ascii_nss_name(nss_name, reipl_block_nss);
- return sprintf(page, "%s\n", nss_name);
+ return sysfs_emit(page, "%s\n", nss_name);
}
static ssize_t reipl_nss_name_store(struct kobject *kobj,
@@ -1171,7 +1171,7 @@ static int reipl_set_type(enum ipl_type type)
static ssize_t reipl_type_show(struct kobject *kobj,
struct kobj_attribute *attr, char *page)
{
- return sprintf(page, "%s\n", ipl_type_str(reipl_type));
+ return sysfs_emit(page, "%s\n", ipl_type_str(reipl_type));
}
static ssize_t reipl_type_store(struct kobject *kobj,
@@ -1692,7 +1692,7 @@ static int dump_set_type(enum dump_type type)
static ssize_t dump_type_show(struct kobject *kobj,
struct kobj_attribute *attr, char *page)
{
- return sprintf(page, "%s\n", dump_type_str(dump_type));
+ return sysfs_emit(page, "%s\n", dump_type_str(dump_type));
}
static ssize_t dump_type_store(struct kobject *kobj,
@@ -1717,6 +1717,24 @@ static ssize_t dump_type_store(struct kobject *kobj,
static struct kobj_attribute dump_type_attr =
__ATTR(dump_type, 0644, dump_type_show, dump_type_store);
+static ssize_t dump_area_size_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *page)
+{
+ return sysfs_emit(page, "%lu\n", sclp.hsa_size);
+}
+
+static struct kobj_attribute dump_area_size_attr = __ATTR_RO(dump_area_size);
+
+static struct attribute *dump_attrs[] = {
+ &dump_type_attr.attr,
+ &dump_area_size_attr.attr,
+ NULL,
+};
+
+static struct attribute_group dump_attr_group = {
+ .attrs = dump_attrs,
+};
+
static struct kset *dump_kset;
static void diag308_dump(void *dump_block)
@@ -1853,7 +1871,7 @@ static int __init dump_init(void)
dump_kset = kset_create_and_add("dump", NULL, firmware_kobj);
if (!dump_kset)
return -ENOMEM;
- rc = sysfs_create_file(&dump_kset->kobj, &dump_type_attr.attr);
+ rc = sysfs_create_group(&dump_kset->kobj, &dump_attr_group);
if (rc) {
kset_unregister(dump_kset);
return rc;
@@ -2034,7 +2052,7 @@ static struct shutdown_trigger on_reboot_trigger = {ON_REIPL_STR,
static ssize_t on_reboot_show(struct kobject *kobj,
struct kobj_attribute *attr, char *page)
{
- return sprintf(page, "%s\n", on_reboot_trigger.action->name);
+ return sysfs_emit(page, "%s\n", on_reboot_trigger.action->name);
}
static ssize_t on_reboot_store(struct kobject *kobj,
@@ -2060,7 +2078,7 @@ static struct shutdown_trigger on_panic_trigger = {ON_PANIC_STR, &stop_action};
static ssize_t on_panic_show(struct kobject *kobj,
struct kobj_attribute *attr, char *page)
{
- return sprintf(page, "%s\n", on_panic_trigger.action->name);
+ return sysfs_emit(page, "%s\n", on_panic_trigger.action->name);
}
static ssize_t on_panic_store(struct kobject *kobj,
@@ -2086,7 +2104,7 @@ static struct shutdown_trigger on_restart_trigger = {ON_RESTART_STR,
static ssize_t on_restart_show(struct kobject *kobj,
struct kobj_attribute *attr, char *page)
{
- return sprintf(page, "%s\n", on_restart_trigger.action->name);
+ return sysfs_emit(page, "%s\n", on_restart_trigger.action->name);
}
static ssize_t on_restart_store(struct kobject *kobj,
@@ -2122,7 +2140,7 @@ static struct shutdown_trigger on_halt_trigger = {ON_HALT_STR, &stop_action};
static ssize_t on_halt_show(struct kobject *kobj,
struct kobj_attribute *attr, char *page)
{
- return sprintf(page, "%s\n", on_halt_trigger.action->name);
+ return sysfs_emit(page, "%s\n", on_halt_trigger.action->name);
}
static ssize_t on_halt_store(struct kobject *kobj,
@@ -2148,7 +2166,7 @@ static struct shutdown_trigger on_poff_trigger = {ON_POFF_STR, &stop_action};
static ssize_t on_poff_show(struct kobject *kobj,
struct kobj_attribute *attr, char *page)
{
- return sprintf(page, "%s\n", on_poff_trigger.action->name);
+ return sysfs_emit(page, "%s\n", on_poff_trigger.action->name);
}
static ssize_t on_poff_store(struct kobject *kobj,
diff --git a/arch/s390/kernel/irq.c b/arch/s390/kernel/irq.c
index 2639a3d..24b625c 100644
--- a/arch/s390/kernel/irq.c
+++ b/arch/s390/kernel/irq.c
@@ -30,6 +30,7 @@
#include <asm/stacktrace.h>
#include <asm/softirq_stack.h>
#include <asm/vtime.h>
+#include <asm/asm.h>
#include "entry.h"
DEFINE_PER_CPU_SHARED_ALIGNED(struct irq_stat, irq_stat);
@@ -129,9 +130,13 @@ static int irq_pending(struct pt_regs *regs)
{
int cc;
- asm volatile("tpi 0\n"
- "ipm %0" : "=d" (cc) : : "cc");
- return cc >> 28;
+ asm volatile(
+ " tpi 0\n"
+ CC_IPM(cc)
+ : CC_OUT(cc, cc)
+ :
+ : CC_CLOBBER);
+ return CC_TRANSFORM(cc);
}
void noinstr do_io_irq(struct pt_regs *regs)
diff --git a/arch/s390/kernel/nospec-sysfs.c b/arch/s390/kernel/nospec-sysfs.c
index a951888..5970dd3 100644
--- a/arch/s390/kernel/nospec-sysfs.c
+++ b/arch/s390/kernel/nospec-sysfs.c
@@ -7,17 +7,17 @@
ssize_t cpu_show_spectre_v1(struct device *dev,
struct device_attribute *attr, char *buf)
{
- return sprintf(buf, "Mitigation: __user pointer sanitization\n");
+ return sysfs_emit(buf, "Mitigation: __user pointer sanitization\n");
}
ssize_t cpu_show_spectre_v2(struct device *dev,
struct device_attribute *attr, char *buf)
{
if (test_facility(156))
- return sprintf(buf, "Mitigation: etokens\n");
+ return sysfs_emit(buf, "Mitigation: etokens\n");
if (nospec_uses_trampoline())
- return sprintf(buf, "Mitigation: execute trampolines\n");
+ return sysfs_emit(buf, "Mitigation: execute trampolines\n");
if (nobp_enabled())
- return sprintf(buf, "Mitigation: limited branch prediction\n");
- return sprintf(buf, "Vulnerable\n");
+ return sysfs_emit(buf, "Mitigation: limited branch prediction\n");
+ return sysfs_emit(buf, "Vulnerable\n");
}
diff --git a/arch/s390/kernel/os_info.c b/arch/s390/kernel/os_info.c
index b695f98..29080d6 100644
--- a/arch/s390/kernel/os_info.c
+++ b/arch/s390/kernel/os_info.c
@@ -180,7 +180,7 @@ static void os_info_old_init(void)
}
/*
- * Return pointer to os infor entry and its size
+ * Return pointer to os info entry and its size
*/
void *os_info_old_entry(int nr, unsigned long *size)
{
diff --git a/arch/s390/kernel/perf_cpum_cf.c b/arch/s390/kernel/perf_cpum_cf.c
index e2e0aa4..b0bc68d 100644
--- a/arch/s390/kernel/perf_cpum_cf.c
+++ b/arch/s390/kernel/perf_cpum_cf.c
@@ -835,7 +835,7 @@ static int __hw_perf_event_init(struct perf_event *event, unsigned int type)
return validate_ctr_version(hwc->config, set);
}
-/* Events CPU_CYLCES and INSTRUCTIONS can be submitted with two different
+/* Events CPU_CYCLES and INSTRUCTIONS can be submitted with two different
* attribute::type values:
* - PERF_TYPE_HARDWARE:
* - pmu->type:
@@ -879,8 +879,8 @@ static int hw_perf_event_reset(struct perf_event *event)
u64 prev, new;
int err;
+ prev = local64_read(&event->hw.prev_count);
do {
- prev = local64_read(&event->hw.prev_count);
err = ecctr(event->hw.config, &new);
if (err) {
if (err != 3)
@@ -892,7 +892,7 @@ static int hw_perf_event_reset(struct perf_event *event)
*/
new = 0;
}
- } while (local64_cmpxchg(&event->hw.prev_count, prev, new) != prev);
+ } while (!local64_try_cmpxchg(&event->hw.prev_count, &prev, new));
return err;
}
@@ -902,12 +902,12 @@ static void hw_perf_event_update(struct perf_event *event)
u64 prev, new, delta;
int err;
+ prev = local64_read(&event->hw.prev_count);
do {
- prev = local64_read(&event->hw.prev_count);
err = ecctr(event->hw.config, &new);
if (err)
return;
- } while (local64_cmpxchg(&event->hw.prev_count, prev, new) != prev);
+ } while (!local64_try_cmpxchg(&event->hw.prev_count, &prev, new));
delta = (prev <= new) ? new - prev
: (-1ULL - prev) + new + 1; /* overflow */
@@ -1054,7 +1054,7 @@ static void cpumf_pmu_del(struct perf_event *event, int flags)
*
* When a new perf event has been added but not yet started, this can
* clear enable control and resets all counters in a set. Therefore,
- * cpumf_pmu_start() always has to reenable a counter set.
+ * cpumf_pmu_start() always has to re-enable a counter set.
*/
for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i)
if (!atomic_read(&cpuhw->ctr_set[i]))
@@ -1863,7 +1863,7 @@ static const struct attribute_group *cfdiag_attr_groups[] = {
/* Performance monitoring unit for event CF_DIAG. Since this event
* is also started and stopped via the perf_event_open() system call, use
* the same event enable/disable call back functions. They do not
- * have a pointer to the perf_event strcture as first parameter.
+ * have a pointer to the perf_event structure as first parameter.
*
* The functions XXX_add, XXX_del, XXX_start and XXX_stop are also common.
* Reuse them and distinguish the event (always first parameter) via
diff --git a/arch/s390/kernel/perf_cpum_sf.c b/arch/s390/kernel/perf_cpum_sf.c
index 5b765e3..0cde42f 100644
--- a/arch/s390/kernel/perf_cpum_sf.c
+++ b/arch/s390/kernel/perf_cpum_sf.c
@@ -404,7 +404,7 @@ static void sfb_init_allocs(unsigned long num, struct hw_perf_event *hwc)
static void deallocate_buffers(struct cpu_hw_sf *cpuhw)
{
- if (cpuhw->sfb.sdbt)
+ if (sf_buffer_available(cpuhw))
free_sampling_buffer(&cpuhw->sfb);
}
@@ -559,16 +559,15 @@ static void setup_pmc_cpu(void *flags)
{
struct cpu_hw_sf *cpuhw = this_cpu_ptr(&cpu_hw_sf);
+ sf_disable();
switch (*((int *)flags)) {
case PMC_INIT:
memset(cpuhw, 0, sizeof(*cpuhw));
qsi(&cpuhw->qsi);
cpuhw->flags |= PMU_F_RESERVED;
- sf_disable();
break;
case PMC_RELEASE:
cpuhw->flags &= ~PMU_F_RESERVED;
- sf_disable();
deallocate_buffers(cpuhw);
break;
}
@@ -759,7 +758,6 @@ static int __hw_perf_event_init(struct perf_event *event)
reserve_pmc_hardware();
refcount_set(&num_events, 1);
}
- mutex_unlock(&pmc_reserve_mutex);
event->destroy = hw_perf_event_destroy;
/* Access per-CPU sampling information (query sampling info) */
@@ -818,7 +816,7 @@ static int __hw_perf_event_init(struct perf_event *event)
/* Use AUX buffer. No need to allocate it by ourself */
if (attr->config == PERF_EVENT_CPUM_SF_DIAG)
- return 0;
+ goto out;
/* Allocate the per-CPU sampling buffer using the CPU information
* from the event. If the event is not pinned to a particular
@@ -848,6 +846,7 @@ static int __hw_perf_event_init(struct perf_event *event)
if (is_default_overflow_handler(event))
event->overflow_handler = cpumsf_output_event_pid;
out:
+ mutex_unlock(&pmc_reserve_mutex);
return err;
}
@@ -910,10 +909,14 @@ static void cpumsf_pmu_enable(struct pmu *pmu)
struct hw_perf_event *hwc;
int err;
- if (cpuhw->flags & PMU_F_ENABLED)
- return;
-
- if (cpuhw->flags & PMU_F_ERR_MASK)
+ /*
+ * Event must be
+ * - added/started on this CPU (PMU_F_IN_USE set)
+ * - and CPU must be available (PMU_F_RESERVED set)
+ * - and not already enabled (PMU_F_ENABLED not set)
+ * - and not in error condition (PMU_F_ERR_MASK not set)
+ */
+ if (cpuhw->flags != (PMU_F_IN_USE | PMU_F_RESERVED))
return;
/* Check whether to extent the sampling buffer.
@@ -927,33 +930,27 @@ static void cpumsf_pmu_enable(struct pmu *pmu)
* facility, but it can be fully re-enabled using sampling controls that
* have been saved in cpumsf_pmu_disable().
*/
- if (cpuhw->event) {
- hwc = &cpuhw->event->hw;
- if (!(SAMPL_DIAG_MODE(hwc))) {
- /*
- * Account number of overflow-designated
- * buffer extents
- */
- sfb_account_overflows(cpuhw, hwc);
- extend_sampling_buffer(&cpuhw->sfb, hwc);
- }
- /* Rate may be adjusted with ioctl() */
- cpuhw->lsctl.interval = SAMPL_RATE(hwc);
+ hwc = &cpuhw->event->hw;
+ if (!(SAMPL_DIAG_MODE(hwc))) {
+ /*
+ * Account number of overflow-designated buffer extents
+ */
+ sfb_account_overflows(cpuhw, hwc);
+ extend_sampling_buffer(&cpuhw->sfb, hwc);
}
+ /* Rate may be adjusted with ioctl() */
+ cpuhw->lsctl.interval = SAMPL_RATE(hwc);
/* (Re)enable the PMU and sampling facility */
- cpuhw->flags |= PMU_F_ENABLED;
- barrier();
-
err = lsctl(&cpuhw->lsctl);
if (err) {
- cpuhw->flags &= ~PMU_F_ENABLED;
pr_err("Loading sampling controls failed: op 1 err %i\n", err);
return;
}
/* Load current program parameter */
lpp(&get_lowcore()->lpp);
+ cpuhw->flags |= PMU_F_ENABLED;
}
static void cpumsf_pmu_disable(struct pmu *pmu)
@@ -1191,8 +1188,8 @@ static void hw_collect_samples(struct perf_event *event, unsigned long *sdbt,
static void hw_perf_event_update(struct perf_event *event, int flush_all)
{
unsigned long long event_overflow, sampl_overflow, num_sdb;
- union hws_trailer_header old, prev, new;
struct hw_perf_event *hwc = &event->hw;
+ union hws_trailer_header prev, new;
struct hws_trailer_entry *te;
unsigned long *sdbt, sdb;
int done;
@@ -1236,13 +1233,11 @@ static void hw_perf_event_update(struct perf_event *event, int flush_all)
/* Reset trailer (using compare-double-and-swap) */
prev.val = READ_ONCE_ALIGNED_128(te->header.val);
do {
- old.val = prev.val;
new.val = prev.val;
new.f = 0;
new.a = 1;
new.overflow = 0;
- prev.val = cmpxchg128(&te->header.val, old.val, new.val);
- } while (prev.val != old.val);
+ } while (!try_cmpxchg128(&te->header.val, &prev.val, new.val));
/* Advance to next sample-data-block */
sdbt++;
@@ -1408,16 +1403,15 @@ static int aux_output_begin(struct perf_output_handle *handle,
static bool aux_set_alert(struct aux_buffer *aux, unsigned long alert_index,
unsigned long long *overflow)
{
- union hws_trailer_header old, prev, new;
+ union hws_trailer_header prev, new;
struct hws_trailer_entry *te;
te = aux_sdb_trailer(aux, alert_index);
prev.val = READ_ONCE_ALIGNED_128(te->header.val);
do {
- old.val = prev.val;
new.val = prev.val;
- *overflow = old.overflow;
- if (old.f) {
+ *overflow = prev.overflow;
+ if (prev.f) {
/*
* SDB is already set by hardware.
* Abort and try to set somewhere
@@ -1427,8 +1421,7 @@ static bool aux_set_alert(struct aux_buffer *aux, unsigned long alert_index,
}
new.a = 1;
new.overflow = 0;
- prev.val = cmpxchg128(&te->header.val, old.val, new.val);
- } while (prev.val != old.val);
+ } while (!try_cmpxchg128(&te->header.val, &prev.val, new.val));
return true;
}
@@ -1457,7 +1450,7 @@ static bool aux_set_alert(struct aux_buffer *aux, unsigned long alert_index,
static bool aux_reset_buffer(struct aux_buffer *aux, unsigned long range,
unsigned long long *overflow)
{
- union hws_trailer_header old, prev, new;
+ union hws_trailer_header prev, new;
unsigned long i, range_scan, idx;
unsigned long long orig_overflow;
struct hws_trailer_entry *te;
@@ -1489,17 +1482,15 @@ static bool aux_reset_buffer(struct aux_buffer *aux, unsigned long range,
te = aux_sdb_trailer(aux, idx);
prev.val = READ_ONCE_ALIGNED_128(te->header.val);
do {
- old.val = prev.val;
new.val = prev.val;
- orig_overflow = old.overflow;
+ orig_overflow = prev.overflow;
new.f = 0;
new.overflow = 0;
if (idx == aux->alert_mark)
new.a = 1;
else
new.a = 0;
- prev.val = cmpxchg128(&te->header.val, old.val, new.val);
- } while (prev.val != old.val);
+ } while (!try_cmpxchg128(&te->header.val, &prev.val, new.val));
*overflow += orig_overflow;
}
@@ -1780,7 +1771,9 @@ static void cpumsf_pmu_stop(struct perf_event *event, int flags)
event->hw.state |= PERF_HES_STOPPED;
if ((flags & PERF_EF_UPDATE) && !(event->hw.state & PERF_HES_UPTODATE)) {
- hw_perf_event_update(event, 1);
+ /* CPU hotplug off removes SDBs. No samples to extract. */
+ if (cpuhw->flags & PMU_F_RESERVED)
+ hw_perf_event_update(event, 1);
event->hw.state |= PERF_HES_UPTODATE;
}
perf_pmu_enable(event->pmu);
@@ -1795,7 +1788,7 @@ static int cpumsf_pmu_add(struct perf_event *event, int flags)
if (cpuhw->flags & PMU_F_IN_USE)
return -EAGAIN;
- if (!SAMPL_DIAG_MODE(&event->hw) && !cpuhw->sfb.sdbt)
+ if (!SAMPL_DIAG_MODE(&event->hw) && !sf_buffer_available(cpuhw))
return -EINVAL;
perf_pmu_disable(event->pmu);
@@ -1957,13 +1950,12 @@ static void cpumf_measurement_alert(struct ext_code ext_code,
/* Program alert request */
if (alert & CPU_MF_INT_SF_PRA) {
- if (cpuhw->flags & PMU_F_IN_USE)
+ if (cpuhw->flags & PMU_F_IN_USE) {
if (SAMPL_DIAG_MODE(&cpuhw->event->hw))
hw_collect_aux(cpuhw);
else
hw_perf_event_update(cpuhw->event, 0);
- else
- WARN_ON_ONCE(!(cpuhw->flags & PMU_F_IN_USE));
+ }
}
/* Report measurement alerts only for non-PRA codes */
@@ -1984,7 +1976,7 @@ static void cpumf_measurement_alert(struct ext_code ext_code,
/* Invalid sampling buffer entry */
if (alert & (CPU_MF_INT_SF_IAE|CPU_MF_INT_SF_ISE)) {
- pr_err("A sampling buffer entry is incorrect (alert=0x%x)\n",
+ pr_err("A sampling buffer entry is incorrect (alert=%#x)\n",
alert);
cpuhw->flags |= PMU_F_ERR_IBE;
sf_disable();
diff --git a/arch/s390/kernel/perf_event.c b/arch/s390/kernel/perf_event.c
index 5fff629..0c65eaf 100644
--- a/arch/s390/kernel/perf_event.c
+++ b/arch/s390/kernel/perf_event.c
@@ -228,5 +228,5 @@ ssize_t cpumf_events_sysfs_show(struct device *dev,
struct perf_pmu_events_attr *pmu_attr;
pmu_attr = container_of(attr, struct perf_pmu_events_attr, attr);
- return sprintf(page, "event=0x%04llx\n", pmu_attr->id);
+ return sysfs_emit(page, "event=0x%04llx\n", pmu_attr->id);
}
diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c
index 4df56fd..822d8e6 100644
--- a/arch/s390/kernel/smp.c
+++ b/arch/s390/kernel/smp.c
@@ -574,7 +574,7 @@ int smp_store_status(int cpu)
/*
* Collect CPU state of the previous, crashed system.
- * There are four cases:
+ * There are three cases:
* 1) standard zfcp/nvme dump
* condition: OLDMEM_BASE == NULL && is_ipl_type_dump() == true
* The state for all CPUs except the boot CPU needs to be collected
@@ -587,16 +587,16 @@ int smp_store_status(int cpu)
* with sigp stop-and-store-status. The firmware or the boot-loader
* stored the registers of the boot CPU in the absolute lowcore in the
* memory of the old system.
- * 3) kdump and the old kernel did not store the CPU state,
- * or stand-alone kdump for DASD
- * condition: OLDMEM_BASE != NULL && !is_kdump_kernel()
+ * 3) kdump or stand-alone kdump for DASD
+ * condition: OLDMEM_BASE != NULL && is_ipl_type_dump() == false
* The state for all CPUs except the boot CPU needs to be collected
* with sigp stop-and-store-status. The kexec code or the boot-loader
* stored the registers of the boot CPU in the memory of the old system.
- * 4) kdump and the old kernel stored the CPU state
- * condition: OLDMEM_BASE != NULL && is_kdump_kernel()
- * This case does not exist for s390 anymore, setup_arch explicitly
- * deactivates the elfcorehdr= kernel parameter
+ *
+ * Note that the legacy kdump mode where the old kernel stored the CPU states
+ * does no longer exist: setup_arch() explicitly deactivates the elfcorehdr=
+ * kernel parameter. The is_kdump_kernel() implementation on s390 is independent
+ * of the elfcorehdr= parameter.
*/
static bool dump_available(void)
{
@@ -1011,7 +1011,7 @@ static ssize_t cpu_configure_show(struct device *dev,
ssize_t count;
mutex_lock(&smp_cpu_state_mutex);
- count = sprintf(buf, "%d\n", per_cpu(pcpu_devices, dev->id).state);
+ count = sysfs_emit(buf, "%d\n", per_cpu(pcpu_devices, dev->id).state);
mutex_unlock(&smp_cpu_state_mutex);
return count;
}
@@ -1083,7 +1083,7 @@ static DEVICE_ATTR(configure, 0644, cpu_configure_show, cpu_configure_store);
static ssize_t show_cpu_address(struct device *dev,
struct device_attribute *attr, char *buf)
{
- return sprintf(buf, "%d\n", per_cpu(pcpu_devices, dev->id).address);
+ return sysfs_emit(buf, "%d\n", per_cpu(pcpu_devices, dev->id).address);
}
static DEVICE_ATTR(address, 0444, show_cpu_address, NULL);
diff --git a/arch/s390/kernel/sthyi.c b/arch/s390/kernel/sthyi.c
index 1cf2ad0..d40f0b9 100644
--- a/arch/s390/kernel/sthyi.c
+++ b/arch/s390/kernel/sthyi.c
@@ -17,6 +17,7 @@
#include <asm/ebcdic.h>
#include <asm/facility.h>
#include <asm/sthyi.h>
+#include <asm/asm.h>
#include "entry.h"
#define DED_WEIGHT 0xffff
@@ -425,13 +426,12 @@ static int sthyi(u64 vaddr, u64 *rc)
asm volatile(
".insn rre,0xB2560000,%[r1],%[r2]\n"
- "ipm %[cc]\n"
- "srl %[cc],28\n"
- : [cc] "=&d" (cc), [r2] "+&d" (r2.pair)
+ CC_IPM(cc)
+ : CC_OUT(cc, cc), [r2] "+&d" (r2.pair)
: [r1] "d" (r1.pair)
- : "memory", "cc");
+ : CC_CLOBBER_LIST("memory"));
*rc = r2.odd;
- return cc;
+ return CC_TRANSFORM(cc);
}
static int fill_dst(void *dst, u64 *rc)
diff --git a/arch/s390/kernel/syscalls/Makefile b/arch/s390/kernel/syscalls/Makefile
index 1bb78b9..c5d958a 100644
--- a/arch/s390/kernel/syscalls/Makefile
+++ b/arch/s390/kernel/syscalls/Makefile
@@ -12,7 +12,7 @@
uapi-hdrs-y := $(uapi)/unistd_32.h
uapi-hdrs-y += $(uapi)/unistd_64.h
-targets += $(addprefix ../../../,$(gen-y) $(kapi-hdrs-y) $(uapi-hdrs-y))
+targets += $(addprefix ../../../../,$(gen-y) $(kapi-hdrs-y) $(uapi-hdrs-y))
PHONY += kapi uapi
@@ -23,23 +23,26 @@
# Create output directory if not already present
$(shell mkdir -p $(uapi) $(kapi))
-filechk_syshdr = $(CONFIG_SHELL) '$(systbl)' -H -a $(syshdr_abi_$(basetarget)) -f "$2" < $<
+quiet_cmd_syshdr = SYSHDR $@
+ cmd_syshdr = $(CONFIG_SHELL) '$(systbl)' -H -a $(syshdr_abi_$(basetarget)) -f "$@" < $< > $@
-filechk_sysnr = $(CONFIG_SHELL) '$(systbl)' -N -a $(sysnr_abi_$(basetarget)) < $<
+quiet_cmd_sysnr = SYSNR $@
+ cmd_sysnr = $(CONFIG_SHELL) '$(systbl)' -N -a $(sysnr_abi_$(basetarget)) < $< > $@
-filechk_syscalls = $(CONFIG_SHELL) '$(systbl)' -S < $<
+quiet_cmd_syscalls = SYSTBL $@
+ cmd_syscalls = $(CONFIG_SHELL) '$(systbl)' -S < $< > $@
syshdr_abi_unistd_32 := common,32
-$(uapi)/unistd_32.h: $(syscall) FORCE
- $(call filechk,syshdr,$@)
+$(uapi)/unistd_32.h: $(syscall) $(systbl) FORCE
+ $(call if_changed,syshdr)
syshdr_abi_unistd_64 := common,64
-$(uapi)/unistd_64.h: $(syscall) FORCE
- $(call filechk,syshdr,$@)
+$(uapi)/unistd_64.h: $(syscall) $(systbl) FORCE
+ $(call if_changed,syshdr)
-$(kapi)/syscall_table.h: $(syscall) FORCE
- $(call filechk,syscalls)
+$(kapi)/syscall_table.h: $(syscall) $(systbl) FORCE
+ $(call if_changed,syscalls)
sysnr_abi_unistd_nr := common,32,64
-$(kapi)/unistd_nr.h: $(syscall) FORCE
- $(call filechk,sysnr)
+$(kapi)/unistd_nr.h: $(syscall) $(systbl) FORCE
+ $(call if_changed,sysnr)
diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl
index 0107118..e9115b4 100644
--- a/arch/s390/kernel/syscalls/syscall.tbl
+++ b/arch/s390/kernel/syscalls/syscall.tbl
@@ -465,3 +465,7 @@
460 common lsm_set_self_attr sys_lsm_set_self_attr sys_lsm_set_self_attr
461 common lsm_list_modules sys_lsm_list_modules sys_lsm_list_modules
462 common mseal sys_mseal sys_mseal
+463 common setxattrat sys_setxattrat sys_setxattrat
+464 common getxattrat sys_getxattrat sys_getxattrat
+465 common listxattrat sys_listxattrat sys_listxattrat
+466 common removexattrat sys_removexattrat sys_removexattrat
diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c
index b713eff..cd02ed7 100644
--- a/arch/s390/kernel/time.c
+++ b/arch/s390/kernel/time.c
@@ -729,8 +729,8 @@ static ssize_t ctn_id_show(struct device *dev,
mutex_lock(&stp_mutex);
if (stpinfo_valid())
- ret = sprintf(buf, "%016lx\n",
- *(unsigned long *) stp_info.ctnid);
+ ret = sysfs_emit(buf, "%016lx\n",
+ *(unsigned long *)stp_info.ctnid);
mutex_unlock(&stp_mutex);
return ret;
}
@@ -745,7 +745,7 @@ static ssize_t ctn_type_show(struct device *dev,
mutex_lock(&stp_mutex);
if (stpinfo_valid())
- ret = sprintf(buf, "%i\n", stp_info.ctn);
+ ret = sysfs_emit(buf, "%i\n", stp_info.ctn);
mutex_unlock(&stp_mutex);
return ret;
}
@@ -760,7 +760,7 @@ static ssize_t dst_offset_show(struct device *dev,
mutex_lock(&stp_mutex);
if (stpinfo_valid() && (stp_info.vbits & 0x2000))
- ret = sprintf(buf, "%i\n", (int)(s16) stp_info.dsto);
+ ret = sysfs_emit(buf, "%i\n", (int)(s16)stp_info.dsto);
mutex_unlock(&stp_mutex);
return ret;
}
@@ -775,7 +775,7 @@ static ssize_t leap_seconds_show(struct device *dev,
mutex_lock(&stp_mutex);
if (stpinfo_valid() && (stp_info.vbits & 0x8000))
- ret = sprintf(buf, "%i\n", (int)(s16) stp_info.leaps);
+ ret = sysfs_emit(buf, "%i\n", (int)(s16)stp_info.leaps);
mutex_unlock(&stp_mutex);
return ret;
}
@@ -801,11 +801,11 @@ static ssize_t leap_seconds_scheduled_show(struct device *dev,
return ret;
if (!stzi.lsoib.p)
- return sprintf(buf, "0,0\n");
+ return sysfs_emit(buf, "0,0\n");
- return sprintf(buf, "%lu,%d\n",
- tod_to_ns(stzi.lsoib.nlsout - TOD_UNIX_EPOCH) / NSEC_PER_SEC,
- stzi.lsoib.nlso - stzi.lsoib.also);
+ return sysfs_emit(buf, "%lu,%d\n",
+ tod_to_ns(stzi.lsoib.nlsout - TOD_UNIX_EPOCH) / NSEC_PER_SEC,
+ stzi.lsoib.nlso - stzi.lsoib.also);
}
static DEVICE_ATTR_RO(leap_seconds_scheduled);
@@ -818,7 +818,7 @@ static ssize_t stratum_show(struct device *dev,
mutex_lock(&stp_mutex);
if (stpinfo_valid())
- ret = sprintf(buf, "%i\n", (int)(s16) stp_info.stratum);
+ ret = sysfs_emit(buf, "%i\n", (int)(s16)stp_info.stratum);
mutex_unlock(&stp_mutex);
return ret;
}
@@ -833,7 +833,7 @@ static ssize_t time_offset_show(struct device *dev,
mutex_lock(&stp_mutex);
if (stpinfo_valid() && (stp_info.vbits & 0x0800))
- ret = sprintf(buf, "%i\n", (int) stp_info.tto);
+ ret = sysfs_emit(buf, "%i\n", (int)stp_info.tto);
mutex_unlock(&stp_mutex);
return ret;
}
@@ -848,7 +848,7 @@ static ssize_t time_zone_offset_show(struct device *dev,
mutex_lock(&stp_mutex);
if (stpinfo_valid() && (stp_info.vbits & 0x4000))
- ret = sprintf(buf, "%i\n", (int)(s16) stp_info.tzo);
+ ret = sysfs_emit(buf, "%i\n", (int)(s16)stp_info.tzo);
mutex_unlock(&stp_mutex);
return ret;
}
@@ -863,7 +863,7 @@ static ssize_t timing_mode_show(struct device *dev,
mutex_lock(&stp_mutex);
if (stpinfo_valid())
- ret = sprintf(buf, "%i\n", stp_info.tmd);
+ ret = sysfs_emit(buf, "%i\n", stp_info.tmd);
mutex_unlock(&stp_mutex);
return ret;
}
@@ -878,7 +878,7 @@ static ssize_t timing_state_show(struct device *dev,
mutex_lock(&stp_mutex);
if (stpinfo_valid())
- ret = sprintf(buf, "%i\n", stp_info.tst);
+ ret = sysfs_emit(buf, "%i\n", stp_info.tst);
mutex_unlock(&stp_mutex);
return ret;
}
@@ -889,7 +889,7 @@ static ssize_t online_show(struct device *dev,
struct device_attribute *attr,
char *buf)
{
- return sprintf(buf, "%i\n", stp_online);
+ return sysfs_emit(buf, "%i\n", stp_online);
}
static ssize_t online_store(struct device *dev,
diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c
index 813e5da..4f9c301 100644
--- a/arch/s390/kernel/topology.c
+++ b/arch/s390/kernel/topology.c
@@ -26,6 +26,7 @@
#include <linux/node.h>
#include <asm/hiperdispatch.h>
#include <asm/sysinfo.h>
+#include <asm/asm.h>
#define PTF_HORIZONTAL (0UL)
#define PTF_VERTICAL (1UL)
@@ -224,15 +225,15 @@ static void topology_update_polarization_simple(void)
static int ptf(unsigned long fc)
{
- int rc;
+ int cc;
asm volatile(
- " .insn rre,0xb9a20000,%1,%1\n"
- " ipm %0\n"
- " srl %0,28\n"
- : "=d" (rc)
- : "d" (fc) : "cc");
- return rc;
+ " .insn rre,0xb9a20000,%[fc],%[fc]\n"
+ CC_IPM(cc)
+ : CC_OUT(cc, cc)
+ : [fc] "d" (fc)
+ : CC_CLOBBER);
+ return CC_TRANSFORM(cc);
}
int topology_set_cpu_management(int fc)
@@ -412,7 +413,7 @@ static ssize_t dispatching_show(struct device *dev,
ssize_t count;
mutex_lock(&smp_cpu_state_mutex);
- count = sprintf(buf, "%d\n", cpu_management);
+ count = sysfs_emit(buf, "%d\n", cpu_management);
mutex_unlock(&smp_cpu_state_mutex);
return count;
}
@@ -443,19 +444,19 @@ static ssize_t cpu_polarization_show(struct device *dev,
mutex_lock(&smp_cpu_state_mutex);
switch (smp_cpu_get_polarization(cpu)) {
case POLARIZATION_HRZ:
- count = sprintf(buf, "horizontal\n");
+ count = sysfs_emit(buf, "horizontal\n");
break;
case POLARIZATION_VL:
- count = sprintf(buf, "vertical:low\n");
+ count = sysfs_emit(buf, "vertical:low\n");
break;
case POLARIZATION_VM:
- count = sprintf(buf, "vertical:medium\n");
+ count = sysfs_emit(buf, "vertical:medium\n");
break;
case POLARIZATION_VH:
- count = sprintf(buf, "vertical:high\n");
+ count = sysfs_emit(buf, "vertical:high\n");
break;
default:
- count = sprintf(buf, "unknown\n");
+ count = sysfs_emit(buf, "unknown\n");
break;
}
mutex_unlock(&smp_cpu_state_mutex);
@@ -479,7 +480,7 @@ static ssize_t cpu_dedicated_show(struct device *dev,
ssize_t count;
mutex_lock(&smp_cpu_state_mutex);
- count = sprintf(buf, "%d\n", topology_cpu_dedicated(cpu));
+ count = sysfs_emit(buf, "%d\n", topology_cpu_dedicated(cpu));
mutex_unlock(&smp_cpu_state_mutex);
return count;
}
diff --git a/arch/s390/kernel/traps.c b/arch/s390/kernel/traps.c
index 160b2ac..24fee11 100644
--- a/arch/s390/kernel/traps.c
+++ b/arch/s390/kernel/traps.c
@@ -31,6 +31,7 @@
#include <asm/asm-extable.h>
#include <asm/vtime.h>
#include <asm/fpu.h>
+#include <asm/fault.h>
#include "entry.h"
static inline void __user *get_trap_ip(struct pt_regs *regs)
@@ -317,9 +318,24 @@ void noinstr __do_pgm_check(struct pt_regs *regs)
struct lowcore *lc = get_lowcore();
irqentry_state_t state;
unsigned int trapnr;
+ union teid teid;
+ teid.val = lc->trans_exc_code;
regs->int_code = lc->pgm_int_code;
- regs->int_parm_long = lc->trans_exc_code;
+ regs->int_parm_long = teid.val;
+
+ /*
+ * In case of a guest fault, short-circuit the fault handler and return.
+ * This way the sie64a() function will return 0; fault address and
+ * other relevant bits are saved in current->thread.gmap_teid, and
+ * the fault number in current->thread.gmap_int_code. KVM will be
+ * able to use this information to handle the fault.
+ */
+ if (test_pt_regs_flag(regs, PIF_GUEST_FAULT)) {
+ current->thread.gmap_teid.val = regs->int_parm_long;
+ current->thread.gmap_int_code = regs->int_code & 0xffff;
+ return;
+ }
state = irqentry_enter(regs);
@@ -408,8 +424,8 @@ static void (*pgm_check_table[128])(struct pt_regs *regs) = {
[0x3b] = do_dat_exception,
[0x3c] = default_trap_handler,
[0x3d] = do_secure_storage_access,
- [0x3e] = do_non_secure_storage_access,
- [0x3f] = do_secure_storage_violation,
+ [0x3e] = default_trap_handler,
+ [0x3f] = default_trap_handler,
[0x40] = monitor_event_exception,
[0x41 ... 0x7f] = default_trap_handler,
};
@@ -420,5 +436,3 @@ static void (*pgm_check_table[128])(struct pt_regs *regs) = {
__stringify(default_trap_handler))
COND_TRAP(do_secure_storage_access);
-COND_TRAP(do_non_secure_storage_access);
-COND_TRAP(do_secure_storage_violation);
diff --git a/arch/s390/kernel/uv.c b/arch/s390/kernel/uv.c
index 9646f77..6f9654a 100644
--- a/arch/s390/kernel/uv.c
+++ b/arch/s390/kernel/uv.c
@@ -2,7 +2,7 @@
/*
* Common Ultravisor functions and initialization
*
- * Copyright IBM Corp. 2019, 2020
+ * Copyright IBM Corp. 2019, 2024
*/
#define KMSG_COMPONENT "prot_virt"
#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
@@ -696,12 +696,32 @@ static struct kobj_attribute uv_query_supp_secret_types_attr =
static ssize_t uv_query_max_secrets(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- return sysfs_emit(buf, "%d\n", uv_info.max_secrets);
+ return sysfs_emit(buf, "%d\n",
+ uv_info.max_assoc_secrets + uv_info.max_retr_secrets);
}
static struct kobj_attribute uv_query_max_secrets_attr =
__ATTR(max_secrets, 0444, uv_query_max_secrets, NULL);
+static ssize_t uv_query_max_retr_secrets(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "%d\n", uv_info.max_retr_secrets);
+}
+
+static struct kobj_attribute uv_query_max_retr_secrets_attr =
+ __ATTR(max_retr_secrets, 0444, uv_query_max_retr_secrets, NULL);
+
+static ssize_t uv_query_max_assoc_secrets(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ return sysfs_emit(buf, "%d\n", uv_info.max_assoc_secrets);
+}
+
+static struct kobj_attribute uv_query_max_assoc_secrets_attr =
+ __ATTR(max_assoc_secrets, 0444, uv_query_max_assoc_secrets, NULL);
+
static struct attribute *uv_query_attrs[] = {
&uv_query_facilities_attr.attr,
&uv_query_feature_indications_attr.attr,
@@ -719,13 +739,81 @@ static struct attribute *uv_query_attrs[] = {
&uv_query_supp_add_secret_pcf_attr.attr,
&uv_query_supp_secret_types_attr.attr,
&uv_query_max_secrets_attr.attr,
+ &uv_query_max_assoc_secrets_attr.attr,
+ &uv_query_max_retr_secrets_attr.attr,
NULL,
};
+static inline struct uv_cb_query_keys uv_query_keys(void)
+{
+ struct uv_cb_query_keys uvcb = {
+ .header.cmd = UVC_CMD_QUERY_KEYS,
+ .header.len = sizeof(uvcb)
+ };
+
+ uv_call(0, (uint64_t)&uvcb);
+ return uvcb;
+}
+
+static inline ssize_t emit_hash(struct uv_key_hash *hash, char *buf, int at)
+{
+ return sysfs_emit_at(buf, at, "%016llx%016llx%016llx%016llx\n",
+ hash->dword[0], hash->dword[1], hash->dword[2], hash->dword[3]);
+}
+
+static ssize_t uv_keys_host_key(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct uv_cb_query_keys uvcb = uv_query_keys();
+
+ return emit_hash(&uvcb.key_hashes[UVC_QUERY_KEYS_IDX_HK], buf, 0);
+}
+
+static struct kobj_attribute uv_keys_host_key_attr =
+ __ATTR(host_key, 0444, uv_keys_host_key, NULL);
+
+static ssize_t uv_keys_backup_host_key(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct uv_cb_query_keys uvcb = uv_query_keys();
+
+ return emit_hash(&uvcb.key_hashes[UVC_QUERY_KEYS_IDX_BACK_HK], buf, 0);
+}
+
+static struct kobj_attribute uv_keys_backup_host_key_attr =
+ __ATTR(backup_host_key, 0444, uv_keys_backup_host_key, NULL);
+
+static ssize_t uv_keys_all(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct uv_cb_query_keys uvcb = uv_query_keys();
+ ssize_t len = 0;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(uvcb.key_hashes); i++)
+ len += emit_hash(uvcb.key_hashes + i, buf, len);
+
+ return len;
+}
+
+static struct kobj_attribute uv_keys_all_attr =
+ __ATTR(all, 0444, uv_keys_all, NULL);
+
static struct attribute_group uv_query_attr_group = {
.attrs = uv_query_attrs,
};
+static struct attribute *uv_keys_attrs[] = {
+ &uv_keys_host_key_attr.attr,
+ &uv_keys_backup_host_key_attr.attr,
+ &uv_keys_all_attr.attr,
+ NULL,
+};
+
+static struct attribute_group uv_keys_attr_group = {
+ .attrs = uv_keys_attrs,
+};
+
static ssize_t uv_is_prot_virt_guest(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
@@ -751,9 +839,27 @@ static const struct attribute *uv_prot_virt_attrs[] = {
};
static struct kset *uv_query_kset;
+static struct kset *uv_keys_kset;
static struct kobject *uv_kobj;
-static int __init uv_info_init(void)
+static int __init uv_sysfs_dir_init(const struct attribute_group *grp,
+ struct kset **uv_dir_kset, const char *name)
+{
+ struct kset *kset;
+ int rc;
+
+ kset = kset_create_and_add(name, NULL, uv_kobj);
+ if (!kset)
+ return -ENOMEM;
+ *uv_dir_kset = kset;
+
+ rc = sysfs_create_group(&kset->kobj, grp);
+ if (rc)
+ kset_unregister(kset);
+ return rc;
+}
+
+static int __init uv_sysfs_init(void)
{
int rc = -ENOMEM;
@@ -768,17 +874,16 @@ static int __init uv_info_init(void)
if (rc)
goto out_kobj;
- uv_query_kset = kset_create_and_add("query", NULL, uv_kobj);
- if (!uv_query_kset) {
- rc = -ENOMEM;
+ rc = uv_sysfs_dir_init(&uv_query_attr_group, &uv_query_kset, "query");
+ if (rc)
goto out_ind_files;
- }
- rc = sysfs_create_group(&uv_query_kset->kobj, &uv_query_attr_group);
- if (!rc)
- return 0;
+ /* Get installed key hashes if available, ignore any errors */
+ if (test_bit_inv(BIT_UVC_CMD_QUERY_KEYS, uv_info.inst_calls_list))
+ uv_sysfs_dir_init(&uv_keys_attr_group, &uv_keys_kset, "keys");
- kset_unregister(uv_query_kset);
+ return 0;
+
out_ind_files:
sysfs_remove_files(uv_kobj, uv_prot_virt_attrs);
out_kobj:
@@ -786,4 +891,131 @@ static int __init uv_info_init(void)
kobject_put(uv_kobj);
return rc;
}
-device_initcall(uv_info_init);
+device_initcall(uv_sysfs_init);
+
+/*
+ * Find the secret with the secret_id in the provided list.
+ *
+ * Context: might sleep.
+ */
+static int find_secret_in_page(const u8 secret_id[UV_SECRET_ID_LEN],
+ const struct uv_secret_list *list,
+ struct uv_secret_list_item_hdr *secret)
+{
+ u16 i;
+
+ for (i = 0; i < list->total_num_secrets; i++) {
+ if (memcmp(secret_id, list->secrets[i].id, UV_SECRET_ID_LEN) == 0) {
+ *secret = list->secrets[i].hdr;
+ return 0;
+ }
+ }
+ return -ENOENT;
+}
+
+/*
+ * Do the actual search for `uv_get_secret_metadata`.
+ *
+ * Context: might sleep.
+ */
+static int find_secret(const u8 secret_id[UV_SECRET_ID_LEN],
+ struct uv_secret_list *list,
+ struct uv_secret_list_item_hdr *secret)
+{
+ u16 start_idx = 0;
+ u16 list_rc;
+ int ret;
+
+ do {
+ uv_list_secrets(list, start_idx, &list_rc, NULL);
+ if (list_rc != UVC_RC_EXECUTED && list_rc != UVC_RC_MORE_DATA) {
+ if (list_rc == UVC_RC_INV_CMD)
+ return -ENODEV;
+ else
+ return -EIO;
+ }
+ ret = find_secret_in_page(secret_id, list, secret);
+ if (ret == 0)
+ return ret;
+ start_idx = list->next_secret_idx;
+ } while (list_rc == UVC_RC_MORE_DATA && start_idx < list->next_secret_idx);
+
+ return -ENOENT;
+}
+
+/**
+ * uv_get_secret_metadata() - get secret metadata for a given secret id.
+ * @secret_id: search pattern.
+ * @secret: output data, containing the secret's metadata.
+ *
+ * Search for a secret with the given secret_id in the Ultravisor secret store.
+ *
+ * Context: might sleep.
+ *
+ * Return:
+ * * %0: - Found entry; secret->idx and secret->type are valid.
+ * * %ENOENT - No entry found.
+ * * %ENODEV: - Not supported: UV not available or command not available.
+ * * %EIO: - Other unexpected UV error.
+ */
+int uv_get_secret_metadata(const u8 secret_id[UV_SECRET_ID_LEN],
+ struct uv_secret_list_item_hdr *secret)
+{
+ struct uv_secret_list *buf;
+ int rc;
+
+ buf = kzalloc(sizeof(*buf), GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+ rc = find_secret(secret_id, buf, secret);
+ kfree(buf);
+ return rc;
+}
+EXPORT_SYMBOL_GPL(uv_get_secret_metadata);
+
+/**
+ * uv_retrieve_secret() - get the secret value for the secret index.
+ * @secret_idx: Secret index for which the secret should be retrieved.
+ * @buf: Buffer to store retrieved secret.
+ * @buf_size: Size of the buffer. The correct buffer size is reported as part of
+ * the result from `uv_get_secret_metadata`.
+ *
+ * Calls the Retrieve Secret UVC and translates the UV return code into an errno.
+ *
+ * Context: might sleep.
+ *
+ * Return:
+ * * %0 - Entry found; buffer contains a valid secret.
+ * * %ENOENT: - No entry found or secret at the index is non-retrievable.
+ * * %ENODEV: - Not supported: UV not available or command not available.
+ * * %EINVAL: - Buffer too small for content.
+ * * %EIO: - Other unexpected UV error.
+ */
+int uv_retrieve_secret(u16 secret_idx, u8 *buf, size_t buf_size)
+{
+ struct uv_cb_retr_secr uvcb = {
+ .header.len = sizeof(uvcb),
+ .header.cmd = UVC_CMD_RETR_SECRET,
+ .secret_idx = secret_idx,
+ .buf_addr = (u64)buf,
+ .buf_size = buf_size,
+ };
+
+ uv_call_sched(0, (u64)&uvcb);
+
+ switch (uvcb.header.rc) {
+ case UVC_RC_EXECUTED:
+ return 0;
+ case UVC_RC_INV_CMD:
+ return -ENODEV;
+ case UVC_RC_RETR_SECR_STORE_EMPTY:
+ case UVC_RC_RETR_SECR_INV_SECRET:
+ case UVC_RC_RETR_SECR_INV_IDX:
+ return -ENOENT;
+ case UVC_RC_RETR_SECR_BUF_SMALL:
+ return -EINVAL;
+ default:
+ return -EIO;
+ }
+}
+EXPORT_SYMBOL_GPL(uv_retrieve_secret);
diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c
index b163520..5bbaadf 100644
--- a/arch/s390/kvm/intercept.c
+++ b/arch/s390/kvm/intercept.c
@@ -367,7 +367,7 @@ static int handle_mvpg_pei(struct kvm_vcpu *vcpu)
reg2, &srcaddr, GACC_FETCH, 0);
if (rc)
return kvm_s390_inject_prog_cond(vcpu, rc);
- rc = kvm_arch_fault_in_page(vcpu, srcaddr, 0);
+ rc = gmap_fault(vcpu->arch.gmap, srcaddr, 0);
if (rc != 0)
return rc;
@@ -376,7 +376,7 @@ static int handle_mvpg_pei(struct kvm_vcpu *vcpu)
reg1, &dstaddr, GACC_STORE, 0);
if (rc)
return kvm_s390_inject_prog_cond(vcpu, rc);
- rc = kvm_arch_fault_in_page(vcpu, dstaddr, 1);
+ rc = gmap_fault(vcpu->arch.gmap, dstaddr, FAULT_FLAG_WRITE);
if (rc != 0)
return rc;
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index bb7134f..deeb320 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -43,6 +43,7 @@
#include <asm/sclp.h>
#include <asm/cpacf.h>
#include <asm/timex.h>
+#include <asm/asm.h>
#include <asm/fpu.h>
#include <asm/ap.h>
#include <asm/uv.h>
@@ -340,12 +341,11 @@ static inline int plo_test_bit(unsigned char nr)
" lgr 0,%[function]\n"
/* Parameter registers are ignored for "test bit" */
" plo 0,0,0,0(0)\n"
- " ipm %0\n"
- " srl %0,28\n"
- : "=d" (cc)
+ CC_IPM(cc)
+ : CC_OUT(cc, cc)
: [function] "d" (function)
- : "cc", "0");
- return cc == 0;
+ : CC_CLOBBER_LIST("0"));
+ return CC_TRANSFORM(cc) == 0;
}
static __always_inline void __sortl_query(u8 (*query)[32])
@@ -3719,7 +3719,6 @@ __u64 kvm_s390_get_cpu_timer(struct kvm_vcpu *vcpu)
void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
- gmap_enable(vcpu->arch.enabled_gmap);
kvm_s390_set_cpuflags(vcpu, CPUSTAT_RUNNING);
if (vcpu->arch.cputm_enabled && !is_vcpu_idle(vcpu))
__start_cpu_timer_accounting(vcpu);
@@ -3732,8 +3731,6 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
if (vcpu->arch.cputm_enabled && !is_vcpu_idle(vcpu))
__stop_cpu_timer_accounting(vcpu);
kvm_s390_clear_cpuflags(vcpu, CPUSTAT_RUNNING);
- vcpu->arch.enabled_gmap = gmap_get_enabled();
- gmap_disable(vcpu->arch.enabled_gmap);
}
@@ -3751,8 +3748,6 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
}
if (test_kvm_facility(vcpu->kvm, 74) || vcpu->kvm->arch.user_instr0)
vcpu->arch.sie_block->ictl |= ICTL_OPEREXC;
- /* make vcpu_load load the right gmap on the first trigger */
- vcpu->arch.enabled_gmap = vcpu->arch.gmap;
}
static bool kvm_has_pckmo_subfunc(struct kvm *kvm, unsigned long nr)
@@ -4579,22 +4574,6 @@ int kvm_s390_try_set_tod_clock(struct kvm *kvm, const struct kvm_s390_vm_tod_clo
return 1;
}
-/**
- * kvm_arch_fault_in_page - fault-in guest page if necessary
- * @vcpu: The corresponding virtual cpu
- * @gpa: Guest physical address
- * @writable: Whether the page should be writable or not
- *
- * Make sure that a guest page has been faulted-in on the host.
- *
- * Return: Zero on success, negative error code otherwise.
- */
-long kvm_arch_fault_in_page(struct kvm_vcpu *vcpu, gpa_t gpa, int writable)
-{
- return gmap_fault(vcpu->arch.gmap, gpa,
- writable ? FAULT_FLAG_WRITE : 0);
-}
-
static void __kvm_inject_pfault_token(struct kvm_vcpu *vcpu, bool start_token,
unsigned long token)
{
@@ -4662,12 +4641,11 @@ static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu)
if (!vcpu->arch.gmap->pfault_enabled)
return false;
- hva = gfn_to_hva(vcpu->kvm, gpa_to_gfn(current->thread.gmap_addr));
- hva += current->thread.gmap_addr & ~PAGE_MASK;
+ hva = gfn_to_hva(vcpu->kvm, current->thread.gmap_teid.addr);
if (read_guest_real(vcpu, vcpu->arch.pfault_token, &arch.pfault_token, 8))
return false;
- return kvm_setup_async_pf(vcpu, current->thread.gmap_addr, hva, &arch);
+ return kvm_setup_async_pf(vcpu, current->thread.gmap_teid.addr * PAGE_SIZE, hva, &arch);
}
static int vcpu_pre_run(struct kvm_vcpu *vcpu)
@@ -4705,6 +4683,7 @@ static int vcpu_pre_run(struct kvm_vcpu *vcpu)
clear_bit(vcpu->vcpu_idx, vcpu->kvm->arch.gisa_int.kicked_mask);
vcpu->arch.sie_block->icptcode = 0;
+ current->thread.gmap_int_code = 0;
cpuflags = atomic_read(&vcpu->arch.sie_block->cpuflags);
VCPU_EVENT(vcpu, 6, "entering sie flags %x", cpuflags);
trace_kvm_s390_sie_enter(vcpu, cpuflags);
@@ -4712,7 +4691,7 @@ static int vcpu_pre_run(struct kvm_vcpu *vcpu)
return 0;
}
-static int vcpu_post_run_fault_in_sie(struct kvm_vcpu *vcpu)
+static int vcpu_post_run_addressing_exception(struct kvm_vcpu *vcpu)
{
struct kvm_s390_pgm_info pgm_info = {
.code = PGM_ADDRESSING,
@@ -4748,10 +4727,106 @@ static int vcpu_post_run_fault_in_sie(struct kvm_vcpu *vcpu)
return kvm_s390_inject_prog_irq(vcpu, &pgm_info);
}
+static int vcpu_post_run_handle_fault(struct kvm_vcpu *vcpu)
+{
+ unsigned int flags = 0;
+ unsigned long gaddr;
+ int rc = 0;
+
+ gaddr = current->thread.gmap_teid.addr * PAGE_SIZE;
+ if (kvm_s390_cur_gmap_fault_is_write())
+ flags = FAULT_FLAG_WRITE;
+
+ switch (current->thread.gmap_int_code & PGM_INT_CODE_MASK) {
+ case 0:
+ vcpu->stat.exit_null++;
+ break;
+ case PGM_NON_SECURE_STORAGE_ACCESS:
+ KVM_BUG(current->thread.gmap_teid.as != PSW_BITS_AS_PRIMARY, vcpu->kvm,
+ "Unexpected program interrupt 0x%x, TEID 0x%016lx",
+ current->thread.gmap_int_code, current->thread.gmap_teid.val);
+ /*
+ * This is normal operation; a page belonging to a protected
+ * guest has not been imported yet. Try to import the page into
+ * the protected guest.
+ */
+ if (gmap_convert_to_secure(vcpu->arch.gmap, gaddr) == -EINVAL)
+ send_sig(SIGSEGV, current, 0);
+ break;
+ case PGM_SECURE_STORAGE_ACCESS:
+ case PGM_SECURE_STORAGE_VIOLATION:
+ KVM_BUG(current->thread.gmap_teid.as != PSW_BITS_AS_PRIMARY, vcpu->kvm,
+ "Unexpected program interrupt 0x%x, TEID 0x%016lx",
+ current->thread.gmap_int_code, current->thread.gmap_teid.val);
+ /*
+ * This can happen after a reboot with asynchronous teardown;
+ * the new guest (normal or protected) will run on top of the
+ * previous protected guest. The old pages need to be destroyed
+ * so the new guest can use them.
+ */
+ if (gmap_destroy_page(vcpu->arch.gmap, gaddr)) {
+ /*
+ * Either KVM messed up the secure guest mapping or the
+ * same page is mapped into multiple secure guests.
+ *
+ * This exception is only triggered when a guest 2 is
+ * running and can therefore never occur in kernel
+ * context.
+ */
+ pr_warn_ratelimited("Secure storage violation (%x) in task: %s, pid %d\n",
+ current->thread.gmap_int_code, current->comm,
+ current->pid);
+ send_sig(SIGSEGV, current, 0);
+ }
+ break;
+ case PGM_PROTECTION:
+ case PGM_SEGMENT_TRANSLATION:
+ case PGM_PAGE_TRANSLATION:
+ case PGM_ASCE_TYPE:
+ case PGM_REGION_FIRST_TRANS:
+ case PGM_REGION_SECOND_TRANS:
+ case PGM_REGION_THIRD_TRANS:
+ KVM_BUG(current->thread.gmap_teid.as != PSW_BITS_AS_PRIMARY, vcpu->kvm,
+ "Unexpected program interrupt 0x%x, TEID 0x%016lx",
+ current->thread.gmap_int_code, current->thread.gmap_teid.val);
+ if (vcpu->arch.gmap->pfault_enabled) {
+ rc = gmap_fault(vcpu->arch.gmap, gaddr, flags | FAULT_FLAG_RETRY_NOWAIT);
+ if (rc == -EFAULT)
+ return vcpu_post_run_addressing_exception(vcpu);
+ if (rc == -EAGAIN) {
+ trace_kvm_s390_major_guest_pfault(vcpu);
+ if (kvm_arch_setup_async_pf(vcpu))
+ return 0;
+ vcpu->stat.pfault_sync++;
+ } else {
+ return rc;
+ }
+ }
+ rc = gmap_fault(vcpu->arch.gmap, gaddr, flags);
+ if (rc == -EFAULT) {
+ if (kvm_is_ucontrol(vcpu->kvm)) {
+ vcpu->run->exit_reason = KVM_EXIT_S390_UCONTROL;
+ vcpu->run->s390_ucontrol.trans_exc_code = gaddr;
+ vcpu->run->s390_ucontrol.pgm_code = 0x10;
+ return -EREMOTE;
+ }
+ return vcpu_post_run_addressing_exception(vcpu);
+ }
+ break;
+ default:
+ KVM_BUG(1, vcpu->kvm, "Unexpected program interrupt 0x%x, TEID 0x%016lx",
+ current->thread.gmap_int_code, current->thread.gmap_teid.val);
+ send_sig(SIGSEGV, current, 0);
+ break;
+ }
+ return rc;
+}
+
static int vcpu_post_run(struct kvm_vcpu *vcpu, int exit_reason)
{
struct mcck_volatile_info *mcck_info;
struct sie_page *sie_page;
+ int rc;
VCPU_EVENT(vcpu, 6, "exit sie icptcode %d",
vcpu->arch.sie_block->icptcode);
@@ -4773,7 +4848,7 @@ static int vcpu_post_run(struct kvm_vcpu *vcpu, int exit_reason)
}
if (vcpu->arch.sie_block->icptcode > 0) {
- int rc = kvm_handle_sie_intercept(vcpu);
+ rc = kvm_handle_sie_intercept(vcpu);
if (rc != -EOPNOTSUPP)
return rc;
@@ -4782,24 +4857,9 @@ static int vcpu_post_run(struct kvm_vcpu *vcpu, int exit_reason)
vcpu->run->s390_sieic.ipa = vcpu->arch.sie_block->ipa;
vcpu->run->s390_sieic.ipb = vcpu->arch.sie_block->ipb;
return -EREMOTE;
- } else if (exit_reason != -EFAULT) {
- vcpu->stat.exit_null++;
- return 0;
- } else if (kvm_is_ucontrol(vcpu->kvm)) {
- vcpu->run->exit_reason = KVM_EXIT_S390_UCONTROL;
- vcpu->run->s390_ucontrol.trans_exc_code =
- current->thread.gmap_addr;
- vcpu->run->s390_ucontrol.pgm_code = 0x10;
- return -EREMOTE;
- } else if (current->thread.gmap_pfault) {
- trace_kvm_s390_major_guest_pfault(vcpu);
- current->thread.gmap_pfault = 0;
- if (kvm_arch_setup_async_pf(vcpu))
- return 0;
- vcpu->stat.pfault_sync++;
- return kvm_arch_fault_in_page(vcpu, current->thread.gmap_addr, 1);
}
- return vcpu_post_run_fault_in_sie(vcpu);
+
+ return vcpu_post_run_handle_fault(vcpu);
}
#define PSW_INT_MASK (PSW_MASK_EXT | PSW_MASK_IO | PSW_MASK_MCHECK)
@@ -4835,7 +4895,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
}
exit_reason = sie64a(vcpu->arch.sie_block,
vcpu->run->s.regs.gprs,
- gmap_get_enabled()->asce);
+ vcpu->arch.gmap->asce);
if (kvm_s390_pv_cpu_is_protected(vcpu)) {
memcpy(vcpu->run->s.regs.gprs,
sie_page->pv_grregs,
diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h
index e680c6b..597d7a7 100644
--- a/arch/s390/kvm/kvm-s390.h
+++ b/arch/s390/kvm/kvm-s390.h
@@ -394,7 +394,6 @@ int kvm_s390_handle_sigp_pei(struct kvm_vcpu *vcpu);
/* implemented in kvm-s390.c */
int kvm_s390_try_set_tod_clock(struct kvm *kvm, const struct kvm_s390_vm_tod_clock *gtod);
-long kvm_arch_fault_in_page(struct kvm_vcpu *vcpu, gpa_t gpa, int writable);
int kvm_s390_store_status_unloaded(struct kvm_vcpu *vcpu, unsigned long addr);
int kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, unsigned long addr);
int kvm_s390_vcpu_start(struct kvm_vcpu *vcpu);
@@ -529,6 +528,13 @@ static inline int kvm_s390_use_sca_entries(void)
void kvm_s390_reinject_machine_check(struct kvm_vcpu *vcpu,
struct mcck_volatile_info *mcck_info);
+static inline bool kvm_s390_cur_gmap_fault_is_write(void)
+{
+ if (current->thread.gmap_int_code == PGM_PROTECTION)
+ return true;
+ return test_facility(75) && (current->thread.gmap_teid.fsi == TEID_FSI_STORE);
+}
+
/**
* kvm_s390_vcpu_crypto_reset_all
*
diff --git a/arch/s390/kvm/pci.c b/arch/s390/kvm/pci.c
index ffa7739..a61518b 100644
--- a/arch/s390/kvm/pci.c
+++ b/arch/s390/kvm/pci.c
@@ -103,7 +103,7 @@ static int zpci_reset_aipb(u8 nisc)
/*
* AEN registration can only happen once per system boot. If
* an aipb already exists then AEN was already registered and
- * we can re-use the aipb contents. This can only happen if
+ * we can reuse the aipb contents. This can only happen if
* the KVM module was removed and re-inserted. However, we must
* ensure that the same forwarding ISC is used as this is assigned
* during KVM module load.
diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c
index 89cafea..d3cdde1 100644
--- a/arch/s390/kvm/vsie.c
+++ b/arch/s390/kvm/vsie.c
@@ -922,19 +922,19 @@ static int handle_fault(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
{
int rc;
- if (current->thread.gmap_int_code == PGM_PROTECTION)
+ if ((current->thread.gmap_int_code & PGM_INT_CODE_MASK) == PGM_PROTECTION)
/* we can directly forward all protection exceptions */
return inject_fault(vcpu, PGM_PROTECTION,
- current->thread.gmap_addr, 1);
+ current->thread.gmap_teid.addr * PAGE_SIZE, 1);
rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap,
- current->thread.gmap_addr, NULL);
+ current->thread.gmap_teid.addr * PAGE_SIZE, NULL);
if (rc > 0) {
rc = inject_fault(vcpu, rc,
- current->thread.gmap_addr,
- current->thread.gmap_write_flag);
+ current->thread.gmap_teid.addr * PAGE_SIZE,
+ kvm_s390_cur_gmap_fault_is_write());
if (rc >= 0)
- vsie_page->fault_addr = current->thread.gmap_addr;
+ vsie_page->fault_addr = current->thread.gmap_teid.addr * PAGE_SIZE;
}
return rc;
}
@@ -1148,9 +1148,10 @@ static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
* also kick the vSIE.
*/
vcpu->arch.sie_block->prog0c |= PROG_IN_SIE;
+ current->thread.gmap_int_code = 0;
barrier();
if (!kvm_s390_vcpu_sie_inhibited(vcpu))
- rc = sie64a(scb_s, vcpu->run->s.regs.gprs, gmap_get_enabled()->asce);
+ rc = sie64a(scb_s, vcpu->run->s.regs.gprs, vsie_page->gmap->asce);
barrier();
vcpu->arch.sie_block->prog0c &= ~PROG_IN_SIE;
@@ -1172,7 +1173,7 @@ static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
if (rc > 0)
rc = 0; /* we could still have an icpt */
- else if (rc == -EFAULT)
+ else if (current->thread.gmap_int_code)
return handle_fault(vcpu, vsie_page);
switch (scb_s->icptcode) {
@@ -1295,10 +1296,8 @@ static int vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
if (!rc)
rc = map_prefix(vcpu, vsie_page);
if (!rc) {
- gmap_enable(vsie_page->gmap);
update_intervention_requests(vsie_page);
rc = do_vsie_run(vcpu, vsie_page);
- gmap_enable(vcpu->arch.gmap);
}
atomic_andnot(PROG_BLOCK_SIE, &scb_s->prog20);
diff --git a/arch/s390/lib/spinlock.c b/arch/s390/lib/spinlock.c
index 9f86ad8..09d7350 100644
--- a/arch/s390/lib/spinlock.c
+++ b/arch/s390/lib/spinlock.c
@@ -127,8 +127,8 @@ static inline void arch_spin_lock_queued(arch_spinlock_t *lp)
node_id = node->node_id;
/* Enqueue the node for this CPU in the spinlock wait queue */
+ old = READ_ONCE(lp->lock);
while (1) {
- old = READ_ONCE(lp->lock);
if ((old & _Q_LOCK_CPU_MASK) == 0 &&
(old & _Q_LOCK_STEAL_MASK) != _Q_LOCK_STEAL_MASK) {
/*
@@ -139,7 +139,7 @@ static inline void arch_spin_lock_queued(arch_spinlock_t *lp)
* waiter will get the lock.
*/
new = (old ? (old + _Q_LOCK_STEAL_ADD) : 0) | lockval;
- if (__atomic_cmpxchg_bool(&lp->lock, old, new))
+ if (arch_try_cmpxchg(&lp->lock, &old, new))
/* Got the lock */
goto out;
/* lock passing in progress */
@@ -147,7 +147,7 @@ static inline void arch_spin_lock_queued(arch_spinlock_t *lp)
}
/* Make the node of this CPU the new tail. */
new = node_id | (old & _Q_LOCK_MASK);
- if (__atomic_cmpxchg_bool(&lp->lock, old, new))
+ if (arch_try_cmpxchg(&lp->lock, &old, new))
break;
}
/* Set the 'next' pointer of the tail node in the queue */
@@ -184,7 +184,7 @@ static inline void arch_spin_lock_queued(arch_spinlock_t *lp)
if (!owner) {
tail_id = old & _Q_TAIL_MASK;
new = ((tail_id != node_id) ? tail_id : 0) | lockval;
- if (__atomic_cmpxchg_bool(&lp->lock, old, new))
+ if (arch_try_cmpxchg(&lp->lock, &old, new))
/* Got the lock */
break;
continue;
@@ -258,7 +258,7 @@ int arch_spin_trylock_retry(arch_spinlock_t *lp)
owner = READ_ONCE(lp->lock);
/* Try to get the lock if it is free. */
if (!owner) {
- if (__atomic_cmpxchg_bool(&lp->lock, 0, cpu))
+ if (arch_try_cmpxchg(&lp->lock, &owner, cpu))
return 1;
}
}
@@ -300,7 +300,7 @@ void arch_write_lock_wait(arch_rwlock_t *rw)
while (1) {
old = READ_ONCE(rw->cnts);
if ((old & 0x1ffff) == 0 &&
- __atomic_cmpxchg_bool(&rw->cnts, old, old | 0x10000))
+ arch_try_cmpxchg(&rw->cnts, &old, old | 0x10000))
/* Got the lock */
break;
barrier();
diff --git a/arch/s390/lib/string.c b/arch/s390/lib/string.c
index 7d87418..373fa1f 100644
--- a/arch/s390/lib/string.c
+++ b/arch/s390/lib/string.c
@@ -15,6 +15,7 @@
#include <linux/types.h>
#include <linux/string.h>
#include <linux/export.h>
+#include <asm/asm.h>
/*
* Helper functions to find the end of a string
@@ -238,12 +239,11 @@ static inline int clcle(const char *s1, unsigned long l1,
asm volatile(
"0: clcle %[r1],%[r3],0\n"
" jo 0b\n"
- " ipm %[cc]\n"
- " srl %[cc],28\n"
- : [cc] "=&d" (cc), [r1] "+&d" (r1.pair), [r3] "+&d" (r3.pair)
+ CC_IPM(cc)
+ : CC_OUT(cc, cc), [r1] "+d" (r1.pair), [r3] "+d" (r3.pair)
:
- : "cc", "memory");
- return cc;
+ : CC_CLOBBER_LIST("memory"));
+ return CC_TRANSFORM(cc);
}
/**
diff --git a/arch/s390/mm/extmem.c b/arch/s390/mm/extmem.c
index 282fefe..4692136 100644
--- a/arch/s390/mm/extmem.c
+++ b/arch/s390/mm/extmem.c
@@ -28,6 +28,7 @@
#include <asm/extmem.h>
#include <asm/cpcmd.h>
#include <asm/setup.h>
+#include <asm/asm.h>
#define DCSS_PURGESEG 0x08
#define DCSS_LOADSHRX 0x20
@@ -134,20 +135,21 @@ dcss_diag(int *func, void *parameter,
unsigned long *ret1, unsigned long *ret2)
{
unsigned long rx, ry;
- int rc;
+ int cc;
rx = virt_to_phys(parameter);
ry = (unsigned long) *func;
diag_stat_inc(DIAG_STAT_X064);
asm volatile(
- " diag %0,%1,0x64\n"
- " ipm %2\n"
- " srl %2,28\n"
- : "+d" (rx), "+d" (ry), "=d" (rc) : : "cc");
+ " diag %[rx],%[ry],0x64\n"
+ CC_IPM(cc)
+ : CC_OUT(cc, cc), [rx] "+d" (rx), [ry] "+d" (ry)
+ :
+ : CC_CLOBBER);
*ret1 = rx;
*ret2 = ry;
- return rc;
+ return CC_TRANSFORM(cc);
}
static inline int
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index ad8b0d6..94cb2e0 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -46,12 +46,6 @@
#include <asm/uv.h>
#include "../kernel/entry.h"
-enum fault_type {
- KERNEL_FAULT,
- USER_FAULT,
- GMAP_FAULT,
-};
-
static DEFINE_STATIC_KEY_FALSE(have_store_indication);
static int __init fault_init(void)
@@ -65,28 +59,15 @@ early_initcall(fault_init);
/*
* Find out which address space caused the exception.
*/
-static enum fault_type get_fault_type(struct pt_regs *regs)
+static bool is_kernel_fault(struct pt_regs *regs)
{
union teid teid = { .val = regs->int_parm_long };
- struct gmap *gmap;
- if (likely(teid.as == PSW_BITS_AS_PRIMARY)) {
- if (user_mode(regs))
- return USER_FAULT;
- if (!IS_ENABLED(CONFIG_PGSTE))
- return KERNEL_FAULT;
- gmap = (struct gmap *)get_lowcore()->gmap;
- if (gmap && gmap->asce == regs->cr1)
- return GMAP_FAULT;
- return KERNEL_FAULT;
- }
+ if (user_mode(regs))
+ return false;
if (teid.as == PSW_BITS_AS_SECONDARY)
- return USER_FAULT;
- /* Access register mode, not used in the kernel */
- if (teid.as == PSW_BITS_AS_ACCREG)
- return USER_FAULT;
- /* Home space -> access via kernel ASCE */
- return KERNEL_FAULT;
+ return false;
+ return true;
}
static unsigned long get_fault_address(struct pt_regs *regs)
@@ -181,21 +162,12 @@ static void dump_fault_info(struct pt_regs *regs)
break;
}
pr_cont("mode while using ");
- switch (get_fault_type(regs)) {
- case USER_FAULT:
- asce = get_lowcore()->user_asce.val;
- pr_cont("user ");
- break;
- case GMAP_FAULT:
- asce = ((struct gmap *)get_lowcore()->gmap)->asce;
- pr_cont("gmap ");
- break;
- case KERNEL_FAULT:
+ if (is_kernel_fault(regs)) {
asce = get_lowcore()->kernel_asce.val;
pr_cont("kernel ");
- break;
- default:
- unreachable();
+ } else {
+ asce = get_lowcore()->user_asce.val;
+ pr_cont("user ");
}
pr_cont("ASCE.\n");
dump_pagetable(asce, get_fault_address(regs));
@@ -230,7 +202,6 @@ static void do_sigsegv(struct pt_regs *regs, int si_code)
static void handle_fault_error_nolock(struct pt_regs *regs, int si_code)
{
- enum fault_type fault_type;
unsigned long address;
bool is_write;
@@ -241,17 +212,15 @@ static void handle_fault_error_nolock(struct pt_regs *regs, int si_code)
}
if (fixup_exception(regs))
return;
- fault_type = get_fault_type(regs);
- if (fault_type == KERNEL_FAULT) {
+ if (is_kernel_fault(regs)) {
address = get_fault_address(regs);
is_write = fault_is_write(regs);
if (kfence_handle_page_fault(address, is_write, regs))
return;
- }
- if (fault_type == KERNEL_FAULT)
pr_alert("Unable to handle kernel pointer dereference in virtual kernel address space\n");
- else
+ } else {
pr_alert("Unable to handle kernel paging request in virtual user address space\n");
+ }
dump_fault_info(regs);
die(regs, "Oops");
}
@@ -285,9 +254,7 @@ static void do_exception(struct pt_regs *regs, int access)
struct vm_area_struct *vma;
unsigned long address;
struct mm_struct *mm;
- enum fault_type type;
unsigned int flags;
- struct gmap *gmap;
vm_fault_t fault;
bool is_write;
@@ -301,16 +268,8 @@ static void do_exception(struct pt_regs *regs, int access)
mm = current->mm;
address = get_fault_address(regs);
is_write = fault_is_write(regs);
- type = get_fault_type(regs);
- switch (type) {
- case KERNEL_FAULT:
+ if (is_kernel_fault(regs) || faulthandler_disabled() || !mm)
return handle_fault_error_nolock(regs, 0);
- case USER_FAULT:
- case GMAP_FAULT:
- if (faulthandler_disabled() || !mm)
- return handle_fault_error_nolock(regs, 0);
- break;
- }
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
flags = FAULT_FLAG_DEFAULT;
if (user_mode(regs))
@@ -334,14 +293,11 @@ static void do_exception(struct pt_regs *regs, int access)
vma_end_read(vma);
if (!(fault & VM_FAULT_RETRY)) {
count_vm_vma_lock_event(VMA_LOCK_SUCCESS);
- if (unlikely(fault & VM_FAULT_ERROR))
- goto error;
- return;
+ goto done;
}
count_vm_vma_lock_event(VMA_LOCK_RETRY);
if (fault & VM_FAULT_MAJOR)
flags |= FAULT_FLAG_TRIED;
-
/* Quick path to respond to signals */
if (fault_signal_pending(fault, regs)) {
if (!user_mode(regs))
@@ -349,81 +305,29 @@ static void do_exception(struct pt_regs *regs, int access)
return;
}
lock_mmap:
- mmap_read_lock(mm);
- gmap = NULL;
- if (IS_ENABLED(CONFIG_PGSTE) && type == GMAP_FAULT) {
- gmap = (struct gmap *)get_lowcore()->gmap;
- current->thread.gmap_addr = address;
- current->thread.gmap_write_flag = !!(flags & FAULT_FLAG_WRITE);
- current->thread.gmap_int_code = regs->int_code & 0xffff;
- address = __gmap_translate(gmap, address);
- if (address == -EFAULT)
- return handle_fault_error(regs, SEGV_MAPERR);
- if (gmap->pfault_enabled)
- flags |= FAULT_FLAG_RETRY_NOWAIT;
- }
retry:
- vma = find_vma(mm, address);
+ vma = lock_mm_and_find_vma(mm, address, regs);
if (!vma)
- return handle_fault_error(regs, SEGV_MAPERR);
- if (unlikely(vma->vm_start > address)) {
- if (!(vma->vm_flags & VM_GROWSDOWN))
- return handle_fault_error(regs, SEGV_MAPERR);
- vma = expand_stack(mm, address);
- if (!vma)
- return handle_fault_error_nolock(regs, SEGV_MAPERR);
- }
+ return handle_fault_error_nolock(regs, SEGV_MAPERR);
if (unlikely(!(vma->vm_flags & access)))
return handle_fault_error(regs, SEGV_ACCERR);
fault = handle_mm_fault(vma, address, flags, regs);
if (fault_signal_pending(fault, regs)) {
- if (flags & FAULT_FLAG_RETRY_NOWAIT)
- mmap_read_unlock(mm);
if (!user_mode(regs))
handle_fault_error_nolock(regs, 0);
return;
}
/* The fault is fully completed (including releasing mmap lock) */
- if (fault & VM_FAULT_COMPLETED) {
- if (gmap) {
- mmap_read_lock(mm);
- goto gmap;
- }
+ if (fault & VM_FAULT_COMPLETED)
return;
- }
- if (unlikely(fault & VM_FAULT_ERROR)) {
- mmap_read_unlock(mm);
- goto error;
- }
if (fault & VM_FAULT_RETRY) {
- if (IS_ENABLED(CONFIG_PGSTE) && gmap && (flags & FAULT_FLAG_RETRY_NOWAIT)) {
- /*
- * FAULT_FLAG_RETRY_NOWAIT has been set,
- * mmap_lock has not been released
- */
- current->thread.gmap_pfault = 1;
- return handle_fault_error(regs, 0);
- }
- flags &= ~FAULT_FLAG_RETRY_NOWAIT;
flags |= FAULT_FLAG_TRIED;
- mmap_read_lock(mm);
goto retry;
}
-gmap:
- if (IS_ENABLED(CONFIG_PGSTE) && gmap) {
- address = __gmap_link(gmap, current->thread.gmap_addr,
- address);
- if (address == -EFAULT)
- return handle_fault_error(regs, SEGV_MAPERR);
- if (address == -ENOMEM) {
- fault = VM_FAULT_OOM;
- mmap_read_unlock(mm);
- goto error;
- }
- }
mmap_read_unlock(mm);
- return;
-error:
+done:
+ if (!(fault & VM_FAULT_ERROR))
+ return;
if (fault & VM_FAULT_OOM) {
if (!user_mode(regs))
handle_fault_error_nolock(regs, 0);
@@ -496,7 +400,6 @@ void do_secure_storage_access(struct pt_regs *regs)
struct folio_walk fw;
struct mm_struct *mm;
struct folio *folio;
- struct gmap *gmap;
int rc;
/*
@@ -521,17 +424,15 @@ void do_secure_storage_access(struct pt_regs *regs)
*/
panic("Unexpected PGM 0x3d with TEID bit 61=0");
}
- switch (get_fault_type(regs)) {
- case GMAP_FAULT:
- mm = current->mm;
- gmap = (struct gmap *)get_lowcore()->gmap;
- mmap_read_lock(mm);
- addr = __gmap_translate(gmap, addr);
- mmap_read_unlock(mm);
- if (IS_ERR_VALUE(addr))
- return handle_fault_error_nolock(regs, SEGV_MAPERR);
- fallthrough;
- case USER_FAULT:
+ if (is_kernel_fault(regs)) {
+ folio = phys_to_folio(addr);
+ if (unlikely(!folio_try_get(folio)))
+ return;
+ rc = arch_make_folio_accessible(folio);
+ folio_put(folio);
+ if (rc)
+ BUG();
+ } else {
mm = current->mm;
mmap_read_lock(mm);
vma = find_vma(mm, addr);
@@ -540,7 +441,7 @@ void do_secure_storage_access(struct pt_regs *regs)
folio = folio_walk_start(&fw, vma, addr, 0);
if (!folio) {
mmap_read_unlock(mm);
- break;
+ return;
}
/* arch_make_folio_accessible() needs a raised refcount. */
folio_get(folio);
@@ -550,56 +451,8 @@ void do_secure_storage_access(struct pt_regs *regs)
if (rc)
send_sig(SIGSEGV, current, 0);
mmap_read_unlock(mm);
- break;
- case KERNEL_FAULT:
- folio = phys_to_folio(addr);
- if (unlikely(!folio_try_get(folio)))
- break;
- rc = arch_make_folio_accessible(folio);
- folio_put(folio);
- if (rc)
- BUG();
- break;
- default:
- unreachable();
}
}
NOKPROBE_SYMBOL(do_secure_storage_access);
-void do_non_secure_storage_access(struct pt_regs *regs)
-{
- struct gmap *gmap = (struct gmap *)get_lowcore()->gmap;
- unsigned long gaddr = get_fault_address(regs);
-
- if (WARN_ON_ONCE(get_fault_type(regs) != GMAP_FAULT))
- return handle_fault_error_nolock(regs, SEGV_MAPERR);
- if (gmap_convert_to_secure(gmap, gaddr) == -EINVAL)
- send_sig(SIGSEGV, current, 0);
-}
-NOKPROBE_SYMBOL(do_non_secure_storage_access);
-
-void do_secure_storage_violation(struct pt_regs *regs)
-{
- struct gmap *gmap = (struct gmap *)get_lowcore()->gmap;
- unsigned long gaddr = get_fault_address(regs);
-
- /*
- * If the VM has been rebooted, its address space might still contain
- * secure pages from the previous boot.
- * Clear the page so it can be reused.
- */
- if (!gmap_destroy_page(gmap, gaddr))
- return;
- /*
- * Either KVM messed up the secure guest mapping or the same
- * page is mapped into multiple secure guests.
- *
- * This exception is only triggered when a guest 2 is running
- * and can therefore never occur in kernel context.
- */
- pr_warn_ratelimited("Secure storage violation in task: %s, pid %d\n",
- current->comm, current->pid);
- send_sig(SIGSEGV, current, 0);
-}
-
#endif /* CONFIG_PGSTE */
diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
index eb0b51a..3296826 100644
--- a/arch/s390/mm/gmap.c
+++ b/arch/s390/mm/gmap.c
@@ -281,37 +281,6 @@ void gmap_remove(struct gmap *gmap)
}
EXPORT_SYMBOL_GPL(gmap_remove);
-/**
- * gmap_enable - switch primary space to the guest address space
- * @gmap: pointer to the guest address space structure
- */
-void gmap_enable(struct gmap *gmap)
-{
- get_lowcore()->gmap = (unsigned long)gmap;
-}
-EXPORT_SYMBOL_GPL(gmap_enable);
-
-/**
- * gmap_disable - switch back to the standard primary address space
- * @gmap: pointer to the guest address space structure
- */
-void gmap_disable(struct gmap *gmap)
-{
- get_lowcore()->gmap = 0UL;
-}
-EXPORT_SYMBOL_GPL(gmap_disable);
-
-/**
- * gmap_get_enabled - get a pointer to the currently enabled gmap
- *
- * Returns a pointer to the currently enabled gmap. 0 if none is enabled.
- */
-struct gmap *gmap_get_enabled(void)
-{
- return (struct gmap *)get_lowcore()->gmap;
-}
-EXPORT_SYMBOL_GPL(gmap_get_enabled);
-
/*
* gmap_alloc_table is assumed to be called with mmap_lock held
*/
@@ -637,44 +606,124 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr)
}
/**
+ * fixup_user_fault_nowait - manually resolve a user page fault without waiting
+ * @mm: mm_struct of target mm
+ * @address: user address
+ * @fault_flags:flags to pass down to handle_mm_fault()
+ * @unlocked: did we unlock the mmap_lock while retrying
+ *
+ * This function behaves similarly to fixup_user_fault(), but it guarantees
+ * that the fault will be resolved without waiting. The function might drop
+ * and re-acquire the mm lock, in which case @unlocked will be set to true.
+ *
+ * The guarantee is that the fault is handled without waiting, but the
+ * function itself might sleep, due to the lock.
+ *
+ * Context: Needs to be called with mm->mmap_lock held in read mode, and will
+ * return with the lock held in read mode; @unlocked will indicate whether
+ * the lock has been dropped and re-acquired. This is the same behaviour as
+ * fixup_user_fault().
+ *
+ * Return: 0 on success, -EAGAIN if the fault cannot be resolved without
+ * waiting, -EFAULT if the fault cannot be resolved, -ENOMEM if out of
+ * memory.
+ */
+static int fixup_user_fault_nowait(struct mm_struct *mm, unsigned long address,
+ unsigned int fault_flags, bool *unlocked)
+{
+ struct vm_area_struct *vma;
+ unsigned int test_flags;
+ vm_fault_t fault;
+ int rc;
+
+ fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT;
+ test_flags = fault_flags & FAULT_FLAG_WRITE ? VM_WRITE : VM_READ;
+
+ vma = find_vma(mm, address);
+ if (unlikely(!vma || address < vma->vm_start))
+ return -EFAULT;
+ if (unlikely(!(vma->vm_flags & test_flags)))
+ return -EFAULT;
+
+ fault = handle_mm_fault(vma, address, fault_flags, NULL);
+ /* the mm lock has been dropped, take it again */
+ if (fault & VM_FAULT_COMPLETED) {
+ *unlocked = true;
+ mmap_read_lock(mm);
+ return 0;
+ }
+ /* the mm lock has not been dropped */
+ if (fault & VM_FAULT_ERROR) {
+ rc = vm_fault_to_errno(fault, 0);
+ BUG_ON(!rc);
+ return rc;
+ }
+ /* the mm lock has not been dropped because of FAULT_FLAG_RETRY_NOWAIT */
+ if (fault & VM_FAULT_RETRY)
+ return -EAGAIN;
+ /* nothing needed to be done and the mm lock has not been dropped */
+ return 0;
+}
+
+/**
+ * __gmap_fault - resolve a fault on a guest address
+ * @gmap: pointer to guest mapping meta data structure
+ * @gaddr: guest address
+ * @fault_flags: flags to pass down to handle_mm_fault()
+ *
+ * Context: Needs to be called with mm->mmap_lock held in read mode. Might
+ * drop and re-acquire the lock. Will always return with the lock held.
+ */
+static int __gmap_fault(struct gmap *gmap, unsigned long gaddr, unsigned int fault_flags)
+{
+ unsigned long vmaddr;
+ bool unlocked;
+ int rc = 0;
+
+retry:
+ unlocked = false;
+
+ vmaddr = __gmap_translate(gmap, gaddr);
+ if (IS_ERR_VALUE(vmaddr))
+ return vmaddr;
+
+ if (fault_flags & FAULT_FLAG_RETRY_NOWAIT)
+ rc = fixup_user_fault_nowait(gmap->mm, vmaddr, fault_flags, &unlocked);
+ else
+ rc = fixup_user_fault(gmap->mm, vmaddr, fault_flags, &unlocked);
+ if (rc)
+ return rc;
+ /*
+ * In the case that fixup_user_fault unlocked the mmap_lock during
+ * fault-in, redo __gmap_translate() to avoid racing with a
+ * map/unmap_segment.
+ * In particular, __gmap_translate(), fixup_user_fault{,_nowait}(),
+ * and __gmap_link() must all be called atomically in one go; if the
+ * lock had been dropped in between, a retry is needed.
+ */
+ if (unlocked)
+ goto retry;
+
+ return __gmap_link(gmap, gaddr, vmaddr);
+}
+
+/**
* gmap_fault - resolve a fault on a guest address
* @gmap: pointer to guest mapping meta data structure
* @gaddr: guest address
* @fault_flags: flags to pass down to handle_mm_fault()
*
- * Returns 0 on success, -ENOMEM for out of memory conditions, and -EFAULT
- * if the vm address is already mapped to a different guest segment.
+ * Returns 0 on success, -ENOMEM for out of memory conditions, -EFAULT if the
+ * vm address is already mapped to a different guest segment, and -EAGAIN if
+ * FAULT_FLAG_RETRY_NOWAIT was specified and the fault could not be processed
+ * immediately.
*/
-int gmap_fault(struct gmap *gmap, unsigned long gaddr,
- unsigned int fault_flags)
+int gmap_fault(struct gmap *gmap, unsigned long gaddr, unsigned int fault_flags)
{
- unsigned long vmaddr;
int rc;
- bool unlocked;
mmap_read_lock(gmap->mm);
-
-retry:
- unlocked = false;
- vmaddr = __gmap_translate(gmap, gaddr);
- if (IS_ERR_VALUE(vmaddr)) {
- rc = vmaddr;
- goto out_up;
- }
- if (fixup_user_fault(gmap->mm, vmaddr, fault_flags,
- &unlocked)) {
- rc = -EFAULT;
- goto out_up;
- }
- /*
- * In the case that fixup_user_fault unlocked the mmap_lock during
- * faultin redo __gmap_translate to not race with a map/unmap_segment.
- */
- if (unlocked)
- goto retry;
-
- rc = __gmap_link(gmap, gaddr, vmaddr);
-out_up:
+ rc = __gmap_fault(gmap, gaddr, fault_flags);
mmap_read_unlock(gmap->mm);
return rc;
}
diff --git a/arch/s390/mm/pageattr.c b/arch/s390/mm/pageattr.c
index 5f805ad..4a0f422 100644
--- a/arch/s390/mm/pageattr.c
+++ b/arch/s390/mm/pageattr.c
@@ -12,6 +12,7 @@
#include <asm/pgalloc.h>
#include <asm/kfence.h>
#include <asm/page.h>
+#include <asm/asm.h>
#include <asm/set_memory.h>
static inline unsigned long sske_frame(unsigned long addr, unsigned char skey)
@@ -406,6 +407,21 @@ int set_direct_map_default_noflush(struct page *page)
return __set_memory((unsigned long)page_to_virt(page), 1, SET_MEMORY_DEF);
}
+bool kernel_page_present(struct page *page)
+{
+ unsigned long addr;
+ unsigned int cc;
+
+ addr = (unsigned long)page_address(page);
+ asm volatile(
+ " lra %[addr],0(%[addr])\n"
+ CC_IPM(cc)
+ : CC_OUT(cc, cc), [addr] "+a" (addr)
+ :
+ : CC_CLOBBER);
+ return CC_TRANSFORM(cc) == 0;
+}
+
#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KFENCE)
static void ipte_range(pte_t *pte, unsigned long address, int nr)
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index 2c944ba..cea5dba 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -525,7 +525,7 @@ static inline void pudp_idte_global(struct mm_struct *mm,
else
/*
* Invalid bit position is the same for pmd and pud, so we can
- * re-use _pmd_csp() here
+ * reuse _pmd_csp() here
*/
__pmdp_csp((pmd_t *) pudp);
}
diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c
index bd9624c..b7efa96 100644
--- a/arch/s390/pci/pci.c
+++ b/arch/s390/pci/pci.c
@@ -29,6 +29,7 @@
#include <linux/pci.h>
#include <linux/printk.h>
#include <linux/lockdep.h>
+#include <linux/list_sort.h>
#include <asm/isc.h>
#include <asm/airq.h>
@@ -785,7 +786,6 @@ struct zpci_dev *zpci_create_device(u32 fid, u32 fh, enum zpci_state state)
struct zpci_dev *zdev;
int rc;
- zpci_dbg(1, "add fid:%x, fh:%x, c:%d\n", fid, fh, state);
zdev = kzalloc(sizeof(*zdev), GFP_KERNEL);
if (!zdev)
return ERR_PTR(-ENOMEM);
@@ -805,6 +805,19 @@ struct zpci_dev *zpci_create_device(u32 fid, u32 fh, enum zpci_state state)
mutex_init(&zdev->fmb_lock);
mutex_init(&zdev->kzdev_lock);
+ return zdev;
+
+error:
+ zpci_dbg(0, "crt fid:%x, rc:%d\n", fid, rc);
+ kfree(zdev);
+ return ERR_PTR(rc);
+}
+
+int zpci_add_device(struct zpci_dev *zdev)
+{
+ int rc;
+
+ zpci_dbg(1, "add fid:%x, fh:%x, c:%d\n", zdev->fid, zdev->fh, zdev->state);
rc = zpci_init_iommu(zdev);
if (rc)
goto error;
@@ -816,15 +829,13 @@ struct zpci_dev *zpci_create_device(u32 fid, u32 fh, enum zpci_state state)
spin_lock(&zpci_list_lock);
list_add_tail(&zdev->entry, &zpci_list);
spin_unlock(&zpci_list_lock);
-
- return zdev;
+ return 0;
error_destroy_iommu:
zpci_destroy_iommu(zdev);
error:
- zpci_dbg(0, "add fid:%x, rc:%d\n", fid, rc);
- kfree(zdev);
- return ERR_PTR(rc);
+ zpci_dbg(0, "add fid:%x, rc:%d\n", zdev->fid, rc);
+ return rc;
}
bool zpci_is_device_configured(struct zpci_dev *zdev)
@@ -1082,6 +1093,49 @@ bool zpci_is_enabled(void)
return s390_pci_initialized;
}
+static int zpci_cmp_rid(void *priv, const struct list_head *a,
+ const struct list_head *b)
+{
+ struct zpci_dev *za = container_of(a, struct zpci_dev, entry);
+ struct zpci_dev *zb = container_of(b, struct zpci_dev, entry);
+
+ /*
+ * PCI functions without RID available maintain original order
+ * between themselves but sort before those with RID.
+ */
+ if (za->rid == zb->rid)
+ return za->rid_available > zb->rid_available;
+ /*
+ * PCI functions with RID sort by RID ascending.
+ */
+ return za->rid > zb->rid;
+}
+
+static void zpci_add_devices(struct list_head *scan_list)
+{
+ struct zpci_dev *zdev, *tmp;
+
+ list_sort(NULL, scan_list, &zpci_cmp_rid);
+ list_for_each_entry_safe(zdev, tmp, scan_list, entry) {
+ list_del_init(&zdev->entry);
+ zpci_add_device(zdev);
+ }
+}
+
+int zpci_scan_devices(void)
+{
+ LIST_HEAD(scan_list);
+ int rc;
+
+ rc = clp_scan_pci_devices(&scan_list);
+ if (rc)
+ return rc;
+
+ zpci_add_devices(&scan_list);
+ zpci_bus_scan_busses();
+ return 0;
+}
+
static int __init pci_base_init(void)
{
int rc;
@@ -1111,10 +1165,9 @@ static int __init pci_base_init(void)
if (rc)
goto out_irq;
- rc = clp_scan_pci_devices();
+ rc = zpci_scan_devices();
if (rc)
goto out_find;
- zpci_bus_scan_busses();
s390_pci_initialized = 1;
return 0;
diff --git a/arch/s390/pci/pci_bus.c b/arch/s390/pci/pci_bus.c
index daa5d74..1b74a00 100644
--- a/arch/s390/pci/pci_bus.c
+++ b/arch/s390/pci/pci_bus.c
@@ -168,9 +168,16 @@ void zpci_bus_scan_busses(void)
mutex_unlock(&zbus_list_lock);
}
+static bool zpci_bus_is_multifunction_root(struct zpci_dev *zdev)
+{
+ return !s390_pci_no_rid && zdev->rid_available &&
+ zpci_is_device_configured(zdev) &&
+ !zdev->vfn;
+}
+
/* zpci_bus_create_pci_bus - Create the PCI bus associated with this zbus
* @zbus: the zbus holding the zdevices
- * @fr: PCI root function that will determine the bus's domain, and bus speeed
+ * @fr: PCI root function that will determine the bus's domain, and bus speed
* @ops: the pci operations
*
* The PCI function @fr determines the domain (its UID), multifunction property
@@ -188,7 +195,7 @@ static int zpci_bus_create_pci_bus(struct zpci_bus *zbus, struct zpci_dev *fr, s
return domain;
zbus->domain_nr = domain;
- zbus->multifunction = fr->rid_available;
+ zbus->multifunction = zpci_bus_is_multifunction_root(fr);
zbus->max_bus_speed = fr->max_bus_speed;
/*
@@ -232,13 +239,15 @@ static void zpci_bus_put(struct zpci_bus *zbus)
kref_put(&zbus->kref, zpci_bus_release);
}
-static struct zpci_bus *zpci_bus_get(int pchid)
+static struct zpci_bus *zpci_bus_get(int topo, bool topo_is_tid)
{
struct zpci_bus *zbus;
mutex_lock(&zbus_list_lock);
list_for_each_entry(zbus, &zbus_list, bus_next) {
- if (pchid == zbus->pchid) {
+ if (!zbus->multifunction)
+ continue;
+ if (topo_is_tid == zbus->topo_is_tid && topo == zbus->topo) {
kref_get(&zbus->kref);
goto out_unlock;
}
@@ -249,7 +258,7 @@ static struct zpci_bus *zpci_bus_get(int pchid)
return zbus;
}
-static struct zpci_bus *zpci_bus_alloc(int pchid)
+static struct zpci_bus *zpci_bus_alloc(int topo, bool topo_is_tid)
{
struct zpci_bus *zbus;
@@ -257,7 +266,8 @@ static struct zpci_bus *zpci_bus_alloc(int pchid)
if (!zbus)
return NULL;
- zbus->pchid = pchid;
+ zbus->topo = topo;
+ zbus->topo_is_tid = topo_is_tid;
INIT_LIST_HEAD(&zbus->bus_next);
mutex_lock(&zbus_list_lock);
list_add_tail(&zbus->bus_next, &zbus_list);
@@ -292,19 +302,22 @@ static int zpci_bus_add_device(struct zpci_bus *zbus, struct zpci_dev *zdev)
{
int rc = -EINVAL;
+ if (zbus->multifunction) {
+ if (!zdev->rid_available) {
+ WARN_ONCE(1, "rid_available not set for multifunction\n");
+ return rc;
+ }
+ zdev->devfn = zdev->rid & ZPCI_RID_MASK_DEVFN;
+ }
+
if (zbus->function[zdev->devfn]) {
pr_err("devfn %04x is already assigned\n", zdev->devfn);
return rc;
}
-
zdev->zbus = zbus;
zbus->function[zdev->devfn] = zdev;
zpci_nb_devices++;
- if (zbus->multifunction && !zdev->rid_available) {
- WARN_ONCE(1, "rid_available not set for multifunction\n");
- goto error;
- }
rc = zpci_init_slot(zdev);
if (rc)
goto error;
@@ -321,8 +334,9 @@ static int zpci_bus_add_device(struct zpci_bus *zbus, struct zpci_dev *zdev)
int zpci_bus_device_register(struct zpci_dev *zdev, struct pci_ops *ops)
{
+ bool topo_is_tid = zdev->tid_avail;
struct zpci_bus *zbus = NULL;
- int rc = -EBADF;
+ int topo, rc = -EBADF;
if (zpci_nb_devices == ZPCI_NR_DEVICES) {
pr_warn("Adding PCI function %08x failed because the configured limit of %d is reached\n",
@@ -330,14 +344,10 @@ int zpci_bus_device_register(struct zpci_dev *zdev, struct pci_ops *ops)
return -ENOSPC;
}
- if (zdev->devfn >= ZPCI_FUNCTIONS_PER_BUS)
- return -EINVAL;
-
- if (!s390_pci_no_rid && zdev->rid_available)
- zbus = zpci_bus_get(zdev->pchid);
-
+ topo = topo_is_tid ? zdev->tid : zdev->pchid;
+ zbus = zpci_bus_get(topo, topo_is_tid);
if (!zbus) {
- zbus = zpci_bus_alloc(zdev->pchid);
+ zbus = zpci_bus_alloc(topo, topo_is_tid);
if (!zbus)
return -ENOMEM;
}
diff --git a/arch/s390/pci/pci_bus.h b/arch/s390/pci/pci_bus.h
index af9f0ac..e86a941 100644
--- a/arch/s390/pci/pci_bus.h
+++ b/arch/s390/pci/pci_bus.h
@@ -6,6 +6,10 @@
* Pierre Morel <pmorel@linux.ibm.com>
*
*/
+#ifndef __S390_PCI_BUS_H
+#define __S390_PCI_BUS_H
+
+#include <linux/pci.h>
int zpci_bus_device_register(struct zpci_dev *zdev, struct pci_ops *ops);
void zpci_bus_device_unregister(struct zpci_dev *zdev);
@@ -40,3 +44,4 @@ static inline struct zpci_dev *zdev_from_bus(struct pci_bus *bus,
return (devfn >= ZPCI_FUNCTIONS_PER_BUS) ? NULL : zbus->function[devfn];
}
+#endif /* __S390_PCI_BUS_H */
diff --git a/arch/s390/pci/pci_clp.c b/arch/s390/pci/pci_clp.c
index 6f55a59..14bf7e8 100644
--- a/arch/s390/pci/pci_clp.c
+++ b/arch/s390/pci/pci_clp.c
@@ -20,6 +20,7 @@
#include <asm/asm-extable.h>
#include <asm/pci_debug.h>
#include <asm/pci_clp.h>
+#include <asm/asm.h>
#include <asm/clp.h>
#include <uapi/asm/clp.h>
@@ -52,18 +53,20 @@ static inline void zpci_err_clp(unsigned int rsp, int rc)
static inline int clp_get_ilp(unsigned long *ilp)
{
unsigned long mask;
- int cc = 3;
+ int cc, exception;
+ exception = 1;
asm volatile (
" .insn rrf,0xb9a00000,%[mask],%[cmd],8,0\n"
- "0: ipm %[cc]\n"
- " srl %[cc],28\n"
+ "0: lhi %[exc],0\n"
"1:\n"
+ CC_IPM(cc)
EX_TABLE(0b, 1b)
- : [cc] "+d" (cc), [mask] "=d" (mask) : [cmd] "a" (1)
- : "cc");
+ : CC_OUT(cc, cc), [mask] "=d" (mask), [exc] "+d" (exception)
+ : [cmd] "a" (1)
+ : CC_CLOBBER);
*ilp = mask;
- return cc;
+ return exception ? 3 : CC_TRANSFORM(cc);
}
/*
@@ -72,19 +75,20 @@ static inline int clp_get_ilp(unsigned long *ilp)
static __always_inline int clp_req(void *data, unsigned int lps)
{
struct { u8 _[CLP_BLK_SIZE]; } *req = data;
+ int cc, exception;
u64 ignored;
- int cc = 3;
+ exception = 1;
asm volatile (
" .insn rrf,0xb9a00000,%[ign],%[req],0,%[lps]\n"
- "0: ipm %[cc]\n"
- " srl %[cc],28\n"
+ "0: lhi %[exc],0\n"
"1:\n"
+ CC_IPM(cc)
EX_TABLE(0b, 1b)
- : [cc] "+d" (cc), [ign] "=d" (ignored), "+m" (*req)
+ : CC_OUT(cc, cc), [ign] "=d" (ignored), "+m" (*req), [exc] "+d" (exception)
: [req] "a" (req), [lps] "i" (lps)
- : "cc");
- return cc;
+ : CC_CLOBBER);
+ return exception ? 3 : CC_TRANSFORM(cc);
}
static void *clp_alloc_block(gfp_t gfp_mask)
@@ -162,12 +166,16 @@ static int clp_store_query_pci_fn(struct zpci_dev *zdev,
zdev->pft = response->pft;
zdev->vfn = response->vfn;
zdev->port = response->port;
+ zdev->fidparm = response->fidparm;
zdev->uid = response->uid;
zdev->fmb_length = sizeof(u32) * response->fmb_len;
- zdev->rid_available = response->rid_avail;
zdev->is_physfn = response->is_physfn;
- if (!s390_pci_no_rid && zdev->rid_available)
- zdev->devfn = response->rid & ZPCI_RID_MASK_DEVFN;
+ zdev->rid_available = response->rid_avail;
+ if (zdev->rid_available)
+ zdev->rid = response->rid;
+ zdev->tid_avail = response->tid_avail;
+ if (zdev->tid_avail)
+ zdev->tid = response->tid;
memcpy(zdev->pfip, response->pfip, sizeof(zdev->pfip));
if (response->util_str_avail) {
@@ -407,6 +415,7 @@ static int clp_find_pci(struct clp_req_rsp_list_pci *rrb, u32 fid,
static void __clp_add(struct clp_fh_list_entry *entry, void *data)
{
+ struct list_head *scan_list = data;
struct zpci_dev *zdev;
if (!entry->vendor_id)
@@ -417,10 +426,11 @@ static void __clp_add(struct clp_fh_list_entry *entry, void *data)
zpci_zdev_put(zdev);
return;
}
- zpci_create_device(entry->fid, entry->fh, entry->config_state);
+ zdev = zpci_create_device(entry->fid, entry->fh, entry->config_state);
+ list_add_tail(&zdev->entry, scan_list);
}
-int clp_scan_pci_devices(void)
+int clp_scan_pci_devices(struct list_head *scan_list)
{
struct clp_req_rsp_list_pci *rrb;
int rc;
@@ -429,7 +439,7 @@ int clp_scan_pci_devices(void)
if (!rrb)
return -ENOMEM;
- rc = clp_list_pci(rrb, NULL, __clp_add);
+ rc = clp_list_pci(rrb, scan_list, __clp_add);
clp_free_block(rrb);
return rc;
diff --git a/arch/s390/pci/pci_event.c b/arch/s390/pci/pci_event.c
index d4f19d3..47f934f 100644
--- a/arch/s390/pci/pci_event.c
+++ b/arch/s390/pci/pci_event.c
@@ -340,6 +340,7 @@ static void __zpci_event_availability(struct zpci_ccdf_avail *ccdf)
zdev = zpci_create_device(ccdf->fid, ccdf->fh, ZPCI_FN_STATE_CONFIGURED);
if (IS_ERR(zdev))
break;
+ zpci_add_device(zdev);
} else {
/* the configuration request may be stale */
if (zdev->state != ZPCI_FN_STATE_STANDBY)
@@ -349,10 +350,14 @@ static void __zpci_event_availability(struct zpci_ccdf_avail *ccdf)
zpci_scan_configured_device(zdev, ccdf->fh);
break;
case 0x0302: /* Reserved -> Standby */
- if (!zdev)
- zpci_create_device(ccdf->fid, ccdf->fh, ZPCI_FN_STATE_STANDBY);
- else
+ if (!zdev) {
+ zdev = zpci_create_device(ccdf->fid, ccdf->fh, ZPCI_FN_STATE_STANDBY);
+ if (IS_ERR(zdev))
+ break;
+ zpci_add_device(zdev);
+ } else {
zpci_update_fh(zdev, ccdf->fh);
+ }
break;
case 0x0303: /* Deconfiguration requested */
if (zdev) {
@@ -381,7 +386,7 @@ static void __zpci_event_availability(struct zpci_ccdf_avail *ccdf)
break;
case 0x0306: /* 0x308 or 0x302 for multiple devices */
zpci_remove_reserved_devices();
- clp_scan_pci_devices();
+ zpci_scan_devices();
break;
case 0x0308: /* Standby -> Reserved */
if (!zdev)
diff --git a/arch/s390/pci/pci_insn.c b/arch/s390/pci/pci_insn.c
index 56480be..f5a75ea 100644
--- a/arch/s390/pci/pci_insn.c
+++ b/arch/s390/pci/pci_insn.c
@@ -15,6 +15,7 @@
#include <asm/pci_debug.h>
#include <asm/pci_io.h>
#include <asm/processor.h>
+#include <asm/asm.h>
#define ZPCI_INSN_BUSY_DELAY 1 /* 1 microsecond */
@@ -57,16 +58,16 @@ static inline void zpci_err_insn_addr(int lvl, u8 insn, u8 cc, u8 status,
/* Modify PCI Function Controls */
static inline u8 __mpcifc(u64 req, struct zpci_fib *fib, u8 *status)
{
- u8 cc;
+ int cc;
asm volatile (
" .insn rxy,0xe300000000d0,%[req],%[fib]\n"
- " ipm %[cc]\n"
- " srl %[cc],28\n"
- : [cc] "=d" (cc), [req] "+d" (req), [fib] "+Q" (*fib)
- : : "cc");
+ CC_IPM(cc)
+ : CC_OUT(cc, cc), [req] "+d" (req), [fib] "+Q" (*fib)
+ :
+ : CC_CLOBBER);
*status = req >> 24 & 0xff;
- return cc;
+ return CC_TRANSFORM(cc);
}
u8 zpci_mod_fc(u64 req, struct zpci_fib *fib, u8 *status)
@@ -98,17 +99,16 @@ EXPORT_SYMBOL_GPL(zpci_mod_fc);
static inline u8 __rpcit(u64 fn, u64 addr, u64 range, u8 *status)
{
union register_pair addr_range = {.even = addr, .odd = range};
- u8 cc;
+ int cc;
asm volatile (
" .insn rre,0xb9d30000,%[fn],%[addr_range]\n"
- " ipm %[cc]\n"
- " srl %[cc],28\n"
- : [cc] "=d" (cc), [fn] "+d" (fn)
+ CC_IPM(cc)
+ : CC_OUT(cc, cc), [fn] "+d" (fn)
: [addr_range] "d" (addr_range.pair)
- : "cc");
+ : CC_CLOBBER);
*status = fn >> 24 & 0xff;
- return cc;
+ return CC_TRANSFORM(cc);
}
int zpci_refresh_trans(u64 fn, u64 addr, u64 range)
@@ -156,20 +156,23 @@ EXPORT_SYMBOL_GPL(zpci_set_irq_ctrl);
static inline int ____pcilg(u64 *data, u64 req, u64 offset, u8 *status)
{
union register_pair req_off = {.even = req, .odd = offset};
- int cc = -ENXIO;
+ int cc, exception;
u64 __data;
+ exception = 1;
asm volatile (
" .insn rre,0xb9d20000,%[data],%[req_off]\n"
- "0: ipm %[cc]\n"
- " srl %[cc],28\n"
+ "0: lhi %[exc],0\n"
"1:\n"
+ CC_IPM(cc)
EX_TABLE(0b, 1b)
- : [cc] "+d" (cc), [data] "=d" (__data),
- [req_off] "+&d" (req_off.pair) :: "cc");
+ : CC_OUT(cc, cc), [data] "=d" (__data),
+ [req_off] "+d" (req_off.pair), [exc] "+d" (exception)
+ :
+ : CC_CLOBBER);
*status = req_off.even >> 24 & 0xff;
*data = __data;
- return cc;
+ return exception ? -ENXIO : CC_TRANSFORM(cc);
}
static inline int __pcilg(u64 *data, u64 req, u64 offset, u8 *status)
@@ -222,20 +225,23 @@ static inline int zpci_load_fh(u64 *data, const volatile void __iomem *addr,
static inline int __pcilg_mio(u64 *data, u64 ioaddr, u64 len, u8 *status)
{
union register_pair ioaddr_len = {.even = ioaddr, .odd = len};
- int cc = -ENXIO;
+ int cc, exception;
u64 __data;
+ exception = 1;
asm volatile (
" .insn rre,0xb9d60000,%[data],%[ioaddr_len]\n"
- "0: ipm %[cc]\n"
- " srl %[cc],28\n"
+ "0: lhi %[exc],0\n"
"1:\n"
+ CC_IPM(cc)
EX_TABLE(0b, 1b)
- : [cc] "+d" (cc), [data] "=d" (__data),
- [ioaddr_len] "+&d" (ioaddr_len.pair) :: "cc");
+ : CC_OUT(cc, cc), [data] "=d" (__data),
+ [ioaddr_len] "+d" (ioaddr_len.pair), [exc] "+d" (exception)
+ :
+ : CC_CLOBBER);
*status = ioaddr_len.odd >> 24 & 0xff;
*data = __data;
- return cc;
+ return exception ? -ENXIO : CC_TRANSFORM(cc);
}
int zpci_load(u64 *data, const volatile void __iomem *addr, unsigned long len)
@@ -258,19 +264,20 @@ EXPORT_SYMBOL_GPL(zpci_load);
static inline int __pcistg(u64 data, u64 req, u64 offset, u8 *status)
{
union register_pair req_off = {.even = req, .odd = offset};
- int cc = -ENXIO;
+ int cc, exception;
+ exception = 1;
asm volatile (
" .insn rre,0xb9d00000,%[data],%[req_off]\n"
- "0: ipm %[cc]\n"
- " srl %[cc],28\n"
+ "0: lhi %[exc],0\n"
"1:\n"
+ CC_IPM(cc)
EX_TABLE(0b, 1b)
- : [cc] "+d" (cc), [req_off] "+&d" (req_off.pair)
+ : CC_OUT(cc, cc), [req_off] "+d" (req_off.pair), [exc] "+d" (exception)
: [data] "d" (data)
- : "cc");
+ : CC_CLOBBER);
*status = req_off.even >> 24 & 0xff;
- return cc;
+ return exception ? -ENXIO : CC_TRANSFORM(cc);
}
int __zpci_store(u64 data, u64 req, u64 offset)
@@ -311,19 +318,20 @@ static inline int zpci_store_fh(const volatile void __iomem *addr, u64 data,
static inline int __pcistg_mio(u64 data, u64 ioaddr, u64 len, u8 *status)
{
union register_pair ioaddr_len = {.even = ioaddr, .odd = len};
- int cc = -ENXIO;
+ int cc, exception;
+ exception = 1;
asm volatile (
" .insn rre,0xb9d40000,%[data],%[ioaddr_len]\n"
- "0: ipm %[cc]\n"
- " srl %[cc],28\n"
+ "0: lhi %[exc],0\n"
"1:\n"
+ CC_IPM(cc)
EX_TABLE(0b, 1b)
- : [cc] "+d" (cc), [ioaddr_len] "+&d" (ioaddr_len.pair)
+ : CC_OUT(cc, cc), [ioaddr_len] "+d" (ioaddr_len.pair), [exc] "+d" (exception)
: [data] "d" (data)
- : "cc", "memory");
+ : CC_CLOBBER_LIST("memory"));
*status = ioaddr_len.odd >> 24 & 0xff;
- return cc;
+ return exception ? -ENXIO : CC_TRANSFORM(cc);
}
int zpci_store(const volatile void __iomem *addr, u64 data, unsigned long len)
@@ -345,19 +353,20 @@ EXPORT_SYMBOL_GPL(zpci_store);
/* PCI Store Block */
static inline int __pcistb(const u64 *data, u64 req, u64 offset, u8 *status)
{
- int cc = -ENXIO;
+ int cc, exception;
+ exception = 1;
asm volatile (
" .insn rsy,0xeb00000000d0,%[req],%[offset],%[data]\n"
- "0: ipm %[cc]\n"
- " srl %[cc],28\n"
+ "0: lhi %[exc],0\n"
"1:\n"
+ CC_IPM(cc)
EX_TABLE(0b, 1b)
- : [cc] "+d" (cc), [req] "+d" (req)
+ : CC_OUT(cc, cc), [req] "+d" (req), [exc] "+d" (exception)
: [offset] "d" (offset), [data] "Q" (*data)
- : "cc");
+ : CC_CLOBBER);
*status = req >> 24 & 0xff;
- return cc;
+ return exception ? -ENXIO : CC_TRANSFORM(cc);
}
int __zpci_store_block(const u64 *data, u64 req, u64 offset)
@@ -398,19 +407,20 @@ static inline int zpci_write_block_fh(volatile void __iomem *dst,
static inline int __pcistb_mio(const u64 *data, u64 ioaddr, u64 len, u8 *status)
{
- int cc = -ENXIO;
+ int cc, exception;
+ exception = 1;
asm volatile (
" .insn rsy,0xeb00000000d4,%[len],%[ioaddr],%[data]\n"
- "0: ipm %[cc]\n"
- " srl %[cc],28\n"
+ "0: lhi %[exc],0\n"
"1:\n"
+ CC_IPM(cc)
EX_TABLE(0b, 1b)
- : [cc] "+d" (cc), [len] "+d" (len)
+ : CC_OUT(cc, cc), [len] "+d" (len), [exc] "+d" (exception)
: [ioaddr] "d" (ioaddr), [data] "Q" (*data)
- : "cc");
+ : CC_CLOBBER);
*status = len >> 24 & 0xff;
- return cc;
+ return exception ? -ENXIO : CC_TRANSFORM(cc);
}
int zpci_write_block(volatile void __iomem *dst,
diff --git a/arch/s390/pci/pci_iov.h b/arch/s390/pci/pci_iov.h
index b2c8280..e3fa4e7 100644
--- a/arch/s390/pci/pci_iov.h
+++ b/arch/s390/pci/pci_iov.h
@@ -10,6 +10,8 @@
#ifndef __S390_PCI_IOV_H
#define __S390_PCI_IOV_H
+#include <linux/pci.h>
+
#ifdef CONFIG_PCI_IOV
void zpci_iov_remove_virtfn(struct pci_dev *pdev, int vfn);
diff --git a/arch/s390/pci/pci_mmio.c b/arch/s390/pci/pci_mmio.c
index de5c0b3..46f99dc 100644
--- a/arch/s390/pci/pci_mmio.c
+++ b/arch/s390/pci/pci_mmio.c
@@ -14,6 +14,7 @@
#include <asm/asm-extable.h>
#include <asm/pci_io.h>
#include <asm/pci_debug.h>
+#include <asm/asm.h>
static inline void zpci_err_mmio(u8 cc, u8 status, u64 offset)
{
@@ -30,20 +31,21 @@ static inline int __pcistb_mio_inuser(
void __iomem *ioaddr, const void __user *src,
u64 len, u8 *status)
{
- int cc = -ENXIO;
+ int cc, exception;
+ exception = 1;
asm volatile (
- " sacf 256\n"
- "0: .insn rsy,0xeb00000000d4,%[len],%[ioaddr],%[src]\n"
- "1: ipm %[cc]\n"
- " srl %[cc],28\n"
- "2: sacf 768\n"
+ " sacf 256\n"
+ "0: .insn rsy,0xeb00000000d4,%[len],%[ioaddr],%[src]\n"
+ "1: lhi %[exc],0\n"
+ "2: sacf 768\n"
+ CC_IPM(cc)
EX_TABLE(0b, 2b) EX_TABLE(1b, 2b)
- : [cc] "+d" (cc), [len] "+d" (len)
+ : CC_OUT(cc, cc), [len] "+d" (len), [exc] "+d" (exception)
: [ioaddr] "a" (ioaddr), [src] "Q" (*((u8 __force *)src))
- : "cc", "memory");
+ : CC_CLOBBER_LIST("memory"));
*status = len >> 24 & 0xff;
- return cc;
+ return exception ? -ENXIO : CC_TRANSFORM(cc);
}
static inline int __pcistg_mio_inuser(
@@ -51,7 +53,7 @@ static inline int __pcistg_mio_inuser(
u64 ulen, u8 *status)
{
union register_pair ioaddr_len = {.even = (u64 __force)ioaddr, .odd = ulen};
- int cc = -ENXIO;
+ int cc, exception;
u64 val = 0;
u64 cnt = ulen;
u8 tmp;
@@ -61,25 +63,27 @@ static inline int __pcistg_mio_inuser(
* a register, then store it to PCI at @ioaddr while in secondary
* address space. pcistg then uses the user mappings.
*/
+ exception = 1;
asm volatile (
- " sacf 256\n"
- "0: llgc %[tmp],0(%[src])\n"
+ " sacf 256\n"
+ "0: llgc %[tmp],0(%[src])\n"
"4: sllg %[val],%[val],8\n"
- " aghi %[src],1\n"
- " ogr %[val],%[tmp]\n"
- " brctg %[cnt],0b\n"
- "1: .insn rre,0xb9d40000,%[val],%[ioaddr_len]\n"
- "2: ipm %[cc]\n"
- " srl %[cc],28\n"
- "3: sacf 768\n"
+ " aghi %[src],1\n"
+ " ogr %[val],%[tmp]\n"
+ " brctg %[cnt],0b\n"
+ "1: .insn rre,0xb9d40000,%[val],%[ioaddr_len]\n"
+ "2: lhi %[exc],0\n"
+ "3: sacf 768\n"
+ CC_IPM(cc)
EX_TABLE(0b, 3b) EX_TABLE(4b, 3b) EX_TABLE(1b, 3b) EX_TABLE(2b, 3b)
+ : [src] "+a" (src), [cnt] "+d" (cnt),
+ [val] "+d" (val), [tmp] "=d" (tmp), [exc] "+d" (exception),
+ CC_OUT(cc, cc), [ioaddr_len] "+&d" (ioaddr_len.pair)
:
- [src] "+a" (src), [cnt] "+d" (cnt),
- [val] "+d" (val), [tmp] "=d" (tmp),
- [cc] "+d" (cc), [ioaddr_len] "+&d" (ioaddr_len.pair)
- :: "cc", "memory");
+ : CC_CLOBBER_LIST("memory"));
*status = ioaddr_len.odd >> 24 & 0xff;
+ cc = exception ? -ENXIO : CC_TRANSFORM(cc);
/* did we read everything from user memory? */
if (!cc && cnt != 0)
cc = -EFAULT;
@@ -198,7 +202,7 @@ static inline int __pcilg_mio_inuser(
union register_pair ioaddr_len = {.even = (u64 __force)ioaddr, .odd = ulen};
u64 cnt = ulen;
int shift = ulen * 8;
- int cc = -ENXIO;
+ int cc, exception;
u64 val, tmp;
/*
@@ -206,27 +210,33 @@ static inline int __pcilg_mio_inuser(
* user space) into a register using pcilg then store these bytes at
* user address @dst
*/
+ exception = 1;
asm volatile (
- " sacf 256\n"
- "0: .insn rre,0xb9d60000,%[val],%[ioaddr_len]\n"
- "1: ipm %[cc]\n"
- " srl %[cc],28\n"
- " ltr %[cc],%[cc]\n"
- " jne 4f\n"
- "2: ahi %[shift],-8\n"
- " srlg %[tmp],%[val],0(%[shift])\n"
- "3: stc %[tmp],0(%[dst])\n"
+ " sacf 256\n"
+ "0: .insn rre,0xb9d60000,%[val],%[ioaddr_len]\n"
+ "1: lhi %[exc],0\n"
+ " jne 4f\n"
+ "2: ahi %[shift],-8\n"
+ " srlg %[tmp],%[val],0(%[shift])\n"
+ "3: stc %[tmp],0(%[dst])\n"
"5: aghi %[dst],1\n"
- " brctg %[cnt],2b\n"
- "4: sacf 768\n"
+ " brctg %[cnt],2b\n"
+ /*
+ * Use xr to clear exc and set condition code to zero
+ * to ensure flag output is correct for this branch.
+ */
+ " xr %[exc],%[exc]\n"
+ "4: sacf 768\n"
+ CC_IPM(cc)
EX_TABLE(0b, 4b) EX_TABLE(1b, 4b) EX_TABLE(3b, 4b) EX_TABLE(5b, 4b)
+ : [ioaddr_len] "+&d" (ioaddr_len.pair), [exc] "+d" (exception),
+ CC_OUT(cc, cc), [val] "=d" (val),
+ [dst] "+a" (dst), [cnt] "+d" (cnt), [tmp] "=d" (tmp),
+ [shift] "+d" (shift)
:
- [ioaddr_len] "+&d" (ioaddr_len.pair),
- [cc] "+d" (cc), [val] "=d" (val),
- [dst] "+a" (dst), [cnt] "+d" (cnt), [tmp] "=d" (tmp),
- [shift] "+d" (shift)
- :: "cc", "memory");
+ : CC_CLOBBER_LIST("memory"));
+ cc = exception ? -ENXIO : CC_TRANSFORM(cc);
/* did we write everything to the user space buffer? */
if (!cc && cnt != 0)
cc = -EFAULT;
diff --git a/arch/s390/pci/pci_sysfs.c b/arch/s390/pci/pci_sysfs.c
index 1f81f6f..5f46ad5 100644
--- a/arch/s390/pci/pci_sysfs.c
+++ b/arch/s390/pci/pci_sysfs.c
@@ -23,7 +23,7 @@ static ssize_t name##_show(struct device *dev, \
{ \
struct zpci_dev *zdev = to_zpci(to_pci_dev(dev)); \
\
- return sprintf(buf, fmt, zdev->member); \
+ return sysfs_emit(buf, fmt, zdev->member); \
} \
static DEVICE_ATTR_RO(name)
@@ -34,6 +34,7 @@ zpci_attr(pfgid, "0x%02x\n", pfgid);
zpci_attr(vfn, "0x%04x\n", vfn);
zpci_attr(pft, "0x%02x\n", pft);
zpci_attr(port, "%d\n", port);
+zpci_attr(fidparm, "0x%02x\n", fidparm);
zpci_attr(uid, "0x%x\n", uid);
zpci_attr(segment0, "0x%02x\n", pfip[0]);
zpci_attr(segment1, "0x%02x\n", pfip[1]);
@@ -45,7 +46,7 @@ static ssize_t mio_enabled_show(struct device *dev,
{
struct zpci_dev *zdev = to_zpci(to_pci_dev(dev));
- return sprintf(buf, zpci_use_mio(zdev) ? "1\n" : "0\n");
+ return sysfs_emit(buf, zpci_use_mio(zdev) ? "1\n" : "0\n");
}
static DEVICE_ATTR_RO(mio_enabled);
@@ -215,6 +216,7 @@ static struct attribute *zpci_dev_attrs[] = {
&dev_attr_pfgid.attr,
&dev_attr_pft.attr,
&dev_attr_port.attr,
+ &dev_attr_fidparm.attr,
&dev_attr_vfn.attr,
&dev_attr_uid.attr,
&dev_attr_recover.attr,
diff --git a/arch/s390/purgatory/head.S b/arch/s390/purgatory/head.S
index 0f93f2e..db3ab24 100644
--- a/arch/s390/purgatory/head.S
+++ b/arch/s390/purgatory/head.S
@@ -156,7 +156,7 @@
agr %r10,%r9
/* Buffer location (in crash memory) and size. As the purgatory is
- * behind the point of no return it can re-use the stack as buffer.
+ * behind the point of no return it can reuse the stack as buffer.
*/
larl %r11,purgatory_end
larl %r12,stack
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index e910399..04ff5fb 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -550,6 +550,9 @@
config ARCH_SUPPORTS_CRASH_DUMP
def_bool BROKEN_ON_SMP
+config ARCH_DEFAULT_CRASH_DUMP
+ def_bool y
+
config ARCH_SUPPORTS_KEXEC_JUMP
def_bool y
diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl
index c55fd76..c8cad33 100644
--- a/arch/sh/kernel/syscalls/syscall.tbl
+++ b/arch/sh/kernel/syscalls/syscall.tbl
@@ -466,3 +466,7 @@
460 common lsm_set_self_attr sys_lsm_set_self_attr
461 common lsm_list_modules sys_lsm_list_modules
462 common mseal sys_mseal
+463 common setxattrat sys_setxattrat
+464 common getxattrat sys_getxattrat
+465 common listxattrat sys_listxattrat
+466 common removexattrat sys_removexattrat
diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl
index cfdfb37..727f99d 100644
--- a/arch/sparc/kernel/syscalls/syscall.tbl
+++ b/arch/sparc/kernel/syscalls/syscall.tbl
@@ -508,3 +508,7 @@
460 common lsm_set_self_attr sys_lsm_set_self_attr
461 common lsm_list_modules sys_lsm_list_modules
462 common mseal sys_mseal
+463 common setxattrat sys_setxattrat
+464 common getxattrat sys_getxattrat
+465 common listxattrat sys_listxattrat
+466 common removexattrat sys_removexattrat
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 16354df..ffa2bd1 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1954,6 +1954,7 @@
depends on AS_WRUSS
depends on X86_64
select ARCH_USES_HIGH_VMA_FLAGS
+ select ARCH_HAS_USER_SHADOW_STACK
select X86_CET
help
Shadow stack protection is a hardware feature that detects function
@@ -2084,6 +2085,9 @@
config ARCH_SUPPORTS_CRASH_DUMP
def_bool X86_64 || (X86_32 && HIGHMEM)
+config ARCH_DEFAULT_CRASH_DUMP
+ def_bool y
+
config ARCH_SUPPORTS_CRASH_HOTPLUG
def_bool y
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index cd75e78..5b773b3 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -142,9 +142,10 @@
ifeq ($(CONFIG_STACKPROTECTOR),y)
ifeq ($(CONFIG_SMP),y)
- KBUILD_CFLAGS += -mstack-protector-guard-reg=fs -mstack-protector-guard-symbol=__stack_chk_guard
+ KBUILD_CFLAGS += -mstack-protector-guard-reg=fs \
+ -mstack-protector-guard-symbol=__ref_stack_chk_guard
else
- KBUILD_CFLAGS += -mstack-protector-guard=global
+ KBUILD_CFLAGS += -mstack-protector-guard=global
endif
endif
else
diff --git a/arch/x86/crypto/Kconfig b/arch/x86/crypto/Kconfig
index 7b1bebe..3d2e38b 100644
--- a/arch/x86/crypto/Kconfig
+++ b/arch/x86/crypto/Kconfig
@@ -363,7 +363,7 @@
- AVX-512VL (Advanced Vector Extensions-512VL)
config CRYPTO_AEGIS128_AESNI_SSE2
- tristate "AEAD ciphers: AEGIS-128 (AES-NI/SSE2)"
+ tristate "AEAD ciphers: AEGIS-128 (AES-NI/SSE4.1)"
depends on X86 && 64BIT
select CRYPTO_AEAD
select CRYPTO_SIMD
@@ -372,7 +372,7 @@
Architecture: x86_64 using:
- AES-NI (AES New Instructions)
- - SSE2 (Streaming SIMD Extensions 2)
+ - SSE4.1 (Streaming SIMD Extensions 4.1)
config CRYPTO_NHPOLY1305_SSE2
tristate "Hash functions: NHPoly1305 (SSE2)"
diff --git a/arch/x86/crypto/aegis128-aesni-asm.S b/arch/x86/crypto/aegis128-aesni-asm.S
index ad7f4c8..7294dc0 100644
--- a/arch/x86/crypto/aegis128-aesni-asm.S
+++ b/arch/x86/crypto/aegis128-aesni-asm.S
@@ -1,14 +1,13 @@
/* SPDX-License-Identifier: GPL-2.0-only */
/*
- * AES-NI + SSE2 implementation of AEGIS-128
+ * AES-NI + SSE4.1 implementation of AEGIS-128
*
* Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com>
* Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
+ * Copyright 2024 Google LLC
*/
#include <linux/linkage.h>
-#include <linux/cfi_types.h>
-#include <asm/frame.h>
#define STATE0 %xmm0
#define STATE1 %xmm1
@@ -20,11 +19,6 @@
#define T0 %xmm6
#define T1 %xmm7
-#define STATEP %rdi
-#define LEN %rsi
-#define SRC %rdx
-#define DST %rcx
-
.section .rodata.cst16.aegis128_const, "aM", @progbits, 32
.align 16
.Laegis128_const_0:
@@ -34,11 +28,11 @@
.byte 0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1
.byte 0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd
-.section .rodata.cst16.aegis128_counter, "aM", @progbits, 16
-.align 16
-.Laegis128_counter:
- .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
- .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
+.section .rodata.cst32.zeropad_mask, "aM", @progbits, 32
+.align 32
+.Lzeropad_mask:
+ .octa 0xffffffffffffffffffffffffffffffff
+ .octa 0
.text
@@ -61,140 +55,102 @@
.endm
/*
- * __load_partial: internal ABI
- * input:
- * LEN - bytes
- * SRC - src
- * output:
- * MSG - message block
- * changed:
- * T0
- * %r8
- * %r9
+ * Load 1 <= LEN (%ecx) <= 15 bytes from the pointer SRC into the xmm register
+ * MSG and zeroize any remaining bytes. Clobbers %rax, %rcx, and %r8.
*/
-SYM_FUNC_START_LOCAL(__load_partial)
- xor %r9d, %r9d
- pxor MSG, MSG
+.macro load_partial
+ sub $8, %ecx /* LEN - 8 */
+ jle .Lle8\@
- mov LEN, %r8
- and $0x1, %r8
- jz .Lld_partial_1
+ /* Load 9 <= LEN <= 15 bytes: */
+ movq (SRC), MSG /* Load first 8 bytes */
+ mov (SRC, %rcx), %rax /* Load last 8 bytes */
+ neg %ecx
+ shl $3, %ecx
+ shr %cl, %rax /* Discard overlapping bytes */
+ pinsrq $1, %rax, MSG
+ jmp .Ldone\@
- mov LEN, %r8
- and $0x1E, %r8
- add SRC, %r8
- mov (%r8), %r9b
+.Lle8\@:
+ add $4, %ecx /* LEN - 4 */
+ jl .Llt4\@
-.Lld_partial_1:
- mov LEN, %r8
- and $0x2, %r8
- jz .Lld_partial_2
+ /* Load 4 <= LEN <= 8 bytes: */
+ mov (SRC), %eax /* Load first 4 bytes */
+ mov (SRC, %rcx), %r8d /* Load last 4 bytes */
+ jmp .Lcombine\@
- mov LEN, %r8
- and $0x1C, %r8
- add SRC, %r8
- shl $0x10, %r9
- mov (%r8), %r9w
-
-.Lld_partial_2:
- mov LEN, %r8
- and $0x4, %r8
- jz .Lld_partial_4
-
- mov LEN, %r8
- and $0x18, %r8
- add SRC, %r8
- shl $32, %r9
- mov (%r8), %r8d
- xor %r8, %r9
-
-.Lld_partial_4:
- movq %r9, MSG
-
- mov LEN, %r8
- and $0x8, %r8
- jz .Lld_partial_8
-
- mov LEN, %r8
- and $0x10, %r8
- add SRC, %r8
- pslldq $8, MSG
- movq (%r8), T0
- pxor T0, MSG
-
-.Lld_partial_8:
- RET
-SYM_FUNC_END(__load_partial)
+.Llt4\@:
+ /* Load 1 <= LEN <= 3 bytes: */
+ add $2, %ecx /* LEN - 2 */
+ movzbl (SRC), %eax /* Load first byte */
+ jl .Lmovq\@
+ movzwl (SRC, %rcx), %r8d /* Load last 2 bytes */
+.Lcombine\@:
+ shl $3, %ecx
+ shl %cl, %r8
+ or %r8, %rax /* Combine the two parts */
+.Lmovq\@:
+ movq %rax, MSG
+.Ldone\@:
+.endm
/*
- * __store_partial: internal ABI
- * input:
- * LEN - bytes
- * DST - dst
- * output:
- * T0 - message block
- * changed:
- * %r8
- * %r9
- * %r10
+ * Store 1 <= LEN (%ecx) <= 15 bytes from the xmm register \msg to the pointer
+ * DST. Clobbers %rax, %rcx, and %r8.
*/
-SYM_FUNC_START_LOCAL(__store_partial)
- mov LEN, %r8
- mov DST, %r9
+.macro store_partial msg
+ sub $8, %ecx /* LEN - 8 */
+ jl .Llt8\@
- movq T0, %r10
+ /* Store 8 <= LEN <= 15 bytes: */
+ pextrq $1, \msg, %rax
+ mov %ecx, %r8d
+ shl $3, %ecx
+ ror %cl, %rax
+ mov %rax, (DST, %r8) /* Store last LEN - 8 bytes */
+ movq \msg, (DST) /* Store first 8 bytes */
+ jmp .Ldone\@
- cmp $8, %r8
- jl .Lst_partial_8
+.Llt8\@:
+ add $4, %ecx /* LEN - 4 */
+ jl .Llt4\@
- mov %r10, (%r9)
- psrldq $8, T0
- movq T0, %r10
+ /* Store 4 <= LEN <= 7 bytes: */
+ pextrd $1, \msg, %eax
+ mov %ecx, %r8d
+ shl $3, %ecx
+ ror %cl, %eax
+ mov %eax, (DST, %r8) /* Store last LEN - 4 bytes */
+ movd \msg, (DST) /* Store first 4 bytes */
+ jmp .Ldone\@
- sub $8, %r8
- add $8, %r9
-
-.Lst_partial_8:
- cmp $4, %r8
- jl .Lst_partial_4
-
- mov %r10d, (%r9)
- shr $32, %r10
-
- sub $4, %r8
- add $4, %r9
-
-.Lst_partial_4:
- cmp $2, %r8
- jl .Lst_partial_2
-
- mov %r10w, (%r9)
- shr $0x10, %r10
-
- sub $2, %r8
- add $2, %r9
-
-.Lst_partial_2:
- cmp $1, %r8
- jl .Lst_partial_1
-
- mov %r10b, (%r9)
-
-.Lst_partial_1:
- RET
-SYM_FUNC_END(__store_partial)
+.Llt4\@:
+ /* Store 1 <= LEN <= 3 bytes: */
+ pextrb $0, \msg, 0(DST)
+ cmp $-2, %ecx /* LEN - 4 == -2, i.e. LEN == 2? */
+ jl .Ldone\@
+ pextrb $1, \msg, 1(DST)
+ je .Ldone\@
+ pextrb $2, \msg, 2(DST)
+.Ldone\@:
+.endm
/*
- * void crypto_aegis128_aesni_init(void *state, const void *key, const void *iv);
+ * void aegis128_aesni_init(struct aegis_state *state,
+ * const struct aegis_block *key,
+ * const u8 iv[AEGIS128_NONCE_SIZE]);
*/
-SYM_FUNC_START(crypto_aegis128_aesni_init)
- FRAME_BEGIN
+SYM_FUNC_START(aegis128_aesni_init)
+ .set STATEP, %rdi
+ .set KEYP, %rsi
+ .set IVP, %rdx
/* load IV: */
- movdqu (%rdx), T1
+ movdqu (IVP), T1
/* load key: */
- movdqa (%rsi), KEY
+ movdqa (KEYP), KEY
pxor KEY, T1
movdqa T1, STATE0
movdqa KEY, STATE3
@@ -224,20 +180,22 @@
movdqu STATE2, 0x20(STATEP)
movdqu STATE3, 0x30(STATEP)
movdqu STATE4, 0x40(STATEP)
-
- FRAME_END
RET
-SYM_FUNC_END(crypto_aegis128_aesni_init)
+SYM_FUNC_END(aegis128_aesni_init)
/*
- * void crypto_aegis128_aesni_ad(void *state, unsigned int length,
- * const void *data);
+ * void aegis128_aesni_ad(struct aegis_state *state, const u8 *data,
+ * unsigned int len);
+ *
+ * len must be a multiple of 16.
*/
-SYM_FUNC_START(crypto_aegis128_aesni_ad)
- FRAME_BEGIN
+SYM_FUNC_START(aegis128_aesni_ad)
+ .set STATEP, %rdi
+ .set SRC, %rsi
+ .set LEN, %edx
- cmp $0x10, LEN
- jb .Lad_out
+ test LEN, LEN
+ jz .Lad_out
/* load the state: */
movdqu 0x00(STATEP), STATE0
@@ -246,89 +204,40 @@
movdqu 0x30(STATEP), STATE3
movdqu 0x40(STATEP), STATE4
- mov SRC, %r8
- and $0xF, %r8
- jnz .Lad_u_loop
-
.align 8
-.Lad_a_loop:
- movdqa 0x00(SRC), MSG
- aegis128_update
- pxor MSG, STATE4
- sub $0x10, LEN
- cmp $0x10, LEN
- jl .Lad_out_1
-
- movdqa 0x10(SRC), MSG
- aegis128_update
- pxor MSG, STATE3
- sub $0x10, LEN
- cmp $0x10, LEN
- jl .Lad_out_2
-
- movdqa 0x20(SRC), MSG
- aegis128_update
- pxor MSG, STATE2
- sub $0x10, LEN
- cmp $0x10, LEN
- jl .Lad_out_3
-
- movdqa 0x30(SRC), MSG
- aegis128_update
- pxor MSG, STATE1
- sub $0x10, LEN
- cmp $0x10, LEN
- jl .Lad_out_4
-
- movdqa 0x40(SRC), MSG
- aegis128_update
- pxor MSG, STATE0
- sub $0x10, LEN
- cmp $0x10, LEN
- jl .Lad_out_0
-
- add $0x50, SRC
- jmp .Lad_a_loop
-
-.align 8
-.Lad_u_loop:
+.Lad_loop:
movdqu 0x00(SRC), MSG
aegis128_update
pxor MSG, STATE4
sub $0x10, LEN
- cmp $0x10, LEN
- jl .Lad_out_1
+ jz .Lad_out_1
movdqu 0x10(SRC), MSG
aegis128_update
pxor MSG, STATE3
sub $0x10, LEN
- cmp $0x10, LEN
- jl .Lad_out_2
+ jz .Lad_out_2
movdqu 0x20(SRC), MSG
aegis128_update
pxor MSG, STATE2
sub $0x10, LEN
- cmp $0x10, LEN
- jl .Lad_out_3
+ jz .Lad_out_3
movdqu 0x30(SRC), MSG
aegis128_update
pxor MSG, STATE1
sub $0x10, LEN
- cmp $0x10, LEN
- jl .Lad_out_4
+ jz .Lad_out_4
movdqu 0x40(SRC), MSG
aegis128_update
pxor MSG, STATE0
sub $0x10, LEN
- cmp $0x10, LEN
- jl .Lad_out_0
+ jz .Lad_out_0
add $0x50, SRC
- jmp .Lad_u_loop
+ jmp .Lad_loop
/* store the state: */
.Lad_out_0:
@@ -337,7 +246,6 @@
movdqu STATE2, 0x20(STATEP)
movdqu STATE3, 0x30(STATEP)
movdqu STATE4, 0x40(STATEP)
- FRAME_END
RET
.Lad_out_1:
@@ -346,7 +254,6 @@
movdqu STATE1, 0x20(STATEP)
movdqu STATE2, 0x30(STATEP)
movdqu STATE3, 0x40(STATEP)
- FRAME_END
RET
.Lad_out_2:
@@ -355,7 +262,6 @@
movdqu STATE0, 0x20(STATEP)
movdqu STATE1, 0x30(STATEP)
movdqu STATE2, 0x40(STATEP)
- FRAME_END
RET
.Lad_out_3:
@@ -364,7 +270,6 @@
movdqu STATE4, 0x20(STATEP)
movdqu STATE0, 0x30(STATEP)
movdqu STATE1, 0x40(STATEP)
- FRAME_END
RET
.Lad_out_4:
@@ -373,41 +278,38 @@
movdqu STATE3, 0x20(STATEP)
movdqu STATE4, 0x30(STATEP)
movdqu STATE0, 0x40(STATEP)
- FRAME_END
- RET
-
.Lad_out:
- FRAME_END
RET
-SYM_FUNC_END(crypto_aegis128_aesni_ad)
+SYM_FUNC_END(aegis128_aesni_ad)
-.macro encrypt_block a s0 s1 s2 s3 s4 i
- movdq\a (\i * 0x10)(SRC), MSG
+.macro encrypt_block s0 s1 s2 s3 s4 i
+ movdqu (\i * 0x10)(SRC), MSG
movdqa MSG, T0
pxor \s1, T0
pxor \s4, T0
movdqa \s2, T1
pand \s3, T1
pxor T1, T0
- movdq\a T0, (\i * 0x10)(DST)
+ movdqu T0, (\i * 0x10)(DST)
aegis128_update
pxor MSG, \s4
sub $0x10, LEN
- cmp $0x10, LEN
- jl .Lenc_out_\i
+ jz .Lenc_out_\i
.endm
/*
- * void crypto_aegis128_aesni_enc(void *state, unsigned int length,
- * const void *src, void *dst);
+ * void aegis128_aesni_enc(struct aegis_state *state, const u8 *src, u8 *dst,
+ * unsigned int len);
+ *
+ * len must be nonzero and a multiple of 16.
*/
-SYM_TYPED_FUNC_START(crypto_aegis128_aesni_enc)
- FRAME_BEGIN
-
- cmp $0x10, LEN
- jb .Lenc_out
+SYM_FUNC_START(aegis128_aesni_enc)
+ .set STATEP, %rdi
+ .set SRC, %rsi
+ .set DST, %rdx
+ .set LEN, %ecx
/* load the state: */
movdqu 0x00(STATEP), STATE0
@@ -416,34 +318,17 @@
movdqu 0x30(STATEP), STATE3
movdqu 0x40(STATEP), STATE4
- mov SRC, %r8
- or DST, %r8
- and $0xF, %r8
- jnz .Lenc_u_loop
-
.align 8
-.Lenc_a_loop:
- encrypt_block a STATE0 STATE1 STATE2 STATE3 STATE4 0
- encrypt_block a STATE4 STATE0 STATE1 STATE2 STATE3 1
- encrypt_block a STATE3 STATE4 STATE0 STATE1 STATE2 2
- encrypt_block a STATE2 STATE3 STATE4 STATE0 STATE1 3
- encrypt_block a STATE1 STATE2 STATE3 STATE4 STATE0 4
+.Lenc_loop:
+ encrypt_block STATE0 STATE1 STATE2 STATE3 STATE4 0
+ encrypt_block STATE4 STATE0 STATE1 STATE2 STATE3 1
+ encrypt_block STATE3 STATE4 STATE0 STATE1 STATE2 2
+ encrypt_block STATE2 STATE3 STATE4 STATE0 STATE1 3
+ encrypt_block STATE1 STATE2 STATE3 STATE4 STATE0 4
add $0x50, SRC
add $0x50, DST
- jmp .Lenc_a_loop
-
-.align 8
-.Lenc_u_loop:
- encrypt_block u STATE0 STATE1 STATE2 STATE3 STATE4 0
- encrypt_block u STATE4 STATE0 STATE1 STATE2 STATE3 1
- encrypt_block u STATE3 STATE4 STATE0 STATE1 STATE2 2
- encrypt_block u STATE2 STATE3 STATE4 STATE0 STATE1 3
- encrypt_block u STATE1 STATE2 STATE3 STATE4 STATE0 4
-
- add $0x50, SRC
- add $0x50, DST
- jmp .Lenc_u_loop
+ jmp .Lenc_loop
/* store the state: */
.Lenc_out_0:
@@ -452,7 +337,6 @@
movdqu STATE1, 0x20(STATEP)
movdqu STATE2, 0x30(STATEP)
movdqu STATE3, 0x40(STATEP)
- FRAME_END
RET
.Lenc_out_1:
@@ -461,7 +345,6 @@
movdqu STATE0, 0x20(STATEP)
movdqu STATE1, 0x30(STATEP)
movdqu STATE2, 0x40(STATEP)
- FRAME_END
RET
.Lenc_out_2:
@@ -470,7 +353,6 @@
movdqu STATE4, 0x20(STATEP)
movdqu STATE0, 0x30(STATEP)
movdqu STATE1, 0x40(STATEP)
- FRAME_END
RET
.Lenc_out_3:
@@ -479,7 +361,6 @@
movdqu STATE3, 0x20(STATEP)
movdqu STATE4, 0x30(STATEP)
movdqu STATE0, 0x40(STATEP)
- FRAME_END
RET
.Lenc_out_4:
@@ -488,20 +369,19 @@
movdqu STATE2, 0x20(STATEP)
movdqu STATE3, 0x30(STATEP)
movdqu STATE4, 0x40(STATEP)
- FRAME_END
- RET
-
.Lenc_out:
- FRAME_END
RET
-SYM_FUNC_END(crypto_aegis128_aesni_enc)
+SYM_FUNC_END(aegis128_aesni_enc)
/*
- * void crypto_aegis128_aesni_enc_tail(void *state, unsigned int length,
- * const void *src, void *dst);
+ * void aegis128_aesni_enc_tail(struct aegis_state *state, const u8 *src,
+ * u8 *dst, unsigned int len);
*/
-SYM_TYPED_FUNC_START(crypto_aegis128_aesni_enc_tail)
- FRAME_BEGIN
+SYM_FUNC_START(aegis128_aesni_enc_tail)
+ .set STATEP, %rdi
+ .set SRC, %rsi
+ .set DST, %rdx
+ .set LEN, %ecx /* {load,store}_partial rely on this being %ecx */
/* load the state: */
movdqu 0x00(STATEP), STATE0
@@ -511,7 +391,8 @@
movdqu 0x40(STATEP), STATE4
/* encrypt message: */
- call __load_partial
+ mov LEN, %r9d
+ load_partial
movdqa MSG, T0
pxor STATE1, T0
@@ -520,7 +401,8 @@
pand STATE3, T1
pxor T1, T0
- call __store_partial
+ mov %r9d, LEN
+ store_partial T0
aegis128_update
pxor MSG, STATE4
@@ -531,37 +413,36 @@
movdqu STATE1, 0x20(STATEP)
movdqu STATE2, 0x30(STATEP)
movdqu STATE3, 0x40(STATEP)
-
- FRAME_END
RET
-SYM_FUNC_END(crypto_aegis128_aesni_enc_tail)
+SYM_FUNC_END(aegis128_aesni_enc_tail)
-.macro decrypt_block a s0 s1 s2 s3 s4 i
- movdq\a (\i * 0x10)(SRC), MSG
+.macro decrypt_block s0 s1 s2 s3 s4 i
+ movdqu (\i * 0x10)(SRC), MSG
pxor \s1, MSG
pxor \s4, MSG
movdqa \s2, T1
pand \s3, T1
pxor T1, MSG
- movdq\a MSG, (\i * 0x10)(DST)
+ movdqu MSG, (\i * 0x10)(DST)
aegis128_update
pxor MSG, \s4
sub $0x10, LEN
- cmp $0x10, LEN
- jl .Ldec_out_\i
+ jz .Ldec_out_\i
.endm
/*
- * void crypto_aegis128_aesni_dec(void *state, unsigned int length,
- * const void *src, void *dst);
+ * void aegis128_aesni_dec(struct aegis_state *state, const u8 *src, u8 *dst,
+ * unsigned int len);
+ *
+ * len must be nonzero and a multiple of 16.
*/
-SYM_TYPED_FUNC_START(crypto_aegis128_aesni_dec)
- FRAME_BEGIN
-
- cmp $0x10, LEN
- jb .Ldec_out
+SYM_FUNC_START(aegis128_aesni_dec)
+ .set STATEP, %rdi
+ .set SRC, %rsi
+ .set DST, %rdx
+ .set LEN, %ecx
/* load the state: */
movdqu 0x00(STATEP), STATE0
@@ -570,34 +451,17 @@
movdqu 0x30(STATEP), STATE3
movdqu 0x40(STATEP), STATE4
- mov SRC, %r8
- or DST, %r8
- and $0xF, %r8
- jnz .Ldec_u_loop
-
.align 8
-.Ldec_a_loop:
- decrypt_block a STATE0 STATE1 STATE2 STATE3 STATE4 0
- decrypt_block a STATE4 STATE0 STATE1 STATE2 STATE3 1
- decrypt_block a STATE3 STATE4 STATE0 STATE1 STATE2 2
- decrypt_block a STATE2 STATE3 STATE4 STATE0 STATE1 3
- decrypt_block a STATE1 STATE2 STATE3 STATE4 STATE0 4
+.Ldec_loop:
+ decrypt_block STATE0 STATE1 STATE2 STATE3 STATE4 0
+ decrypt_block STATE4 STATE0 STATE1 STATE2 STATE3 1
+ decrypt_block STATE3 STATE4 STATE0 STATE1 STATE2 2
+ decrypt_block STATE2 STATE3 STATE4 STATE0 STATE1 3
+ decrypt_block STATE1 STATE2 STATE3 STATE4 STATE0 4
add $0x50, SRC
add $0x50, DST
- jmp .Ldec_a_loop
-
-.align 8
-.Ldec_u_loop:
- decrypt_block u STATE0 STATE1 STATE2 STATE3 STATE4 0
- decrypt_block u STATE4 STATE0 STATE1 STATE2 STATE3 1
- decrypt_block u STATE3 STATE4 STATE0 STATE1 STATE2 2
- decrypt_block u STATE2 STATE3 STATE4 STATE0 STATE1 3
- decrypt_block u STATE1 STATE2 STATE3 STATE4 STATE0 4
-
- add $0x50, SRC
- add $0x50, DST
- jmp .Ldec_u_loop
+ jmp .Ldec_loop
/* store the state: */
.Ldec_out_0:
@@ -606,7 +470,6 @@
movdqu STATE1, 0x20(STATEP)
movdqu STATE2, 0x30(STATEP)
movdqu STATE3, 0x40(STATEP)
- FRAME_END
RET
.Ldec_out_1:
@@ -615,7 +478,6 @@
movdqu STATE0, 0x20(STATEP)
movdqu STATE1, 0x30(STATEP)
movdqu STATE2, 0x40(STATEP)
- FRAME_END
RET
.Ldec_out_2:
@@ -624,7 +486,6 @@
movdqu STATE4, 0x20(STATEP)
movdqu STATE0, 0x30(STATEP)
movdqu STATE1, 0x40(STATEP)
- FRAME_END
RET
.Ldec_out_3:
@@ -633,7 +494,6 @@
movdqu STATE3, 0x20(STATEP)
movdqu STATE4, 0x30(STATEP)
movdqu STATE0, 0x40(STATEP)
- FRAME_END
RET
.Ldec_out_4:
@@ -642,20 +502,19 @@
movdqu STATE2, 0x20(STATEP)
movdqu STATE3, 0x30(STATEP)
movdqu STATE4, 0x40(STATEP)
- FRAME_END
- RET
-
.Ldec_out:
- FRAME_END
RET
-SYM_FUNC_END(crypto_aegis128_aesni_dec)
+SYM_FUNC_END(aegis128_aesni_dec)
/*
- * void crypto_aegis128_aesni_dec_tail(void *state, unsigned int length,
- * const void *src, void *dst);
+ * void aegis128_aesni_dec_tail(struct aegis_state *state, const u8 *src,
+ * u8 *dst, unsigned int len);
*/
-SYM_TYPED_FUNC_START(crypto_aegis128_aesni_dec_tail)
- FRAME_BEGIN
+SYM_FUNC_START(aegis128_aesni_dec_tail)
+ .set STATEP, %rdi
+ .set SRC, %rsi
+ .set DST, %rdx
+ .set LEN, %ecx /* {load,store}_partial rely on this being %ecx */
/* load the state: */
movdqu 0x00(STATEP), STATE0
@@ -665,7 +524,8 @@
movdqu 0x40(STATEP), STATE4
/* decrypt message: */
- call __load_partial
+ mov LEN, %r9d
+ load_partial
pxor STATE1, MSG
pxor STATE4, MSG
@@ -673,17 +533,13 @@
pand STATE3, T1
pxor T1, MSG
- movdqa MSG, T0
- call __store_partial
+ mov %r9d, LEN
+ store_partial MSG
/* mask with byte count: */
- movq LEN, T0
- punpcklbw T0, T0
- punpcklbw T0, T0
- punpcklbw T0, T0
- punpcklbw T0, T0
- movdqa .Laegis128_counter(%rip), T1
- pcmpgtb T1, T0
+ lea .Lzeropad_mask+16(%rip), %rax
+ sub %r9, %rax
+ movdqu (%rax), T0
pand T0, MSG
aegis128_update
@@ -695,17 +551,19 @@
movdqu STATE1, 0x20(STATEP)
movdqu STATE2, 0x30(STATEP)
movdqu STATE3, 0x40(STATEP)
-
- FRAME_END
RET
-SYM_FUNC_END(crypto_aegis128_aesni_dec_tail)
+SYM_FUNC_END(aegis128_aesni_dec_tail)
/*
- * void crypto_aegis128_aesni_final(void *state, void *tag_xor,
- * u64 assoclen, u64 cryptlen);
+ * void aegis128_aesni_final(struct aegis_state *state,
+ * struct aegis_block *tag_xor,
+ * unsigned int assoclen, unsigned int cryptlen);
*/
-SYM_FUNC_START(crypto_aegis128_aesni_final)
- FRAME_BEGIN
+SYM_FUNC_START(aegis128_aesni_final)
+ .set STATEP, %rdi
+ .set TAG_XOR, %rsi
+ .set ASSOCLEN, %edx
+ .set CRYPTLEN, %ecx
/* load the state: */
movdqu 0x00(STATEP), STATE0
@@ -715,10 +573,8 @@
movdqu 0x40(STATEP), STATE4
/* prepare length block: */
- movq %rdx, MSG
- movq %rcx, T0
- pslldq $8, T0
- pxor T0, MSG
+ movd ASSOCLEN, MSG
+ pinsrd $2, CRYPTLEN, MSG
psllq $3, MSG /* multiply by 8 (to get bit count) */
pxor STATE3, MSG
@@ -733,7 +589,7 @@
aegis128_update; pxor MSG, STATE3
/* xor tag: */
- movdqu (%rsi), MSG
+ movdqu (TAG_XOR), MSG
pxor STATE0, MSG
pxor STATE1, MSG
@@ -741,8 +597,6 @@
pxor STATE3, MSG
pxor STATE4, MSG
- movdqu MSG, (%rsi)
-
- FRAME_END
+ movdqu MSG, (TAG_XOR)
RET
-SYM_FUNC_END(crypto_aegis128_aesni_final)
+SYM_FUNC_END(aegis128_aesni_final)
diff --git a/arch/x86/crypto/aegis128-aesni-glue.c b/arch/x86/crypto/aegis128-aesni-glue.c
index 4623189..c19d8e3 100644
--- a/arch/x86/crypto/aegis128-aesni-glue.c
+++ b/arch/x86/crypto/aegis128-aesni-glue.c
@@ -1,7 +1,7 @@
// SPDX-License-Identifier: GPL-2.0-or-later
/*
* The AEGIS-128 Authenticated-Encryption Algorithm
- * Glue for AES-NI + SSE2 implementation
+ * Glue for AES-NI + SSE4.1 implementation
*
* Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com>
* Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
@@ -23,27 +23,6 @@
#define AEGIS128_MIN_AUTH_SIZE 8
#define AEGIS128_MAX_AUTH_SIZE 16
-asmlinkage void crypto_aegis128_aesni_init(void *state, void *key, void *iv);
-
-asmlinkage void crypto_aegis128_aesni_ad(
- void *state, unsigned int length, const void *data);
-
-asmlinkage void crypto_aegis128_aesni_enc(
- void *state, unsigned int length, const void *src, void *dst);
-
-asmlinkage void crypto_aegis128_aesni_dec(
- void *state, unsigned int length, const void *src, void *dst);
-
-asmlinkage void crypto_aegis128_aesni_enc_tail(
- void *state, unsigned int length, const void *src, void *dst);
-
-asmlinkage void crypto_aegis128_aesni_dec_tail(
- void *state, unsigned int length, const void *src, void *dst);
-
-asmlinkage void crypto_aegis128_aesni_final(
- void *state, void *tag_xor, unsigned int cryptlen,
- unsigned int assoclen);
-
struct aegis_block {
u8 bytes[AEGIS128_BLOCK_SIZE] __aligned(AEGIS128_BLOCK_ALIGN);
};
@@ -56,15 +35,31 @@ struct aegis_ctx {
struct aegis_block key;
};
-struct aegis_crypt_ops {
- int (*skcipher_walk_init)(struct skcipher_walk *walk,
- struct aead_request *req, bool atomic);
+asmlinkage void aegis128_aesni_init(struct aegis_state *state,
+ const struct aegis_block *key,
+ const u8 iv[AEGIS128_NONCE_SIZE]);
- void (*crypt_blocks)(void *state, unsigned int length, const void *src,
- void *dst);
- void (*crypt_tail)(void *state, unsigned int length, const void *src,
- void *dst);
-};
+asmlinkage void aegis128_aesni_ad(struct aegis_state *state, const u8 *data,
+ unsigned int len);
+
+asmlinkage void aegis128_aesni_enc(struct aegis_state *state, const u8 *src,
+ u8 *dst, unsigned int len);
+
+asmlinkage void aegis128_aesni_dec(struct aegis_state *state, const u8 *src,
+ u8 *dst, unsigned int len);
+
+asmlinkage void aegis128_aesni_enc_tail(struct aegis_state *state,
+ const u8 *src, u8 *dst,
+ unsigned int len);
+
+asmlinkage void aegis128_aesni_dec_tail(struct aegis_state *state,
+ const u8 *src, u8 *dst,
+ unsigned int len);
+
+asmlinkage void aegis128_aesni_final(struct aegis_state *state,
+ struct aegis_block *tag_xor,
+ unsigned int assoclen,
+ unsigned int cryptlen);
static void crypto_aegis128_aesni_process_ad(
struct aegis_state *state, struct scatterlist *sg_src,
@@ -85,16 +80,15 @@ static void crypto_aegis128_aesni_process_ad(
if (pos > 0) {
unsigned int fill = AEGIS128_BLOCK_SIZE - pos;
memcpy(buf.bytes + pos, src, fill);
- crypto_aegis128_aesni_ad(state,
- AEGIS128_BLOCK_SIZE,
- buf.bytes);
+ aegis128_aesni_ad(state, buf.bytes,
+ AEGIS128_BLOCK_SIZE);
pos = 0;
left -= fill;
src += fill;
}
- crypto_aegis128_aesni_ad(state, left, src);
-
+ aegis128_aesni_ad(state, src,
+ left & ~(AEGIS128_BLOCK_SIZE - 1));
src += left & ~(AEGIS128_BLOCK_SIZE - 1);
left &= AEGIS128_BLOCK_SIZE - 1;
}
@@ -110,24 +104,37 @@ static void crypto_aegis128_aesni_process_ad(
if (pos > 0) {
memset(buf.bytes + pos, 0, AEGIS128_BLOCK_SIZE - pos);
- crypto_aegis128_aesni_ad(state, AEGIS128_BLOCK_SIZE, buf.bytes);
+ aegis128_aesni_ad(state, buf.bytes, AEGIS128_BLOCK_SIZE);
}
}
-static void crypto_aegis128_aesni_process_crypt(
- struct aegis_state *state, struct skcipher_walk *walk,
- const struct aegis_crypt_ops *ops)
+static __always_inline void
+crypto_aegis128_aesni_process_crypt(struct aegis_state *state,
+ struct skcipher_walk *walk, bool enc)
{
while (walk->nbytes >= AEGIS128_BLOCK_SIZE) {
- ops->crypt_blocks(state,
- round_down(walk->nbytes, AEGIS128_BLOCK_SIZE),
- walk->src.virt.addr, walk->dst.virt.addr);
+ if (enc)
+ aegis128_aesni_enc(state, walk->src.virt.addr,
+ walk->dst.virt.addr,
+ round_down(walk->nbytes,
+ AEGIS128_BLOCK_SIZE));
+ else
+ aegis128_aesni_dec(state, walk->src.virt.addr,
+ walk->dst.virt.addr,
+ round_down(walk->nbytes,
+ AEGIS128_BLOCK_SIZE));
skcipher_walk_done(walk, walk->nbytes % AEGIS128_BLOCK_SIZE);
}
if (walk->nbytes) {
- ops->crypt_tail(state, walk->nbytes, walk->src.virt.addr,
- walk->dst.virt.addr);
+ if (enc)
+ aegis128_aesni_enc_tail(state, walk->src.virt.addr,
+ walk->dst.virt.addr,
+ walk->nbytes);
+ else
+ aegis128_aesni_dec_tail(state, walk->src.virt.addr,
+ walk->dst.virt.addr,
+ walk->nbytes);
skcipher_walk_done(walk, 0);
}
}
@@ -162,42 +169,39 @@ static int crypto_aegis128_aesni_setauthsize(struct crypto_aead *tfm,
return 0;
}
-static void crypto_aegis128_aesni_crypt(struct aead_request *req,
- struct aegis_block *tag_xor,
- unsigned int cryptlen,
- const struct aegis_crypt_ops *ops)
+static __always_inline void
+crypto_aegis128_aesni_crypt(struct aead_request *req,
+ struct aegis_block *tag_xor,
+ unsigned int cryptlen, bool enc)
{
struct crypto_aead *tfm = crypto_aead_reqtfm(req);
struct aegis_ctx *ctx = crypto_aegis128_aesni_ctx(tfm);
struct skcipher_walk walk;
struct aegis_state state;
- ops->skcipher_walk_init(&walk, req, true);
+ if (enc)
+ skcipher_walk_aead_encrypt(&walk, req, true);
+ else
+ skcipher_walk_aead_decrypt(&walk, req, true);
kernel_fpu_begin();
- crypto_aegis128_aesni_init(&state, ctx->key.bytes, req->iv);
+ aegis128_aesni_init(&state, &ctx->key, req->iv);
crypto_aegis128_aesni_process_ad(&state, req->src, req->assoclen);
- crypto_aegis128_aesni_process_crypt(&state, &walk, ops);
- crypto_aegis128_aesni_final(&state, tag_xor, req->assoclen, cryptlen);
+ crypto_aegis128_aesni_process_crypt(&state, &walk, enc);
+ aegis128_aesni_final(&state, tag_xor, req->assoclen, cryptlen);
kernel_fpu_end();
}
static int crypto_aegis128_aesni_encrypt(struct aead_request *req)
{
- static const struct aegis_crypt_ops OPS = {
- .skcipher_walk_init = skcipher_walk_aead_encrypt,
- .crypt_blocks = crypto_aegis128_aesni_enc,
- .crypt_tail = crypto_aegis128_aesni_enc_tail,
- };
-
struct crypto_aead *tfm = crypto_aead_reqtfm(req);
struct aegis_block tag = {};
unsigned int authsize = crypto_aead_authsize(tfm);
unsigned int cryptlen = req->cryptlen;
- crypto_aegis128_aesni_crypt(req, &tag, cryptlen, &OPS);
+ crypto_aegis128_aesni_crypt(req, &tag, cryptlen, true);
scatterwalk_map_and_copy(tag.bytes, req->dst,
req->assoclen + cryptlen, authsize, 1);
@@ -208,12 +212,6 @@ static int crypto_aegis128_aesni_decrypt(struct aead_request *req)
{
static const struct aegis_block zeros = {};
- static const struct aegis_crypt_ops OPS = {
- .skcipher_walk_init = skcipher_walk_aead_decrypt,
- .crypt_blocks = crypto_aegis128_aesni_dec,
- .crypt_tail = crypto_aegis128_aesni_dec_tail,
- };
-
struct crypto_aead *tfm = crypto_aead_reqtfm(req);
struct aegis_block tag;
unsigned int authsize = crypto_aead_authsize(tfm);
@@ -222,27 +220,16 @@ static int crypto_aegis128_aesni_decrypt(struct aead_request *req)
scatterwalk_map_and_copy(tag.bytes, req->src,
req->assoclen + cryptlen, authsize, 0);
- crypto_aegis128_aesni_crypt(req, &tag, cryptlen, &OPS);
+ crypto_aegis128_aesni_crypt(req, &tag, cryptlen, false);
return crypto_memneq(tag.bytes, zeros.bytes, authsize) ? -EBADMSG : 0;
}
-static int crypto_aegis128_aesni_init_tfm(struct crypto_aead *aead)
-{
- return 0;
-}
-
-static void crypto_aegis128_aesni_exit_tfm(struct crypto_aead *aead)
-{
-}
-
static struct aead_alg crypto_aegis128_aesni_alg = {
.setkey = crypto_aegis128_aesni_setkey,
.setauthsize = crypto_aegis128_aesni_setauthsize,
.encrypt = crypto_aegis128_aesni_encrypt,
.decrypt = crypto_aegis128_aesni_decrypt,
- .init = crypto_aegis128_aesni_init_tfm,
- .exit = crypto_aegis128_aesni_exit_tfm,
.ivsize = AEGIS128_NONCE_SIZE,
.maxauthsize = AEGIS128_MAX_AUTH_SIZE,
@@ -267,7 +254,7 @@ static struct simd_aead_alg *simd_alg;
static int __init crypto_aegis128_aesni_module_init(void)
{
- if (!boot_cpu_has(X86_FEATURE_XMM2) ||
+ if (!boot_cpu_has(X86_FEATURE_XMM4_1) ||
!boot_cpu_has(X86_FEATURE_AES) ||
!cpu_has_xfeatures(XFEATURE_MASK_SSE, NULL))
return -ENODEV;
@@ -286,6 +273,6 @@ module_exit(crypto_aegis128_aesni_module_exit);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Ondrej Mosnacek <omosnacek@gmail.com>");
-MODULE_DESCRIPTION("AEGIS-128 AEAD algorithm -- AESNI+SSE2 implementation");
+MODULE_DESCRIPTION("AEGIS-128 AEAD algorithm -- AESNI+SSE4.1 implementation");
MODULE_ALIAS_CRYPTO("aegis128");
MODULE_ALIAS_CRYPTO("aegis128-aesni");
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index b0dd835..fbf4348 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -1747,7 +1747,7 @@ static void __exit aesni_exit(void)
unregister_avx_algs();
}
-late_initcall(aesni_init);
+module_init(aesni_init);
module_exit(aesni_exit);
MODULE_DESCRIPTION("AES cipher and modes, optimized with AES-NI or VAES instructions");
diff --git a/arch/x86/crypto/cast5-avx-x86_64-asm_64.S b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S
index b4e460a..fb95a61 100644
--- a/arch/x86/crypto/cast5-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S
@@ -487,79 +487,3 @@
FRAME_END
RET;
SYM_FUNC_END(cast5_cbc_dec_16way)
-
-SYM_FUNC_START(cast5_ctr_16way)
- /* input:
- * %rdi: ctx
- * %rsi: dst
- * %rdx: src
- * %rcx: iv (big endian, 64bit)
- */
- FRAME_BEGIN
- pushq %r12;
- pushq %r15;
-
- movq %rdi, CTX;
- movq %rsi, %r11;
- movq %rdx, %r12;
-
- vpcmpeqd RTMP, RTMP, RTMP;
- vpsrldq $8, RTMP, RTMP; /* low: -1, high: 0 */
-
- vpcmpeqd RKR, RKR, RKR;
- vpaddq RKR, RKR, RKR; /* low: -2, high: -2 */
- vmovdqa .Lbswap_iv_mask(%rip), R1ST;
- vmovdqa .Lbswap128_mask(%rip), RKM;
-
- /* load IV and byteswap */
- vmovq (%rcx), RX;
- vpshufb R1ST, RX, RX;
-
- /* construct IVs */
- vpsubq RTMP, RX, RX; /* le: IV1, IV0 */
- vpshufb RKM, RX, RL1; /* be: IV0, IV1 */
- vpsubq RKR, RX, RX;
- vpshufb RKM, RX, RR1; /* be: IV2, IV3 */
- vpsubq RKR, RX, RX;
- vpshufb RKM, RX, RL2; /* be: IV4, IV5 */
- vpsubq RKR, RX, RX;
- vpshufb RKM, RX, RR2; /* be: IV6, IV7 */
- vpsubq RKR, RX, RX;
- vpshufb RKM, RX, RL3; /* be: IV8, IV9 */
- vpsubq RKR, RX, RX;
- vpshufb RKM, RX, RR3; /* be: IV10, IV11 */
- vpsubq RKR, RX, RX;
- vpshufb RKM, RX, RL4; /* be: IV12, IV13 */
- vpsubq RKR, RX, RX;
- vpshufb RKM, RX, RR4; /* be: IV14, IV15 */
-
- /* store last IV */
- vpsubq RTMP, RX, RX; /* le: IV16, IV14 */
- vpshufb R1ST, RX, RX; /* be: IV16, IV16 */
- vmovq RX, (%rcx);
-
- call __cast5_enc_blk16;
-
- /* dst = src ^ iv */
- vpxor (0*16)(%r12), RR1, RR1;
- vpxor (1*16)(%r12), RL1, RL1;
- vpxor (2*16)(%r12), RR2, RR2;
- vpxor (3*16)(%r12), RL2, RL2;
- vpxor (4*16)(%r12), RR3, RR3;
- vpxor (5*16)(%r12), RL3, RL3;
- vpxor (6*16)(%r12), RR4, RR4;
- vpxor (7*16)(%r12), RL4, RL4;
- vmovdqu RR1, (0*16)(%r11);
- vmovdqu RL1, (1*16)(%r11);
- vmovdqu RR2, (2*16)(%r11);
- vmovdqu RL2, (3*16)(%r11);
- vmovdqu RR3, (4*16)(%r11);
- vmovdqu RL3, (5*16)(%r11);
- vmovdqu RR4, (6*16)(%r11);
- vmovdqu RL4, (7*16)(%r11);
-
- popq %r15;
- popq %r12;
- FRAME_END
- RET;
-SYM_FUNC_END(cast5_ctr_16way)
diff --git a/arch/x86/crypto/crc32c-intel_glue.c b/arch/x86/crypto/crc32c-intel_glue.c
index feccb52..52c5d47 100644
--- a/arch/x86/crypto/crc32c-intel_glue.c
+++ b/arch/x86/crypto/crc32c-intel_glue.c
@@ -41,7 +41,7 @@
*/
#define CRC32C_PCL_BREAKEVEN 512
-asmlinkage unsigned int crc_pcl(const u8 *buffer, int len,
+asmlinkage unsigned int crc_pcl(const u8 *buffer, unsigned int len,
unsigned int crc_init);
#endif /* CONFIG_X86_64 */
diff --git a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
index bbcff1f..752812b 100644
--- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
+++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
@@ -7,6 +7,7 @@
* http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-paper.pdf
*
* Copyright (C) 2012 Intel Corporation.
+ * Copyright 2024 Google LLC
*
* Authors:
* Wajdi Feghali <wajdi.k.feghali@intel.com>
@@ -44,185 +45,129 @@
*/
#include <linux/linkage.h>
-#include <asm/nospec-branch.h>
## ISCSI CRC 32 Implementation with crc32 and pclmulqdq Instruction
-.macro LABEL prefix n
-.L\prefix\n\():
-.endm
-
-.macro JMPTBL_ENTRY i
-.quad .Lcrc_\i
-.endm
-
-.macro JNC_LESS_THAN j
- jnc .Lless_than_\j
-.endm
-
-# Define threshold where buffers are considered "small" and routed to more
-# efficient "by-1" code. This "by-1" code only handles up to 255 bytes, so
-# SMALL_SIZE can be no larger than 255.
-
+# Define threshold below which buffers are considered "small" and routed to
+# regular CRC code that does not interleave the CRC instructions.
#define SMALL_SIZE 200
-.if (SMALL_SIZE > 255)
-.error "SMALL_ SIZE must be < 256"
-.endif
-
-# unsigned int crc_pcl(u8 *buffer, int len, unsigned int crc_init);
+# unsigned int crc_pcl(const u8 *buffer, unsigned int len, unsigned int crc_init);
.text
SYM_FUNC_START(crc_pcl)
-#define bufp rdi
-#define bufp_dw %edi
-#define bufp_w %di
-#define bufp_b %dil
-#define bufptmp %rcx
-#define block_0 %rcx
-#define block_1 %rdx
-#define block_2 %r11
-#define len %rsi
-#define len_dw %esi
-#define len_w %si
-#define len_b %sil
-#define crc_init_arg %rdx
-#define tmp %rbx
-#define crc_init %r8
-#define crc_init_dw %r8d
-#define crc1 %r9
-#define crc2 %r10
+#define bufp %rdi
+#define bufp_d %edi
+#define len %esi
+#define crc_init %edx
+#define crc_init_q %rdx
+#define n_misaligned %ecx /* overlaps chunk_bytes! */
+#define n_misaligned_q %rcx
+#define chunk_bytes %ecx /* overlaps n_misaligned! */
+#define chunk_bytes_q %rcx
+#define crc1 %r8
+#define crc2 %r9
- pushq %rbx
- pushq %rdi
- pushq %rsi
-
- ## Move crc_init for Linux to a different
- mov crc_init_arg, crc_init
+ cmp $SMALL_SIZE, len
+ jb .Lsmall
################################################################
## 1) ALIGN:
################################################################
-
- mov %bufp, bufptmp # rdi = *buf
- neg %bufp
- and $7, %bufp # calculate the unalignment amount of
+ mov bufp_d, n_misaligned
+ neg n_misaligned
+ and $7, n_misaligned # calculate the misalignment amount of
# the address
- je .Lproc_block # Skip if aligned
+ je .Laligned # Skip if aligned
- ## If len is less than 8 and we're unaligned, we need to jump
- ## to special code to avoid reading beyond the end of the buffer
- cmp $8, len
- jae .Ldo_align
- # less_than_8 expects length in upper 3 bits of len_dw
- # less_than_8_post_shl1 expects length = carryflag * 8 + len_dw[31:30]
- shl $32-3+1, len_dw
- jmp .Lless_than_8_post_shl1
-
+ # Process 1 <= n_misaligned <= 7 bytes individually in order to align
+ # the remaining data to an 8-byte boundary.
.Ldo_align:
- #### Calculate CRC of unaligned bytes of the buffer (if any)
- movq (bufptmp), tmp # load a quadward from the buffer
- add %bufp, bufptmp # align buffer pointer for quadword
- # processing
- sub %bufp, len # update buffer length
+ movq (bufp), %rax
+ add n_misaligned_q, bufp
+ sub n_misaligned, len
.Lalign_loop:
- crc32b %bl, crc_init_dw # compute crc32 of 1-byte
- shr $8, tmp # get next byte
- dec %bufp
+ crc32b %al, crc_init # compute crc32 of 1-byte
+ shr $8, %rax # get next byte
+ dec n_misaligned
jne .Lalign_loop
-
-.Lproc_block:
+.Laligned:
################################################################
- ## 2) PROCESS BLOCKS:
+ ## 2) PROCESS BLOCK:
################################################################
- ## compute num of bytes to be processed
- movq len, tmp # save num bytes in tmp
-
- cmpq $128*24, len
+ cmp $128*24, len
jae .Lfull_block
-.Lcontinue_block:
- cmpq $SMALL_SIZE, len
- jb .Lsmall
+.Lpartial_block:
+ # Compute floor(len / 24) to get num qwords to process from each lane.
+ imul $2731, len, %eax # 2731 = ceil(2^16 / 24)
+ shr $16, %eax
+ jmp .Lcrc_3lanes
- ## len < 128*24
- movq $2731, %rax # 2731 = ceil(2^16 / 24)
- mul len_dw
- shrq $16, %rax
-
- ## eax contains floor(bytes / 24) = num 24-byte chunks to do
-
- ## process rax 24-byte chunks (128 >= rax >= 0)
-
- ## compute end address of each block
- ## block 0 (base addr + RAX * 8)
- ## block 1 (base addr + RAX * 16)
- ## block 2 (base addr + RAX * 24)
- lea (bufptmp, %rax, 8), block_0
- lea (block_0, %rax, 8), block_1
- lea (block_1, %rax, 8), block_2
-
- xor crc1, crc1
- xor crc2, crc2
-
- ## branch into array
- leaq jump_table(%rip), %bufp
- mov (%bufp,%rax,8), %bufp
- JMP_NOSPEC bufp
-
- ################################################################
- ## 2a) PROCESS FULL BLOCKS:
- ################################################################
.Lfull_block:
- movl $128,%eax
- lea 128*8*2(block_0), block_1
- lea 128*8*3(block_0), block_2
- add $128*8*1, block_0
+ # Processing 128 qwords from each lane.
+ mov $128, %eax
- xor crc1,crc1
+ ################################################################
+ ## 3) CRC each of three lanes:
+ ################################################################
+
+.Lcrc_3lanes:
+ xor crc1,crc1
xor crc2,crc2
+ mov %eax, chunk_bytes
+ shl $3, chunk_bytes # num bytes to process from each lane
+ sub $5, %eax # 4 for 4x_loop, 1 for special last iter
+ jl .Lcrc_3lanes_4x_done
- # Fall through into top of crc array (crc_128)
+ # Unroll the loop by a factor of 4 to reduce the overhead of the loop
+ # bookkeeping instructions, which can compete with crc32q for the ALUs.
+.Lcrc_3lanes_4x_loop:
+ crc32q (bufp), crc_init_q
+ crc32q (bufp,chunk_bytes_q), crc1
+ crc32q (bufp,chunk_bytes_q,2), crc2
+ crc32q 8(bufp), crc_init_q
+ crc32q 8(bufp,chunk_bytes_q), crc1
+ crc32q 8(bufp,chunk_bytes_q,2), crc2
+ crc32q 16(bufp), crc_init_q
+ crc32q 16(bufp,chunk_bytes_q), crc1
+ crc32q 16(bufp,chunk_bytes_q,2), crc2
+ crc32q 24(bufp), crc_init_q
+ crc32q 24(bufp,chunk_bytes_q), crc1
+ crc32q 24(bufp,chunk_bytes_q,2), crc2
+ add $32, bufp
+ sub $4, %eax
+ jge .Lcrc_3lanes_4x_loop
- ################################################################
- ## 3) CRC Array:
- ################################################################
+.Lcrc_3lanes_4x_done:
+ add $4, %eax
+ jz .Lcrc_3lanes_last_qword
- i=128
-.rept 128-1
-.altmacro
-LABEL crc_ %i
-.noaltmacro
- ENDBR
- crc32q -i*8(block_0), crc_init
- crc32q -i*8(block_1), crc1
- crc32q -i*8(block_2), crc2
- i=(i-1)
-.endr
+.Lcrc_3lanes_1x_loop:
+ crc32q (bufp), crc_init_q
+ crc32q (bufp,chunk_bytes_q), crc1
+ crc32q (bufp,chunk_bytes_q,2), crc2
+ add $8, bufp
+ dec %eax
+ jnz .Lcrc_3lanes_1x_loop
-.altmacro
-LABEL crc_ %i
-.noaltmacro
- ENDBR
- crc32q -i*8(block_0), crc_init
- crc32q -i*8(block_1), crc1
-# SKIP crc32 -i*8(block_2), crc2 ; Don't do this one yet
-
- mov block_2, block_0
+.Lcrc_3lanes_last_qword:
+ crc32q (bufp), crc_init_q
+ crc32q (bufp,chunk_bytes_q), crc1
+# SKIP crc32q (bufp,chunk_bytes_q,2), crc2 ; Don't do this one yet
################################################################
## 4) Combine three results:
################################################################
- lea (K_table-8)(%rip), %bufp # first entry is for idx 1
- shlq $3, %rax # rax *= 8
- pmovzxdq (%bufp,%rax), %xmm0 # 2 consts: K1:K2
- leal (%eax,%eax,2), %eax # rax *= 3 (total *24)
- subq %rax, tmp # tmp -= rax*24
+ lea (K_table-8)(%rip), %rax # first entry is for idx 1
+ pmovzxdq (%rax,chunk_bytes_q), %xmm0 # 2 consts: K1:K2
+ lea (chunk_bytes,chunk_bytes,2), %eax # chunk_bytes * 3
+ sub %eax, len # len -= chunk_bytes * 3
- movq crc_init, %xmm1 # CRC for block 1
+ movq crc_init_q, %xmm1 # CRC for block 1
pclmulqdq $0x00, %xmm0, %xmm1 # Multiply by K2
movq crc1, %xmm2 # CRC for block 2
@@ -230,103 +175,54 @@
pxor %xmm2,%xmm1
movq %xmm1, %rax
- xor -i*8(block_2), %rax
- mov crc2, crc_init
- crc32 %rax, crc_init
+ xor (bufp,chunk_bytes_q,2), %rax
+ mov crc2, crc_init_q
+ crc32 %rax, crc_init_q
+ lea 8(bufp,chunk_bytes_q,2), bufp
################################################################
- ## 5) Check for end:
+ ## 5) If more blocks remain, goto (2):
################################################################
-LABEL crc_ 0
- ENDBR
- mov tmp, len
- cmp $128*24, tmp
- jae .Lfull_block
- cmp $24, tmp
- jae .Lcontinue_block
-
-.Lless_than_24:
- shl $32-4, len_dw # less_than_16 expects length
- # in upper 4 bits of len_dw
- jnc .Lless_than_16
- crc32q (bufptmp), crc_init
- crc32q 8(bufptmp), crc_init
- jz .Ldo_return
- add $16, bufptmp
- # len is less than 8 if we got here
- # less_than_8 expects length in upper 3 bits of len_dw
- # less_than_8_post_shl1 expects length = carryflag * 8 + len_dw[31:30]
- shl $2, len_dw
- jmp .Lless_than_8_post_shl1
+ cmp $128*24, len
+ jae .Lfull_block
+ cmp $SMALL_SIZE, len
+ jae .Lpartial_block
#######################################################################
- ## 6) LESS THAN 256-bytes REMAIN AT THIS POINT (8-bits of len are full)
+ ## 6) Process any remainder without interleaving:
#######################################################################
.Lsmall:
- shl $32-8, len_dw # Prepare len_dw for less_than_256
- j=256
-.rept 5 # j = {256, 128, 64, 32, 16}
-.altmacro
-LABEL less_than_ %j # less_than_j: Length should be in
- # upper lg(j) bits of len_dw
- j=(j/2)
- shl $1, len_dw # Get next MSB
- JNC_LESS_THAN %j
-.noaltmacro
- i=0
-.rept (j/8)
- crc32q i(bufptmp), crc_init # Compute crc32 of 8-byte data
- i=i+8
-.endr
- jz .Ldo_return # Return if remaining length is zero
- add $j, bufptmp # Advance buf
-.endr
-
-.Lless_than_8: # Length should be stored in
- # upper 3 bits of len_dw
- shl $1, len_dw
-.Lless_than_8_post_shl1:
- jnc .Lless_than_4
- crc32l (bufptmp), crc_init_dw # CRC of 4 bytes
- jz .Ldo_return # return if remaining data is zero
- add $4, bufptmp
-.Lless_than_4: # Length should be stored in
- # upper 2 bits of len_dw
- shl $1, len_dw
- jnc .Lless_than_2
- crc32w (bufptmp), crc_init_dw # CRC of 2 bytes
- jz .Ldo_return # return if remaining data is zero
- add $2, bufptmp
-.Lless_than_2: # Length should be stored in the MSB
- # of len_dw
- shl $1, len_dw
- jnc .Lless_than_1
- crc32b (bufptmp), crc_init_dw # CRC of 1 byte
-.Lless_than_1: # Length should be zero
-.Ldo_return:
- movq crc_init, %rax
- popq %rsi
- popq %rdi
- popq %rbx
+ test len, len
+ jz .Ldone
+ mov len, %eax
+ shr $3, %eax
+ jz .Ldo_dword
+.Ldo_qwords:
+ crc32q (bufp), crc_init_q
+ add $8, bufp
+ dec %eax
+ jnz .Ldo_qwords
+.Ldo_dword:
+ test $4, len
+ jz .Ldo_word
+ crc32l (bufp), crc_init
+ add $4, bufp
+.Ldo_word:
+ test $2, len
+ jz .Ldo_byte
+ crc32w (bufp), crc_init
+ add $2, bufp
+.Ldo_byte:
+ test $1, len
+ jz .Ldone
+ crc32b (bufp), crc_init
+.Ldone:
+ mov crc_init, %eax
RET
SYM_FUNC_END(crc_pcl)
.section .rodata, "a", @progbits
- ################################################################
- ## jump table Table is 129 entries x 2 bytes each
- ################################################################
-.align 4
-jump_table:
- i=0
-.rept 129
-.altmacro
-JMPTBL_ENTRY %i
-.noaltmacro
- i=i+1
-.endr
-
-
################################################################
## PCLMULQDQ tables
## Table is 128 entries x 2 words (8 bytes) each
diff --git a/arch/x86/entry/entry.S b/arch/x86/entry/entry.S
index 324686b..b7ea3e8 100644
--- a/arch/x86/entry/entry.S
+++ b/arch/x86/entry/entry.S
@@ -51,3 +51,19 @@
.popsection
THUNK warn_thunk_thunk, __warn_thunk
+
+#ifndef CONFIG_X86_64
+/*
+ * Clang's implementation of TLS stack cookies requires the variable in
+ * question to be a TLS variable. If the variable happens to be defined as an
+ * ordinary variable with external linkage in the same compilation unit (which
+ * amounts to the whole of vmlinux with LTO enabled), Clang will drop the
+ * segment register prefix from the references, resulting in broken code. Work
+ * around this by avoiding the symbol used in -mstack-protector-guard-symbol=
+ * entirely in the C code, and use an alias emitted by the linker script
+ * instead.
+ */
+#ifdef CONFIG_STACKPROTECTOR
+EXPORT_SYMBOL(__ref_stack_chk_guard);
+#endif
+#endif
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 534c74b..4d0fb2f 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -468,3 +468,7 @@
460 i386 lsm_set_self_attr sys_lsm_set_self_attr
461 i386 lsm_list_modules sys_lsm_list_modules
462 i386 mseal sys_mseal
+463 i386 setxattrat sys_setxattrat
+464 i386 getxattrat sys_getxattrat
+465 i386 listxattrat sys_listxattrat
+466 i386 removexattrat sys_removexattrat
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 7093ee2..5eb708b 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -386,6 +386,10 @@
460 common lsm_set_self_attr sys_lsm_set_self_attr
461 common lsm_list_modules sys_lsm_list_modules
462 common mseal sys_mseal
+463 common setxattrat sys_setxattrat
+464 common getxattrat sys_getxattrat
+465 common listxattrat sys_listxattrat
+466 common removexattrat sys_removexattrat
#
# Due to a historical design error, certain syscalls are numbered differently
diff --git a/arch/x86/include/asm/asm-prototypes.h b/arch/x86/include/asm/asm-prototypes.h
index 25466c4..3674006 100644
--- a/arch/x86/include/asm/asm-prototypes.h
+++ b/arch/x86/include/asm/asm-prototypes.h
@@ -20,3 +20,6 @@
extern void cmpxchg8b_emu(void);
#endif
+#if defined(__GENKSYMS__) && defined(CONFIG_STACKPROTECTOR)
+extern unsigned long __ref_stack_chk_guard;
+#endif
diff --git a/arch/x86/include/uapi/asm/mman.h b/arch/x86/include/uapi/asm/mman.h
index 46cdc94..ac1e627 100644
--- a/arch/x86/include/uapi/asm/mman.h
+++ b/arch/x86/include/uapi/asm/mman.h
@@ -5,9 +5,6 @@
#define MAP_32BIT 0x40 /* only give out 32bit addresses */
#define MAP_ABOVE4G 0x80 /* only map above 4GB */
-/* Flags for map_shadow_stack(2) */
-#define SHADOW_STACK_SET_TOKEN (1ULL << 0) /* Set up a restore token in the shadow stack */
-
#include <asm-generic/mman.h>
#endif /* _ASM_X86_MMAN_H */
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index fab5cae..823f44f 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -924,6 +924,17 @@ static void init_amd_zen4(struct cpuinfo_x86 *c)
{
if (!cpu_has(c, X86_FEATURE_HYPERVISOR))
msr_set_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_SHARED_BTB_FIX_BIT);
+
+ /*
+ * These Zen4 SoCs advertise support for virtualized VMLOAD/VMSAVE
+ * in some BIOS versions but they can lead to random host reboots.
+ */
+ switch (c->x86_model) {
+ case 0x18 ... 0x1f:
+ case 0x60 ... 0x7f:
+ clear_cpu_cap(c, X86_FEATURE_V_VMSAVE_VMLOAD);
+ break;
+ }
}
static void init_amd_zen5(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index a5f221ea5..f43bb97 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -2089,8 +2089,10 @@ void syscall_init(void)
#ifdef CONFIG_STACKPROTECTOR
DEFINE_PER_CPU(unsigned long, __stack_chk_guard);
+#ifndef CONFIG_SMP
EXPORT_PER_CPU_SYMBOL(__stack_chk_guard);
#endif
+#endif
#endif /* CONFIG_X86_64 */
diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c
index 9ace844..eb5848d 100644
--- a/arch/x86/kernel/cpu/sgx/main.c
+++ b/arch/x86/kernel/cpu/sgx/main.c
@@ -901,19 +901,15 @@ static struct miscdevice sgx_dev_provision = {
int sgx_set_attribute(unsigned long *allowed_attributes,
unsigned int attribute_fd)
{
- struct fd f = fdget(attribute_fd);
+ CLASS(fd, f)(attribute_fd);
- if (!fd_file(f))
+ if (fd_empty(f))
return -EINVAL;
- if (fd_file(f)->f_op != &sgx_provision_fops) {
- fdput(f);
+ if (fd_file(f)->f_op != &sgx_provision_fops)
return -EINVAL;
- }
*allowed_attributes |= SGX_ATTR_PROVISIONKEY;
-
- fdput(f);
return 0;
}
EXPORT_SYMBOL_GPL(sgx_set_attribute);
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index b8c5741..6519984 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -491,6 +491,9 @@
. = ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE),
"kernel image bigger than KERNEL_IMAGE_SIZE");
+/* needed for Clang - see arch/x86/entry/entry.S */
+PROVIDE(__ref_stack_chk_guard = __stack_chk_guard);
+
#ifdef CONFIG_X86_64
/*
* Per-cpu symbols which need to be offset from __per_cpu_load
@@ -528,3 +531,22 @@
#endif
#endif /* CONFIG_X86_64 */
+
+/*
+ * The symbols below are referenced using relative relocations in the
+ * respective ELF notes. This produces build time constants that the
+ * linker will never mark as relocatable. (Using just ABSOLUTE() is not
+ * sufficient for that).
+ */
+#ifdef CONFIG_XEN
+#ifdef CONFIG_XEN_PV
+xen_elfnote_entry_value =
+ ABSOLUTE(xen_elfnote_entry) + ABSOLUTE(startup_xen);
+#endif
+xen_elfnote_hypercall_page_value =
+ ABSOLUTE(xen_elfnote_hypercall_page) + ABSOLUTE(hypercall_page);
+#endif
+#ifdef CONFIG_PVH
+xen_elfnote_phys32_entry_value =
+ ABSOLUTE(xen_elfnote_phys32_entry) + ABSOLUTE(pvh_start_xen - LOAD_OFFSET);
+#endif
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 2098dc6..95c6beb 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -2629,19 +2629,26 @@ void kvm_apic_update_apicv(struct kvm_vcpu *vcpu)
{
struct kvm_lapic *apic = vcpu->arch.apic;
- if (apic->apicv_active) {
- /* irr_pending is always true when apicv is activated. */
- apic->irr_pending = true;
+ /*
+ * When APICv is enabled, KVM must always search the IRR for a pending
+ * IRQ, as other vCPUs and devices can set IRR bits even if the vCPU
+ * isn't running. If APICv is disabled, KVM _should_ search the IRR
+ * for a pending IRQ. But KVM currently doesn't ensure *all* hardware,
+ * e.g. CPUs and IOMMUs, has seen the change in state, i.e. searching
+ * the IRR at this time could race with IRQ delivery from hardware that
+ * still sees APICv as being enabled.
+ *
+ * FIXME: Ensure other vCPUs and devices observe the change in APICv
+ * state prior to updating KVM's metadata caches, so that KVM
+ * can safely search the IRR and set irr_pending accordingly.
+ */
+ apic->irr_pending = true;
+
+ if (apic->apicv_active)
apic->isr_count = 1;
- } else {
- /*
- * Don't clear irr_pending, searching the IRR can race with
- * updates from the CPU as APICv is still active from hardware's
- * perspective. The flag will be cleared as appropriate when
- * KVM injects the interrupt.
- */
+ else
apic->isr_count = count_vectors(apic->regs + APIC_ISR);
- }
+
apic->highest_isr_cache = -1;
}
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 0b851ef..92d4711 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -450,8 +450,11 @@ static int __sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp,
goto e_free;
/* This needs to happen after SEV/SNP firmware initialization. */
- if (vm_type == KVM_X86_SNP_VM && snp_guest_req_init(kvm))
- goto e_free;
+ if (vm_type == KVM_X86_SNP_VM) {
+ ret = snp_guest_req_init(kvm);
+ if (ret)
+ goto e_free;
+ }
INIT_LIST_HEAD(&sev->regions_list);
INIT_LIST_HEAD(&sev->mirror_vms);
@@ -530,17 +533,12 @@ static int sev_bind_asid(struct kvm *kvm, unsigned int handle, int *error)
static int __sev_issue_cmd(int fd, int id, void *data, int *error)
{
- struct fd f;
- int ret;
+ CLASS(fd, f)(fd);
- f = fdget(fd);
- if (!fd_file(f))
+ if (fd_empty(f))
return -EBADF;
- ret = sev_issue_cmd_external_user(fd_file(f), id, data, error);
-
- fdput(f);
- return ret;
+ return sev_issue_cmd_external_user(fd_file(f), id, data, error);
}
static int sev_issue_cmd(struct kvm *kvm, int id, void *data, int *error)
@@ -2073,23 +2071,21 @@ int sev_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd)
{
struct kvm_sev_info *dst_sev = &to_kvm_svm(kvm)->sev_info;
struct kvm_sev_info *src_sev, *cg_cleanup_sev;
- struct fd f = fdget(source_fd);
+ CLASS(fd, f)(source_fd);
struct kvm *source_kvm;
bool charged = false;
int ret;
- if (!fd_file(f))
+ if (fd_empty(f))
return -EBADF;
- if (!file_is_kvm(fd_file(f))) {
- ret = -EBADF;
- goto out_fput;
- }
+ if (!file_is_kvm(fd_file(f)))
+ return -EBADF;
source_kvm = fd_file(f)->private_data;
ret = sev_lock_two_vms(kvm, source_kvm);
if (ret)
- goto out_fput;
+ return ret;
if (kvm->arch.vm_type != source_kvm->arch.vm_type ||
sev_guest(kvm) || !sev_guest(source_kvm)) {
@@ -2136,8 +2132,6 @@ int sev_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd)
cg_cleanup_sev->misc_cg = NULL;
out_unlock:
sev_unlock_two_vms(kvm, source_kvm);
-out_fput:
- fdput(f);
return ret;
}
@@ -2212,10 +2206,6 @@ static int snp_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
if (sev->snp_context)
return -EINVAL;
- sev->snp_context = snp_context_create(kvm, argp);
- if (!sev->snp_context)
- return -ENOTTY;
-
if (params.flags)
return -EINVAL;
@@ -2230,6 +2220,10 @@ static int snp_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
if (params.policy & SNP_POLICY_MASK_SINGLE_SOCKET)
return -EINVAL;
+ sev->snp_context = snp_context_create(kvm, argp);
+ if (!sev->snp_context)
+ return -ENOTTY;
+
start.gctx_paddr = __psp_pa(sev->snp_context);
start.policy = params.policy;
memcpy(start.gosvw, params.gosvw, sizeof(params.gosvw));
@@ -2798,23 +2792,21 @@ int sev_mem_enc_unregister_region(struct kvm *kvm,
int sev_vm_copy_enc_context_from(struct kvm *kvm, unsigned int source_fd)
{
- struct fd f = fdget(source_fd);
+ CLASS(fd, f)(source_fd);
struct kvm *source_kvm;
struct kvm_sev_info *source_sev, *mirror_sev;
int ret;
- if (!fd_file(f))
+ if (fd_empty(f))
return -EBADF;
- if (!file_is_kvm(fd_file(f))) {
- ret = -EBADF;
- goto e_source_fput;
- }
+ if (!file_is_kvm(fd_file(f)))
+ return -EBADF;
source_kvm = fd_file(f)->private_data;
ret = sev_lock_two_vms(kvm, source_kvm);
if (ret)
- goto e_source_fput;
+ return ret;
/*
* Mirrors of mirrors should work, but let's not get silly. Also
@@ -2857,8 +2849,6 @@ int sev_vm_copy_enc_context_from(struct kvm *kvm, unsigned int source_fd)
e_unlock:
sev_unlock_two_vms(kvm, source_kvm);
-e_source_fput:
- fdput(f);
return ret;
}
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index a8e7bc0..931a736 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -1197,11 +1197,14 @@ static void nested_vmx_transition_tlb_flush(struct kvm_vcpu *vcpu,
kvm_hv_nested_transtion_tlb_flush(vcpu, enable_ept);
/*
- * If vmcs12 doesn't use VPID, L1 expects linear and combined mappings
- * for *all* contexts to be flushed on VM-Enter/VM-Exit, i.e. it's a
- * full TLB flush from the guest's perspective. This is required even
- * if VPID is disabled in the host as KVM may need to synchronize the
- * MMU in response to the guest TLB flush.
+ * If VPID is disabled, then guest TLB accesses use VPID=0, i.e. the
+ * same VPID as the host, and so architecturally, linear and combined
+ * mappings for VPID=0 must be flushed at VM-Enter and VM-Exit. KVM
+ * emulates L2 sharing L1's VPID=0 by using vpid01 while running L2,
+ * and so KVM must also emulate TLB flush of VPID=0, i.e. vpid01. This
+ * is required if VPID is disabled in KVM, as a TLB flush (there are no
+ * VPIDs) still occurs from L1's perspective, and KVM may need to
+ * synchronize the MMU in response to the guest TLB flush.
*
* Note, using TLB_FLUSH_GUEST is correct even if nested EPT is in use.
* EPT is a special snowflake, as guest-physical mappings aren't
@@ -2315,6 +2318,17 @@ static void prepare_vmcs02_early_rare(struct vcpu_vmx *vmx,
vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA);
+ /*
+ * If VPID is disabled, then guest TLB accesses use VPID=0, i.e. the
+ * same VPID as the host. Emulate this behavior by using vpid01 for L2
+ * if VPID is disabled in vmcs12. Note, if VPID is disabled, VM-Enter
+ * and VM-Exit are architecturally required to flush VPID=0, but *only*
+ * VPID=0. I.e. using vpid02 would be ok (so long as KVM emulates the
+ * required flushes), but doing so would cause KVM to over-flush. E.g.
+ * if L1 runs L2 X with VPID12=1, then runs L2 Y with VPID12 disabled,
+ * and then runs L2 X again, then KVM can and should retain TLB entries
+ * for VPID12=1.
+ */
if (enable_vpid) {
if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02)
vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02);
@@ -5950,6 +5964,12 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
return nested_vmx_fail(vcpu,
VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
+ /*
+ * Always flush the effective vpid02, i.e. never flush the current VPID
+ * and never explicitly flush vpid01. INVVPID targets a VPID, not a
+ * VMCS, and so whether or not the current vmcs12 has VPID enabled is
+ * irrelevant (and there may not be a loaded vmcs12).
+ */
vpid02 = nested_get_vpid02(vcpu);
switch (type) {
case VMX_VPID_EXTENT_INDIVIDUAL_ADDR:
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 81ed596..d28618e 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -217,9 +217,11 @@ module_param(ple_window_shrink, uint, 0444);
static unsigned int ple_window_max = KVM_VMX_DEFAULT_PLE_WINDOW_MAX;
module_param(ple_window_max, uint, 0444);
-/* Default is SYSTEM mode, 1 for host-guest mode */
+/* Default is SYSTEM mode, 1 for host-guest mode (which is BROKEN) */
int __read_mostly pt_mode = PT_MODE_SYSTEM;
+#ifdef CONFIG_BROKEN
module_param(pt_mode, int, S_IRUGO);
+#endif
struct x86_pmu_lbr __ro_after_init vmx_lbr_caps;
@@ -3216,7 +3218,7 @@ void vmx_flush_tlb_all(struct kvm_vcpu *vcpu)
static inline int vmx_get_current_vpid(struct kvm_vcpu *vcpu)
{
- if (is_guest_mode(vcpu))
+ if (is_guest_mode(vcpu) && nested_cpu_has_vpid(get_vmcs12(vcpu)))
return nested_get_vpid02(vcpu);
return to_vmx(vcpu)->vpid;
}
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 70b02fc..8d29163 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -656,7 +656,8 @@ static bool memremap_is_setup_data(resource_size_t phys_addr,
paddr_next = data->next;
len = data->len;
- if ((phys_addr > paddr) && (phys_addr < (paddr + len))) {
+ if ((phys_addr > paddr) &&
+ (phys_addr < (paddr + sizeof(struct setup_data) + len))) {
memunmap(data);
return true;
}
@@ -718,7 +719,8 @@ static bool __init early_memremap_is_setup_data(resource_size_t phys_addr,
paddr_next = data->next;
len = data->len;
- if ((phys_addr > paddr) && (phys_addr < (paddr + len))) {
+ if ((phys_addr > paddr) &&
+ (phys_addr < (paddr + sizeof(struct setup_data) + len))) {
early_memunmap(data, sizeof(*data));
return true;
}
diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c
index 230f1de..e17e6e2 100644
--- a/arch/x86/mm/kaslr.c
+++ b/arch/x86/mm/kaslr.c
@@ -22,7 +22,7 @@
#include <linux/kernel.h>
#include <linux/init.h>
-#include <linux/random.h>
+#include <linux/prandom.h>
#include <linux/memblock.h>
#include <linux/pgtable.h>
diff --git a/arch/x86/platform/pvh/head.S b/arch/x86/platform/pvh/head.S
index 64fca49..4733a5f 100644
--- a/arch/x86/platform/pvh/head.S
+++ b/arch/x86/platform/pvh/head.S
@@ -6,7 +6,9 @@
.code32
.text
+#ifdef CONFIG_X86_32
#define _pa(x) ((x) - __START_KERNEL_map)
+#endif
#define rva(x) ((x) - pvh_start_xen)
#include <linux/elfnote.h>
@@ -52,7 +54,7 @@
#define PVH_CS_SEL (PVH_GDT_ENTRY_CS * 8)
#define PVH_DS_SEL (PVH_GDT_ENTRY_DS * 8)
-SYM_CODE_START_LOCAL(pvh_start_xen)
+SYM_CODE_START(pvh_start_xen)
UNWIND_HINT_END_OF_STACK
cld
@@ -72,8 +74,7 @@
movl $0, %esp
leal rva(gdt)(%ebp), %eax
- leal rva(gdt_start)(%ebp), %ecx
- movl %ecx, 2(%eax)
+ addl %eax, 2(%eax)
lgdt (%eax)
mov $PVH_DS_SEL,%eax
@@ -103,10 +104,23 @@
btsl $_EFER_LME, %eax
wrmsr
+ /*
+ * Reuse the non-relocatable symbol emitted for the ELF note to
+ * subtract the build time physical address of pvh_start_xen() from
+ * its actual runtime address, without relying on absolute 32-bit ELF
+ * relocations, as these are not supported by the linker when running
+ * in -pie mode, and should be avoided in .head.text in general.
+ */
mov %ebp, %ebx
- subl $_pa(pvh_start_xen), %ebx /* offset */
+ subl rva(xen_elfnote_phys32_entry)(%ebp), %ebx
jz .Lpagetable_done
+ /*
+ * Store the resulting load offset in phys_base. __pa() needs
+ * phys_base set to calculate the hypercall page in xen_pvh_init().
+ */
+ movl %ebx, rva(phys_base)(%ebp)
+
/* Fixup page-tables for relocation. */
leal rva(pvh_init_top_pgt)(%ebp), %edi
movl $PTRS_PER_PGD, %ecx
@@ -165,20 +179,12 @@
xor %edx, %edx
wrmsr
- /*
- * Calculate load offset and store in phys_base. __pa() needs
- * phys_base set to calculate the hypercall page in xen_pvh_init().
- */
- movq %rbp, %rbx
- subq $_pa(pvh_start_xen), %rbx
- movq %rbx, phys_base(%rip)
- call xen_prepare_pvh
- /*
- * Clear phys_base. __startup_64 will *add* to its value,
- * so reset to 0.
- */
- xor %rbx, %rbx
- movq %rbx, phys_base(%rip)
+ /* Call xen_prepare_pvh() via the kernel virtual mapping */
+ leaq xen_prepare_pvh(%rip), %rax
+ subq phys_base(%rip), %rax
+ addq $__START_KERNEL_map, %rax
+ ANNOTATE_RETPOLINE_SAFE
+ call *%rax
/* startup_64 expects boot_params in %rsi. */
lea pvh_bootparams(%rip), %rsi
@@ -217,8 +223,8 @@
.section ".init.data","aw"
.balign 8
SYM_DATA_START_LOCAL(gdt)
- .word gdt_end - gdt_start
- .long _pa(gdt_start) /* x86-64 will overwrite if relocated. */
+ .word gdt_end - gdt_start - 1
+ .long gdt_start - gdt
.word 0
SYM_DATA_END(gdt)
SYM_DATA_START_LOCAL(gdt_start)
@@ -300,5 +306,5 @@
.long KERNEL_IMAGE_SIZE - 1)
#endif
- ELFNOTE(Xen, XEN_ELFNOTE_PHYS32_ENTRY,
- _ASM_PTR (pvh_start_xen - __START_KERNEL_map))
+ ELFNOTE(Xen, XEN_ELFNOTE_PHYS32_ENTRY, .global xen_elfnote_phys32_entry;
+ xen_elfnote_phys32_entry: _ASM_PTR xen_elfnote_phys32_entry_value - .)
diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c
index c101bed..3ede19c 100644
--- a/arch/x86/tools/relocs.c
+++ b/arch/x86/tools/relocs.c
@@ -56,6 +56,7 @@ static const char * const sym_regex_kernel[S_NSYMTYPES] = {
[S_ABS] =
"^(xen_irq_disable_direct_reloc$|"
"xen_save_fl_direct_reloc$|"
+ "xen_elfnote_.+_offset$|"
"VDSO|"
"__kcfi_typeid_|"
"__crc_)",
diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S
index 758bcd4..7f6c69db 100644
--- a/arch/x86/xen/xen-head.S
+++ b/arch/x86/xen/xen-head.S
@@ -94,7 +94,8 @@
ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, _ASM_PTR __START_KERNEL_map)
/* Map the p2m table to a 512GB-aligned user address. */
ELFNOTE(Xen, XEN_ELFNOTE_INIT_P2M, .quad (PUD_SIZE * PTRS_PER_PUD))
- ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, _ASM_PTR startup_xen)
+ ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, .globl xen_elfnote_entry;
+ xen_elfnote_entry: _ASM_PTR xen_elfnote_entry_value - .)
ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .ascii "!writable_page_tables")
ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "yes")
ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID,
@@ -115,7 +116,8 @@
#else
# define FEATURES_DOM0 0
#endif
- ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, _ASM_PTR hypercall_page)
+ ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .globl xen_elfnote_hypercall_page;
+ xen_elfnote_hypercall_page: _ASM_PTR xen_elfnote_hypercall_page_value - .)
ELFNOTE(Xen, XEN_ELFNOTE_SUPPORTED_FEATURES,
.long FEATURES_PV | FEATURES_PVH | FEATURES_DOM0)
ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic")
diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl
index 67083fc..37effc1 100644
--- a/arch/xtensa/kernel/syscalls/syscall.tbl
+++ b/arch/xtensa/kernel/syscalls/syscall.tbl
@@ -433,3 +433,7 @@
460 common lsm_set_self_attr sys_lsm_set_self_attr
461 common lsm_list_modules sys_lsm_list_modules
462 common mseal sys_mseal
+463 common setxattrat sys_setxattrat
+464 common getxattrat sys_getxattrat
+465 common listxattrat sys_listxattrat
+466 common removexattrat sys_removexattrat
diff --git a/block/bio-integrity.c b/block/bio-integrity.c
index 88e3ad7..2a4bd66 100644
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -199,7 +199,7 @@ EXPORT_SYMBOL(bio_integrity_add_page);
static int bio_integrity_copy_user(struct bio *bio, struct bio_vec *bvec,
int nr_vecs, unsigned int len,
- unsigned int direction, u32 seed)
+ unsigned int direction)
{
bool write = direction == ITER_SOURCE;
struct bio_integrity_payload *bip;
@@ -247,7 +247,6 @@ static int bio_integrity_copy_user(struct bio *bio, struct bio_vec *bvec,
}
bip->bip_flags |= BIP_COPY_USER;
- bip->bip_iter.bi_sector = seed;
bip->bip_vcnt = nr_vecs;
return 0;
free_bip:
@@ -258,7 +257,7 @@ static int bio_integrity_copy_user(struct bio *bio, struct bio_vec *bvec,
}
static int bio_integrity_init_user(struct bio *bio, struct bio_vec *bvec,
- int nr_vecs, unsigned int len, u32 seed)
+ int nr_vecs, unsigned int len)
{
struct bio_integrity_payload *bip;
@@ -267,7 +266,6 @@ static int bio_integrity_init_user(struct bio *bio, struct bio_vec *bvec,
return PTR_ERR(bip);
memcpy(bip->bip_vec, bvec, nr_vecs * sizeof(*bvec));
- bip->bip_iter.bi_sector = seed;
bip->bip_iter.bi_size = len;
bip->bip_vcnt = nr_vecs;
return 0;
@@ -303,8 +301,7 @@ static unsigned int bvec_from_pages(struct bio_vec *bvec, struct page **pages,
return nr_bvecs;
}
-int bio_integrity_map_user(struct bio *bio, void __user *ubuf, ssize_t bytes,
- u32 seed)
+int bio_integrity_map_user(struct bio *bio, void __user *ubuf, ssize_t bytes)
{
struct request_queue *q = bdev_get_queue(bio->bi_bdev);
unsigned int align = blk_lim_dma_alignment_and_pad(&q->limits);
@@ -350,9 +347,9 @@ int bio_integrity_map_user(struct bio *bio, void __user *ubuf, ssize_t bytes,
if (copy)
ret = bio_integrity_copy_user(bio, bvec, nr_bvecs, bytes,
- direction, seed);
+ direction);
else
- ret = bio_integrity_init_user(bio, bvec, nr_bvecs, bytes, seed);
+ ret = bio_integrity_init_user(bio, bvec, nr_bvecs, bytes);
if (ret)
goto release_pages;
if (bvec != stack_vec)
diff --git a/block/bio.c b/block/bio.c
index ac4d77c8..699a78c 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1065,39 +1065,6 @@ int bio_add_pc_page(struct request_queue *q, struct bio *bio,
EXPORT_SYMBOL(bio_add_pc_page);
/**
- * bio_add_zone_append_page - attempt to add page to zone-append bio
- * @bio: destination bio
- * @page: page to add
- * @len: vec entry length
- * @offset: vec entry offset
- *
- * Attempt to add a page to the bio_vec maplist of a bio that will be submitted
- * for a zone-append request. This can fail for a number of reasons, such as the
- * bio being full or the target block device is not a zoned block device or
- * other limitations of the target block device. The target block device must
- * allow bio's up to PAGE_SIZE, so it is always possible to add a single page
- * to an empty bio.
- *
- * Returns: number of bytes added to the bio, or 0 in case of a failure.
- */
-int bio_add_zone_append_page(struct bio *bio, struct page *page,
- unsigned int len, unsigned int offset)
-{
- struct request_queue *q = bdev_get_queue(bio->bi_bdev);
- bool same_page = false;
-
- if (WARN_ON_ONCE(bio_op(bio) != REQ_OP_ZONE_APPEND))
- return 0;
-
- if (WARN_ON_ONCE(!bdev_is_zoned(bio->bi_bdev)))
- return 0;
-
- return bio_add_hw_page(q, bio, page, len, offset,
- queue_max_zone_append_sectors(q), &same_page);
-}
-EXPORT_SYMBOL_GPL(bio_add_zone_append_page);
-
-/**
* __bio_add_page - add page(s) to a bio in a new segment
* @bio: destination bio
* @page: start page to add
@@ -1206,21 +1173,12 @@ EXPORT_SYMBOL_GPL(__bio_release_pages);
void bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter)
{
- size_t size = iov_iter_count(iter);
-
WARN_ON_ONCE(bio->bi_max_vecs);
- if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
- struct request_queue *q = bdev_get_queue(bio->bi_bdev);
- size_t max_sectors = queue_max_zone_append_sectors(q);
-
- size = min(size, max_sectors << SECTOR_SHIFT);
- }
-
bio->bi_vcnt = iter->nr_segs;
bio->bi_io_vec = (struct bio_vec *)iter->bvec;
bio->bi_iter.bi_bvec_done = iter->iov_offset;
- bio->bi_iter.bi_size = size;
+ bio->bi_iter.bi_size = iov_iter_count(iter);
bio_set_flag(bio, BIO_CLONED);
}
@@ -1245,20 +1203,6 @@ static int bio_iov_add_folio(struct bio *bio, struct folio *folio, size_t len,
return 0;
}
-static int bio_iov_add_zone_append_folio(struct bio *bio, struct folio *folio,
- size_t len, size_t offset)
-{
- struct request_queue *q = bdev_get_queue(bio->bi_bdev);
- bool same_page = false;
-
- if (bio_add_hw_folio(q, bio, folio, len, offset,
- queue_max_zone_append_sectors(q), &same_page) != len)
- return -EINVAL;
- if (same_page && bio_flagged(bio, BIO_PAGE_PINNED))
- unpin_user_folio(folio, 1);
- return 0;
-}
-
static unsigned int get_contig_folio_len(unsigned int *num_pages,
struct page **pages, unsigned int i,
struct folio *folio, size_t left,
@@ -1365,14 +1309,7 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
len = get_contig_folio_len(&num_pages, pages, i,
folio, left, offset);
- if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
- ret = bio_iov_add_zone_append_folio(bio, folio, len,
- folio_offset);
- if (ret)
- break;
- } else
- bio_iov_add_folio(bio, folio, len, folio_offset);
-
+ bio_iov_add_folio(bio, folio, len, folio_offset);
offset = 0;
}
@@ -1728,16 +1665,22 @@ struct bio *bio_split(struct bio *bio, int sectors,
{
struct bio *split;
- BUG_ON(sectors <= 0);
- BUG_ON(sectors >= bio_sectors(bio));
+ if (WARN_ON_ONCE(sectors <= 0))
+ return ERR_PTR(-EINVAL);
+ if (WARN_ON_ONCE(sectors >= bio_sectors(bio)))
+ return ERR_PTR(-EINVAL);
/* Zone append commands cannot be split */
if (WARN_ON_ONCE(bio_op(bio) == REQ_OP_ZONE_APPEND))
- return NULL;
+ return ERR_PTR(-EINVAL);
+
+ /* atomic writes cannot be split */
+ if (bio->bi_opf & REQ_ATOMIC)
+ return ERR_PTR(-EINVAL);
split = bio_alloc_clone(bio->bi_bdev, bio, gfp, bs);
if (!split)
- return NULL;
+ return ERR_PTR(-ENOMEM);
split->bi_iter.bi_size = sectors << 9;
diff --git a/block/blk-core.c b/block/blk-core.c
index bc5e8c5..666efe8 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -261,6 +261,8 @@ static void blk_free_queue(struct request_queue *q)
blk_mq_release(q);
ida_free(&blk_queue_ida, q->id);
+ lockdep_unregister_key(&q->io_lock_cls_key);
+ lockdep_unregister_key(&q->q_lock_cls_key);
call_rcu(&q->rcu_head, blk_free_queue_rcu);
}
@@ -278,18 +280,20 @@ void blk_put_queue(struct request_queue *q)
}
EXPORT_SYMBOL(blk_put_queue);
-void blk_queue_start_drain(struct request_queue *q)
+bool blk_queue_start_drain(struct request_queue *q)
{
/*
* When queue DYING flag is set, we need to block new req
* entering queue, so we call blk_freeze_queue_start() to
* prevent I/O from crossing blk_queue_enter().
*/
- blk_freeze_queue_start(q);
+ bool freeze = __blk_freeze_queue_start(q, current);
if (queue_is_mq(q))
blk_mq_wake_waiters(q);
/* Make blk_queue_enter() reexamine the DYING flag. */
wake_up_all(&q->mq_freeze_wq);
+
+ return freeze;
}
/**
@@ -321,6 +325,8 @@ int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags)
return -ENODEV;
}
+ rwsem_acquire_read(&q->q_lockdep_map, 0, 0, _RET_IP_);
+ rwsem_release(&q->q_lockdep_map, _RET_IP_);
return 0;
}
@@ -352,6 +358,8 @@ int __bio_queue_enter(struct request_queue *q, struct bio *bio)
goto dead;
}
+ rwsem_acquire_read(&q->io_lockdep_map, 0, 0, _RET_IP_);
+ rwsem_release(&q->io_lockdep_map, _RET_IP_);
return 0;
dead:
bio_io_error(bio);
@@ -441,6 +449,12 @@ struct request_queue *blk_alloc_queue(struct queue_limits *lim, int node_id)
PERCPU_REF_INIT_ATOMIC, GFP_KERNEL);
if (error)
goto fail_stats;
+ lockdep_register_key(&q->io_lock_cls_key);
+ lockdep_register_key(&q->q_lock_cls_key);
+ lockdep_init_map(&q->io_lockdep_map, "&q->q_usage_counter(io)",
+ &q->io_lock_cls_key, 0);
+ lockdep_init_map(&q->q_lockdep_map, "&q->q_usage_counter(queue)",
+ &q->q_lock_cls_key, 0);
q->nr_requests = BLKDEV_DEFAULT_RQ;
@@ -593,7 +607,7 @@ static inline blk_status_t blk_check_zone_append(struct request_queue *q,
return BLK_STS_IOERR;
/* Make sure the BIO is small enough and will not get split */
- if (nr_sectors > queue_max_zone_append_sectors(q))
+ if (nr_sectors > q->limits.max_zone_append_sectors)
return BLK_STS_IOERR;
bio->bi_opf |= REQ_NOMERGE;
@@ -1106,8 +1120,8 @@ void blk_start_plug_nr_ios(struct blk_plug *plug, unsigned short nr_ios)
return;
plug->cur_ktime = 0;
- plug->mq_list = NULL;
- plug->cached_rq = NULL;
+ rq_list_init(&plug->mq_list);
+ rq_list_init(&plug->cached_rqs);
plug->nr_ios = min_t(unsigned short, nr_ios, BLK_MAX_REQUEST_COUNT);
plug->rq_count = 0;
plug->multiple_queues = false;
@@ -1203,7 +1217,7 @@ void __blk_flush_plug(struct blk_plug *plug, bool from_schedule)
* queue for cached requests, we don't want a blocked task holding
* up a queue freeze/quiesce event.
*/
- if (unlikely(!rq_list_empty(plug->cached_rq)))
+ if (unlikely(!rq_list_empty(&plug->cached_rqs)))
blk_mq_free_plug_rqs(plug);
plug->cur_ktime = 0;
diff --git a/block/blk-crypto-fallback.c b/block/blk-crypto-fallback.c
index b1e7415..29a2054 100644
--- a/block/blk-crypto-fallback.c
+++ b/block/blk-crypto-fallback.c
@@ -226,7 +226,7 @@ static bool blk_crypto_fallback_split_bio_if_needed(struct bio **bio_ptr)
split_bio = bio_split(bio, num_sectors, GFP_NOIO,
&crypto_bio_split);
- if (!split_bio) {
+ if (IS_ERR(split_bio)) {
bio->bi_status = BLK_STS_RESOURCE;
return false;
}
diff --git a/block/blk-integrity.c b/block/blk-integrity.c
index 83b696b..b180cac 100644
--- a/block/blk-integrity.c
+++ b/block/blk-integrity.c
@@ -113,9 +113,9 @@ int blk_rq_map_integrity_sg(struct request *rq, struct scatterlist *sglist)
EXPORT_SYMBOL(blk_rq_map_integrity_sg);
int blk_rq_integrity_map_user(struct request *rq, void __user *ubuf,
- ssize_t bytes, u32 seed)
+ ssize_t bytes)
{
- int ret = bio_integrity_map_user(rq->bio, ubuf, bytes, seed);
+ int ret = bio_integrity_map_user(rq->bio, ubuf, bytes);
if (ret)
return ret;
diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index 25dd4db..ce82770 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -32,13 +32,6 @@ static void get_io_context(struct io_context *ioc)
atomic_long_inc(&ioc->refcount);
}
-static void icq_free_icq_rcu(struct rcu_head *head)
-{
- struct io_cq *icq = container_of(head, struct io_cq, __rcu_head);
-
- kmem_cache_free(icq->__rcu_icq_cache, icq);
-}
-
/*
* Exit an icq. Called with ioc locked for blk-mq, and with both ioc
* and queue locked for legacy.
@@ -102,7 +95,7 @@ static void ioc_destroy_icq(struct io_cq *icq)
*/
icq->__rcu_icq_cache = et->icq_cache;
icq->flags |= ICQ_DESTROYED;
- call_rcu(&icq->__rcu_head, icq_free_icq_rcu);
+ kfree_rcu(icq, __rcu_head);
}
/*
diff --git a/block/blk-merge.c b/block/blk-merge.c
index ad763ec3..e0b28e9 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -107,17 +107,18 @@ static unsigned int bio_allowed_max_sectors(const struct queue_limits *lim)
static struct bio *bio_submit_split(struct bio *bio, int split_sectors)
{
- if (unlikely(split_sectors < 0)) {
- bio->bi_status = errno_to_blk_status(split_sectors);
- bio_endio(bio);
- return NULL;
- }
+ if (unlikely(split_sectors < 0))
+ goto error;
if (split_sectors) {
struct bio *split;
split = bio_split(bio, split_sectors, GFP_NOIO,
&bio->bi_bdev->bd_disk->bio_split);
+ if (IS_ERR(split)) {
+ split_sectors = PTR_ERR(split);
+ goto error;
+ }
split->bi_opf |= REQ_NOMERGE;
blkcg_bio_issue_init(split);
bio_chain(split, bio);
@@ -128,6 +129,10 @@ static struct bio *bio_submit_split(struct bio *bio, int split_sectors)
}
return bio;
+error:
+ bio->bi_status = errno_to_blk_status(split_sectors);
+ bio_endio(bio);
+ return NULL;
}
struct bio *bio_split_discard(struct bio *bio, const struct queue_limits *lim,
@@ -166,17 +171,6 @@ struct bio *bio_split_discard(struct bio *bio, const struct queue_limits *lim,
return bio_submit_split(bio, split_sectors);
}
-struct bio *bio_split_write_zeroes(struct bio *bio,
- const struct queue_limits *lim, unsigned *nsegs)
-{
- *nsegs = 0;
- if (!lim->max_write_zeroes_sectors)
- return bio;
- if (bio_sectors(bio) <= lim->max_write_zeroes_sectors)
- return bio;
- return bio_submit_split(bio, lim->max_write_zeroes_sectors);
-}
-
static inline unsigned int blk_boundary_sectors(const struct queue_limits *lim,
bool is_atomic)
{
@@ -211,7 +205,9 @@ static inline unsigned get_max_io_size(struct bio *bio,
* We ignore lim->max_sectors for atomic writes because it may less
* than the actual bio size, which we cannot tolerate.
*/
- if (is_atomic)
+ if (bio_op(bio) == REQ_OP_WRITE_ZEROES)
+ max_sectors = lim->max_write_zeroes_sectors;
+ else if (is_atomic)
max_sectors = lim->atomic_write_max_sectors;
else
max_sectors = lim->max_sectors;
@@ -296,6 +292,14 @@ static bool bvec_split_segs(const struct queue_limits *lim,
return len > 0 || bv->bv_len > max_len;
}
+static unsigned int bio_split_alignment(struct bio *bio,
+ const struct queue_limits *lim)
+{
+ if (op_is_write(bio_op(bio)) && lim->zone_write_granularity)
+ return lim->zone_write_granularity;
+ return lim->logical_block_size;
+}
+
/**
* bio_split_rw_at - check if and where to split a read/write bio
* @bio: [in] bio to be split
@@ -358,7 +362,7 @@ int bio_split_rw_at(struct bio *bio, const struct queue_limits *lim,
* split size so that each bio is properly block size aligned, even if
* we do not use the full hardware limits.
*/
- bytes = ALIGN_DOWN(bytes, lim->logical_block_size);
+ bytes = ALIGN_DOWN(bytes, bio_split_alignment(bio, lim));
/*
* Bio splitting may cause subtle trouble such as hang when doing sync
@@ -388,16 +392,35 @@ struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim,
struct bio *bio_split_zone_append(struct bio *bio,
const struct queue_limits *lim, unsigned *nr_segs)
{
- unsigned int max_sectors = queue_limits_max_zone_append_sectors(lim);
int split_sectors;
split_sectors = bio_split_rw_at(bio, lim, nr_segs,
- max_sectors << SECTOR_SHIFT);
+ lim->max_zone_append_sectors << SECTOR_SHIFT);
if (WARN_ON_ONCE(split_sectors > 0))
split_sectors = -EINVAL;
return bio_submit_split(bio, split_sectors);
}
+struct bio *bio_split_write_zeroes(struct bio *bio,
+ const struct queue_limits *lim, unsigned *nsegs)
+{
+ unsigned int max_sectors = get_max_io_size(bio, lim);
+
+ *nsegs = 0;
+
+ /*
+ * An unset limit should normally not happen, as bio submission is keyed
+ * off having a non-zero limit. But SCSI can clear the limit in the
+ * I/O completion handler, and we can race and see this. Splitting to a
+ * zero limit obviously doesn't make sense, so band-aid it here.
+ */
+ if (!max_sectors)
+ return bio;
+ if (bio_sectors(bio) <= max_sectors)
+ return bio;
+ return bio_submit_split(bio, max_sectors);
+}
+
/**
* bio_split_to_limits - split a bio to fit the queue limits
* @bio: bio to be split
@@ -411,10 +434,9 @@ struct bio *bio_split_zone_append(struct bio *bio,
*/
struct bio *bio_split_to_limits(struct bio *bio)
{
- const struct queue_limits *lim = &bdev_get_queue(bio->bi_bdev)->limits;
unsigned int nr_segs;
- return __bio_split_to_limits(bio, lim, &nr_segs);
+ return __bio_split_to_limits(bio, bdev_limits(bio->bi_bdev), &nr_segs);
}
EXPORT_SYMBOL(bio_split_to_limits);
@@ -797,7 +819,7 @@ static inline void blk_update_mixed_merge(struct request *req,
static void blk_account_io_merge_request(struct request *req)
{
- if (blk_do_io_stat(req)) {
+ if (req->rq_flags & RQF_IO_STAT) {
part_stat_lock();
part_stat_inc(req->part, merges[op_stat_group(req_op(req))]);
part_stat_local_dec(req->part,
@@ -845,12 +867,13 @@ static struct request *attempt_merge(struct request_queue *q,
if (rq_data_dir(req) != rq_data_dir(next))
return NULL;
- /* Don't merge requests with different write hints. */
- if (req->write_hint != next->write_hint)
- return NULL;
-
- if (req->ioprio != next->ioprio)
- return NULL;
+ if (req->bio && next->bio) {
+ /* Don't merge requests with different write hints. */
+ if (req->bio->bi_write_hint != next->bio->bi_write_hint)
+ return NULL;
+ if (req->bio->bi_ioprio != next->bio->bi_ioprio)
+ return NULL;
+ }
if (!blk_atomic_write_mergeable_rqs(req, next))
return NULL;
@@ -979,12 +1002,13 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
if (!bio_crypt_rq_ctx_compatible(rq, bio))
return false;
- /* Don't merge requests with different write hints. */
- if (rq->write_hint != bio->bi_write_hint)
- return false;
-
- if (rq->ioprio != bio_prio(bio))
- return false;
+ if (rq->bio) {
+ /* Don't merge requests with different write hints. */
+ if (rq->bio->bi_write_hint != bio->bi_write_hint)
+ return false;
+ if (rq->bio->bi_ioprio != bio->bi_ioprio)
+ return false;
+ }
if (blk_atomic_write_mergeable_rq_bio(rq, bio) == false)
return false;
@@ -1005,12 +1029,11 @@ enum elv_merge blk_try_merge(struct request *rq, struct bio *bio)
static void blk_account_io_merge_bio(struct request *req)
{
- if (!blk_do_io_stat(req))
- return;
-
- part_stat_lock();
- part_stat_inc(req->part, merges[op_stat_group(req_op(req))]);
- part_stat_unlock();
+ if (req->rq_flags & RQF_IO_STAT) {
+ part_stat_lock();
+ part_stat_inc(req->part, merges[op_stat_group(req_op(req))]);
+ part_stat_unlock();
+ }
}
enum bio_merge_status bio_attempt_back_merge(struct request *req,
@@ -1156,7 +1179,7 @@ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
struct blk_plug *plug = current->plug;
struct request *rq;
- if (!plug || rq_list_empty(plug->mq_list))
+ if (!plug || rq_list_empty(&plug->mq_list))
return false;
rq_list_for_each(&plug->mq_list, rq) {
diff --git a/block/blk-mq.c b/block/blk-mq.c
index cf626e0..270cfd9 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -92,7 +92,7 @@ static bool blk_mq_check_inflight(struct request *rq, void *priv)
{
struct mq_inflight *mi = priv;
- if (rq->part && blk_do_io_stat(rq) &&
+ if (rq->rq_flags & RQF_IO_STAT &&
(!bdev_is_partition(mi->part) || rq->part == mi->part) &&
blk_mq_rq_state(rq) == MQ_RQ_IN_FLIGHT)
mi->inflight[rq_data_dir(rq)]++;
@@ -120,9 +120,59 @@ void blk_mq_in_flight_rw(struct request_queue *q, struct block_device *part,
inflight[1] = mi.inflight[1];
}
-void blk_freeze_queue_start(struct request_queue *q)
+#ifdef CONFIG_LOCKDEP
+static bool blk_freeze_set_owner(struct request_queue *q,
+ struct task_struct *owner)
{
+ if (!owner)
+ return false;
+
+ if (!q->mq_freeze_depth) {
+ q->mq_freeze_owner = owner;
+ q->mq_freeze_owner_depth = 1;
+ return true;
+ }
+
+ if (owner == q->mq_freeze_owner)
+ q->mq_freeze_owner_depth += 1;
+ return false;
+}
+
+/* verify the last unfreeze in owner context */
+static bool blk_unfreeze_check_owner(struct request_queue *q)
+{
+ if (!q->mq_freeze_owner)
+ return false;
+ if (q->mq_freeze_owner != current)
+ return false;
+ if (--q->mq_freeze_owner_depth == 0) {
+ q->mq_freeze_owner = NULL;
+ return true;
+ }
+ return false;
+}
+
+#else
+
+static bool blk_freeze_set_owner(struct request_queue *q,
+ struct task_struct *owner)
+{
+ return false;
+}
+
+static bool blk_unfreeze_check_owner(struct request_queue *q)
+{
+ return false;
+}
+#endif
+
+bool __blk_freeze_queue_start(struct request_queue *q,
+ struct task_struct *owner)
+{
+ bool freeze;
+
mutex_lock(&q->mq_freeze_lock);
+ freeze = blk_freeze_set_owner(q, owner);
if (++q->mq_freeze_depth == 1) {
percpu_ref_kill(&q->q_usage_counter);
mutex_unlock(&q->mq_freeze_lock);
@@ -131,6 +181,14 @@ void blk_freeze_queue_start(struct request_queue *q)
} else {
mutex_unlock(&q->mq_freeze_lock);
}
+
+ return freeze;
+}
+
+void blk_freeze_queue_start(struct request_queue *q)
+{
+ if (__blk_freeze_queue_start(q, current))
+ blk_freeze_acquire_lock(q, false, false);
}
EXPORT_SYMBOL_GPL(blk_freeze_queue_start);
@@ -149,35 +207,17 @@ int blk_mq_freeze_queue_wait_timeout(struct request_queue *q,
}
EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait_timeout);
-/*
- * Guarantee no request is in use, so we can change any data structure of
- * the queue afterward.
- */
-void blk_freeze_queue(struct request_queue *q)
+void blk_mq_freeze_queue(struct request_queue *q)
{
- /*
- * In the !blk_mq case we are only calling this to kill the
- * q_usage_counter, otherwise this increases the freeze depth
- * and waits for it to return to zero. For this reason there is
- * no blk_unfreeze_queue(), and blk_freeze_queue() is not
- * exported to drivers as the only user for unfreeze is blk_mq.
- */
blk_freeze_queue_start(q);
blk_mq_freeze_queue_wait(q);
}
-
-void blk_mq_freeze_queue(struct request_queue *q)
-{
- /*
- * ...just an alias to keep freeze and unfreeze actions balanced
- * in the blk_mq_* namespace
- */
- blk_freeze_queue(q);
-}
EXPORT_SYMBOL_GPL(blk_mq_freeze_queue);
-void __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic)
+bool __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic)
{
+ bool unfreeze;
+
mutex_lock(&q->mq_freeze_lock);
if (force_atomic)
q->q_usage_counter.data->force_atomic = true;
@@ -187,16 +227,40 @@ void __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic)
percpu_ref_resurrect(&q->q_usage_counter);
wake_up_all(&q->mq_freeze_wq);
}
+ unfreeze = blk_unfreeze_check_owner(q);
mutex_unlock(&q->mq_freeze_lock);
+
+ return unfreeze;
}
void blk_mq_unfreeze_queue(struct request_queue *q)
{
- __blk_mq_unfreeze_queue(q, false);
+ if (__blk_mq_unfreeze_queue(q, false))
+ blk_unfreeze_release_lock(q, false, false);
}
EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue);
/*
+ * non_owner variant of blk_freeze_queue_start
+ *
+ * Unlike blk_freeze_queue_start, the queue doesn't need to be unfrozen
+ * by the same task. This is fragile and should not be used if at all
+ * possible.
+ */
+void blk_freeze_queue_start_non_owner(struct request_queue *q)
+{
+ __blk_freeze_queue_start(q, NULL);
+}
+EXPORT_SYMBOL_GPL(blk_freeze_queue_start_non_owner);
+
+/* non_owner variant of blk_mq_unfreeze_queue */
+void blk_mq_unfreeze_queue_non_owner(struct request_queue *q)
+{
+ __blk_mq_unfreeze_queue(q, false);
+}
+EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue_non_owner);
+
+/*
* FIXME: replace the scsi_internal_device_*block_nowait() calls in the
* mpt3sas driver such that this function can be removed.
*/
@@ -283,8 +347,9 @@ void blk_mq_quiesce_tagset(struct blk_mq_tag_set *set)
if (!blk_queue_skip_tagset_quiesce(q))
blk_mq_quiesce_queue_nowait(q);
}
- blk_mq_wait_quiesce_done(set);
mutex_unlock(&set->tag_list_lock);
+
+ blk_mq_wait_quiesce_done(set);
}
EXPORT_SYMBOL_GPL(blk_mq_quiesce_tagset);
@@ -331,14 +396,9 @@ EXPORT_SYMBOL(blk_rq_init);
/* Set start and alloc time when the allocated request is actually used */
static inline void blk_mq_rq_time_init(struct request *rq, u64 alloc_time_ns)
{
- if (blk_mq_need_time_stamp(rq))
- rq->start_time_ns = blk_time_get_ns();
- else
- rq->start_time_ns = 0;
-
#ifdef CONFIG_BLK_RQ_ALLOC_TIME
if (blk_queue_rq_alloc_time(rq->q))
- rq->alloc_time_ns = alloc_time_ns ?: rq->start_time_ns;
+ rq->alloc_time_ns = alloc_time_ns;
else
rq->alloc_time_ns = 0;
#endif
@@ -359,8 +419,6 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
if (data->flags & BLK_MQ_REQ_PM)
data->rq_flags |= RQF_PM;
- if (blk_queue_io_stat(q))
- data->rq_flags |= RQF_IO_STAT;
rq->rq_flags = data->rq_flags;
if (data->rq_flags & RQF_SCHED_TAGS) {
@@ -420,7 +478,7 @@ __blk_mq_alloc_requests_batch(struct blk_mq_alloc_data *data)
prefetch(tags->static_rqs[tag]);
tag_mask &= ~(1UL << i);
rq = blk_mq_rq_ctx_init(data, tags, tag);
- rq_list_add(data->cached_rq, rq);
+ rq_list_add_head(data->cached_rqs, rq);
nr++;
}
if (!(data->rq_flags & RQF_SCHED_TAGS))
@@ -429,7 +487,7 @@ __blk_mq_alloc_requests_batch(struct blk_mq_alloc_data *data)
percpu_ref_get_many(&data->q->q_usage_counter, nr - 1);
data->nr_tags -= nr;
- return rq_list_pop(data->cached_rq);
+ return rq_list_pop(data->cached_rqs);
}
static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data)
@@ -526,7 +584,7 @@ static struct request *blk_mq_rq_cache_fill(struct request_queue *q,
.flags = flags,
.cmd_flags = opf,
.nr_tags = plug->nr_ios,
- .cached_rq = &plug->cached_rq,
+ .cached_rqs = &plug->cached_rqs,
};
struct request *rq;
@@ -551,14 +609,14 @@ static struct request *blk_mq_alloc_cached_request(struct request_queue *q,
if (!plug)
return NULL;
- if (rq_list_empty(plug->cached_rq)) {
+ if (rq_list_empty(&plug->cached_rqs)) {
if (plug->nr_ios == 1)
return NULL;
rq = blk_mq_rq_cache_fill(q, plug, opf, flags);
if (!rq)
return NULL;
} else {
- rq = rq_list_peek(&plug->cached_rq);
+ rq = rq_list_peek(&plug->cached_rqs);
if (!rq || rq->q != q)
return NULL;
@@ -567,8 +625,8 @@ static struct request *blk_mq_alloc_cached_request(struct request_queue *q,
if (op_is_flush(rq->cmd_flags) != op_is_flush(opf))
return NULL;
- plug->cached_rq = rq_list_next(rq);
- blk_mq_rq_time_init(rq, 0);
+ rq_list_pop(&plug->cached_rqs);
+ blk_mq_rq_time_init(rq, blk_time_get_ns());
}
rq->cmd_flags = opf;
@@ -744,7 +802,7 @@ void blk_mq_free_plug_rqs(struct blk_plug *plug)
{
struct request *rq;
- while ((rq = rq_list_pop(&plug->cached_rq)) != NULL)
+ while ((rq = rq_list_pop(&plug->cached_rqs)) != NULL)
blk_mq_free_request(rq);
}
@@ -764,7 +822,7 @@ EXPORT_SYMBOL(blk_dump_rq_flags);
static void blk_account_io_completion(struct request *req, unsigned int bytes)
{
- if (req->part && blk_do_io_stat(req)) {
+ if (req->rq_flags & RQF_IO_STAT) {
const int sgrp = op_stat_group(req_op(req));
part_stat_lock();
@@ -784,7 +842,7 @@ static void blk_print_req_error(struct request *req, blk_status_t status)
blk_op_str(req_op(req)),
(__force u32)(req->cmd_flags & ~REQ_OP_MASK),
req->nr_phys_segments,
- IOPRIO_PRIO_CLASS(req->ioprio));
+ IOPRIO_PRIO_CLASS(req_get_ioprio(req)));
}
/*
@@ -982,8 +1040,7 @@ static inline void blk_account_io_done(struct request *req, u64 now)
* normal IO on queueing nor completion. Accounting the
* containing request is enough.
*/
- if (blk_do_io_stat(req) && req->part &&
- !(req->rq_flags & RQF_FLUSH_SEQ)) {
+ if ((req->rq_flags & (RQF_IO_STAT|RQF_FLUSH_SEQ)) == RQF_IO_STAT) {
const int sgrp = op_stat_group(req_op(req));
part_stat_lock();
@@ -996,28 +1053,63 @@ static inline void blk_account_io_done(struct request *req, u64 now)
}
}
+static inline bool blk_rq_passthrough_stats(struct request *req)
+{
+ struct bio *bio = req->bio;
+
+ if (!blk_queue_passthrough_stat(req->q))
+ return false;
+
+ /* Requests without a bio do not transfer data. */
+ if (!bio)
+ return false;
+
+ /*
+ * Stats are accumulated in the bdev, so must have one attached to a
+ * bio to track stats. Most drivers do not set the bdev for passthrough
+ * requests, but nvme is one that will set it.
+ */
+ if (!bio->bi_bdev)
+ return false;
+
+ /*
+ * We don't know what a passthrough command does, but we know the
+ * payload size and data direction. Ensuring the size is aligned to the
+ * block size filters out most commands with payloads that don't
+ * represent sector access.
+ */
+ if (blk_rq_bytes(req) & (bdev_logical_block_size(bio->bi_bdev) - 1))
+ return false;
+ return true;
+}
+
static inline void blk_account_io_start(struct request *req)
{
trace_block_io_start(req);
- if (blk_do_io_stat(req)) {
- /*
- * All non-passthrough requests are created from a bio with one
- * exception: when a flush command that is part of a flush sequence
- * generated by the state machine in blk-flush.c is cloned onto the
- * lower device by dm-multipath we can get here without a bio.
- */
- if (req->bio)
- req->part = req->bio->bi_bdev;
- else
- req->part = req->q->disk->part0;
+ if (!blk_queue_io_stat(req->q))
+ return;
+ if (blk_rq_is_passthrough(req) && !blk_rq_passthrough_stats(req))
+ return;
- part_stat_lock();
- update_io_ticks(req->part, jiffies, false);
- part_stat_local_inc(req->part,
- in_flight[op_is_write(req_op(req))]);
- part_stat_unlock();
- }
+ req->rq_flags |= RQF_IO_STAT;
+ req->start_time_ns = blk_time_get_ns();
+
+ /*
+ * All non-passthrough requests are created from a bio with one
+ * exception: when a flush command that is part of a flush sequence
+ * generated by the state machine in blk-flush.c is cloned onto the
+ * lower device by dm-multipath we can get here without a bio.
+ */
+ if (req->bio)
+ req->part = req->bio->bi_bdev;
+ else
+ req->part = req->q->disk->part0;
+
+ part_stat_lock();
+ update_io_ticks(req->part, jiffies, false);
+ part_stat_local_inc(req->part, in_flight[op_is_write(req_op(req))]);
+ part_stat_unlock();
}
static inline void __blk_mq_end_request_acct(struct request *rq, u64 now)
@@ -1300,8 +1392,7 @@ static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq)
*/
if (!plug->has_elevator && (rq->rq_flags & RQF_SCHED_TAGS))
plug->has_elevator = true;
- rq->rq_next = NULL;
- rq_list_add(&plug->mq_list, rq);
+ rq_list_add_tail(&plug->mq_list, rq);
plug->rq_count++;
}
@@ -1698,7 +1789,6 @@ void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list)
sbitmap_for_each_set(&hctx->ctx_map, flush_busy_ctx, &data);
}
-EXPORT_SYMBOL_GPL(blk_mq_flush_busy_ctxs);
struct dispatch_rq_data {
struct blk_mq_hw_ctx *hctx;
@@ -2200,6 +2290,24 @@ void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
}
EXPORT_SYMBOL(blk_mq_delay_run_hw_queue);
+static inline bool blk_mq_hw_queue_need_run(struct blk_mq_hw_ctx *hctx)
+{
+ bool need_run;
+
+ /*
+ * When queue is quiesced, we may be switching io scheduler, or
+ * updating nr_hw_queues, or other things, and we can't run queue
+ * any more, even blk_mq_hctx_has_pending() can't be called safely.
+ *
+ * And queue will be rerun in blk_mq_unquiesce_queue() if it is
+ * quiesced.
+ */
+ __blk_mq_run_dispatch_ops(hctx->queue, false,
+ need_run = !blk_queue_quiesced(hctx->queue) &&
+ blk_mq_hctx_has_pending(hctx));
+ return need_run;
+}
+
/**
* blk_mq_run_hw_queue - Start to run a hardware queue.
* @hctx: Pointer to the hardware queue to run.
@@ -2220,20 +2328,23 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
might_sleep_if(!async && hctx->flags & BLK_MQ_F_BLOCKING);
- /*
- * When queue is quiesced, we may be switching io scheduler, or
- * updating nr_hw_queues, or other things, and we can't run queue
- * any more, even __blk_mq_hctx_has_pending() can't be called safely.
- *
- * And queue will be rerun in blk_mq_unquiesce_queue() if it is
- * quiesced.
- */
- __blk_mq_run_dispatch_ops(hctx->queue, false,
- need_run = !blk_queue_quiesced(hctx->queue) &&
- blk_mq_hctx_has_pending(hctx));
+ need_run = blk_mq_hw_queue_need_run(hctx);
+ if (!need_run) {
+ unsigned long flags;
- if (!need_run)
- return;
+ /*
+ * Synchronize with blk_mq_unquiesce_queue(), because we check
+ * if hw queue is quiesced locklessly above, we need the use
+ * ->queue_lock to make sure we see the up-to-date status to
+ * not miss rerunning the hw queue.
+ */
+ spin_lock_irqsave(&hctx->queue->queue_lock, flags);
+ need_run = blk_mq_hw_queue_need_run(hctx);
+ spin_unlock_irqrestore(&hctx->queue->queue_lock, flags);
+
+ if (!need_run)
+ return;
+ }
if (async || !cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask)) {
blk_mq_delay_run_hw_queue(hctx, 0);
@@ -2390,6 +2501,12 @@ void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
return;
clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
+ /*
+ * Pairs with the smp_mb() in blk_mq_hctx_stopped() to order the
+ * clearing of BLK_MQ_S_STOPPED above and the checking of dispatch
+ * list in the subsequent routine.
+ */
+ smp_mb__after_atomic();
blk_mq_run_hw_queue(hctx, async);
}
EXPORT_SYMBOL_GPL(blk_mq_start_stopped_hw_queue);
@@ -2542,7 +2659,6 @@ static void blk_mq_bio_to_request(struct request *rq, struct bio *bio,
rq->cmd_flags |= REQ_FAILFAST_MASK;
rq->__sector = bio->bi_iter.bi_sector;
- rq->write_hint = bio->bi_write_hint;
blk_rq_bio_prep(rq, bio, nr_segs);
if (bio_integrity(bio))
rq->nr_integrity_segments = blk_rq_count_integrity_sg(rq->q,
@@ -2620,6 +2736,7 @@ static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(rq->q)) {
blk_mq_insert_request(rq, 0);
+ blk_mq_run_hw_queue(hctx, false);
return;
}
@@ -2650,6 +2767,7 @@ static blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last)
if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(rq->q)) {
blk_mq_insert_request(rq, 0);
+ blk_mq_run_hw_queue(hctx, false);
return BLK_STS_OK;
}
@@ -2666,7 +2784,7 @@ static void blk_mq_plug_issue_direct(struct blk_plug *plug)
blk_status_t ret = BLK_STS_OK;
while ((rq = rq_list_pop(&plug->mq_list))) {
- bool last = rq_list_empty(plug->mq_list);
+ bool last = rq_list_empty(&plug->mq_list);
if (hctx != rq->mq_hctx) {
if (hctx) {
@@ -2709,8 +2827,7 @@ static void blk_mq_dispatch_plug_list(struct blk_plug *plug, bool from_sched)
{
struct blk_mq_hw_ctx *this_hctx = NULL;
struct blk_mq_ctx *this_ctx = NULL;
- struct request *requeue_list = NULL;
- struct request **requeue_lastp = &requeue_list;
+ struct rq_list requeue_list = {};
unsigned int depth = 0;
bool is_passthrough = false;
LIST_HEAD(list);
@@ -2724,12 +2841,12 @@ static void blk_mq_dispatch_plug_list(struct blk_plug *plug, bool from_sched)
is_passthrough = blk_rq_is_passthrough(rq);
} else if (this_hctx != rq->mq_hctx || this_ctx != rq->mq_ctx ||
is_passthrough != blk_rq_is_passthrough(rq)) {
- rq_list_add_tail(&requeue_lastp, rq);
+ rq_list_add_tail(&requeue_list, rq);
continue;
}
- list_add(&rq->queuelist, &list);
+ list_add_tail(&rq->queuelist, &list);
depth++;
- } while (!rq_list_empty(plug->mq_list));
+ } while (!rq_list_empty(&plug->mq_list));
plug->mq_list = requeue_list;
trace_block_unplug(this_hctx->queue, depth, !from_sched);
@@ -2784,19 +2901,19 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
if (q->mq_ops->queue_rqs) {
blk_mq_run_dispatch_ops(q,
__blk_mq_flush_plug_list(q, plug));
- if (rq_list_empty(plug->mq_list))
+ if (rq_list_empty(&plug->mq_list))
return;
}
blk_mq_run_dispatch_ops(q,
blk_mq_plug_issue_direct(plug));
- if (rq_list_empty(plug->mq_list))
+ if (rq_list_empty(&plug->mq_list))
return;
}
do {
blk_mq_dispatch_plug_list(plug, from_schedule);
- } while (!rq_list_empty(plug->mq_list));
+ } while (!rq_list_empty(&plug->mq_list));
}
static void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
@@ -2861,7 +2978,7 @@ static struct request *blk_mq_get_new_requests(struct request_queue *q,
if (plug) {
data.nr_tags = plug->nr_ios;
plug->nr_ios = 1;
- data.cached_rq = &plug->cached_rq;
+ data.cached_rqs = &plug->cached_rqs;
}
rq = __blk_mq_alloc_requests(&data);
@@ -2884,7 +3001,7 @@ static struct request *blk_mq_peek_cached_request(struct blk_plug *plug,
if (!plug)
return NULL;
- rq = rq_list_peek(&plug->cached_rq);
+ rq = rq_list_peek(&plug->cached_rqs);
if (!rq || rq->q != q)
return NULL;
if (type != rq->mq_hctx->type &&
@@ -2898,17 +3015,17 @@ static struct request *blk_mq_peek_cached_request(struct blk_plug *plug,
static void blk_mq_use_cached_rq(struct request *rq, struct blk_plug *plug,
struct bio *bio)
{
- WARN_ON_ONCE(rq_list_peek(&plug->cached_rq) != rq);
+ if (rq_list_pop(&plug->cached_rqs) != rq)
+ WARN_ON_ONCE(1);
/*
* If any qos ->throttle() end up blocking, we will have flushed the
* plug and hence killed the cached_rq list as well. Pop this entry
* before we throttle.
*/
- plug->cached_rq = rq_list_next(rq);
rq_qos_throttle(rq->q, bio);
- blk_mq_rq_time_init(rq, 0);
+ blk_mq_rq_time_init(rq, blk_time_get_ns());
rq->cmd_flags = bio->bi_opf;
INIT_LIST_HEAD(&rq->queuelist);
}
@@ -3187,8 +3304,6 @@ int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
rq->special_vec = rq_src->special_vec;
}
rq->nr_phys_segments = rq_src->nr_phys_segments;
- rq->ioprio = rq_src->ioprio;
- rq->write_hint = rq_src->write_hint;
if (rq->bio && blk_crypto_rq_bio_prep(rq, rq->bio, gfp_mask) < 0)
goto free_and_out;
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 3bd43b1..89a20ff 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -155,7 +155,7 @@ struct blk_mq_alloc_data {
/* allocate multiple requests/tags in one go */
unsigned int nr_tags;
- struct request **cached_rq;
+ struct rq_list *cached_rqs;
/* input & output parameter */
struct blk_mq_ctx *ctx;
@@ -230,6 +230,19 @@ static inline struct blk_mq_tags *blk_mq_tags_from_data(struct blk_mq_alloc_data
static inline bool blk_mq_hctx_stopped(struct blk_mq_hw_ctx *hctx)
{
+ /* Fast path: hardware queue is not stopped most of the time. */
+ if (likely(!test_bit(BLK_MQ_S_STOPPED, &hctx->state)))
+ return false;
+
+ /*
+ * This barrier is used to order adding of dispatch list before and
+ * the test of BLK_MQ_S_STOPPED below. Pairs with the memory barrier
+ * in blk_mq_start_stopped_hw_queue() so that dispatch code could
+ * either see BLK_MQ_S_STOPPED is cleared or dispatch list is not
+ * empty to avoid missing dispatching requests.
+ */
+ smp_mb();
+
return test_bit(BLK_MQ_S_STOPPED, &hctx->state);
}
diff --git a/block/blk-rq-qos.c b/block/blk-rq-qos.c
index 058f92c..eb9618c 100644
--- a/block/blk-rq-qos.c
+++ b/block/blk-rq-qos.c
@@ -218,7 +218,6 @@ static int rq_qos_wake_function(struct wait_queue_entry *curr,
return -1;
data->got_token = true;
- smp_wmb();
wake_up_process(data->task);
list_del_init_careful(&curr->entry);
return 1;
@@ -274,10 +273,9 @@ void rq_qos_wait(struct rq_wait *rqw, void *private_data,
* which means we now have two. Put our local token
* and wake anyone else potentially waiting for one.
*/
- smp_rmb();
if (data.got_token)
cleanup_cb(rqw, private_data);
- break;
+ return;
}
io_schedule();
has_sleeper = true;
diff --git a/block/blk-settings.c b/block/blk-settings.c
index a446654..f1d4dfd 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -50,7 +50,7 @@ void blk_set_stacking_limits(struct queue_limits *lim)
lim->max_sectors = UINT_MAX;
lim->max_dev_sectors = UINT_MAX;
lim->max_write_zeroes_sectors = UINT_MAX;
- lim->max_zone_append_sectors = UINT_MAX;
+ lim->max_hw_zone_append_sectors = UINT_MAX;
lim->max_user_discard_sectors = UINT_MAX;
}
EXPORT_SYMBOL(blk_set_stacking_limits);
@@ -91,17 +91,16 @@ static int blk_validate_zoned_limits(struct queue_limits *lim)
if (lim->zone_write_granularity < lim->logical_block_size)
lim->zone_write_granularity = lim->logical_block_size;
- if (lim->max_zone_append_sectors) {
- /*
- * The Zone Append size is limited by the maximum I/O size
- * and the zone size given that it can't span zones.
- */
- lim->max_zone_append_sectors =
- min3(lim->max_hw_sectors,
- lim->max_zone_append_sectors,
- lim->chunk_sectors);
- }
-
+ /*
+ * The Zone Append size is limited by the maximum I/O size and the zone
+ * size given that it can't span zones.
+ *
+ * If no max_hw_zone_append_sectors limit is provided, the block layer
+ * will emulated it, else we're also bound by the hardware limit.
+ */
+ lim->max_zone_append_sectors =
+ min_not_zero(lim->max_hw_zone_append_sectors,
+ min(lim->chunk_sectors, lim->max_hw_sectors));
return 0;
}
@@ -223,7 +222,7 @@ static void blk_validate_atomic_write_limits(struct queue_limits *lim)
* Check that the limits in lim are valid, initialize defaults for unset
* values, and cap values based on others where needed.
*/
-static int blk_validate_limits(struct queue_limits *lim)
+int blk_validate_limits(struct queue_limits *lim)
{
unsigned int max_hw_sectors;
unsigned int logical_block_sectors;
@@ -366,6 +365,7 @@ static int blk_validate_limits(struct queue_limits *lim)
return err;
return blk_validate_zoned_limits(lim);
}
+EXPORT_SYMBOL_GPL(blk_validate_limits);
/*
* Set the default limits for a newly allocated queue. @lim contains the
@@ -508,10 +508,10 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
t->features |= (b->features & BLK_FEAT_INHERIT_MASK);
/*
- * BLK_FEAT_NOWAIT and BLK_FEAT_POLL need to be supported both by the
- * stacking driver and all underlying devices. The stacking driver sets
- * the flags before stacking the limits, and this will clear the flags
- * if any of the underlying devices does not support it.
+ * Some feaures need to be supported both by the stacking driver and all
+ * underlying devices. The stacking driver sets these flags before
+ * stacking the limits, and this will clear the flags if any of the
+ * underlying devices does not support it.
*/
if (!(b->features & BLK_FEAT_NOWAIT))
t->features &= ~BLK_FEAT_NOWAIT;
@@ -527,8 +527,8 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
t->max_dev_sectors = min_not_zero(t->max_dev_sectors, b->max_dev_sectors);
t->max_write_zeroes_sectors = min(t->max_write_zeroes_sectors,
b->max_write_zeroes_sectors);
- t->max_zone_append_sectors = min(queue_limits_max_zone_append_sectors(t),
- queue_limits_max_zone_append_sectors(b));
+ t->max_hw_zone_append_sectors = min(t->max_hw_zone_append_sectors,
+ b->max_hw_zone_append_sectors);
t->seg_boundary_mask = min_not_zero(t->seg_boundary_mask,
b->seg_boundary_mask);
@@ -661,7 +661,7 @@ EXPORT_SYMBOL(blk_stack_limits);
void queue_limits_stack_bdev(struct queue_limits *t, struct block_device *bdev,
sector_t offset, const char *pfx)
{
- if (blk_stack_limits(t, &bdev_get_queue(bdev)->limits,
+ if (blk_stack_limits(t, bdev_limits(bdev),
get_start_sect(bdev) + offset))
pr_notice("%s: Warning: Device %pg is misaligned\n",
pfx, bdev);
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index e85941b..d80a202 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -23,14 +23,14 @@
struct queue_sysfs_entry {
struct attribute attr;
ssize_t (*show)(struct gendisk *disk, char *page);
- int (*load_module)(struct gendisk *disk, const char *page, size_t count);
ssize_t (*store)(struct gendisk *disk, const char *page, size_t count);
+ void (*load_module)(struct gendisk *disk, const char *page, size_t count);
};
static ssize_t
queue_var_show(unsigned long var, char *page)
{
- return sprintf(page, "%lu\n", var);
+ return sysfs_emit(page, "%lu\n", var);
}
static ssize_t
@@ -121,7 +121,7 @@ QUEUE_SYSFS_LIMIT_SHOW(atomic_write_unit_max)
#define QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(_field) \
static ssize_t queue_##_field##_show(struct gendisk *disk, char *page) \
{ \
- return sprintf(page, "%llu\n", \
+ return sysfs_emit(page, "%llu\n", \
(unsigned long long)disk->queue->limits._field << \
SECTOR_SHIFT); \
}
@@ -131,6 +131,7 @@ QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(max_hw_discard_sectors)
QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(max_write_zeroes_sectors)
QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(atomic_write_max_sectors)
QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(atomic_write_boundary_sectors)
+QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(max_zone_append_sectors)
#define QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_KB(_field) \
static ssize_t queue_##_field##_show(struct gendisk *disk, char *page) \
@@ -144,7 +145,7 @@ QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_KB(max_hw_sectors)
#define QUEUE_SYSFS_SHOW_CONST(_name, _val) \
static ssize_t queue_##_name##_show(struct gendisk *disk, char *page) \
{ \
- return sprintf(page, "%d\n", _val); \
+ return sysfs_emit(page, "%d\n", _val); \
}
/* deprecated fields */
@@ -178,18 +179,6 @@ static ssize_t queue_max_discard_sectors_store(struct gendisk *disk,
return ret;
}
-/*
- * For zone append queue_max_zone_append_sectors does not just return the
- * underlying queue limits, but actually contains a calculation. Because of
- * that we can't simply use QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES here.
- */
-static ssize_t queue_zone_append_max_show(struct gendisk *disk, char *page)
-{
- return sprintf(page, "%llu\n",
- (u64)queue_max_zone_append_sectors(disk->queue) <<
- SECTOR_SHIFT);
-}
-
static ssize_t
queue_max_sectors_store(struct gendisk *disk, const char *page, size_t count)
{
@@ -235,7 +224,7 @@ static ssize_t queue_feature_store(struct gendisk *disk, const char *page,
#define QUEUE_SYSFS_FEATURE(_name, _feature) \
static ssize_t queue_##_name##_show(struct gendisk *disk, char *page) \
{ \
- return sprintf(page, "%u\n", \
+ return sysfs_emit(page, "%u\n", \
!!(disk->queue->limits.features & _feature)); \
} \
static ssize_t queue_##_name##_store(struct gendisk *disk, \
@@ -252,7 +241,7 @@ QUEUE_SYSFS_FEATURE(stable_writes, BLK_FEAT_STABLE_WRITES);
#define QUEUE_SYSFS_FEATURE_SHOW(_name, _feature) \
static ssize_t queue_##_name##_show(struct gendisk *disk, char *page) \
{ \
- return sprintf(page, "%u\n", \
+ return sysfs_emit(page, "%u\n", \
!!(disk->queue->limits.features & _feature)); \
}
@@ -263,8 +252,8 @@ QUEUE_SYSFS_FEATURE_SHOW(dax, BLK_FEAT_DAX);
static ssize_t queue_zoned_show(struct gendisk *disk, char *page)
{
if (blk_queue_is_zoned(disk->queue))
- return sprintf(page, "host-managed\n");
- return sprintf(page, "none\n");
+ return sysfs_emit(page, "host-managed\n");
+ return sysfs_emit(page, "none\n");
}
static ssize_t queue_nr_zones_show(struct gendisk *disk, char *page)
@@ -272,6 +261,34 @@ static ssize_t queue_nr_zones_show(struct gendisk *disk, char *page)
return queue_var_show(disk_nr_zones(disk), page);
}
+static ssize_t queue_iostats_passthrough_show(struct gendisk *disk, char *page)
+{
+ return queue_var_show(blk_queue_passthrough_stat(disk->queue), page);
+}
+
+static ssize_t queue_iostats_passthrough_store(struct gendisk *disk,
+ const char *page, size_t count)
+{
+ struct queue_limits lim;
+ unsigned long ios;
+ ssize_t ret;
+
+ ret = queue_var_store(&ios, page, count);
+ if (ret < 0)
+ return ret;
+
+ lim = queue_limits_start_update(disk->queue);
+ if (ios)
+ lim.flags |= BLK_FLAG_IOSTATS_PASSTHROUGH;
+ else
+ lim.flags &= ~BLK_FLAG_IOSTATS_PASSTHROUGH;
+
+ ret = queue_limits_commit_update(disk->queue, &lim);
+ if (ret)
+ return ret;
+
+ return count;
+}
static ssize_t queue_nomerges_show(struct gendisk *disk, char *page)
{
return queue_var_show((blk_queue_nomerges(disk->queue) << 1) |
@@ -349,7 +366,7 @@ static ssize_t queue_poll_store(struct gendisk *disk, const char *page,
static ssize_t queue_io_timeout_show(struct gendisk *disk, char *page)
{
- return sprintf(page, "%u\n", jiffies_to_msecs(disk->queue->rq_timeout));
+ return sysfs_emit(page, "%u\n", jiffies_to_msecs(disk->queue->rq_timeout));
}
static ssize_t queue_io_timeout_store(struct gendisk *disk, const char *page,
@@ -370,8 +387,8 @@ static ssize_t queue_io_timeout_store(struct gendisk *disk, const char *page,
static ssize_t queue_wc_show(struct gendisk *disk, char *page)
{
if (blk_queue_write_cache(disk->queue))
- return sprintf(page, "write back\n");
- return sprintf(page, "write through\n");
+ return sysfs_emit(page, "write back\n");
+ return sysfs_emit(page, "write through\n");
}
static ssize_t queue_wc_store(struct gendisk *disk, const char *page,
@@ -451,7 +468,7 @@ QUEUE_RO_ENTRY(queue_atomic_write_unit_min, "atomic_write_unit_min_bytes");
QUEUE_RO_ENTRY(queue_write_same_max, "write_same_max_bytes");
QUEUE_RO_ENTRY(queue_max_write_zeroes_sectors, "write_zeroes_max_bytes");
-QUEUE_RO_ENTRY(queue_zone_append_max, "zone_append_max_bytes");
+QUEUE_RO_ENTRY(queue_max_zone_append_sectors, "zone_append_max_bytes");
QUEUE_RO_ENTRY(queue_zone_write_granularity, "zone_write_granularity");
QUEUE_RO_ENTRY(queue_zoned, "zoned");
@@ -460,6 +477,7 @@ QUEUE_RO_ENTRY(queue_max_open_zones, "max_open_zones");
QUEUE_RO_ENTRY(queue_max_active_zones, "max_active_zones");
QUEUE_RW_ENTRY(queue_nomerges, "nomerges");
+QUEUE_RW_ENTRY(queue_iostats_passthrough, "iostats_passthrough");
QUEUE_RW_ENTRY(queue_rq_affinity, "rq_affinity");
QUEUE_RW_ENTRY(queue_poll, "io_poll");
QUEUE_RW_ENTRY(queue_poll_delay, "io_poll_delay");
@@ -501,9 +519,9 @@ static ssize_t queue_wb_lat_show(struct gendisk *disk, char *page)
return -EINVAL;
if (wbt_disabled(disk->queue))
- return sprintf(page, "0\n");
+ return sysfs_emit(page, "0\n");
- return sprintf(page, "%llu\n",
+ return sysfs_emit(page, "%llu\n",
div_u64(wbt_get_min_lat(disk->queue), 1000));
}
@@ -578,7 +596,7 @@ static struct attribute *queue_attrs[] = {
&queue_atomic_write_unit_max_entry.attr,
&queue_write_same_max_entry.attr,
&queue_max_write_zeroes_sectors_entry.attr,
- &queue_zone_append_max_entry.attr,
+ &queue_max_zone_append_sectors_entry.attr,
&queue_zone_write_granularity_entry.attr,
&queue_rotational_entry.attr,
&queue_zoned_entry.attr,
@@ -586,6 +604,7 @@ static struct attribute *queue_attrs[] = {
&queue_max_open_zones_entry.attr,
&queue_max_active_zones_entry.attr,
&queue_nomerges_entry.attr,
+ &queue_iostats_passthrough_entry.attr,
&queue_iostats_entry.attr,
&queue_stable_writes_entry.attr,
&queue_add_random_entry.attr,
@@ -684,11 +703,8 @@ queue_attr_store(struct kobject *kobj, struct attribute *attr,
* queue to ensure that the module file can be read when the request
* queue is the one for the device storing the module file.
*/
- if (entry->load_module) {
- res = entry->load_module(disk, page, length);
- if (res)
- return res;
- }
+ if (entry->load_module)
+ entry->load_module(disk, page, length);
blk_mq_freeze_queue(q);
mutex_lock(&q->sysfs_lock);
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 2c4192e..82dbaef 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -1485,13 +1485,13 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of,
goto out_finish;
ret = -EINVAL;
- if (!strcmp(tok, "rbps") && val > 1)
+ if (!strcmp(tok, "rbps"))
v[0] = val;
- else if (!strcmp(tok, "wbps") && val > 1)
+ else if (!strcmp(tok, "wbps"))
v[1] = val;
- else if (!strcmp(tok, "riops") && val > 1)
+ else if (!strcmp(tok, "riops"))
v[2] = min_t(u64, val, UINT_MAX);
- else if (!strcmp(tok, "wiops") && val > 1)
+ else if (!strcmp(tok, "wiops"))
v[3] = min_t(u64, val, UINT_MAX);
else
goto out_finish;
@@ -1526,6 +1526,42 @@ static void throtl_shutdown_wq(struct request_queue *q)
cancel_work_sync(&td->dispatch_work);
}
+static void tg_flush_bios(struct throtl_grp *tg)
+{
+ struct throtl_service_queue *sq = &tg->service_queue;
+
+ if (tg->flags & THROTL_TG_CANCELING)
+ return;
+ /*
+ * Set the flag to make sure throtl_pending_timer_fn() won't
+ * stop until all throttled bios are dispatched.
+ */
+ tg->flags |= THROTL_TG_CANCELING;
+
+ /*
+ * Do not dispatch cgroup without THROTL_TG_PENDING or cgroup
+ * will be inserted to service queue without THROTL_TG_PENDING
+ * set in tg_update_disptime below. Then IO dispatched from
+ * child in tg_dispatch_one_bio will trigger double insertion
+ * and corrupt the tree.
+ */
+ if (!(tg->flags & THROTL_TG_PENDING))
+ return;
+
+ /*
+ * Update disptime after setting the above flag to make sure
+ * throtl_select_dispatch() won't exit without dispatching.
+ */
+ tg_update_disptime(tg);
+
+ throtl_schedule_pending_timer(sq, jiffies + 1);
+}
+
+static void throtl_pd_offline(struct blkg_policy_data *pd)
+{
+ tg_flush_bios(pd_to_tg(pd));
+}
+
struct blkcg_policy blkcg_policy_throtl = {
.dfl_cftypes = throtl_files,
.legacy_cftypes = throtl_legacy_files,
@@ -1533,6 +1569,7 @@ struct blkcg_policy blkcg_policy_throtl = {
.pd_alloc_fn = throtl_pd_alloc,
.pd_init_fn = throtl_pd_init,
.pd_online_fn = throtl_pd_online,
+ .pd_offline_fn = throtl_pd_offline,
.pd_free_fn = throtl_pd_free,
};
@@ -1553,32 +1590,15 @@ void blk_throtl_cancel_bios(struct gendisk *disk)
*/
rcu_read_lock();
blkg_for_each_descendant_post(blkg, pos_css, q->root_blkg) {
- struct throtl_grp *tg = blkg_to_tg(blkg);
- struct throtl_service_queue *sq = &tg->service_queue;
-
/*
- * Set the flag to make sure throtl_pending_timer_fn() won't
- * stop until all throttled bios are dispatched.
+ * disk_release will call pd_offline_fn to cancel bios.
+ * However, disk_release can't be called if someone get
+ * the refcount of device and issued bios which are
+ * inflight after del_gendisk.
+ * Cancel bios here to ensure no bios are inflight after
+ * del_gendisk.
*/
- tg->flags |= THROTL_TG_CANCELING;
-
- /*
- * Do not dispatch cgroup without THROTL_TG_PENDING or cgroup
- * will be inserted to service queue without THROTL_TG_PENDING
- * set in tg_update_disptime below. Then IO dispatched from
- * child in tg_dispatch_one_bio will trigger double insertion
- * and corrupt the tree.
- */
- if (!(tg->flags & THROTL_TG_PENDING))
- continue;
-
- /*
- * Update disptime after setting the above flag to make sure
- * throtl_select_dispatch() won't exit without dispatching.
- */
- tg_update_disptime(tg);
-
- throtl_schedule_pending_timer(sq, jiffies + 1);
+ tg_flush_bios(blkg_to_tg(blkg));
}
rcu_read_unlock();
spin_unlock_irq(&q->queue_lock);
diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index af19296..7021175 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -18,7 +18,7 @@
#include <linux/vmalloc.h>
#include <linux/sched/mm.h>
#include <linux/spinlock.h>
-#include <linux/atomic.h>
+#include <linux/refcount.h>
#include <linux/mempool.h>
#include "blk.h"
@@ -64,7 +64,7 @@ static const char *const zone_cond_name[] = {
struct blk_zone_wplug {
struct hlist_node node;
struct list_head link;
- atomic_t ref;
+ refcount_t ref;
spinlock_t lock;
unsigned int flags;
unsigned int zone_no;
@@ -348,13 +348,6 @@ int blkdev_zone_mgmt_ioctl(struct block_device *bdev, blk_mode_t mode,
return ret;
}
-static inline bool disk_zone_is_conv(struct gendisk *disk, sector_t sector)
-{
- if (!disk->conv_zones_bitmap)
- return false;
- return test_bit(disk_zone_no(disk, sector), disk->conv_zones_bitmap);
-}
-
static bool disk_zone_is_last(struct gendisk *disk, struct blk_zone *zone)
{
return zone->start + zone->len >= get_capacity(disk);
@@ -411,7 +404,7 @@ static struct blk_zone_wplug *disk_get_zone_wplug(struct gendisk *disk,
hlist_for_each_entry_rcu(zwplug, &disk->zone_wplugs_hash[idx], node) {
if (zwplug->zone_no == zno &&
- atomic_inc_not_zero(&zwplug->ref)) {
+ refcount_inc_not_zero(&zwplug->ref)) {
rcu_read_unlock();
return zwplug;
}
@@ -432,7 +425,7 @@ static void disk_free_zone_wplug_rcu(struct rcu_head *rcu_head)
static inline void disk_put_zone_wplug(struct blk_zone_wplug *zwplug)
{
- if (atomic_dec_and_test(&zwplug->ref)) {
+ if (refcount_dec_and_test(&zwplug->ref)) {
WARN_ON_ONCE(!bio_list_empty(&zwplug->bio_list));
WARN_ON_ONCE(!list_empty(&zwplug->link));
WARN_ON_ONCE(!(zwplug->flags & BLK_ZONE_WPLUG_UNHASHED));
@@ -463,7 +456,7 @@ static inline bool disk_should_remove_zone_wplug(struct gendisk *disk,
* taken when the plug was allocated and another reference taken by the
* caller context).
*/
- if (atomic_read(&zwplug->ref) > 2)
+ if (refcount_read(&zwplug->ref) > 2)
return false;
/* We can remove zone write plugs for zones that are empty or full. */
@@ -533,7 +526,7 @@ static struct blk_zone_wplug *disk_get_and_lock_zone_wplug(struct gendisk *disk,
INIT_HLIST_NODE(&zwplug->node);
INIT_LIST_HEAD(&zwplug->link);
- atomic_set(&zwplug->ref, 2);
+ refcount_set(&zwplug->ref, 2);
spin_lock_init(&zwplug->lock);
zwplug->flags = 0;
zwplug->zone_no = zno;
@@ -624,7 +617,7 @@ static inline void disk_zone_wplug_set_error(struct gendisk *disk,
* finished.
*/
zwplug->flags |= BLK_ZONE_WPLUG_ERROR;
- atomic_inc(&zwplug->ref);
+ refcount_inc(&zwplug->ref);
spin_lock_irqsave(&disk->zone_wplugs_lock, flags);
list_add_tail(&zwplug->link, &disk->zone_wplugs_err_list);
@@ -709,7 +702,7 @@ static bool blk_zone_wplug_handle_reset_or_finish(struct bio *bio,
struct blk_zone_wplug *zwplug;
/* Conventional zones cannot be reset nor finished. */
- if (disk_zone_is_conv(disk, sector)) {
+ if (!bdev_zone_is_seq(bio->bi_bdev, sector)) {
bio_io_error(bio);
return true;
}
@@ -963,7 +956,7 @@ static bool blk_zone_wplug_handle_write(struct bio *bio, unsigned int nr_segs)
}
/* Conventional zones do not need write plugging. */
- if (disk_zone_is_conv(disk, sector)) {
+ if (!bdev_zone_is_seq(bio->bi_bdev, sector)) {
/* Zone append to conventional zones is not allowed. */
if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
bio_io_error(bio);
@@ -1099,7 +1092,7 @@ static void disk_zone_wplug_schedule_bio_work(struct gendisk *disk,
* reference we take here.
*/
WARN_ON_ONCE(!(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED));
- atomic_inc(&zwplug->ref);
+ refcount_inc(&zwplug->ref);
queue_work(disk->zone_wplugs_wq, &zwplug->bio_work);
}
@@ -1444,7 +1437,7 @@ static void disk_destroy_zone_wplugs_hash_table(struct gendisk *disk)
while (!hlist_empty(&disk->zone_wplugs_hash[i])) {
zwplug = hlist_entry(disk->zone_wplugs_hash[i].first,
struct blk_zone_wplug, node);
- atomic_inc(&zwplug->ref);
+ refcount_inc(&zwplug->ref);
disk_remove_zone_wplug(disk, zwplug);
disk_put_zone_wplug(zwplug);
}
@@ -1455,6 +1448,24 @@ static void disk_destroy_zone_wplugs_hash_table(struct gendisk *disk)
disk->zone_wplugs_hash_bits = 0;
}
+static unsigned int disk_set_conv_zones_bitmap(struct gendisk *disk,
+ unsigned long *bitmap)
+{
+ unsigned int nr_conv_zones = 0;
+ unsigned long flags;
+
+ spin_lock_irqsave(&disk->zone_wplugs_lock, flags);
+ if (bitmap)
+ nr_conv_zones = bitmap_weight(bitmap, disk->nr_zones);
+ bitmap = rcu_replace_pointer(disk->conv_zones_bitmap, bitmap,
+ lockdep_is_held(&disk->zone_wplugs_lock));
+ spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags);
+
+ kfree_rcu_mightsleep(bitmap);
+
+ return nr_conv_zones;
+}
+
void disk_free_zone_resources(struct gendisk *disk)
{
if (!disk->zone_wplugs_pool)
@@ -1478,8 +1489,7 @@ void disk_free_zone_resources(struct gendisk *disk)
mempool_destroy(disk->zone_wplugs_pool);
disk->zone_wplugs_pool = NULL;
- bitmap_free(disk->conv_zones_bitmap);
- disk->conv_zones_bitmap = NULL;
+ disk_set_conv_zones_bitmap(disk, NULL);
disk->zone_capacity = 0;
disk->last_zone_capacity = 0;
disk->nr_zones = 0;
@@ -1538,17 +1548,15 @@ static int disk_update_zone_resources(struct gendisk *disk,
struct blk_revalidate_zone_args *args)
{
struct request_queue *q = disk->queue;
- unsigned int nr_seq_zones, nr_conv_zones = 0;
+ unsigned int nr_seq_zones, nr_conv_zones;
unsigned int pool_size;
struct queue_limits lim;
disk->nr_zones = args->nr_zones;
disk->zone_capacity = args->zone_capacity;
disk->last_zone_capacity = args->last_zone_capacity;
- swap(disk->conv_zones_bitmap, args->conv_zones_bitmap);
- if (disk->conv_zones_bitmap)
- nr_conv_zones = bitmap_weight(disk->conv_zones_bitmap,
- disk->nr_zones);
+ nr_conv_zones =
+ disk_set_conv_zones_bitmap(disk, args->conv_zones_bitmap);
if (nr_conv_zones >= disk->nr_zones) {
pr_warn("%s: Invalid number of conventional zones %u / %u\n",
disk->disk_name, nr_conv_zones, disk->nr_zones);
@@ -1774,12 +1782,6 @@ int blk_revalidate_disk_zones(struct gendisk *disk)
return -ENODEV;
}
- if (!queue_max_zone_append_sectors(q)) {
- pr_warn("%s: Invalid 0 maximum zone append limit\n",
- disk->disk_name);
- return -ENODEV;
- }
-
/*
* Ensure that all memory allocations in this context are done as if
* GFP_NOIO was specified.
@@ -1823,8 +1825,6 @@ int blk_revalidate_disk_zones(struct gendisk *disk)
disk_free_zone_resources(disk);
blk_mq_unfreeze_queue(q);
- kfree(args.conv_zones_bitmap);
-
return ret;
}
EXPORT_SYMBOL_GPL(blk_revalidate_disk_zones);
@@ -1851,7 +1851,7 @@ int queue_zone_wplugs_show(void *data, struct seq_file *m)
spin_lock_irqsave(&zwplug->lock, flags);
zwp_zone_no = zwplug->zone_no;
zwp_flags = zwplug->flags;
- zwp_ref = atomic_read(&zwplug->ref);
+ zwp_ref = refcount_read(&zwplug->ref);
zwp_wp_offset = zwplug->wp_offset;
zwp_bio_list_size = bio_list_size(&zwplug->bio_list);
spin_unlock_irqrestore(&zwplug->lock, flags);
diff --git a/block/blk.h b/block/blk.h
index c718e42..2c26abf 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -4,6 +4,7 @@
#include <linux/bio-integrity.h>
#include <linux/blk-crypto.h>
+#include <linux/lockdep.h>
#include <linux/memblock.h> /* for max_pfn/max_low_pfn */
#include <linux/sched/sysctl.h>
#include <linux/timekeeping.h>
@@ -34,9 +35,10 @@ struct blk_flush_queue *blk_alloc_flush_queue(int node, int cmd_size,
gfp_t flags);
void blk_free_flush_queue(struct blk_flush_queue *q);
-void blk_freeze_queue(struct request_queue *q);
-void __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic);
-void blk_queue_start_drain(struct request_queue *q);
+bool __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic);
+bool blk_queue_start_drain(struct request_queue *q);
+bool __blk_freeze_queue_start(struct request_queue *q,
+ struct task_struct *owner);
int __bio_queue_enter(struct request_queue *q, struct bio *bio);
void submit_bio_noacct_nocheck(struct bio *bio);
void bio_await_chain(struct bio *bio);
@@ -69,8 +71,11 @@ static inline int bio_queue_enter(struct bio *bio)
{
struct request_queue *q = bdev_get_queue(bio->bi_bdev);
- if (blk_try_enter_queue(q, false))
+ if (blk_try_enter_queue(q, false)) {
+ rwsem_acquire_read(&q->io_lockdep_map, 0, 0, _RET_IP_);
+ rwsem_release(&q->io_lockdep_map, _RET_IP_);
return 0;
+ }
return __bio_queue_enter(q, bio);
}
@@ -405,17 +410,6 @@ void blk_apply_bdi_limits(struct backing_dev_info *bdi,
struct queue_limits *lim);
int blk_dev_init(void);
-/*
- * Contribute to IO statistics IFF:
- *
- * a) it's attached to a gendisk, and
- * b) the queue had IO stats enabled when this request was started
- */
-static inline bool blk_do_io_stat(struct request *rq)
-{
- return (rq->rq_flags & RQF_IO_STAT) && !blk_rq_is_passthrough(rq);
-}
-
void update_io_ticks(struct block_device *part, unsigned long now, bool end);
unsigned int part_in_flight(struct block_device *part);
@@ -463,11 +457,6 @@ static inline bool bio_zone_write_plugging(struct bio *bio)
{
return bio_flagged(bio, BIO_ZONE_WRITE_PLUGGING);
}
-static inline bool bio_is_zone_append(struct bio *bio)
-{
- return bio_op(bio) == REQ_OP_ZONE_APPEND ||
- bio_flagged(bio, BIO_EMULATES_ZONE_APPEND);
-}
void blk_zone_write_plug_bio_merged(struct bio *bio);
void blk_zone_write_plug_init_request(struct request *rq);
static inline void blk_zone_update_request_bio(struct request *rq,
@@ -516,10 +505,6 @@ static inline bool bio_zone_write_plugging(struct bio *bio)
{
return false;
}
-static inline bool bio_is_zone_append(struct bio *bio)
-{
- return false;
-}
static inline void blk_zone_write_plug_bio_merged(struct bio *bio)
{
}
@@ -558,6 +543,7 @@ void blk_free_ext_minor(unsigned int minor);
#define ADDPART_FLAG_NONE 0
#define ADDPART_FLAG_RAID 1
#define ADDPART_FLAG_WHOLEDISK 2
+#define ADDPART_FLAG_READONLY 4
int bdev_add_partition(struct gendisk *disk, int partno, sector_t start,
sector_t length);
int bdev_del_partition(struct gendisk *disk, int partno);
@@ -734,4 +720,22 @@ void blk_integrity_verify(struct bio *bio);
void blk_integrity_prepare(struct request *rq);
void blk_integrity_complete(struct request *rq, unsigned int nr_bytes);
+static inline void blk_freeze_acquire_lock(struct request_queue *q, bool
+ disk_dead, bool queue_dying)
+{
+ if (!disk_dead)
+ rwsem_acquire(&q->io_lockdep_map, 0, 1, _RET_IP_);
+ if (!queue_dying)
+ rwsem_acquire(&q->q_lockdep_map, 0, 1, _RET_IP_);
+}
+
+static inline void blk_unfreeze_release_lock(struct request_queue *q, bool
+ disk_dead, bool queue_dying)
+{
+ if (!queue_dying)
+ rwsem_release(&q->q_lockdep_map, _RET_IP_);
+ if (!disk_dead)
+ rwsem_release(&q->io_lockdep_map, _RET_IP_);
+}
+
#endif /* BLK_INTERNAL_H */
diff --git a/block/elevator.c b/block/elevator.c
index 9430cde..7c3ba80 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -598,13 +598,19 @@ void elevator_init_mq(struct request_queue *q)
* drain any dispatch activities originated from passthrough
* requests, then no need to quiesce queue which may add long boot
* latency, especially when lots of disks are involved.
+ *
+ * Disk isn't added yet, so verifying queue lock only manually.
*/
- blk_mq_freeze_queue(q);
+ blk_freeze_queue_start_non_owner(q);
+ blk_freeze_acquire_lock(q, true, false);
+ blk_mq_freeze_queue_wait(q);
+
blk_mq_cancel_work_sync(q);
err = blk_mq_init_sched(q, e);
- blk_mq_unfreeze_queue(q);
+ blk_unfreeze_release_lock(q, true, false);
+ blk_mq_unfreeze_queue_non_owner(q);
if (err) {
pr_warn("\"%s\" elevator initialization failed, "
@@ -704,15 +710,15 @@ static int elevator_change(struct request_queue *q, const char *elevator_name)
return ret;
}
-int elv_iosched_load_module(struct gendisk *disk, const char *buf,
- size_t count)
+void elv_iosched_load_module(struct gendisk *disk, const char *buf,
+ size_t count)
{
char elevator_name[ELV_NAME_MAX];
struct elevator_type *found;
const char *name;
if (!elv_support_iosched(disk->queue))
- return -EOPNOTSUPP;
+ return;
strscpy(elevator_name, buf, sizeof(elevator_name));
name = strstrip(elevator_name);
@@ -723,8 +729,6 @@ int elv_iosched_load_module(struct gendisk *disk, const char *buf,
if (!found)
request_module("%s-iosched", name);
-
- return 0;
}
ssize_t elv_iosched_store(struct gendisk *disk, const char *buf,
diff --git a/block/elevator.h b/block/elevator.h
index 2a78544..dbf357e 100644
--- a/block/elevator.h
+++ b/block/elevator.h
@@ -148,8 +148,8 @@ extern void elv_unregister(struct elevator_type *);
* io scheduler sysfs switching
*/
ssize_t elv_iosched_show(struct gendisk *disk, char *page);
-int elv_iosched_load_module(struct gendisk *disk, const char *page,
- size_t count);
+void elv_iosched_load_module(struct gendisk *disk, const char *page,
+ size_t count);
ssize_t elv_iosched_store(struct gendisk *disk, const char *page, size_t count);
extern bool elv_bio_merge_ok(struct request *, struct bio *);
diff --git a/block/fops.c b/block/fops.c
index e696ae5..2d01c90 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -35,13 +35,10 @@ static blk_opf_t dio_bio_write_op(struct kiocb *iocb)
return opf;
}
-static bool blkdev_dio_invalid(struct block_device *bdev, loff_t pos,
- struct iov_iter *iter, bool is_atomic)
+static bool blkdev_dio_invalid(struct block_device *bdev, struct kiocb *iocb,
+ struct iov_iter *iter)
{
- if (is_atomic && !generic_atomic_write_valid(iter, pos))
- return true;
-
- return pos & (bdev_logical_block_size(bdev) - 1) ||
+ return iocb->ki_pos & (bdev_logical_block_size(bdev) - 1) ||
!bdev_iter_is_aligned(bdev, iter);
}
@@ -368,13 +365,12 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
{
struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host);
- bool is_atomic = iocb->ki_flags & IOCB_ATOMIC;
unsigned int nr_pages;
if (!iov_iter_count(iter))
return 0;
- if (blkdev_dio_invalid(bdev, iocb->ki_pos, iter, is_atomic))
+ if (blkdev_dio_invalid(bdev, iocb, iter))
return -EINVAL;
nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS + 1);
@@ -383,7 +379,7 @@ static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
return __blkdev_direct_IO_simple(iocb, iter, bdev,
nr_pages);
return __blkdev_direct_IO_async(iocb, iter, bdev, nr_pages);
- } else if (is_atomic) {
+ } else if (iocb->ki_flags & IOCB_ATOMIC) {
return -EINVAL;
}
return __blkdev_direct_IO(iocb, iter, bdev, bio_max_segs(nr_pages));
@@ -625,7 +621,7 @@ static int blkdev_open(struct inode *inode, struct file *filp)
if (!bdev)
return -ENXIO;
- if (bdev_can_atomic_write(bdev) && filp->f_flags & O_DIRECT)
+ if (bdev_can_atomic_write(bdev))
filp->f_mode |= FMODE_CAN_ATOMIC_WRITE;
ret = bdev_open(bdev, mode, filp->private_data, NULL, filp);
@@ -700,6 +696,12 @@ static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
if ((iocb->ki_flags & (IOCB_NOWAIT | IOCB_DIRECT)) == IOCB_NOWAIT)
return -EOPNOTSUPP;
+ if (iocb->ki_flags & IOCB_ATOMIC) {
+ ret = generic_atomic_write_valid(iocb, from);
+ if (ret)
+ return ret;
+ }
+
size -= iocb->ki_pos;
if (iov_iter_count(from) > size) {
shorted = iov_iter_count(from) - size;
diff --git a/block/genhd.c b/block/genhd.c
index 1c05dd4..9130e16 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -383,16 +383,18 @@ int disk_scan_partitions(struct gendisk *disk, blk_mode_t mode)
}
/**
- * device_add_disk - add disk information to kernel list
+ * add_disk_fwnode - add disk information to kernel list with fwnode
* @parent: parent device for the disk
* @disk: per-device partitioning information
* @groups: Additional per-device sysfs groups
+ * @fwnode: attached disk fwnode
*
* This function registers the partitioning information in @disk
- * with the kernel.
+ * with the kernel. Also attach a fwnode to the disk device.
*/
-int __must_check device_add_disk(struct device *parent, struct gendisk *disk,
- const struct attribute_group **groups)
+int __must_check add_disk_fwnode(struct device *parent, struct gendisk *disk,
+ const struct attribute_group **groups,
+ struct fwnode_handle *fwnode)
{
struct device *ddev = disk_to_dev(disk);
@@ -452,6 +454,8 @@ int __must_check device_add_disk(struct device *parent, struct gendisk *disk,
ddev->parent = parent;
ddev->groups = groups;
dev_set_name(ddev, "%s", disk->disk_name);
+ if (fwnode)
+ device_set_node(ddev, fwnode);
if (!(disk->flags & GENHD_FL_HIDDEN))
ddev->devt = MKDEV(disk->major, disk->first_minor);
ret = device_add(ddev);
@@ -553,6 +557,22 @@ int __must_check device_add_disk(struct device *parent, struct gendisk *disk,
elevator_exit(disk->queue);
return ret;
}
+EXPORT_SYMBOL_GPL(add_disk_fwnode);
+
+/**
+ * device_add_disk - add disk information to kernel list
+ * @parent: parent device for the disk
+ * @disk: per-device partitioning information
+ * @groups: Additional per-device sysfs groups
+ *
+ * This function registers the partitioning information in @disk
+ * with the kernel.
+ */
+int __must_check device_add_disk(struct device *parent, struct gendisk *disk,
+ const struct attribute_group **groups)
+{
+ return add_disk_fwnode(parent, disk, groups, NULL);
+}
EXPORT_SYMBOL(device_add_disk);
static void blk_report_disk_dead(struct gendisk *disk, bool surprise)
@@ -581,13 +601,13 @@ static void blk_report_disk_dead(struct gendisk *disk, bool surprise)
rcu_read_unlock();
}
-static void __blk_mark_disk_dead(struct gendisk *disk)
+static bool __blk_mark_disk_dead(struct gendisk *disk)
{
/*
* Fail any new I/O.
*/
if (test_and_set_bit(GD_DEAD, &disk->state))
- return;
+ return false;
if (test_bit(GD_OWNS_QUEUE, &disk->state))
blk_queue_flag_set(QUEUE_FLAG_DYING, disk->queue);
@@ -600,7 +620,7 @@ static void __blk_mark_disk_dead(struct gendisk *disk)
/*
* Prevent new I/O from crossing bio_queue_enter().
*/
- blk_queue_start_drain(disk->queue);
+ return blk_queue_start_drain(disk->queue);
}
/**
@@ -641,6 +661,7 @@ void del_gendisk(struct gendisk *disk)
struct request_queue *q = disk->queue;
struct block_device *part;
unsigned long idx;
+ bool start_drain, queue_dying;
might_sleep();
@@ -668,7 +689,10 @@ void del_gendisk(struct gendisk *disk)
* Drop all partitions now that the disk is marked dead.
*/
mutex_lock(&disk->open_mutex);
- __blk_mark_disk_dead(disk);
+ start_drain = __blk_mark_disk_dead(disk);
+ queue_dying = blk_queue_dying(q);
+ if (start_drain)
+ blk_freeze_acquire_lock(q, true, queue_dying);
xa_for_each_start(&disk->part_tbl, idx, part, 1)
drop_partition(part);
mutex_unlock(&disk->open_mutex);
@@ -725,6 +749,9 @@ void del_gendisk(struct gendisk *disk)
if (queue_is_mq(q))
blk_mq_exit_queue(q);
}
+
+ if (start_drain)
+ blk_unfreeze_release_lock(q, true, queue_dying);
}
EXPORT_SYMBOL(del_gendisk);
@@ -756,7 +783,7 @@ static ssize_t disk_badblocks_show(struct device *dev,
struct gendisk *disk = dev_to_disk(dev);
if (!disk->bb)
- return sprintf(page, "\n");
+ return sysfs_emit(page, "\n");
return badblocks_show(disk->bb, page, 0);
}
@@ -904,7 +931,7 @@ static ssize_t disk_range_show(struct device *dev,
{
struct gendisk *disk = dev_to_disk(dev);
- return sprintf(buf, "%d\n", disk->minors);
+ return sysfs_emit(buf, "%d\n", disk->minors);
}
static ssize_t disk_ext_range_show(struct device *dev,
@@ -912,7 +939,7 @@ static ssize_t disk_ext_range_show(struct device *dev,
{
struct gendisk *disk = dev_to_disk(dev);
- return sprintf(buf, "%d\n",
+ return sysfs_emit(buf, "%d\n",
(disk->flags & GENHD_FL_NO_PART) ? 1 : DISK_MAX_PARTS);
}
@@ -921,7 +948,7 @@ static ssize_t disk_removable_show(struct device *dev,
{
struct gendisk *disk = dev_to_disk(dev);
- return sprintf(buf, "%d\n",
+ return sysfs_emit(buf, "%d\n",
(disk->flags & GENHD_FL_REMOVABLE ? 1 : 0));
}
@@ -930,7 +957,7 @@ static ssize_t disk_hidden_show(struct device *dev,
{
struct gendisk *disk = dev_to_disk(dev);
- return sprintf(buf, "%d\n",
+ return sysfs_emit(buf, "%d\n",
(disk->flags & GENHD_FL_HIDDEN ? 1 : 0));
}
@@ -939,13 +966,13 @@ static ssize_t disk_ro_show(struct device *dev,
{
struct gendisk *disk = dev_to_disk(dev);
- return sprintf(buf, "%d\n", get_disk_ro(disk) ? 1 : 0);
+ return sysfs_emit(buf, "%d\n", get_disk_ro(disk) ? 1 : 0);
}
ssize_t part_size_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
- return sprintf(buf, "%llu\n", bdev_nr_sectors(dev_to_bdev(dev)));
+ return sysfs_emit(buf, "%llu\n", bdev_nr_sectors(dev_to_bdev(dev)));
}
ssize_t part_stat_show(struct device *dev,
@@ -962,7 +989,7 @@ ssize_t part_stat_show(struct device *dev,
part_stat_unlock();
}
part_stat_read_all(bdev, &stat);
- return sprintf(buf,
+ return sysfs_emit(buf,
"%8lu %8lu %8llu %8u "
"%8lu %8lu %8llu %8u "
"%8u %8u %8u "
@@ -1004,14 +1031,14 @@ ssize_t part_inflight_show(struct device *dev, struct device_attribute *attr,
else
part_in_flight_rw(bdev, inflight);
- return sprintf(buf, "%8u %8u\n", inflight[0], inflight[1]);
+ return sysfs_emit(buf, "%8u %8u\n", inflight[0], inflight[1]);
}
static ssize_t disk_capability_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
dev_warn_once(dev, "the capability attribute has been deprecated.\n");
- return sprintf(buf, "0\n");
+ return sysfs_emit(buf, "0\n");
}
static ssize_t disk_alignment_offset_show(struct device *dev,
@@ -1020,7 +1047,7 @@ static ssize_t disk_alignment_offset_show(struct device *dev,
{
struct gendisk *disk = dev_to_disk(dev);
- return sprintf(buf, "%d\n", bdev_alignment_offset(disk->part0));
+ return sysfs_emit(buf, "%d\n", bdev_alignment_offset(disk->part0));
}
static ssize_t disk_discard_alignment_show(struct device *dev,
@@ -1029,7 +1056,7 @@ static ssize_t disk_discard_alignment_show(struct device *dev,
{
struct gendisk *disk = dev_to_disk(dev);
- return sprintf(buf, "%d\n", bdev_alignment_offset(disk->part0));
+ return sysfs_emit(buf, "%d\n", bdev_alignment_offset(disk->part0));
}
static ssize_t diskseq_show(struct device *dev,
@@ -1037,13 +1064,13 @@ static ssize_t diskseq_show(struct device *dev,
{
struct gendisk *disk = dev_to_disk(dev);
- return sprintf(buf, "%llu\n", disk->diskseq);
+ return sysfs_emit(buf, "%llu\n", disk->diskseq);
}
static ssize_t partscan_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
- return sprintf(buf, "%u\n", disk_has_partscan(dev_to_disk(dev)));
+ return sysfs_emit(buf, "%u\n", disk_has_partscan(dev_to_disk(dev)));
}
static DEVICE_ATTR(range, 0444, disk_range_show, NULL);
@@ -1065,7 +1092,7 @@ static DEVICE_ATTR(partscan, 0444, partscan_show, NULL);
ssize_t part_fail_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
- return sprintf(buf, "%d\n",
+ return sysfs_emit(buf, "%d\n",
bdev_test_flag(dev_to_bdev(dev), BD_MAKE_IT_FAIL));
}
@@ -1264,40 +1291,35 @@ static int diskstats_show(struct seq_file *seqf, void *v)
part_stat_unlock();
}
part_stat_read_all(hd, &stat);
- seq_printf(seqf, "%4d %7d %pg "
- "%lu %lu %lu %u "
- "%lu %lu %lu %u "
- "%u %u %u "
- "%lu %lu %lu %u "
- "%lu %u"
- "\n",
- MAJOR(hd->bd_dev), MINOR(hd->bd_dev), hd,
- stat.ios[STAT_READ],
- stat.merges[STAT_READ],
- stat.sectors[STAT_READ],
- (unsigned int)div_u64(stat.nsecs[STAT_READ],
- NSEC_PER_MSEC),
- stat.ios[STAT_WRITE],
- stat.merges[STAT_WRITE],
- stat.sectors[STAT_WRITE],
- (unsigned int)div_u64(stat.nsecs[STAT_WRITE],
- NSEC_PER_MSEC),
- inflight,
- jiffies_to_msecs(stat.io_ticks),
- (unsigned int)div_u64(stat.nsecs[STAT_READ] +
- stat.nsecs[STAT_WRITE] +
- stat.nsecs[STAT_DISCARD] +
- stat.nsecs[STAT_FLUSH],
- NSEC_PER_MSEC),
- stat.ios[STAT_DISCARD],
- stat.merges[STAT_DISCARD],
- stat.sectors[STAT_DISCARD],
- (unsigned int)div_u64(stat.nsecs[STAT_DISCARD],
- NSEC_PER_MSEC),
- stat.ios[STAT_FLUSH],
- (unsigned int)div_u64(stat.nsecs[STAT_FLUSH],
- NSEC_PER_MSEC)
- );
+ seq_put_decimal_ull_width(seqf, "", MAJOR(hd->bd_dev), 4);
+ seq_put_decimal_ull_width(seqf, " ", MINOR(hd->bd_dev), 7);
+ seq_printf(seqf, " %pg", hd);
+ seq_put_decimal_ull(seqf, " ", stat.ios[STAT_READ]);
+ seq_put_decimal_ull(seqf, " ", stat.merges[STAT_READ]);
+ seq_put_decimal_ull(seqf, " ", stat.sectors[STAT_READ]);
+ seq_put_decimal_ull(seqf, " ", (unsigned int)div_u64(stat.nsecs[STAT_READ],
+ NSEC_PER_MSEC));
+ seq_put_decimal_ull(seqf, " ", stat.ios[STAT_WRITE]);
+ seq_put_decimal_ull(seqf, " ", stat.merges[STAT_WRITE]);
+ seq_put_decimal_ull(seqf, " ", stat.sectors[STAT_WRITE]);
+ seq_put_decimal_ull(seqf, " ", (unsigned int)div_u64(stat.nsecs[STAT_WRITE],
+ NSEC_PER_MSEC));
+ seq_put_decimal_ull(seqf, " ", inflight);
+ seq_put_decimal_ull(seqf, " ", jiffies_to_msecs(stat.io_ticks));
+ seq_put_decimal_ull(seqf, " ", (unsigned int)div_u64(stat.nsecs[STAT_READ] +
+ stat.nsecs[STAT_WRITE] +
+ stat.nsecs[STAT_DISCARD] +
+ stat.nsecs[STAT_FLUSH],
+ NSEC_PER_MSEC));
+ seq_put_decimal_ull(seqf, " ", stat.ios[STAT_DISCARD]);
+ seq_put_decimal_ull(seqf, " ", stat.merges[STAT_DISCARD]);
+ seq_put_decimal_ull(seqf, " ", stat.sectors[STAT_DISCARD]);
+ seq_put_decimal_ull(seqf, " ", (unsigned int)div_u64(stat.nsecs[STAT_DISCARD],
+ NSEC_PER_MSEC));
+ seq_put_decimal_ull(seqf, " ", stat.ios[STAT_FLUSH]);
+ seq_put_decimal_ull(seqf, " ", (unsigned int)div_u64(stat.nsecs[STAT_FLUSH],
+ NSEC_PER_MSEC));
+ seq_putc(seqf, '\n');
}
rcu_read_unlock();
diff --git a/block/partitions/Kconfig b/block/partitions/Kconfig
index 7aff4eb..ce17e41 100644
--- a/block/partitions/Kconfig
+++ b/block/partitions/Kconfig
@@ -270,4 +270,13 @@
Say Y here if you want to read the partition table from bootargs.
The format for the command line is just like mtdparts.
+config OF_PARTITION
+ bool "Device Tree partition support" if PARTITION_ADVANCED
+ depends on OF
+ help
+ Say Y here if you want to enable support for partition table
+ defined in Device Tree. (mainly for eMMC)
+ The format for the device tree node is just like MTD fixed-partition
+ schema.
+
endmenu
diff --git a/block/partitions/Makefile b/block/partitions/Makefile
index a7f05cd..25d4249 100644
--- a/block/partitions/Makefile
+++ b/block/partitions/Makefile
@@ -12,6 +12,7 @@
obj-$(CONFIG_MAC_PARTITION) += mac.o
obj-$(CONFIG_LDM_PARTITION) += ldm.o
obj-$(CONFIG_MSDOS_PARTITION) += msdos.o
+obj-$(CONFIG_OF_PARTITION) += of.o
obj-$(CONFIG_OSF_PARTITION) += osf.o
obj-$(CONFIG_SGI_PARTITION) += sgi.o
obj-$(CONFIG_SUN_PARTITION) += sun.o
diff --git a/block/partitions/check.h b/block/partitions/check.h
index 8d70a88..e5c1c61 100644
--- a/block/partitions/check.h
+++ b/block/partitions/check.h
@@ -62,6 +62,7 @@ int karma_partition(struct parsed_partitions *state);
int ldm_partition(struct parsed_partitions *state);
int mac_partition(struct parsed_partitions *state);
int msdos_partition(struct parsed_partitions *state);
+int of_partition(struct parsed_partitions *state);
int osf_partition(struct parsed_partitions *state);
int sgi_partition(struct parsed_partitions *state);
int sun_partition(struct parsed_partitions *state);
diff --git a/block/partitions/cmdline.c b/block/partitions/cmdline.c
index 152c85d..da3e719 100644
--- a/block/partitions/cmdline.c
+++ b/block/partitions/cmdline.c
@@ -237,6 +237,9 @@ static int add_part(int slot, struct cmdline_subpart *subpart,
put_partition(state, slot, subpart->from >> 9,
subpart->size >> 9);
+ if (subpart->flags & PF_RDONLY)
+ state->parts[slot].flags |= ADDPART_FLAG_READONLY;
+
info = &state->parts[slot].info;
strscpy(info->volname, subpart->name, sizeof(info->volname));
diff --git a/block/partitions/core.c b/block/partitions/core.c
index 5bd7a60..815ed33 100644
--- a/block/partitions/core.c
+++ b/block/partitions/core.c
@@ -43,6 +43,9 @@ static int (*const check_part[])(struct parsed_partitions *) = {
#ifdef CONFIG_CMDLINE_PARTITION
cmdline_partition,
#endif
+#ifdef CONFIG_OF_PARTITION
+ of_partition, /* cmdline have priority to OF */
+#endif
#ifdef CONFIG_EFI_PARTITION
efi_partition, /* this must come before msdos */
#endif
@@ -253,6 +256,8 @@ static int part_uevent(const struct device *dev, struct kobj_uevent_env *env)
add_uevent_var(env, "PARTN=%u", bdev_partno(part));
if (part->bd_meta_info && part->bd_meta_info->volname[0])
add_uevent_var(env, "PARTNAME=%s", part->bd_meta_info->volname);
+ if (part->bd_meta_info && part->bd_meta_info->uuid[0])
+ add_uevent_var(env, "PARTUUID=%s", part->bd_meta_info->uuid);
return 0;
}
@@ -373,6 +378,9 @@ static struct block_device *add_partition(struct gendisk *disk, int partno,
goto out_del;
}
+ if (flags & ADDPART_FLAG_READONLY)
+ bdev_set_flag(bdev, BD_READ_ONLY);
+
/* everything is up and running, commence */
err = xa_insert(&disk->part_tbl, partno, bdev, GFP_KERNEL);
if (err)
diff --git a/block/partitions/of.c b/block/partitions/of.c
new file mode 100644
index 0000000..4e760fd
--- /dev/null
+++ b/block/partitions/of.c
@@ -0,0 +1,110 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/blkdev.h>
+#include <linux/major.h>
+#include <linux/of.h>
+#include <linux/string.h>
+#include "check.h"
+
+static int validate_of_partition(struct device_node *np, int slot)
+{
+ u64 offset, size;
+ int len;
+
+ const __be32 *reg = of_get_property(np, "reg", &len);
+ int a_cells = of_n_addr_cells(np);
+ int s_cells = of_n_size_cells(np);
+
+ /* Make sure reg len match the expected addr and size cells */
+ if (len / sizeof(*reg) != a_cells + s_cells)
+ return -EINVAL;
+
+ /* Validate offset conversion from bytes to sectors */
+ offset = of_read_number(reg, a_cells);
+ if (offset % SECTOR_SIZE)
+ return -EINVAL;
+
+ /* Validate size conversion from bytes to sectors */
+ size = of_read_number(reg + a_cells, s_cells);
+ if (!size || size % SECTOR_SIZE)
+ return -EINVAL;
+
+ return 0;
+}
+
+static void add_of_partition(struct parsed_partitions *state, int slot,
+ struct device_node *np)
+{
+ struct partition_meta_info *info;
+ char tmp[sizeof(info->volname) + 4];
+ const char *partname;
+ int len;
+
+ const __be32 *reg = of_get_property(np, "reg", &len);
+ int a_cells = of_n_addr_cells(np);
+ int s_cells = of_n_size_cells(np);
+
+ /* Convert bytes to sector size */
+ u64 offset = of_read_number(reg, a_cells) / SECTOR_SIZE;
+ u64 size = of_read_number(reg + a_cells, s_cells) / SECTOR_SIZE;
+
+ put_partition(state, slot, offset, size);
+
+ if (of_property_read_bool(np, "read-only"))
+ state->parts[slot].flags |= ADDPART_FLAG_READONLY;
+
+ /*
+ * Follow MTD label logic, search for label property,
+ * fallback to node name if not found.
+ */
+ info = &state->parts[slot].info;
+ partname = of_get_property(np, "label", &len);
+ if (!partname)
+ partname = of_get_property(np, "name", &len);
+ strscpy(info->volname, partname, sizeof(info->volname));
+
+ snprintf(tmp, sizeof(tmp), "(%s)", info->volname);
+ strlcat(state->pp_buf, tmp, PAGE_SIZE);
+}
+
+int of_partition(struct parsed_partitions *state)
+{
+ struct device *ddev = disk_to_dev(state->disk);
+ struct device_node *np;
+ int slot;
+
+ struct device_node *partitions_np = of_node_get(ddev->of_node);
+
+ if (!partitions_np ||
+ !of_device_is_compatible(partitions_np, "fixed-partitions"))
+ return 0;
+
+ slot = 1;
+ /* Validate parition offset and size */
+ for_each_child_of_node(partitions_np, np) {
+ if (validate_of_partition(np, slot)) {
+ of_node_put(np);
+ of_node_put(partitions_np);
+
+ return -1;
+ }
+
+ slot++;
+ }
+
+ slot = 1;
+ for_each_child_of_node(partitions_np, np) {
+ if (slot >= state->limit) {
+ of_node_put(np);
+ break;
+ }
+
+ add_of_partition(state, slot, np);
+
+ slot++;
+ }
+
+ strlcat(state->pp_buf, "\n", PAGE_SIZE);
+
+ return 1;
+}
diff --git a/block/sed-opal.c b/block/sed-opal.c
index 598fd3e..5a28f23 100644
--- a/block/sed-opal.c
+++ b/block/sed-opal.c
@@ -3037,6 +3037,29 @@ static int opal_set_new_pw(struct opal_dev *dev, struct opal_new_pw *opal_pw)
return ret;
}
+static int opal_set_new_sid_pw(struct opal_dev *dev, struct opal_new_pw *opal_pw)
+{
+ int ret;
+ struct opal_key *newkey = &opal_pw->new_user_pw.opal_key;
+ struct opal_key *oldkey = &opal_pw->session.opal_key;
+
+ const struct opal_step pw_steps[] = {
+ { start_SIDASP_opal_session, oldkey },
+ { set_sid_cpin_pin, newkey },
+ { end_opal_session, }
+ };
+
+ if (!dev)
+ return -ENODEV;
+
+ mutex_lock(&dev->dev_lock);
+ setup_opal_dev(dev);
+ ret = execute_steps(dev, pw_steps, ARRAY_SIZE(pw_steps));
+ mutex_unlock(&dev->dev_lock);
+
+ return ret;
+}
+
static int opal_activate_user(struct opal_dev *dev,
struct opal_session_info *opal_session)
{
@@ -3286,6 +3309,9 @@ int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg)
case IOC_OPAL_DISCOVERY:
ret = opal_get_discv(dev, p);
break;
+ case IOC_OPAL_SET_SID_PW:
+ ret = opal_set_new_sid_pw(dev, p);
+ break;
default:
break;
diff --git a/crypto/Kconfig b/crypto/Kconfig
index a779cab..6b0bfbc 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -250,6 +250,7 @@
tristate "RSA (Rivest-Shamir-Adleman)"
select CRYPTO_AKCIPHER
select CRYPTO_MANAGER
+ select CRYPTO_SIG
select MPILIB
select ASN1
help
@@ -290,19 +291,19 @@
config CRYPTO_ECDSA
tristate "ECDSA (Elliptic Curve Digital Signature Algorithm)"
select CRYPTO_ECC
- select CRYPTO_AKCIPHER
+ select CRYPTO_SIG
select ASN1
help
ECDSA (Elliptic Curve Digital Signature Algorithm) (FIPS 186,
ISO/IEC 14888-3)
- using curves P-192, P-256, and P-384
+ using curves P-192, P-256, P-384 and P-521
Only signature verification is implemented.
config CRYPTO_ECRDSA
tristate "EC-RDSA (Elliptic Curve Russian Digital Signature Algorithm)"
select CRYPTO_ECC
- select CRYPTO_AKCIPHER
+ select CRYPTO_SIG
select CRYPTO_STREEBOG
select OID_REGISTRY
select ASN1
diff --git a/crypto/Makefile b/crypto/Makefile
index 4c99e5d..77abca7 100644
--- a/crypto/Makefile
+++ b/crypto/Makefile
@@ -48,11 +48,14 @@
rsa_generic-y += rsa.o
rsa_generic-y += rsa_helper.o
rsa_generic-y += rsa-pkcs1pad.o
+rsa_generic-y += rsassa-pkcs1.o
obj-$(CONFIG_CRYPTO_RSA) += rsa_generic.o
$(obj)/ecdsasignature.asn1.o: $(obj)/ecdsasignature.asn1.c $(obj)/ecdsasignature.asn1.h
-$(obj)/ecdsa.o: $(obj)/ecdsasignature.asn1.h
+$(obj)/ecdsa-x962.o: $(obj)/ecdsasignature.asn1.h
ecdsa_generic-y += ecdsa.o
+ecdsa_generic-y += ecdsa-x962.o
+ecdsa_generic-y += ecdsa-p1363.o
ecdsa_generic-y += ecdsasignature.asn1.o
obj-$(CONFIG_CRYPTO_ECDSA) += ecdsa_generic.o
@@ -152,6 +155,8 @@
obj-$(CONFIG_CRYPTO_MICHAEL_MIC) += michael_mic.o
obj-$(CONFIG_CRYPTO_CRC32C) += crc32c_generic.o
obj-$(CONFIG_CRYPTO_CRC32) += crc32_generic.o
+CFLAGS_crc32c_generic.o += -DARCH=$(ARCH)
+CFLAGS_crc32_generic.o += -DARCH=$(ARCH)
obj-$(CONFIG_CRYPTO_CRCT10DIF) += crct10dif_common.o crct10dif_generic.o
obj-$(CONFIG_CRYPTO_CRC64_ROCKSOFT) += crc64_rocksoft_generic.o
obj-$(CONFIG_CRYPTO_AUTHENC) += authenc.o authencesn.o
diff --git a/crypto/akcipher.c b/crypto/akcipher.c
index e0ff5f4..72c82d9 100644
--- a/crypto/akcipher.c
+++ b/crypto/akcipher.c
@@ -20,6 +20,19 @@
#define CRYPTO_ALG_TYPE_AHASH_MASK 0x0000000e
+struct crypto_akcipher_sync_data {
+ struct crypto_akcipher *tfm;
+ const void *src;
+ void *dst;
+ unsigned int slen;
+ unsigned int dlen;
+
+ struct akcipher_request *req;
+ struct crypto_wait cwait;
+ struct scatterlist sg;
+ u8 *buf;
+};
+
static int __maybe_unused crypto_akcipher_report(
struct sk_buff *skb, struct crypto_alg *alg)
{
@@ -126,10 +139,6 @@ int crypto_register_akcipher(struct akcipher_alg *alg)
{
struct crypto_alg *base = &alg->base;
- if (!alg->sign)
- alg->sign = akcipher_default_op;
- if (!alg->verify)
- alg->verify = akcipher_default_op;
if (!alg->encrypt)
alg->encrypt = akcipher_default_op;
if (!alg->decrypt)
@@ -158,7 +167,7 @@ int akcipher_register_instance(struct crypto_template *tmpl,
}
EXPORT_SYMBOL_GPL(akcipher_register_instance);
-int crypto_akcipher_sync_prep(struct crypto_akcipher_sync_data *data)
+static int crypto_akcipher_sync_prep(struct crypto_akcipher_sync_data *data)
{
unsigned int reqsize = crypto_akcipher_reqsize(data->tfm);
struct akcipher_request *req;
@@ -167,10 +176,7 @@ int crypto_akcipher_sync_prep(struct crypto_akcipher_sync_data *data)
unsigned int len;
u8 *buf;
- if (data->dst)
- mlen = max(data->slen, data->dlen);
- else
- mlen = data->slen + data->dlen;
+ mlen = max(data->slen, data->dlen);
len = sizeof(*req) + reqsize + mlen;
if (len < mlen)
@@ -189,8 +195,7 @@ int crypto_akcipher_sync_prep(struct crypto_akcipher_sync_data *data)
sg = &data->sg;
sg_init_one(sg, buf, mlen);
- akcipher_request_set_crypt(req, sg, data->dst ? sg : NULL,
- data->slen, data->dlen);
+ akcipher_request_set_crypt(req, sg, sg, data->slen, data->dlen);
crypto_init_wait(&data->cwait);
akcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP,
@@ -198,18 +203,16 @@ int crypto_akcipher_sync_prep(struct crypto_akcipher_sync_data *data)
return 0;
}
-EXPORT_SYMBOL_GPL(crypto_akcipher_sync_prep);
-int crypto_akcipher_sync_post(struct crypto_akcipher_sync_data *data, int err)
+static int crypto_akcipher_sync_post(struct crypto_akcipher_sync_data *data,
+ int err)
{
err = crypto_wait_req(err, &data->cwait);
- if (data->dst)
- memcpy(data->dst, data->buf, data->dlen);
+ memcpy(data->dst, data->buf, data->dlen);
data->dlen = data->req->dst_len;
kfree_sensitive(data->req);
return err;
}
-EXPORT_SYMBOL_GPL(crypto_akcipher_sync_post);
int crypto_akcipher_sync_encrypt(struct crypto_akcipher *tfm,
const void *src, unsigned int slen,
@@ -248,34 +251,5 @@ int crypto_akcipher_sync_decrypt(struct crypto_akcipher *tfm,
}
EXPORT_SYMBOL_GPL(crypto_akcipher_sync_decrypt);
-static void crypto_exit_akcipher_ops_sig(struct crypto_tfm *tfm)
-{
- struct crypto_akcipher **ctx = crypto_tfm_ctx(tfm);
-
- crypto_free_akcipher(*ctx);
-}
-
-int crypto_init_akcipher_ops_sig(struct crypto_tfm *tfm)
-{
- struct crypto_akcipher **ctx = crypto_tfm_ctx(tfm);
- struct crypto_alg *calg = tfm->__crt_alg;
- struct crypto_akcipher *akcipher;
-
- if (!crypto_mod_get(calg))
- return -EAGAIN;
-
- akcipher = crypto_create_tfm(calg, &crypto_akcipher_type);
- if (IS_ERR(akcipher)) {
- crypto_mod_put(calg);
- return PTR_ERR(akcipher);
- }
-
- *ctx = akcipher;
- tfm->exit = crypto_exit_akcipher_ops_sig;
-
- return 0;
-}
-EXPORT_SYMBOL_GPL(crypto_init_akcipher_ops_sig);
-
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("Generic public key cipher type");
diff --git a/crypto/algapi.c b/crypto/algapi.c
index 004d27e..16f7c7a 100644
--- a/crypto/algapi.c
+++ b/crypto/algapi.c
@@ -6,7 +6,6 @@
*/
#include <crypto/algapi.h>
-#include <crypto/internal/simd.h>
#include <linux/err.h>
#include <linux/errno.h>
#include <linux/fips.h>
@@ -23,11 +22,6 @@
static LIST_HEAD(crypto_template_list);
-#ifdef CONFIG_CRYPTO_MANAGER_EXTRA_TESTS
-DEFINE_PER_CPU(bool, crypto_simd_disabled_for_test);
-EXPORT_PER_CPU_SYMBOL_GPL(crypto_simd_disabled_for_test);
-#endif
-
static inline void crypto_check_module_sig(struct module *mod)
{
if (fips_enabled && mod && !module_sig_ok(mod))
diff --git a/crypto/asymmetric_keys/public_key.c b/crypto/asymmetric_keys/public_key.c
index 422940a..bbd07a9 100644
--- a/crypto/asymmetric_keys/public_key.c
+++ b/crypto/asymmetric_keys/public_key.c
@@ -83,13 +83,19 @@ software_key_determine_akcipher(const struct public_key *pkey,
if (strcmp(encoding, "pkcs1") == 0) {
*sig = op == kernel_pkey_sign ||
op == kernel_pkey_verify;
- if (!hash_algo) {
+ if (!*sig) {
+ /*
+ * For encrypt/decrypt, hash_algo is not used
+ * but allowed to be set for historic reasons.
+ */
n = snprintf(alg_name, CRYPTO_MAX_ALG_NAME,
"pkcs1pad(%s)",
pkey->pkey_algo);
} else {
+ if (!hash_algo)
+ hash_algo = "none";
n = snprintf(alg_name, CRYPTO_MAX_ALG_NAME,
- "pkcs1pad(%s,%s)",
+ "pkcs1(%s,%s)",
pkey->pkey_algo, hash_algo);
}
return n >= CRYPTO_MAX_ALG_NAME ? -EINVAL : 0;
@@ -104,7 +110,8 @@ software_key_determine_akcipher(const struct public_key *pkey,
return -EINVAL;
*sig = false;
} else if (strncmp(pkey->pkey_algo, "ecdsa", 5) == 0) {
- if (strcmp(encoding, "x962") != 0)
+ if (strcmp(encoding, "x962") != 0 &&
+ strcmp(encoding, "p1363") != 0)
return -EINVAL;
/*
* ECDSA signatures are taken over a raw hash, so they don't
@@ -124,6 +131,9 @@ software_key_determine_akcipher(const struct public_key *pkey,
strcmp(hash_algo, "sha3-384") != 0 &&
strcmp(hash_algo, "sha3-512") != 0)
return -EINVAL;
+ n = snprintf(alg_name, CRYPTO_MAX_ALG_NAME, "%s(%s)",
+ encoding, pkey->pkey_algo);
+ return n >= CRYPTO_MAX_ALG_NAME ? -EINVAL : 0;
} else if (strcmp(pkey->pkey_algo, "ecrdsa") == 0) {
if (strcmp(encoding, "raw") != 0)
return -EINVAL;
@@ -192,7 +202,9 @@ static int software_key_query(const struct kernel_pkey_params *params,
if (ret < 0)
goto error_free_tfm;
- len = crypto_sig_maxsize(sig);
+ len = crypto_sig_keysize(sig);
+ info->max_sig_size = crypto_sig_maxsize(sig);
+ info->max_data_size = crypto_sig_digestsize(sig);
info->supported_ops = KEYCTL_SUPPORTS_VERIFY;
if (pkey->key_is_private)
@@ -218,6 +230,8 @@ static int software_key_query(const struct kernel_pkey_params *params,
goto error_free_tfm;
len = crypto_akcipher_maxsize(tfm);
+ info->max_sig_size = len;
+ info->max_data_size = len;
info->supported_ops = KEYCTL_SUPPORTS_ENCRYPT;
if (pkey->key_is_private)
@@ -225,40 +239,6 @@ static int software_key_query(const struct kernel_pkey_params *params,
}
info->key_size = len * 8;
-
- if (strncmp(pkey->pkey_algo, "ecdsa", 5) == 0) {
- int slen = len;
- /*
- * ECDSA key sizes are much smaller than RSA, and thus could
- * operate on (hashed) inputs that are larger than key size.
- * For example SHA384-hashed input used with secp256r1
- * based keys. Set max_data_size to be at least as large as
- * the largest supported hash size (SHA512)
- */
- info->max_data_size = 64;
-
- /*
- * Verify takes ECDSA-Sig (described in RFC 5480) as input,
- * which is actually 2 'key_size'-bit integers encoded in
- * ASN.1. Account for the ASN.1 encoding overhead here.
- *
- * NIST P192/256/384 may prepend a '0' to a coordinate to
- * indicate a positive integer. NIST P521 never needs it.
- */
- if (strcmp(pkey->pkey_algo, "ecdsa-nist-p521") != 0)
- slen += 1;
- /* Length of encoding the x & y coordinates */
- slen = 2 * (slen + 2);
- /*
- * If coordinate encoding takes at least 128 bytes then an
- * additional byte for length encoding is needed.
- */
- info->max_sig_size = 1 + (slen >= 128) + 1 + slen;
- } else {
- info->max_data_size = len;
- info->max_sig_size = len;
- }
-
info->max_enc_size = len;
info->max_dec_size = len;
@@ -323,7 +303,7 @@ static int software_key_eds_op(struct kernel_pkey_params *params,
if (ret)
goto error_free_tfm;
- ksz = crypto_sig_maxsize(sig);
+ ksz = crypto_sig_keysize(sig);
} else {
tfm = crypto_alloc_akcipher(alg_name, 0, 0);
if (IS_ERR(tfm)) {
diff --git a/crypto/asymmetric_keys/signature.c b/crypto/asymmetric_keys/signature.c
index 2deff81..041d04b 100644
--- a/crypto/asymmetric_keys/signature.c
+++ b/crypto/asymmetric_keys/signature.c
@@ -65,69 +65,6 @@ int query_asymmetric_key(const struct kernel_pkey_params *params,
EXPORT_SYMBOL_GPL(query_asymmetric_key);
/**
- * encrypt_blob - Encrypt data using an asymmetric key
- * @params: Various parameters
- * @data: Data blob to be encrypted, length params->data_len
- * @enc: Encrypted data buffer, length params->enc_len
- *
- * Encrypt the specified data blob using the private key specified by
- * params->key. The encrypted data is wrapped in an encoding if
- * params->encoding is specified (eg. "pkcs1").
- *
- * Returns the length of the data placed in the encrypted data buffer or an
- * error.
- */
-int encrypt_blob(struct kernel_pkey_params *params,
- const void *data, void *enc)
-{
- params->op = kernel_pkey_encrypt;
- return asymmetric_key_eds_op(params, data, enc);
-}
-EXPORT_SYMBOL_GPL(encrypt_blob);
-
-/**
- * decrypt_blob - Decrypt data using an asymmetric key
- * @params: Various parameters
- * @enc: Encrypted data to be decrypted, length params->enc_len
- * @data: Decrypted data buffer, length params->data_len
- *
- * Decrypt the specified data blob using the private key specified by
- * params->key. The decrypted data is wrapped in an encoding if
- * params->encoding is specified (eg. "pkcs1").
- *
- * Returns the length of the data placed in the decrypted data buffer or an
- * error.
- */
-int decrypt_blob(struct kernel_pkey_params *params,
- const void *enc, void *data)
-{
- params->op = kernel_pkey_decrypt;
- return asymmetric_key_eds_op(params, enc, data);
-}
-EXPORT_SYMBOL_GPL(decrypt_blob);
-
-/**
- * create_signature - Sign some data using an asymmetric key
- * @params: Various parameters
- * @data: Data blob to be signed, length params->data_len
- * @enc: Signature buffer, length params->enc_len
- *
- * Sign the specified data blob using the private key specified by params->key.
- * The signature is wrapped in an encoding if params->encoding is specified
- * (eg. "pkcs1"). If the encoding needs to know the digest type, this can be
- * passed through params->hash_algo (eg. "sha1").
- *
- * Returns the length of the data placed in the signature buffer or an error.
- */
-int create_signature(struct kernel_pkey_params *params,
- const void *data, void *enc)
-{
- params->op = kernel_pkey_sign;
- return asymmetric_key_eds_op(params, data, enc);
-}
-EXPORT_SYMBOL_GPL(create_signature);
-
-/**
* verify_signature - Initiate the use of an asymmetric key to verify a signature
* @key: The asymmetric key to verify against
* @sig: The signature to check
diff --git a/crypto/crc32_generic.c b/crypto/crc32_generic.c
index d125166..6a55d20 100644
--- a/crypto/crc32_generic.c
+++ b/crypto/crc32_generic.c
@@ -59,6 +59,15 @@ static int crc32_update(struct shash_desc *desc, const u8 *data,
{
u32 *crcp = shash_desc_ctx(desc);
+ *crcp = crc32_le_base(*crcp, data, len);
+ return 0;
+}
+
+static int crc32_update_arch(struct shash_desc *desc, const u8 *data,
+ unsigned int len)
+{
+ u32 *crcp = shash_desc_ctx(desc);
+
*crcp = crc32_le(*crcp, data, len);
return 0;
}
@@ -67,6 +76,13 @@ static int crc32_update(struct shash_desc *desc, const u8 *data,
static int __crc32_finup(u32 *crcp, const u8 *data, unsigned int len,
u8 *out)
{
+ put_unaligned_le32(crc32_le_base(*crcp, data, len), out);
+ return 0;
+}
+
+static int __crc32_finup_arch(u32 *crcp, const u8 *data, unsigned int len,
+ u8 *out)
+{
put_unaligned_le32(crc32_le(*crcp, data, len), out);
return 0;
}
@@ -77,6 +93,12 @@ static int crc32_finup(struct shash_desc *desc, const u8 *data,
return __crc32_finup(shash_desc_ctx(desc), data, len, out);
}
+static int crc32_finup_arch(struct shash_desc *desc, const u8 *data,
+ unsigned int len, u8 *out)
+{
+ return __crc32_finup_arch(shash_desc_ctx(desc), data, len, out);
+}
+
static int crc32_final(struct shash_desc *desc, u8 *out)
{
u32 *crcp = shash_desc_ctx(desc);
@@ -88,38 +110,62 @@ static int crc32_final(struct shash_desc *desc, u8 *out)
static int crc32_digest(struct shash_desc *desc, const u8 *data,
unsigned int len, u8 *out)
{
- return __crc32_finup(crypto_shash_ctx(desc->tfm), data, len,
- out);
+ return __crc32_finup(crypto_shash_ctx(desc->tfm), data, len, out);
}
-static struct shash_alg alg = {
- .setkey = crc32_setkey,
- .init = crc32_init,
- .update = crc32_update,
- .final = crc32_final,
- .finup = crc32_finup,
- .digest = crc32_digest,
- .descsize = sizeof(u32),
- .digestsize = CHKSUM_DIGEST_SIZE,
- .base = {
- .cra_name = "crc32",
- .cra_driver_name = "crc32-generic",
- .cra_priority = 100,
- .cra_flags = CRYPTO_ALG_OPTIONAL_KEY,
- .cra_blocksize = CHKSUM_BLOCK_SIZE,
- .cra_ctxsize = sizeof(u32),
- .cra_module = THIS_MODULE,
- .cra_init = crc32_cra_init,
- }
-};
+
+static int crc32_digest_arch(struct shash_desc *desc, const u8 *data,
+ unsigned int len, u8 *out)
+{
+ return __crc32_finup_arch(crypto_shash_ctx(desc->tfm), data, len, out);
+}
+
+static struct shash_alg algs[] = {{
+ .setkey = crc32_setkey,
+ .init = crc32_init,
+ .update = crc32_update,
+ .final = crc32_final,
+ .finup = crc32_finup,
+ .digest = crc32_digest,
+ .descsize = sizeof(u32),
+ .digestsize = CHKSUM_DIGEST_SIZE,
+
+ .base.cra_name = "crc32",
+ .base.cra_driver_name = "crc32-generic",
+ .base.cra_priority = 100,
+ .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY,
+ .base.cra_blocksize = CHKSUM_BLOCK_SIZE,
+ .base.cra_ctxsize = sizeof(u32),
+ .base.cra_module = THIS_MODULE,
+ .base.cra_init = crc32_cra_init,
+}, {
+ .setkey = crc32_setkey,
+ .init = crc32_init,
+ .update = crc32_update_arch,
+ .final = crc32_final,
+ .finup = crc32_finup_arch,
+ .digest = crc32_digest_arch,
+ .descsize = sizeof(u32),
+ .digestsize = CHKSUM_DIGEST_SIZE,
+
+ .base.cra_name = "crc32",
+ .base.cra_driver_name = "crc32-" __stringify(ARCH),
+ .base.cra_priority = 150,
+ .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY,
+ .base.cra_blocksize = CHKSUM_BLOCK_SIZE,
+ .base.cra_ctxsize = sizeof(u32),
+ .base.cra_module = THIS_MODULE,
+ .base.cra_init = crc32_cra_init,
+}};
static int __init crc32_mod_init(void)
{
- return crypto_register_shash(&alg);
+ /* register the arch flavor only if it differs from the generic one */
+ return crypto_register_shashes(algs, 1 + (&crc32_le != &crc32_le_base));
}
static void __exit crc32_mod_fini(void)
{
- crypto_unregister_shash(&alg);
+ crypto_unregister_shashes(algs, 1 + (&crc32_le != &crc32_le_base));
}
subsys_initcall(crc32_mod_init);
diff --git a/crypto/crc32c_generic.c b/crypto/crc32c_generic.c
index a8c90b3..7c2357c 100644
--- a/crypto/crc32c_generic.c
+++ b/crypto/crc32c_generic.c
@@ -85,6 +85,15 @@ static int chksum_update(struct shash_desc *desc, const u8 *data,
{
struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
+ ctx->crc = __crc32c_le_base(ctx->crc, data, length);
+ return 0;
+}
+
+static int chksum_update_arch(struct shash_desc *desc, const u8 *data,
+ unsigned int length)
+{
+ struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
+
ctx->crc = __crc32c_le(ctx->crc, data, length);
return 0;
}
@@ -99,6 +108,13 @@ static int chksum_final(struct shash_desc *desc, u8 *out)
static int __chksum_finup(u32 *crcp, const u8 *data, unsigned int len, u8 *out)
{
+ put_unaligned_le32(~__crc32c_le_base(*crcp, data, len), out);
+ return 0;
+}
+
+static int __chksum_finup_arch(u32 *crcp, const u8 *data, unsigned int len,
+ u8 *out)
+{
put_unaligned_le32(~__crc32c_le(*crcp, data, len), out);
return 0;
}
@@ -111,6 +127,14 @@ static int chksum_finup(struct shash_desc *desc, const u8 *data,
return __chksum_finup(&ctx->crc, data, len, out);
}
+static int chksum_finup_arch(struct shash_desc *desc, const u8 *data,
+ unsigned int len, u8 *out)
+{
+ struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
+
+ return __chksum_finup_arch(&ctx->crc, data, len, out);
+}
+
static int chksum_digest(struct shash_desc *desc, const u8 *data,
unsigned int length, u8 *out)
{
@@ -119,6 +143,14 @@ static int chksum_digest(struct shash_desc *desc, const u8 *data,
return __chksum_finup(&mctx->key, data, length, out);
}
+static int chksum_digest_arch(struct shash_desc *desc, const u8 *data,
+ unsigned int length, u8 *out)
+{
+ struct chksum_ctx *mctx = crypto_shash_ctx(desc->tfm);
+
+ return __chksum_finup_arch(&mctx->key, data, length, out);
+}
+
static int crc32c_cra_init(struct crypto_tfm *tfm)
{
struct chksum_ctx *mctx = crypto_tfm_ctx(tfm);
@@ -127,35 +159,53 @@ static int crc32c_cra_init(struct crypto_tfm *tfm)
return 0;
}
-static struct shash_alg alg = {
- .digestsize = CHKSUM_DIGEST_SIZE,
- .setkey = chksum_setkey,
- .init = chksum_init,
- .update = chksum_update,
- .final = chksum_final,
- .finup = chksum_finup,
- .digest = chksum_digest,
- .descsize = sizeof(struct chksum_desc_ctx),
- .base = {
- .cra_name = "crc32c",
- .cra_driver_name = "crc32c-generic",
- .cra_priority = 100,
- .cra_flags = CRYPTO_ALG_OPTIONAL_KEY,
- .cra_blocksize = CHKSUM_BLOCK_SIZE,
- .cra_ctxsize = sizeof(struct chksum_ctx),
- .cra_module = THIS_MODULE,
- .cra_init = crc32c_cra_init,
- }
-};
+static struct shash_alg algs[] = {{
+ .digestsize = CHKSUM_DIGEST_SIZE,
+ .setkey = chksum_setkey,
+ .init = chksum_init,
+ .update = chksum_update,
+ .final = chksum_final,
+ .finup = chksum_finup,
+ .digest = chksum_digest,
+ .descsize = sizeof(struct chksum_desc_ctx),
+
+ .base.cra_name = "crc32c",
+ .base.cra_driver_name = "crc32c-generic",
+ .base.cra_priority = 100,
+ .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY,
+ .base.cra_blocksize = CHKSUM_BLOCK_SIZE,
+ .base.cra_ctxsize = sizeof(struct chksum_ctx),
+ .base.cra_module = THIS_MODULE,
+ .base.cra_init = crc32c_cra_init,
+}, {
+ .digestsize = CHKSUM_DIGEST_SIZE,
+ .setkey = chksum_setkey,
+ .init = chksum_init,
+ .update = chksum_update_arch,
+ .final = chksum_final,
+ .finup = chksum_finup_arch,
+ .digest = chksum_digest_arch,
+ .descsize = sizeof(struct chksum_desc_ctx),
+
+ .base.cra_name = "crc32c",
+ .base.cra_driver_name = "crc32c-" __stringify(ARCH),
+ .base.cra_priority = 150,
+ .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY,
+ .base.cra_blocksize = CHKSUM_BLOCK_SIZE,
+ .base.cra_ctxsize = sizeof(struct chksum_ctx),
+ .base.cra_module = THIS_MODULE,
+ .base.cra_init = crc32c_cra_init,
+}};
static int __init crc32c_mod_init(void)
{
- return crypto_register_shash(&alg);
+ /* register the arch flavor only if it differs from the generic one */
+ return crypto_register_shashes(algs, 1 + (&__crc32c_le != &__crc32c_le_base));
}
static void __exit crc32c_mod_fini(void)
{
- crypto_unregister_shash(&alg);
+ crypto_unregister_shashes(algs, 1 + (&__crc32c_le != &__crc32c_le_base));
}
subsys_initcall(crc32c_mod_init);
diff --git a/crypto/drbg.c b/crypto/drbg.c
index 3addce9..c323f40 100644
--- a/crypto/drbg.c
+++ b/crypto/drbg.c
@@ -101,6 +101,7 @@
#include <crypto/internal/cipher.h>
#include <linux/kernel.h>
#include <linux/jiffies.h>
+#include <linux/string_choices.h>
/***************************************************************
* Backend cipher definitions available to DRBG
@@ -1412,7 +1413,7 @@ static int drbg_generate(struct drbg_state *drbg,
if (drbg->pr || drbg->seeded == DRBG_SEED_STATE_UNSEEDED) {
pr_devel("DRBG: reseeding before generation (prediction "
"resistance: %s, state %s)\n",
- drbg->pr ? "true" : "false",
+ str_true_false(drbg->pr),
(drbg->seeded == DRBG_SEED_STATE_FULL ?
"seeded" : "unseeded"));
/* 9.3.1 steps 7.1 through 7.3 */
@@ -1562,7 +1563,7 @@ static int drbg_instantiate(struct drbg_state *drbg, struct drbg_string *pers,
bool reseed = true;
pr_devel("DRBG: Initializing DRBG core %d with prediction resistance "
- "%s\n", coreref, pr ? "enabled" : "disabled");
+ "%s\n", coreref, str_enabled_disabled(pr));
mutex_lock(&drbg->drbg_mutex);
/* 9.1 step 1 is implicit with the selected DRBG type */
diff --git a/crypto/ecdsa-p1363.c b/crypto/ecdsa-p1363.c
new file mode 100644
index 0000000..eaae721
--- /dev/null
+++ b/crypto/ecdsa-p1363.c
@@ -0,0 +1,159 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * ECDSA P1363 signature encoding
+ *
+ * Copyright (c) 2024 Intel Corporation
+ */
+
+#include <linux/err.h>
+#include <linux/module.h>
+#include <crypto/algapi.h>
+#include <crypto/sig.h>
+#include <crypto/internal/ecc.h>
+#include <crypto/internal/sig.h>
+
+struct ecdsa_p1363_ctx {
+ struct crypto_sig *child;
+};
+
+static int ecdsa_p1363_verify(struct crypto_sig *tfm,
+ const void *src, unsigned int slen,
+ const void *digest, unsigned int dlen)
+{
+ struct ecdsa_p1363_ctx *ctx = crypto_sig_ctx(tfm);
+ unsigned int keylen = crypto_sig_keysize(ctx->child);
+ unsigned int ndigits = DIV_ROUND_UP(keylen, sizeof(u64));
+ struct ecdsa_raw_sig sig;
+
+ if (slen != 2 * keylen)
+ return -EINVAL;
+
+ ecc_digits_from_bytes(src, keylen, sig.r, ndigits);
+ ecc_digits_from_bytes(src + keylen, keylen, sig.s, ndigits);
+
+ return crypto_sig_verify(ctx->child, &sig, sizeof(sig), digest, dlen);
+}
+
+static unsigned int ecdsa_p1363_key_size(struct crypto_sig *tfm)
+{
+ struct ecdsa_p1363_ctx *ctx = crypto_sig_ctx(tfm);
+
+ return crypto_sig_keysize(ctx->child);
+}
+
+static unsigned int ecdsa_p1363_max_size(struct crypto_sig *tfm)
+{
+ struct ecdsa_p1363_ctx *ctx = crypto_sig_ctx(tfm);
+
+ return 2 * crypto_sig_keysize(ctx->child);
+}
+
+static unsigned int ecdsa_p1363_digest_size(struct crypto_sig *tfm)
+{
+ struct ecdsa_p1363_ctx *ctx = crypto_sig_ctx(tfm);
+
+ return crypto_sig_digestsize(ctx->child);
+}
+
+static int ecdsa_p1363_set_pub_key(struct crypto_sig *tfm,
+ const void *key, unsigned int keylen)
+{
+ struct ecdsa_p1363_ctx *ctx = crypto_sig_ctx(tfm);
+
+ return crypto_sig_set_pubkey(ctx->child, key, keylen);
+}
+
+static int ecdsa_p1363_init_tfm(struct crypto_sig *tfm)
+{
+ struct sig_instance *inst = sig_alg_instance(tfm);
+ struct crypto_sig_spawn *spawn = sig_instance_ctx(inst);
+ struct ecdsa_p1363_ctx *ctx = crypto_sig_ctx(tfm);
+ struct crypto_sig *child_tfm;
+
+ child_tfm = crypto_spawn_sig(spawn);
+ if (IS_ERR(child_tfm))
+ return PTR_ERR(child_tfm);
+
+ ctx->child = child_tfm;
+
+ return 0;
+}
+
+static void ecdsa_p1363_exit_tfm(struct crypto_sig *tfm)
+{
+ struct ecdsa_p1363_ctx *ctx = crypto_sig_ctx(tfm);
+
+ crypto_free_sig(ctx->child);
+}
+
+static void ecdsa_p1363_free(struct sig_instance *inst)
+{
+ struct crypto_sig_spawn *spawn = sig_instance_ctx(inst);
+
+ crypto_drop_sig(spawn);
+ kfree(inst);
+}
+
+static int ecdsa_p1363_create(struct crypto_template *tmpl, struct rtattr **tb)
+{
+ struct crypto_sig_spawn *spawn;
+ struct sig_instance *inst;
+ struct sig_alg *ecdsa_alg;
+ u32 mask;
+ int err;
+
+ err = crypto_check_attr_type(tb, CRYPTO_ALG_TYPE_SIG, &mask);
+ if (err)
+ return err;
+
+ inst = kzalloc(sizeof(*inst) + sizeof(*spawn), GFP_KERNEL);
+ if (!inst)
+ return -ENOMEM;
+
+ spawn = sig_instance_ctx(inst);
+
+ err = crypto_grab_sig(spawn, sig_crypto_instance(inst),
+ crypto_attr_alg_name(tb[1]), 0, mask);
+ if (err)
+ goto err_free_inst;
+
+ ecdsa_alg = crypto_spawn_sig_alg(spawn);
+
+ err = -EINVAL;
+ if (strncmp(ecdsa_alg->base.cra_name, "ecdsa", 5) != 0)
+ goto err_free_inst;
+
+ err = crypto_inst_setname(sig_crypto_instance(inst), tmpl->name,
+ &ecdsa_alg->base);
+ if (err)
+ goto err_free_inst;
+
+ inst->alg.base.cra_priority = ecdsa_alg->base.cra_priority;
+ inst->alg.base.cra_ctxsize = sizeof(struct ecdsa_p1363_ctx);
+
+ inst->alg.init = ecdsa_p1363_init_tfm;
+ inst->alg.exit = ecdsa_p1363_exit_tfm;
+
+ inst->alg.verify = ecdsa_p1363_verify;
+ inst->alg.key_size = ecdsa_p1363_key_size;
+ inst->alg.max_size = ecdsa_p1363_max_size;
+ inst->alg.digest_size = ecdsa_p1363_digest_size;
+ inst->alg.set_pub_key = ecdsa_p1363_set_pub_key;
+
+ inst->free = ecdsa_p1363_free;
+
+ err = sig_register_instance(tmpl, inst);
+ if (err) {
+err_free_inst:
+ ecdsa_p1363_free(inst);
+ }
+ return err;
+}
+
+struct crypto_template ecdsa_p1363_tmpl = {
+ .name = "p1363",
+ .create = ecdsa_p1363_create,
+ .module = THIS_MODULE,
+};
+
+MODULE_ALIAS_CRYPTO("p1363");
diff --git a/crypto/ecdsa-x962.c b/crypto/ecdsa-x962.c
new file mode 100644
index 0000000..6a77c13
--- /dev/null
+++ b/crypto/ecdsa-x962.c
@@ -0,0 +1,237 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * ECDSA X9.62 signature encoding
+ *
+ * Copyright (c) 2021 IBM Corporation
+ * Copyright (c) 2024 Intel Corporation
+ */
+
+#include <linux/asn1_decoder.h>
+#include <linux/err.h>
+#include <linux/module.h>
+#include <crypto/algapi.h>
+#include <crypto/sig.h>
+#include <crypto/internal/ecc.h>
+#include <crypto/internal/sig.h>
+
+#include "ecdsasignature.asn1.h"
+
+struct ecdsa_x962_ctx {
+ struct crypto_sig *child;
+};
+
+struct ecdsa_x962_signature_ctx {
+ struct ecdsa_raw_sig sig;
+ unsigned int ndigits;
+};
+
+/* Get the r and s components of a signature from the X.509 certificate. */
+static int ecdsa_get_signature_rs(u64 *dest, size_t hdrlen, unsigned char tag,
+ const void *value, size_t vlen,
+ unsigned int ndigits)
+{
+ size_t bufsize = ndigits * sizeof(u64);
+ const char *d = value;
+
+ if (!value || !vlen || vlen > bufsize + 1)
+ return -EINVAL;
+
+ /*
+ * vlen may be 1 byte larger than bufsize due to a leading zero byte
+ * (necessary if the most significant bit of the integer is set).
+ */
+ if (vlen > bufsize) {
+ /* skip over leading zeros that make 'value' a positive int */
+ if (*d == 0) {
+ vlen -= 1;
+ d++;
+ } else {
+ return -EINVAL;
+ }
+ }
+
+ ecc_digits_from_bytes(d, vlen, dest, ndigits);
+
+ return 0;
+}
+
+int ecdsa_get_signature_r(void *context, size_t hdrlen, unsigned char tag,
+ const void *value, size_t vlen)
+{
+ struct ecdsa_x962_signature_ctx *sig_ctx = context;
+
+ return ecdsa_get_signature_rs(sig_ctx->sig.r, hdrlen, tag, value, vlen,
+ sig_ctx->ndigits);
+}
+
+int ecdsa_get_signature_s(void *context, size_t hdrlen, unsigned char tag,
+ const void *value, size_t vlen)
+{
+ struct ecdsa_x962_signature_ctx *sig_ctx = context;
+
+ return ecdsa_get_signature_rs(sig_ctx->sig.s, hdrlen, tag, value, vlen,
+ sig_ctx->ndigits);
+}
+
+static int ecdsa_x962_verify(struct crypto_sig *tfm,
+ const void *src, unsigned int slen,
+ const void *digest, unsigned int dlen)
+{
+ struct ecdsa_x962_ctx *ctx = crypto_sig_ctx(tfm);
+ struct ecdsa_x962_signature_ctx sig_ctx;
+ int err;
+
+ sig_ctx.ndigits = DIV_ROUND_UP(crypto_sig_keysize(ctx->child),
+ sizeof(u64));
+
+ err = asn1_ber_decoder(&ecdsasignature_decoder, &sig_ctx, src, slen);
+ if (err < 0)
+ return err;
+
+ return crypto_sig_verify(ctx->child, &sig_ctx.sig, sizeof(sig_ctx.sig),
+ digest, dlen);
+}
+
+static unsigned int ecdsa_x962_key_size(struct crypto_sig *tfm)
+{
+ struct ecdsa_x962_ctx *ctx = crypto_sig_ctx(tfm);
+
+ return crypto_sig_keysize(ctx->child);
+}
+
+static unsigned int ecdsa_x962_max_size(struct crypto_sig *tfm)
+{
+ struct ecdsa_x962_ctx *ctx = crypto_sig_ctx(tfm);
+ struct sig_alg *alg = crypto_sig_alg(ctx->child);
+ int slen = crypto_sig_keysize(ctx->child);
+
+ /*
+ * Verify takes ECDSA-Sig-Value (described in RFC 5480) as input,
+ * which is actually 2 'key_size'-bit integers encoded in ASN.1.
+ * Account for the ASN.1 encoding overhead here.
+ *
+ * NIST P192/256/384 may prepend a '0' to a coordinate to indicate
+ * a positive integer. NIST P521 never needs it.
+ */
+ if (strcmp(alg->base.cra_name, "ecdsa-nist-p521") != 0)
+ slen += 1;
+
+ /* Length of encoding the x & y coordinates */
+ slen = 2 * (slen + 2);
+
+ /*
+ * If coordinate encoding takes at least 128 bytes then an
+ * additional byte for length encoding is needed.
+ */
+ return 1 + (slen >= 128) + 1 + slen;
+}
+
+static unsigned int ecdsa_x962_digest_size(struct crypto_sig *tfm)
+{
+ struct ecdsa_x962_ctx *ctx = crypto_sig_ctx(tfm);
+
+ return crypto_sig_digestsize(ctx->child);
+}
+
+static int ecdsa_x962_set_pub_key(struct crypto_sig *tfm,
+ const void *key, unsigned int keylen)
+{
+ struct ecdsa_x962_ctx *ctx = crypto_sig_ctx(tfm);
+
+ return crypto_sig_set_pubkey(ctx->child, key, keylen);
+}
+
+static int ecdsa_x962_init_tfm(struct crypto_sig *tfm)
+{
+ struct sig_instance *inst = sig_alg_instance(tfm);
+ struct crypto_sig_spawn *spawn = sig_instance_ctx(inst);
+ struct ecdsa_x962_ctx *ctx = crypto_sig_ctx(tfm);
+ struct crypto_sig *child_tfm;
+
+ child_tfm = crypto_spawn_sig(spawn);
+ if (IS_ERR(child_tfm))
+ return PTR_ERR(child_tfm);
+
+ ctx->child = child_tfm;
+
+ return 0;
+}
+
+static void ecdsa_x962_exit_tfm(struct crypto_sig *tfm)
+{
+ struct ecdsa_x962_ctx *ctx = crypto_sig_ctx(tfm);
+
+ crypto_free_sig(ctx->child);
+}
+
+static void ecdsa_x962_free(struct sig_instance *inst)
+{
+ struct crypto_sig_spawn *spawn = sig_instance_ctx(inst);
+
+ crypto_drop_sig(spawn);
+ kfree(inst);
+}
+
+static int ecdsa_x962_create(struct crypto_template *tmpl, struct rtattr **tb)
+{
+ struct crypto_sig_spawn *spawn;
+ struct sig_instance *inst;
+ struct sig_alg *ecdsa_alg;
+ u32 mask;
+ int err;
+
+ err = crypto_check_attr_type(tb, CRYPTO_ALG_TYPE_SIG, &mask);
+ if (err)
+ return err;
+
+ inst = kzalloc(sizeof(*inst) + sizeof(*spawn), GFP_KERNEL);
+ if (!inst)
+ return -ENOMEM;
+
+ spawn = sig_instance_ctx(inst);
+
+ err = crypto_grab_sig(spawn, sig_crypto_instance(inst),
+ crypto_attr_alg_name(tb[1]), 0, mask);
+ if (err)
+ goto err_free_inst;
+
+ ecdsa_alg = crypto_spawn_sig_alg(spawn);
+
+ err = -EINVAL;
+ if (strncmp(ecdsa_alg->base.cra_name, "ecdsa", 5) != 0)
+ goto err_free_inst;
+
+ err = crypto_inst_setname(sig_crypto_instance(inst), tmpl->name,
+ &ecdsa_alg->base);
+ if (err)
+ goto err_free_inst;
+
+ inst->alg.base.cra_priority = ecdsa_alg->base.cra_priority;
+ inst->alg.base.cra_ctxsize = sizeof(struct ecdsa_x962_ctx);
+
+ inst->alg.init = ecdsa_x962_init_tfm;
+ inst->alg.exit = ecdsa_x962_exit_tfm;
+
+ inst->alg.verify = ecdsa_x962_verify;
+ inst->alg.key_size = ecdsa_x962_key_size;
+ inst->alg.max_size = ecdsa_x962_max_size;
+ inst->alg.digest_size = ecdsa_x962_digest_size;
+ inst->alg.set_pub_key = ecdsa_x962_set_pub_key;
+
+ inst->free = ecdsa_x962_free;
+
+ err = sig_register_instance(tmpl, inst);
+ if (err) {
+err_free_inst:
+ ecdsa_x962_free(inst);
+ }
+ return err;
+}
+
+struct crypto_template ecdsa_x962_tmpl = {
+ .name = "x962",
+ .create = ecdsa_x962_create,
+ .module = THIS_MODULE,
+};
+
+MODULE_ALIAS_CRYPTO("x962");
diff --git a/crypto/ecdsa.c b/crypto/ecdsa.c
index d5a1095..117526d 100644
--- a/crypto/ecdsa.c
+++ b/crypto/ecdsa.c
@@ -4,14 +4,11 @@
*/
#include <linux/module.h>
-#include <crypto/internal/akcipher.h>
#include <crypto/internal/ecc.h>
-#include <crypto/akcipher.h>
+#include <crypto/internal/sig.h>
#include <crypto/ecdh.h>
-#include <linux/asn1_decoder.h>
-#include <linux/scatterlist.h>
-
-#include "ecdsasignature.asn1.h"
+#include <crypto/sha2.h>
+#include <crypto/sig.h>
struct ecc_ctx {
unsigned int curve_id;
@@ -23,66 +20,6 @@ struct ecc_ctx {
struct ecc_point pub_key;
};
-struct ecdsa_signature_ctx {
- const struct ecc_curve *curve;
- u64 r[ECC_MAX_DIGITS];
- u64 s[ECC_MAX_DIGITS];
-};
-
-/*
- * Get the r and s components of a signature from the X509 certificate.
- */
-static int ecdsa_get_signature_rs(u64 *dest, size_t hdrlen, unsigned char tag,
- const void *value, size_t vlen, unsigned int ndigits)
-{
- size_t bufsize = ndigits * sizeof(u64);
- ssize_t diff = vlen - bufsize;
- const char *d = value;
-
- if (!value || !vlen)
- return -EINVAL;
-
- /* diff = 0: 'value' has exacly the right size
- * diff > 0: 'value' has too many bytes; one leading zero is allowed that
- * makes the value a positive integer; error on more
- * diff < 0: 'value' is missing leading zeros
- */
- if (diff > 0) {
- /* skip over leading zeros that make 'value' a positive int */
- if (*d == 0) {
- vlen -= 1;
- diff--;
- d++;
- }
- if (diff)
- return -EINVAL;
- }
- if (-diff >= bufsize)
- return -EINVAL;
-
- ecc_digits_from_bytes(d, vlen, dest, ndigits);
-
- return 0;
-}
-
-int ecdsa_get_signature_r(void *context, size_t hdrlen, unsigned char tag,
- const void *value, size_t vlen)
-{
- struct ecdsa_signature_ctx *sig = context;
-
- return ecdsa_get_signature_rs(sig->r, hdrlen, tag, value, vlen,
- sig->curve->g.ndigits);
-}
-
-int ecdsa_get_signature_s(void *context, size_t hdrlen, unsigned char tag,
- const void *value, size_t vlen)
-{
- struct ecdsa_signature_ctx *sig = context;
-
- return ecdsa_get_signature_rs(sig->s, hdrlen, tag, value, vlen,
- sig->curve->g.ndigits);
-}
-
static int _ecdsa_verify(struct ecc_ctx *ctx, const u64 *hash, const u64 *r, const u64 *s)
{
const struct ecc_curve *curve = ctx->curve;
@@ -126,46 +63,27 @@ static int _ecdsa_verify(struct ecc_ctx *ctx, const u64 *hash, const u64 *r, con
/*
* Verify an ECDSA signature.
*/
-static int ecdsa_verify(struct akcipher_request *req)
+static int ecdsa_verify(struct crypto_sig *tfm,
+ const void *src, unsigned int slen,
+ const void *digest, unsigned int dlen)
{
- struct crypto_akcipher *tfm = crypto_akcipher_reqtfm(req);
- struct ecc_ctx *ctx = akcipher_tfm_ctx(tfm);
+ struct ecc_ctx *ctx = crypto_sig_ctx(tfm);
size_t bufsize = ctx->curve->g.ndigits * sizeof(u64);
- struct ecdsa_signature_ctx sig_ctx = {
- .curve = ctx->curve,
- };
+ const struct ecdsa_raw_sig *sig = src;
u64 hash[ECC_MAX_DIGITS];
- unsigned char *buffer;
- int ret;
if (unlikely(!ctx->pub_key_set))
return -EINVAL;
- buffer = kmalloc(req->src_len + req->dst_len, GFP_KERNEL);
- if (!buffer)
- return -ENOMEM;
+ if (slen != sizeof(*sig))
+ return -EINVAL;
- sg_pcopy_to_buffer(req->src,
- sg_nents_for_len(req->src, req->src_len + req->dst_len),
- buffer, req->src_len + req->dst_len, 0);
+ if (bufsize > dlen)
+ bufsize = dlen;
- ret = asn1_ber_decoder(&ecdsasignature_decoder, &sig_ctx,
- buffer, req->src_len);
- if (ret < 0)
- goto error;
+ ecc_digits_from_bytes(digest, bufsize, hash, ctx->curve->g.ndigits);
- if (bufsize > req->dst_len)
- bufsize = req->dst_len;
-
- ecc_digits_from_bytes(buffer + req->src_len, bufsize,
- hash, ctx->curve->g.ndigits);
-
- ret = _ecdsa_verify(ctx, hash, sig_ctx.r, sig_ctx.s);
-
-error:
- kfree(buffer);
-
- return ret;
+ return _ecdsa_verify(ctx, hash, sig->r, sig->s);
}
static int ecdsa_ecc_ctx_init(struct ecc_ctx *ctx, unsigned int curve_id)
@@ -201,9 +119,10 @@ static int ecdsa_ecc_ctx_reset(struct ecc_ctx *ctx)
* Set the public ECC key as defined by RFC5480 section 2.2 "Subject Public
* Key". Only the uncompressed format is supported.
*/
-static int ecdsa_set_pub_key(struct crypto_akcipher *tfm, const void *key, unsigned int keylen)
+static int ecdsa_set_pub_key(struct crypto_sig *tfm, const void *key,
+ unsigned int keylen)
{
- struct ecc_ctx *ctx = akcipher_tfm_ctx(tfm);
+ struct ecc_ctx *ctx = crypto_sig_ctx(tfm);
unsigned int digitlen, ndigits;
const unsigned char *d = key;
int ret;
@@ -237,31 +156,43 @@ static int ecdsa_set_pub_key(struct crypto_akcipher *tfm, const void *key, unsig
return ret;
}
-static void ecdsa_exit_tfm(struct crypto_akcipher *tfm)
+static void ecdsa_exit_tfm(struct crypto_sig *tfm)
{
- struct ecc_ctx *ctx = akcipher_tfm_ctx(tfm);
+ struct ecc_ctx *ctx = crypto_sig_ctx(tfm);
ecdsa_ecc_ctx_deinit(ctx);
}
-static unsigned int ecdsa_max_size(struct crypto_akcipher *tfm)
+static unsigned int ecdsa_key_size(struct crypto_sig *tfm)
{
- struct ecc_ctx *ctx = akcipher_tfm_ctx(tfm);
+ struct ecc_ctx *ctx = crypto_sig_ctx(tfm);
return DIV_ROUND_UP(ctx->curve->nbits, 8);
}
-static int ecdsa_nist_p521_init_tfm(struct crypto_akcipher *tfm)
+static unsigned int ecdsa_digest_size(struct crypto_sig *tfm)
{
- struct ecc_ctx *ctx = akcipher_tfm_ctx(tfm);
+ /*
+ * ECDSA key sizes are much smaller than RSA, and thus could
+ * operate on (hashed) inputs that are larger than the key size.
+ * E.g. SHA384-hashed input used with secp256r1 based keys.
+ * Return the largest supported hash size (SHA512).
+ */
+ return SHA512_DIGEST_SIZE;
+}
+
+static int ecdsa_nist_p521_init_tfm(struct crypto_sig *tfm)
+{
+ struct ecc_ctx *ctx = crypto_sig_ctx(tfm);
return ecdsa_ecc_ctx_init(ctx, ECC_CURVE_NIST_P521);
}
-static struct akcipher_alg ecdsa_nist_p521 = {
+static struct sig_alg ecdsa_nist_p521 = {
.verify = ecdsa_verify,
.set_pub_key = ecdsa_set_pub_key,
- .max_size = ecdsa_max_size,
+ .key_size = ecdsa_key_size,
+ .digest_size = ecdsa_digest_size,
.init = ecdsa_nist_p521_init_tfm,
.exit = ecdsa_exit_tfm,
.base = {
@@ -273,17 +204,18 @@ static struct akcipher_alg ecdsa_nist_p521 = {
},
};
-static int ecdsa_nist_p384_init_tfm(struct crypto_akcipher *tfm)
+static int ecdsa_nist_p384_init_tfm(struct crypto_sig *tfm)
{
- struct ecc_ctx *ctx = akcipher_tfm_ctx(tfm);
+ struct ecc_ctx *ctx = crypto_sig_ctx(tfm);
return ecdsa_ecc_ctx_init(ctx, ECC_CURVE_NIST_P384);
}
-static struct akcipher_alg ecdsa_nist_p384 = {
+static struct sig_alg ecdsa_nist_p384 = {
.verify = ecdsa_verify,
.set_pub_key = ecdsa_set_pub_key,
- .max_size = ecdsa_max_size,
+ .key_size = ecdsa_key_size,
+ .digest_size = ecdsa_digest_size,
.init = ecdsa_nist_p384_init_tfm,
.exit = ecdsa_exit_tfm,
.base = {
@@ -295,17 +227,18 @@ static struct akcipher_alg ecdsa_nist_p384 = {
},
};
-static int ecdsa_nist_p256_init_tfm(struct crypto_akcipher *tfm)
+static int ecdsa_nist_p256_init_tfm(struct crypto_sig *tfm)
{
- struct ecc_ctx *ctx = akcipher_tfm_ctx(tfm);
+ struct ecc_ctx *ctx = crypto_sig_ctx(tfm);
return ecdsa_ecc_ctx_init(ctx, ECC_CURVE_NIST_P256);
}
-static struct akcipher_alg ecdsa_nist_p256 = {
+static struct sig_alg ecdsa_nist_p256 = {
.verify = ecdsa_verify,
.set_pub_key = ecdsa_set_pub_key,
- .max_size = ecdsa_max_size,
+ .key_size = ecdsa_key_size,
+ .digest_size = ecdsa_digest_size,
.init = ecdsa_nist_p256_init_tfm,
.exit = ecdsa_exit_tfm,
.base = {
@@ -317,17 +250,18 @@ static struct akcipher_alg ecdsa_nist_p256 = {
},
};
-static int ecdsa_nist_p192_init_tfm(struct crypto_akcipher *tfm)
+static int ecdsa_nist_p192_init_tfm(struct crypto_sig *tfm)
{
- struct ecc_ctx *ctx = akcipher_tfm_ctx(tfm);
+ struct ecc_ctx *ctx = crypto_sig_ctx(tfm);
return ecdsa_ecc_ctx_init(ctx, ECC_CURVE_NIST_P192);
}
-static struct akcipher_alg ecdsa_nist_p192 = {
+static struct sig_alg ecdsa_nist_p192 = {
.verify = ecdsa_verify,
.set_pub_key = ecdsa_set_pub_key,
- .max_size = ecdsa_max_size,
+ .key_size = ecdsa_key_size,
+ .digest_size = ecdsa_digest_size,
.init = ecdsa_nist_p192_init_tfm,
.exit = ecdsa_exit_tfm,
.base = {
@@ -345,42 +279,59 @@ static int __init ecdsa_init(void)
int ret;
/* NIST p192 may not be available in FIPS mode */
- ret = crypto_register_akcipher(&ecdsa_nist_p192);
+ ret = crypto_register_sig(&ecdsa_nist_p192);
ecdsa_nist_p192_registered = ret == 0;
- ret = crypto_register_akcipher(&ecdsa_nist_p256);
+ ret = crypto_register_sig(&ecdsa_nist_p256);
if (ret)
goto nist_p256_error;
- ret = crypto_register_akcipher(&ecdsa_nist_p384);
+ ret = crypto_register_sig(&ecdsa_nist_p384);
if (ret)
goto nist_p384_error;
- ret = crypto_register_akcipher(&ecdsa_nist_p521);
+ ret = crypto_register_sig(&ecdsa_nist_p521);
if (ret)
goto nist_p521_error;
+ ret = crypto_register_template(&ecdsa_x962_tmpl);
+ if (ret)
+ goto x962_tmpl_error;
+
+ ret = crypto_register_template(&ecdsa_p1363_tmpl);
+ if (ret)
+ goto p1363_tmpl_error;
+
return 0;
+p1363_tmpl_error:
+ crypto_unregister_template(&ecdsa_x962_tmpl);
+
+x962_tmpl_error:
+ crypto_unregister_sig(&ecdsa_nist_p521);
+
nist_p521_error:
- crypto_unregister_akcipher(&ecdsa_nist_p384);
+ crypto_unregister_sig(&ecdsa_nist_p384);
nist_p384_error:
- crypto_unregister_akcipher(&ecdsa_nist_p256);
+ crypto_unregister_sig(&ecdsa_nist_p256);
nist_p256_error:
if (ecdsa_nist_p192_registered)
- crypto_unregister_akcipher(&ecdsa_nist_p192);
+ crypto_unregister_sig(&ecdsa_nist_p192);
return ret;
}
static void __exit ecdsa_exit(void)
{
+ crypto_unregister_template(&ecdsa_x962_tmpl);
+ crypto_unregister_template(&ecdsa_p1363_tmpl);
+
if (ecdsa_nist_p192_registered)
- crypto_unregister_akcipher(&ecdsa_nist_p192);
- crypto_unregister_akcipher(&ecdsa_nist_p256);
- crypto_unregister_akcipher(&ecdsa_nist_p384);
- crypto_unregister_akcipher(&ecdsa_nist_p521);
+ crypto_unregister_sig(&ecdsa_nist_p192);
+ crypto_unregister_sig(&ecdsa_nist_p256);
+ crypto_unregister_sig(&ecdsa_nist_p384);
+ crypto_unregister_sig(&ecdsa_nist_p521);
}
subsys_initcall(ecdsa_init);
diff --git a/crypto/ecrdsa.c b/crypto/ecrdsa.c
index 3811f38..b3dd8a3 100644
--- a/crypto/ecrdsa.c
+++ b/crypto/ecrdsa.c
@@ -18,12 +18,11 @@
#include <linux/module.h>
#include <linux/crypto.h>
+#include <crypto/sig.h>
#include <crypto/streebog.h>
-#include <crypto/internal/akcipher.h>
#include <crypto/internal/ecc.h>
-#include <crypto/akcipher.h>
+#include <crypto/internal/sig.h>
#include <linux/oid_registry.h>
-#include <linux/scatterlist.h>
#include "ecrdsa_params.asn1.h"
#include "ecrdsa_pub_key.asn1.h"
#include "ecrdsa_defs.h"
@@ -68,13 +67,12 @@ static const struct ecc_curve *get_curve_by_oid(enum OID oid)
}
}
-static int ecrdsa_verify(struct akcipher_request *req)
+static int ecrdsa_verify(struct crypto_sig *tfm,
+ const void *src, unsigned int slen,
+ const void *digest, unsigned int dlen)
{
- struct crypto_akcipher *tfm = crypto_akcipher_reqtfm(req);
- struct ecrdsa_ctx *ctx = akcipher_tfm_ctx(tfm);
- unsigned char sig[ECRDSA_MAX_SIG_SIZE];
- unsigned char digest[STREEBOG512_DIGEST_SIZE];
- unsigned int ndigits = req->dst_len / sizeof(u64);
+ struct ecrdsa_ctx *ctx = crypto_sig_ctx(tfm);
+ unsigned int ndigits = dlen / sizeof(u64);
u64 r[ECRDSA_MAX_DIGITS]; /* witness (r) */
u64 _r[ECRDSA_MAX_DIGITS]; /* -r */
u64 s[ECRDSA_MAX_DIGITS]; /* second part of sig (s) */
@@ -91,25 +89,19 @@ static int ecrdsa_verify(struct akcipher_request *req)
*/
if (!ctx->curve ||
!ctx->digest ||
- !req->src ||
+ !src ||
+ !digest ||
!ctx->pub_key.x ||
- req->dst_len != ctx->digest_len ||
- req->dst_len != ctx->curve->g.ndigits * sizeof(u64) ||
+ dlen != ctx->digest_len ||
+ dlen != ctx->curve->g.ndigits * sizeof(u64) ||
ctx->pub_key.ndigits != ctx->curve->g.ndigits ||
- req->dst_len * 2 != req->src_len ||
- WARN_ON(req->src_len > sizeof(sig)) ||
- WARN_ON(req->dst_len > sizeof(digest)))
+ dlen * 2 != slen ||
+ WARN_ON(slen > ECRDSA_MAX_SIG_SIZE) ||
+ WARN_ON(dlen > STREEBOG512_DIGEST_SIZE))
return -EBADMSG;
- sg_copy_to_buffer(req->src, sg_nents_for_len(req->src, req->src_len),
- sig, req->src_len);
- sg_pcopy_to_buffer(req->src,
- sg_nents_for_len(req->src,
- req->src_len + req->dst_len),
- digest, req->dst_len, req->src_len);
-
- vli_from_be64(s, sig, ndigits);
- vli_from_be64(r, sig + ndigits * sizeof(u64), ndigits);
+ vli_from_be64(s, src, ndigits);
+ vli_from_be64(r, src + ndigits * sizeof(u64), ndigits);
/* Step 1: verify that 0 < r < q, 0 < s < q */
if (vli_is_zero(r, ndigits) ||
@@ -188,10 +180,10 @@ static u8 *ecrdsa_unpack_u32(u32 *dst, void *src)
}
/* Parse BER encoded subjectPublicKey. */
-static int ecrdsa_set_pub_key(struct crypto_akcipher *tfm, const void *key,
+static int ecrdsa_set_pub_key(struct crypto_sig *tfm, const void *key,
unsigned int keylen)
{
- struct ecrdsa_ctx *ctx = akcipher_tfm_ctx(tfm);
+ struct ecrdsa_ctx *ctx = crypto_sig_ctx(tfm);
unsigned int ndigits;
u32 algo, paramlen;
u8 *params;
@@ -249,9 +241,9 @@ static int ecrdsa_set_pub_key(struct crypto_akcipher *tfm, const void *key,
return 0;
}
-static unsigned int ecrdsa_max_size(struct crypto_akcipher *tfm)
+static unsigned int ecrdsa_key_size(struct crypto_sig *tfm)
{
- struct ecrdsa_ctx *ctx = akcipher_tfm_ctx(tfm);
+ struct ecrdsa_ctx *ctx = crypto_sig_ctx(tfm);
/*
* Verify doesn't need any output, so it's just informational
@@ -260,13 +252,21 @@ static unsigned int ecrdsa_max_size(struct crypto_akcipher *tfm)
return ctx->pub_key.ndigits * sizeof(u64);
}
-static void ecrdsa_exit_tfm(struct crypto_akcipher *tfm)
+static unsigned int ecrdsa_max_size(struct crypto_sig *tfm)
+{
+ struct ecrdsa_ctx *ctx = crypto_sig_ctx(tfm);
+
+ return 2 * ctx->pub_key.ndigits * sizeof(u64);
+}
+
+static void ecrdsa_exit_tfm(struct crypto_sig *tfm)
{
}
-static struct akcipher_alg ecrdsa_alg = {
+static struct sig_alg ecrdsa_alg = {
.verify = ecrdsa_verify,
.set_pub_key = ecrdsa_set_pub_key,
+ .key_size = ecrdsa_key_size,
.max_size = ecrdsa_max_size,
.exit = ecrdsa_exit_tfm,
.base = {
@@ -280,12 +280,12 @@ static struct akcipher_alg ecrdsa_alg = {
static int __init ecrdsa_mod_init(void)
{
- return crypto_register_akcipher(&ecrdsa_alg);
+ return crypto_register_sig(&ecrdsa_alg);
}
static void __exit ecrdsa_mod_fini(void)
{
- crypto_unregister_akcipher(&ecrdsa_alg);
+ crypto_unregister_sig(&ecrdsa_alg);
}
module_init(ecrdsa_mod_init);
diff --git a/crypto/internal.h b/crypto/internal.h
index 711a6a5..46b661b 100644
--- a/crypto/internal.h
+++ b/crypto/internal.h
@@ -22,8 +22,6 @@
#include <linux/sched.h>
#include <linux/types.h>
-struct akcipher_request;
-struct crypto_akcipher;
struct crypto_instance;
struct crypto_template;
@@ -35,19 +33,6 @@ struct crypto_larval {
bool test_started;
};
-struct crypto_akcipher_sync_data {
- struct crypto_akcipher *tfm;
- const void *src;
- void *dst;
- unsigned int slen;
- unsigned int dlen;
-
- struct akcipher_request *req;
- struct crypto_wait cwait;
- struct scatterlist sg;
- u8 *buf;
-};
-
enum {
CRYPTOA_UNSPEC,
CRYPTOA_ALG,
@@ -129,10 +114,6 @@ void *crypto_create_tfm_node(struct crypto_alg *alg,
void *crypto_clone_tfm(const struct crypto_type *frontend,
struct crypto_tfm *otfm);
-int crypto_akcipher_sync_prep(struct crypto_akcipher_sync_data *data);
-int crypto_akcipher_sync_post(struct crypto_akcipher_sync_data *data, int err);
-int crypto_init_akcipher_ops_sig(struct crypto_tfm *tfm);
-
static inline void *crypto_create_tfm(struct crypto_alg *alg,
const struct crypto_type *frontend)
{
diff --git a/crypto/jitterentropy-testing.c b/crypto/jitterentropy-testing.c
index 5cb6a77..21c9d7c 100644
--- a/crypto/jitterentropy-testing.c
+++ b/crypto/jitterentropy-testing.c
@@ -15,7 +15,7 @@
#define JENT_TEST_RINGBUFFER_MASK (JENT_TEST_RINGBUFFER_SIZE - 1)
struct jent_testing {
- u32 jent_testing_rb[JENT_TEST_RINGBUFFER_SIZE];
+ u64 jent_testing_rb[JENT_TEST_RINGBUFFER_SIZE];
u32 rb_reader;
atomic_t rb_writer;
atomic_t jent_testing_enabled;
@@ -72,7 +72,7 @@ static void jent_testing_fini(struct jent_testing *data, u32 boot)
pr_warn("Disabling data collection\n");
}
-static bool jent_testing_store(struct jent_testing *data, u32 value,
+static bool jent_testing_store(struct jent_testing *data, u64 value,
u32 *boot)
{
unsigned long flags;
@@ -156,20 +156,20 @@ static int jent_testing_reader(struct jent_testing *data, u32 *boot,
}
/* We copy out word-wise */
- if (outbuflen < sizeof(u32)) {
+ if (outbuflen < sizeof(u64)) {
spin_unlock_irqrestore(&data->lock, flags);
goto out;
}
memcpy(outbuf, &data->jent_testing_rb[data->rb_reader],
- sizeof(u32));
+ sizeof(u64));
data->rb_reader++;
spin_unlock_irqrestore(&data->lock, flags);
- outbuf += sizeof(u32);
- outbuflen -= sizeof(u32);
- collected_data += sizeof(u32);
+ outbuf += sizeof(u64);
+ outbuflen -= sizeof(u64);
+ collected_data += sizeof(u64);
}
out:
@@ -189,16 +189,17 @@ static int jent_testing_extract_user(struct file *file, char __user *buf,
/*
* The intention of this interface is for collecting at least
- * 1000 samples due to the SP800-90B requirements. So, we make no
- * effort in avoiding allocating more memory that actually needed
- * by the user. Hence, we allocate sufficient memory to always hold
- * that amount of data.
+ * 1000 samples due to the SP800-90B requirements. However, due to
+ * memory and performance constraints, it is not desirable to allocate
+ * 8000 bytes of memory. Instead, we allocate space for only 125
+ * samples, which will allow the user to collect all 1000 samples using
+ * 8 calls to this interface.
*/
- tmp = kmalloc(JENT_TEST_RINGBUFFER_SIZE + sizeof(u32), GFP_KERNEL);
+ tmp = kmalloc(125 * sizeof(u64) + sizeof(u64), GFP_KERNEL);
if (!tmp)
return -ENOMEM;
- tmp_aligned = PTR_ALIGN(tmp, sizeof(u32));
+ tmp_aligned = PTR_ALIGN(tmp, sizeof(u64));
while (nbytes) {
int i;
@@ -212,7 +213,7 @@ static int jent_testing_extract_user(struct file *file, char __user *buf,
schedule();
}
- i = min_t(int, nbytes, JENT_TEST_RINGBUFFER_SIZE);
+ i = min_t(int, nbytes, 125 * sizeof(u64));
i = reader(tmp_aligned, i);
if (i <= 0) {
if (i < 0)
@@ -251,7 +252,7 @@ static struct jent_testing jent_raw_hires = {
.read_wait = __WAIT_QUEUE_HEAD_INITIALIZER(jent_raw_hires.read_wait)
};
-int jent_raw_hires_entropy_store(__u32 value)
+int jent_raw_hires_entropy_store(__u64 value)
{
return jent_testing_store(&jent_raw_hires, value, &boot_raw_hires_test);
}
diff --git a/crypto/jitterentropy.h b/crypto/jitterentropy.h
index aa47286..4c5dbf2 100644
--- a/crypto/jitterentropy.h
+++ b/crypto/jitterentropy.h
@@ -22,11 +22,11 @@ extern struct rand_data *jent_entropy_collector_alloc(unsigned int osr,
extern void jent_entropy_collector_free(struct rand_data *entropy_collector);
#ifdef CONFIG_CRYPTO_JITTERENTROPY_TESTINTERFACE
-int jent_raw_hires_entropy_store(__u32 value);
+int jent_raw_hires_entropy_store(__u64 value);
void jent_testing_init(void);
void jent_testing_exit(void);
#else /* CONFIG_CRYPTO_JITTERENTROPY_TESTINTERFACE */
-static inline int jent_raw_hires_entropy_store(__u32 value) { return 0; }
+static inline int jent_raw_hires_entropy_store(__u64 value) { return 0; }
static inline void jent_testing_init(void) { }
static inline void jent_testing_exit(void) { }
#endif /* CONFIG_CRYPTO_JITTERENTROPY_TESTINTERFACE */
diff --git a/crypto/pcrypt.c b/crypto/pcrypt.c
index d0d954f..7fc79e7 100644
--- a/crypto/pcrypt.c
+++ b/crypto/pcrypt.c
@@ -117,8 +117,10 @@ static int pcrypt_aead_encrypt(struct aead_request *req)
err = padata_do_parallel(ictx->psenc, padata, &ctx->cb_cpu);
if (!err)
return -EINPROGRESS;
- if (err == -EBUSY)
- return -EAGAIN;
+ if (err == -EBUSY) {
+ /* try non-parallel mode */
+ return crypto_aead_encrypt(creq);
+ }
return err;
}
@@ -166,8 +168,10 @@ static int pcrypt_aead_decrypt(struct aead_request *req)
err = padata_do_parallel(ictx->psdec, padata, &ctx->cb_cpu);
if (!err)
return -EINPROGRESS;
- if (err == -EBUSY)
- return -EAGAIN;
+ if (err == -EBUSY) {
+ /* try non-parallel mode */
+ return crypto_aead_decrypt(creq);
+ }
return err;
}
diff --git a/crypto/rsa-pkcs1pad.c b/crypto/rsa-pkcs1pad.c
index cd50119..50bdb18 100644
--- a/crypto/rsa-pkcs1pad.c
+++ b/crypto/rsa-pkcs1pad.c
@@ -16,101 +16,6 @@
#include <linux/random.h>
#include <linux/scatterlist.h>
-/*
- * Hash algorithm OIDs plus ASN.1 DER wrappings [RFC4880 sec 5.2.2].
- */
-static const u8 rsa_digest_info_md5[] = {
- 0x30, 0x20, 0x30, 0x0c, 0x06, 0x08,
- 0x2a, 0x86, 0x48, 0x86, 0xf7, 0x0d, 0x02, 0x05, /* OID */
- 0x05, 0x00, 0x04, 0x10
-};
-
-static const u8 rsa_digest_info_sha1[] = {
- 0x30, 0x21, 0x30, 0x09, 0x06, 0x05,
- 0x2b, 0x0e, 0x03, 0x02, 0x1a,
- 0x05, 0x00, 0x04, 0x14
-};
-
-static const u8 rsa_digest_info_rmd160[] = {
- 0x30, 0x21, 0x30, 0x09, 0x06, 0x05,
- 0x2b, 0x24, 0x03, 0x02, 0x01,
- 0x05, 0x00, 0x04, 0x14
-};
-
-static const u8 rsa_digest_info_sha224[] = {
- 0x30, 0x2d, 0x30, 0x0d, 0x06, 0x09,
- 0x60, 0x86, 0x48, 0x01, 0x65, 0x03, 0x04, 0x02, 0x04,
- 0x05, 0x00, 0x04, 0x1c
-};
-
-static const u8 rsa_digest_info_sha256[] = {
- 0x30, 0x31, 0x30, 0x0d, 0x06, 0x09,
- 0x60, 0x86, 0x48, 0x01, 0x65, 0x03, 0x04, 0x02, 0x01,
- 0x05, 0x00, 0x04, 0x20
-};
-
-static const u8 rsa_digest_info_sha384[] = {
- 0x30, 0x41, 0x30, 0x0d, 0x06, 0x09,
- 0x60, 0x86, 0x48, 0x01, 0x65, 0x03, 0x04, 0x02, 0x02,
- 0x05, 0x00, 0x04, 0x30
-};
-
-static const u8 rsa_digest_info_sha512[] = {
- 0x30, 0x51, 0x30, 0x0d, 0x06, 0x09,
- 0x60, 0x86, 0x48, 0x01, 0x65, 0x03, 0x04, 0x02, 0x03,
- 0x05, 0x00, 0x04, 0x40
-};
-
-static const u8 rsa_digest_info_sha3_256[] = {
- 0x30, 0x31, 0x30, 0x0d, 0x06, 0x09,
- 0x60, 0x86, 0x48, 0x01, 0x65, 0x03, 0x04, 0x02, 0x08,
- 0x05, 0x00, 0x04, 0x20
-};
-
-static const u8 rsa_digest_info_sha3_384[] = {
- 0x30, 0x41, 0x30, 0x0d, 0x06, 0x09,
- 0x60, 0x86, 0x48, 0x01, 0x65, 0x03, 0x04, 0x02, 0x09,
- 0x05, 0x00, 0x04, 0x30
-};
-
-static const u8 rsa_digest_info_sha3_512[] = {
- 0x30, 0x51, 0x30, 0x0d, 0x06, 0x09,
- 0x60, 0x86, 0x48, 0x01, 0x65, 0x03, 0x04, 0x02, 0x0A,
- 0x05, 0x00, 0x04, 0x40
-};
-
-static const struct rsa_asn1_template {
- const char *name;
- const u8 *data;
- size_t size;
-} rsa_asn1_templates[] = {
-#define _(X) { #X, rsa_digest_info_##X, sizeof(rsa_digest_info_##X) }
- _(md5),
- _(sha1),
- _(rmd160),
- _(sha256),
- _(sha384),
- _(sha512),
- _(sha224),
-#undef _
-#define _(X) { "sha3-" #X, rsa_digest_info_sha3_##X, sizeof(rsa_digest_info_sha3_##X) }
- _(256),
- _(384),
- _(512),
-#undef _
- { NULL }
-};
-
-static const struct rsa_asn1_template *rsa_lookup_asn1(const char *name)
-{
- const struct rsa_asn1_template *p;
-
- for (p = rsa_asn1_templates; p->name; p++)
- if (strcmp(name, p->name) == 0)
- return p;
- return NULL;
-}
-
struct pkcs1pad_ctx {
struct crypto_akcipher *child;
unsigned int key_size;
@@ -118,7 +23,6 @@ struct pkcs1pad_ctx {
struct pkcs1pad_inst_ctx {
struct crypto_akcipher_spawn spawn;
- const struct rsa_asn1_template *digest_info;
};
struct pkcs1pad_request {
@@ -131,42 +35,16 @@ static int pkcs1pad_set_pub_key(struct crypto_akcipher *tfm, const void *key,
unsigned int keylen)
{
struct pkcs1pad_ctx *ctx = akcipher_tfm_ctx(tfm);
- int err;
- ctx->key_size = 0;
-
- err = crypto_akcipher_set_pub_key(ctx->child, key, keylen);
- if (err)
- return err;
-
- /* Find out new modulus size from rsa implementation */
- err = crypto_akcipher_maxsize(ctx->child);
- if (err > PAGE_SIZE)
- return -ENOTSUPP;
-
- ctx->key_size = err;
- return 0;
+ return rsa_set_key(ctx->child, &ctx->key_size, RSA_PUB, key, keylen);
}
static int pkcs1pad_set_priv_key(struct crypto_akcipher *tfm, const void *key,
unsigned int keylen)
{
struct pkcs1pad_ctx *ctx = akcipher_tfm_ctx(tfm);
- int err;
- ctx->key_size = 0;
-
- err = crypto_akcipher_set_priv_key(ctx->child, key, keylen);
- if (err)
- return err;
-
- /* Find out new modulus size from rsa implementation */
- err = crypto_akcipher_maxsize(ctx->child);
- if (err > PAGE_SIZE)
- return -ENOTSUPP;
-
- ctx->key_size = err;
- return 0;
+ return rsa_set_key(ctx->child, &ctx->key_size, RSA_PRIV, key, keylen);
}
static unsigned int pkcs1pad_get_max_size(struct crypto_akcipher *tfm)
@@ -174,9 +52,9 @@ static unsigned int pkcs1pad_get_max_size(struct crypto_akcipher *tfm)
struct pkcs1pad_ctx *ctx = akcipher_tfm_ctx(tfm);
/*
- * The maximum destination buffer size for the encrypt/sign operations
+ * The maximum destination buffer size for the encrypt operation
* will be the same as for RSA, even though it's smaller for
- * decrypt/verify.
+ * decrypt.
*/
return ctx->key_size;
@@ -194,7 +72,7 @@ static void pkcs1pad_sg_set_buf(struct scatterlist *sg, void *buf, size_t len,
sg_chain(sg, nsegs, next);
}
-static int pkcs1pad_encrypt_sign_complete(struct akcipher_request *req, int err)
+static int pkcs1pad_encrypt_complete(struct akcipher_request *req, int err)
{
struct crypto_akcipher *tfm = crypto_akcipher_reqtfm(req);
struct pkcs1pad_ctx *ctx = akcipher_tfm_ctx(tfm);
@@ -233,14 +111,14 @@ static int pkcs1pad_encrypt_sign_complete(struct akcipher_request *req, int err)
return err;
}
-static void pkcs1pad_encrypt_sign_complete_cb(void *data, int err)
+static void pkcs1pad_encrypt_complete_cb(void *data, int err)
{
struct akcipher_request *req = data;
if (err == -EINPROGRESS)
goto out;
- err = pkcs1pad_encrypt_sign_complete(req, err);
+ err = pkcs1pad_encrypt_complete(req, err);
out:
akcipher_request_complete(req, err);
@@ -281,7 +159,7 @@ static int pkcs1pad_encrypt(struct akcipher_request *req)
akcipher_request_set_tfm(&req_ctx->child_req, ctx->child);
akcipher_request_set_callback(&req_ctx->child_req, req->base.flags,
- pkcs1pad_encrypt_sign_complete_cb, req);
+ pkcs1pad_encrypt_complete_cb, req);
/* Reuse output buffer */
akcipher_request_set_crypt(&req_ctx->child_req, req_ctx->in_sg,
@@ -289,7 +167,7 @@ static int pkcs1pad_encrypt(struct akcipher_request *req)
err = crypto_akcipher_encrypt(&req_ctx->child_req);
if (err != -EINPROGRESS && err != -EBUSY)
- return pkcs1pad_encrypt_sign_complete(req, err);
+ return pkcs1pad_encrypt_complete(req, err);
return err;
}
@@ -394,195 +272,6 @@ static int pkcs1pad_decrypt(struct akcipher_request *req)
return err;
}
-static int pkcs1pad_sign(struct akcipher_request *req)
-{
- struct crypto_akcipher *tfm = crypto_akcipher_reqtfm(req);
- struct pkcs1pad_ctx *ctx = akcipher_tfm_ctx(tfm);
- struct pkcs1pad_request *req_ctx = akcipher_request_ctx(req);
- struct akcipher_instance *inst = akcipher_alg_instance(tfm);
- struct pkcs1pad_inst_ctx *ictx = akcipher_instance_ctx(inst);
- const struct rsa_asn1_template *digest_info = ictx->digest_info;
- int err;
- unsigned int ps_end, digest_info_size = 0;
-
- if (!ctx->key_size)
- return -EINVAL;
-
- if (digest_info)
- digest_info_size = digest_info->size;
-
- if (req->src_len + digest_info_size > ctx->key_size - 11)
- return -EOVERFLOW;
-
- if (req->dst_len < ctx->key_size) {
- req->dst_len = ctx->key_size;
- return -EOVERFLOW;
- }
-
- req_ctx->in_buf = kmalloc(ctx->key_size - 1 - req->src_len,
- GFP_KERNEL);
- if (!req_ctx->in_buf)
- return -ENOMEM;
-
- ps_end = ctx->key_size - digest_info_size - req->src_len - 2;
- req_ctx->in_buf[0] = 0x01;
- memset(req_ctx->in_buf + 1, 0xff, ps_end - 1);
- req_ctx->in_buf[ps_end] = 0x00;
-
- if (digest_info)
- memcpy(req_ctx->in_buf + ps_end + 1, digest_info->data,
- digest_info->size);
-
- pkcs1pad_sg_set_buf(req_ctx->in_sg, req_ctx->in_buf,
- ctx->key_size - 1 - req->src_len, req->src);
-
- akcipher_request_set_tfm(&req_ctx->child_req, ctx->child);
- akcipher_request_set_callback(&req_ctx->child_req, req->base.flags,
- pkcs1pad_encrypt_sign_complete_cb, req);
-
- /* Reuse output buffer */
- akcipher_request_set_crypt(&req_ctx->child_req, req_ctx->in_sg,
- req->dst, ctx->key_size - 1, req->dst_len);
-
- err = crypto_akcipher_decrypt(&req_ctx->child_req);
- if (err != -EINPROGRESS && err != -EBUSY)
- return pkcs1pad_encrypt_sign_complete(req, err);
-
- return err;
-}
-
-static int pkcs1pad_verify_complete(struct akcipher_request *req, int err)
-{
- struct crypto_akcipher *tfm = crypto_akcipher_reqtfm(req);
- struct pkcs1pad_ctx *ctx = akcipher_tfm_ctx(tfm);
- struct pkcs1pad_request *req_ctx = akcipher_request_ctx(req);
- struct akcipher_instance *inst = akcipher_alg_instance(tfm);
- struct pkcs1pad_inst_ctx *ictx = akcipher_instance_ctx(inst);
- const struct rsa_asn1_template *digest_info = ictx->digest_info;
- const unsigned int sig_size = req->src_len;
- const unsigned int digest_size = req->dst_len;
- unsigned int dst_len;
- unsigned int pos;
- u8 *out_buf;
-
- if (err)
- goto done;
-
- err = -EINVAL;
- dst_len = req_ctx->child_req.dst_len;
- if (dst_len < ctx->key_size - 1)
- goto done;
-
- out_buf = req_ctx->out_buf;
- if (dst_len == ctx->key_size) {
- if (out_buf[0] != 0x00)
- /* Decrypted value had no leading 0 byte */
- goto done;
-
- dst_len--;
- out_buf++;
- }
-
- err = -EBADMSG;
- if (out_buf[0] != 0x01)
- goto done;
-
- for (pos = 1; pos < dst_len; pos++)
- if (out_buf[pos] != 0xff)
- break;
-
- if (pos < 9 || pos == dst_len || out_buf[pos] != 0x00)
- goto done;
- pos++;
-
- if (digest_info) {
- if (digest_info->size > dst_len - pos)
- goto done;
- if (crypto_memneq(out_buf + pos, digest_info->data,
- digest_info->size))
- goto done;
-
- pos += digest_info->size;
- }
-
- err = 0;
-
- if (digest_size != dst_len - pos) {
- err = -EKEYREJECTED;
- req->dst_len = dst_len - pos;
- goto done;
- }
- /* Extract appended digest. */
- sg_pcopy_to_buffer(req->src,
- sg_nents_for_len(req->src, sig_size + digest_size),
- req_ctx->out_buf + ctx->key_size,
- digest_size, sig_size);
- /* Do the actual verification step. */
- if (memcmp(req_ctx->out_buf + ctx->key_size, out_buf + pos,
- digest_size) != 0)
- err = -EKEYREJECTED;
-done:
- kfree_sensitive(req_ctx->out_buf);
-
- return err;
-}
-
-static void pkcs1pad_verify_complete_cb(void *data, int err)
-{
- struct akcipher_request *req = data;
-
- if (err == -EINPROGRESS)
- goto out;
-
- err = pkcs1pad_verify_complete(req, err);
-
-out:
- akcipher_request_complete(req, err);
-}
-
-/*
- * The verify operation is here for completeness similar to the verification
- * defined in RFC2313 section 10.2 except that block type 0 is not accepted,
- * as in RFC2437. RFC2437 section 9.2 doesn't define any operation to
- * retrieve the DigestInfo from a signature, instead the user is expected
- * to call the sign operation to generate the expected signature and compare
- * signatures instead of the message-digests.
- */
-static int pkcs1pad_verify(struct akcipher_request *req)
-{
- struct crypto_akcipher *tfm = crypto_akcipher_reqtfm(req);
- struct pkcs1pad_ctx *ctx = akcipher_tfm_ctx(tfm);
- struct pkcs1pad_request *req_ctx = akcipher_request_ctx(req);
- const unsigned int sig_size = req->src_len;
- const unsigned int digest_size = req->dst_len;
- int err;
-
- if (WARN_ON(req->dst) || WARN_ON(!digest_size) ||
- !ctx->key_size || sig_size != ctx->key_size)
- return -EINVAL;
-
- req_ctx->out_buf = kmalloc(ctx->key_size + digest_size, GFP_KERNEL);
- if (!req_ctx->out_buf)
- return -ENOMEM;
-
- pkcs1pad_sg_set_buf(req_ctx->out_sg, req_ctx->out_buf,
- ctx->key_size, NULL);
-
- akcipher_request_set_tfm(&req_ctx->child_req, ctx->child);
- akcipher_request_set_callback(&req_ctx->child_req, req->base.flags,
- pkcs1pad_verify_complete_cb, req);
-
- /* Reuse input buffer, output to a new buffer */
- akcipher_request_set_crypt(&req_ctx->child_req, req->src,
- req_ctx->out_sg, sig_size, ctx->key_size);
-
- err = crypto_akcipher_encrypt(&req_ctx->child_req);
- if (err != -EINPROGRESS && err != -EBUSY)
- return pkcs1pad_verify_complete(req, err);
-
- return err;
-}
-
static int pkcs1pad_init_tfm(struct crypto_akcipher *tfm)
{
struct akcipher_instance *inst = akcipher_alg_instance(tfm);
@@ -624,7 +313,6 @@ static int pkcs1pad_create(struct crypto_template *tmpl, struct rtattr **tb)
struct akcipher_instance *inst;
struct pkcs1pad_inst_ctx *ctx;
struct akcipher_alg *rsa_alg;
- const char *hash_name;
int err;
err = crypto_check_attr_type(tb, CRYPTO_ALG_TYPE_AKCIPHER, &mask);
@@ -650,36 +338,15 @@ static int pkcs1pad_create(struct crypto_template *tmpl, struct rtattr **tb)
}
err = -ENAMETOOLONG;
- hash_name = crypto_attr_alg_name(tb[2]);
- if (IS_ERR(hash_name)) {
- if (snprintf(inst->alg.base.cra_name,
- CRYPTO_MAX_ALG_NAME, "pkcs1pad(%s)",
- rsa_alg->base.cra_name) >= CRYPTO_MAX_ALG_NAME)
- goto err_free_inst;
+ if (snprintf(inst->alg.base.cra_name,
+ CRYPTO_MAX_ALG_NAME, "pkcs1pad(%s)",
+ rsa_alg->base.cra_name) >= CRYPTO_MAX_ALG_NAME)
+ goto err_free_inst;
- if (snprintf(inst->alg.base.cra_driver_name,
- CRYPTO_MAX_ALG_NAME, "pkcs1pad(%s)",
- rsa_alg->base.cra_driver_name) >=
- CRYPTO_MAX_ALG_NAME)
- goto err_free_inst;
- } else {
- ctx->digest_info = rsa_lookup_asn1(hash_name);
- if (!ctx->digest_info) {
- err = -EINVAL;
- goto err_free_inst;
- }
-
- if (snprintf(inst->alg.base.cra_name, CRYPTO_MAX_ALG_NAME,
- "pkcs1pad(%s,%s)", rsa_alg->base.cra_name,
- hash_name) >= CRYPTO_MAX_ALG_NAME)
- goto err_free_inst;
-
- if (snprintf(inst->alg.base.cra_driver_name,
- CRYPTO_MAX_ALG_NAME, "pkcs1pad(%s,%s)",
- rsa_alg->base.cra_driver_name,
- hash_name) >= CRYPTO_MAX_ALG_NAME)
- goto err_free_inst;
- }
+ if (snprintf(inst->alg.base.cra_driver_name,
+ CRYPTO_MAX_ALG_NAME, "pkcs1pad(%s)",
+ rsa_alg->base.cra_driver_name) >= CRYPTO_MAX_ALG_NAME)
+ goto err_free_inst;
inst->alg.base.cra_priority = rsa_alg->base.cra_priority;
inst->alg.base.cra_ctxsize = sizeof(struct pkcs1pad_ctx);
@@ -689,8 +356,6 @@ static int pkcs1pad_create(struct crypto_template *tmpl, struct rtattr **tb)
inst->alg.encrypt = pkcs1pad_encrypt;
inst->alg.decrypt = pkcs1pad_decrypt;
- inst->alg.sign = pkcs1pad_sign;
- inst->alg.verify = pkcs1pad_verify;
inst->alg.set_pub_key = pkcs1pad_set_pub_key;
inst->alg.set_priv_key = pkcs1pad_set_priv_key;
inst->alg.max_size = pkcs1pad_get_max_size;
diff --git a/crypto/rsa.c b/crypto/rsa.c
index 78b28d1..b7d2152 100644
--- a/crypto/rsa.c
+++ b/crypto/rsa.c
@@ -407,16 +407,25 @@ static int __init rsa_init(void)
return err;
err = crypto_register_template(&rsa_pkcs1pad_tmpl);
- if (err) {
- crypto_unregister_akcipher(&rsa);
- return err;
- }
+ if (err)
+ goto err_unregister_rsa;
+
+ err = crypto_register_template(&rsassa_pkcs1_tmpl);
+ if (err)
+ goto err_unregister_rsa_pkcs1pad;
return 0;
+
+err_unregister_rsa_pkcs1pad:
+ crypto_unregister_template(&rsa_pkcs1pad_tmpl);
+err_unregister_rsa:
+ crypto_unregister_akcipher(&rsa);
+ return err;
}
static void __exit rsa_exit(void)
{
+ crypto_unregister_template(&rsassa_pkcs1_tmpl);
crypto_unregister_template(&rsa_pkcs1pad_tmpl);
crypto_unregister_akcipher(&rsa);
}
diff --git a/crypto/rsassa-pkcs1.c b/crypto/rsassa-pkcs1.c
new file mode 100644
index 0000000..4d077fc9
--- /dev/null
+++ b/crypto/rsassa-pkcs1.c
@@ -0,0 +1,454 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * RSA Signature Scheme with Appendix - PKCS #1 v1.5 (RFC 8017 sec 8.2)
+ *
+ * https://www.rfc-editor.org/rfc/rfc8017#section-8.2
+ *
+ * Copyright (c) 2015 - 2024 Intel Corporation
+ */
+
+#include <linux/module.h>
+#include <linux/scatterlist.h>
+#include <crypto/akcipher.h>
+#include <crypto/algapi.h>
+#include <crypto/hash.h>
+#include <crypto/sig.h>
+#include <crypto/internal/akcipher.h>
+#include <crypto/internal/rsa.h>
+#include <crypto/internal/sig.h>
+
+/*
+ * Full Hash Prefix for EMSA-PKCS1-v1_5 encoding method (RFC 9580 table 24)
+ *
+ * RSA keys are usually much larger than the hash of the message to be signed.
+ * The hash is therefore prepended by the Full Hash Prefix and a 0xff padding.
+ * The Full Hash Prefix is an ASN.1 SEQUENCE containing the hash algorithm OID.
+ *
+ * https://www.rfc-editor.org/rfc/rfc9580#table-24
+ */
+
+static const u8 hash_prefix_none[] = { };
+
+static const u8 hash_prefix_md5[] = {
+ 0x30, 0x20, 0x30, 0x0c, 0x06, 0x08, /* SEQUENCE (SEQUENCE (OID */
+ 0x2a, 0x86, 0x48, 0x86, 0xf7, 0x0d, 0x02, 0x05, /* <algorithm>, */
+ 0x05, 0x00, 0x04, 0x10 /* NULL), OCTET STRING <hash>) */
+};
+
+static const u8 hash_prefix_sha1[] = {
+ 0x30, 0x21, 0x30, 0x09, 0x06, 0x05,
+ 0x2b, 0x0e, 0x03, 0x02, 0x1a,
+ 0x05, 0x00, 0x04, 0x14
+};
+
+static const u8 hash_prefix_rmd160[] = {
+ 0x30, 0x21, 0x30, 0x09, 0x06, 0x05,
+ 0x2b, 0x24, 0x03, 0x02, 0x01,
+ 0x05, 0x00, 0x04, 0x14
+};
+
+static const u8 hash_prefix_sha224[] = {
+ 0x30, 0x2d, 0x30, 0x0d, 0x06, 0x09,
+ 0x60, 0x86, 0x48, 0x01, 0x65, 0x03, 0x04, 0x02, 0x04,
+ 0x05, 0x00, 0x04, 0x1c
+};
+
+static const u8 hash_prefix_sha256[] = {
+ 0x30, 0x31, 0x30, 0x0d, 0x06, 0x09,
+ 0x60, 0x86, 0x48, 0x01, 0x65, 0x03, 0x04, 0x02, 0x01,
+ 0x05, 0x00, 0x04, 0x20
+};
+
+static const u8 hash_prefix_sha384[] = {
+ 0x30, 0x41, 0x30, 0x0d, 0x06, 0x09,
+ 0x60, 0x86, 0x48, 0x01, 0x65, 0x03, 0x04, 0x02, 0x02,
+ 0x05, 0x00, 0x04, 0x30
+};
+
+static const u8 hash_prefix_sha512[] = {
+ 0x30, 0x51, 0x30, 0x0d, 0x06, 0x09,
+ 0x60, 0x86, 0x48, 0x01, 0x65, 0x03, 0x04, 0x02, 0x03,
+ 0x05, 0x00, 0x04, 0x40
+};
+
+static const u8 hash_prefix_sha3_256[] = {
+ 0x30, 0x31, 0x30, 0x0d, 0x06, 0x09,
+ 0x60, 0x86, 0x48, 0x01, 0x65, 0x03, 0x04, 0x02, 0x08,
+ 0x05, 0x00, 0x04, 0x20
+};
+
+static const u8 hash_prefix_sha3_384[] = {
+ 0x30, 0x41, 0x30, 0x0d, 0x06, 0x09,
+ 0x60, 0x86, 0x48, 0x01, 0x65, 0x03, 0x04, 0x02, 0x09,
+ 0x05, 0x00, 0x04, 0x30
+};
+
+static const u8 hash_prefix_sha3_512[] = {
+ 0x30, 0x51, 0x30, 0x0d, 0x06, 0x09,
+ 0x60, 0x86, 0x48, 0x01, 0x65, 0x03, 0x04, 0x02, 0x0a,
+ 0x05, 0x00, 0x04, 0x40
+};
+
+static const struct hash_prefix {
+ const char *name;
+ const u8 *data;
+ size_t size;
+} hash_prefixes[] = {
+#define _(X) { #X, hash_prefix_##X, sizeof(hash_prefix_##X) }
+ _(none),
+ _(md5),
+ _(sha1),
+ _(rmd160),
+ _(sha256),
+ _(sha384),
+ _(sha512),
+ _(sha224),
+#undef _
+#define _(X) { "sha3-" #X, hash_prefix_sha3_##X, sizeof(hash_prefix_sha3_##X) }
+ _(256),
+ _(384),
+ _(512),
+#undef _
+ { NULL }
+};
+
+static const struct hash_prefix *rsassa_pkcs1_find_hash_prefix(const char *name)
+{
+ const struct hash_prefix *p;
+
+ for (p = hash_prefixes; p->name; p++)
+ if (strcmp(name, p->name) == 0)
+ return p;
+ return NULL;
+}
+
+static bool rsassa_pkcs1_invalid_hash_len(unsigned int len,
+ const struct hash_prefix *p)
+{
+ /*
+ * Legacy protocols such as TLS 1.1 or earlier and IKE version 1
+ * do not prepend a Full Hash Prefix to the hash. In that case,
+ * the size of the Full Hash Prefix is zero.
+ */
+ if (p->data == hash_prefix_none)
+ return false;
+
+ /*
+ * The final byte of the Full Hash Prefix encodes the hash length.
+ *
+ * This needs to be revisited should hash algorithms with more than
+ * 1016 bits (127 bytes * 8) ever be added. The length would then
+ * be encoded into more than one byte by ASN.1.
+ */
+ static_assert(HASH_MAX_DIGESTSIZE <= 127);
+
+ return len != p->data[p->size - 1];
+}
+
+struct rsassa_pkcs1_ctx {
+ struct crypto_akcipher *child;
+ unsigned int key_size;
+};
+
+struct rsassa_pkcs1_inst_ctx {
+ struct crypto_akcipher_spawn spawn;
+ const struct hash_prefix *hash_prefix;
+};
+
+static int rsassa_pkcs1_sign(struct crypto_sig *tfm,
+ const void *src, unsigned int slen,
+ void *dst, unsigned int dlen)
+{
+ struct sig_instance *inst = sig_alg_instance(tfm);
+ struct rsassa_pkcs1_inst_ctx *ictx = sig_instance_ctx(inst);
+ const struct hash_prefix *hash_prefix = ictx->hash_prefix;
+ struct rsassa_pkcs1_ctx *ctx = crypto_sig_ctx(tfm);
+ unsigned int child_reqsize = crypto_akcipher_reqsize(ctx->child);
+ struct akcipher_request *child_req __free(kfree_sensitive) = NULL;
+ struct scatterlist in_sg[3], out_sg;
+ struct crypto_wait cwait;
+ unsigned int pad_len;
+ unsigned int ps_end;
+ unsigned int len;
+ u8 *in_buf;
+ int err;
+
+ if (!ctx->key_size)
+ return -EINVAL;
+
+ if (dlen < ctx->key_size)
+ return -EOVERFLOW;
+
+ if (rsassa_pkcs1_invalid_hash_len(slen, hash_prefix))
+ return -EINVAL;
+
+ if (slen + hash_prefix->size > ctx->key_size - 11)
+ return -EOVERFLOW;
+
+ pad_len = ctx->key_size - slen - hash_prefix->size - 1;
+
+ child_req = kmalloc(sizeof(*child_req) + child_reqsize + pad_len,
+ GFP_KERNEL);
+ if (!child_req)
+ return -ENOMEM;
+
+ /* RFC 8017 sec 8.2.1 step 1 - EMSA-PKCS1-v1_5 encoding generation */
+ in_buf = (u8 *)(child_req + 1) + child_reqsize;
+ ps_end = pad_len - 1;
+ in_buf[0] = 0x01;
+ memset(in_buf + 1, 0xff, ps_end - 1);
+ in_buf[ps_end] = 0x00;
+
+ /* RFC 8017 sec 8.2.1 step 2 - RSA signature */
+ crypto_init_wait(&cwait);
+ sg_init_table(in_sg, 3);
+ sg_set_buf(&in_sg[0], in_buf, pad_len);
+ sg_set_buf(&in_sg[1], hash_prefix->data, hash_prefix->size);
+ sg_set_buf(&in_sg[2], src, slen);
+ sg_init_one(&out_sg, dst, dlen);
+ akcipher_request_set_tfm(child_req, ctx->child);
+ akcipher_request_set_crypt(child_req, in_sg, &out_sg,
+ ctx->key_size - 1, dlen);
+ akcipher_request_set_callback(child_req, CRYPTO_TFM_REQ_MAY_SLEEP,
+ crypto_req_done, &cwait);
+
+ err = crypto_akcipher_decrypt(child_req);
+ err = crypto_wait_req(err, &cwait);
+ if (err)
+ return err;
+
+ len = child_req->dst_len;
+ pad_len = ctx->key_size - len;
+
+ /* Four billion to one */
+ if (unlikely(pad_len)) {
+ memmove(dst + pad_len, dst, len);
+ memset(dst, 0, pad_len);
+ }
+
+ return 0;
+}
+
+static int rsassa_pkcs1_verify(struct crypto_sig *tfm,
+ const void *src, unsigned int slen,
+ const void *digest, unsigned int dlen)
+{
+ struct sig_instance *inst = sig_alg_instance(tfm);
+ struct rsassa_pkcs1_inst_ctx *ictx = sig_instance_ctx(inst);
+ const struct hash_prefix *hash_prefix = ictx->hash_prefix;
+ struct rsassa_pkcs1_ctx *ctx = crypto_sig_ctx(tfm);
+ unsigned int child_reqsize = crypto_akcipher_reqsize(ctx->child);
+ struct akcipher_request *child_req __free(kfree_sensitive) = NULL;
+ struct scatterlist in_sg, out_sg;
+ struct crypto_wait cwait;
+ unsigned int dst_len;
+ unsigned int pos;
+ u8 *out_buf;
+ int err;
+
+ /* RFC 8017 sec 8.2.2 step 1 - length checking */
+ if (!ctx->key_size ||
+ slen != ctx->key_size ||
+ rsassa_pkcs1_invalid_hash_len(dlen, hash_prefix))
+ return -EINVAL;
+
+ /* RFC 8017 sec 8.2.2 step 2 - RSA verification */
+ child_req = kmalloc(sizeof(*child_req) + child_reqsize + ctx->key_size,
+ GFP_KERNEL);
+ if (!child_req)
+ return -ENOMEM;
+
+ out_buf = (u8 *)(child_req + 1) + child_reqsize;
+
+ crypto_init_wait(&cwait);
+ sg_init_one(&in_sg, src, slen);
+ sg_init_one(&out_sg, out_buf, ctx->key_size);
+ akcipher_request_set_tfm(child_req, ctx->child);
+ akcipher_request_set_crypt(child_req, &in_sg, &out_sg,
+ slen, ctx->key_size);
+ akcipher_request_set_callback(child_req, CRYPTO_TFM_REQ_MAY_SLEEP,
+ crypto_req_done, &cwait);
+
+ err = crypto_akcipher_encrypt(child_req);
+ err = crypto_wait_req(err, &cwait);
+ if (err)
+ return err;
+
+ /* RFC 8017 sec 8.2.2 step 3 - EMSA-PKCS1-v1_5 encoding verification */
+ dst_len = child_req->dst_len;
+ if (dst_len < ctx->key_size - 1)
+ return -EINVAL;
+
+ if (dst_len == ctx->key_size) {
+ if (out_buf[0] != 0x00)
+ /* Encrypted value had no leading 0 byte */
+ return -EINVAL;
+
+ dst_len--;
+ out_buf++;
+ }
+
+ if (out_buf[0] != 0x01)
+ return -EBADMSG;
+
+ for (pos = 1; pos < dst_len; pos++)
+ if (out_buf[pos] != 0xff)
+ break;
+
+ if (pos < 9 || pos == dst_len || out_buf[pos] != 0x00)
+ return -EBADMSG;
+ pos++;
+
+ if (hash_prefix->size > dst_len - pos)
+ return -EBADMSG;
+ if (crypto_memneq(out_buf + pos, hash_prefix->data, hash_prefix->size))
+ return -EBADMSG;
+ pos += hash_prefix->size;
+
+ /* RFC 8017 sec 8.2.2 step 4 - comparison of digest with out_buf */
+ if (dlen != dst_len - pos)
+ return -EKEYREJECTED;
+ if (memcmp(digest, out_buf + pos, dlen) != 0)
+ return -EKEYREJECTED;
+
+ return 0;
+}
+
+static unsigned int rsassa_pkcs1_key_size(struct crypto_sig *tfm)
+{
+ struct rsassa_pkcs1_ctx *ctx = crypto_sig_ctx(tfm);
+
+ return ctx->key_size;
+}
+
+static int rsassa_pkcs1_set_pub_key(struct crypto_sig *tfm,
+ const void *key, unsigned int keylen)
+{
+ struct rsassa_pkcs1_ctx *ctx = crypto_sig_ctx(tfm);
+
+ return rsa_set_key(ctx->child, &ctx->key_size, RSA_PUB, key, keylen);
+}
+
+static int rsassa_pkcs1_set_priv_key(struct crypto_sig *tfm,
+ const void *key, unsigned int keylen)
+{
+ struct rsassa_pkcs1_ctx *ctx = crypto_sig_ctx(tfm);
+
+ return rsa_set_key(ctx->child, &ctx->key_size, RSA_PRIV, key, keylen);
+}
+
+static int rsassa_pkcs1_init_tfm(struct crypto_sig *tfm)
+{
+ struct sig_instance *inst = sig_alg_instance(tfm);
+ struct rsassa_pkcs1_inst_ctx *ictx = sig_instance_ctx(inst);
+ struct rsassa_pkcs1_ctx *ctx = crypto_sig_ctx(tfm);
+ struct crypto_akcipher *child_tfm;
+
+ child_tfm = crypto_spawn_akcipher(&ictx->spawn);
+ if (IS_ERR(child_tfm))
+ return PTR_ERR(child_tfm);
+
+ ctx->child = child_tfm;
+
+ return 0;
+}
+
+static void rsassa_pkcs1_exit_tfm(struct crypto_sig *tfm)
+{
+ struct rsassa_pkcs1_ctx *ctx = crypto_sig_ctx(tfm);
+
+ crypto_free_akcipher(ctx->child);
+}
+
+static void rsassa_pkcs1_free(struct sig_instance *inst)
+{
+ struct rsassa_pkcs1_inst_ctx *ctx = sig_instance_ctx(inst);
+ struct crypto_akcipher_spawn *spawn = &ctx->spawn;
+
+ crypto_drop_akcipher(spawn);
+ kfree(inst);
+}
+
+static int rsassa_pkcs1_create(struct crypto_template *tmpl, struct rtattr **tb)
+{
+ struct rsassa_pkcs1_inst_ctx *ctx;
+ struct akcipher_alg *rsa_alg;
+ struct sig_instance *inst;
+ const char *hash_name;
+ u32 mask;
+ int err;
+
+ err = crypto_check_attr_type(tb, CRYPTO_ALG_TYPE_SIG, &mask);
+ if (err)
+ return err;
+
+ inst = kzalloc(sizeof(*inst) + sizeof(*ctx), GFP_KERNEL);
+ if (!inst)
+ return -ENOMEM;
+
+ ctx = sig_instance_ctx(inst);
+
+ err = crypto_grab_akcipher(&ctx->spawn, sig_crypto_instance(inst),
+ crypto_attr_alg_name(tb[1]), 0, mask);
+ if (err)
+ goto err_free_inst;
+
+ rsa_alg = crypto_spawn_akcipher_alg(&ctx->spawn);
+
+ if (strcmp(rsa_alg->base.cra_name, "rsa") != 0) {
+ err = -EINVAL;
+ goto err_free_inst;
+ }
+
+ hash_name = crypto_attr_alg_name(tb[2]);
+ if (IS_ERR(hash_name)) {
+ err = PTR_ERR(hash_name);
+ goto err_free_inst;
+ }
+
+ ctx->hash_prefix = rsassa_pkcs1_find_hash_prefix(hash_name);
+ if (!ctx->hash_prefix) {
+ err = -EINVAL;
+ goto err_free_inst;
+ }
+
+ err = -ENAMETOOLONG;
+ if (snprintf(inst->alg.base.cra_name, CRYPTO_MAX_ALG_NAME,
+ "pkcs1(%s,%s)", rsa_alg->base.cra_name,
+ hash_name) >= CRYPTO_MAX_ALG_NAME)
+ goto err_free_inst;
+
+ if (snprintf(inst->alg.base.cra_driver_name, CRYPTO_MAX_ALG_NAME,
+ "pkcs1(%s,%s)", rsa_alg->base.cra_driver_name,
+ hash_name) >= CRYPTO_MAX_ALG_NAME)
+ goto err_free_inst;
+
+ inst->alg.base.cra_priority = rsa_alg->base.cra_priority;
+ inst->alg.base.cra_ctxsize = sizeof(struct rsassa_pkcs1_ctx);
+
+ inst->alg.init = rsassa_pkcs1_init_tfm;
+ inst->alg.exit = rsassa_pkcs1_exit_tfm;
+
+ inst->alg.sign = rsassa_pkcs1_sign;
+ inst->alg.verify = rsassa_pkcs1_verify;
+ inst->alg.key_size = rsassa_pkcs1_key_size;
+ inst->alg.set_pub_key = rsassa_pkcs1_set_pub_key;
+ inst->alg.set_priv_key = rsassa_pkcs1_set_priv_key;
+
+ inst->free = rsassa_pkcs1_free;
+
+ err = sig_register_instance(tmpl, inst);
+ if (err) {
+err_free_inst:
+ rsassa_pkcs1_free(inst);
+ }
+ return err;
+}
+
+struct crypto_template rsassa_pkcs1_tmpl = {
+ .name = "pkcs1",
+ .create = rsassa_pkcs1_create,
+ .module = THIS_MODULE,
+};
+
+MODULE_ALIAS_CRYPTO("pkcs1");
diff --git a/crypto/sig.c b/crypto/sig.c
index 7645bedf..5e1f1f7 100644
--- a/crypto/sig.c
+++ b/crypto/sig.c
@@ -5,12 +5,10 @@
* Copyright (c) 2023 Herbert Xu <herbert@gondor.apana.org.au>
*/
-#include <crypto/akcipher.h>
#include <crypto/internal/sig.h>
#include <linux/cryptouser.h>
#include <linux/kernel.h>
#include <linux/module.h>
-#include <linux/scatterlist.h>
#include <linux/seq_file.h>
#include <linux/string.h>
#include <net/netlink.h>
@@ -19,16 +17,35 @@
#define CRYPTO_ALG_TYPE_SIG_MASK 0x0000000e
-static const struct crypto_type crypto_sig_type;
+static void crypto_sig_exit_tfm(struct crypto_tfm *tfm)
+{
+ struct crypto_sig *sig = __crypto_sig_tfm(tfm);
+ struct sig_alg *alg = crypto_sig_alg(sig);
+
+ alg->exit(sig);
+}
static int crypto_sig_init_tfm(struct crypto_tfm *tfm)
{
- if (tfm->__crt_alg->cra_type != &crypto_sig_type)
- return crypto_init_akcipher_ops_sig(tfm);
+ struct crypto_sig *sig = __crypto_sig_tfm(tfm);
+ struct sig_alg *alg = crypto_sig_alg(sig);
+
+ if (alg->exit)
+ sig->base.exit = crypto_sig_exit_tfm;
+
+ if (alg->init)
+ return alg->init(sig);
return 0;
}
+static void crypto_sig_free_instance(struct crypto_instance *inst)
+{
+ struct sig_instance *sig = sig_instance(inst);
+
+ sig->free(sig);
+}
+
static void __maybe_unused crypto_sig_show(struct seq_file *m,
struct crypto_alg *alg)
{
@@ -38,16 +55,17 @@ static void __maybe_unused crypto_sig_show(struct seq_file *m,
static int __maybe_unused crypto_sig_report(struct sk_buff *skb,
struct crypto_alg *alg)
{
- struct crypto_report_akcipher rsig = {};
+ struct crypto_report_sig rsig = {};
strscpy(rsig.type, "sig", sizeof(rsig.type));
- return nla_put(skb, CRYPTOCFGA_REPORT_AKCIPHER, sizeof(rsig), &rsig);
+ return nla_put(skb, CRYPTOCFGA_REPORT_SIG, sizeof(rsig), &rsig);
}
static const struct crypto_type crypto_sig_type = {
.extsize = crypto_alg_extsize,
.init_tfm = crypto_sig_init_tfm,
+ .free = crypto_sig_free_instance,
#ifdef CONFIG_PROC_FS
.show = crypto_sig_show,
#endif
@@ -66,74 +84,95 @@ struct crypto_sig *crypto_alloc_sig(const char *alg_name, u32 type, u32 mask)
}
EXPORT_SYMBOL_GPL(crypto_alloc_sig);
-int crypto_sig_maxsize(struct crypto_sig *tfm)
+static int sig_default_sign(struct crypto_sig *tfm,
+ const void *src, unsigned int slen,
+ void *dst, unsigned int dlen)
{
- struct crypto_akcipher **ctx = crypto_sig_ctx(tfm);
-
- return crypto_akcipher_maxsize(*ctx);
+ return -ENOSYS;
}
-EXPORT_SYMBOL_GPL(crypto_sig_maxsize);
-int crypto_sig_sign(struct crypto_sig *tfm,
- const void *src, unsigned int slen,
- void *dst, unsigned int dlen)
+static int sig_default_verify(struct crypto_sig *tfm,
+ const void *src, unsigned int slen,
+ const void *dst, unsigned int dlen)
{
- struct crypto_akcipher **ctx = crypto_sig_ctx(tfm);
- struct crypto_akcipher_sync_data data = {
- .tfm = *ctx,
- .src = src,
- .dst = dst,
- .slen = slen,
- .dlen = dlen,
- };
-
- return crypto_akcipher_sync_prep(&data) ?:
- crypto_akcipher_sync_post(&data,
- crypto_akcipher_sign(data.req));
+ return -ENOSYS;
}
-EXPORT_SYMBOL_GPL(crypto_sig_sign);
-int crypto_sig_verify(struct crypto_sig *tfm,
- const void *src, unsigned int slen,
- const void *digest, unsigned int dlen)
+static int sig_default_set_key(struct crypto_sig *tfm,
+ const void *key, unsigned int keylen)
{
- struct crypto_akcipher **ctx = crypto_sig_ctx(tfm);
- struct crypto_akcipher_sync_data data = {
- .tfm = *ctx,
- .src = src,
- .slen = slen,
- .dlen = dlen,
- };
+ return -ENOSYS;
+}
+
+static int sig_prepare_alg(struct sig_alg *alg)
+{
+ struct crypto_alg *base = &alg->base;
+
+ if (!alg->sign)
+ alg->sign = sig_default_sign;
+ if (!alg->verify)
+ alg->verify = sig_default_verify;
+ if (!alg->set_priv_key)
+ alg->set_priv_key = sig_default_set_key;
+ if (!alg->set_pub_key)
+ return -EINVAL;
+ if (!alg->key_size)
+ return -EINVAL;
+ if (!alg->max_size)
+ alg->max_size = alg->key_size;
+ if (!alg->digest_size)
+ alg->digest_size = alg->key_size;
+
+ base->cra_type = &crypto_sig_type;
+ base->cra_flags &= ~CRYPTO_ALG_TYPE_MASK;
+ base->cra_flags |= CRYPTO_ALG_TYPE_SIG;
+
+ return 0;
+}
+
+int crypto_register_sig(struct sig_alg *alg)
+{
+ struct crypto_alg *base = &alg->base;
int err;
- err = crypto_akcipher_sync_prep(&data);
+ err = sig_prepare_alg(alg);
if (err)
return err;
- memcpy(data.buf + slen, digest, dlen);
-
- return crypto_akcipher_sync_post(&data,
- crypto_akcipher_verify(data.req));
+ return crypto_register_alg(base);
}
-EXPORT_SYMBOL_GPL(crypto_sig_verify);
+EXPORT_SYMBOL_GPL(crypto_register_sig);
-int crypto_sig_set_pubkey(struct crypto_sig *tfm,
- const void *key, unsigned int keylen)
+void crypto_unregister_sig(struct sig_alg *alg)
{
- struct crypto_akcipher **ctx = crypto_sig_ctx(tfm);
-
- return crypto_akcipher_set_pub_key(*ctx, key, keylen);
+ crypto_unregister_alg(&alg->base);
}
-EXPORT_SYMBOL_GPL(crypto_sig_set_pubkey);
+EXPORT_SYMBOL_GPL(crypto_unregister_sig);
-int crypto_sig_set_privkey(struct crypto_sig *tfm,
- const void *key, unsigned int keylen)
+int sig_register_instance(struct crypto_template *tmpl,
+ struct sig_instance *inst)
{
- struct crypto_akcipher **ctx = crypto_sig_ctx(tfm);
+ int err;
- return crypto_akcipher_set_priv_key(*ctx, key, keylen);
+ if (WARN_ON(!inst->free))
+ return -EINVAL;
+
+ err = sig_prepare_alg(&inst->alg);
+ if (err)
+ return err;
+
+ return crypto_register_instance(tmpl, sig_crypto_instance(inst));
}
-EXPORT_SYMBOL_GPL(crypto_sig_set_privkey);
+EXPORT_SYMBOL_GPL(sig_register_instance);
+
+int crypto_grab_sig(struct crypto_sig_spawn *spawn,
+ struct crypto_instance *inst,
+ const char *name, u32 type, u32 mask)
+{
+ spawn->base.frontend = &crypto_sig_type;
+ return crypto_grab_spawn(&spawn->base, inst, name, type, mask);
+}
+EXPORT_SYMBOL_GPL(crypto_grab_sig);
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("Public Key Signature Algorithms");
diff --git a/crypto/testmgr.c b/crypto/testmgr.c
index 2f5f6b5..3fc908b 100644
--- a/crypto/testmgr.c
+++ b/crypto/testmgr.c
@@ -23,7 +23,7 @@
#include <linux/fips.h>
#include <linux/module.h>
#include <linux/once.h>
-#include <linux/random.h>
+#include <linux/prandom.h>
#include <linux/scatterlist.h>
#include <linux/slab.h>
#include <linux/string.h>
@@ -33,6 +33,7 @@
#include <crypto/akcipher.h>
#include <crypto/kpp.h>
#include <crypto/acompress.h>
+#include <crypto/sig.h>
#include <crypto/internal/cipher.h>
#include <crypto/internal/simd.h>
@@ -131,6 +132,11 @@ struct akcipher_test_suite {
unsigned int count;
};
+struct sig_test_suite {
+ const struct sig_testvec *vecs;
+ unsigned int count;
+};
+
struct kpp_test_suite {
const struct kpp_testvec *vecs;
unsigned int count;
@@ -151,6 +157,7 @@ struct alg_test_desc {
struct cprng_test_suite cprng;
struct drbg_test_suite drbg;
struct akcipher_test_suite akcipher;
+ struct sig_test_suite sig;
struct kpp_test_suite kpp;
} suite;
};
@@ -4123,11 +4130,9 @@ static int test_akcipher_one(struct crypto_akcipher *tfm,
struct crypto_wait wait;
unsigned int out_len_max, out_len = 0;
int err = -ENOMEM;
- struct scatterlist src, dst, src_tab[3];
- const char *m, *c;
- unsigned int m_size, c_size;
- const char *op;
- u8 *key, *ptr;
+ struct scatterlist src, dst, src_tab[2];
+ const char *c;
+ unsigned int c_size;
if (testmgr_alloc_buf(xbuf))
return err;
@@ -4138,92 +4143,53 @@ static int test_akcipher_one(struct crypto_akcipher *tfm,
crypto_init_wait(&wait);
- key = kmalloc(vecs->key_len + sizeof(u32) * 2 + vecs->param_len,
- GFP_KERNEL);
- if (!key)
- goto free_req;
- memcpy(key, vecs->key, vecs->key_len);
- ptr = key + vecs->key_len;
- ptr = test_pack_u32(ptr, vecs->algo);
- ptr = test_pack_u32(ptr, vecs->param_len);
- memcpy(ptr, vecs->params, vecs->param_len);
-
if (vecs->public_key_vec)
- err = crypto_akcipher_set_pub_key(tfm, key, vecs->key_len);
+ err = crypto_akcipher_set_pub_key(tfm, vecs->key,
+ vecs->key_len);
else
- err = crypto_akcipher_set_priv_key(tfm, key, vecs->key_len);
+ err = crypto_akcipher_set_priv_key(tfm, vecs->key,
+ vecs->key_len);
if (err)
- goto free_key;
+ goto free_req;
- /*
- * First run test which do not require a private key, such as
- * encrypt or verify.
- */
+ /* First run encrypt test which does not require a private key */
err = -ENOMEM;
out_len_max = crypto_akcipher_maxsize(tfm);
outbuf_enc = kzalloc(out_len_max, GFP_KERNEL);
if (!outbuf_enc)
- goto free_key;
+ goto free_req;
- if (!vecs->siggen_sigver_test) {
- m = vecs->m;
- m_size = vecs->m_size;
- c = vecs->c;
- c_size = vecs->c_size;
- op = "encrypt";
- } else {
- /* Swap args so we could keep plaintext (digest)
- * in vecs->m, and cooked signature in vecs->c.
- */
- m = vecs->c; /* signature */
- m_size = vecs->c_size;
- c = vecs->m; /* digest */
- c_size = vecs->m_size;
- op = "verify";
- }
+ c = vecs->c;
+ c_size = vecs->c_size;
err = -E2BIG;
- if (WARN_ON(m_size > PAGE_SIZE))
+ if (WARN_ON(vecs->m_size > PAGE_SIZE))
goto free_all;
- memcpy(xbuf[0], m, m_size);
+ memcpy(xbuf[0], vecs->m, vecs->m_size);
- sg_init_table(src_tab, 3);
+ sg_init_table(src_tab, 2);
sg_set_buf(&src_tab[0], xbuf[0], 8);
- sg_set_buf(&src_tab[1], xbuf[0] + 8, m_size - 8);
- if (vecs->siggen_sigver_test) {
- if (WARN_ON(c_size > PAGE_SIZE))
- goto free_all;
- memcpy(xbuf[1], c, c_size);
- sg_set_buf(&src_tab[2], xbuf[1], c_size);
- akcipher_request_set_crypt(req, src_tab, NULL, m_size, c_size);
- } else {
- sg_init_one(&dst, outbuf_enc, out_len_max);
- akcipher_request_set_crypt(req, src_tab, &dst, m_size,
- out_len_max);
- }
+ sg_set_buf(&src_tab[1], xbuf[0] + 8, vecs->m_size - 8);
+ sg_init_one(&dst, outbuf_enc, out_len_max);
+ akcipher_request_set_crypt(req, src_tab, &dst, vecs->m_size,
+ out_len_max);
akcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG,
crypto_req_done, &wait);
- err = crypto_wait_req(vecs->siggen_sigver_test ?
- /* Run asymmetric signature verification */
- crypto_akcipher_verify(req) :
- /* Run asymmetric encrypt */
- crypto_akcipher_encrypt(req), &wait);
+ err = crypto_wait_req(crypto_akcipher_encrypt(req), &wait);
if (err) {
- pr_err("alg: akcipher: %s test failed. err %d\n", op, err);
+ pr_err("alg: akcipher: encrypt test failed. err %d\n", err);
goto free_all;
}
- if (!vecs->siggen_sigver_test && c) {
+ if (c) {
if (req->dst_len != c_size) {
- pr_err("alg: akcipher: %s test failed. Invalid output len\n",
- op);
+ pr_err("alg: akcipher: encrypt test failed. Invalid output len\n");
err = -EINVAL;
goto free_all;
}
/* verify that encrypted message is equal to expected */
if (memcmp(c, outbuf_enc, c_size) != 0) {
- pr_err("alg: akcipher: %s test failed. Invalid output\n",
- op);
+ pr_err("alg: akcipher: encrypt test failed. Invalid output\n");
hexdump(outbuf_enc, c_size);
err = -EINVAL;
goto free_all;
@@ -4231,7 +4197,7 @@ static int test_akcipher_one(struct crypto_akcipher *tfm,
}
/*
- * Don't invoke (decrypt or sign) test which require a private key
+ * Don't invoke decrypt test which requires a private key
* for vectors with only a public key.
*/
if (vecs->public_key_vec) {
@@ -4244,13 +4210,12 @@ static int test_akcipher_one(struct crypto_akcipher *tfm,
goto free_all;
}
- if (!vecs->siggen_sigver_test && !c) {
+ if (!c) {
c = outbuf_enc;
c_size = req->dst_len;
}
err = -E2BIG;
- op = vecs->siggen_sigver_test ? "sign" : "decrypt";
if (WARN_ON(c_size > PAGE_SIZE))
goto free_all;
memcpy(xbuf[0], c, c_size);
@@ -4260,34 +4225,29 @@ static int test_akcipher_one(struct crypto_akcipher *tfm,
crypto_init_wait(&wait);
akcipher_request_set_crypt(req, &src, &dst, c_size, out_len_max);
- err = crypto_wait_req(vecs->siggen_sigver_test ?
- /* Run asymmetric signature generation */
- crypto_akcipher_sign(req) :
- /* Run asymmetric decrypt */
- crypto_akcipher_decrypt(req), &wait);
+ err = crypto_wait_req(crypto_akcipher_decrypt(req), &wait);
if (err) {
- pr_err("alg: akcipher: %s test failed. err %d\n", op, err);
+ pr_err("alg: akcipher: decrypt test failed. err %d\n", err);
goto free_all;
}
out_len = req->dst_len;
- if (out_len < m_size) {
- pr_err("alg: akcipher: %s test failed. Invalid output len %u\n",
- op, out_len);
+ if (out_len < vecs->m_size) {
+ pr_err("alg: akcipher: decrypt test failed. Invalid output len %u\n",
+ out_len);
err = -EINVAL;
goto free_all;
}
/* verify that decrypted message is equal to the original msg */
- if (memchr_inv(outbuf_dec, 0, out_len - m_size) ||
- memcmp(m, outbuf_dec + out_len - m_size, m_size)) {
- pr_err("alg: akcipher: %s test failed. Invalid output\n", op);
+ if (memchr_inv(outbuf_dec, 0, out_len - vecs->m_size) ||
+ memcmp(vecs->m, outbuf_dec + out_len - vecs->m_size,
+ vecs->m_size)) {
+ pr_err("alg: akcipher: decrypt test failed. Invalid output\n");
hexdump(outbuf_dec, out_len);
err = -EINVAL;
}
free_all:
kfree(outbuf_dec);
kfree(outbuf_enc);
-free_key:
- kfree(key);
free_req:
akcipher_request_free(req);
free_xbuf:
@@ -4337,6 +4297,113 @@ static int alg_test_akcipher(const struct alg_test_desc *desc,
return err;
}
+static int test_sig_one(struct crypto_sig *tfm, const struct sig_testvec *vecs)
+{
+ u8 *ptr, *key __free(kfree);
+ int err, sig_size;
+
+ key = kmalloc(vecs->key_len + 2 * sizeof(u32) + vecs->param_len,
+ GFP_KERNEL);
+ if (!key)
+ return -ENOMEM;
+
+ /* ecrdsa expects additional parameters appended to the key */
+ memcpy(key, vecs->key, vecs->key_len);
+ ptr = key + vecs->key_len;
+ ptr = test_pack_u32(ptr, vecs->algo);
+ ptr = test_pack_u32(ptr, vecs->param_len);
+ memcpy(ptr, vecs->params, vecs->param_len);
+
+ if (vecs->public_key_vec)
+ err = crypto_sig_set_pubkey(tfm, key, vecs->key_len);
+ else
+ err = crypto_sig_set_privkey(tfm, key, vecs->key_len);
+ if (err)
+ return err;
+
+ /*
+ * Run asymmetric signature verification first
+ * (which does not require a private key)
+ */
+ err = crypto_sig_verify(tfm, vecs->c, vecs->c_size,
+ vecs->m, vecs->m_size);
+ if (err) {
+ pr_err("alg: sig: verify test failed: err %d\n", err);
+ return err;
+ }
+
+ /*
+ * Don't invoke sign test (which requires a private key)
+ * for vectors with only a public key.
+ */
+ if (vecs->public_key_vec)
+ return 0;
+
+ sig_size = crypto_sig_keysize(tfm);
+ if (sig_size < vecs->c_size) {
+ pr_err("alg: sig: invalid maxsize %u\n", sig_size);
+ return -EINVAL;
+ }
+
+ u8 *sig __free(kfree) = kzalloc(sig_size, GFP_KERNEL);
+ if (!sig)
+ return -ENOMEM;
+
+ /* Run asymmetric signature generation */
+ err = crypto_sig_sign(tfm, vecs->m, vecs->m_size, sig, sig_size);
+ if (err) {
+ pr_err("alg: sig: sign test failed: err %d\n", err);
+ return err;
+ }
+
+ /* Verify that generated signature equals cooked signature */
+ if (memcmp(sig, vecs->c, vecs->c_size) ||
+ memchr_inv(sig + vecs->c_size, 0, sig_size - vecs->c_size)) {
+ pr_err("alg: sig: sign test failed: invalid output\n");
+ hexdump(sig, sig_size);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int test_sig(struct crypto_sig *tfm, const char *alg,
+ const struct sig_testvec *vecs, unsigned int tcount)
+{
+ const char *algo = crypto_tfm_alg_driver_name(crypto_sig_tfm(tfm));
+ int ret, i;
+
+ for (i = 0; i < tcount; i++) {
+ ret = test_sig_one(tfm, vecs++);
+ if (ret) {
+ pr_err("alg: sig: test %d failed for %s: err %d\n",
+ i + 1, algo, ret);
+ return ret;
+ }
+ }
+ return 0;
+}
+
+static int alg_test_sig(const struct alg_test_desc *desc, const char *driver,
+ u32 type, u32 mask)
+{
+ struct crypto_sig *tfm;
+ int err = 0;
+
+ tfm = crypto_alloc_sig(driver, type, mask);
+ if (IS_ERR(tfm)) {
+ pr_err("alg: sig: Failed to load tfm for %s: %ld\n",
+ driver, PTR_ERR(tfm));
+ return PTR_ERR(tfm);
+ }
+ if (desc->suite.sig.vecs)
+ err = test_sig(tfm, desc->alg, desc->suite.sig.vecs,
+ desc->suite.sig.count);
+
+ crypto_free_sig(tfm);
+ return err;
+}
+
static int alg_test_null(const struct alg_test_desc *desc,
const char *driver, u32 type, u32 mask)
{
@@ -5126,36 +5193,36 @@ static const struct alg_test_desc alg_test_descs[] = {
}
}, {
.alg = "ecdsa-nist-p192",
- .test = alg_test_akcipher,
+ .test = alg_test_sig,
.suite = {
- .akcipher = __VECS(ecdsa_nist_p192_tv_template)
+ .sig = __VECS(ecdsa_nist_p192_tv_template)
}
}, {
.alg = "ecdsa-nist-p256",
- .test = alg_test_akcipher,
+ .test = alg_test_sig,
.fips_allowed = 1,
.suite = {
- .akcipher = __VECS(ecdsa_nist_p256_tv_template)
+ .sig = __VECS(ecdsa_nist_p256_tv_template)
}
}, {
.alg = "ecdsa-nist-p384",
- .test = alg_test_akcipher,
+ .test = alg_test_sig,
.fips_allowed = 1,
.suite = {
- .akcipher = __VECS(ecdsa_nist_p384_tv_template)
+ .sig = __VECS(ecdsa_nist_p384_tv_template)
}
}, {
.alg = "ecdsa-nist-p521",
- .test = alg_test_akcipher,
+ .test = alg_test_sig,
.fips_allowed = 1,
.suite = {
- .akcipher = __VECS(ecdsa_nist_p521_tv_template)
+ .sig = __VECS(ecdsa_nist_p521_tv_template)
}
}, {
.alg = "ecrdsa",
- .test = alg_test_akcipher,
+ .test = alg_test_sig,
.suite = {
- .akcipher = __VECS(ecrdsa_tv_template)
+ .sig = __VECS(ecrdsa_tv_template)
}
}, {
.alg = "essiv(authenc(hmac(sha256),cbc(aes)),sha256)",
@@ -5448,40 +5515,68 @@ static const struct alg_test_desc alg_test_descs[] = {
.hash = __VECS(nhpoly1305_tv_template)
}
}, {
+ .alg = "p1363(ecdsa-nist-p192)",
+ .test = alg_test_null,
+ }, {
+ .alg = "p1363(ecdsa-nist-p256)",
+ .test = alg_test_sig,
+ .fips_allowed = 1,
+ .suite = {
+ .sig = __VECS(p1363_ecdsa_nist_p256_tv_template)
+ }
+ }, {
+ .alg = "p1363(ecdsa-nist-p384)",
+ .test = alg_test_null,
+ .fips_allowed = 1,
+ }, {
+ .alg = "p1363(ecdsa-nist-p521)",
+ .test = alg_test_null,
+ .fips_allowed = 1,
+ }, {
.alg = "pcbc(fcrypt)",
.test = alg_test_skcipher,
.suite = {
.cipher = __VECS(fcrypt_pcbc_tv_template)
}
}, {
- .alg = "pkcs1pad(rsa,sha224)",
- .test = alg_test_null,
- .fips_allowed = 1,
- }, {
- .alg = "pkcs1pad(rsa,sha256)",
- .test = alg_test_akcipher,
- .fips_allowed = 1,
+ .alg = "pkcs1(rsa,none)",
+ .test = alg_test_sig,
.suite = {
- .akcipher = __VECS(pkcs1pad_rsa_tv_template)
+ .sig = __VECS(pkcs1_rsa_none_tv_template)
}
}, {
- .alg = "pkcs1pad(rsa,sha3-256)",
+ .alg = "pkcs1(rsa,sha224)",
.test = alg_test_null,
.fips_allowed = 1,
}, {
- .alg = "pkcs1pad(rsa,sha3-384)",
+ .alg = "pkcs1(rsa,sha256)",
+ .test = alg_test_sig,
+ .fips_allowed = 1,
+ .suite = {
+ .sig = __VECS(pkcs1_rsa_tv_template)
+ }
+ }, {
+ .alg = "pkcs1(rsa,sha3-256)",
.test = alg_test_null,
.fips_allowed = 1,
}, {
- .alg = "pkcs1pad(rsa,sha3-512)",
+ .alg = "pkcs1(rsa,sha3-384)",
.test = alg_test_null,
.fips_allowed = 1,
}, {
- .alg = "pkcs1pad(rsa,sha384)",
+ .alg = "pkcs1(rsa,sha3-512)",
.test = alg_test_null,
.fips_allowed = 1,
}, {
- .alg = "pkcs1pad(rsa,sha512)",
+ .alg = "pkcs1(rsa,sha384)",
+ .test = alg_test_null,
+ .fips_allowed = 1,
+ }, {
+ .alg = "pkcs1(rsa,sha512)",
+ .test = alg_test_null,
+ .fips_allowed = 1,
+ }, {
+ .alg = "pkcs1pad(rsa)",
.test = alg_test_null,
.fips_allowed = 1,
}, {
@@ -5679,6 +5774,33 @@ static const struct alg_test_desc alg_test_descs[] = {
.hash = __VECS(wp512_tv_template)
}
}, {
+ .alg = "x962(ecdsa-nist-p192)",
+ .test = alg_test_sig,
+ .suite = {
+ .sig = __VECS(x962_ecdsa_nist_p192_tv_template)
+ }
+ }, {
+ .alg = "x962(ecdsa-nist-p256)",
+ .test = alg_test_sig,
+ .fips_allowed = 1,
+ .suite = {
+ .sig = __VECS(x962_ecdsa_nist_p256_tv_template)
+ }
+ }, {
+ .alg = "x962(ecdsa-nist-p384)",
+ .test = alg_test_sig,
+ .fips_allowed = 1,
+ .suite = {
+ .sig = __VECS(x962_ecdsa_nist_p384_tv_template)
+ }
+ }, {
+ .alg = "x962(ecdsa-nist-p521)",
+ .test = alg_test_sig,
+ .fips_allowed = 1,
+ .suite = {
+ .sig = __VECS(x962_ecdsa_nist_p521_tv_template)
+ }
+ }, {
.alg = "xcbc(aes)",
.test = alg_test_hash,
.suite = {
diff --git a/crypto/testmgr.h b/crypto/testmgr.h
index 9b38501..430d33d 100644
--- a/crypto/testmgr.h
+++ b/crypto/testmgr.h
@@ -21,6 +21,7 @@
#define _CRYPTO_TESTMGR_H
#include <linux/oid_registry.h>
+#include <crypto/internal/ecc.h>
#define MAX_IVLEN 32
@@ -150,6 +151,16 @@ struct drbg_testvec {
struct akcipher_testvec {
const unsigned char *key;
+ const unsigned char *m;
+ const unsigned char *c;
+ unsigned int key_len;
+ unsigned int m_size;
+ unsigned int c_size;
+ bool public_key_vec;
+};
+
+struct sig_testvec {
+ const unsigned char *key;
const unsigned char *params;
const unsigned char *m;
const unsigned char *c;
@@ -158,7 +169,6 @@ struct akcipher_testvec {
unsigned int m_size;
unsigned int c_size;
bool public_key_vec;
- bool siggen_sigver_test;
enum OID algo;
};
@@ -647,274 +657,357 @@ static const struct akcipher_testvec rsa_tv_template[] = {
}
};
+#ifdef CONFIG_CPU_BIG_ENDIAN
+#define be64_to_cpua(b1, b2, b3, b4, b5, b6, b7, b8) \
+ 0x##b1, 0x##b2, 0x##b3, 0x##b4, 0x##b5, 0x##b6, 0x##b7, 0x##b8
+#else
+#define be64_to_cpua(b1, b2, b3, b4, b5, b6, b7, b8) \
+ 0x##b8, 0x##b7, 0x##b6, 0x##b5, 0x##b4, 0x##b3, 0x##b2, 0x##b1
+#endif
+
/*
* ECDSA test vectors.
*/
-static const struct akcipher_testvec ecdsa_nist_p192_tv_template[] = {
+static const struct sig_testvec ecdsa_nist_p192_tv_template[] = {
{
- .key =
+ .key = /* secp192r1(sha1) */
"\x04\xf7\x46\xf8\x2f\x15\xf6\x22\x8e\xd7\x57\x4f\xcc\xe7\xbb\xc1"
"\xd4\x09\x73\xcf\xea\xd0\x15\x07\x3d\xa5\x8a\x8a\x95\x43\xe4\x68"
"\xea\xc6\x25\xc1\xc1\x01\x25\x4c\x7e\xc3\x3c\xa6\x04\x0a\xe7\x08"
"\x98",
.key_len = 49,
- .params =
- "\x30\x13\x06\x07\x2a\x86\x48\xce\x3d\x02\x01\x06\x08\x2a\x86\x48"
- "\xce\x3d\x03\x01\x01",
- .param_len = 21,
.m =
"\xcd\xb9\xd2\x1c\xb7\x6f\xcd\x44\xb3\xfd\x63\xea\xa3\x66\x7f\xae"
"\x63\x85\xe7\x82",
.m_size = 20,
- .algo = OID_id_ecdsa_with_sha1,
- .c =
- "\x30\x35\x02\x19\x00\xba\xe5\x93\x83\x6e\xb6\x3b\x63\xa0\x27\x91"
- "\xc6\xf6\x7f\xc3\x09\xad\x59\xad\x88\x27\xd6\x92\x6b\x02\x18\x10"
- "\x68\x01\x9d\xba\xce\x83\x08\xef\x95\x52\x7b\xa0\x0f\xe4\x18\x86"
- "\x80\x6f\xa5\x79\x77\xda\xd0",
- .c_size = 55,
+ .c = (const unsigned char[]){
+ be64_to_cpua(ad, 59, ad, 88, 27, d6, 92, 6b),
+ be64_to_cpua(a0, 27, 91, c6, f6, 7f, c3, 09),
+ be64_to_cpua(ba, e5, 93, 83, 6e, b6, 3b, 63),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(86, 80, 6f, a5, 79, 77, da, d0),
+ be64_to_cpua(ef, 95, 52, 7b, a0, 0f, e4, 18),
+ be64_to_cpua(10, 68, 01, 9d, ba, ce, 83, 08),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00) },
+ .c_size = ECC_MAX_BYTES * 2,
.public_key_vec = true,
- .siggen_sigver_test = true,
}, {
- .key =
+ .key = /* secp192r1(sha224) */
"\x04\xb6\x4b\xb1\xd1\xac\xba\x24\x8f\x65\xb2\x60\x00\x90\xbf\xbd"
"\x78\x05\x73\xe9\x79\x1d\x6f\x7c\x0b\xd2\xc3\x93\xa7\x28\xe1\x75"
"\xf7\xd5\x95\x1d\x28\x10\xc0\x75\x50\x5c\x1a\x4f\x3f\x8f\xa5\xee"
"\xa3",
.key_len = 49,
- .params =
- "\x30\x13\x06\x07\x2a\x86\x48\xce\x3d\x02\x01\x06\x08\x2a\x86\x48"
- "\xce\x3d\x03\x01\x01",
- .param_len = 21,
.m =
"\x8d\xd6\xb8\x3e\xe5\xff\x23\xf6\x25\xa2\x43\x42\x74\x45\xa7\x40"
"\x3a\xff\x2f\xe1\xd3\xf6\x9f\xe8\x33\xcb\x12\x11",
.m_size = 28,
- .algo = OID_id_ecdsa_with_sha224,
- .c =
- "\x30\x34\x02\x18\x5a\x8b\x82\x69\x7e\x8a\x0a\x09\x14\xf8\x11\x2b"
- "\x55\xdc\xae\x37\x83\x7b\x12\xe6\xb6\x5b\xcb\xd4\x02\x18\x6a\x14"
- "\x4f\x53\x75\xc8\x02\x48\xeb\xc3\x92\x0f\x1e\x72\xee\xc4\xa3\xe3"
- "\x5c\x99\xdb\x92\x5b\x36",
- .c_size = 54,
+ .c = (const unsigned char[]){
+ be64_to_cpua(83, 7b, 12, e6, b6, 5b, cb, d4),
+ be64_to_cpua(14, f8, 11, 2b, 55, dc, ae, 37),
+ be64_to_cpua(5a, 8b, 82, 69, 7e, 8a, 0a, 09),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(a3, e3, 5c, 99, db, 92, 5b, 36),
+ be64_to_cpua(eb, c3, 92, 0f, 1e, 72, ee, c4),
+ be64_to_cpua(6a, 14, 4f, 53, 75, c8, 02, 48),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00) },
+ .c_size = ECC_MAX_BYTES * 2,
.public_key_vec = true,
- .siggen_sigver_test = true,
}, {
- .key =
+ .key = /* secp192r1(sha256) */
"\x04\xe2\x51\x24\x9b\xf7\xb6\x32\x82\x39\x66\x3d\x5b\xec\x3b\xae"
"\x0c\xd5\xf2\x67\xd1\xc7\xe1\x02\xe4\xbf\x90\x62\xb8\x55\x75\x56"
"\x69\x20\x5e\xcb\x4e\xca\x33\xd6\xcb\x62\x6b\x94\xa9\xa2\xe9\x58"
"\x91",
.key_len = 49,
- .params =
- "\x30\x13\x06\x07\x2a\x86\x48\xce\x3d\x02\x01\x06\x08\x2a\x86\x48"
- "\xce\x3d\x03\x01\x01",
- .param_len = 21,
.m =
"\x35\xec\xa1\xa0\x9e\x14\xde\x33\x03\xb6\xf6\xbd\x0c\x2f\xb2\xfd"
"\x1f\x27\x82\xa5\xd7\x70\x3f\xef\xa0\x82\x69\x8e\x73\x31\x8e\xd7",
.m_size = 32,
- .algo = OID_id_ecdsa_with_sha256,
- .c =
- "\x30\x35\x02\x18\x3f\x72\x3f\x1f\x42\xd2\x3f\x1d\x6b\x1a\x58\x56"
- "\xf1\x8f\xf7\xfd\x01\x48\xfb\x5f\x72\x2a\xd4\x8f\x02\x19\x00\xb3"
- "\x69\x43\xfd\x48\x19\x86\xcf\x32\xdd\x41\x74\x6a\x51\xc7\xd9\x7d"
- "\x3a\x97\xd9\xcd\x1a\x6a\x49",
- .c_size = 55,
+ .c = (const unsigned char[]){
+ be64_to_cpua(01, 48, fb, 5f, 72, 2a, d4, 8f),
+ be64_to_cpua(6b, 1a, 58, 56, f1, 8f, f7, fd),
+ be64_to_cpua(3f, 72, 3f, 1f, 42, d2, 3f, 1d),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(7d, 3a, 97, d9, cd, 1a, 6a, 49),
+ be64_to_cpua(32, dd, 41, 74, 6a, 51, c7, d9),
+ be64_to_cpua(b3, 69, 43, fd, 48, 19, 86, cf),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00) },
+ .c_size = ECC_MAX_BYTES * 2,
.public_key_vec = true,
- .siggen_sigver_test = true,
}, {
- .key =
+ .key = /* secp192r1(sha384) */
"\x04\x5a\x13\xfe\x68\x86\x4d\xf4\x17\xc7\xa4\xe5\x8c\x65\x57\xb7"
"\x03\x73\x26\x57\xfb\xe5\x58\x40\xd8\xfd\x49\x05\xab\xf1\x66\x1f"
"\xe2\x9d\x93\x9e\xc2\x22\x5a\x8b\x4f\xf3\x77\x22\x59\x7e\xa6\x4e"
"\x8b",
.key_len = 49,
- .params =
- "\x30\x13\x06\x07\x2a\x86\x48\xce\x3d\x02\x01\x06\x08\x2a\x86\x48"
- "\xce\x3d\x03\x01\x01",
- .param_len = 21,
.m =
"\x9d\x2e\x1a\x8f\xed\x6c\x4b\x61\xae\xac\xd5\x19\x79\xce\x67\xf9"
"\xa0\x34\xeb\xb0\x81\xf9\xd9\xdc\x6e\xb3\x5c\xa8\x69\xfc\x8a\x61"
"\x39\x81\xfb\xfd\x5c\x30\x6b\xa8\xee\xed\x89\xaf\xa3\x05\xe4\x78",
.m_size = 48,
- .algo = OID_id_ecdsa_with_sha384,
- .c =
- "\x30\x35\x02\x19\x00\xf0\xa3\x38\xce\x2b\xf8\x9d\x1a\xcf\x7f\x34"
- "\xb4\xb4\xe5\xc5\x00\xdd\x15\xbb\xd6\x8c\xa7\x03\x78\x02\x18\x64"
- "\xbc\x5a\x1f\x82\x96\x61\xd7\xd1\x01\x77\x44\x5d\x53\xa4\x7c\x93"
- "\x12\x3b\x3b\x28\xfb\x6d\xe1",
- .c_size = 55,
+ .c = (const unsigned char[]){
+ be64_to_cpua(dd, 15, bb, d6, 8c, a7, 03, 78),
+ be64_to_cpua(cf, 7f, 34, b4, b4, e5, c5, 00),
+ be64_to_cpua(f0, a3, 38, ce, 2b, f8, 9d, 1a),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(93, 12, 3b, 3b, 28, fb, 6d, e1),
+ be64_to_cpua(d1, 01, 77, 44, 5d, 53, a4, 7c),
+ be64_to_cpua(64, bc, 5a, 1f, 82, 96, 61, d7),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00) },
+ .c_size = ECC_MAX_BYTES * 2,
.public_key_vec = true,
- .siggen_sigver_test = true,
}, {
- .key =
+ .key = /* secp192r1(sha512) */
"\x04\xd5\xf2\x6e\xc3\x94\x5c\x52\xbc\xdf\x86\x6c\x14\xd1\xca\xea"
"\xcc\x72\x3a\x8a\xf6\x7a\x3a\x56\x36\x3b\xca\xc6\x94\x0e\x17\x1d"
"\x9e\xa0\x58\x28\xf9\x4b\xe6\xd1\xa5\x44\x91\x35\x0d\xe7\xf5\x11"
"\x57",
.key_len = 49,
- .params =
- "\x30\x13\x06\x07\x2a\x86\x48\xce\x3d\x02\x01\x06\x08\x2a\x86\x48"
- "\xce\x3d\x03\x01\x01",
- .param_len = 21,
.m =
"\xd5\x4b\xe9\x36\xda\xd8\x6e\xc0\x50\x03\xbe\x00\x43\xff\xf0\x23"
"\xac\xa2\x42\xe7\x37\x77\x79\x52\x8f\x3e\xc0\x16\xc1\xfc\x8c\x67"
"\x16\xbc\x8a\x5d\x3b\xd3\x13\xbb\xb6\xc0\x26\x1b\xeb\x33\xcc\x70"
"\x4a\xf2\x11\x37\xe8\x1b\xba\x55\xac\x69\xe1\x74\x62\x7c\x6e\xb5",
.m_size = 64,
- .algo = OID_id_ecdsa_with_sha512,
- .c =
- "\x30\x35\x02\x19\x00\x88\x5b\x8f\x59\x43\xbf\xcf\xc6\xdd\x3f\x07"
- "\x87\x12\xa0\xd4\xac\x2b\x11\x2d\x1c\xb6\x06\xc9\x6c\x02\x18\x73"
- "\xb4\x22\x9a\x98\x73\x3c\x83\xa9\x14\x2a\x5e\xf5\xe5\xfb\x72\x28"
- "\x6a\xdf\x97\xfd\x82\x76\x24",
- .c_size = 55,
+ .c = (const unsigned char[]){
+ be64_to_cpua(2b, 11, 2d, 1c, b6, 06, c9, 6c),
+ be64_to_cpua(dd, 3f, 07, 87, 12, a0, d4, ac),
+ be64_to_cpua(88, 5b, 8f, 59, 43, bf, cf, c6),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(28, 6a, df, 97, fd, 82, 76, 24),
+ be64_to_cpua(a9, 14, 2a, 5e, f5, e5, fb, 72),
+ be64_to_cpua(73, b4, 22, 9a, 98, 73, 3c, 83),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00) },
+ .c_size = ECC_MAX_BYTES * 2,
.public_key_vec = true,
- .siggen_sigver_test = true,
},
};
-static const struct akcipher_testvec ecdsa_nist_p256_tv_template[] = {
+static const struct sig_testvec ecdsa_nist_p256_tv_template[] = {
{
- .key =
+ .key = /* secp256r1(sha1) */
"\x04\xb9\x7b\xbb\xd7\x17\x64\xd2\x7e\xfc\x81\x5d\x87\x06\x83\x41"
"\x22\xd6\x9a\xaa\x87\x17\xec\x4f\x63\x55\x2f\x94\xba\xdd\x83\xe9"
"\x34\x4b\xf3\xe9\x91\x13\x50\xb6\xcb\xca\x62\x08\xe7\x3b\x09\xdc"
"\xc3\x63\x4b\x2d\xb9\x73\x53\xe4\x45\xe6\x7c\xad\xe7\x6b\xb0\xe8"
"\xaf",
.key_len = 65,
- .params =
- "\x30\x13\x06\x07\x2a\x86\x48\xce\x3d\x02\x01\x06\x08\x2a\x86\x48"
- "\xce\x3d\x03\x01\x07",
- .param_len = 21,
.m =
"\xc2\x2b\x5f\x91\x78\x34\x26\x09\x42\x8d\x6f\x51\xb2\xc5\xaf\x4c"
"\x0b\xde\x6a\x42",
.m_size = 20,
- .algo = OID_id_ecdsa_with_sha1,
- .c =
- "\x30\x46\x02\x21\x00\xf9\x25\xce\x9f\x3a\xa6\x35\x81\xcf\xd4\xe7"
- "\xb7\xf0\x82\x56\x41\xf7\xd4\xad\x8d\x94\x5a\x69\x89\xee\xca\x6a"
- "\x52\x0e\x48\x4d\xcc\x02\x21\x00\xd7\xe4\xef\x52\x66\xd3\x5b\x9d"
- "\x8a\xfa\x54\x93\x29\xa7\x70\x86\xf1\x03\x03\xf3\x3b\xe2\x73\xf7"
- "\xfb\x9d\x8b\xde\xd4\x8d\x6f\xad",
- .c_size = 72,
+ .c = (const unsigned char[]){
+ be64_to_cpua(ee, ca, 6a, 52, 0e, 48, 4d, cc),
+ be64_to_cpua(f7, d4, ad, 8d, 94, 5a, 69, 89),
+ be64_to_cpua(cf, d4, e7, b7, f0, 82, 56, 41),
+ be64_to_cpua(f9, 25, ce, 9f, 3a, a6, 35, 81),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(fb, 9d, 8b, de, d4, 8d, 6f, ad),
+ be64_to_cpua(f1, 03, 03, f3, 3b, e2, 73, f7),
+ be64_to_cpua(8a, fa, 54, 93, 29, a7, 70, 86),
+ be64_to_cpua(d7, e4, ef, 52, 66, d3, 5b, 9d),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00) },
+ .c_size = ECC_MAX_BYTES * 2,
.public_key_vec = true,
- .siggen_sigver_test = true,
}, {
- .key =
+ .key = /* secp256r1(sha224) */
"\x04\x8b\x6d\xc0\x33\x8e\x2d\x8b\x67\xf5\xeb\xc4\x7f\xa0\xf5\xd9"
"\x7b\x03\xa5\x78\x9a\xb5\xea\x14\xe4\x23\xd0\xaf\xd7\x0e\x2e\xa0"
"\xc9\x8b\xdb\x95\xf8\xb3\xaf\xac\x00\x2c\x2c\x1f\x7a\xfd\x95\x88"
"\x43\x13\xbf\xf3\x1c\x05\x1a\x14\x18\x09\x3f\xd6\x28\x3e\xc5\xa0"
"\xd4",
.key_len = 65,
- .params =
- "\x30\x13\x06\x07\x2a\x86\x48\xce\x3d\x02\x01\x06\x08\x2a\x86\x48"
- "\xce\x3d\x03\x01\x07",
- .param_len = 21,
.m =
"\x1a\x15\xbc\xa3\xe4\xed\x3a\xb8\x23\x67\xc6\xc4\x34\xf8\x6c\x41"
"\x04\x0b\xda\xc5\x77\xfa\x1c\x2d\xe6\x2c\x3b\xe0",
.m_size = 28,
- .algo = OID_id_ecdsa_with_sha224,
- .c =
- "\x30\x44\x02\x20\x20\x43\xfa\xc0\x9f\x9d\x7b\xe7\xae\xce\x77\x59"
- "\x1a\xdb\x59\xd5\x34\x62\x79\xcb\x6a\x91\x67\x2e\x7d\x25\xd8\x25"
- "\xf5\x81\xd2\x1e\x02\x20\x5f\xf8\x74\xf8\x57\xd0\x5e\x54\x76\x20"
- "\x4a\x77\x22\xec\xc8\x66\xbf\x50\x05\x58\x39\x0e\x26\x92\xce\xd5"
- "\x2e\x8b\xde\x5a\x04\x0e",
- .c_size = 70,
+ .c = (const unsigned char[]){
+ be64_to_cpua(7d, 25, d8, 25, f5, 81, d2, 1e),
+ be64_to_cpua(34, 62, 79, cb, 6a, 91, 67, 2e),
+ be64_to_cpua(ae, ce, 77, 59, 1a, db, 59, d5),
+ be64_to_cpua(20, 43, fa, c0, 9f, 9d, 7b, e7),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(ce, d5, 2e, 8b, de, 5a, 04, 0e),
+ be64_to_cpua(bf, 50, 05, 58, 39, 0e, 26, 92),
+ be64_to_cpua(76, 20, 4a, 77, 22, ec, c8, 66),
+ be64_to_cpua(5f, f8, 74, f8, 57, d0, 5e, 54),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00) },
+ .c_size = ECC_MAX_BYTES * 2,
.public_key_vec = true,
- .siggen_sigver_test = true,
}, {
- .key =
+ .key = /* secp256r1(sha256) */
"\x04\xf1\xea\xc4\x53\xf3\xb9\x0e\x9f\x7e\xad\xe3\xea\xd7\x0e\x0f"
"\xd6\x98\x9a\xca\x92\x4d\x0a\x80\xdb\x2d\x45\xc7\xec\x4b\x97\x00"
"\x2f\xe9\x42\x6c\x29\xdc\x55\x0e\x0b\x53\x12\x9b\x2b\xad\x2c\xe9"
"\x80\xe6\xc5\x43\xc2\x1d\x5e\xbb\x65\x21\x50\xb6\x37\xb0\x03\x8e"
"\xb8",
.key_len = 65,
- .params =
- "\x30\x13\x06\x07\x2a\x86\x48\xce\x3d\x02\x01\x06\x08\x2a\x86\x48"
- "\xce\x3d\x03\x01\x07",
- .param_len = 21,
.m =
"\x8f\x43\x43\x46\x64\x8f\x6b\x96\xdf\x89\xdd\xa9\x01\xc5\x17\x6b"
"\x10\xa6\xd8\x39\x61\xdd\x3c\x1a\xc8\x8b\x59\xb2\xdc\x32\x7a\xa4",
.m_size = 32,
- .algo = OID_id_ecdsa_with_sha256,
- .c =
- "\x30\x45\x02\x20\x08\x31\xfa\x74\x0d\x1d\x21\x5d\x09\xdc\x29\x63"
- "\xa8\x1a\xad\xfc\xac\x44\xc3\xe8\x24\x11\x2d\xa4\x91\xdc\x02\x67"
- "\xdc\x0c\xd0\x82\x02\x21\x00\xbd\xff\xce\xee\x42\xc3\x97\xff\xf9"
- "\xa9\x81\xac\x4a\x50\xd0\x91\x0a\x6e\x1b\xc4\xaf\xe1\x83\xc3\x4f"
- "\x2a\x65\x35\x23\xe3\x1d\xfa",
- .c_size = 71,
+ .c = (const unsigned char[]){
+ be64_to_cpua(91, dc, 02, 67, dc, 0c, d0, 82),
+ be64_to_cpua(ac, 44, c3, e8, 24, 11, 2d, a4),
+ be64_to_cpua(09, dc, 29, 63, a8, 1a, ad, fc),
+ be64_to_cpua(08, 31, fa, 74, 0d, 1d, 21, 5d),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(4f, 2a, 65, 35, 23, e3, 1d, fa),
+ be64_to_cpua(0a, 6e, 1b, c4, af, e1, 83, c3),
+ be64_to_cpua(f9, a9, 81, ac, 4a, 50, d0, 91),
+ be64_to_cpua(bd, ff, ce, ee, 42, c3, 97, ff),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00) },
+ .c_size = ECC_MAX_BYTES * 2,
.public_key_vec = true,
- .siggen_sigver_test = true,
}, {
- .key =
+ .key = /* secp256r1(sha384) */
"\x04\xc5\xc6\xea\x60\xc9\xce\xad\x02\x8d\xf5\x3e\x24\xe3\x52\x1d"
"\x28\x47\x3b\xc3\x6b\xa4\x99\x35\x99\x11\x88\x88\xc8\xf4\xee\x7e"
"\x8c\x33\x8f\x41\x03\x24\x46\x2b\x1a\x82\xf9\x9f\xe1\x97\x1b\x00"
"\xda\x3b\x24\x41\xf7\x66\x33\x58\x3d\x3a\x81\xad\xcf\x16\xe9\xe2"
"\x7c",
.key_len = 65,
- .params =
- "\x30\x13\x06\x07\x2a\x86\x48\xce\x3d\x02\x01\x06\x08\x2a\x86\x48"
- "\xce\x3d\x03\x01\x07",
- .param_len = 21,
.m =
"\x3e\x78\x70\xfb\xcd\x66\xba\x91\xa1\x79\xff\x1e\x1c\x6b\x78\xe6"
"\xc0\x81\x3a\x65\x97\x14\x84\x36\x14\x1a\x9a\xb7\xc5\xab\x84\x94"
"\x5e\xbb\x1b\x34\x71\xcb\x41\xe1\xf6\xfc\x92\x7b\x34\xbb\x86\xbb",
.m_size = 48,
- .algo = OID_id_ecdsa_with_sha384,
- .c =
- "\x30\x46\x02\x21\x00\x8e\xf3\x6f\xdc\xf8\x69\xa6\x2e\xd0\x2e\x95"
- "\x54\xd1\x95\x64\x93\x08\xb2\x6b\x24\x94\x48\x46\x5e\xf2\xe4\x6c"
- "\xc7\x94\xb1\xd5\xfe\x02\x21\x00\xeb\xa7\x80\x26\xdc\xf9\x3a\x44"
- "\x19\xfb\x5f\x92\xf4\xc9\x23\x37\x69\xf4\x3b\x4f\x47\xcf\x9b\x16"
- "\xc0\x60\x11\x92\xdc\x17\x89\x12",
- .c_size = 72,
+ .c = (const unsigned char[]){
+ be64_to_cpua(f2, e4, 6c, c7, 94, b1, d5, fe),
+ be64_to_cpua(08, b2, 6b, 24, 94, 48, 46, 5e),
+ be64_to_cpua(d0, 2e, 95, 54, d1, 95, 64, 93),
+ be64_to_cpua(8e, f3, 6f, dc, f8, 69, a6, 2e),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(c0, 60, 11, 92, dc, 17, 89, 12),
+ be64_to_cpua(69, f4, 3b, 4f, 47, cf, 9b, 16),
+ be64_to_cpua(19, fb, 5f, 92, f4, c9, 23, 37),
+ be64_to_cpua(eb, a7, 80, 26, dc, f9, 3a, 44),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00) },
+ .c_size = ECC_MAX_BYTES * 2,
.public_key_vec = true,
- .siggen_sigver_test = true,
}, {
- .key =
+ .key = /* secp256r1(sha512) */
"\x04\xd7\x27\x46\x49\xf6\x26\x85\x12\x40\x76\x8e\xe2\xe6\x2a\x7a"
"\x83\xb1\x4e\x7a\xeb\x3b\x5c\x67\x4a\xb5\xa4\x92\x8c\x69\xff\x38"
"\xee\xd9\x4e\x13\x29\x59\xad\xde\x6b\xbb\x45\x31\xee\xfd\xd1\x1b"
"\x64\xd3\xb5\xfc\xaf\x9b\x4b\x88\x3b\x0e\xb7\xd6\xdf\xf1\xd5\x92"
"\xbf",
.key_len = 65,
- .params =
- "\x30\x13\x06\x07\x2a\x86\x48\xce\x3d\x02\x01\x06\x08\x2a\x86\x48"
- "\xce\x3d\x03\x01\x07",
- .param_len = 21,
.m =
"\x57\xb7\x9e\xe9\x05\x0a\x8c\x1b\xc9\x13\xe5\x4a\x24\xc7\xe2\xe9"
"\x43\xc3\xd1\x76\x62\xf4\x98\x1a\x9c\x13\xb0\x20\x1b\xe5\x39\xca"
"\x4f\xd9\x85\x34\x95\xa2\x31\xbc\xbb\xde\xdd\x76\xbb\x61\xe3\xcf"
"\x9d\xc0\x49\x7a\xf3\x7a\xc4\x7d\xa8\x04\x4b\x8d\xb4\x4d\x5b\xd6",
.m_size = 64,
- .algo = OID_id_ecdsa_with_sha512,
- .c =
- "\x30\x45\x02\x21\x00\xb8\x6d\x87\x81\x43\xdf\xfb\x9f\x40\xea\x44"
- "\x81\x00\x4e\x29\x08\xed\x8c\x73\x30\x6c\x22\xb3\x97\x76\xf6\x04"
- "\x99\x09\x37\x4d\xfa\x02\x20\x1e\xb9\x75\x31\xf6\x04\xa5\x4d\xf8"
- "\x00\xdd\xab\xd4\xc0\x2b\xe6\x5c\xad\xc3\x78\x1c\xc2\xc1\x19\x76"
- "\x31\x79\x4a\xe9\x81\x6a\xee",
- .c_size = 71,
+ .c = (const unsigned char[]){
+ be64_to_cpua(76, f6, 04, 99, 09, 37, 4d, fa),
+ be64_to_cpua(ed, 8c, 73, 30, 6c, 22, b3, 97),
+ be64_to_cpua(40, ea, 44, 81, 00, 4e, 29, 08),
+ be64_to_cpua(b8, 6d, 87, 81, 43, df, fb, 9f),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(76, 31, 79, 4a, e9, 81, 6a, ee),
+ be64_to_cpua(5c, ad, c3, 78, 1c, c2, c1, 19),
+ be64_to_cpua(f8, 00, dd, ab, d4, c0, 2b, e6),
+ be64_to_cpua(1e, b9, 75, 31, f6, 04, a5, 4d),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00) },
+ .c_size = ECC_MAX_BYTES * 2,
.public_key_vec = true,
- .siggen_sigver_test = true,
},
};
-static const struct akcipher_testvec ecdsa_nist_p384_tv_template[] = {
+static const struct sig_testvec ecdsa_nist_p384_tv_template[] = {
{
.key = /* secp384r1(sha1) */
"\x04\x89\x25\xf3\x97\x88\xcb\xb0\x78\xc5\x72\x9a\x14\x6e\x7a\xb1"
@@ -925,26 +1018,31 @@ static const struct akcipher_testvec ecdsa_nist_p384_tv_template[] = {
"\x0b\x25\xd6\x80\x5c\x3b\xe6\x1a\x98\x48\x91\x45\x7a\x73\xb0\xc3"
"\xf1",
.key_len = 97,
- .params =
- "\x30\x10\x06\x07\x2a\x86\x48\xce\x3d\x02\x01\x06\x05\x2b\x81\x04"
- "\x00\x22",
- .param_len = 18,
.m =
"\x12\x55\x28\xf0\x77\xd5\xb6\x21\x71\x32\x48\xcd\x28\xa8\x25\x22"
"\x3a\x69\xc1\x93",
.m_size = 20,
- .algo = OID_id_ecdsa_with_sha1,
- .c =
- "\x30\x66\x02\x31\x00\xf5\x0f\x24\x4c\x07\x93\x6f\x21\x57\x55\x07"
- "\x20\x43\x30\xde\xa0\x8d\x26\x8e\xae\x63\x3f\xbc\x20\x3a\xc6\xf1"
- "\x32\x3c\xce\x70\x2b\x78\xf1\x4c\x26\xe6\x5b\x86\xcf\xec\x7c\x7e"
- "\xd0\x87\xd7\xd7\x6e\x02\x31\x00\xcd\xbb\x7e\x81\x5d\x8f\x63\xc0"
- "\x5f\x63\xb1\xbe\x5e\x4c\x0e\xa1\xdf\x28\x8c\x1b\xfa\xf9\x95\x88"
- "\x74\xa0\x0f\xbf\xaf\xc3\x36\x76\x4a\xa1\x59\xf1\x1c\xa4\x58\x26"
- "\x79\x12\x2a\xb7\xc5\x15\x92\xc5",
- .c_size = 104,
+ .c = (const unsigned char[]){
+ be64_to_cpua(ec, 7c, 7e, d0, 87, d7, d7, 6e),
+ be64_to_cpua(78, f1, 4c, 26, e6, 5b, 86, cf),
+ be64_to_cpua(3a, c6, f1, 32, 3c, ce, 70, 2b),
+ be64_to_cpua(8d, 26, 8e, ae, 63, 3f, bc, 20),
+ be64_to_cpua(57, 55, 07, 20, 43, 30, de, a0),
+ be64_to_cpua(f5, 0f, 24, 4c, 07, 93, 6f, 21),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(79, 12, 2a, b7, c5, 15, 92, c5),
+ be64_to_cpua(4a, a1, 59, f1, 1c, a4, 58, 26),
+ be64_to_cpua(74, a0, 0f, bf, af, c3, 36, 76),
+ be64_to_cpua(df, 28, 8c, 1b, fa, f9, 95, 88),
+ be64_to_cpua(5f, 63, b1, be, 5e, 4c, 0e, a1),
+ be64_to_cpua(cd, bb, 7e, 81, 5d, 8f, 63, c0),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00) },
+ .c_size = ECC_MAX_BYTES * 2,
.public_key_vec = true,
- .siggen_sigver_test = true,
}, {
.key = /* secp384r1(sha224) */
"\x04\x69\x6c\xcf\x62\xee\xd0\x0d\xe5\xb5\x2f\x70\x54\xcf\x26\xa0"
@@ -955,26 +1053,31 @@ static const struct akcipher_testvec ecdsa_nist_p384_tv_template[] = {
"\x6b\x93\x99\x6c\x66\x4c\x42\x3f\x65\x60\x6c\x1c\x0b\x93\x9b\x9d"
"\xe0",
.key_len = 97,
- .params =
- "\x30\x10\x06\x07\x2a\x86\x48\xce\x3d\x02\x01\x06\x05\x2b\x81\x04"
- "\x00\x22",
- .param_len = 18,
.m =
"\x12\x80\xb6\xeb\x25\xe2\x3d\xf0\x21\x32\x96\x17\x3a\x38\x39\xfd"
"\x1f\x05\x34\x7b\xb8\xf9\x71\x66\x03\x4f\xd5\xe5",
.m_size = 28,
- .algo = OID_id_ecdsa_with_sha224,
- .c =
- "\x30\x66\x02\x31\x00\x8a\x51\x84\xce\x13\x1e\xd2\xdc\xec\xcb\xe4"
- "\x89\x47\xb2\xf7\xbc\x97\xf1\xc8\x72\x26\xcf\x5a\x5e\xc5\xda\xb4"
- "\xe3\x93\x07\xe0\x99\xc9\x9c\x11\xb8\x10\x01\xc5\x41\x3f\xdd\x15"
- "\x1b\x68\x2b\x9d\x8b\x02\x31\x00\x8b\x03\x2c\xfc\x1f\xd1\xa9\xa4"
- "\x4b\x00\x08\x31\x6c\xf5\xd5\xf6\xdf\xd8\x68\xa2\x64\x42\x65\xf3"
- "\x4d\xd0\xc6\x6e\xb0\xe9\xfc\x14\x9f\x19\xd0\x42\x8b\x93\xc2\x11"
- "\x88\x2b\x82\x26\x5e\x1c\xda\xfb",
- .c_size = 104,
+ .c = (const unsigned char[]){
+ be64_to_cpua(3f, dd, 15, 1b, 68, 2b, 9d, 8b),
+ be64_to_cpua(c9, 9c, 11, b8, 10, 01, c5, 41),
+ be64_to_cpua(c5, da, b4, e3, 93, 07, e0, 99),
+ be64_to_cpua(97, f1, c8, 72, 26, cf, 5a, 5e),
+ be64_to_cpua(ec, cb, e4, 89, 47, b2, f7, bc),
+ be64_to_cpua(8a, 51, 84, ce, 13, 1e, d2, dc),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(88, 2b, 82, 26, 5e, 1c, da, fb),
+ be64_to_cpua(9f, 19, d0, 42, 8b, 93, c2, 11),
+ be64_to_cpua(4d, d0, c6, 6e, b0, e9, fc, 14),
+ be64_to_cpua(df, d8, 68, a2, 64, 42, 65, f3),
+ be64_to_cpua(4b, 00, 08, 31, 6c, f5, d5, f6),
+ be64_to_cpua(8b, 03, 2c, fc, 1f, d1, a9, a4),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00) },
+ .c_size = ECC_MAX_BYTES * 2,
.public_key_vec = true,
- .siggen_sigver_test = true,
}, {
.key = /* secp384r1(sha256) */
"\x04\xee\xd6\xda\x3e\x94\x90\x00\x27\xed\xf8\x64\x55\xd6\x51\x9a"
@@ -985,26 +1088,31 @@ static const struct akcipher_testvec ecdsa_nist_p384_tv_template[] = {
"\x17\xc3\x34\x29\xd6\x40\xea\x5c\xb9\x3f\xfb\x32\x2e\x12\x33\xbc"
"\xab",
.key_len = 97,
- .params =
- "\x30\x10\x06\x07\x2a\x86\x48\xce\x3d\x02\x01\x06\x05\x2b\x81\x04"
- "\x00\x22",
- .param_len = 18,
.m =
"\xaa\xe7\xfd\x03\x26\xcb\x94\x71\xe4\xce\x0f\xc5\xff\xa6\x29\xa3"
"\xe1\xcc\x4c\x35\x4e\xde\xca\x80\xab\x26\x0c\x25\xe6\x68\x11\xc2",
.m_size = 32,
- .algo = OID_id_ecdsa_with_sha256,
- .c =
- "\x30\x64\x02\x30\x08\x09\x12\x9d\x6e\x96\x64\xa6\x8e\x3f\x7e\xce"
- "\x0a\x9b\xaa\x59\xcc\x47\x53\x87\xbc\xbd\x83\x3f\xaf\x06\x3f\x84"
- "\x04\xe2\xf9\x67\xb6\xc6\xfc\x70\x2e\x66\x3c\x77\xc8\x8d\x2c\x79"
- "\x3a\x8e\x32\xc4\x02\x30\x40\x34\xb8\x90\xa9\x80\xab\x47\x26\xa2"
- "\xb0\x89\x42\x0a\xda\xd9\xdd\xce\xbc\xb2\x97\xf4\x9c\xf3\x15\x68"
- "\xc0\x75\x3e\x23\x5e\x36\x4f\x8d\xde\x1e\x93\x8d\x95\xbb\x10\x0e"
- "\xf4\x1f\x39\xca\x4d\x43",
- .c_size = 102,
+ .c = (const unsigned char[]){
+ be64_to_cpua(c8, 8d, 2c, 79, 3a, 8e, 32, c4),
+ be64_to_cpua(b6, c6, fc, 70, 2e, 66, 3c, 77),
+ be64_to_cpua(af, 06, 3f, 84, 04, e2, f9, 67),
+ be64_to_cpua(cc, 47, 53, 87, bc, bd, 83, 3f),
+ be64_to_cpua(8e, 3f, 7e, ce, 0a, 9b, aa, 59),
+ be64_to_cpua(08, 09, 12, 9d, 6e, 96, 64, a6),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(10, 0e, f4, 1f, 39, ca, 4d, 43),
+ be64_to_cpua(4f, 8d, de, 1e, 93, 8d, 95, bb),
+ be64_to_cpua(15, 68, c0, 75, 3e, 23, 5e, 36),
+ be64_to_cpua(dd, ce, bc, b2, 97, f4, 9c, f3),
+ be64_to_cpua(26, a2, b0, 89, 42, 0a, da, d9),
+ be64_to_cpua(40, 34, b8, 90, a9, 80, ab, 47),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00) },
+ .c_size = ECC_MAX_BYTES * 2,
.public_key_vec = true,
- .siggen_sigver_test = true,
}, {
.key = /* secp384r1(sha384) */
"\x04\x3a\x2f\x62\xe7\x1a\xcf\x24\xd0\x0b\x7c\xe0\xed\x46\x0a\x4f"
@@ -1015,27 +1123,32 @@ static const struct akcipher_testvec ecdsa_nist_p384_tv_template[] = {
"\x21\x67\xe5\x1b\x5a\x52\x31\x68\xd6\xee\xf0\x19\xb0\x55\xed\x89"
"\x9e",
.key_len = 97,
- .params =
- "\x30\x10\x06\x07\x2a\x86\x48\xce\x3d\x02\x01\x06\x05\x2b\x81\x04"
- "\x00\x22",
- .param_len = 18,
.m =
"\x8d\xf2\xc0\xe9\xa8\xf3\x8e\x44\xc4\x8c\x1a\xa0\xb8\xd7\x17\xdf"
"\xf2\x37\x1b\xc6\xe3\xf5\x62\xcc\x68\xf5\xd5\x0b\xbf\x73\x2b\xb1"
"\xb0\x4c\x04\x00\x31\xab\xfe\xc8\xd6\x09\xc8\xf2\xea\xd3\x28\xff",
.m_size = 48,
- .algo = OID_id_ecdsa_with_sha384,
- .c =
- "\x30\x66\x02\x31\x00\x9b\x28\x68\xc0\xa1\xea\x8c\x50\xee\x2e\x62"
- "\x35\x46\xfa\x00\xd8\x2d\x7a\x91\x5f\x49\x2d\x22\x08\x29\xe6\xfb"
- "\xca\x8c\xd6\xb6\xb4\x3b\x1f\x07\x8f\x15\x02\xfe\x1d\xa2\xa4\xc8"
- "\xf2\xea\x9d\x11\x1f\x02\x31\x00\xfc\x50\xf6\x43\xbd\x50\x82\x0e"
- "\xbf\xe3\x75\x24\x49\xac\xfb\xc8\x71\xcd\x8f\x18\x99\xf0\x0f\x13"
- "\x44\x92\x8c\x86\x99\x65\xb3\x97\x96\x17\x04\xc9\x05\x77\xf1\x8e"
- "\xab\x8d\x4e\xde\xe6\x6d\x9b\x66",
- .c_size = 104,
+ .c = (const unsigned char[]){
+ be64_to_cpua(a2, a4, c8, f2, ea, 9d, 11, 1f),
+ be64_to_cpua(3b, 1f, 07, 8f, 15, 02, fe, 1d),
+ be64_to_cpua(29, e6, fb, ca, 8c, d6, b6, b4),
+ be64_to_cpua(2d, 7a, 91, 5f, 49, 2d, 22, 08),
+ be64_to_cpua(ee, 2e, 62, 35, 46, fa, 00, d8),
+ be64_to_cpua(9b, 28, 68, c0, a1, ea, 8c, 50),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(ab, 8d, 4e, de, e6, 6d, 9b, 66),
+ be64_to_cpua(96, 17, 04, c9, 05, 77, f1, 8e),
+ be64_to_cpua(44, 92, 8c, 86, 99, 65, b3, 97),
+ be64_to_cpua(71, cd, 8f, 18, 99, f0, 0f, 13),
+ be64_to_cpua(bf, e3, 75, 24, 49, ac, fb, c8),
+ be64_to_cpua(fc, 50, f6, 43, bd, 50, 82, 0e),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00) },
+ .c_size = ECC_MAX_BYTES * 2,
.public_key_vec = true,
- .siggen_sigver_test = true,
}, {
.key = /* secp384r1(sha512) */
"\x04\xb4\xe7\xc1\xeb\x64\x25\x22\x46\xc3\x86\x61\x80\xbe\x1e\x46"
@@ -1046,32 +1159,37 @@ static const struct akcipher_testvec ecdsa_nist_p384_tv_template[] = {
"\xdf\x42\x5c\xc2\x5a\xc7\x0c\xf4\x15\xf7\x1b\xa3\x2e\xd7\x00\xac"
"\xa3",
.key_len = 97,
- .params =
- "\x30\x10\x06\x07\x2a\x86\x48\xce\x3d\x02\x01\x06\x05\x2b\x81\x04"
- "\x00\x22",
- .param_len = 18,
.m =
"\xe8\xb7\x52\x7d\x1a\x44\x20\x05\x53\x6b\x3a\x68\xf2\xe7\x6c\xa1"
"\xae\x9d\x84\xbb\xba\x52\x43\x3e\x2c\x42\x78\x49\xbf\x78\xb2\x71"
"\xeb\xe1\xe0\xe8\x42\x7b\x11\xad\x2b\x99\x05\x1d\x36\xe6\xac\xfc"
"\x55\x73\xf0\x15\x63\x39\xb8\x6a\x6a\xc5\x91\x5b\xca\x6a\xa8\x0e",
.m_size = 64,
- .algo = OID_id_ecdsa_with_sha512,
- .c =
- "\x30\x63\x02\x2f\x1d\x20\x94\x77\xfe\x31\xfa\x4d\xc6\xef\xda\x02"
- "\xe7\x0f\x52\x9a\x02\xde\x93\xe8\x83\xe4\x84\x4c\xfc\x6f\x80\xe3"
- "\xaf\xb3\xd9\xdc\x2b\x43\x0e\x6a\xb3\x53\x6f\x3e\xb3\xc7\xa8\xb3"
- "\x17\x77\xd1\x02\x30\x63\xf6\xf0\x3d\x5f\x5f\x99\x3f\xde\x3a\x3d"
- "\x16\xaf\xb4\x52\x6a\xec\x63\xe3\x0c\xec\x50\xdc\xcc\xc4\x6a\x03"
- "\x5f\x8d\x7a\xf9\xfb\x34\xe4\x8b\x80\xa5\xb6\xda\x2c\x4e\x45\xcf"
- "\x3c\x93\xff\x50\x5d",
- .c_size = 101,
+ .c = (const unsigned char[]){
+ be64_to_cpua(3e, b3, c7, a8, b3, 17, 77, d1),
+ be64_to_cpua(dc, 2b, 43, 0e, 6a, b3, 53, 6f),
+ be64_to_cpua(4c, fc, 6f, 80, e3, af, b3, d9),
+ be64_to_cpua(9a, 02, de, 93, e8, 83, e4, 84),
+ be64_to_cpua(4d, c6, ef, da, 02, e7, 0f, 52),
+ be64_to_cpua(00, 1d, 20, 94, 77, fe, 31, fa),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(4e, 45, cf, 3c, 93, ff, 50, 5d),
+ be64_to_cpua(34, e4, 8b, 80, a5, b6, da, 2c),
+ be64_to_cpua(c4, 6a, 03, 5f, 8d, 7a, f9, fb),
+ be64_to_cpua(ec, 63, e3, 0c, ec, 50, dc, cc),
+ be64_to_cpua(de, 3a, 3d, 16, af, b4, 52, 6a),
+ be64_to_cpua(63, f6, f0, 3d, 5f, 5f, 99, 3f),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 00) },
+ .c_size = ECC_MAX_BYTES * 2,
.public_key_vec = true,
- .siggen_sigver_test = true,
},
};
-static const struct akcipher_testvec ecdsa_nist_p521_tv_template[] = {
+static const struct sig_testvec ecdsa_nist_p521_tv_template[] = {
{
.key = /* secp521r1(sha224) */
"\x04\x01\x4f\x43\x18\xb6\xa9\xc9\x5d\x68\xd3\xa9\x42\xf8\x98\xc0"
@@ -1084,28 +1202,31 @@ static const struct akcipher_testvec ecdsa_nist_p521_tv_template[] = {
"\xed\x37\x0f\x99\x3f\x26\xba\xa3\x8e\xff\x79\x34\x7c\x3a\xfe\x1f"
"\x3b\x83\x82\x2f\x14",
.key_len = 133,
- .params =
- "\x30\x10\x06\x07\x2a\x86\x48\xce\x3d\x02\x01\x06\x05\x2b\x81\x04"
- "\x00\x23",
- .param_len = 18,
.m =
"\xa2\x3a\x6a\x8c\x7b\x3c\xf2\x51\xf8\xbe\x5f\x4f\x3b\x15\x05\xc4"
"\xb5\xbc\x19\xe7\x21\x85\xe9\x23\x06\x33\x62\xfb",
.m_size = 28,
- .algo = OID_id_ecdsa_with_sha224,
- .c =
- "\x30\x81\x86\x02\x41\x01\xd6\x43\xe7\xff\x42\xb2\xba\x74\x35\xf6"
- "\xdc\x6d\x02\x7b\x22\xac\xe2\xef\x07\x92\xee\x60\x94\x06\xf8\x3f"
- "\x59\x0f\x74\xf0\x3f\xd8\x18\xc6\x37\x8a\xcb\xa7\xd8\x7d\x98\x85"
- "\x29\x88\xff\x0b\x94\x94\x6c\xa6\x9b\x89\x8b\x1e\xfd\x09\x46\x6b"
- "\xc7\xaf\x7a\xb9\x19\x0a\x02\x41\x3a\x26\x0d\x55\xcd\x23\x1e\x7d"
- "\xa0\x5e\xf9\x88\xf3\xd2\x32\x90\x57\x0f\xf8\x65\x97\x6b\x09\x4d"
- "\x22\x26\x0b\x5f\x49\x32\x6b\x91\x99\x30\x90\x0f\x1c\x8f\x78\xd3"
- "\x9f\x0e\x64\xcc\xc4\xe8\x43\xd9\x0e\x1c\xad\x22\xda\x82\x00\x35"
- "\xa3\x50\xb1\xa5\x98\x92\x2a\xa5\x52",
- .c_size = 137,
+ .c = (const unsigned char[]){
+ be64_to_cpua(46, 6b, c7, af, 7a, b9, 19, 0a),
+ be64_to_cpua(6c, a6, 9b, 89, 8b, 1e, fd, 09),
+ be64_to_cpua(98, 85, 29, 88, ff, 0b, 94, 94),
+ be64_to_cpua(18, c6, 37, 8a, cb, a7, d8, 7d),
+ be64_to_cpua(f8, 3f, 59, 0f, 74, f0, 3f, d8),
+ be64_to_cpua(e2, ef, 07, 92, ee, 60, 94, 06),
+ be64_to_cpua(35, f6, dc, 6d, 02, 7b, 22, ac),
+ be64_to_cpua(d6, 43, e7, ff, 42, b2, ba, 74),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 01),
+ be64_to_cpua(50, b1, a5, 98, 92, 2a, a5, 52),
+ be64_to_cpua(1c, ad, 22, da, 82, 00, 35, a3),
+ be64_to_cpua(0e, 64, cc, c4, e8, 43, d9, 0e),
+ be64_to_cpua(30, 90, 0f, 1c, 8f, 78, d3, 9f),
+ be64_to_cpua(26, 0b, 5f, 49, 32, 6b, 91, 99),
+ be64_to_cpua(0f, f8, 65, 97, 6b, 09, 4d, 22),
+ be64_to_cpua(5e, f9, 88, f3, d2, 32, 90, 57),
+ be64_to_cpua(26, 0d, 55, cd, 23, 1e, 7d, a0),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 3a) },
+ .c_size = ECC_MAX_BYTES * 2,
.public_key_vec = true,
- .siggen_sigver_test = true,
},
{
.key = /* secp521r1(sha256) */
@@ -1119,28 +1240,31 @@ static const struct akcipher_testvec ecdsa_nist_p521_tv_template[] = {
"\x8a\xe9\x53\xa8\xcf\xce\x43\x0e\x82\x20\x86\xbc\x88\x9c\xb7\xe3"
"\xe6\x77\x1e\x1f\x8a",
.key_len = 133,
- .params =
- "\x30\x10\x06\x07\x2a\x86\x48\xce\x3d\x02\x01\x06\x05\x2b\x81\x04"
- "\x00\x23",
- .param_len = 18,
.m =
"\xcc\x97\x73\x0c\x73\xa2\x53\x2b\xfa\xd7\x83\x1d\x0c\x72\x1b\x39"
"\x80\x71\x8d\xdd\xc5\x9b\xff\x55\x32\x98\x25\xa2\x58\x2e\xb7\x73",
.m_size = 32,
- .algo = OID_id_ecdsa_with_sha256,
- .c =
- "\x30\x81\x88\x02\x42\x00\xcd\xa5\x5f\x57\x52\x27\x78\x3a\xb5\x06"
- "\x0f\xfd\x83\xfc\x0e\xd9\xce\x50\x9f\x7d\x1f\xca\x8b\xa8\x2d\x56"
- "\x3c\xf6\xf0\xd8\xe1\xb7\x5d\x95\x35\x6f\x02\x0e\xaf\xe1\x4c\xae"
- "\xce\x54\x76\x9a\xc2\x8f\xb8\x38\x1f\x46\x0b\x04\x64\x34\x79\xde"
- "\x7e\xd7\x59\x10\xe9\xd9\xd5\x02\x42\x01\xcf\x50\x85\x38\xf9\x15"
- "\x83\x18\x04\x6b\x35\xae\x65\xb5\x99\x12\x0a\xa9\x79\x24\xb9\x37"
- "\x35\xdd\xa0\xe0\x87\x2c\x44\x4b\x5a\xee\xaf\xfa\x10\xdd\x9b\xfb"
- "\x36\x1a\x31\x03\x42\x02\x5f\x50\xf0\xa2\x0d\x1c\x57\x56\x8f\x12"
- "\xb7\x1d\x91\x55\x38\xb6\xf6\x34\x65\xc7\xbd",
- .c_size = 139,
+ .c = (const unsigned char[]){
+ be64_to_cpua(de, 7e, d7, 59, 10, e9, d9, d5),
+ be64_to_cpua(38, 1f, 46, 0b, 04, 64, 34, 79),
+ be64_to_cpua(ae, ce, 54, 76, 9a, c2, 8f, b8),
+ be64_to_cpua(95, 35, 6f, 02, 0e, af, e1, 4c),
+ be64_to_cpua(56, 3c, f6, f0, d8, e1, b7, 5d),
+ be64_to_cpua(50, 9f, 7d, 1f, ca, 8b, a8, 2d),
+ be64_to_cpua(06, 0f, fd, 83, fc, 0e, d9, ce),
+ be64_to_cpua(a5, 5f, 57, 52, 27, 78, 3a, b5),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, cd),
+ be64_to_cpua(55, 38, b6, f6, 34, 65, c7, bd),
+ be64_to_cpua(1c, 57, 56, 8f, 12, b7, 1d, 91),
+ be64_to_cpua(03, 42, 02, 5f, 50, f0, a2, 0d),
+ be64_to_cpua(fa, 10, dd, 9b, fb, 36, 1a, 31),
+ be64_to_cpua(e0, 87, 2c, 44, 4b, 5a, ee, af),
+ be64_to_cpua(a9, 79, 24, b9, 37, 35, dd, a0),
+ be64_to_cpua(6b, 35, ae, 65, b5, 99, 12, 0a),
+ be64_to_cpua(50, 85, 38, f9, 15, 83, 18, 04),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 01, cf) },
+ .c_size = ECC_MAX_BYTES * 2,
.public_key_vec = true,
- .siggen_sigver_test = true,
},
{
.key = /* secp521r1(sha384) */
@@ -1154,29 +1278,32 @@ static const struct akcipher_testvec ecdsa_nist_p521_tv_template[] = {
"\x22\x6e\xd7\x35\xc7\x23\xb7\x13\xae\xb6\x34\xff\xd7\x80\xe5\x39"
"\xb3\x3b\x5b\x1b\x94",
.key_len = 133,
- .params =
- "\x30\x10\x06\x07\x2a\x86\x48\xce\x3d\x02\x01\x06\x05\x2b\x81\x04"
- "\x00\x23",
- .param_len = 18,
.m =
"\x36\x98\xd6\x82\xfa\xad\xed\x3c\xb9\x40\xb6\x4d\x9e\xb7\x04\x26"
"\xad\x72\x34\x44\xd2\x81\xb4\x9b\xbe\x01\x04\x7a\xd8\x50\xf8\x59"
"\xba\xad\x23\x85\x6b\x59\xbe\xfb\xf6\x86\xd4\x67\xa8\x43\x28\x76",
.m_size = 48,
- .algo = OID_id_ecdsa_with_sha384,
- .c =
- "\x30\x81\x88\x02\x42\x00\x93\x96\x76\x3c\x27\xea\xaa\x9c\x26\xec"
- "\x51\xdc\xe8\x35\x5e\xae\x16\xf2\x4b\x64\x98\xf7\xec\xda\xc7\x7e"
- "\x42\x71\x86\x57\x2d\xf1\x7d\xe4\xdf\x9b\x7d\x9e\x47\xca\x33\x32"
- "\x76\x06\xd0\xf9\xc0\xe4\xe6\x84\x59\xfd\x1a\xc4\x40\xdd\x43\xb8"
- "\x6a\xdd\xfb\xe6\x63\x4e\x28\x02\x42\x00\xff\xc3\x6a\x87\x6e\xb5"
- "\x13\x1f\x20\x55\xce\x37\x97\xc9\x05\x51\xe5\xe4\x3c\xbc\x93\x65"
- "\x57\x1c\x30\xda\xa7\xcd\x26\x28\x76\x3b\x52\xdf\xc4\xc0\xdb\x54"
- "\xdb\x8a\x0d\x6a\xc3\xf3\x7a\xd1\xfa\xe7\xa7\xe5\x5a\x94\x56\xcf"
- "\x8f\xb4\x22\xc6\x4f\xab\x2b\x62\xc1\x42\xb1",
- .c_size = 139,
+ .c = (const unsigned char[]){
+ be64_to_cpua(b8, 6a, dd, fb, e6, 63, 4e, 28),
+ be64_to_cpua(84, 59, fd, 1a, c4, 40, dd, 43),
+ be64_to_cpua(32, 76, 06, d0, f9, c0, e4, e6),
+ be64_to_cpua(e4, df, 9b, 7d, 9e, 47, ca, 33),
+ be64_to_cpua(7e, 42, 71, 86, 57, 2d, f1, 7d),
+ be64_to_cpua(f2, 4b, 64, 98, f7, ec, da, c7),
+ be64_to_cpua(ec, 51, dc, e8, 35, 5e, ae, 16),
+ be64_to_cpua(96, 76, 3c, 27, ea, aa, 9c, 26),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, 93),
+ be64_to_cpua(c6, 4f, ab, 2b, 62, c1, 42, b1),
+ be64_to_cpua(e5, 5a, 94, 56, cf, 8f, b4, 22),
+ be64_to_cpua(6a, c3, f3, 7a, d1, fa, e7, a7),
+ be64_to_cpua(df, c4, c0, db, 54, db, 8a, 0d),
+ be64_to_cpua(da, a7, cd, 26, 28, 76, 3b, 52),
+ be64_to_cpua(e4, 3c, bc, 93, 65, 57, 1c, 30),
+ be64_to_cpua(55, ce, 37, 97, c9, 05, 51, e5),
+ be64_to_cpua(c3, 6a, 87, 6e, b5, 13, 1f, 20),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 00, ff) },
+ .c_size = ECC_MAX_BYTES * 2,
.public_key_vec = true,
- .siggen_sigver_test = true,
},
{
.key = /* secp521r1(sha512) */
@@ -1190,17 +1317,479 @@ static const struct akcipher_testvec ecdsa_nist_p521_tv_template[] = {
"\xfe\x3a\x05\x1a\xdb\xa9\x0f\xc0\x6c\x76\x30\x8c\xd8\xde\x44\xae"
"\xd0\x17\xdf\x49\x6a",
.key_len = 133,
- .params =
- "\x30\x10\x06\x07\x2a\x86\x48\xce\x3d\x02\x01\x06\x05\x2b\x81\x04"
- "\x00\x23",
- .param_len = 18,
.m =
"\x5c\xa6\xbc\x79\xb8\xa0\x1e\x11\x83\xf7\xe9\x05\xdf\xba\xf7\x69"
"\x97\x22\x32\xe4\x94\x7c\x65\xbd\x74\xc6\x9a\x8b\xbd\x0d\xdc\xed"
"\xf5\x9c\xeb\xe1\xc5\x68\x40\xf2\xc7\x04\xde\x9e\x0d\x76\xc5\xa3"
"\xf9\x3c\x6c\x98\x08\x31\xbd\x39\xe8\x42\x7f\x80\x39\x6f\xfe\x68",
.m_size = 64,
- .algo = OID_id_ecdsa_with_sha512,
+ .c = (const unsigned char[]){
+ be64_to_cpua(28, b5, 04, b0, b6, 33, 1c, 7e),
+ be64_to_cpua(80, a6, 13, fc, b6, 90, f7, bb),
+ be64_to_cpua(27, 93, e8, 6c, 49, 7d, 28, fc),
+ be64_to_cpua(1f, 12, 3e, b7, 7e, 51, ff, 7f),
+ be64_to_cpua(fb, 62, 1e, 42, 03, 6c, 74, 8a),
+ be64_to_cpua(63, 0e, 02, cc, 94, a9, 05, b9),
+ be64_to_cpua(aa, 86, ec, a8, 05, 03, 52, 56),
+ be64_to_cpua(71, 86, 96, ac, 21, 33, 7e, 4e),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 01, 5c),
+ be64_to_cpua(46, 1e, 77, 44, 78, e0, d1, 04),
+ be64_to_cpua(72, 74, 13, 63, 39, a6, e5, 25),
+ be64_to_cpua(00, 55, bb, 6a, b4, 73, 00, d2),
+ be64_to_cpua(71, d0, e9, ca, a7, c0, cb, aa),
+ be64_to_cpua(7a, 76, 37, 51, 47, 49, 98, 12),
+ be64_to_cpua(88, 05, 3e, 43, 39, 01, bd, b7),
+ be64_to_cpua(95, 35, 89, 4f, 41, 5f, 9e, 19),
+ be64_to_cpua(43, 52, 1d, e3, c6, bd, 5a, 40),
+ be64_to_cpua(00, 00, 00, 00, 00, 00, 01, 70) },
+ .c_size = ECC_MAX_BYTES * 2,
+ .public_key_vec = true,
+ },
+};
+
+/*
+ * ECDSA X9.62 test vectors.
+ *
+ * Identical to ECDSA test vectors, except signature in "c" is X9.62 encoded.
+ */
+static const struct sig_testvec x962_ecdsa_nist_p192_tv_template[] = {
+ {
+ .key = /* secp192r1(sha1) */
+ "\x04\xf7\x46\xf8\x2f\x15\xf6\x22\x8e\xd7\x57\x4f\xcc\xe7\xbb\xc1"
+ "\xd4\x09\x73\xcf\xea\xd0\x15\x07\x3d\xa5\x8a\x8a\x95\x43\xe4\x68"
+ "\xea\xc6\x25\xc1\xc1\x01\x25\x4c\x7e\xc3\x3c\xa6\x04\x0a\xe7\x08"
+ "\x98",
+ .key_len = 49,
+ .m =
+ "\xcd\xb9\xd2\x1c\xb7\x6f\xcd\x44\xb3\xfd\x63\xea\xa3\x66\x7f\xae"
+ "\x63\x85\xe7\x82",
+ .m_size = 20,
+ .c =
+ "\x30\x35\x02\x19\x00\xba\xe5\x93\x83\x6e\xb6\x3b\x63\xa0\x27\x91"
+ "\xc6\xf6\x7f\xc3\x09\xad\x59\xad\x88\x27\xd6\x92\x6b\x02\x18\x10"
+ "\x68\x01\x9d\xba\xce\x83\x08\xef\x95\x52\x7b\xa0\x0f\xe4\x18\x86"
+ "\x80\x6f\xa5\x79\x77\xda\xd0",
+ .c_size = 55,
+ .public_key_vec = true,
+ }, {
+ .key = /* secp192r1(sha224) */
+ "\x04\xb6\x4b\xb1\xd1\xac\xba\x24\x8f\x65\xb2\x60\x00\x90\xbf\xbd"
+ "\x78\x05\x73\xe9\x79\x1d\x6f\x7c\x0b\xd2\xc3\x93\xa7\x28\xe1\x75"
+ "\xf7\xd5\x95\x1d\x28\x10\xc0\x75\x50\x5c\x1a\x4f\x3f\x8f\xa5\xee"
+ "\xa3",
+ .key_len = 49,
+ .m =
+ "\x8d\xd6\xb8\x3e\xe5\xff\x23\xf6\x25\xa2\x43\x42\x74\x45\xa7\x40"
+ "\x3a\xff\x2f\xe1\xd3\xf6\x9f\xe8\x33\xcb\x12\x11",
+ .m_size = 28,
+ .c =
+ "\x30\x34\x02\x18\x5a\x8b\x82\x69\x7e\x8a\x0a\x09\x14\xf8\x11\x2b"
+ "\x55\xdc\xae\x37\x83\x7b\x12\xe6\xb6\x5b\xcb\xd4\x02\x18\x6a\x14"
+ "\x4f\x53\x75\xc8\x02\x48\xeb\xc3\x92\x0f\x1e\x72\xee\xc4\xa3\xe3"
+ "\x5c\x99\xdb\x92\x5b\x36",
+ .c_size = 54,
+ .public_key_vec = true,
+ }, {
+ .key = /* secp192r1(sha256) */
+ "\x04\xe2\x51\x24\x9b\xf7\xb6\x32\x82\x39\x66\x3d\x5b\xec\x3b\xae"
+ "\x0c\xd5\xf2\x67\xd1\xc7\xe1\x02\xe4\xbf\x90\x62\xb8\x55\x75\x56"
+ "\x69\x20\x5e\xcb\x4e\xca\x33\xd6\xcb\x62\x6b\x94\xa9\xa2\xe9\x58"
+ "\x91",
+ .key_len = 49,
+ .m =
+ "\x35\xec\xa1\xa0\x9e\x14\xde\x33\x03\xb6\xf6\xbd\x0c\x2f\xb2\xfd"
+ "\x1f\x27\x82\xa5\xd7\x70\x3f\xef\xa0\x82\x69\x8e\x73\x31\x8e\xd7",
+ .m_size = 32,
+ .c =
+ "\x30\x35\x02\x18\x3f\x72\x3f\x1f\x42\xd2\x3f\x1d\x6b\x1a\x58\x56"
+ "\xf1\x8f\xf7\xfd\x01\x48\xfb\x5f\x72\x2a\xd4\x8f\x02\x19\x00\xb3"
+ "\x69\x43\xfd\x48\x19\x86\xcf\x32\xdd\x41\x74\x6a\x51\xc7\xd9\x7d"
+ "\x3a\x97\xd9\xcd\x1a\x6a\x49",
+ .c_size = 55,
+ .public_key_vec = true,
+ }, {
+ .key = /* secp192r1(sha384) */
+ "\x04\x5a\x13\xfe\x68\x86\x4d\xf4\x17\xc7\xa4\xe5\x8c\x65\x57\xb7"
+ "\x03\x73\x26\x57\xfb\xe5\x58\x40\xd8\xfd\x49\x05\xab\xf1\x66\x1f"
+ "\xe2\x9d\x93\x9e\xc2\x22\x5a\x8b\x4f\xf3\x77\x22\x59\x7e\xa6\x4e"
+ "\x8b",
+ .key_len = 49,
+ .m =
+ "\x9d\x2e\x1a\x8f\xed\x6c\x4b\x61\xae\xac\xd5\x19\x79\xce\x67\xf9"
+ "\xa0\x34\xeb\xb0\x81\xf9\xd9\xdc\x6e\xb3\x5c\xa8\x69\xfc\x8a\x61"
+ "\x39\x81\xfb\xfd\x5c\x30\x6b\xa8\xee\xed\x89\xaf\xa3\x05\xe4\x78",
+ .m_size = 48,
+ .c =
+ "\x30\x35\x02\x19\x00\xf0\xa3\x38\xce\x2b\xf8\x9d\x1a\xcf\x7f\x34"
+ "\xb4\xb4\xe5\xc5\x00\xdd\x15\xbb\xd6\x8c\xa7\x03\x78\x02\x18\x64"
+ "\xbc\x5a\x1f\x82\x96\x61\xd7\xd1\x01\x77\x44\x5d\x53\xa4\x7c\x93"
+ "\x12\x3b\x3b\x28\xfb\x6d\xe1",
+ .c_size = 55,
+ .public_key_vec = true,
+ }, {
+ .key = /* secp192r1(sha512) */
+ "\x04\xd5\xf2\x6e\xc3\x94\x5c\x52\xbc\xdf\x86\x6c\x14\xd1\xca\xea"
+ "\xcc\x72\x3a\x8a\xf6\x7a\x3a\x56\x36\x3b\xca\xc6\x94\x0e\x17\x1d"
+ "\x9e\xa0\x58\x28\xf9\x4b\xe6\xd1\xa5\x44\x91\x35\x0d\xe7\xf5\x11"
+ "\x57",
+ .key_len = 49,
+ .m =
+ "\xd5\x4b\xe9\x36\xda\xd8\x6e\xc0\x50\x03\xbe\x00\x43\xff\xf0\x23"
+ "\xac\xa2\x42\xe7\x37\x77\x79\x52\x8f\x3e\xc0\x16\xc1\xfc\x8c\x67"
+ "\x16\xbc\x8a\x5d\x3b\xd3\x13\xbb\xb6\xc0\x26\x1b\xeb\x33\xcc\x70"
+ "\x4a\xf2\x11\x37\xe8\x1b\xba\x55\xac\x69\xe1\x74\x62\x7c\x6e\xb5",
+ .m_size = 64,
+ .c =
+ "\x30\x35\x02\x19\x00\x88\x5b\x8f\x59\x43\xbf\xcf\xc6\xdd\x3f\x07"
+ "\x87\x12\xa0\xd4\xac\x2b\x11\x2d\x1c\xb6\x06\xc9\x6c\x02\x18\x73"
+ "\xb4\x22\x9a\x98\x73\x3c\x83\xa9\x14\x2a\x5e\xf5\xe5\xfb\x72\x28"
+ "\x6a\xdf\x97\xfd\x82\x76\x24",
+ .c_size = 55,
+ .public_key_vec = true,
+ },
+};
+
+static const struct sig_testvec x962_ecdsa_nist_p256_tv_template[] = {
+ {
+ .key = /* secp256r1(sha1) */
+ "\x04\xb9\x7b\xbb\xd7\x17\x64\xd2\x7e\xfc\x81\x5d\x87\x06\x83\x41"
+ "\x22\xd6\x9a\xaa\x87\x17\xec\x4f\x63\x55\x2f\x94\xba\xdd\x83\xe9"
+ "\x34\x4b\xf3\xe9\x91\x13\x50\xb6\xcb\xca\x62\x08\xe7\x3b\x09\xdc"
+ "\xc3\x63\x4b\x2d\xb9\x73\x53\xe4\x45\xe6\x7c\xad\xe7\x6b\xb0\xe8"
+ "\xaf",
+ .key_len = 65,
+ .m =
+ "\xc2\x2b\x5f\x91\x78\x34\x26\x09\x42\x8d\x6f\x51\xb2\xc5\xaf\x4c"
+ "\x0b\xde\x6a\x42",
+ .m_size = 20,
+ .c =
+ "\x30\x46\x02\x21\x00\xf9\x25\xce\x9f\x3a\xa6\x35\x81\xcf\xd4\xe7"
+ "\xb7\xf0\x82\x56\x41\xf7\xd4\xad\x8d\x94\x5a\x69\x89\xee\xca\x6a"
+ "\x52\x0e\x48\x4d\xcc\x02\x21\x00\xd7\xe4\xef\x52\x66\xd3\x5b\x9d"
+ "\x8a\xfa\x54\x93\x29\xa7\x70\x86\xf1\x03\x03\xf3\x3b\xe2\x73\xf7"
+ "\xfb\x9d\x8b\xde\xd4\x8d\x6f\xad",
+ .c_size = 72,
+ .public_key_vec = true,
+ }, {
+ .key = /* secp256r1(sha224) */
+ "\x04\x8b\x6d\xc0\x33\x8e\x2d\x8b\x67\xf5\xeb\xc4\x7f\xa0\xf5\xd9"
+ "\x7b\x03\xa5\x78\x9a\xb5\xea\x14\xe4\x23\xd0\xaf\xd7\x0e\x2e\xa0"
+ "\xc9\x8b\xdb\x95\xf8\xb3\xaf\xac\x00\x2c\x2c\x1f\x7a\xfd\x95\x88"
+ "\x43\x13\xbf\xf3\x1c\x05\x1a\x14\x18\x09\x3f\xd6\x28\x3e\xc5\xa0"
+ "\xd4",
+ .key_len = 65,
+ .m =
+ "\x1a\x15\xbc\xa3\xe4\xed\x3a\xb8\x23\x67\xc6\xc4\x34\xf8\x6c\x41"
+ "\x04\x0b\xda\xc5\x77\xfa\x1c\x2d\xe6\x2c\x3b\xe0",
+ .m_size = 28,
+ .c =
+ "\x30\x44\x02\x20\x20\x43\xfa\xc0\x9f\x9d\x7b\xe7\xae\xce\x77\x59"
+ "\x1a\xdb\x59\xd5\x34\x62\x79\xcb\x6a\x91\x67\x2e\x7d\x25\xd8\x25"
+ "\xf5\x81\xd2\x1e\x02\x20\x5f\xf8\x74\xf8\x57\xd0\x5e\x54\x76\x20"
+ "\x4a\x77\x22\xec\xc8\x66\xbf\x50\x05\x58\x39\x0e\x26\x92\xce\xd5"
+ "\x2e\x8b\xde\x5a\x04\x0e",
+ .c_size = 70,
+ .public_key_vec = true,
+ }, {
+ .key = /* secp256r1(sha256) */
+ "\x04\xf1\xea\xc4\x53\xf3\xb9\x0e\x9f\x7e\xad\xe3\xea\xd7\x0e\x0f"
+ "\xd6\x98\x9a\xca\x92\x4d\x0a\x80\xdb\x2d\x45\xc7\xec\x4b\x97\x00"
+ "\x2f\xe9\x42\x6c\x29\xdc\x55\x0e\x0b\x53\x12\x9b\x2b\xad\x2c\xe9"
+ "\x80\xe6\xc5\x43\xc2\x1d\x5e\xbb\x65\x21\x50\xb6\x37\xb0\x03\x8e"
+ "\xb8",
+ .key_len = 65,
+ .m =
+ "\x8f\x43\x43\x46\x64\x8f\x6b\x96\xdf\x89\xdd\xa9\x01\xc5\x17\x6b"
+ "\x10\xa6\xd8\x39\x61\xdd\x3c\x1a\xc8\x8b\x59\xb2\xdc\x32\x7a\xa4",
+ .m_size = 32,
+ .c =
+ "\x30\x45\x02\x20\x08\x31\xfa\x74\x0d\x1d\x21\x5d\x09\xdc\x29\x63"
+ "\xa8\x1a\xad\xfc\xac\x44\xc3\xe8\x24\x11\x2d\xa4\x91\xdc\x02\x67"
+ "\xdc\x0c\xd0\x82\x02\x21\x00\xbd\xff\xce\xee\x42\xc3\x97\xff\xf9"
+ "\xa9\x81\xac\x4a\x50\xd0\x91\x0a\x6e\x1b\xc4\xaf\xe1\x83\xc3\x4f"
+ "\x2a\x65\x35\x23\xe3\x1d\xfa",
+ .c_size = 71,
+ .public_key_vec = true,
+ }, {
+ .key = /* secp256r1(sha384) */
+ "\x04\xc5\xc6\xea\x60\xc9\xce\xad\x02\x8d\xf5\x3e\x24\xe3\x52\x1d"
+ "\x28\x47\x3b\xc3\x6b\xa4\x99\x35\x99\x11\x88\x88\xc8\xf4\xee\x7e"
+ "\x8c\x33\x8f\x41\x03\x24\x46\x2b\x1a\x82\xf9\x9f\xe1\x97\x1b\x00"
+ "\xda\x3b\x24\x41\xf7\x66\x33\x58\x3d\x3a\x81\xad\xcf\x16\xe9\xe2"
+ "\x7c",
+ .key_len = 65,
+ .m =
+ "\x3e\x78\x70\xfb\xcd\x66\xba\x91\xa1\x79\xff\x1e\x1c\x6b\x78\xe6"
+ "\xc0\x81\x3a\x65\x97\x14\x84\x36\x14\x1a\x9a\xb7\xc5\xab\x84\x94"
+ "\x5e\xbb\x1b\x34\x71\xcb\x41\xe1\xf6\xfc\x92\x7b\x34\xbb\x86\xbb",
+ .m_size = 48,
+ .c =
+ "\x30\x46\x02\x21\x00\x8e\xf3\x6f\xdc\xf8\x69\xa6\x2e\xd0\x2e\x95"
+ "\x54\xd1\x95\x64\x93\x08\xb2\x6b\x24\x94\x48\x46\x5e\xf2\xe4\x6c"
+ "\xc7\x94\xb1\xd5\xfe\x02\x21\x00\xeb\xa7\x80\x26\xdc\xf9\x3a\x44"
+ "\x19\xfb\x5f\x92\xf4\xc9\x23\x37\x69\xf4\x3b\x4f\x47\xcf\x9b\x16"
+ "\xc0\x60\x11\x92\xdc\x17\x89\x12",
+ .c_size = 72,
+ .public_key_vec = true,
+ }, {
+ .key = /* secp256r1(sha512) */
+ "\x04\xd7\x27\x46\x49\xf6\x26\x85\x12\x40\x76\x8e\xe2\xe6\x2a\x7a"
+ "\x83\xb1\x4e\x7a\xeb\x3b\x5c\x67\x4a\xb5\xa4\x92\x8c\x69\xff\x38"
+ "\xee\xd9\x4e\x13\x29\x59\xad\xde\x6b\xbb\x45\x31\xee\xfd\xd1\x1b"
+ "\x64\xd3\xb5\xfc\xaf\x9b\x4b\x88\x3b\x0e\xb7\xd6\xdf\xf1\xd5\x92"
+ "\xbf",
+ .key_len = 65,
+ .m =
+ "\x57\xb7\x9e\xe9\x05\x0a\x8c\x1b\xc9\x13\xe5\x4a\x24\xc7\xe2\xe9"
+ "\x43\xc3\xd1\x76\x62\xf4\x98\x1a\x9c\x13\xb0\x20\x1b\xe5\x39\xca"
+ "\x4f\xd9\x85\x34\x95\xa2\x31\xbc\xbb\xde\xdd\x76\xbb\x61\xe3\xcf"
+ "\x9d\xc0\x49\x7a\xf3\x7a\xc4\x7d\xa8\x04\x4b\x8d\xb4\x4d\x5b\xd6",
+ .m_size = 64,
+ .c =
+ "\x30\x45\x02\x21\x00\xb8\x6d\x87\x81\x43\xdf\xfb\x9f\x40\xea\x44"
+ "\x81\x00\x4e\x29\x08\xed\x8c\x73\x30\x6c\x22\xb3\x97\x76\xf6\x04"
+ "\x99\x09\x37\x4d\xfa\x02\x20\x1e\xb9\x75\x31\xf6\x04\xa5\x4d\xf8"
+ "\x00\xdd\xab\xd4\xc0\x2b\xe6\x5c\xad\xc3\x78\x1c\xc2\xc1\x19\x76"
+ "\x31\x79\x4a\xe9\x81\x6a\xee",
+ .c_size = 71,
+ .public_key_vec = true,
+ },
+};
+
+static const struct sig_testvec x962_ecdsa_nist_p384_tv_template[] = {
+ {
+ .key = /* secp384r1(sha1) */
+ "\x04\x89\x25\xf3\x97\x88\xcb\xb0\x78\xc5\x72\x9a\x14\x6e\x7a\xb1"
+ "\x5a\xa5\x24\xf1\x95\x06\x9e\x28\xfb\xc4\xb9\xbe\x5a\x0d\xd9\x9f"
+ "\xf3\xd1\x4d\x2d\x07\x99\xbd\xda\xa7\x66\xec\xbb\xea\xba\x79\x42"
+ "\xc9\x34\x89\x6a\xe7\x0b\xc3\xf2\xfe\x32\x30\xbe\xba\xf9\xdf\x7e"
+ "\x4b\x6a\x07\x8e\x26\x66\x3f\x1d\xec\xa2\x57\x91\x51\xdd\x17\x0e"
+ "\x0b\x25\xd6\x80\x5c\x3b\xe6\x1a\x98\x48\x91\x45\x7a\x73\xb0\xc3"
+ "\xf1",
+ .key_len = 97,
+ .m =
+ "\x12\x55\x28\xf0\x77\xd5\xb6\x21\x71\x32\x48\xcd\x28\xa8\x25\x22"
+ "\x3a\x69\xc1\x93",
+ .m_size = 20,
+ .c =
+ "\x30\x66\x02\x31\x00\xf5\x0f\x24\x4c\x07\x93\x6f\x21\x57\x55\x07"
+ "\x20\x43\x30\xde\xa0\x8d\x26\x8e\xae\x63\x3f\xbc\x20\x3a\xc6\xf1"
+ "\x32\x3c\xce\x70\x2b\x78\xf1\x4c\x26\xe6\x5b\x86\xcf\xec\x7c\x7e"
+ "\xd0\x87\xd7\xd7\x6e\x02\x31\x00\xcd\xbb\x7e\x81\x5d\x8f\x63\xc0"
+ "\x5f\x63\xb1\xbe\x5e\x4c\x0e\xa1\xdf\x28\x8c\x1b\xfa\xf9\x95\x88"
+ "\x74\xa0\x0f\xbf\xaf\xc3\x36\x76\x4a\xa1\x59\xf1\x1c\xa4\x58\x26"
+ "\x79\x12\x2a\xb7\xc5\x15\x92\xc5",
+ .c_size = 104,
+ .public_key_vec = true,
+ }, {
+ .key = /* secp384r1(sha224) */
+ "\x04\x69\x6c\xcf\x62\xee\xd0\x0d\xe5\xb5\x2f\x70\x54\xcf\x26\xa0"
+ "\xd9\x98\x8d\x92\x2a\xab\x9b\x11\xcb\x48\x18\xa1\xa9\x0d\xd5\x18"
+ "\x3e\xe8\x29\x6e\xf6\xe4\xb5\x8e\xc7\x4a\xc2\x5f\x37\x13\x99\x05"
+ "\xb6\xa4\x9d\xf9\xfb\x79\x41\xe7\xd7\x96\x9f\x73\x3b\x39\x43\xdc"
+ "\xda\xf4\x06\xb9\xa5\x29\x01\x9d\x3b\xe1\xd8\x68\x77\x2a\xf4\x50"
+ "\x6b\x93\x99\x6c\x66\x4c\x42\x3f\x65\x60\x6c\x1c\x0b\x93\x9b\x9d"
+ "\xe0",
+ .key_len = 97,
+ .m =
+ "\x12\x80\xb6\xeb\x25\xe2\x3d\xf0\x21\x32\x96\x17\x3a\x38\x39\xfd"
+ "\x1f\x05\x34\x7b\xb8\xf9\x71\x66\x03\x4f\xd5\xe5",
+ .m_size = 28,
+ .c =
+ "\x30\x66\x02\x31\x00\x8a\x51\x84\xce\x13\x1e\xd2\xdc\xec\xcb\xe4"
+ "\x89\x47\xb2\xf7\xbc\x97\xf1\xc8\x72\x26\xcf\x5a\x5e\xc5\xda\xb4"
+ "\xe3\x93\x07\xe0\x99\xc9\x9c\x11\xb8\x10\x01\xc5\x41\x3f\xdd\x15"
+ "\x1b\x68\x2b\x9d\x8b\x02\x31\x00\x8b\x03\x2c\xfc\x1f\xd1\xa9\xa4"
+ "\x4b\x00\x08\x31\x6c\xf5\xd5\xf6\xdf\xd8\x68\xa2\x64\x42\x65\xf3"
+ "\x4d\xd0\xc6\x6e\xb0\xe9\xfc\x14\x9f\x19\xd0\x42\x8b\x93\xc2\x11"
+ "\x88\x2b\x82\x26\x5e\x1c\xda\xfb",
+ .c_size = 104,
+ .public_key_vec = true,
+ }, {
+ .key = /* secp384r1(sha256) */
+ "\x04\xee\xd6\xda\x3e\x94\x90\x00\x27\xed\xf8\x64\x55\xd6\x51\x9a"
+ "\x1f\x52\x00\x63\x78\xf1\xa9\xfd\x75\x4c\x9e\xb2\x20\x1a\x91\x5a"
+ "\xba\x7a\xa3\xe5\x6c\xb6\x25\x68\x4b\xe8\x13\xa6\x54\x87\x2c\x0e"
+ "\xd0\x83\x95\xbc\xbf\xc5\x28\x4f\x77\x1c\x46\xa6\xf0\xbc\xd4\xa4"
+ "\x8d\xc2\x8f\xb3\x32\x37\x40\xd6\xca\xf8\xae\x07\x34\x52\x39\x52"
+ "\x17\xc3\x34\x29\xd6\x40\xea\x5c\xb9\x3f\xfb\x32\x2e\x12\x33\xbc"
+ "\xab",
+ .key_len = 97,
+ .m =
+ "\xaa\xe7\xfd\x03\x26\xcb\x94\x71\xe4\xce\x0f\xc5\xff\xa6\x29\xa3"
+ "\xe1\xcc\x4c\x35\x4e\xde\xca\x80\xab\x26\x0c\x25\xe6\x68\x11\xc2",
+ .m_size = 32,
+ .c =
+ "\x30\x64\x02\x30\x08\x09\x12\x9d\x6e\x96\x64\xa6\x8e\x3f\x7e\xce"
+ "\x0a\x9b\xaa\x59\xcc\x47\x53\x87\xbc\xbd\x83\x3f\xaf\x06\x3f\x84"
+ "\x04\xe2\xf9\x67\xb6\xc6\xfc\x70\x2e\x66\x3c\x77\xc8\x8d\x2c\x79"
+ "\x3a\x8e\x32\xc4\x02\x30\x40\x34\xb8\x90\xa9\x80\xab\x47\x26\xa2"
+ "\xb0\x89\x42\x0a\xda\xd9\xdd\xce\xbc\xb2\x97\xf4\x9c\xf3\x15\x68"
+ "\xc0\x75\x3e\x23\x5e\x36\x4f\x8d\xde\x1e\x93\x8d\x95\xbb\x10\x0e"
+ "\xf4\x1f\x39\xca\x4d\x43",
+ .c_size = 102,
+ .public_key_vec = true,
+ }, {
+ .key = /* secp384r1(sha384) */
+ "\x04\x3a\x2f\x62\xe7\x1a\xcf\x24\xd0\x0b\x7c\xe0\xed\x46\x0a\x4f"
+ "\x74\x16\x43\xe9\x1a\x25\x7c\x55\xff\xf0\x29\x68\x66\x20\x91\xf9"
+ "\xdb\x2b\xf6\xb3\x6c\x54\x01\xca\xc7\x6a\x5c\x0d\xeb\x68\xd9\x3c"
+ "\xf1\x01\x74\x1f\xf9\x6c\xe5\x5b\x60\xe9\x7f\x5d\xb3\x12\x80\x2a"
+ "\xd8\x67\x92\xc9\x0e\x4c\x4c\x6b\xa1\xb2\xa8\x1e\xac\x1c\x97\xd9"
+ "\x21\x67\xe5\x1b\x5a\x52\x31\x68\xd6\xee\xf0\x19\xb0\x55\xed\x89"
+ "\x9e",
+ .key_len = 97,
+ .m =
+ "\x8d\xf2\xc0\xe9\xa8\xf3\x8e\x44\xc4\x8c\x1a\xa0\xb8\xd7\x17\xdf"
+ "\xf2\x37\x1b\xc6\xe3\xf5\x62\xcc\x68\xf5\xd5\x0b\xbf\x73\x2b\xb1"
+ "\xb0\x4c\x04\x00\x31\xab\xfe\xc8\xd6\x09\xc8\xf2\xea\xd3\x28\xff",
+ .m_size = 48,
+ .c =
+ "\x30\x66\x02\x31\x00\x9b\x28\x68\xc0\xa1\xea\x8c\x50\xee\x2e\x62"
+ "\x35\x46\xfa\x00\xd8\x2d\x7a\x91\x5f\x49\x2d\x22\x08\x29\xe6\xfb"
+ "\xca\x8c\xd6\xb6\xb4\x3b\x1f\x07\x8f\x15\x02\xfe\x1d\xa2\xa4\xc8"
+ "\xf2\xea\x9d\x11\x1f\x02\x31\x00\xfc\x50\xf6\x43\xbd\x50\x82\x0e"
+ "\xbf\xe3\x75\x24\x49\xac\xfb\xc8\x71\xcd\x8f\x18\x99\xf0\x0f\x13"
+ "\x44\x92\x8c\x86\x99\x65\xb3\x97\x96\x17\x04\xc9\x05\x77\xf1\x8e"
+ "\xab\x8d\x4e\xde\xe6\x6d\x9b\x66",
+ .c_size = 104,
+ .public_key_vec = true,
+ }, {
+ .key = /* secp384r1(sha512) */
+ "\x04\xb4\xe7\xc1\xeb\x64\x25\x22\x46\xc3\x86\x61\x80\xbe\x1e\x46"
+ "\xcb\xf6\x05\xc2\xee\x73\x83\xbc\xea\x30\x61\x4d\x40\x05\x41\xf4"
+ "\x8c\xe3\x0e\x5c\xf0\x50\xf2\x07\x19\xe8\x4f\x25\xbe\xee\x0c\x95"
+ "\x54\x36\x86\xec\xc2\x20\x75\xf3\x89\xb5\x11\xa1\xb7\xf5\xaf\xbe"
+ "\x81\xe4\xc3\x39\x06\xbd\xe4\xfe\x68\x1c\x6d\x99\x2b\x1b\x63\xfa"
+ "\xdf\x42\x5c\xc2\x5a\xc7\x0c\xf4\x15\xf7\x1b\xa3\x2e\xd7\x00\xac"
+ "\xa3",
+ .key_len = 97,
+ .m =
+ "\xe8\xb7\x52\x7d\x1a\x44\x20\x05\x53\x6b\x3a\x68\xf2\xe7\x6c\xa1"
+ "\xae\x9d\x84\xbb\xba\x52\x43\x3e\x2c\x42\x78\x49\xbf\x78\xb2\x71"
+ "\xeb\xe1\xe0\xe8\x42\x7b\x11\xad\x2b\x99\x05\x1d\x36\xe6\xac\xfc"
+ "\x55\x73\xf0\x15\x63\x39\xb8\x6a\x6a\xc5\x91\x5b\xca\x6a\xa8\x0e",
+ .m_size = 64,
+ .c =
+ "\x30\x63\x02\x2f\x1d\x20\x94\x77\xfe\x31\xfa\x4d\xc6\xef\xda\x02"
+ "\xe7\x0f\x52\x9a\x02\xde\x93\xe8\x83\xe4\x84\x4c\xfc\x6f\x80\xe3"
+ "\xaf\xb3\xd9\xdc\x2b\x43\x0e\x6a\xb3\x53\x6f\x3e\xb3\xc7\xa8\xb3"
+ "\x17\x77\xd1\x02\x30\x63\xf6\xf0\x3d\x5f\x5f\x99\x3f\xde\x3a\x3d"
+ "\x16\xaf\xb4\x52\x6a\xec\x63\xe3\x0c\xec\x50\xdc\xcc\xc4\x6a\x03"
+ "\x5f\x8d\x7a\xf9\xfb\x34\xe4\x8b\x80\xa5\xb6\xda\x2c\x4e\x45\xcf"
+ "\x3c\x93\xff\x50\x5d",
+ .c_size = 101,
+ .public_key_vec = true,
+ },
+};
+
+static const struct sig_testvec x962_ecdsa_nist_p521_tv_template[] = {
+ {
+ .key = /* secp521r1(sha224) */
+ "\x04\x01\x4f\x43\x18\xb6\xa9\xc9\x5d\x68\xd3\xa9\x42\xf8\x98\xc0"
+ "\xd2\xd1\xa9\x50\x3b\xe8\xc4\x40\xe6\x11\x78\x88\x4b\xbd\x76\xa7"
+ "\x9a\xe0\xdd\x31\xa4\x67\x78\x45\x33\x9e\x8c\xd1\xc7\x44\xac\x61"
+ "\x68\xc8\x04\xe7\x5c\x79\xb1\xf1\x41\x0c\x71\xc0\x53\xa8\xbc\xfb"
+ "\xf5\xca\xd4\x01\x40\xfd\xa3\x45\xda\x08\xe0\xb4\xcb\x28\x3b\x0a"
+ "\x02\x35\x5f\x02\x9f\x3f\xcd\xef\x08\x22\x40\x97\x74\x65\xb7\x76"
+ "\x85\xc7\xc0\x5c\xfb\x81\xe1\xa5\xde\x0c\x4e\x8b\x12\x31\xb6\x47"
+ "\xed\x37\x0f\x99\x3f\x26\xba\xa3\x8e\xff\x79\x34\x7c\x3a\xfe\x1f"
+ "\x3b\x83\x82\x2f\x14",
+ .key_len = 133,
+ .m =
+ "\xa2\x3a\x6a\x8c\x7b\x3c\xf2\x51\xf8\xbe\x5f\x4f\x3b\x15\x05\xc4"
+ "\xb5\xbc\x19\xe7\x21\x85\xe9\x23\x06\x33\x62\xfb",
+ .m_size = 28,
+ .c =
+ "\x30\x81\x86\x02\x41\x01\xd6\x43\xe7\xff\x42\xb2\xba\x74\x35\xf6"
+ "\xdc\x6d\x02\x7b\x22\xac\xe2\xef\x07\x92\xee\x60\x94\x06\xf8\x3f"
+ "\x59\x0f\x74\xf0\x3f\xd8\x18\xc6\x37\x8a\xcb\xa7\xd8\x7d\x98\x85"
+ "\x29\x88\xff\x0b\x94\x94\x6c\xa6\x9b\x89\x8b\x1e\xfd\x09\x46\x6b"
+ "\xc7\xaf\x7a\xb9\x19\x0a\x02\x41\x3a\x26\x0d\x55\xcd\x23\x1e\x7d"
+ "\xa0\x5e\xf9\x88\xf3\xd2\x32\x90\x57\x0f\xf8\x65\x97\x6b\x09\x4d"
+ "\x22\x26\x0b\x5f\x49\x32\x6b\x91\x99\x30\x90\x0f\x1c\x8f\x78\xd3"
+ "\x9f\x0e\x64\xcc\xc4\xe8\x43\xd9\x0e\x1c\xad\x22\xda\x82\x00\x35"
+ "\xa3\x50\xb1\xa5\x98\x92\x2a\xa5\x52",
+ .c_size = 137,
+ .public_key_vec = true,
+ },
+ {
+ .key = /* secp521r1(sha256) */
+ "\x04\x01\x05\x3a\x6b\x3b\x5a\x0f\xa7\xb9\xb7\x32\x53\x4e\xe2\xae"
+ "\x0a\x52\xc5\xda\xdd\x5a\x79\x1c\x30\x2d\x33\x07\x79\xd5\x70\x14"
+ "\x61\x0c\xec\x26\x4d\xd8\x35\x57\x04\x1d\x88\x33\x4d\xce\x05\x36"
+ "\xa5\xaf\x56\x84\xfa\x0b\x9e\xff\x7b\x30\x4b\x92\x1d\x06\xf8\x81"
+ "\x24\x1e\x51\x00\x09\x21\x51\xf7\x46\x0a\x77\xdb\xb5\x0c\xe7\x9c"
+ "\xff\x27\x3c\x02\x71\xd7\x85\x36\xf1\xaa\x11\x59\xd8\xb8\xdc\x09"
+ "\xdc\x6d\x5a\x6f\x63\x07\x6c\xe1\xe5\x4d\x6e\x0f\x6e\xfb\x7c\x05"
+ "\x8a\xe9\x53\xa8\xcf\xce\x43\x0e\x82\x20\x86\xbc\x88\x9c\xb7\xe3"
+ "\xe6\x77\x1e\x1f\x8a",
+ .key_len = 133,
+ .m =
+ "\xcc\x97\x73\x0c\x73\xa2\x53\x2b\xfa\xd7\x83\x1d\x0c\x72\x1b\x39"
+ "\x80\x71\x8d\xdd\xc5\x9b\xff\x55\x32\x98\x25\xa2\x58\x2e\xb7\x73",
+ .m_size = 32,
+ .c =
+ "\x30\x81\x88\x02\x42\x00\xcd\xa5\x5f\x57\x52\x27\x78\x3a\xb5\x06"
+ "\x0f\xfd\x83\xfc\x0e\xd9\xce\x50\x9f\x7d\x1f\xca\x8b\xa8\x2d\x56"
+ "\x3c\xf6\xf0\xd8\xe1\xb7\x5d\x95\x35\x6f\x02\x0e\xaf\xe1\x4c\xae"
+ "\xce\x54\x76\x9a\xc2\x8f\xb8\x38\x1f\x46\x0b\x04\x64\x34\x79\xde"
+ "\x7e\xd7\x59\x10\xe9\xd9\xd5\x02\x42\x01\xcf\x50\x85\x38\xf9\x15"
+ "\x83\x18\x04\x6b\x35\xae\x65\xb5\x99\x12\x0a\xa9\x79\x24\xb9\x37"
+ "\x35\xdd\xa0\xe0\x87\x2c\x44\x4b\x5a\xee\xaf\xfa\x10\xdd\x9b\xfb"
+ "\x36\x1a\x31\x03\x42\x02\x5f\x50\xf0\xa2\x0d\x1c\x57\x56\x8f\x12"
+ "\xb7\x1d\x91\x55\x38\xb6\xf6\x34\x65\xc7\xbd",
+ .c_size = 139,
+ .public_key_vec = true,
+ },
+ {
+ .key = /* secp521r1(sha384) */
+ "\x04\x00\x2e\xd6\x21\x04\x75\xc3\xdc\x7d\xff\x0e\xf3\x70\x25\x2b"
+ "\xad\x72\xfc\x5a\x91\xf1\xd5\x9c\x64\xf3\x1f\x47\x11\x10\x62\x33"
+ "\xfd\x2e\xe8\x32\xca\x9e\x6f\x0a\x4c\x5b\x35\x9a\x46\xc5\xe7\xd4"
+ "\x38\xda\xb2\xf0\xf4\x87\xf3\x86\xf4\xea\x70\xad\x1e\xd4\x78\x8c"
+ "\x36\x18\x17\x00\xa2\xa0\x34\x1b\x2e\x6a\xdf\x06\xd6\x99\x2d\x47"
+ "\x50\x92\x1a\x8a\x72\x9c\x23\x44\xfa\xa7\xa9\xed\xa6\xef\x26\x14"
+ "\xb3\x9d\xfe\x5e\xa3\x8c\xd8\x29\xf8\xdf\xad\xa6\xab\xfc\xdd\x46"
+ "\x22\x6e\xd7\x35\xc7\x23\xb7\x13\xae\xb6\x34\xff\xd7\x80\xe5\x39"
+ "\xb3\x3b\x5b\x1b\x94",
+ .key_len = 133,
+ .m =
+ "\x36\x98\xd6\x82\xfa\xad\xed\x3c\xb9\x40\xb6\x4d\x9e\xb7\x04\x26"
+ "\xad\x72\x34\x44\xd2\x81\xb4\x9b\xbe\x01\x04\x7a\xd8\x50\xf8\x59"
+ "\xba\xad\x23\x85\x6b\x59\xbe\xfb\xf6\x86\xd4\x67\xa8\x43\x28\x76",
+ .m_size = 48,
+ .c =
+ "\x30\x81\x88\x02\x42\x00\x93\x96\x76\x3c\x27\xea\xaa\x9c\x26\xec"
+ "\x51\xdc\xe8\x35\x5e\xae\x16\xf2\x4b\x64\x98\xf7\xec\xda\xc7\x7e"
+ "\x42\x71\x86\x57\x2d\xf1\x7d\xe4\xdf\x9b\x7d\x9e\x47\xca\x33\x32"
+ "\x76\x06\xd0\xf9\xc0\xe4\xe6\x84\x59\xfd\x1a\xc4\x40\xdd\x43\xb8"
+ "\x6a\xdd\xfb\xe6\x63\x4e\x28\x02\x42\x00\xff\xc3\x6a\x87\x6e\xb5"
+ "\x13\x1f\x20\x55\xce\x37\x97\xc9\x05\x51\xe5\xe4\x3c\xbc\x93\x65"
+ "\x57\x1c\x30\xda\xa7\xcd\x26\x28\x76\x3b\x52\xdf\xc4\xc0\xdb\x54"
+ "\xdb\x8a\x0d\x6a\xc3\xf3\x7a\xd1\xfa\xe7\xa7\xe5\x5a\x94\x56\xcf"
+ "\x8f\xb4\x22\xc6\x4f\xab\x2b\x62\xc1\x42\xb1",
+ .c_size = 139,
+ .public_key_vec = true,
+ },
+ {
+ .key = /* secp521r1(sha512) */
+ "\x04\x00\xc7\x65\xee\x0b\x86\x7d\x8f\x02\xf1\x74\x5b\xb0\x4c\x3f"
+ "\xa6\x35\x60\x9f\x55\x23\x11\xcc\xdf\xb8\x42\x99\xee\x6c\x96\x6a"
+ "\x27\xa2\x56\xb2\x2b\x03\xad\x0f\xe7\x97\xde\x09\x5d\xb4\xc5\x5f"
+ "\xbd\x87\x37\xbf\x5a\x16\x35\x56\x08\xfd\x6f\x06\x1a\x1c\x84\xee"
+ "\xc3\x64\xb3\x00\x9e\xbd\x6e\x60\x76\xee\x69\xfd\x3a\xb8\xcd\x7e"
+ "\x91\x68\x53\x57\x44\x13\x2e\x77\x09\x2a\xbe\x48\xbd\x91\xd8\xf6"
+ "\x21\x16\x53\x99\xd5\xf0\x40\xad\xa6\xf8\x58\x26\xb6\x9a\xf8\x77"
+ "\xfe\x3a\x05\x1a\xdb\xa9\x0f\xc0\x6c\x76\x30\x8c\xd8\xde\x44\xae"
+ "\xd0\x17\xdf\x49\x6a",
+ .key_len = 133,
+ .m =
+ "\x5c\xa6\xbc\x79\xb8\xa0\x1e\x11\x83\xf7\xe9\x05\xdf\xba\xf7\x69"
+ "\x97\x22\x32\xe4\x94\x7c\x65\xbd\x74\xc6\x9a\x8b\xbd\x0d\xdc\xed"
+ "\xf5\x9c\xeb\xe1\xc5\x68\x40\xf2\xc7\x04\xde\x9e\x0d\x76\xc5\xa3"
+ "\xf9\x3c\x6c\x98\x08\x31\xbd\x39\xe8\x42\x7f\x80\x39\x6f\xfe\x68",
+ .m_size = 64,
.c =
"\x30\x81\x88\x02\x42\x01\x5c\x71\x86\x96\xac\x21\x33\x7e\x4e\xaa"
"\x86\xec\xa8\x05\x03\x52\x56\x63\x0e\x02\xcc\x94\xa9\x05\xb9\xfb"
@@ -1213,14 +1802,41 @@ static const struct akcipher_testvec ecdsa_nist_p521_tv_template[] = {
"\xa6\xe5\x25\x46\x1e\x77\x44\x78\xe0\xd1\x04",
.c_size = 139,
.public_key_vec = true,
- .siggen_sigver_test = true,
+ },
+};
+
+/*
+ * ECDSA P1363 test vectors.
+ *
+ * Identical to ECDSA test vectors, except signature in "c" is P1363 encoded.
+ */
+static const struct sig_testvec p1363_ecdsa_nist_p256_tv_template[] = {
+ {
+ .key = /* secp256r1(sha256) */
+ "\x04\xf1\xea\xc4\x53\xf3\xb9\x0e\x9f\x7e\xad\xe3\xea\xd7\x0e\x0f"
+ "\xd6\x98\x9a\xca\x92\x4d\x0a\x80\xdb\x2d\x45\xc7\xec\x4b\x97\x00"
+ "\x2f\xe9\x42\x6c\x29\xdc\x55\x0e\x0b\x53\x12\x9b\x2b\xad\x2c\xe9"
+ "\x80\xe6\xc5\x43\xc2\x1d\x5e\xbb\x65\x21\x50\xb6\x37\xb0\x03\x8e"
+ "\xb8",
+ .key_len = 65,
+ .m =
+ "\x8f\x43\x43\x46\x64\x8f\x6b\x96\xdf\x89\xdd\xa9\x01\xc5\x17\x6b"
+ "\x10\xa6\xd8\x39\x61\xdd\x3c\x1a\xc8\x8b\x59\xb2\xdc\x32\x7a\xa4",
+ .m_size = 32,
+ .c =
+ "\x08\x31\xfa\x74\x0d\x1d\x21\x5d\x09\xdc\x29\x63\xa8\x1a\xad\xfc"
+ "\xac\x44\xc3\xe8\x24\x11\x2d\xa4\x91\xdc\x02\x67\xdc\x0c\xd0\x82"
+ "\xbd\xff\xce\xee\x42\xc3\x97\xff\xf9\xa9\x81\xac\x4a\x50\xd0\x91"
+ "\x0a\x6e\x1b\xc4\xaf\xe1\x83\xc3\x4f\x2a\x65\x35\x23\xe3\x1d\xfa",
+ .c_size = 64,
+ .public_key_vec = true,
},
};
/*
* EC-RDSA test vectors are generated by gost-engine.
*/
-static const struct akcipher_testvec ecrdsa_tv_template[] = {
+static const struct sig_testvec ecrdsa_tv_template[] = {
{
.key =
"\x04\x40\xd5\xa7\x77\xf9\x26\x2f\x8c\xbd\xcc\xe3\x1f\x01\x94\x05"
@@ -1245,7 +1861,6 @@ static const struct akcipher_testvec ecrdsa_tv_template[] = {
"\x79\xd2\x76\x64\xa3\xbd\x66\x10\x79\x05\x5a\x06\x42\xec\xb9\xc9",
.m_size = 32,
.public_key_vec = true,
- .siggen_sigver_test = true,
},
{
.key =
@@ -1271,7 +1886,6 @@ static const struct akcipher_testvec ecrdsa_tv_template[] = {
"\x11\x23\x4a\x70\x43\x52\x7a\x68\x11\x65\x45\x37\xbb\x25\xb7\x40",
.m_size = 32,
.public_key_vec = true,
- .siggen_sigver_test = true,
},
{
.key =
@@ -1297,7 +1911,6 @@ static const struct akcipher_testvec ecrdsa_tv_template[] = {
"\x9f\x16\xc6\x1c\xb1\x3f\x84\x41\x69\xec\x34\xfd\xf1\xf9\xa3\x39",
.m_size = 32,
.public_key_vec = true,
- .siggen_sigver_test = true,
},
{
.key =
@@ -1332,7 +1945,6 @@ static const struct akcipher_testvec ecrdsa_tv_template[] = {
"\xa8\xf6\x80\x01\xb9\x27\xac\xd8\x45\x96\x66\xa1\xee\x48\x08\x3f",
.m_size = 64,
.public_key_vec = true,
- .siggen_sigver_test = true,
},
{
.key =
@@ -1367,14 +1979,68 @@ static const struct akcipher_testvec ecrdsa_tv_template[] = {
"\x6d\xf4\xd2\x45\xc2\x83\xa0\x42\x95\x05\x9d\x89\x8e\x0a\xca\xcc",
.m_size = 64,
.public_key_vec = true,
- .siggen_sigver_test = true,
+ },
+};
+
+/*
+ * PKCS#1 RSA test vectors for hash algorithm "none"
+ * (i.e. the hash in "m" is not prepended by a Full Hash Prefix)
+ *
+ * Obtained from:
+ * https://vcsjones.dev/sometimes-valid-rsa-dotnet/
+ * https://gist.github.com/vcsjones/ab4c2327b53ed018eada76b75ef4fd99
+ */
+static const struct sig_testvec pkcs1_rsa_none_tv_template[] = {
+ {
+ .key =
+ "\x30\x82\x01\x0a\x02\x82\x01\x01\x00\xa2\x63\x0b\x39\x44\xb8\xbb"
+ "\x23\xa7\x44\x49\xbb\x0e\xff\xa1\xf0\x61\x0a\x53\x93\xb0\x98\xdb"
+ "\xad\x2c\x0f\x4a\xc5\x6e\xff\x86\x3c\x53\x55\x0f\x15\xce\x04\x3f"
+ "\x2b\xfd\xa9\x96\x96\xd9\xbe\x61\x79\x0b\x5b\xc9\x4c\x86\x76\xe5"
+ "\xe0\x43\x4b\x22\x95\xee\xc2\x2b\x43\xc1\x9f\xd8\x68\xb4\x8e\x40"
+ "\x4f\xee\x85\x38\xb9\x11\xc5\x23\xf2\x64\x58\xf0\x15\x32\x6f\x4e"
+ "\x57\xa1\xae\x88\xa4\x02\xd7\x2a\x1e\xcd\x4b\xe1\xdd\x63\xd5\x17"
+ "\x89\x32\x5b\xb0\x5e\x99\x5a\xa8\x9d\x28\x50\x0e\x17\xee\x96\xdb"
+ "\x61\x3b\x45\x51\x1d\xcf\x12\x56\x0b\x92\x47\xfc\xab\xae\xf6\x66"
+ "\x3d\x47\xac\x70\x72\xe7\x92\xe7\x5f\xcd\x10\xb9\xc4\x83\x64\x94"
+ "\x19\xbd\x25\x80\xe1\xe8\xd2\x22\xa5\xd0\xba\x02\x7a\xa1\x77\x93"
+ "\x5b\x65\xc3\xee\x17\x74\xbc\x41\x86\x2a\xdc\x08\x4c\x8c\x92\x8c"
+ "\x91\x2d\x9e\x77\x44\x1f\x68\xd6\xa8\x74\x77\xdb\x0e\x5b\x32\x8b"
+ "\x56\x8b\x33\xbd\xd9\x63\xc8\x49\x9d\x3a\xc5\xc5\xea\x33\x0b\xd2"
+ "\xf1\xa3\x1b\xf4\x8b\xbe\xd9\xb3\x57\x8b\x3b\xde\x04\xa7\x7a\x22"
+ "\xb2\x24\xae\x2e\xc7\x70\xc5\xbe\x4e\x83\x26\x08\xfb\x0b\xbd\xa9"
+ "\x4f\x99\x08\xe1\x10\x28\x72\xaa\xcd\x02\x03\x01\x00\x01",
+ .key_len = 270,
+ .m =
+ "\x68\xb4\xf9\x26\x34\x31\x25\xdd\x26\x50\x13\x68\xc1\x99\x26\x71"
+ "\x19\xa2\xde\x81",
+ .m_size = 20,
+ .c =
+ "\x6a\xdb\x39\xe5\x63\xb3\x25\xde\x58\xca\xc3\xf1\x36\x9c\x0b\x36"
+ "\xb7\xd6\x69\xf9\xba\xa6\x68\x14\x8c\x24\x52\xd3\x25\xa5\xf3\xad"
+ "\xc9\x47\x44\xde\x06\xd8\x0f\x56\xca\x2d\xfb\x0f\xe9\x99\xe2\x9d"
+ "\x8a\xe8\x7f\xfb\x9a\x99\x96\xf1\x2c\x4a\xe4\xc0\xae\x4d\x29\x47"
+ "\x38\x96\x51\x2f\x6d\x8e\xb8\x88\xbd\x1a\x0a\x70\xbc\x23\x38\x67"
+ "\x62\x22\x01\x23\x71\xe5\xbb\x95\xea\x6b\x8d\x31\x62\xbf\xf0\xc4"
+ "\xb9\x46\xd6\x67\xfc\x4c\xe6\x1f\xd6\x5d\xf7\xa9\xad\x3a\xf1\xbf"
+ "\xa2\xf9\x66\xde\xb6\x8e\xec\x8f\x81\x8d\x1e\x3a\x12\x27\x6a\xfc"
+ "\xae\x92\x9f\xc3\x87\xc3\xba\x8d\x04\xb8\x8f\x0f\x61\x68\x9a\x96"
+ "\x2c\x80\x2c\x32\x40\xde\x9d\xb9\x9b\xe2\xe4\x45\x2e\x91\x47\x5c"
+ "\x47\xa4\x9d\x02\x57\x59\xf7\x75\x5d\x5f\x32\x82\x75\x5d\xe5\x78"
+ "\xc9\x19\x61\x46\x06\x9d\xa5\x1d\xd6\x32\x48\x9a\xdb\x09\x29\x81"
+ "\x14\x2e\xf0\x27\xe9\x37\x13\x74\xec\xa5\xcd\x67\x6b\x19\xf6\x88"
+ "\xf0\xc2\x8b\xa8\x7f\x2f\x76\x5a\x3e\x0c\x47\x5d\xe8\x82\x50\x27"
+ "\x40\xce\x27\x41\x45\xa0\xcf\xaa\x2f\xd3\xad\x3c\xbf\x73\xff\x93"
+ "\xe3\x78\x49\xd9\xa9\x78\x22\x81\x9a\xe5\xe2\x94\xe9\x40\xab\xf1",
+ .c_size = 256,
+ .public_key_vec = true,
},
};
/*
* PKCS#1 RSA test vectors. Obtained from CAVS testing.
*/
-static const struct akcipher_testvec pkcs1pad_rsa_tv_template[] = {
+static const struct sig_testvec pkcs1_rsa_tv_template[] = {
{
.key =
"\x30\x82\x04\xa5\x02\x01\x00\x02\x82\x01\x01\x00\xd7\x1e\x77\x82"
@@ -1486,7 +2152,6 @@ static const struct akcipher_testvec pkcs1pad_rsa_tv_template[] = {
"\xda\x62\x8d\xe1\x2a\x71\x91\x43\x40\x61\x3c\x5a\xbe\x86\xfc\x5b"
"\xe6\xf9\xa9\x16\x31\x1f\xaf\x25\x6d\xc2\x4a\x23\x6e\x63\x02\xa2",
.c_size = 256,
- .siggen_sigver_test = true,
}
};
diff --git a/drivers/acpi/arm64/gtdt.c b/drivers/acpi/arm64/gtdt.c
index c0e77c1..3561553 100644
--- a/drivers/acpi/arm64/gtdt.c
+++ b/drivers/acpi/arm64/gtdt.c
@@ -36,19 +36,25 @@ struct acpi_gtdt_descriptor {
static struct acpi_gtdt_descriptor acpi_gtdt_desc __initdata;
-static inline __init void *next_platform_timer(void *platform_timer)
+static __init bool platform_timer_valid(void *platform_timer)
{
struct acpi_gtdt_header *gh = platform_timer;
- platform_timer += gh->length;
- if (platform_timer < acpi_gtdt_desc.gtdt_end)
- return platform_timer;
+ return (platform_timer >= (void *)(acpi_gtdt_desc.gtdt + 1) &&
+ platform_timer < acpi_gtdt_desc.gtdt_end &&
+ gh->length != 0 &&
+ platform_timer + gh->length <= acpi_gtdt_desc.gtdt_end);
+}
- return NULL;
+static __init void *next_platform_timer(void *platform_timer)
+{
+ struct acpi_gtdt_header *gh = platform_timer;
+
+ return platform_timer + gh->length;
}
#define for_each_platform_timer(_g) \
- for (_g = acpi_gtdt_desc.platform_timer; _g; \
+ for (_g = acpi_gtdt_desc.platform_timer; platform_timer_valid(_g);\
_g = next_platform_timer(_g))
static inline bool is_timer_block(void *platform_timer)
@@ -157,6 +163,7 @@ int __init acpi_gtdt_init(struct acpi_table_header *table,
{
void *platform_timer;
struct acpi_table_gtdt *gtdt;
+ int cnt = 0;
gtdt = container_of(table, struct acpi_table_gtdt, header);
acpi_gtdt_desc.gtdt = gtdt;
@@ -176,12 +183,16 @@ int __init acpi_gtdt_init(struct acpi_table_header *table,
return 0;
}
- platform_timer = (void *)gtdt + gtdt->platform_timer_offset;
- if (platform_timer < (void *)table + sizeof(struct acpi_table_gtdt)) {
+ acpi_gtdt_desc.platform_timer = (void *)gtdt + gtdt->platform_timer_offset;
+ for_each_platform_timer(platform_timer)
+ cnt++;
+
+ if (cnt != gtdt->platform_timer_count) {
+ acpi_gtdt_desc.platform_timer = NULL;
pr_err(FW_BUG "invalid timer data.\n");
return -EINVAL;
}
- acpi_gtdt_desc.platform_timer = platform_timer;
+
if (platform_timer_count)
*platform_timer_count = gtdt->platform_timer_count;
@@ -283,7 +294,7 @@ static int __init gtdt_parse_timer_block(struct acpi_gtdt_timer_block *block,
if (frame->virt_irq > 0)
acpi_unregister_gsi(gtdt_frame->virtual_timer_interrupt);
frame->virt_irq = 0;
- } while (i-- >= 0 && gtdt_frame--);
+ } while (i-- > 0 && gtdt_frame--);
return -EINVAL;
}
@@ -352,7 +363,7 @@ static int __init gtdt_import_sbsa_gwdt(struct acpi_gtdt_watchdog *wd,
}
irq = map_gt_gsi(wd->timer_interrupt, wd->timer_flags);
- res[2] = (struct resource)DEFINE_RES_IRQ(irq);
+ res[2] = DEFINE_RES_IRQ(irq);
if (irq <= 0) {
pr_warn("failed to map the Watchdog interrupt.\n");
nr_res--;
diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c
index 45f63b0..2d3d3d6 100644
--- a/drivers/ata/ahci.c
+++ b/drivers/ata/ahci.c
@@ -1676,7 +1676,7 @@ static int ahci_init_msi(struct pci_dev *pdev, unsigned int n_ports,
/*
* If number of MSIs is less than number of ports then Sharing Last
* Message mode could be enforced. In this case assume that advantage
- * of multipe MSIs is negated and use single MSI mode instead.
+ * of multiple MSIs is negated and use single MSI mode instead.
*/
if (n_ports > 1) {
nvec = pci_alloc_irq_vectors(pdev, n_ports, INT_MAX,
diff --git a/drivers/ata/ahci_brcm.c b/drivers/ata/ahci_brcm.c
index 2f16524..ef569ea 100644
--- a/drivers/ata/ahci_brcm.c
+++ b/drivers/ata/ahci_brcm.c
@@ -571,7 +571,7 @@ static SIMPLE_DEV_PM_OPS(ahci_brcm_pm_ops, brcm_ahci_suspend, brcm_ahci_resume);
static struct platform_driver brcm_ahci_driver = {
.probe = brcm_ahci_probe,
- .remove_new = brcm_ahci_remove,
+ .remove = brcm_ahci_remove,
.shutdown = brcm_ahci_shutdown,
.driver = {
.name = DRV_NAME,
diff --git a/drivers/ata/ahci_ceva.c b/drivers/ata/ahci_ceva.c
index 11a2c19..1ec3577 100644
--- a/drivers/ata/ahci_ceva.c
+++ b/drivers/ata/ahci_ceva.c
@@ -402,7 +402,7 @@ MODULE_DEVICE_TABLE(of, ceva_ahci_of_match);
static struct platform_driver ceva_ahci_driver = {
.probe = ceva_ahci_probe,
- .remove_new = ata_platform_remove_one,
+ .remove = ata_platform_remove_one,
.driver = {
.name = DRV_NAME,
.of_match_table = ceva_ahci_of_match,
diff --git a/drivers/ata/ahci_da850.c b/drivers/ata/ahci_da850.c
index 55a6627..ca0924d 100644
--- a/drivers/ata/ahci_da850.c
+++ b/drivers/ata/ahci_da850.c
@@ -238,7 +238,7 @@ MODULE_DEVICE_TABLE(of, ahci_da850_of_match);
static struct platform_driver ahci_da850_driver = {
.probe = ahci_da850_probe,
- .remove_new = ata_platform_remove_one,
+ .remove = ata_platform_remove_one,
.driver = {
.name = DRV_NAME,
.of_match_table = ahci_da850_of_match,
diff --git a/drivers/ata/ahci_dm816.c b/drivers/ata/ahci_dm816.c
index 4cb7006..b08547b 100644
--- a/drivers/ata/ahci_dm816.c
+++ b/drivers/ata/ahci_dm816.c
@@ -182,7 +182,7 @@ MODULE_DEVICE_TABLE(of, ahci_dm816_of_match);
static struct platform_driver ahci_dm816_driver = {
.probe = ahci_dm816_probe,
- .remove_new = ata_platform_remove_one,
+ .remove = ata_platform_remove_one,
.driver = {
.name = AHCI_DM816_DRV_NAME,
.of_match_table = ahci_dm816_of_match,
diff --git a/drivers/ata/ahci_dwc.c b/drivers/ata/ahci_dwc.c
index ed263de..aec6d79 100644
--- a/drivers/ata/ahci_dwc.c
+++ b/drivers/ata/ahci_dwc.c
@@ -478,7 +478,7 @@ MODULE_DEVICE_TABLE(of, ahci_dwc_of_match);
static struct platform_driver ahci_dwc_driver = {
.probe = ahci_dwc_probe,
- .remove_new = ata_platform_remove_one,
+ .remove = ata_platform_remove_one,
.shutdown = ahci_platform_shutdown,
.driver = {
.name = DRV_NAME,
diff --git a/drivers/ata/ahci_imx.c b/drivers/ata/ahci_imx.c
index 6f955e9..f01f080 100644
--- a/drivers/ata/ahci_imx.c
+++ b/drivers/ata/ahci_imx.c
@@ -511,7 +511,7 @@ static int imx_sata_enable(struct ahci_host_priv *hpriv)
if (imxpriv->type == AHCI_IMX6Q || imxpriv->type == AHCI_IMX6QP) {
/*
- * set PHY Paremeters, two steps to configure the GPR13,
+ * set PHY Parameters, two steps to configure the GPR13,
* one write for rest of parameters, mask of first write
* is 0x07ffffff, and the other one write for setting
* the mpll_clk_en.
@@ -1027,7 +1027,7 @@ static SIMPLE_DEV_PM_OPS(ahci_imx_pm_ops, imx_ahci_suspend, imx_ahci_resume);
static struct platform_driver imx_ahci_driver = {
.probe = imx_ahci_probe,
- .remove_new = ata_platform_remove_one,
+ .remove = ata_platform_remove_one,
.driver = {
.name = DRV_NAME,
.of_match_table = imx_ahci_of_match,
diff --git a/drivers/ata/ahci_mtk.c b/drivers/ata/ahci_mtk.c
index adc851c..7295b90 100644
--- a/drivers/ata/ahci_mtk.c
+++ b/drivers/ata/ahci_mtk.c
@@ -174,7 +174,7 @@ MODULE_DEVICE_TABLE(of, ahci_of_match);
static struct platform_driver mtk_ahci_driver = {
.probe = mtk_ahci_probe,
- .remove_new = ata_platform_remove_one,
+ .remove = ata_platform_remove_one,
.driver = {
.name = DRV_NAME,
.of_match_table = ahci_of_match,
diff --git a/drivers/ata/ahci_mvebu.c b/drivers/ata/ahci_mvebu.c
index f318735..8744dae 100644
--- a/drivers/ata/ahci_mvebu.c
+++ b/drivers/ata/ahci_mvebu.c
@@ -245,7 +245,7 @@ MODULE_DEVICE_TABLE(of, ahci_mvebu_of_match);
static struct platform_driver ahci_mvebu_driver = {
.probe = ahci_mvebu_probe,
- .remove_new = ata_platform_remove_one,
+ .remove = ata_platform_remove_one,
.suspend = ahci_mvebu_suspend,
.resume = ahci_mvebu_resume,
.driver = {
diff --git a/drivers/ata/ahci_platform.c b/drivers/ata/ahci_platform.c
index 81fc63f6..c180543 100644
--- a/drivers/ata/ahci_platform.c
+++ b/drivers/ata/ahci_platform.c
@@ -96,7 +96,7 @@ MODULE_DEVICE_TABLE(acpi, ahci_acpi_match);
static struct platform_driver ahci_driver = {
.probe = ahci_probe,
- .remove_new = ata_platform_remove_one,
+ .remove = ata_platform_remove_one,
.shutdown = ahci_platform_shutdown,
.driver = {
.name = DRV_NAME,
diff --git a/drivers/ata/ahci_qoriq.c b/drivers/ata/ahci_qoriq.c
index b1a4e57..30e3988 100644
--- a/drivers/ata/ahci_qoriq.c
+++ b/drivers/ata/ahci_qoriq.c
@@ -357,7 +357,7 @@ static SIMPLE_DEV_PM_OPS(ahci_qoriq_pm_ops, ahci_platform_suspend,
static struct platform_driver ahci_qoriq_driver = {
.probe = ahci_qoriq_probe,
- .remove_new = ata_platform_remove_one,
+ .remove = ata_platform_remove_one,
.driver = {
.name = DRV_NAME,
.of_match_table = ahci_qoriq_of_match,
diff --git a/drivers/ata/ahci_seattle.c b/drivers/ata/ahci_seattle.c
index 59f97aa..3f16c16 100644
--- a/drivers/ata/ahci_seattle.c
+++ b/drivers/ata/ahci_seattle.c
@@ -185,7 +185,7 @@ MODULE_DEVICE_TABLE(acpi, ahci_acpi_match);
static struct platform_driver ahci_seattle_driver = {
.probe = ahci_seattle_probe,
- .remove_new = ata_platform_remove_one,
+ .remove = ata_platform_remove_one,
.driver = {
.name = DRV_NAME,
.acpi_match_table = ahci_acpi_match,
diff --git a/drivers/ata/ahci_st.c b/drivers/ata/ahci_st.c
index 79a8b0a..6b9b4a1 100644
--- a/drivers/ata/ahci_st.c
+++ b/drivers/ata/ahci_st.c
@@ -238,7 +238,7 @@ static struct platform_driver st_ahci_driver = {
.of_match_table = st_ahci_match,
},
.probe = st_ahci_probe,
- .remove_new = ata_platform_remove_one,
+ .remove = ata_platform_remove_one,
};
module_platform_driver(st_ahci_driver);
diff --git a/drivers/ata/ahci_sunxi.c b/drivers/ata/ahci_sunxi.c
index 58b2683..5d45845 100644
--- a/drivers/ata/ahci_sunxi.c
+++ b/drivers/ata/ahci_sunxi.c
@@ -292,7 +292,7 @@ MODULE_DEVICE_TABLE(of, ahci_sunxi_of_match);
static struct platform_driver ahci_sunxi_driver = {
.probe = ahci_sunxi_probe,
- .remove_new = ata_platform_remove_one,
+ .remove = ata_platform_remove_one,
.driver = {
.name = DRV_NAME,
.of_match_table = ahci_sunxi_of_match,
diff --git a/drivers/ata/ahci_tegra.c b/drivers/ata/ahci_tegra.c
index 8703c2a..44584ee 100644
--- a/drivers/ata/ahci_tegra.c
+++ b/drivers/ata/ahci_tegra.c
@@ -608,7 +608,7 @@ static int tegra_ahci_probe(struct platform_device *pdev)
static struct platform_driver tegra_ahci_driver = {
.probe = tegra_ahci_probe,
- .remove_new = ata_platform_remove_one,
+ .remove = ata_platform_remove_one,
.driver = {
.name = DRV_NAME,
.of_match_table = tegra_ahci_of_match,
diff --git a/drivers/ata/ahci_xgene.c b/drivers/ata/ahci_xgene.c
index 81a1d83..dfbd8c5 100644
--- a/drivers/ata/ahci_xgene.c
+++ b/drivers/ata/ahci_xgene.c
@@ -534,7 +534,7 @@ static int xgene_ahci_softreset(struct ata_link *link, unsigned int *class,
/**
* xgene_ahci_handle_broken_edge_irq - Handle the broken irq.
- * @host: Host that recieved the irq
+ * @host: Host that received the irq
* @irq_masked: HOST_IRQ_STAT value
*
* For hardware with broken edge trigger latch
@@ -859,7 +859,7 @@ static int xgene_ahci_probe(struct platform_device *pdev)
static struct platform_driver xgene_ahci_driver = {
.probe = xgene_ahci_probe,
- .remove_new = ata_platform_remove_one,
+ .remove = ata_platform_remove_one,
.driver = {
.name = DRV_NAME,
.of_match_table = xgene_ahci_of_match,
diff --git a/drivers/ata/libata-acpi.c b/drivers/ata/libata-acpi.c
index d36e71f..b7f0bf79 100644
--- a/drivers/ata/libata-acpi.c
+++ b/drivers/ata/libata-acpi.c
@@ -86,7 +86,7 @@ static void ata_acpi_detach_device(struct ata_port *ap, struct ata_device *dev)
* @dev: ATA device ACPI event occurred (can be NULL)
* @event: ACPI event which occurred
*
- * All ACPI bay / device realted events end up in this function. If
+ * All ACPI bay / device related events end up in this function. If
* the event is port-wide @dev is NULL. If the event is specific to a
* device, @dev points to it.
*
@@ -832,7 +832,7 @@ void ata_acpi_on_resume(struct ata_port *ap)
dev->flags |= ATA_DFLAG_ACPI_PENDING;
}
} else {
- /* SATA _GTF needs to be evaulated after _SDD and
+ /* SATA _GTF needs to be evaluated after _SDD and
* there's no reason to evaluate IDE _GTF early
* without _STM. Clear cache and schedule _GTF.
*/
diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
index f915e3d..2ce5bef 100644
--- a/drivers/ata/libata-scsi.c
+++ b/drivers/ata/libata-scsi.c
@@ -1334,17 +1334,8 @@ static unsigned int ata_scsi_flush_xlat(struct ata_queued_cmd *qc)
*/
static void scsi_6_lba_len(const u8 *cdb, u64 *plba, u32 *plen)
{
- u64 lba = 0;
- u32 len;
-
- lba |= ((u64)(cdb[1] & 0x1f)) << 16;
- lba |= ((u64)cdb[2]) << 8;
- lba |= ((u64)cdb[3]);
-
- len = cdb[4];
-
- *plba = lba;
- *plen = len;
+ *plba = get_unaligned_be24(&cdb[1]) & 0x1fffff;
+ *plen = cdb[4];
}
/**
@@ -1781,15 +1772,10 @@ static int ata_scsi_translate(struct ata_device *dev, struct scsi_cmnd *cmd,
return SCSI_MLQUEUE_HOST_BUSY;
}
-struct ata_scsi_args {
- struct ata_device *dev;
- u16 *id;
- struct scsi_cmnd *cmd;
-};
-
/**
* ata_scsi_rbuf_fill - wrapper for SCSI command simulators
- * @args: device IDENTIFY data / SCSI command of interest.
+ * @dev: Target device.
+ * @cmd: SCSI command of interest.
* @actor: Callback hook for desired SCSI command simulator
*
* Takes care of the hard work of simulating a SCSI command...
@@ -1802,30 +1788,32 @@ struct ata_scsi_args {
* LOCKING:
* spin_lock_irqsave(host lock)
*/
-static void ata_scsi_rbuf_fill(struct ata_scsi_args *args,
- unsigned int (*actor)(struct ata_scsi_args *args, u8 *rbuf))
+static void ata_scsi_rbuf_fill(struct ata_device *dev, struct scsi_cmnd *cmd,
+ unsigned int (*actor)(struct ata_device *dev,
+ struct scsi_cmnd *cmd, u8 *rbuf))
{
- unsigned int rc;
- struct scsi_cmnd *cmd = args->cmd;
unsigned long flags;
+ unsigned int len;
spin_lock_irqsave(&ata_scsi_rbuf_lock, flags);
memset(ata_scsi_rbuf, 0, ATA_SCSI_RBUF_SIZE);
- rc = actor(args, ata_scsi_rbuf);
- if (rc == 0)
+ len = actor(dev, cmd, ata_scsi_rbuf);
+ if (len) {
sg_copy_from_buffer(scsi_sglist(cmd), scsi_sg_count(cmd),
ata_scsi_rbuf, ATA_SCSI_RBUF_SIZE);
+ cmd->result = SAM_STAT_GOOD;
+ if (scsi_bufflen(cmd) > len)
+ scsi_set_resid(cmd, scsi_bufflen(cmd) - len);
+ }
spin_unlock_irqrestore(&ata_scsi_rbuf_lock, flags);
-
- if (rc == 0)
- cmd->result = SAM_STAT_GOOD;
}
/**
- * ata_scsiop_inq_std - Simulate INQUIRY command
- * @args: device IDENTIFY data / SCSI command of interest.
+ * ata_scsiop_inq_std - Simulate standard INQUIRY command
+ * @dev: Target device.
+ * @cmd: SCSI command of interest.
* @rbuf: Response buffer, to which simulated SCSI cmd output is sent.
*
* Returns standard device identification data associated
@@ -1834,7 +1822,8 @@ static void ata_scsi_rbuf_fill(struct ata_scsi_args *args,
* LOCKING:
* spin_lock_irqsave(host lock)
*/
-static unsigned int ata_scsiop_inq_std(struct ata_scsi_args *args, u8 *rbuf)
+static unsigned int ata_scsiop_inq_std(struct ata_device *dev,
+ struct scsi_cmnd *cmd, u8 *rbuf)
{
static const u8 versions[] = {
0x00,
@@ -1875,40 +1864,45 @@ static unsigned int ata_scsiop_inq_std(struct ata_scsi_args *args, u8 *rbuf)
* Set the SCSI Removable Media Bit (RMB) if the ATA removable media
* device bit (obsolete since ATA-8 ACS) is set.
*/
- if (ata_id_removable(args->id))
+ if (ata_id_removable(dev->id))
hdr[1] |= (1 << 7);
- if (args->dev->class == ATA_DEV_ZAC) {
+ if (dev->class == ATA_DEV_ZAC) {
hdr[0] = TYPE_ZBC;
hdr[2] = 0x7; /* claim SPC-5 version compatibility */
}
- if (args->dev->flags & ATA_DFLAG_CDL)
+ if (dev->flags & ATA_DFLAG_CDL)
hdr[2] = 0xd; /* claim SPC-6 version compatibility */
memcpy(rbuf, hdr, sizeof(hdr));
memcpy(&rbuf[8], "ATA ", 8);
- ata_id_string(args->id, &rbuf[16], ATA_ID_PROD, 16);
+ ata_id_string(dev->id, &rbuf[16], ATA_ID_PROD, 16);
/* From SAT, use last 2 words from fw rev unless they are spaces */
- ata_id_string(args->id, &rbuf[32], ATA_ID_FW_REV + 2, 4);
+ ata_id_string(dev->id, &rbuf[32], ATA_ID_FW_REV + 2, 4);
if (strncmp(&rbuf[32], " ", 4) == 0)
- ata_id_string(args->id, &rbuf[32], ATA_ID_FW_REV, 4);
+ ata_id_string(dev->id, &rbuf[32], ATA_ID_FW_REV, 4);
if (rbuf[32] == 0 || rbuf[32] == ' ')
memcpy(&rbuf[32], "n/a ", 4);
- if (ata_id_zoned_cap(args->id) || args->dev->class == ATA_DEV_ZAC)
+ if (ata_id_zoned_cap(dev->id) || dev->class == ATA_DEV_ZAC)
memcpy(rbuf + 58, versions_zbc, sizeof(versions_zbc));
else
memcpy(rbuf + 58, versions, sizeof(versions));
- return 0;
+ /*
+ * Include all 8 possible version descriptors, even if not all of
+ * them are popoulated.
+ */
+ return 96;
}
/**
* ata_scsiop_inq_00 - Simulate INQUIRY VPD page 0, list of pages
- * @args: device IDENTIFY data / SCSI command of interest.
+ * @dev: Target device.
+ * @cmd: SCSI command of interest.
* @rbuf: Response buffer, to which simulated SCSI cmd output is sent.
*
* Returns list of inquiry VPD pages available.
@@ -1916,7 +1910,8 @@ static unsigned int ata_scsiop_inq_std(struct ata_scsi_args *args, u8 *rbuf)
* LOCKING:
* spin_lock_irqsave(host lock)
*/
-static unsigned int ata_scsiop_inq_00(struct ata_scsi_args *args, u8 *rbuf)
+static unsigned int ata_scsiop_inq_00(struct ata_device *dev,
+ struct scsi_cmnd *cmd, u8 *rbuf)
{
int i, num_pages = 0;
static const u8 pages[] = {
@@ -1933,18 +1928,20 @@ static unsigned int ata_scsiop_inq_00(struct ata_scsi_args *args, u8 *rbuf)
for (i = 0; i < sizeof(pages); i++) {
if (pages[i] == 0xb6 &&
- !(args->dev->flags & ATA_DFLAG_ZAC))
+ !(dev->flags & ATA_DFLAG_ZAC))
continue;
rbuf[num_pages + 4] = pages[i];
num_pages++;
}
rbuf[3] = num_pages; /* number of supported VPD pages */
- return 0;
+
+ return get_unaligned_be16(&rbuf[2]) + 4;
}
/**
* ata_scsiop_inq_80 - Simulate INQUIRY VPD page 80, device serial number
- * @args: device IDENTIFY data / SCSI command of interest.
+ * @dev: Target device.
+ * @cmd: SCSI command of interest.
* @rbuf: Response buffer, to which simulated SCSI cmd output is sent.
*
* Returns ATA device serial number.
@@ -1952,7 +1949,8 @@ static unsigned int ata_scsiop_inq_00(struct ata_scsi_args *args, u8 *rbuf)
* LOCKING:
* spin_lock_irqsave(host lock)
*/
-static unsigned int ata_scsiop_inq_80(struct ata_scsi_args *args, u8 *rbuf)
+static unsigned int ata_scsiop_inq_80(struct ata_device *dev,
+ struct scsi_cmnd *cmd, u8 *rbuf)
{
static const u8 hdr[] = {
0,
@@ -1962,14 +1960,16 @@ static unsigned int ata_scsiop_inq_80(struct ata_scsi_args *args, u8 *rbuf)
};
memcpy(rbuf, hdr, sizeof(hdr));
- ata_id_string(args->id, (unsigned char *) &rbuf[4],
+ ata_id_string(dev->id, (unsigned char *) &rbuf[4],
ATA_ID_SERNO, ATA_ID_SERNO_LEN);
- return 0;
+
+ return get_unaligned_be16(&rbuf[2]) + 4;
}
/**
* ata_scsiop_inq_83 - Simulate INQUIRY VPD page 83, device identity
- * @args: device IDENTIFY data / SCSI command of interest.
+ * @dev: Target device.
+ * @cmd: SCSI command of interest.
* @rbuf: Response buffer, to which simulated SCSI cmd output is sent.
*
* Yields two logical unit device identification designators:
@@ -1980,7 +1980,8 @@ static unsigned int ata_scsiop_inq_80(struct ata_scsi_args *args, u8 *rbuf)
* LOCKING:
* spin_lock_irqsave(host lock)
*/
-static unsigned int ata_scsiop_inq_83(struct ata_scsi_args *args, u8 *rbuf)
+static unsigned int ata_scsiop_inq_83(struct ata_device *dev,
+ struct scsi_cmnd *cmd, u8 *rbuf)
{
const int sat_model_serial_desc_len = 68;
int num;
@@ -1992,7 +1993,7 @@ static unsigned int ata_scsiop_inq_83(struct ata_scsi_args *args, u8 *rbuf)
rbuf[num + 0] = 2;
rbuf[num + 3] = ATA_ID_SERNO_LEN;
num += 4;
- ata_id_string(args->id, (unsigned char *) rbuf + num,
+ ata_id_string(dev->id, (unsigned char *) rbuf + num,
ATA_ID_SERNO, ATA_ID_SERNO_LEN);
num += ATA_ID_SERNO_LEN;
@@ -2004,31 +2005,33 @@ static unsigned int ata_scsiop_inq_83(struct ata_scsi_args *args, u8 *rbuf)
num += 4;
memcpy(rbuf + num, "ATA ", 8);
num += 8;
- ata_id_string(args->id, (unsigned char *) rbuf + num, ATA_ID_PROD,
+ ata_id_string(dev->id, (unsigned char *) rbuf + num, ATA_ID_PROD,
ATA_ID_PROD_LEN);
num += ATA_ID_PROD_LEN;
- ata_id_string(args->id, (unsigned char *) rbuf + num, ATA_ID_SERNO,
+ ata_id_string(dev->id, (unsigned char *) rbuf + num, ATA_ID_SERNO,
ATA_ID_SERNO_LEN);
num += ATA_ID_SERNO_LEN;
- if (ata_id_has_wwn(args->id)) {
+ if (ata_id_has_wwn(dev->id)) {
/* SAT defined lu world wide name */
/* piv=0, assoc=lu, code_set=binary, designator=NAA */
rbuf[num + 0] = 1;
rbuf[num + 1] = 3;
rbuf[num + 3] = ATA_ID_WWN_LEN;
num += 4;
- ata_id_string(args->id, (unsigned char *) rbuf + num,
+ ata_id_string(dev->id, (unsigned char *) rbuf + num,
ATA_ID_WWN, ATA_ID_WWN_LEN);
num += ATA_ID_WWN_LEN;
}
rbuf[3] = num - 4; /* page len (assume less than 256 bytes) */
- return 0;
+
+ return get_unaligned_be16(&rbuf[2]) + 4;
}
/**
* ata_scsiop_inq_89 - Simulate INQUIRY VPD page 89, ATA info
- * @args: device IDENTIFY data / SCSI command of interest.
+ * @dev: Target device.
+ * @cmd: SCSI command of interest.
* @rbuf: Response buffer, to which simulated SCSI cmd output is sent.
*
* Yields SAT-specified ATA VPD page.
@@ -2036,7 +2039,8 @@ static unsigned int ata_scsiop_inq_83(struct ata_scsi_args *args, u8 *rbuf)
* LOCKING:
* spin_lock_irqsave(host lock)
*/
-static unsigned int ata_scsiop_inq_89(struct ata_scsi_args *args, u8 *rbuf)
+static unsigned int ata_scsiop_inq_89(struct ata_device *dev,
+ struct scsi_cmnd *cmd, u8 *rbuf)
{
rbuf[1] = 0x89; /* our page code */
rbuf[2] = (0x238 >> 8); /* page size fixed at 238h */
@@ -2057,13 +2061,25 @@ static unsigned int ata_scsiop_inq_89(struct ata_scsi_args *args, u8 *rbuf)
rbuf[56] = ATA_CMD_ID_ATA;
- memcpy(&rbuf[60], &args->id[0], 512);
- return 0;
+ memcpy(&rbuf[60], &dev->id[0], 512);
+
+ return get_unaligned_be16(&rbuf[2]) + 4;
}
-static unsigned int ata_scsiop_inq_b0(struct ata_scsi_args *args, u8 *rbuf)
+/**
+ * ata_scsiop_inq_b0 - Simulate INQUIRY VPD page B0, Block Limits
+ * @dev: Target device.
+ * @cmd: SCSI command of interest.
+ * @rbuf: Response buffer, to which simulated SCSI cmd output is sent.
+ *
+ * Return data for the VPD page B0h (Block Limits).
+ *
+ * LOCKING:
+ * spin_lock_irqsave(host lock)
+ */
+static unsigned int ata_scsiop_inq_b0(struct ata_device *dev,
+ struct scsi_cmnd *cmd, u8 *rbuf)
{
- struct ata_device *dev = args->dev;
u16 min_io_sectors;
rbuf[1] = 0xb0;
@@ -2076,7 +2092,7 @@ static unsigned int ata_scsiop_inq_b0(struct ata_scsi_args *args, u8 *rbuf)
* logical than physical sector size we need to figure out what the
* latter is.
*/
- min_io_sectors = 1 << ata_id_log2_per_physical_sector(args->id);
+ min_io_sectors = 1 << ata_id_log2_per_physical_sector(dev->id);
put_unaligned_be16(min_io_sectors, &rbuf[6]);
/*
@@ -2088,7 +2104,7 @@ static unsigned int ata_scsiop_inq_b0(struct ata_scsi_args *args, u8 *rbuf)
* that we support some form of unmap - in thise case via WRITE SAME
* with the unmap bit set.
*/
- if (ata_id_has_trim(args->id)) {
+ if (ata_id_has_trim(dev->id)) {
u64 max_blocks = 65535 * ATA_MAX_TRIM_RNUM;
if (dev->quirks & ATA_QUIRK_MAX_TRIM_128M)
@@ -2098,14 +2114,27 @@ static unsigned int ata_scsiop_inq_b0(struct ata_scsi_args *args, u8 *rbuf)
put_unaligned_be32(1, &rbuf[28]);
}
- return 0;
+ return get_unaligned_be16(&rbuf[2]) + 4;
}
-static unsigned int ata_scsiop_inq_b1(struct ata_scsi_args *args, u8 *rbuf)
+/**
+ * ata_scsiop_inq_b1 - Simulate INQUIRY VPD page B1, Block Device
+ * Characteristics
+ * @dev: Target device.
+ * @cmd: SCSI command of interest.
+ * @rbuf: Response buffer, to which simulated SCSI cmd output is sent.
+ *
+ * Return data for the VPD page B1h (Block Device Characteristics).
+ *
+ * LOCKING:
+ * spin_lock_irqsave(host lock)
+ */
+static unsigned int ata_scsiop_inq_b1(struct ata_device *dev,
+ struct scsi_cmnd *cmd, u8 *rbuf)
{
- int form_factor = ata_id_form_factor(args->id);
- int media_rotation_rate = ata_id_rotation_rate(args->id);
- u8 zoned = ata_id_zoned_cap(args->id);
+ int form_factor = ata_id_form_factor(dev->id);
+ int media_rotation_rate = ata_id_rotation_rate(dev->id);
+ u8 zoned = ata_id_zoned_cap(dev->id);
rbuf[1] = 0xb1;
rbuf[3] = 0x3c;
@@ -2115,21 +2144,52 @@ static unsigned int ata_scsiop_inq_b1(struct ata_scsi_args *args, u8 *rbuf)
if (zoned)
rbuf[8] = (zoned << 4);
- return 0;
+ return get_unaligned_be16(&rbuf[2]) + 4;
}
-static unsigned int ata_scsiop_inq_b2(struct ata_scsi_args *args, u8 *rbuf)
+/**
+ * ata_scsiop_inq_b2 - Simulate INQUIRY VPD page B2, Logical Block
+ * Provisioning
+ * @dev: Target device.
+ * @cmd: SCSI command of interest.
+ * @rbuf: Response buffer, to which simulated SCSI cmd output is sent.
+ *
+ * Return data for the VPD page B2h (Logical Block Provisioning).
+ *
+ * LOCKING:
+ * spin_lock_irqsave(host lock)
+ */
+static unsigned int ata_scsiop_inq_b2(struct ata_device *dev,
+ struct scsi_cmnd *cmd, u8 *rbuf)
{
/* SCSI Thin Provisioning VPD page: SBC-3 rev 22 or later */
rbuf[1] = 0xb2;
rbuf[3] = 0x4;
rbuf[5] = 1 << 6; /* TPWS */
- return 0;
+ return get_unaligned_be16(&rbuf[2]) + 4;
}
-static unsigned int ata_scsiop_inq_b6(struct ata_scsi_args *args, u8 *rbuf)
+/**
+ * ata_scsiop_inq_b6 - Simulate INQUIRY VPD page B6, Zoned Block Device
+ * Characteristics
+ * @dev: Target device.
+ * @cmd: SCSI command of interest.
+ * @rbuf: Response buffer, to which simulated SCSI cmd output is sent.
+ *
+ * Return data for the VPD page B2h (Zoned Block Device Characteristics).
+ *
+ * LOCKING:
+ * spin_lock_irqsave(host lock)
+ */
+static unsigned int ata_scsiop_inq_b6(struct ata_device *dev,
+ struct scsi_cmnd *cmd, u8 *rbuf)
{
+ if (!(dev->flags & ATA_DFLAG_ZAC)) {
+ ata_scsi_set_invalid_field(dev, cmd, 2, 0xff);
+ return 0;
+ }
+
/*
* zbc-r05 SCSI Zoned Block device characteristics VPD page
*/
@@ -2139,21 +2199,39 @@ static unsigned int ata_scsiop_inq_b6(struct ata_scsi_args *args, u8 *rbuf)
/*
* URSWRZ bit is only meaningful for host-managed ZAC drives
*/
- if (args->dev->zac_zoned_cap & 1)
+ if (dev->zac_zoned_cap & 1)
rbuf[4] |= 1;
- put_unaligned_be32(args->dev->zac_zones_optimal_open, &rbuf[8]);
- put_unaligned_be32(args->dev->zac_zones_optimal_nonseq, &rbuf[12]);
- put_unaligned_be32(args->dev->zac_zones_max_open, &rbuf[16]);
+ put_unaligned_be32(dev->zac_zones_optimal_open, &rbuf[8]);
+ put_unaligned_be32(dev->zac_zones_optimal_nonseq, &rbuf[12]);
+ put_unaligned_be32(dev->zac_zones_max_open, &rbuf[16]);
- return 0;
+ return get_unaligned_be16(&rbuf[2]) + 4;
}
-static unsigned int ata_scsiop_inq_b9(struct ata_scsi_args *args, u8 *rbuf)
+/**
+ * ata_scsiop_inq_b9 - Simulate INQUIRY VPD page B9, Concurrent Positioning
+ * Ranges
+ * @dev: Target device.
+ * @cmd: SCSI command of interest.
+ * @rbuf: Response buffer, to which simulated SCSI cmd output is sent.
+ *
+ * Return data for the VPD page B9h (Concurrent Positioning Ranges).
+ *
+ * LOCKING:
+ * spin_lock_irqsave(host lock)
+ */
+static unsigned int ata_scsiop_inq_b9(struct ata_device *dev,
+ struct scsi_cmnd *cmd, u8 *rbuf)
{
- struct ata_cpr_log *cpr_log = args->dev->cpr_log;
+ struct ata_cpr_log *cpr_log = dev->cpr_log;
u8 *desc = &rbuf[64];
int i;
+ if (!cpr_log) {
+ ata_scsi_set_invalid_field(dev, cmd, 2, 0xff);
+ return 0;
+ }
+
/* SCSI Concurrent Positioning Ranges VPD page: SBC-5 rev 1 or later */
rbuf[1] = 0xb9;
put_unaligned_be16(64 + (int)cpr_log->nr_cpr * 32 - 4, &rbuf[2]);
@@ -2165,7 +2243,58 @@ static unsigned int ata_scsiop_inq_b9(struct ata_scsi_args *args, u8 *rbuf)
put_unaligned_be64(cpr_log->cpr[i].num_lbas, &desc[16]);
}
- return 0;
+ return get_unaligned_be16(&rbuf[2]) + 4;
+}
+
+/**
+ * ata_scsiop_inquiry - Simulate INQUIRY command
+ * @dev: Target device.
+ * @cmd: SCSI command of interest.
+ * @rbuf: Response buffer, to which simulated SCSI cmd output is sent.
+ *
+ * Returns data associated with an INQUIRY command output.
+ *
+ * LOCKING:
+ * spin_lock_irqsave(host lock)
+ */
+static unsigned int ata_scsiop_inquiry(struct ata_device *dev,
+ struct scsi_cmnd *cmd, u8 *rbuf)
+{
+ const u8 *scsicmd = cmd->cmnd;
+
+ /* is CmdDt set? */
+ if (scsicmd[1] & 2) {
+ ata_scsi_set_invalid_field(dev, cmd, 1, 0xff);
+ return 0;
+ }
+
+ /* Is EVPD clear? */
+ if ((scsicmd[1] & 1) == 0)
+ return ata_scsiop_inq_std(dev, cmd, rbuf);
+
+ switch (scsicmd[2]) {
+ case 0x00:
+ return ata_scsiop_inq_00(dev, cmd, rbuf);
+ case 0x80:
+ return ata_scsiop_inq_80(dev, cmd, rbuf);
+ case 0x83:
+ return ata_scsiop_inq_83(dev, cmd, rbuf);
+ case 0x89:
+ return ata_scsiop_inq_89(dev, cmd, rbuf);
+ case 0xb0:
+ return ata_scsiop_inq_b0(dev, cmd, rbuf);
+ case 0xb1:
+ return ata_scsiop_inq_b1(dev, cmd, rbuf);
+ case 0xb2:
+ return ata_scsiop_inq_b2(dev, cmd, rbuf);
+ case 0xb6:
+ return ata_scsiop_inq_b6(dev, cmd, rbuf);
+ case 0xb9:
+ return ata_scsiop_inq_b9(dev, cmd, rbuf);
+ default:
+ ata_scsi_set_invalid_field(dev, cmd, 2, 0xff);
+ return 0;
+ }
}
/**
@@ -2388,7 +2517,8 @@ static unsigned int ata_msense_rw_recovery(u8 *buf, bool changeable)
/**
* ata_scsiop_mode_sense - Simulate MODE SENSE 6, 10 commands
- * @args: device IDENTIFY data / SCSI command of interest.
+ * @dev: Target device.
+ * @cmd: SCSI command of interest.
* @rbuf: Response buffer, to which simulated SCSI cmd output is sent.
*
* Simulate MODE SENSE commands. Assume this is invoked for direct
@@ -2398,10 +2528,10 @@ static unsigned int ata_msense_rw_recovery(u8 *buf, bool changeable)
* LOCKING:
* spin_lock_irqsave(host lock)
*/
-static unsigned int ata_scsiop_mode_sense(struct ata_scsi_args *args, u8 *rbuf)
+static unsigned int ata_scsiop_mode_sense(struct ata_device *dev,
+ struct scsi_cmnd *cmd, u8 *rbuf)
{
- struct ata_device *dev = args->dev;
- u8 *scsicmd = args->cmd->cmnd, *p = rbuf;
+ u8 *scsicmd = cmd->cmnd, *p = rbuf;
static const u8 sat_blk_desc[] = {
0, 0, 0, 0, /* number of blocks: sat unspecified */
0,
@@ -2466,17 +2596,17 @@ static unsigned int ata_scsiop_mode_sense(struct ata_scsi_args *args, u8 *rbuf)
break;
case CACHE_MPAGE:
- p += ata_msense_caching(args->id, p, page_control == 1);
+ p += ata_msense_caching(dev->id, p, page_control == 1);
break;
case CONTROL_MPAGE:
- p += ata_msense_control(args->dev, p, spg, page_control == 1);
+ p += ata_msense_control(dev, p, spg, page_control == 1);
break;
case ALL_MPAGES:
p += ata_msense_rw_recovery(p, page_control == 1);
- p += ata_msense_caching(args->id, p, page_control == 1);
- p += ata_msense_control(args->dev, p, spg, page_control == 1);
+ p += ata_msense_caching(dev->id, p, page_control == 1);
+ p += ata_msense_control(dev, p, spg, page_control == 1);
break;
default: /* invalid page code */
@@ -2494,29 +2624,33 @@ static unsigned int ata_scsiop_mode_sense(struct ata_scsi_args *args, u8 *rbuf)
rbuf[3] = sizeof(sat_blk_desc);
memcpy(rbuf + 4, sat_blk_desc, sizeof(sat_blk_desc));
}
- } else {
- put_unaligned_be16(p - rbuf - 2, &rbuf[0]);
- rbuf[3] |= dpofua;
- if (ebd) {
- rbuf[7] = sizeof(sat_blk_desc);
- memcpy(rbuf + 8, sat_blk_desc, sizeof(sat_blk_desc));
- }
+
+ return rbuf[0] + 1;
}
- return 0;
+
+ put_unaligned_be16(p - rbuf - 2, &rbuf[0]);
+ rbuf[3] |= dpofua;
+ if (ebd) {
+ rbuf[7] = sizeof(sat_blk_desc);
+ memcpy(rbuf + 8, sat_blk_desc, sizeof(sat_blk_desc));
+ }
+
+ return get_unaligned_be16(&rbuf[0]) + 2;
invalid_fld:
- ata_scsi_set_invalid_field(dev, args->cmd, fp, bp);
- return 1;
+ ata_scsi_set_invalid_field(dev, cmd, fp, bp);
+ return 0;
saving_not_supp:
- ata_scsi_set_sense(dev, args->cmd, ILLEGAL_REQUEST, 0x39, 0x0);
+ ata_scsi_set_sense(dev, cmd, ILLEGAL_REQUEST, 0x39, 0x0);
/* "Saving parameters not supported" */
- return 1;
+ return 0;
}
/**
* ata_scsiop_read_cap - Simulate READ CAPACITY[ 16] commands
- * @args: device IDENTIFY data / SCSI command of interest.
+ * @dev: Target device.
+ * @cmd: SCSI command of interest.
* @rbuf: Response buffer, to which simulated SCSI cmd output is sent.
*
* Simulate READ CAPACITY commands.
@@ -2524,9 +2658,10 @@ static unsigned int ata_scsiop_mode_sense(struct ata_scsi_args *args, u8 *rbuf)
* LOCKING:
* None.
*/
-static unsigned int ata_scsiop_read_cap(struct ata_scsi_args *args, u8 *rbuf)
+static unsigned int ata_scsiop_read_cap(struct ata_device *dev,
+ struct scsi_cmnd *cmd, u8 *rbuf)
{
- struct ata_device *dev = args->dev;
+ u8 *scsicmd = cmd->cmnd;
u64 last_lba = dev->n_sectors - 1; /* LBA of the last block */
u32 sector_size; /* physical sector size in bytes */
u8 log2_per_phys;
@@ -2536,7 +2671,7 @@ static unsigned int ata_scsiop_read_cap(struct ata_scsi_args *args, u8 *rbuf)
log2_per_phys = ata_id_log2_per_physical_sector(dev->id);
lowest_aligned = ata_id_logical_sector_offset(dev->id, log2_per_phys);
- if (args->cmd->cmnd[0] == READ_CAPACITY) {
+ if (scsicmd[0] == READ_CAPACITY) {
if (last_lba >= 0xffffffffULL)
last_lba = 0xffffffff;
@@ -2551,48 +2686,59 @@ static unsigned int ata_scsiop_read_cap(struct ata_scsi_args *args, u8 *rbuf)
rbuf[5] = sector_size >> (8 * 2);
rbuf[6] = sector_size >> (8 * 1);
rbuf[7] = sector_size;
- } else {
- /* sector count, 64-bit */
- rbuf[0] = last_lba >> (8 * 7);
- rbuf[1] = last_lba >> (8 * 6);
- rbuf[2] = last_lba >> (8 * 5);
- rbuf[3] = last_lba >> (8 * 4);
- rbuf[4] = last_lba >> (8 * 3);
- rbuf[5] = last_lba >> (8 * 2);
- rbuf[6] = last_lba >> (8 * 1);
- rbuf[7] = last_lba;
- /* sector size */
- rbuf[ 8] = sector_size >> (8 * 3);
- rbuf[ 9] = sector_size >> (8 * 2);
- rbuf[10] = sector_size >> (8 * 1);
- rbuf[11] = sector_size;
-
- rbuf[12] = 0;
- rbuf[13] = log2_per_phys;
- rbuf[14] = (lowest_aligned >> 8) & 0x3f;
- rbuf[15] = lowest_aligned;
-
- if (ata_id_has_trim(args->id) &&
- !(dev->quirks & ATA_QUIRK_NOTRIM)) {
- rbuf[14] |= 0x80; /* LBPME */
-
- if (ata_id_has_zero_after_trim(args->id) &&
- dev->quirks & ATA_QUIRK_ZERO_AFTER_TRIM) {
- ata_dev_info(dev, "Enabling discard_zeroes_data\n");
- rbuf[14] |= 0x40; /* LBPRZ */
- }
- }
- if (ata_id_zoned_cap(args->id) ||
- args->dev->class == ATA_DEV_ZAC)
- rbuf[12] = (1 << 4); /* RC_BASIS */
+ return 8;
}
- return 0;
+
+ /*
+ * READ CAPACITY 16 command is defined as a service action
+ * (SERVICE_ACTION_IN_16 command).
+ */
+ if (scsicmd[0] != SERVICE_ACTION_IN_16 ||
+ (scsicmd[1] & 0x1f) != SAI_READ_CAPACITY_16) {
+ ata_scsi_set_invalid_field(dev, cmd, 1, 0xff);
+ return 0;
+ }
+
+ /* sector count, 64-bit */
+ rbuf[0] = last_lba >> (8 * 7);
+ rbuf[1] = last_lba >> (8 * 6);
+ rbuf[2] = last_lba >> (8 * 5);
+ rbuf[3] = last_lba >> (8 * 4);
+ rbuf[4] = last_lba >> (8 * 3);
+ rbuf[5] = last_lba >> (8 * 2);
+ rbuf[6] = last_lba >> (8 * 1);
+ rbuf[7] = last_lba;
+
+ /* sector size */
+ rbuf[ 8] = sector_size >> (8 * 3);
+ rbuf[ 9] = sector_size >> (8 * 2);
+ rbuf[10] = sector_size >> (8 * 1);
+ rbuf[11] = sector_size;
+
+ if (ata_id_zoned_cap(dev->id) || dev->class == ATA_DEV_ZAC)
+ rbuf[12] = (1 << 4); /* RC_BASIS */
+ rbuf[13] = log2_per_phys;
+ rbuf[14] = (lowest_aligned >> 8) & 0x3f;
+ rbuf[15] = lowest_aligned;
+
+ if (ata_id_has_trim(dev->id) && !(dev->quirks & ATA_QUIRK_NOTRIM)) {
+ rbuf[14] |= 0x80; /* LBPME */
+
+ if (ata_id_has_zero_after_trim(dev->id) &&
+ dev->quirks & ATA_QUIRK_ZERO_AFTER_TRIM) {
+ ata_dev_info(dev, "Enabling discard_zeroes_data\n");
+ rbuf[14] |= 0x40; /* LBPRZ */
+ }
+ }
+
+ return 16;
}
/**
* ata_scsiop_report_luns - Simulate REPORT LUNS command
- * @args: device IDENTIFY data / SCSI command of interest.
+ * @dev: Target device.
+ * @cmd: SCSI command of interest.
* @rbuf: Response buffer, to which simulated SCSI cmd output is sent.
*
* Simulate REPORT LUNS command.
@@ -2600,11 +2746,12 @@ static unsigned int ata_scsiop_read_cap(struct ata_scsi_args *args, u8 *rbuf)
* LOCKING:
* spin_lock_irqsave(host lock)
*/
-static unsigned int ata_scsiop_report_luns(struct ata_scsi_args *args, u8 *rbuf)
+static unsigned int ata_scsiop_report_luns(struct ata_device *dev,
+ struct scsi_cmnd *cmd, u8 *rbuf)
{
rbuf[3] = 8; /* just one lun, LUN 0, size 8 bytes */
- return 0;
+ return 16;
}
/*
@@ -3312,7 +3459,8 @@ static unsigned int ata_scsi_write_same_xlat(struct ata_queued_cmd *qc)
/**
* ata_scsiop_maint_in - Simulate a subset of MAINTENANCE_IN
- * @args: device MAINTENANCE_IN data / SCSI command of interest.
+ * @dev: Target device.
+ * @cmd: SCSI command of interest.
* @rbuf: Response buffer, to which simulated SCSI cmd output is sent.
*
* Yields a subset to satisfy scsi_report_opcode()
@@ -3320,17 +3468,21 @@ static unsigned int ata_scsi_write_same_xlat(struct ata_queued_cmd *qc)
* LOCKING:
* spin_lock_irqsave(host lock)
*/
-static unsigned int ata_scsiop_maint_in(struct ata_scsi_args *args, u8 *rbuf)
+static unsigned int ata_scsiop_maint_in(struct ata_device *dev,
+ struct scsi_cmnd *cmd, u8 *rbuf)
{
- struct ata_device *dev = args->dev;
- u8 *cdb = args->cmd->cmnd;
+ u8 *cdb = cmd->cmnd;
u8 supported = 0, cdlp = 0, rwcdlp = 0;
- unsigned int err = 0;
+
+ if ((cdb[1] & 0x1f) != MI_REPORT_SUPPORTED_OPERATION_CODES) {
+ ata_scsi_set_invalid_field(dev, cmd, 1, 0xff);
+ return 0;
+ }
if (cdb[2] != 1 && cdb[2] != 3) {
ata_dev_warn(dev, "invalid command format %d\n", cdb[2]);
- err = 2;
- goto out;
+ ata_scsi_set_invalid_field(dev, cmd, 1, 0xff);
+ return 0;
}
switch (cdb[3]) {
@@ -3398,11 +3550,12 @@ static unsigned int ata_scsiop_maint_in(struct ata_scsi_args *args, u8 *rbuf)
default:
break;
}
-out:
+
/* One command format */
rbuf[0] = rwcdlp;
rbuf[1] = cdlp | supported;
- return err;
+
+ return 4;
}
/**
@@ -4262,78 +4415,26 @@ EXPORT_SYMBOL_GPL(ata_scsi_queuecmd);
void ata_scsi_simulate(struct ata_device *dev, struct scsi_cmnd *cmd)
{
- struct ata_scsi_args args;
const u8 *scsicmd = cmd->cmnd;
u8 tmp8;
- args.dev = dev;
- args.id = dev->id;
- args.cmd = cmd;
-
switch(scsicmd[0]) {
case INQUIRY:
- if (scsicmd[1] & 2) /* is CmdDt set? */
- ata_scsi_set_invalid_field(dev, cmd, 1, 0xff);
- else if ((scsicmd[1] & 1) == 0) /* is EVPD clear? */
- ata_scsi_rbuf_fill(&args, ata_scsiop_inq_std);
- else switch (scsicmd[2]) {
- case 0x00:
- ata_scsi_rbuf_fill(&args, ata_scsiop_inq_00);
- break;
- case 0x80:
- ata_scsi_rbuf_fill(&args, ata_scsiop_inq_80);
- break;
- case 0x83:
- ata_scsi_rbuf_fill(&args, ata_scsiop_inq_83);
- break;
- case 0x89:
- ata_scsi_rbuf_fill(&args, ata_scsiop_inq_89);
- break;
- case 0xb0:
- ata_scsi_rbuf_fill(&args, ata_scsiop_inq_b0);
- break;
- case 0xb1:
- ata_scsi_rbuf_fill(&args, ata_scsiop_inq_b1);
- break;
- case 0xb2:
- ata_scsi_rbuf_fill(&args, ata_scsiop_inq_b2);
- break;
- case 0xb6:
- if (dev->flags & ATA_DFLAG_ZAC)
- ata_scsi_rbuf_fill(&args, ata_scsiop_inq_b6);
- else
- ata_scsi_set_invalid_field(dev, cmd, 2, 0xff);
- break;
- case 0xb9:
- if (dev->cpr_log)
- ata_scsi_rbuf_fill(&args, ata_scsiop_inq_b9);
- else
- ata_scsi_set_invalid_field(dev, cmd, 2, 0xff);
- break;
- default:
- ata_scsi_set_invalid_field(dev, cmd, 2, 0xff);
- break;
- }
+ ata_scsi_rbuf_fill(dev, cmd, ata_scsiop_inquiry);
break;
case MODE_SENSE:
case MODE_SENSE_10:
- ata_scsi_rbuf_fill(&args, ata_scsiop_mode_sense);
+ ata_scsi_rbuf_fill(dev, cmd, ata_scsiop_mode_sense);
break;
case READ_CAPACITY:
- ata_scsi_rbuf_fill(&args, ata_scsiop_read_cap);
- break;
-
case SERVICE_ACTION_IN_16:
- if ((scsicmd[1] & 0x1f) == SAI_READ_CAPACITY_16)
- ata_scsi_rbuf_fill(&args, ata_scsiop_read_cap);
- else
- ata_scsi_set_invalid_field(dev, cmd, 1, 0xff);
+ ata_scsi_rbuf_fill(dev, cmd, ata_scsiop_read_cap);
break;
case REPORT_LUNS:
- ata_scsi_rbuf_fill(&args, ata_scsiop_report_luns);
+ ata_scsi_rbuf_fill(dev, cmd, ata_scsiop_report_luns);
break;
case REQUEST_SENSE:
@@ -4361,10 +4462,7 @@ void ata_scsi_simulate(struct ata_device *dev, struct scsi_cmnd *cmd)
break;
case MAINTENANCE_IN:
- if ((scsicmd[1] & 0x1f) == MI_REPORT_SUPPORTED_OPERATION_CODES)
- ata_scsi_rbuf_fill(&args, ata_scsiop_maint_in);
- else
- ata_scsi_set_invalid_field(dev, cmd, 1, 0xff);
+ ata_scsi_rbuf_fill(dev, cmd, ata_scsiop_maint_in);
break;
/* all other commands */
diff --git a/drivers/ata/pata_arasan_cf.c b/drivers/ata/pata_arasan_cf.c
index d0c6924..514d549 100644
--- a/drivers/ata/pata_arasan_cf.c
+++ b/drivers/ata/pata_arasan_cf.c
@@ -964,7 +964,7 @@ MODULE_DEVICE_TABLE(of, arasan_cf_id_table);
static struct platform_driver arasan_cf_driver = {
.probe = arasan_cf_probe,
- .remove_new = arasan_cf_remove,
+ .remove = arasan_cf_remove,
.driver = {
.name = DRIVER_NAME,
.pm = &arasan_cf_pm_ops,
diff --git a/drivers/ata/pata_ep93xx.c b/drivers/ata/pata_ep93xx.c
index f3f5b2b..e8cda98 100644
--- a/drivers/ata/pata_ep93xx.c
+++ b/drivers/ata/pata_ep93xx.c
@@ -1015,7 +1015,7 @@ static struct platform_driver ep93xx_pata_platform_driver = {
.of_match_table = ep93xx_pata_of_ids,
},
.probe = ep93xx_pata_probe,
- .remove_new = ep93xx_pata_remove,
+ .remove = ep93xx_pata_remove,
};
module_platform_driver(ep93xx_pata_platform_driver);
diff --git a/drivers/ata/pata_falcon.c b/drivers/ata/pata_falcon.c
index 18ceefd..334c4ee 100644
--- a/drivers/ata/pata_falcon.c
+++ b/drivers/ata/pata_falcon.c
@@ -225,8 +225,8 @@ static void pata_falcon_remove_one(struct platform_device *pdev)
static struct platform_driver pata_falcon_driver = {
.probe = pata_falcon_init_one,
- .remove_new = pata_falcon_remove_one,
- .driver = {
+ .remove = pata_falcon_remove_one,
+ .driver = {
.name = "atari-falcon-ide",
},
};
diff --git a/drivers/ata/pata_ftide010.c b/drivers/ata/pata_ftide010.c
index 73a9a51..c3a8384 100644
--- a/drivers/ata/pata_ftide010.c
+++ b/drivers/ata/pata_ftide010.c
@@ -557,7 +557,7 @@ static struct platform_driver pata_ftide010_driver = {
.of_match_table = pata_ftide010_of_match,
},
.probe = pata_ftide010_probe,
- .remove_new = pata_ftide010_remove,
+ .remove = pata_ftide010_remove,
};
module_platform_driver(pata_ftide010_driver);
diff --git a/drivers/ata/pata_gayle.c b/drivers/ata/pata_gayle.c
index 94df60a..8602c38 100644
--- a/drivers/ata/pata_gayle.c
+++ b/drivers/ata/pata_gayle.c
@@ -202,9 +202,9 @@ static void pata_gayle_remove_one(struct platform_device *pdev)
static struct platform_driver pata_gayle_driver = {
.probe = pata_gayle_init_one,
- .remove_new = pata_gayle_remove_one,
- .driver = {
- .name = "amiga-gayle-ide",
+ .remove = pata_gayle_remove_one,
+ .driver = {
+ .name = "amiga-gayle-ide",
},
};
diff --git a/drivers/ata/pata_imx.c b/drivers/ata/pata_imx.c
index d0aa8fc..b37682b 100644
--- a/drivers/ata/pata_imx.c
+++ b/drivers/ata/pata_imx.c
@@ -249,7 +249,7 @@ MODULE_DEVICE_TABLE(of, imx_pata_dt_ids);
static struct platform_driver pata_imx_driver = {
.probe = pata_imx_probe,
- .remove_new = pata_imx_remove,
+ .remove = pata_imx_remove,
.driver = {
.name = DRV_NAME,
.of_match_table = imx_pata_dt_ids,
diff --git a/drivers/ata/pata_it8213.c b/drivers/ata/pata_it8213.c
index b7ac561..9cbe213 100644
--- a/drivers/ata/pata_it8213.c
+++ b/drivers/ata/pata_it8213.c
@@ -81,7 +81,7 @@ static void it8213_set_piomode (struct ata_port *ap, struct ata_device *adev)
int control = 0;
/*
- * See Intel Document 298600-004 for the timing programing rules
+ * See Intel Document 298600-004 for the timing programming rules
* for PIIX/ICH. The 8213 is a clone so very similar
*/
diff --git a/drivers/ata/pata_ixp4xx_cf.c b/drivers/ata/pata_ixp4xx_cf.c
index 8a9ee82..80f6a91 100644
--- a/drivers/ata/pata_ixp4xx_cf.c
+++ b/drivers/ata/pata_ixp4xx_cf.c
@@ -298,7 +298,7 @@ static struct platform_driver ixp4xx_pata_platform_driver = {
.of_match_table = ixp4xx_pata_of_match,
},
.probe = ixp4xx_pata_probe,
- .remove_new = ata_platform_remove_one,
+ .remove = ata_platform_remove_one,
};
module_platform_driver(ixp4xx_pata_platform_driver);
diff --git a/drivers/ata/pata_mpc52xx.c b/drivers/ata/pata_mpc52xx.c
index 3f92586..210a632 100644
--- a/drivers/ata/pata_mpc52xx.c
+++ b/drivers/ata/pata_mpc52xx.c
@@ -854,7 +854,7 @@ static const struct of_device_id mpc52xx_ata_of_match[] = {
static struct platform_driver mpc52xx_ata_of_platform_driver = {
.probe = mpc52xx_ata_probe,
- .remove_new = mpc52xx_ata_remove,
+ .remove = mpc52xx_ata_remove,
#ifdef CONFIG_PM_SLEEP
.suspend = mpc52xx_ata_suspend,
.resume = mpc52xx_ata_resume,
diff --git a/drivers/ata/pata_octeon_cf.c b/drivers/ata/pata_octeon_cf.c
index 0bb9607..dce2480 100644
--- a/drivers/ata/pata_octeon_cf.c
+++ b/drivers/ata/pata_octeon_cf.c
@@ -183,7 +183,7 @@ static void octeon_cf_set_piomode(struct ata_port *ap, struct ata_device *dev)
reg_tim.s.ale = 0;
/* Not used */
reg_tim.s.page = 0;
- /* Time after IORDY to coninue to assert the data */
+ /* Time after IORDY to continue to assert the data */
reg_tim.s.wait = 0;
/* Time to wait to complete the cycle. */
reg_tim.s.pause = pause;
diff --git a/drivers/ata/pata_of_platform.c b/drivers/ata/pata_of_platform.c
index 4956f0f..178b28e 100644
--- a/drivers/ata/pata_of_platform.c
+++ b/drivers/ata/pata_of_platform.c
@@ -89,7 +89,7 @@ static struct platform_driver pata_of_platform_driver = {
.of_match_table = pata_of_platform_match,
},
.probe = pata_of_platform_probe,
- .remove_new = ata_platform_remove_one,
+ .remove = ata_platform_remove_one,
};
module_platform_driver(pata_of_platform_driver);
diff --git a/drivers/ata/pata_oldpiix.c b/drivers/ata/pata_oldpiix.c
index dca82d9..3d01b70 100644
--- a/drivers/ata/pata_oldpiix.c
+++ b/drivers/ata/pata_oldpiix.c
@@ -70,7 +70,7 @@ static void oldpiix_set_piomode (struct ata_port *ap, struct ata_device *adev)
int control = 0;
/*
- * See Intel Document 298600-004 for the timing programing rules
+ * See Intel Document 298600-004 for the timing programming rules
* for PIIX/ICH. Note that the early PIIX does not have the slave
* timing port at 0x44.
*/
diff --git a/drivers/ata/pata_platform.c b/drivers/ata/pata_platform.c
index 232c3da..87479bc 100644
--- a/drivers/ata/pata_platform.c
+++ b/drivers/ata/pata_platform.c
@@ -223,7 +223,7 @@ static int pata_platform_probe(struct platform_device *pdev)
static struct platform_driver pata_platform_driver = {
.probe = pata_platform_probe,
- .remove_new = ata_platform_remove_one,
+ .remove = ata_platform_remove_one,
.driver = {
.name = DRV_NAME,
},
diff --git a/drivers/ata/pata_pxa.c b/drivers/ata/pata_pxa.c
index 538bd34..434f380 100644
--- a/drivers/ata/pata_pxa.c
+++ b/drivers/ata/pata_pxa.c
@@ -306,7 +306,7 @@ static void pxa_ata_remove(struct platform_device *pdev)
static struct platform_driver pxa_ata_driver = {
.probe = pxa_ata_probe,
- .remove_new = pxa_ata_remove,
+ .remove = pxa_ata_remove,
.driver = {
.name = DRV_NAME,
},
diff --git a/drivers/ata/pata_radisys.c b/drivers/ata/pata_radisys.c
index 84b0010..40ef807 100644
--- a/drivers/ata/pata_radisys.c
+++ b/drivers/ata/pata_radisys.c
@@ -45,7 +45,7 @@ static void radisys_set_piomode (struct ata_port *ap, struct ata_device *adev)
int control = 0;
/*
- * See Intel Document 298600-004 for the timing programing rules
+ * See Intel Document 298600-004 for the timing programming rules
* for PIIX/ICH. Note that the early PIIX does not have the slave
* timing port at 0x44. The Radisys is a relative of the PIIX
* but not the same so be careful.
diff --git a/drivers/ata/pata_rb532_cf.c b/drivers/ata/pata_rb532_cf.c
index 0fa253a..fd81e75 100644
--- a/drivers/ata/pata_rb532_cf.c
+++ b/drivers/ata/pata_rb532_cf.c
@@ -164,7 +164,7 @@ static void rb532_pata_driver_remove(struct platform_device *pdev)
static struct platform_driver rb532_pata_platform_driver = {
.probe = rb532_pata_driver_probe,
- .remove_new = rb532_pata_driver_remove,
+ .remove = rb532_pata_driver_remove,
.driver = {
.name = DRV_NAME,
},
diff --git a/drivers/ata/sata_dwc_460ex.c b/drivers/ata/sata_dwc_460ex.c
index 52f5168..6e1dd0d 100644
--- a/drivers/ata/sata_dwc_460ex.c
+++ b/drivers/ata/sata_dwc_460ex.c
@@ -1240,7 +1240,7 @@ static struct platform_driver sata_dwc_driver = {
.of_match_table = sata_dwc_match,
},
.probe = sata_dwc_probe,
- .remove_new = sata_dwc_remove,
+ .remove = sata_dwc_remove,
};
module_platform_driver(sata_dwc_driver);
diff --git a/drivers/ata/sata_fsl.c b/drivers/ata/sata_fsl.c
index 01aa05f..87e91a9 100644
--- a/drivers/ata/sata_fsl.c
+++ b/drivers/ata/sata_fsl.c
@@ -1589,7 +1589,7 @@ static struct platform_driver fsl_sata_driver = {
.of_match_table = fsl_sata_match,
},
.probe = sata_fsl_probe,
- .remove_new = sata_fsl_remove,
+ .remove = sata_fsl_remove,
#ifdef CONFIG_PM_SLEEP
.suspend = sata_fsl_suspend,
.resume = sata_fsl_resume,
diff --git a/drivers/ata/sata_gemini.c b/drivers/ata/sata_gemini.c
index f574e3c..d040799 100644
--- a/drivers/ata/sata_gemini.c
+++ b/drivers/ata/sata_gemini.c
@@ -425,7 +425,7 @@ static struct platform_driver gemini_sata_driver = {
.of_match_table = gemini_sata_of_match,
},
.probe = gemini_sata_probe,
- .remove_new = gemini_sata_remove,
+ .remove = gemini_sata_remove,
};
module_platform_driver(gemini_sata_driver);
diff --git a/drivers/ata/sata_highbank.c b/drivers/ata/sata_highbank.c
index 63ef7bb..b1b40e9 100644
--- a/drivers/ata/sata_highbank.c
+++ b/drivers/ata/sata_highbank.c
@@ -614,12 +614,12 @@ static SIMPLE_DEV_PM_OPS(ahci_highbank_pm_ops,
ahci_highbank_suspend, ahci_highbank_resume);
static struct platform_driver ahci_highbank_driver = {
- .remove_new = ata_platform_remove_one,
- .driver = {
- .name = "highbank-ahci",
- .of_match_table = ahci_of_match,
- .pm = &ahci_highbank_pm_ops,
- },
+ .remove = ata_platform_remove_one,
+ .driver = {
+ .name = "highbank-ahci",
+ .of_match_table = ahci_of_match,
+ .pm = &ahci_highbank_pm_ops,
+ },
.probe = ahci_highbank_probe,
};
diff --git a/drivers/ata/sata_mv.c b/drivers/ata/sata_mv.c
index 05c9058..b8f3633 100644
--- a/drivers/ata/sata_mv.c
+++ b/drivers/ata/sata_mv.c
@@ -4255,7 +4255,7 @@ MODULE_DEVICE_TABLE(of, mv_sata_dt_ids);
static struct platform_driver mv_platform_driver = {
.probe = mv_platform_probe,
- .remove_new = mv_platform_remove,
+ .remove = mv_platform_remove,
.suspend = mv_platform_suspend,
.resume = mv_platform_resume,
.driver = {
diff --git a/drivers/ata/sata_rcar.c b/drivers/ata/sata_rcar.c
index c1469d0..22820a0 100644
--- a/drivers/ata/sata_rcar.c
+++ b/drivers/ata/sata_rcar.c
@@ -1009,7 +1009,7 @@ static const struct dev_pm_ops sata_rcar_pm_ops = {
static struct platform_driver sata_rcar_driver = {
.probe = sata_rcar_probe,
- .remove_new = sata_rcar_remove,
+ .remove = sata_rcar_remove,
.driver = {
.name = DRV_NAME,
.of_match_table = sata_rcar_match,
diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index 2fd1ed1..5a95671 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -316,8 +316,40 @@ __setup("ramdisk_size=", ramdisk_size);
* (should share code eventually).
*/
static LIST_HEAD(brd_devices);
+static DEFINE_MUTEX(brd_devices_mutex);
static struct dentry *brd_debugfs_dir;
+static struct brd_device *brd_find_or_alloc_device(int i)
+{
+ struct brd_device *brd;
+
+ mutex_lock(&brd_devices_mutex);
+ list_for_each_entry(brd, &brd_devices, brd_list) {
+ if (brd->brd_number == i) {
+ mutex_unlock(&brd_devices_mutex);
+ return ERR_PTR(-EEXIST);
+ }
+ }
+
+ brd = kzalloc(sizeof(*brd), GFP_KERNEL);
+ if (!brd) {
+ mutex_unlock(&brd_devices_mutex);
+ return ERR_PTR(-ENOMEM);
+ }
+ brd->brd_number = i;
+ list_add_tail(&brd->brd_list, &brd_devices);
+ mutex_unlock(&brd_devices_mutex);
+ return brd;
+}
+
+static void brd_free_device(struct brd_device *brd)
+{
+ mutex_lock(&brd_devices_mutex);
+ list_del(&brd->brd_list);
+ mutex_unlock(&brd_devices_mutex);
+ kfree(brd);
+}
+
static int brd_alloc(int i)
{
struct brd_device *brd;
@@ -340,14 +372,9 @@ static int brd_alloc(int i)
BLK_FEAT_NOWAIT,
};
- list_for_each_entry(brd, &brd_devices, brd_list)
- if (brd->brd_number == i)
- return -EEXIST;
- brd = kzalloc(sizeof(*brd), GFP_KERNEL);
- if (!brd)
- return -ENOMEM;
- brd->brd_number = i;
- list_add_tail(&brd->brd_list, &brd_devices);
+ brd = brd_find_or_alloc_device(i);
+ if (IS_ERR(brd))
+ return PTR_ERR(brd);
xa_init(&brd->brd_pages);
@@ -378,8 +405,7 @@ static int brd_alloc(int i)
out_cleanup_disk:
put_disk(disk);
out_free_dev:
- list_del(&brd->brd_list);
- kfree(brd);
+ brd_free_device(brd);
return err;
}
@@ -398,8 +424,7 @@ static void brd_cleanup(void)
del_gendisk(brd->brd_disk);
put_disk(brd->brd_disk);
brd_free_pages(brd);
- list_del(&brd->brd_list);
- kfree(brd);
+ brd_free_device(brd);
}
}
@@ -426,16 +451,6 @@ static int __init brd_init(void)
{
int err, i;
- brd_check_and_reset_par();
-
- brd_debugfs_dir = debugfs_create_dir("ramdisk_pages", NULL);
-
- for (i = 0; i < rd_nr; i++) {
- err = brd_alloc(i);
- if (err)
- goto out_free;
- }
-
/*
* brd module now has a feature to instantiate underlying device
* structure on-demand, provided that there is an access dev node.
@@ -451,11 +466,18 @@ static int __init brd_init(void)
* dynamically.
*/
+ brd_check_and_reset_par();
+
+ brd_debugfs_dir = debugfs_create_dir("ramdisk_pages", NULL);
+
if (__register_blkdev(RAMDISK_MAJOR, "ramdisk", brd_probe)) {
err = -EIO;
goto out_free;
}
+ for (i = 0; i < rd_nr; i++)
+ brd_alloc(i);
+
pr_info("brd: module loaded\n");
return 0;
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 78a7bb2..fe9bb4f 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -173,7 +173,7 @@ static loff_t get_loop_size(struct loop_device *lo, struct file *file)
static bool lo_bdev_can_use_dio(struct loop_device *lo,
struct block_device *backing_bdev)
{
- unsigned short sb_bsize = bdev_logical_block_size(backing_bdev);
+ unsigned int sb_bsize = bdev_logical_block_size(backing_bdev);
if (queue_logical_block_size(lo->lo_queue) < sb_bsize)
return false;
@@ -786,11 +786,10 @@ static void loop_config_discard(struct loop_device *lo,
* file-backed loop devices: discarded regions read back as zero.
*/
if (S_ISBLK(inode->i_mode)) {
- struct request_queue *backingq = bdev_get_queue(I_BDEV(inode));
+ struct block_device *bdev = I_BDEV(inode);
- max_discard_sectors = backingq->limits.max_write_zeroes_sectors;
- granularity = bdev_discard_granularity(I_BDEV(inode)) ?:
- queue_physical_block_size(backingq);
+ max_discard_sectors = bdev_write_zeroes_sectors(bdev);
+ granularity = bdev_discard_granularity(bdev);
/*
* We use punch hole to reclaim the free space used by the
@@ -977,7 +976,7 @@ loop_set_status_from_info(struct loop_device *lo,
return 0;
}
-static unsigned short loop_default_blocksize(struct loop_device *lo,
+static unsigned int loop_default_blocksize(struct loop_device *lo,
struct block_device *backing_bdev)
{
/* In case of direct I/O, match underlying block size */
@@ -986,7 +985,7 @@ static unsigned short loop_default_blocksize(struct loop_device *lo,
return SECTOR_SIZE;
}
-static int loop_reconfigure_limits(struct loop_device *lo, unsigned short bsize)
+static int loop_reconfigure_limits(struct loop_device *lo, unsigned int bsize)
{
struct file *file = lo->lo_backing_file;
struct inode *inode = file->f_mapping->host;
diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index 223faa9..43701b7 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -2701,7 +2701,12 @@ static int mtip_hw_init(struct driver_data *dd)
int rv;
unsigned long timeout, timetaken;
- dd->mmio = pcim_iomap_table(dd->pdev)[MTIP_ABAR];
+ dd->mmio = pcim_iomap_region(dd->pdev, MTIP_ABAR, MTIP_DRV_NAME);
+ if (IS_ERR(dd->mmio)) {
+ dev_err(&dd->pdev->dev, "Unable to request / ioremap PCI region\n");
+ return PTR_ERR(dd->mmio);
+ }
+
mtip_detect_product(dd);
if (dd->product_type == MTIP_PRODUCT_UNKNOWN) {
@@ -3710,13 +3715,6 @@ static int mtip_pci_probe(struct pci_dev *pdev,
goto iomap_err;
}
- /* Map BAR5 to memory. */
- rv = pcim_iomap_regions(pdev, 1 << MTIP_ABAR, MTIP_DRV_NAME);
- if (rv < 0) {
- dev_err(&pdev->dev, "Unable to map regions\n");
- goto iomap_err;
- }
-
rv = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64));
if (rv) {
dev_warn(&pdev->dev, "64-bit DMA enable failed\n");
diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c
index 2f0431e..3c3d8d2 100644
--- a/drivers/block/null_blk/main.c
+++ b/drivers/block/null_blk/main.c
@@ -1638,10 +1638,9 @@ static blk_status_t null_queue_rq(struct blk_mq_hw_ctx *hctx,
return BLK_STS_OK;
}
-static void null_queue_rqs(struct request **rqlist)
+static void null_queue_rqs(struct rq_list *rqlist)
{
- struct request *requeue_list = NULL;
- struct request **requeue_lastp = &requeue_list;
+ struct rq_list requeue_list = {};
struct blk_mq_queue_data bd = { };
blk_status_t ret;
@@ -1651,8 +1650,8 @@ static void null_queue_rqs(struct request **rqlist)
bd.rq = rq;
ret = null_queue_rq(rq->mq_hctx, &bd);
if (ret != BLK_STS_OK)
- rq_list_add_tail(&requeue_lastp, rq);
- } while (!rq_list_empty(*rqlist));
+ rq_list_add_tail(&requeue_list, rq);
+ } while (!rq_list_empty(rqlist));
*rqlist = requeue_list;
}
diff --git a/drivers/block/null_blk/zoned.c b/drivers/block/null_blk/zoned.c
index 9bc768b..0d5f9bf 100644
--- a/drivers/block/null_blk/zoned.c
+++ b/drivers/block/null_blk/zoned.c
@@ -166,7 +166,7 @@ int null_init_zoned_dev(struct nullb_device *dev,
lim->features |= BLK_FEAT_ZONED;
lim->chunk_sectors = dev->zone_size_sects;
- lim->max_zone_append_sectors = dev->zone_append_max_sectors;
+ lim->max_hw_zone_append_sectors = dev->zone_append_max_sectors;
lim->max_open_zones = dev->zone_max_open;
lim->max_active_zones = dev->zone_max_active;
return 0;
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 9c8b19a..ac421db 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -7284,6 +7284,7 @@ static ssize_t do_rbd_remove(const char *buf, size_t count)
*/
blk_mq_freeze_queue(rbd_dev->disk->queue);
blk_mark_disk_dead(rbd_dev->disk);
+ blk_mq_unfreeze_queue(rbd_dev->disk->queue);
}
del_gendisk(rbd_dev->disk);
diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index 6ba2c1d..c6d18cd 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -60,7 +60,12 @@
| UBLK_F_UNPRIVILEGED_DEV \
| UBLK_F_CMD_IOCTL_ENCODE \
| UBLK_F_USER_COPY \
- | UBLK_F_ZONED)
+ | UBLK_F_ZONED \
+ | UBLK_F_USER_RECOVERY_FAIL_IO)
+
+#define UBLK_F_ALL_RECOVERY_FLAGS (UBLK_F_USER_RECOVERY \
+ | UBLK_F_USER_RECOVERY_REISSUE \
+ | UBLK_F_USER_RECOVERY_FAIL_IO)
/* All UBLK_PARAM_TYPE_* should be included here */
#define UBLK_PARAM_TYPE_ALL \
@@ -143,6 +148,7 @@ struct ublk_queue {
bool force_abort;
bool timeout;
bool canceling;
+ bool fail_io; /* copy of dev->state == UBLK_S_DEV_FAIL_IO */
unsigned short nr_io_ready; /* how many ios setup */
spinlock_t cancel_lock;
struct ublk_device *dev;
@@ -179,8 +185,7 @@ struct ublk_device {
unsigned int nr_queues_ready;
unsigned int nr_privileged_daemon;
- struct work_struct quiesce_work;
- struct work_struct stop_work;
+ struct work_struct nosrv_work;
};
/* header of ublk_params */
@@ -664,30 +669,69 @@ static inline char *ublk_queue_cmd_buf(struct ublk_device *ub, int q_id)
return ublk_get_queue(ub, q_id)->io_cmd_buf;
}
+static inline int __ublk_queue_cmd_buf_size(int depth)
+{
+ return round_up(depth * sizeof(struct ublksrv_io_desc), PAGE_SIZE);
+}
+
static inline int ublk_queue_cmd_buf_size(struct ublk_device *ub, int q_id)
{
struct ublk_queue *ubq = ublk_get_queue(ub, q_id);
- return round_up(ubq->q_depth * sizeof(struct ublksrv_io_desc),
- PAGE_SIZE);
+ return __ublk_queue_cmd_buf_size(ubq->q_depth);
}
-static inline bool ublk_queue_can_use_recovery_reissue(
- struct ublk_queue *ubq)
+static int ublk_max_cmd_buf_size(void)
+{
+ return __ublk_queue_cmd_buf_size(UBLK_MAX_QUEUE_DEPTH);
+}
+
+/*
+ * Should I/O outstanding to the ublk server when it exits be reissued?
+ * If not, outstanding I/O will get errors.
+ */
+static inline bool ublk_nosrv_should_reissue_outstanding(struct ublk_device *ub)
+{
+ return (ub->dev_info.flags & UBLK_F_USER_RECOVERY) &&
+ (ub->dev_info.flags & UBLK_F_USER_RECOVERY_REISSUE);
+}
+
+/*
+ * Should I/O issued while there is no ublk server queue? If not, I/O
+ * issued while there is no ublk server will get errors.
+ */
+static inline bool ublk_nosrv_dev_should_queue_io(struct ublk_device *ub)
+{
+ return (ub->dev_info.flags & UBLK_F_USER_RECOVERY) &&
+ !(ub->dev_info.flags & UBLK_F_USER_RECOVERY_FAIL_IO);
+}
+
+/*
+ * Same as ublk_nosrv_dev_should_queue_io, but uses a queue-local copy
+ * of the device flags for smaller cache footprint - better for fast
+ * paths.
+ */
+static inline bool ublk_nosrv_should_queue_io(struct ublk_queue *ubq)
{
return (ubq->flags & UBLK_F_USER_RECOVERY) &&
- (ubq->flags & UBLK_F_USER_RECOVERY_REISSUE);
+ !(ubq->flags & UBLK_F_USER_RECOVERY_FAIL_IO);
}
-static inline bool ublk_queue_can_use_recovery(
- struct ublk_queue *ubq)
+/*
+ * Should ublk devices be stopped (i.e. no recovery possible) when the
+ * ublk server exits? If not, devices can be used again by a future
+ * incarnation of a ublk server via the start_recovery/end_recovery
+ * commands.
+ */
+static inline bool ublk_nosrv_should_stop_dev(struct ublk_device *ub)
{
- return ubq->flags & UBLK_F_USER_RECOVERY;
+ return !(ub->dev_info.flags & UBLK_F_USER_RECOVERY);
}
-static inline bool ublk_can_use_recovery(struct ublk_device *ub)
+static inline bool ublk_dev_in_recoverable_state(struct ublk_device *ub)
{
- return ub->dev_info.flags & UBLK_F_USER_RECOVERY;
+ return ub->dev_info.state == UBLK_S_DEV_QUIESCED ||
+ ub->dev_info.state == UBLK_S_DEV_FAIL_IO;
}
static void ublk_free_disk(struct gendisk *disk)
@@ -1063,7 +1107,7 @@ static void __ublk_fail_req(struct ublk_queue *ubq, struct ublk_io *io,
{
WARN_ON_ONCE(io->flags & UBLK_IO_FLAG_ACTIVE);
- if (ublk_queue_can_use_recovery_reissue(ubq))
+ if (ublk_nosrv_should_reissue_outstanding(ubq->dev))
blk_mq_requeue_request(req, false);
else
ublk_put_req_ref(ubq, req);
@@ -1091,7 +1135,7 @@ static inline void __ublk_abort_rq(struct ublk_queue *ubq,
struct request *rq)
{
/* We cannot process this rq so just requeue it. */
- if (ublk_queue_can_use_recovery(ubq))
+ if (ublk_nosrv_dev_should_queue_io(ubq->dev))
blk_mq_requeue_request(rq, false);
else
blk_mq_end_request(rq, BLK_STS_IOERR);
@@ -1236,10 +1280,7 @@ static enum blk_eh_timer_return ublk_timeout(struct request *rq)
struct ublk_device *ub = ubq->dev;
if (ublk_abort_requests(ub, ubq)) {
- if (ublk_can_use_recovery(ub))
- schedule_work(&ub->quiesce_work);
- else
- schedule_work(&ub->stop_work);
+ schedule_work(&ub->nosrv_work);
}
return BLK_EH_DONE;
}
@@ -1254,6 +1295,10 @@ static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx,
struct request *rq = bd->rq;
blk_status_t res;
+ if (unlikely(ubq->fail_io)) {
+ return BLK_STS_TARGET;
+ }
+
/* fill iod to slot in io cmd buffer */
res = ublk_setup_iod(ubq, rq);
if (unlikely(res != BLK_STS_OK))
@@ -1268,7 +1313,7 @@ static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx,
* Note: force_abort is guaranteed to be seen because it is set
* before request queue is unqiuesced.
*/
- if (ublk_queue_can_use_recovery(ubq) && unlikely(ubq->force_abort))
+ if (ublk_nosrv_should_queue_io(ubq) && unlikely(ubq->force_abort))
return BLK_STS_IOERR;
if (unlikely(ubq->canceling)) {
@@ -1322,7 +1367,7 @@ static int ublk_ch_mmap(struct file *filp, struct vm_area_struct *vma)
{
struct ublk_device *ub = filp->private_data;
size_t sz = vma->vm_end - vma->vm_start;
- unsigned max_sz = UBLK_MAX_QUEUE_DEPTH * sizeof(struct ublksrv_io_desc);
+ unsigned max_sz = ublk_max_cmd_buf_size();
unsigned long pfn, end, phys_off = vma->vm_pgoff << PAGE_SHIFT;
int q_id, ret = 0;
@@ -1489,10 +1534,7 @@ static void ublk_uring_cmd_cancel_fn(struct io_uring_cmd *cmd,
ublk_cancel_cmd(ubq, io, issue_flags);
if (need_schedule) {
- if (ublk_can_use_recovery(ub))
- schedule_work(&ub->quiesce_work);
- else
- schedule_work(&ub->stop_work);
+ schedule_work(&ub->nosrv_work);
}
}
@@ -1555,20 +1597,6 @@ static void __ublk_quiesce_dev(struct ublk_device *ub)
ub->dev_info.state = UBLK_S_DEV_QUIESCED;
}
-static void ublk_quiesce_work_fn(struct work_struct *work)
-{
- struct ublk_device *ub =
- container_of(work, struct ublk_device, quiesce_work);
-
- mutex_lock(&ub->mutex);
- if (ub->dev_info.state != UBLK_S_DEV_LIVE)
- goto unlock;
- __ublk_quiesce_dev(ub);
- unlock:
- mutex_unlock(&ub->mutex);
- ublk_cancel_dev(ub);
-}
-
static void ublk_unquiesce_dev(struct ublk_device *ub)
{
int i;
@@ -1597,7 +1625,7 @@ static void ublk_stop_dev(struct ublk_device *ub)
mutex_lock(&ub->mutex);
if (ub->dev_info.state == UBLK_S_DEV_DEAD)
goto unlock;
- if (ublk_can_use_recovery(ub)) {
+ if (ublk_nosrv_dev_should_queue_io(ub)) {
if (ub->dev_info.state == UBLK_S_DEV_LIVE)
__ublk_quiesce_dev(ub);
ublk_unquiesce_dev(ub);
@@ -1617,6 +1645,37 @@ static void ublk_stop_dev(struct ublk_device *ub)
ublk_cancel_dev(ub);
}
+static void ublk_nosrv_work(struct work_struct *work)
+{
+ struct ublk_device *ub =
+ container_of(work, struct ublk_device, nosrv_work);
+ int i;
+
+ if (ublk_nosrv_should_stop_dev(ub)) {
+ ublk_stop_dev(ub);
+ return;
+ }
+
+ mutex_lock(&ub->mutex);
+ if (ub->dev_info.state != UBLK_S_DEV_LIVE)
+ goto unlock;
+
+ if (ublk_nosrv_dev_should_queue_io(ub)) {
+ __ublk_quiesce_dev(ub);
+ } else {
+ blk_mq_quiesce_queue(ub->ub_disk->queue);
+ ub->dev_info.state = UBLK_S_DEV_FAIL_IO;
+ for (i = 0; i < ub->dev_info.nr_hw_queues; i++) {
+ ublk_get_queue(ub, i)->fail_io = true;
+ }
+ blk_mq_unquiesce_queue(ub->ub_disk->queue);
+ }
+
+ unlock:
+ mutex_unlock(&ub->mutex);
+ ublk_cancel_dev(ub);
+}
+
/* device can only be started after all IOs are ready */
static void ublk_mark_io_ready(struct ublk_device *ub, struct ublk_queue *ubq)
{
@@ -2130,14 +2189,6 @@ static int ublk_add_chdev(struct ublk_device *ub)
return ret;
}
-static void ublk_stop_work_fn(struct work_struct *work)
-{
- struct ublk_device *ub =
- container_of(work, struct ublk_device, stop_work);
-
- ublk_stop_dev(ub);
-}
-
/* align max io buffer size with PAGE_SIZE */
static void ublk_align_max_io_size(struct ublk_device *ub)
{
@@ -2162,8 +2213,7 @@ static int ublk_add_tag_set(struct ublk_device *ub)
static void ublk_remove(struct ublk_device *ub)
{
ublk_stop_dev(ub);
- cancel_work_sync(&ub->stop_work);
- cancel_work_sync(&ub->quiesce_work);
+ cancel_work_sync(&ub->nosrv_work);
cdev_device_del(&ub->cdev, &ub->cdev_dev);
ublk_put_device(ub);
ublks_added--;
@@ -2229,7 +2279,7 @@ static int ublk_ctrl_start_dev(struct ublk_device *ub, struct io_uring_cmd *cmd)
lim.features |= BLK_FEAT_ZONED;
lim.max_active_zones = p->max_active_zones;
lim.max_open_zones = p->max_open_zones;
- lim.max_zone_append_sectors = p->max_zone_append_sectors;
+ lim.max_hw_zone_append_sectors = p->max_zone_append_sectors;
}
if (ub->params.basic.attrs & UBLK_ATTR_VOLATILE_CACHE) {
@@ -2372,6 +2422,19 @@ static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd)
else if (!(info.flags & UBLK_F_UNPRIVILEGED_DEV))
return -EPERM;
+ /* forbid nonsense combinations of recovery flags */
+ switch (info.flags & UBLK_F_ALL_RECOVERY_FLAGS) {
+ case 0:
+ case UBLK_F_USER_RECOVERY:
+ case (UBLK_F_USER_RECOVERY | UBLK_F_USER_RECOVERY_REISSUE):
+ case (UBLK_F_USER_RECOVERY | UBLK_F_USER_RECOVERY_FAIL_IO):
+ break;
+ default:
+ pr_warn("%s: invalid recovery flags %llx\n", __func__,
+ info.flags & UBLK_F_ALL_RECOVERY_FLAGS);
+ return -EINVAL;
+ }
+
/*
* unprivileged device can't be trusted, but RECOVERY and
* RECOVERY_REISSUE still may hang error handling, so can't
@@ -2424,8 +2487,7 @@ static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd)
goto out_unlock;
mutex_init(&ub->mutex);
spin_lock_init(&ub->lock);
- INIT_WORK(&ub->quiesce_work, ublk_quiesce_work_fn);
- INIT_WORK(&ub->stop_work, ublk_stop_work_fn);
+ INIT_WORK(&ub->nosrv_work, ublk_nosrv_work);
ret = ublk_alloc_dev_number(ub, header->dev_id);
if (ret < 0)
@@ -2560,9 +2622,7 @@ static inline void ublk_ctrl_cmd_dump(struct io_uring_cmd *cmd)
static int ublk_ctrl_stop_dev(struct ublk_device *ub)
{
ublk_stop_dev(ub);
- cancel_work_sync(&ub->stop_work);
- cancel_work_sync(&ub->quiesce_work);
-
+ cancel_work_sync(&ub->nosrv_work);
return 0;
}
@@ -2699,7 +2759,7 @@ static int ublk_ctrl_start_recovery(struct ublk_device *ub,
int i;
mutex_lock(&ub->mutex);
- if (!ublk_can_use_recovery(ub))
+ if (ublk_nosrv_should_stop_dev(ub))
goto out_unlock;
if (!ub->nr_queues_ready)
goto out_unlock;
@@ -2710,14 +2770,18 @@ static int ublk_ctrl_start_recovery(struct ublk_device *ub,
* and related io_uring ctx is freed so file struct of /dev/ublkcX is
* released.
*
+ * and one of the following holds
+ *
* (2) UBLK_S_DEV_QUIESCED is set, which means the quiesce_work:
* (a)has quiesced request queue
* (b)has requeued every inflight rqs whose io_flags is ACTIVE
* (c)has requeued/aborted every inflight rqs whose io_flags is NOT ACTIVE
* (d)has completed/camceled all ioucmds owned by ther dying process
+ *
+ * (3) UBLK_S_DEV_FAIL_IO is set, which means the queue is not
+ * quiesced, but all I/O is being immediately errored
*/
- if (test_bit(UB_STATE_OPEN, &ub->state) ||
- ub->dev_info.state != UBLK_S_DEV_QUIESCED) {
+ if (test_bit(UB_STATE_OPEN, &ub->state) || !ublk_dev_in_recoverable_state(ub)) {
ret = -EBUSY;
goto out_unlock;
}
@@ -2741,6 +2805,7 @@ static int ublk_ctrl_end_recovery(struct ublk_device *ub,
const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe);
int ublksrv_pid = (int)header->data[0];
int ret = -EINVAL;
+ int i;
pr_devel("%s: Waiting for new ubq_daemons(nr: %d) are ready, dev id %d...\n",
__func__, ub->dev_info.nr_hw_queues, header->dev_id);
@@ -2752,21 +2817,32 @@ static int ublk_ctrl_end_recovery(struct ublk_device *ub,
__func__, ub->dev_info.nr_hw_queues, header->dev_id);
mutex_lock(&ub->mutex);
- if (!ublk_can_use_recovery(ub))
+ if (ublk_nosrv_should_stop_dev(ub))
goto out_unlock;
- if (ub->dev_info.state != UBLK_S_DEV_QUIESCED) {
+ if (!ublk_dev_in_recoverable_state(ub)) {
ret = -EBUSY;
goto out_unlock;
}
ub->dev_info.ublksrv_pid = ublksrv_pid;
pr_devel("%s: new ublksrv_pid %d, dev id %d\n",
__func__, ublksrv_pid, header->dev_id);
- blk_mq_unquiesce_queue(ub->ub_disk->queue);
- pr_devel("%s: queue unquiesced, dev id %d.\n",
- __func__, header->dev_id);
- blk_mq_kick_requeue_list(ub->ub_disk->queue);
- ub->dev_info.state = UBLK_S_DEV_LIVE;
+
+ if (ublk_nosrv_dev_should_queue_io(ub)) {
+ ub->dev_info.state = UBLK_S_DEV_LIVE;
+ blk_mq_unquiesce_queue(ub->ub_disk->queue);
+ pr_devel("%s: queue unquiesced, dev id %d.\n",
+ __func__, header->dev_id);
+ blk_mq_kick_requeue_list(ub->ub_disk->queue);
+ } else {
+ blk_mq_quiesce_queue(ub->ub_disk->queue);
+ ub->dev_info.state = UBLK_S_DEV_LIVE;
+ for (i = 0; i < ub->dev_info.nr_hw_queues; i++) {
+ ublk_get_queue(ub, i)->fail_io = false;
+ }
+ blk_mq_unquiesce_queue(ub->ub_disk->queue);
+ }
+
ret = 0;
out_unlock:
mutex_unlock(&ub->mutex);
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 194417a..c0cdba7 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -471,18 +471,18 @@ static bool virtblk_prep_rq_batch(struct request *req)
return virtblk_prep_rq(req->mq_hctx, vblk, req, vbr) == BLK_STS_OK;
}
-static bool virtblk_add_req_batch(struct virtio_blk_vq *vq,
- struct request **rqlist)
+static void virtblk_add_req_batch(struct virtio_blk_vq *vq,
+ struct rq_list *rqlist)
{
+ struct request *req;
unsigned long flags;
- int err;
bool kick;
spin_lock_irqsave(&vq->lock, flags);
- while (!rq_list_empty(*rqlist)) {
- struct request *req = rq_list_pop(rqlist);
+ while ((req = rq_list_pop(rqlist))) {
struct virtblk_req *vbr = blk_mq_rq_to_pdu(req);
+ int err;
err = virtblk_add_req(vq->vq, vbr);
if (err) {
@@ -495,37 +495,32 @@ static bool virtblk_add_req_batch(struct virtio_blk_vq *vq,
kick = virtqueue_kick_prepare(vq->vq);
spin_unlock_irqrestore(&vq->lock, flags);
- return kick;
+ if (kick)
+ virtqueue_notify(vq->vq);
}
-static void virtio_queue_rqs(struct request **rqlist)
+static void virtio_queue_rqs(struct rq_list *rqlist)
{
- struct request *req, *next, *prev = NULL;
- struct request *requeue_list = NULL;
+ struct rq_list submit_list = { };
+ struct rq_list requeue_list = { };
+ struct virtio_blk_vq *vq = NULL;
+ struct request *req;
- rq_list_for_each_safe(rqlist, req, next) {
- struct virtio_blk_vq *vq = get_virtio_blk_vq(req->mq_hctx);
- bool kick;
+ while ((req = rq_list_pop(rqlist))) {
+ struct virtio_blk_vq *this_vq = get_virtio_blk_vq(req->mq_hctx);
- if (!virtblk_prep_rq_batch(req)) {
- rq_list_move(rqlist, &requeue_list, req, prev);
- req = prev;
- if (!req)
- continue;
- }
+ if (vq && vq != this_vq)
+ virtblk_add_req_batch(vq, &submit_list);
+ vq = this_vq;
- if (!next || req->mq_hctx != next->mq_hctx) {
- req->rq_next = NULL;
- kick = virtblk_add_req_batch(vq, rqlist);
- if (kick)
- virtqueue_notify(vq->vq);
-
- *rqlist = next;
- prev = NULL;
- } else
- prev = req;
+ if (virtblk_prep_rq_batch(req))
+ rq_list_add_tail(&submit_list, req);
+ else
+ rq_list_add_tail(&requeue_list, req);
}
+ if (vq)
+ virtblk_add_req_batch(vq, &submit_list);
*rqlist = requeue_list;
}
@@ -784,7 +779,7 @@ static int virtblk_read_zoned_limits(struct virtio_blk *vblk,
wg, v);
return -ENODEV;
}
- lim->max_zone_append_sectors = v;
+ lim->max_hw_zone_append_sectors = v;
dev_dbg(&vdev->dev, "max append sectors = %u\n", v);
return 0;
diff --git a/drivers/bluetooth/btintel.c b/drivers/bluetooth/btintel.c
index 438b929..30a32eb 100644
--- a/drivers/bluetooth/btintel.c
+++ b/drivers/bluetooth/btintel.c
@@ -3288,13 +3288,12 @@ static int btintel_diagnostics(struct hci_dev *hdev, struct sk_buff *skb)
case INTEL_TLV_TEST_EXCEPTION:
/* Generate devcoredump from exception */
if (!hci_devcd_init(hdev, skb->len)) {
- hci_devcd_append(hdev, skb);
+ hci_devcd_append(hdev, skb_clone(skb, GFP_ATOMIC));
hci_devcd_complete(hdev);
} else {
bt_dev_err(hdev, "Failed to generate devcoredump");
- kfree_skb(skb);
}
- return 0;
+ break;
default:
bt_dev_err(hdev, "Invalid exception type %02X", tlv->val[0]);
}
diff --git a/drivers/char/hw_random/Kconfig b/drivers/char/hw_random/Kconfig
index b51d9e2..17854f0 100644
--- a/drivers/char/hw_random/Kconfig
+++ b/drivers/char/hw_random/Kconfig
@@ -50,7 +50,7 @@
config HW_RANDOM_AMD
tristate "AMD HW Random Number Generator support"
- depends on (X86 || PPC_MAPLE || COMPILE_TEST)
+ depends on (X86 || COMPILE_TEST)
depends on PCI && HAS_IOPORT_MAP
default HW_RANDOM
help
@@ -62,6 +62,19 @@
If unsure, say Y.
+config HW_RANDOM_AIROHA
+ tristate "Airoha True HW Random Number Generator support"
+ depends on ARCH_AIROHA || COMPILE_TEST
+ default HW_RANDOM
+ help
+ This driver provides kernel-side support for the True Random Number
+ Generator hardware found on Airoha SoC.
+
+ To compile this driver as a module, choose M here: the
+ module will be called airoha-rng.
+
+ If unsure, say Y.
+
config HW_RANDOM_ATMEL
tristate "Atmel Random Number Generator support"
depends on (ARCH_AT91 || COMPILE_TEST)
@@ -99,9 +112,22 @@
If unsure, say Y.
+config HW_RANDOM_BCM74110
+ tristate "Broadcom BCM74110 Random Number Generator support"
+ depends on ARCH_BRCMSTB || COMPILE_TEST
+ default HW_RANDOM
+ help
+ This driver provides kernel-side support for the Random Number
+ Generator hardware found on the Broadcom BCM74110 SoCs.
+
+ To compile this driver as a module, choose M here: the
+ module will be called bcm74110-rng
+
+ If unsure, say Y.
+
config HW_RANDOM_IPROC_RNG200
tristate "Broadcom iProc/STB RNG200 support"
- depends on ARCH_BCM_IPROC || ARCH_BCM2835 || ARCH_BRCMSTB || COMPILE_TEST
+ depends on ARCH_BCM_IPROC || ARCH_BCM2835 || ARCH_BCMBCA || ARCH_BRCMSTB || COMPILE_TEST
default HW_RANDOM
help
This driver provides kernel-side support for the RNG200
diff --git a/drivers/char/hw_random/Makefile b/drivers/char/hw_random/Makefile
index 01f012e..b9132b3 100644
--- a/drivers/char/hw_random/Makefile
+++ b/drivers/char/hw_random/Makefile
@@ -8,6 +8,7 @@
obj-$(CONFIG_HW_RANDOM_TIMERIOMEM) += timeriomem-rng.o
obj-$(CONFIG_HW_RANDOM_INTEL) += intel-rng.o
obj-$(CONFIG_HW_RANDOM_AMD) += amd-rng.o
+obj-$(CONFIG_HW_RANDOM_AIROHA) += airoha-trng.o
obj-$(CONFIG_HW_RANDOM_ATMEL) += atmel-rng.o
obj-$(CONFIG_HW_RANDOM_BA431) += ba431-rng.o
obj-$(CONFIG_HW_RANDOM_GEODE) += geode-rng.o
@@ -31,6 +32,7 @@
obj-$(CONFIG_HW_RANDOM_HISI) += hisi-rng.o
obj-$(CONFIG_HW_RANDOM_HISTB) += histb-rng.o
obj-$(CONFIG_HW_RANDOM_BCM2835) += bcm2835-rng.o
+obj-$(CONFIG_HW_RANDOM_BCM74110) += bcm74110-rng.o
obj-$(CONFIG_HW_RANDOM_IPROC_RNG200) += iproc-rng200.o
obj-$(CONFIG_HW_RANDOM_ST) += st-rng.o
obj-$(CONFIG_HW_RANDOM_XGENE) += xgene-rng.o
diff --git a/drivers/char/hw_random/airoha-trng.c b/drivers/char/hw_random/airoha-trng.c
new file mode 100644
index 0000000..1dbfa95
--- /dev/null
+++ b/drivers/char/hw_random/airoha-trng.c
@@ -0,0 +1,243 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2024 Christian Marangi */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/mod_devicetable.h>
+#include <linux/bitfield.h>
+#include <linux/delay.h>
+#include <linux/hw_random.h>
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/iopoll.h>
+#include <linux/platform_device.h>
+
+#define TRNG_IP_RDY 0x800
+#define CNT_TRANS GENMASK(15, 8)
+#define SAMPLE_RDY BIT(0)
+#define TRNG_NS_SEK_AND_DAT_EN 0x804
+#define RNG_EN BIT(31) /* referenced as ring_en */
+#define RAW_DATA_EN BIT(16)
+#define TRNG_HEALTH_TEST_SW_RST 0x808
+#define SW_RST BIT(0) /* Active High */
+#define TRNG_INTR_EN 0x818
+#define INTR_MASK BIT(16)
+#define CONTINUOUS_HEALTH_INITR_EN BIT(2)
+#define SW_STARTUP_INITR_EN BIT(1)
+#define RST_STARTUP_INITR_EN BIT(0)
+/* Notice that Health Test are done only out of Reset and with RNG_EN */
+#define TRNG_HEALTH_TEST_STATUS 0x824
+#define CONTINUOUS_HEALTH_AP_TEST_FAIL BIT(23)
+#define CONTINUOUS_HEALTH_RC_TEST_FAIL BIT(22)
+#define SW_STARTUP_TEST_DONE BIT(21)
+#define SW_STARTUP_AP_TEST_FAIL BIT(20)
+#define SW_STARTUP_RC_TEST_FAIL BIT(19)
+#define RST_STARTUP_TEST_DONE BIT(18)
+#define RST_STARTUP_AP_TEST_FAIL BIT(17)
+#define RST_STARTUP_RC_TEST_FAIL BIT(16)
+#define RAW_DATA_VALID BIT(7)
+
+#define TRNG_RAW_DATA_OUT 0x828
+
+#define TRNG_CNT_TRANS_VALID 0x80
+#define BUSY_LOOP_SLEEP 10
+#define BUSY_LOOP_TIMEOUT (BUSY_LOOP_SLEEP * 10000)
+
+struct airoha_trng {
+ void __iomem *base;
+ struct hwrng rng;
+ struct device *dev;
+
+ struct completion rng_op_done;
+};
+
+static int airoha_trng_irq_mask(struct airoha_trng *trng)
+{
+ u32 val;
+
+ val = readl(trng->base + TRNG_INTR_EN);
+ val |= INTR_MASK;
+ writel(val, trng->base + TRNG_INTR_EN);
+
+ return 0;
+}
+
+static int airoha_trng_irq_unmask(struct airoha_trng *trng)
+{
+ u32 val;
+
+ val = readl(trng->base + TRNG_INTR_EN);
+ val &= ~INTR_MASK;
+ writel(val, trng->base + TRNG_INTR_EN);
+
+ return 0;
+}
+
+static int airoha_trng_init(struct hwrng *rng)
+{
+ struct airoha_trng *trng = container_of(rng, struct airoha_trng, rng);
+ int ret;
+ u32 val;
+
+ val = readl(trng->base + TRNG_NS_SEK_AND_DAT_EN);
+ val |= RNG_EN;
+ writel(val, trng->base + TRNG_NS_SEK_AND_DAT_EN);
+
+ /* Set out of SW Reset */
+ airoha_trng_irq_unmask(trng);
+ writel(0, trng->base + TRNG_HEALTH_TEST_SW_RST);
+
+ ret = wait_for_completion_timeout(&trng->rng_op_done, BUSY_LOOP_TIMEOUT);
+ if (ret <= 0) {
+ dev_err(trng->dev, "Timeout waiting for Health Check\n");
+ airoha_trng_irq_mask(trng);
+ return -ENODEV;
+ }
+
+ /* Check if Health Test Failed */
+ val = readl(trng->base + TRNG_HEALTH_TEST_STATUS);
+ if (val & (RST_STARTUP_AP_TEST_FAIL | RST_STARTUP_RC_TEST_FAIL)) {
+ dev_err(trng->dev, "Health Check fail: %s test fail\n",
+ val & RST_STARTUP_AP_TEST_FAIL ? "AP" : "RC");
+ return -ENODEV;
+ }
+
+ /* Check if IP is ready */
+ ret = readl_poll_timeout(trng->base + TRNG_IP_RDY, val,
+ val & SAMPLE_RDY, 10, 1000);
+ if (ret < 0) {
+ dev_err(trng->dev, "Timeout waiting for IP ready");
+ return -ENODEV;
+ }
+
+ /* CNT_TRANS must be 0x80 for IP to be considered ready */
+ ret = readl_poll_timeout(trng->base + TRNG_IP_RDY, val,
+ FIELD_GET(CNT_TRANS, val) == TRNG_CNT_TRANS_VALID,
+ 10, 1000);
+ if (ret < 0) {
+ dev_err(trng->dev, "Timeout waiting for IP ready");
+ return -ENODEV;
+ }
+
+ return 0;
+}
+
+static void airoha_trng_cleanup(struct hwrng *rng)
+{
+ struct airoha_trng *trng = container_of(rng, struct airoha_trng, rng);
+ u32 val;
+
+ val = readl(trng->base + TRNG_NS_SEK_AND_DAT_EN);
+ val &= ~RNG_EN;
+ writel(val, trng->base + TRNG_NS_SEK_AND_DAT_EN);
+
+ /* Put it in SW Reset */
+ writel(SW_RST, trng->base + TRNG_HEALTH_TEST_SW_RST);
+}
+
+static int airoha_trng_read(struct hwrng *rng, void *buf, size_t max, bool wait)
+{
+ struct airoha_trng *trng = container_of(rng, struct airoha_trng, rng);
+ u32 *data = buf;
+ u32 status;
+ int ret;
+
+ ret = readl_poll_timeout(trng->base + TRNG_HEALTH_TEST_STATUS, status,
+ status & RAW_DATA_VALID, 10, 1000);
+ if (ret < 0) {
+ dev_err(trng->dev, "Timeout waiting for TRNG RAW Data valid\n");
+ return ret;
+ }
+
+ *data = readl(trng->base + TRNG_RAW_DATA_OUT);
+
+ return 4;
+}
+
+static irqreturn_t airoha_trng_irq(int irq, void *priv)
+{
+ struct airoha_trng *trng = (struct airoha_trng *)priv;
+
+ airoha_trng_irq_mask(trng);
+ /* Just complete the task, we will read the value later */
+ complete(&trng->rng_op_done);
+
+ return IRQ_HANDLED;
+}
+
+static int airoha_trng_probe(struct platform_device *pdev)
+{
+ struct device *dev = &pdev->dev;
+ struct airoha_trng *trng;
+ int irq, ret;
+ u32 val;
+
+ trng = devm_kzalloc(dev, sizeof(*trng), GFP_KERNEL);
+ if (!trng)
+ return -ENOMEM;
+
+ trng->base = devm_platform_ioremap_resource(pdev, 0);
+ if (IS_ERR(trng->base))
+ return PTR_ERR(trng->base);
+
+ irq = platform_get_irq(pdev, 0);
+ if (irq < 0)
+ return irq;
+
+ airoha_trng_irq_mask(trng);
+ ret = devm_request_irq(&pdev->dev, irq, airoha_trng_irq, 0,
+ pdev->name, (void *)trng);
+ if (ret) {
+ dev_err(dev, "Can't get interrupt working.\n");
+ return ret;
+ }
+
+ init_completion(&trng->rng_op_done);
+
+ /* Enable interrupt for SW reset Health Check */
+ val = readl(trng->base + TRNG_INTR_EN);
+ val |= RST_STARTUP_INITR_EN;
+ writel(val, trng->base + TRNG_INTR_EN);
+
+ /* Set output to raw data */
+ val = readl(trng->base + TRNG_NS_SEK_AND_DAT_EN);
+ val |= RAW_DATA_EN;
+ writel(val, trng->base + TRNG_NS_SEK_AND_DAT_EN);
+
+ /* Put it in SW Reset */
+ writel(SW_RST, trng->base + TRNG_HEALTH_TEST_SW_RST);
+
+ trng->dev = dev;
+ trng->rng.name = pdev->name;
+ trng->rng.init = airoha_trng_init;
+ trng->rng.cleanup = airoha_trng_cleanup;
+ trng->rng.read = airoha_trng_read;
+
+ ret = devm_hwrng_register(dev, &trng->rng);
+ if (ret) {
+ dev_err(dev, "failed to register rng device: %d\n", ret);
+ return ret;
+ }
+
+ return 0;
+}
+
+static const struct of_device_id airoha_trng_of_match[] = {
+ { .compatible = "airoha,en7581-trng", },
+ {},
+};
+MODULE_DEVICE_TABLE(of, airoha_trng_of_match);
+
+static struct platform_driver airoha_trng_driver = {
+ .driver = {
+ .name = "airoha-trng",
+ .of_match_table = airoha_trng_of_match,
+ },
+ .probe = airoha_trng_probe,
+};
+
+module_platform_driver(airoha_trng_driver);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Christian Marangi <ansuelsmth@gmail.com>");
+MODULE_DESCRIPTION("Airoha True Random Number Generator driver");
diff --git a/drivers/char/hw_random/atmel-rng.c b/drivers/char/hw_random/atmel-rng.c
index e915725..143406b 100644
--- a/drivers/char/hw_random/atmel-rng.c
+++ b/drivers/char/hw_random/atmel-rng.c
@@ -216,7 +216,7 @@ MODULE_DEVICE_TABLE(of, atmel_trng_dt_ids);
static struct platform_driver atmel_trng_driver = {
.probe = atmel_trng_probe,
- .remove_new = atmel_trng_remove,
+ .remove = atmel_trng_remove,
.driver = {
.name = "atmel-trng",
.pm = pm_ptr(&atmel_trng_pm_ops),
diff --git a/drivers/char/hw_random/bcm74110-rng.c b/drivers/char/hw_random/bcm74110-rng.c
new file mode 100644
index 0000000..5c64148
--- /dev/null
+++ b/drivers/char/hw_random/bcm74110-rng.c
@@ -0,0 +1,125 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2024 Broadcom
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/mod_devicetable.h>
+#include <linux/kernel.h>
+#include <linux/io.h>
+#include <linux/delay.h>
+#include <linux/platform_device.h>
+#include <linux/random.h>
+#include <linux/hw_random.h>
+
+#define HOST_REV_ID 0x00
+#define HOST_FIFO_DEPTH 0x04
+#define HOST_FIFO_COUNT 0x08
+#define HOST_FIFO_THRESHOLD 0x0c
+#define HOST_FIFO_DATA 0x10
+
+#define HOST_FIFO_COUNT_MASK 0xffff
+
+/* Delay range in microseconds */
+#define FIFO_DELAY_MIN_US 3
+#define FIFO_DELAY_MAX_US 7
+#define FIFO_DELAY_MAX_COUNT 10
+
+struct bcm74110_priv {
+ void __iomem *base;
+};
+
+static inline int bcm74110_rng_fifo_count(void __iomem *mem)
+{
+ return readl_relaxed(mem) & HOST_FIFO_COUNT_MASK;
+}
+
+static int bcm74110_rng_read(struct hwrng *rng, void *buf, size_t max,
+ bool wait)
+{
+ struct bcm74110_priv *priv = (struct bcm74110_priv *)rng->priv;
+ void __iomem *fc_addr = priv->base + HOST_FIFO_COUNT;
+ void __iomem *fd_addr = priv->base + HOST_FIFO_DATA;
+ unsigned underrun_count = 0;
+ u32 max_words = max / sizeof(u32);
+ u32 num_words;
+ unsigned i;
+
+ /*
+ * We need to check how many words are available in the RNG FIFO. If
+ * there aren't any, we need to wait for some to become available.
+ */
+ while ((num_words = bcm74110_rng_fifo_count(fc_addr)) == 0) {
+ if (!wait)
+ return 0;
+ /*
+ * As a precaution, limit how long we wait. If the FIFO doesn't
+ * refill within the allotted time, return 0 (=no data) to the
+ * caller.
+ */
+ if (likely(underrun_count < FIFO_DELAY_MAX_COUNT))
+ usleep_range(FIFO_DELAY_MIN_US, FIFO_DELAY_MAX_US);
+ else
+ return 0;
+ underrun_count++;
+ }
+ if (num_words > max_words)
+ num_words = max_words;
+
+ /* Bail early if we run out of random numbers unexpectedly */
+ for (i = 0; i < num_words && bcm74110_rng_fifo_count(fc_addr) > 0; i++)
+ ((u32 *)buf)[i] = readl_relaxed(fd_addr);
+
+ return i * sizeof(u32);
+}
+
+static struct hwrng bcm74110_hwrng = {
+ .read = bcm74110_rng_read,
+};
+
+static int bcm74110_rng_probe(struct platform_device *pdev)
+{
+ struct device *dev = &pdev->dev;
+ struct bcm74110_priv *priv;
+ int rc;
+
+ priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL);
+ if (!priv)
+ return -ENOMEM;
+
+ bcm74110_hwrng.name = pdev->name;
+ bcm74110_hwrng.priv = (unsigned long)priv;
+
+ priv->base = devm_platform_ioremap_resource(pdev, 0);
+ if (IS_ERR(priv->base))
+ return PTR_ERR(priv->base);
+
+ rc = devm_hwrng_register(dev, &bcm74110_hwrng);
+ if (rc)
+ dev_err(dev, "hwrng registration failed (%d)\n", rc);
+ else
+ dev_info(dev, "hwrng registered\n");
+
+ return rc;
+}
+
+static const struct of_device_id bcm74110_rng_match[] = {
+ { .compatible = "brcm,bcm74110-rng", },
+ {},
+};
+MODULE_DEVICE_TABLE(of, bcm74110_rng_match);
+
+static struct platform_driver bcm74110_rng_driver = {
+ .driver = {
+ .name = KBUILD_MODNAME,
+ .of_match_table = bcm74110_rng_match,
+ },
+ .probe = bcm74110_rng_probe,
+};
+module_platform_driver(bcm74110_rng_driver);
+
+MODULE_AUTHOR("Markus Mayer <mmayer@broadcom.com>");
+MODULE_DESCRIPTION("BCM 74110 Random Number Generator (RNG) driver");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/char/hw_random/cctrng.c b/drivers/char/hw_random/cctrng.c
index 4c50efc4..4db1988 100644
--- a/drivers/char/hw_random/cctrng.c
+++ b/drivers/char/hw_random/cctrng.c
@@ -653,7 +653,7 @@ static struct platform_driver cctrng_driver = {
.pm = &cctrng_pm,
},
.probe = cctrng_probe,
- .remove_new = cctrng_remove,
+ .remove = cctrng_remove,
};
module_platform_driver(cctrng_driver);
diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c
index 57c51ef..018316f 100644
--- a/drivers/char/hw_random/core.c
+++ b/drivers/char/hw_random/core.c
@@ -181,8 +181,15 @@ static inline int rng_get_data(struct hwrng *rng, u8 *buffer, size_t size,
int present;
BUG_ON(!mutex_is_locked(&reading_mutex));
- if (rng->read)
- return rng->read(rng, (void *)buffer, size, wait);
+ if (rng->read) {
+ int err;
+
+ err = rng->read(rng, buffer, size, wait);
+ if (WARN_ON_ONCE(err > 0 && err > size))
+ err = size;
+
+ return err;
+ }
if (rng->data_present)
present = rng->data_present(rng, wait);
diff --git a/drivers/char/hw_random/exynos-trng.c b/drivers/char/hw_random/exynos-trng.c
index 9f039fd..02e207c 100644
--- a/drivers/char/hw_random/exynos-trng.c
+++ b/drivers/char/hw_random/exynos-trng.c
@@ -335,7 +335,7 @@ static struct platform_driver exynos_trng_driver = {
.of_match_table = exynos_trng_dt_match,
},
.probe = exynos_trng_probe,
- .remove_new = exynos_trng_remove,
+ .remove = exynos_trng_remove,
};
module_platform_driver(exynos_trng_driver);
diff --git a/drivers/char/hw_random/histb-rng.c b/drivers/char/hw_random/histb-rng.c
index f652e11..1b91e88 100644
--- a/drivers/char/hw_random/histb-rng.c
+++ b/drivers/char/hw_random/histb-rng.c
@@ -89,7 +89,7 @@ depth_show(struct device *dev, struct device_attribute *attr, char *buf)
struct histb_rng_priv *priv = dev_get_drvdata(dev);
void __iomem *base = priv->base;
- return sprintf(buf, "%d\n", histb_rng_get_depth(base));
+ return sprintf(buf, "%u\n", histb_rng_get_depth(base));
}
static ssize_t
diff --git a/drivers/char/hw_random/ingenic-rng.c b/drivers/char/hw_random/ingenic-rng.c
index 2f9b648..bbfd662 100644
--- a/drivers/char/hw_random/ingenic-rng.c
+++ b/drivers/char/hw_random/ingenic-rng.c
@@ -132,7 +132,7 @@ MODULE_DEVICE_TABLE(of, ingenic_rng_of_match);
static struct platform_driver ingenic_rng_driver = {
.probe = ingenic_rng_probe,
- .remove_new = ingenic_rng_remove,
+ .remove = ingenic_rng_remove,
.driver = {
.name = "ingenic-rng",
.of_match_table = ingenic_rng_of_match,
diff --git a/drivers/char/hw_random/ks-sa-rng.c b/drivers/char/hw_random/ks-sa-rng.c
index 36c3425..d8fd8a3 100644
--- a/drivers/char/hw_random/ks-sa-rng.c
+++ b/drivers/char/hw_random/ks-sa-rng.c
@@ -261,7 +261,7 @@ static struct platform_driver ks_sa_rng_driver = {
.of_match_table = ks_sa_rng_dt_match,
},
.probe = ks_sa_rng_probe,
- .remove_new = ks_sa_rng_remove,
+ .remove = ks_sa_rng_remove,
};
module_platform_driver(ks_sa_rng_driver);
diff --git a/drivers/char/hw_random/mxc-rnga.c b/drivers/char/hw_random/mxc-rnga.c
index f01eb95..e3fcb8bc 100644
--- a/drivers/char/hw_random/mxc-rnga.c
+++ b/drivers/char/hw_random/mxc-rnga.c
@@ -188,7 +188,7 @@ static struct platform_driver mxc_rnga_driver = {
.of_match_table = mxc_rnga_of_match,
},
.probe = mxc_rnga_probe,
- .remove_new = mxc_rnga_remove,
+ .remove = mxc_rnga_remove,
};
module_platform_driver(mxc_rnga_driver);
diff --git a/drivers/char/hw_random/n2-drv.c b/drivers/char/hw_random/n2-drv.c
index 1b49e3a..ea6d5599 100644
--- a/drivers/char/hw_random/n2-drv.c
+++ b/drivers/char/hw_random/n2-drv.c
@@ -858,7 +858,7 @@ static struct platform_driver n2rng_driver = {
.of_match_table = n2rng_match,
},
.probe = n2rng_probe,
- .remove_new = n2rng_remove,
+ .remove = n2rng_remove,
};
module_platform_driver(n2rng_driver);
diff --git a/drivers/char/hw_random/npcm-rng.c b/drivers/char/hw_random/npcm-rng.c
index bce8c48..9ff00f0 100644
--- a/drivers/char/hw_random/npcm-rng.c
+++ b/drivers/char/hw_random/npcm-rng.c
@@ -176,7 +176,7 @@ static struct platform_driver npcm_rng_driver = {
.of_match_table = of_match_ptr(rng_dt_id),
},
.probe = npcm_rng_probe,
- .remove_new = npcm_rng_remove,
+ .remove = npcm_rng_remove,
};
module_platform_driver(npcm_rng_driver);
diff --git a/drivers/char/hw_random/omap-rng.c b/drivers/char/hw_random/omap-rng.c
index 4914a87..5e8b50f 100644
--- a/drivers/char/hw_random/omap-rng.c
+++ b/drivers/char/hw_random/omap-rng.c
@@ -558,7 +558,7 @@ static struct platform_driver omap_rng_driver = {
.of_match_table = of_match_ptr(omap_rng_of_match),
},
.probe = omap_rng_probe,
- .remove_new = omap_rng_remove,
+ .remove = omap_rng_remove,
};
module_platform_driver(omap_rng_driver);
diff --git a/drivers/char/hw_random/stm32-rng.c b/drivers/char/hw_random/stm32-rng.c
index 9d041a6..98edbe7 100644
--- a/drivers/char/hw_random/stm32-rng.c
+++ b/drivers/char/hw_random/stm32-rng.c
@@ -4,6 +4,7 @@
*/
#include <linux/clk.h>
+#include <linux/clk-provider.h>
#include <linux/delay.h>
#include <linux/hw_random.h>
#include <linux/io.h>
@@ -49,6 +50,7 @@
struct stm32_rng_data {
uint max_clock_rate;
+ uint nb_clock;
u32 cr;
u32 nscr;
u32 htcr;
@@ -72,7 +74,7 @@ struct stm32_rng_private {
struct hwrng rng;
struct device *dev;
void __iomem *base;
- struct clk *clk;
+ struct clk_bulk_data *clk_bulk;
struct reset_control *rst;
struct stm32_rng_config pm_conf;
const struct stm32_rng_data *data;
@@ -266,7 +268,7 @@ static uint stm32_rng_clock_freq_restrain(struct hwrng *rng)
unsigned long clock_rate = 0;
uint clock_div = 0;
- clock_rate = clk_get_rate(priv->clk);
+ clock_rate = clk_get_rate(priv->clk_bulk[0].clk);
/*
* Get the exponent to apply on the CLKDIV field in RNG_CR register
@@ -276,7 +278,7 @@ static uint stm32_rng_clock_freq_restrain(struct hwrng *rng)
while ((clock_rate >> clock_div) > priv->data->max_clock_rate)
clock_div++;
- pr_debug("RNG clk rate : %lu\n", clk_get_rate(priv->clk) >> clock_div);
+ pr_debug("RNG clk rate : %lu\n", clk_get_rate(priv->clk_bulk[0].clk) >> clock_div);
return clock_div;
}
@@ -288,7 +290,7 @@ static int stm32_rng_init(struct hwrng *rng)
int err;
u32 reg;
- err = clk_prepare_enable(priv->clk);
+ err = clk_bulk_prepare_enable(priv->data->nb_clock, priv->clk_bulk);
if (err)
return err;
@@ -328,7 +330,7 @@ static int stm32_rng_init(struct hwrng *rng)
(!(reg & RNG_CR_CONDRST)),
10, 50000);
if (err) {
- clk_disable_unprepare(priv->clk);
+ clk_bulk_disable_unprepare(priv->data->nb_clock, priv->clk_bulk);
dev_err(priv->dev, "%s: timeout %x!\n", __func__, reg);
return -EINVAL;
}
@@ -356,12 +358,13 @@ static int stm32_rng_init(struct hwrng *rng)
reg & RNG_SR_DRDY,
10, 100000);
if (err || (reg & ~RNG_SR_DRDY)) {
- clk_disable_unprepare(priv->clk);
+ clk_bulk_disable_unprepare(priv->data->nb_clock, priv->clk_bulk);
dev_err(priv->dev, "%s: timeout:%x SR: %x!\n", __func__, err, reg);
+
return -EINVAL;
}
- clk_disable_unprepare(priv->clk);
+ clk_bulk_disable_unprepare(priv->data->nb_clock, priv->clk_bulk);
return 0;
}
@@ -379,7 +382,8 @@ static int __maybe_unused stm32_rng_runtime_suspend(struct device *dev)
reg = readl_relaxed(priv->base + RNG_CR);
reg &= ~RNG_CR_RNGEN;
writel_relaxed(reg, priv->base + RNG_CR);
- clk_disable_unprepare(priv->clk);
+
+ clk_bulk_disable_unprepare(priv->data->nb_clock, priv->clk_bulk);
return 0;
}
@@ -389,7 +393,7 @@ static int __maybe_unused stm32_rng_suspend(struct device *dev)
struct stm32_rng_private *priv = dev_get_drvdata(dev);
int err;
- err = clk_prepare_enable(priv->clk);
+ err = clk_bulk_prepare_enable(priv->data->nb_clock, priv->clk_bulk);
if (err)
return err;
@@ -403,7 +407,7 @@ static int __maybe_unused stm32_rng_suspend(struct device *dev)
writel_relaxed(priv->pm_conf.cr, priv->base + RNG_CR);
- clk_disable_unprepare(priv->clk);
+ clk_bulk_disable_unprepare(priv->data->nb_clock, priv->clk_bulk);
return 0;
}
@@ -414,7 +418,7 @@ static int __maybe_unused stm32_rng_runtime_resume(struct device *dev)
int err;
u32 reg;
- err = clk_prepare_enable(priv->clk);
+ err = clk_bulk_prepare_enable(priv->data->nb_clock, priv->clk_bulk);
if (err)
return err;
@@ -434,7 +438,7 @@ static int __maybe_unused stm32_rng_resume(struct device *dev)
int err;
u32 reg;
- err = clk_prepare_enable(priv->clk);
+ err = clk_bulk_prepare_enable(priv->data->nb_clock, priv->clk_bulk);
if (err)
return err;
@@ -462,7 +466,7 @@ static int __maybe_unused stm32_rng_resume(struct device *dev)
reg & ~RNG_CR_CONDRST, 10, 100000);
if (err) {
- clk_disable_unprepare(priv->clk);
+ clk_bulk_disable_unprepare(priv->data->nb_clock, priv->clk_bulk);
dev_err(priv->dev, "%s: timeout:%x CR: %x!\n", __func__, err, reg);
return -EINVAL;
}
@@ -472,7 +476,7 @@ static int __maybe_unused stm32_rng_resume(struct device *dev)
writel_relaxed(reg, priv->base + RNG_CR);
}
- clk_disable_unprepare(priv->clk);
+ clk_bulk_disable_unprepare(priv->data->nb_clock, priv->clk_bulk);
return 0;
}
@@ -484,9 +488,19 @@ static const struct dev_pm_ops __maybe_unused stm32_rng_pm_ops = {
stm32_rng_resume)
};
+static const struct stm32_rng_data stm32mp25_rng_data = {
+ .has_cond_reset = true,
+ .max_clock_rate = 48000000,
+ .nb_clock = 2,
+ .cr = 0x00F00D00,
+ .nscr = 0x2B5BB,
+ .htcr = 0x969D,
+};
+
static const struct stm32_rng_data stm32mp13_rng_data = {
.has_cond_reset = true,
.max_clock_rate = 48000000,
+ .nb_clock = 1,
.cr = 0x00F00D00,
.nscr = 0x2B5BB,
.htcr = 0x969D,
@@ -494,11 +508,16 @@ static const struct stm32_rng_data stm32mp13_rng_data = {
static const struct stm32_rng_data stm32_rng_data = {
.has_cond_reset = false,
- .max_clock_rate = 3000000,
+ .max_clock_rate = 48000000,
+ .nb_clock = 1,
};
static const struct of_device_id stm32_rng_match[] = {
{
+ .compatible = "st,stm32mp25-rng",
+ .data = &stm32mp25_rng_data,
+ },
+ {
.compatible = "st,stm32mp13-rng",
.data = &stm32mp13_rng_data,
},
@@ -516,6 +535,7 @@ static int stm32_rng_probe(struct platform_device *ofdev)
struct device_node *np = ofdev->dev.of_node;
struct stm32_rng_private *priv;
struct resource *res;
+ int ret;
priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL);
if (!priv)
@@ -525,10 +545,6 @@ static int stm32_rng_probe(struct platform_device *ofdev)
if (IS_ERR(priv->base))
return PTR_ERR(priv->base);
- priv->clk = devm_clk_get(&ofdev->dev, NULL);
- if (IS_ERR(priv->clk))
- return PTR_ERR(priv->clk);
-
priv->rst = devm_reset_control_get(&ofdev->dev, NULL);
if (!IS_ERR(priv->rst)) {
reset_control_assert(priv->rst);
@@ -551,6 +567,28 @@ static int stm32_rng_probe(struct platform_device *ofdev)
priv->rng.read = stm32_rng_read;
priv->rng.quality = 900;
+ if (!priv->data->nb_clock || priv->data->nb_clock > 2)
+ return -EINVAL;
+
+ ret = devm_clk_bulk_get_all(dev, &priv->clk_bulk);
+ if (ret != priv->data->nb_clock)
+ return dev_err_probe(dev, -EINVAL, "Failed to get clocks: %d\n", ret);
+
+ if (priv->data->nb_clock == 2) {
+ const char *id = priv->clk_bulk[1].id;
+ struct clk *clk = priv->clk_bulk[1].clk;
+
+ if (!priv->clk_bulk[0].id || !priv->clk_bulk[1].id)
+ return dev_err_probe(dev, -EINVAL, "Missing clock name\n");
+
+ if (strcmp(priv->clk_bulk[0].id, "core")) {
+ priv->clk_bulk[1].id = priv->clk_bulk[0].id;
+ priv->clk_bulk[1].clk = priv->clk_bulk[0].clk;
+ priv->clk_bulk[0].id = id;
+ priv->clk_bulk[0].clk = clk;
+ }
+ }
+
pm_runtime_set_autosuspend_delay(dev, 100);
pm_runtime_use_autosuspend(dev);
pm_runtime_enable(dev);
@@ -565,7 +603,7 @@ static struct platform_driver stm32_rng_driver = {
.of_match_table = stm32_rng_match,
},
.probe = stm32_rng_probe,
- .remove_new = stm32_rng_remove,
+ .remove = stm32_rng_remove,
};
module_platform_driver(stm32_rng_driver);
diff --git a/drivers/char/hw_random/timeriomem-rng.c b/drivers/char/hw_random/timeriomem-rng.c
index 65b8260..7174bfc 100644
--- a/drivers/char/hw_random/timeriomem-rng.c
+++ b/drivers/char/hw_random/timeriomem-rng.c
@@ -193,7 +193,7 @@ static struct platform_driver timeriomem_rng_driver = {
.of_match_table = timeriomem_rng_match,
},
.probe = timeriomem_rng_probe,
- .remove_new = timeriomem_rng_remove,
+ .remove = timeriomem_rng_remove,
};
module_platform_driver(timeriomem_rng_driver);
diff --git a/drivers/char/hw_random/xgene-rng.c b/drivers/char/hw_random/xgene-rng.c
index 642d135..39acaa5 100644
--- a/drivers/char/hw_random/xgene-rng.c
+++ b/drivers/char/hw_random/xgene-rng.c
@@ -375,7 +375,7 @@ MODULE_DEVICE_TABLE(of, xgene_rng_of_match);
static struct platform_driver xgene_rng_driver = {
.probe = xgene_rng_probe,
- .remove_new = xgene_rng_remove,
+ .remove = xgene_rng_remove,
.driver = {
.name = "xgene-rng",
.of_match_table = xgene_rng_of_match,
diff --git a/drivers/char/tpm/tpm-buf.c b/drivers/char/tpm/tpm-buf.c
index cad0048..e49a19f 100644
--- a/drivers/char/tpm/tpm-buf.c
+++ b/drivers/char/tpm/tpm-buf.c
@@ -147,6 +147,26 @@ void tpm_buf_append_u32(struct tpm_buf *buf, const u32 value)
EXPORT_SYMBOL_GPL(tpm_buf_append_u32);
/**
+ * tpm_buf_append_handle() - Add a handle
+ * @chip: &tpm_chip instance
+ * @buf: &tpm_buf instance
+ * @handle: a TPM object handle
+ *
+ * Add a handle to the buffer, and increase the count tracking the number of
+ * handles in the command buffer. Works only for command buffers.
+ */
+void tpm_buf_append_handle(struct tpm_chip *chip, struct tpm_buf *buf, u32 handle)
+{
+ if (buf->flags & TPM_BUF_TPM2B) {
+ dev_err(&chip->dev, "Invalid buffer type (TPM2B)\n");
+ return;
+ }
+
+ tpm_buf_append_u32(buf, handle);
+ buf->handles++;
+}
+
+/**
* tpm_buf_read() - Read from a TPM buffer
* @buf: &tpm_buf instance
* @offset: offset within the buffer
diff --git a/drivers/char/tpm/tpm2-cmd.c b/drivers/char/tpm/tpm2-cmd.c
index 1e85625..dfdcbd0 100644
--- a/drivers/char/tpm/tpm2-cmd.c
+++ b/drivers/char/tpm/tpm2-cmd.c
@@ -14,6 +14,10 @@
#include "tpm.h"
#include <crypto/hash_info.h>
+static bool disable_pcr_integrity;
+module_param(disable_pcr_integrity, bool, 0444);
+MODULE_PARM_DESC(disable_pcr_integrity, "Disable integrity protection of TPM2_PCR_Extend");
+
static struct tpm2_hash tpm2_hash_map[] = {
{HASH_ALGO_SHA1, TPM_ALG_SHA1},
{HASH_ALGO_SHA256, TPM_ALG_SHA256},
@@ -232,18 +236,26 @@ int tpm2_pcr_extend(struct tpm_chip *chip, u32 pcr_idx,
int rc;
int i;
- rc = tpm2_start_auth_session(chip);
- if (rc)
- return rc;
+ if (!disable_pcr_integrity) {
+ rc = tpm2_start_auth_session(chip);
+ if (rc)
+ return rc;
+ }
rc = tpm_buf_init(&buf, TPM2_ST_SESSIONS, TPM2_CC_PCR_EXTEND);
if (rc) {
- tpm2_end_auth_session(chip);
+ if (!disable_pcr_integrity)
+ tpm2_end_auth_session(chip);
return rc;
}
- tpm_buf_append_name(chip, &buf, pcr_idx, NULL);
- tpm_buf_append_hmac_session(chip, &buf, 0, NULL, 0);
+ if (!disable_pcr_integrity) {
+ tpm_buf_append_name(chip, &buf, pcr_idx, NULL);
+ tpm_buf_append_hmac_session(chip, &buf, 0, NULL, 0);
+ } else {
+ tpm_buf_append_handle(chip, &buf, pcr_idx);
+ tpm_buf_append_auth(chip, &buf, 0, NULL, 0);
+ }
tpm_buf_append_u32(&buf, chip->nr_allocated_banks);
@@ -253,9 +265,11 @@ int tpm2_pcr_extend(struct tpm_chip *chip, u32 pcr_idx,
chip->allocated_banks[i].digest_size);
}
- tpm_buf_fill_hmac_session(chip, &buf);
+ if (!disable_pcr_integrity)
+ tpm_buf_fill_hmac_session(chip, &buf);
rc = tpm_transmit_cmd(chip, &buf, 0, "attempting extend a PCR value");
- rc = tpm_buf_check_hmac_response(chip, &buf, rc);
+ if (!disable_pcr_integrity)
+ rc = tpm_buf_check_hmac_response(chip, &buf, rc);
tpm_buf_destroy(&buf);
diff --git a/drivers/char/tpm/tpm2-sessions.c b/drivers/char/tpm/tpm2-sessions.c
index 0739830..b0f13c8 100644
--- a/drivers/char/tpm/tpm2-sessions.c
+++ b/drivers/char/tpm/tpm2-sessions.c
@@ -237,9 +237,7 @@ void tpm_buf_append_name(struct tpm_chip *chip, struct tpm_buf *buf,
#endif
if (!tpm2_chip_auth(chip)) {
- tpm_buf_append_u32(buf, handle);
- /* count the number of handles in the upper bits of flags */
- buf->handles++;
+ tpm_buf_append_handle(chip, buf, handle);
return;
}
@@ -272,6 +270,31 @@ void tpm_buf_append_name(struct tpm_chip *chip, struct tpm_buf *buf,
}
EXPORT_SYMBOL_GPL(tpm_buf_append_name);
+void tpm_buf_append_auth(struct tpm_chip *chip, struct tpm_buf *buf,
+ u8 attributes, u8 *passphrase, int passphrase_len)
+{
+ /* offset tells us where the sessions area begins */
+ int offset = buf->handles * 4 + TPM_HEADER_SIZE;
+ u32 len = 9 + passphrase_len;
+
+ if (tpm_buf_length(buf) != offset) {
+ /* not the first session so update the existing length */
+ len += get_unaligned_be32(&buf->data[offset]);
+ put_unaligned_be32(len, &buf->data[offset]);
+ } else {
+ tpm_buf_append_u32(buf, len);
+ }
+ /* auth handle */
+ tpm_buf_append_u32(buf, TPM2_RS_PW);
+ /* nonce */
+ tpm_buf_append_u16(buf, 0);
+ /* attributes */
+ tpm_buf_append_u8(buf, 0);
+ /* passphrase */
+ tpm_buf_append_u16(buf, passphrase_len);
+ tpm_buf_append(buf, passphrase, passphrase_len);
+}
+
/**
* tpm_buf_append_hmac_session() - Append a TPM session element
* @chip: the TPM chip structure
@@ -309,26 +332,8 @@ void tpm_buf_append_hmac_session(struct tpm_chip *chip, struct tpm_buf *buf,
#endif
if (!tpm2_chip_auth(chip)) {
- /* offset tells us where the sessions area begins */
- int offset = buf->handles * 4 + TPM_HEADER_SIZE;
- u32 len = 9 + passphrase_len;
-
- if (tpm_buf_length(buf) != offset) {
- /* not the first session so update the existing length */
- len += get_unaligned_be32(&buf->data[offset]);
- put_unaligned_be32(len, &buf->data[offset]);
- } else {
- tpm_buf_append_u32(buf, len);
- }
- /* auth handle */
- tpm_buf_append_u32(buf, TPM2_RS_PW);
- /* nonce */
- tpm_buf_append_u16(buf, 0);
- /* attributes */
- tpm_buf_append_u8(buf, 0);
- /* passphrase */
- tpm_buf_append_u16(buf, passphrase_len);
- tpm_buf_append(buf, passphrase, passphrase_len);
+ tpm_buf_append_auth(chip, buf, attributes, passphrase,
+ passphrase_len);
return;
}
@@ -948,10 +953,13 @@ static int tpm2_load_null(struct tpm_chip *chip, u32 *null_key)
/* Deduce from the name change TPM interference: */
dev_err(&chip->dev, "null key integrity check failed\n");
tpm2_flush_context(chip, tmp_null_key);
- chip->flags |= TPM_CHIP_FLAG_DISABLE;
err:
- return rc ? -ENODEV : 0;
+ if (rc) {
+ chip->flags |= TPM_CHIP_FLAG_DISABLE;
+ rc = -ENODEV;
+ }
+ return rc;
}
/**
diff --git a/drivers/cpufreq/acpi-cpufreq.c b/drivers/cpufreq/acpi-cpufreq.c
index 0f04feb..c9ebacf 100644
--- a/drivers/cpufreq/acpi-cpufreq.c
+++ b/drivers/cpufreq/acpi-cpufreq.c
@@ -73,20 +73,17 @@ static unsigned int acpi_pstate_strict;
static bool boost_state(unsigned int cpu)
{
- u32 lo, hi;
u64 msr;
switch (boot_cpu_data.x86_vendor) {
case X86_VENDOR_INTEL:
case X86_VENDOR_CENTAUR:
case X86_VENDOR_ZHAOXIN:
- rdmsr_on_cpu(cpu, MSR_IA32_MISC_ENABLE, &lo, &hi);
- msr = lo | ((u64)hi << 32);
+ rdmsrl_on_cpu(cpu, MSR_IA32_MISC_ENABLE, &msr);
return !(msr & MSR_IA32_MISC_ENABLE_TURBO_DISABLE);
case X86_VENDOR_HYGON:
case X86_VENDOR_AMD:
- rdmsr_on_cpu(cpu, MSR_K7_HWCR, &lo, &hi);
- msr = lo | ((u64)hi << 32);
+ rdmsrl_on_cpu(cpu, MSR_K7_HWCR, &msr);
return !(msr & MSR_K7_HWCR_CPB_DIS);
}
return false;
@@ -1028,7 +1025,7 @@ static struct platform_driver acpi_cpufreq_platdrv = {
.driver = {
.name = "acpi-cpufreq",
},
- .remove_new = acpi_cpufreq_remove,
+ .remove = acpi_cpufreq_remove,
};
static int __init acpi_cpufreq_init(void)
diff --git a/drivers/cpufreq/amd-pstate-ut.c b/drivers/cpufreq/amd-pstate-ut.c
index f667015..a261d73 100644
--- a/drivers/cpufreq/amd-pstate-ut.c
+++ b/drivers/cpufreq/amd-pstate-ut.c
@@ -227,10 +227,10 @@ static void amd_pstate_ut_check_freq(u32 index)
goto skip_test;
}
- if (cpudata->min_freq != policy->min) {
+ if (cpudata->lowest_nonlinear_freq != policy->min) {
amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_FAIL;
- pr_err("%s cpu%d cpudata_min_freq=%d policy_min=%d, they should be equal!\n",
- __func__, cpu, cpudata->min_freq, policy->min);
+ pr_err("%s cpu%d cpudata_lowest_nonlinear_freq=%d policy_min=%d, they should be equal!\n",
+ __func__, cpu, cpudata->lowest_nonlinear_freq, policy->min);
goto skip_test;
}
diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c
index b63863f..f834cc8 100644
--- a/drivers/cpufreq/amd-pstate.c
+++ b/drivers/cpufreq/amd-pstate.c
@@ -233,7 +233,7 @@ static int amd_pstate_get_energy_pref_index(struct amd_cpudata *cpudata)
return index;
}
-static void pstate_update_perf(struct amd_cpudata *cpudata, u32 min_perf,
+static void msr_update_perf(struct amd_cpudata *cpudata, u32 min_perf,
u32 des_perf, u32 max_perf, bool fast_switch)
{
if (fast_switch)
@@ -243,7 +243,7 @@ static void pstate_update_perf(struct amd_cpudata *cpudata, u32 min_perf,
READ_ONCE(cpudata->cppc_req_cached));
}
-DEFINE_STATIC_CALL(amd_pstate_update_perf, pstate_update_perf);
+DEFINE_STATIC_CALL(amd_pstate_update_perf, msr_update_perf);
static inline void amd_pstate_update_perf(struct amd_cpudata *cpudata,
u32 min_perf, u32 des_perf,
@@ -306,11 +306,17 @@ static int amd_pstate_set_energy_pref_index(struct amd_cpudata *cpudata,
return ret;
}
-static inline int pstate_enable(bool enable)
+static inline int msr_cppc_enable(bool enable)
{
int ret, cpu;
unsigned long logical_proc_id_mask = 0;
+ /*
+ * MSR_AMD_CPPC_ENABLE is write-once, once set it cannot be cleared.
+ */
+ if (!enable)
+ return 0;
+
if (enable == cppc_enabled)
return 0;
@@ -332,7 +338,7 @@ static inline int pstate_enable(bool enable)
return 0;
}
-static int cppc_enable(bool enable)
+static int shmem_cppc_enable(bool enable)
{
int cpu, ret = 0;
struct cppc_perf_ctrls perf_ctrls;
@@ -359,14 +365,14 @@ static int cppc_enable(bool enable)
return ret;
}
-DEFINE_STATIC_CALL(amd_pstate_enable, pstate_enable);
+DEFINE_STATIC_CALL(amd_pstate_cppc_enable, msr_cppc_enable);
-static inline int amd_pstate_enable(bool enable)
+static inline int amd_pstate_cppc_enable(bool enable)
{
- return static_call(amd_pstate_enable)(enable);
+ return static_call(amd_pstate_cppc_enable)(enable);
}
-static int pstate_init_perf(struct amd_cpudata *cpudata)
+static int msr_init_perf(struct amd_cpudata *cpudata)
{
u64 cap1;
@@ -385,7 +391,7 @@ static int pstate_init_perf(struct amd_cpudata *cpudata)
return 0;
}
-static int cppc_init_perf(struct amd_cpudata *cpudata)
+static int shmem_init_perf(struct amd_cpudata *cpudata)
{
struct cppc_perf_caps cppc_perf;
@@ -420,14 +426,14 @@ static int cppc_init_perf(struct amd_cpudata *cpudata)
return ret;
}
-DEFINE_STATIC_CALL(amd_pstate_init_perf, pstate_init_perf);
+DEFINE_STATIC_CALL(amd_pstate_init_perf, msr_init_perf);
static inline int amd_pstate_init_perf(struct amd_cpudata *cpudata)
{
return static_call(amd_pstate_init_perf)(cpudata);
}
-static void cppc_update_perf(struct amd_cpudata *cpudata,
+static void shmem_update_perf(struct amd_cpudata *cpudata,
u32 min_perf, u32 des_perf,
u32 max_perf, bool fast_switch)
{
@@ -527,9 +533,28 @@ static void amd_pstate_update(struct amd_cpudata *cpudata, u32 min_perf,
cpufreq_cpu_put(policy);
}
-static int amd_pstate_verify(struct cpufreq_policy_data *policy)
+static int amd_pstate_verify(struct cpufreq_policy_data *policy_data)
{
- cpufreq_verify_within_cpu_limits(policy);
+ /*
+ * Initialize lower frequency limit (i.e.policy->min) with
+ * lowest_nonlinear_frequency which is the most energy efficient
+ * frequency. Override the initial value set by cpufreq core and
+ * amd-pstate qos_requests.
+ */
+ if (policy_data->min == FREQ_QOS_MIN_DEFAULT_VALUE) {
+ struct cpufreq_policy *policy = cpufreq_cpu_get(policy_data->cpu);
+ struct amd_cpudata *cpudata;
+
+ if (!policy)
+ return -EINVAL;
+
+ cpudata = policy->driver_data;
+ policy_data->min = cpudata->lowest_nonlinear_freq;
+ cpufreq_cpu_put(policy);
+ }
+
+ cpufreq_verify_within_cpu_limits(policy_data);
+ pr_debug("policy_max =%d, policy_min=%d\n", policy_data->max, policy_data->min);
return 0;
}
@@ -665,34 +690,12 @@ static void amd_pstate_adjust_perf(unsigned int cpu,
static int amd_pstate_cpu_boost_update(struct cpufreq_policy *policy, bool on)
{
struct amd_cpudata *cpudata = policy->driver_data;
- struct cppc_perf_ctrls perf_ctrls;
- u32 highest_perf, nominal_perf, nominal_freq, max_freq;
+ u32 nominal_freq, max_freq;
int ret = 0;
- highest_perf = READ_ONCE(cpudata->highest_perf);
- nominal_perf = READ_ONCE(cpudata->nominal_perf);
nominal_freq = READ_ONCE(cpudata->nominal_freq);
max_freq = READ_ONCE(cpudata->max_freq);
- if (boot_cpu_has(X86_FEATURE_CPPC)) {
- u64 value = READ_ONCE(cpudata->cppc_req_cached);
-
- value &= ~GENMASK_ULL(7, 0);
- value |= on ? highest_perf : nominal_perf;
- WRITE_ONCE(cpudata->cppc_req_cached, value);
-
- wrmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, value);
- } else {
- perf_ctrls.max_perf = on ? highest_perf : nominal_perf;
- ret = cppc_set_perf(cpudata->cpu, &perf_ctrls);
- if (ret) {
- cpufreq_cpu_release(policy);
- pr_debug("Failed to set max perf on CPU:%d. ret:%d\n",
- cpudata->cpu, ret);
- return ret;
- }
- }
-
if (on)
policy->cpuinfo.max_freq = max_freq;
else if (policy->cpuinfo.max_freq > nominal_freq * 1000)
@@ -1001,7 +1004,7 @@ static int amd_pstate_cpu_init(struct cpufreq_policy *policy)
policy->fast_switch_possible = true;
ret = freq_qos_add_request(&policy->constraints, &cpudata->req[0],
- FREQ_QOS_MIN, policy->cpuinfo.min_freq);
+ FREQ_QOS_MIN, FREQ_QOS_MIN_DEFAULT_VALUE);
if (ret < 0) {
dev_err(dev, "Failed to add min-freq constraint (%d)\n", ret);
goto free_cpudata1;
@@ -1045,7 +1048,7 @@ static int amd_pstate_cpu_resume(struct cpufreq_policy *policy)
{
int ret;
- ret = amd_pstate_enable(true);
+ ret = amd_pstate_cppc_enable(true);
if (ret)
pr_err("failed to enable amd-pstate during resume, return %d\n", ret);
@@ -1056,7 +1059,7 @@ static int amd_pstate_cpu_suspend(struct cpufreq_policy *policy)
{
int ret;
- ret = amd_pstate_enable(false);
+ ret = amd_pstate_cppc_enable(false);
if (ret)
pr_err("failed to disable amd-pstate during suspend, return %d\n", ret);
@@ -1189,25 +1192,41 @@ static ssize_t show_energy_performance_preference(
static void amd_pstate_driver_cleanup(void)
{
- amd_pstate_enable(false);
+ amd_pstate_cppc_enable(false);
cppc_state = AMD_PSTATE_DISABLE;
current_pstate_driver = NULL;
}
+static int amd_pstate_set_driver(int mode_idx)
+{
+ if (mode_idx >= AMD_PSTATE_DISABLE && mode_idx < AMD_PSTATE_MAX) {
+ cppc_state = mode_idx;
+ if (cppc_state == AMD_PSTATE_DISABLE)
+ pr_info("driver is explicitly disabled\n");
+
+ if (cppc_state == AMD_PSTATE_ACTIVE)
+ current_pstate_driver = &amd_pstate_epp_driver;
+
+ if (cppc_state == AMD_PSTATE_PASSIVE || cppc_state == AMD_PSTATE_GUIDED)
+ current_pstate_driver = &amd_pstate_driver;
+
+ return 0;
+ }
+
+ return -EINVAL;
+}
+
static int amd_pstate_register_driver(int mode)
{
int ret;
- if (mode == AMD_PSTATE_PASSIVE || mode == AMD_PSTATE_GUIDED)
- current_pstate_driver = &amd_pstate_driver;
- else if (mode == AMD_PSTATE_ACTIVE)
- current_pstate_driver = &amd_pstate_epp_driver;
- else
- return -EINVAL;
+ ret = amd_pstate_set_driver(mode);
+ if (ret)
+ return ret;
cppc_state = mode;
- ret = amd_pstate_enable(true);
+ ret = amd_pstate_cppc_enable(true);
if (ret) {
pr_err("failed to enable cppc during amd-pstate driver registration, return %d\n",
ret);
@@ -1485,6 +1504,8 @@ static int amd_pstate_epp_cpu_init(struct cpufreq_policy *policy)
WRITE_ONCE(cpudata->cppc_cap1_cached, value);
}
+ current_pstate_driver->adjust_perf = NULL;
+
return 0;
free_cpudata1:
@@ -1507,26 +1528,13 @@ static void amd_pstate_epp_cpu_exit(struct cpufreq_policy *policy)
static int amd_pstate_epp_update_limit(struct cpufreq_policy *policy)
{
struct amd_cpudata *cpudata = policy->driver_data;
- u32 max_perf, min_perf, min_limit_perf, max_limit_perf;
+ u32 max_perf, min_perf;
u64 value;
s16 epp;
- if (cpudata->boost_supported && !policy->boost_enabled)
- max_perf = READ_ONCE(cpudata->nominal_perf);
- else
- max_perf = READ_ONCE(cpudata->highest_perf);
+ max_perf = READ_ONCE(cpudata->highest_perf);
min_perf = READ_ONCE(cpudata->lowest_perf);
- max_limit_perf = div_u64(policy->max * max_perf, policy->cpuinfo.max_freq);
- min_limit_perf = div_u64(policy->min * max_perf, policy->cpuinfo.max_freq);
-
- if (min_limit_perf < min_perf)
- min_limit_perf = min_perf;
-
- if (max_limit_perf < min_limit_perf)
- max_limit_perf = min_limit_perf;
-
- WRITE_ONCE(cpudata->max_limit_perf, max_limit_perf);
- WRITE_ONCE(cpudata->min_limit_perf, min_limit_perf);
+ amd_pstate_update_min_max_limit(policy);
max_perf = clamp_t(unsigned long, max_perf, cpudata->min_limit_perf,
cpudata->max_limit_perf);
@@ -1535,7 +1543,7 @@ static int amd_pstate_epp_update_limit(struct cpufreq_policy *policy)
value = READ_ONCE(cpudata->cppc_req_cached);
if (cpudata->policy == CPUFREQ_POLICY_PERFORMANCE)
- min_perf = max_perf;
+ min_perf = min(cpudata->nominal_perf, max_perf);
/* Initial min/max values for CPPC Performance Controls Register */
value &= ~AMD_CPPC_MIN_PERF(~0L);
@@ -1563,12 +1571,6 @@ static int amd_pstate_epp_update_limit(struct cpufreq_policy *policy)
if (cpudata->policy == CPUFREQ_POLICY_PERFORMANCE)
epp = 0;
- /* Set initial EPP value */
- if (cpu_feature_enabled(X86_FEATURE_CPPC)) {
- value &= ~GENMASK_ULL(31, 24);
- value |= (u64)epp << 24;
- }
-
WRITE_ONCE(cpudata->cppc_req_cached, value);
return amd_pstate_set_epp(cpudata, epp);
}
@@ -1605,7 +1607,7 @@ static void amd_pstate_epp_reenable(struct amd_cpudata *cpudata)
u64 value, max_perf;
int ret;
- ret = amd_pstate_enable(true);
+ ret = amd_pstate_cppc_enable(true);
if (ret)
pr_err("failed to enable amd pstate during resume, return %d\n", ret);
@@ -1616,8 +1618,9 @@ static void amd_pstate_epp_reenable(struct amd_cpudata *cpudata)
wrmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, value);
} else {
perf_ctrls.max_perf = max_perf;
- perf_ctrls.energy_perf = AMD_CPPC_ENERGY_PERF_PREF(cpudata->epp_cached);
cppc_set_perf(cpudata->cpu, &perf_ctrls);
+ perf_ctrls.energy_perf = AMD_CPPC_ENERGY_PERF_PREF(cpudata->epp_cached);
+ cppc_set_epp_perf(cpudata->cpu, &perf_ctrls, 1);
}
}
@@ -1657,9 +1660,11 @@ static void amd_pstate_epp_offline(struct cpufreq_policy *policy)
wrmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, value);
} else {
perf_ctrls.desired_perf = 0;
+ perf_ctrls.min_perf = min_perf;
perf_ctrls.max_perf = min_perf;
- perf_ctrls.energy_perf = AMD_CPPC_ENERGY_PERF_PREF(HWP_EPP_BALANCE_POWERSAVE);
cppc_set_perf(cpudata->cpu, &perf_ctrls);
+ perf_ctrls.energy_perf = AMD_CPPC_ENERGY_PERF_PREF(HWP_EPP_BALANCE_POWERSAVE);
+ cppc_set_epp_perf(cpudata->cpu, &perf_ctrls, 1);
}
mutex_unlock(&amd_pstate_limits_lock);
}
@@ -1679,13 +1684,6 @@ static int amd_pstate_epp_cpu_offline(struct cpufreq_policy *policy)
return 0;
}
-static int amd_pstate_epp_verify_policy(struct cpufreq_policy_data *policy)
-{
- cpufreq_verify_within_cpu_limits(policy);
- pr_debug("policy_max =%d, policy_min=%d\n", policy->max, policy->min);
- return 0;
-}
-
static int amd_pstate_epp_suspend(struct cpufreq_policy *policy)
{
struct amd_cpudata *cpudata = policy->driver_data;
@@ -1699,7 +1697,7 @@ static int amd_pstate_epp_suspend(struct cpufreq_policy *policy)
cpudata->suspended = true;
/* disable CPPC in lowlevel firmware */
- ret = amd_pstate_enable(false);
+ ret = amd_pstate_cppc_enable(false);
if (ret)
pr_err("failed to suspend, return %d\n", ret);
@@ -1741,7 +1739,7 @@ static struct cpufreq_driver amd_pstate_driver = {
static struct cpufreq_driver amd_pstate_epp_driver = {
.flags = CPUFREQ_CONST_LOOPS,
- .verify = amd_pstate_epp_verify_policy,
+ .verify = amd_pstate_verify,
.setpolicy = amd_pstate_epp_set_policy,
.init = amd_pstate_epp_cpu_init,
.exit = amd_pstate_epp_cpu_exit,
@@ -1755,26 +1753,7 @@ static struct cpufreq_driver amd_pstate_epp_driver = {
.attr = amd_pstate_epp_attr,
};
-static int __init amd_pstate_set_driver(int mode_idx)
-{
- if (mode_idx >= AMD_PSTATE_DISABLE && mode_idx < AMD_PSTATE_MAX) {
- cppc_state = mode_idx;
- if (cppc_state == AMD_PSTATE_DISABLE)
- pr_info("driver is explicitly disabled\n");
-
- if (cppc_state == AMD_PSTATE_ACTIVE)
- current_pstate_driver = &amd_pstate_epp_driver;
-
- if (cppc_state == AMD_PSTATE_PASSIVE || cppc_state == AMD_PSTATE_GUIDED)
- current_pstate_driver = &amd_pstate_driver;
-
- return 0;
- }
-
- return -EINVAL;
-}
-
-/**
+/*
* CPPC function is not supported for family ID 17H with model_ID ranging from 0x10 to 0x2F.
* show the debug message that helps to check if the CPU has CPPC support for loading issue.
*/
@@ -1864,10 +1843,10 @@ static int __init amd_pstate_init(void)
if (cppc_state == AMD_PSTATE_UNDEFINED) {
/* Disable on the following configs by default:
* 1. Undefined platforms
- * 2. Server platforms
+ * 2. Server platforms with CPUs older than Family 0x1A.
*/
if (amd_pstate_acpi_pm_profile_undefined() ||
- amd_pstate_acpi_pm_profile_server()) {
+ (amd_pstate_acpi_pm_profile_server() && boot_cpu_data.x86 < 0x1A)) {
pr_info("driver load is disabled, boot with specific mode to enable this\n");
return -ENODEV;
}
@@ -1875,31 +1854,25 @@ static int __init amd_pstate_init(void)
cppc_state = CONFIG_X86_AMD_PSTATE_DEFAULT_MODE;
}
- switch (cppc_state) {
- case AMD_PSTATE_DISABLE:
+ if (cppc_state == AMD_PSTATE_DISABLE) {
pr_info("driver load is disabled, boot with specific mode to enable this\n");
return -ENODEV;
- case AMD_PSTATE_PASSIVE:
- case AMD_PSTATE_ACTIVE:
- case AMD_PSTATE_GUIDED:
- ret = amd_pstate_set_driver(cppc_state);
- if (ret)
- return ret;
- break;
- default:
- return -EINVAL;
}
/* capability check */
if (cpu_feature_enabled(X86_FEATURE_CPPC)) {
pr_debug("AMD CPPC MSR based functionality is supported\n");
- if (cppc_state != AMD_PSTATE_ACTIVE)
- current_pstate_driver->adjust_perf = amd_pstate_adjust_perf;
} else {
pr_debug("AMD CPPC shared memory based functionality is supported\n");
- static_call_update(amd_pstate_enable, cppc_enable);
- static_call_update(amd_pstate_init_perf, cppc_init_perf);
- static_call_update(amd_pstate_update_perf, cppc_update_perf);
+ static_call_update(amd_pstate_cppc_enable, shmem_cppc_enable);
+ static_call_update(amd_pstate_init_perf, shmem_init_perf);
+ static_call_update(amd_pstate_update_perf, shmem_update_perf);
+ }
+
+ ret = amd_pstate_register_driver(cppc_state);
+ if (ret) {
+ pr_err("failed to register with return %d\n", ret);
+ return ret;
}
if (amd_pstate_prefcore) {
@@ -1908,19 +1881,6 @@ static int __init amd_pstate_init(void)
return ret;
}
- /* enable amd pstate feature */
- ret = amd_pstate_enable(true);
- if (ret) {
- pr_err("failed to enable driver mode(%d)\n", cppc_state);
- return ret;
- }
-
- ret = cpufreq_register_driver(current_pstate_driver);
- if (ret) {
- pr_err("failed to register with return %d\n", ret);
- goto disable_driver;
- }
-
dev_root = bus_get_dev_root(&cpu_subsys);
if (dev_root) {
ret = sysfs_create_group(&dev_root->kobj, &amd_pstate_global_attr_group);
@@ -1935,8 +1895,7 @@ static int __init amd_pstate_init(void)
global_attr_free:
cpufreq_unregister_driver(current_pstate_driver);
-disable_driver:
- amd_pstate_enable(false);
+ amd_pstate_cppc_enable(false);
return ret;
}
device_initcall(amd_pstate_init);
diff --git a/drivers/cpufreq/brcmstb-avs-cpufreq.c b/drivers/cpufreq/brcmstb-avs-cpufreq.c
index ea84385..5d03a29 100644
--- a/drivers/cpufreq/brcmstb-avs-cpufreq.c
+++ b/drivers/cpufreq/brcmstb-avs-cpufreq.c
@@ -777,7 +777,7 @@ static struct platform_driver brcm_avs_cpufreq_platdrv = {
.of_match_table = brcm_avs_cpufreq_match,
},
.probe = brcm_avs_cpufreq_probe,
- .remove_new = brcm_avs_cpufreq_remove,
+ .remove = brcm_avs_cpufreq_remove,
};
module_platform_driver(brcm_avs_cpufreq_platdrv);
diff --git a/drivers/cpufreq/cpufreq-dt.c b/drivers/cpufreq/cpufreq-dt.c
index 9834433..3a7c337 100644
--- a/drivers/cpufreq/cpufreq-dt.c
+++ b/drivers/cpufreq/cpufreq-dt.c
@@ -345,7 +345,7 @@ static struct platform_driver dt_cpufreq_platdrv = {
.name = "cpufreq-dt",
},
.probe = dt_cpufreq_probe,
- .remove_new = dt_cpufreq_remove,
+ .remove = dt_cpufreq_remove,
};
module_platform_driver(dt_cpufreq_platdrv);
diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index f98c943..1a4cae54 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -1520,7 +1520,7 @@ static int cpufreq_online(unsigned int cpu)
* frequency for longer duration. Hence, a BUG_ON().
*/
BUG_ON(ret);
- pr_info("%s: CPU%d: Running at unlisted initial frequency: %u KHz, changing to: %u KHz\n",
+ pr_info("%s: CPU%d: Running at unlisted initial frequency: %u kHz, changing to: %u kHz\n",
__func__, policy->cpu, old_freq, policy->cur);
}
}
diff --git a/drivers/cpufreq/davinci-cpufreq.c b/drivers/cpufreq/davinci-cpufreq.c
index 7d27544..8736be3 100644
--- a/drivers/cpufreq/davinci-cpufreq.c
+++ b/drivers/cpufreq/davinci-cpufreq.c
@@ -145,7 +145,7 @@ static struct platform_driver davinci_cpufreq_driver = {
.driver = {
.name = "cpufreq-davinci",
},
- .remove_new = __exit_p(davinci_cpufreq_remove),
+ .remove = __exit_p(davinci_cpufreq_remove),
};
int __init davinci_cpufreq_init(void)
diff --git a/drivers/cpufreq/imx-cpufreq-dt.c b/drivers/cpufreq/imx-cpufreq-dt.c
index 577bb9e..1492c92 100644
--- a/drivers/cpufreq/imx-cpufreq-dt.c
+++ b/drivers/cpufreq/imx-cpufreq-dt.c
@@ -183,7 +183,7 @@ static void imx_cpufreq_dt_remove(struct platform_device *pdev)
static struct platform_driver imx_cpufreq_dt_driver = {
.probe = imx_cpufreq_dt_probe,
- .remove_new = imx_cpufreq_dt_remove,
+ .remove = imx_cpufreq_dt_remove,
.driver = {
.name = "imx-cpufreq-dt",
},
diff --git a/drivers/cpufreq/imx6q-cpufreq.c b/drivers/cpufreq/imx6q-cpufreq.c
index c20d3ec..f3c99f3 100644
--- a/drivers/cpufreq/imx6q-cpufreq.c
+++ b/drivers/cpufreq/imx6q-cpufreq.c
@@ -522,7 +522,7 @@ static struct platform_driver imx6q_cpufreq_platdrv = {
.name = "imx6q-cpufreq",
},
.probe = imx6q_cpufreq_probe,
- .remove_new = imx6q_cpufreq_remove,
+ .remove = imx6q_cpufreq_remove,
};
module_platform_driver(imx6q_cpufreq_platdrv);
diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index cd2ac1b..b8e2396 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -1028,26 +1028,29 @@ static void hybrid_update_cpu_capacity_scaling(void)
}
}
-static void __hybrid_init_cpu_capacity_scaling(void)
+static void __hybrid_refresh_cpu_capacity_scaling(void)
{
hybrid_max_perf_cpu = NULL;
hybrid_update_cpu_capacity_scaling();
}
+static void hybrid_refresh_cpu_capacity_scaling(void)
+{
+ guard(mutex)(&hybrid_capacity_lock);
+
+ __hybrid_refresh_cpu_capacity_scaling();
+}
+
static void hybrid_init_cpu_capacity_scaling(bool refresh)
{
- bool disable_itmt = false;
-
- mutex_lock(&hybrid_capacity_lock);
-
/*
* If hybrid_max_perf_cpu is set at this point, the hybrid CPU capacity
* scaling has been enabled already and the driver is just changing the
* operation mode.
*/
if (refresh) {
- __hybrid_init_cpu_capacity_scaling();
- goto unlock;
+ hybrid_refresh_cpu_capacity_scaling();
+ return;
}
/*
@@ -1056,19 +1059,13 @@ static void hybrid_init_cpu_capacity_scaling(bool refresh)
* do not do that when SMT is in use.
*/
if (hwp_is_hybrid && !sched_smt_active() && arch_enable_hybrid_capacity_scale()) {
- __hybrid_init_cpu_capacity_scaling();
- disable_itmt = true;
- }
-
-unlock:
- mutex_unlock(&hybrid_capacity_lock);
-
- /*
- * Disabling ITMT causes sched domains to be rebuilt to disable asym
- * packing and enable asym capacity.
- */
- if (disable_itmt)
+ hybrid_refresh_cpu_capacity_scaling();
+ /*
+ * Disabling ITMT causes sched domains to be rebuilt to disable asym
+ * packing and enable asym capacity.
+ */
sched_clear_itmt_support();
+ }
}
static bool hybrid_clear_max_perf_cpu(void)
@@ -1404,7 +1401,7 @@ static void intel_pstate_update_limits_for_all(void)
mutex_lock(&hybrid_capacity_lock);
if (hybrid_max_perf_cpu)
- __hybrid_init_cpu_capacity_scaling();
+ __hybrid_refresh_cpu_capacity_scaling();
mutex_unlock(&hybrid_capacity_lock);
}
@@ -3658,6 +3655,8 @@ static const struct x86_cpu_id intel_epp_default[] = {
X86_MATCH_VFM(INTEL_ALDERLAKE_L, HWP_SET_DEF_BALANCE_PERF_EPP(102)),
X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, HWP_SET_DEF_BALANCE_PERF_EPP(32)),
X86_MATCH_VFM(INTEL_EMERALDRAPIDS_X, HWP_SET_DEF_BALANCE_PERF_EPP(32)),
+ X86_MATCH_VFM(INTEL_GRANITERAPIDS_X, HWP_SET_DEF_BALANCE_PERF_EPP(32)),
+ X86_MATCH_VFM(INTEL_GRANITERAPIDS_D, HWP_SET_DEF_BALANCE_PERF_EPP(32)),
X86_MATCH_VFM(INTEL_METEORLAKE_L, HWP_SET_EPP_VALUES(HWP_EPP_POWERSAVE,
179, 64, 16)),
X86_MATCH_VFM(INTEL_ARROWLAKE, HWP_SET_EPP_VALUES(HWP_EPP_POWERSAVE,
diff --git a/drivers/cpufreq/kirkwood-cpufreq.c b/drivers/cpufreq/kirkwood-cpufreq.c
index fd20b98..312f265 100644
--- a/drivers/cpufreq/kirkwood-cpufreq.c
+++ b/drivers/cpufreq/kirkwood-cpufreq.c
@@ -189,7 +189,7 @@ static void kirkwood_cpufreq_remove(struct platform_device *pdev)
static struct platform_driver kirkwood_cpufreq_platform_driver = {
.probe = kirkwood_cpufreq_probe,
- .remove_new = kirkwood_cpufreq_remove,
+ .remove = kirkwood_cpufreq_remove,
.driver = {
.name = "kirkwood-cpufreq",
},
diff --git a/drivers/cpufreq/loongson3_cpufreq.c b/drivers/cpufreq/loongson3_cpufreq.c
index 6b5e679..61ebebf 100644
--- a/drivers/cpufreq/loongson3_cpufreq.c
+++ b/drivers/cpufreq/loongson3_cpufreq.c
@@ -386,7 +386,7 @@ static struct platform_driver loongson3_platform_driver = {
},
.id_table = cpufreq_id_table,
.probe = loongson3_cpufreq_probe,
- .remove_new = loongson3_cpufreq_remove,
+ .remove = loongson3_cpufreq_remove,
};
module_platform_driver(loongson3_platform_driver);
diff --git a/drivers/cpufreq/mediatek-cpufreq-hw.c b/drivers/cpufreq/mediatek-cpufreq-hw.c
index 8925e09..f7db5f4 100644
--- a/drivers/cpufreq/mediatek-cpufreq-hw.c
+++ b/drivers/cpufreq/mediatek-cpufreq-hw.c
@@ -344,7 +344,7 @@ MODULE_DEVICE_TABLE(of, mtk_cpufreq_hw_match);
static struct platform_driver mtk_cpufreq_hw_driver = {
.probe = mtk_cpufreq_hw_driver_probe,
- .remove_new = mtk_cpufreq_hw_driver_remove,
+ .remove = mtk_cpufreq_hw_driver_remove,
.driver = {
.name = "mtk-cpufreq-hw",
.of_match_table = mtk_cpufreq_hw_match,
diff --git a/drivers/cpufreq/omap-cpufreq.c b/drivers/cpufreq/omap-cpufreq.c
index de8be0a..106220c 100644
--- a/drivers/cpufreq/omap-cpufreq.c
+++ b/drivers/cpufreq/omap-cpufreq.c
@@ -188,7 +188,7 @@ static struct platform_driver omap_cpufreq_platdrv = {
.name = "omap-cpufreq",
},
.probe = omap_cpufreq_probe,
- .remove_new = omap_cpufreq_remove,
+ .remove = omap_cpufreq_remove,
};
module_platform_driver(omap_cpufreq_platdrv);
diff --git a/drivers/cpufreq/pcc-cpufreq.c b/drivers/cpufreq/pcc-cpufreq.c
index 771efbf..ac2e90a 100644
--- a/drivers/cpufreq/pcc-cpufreq.c
+++ b/drivers/cpufreq/pcc-cpufreq.c
@@ -615,7 +615,7 @@ static struct platform_driver pcc_cpufreq_platdrv = {
.driver = {
.name = "pcc-cpufreq",
},
- .remove_new = pcc_cpufreq_remove,
+ .remove = pcc_cpufreq_remove,
};
static int __init pcc_cpufreq_init(void)
diff --git a/drivers/cpufreq/qcom-cpufreq-hw.c b/drivers/cpufreq/qcom-cpufreq-hw.c
index 900d684..9812956 100644
--- a/drivers/cpufreq/qcom-cpufreq-hw.c
+++ b/drivers/cpufreq/qcom-cpufreq-hw.c
@@ -736,7 +736,7 @@ static void qcom_cpufreq_hw_driver_remove(struct platform_device *pdev)
static struct platform_driver qcom_cpufreq_hw_driver = {
.probe = qcom_cpufreq_hw_driver_probe,
- .remove_new = qcom_cpufreq_hw_driver_remove,
+ .remove = qcom_cpufreq_hw_driver_remove,
.driver = {
.name = "qcom-cpufreq-hw",
.of_match_table = qcom_cpufreq_hw_match,
diff --git a/drivers/cpufreq/qcom-cpufreq-nvmem.c b/drivers/cpufreq/qcom-cpufreq-nvmem.c
index 703308f..08e518c 100644
--- a/drivers/cpufreq/qcom-cpufreq-nvmem.c
+++ b/drivers/cpufreq/qcom-cpufreq-nvmem.c
@@ -604,7 +604,7 @@ static DEFINE_SIMPLE_DEV_PM_OPS(qcom_cpufreq_pm_ops, qcom_cpufreq_suspend, NULL)
static struct platform_driver qcom_cpufreq_driver = {
.probe = qcom_cpufreq_probe,
- .remove_new = qcom_cpufreq_remove,
+ .remove = qcom_cpufreq_remove,
.driver = {
.name = "qcom-cpufreq-nvmem",
.pm = pm_sleep_ptr(&qcom_cpufreq_pm_ops),
diff --git a/drivers/cpufreq/qoriq-cpufreq.c b/drivers/cpufreq/qoriq-cpufreq.c
index 3519bf3..a37ce05 100644
--- a/drivers/cpufreq/qoriq-cpufreq.c
+++ b/drivers/cpufreq/qoriq-cpufreq.c
@@ -296,7 +296,7 @@ static struct platform_driver qoriq_cpufreq_platform_driver = {
.name = "qoriq-cpufreq",
},
.probe = qoriq_cpufreq_probe,
- .remove_new = qoriq_cpufreq_remove,
+ .remove = qoriq_cpufreq_remove,
};
module_platform_driver(qoriq_cpufreq_platform_driver);
diff --git a/drivers/cpufreq/raspberrypi-cpufreq.c b/drivers/cpufreq/raspberrypi-cpufreq.c
index e0705cc..5050932 100644
--- a/drivers/cpufreq/raspberrypi-cpufreq.c
+++ b/drivers/cpufreq/raspberrypi-cpufreq.c
@@ -85,7 +85,7 @@ static struct platform_driver raspberrypi_cpufreq_driver = {
.name = "raspberrypi-cpufreq",
},
.probe = raspberrypi_cpufreq_probe,
- .remove_new = raspberrypi_cpufreq_remove,
+ .remove = raspberrypi_cpufreq_remove,
};
module_platform_driver(raspberrypi_cpufreq_driver);
diff --git a/drivers/cpufreq/scpi-cpufreq.c b/drivers/cpufreq/scpi-cpufreq.c
index 8d73e6e..cd89c1b 100644
--- a/drivers/cpufreq/scpi-cpufreq.c
+++ b/drivers/cpufreq/scpi-cpufreq.c
@@ -217,7 +217,7 @@ static struct platform_driver scpi_cpufreq_platdrv = {
.name = "scpi-cpufreq",
},
.probe = scpi_cpufreq_probe,
- .remove_new = scpi_cpufreq_remove,
+ .remove = scpi_cpufreq_remove,
};
module_platform_driver(scpi_cpufreq_platdrv);
diff --git a/drivers/cpufreq/sun50i-cpufreq-nvmem.c b/drivers/cpufreq/sun50i-cpufreq-nvmem.c
index 293921a..352e1a6 100644
--- a/drivers/cpufreq/sun50i-cpufreq-nvmem.c
+++ b/drivers/cpufreq/sun50i-cpufreq-nvmem.c
@@ -283,7 +283,7 @@ static void sun50i_cpufreq_nvmem_remove(struct platform_device *pdev)
static struct platform_driver sun50i_cpufreq_driver = {
.probe = sun50i_cpufreq_nvmem_probe,
- .remove_new = sun50i_cpufreq_nvmem_remove,
+ .remove = sun50i_cpufreq_nvmem_remove,
.driver = {
.name = "sun50i-cpufreq-nvmem",
},
diff --git a/drivers/cpufreq/tegra186-cpufreq.c b/drivers/cpufreq/tegra186-cpufreq.c
index 7b8fcfa..c7761eb 100644
--- a/drivers/cpufreq/tegra186-cpufreq.c
+++ b/drivers/cpufreq/tegra186-cpufreq.c
@@ -276,7 +276,7 @@ static struct platform_driver tegra186_cpufreq_platform_driver = {
.of_match_table = tegra186_cpufreq_of_match,
},
.probe = tegra186_cpufreq_probe,
- .remove_new = tegra186_cpufreq_remove,
+ .remove = tegra186_cpufreq_remove,
};
module_platform_driver(tegra186_cpufreq_platform_driver);
diff --git a/drivers/cpufreq/tegra194-cpufreq.c b/drivers/cpufreq/tegra194-cpufreq.c
index 07ea7ed..9055dd3 100644
--- a/drivers/cpufreq/tegra194-cpufreq.c
+++ b/drivers/cpufreq/tegra194-cpufreq.c
@@ -818,7 +818,7 @@ static struct platform_driver tegra194_ccplex_driver = {
.of_match_table = tegra194_cpufreq_of_match,
},
.probe = tegra194_cpufreq_probe,
- .remove_new = tegra194_cpufreq_remove,
+ .remove = tegra194_cpufreq_remove,
};
module_platform_driver(tegra194_ccplex_driver);
diff --git a/drivers/cpufreq/vexpress-spc-cpufreq.c b/drivers/cpufreq/vexpress-spc-cpufreq.c
index 3fadf53..0f86cdb 100644
--- a/drivers/cpufreq/vexpress-spc-cpufreq.c
+++ b/drivers/cpufreq/vexpress-spc-cpufreq.c
@@ -565,7 +565,7 @@ static struct platform_driver ve_spc_cpufreq_platdrv = {
.name = "vexpress-spc-cpufreq",
},
.probe = ve_spc_cpufreq_probe,
- .remove_new = ve_spc_cpufreq_remove,
+ .remove = ve_spc_cpufreq_remove,
};
module_platform_driver(ve_spc_cpufreq_platdrv);
diff --git a/drivers/cpuidle/cpuidle-arm.c b/drivers/cpuidle/cpuidle-arm.c
index 7cfb980..caba6f4 100644
--- a/drivers/cpuidle/cpuidle-arm.c
+++ b/drivers/cpuidle/cpuidle-arm.c
@@ -139,7 +139,7 @@ static int __init arm_idle_init_cpu(int cpu)
*
* Initializes arm cpuidle driver for all CPUs, if any CPU fails
* to register cpuidle driver then rollback to cancel all CPUs
- * registeration.
+ * registration.
*/
static int __init arm_idle_init(void)
{
diff --git a/drivers/cpuidle/cpuidle-qcom-spm.c b/drivers/cpuidle/cpuidle-qcom-spm.c
index 1fc9968..3ab240e 100644
--- a/drivers/cpuidle/cpuidle-qcom-spm.c
+++ b/drivers/cpuidle/cpuidle-qcom-spm.c
@@ -48,7 +48,7 @@ static int qcom_cpu_spc(struct spm_driver_data *drv)
ret = cpu_suspend(0, qcom_pm_collapse);
/*
* ARM common code executes WFI without calling into our driver and
- * if the SPM mode is not reset, then we may accidently power down the
+ * if the SPM mode is not reset, then we may accidentally power down the
* cpu when we intended only to gate the cpu clock.
* Ensure the state is set to standby before returning.
*/
diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index 9e418ae..06ace16 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -406,7 +406,7 @@ void cpuidle_reflect(struct cpuidle_device *dev, int index)
* Min polling interval of 10usec is a guess. It is assuming that
* for most users, the time for a single ping-pong workload like
* perf bench pipe would generally complete within 10usec but
- * this is hardware dependant. Actual time can be estimated with
+ * this is hardware dependent. Actual time can be estimated with
*
* perf bench sched pipe -l 10000
*
diff --git a/drivers/cpuidle/driver.c b/drivers/cpuidle/driver.c
index cf5873c..9bbfa59 100644
--- a/drivers/cpuidle/driver.c
+++ b/drivers/cpuidle/driver.c
@@ -261,7 +261,7 @@ static void __cpuidle_unregister_driver(struct cpuidle_driver *drv)
* @drv: a pointer to a valid struct cpuidle_driver
*
* Register the driver under a lock to prevent concurrent attempts to
- * [un]register the driver from occuring at the same time.
+ * [un]register the driver from occurring at the same time.
*
* Returns 0 on success, a negative error code (returned by
* __cpuidle_register_driver()) otherwise.
@@ -296,7 +296,7 @@ EXPORT_SYMBOL_GPL(cpuidle_register_driver);
* @drv: a pointer to a valid struct cpuidle_driver
*
* Unregisters the cpuidle driver under a lock to prevent concurrent attempts
- * to [un]register the driver from occuring at the same time. @drv has to
+ * to [un]register the driver from occurring at the same time. @drv has to
* match the currently registered driver.
*/
void cpuidle_unregister_driver(struct cpuidle_driver *drv)
diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c
index f3c9d49..28363bfa 100644
--- a/drivers/cpuidle/governors/menu.c
+++ b/drivers/cpuidle/governors/menu.c
@@ -19,7 +19,7 @@
#include "gov.h"
-#define BUCKETS 12
+#define BUCKETS 6
#define INTERVAL_SHIFT 3
#define INTERVALS (1UL << INTERVAL_SHIFT)
#define RESOLUTION 1024
@@ -29,12 +29,11 @@
/*
* Concepts and ideas behind the menu governor
*
- * For the menu governor, there are 3 decision factors for picking a C
+ * For the menu governor, there are 2 decision factors for picking a C
* state:
* 1) Energy break even point
- * 2) Performance impact
- * 3) Latency tolerance (from pmqos infrastructure)
- * These three factors are treated independently.
+ * 2) Latency tolerance (from pmqos infrastructure)
+ * These two factors are treated independently.
*
* Energy break even point
* -----------------------
@@ -75,30 +74,6 @@
* intervals and if the stand deviation of these 8 intervals is below a
* threshold value, we use the average of these intervals as prediction.
*
- * Limiting Performance Impact
- * ---------------------------
- * C states, especially those with large exit latencies, can have a real
- * noticeable impact on workloads, which is not acceptable for most sysadmins,
- * and in addition, less performance has a power price of its own.
- *
- * As a general rule of thumb, menu assumes that the following heuristic
- * holds:
- * The busier the system, the less impact of C states is acceptable
- *
- * This rule-of-thumb is implemented using a performance-multiplier:
- * If the exit latency times the performance multiplier is longer than
- * the predicted duration, the C state is not considered a candidate
- * for selection due to a too high performance impact. So the higher
- * this multiplier is, the longer we need to be idle to pick a deep C
- * state, and thus the less likely a busy CPU will hit such a deep
- * C state.
- *
- * Currently there is only one value determining the factor:
- * 10 points are added for each process that is waiting for IO on this CPU.
- * (This value was experimentally determined.)
- * Utilization is no longer a factor as it was shown that it never contributed
- * significantly to the performance multiplier in the first place.
- *
*/
struct menu_device {
@@ -112,19 +87,10 @@ struct menu_device {
int interval_ptr;
};
-static inline int which_bucket(u64 duration_ns, unsigned int nr_iowaiters)
+static inline int which_bucket(u64 duration_ns)
{
int bucket = 0;
- /*
- * We keep two groups of stats; one with no
- * IO pending, one without.
- * This allows us to calculate
- * E(duration)|iowait
- */
- if (nr_iowaiters)
- bucket = BUCKETS/2;
-
if (duration_ns < 10ULL * NSEC_PER_USEC)
return bucket;
if (duration_ns < 100ULL * NSEC_PER_USEC)
@@ -138,19 +104,6 @@ static inline int which_bucket(u64 duration_ns, unsigned int nr_iowaiters)
return bucket + 5;
}
-/*
- * Return a multiplier for the exit latency that is intended
- * to take performance requirements into account.
- * The more performance critical we estimate the system
- * to be, the higher this multiplier, and thus the higher
- * the barrier to go to an expensive C state.
- */
-static inline int performance_multiplier(unsigned int nr_iowaiters)
-{
- /* for IO wait tasks (per cpu!) we add 10x each */
- return 1 + 10 * nr_iowaiters;
-}
-
static DEFINE_PER_CPU(struct menu_device, menu_devices);
static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev);
@@ -258,8 +211,6 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
struct menu_device *data = this_cpu_ptr(&menu_devices);
s64 latency_req = cpuidle_governor_latency_req(dev->cpu);
u64 predicted_ns;
- u64 interactivity_req;
- unsigned int nr_iowaiters;
ktime_t delta, delta_tick;
int i, idx;
@@ -268,8 +219,6 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
data->needs_update = 0;
}
- nr_iowaiters = nr_iowait_cpu(dev->cpu);
-
/* Find the shortest expected idle interval. */
predicted_ns = get_typical_interval(data) * NSEC_PER_USEC;
if (predicted_ns > RESIDENCY_THRESHOLD_NS) {
@@ -283,7 +232,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
}
data->next_timer_ns = delta;
- data->bucket = which_bucket(data->next_timer_ns, nr_iowaiters);
+ data->bucket = which_bucket(data->next_timer_ns);
/* Round up the result for half microseconds. */
timer_us = div_u64((RESOLUTION * DECAY * NSEC_PER_USEC) / 2 +
@@ -301,7 +250,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
*/
data->next_timer_ns = KTIME_MAX;
delta_tick = TICK_NSEC / 2;
- data->bucket = which_bucket(KTIME_MAX, nr_iowaiters);
+ data->bucket = which_bucket(KTIME_MAX);
}
if (unlikely(drv->state_count <= 1 || latency_req == 0) ||
@@ -328,15 +277,8 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
*/
if (predicted_ns < TICK_NSEC)
predicted_ns = data->next_timer_ns;
- } else {
- /*
- * Use the performance multiplier and the user-configurable
- * latency_req to determine the maximum exit latency.
- */
- interactivity_req = div64_u64(predicted_ns,
- performance_multiplier(nr_iowaiters));
- if (latency_req > interactivity_req)
- latency_req = interactivity_req;
+ } else if (latency_req > predicted_ns) {
+ latency_req = predicted_ns;
}
/*
diff --git a/drivers/crypto/Kconfig b/drivers/crypto/Kconfig
index 08b1238..0a9cdd3 100644
--- a/drivers/crypto/Kconfig
+++ b/drivers/crypto/Kconfig
@@ -95,6 +95,9 @@
loaded when a CEX crypto card is available.
- A pkey EP11 kernel module (pkey-ep11.ko) which is automatically
loaded when a CEX crypto card is available.
+ - A pkey UV kernel module (pkey-uv.ko) which is automatically
+ loaded when the Ultravisor feature is available within a
+ protected execution environment.
Select this option if you want to enable the kernel and userspace
API for protected key handling.
@@ -152,6 +155,24 @@
this option unless you are sure you never need to derive protected
keys from clear key values directly via PCKMO.
+config PKEY_UV
+ tristate "PKEY UV support handler"
+ depends on PKEY
+ depends on S390_UV_UAPI
+ help
+ This is the PKEY Ultravisor support handler for deriving protected
+ keys from secrets stored within the Ultravisor (UV).
+
+ This module works together with the UV device and supports the
+ retrieval of protected keys from secrets stored within the
+ UV firmware layer. This service is only available within
+ a protected execution guest and thus this module will fail upon
+ modprobe if no protected execution environment is detected.
+
+ Enable this option if you intend to run this kernel with an KVM
+ guest with protected execution and you want to use UV retrievable
+ secrets via PKEY API.
+
config CRYPTO_PAES_S390
tristate "PAES cipher algorithms"
depends on S390
diff --git a/drivers/crypto/allwinner/sun4i-ss/sun4i-ss-core.c b/drivers/crypto/allwinner/sun4i-ss/sun4i-ss-core.c
index 890664b..58a76e2 100644
--- a/drivers/crypto/allwinner/sun4i-ss/sun4i-ss-core.c
+++ b/drivers/crypto/allwinner/sun4i-ss/sun4i-ss-core.c
@@ -542,7 +542,7 @@ MODULE_DEVICE_TABLE(of, a20ss_crypto_of_match_table);
static struct platform_driver sun4i_ss_driver = {
.probe = sun4i_ss_probe,
- .remove_new = sun4i_ss_remove,
+ .remove = sun4i_ss_remove,
.driver = {
.name = "sun4i-ss",
.pm = &sun4i_ss_pm_ops,
diff --git a/drivers/crypto/allwinner/sun8i-ce/sun8i-ce-core.c b/drivers/crypto/allwinner/sun8i-ce/sun8i-ce-core.c
index e55e58e..ec1ffda 100644
--- a/drivers/crypto/allwinner/sun8i-ce/sun8i-ce-core.c
+++ b/drivers/crypto/allwinner/sun8i-ce/sun8i-ce-core.c
@@ -1129,7 +1129,7 @@ MODULE_DEVICE_TABLE(of, sun8i_ce_crypto_of_match_table);
static struct platform_driver sun8i_ce_driver = {
.probe = sun8i_ce_probe,
- .remove_new = sun8i_ce_remove,
+ .remove = sun8i_ce_remove,
.driver = {
.name = "sun8i-ce",
.pm = &sun8i_ce_pm_ops,
diff --git a/drivers/crypto/allwinner/sun8i-ss/sun8i-ss-core.c b/drivers/crypto/allwinner/sun8i-ss/sun8i-ss-core.c
index 0dbc022..f456857 100644
--- a/drivers/crypto/allwinner/sun8i-ss/sun8i-ss-core.c
+++ b/drivers/crypto/allwinner/sun8i-ss/sun8i-ss-core.c
@@ -929,7 +929,7 @@ MODULE_DEVICE_TABLE(of, sun8i_ss_crypto_of_match_table);
static struct platform_driver sun8i_ss_driver = {
.probe = sun8i_ss_probe,
- .remove_new = sun8i_ss_remove,
+ .remove = sun8i_ss_remove,
.driver = {
.name = "sun8i-ss",
.pm = &sun8i_ss_pm_ops,
diff --git a/drivers/crypto/amcc/crypto4xx_core.c b/drivers/crypto/amcc/crypto4xx_core.c
index 6006703..ec3ccfa 100644
--- a/drivers/crypto/amcc/crypto4xx_core.c
+++ b/drivers/crypto/amcc/crypto4xx_core.c
@@ -653,9 +653,6 @@ static void crypto4xx_stop_all(struct crypto4xx_core_device *core_dev)
crypto4xx_destroy_pdr(core_dev->dev);
crypto4xx_destroy_gdr(core_dev->dev);
crypto4xx_destroy_sdr(core_dev->dev);
- iounmap(core_dev->dev->ce_base);
- kfree(core_dev->dev);
- kfree(core_dev);
}
static u32 get_next_gd(u32 current)
@@ -1333,17 +1330,12 @@ static struct crypto4xx_alg_common crypto4xx_alg[] = {
static int crypto4xx_probe(struct platform_device *ofdev)
{
int rc;
- struct resource res;
struct device *dev = &ofdev->dev;
struct crypto4xx_core_device *core_dev;
struct device_node *np;
u32 pvr;
bool is_revb = true;
- rc = of_address_to_resource(ofdev->dev.of_node, 0, &res);
- if (rc)
- return -ENODEV;
-
np = of_find_compatible_node(NULL, NULL, "amcc,ppc460ex-crypto");
if (np) {
mtdcri(SDR0, PPC460EX_SDR0_SRST,
@@ -1374,16 +1366,17 @@ static int crypto4xx_probe(struct platform_device *ofdev)
of_node_put(np);
- core_dev = kzalloc(sizeof(struct crypto4xx_core_device), GFP_KERNEL);
+ core_dev = devm_kzalloc(
+ &ofdev->dev, sizeof(struct crypto4xx_core_device), GFP_KERNEL);
if (!core_dev)
return -ENOMEM;
dev_set_drvdata(dev, core_dev);
core_dev->ofdev = ofdev;
- core_dev->dev = kzalloc(sizeof(struct crypto4xx_device), GFP_KERNEL);
- rc = -ENOMEM;
+ core_dev->dev = devm_kzalloc(
+ &ofdev->dev, sizeof(struct crypto4xx_device), GFP_KERNEL);
if (!core_dev->dev)
- goto err_alloc_dev;
+ return -ENOMEM;
/*
* Older version of 460EX/GT have a hardware bug.
@@ -1402,7 +1395,9 @@ static int crypto4xx_probe(struct platform_device *ofdev)
core_dev->dev->core_dev = core_dev;
core_dev->dev->is_revb = is_revb;
core_dev->device = dev;
- mutex_init(&core_dev->rng_lock);
+ rc = devm_mutex_init(&ofdev->dev, &core_dev->rng_lock);
+ if (rc)
+ return rc;
spin_lock_init(&core_dev->lock);
INIT_LIST_HEAD(&core_dev->dev->alg_list);
ratelimit_default_init(&core_dev->dev->aead_ratelimit);
@@ -1421,21 +1416,21 @@ static int crypto4xx_probe(struct platform_device *ofdev)
tasklet_init(&core_dev->tasklet, crypto4xx_bh_tasklet_cb,
(unsigned long) dev);
- core_dev->dev->ce_base = of_iomap(ofdev->dev.of_node, 0);
- if (!core_dev->dev->ce_base) {
- dev_err(dev, "failed to of_iomap\n");
- rc = -ENOMEM;
- goto err_iomap;
+ core_dev->dev->ce_base = devm_platform_ioremap_resource(ofdev, 0);
+ if (IS_ERR(core_dev->dev->ce_base)) {
+ dev_err(&ofdev->dev, "failed to ioremap resource");
+ rc = PTR_ERR(core_dev->dev->ce_base);
+ goto err_build_sdr;
}
/* Register for Crypto isr, Crypto Engine IRQ */
core_dev->irq = irq_of_parse_and_map(ofdev->dev.of_node, 0);
- rc = request_irq(core_dev->irq, is_revb ?
- crypto4xx_ce_interrupt_handler_revb :
- crypto4xx_ce_interrupt_handler, 0,
- KBUILD_MODNAME, dev);
+ rc = devm_request_irq(&ofdev->dev, core_dev->irq,
+ is_revb ? crypto4xx_ce_interrupt_handler_revb :
+ crypto4xx_ce_interrupt_handler,
+ 0, KBUILD_MODNAME, dev);
if (rc)
- goto err_request_irq;
+ goto err_iomap;
/* need to setup pdr, rdr, gdr and sdr before this */
crypto4xx_hw_init(core_dev->dev);
@@ -1444,26 +1439,17 @@ static int crypto4xx_probe(struct platform_device *ofdev)
rc = crypto4xx_register_alg(core_dev->dev, crypto4xx_alg,
ARRAY_SIZE(crypto4xx_alg));
if (rc)
- goto err_start_dev;
+ goto err_iomap;
ppc4xx_trng_probe(core_dev);
return 0;
-err_start_dev:
- free_irq(core_dev->irq, dev);
-err_request_irq:
- irq_dispose_mapping(core_dev->irq);
- iounmap(core_dev->dev->ce_base);
err_iomap:
tasklet_kill(&core_dev->tasklet);
err_build_sdr:
crypto4xx_destroy_sdr(core_dev->dev);
crypto4xx_destroy_gdr(core_dev->dev);
crypto4xx_destroy_pdr(core_dev->dev);
- kfree(core_dev->dev);
-err_alloc_dev:
- kfree(core_dev);
-
return rc;
}
@@ -1474,13 +1460,9 @@ static void crypto4xx_remove(struct platform_device *ofdev)
ppc4xx_trng_remove(core_dev);
- free_irq(core_dev->irq, dev);
- irq_dispose_mapping(core_dev->irq);
-
tasklet_kill(&core_dev->tasklet);
/* Un-register with Linux CryptoAPI */
crypto4xx_unregister_alg(core_dev->dev);
- mutex_destroy(&core_dev->rng_lock);
/* Free all allocated memory */
crypto4xx_stop_all(core_dev);
}
@@ -1497,7 +1479,7 @@ static struct platform_driver crypto4xx_driver = {
.of_match_table = crypto4xx_match,
},
.probe = crypto4xx_probe,
- .remove_new = crypto4xx_remove,
+ .remove = crypto4xx_remove,
};
module_platform_driver(crypto4xx_driver);
diff --git a/drivers/crypto/amlogic/amlogic-gxl-core.c b/drivers/crypto/amlogic/amlogic-gxl-core.c
index f54ab0d..1c18a5b 100644
--- a/drivers/crypto/amlogic/amlogic-gxl-core.c
+++ b/drivers/crypto/amlogic/amlogic-gxl-core.c
@@ -240,11 +240,9 @@ static int meson_crypto_probe(struct platform_device *pdev)
platform_set_drvdata(pdev, mc);
mc->base = devm_platform_ioremap_resource(pdev, 0);
- if (IS_ERR(mc->base)) {
- err = PTR_ERR(mc->base);
- dev_err(&pdev->dev, "Cannot request MMIO err=%d\n", err);
- return err;
- }
+ if (IS_ERR(mc->base))
+ return PTR_ERR(mc->base);
+
mc->busclk = devm_clk_get(&pdev->dev, "blkmv");
if (IS_ERR(mc->busclk)) {
err = PTR_ERR(mc->busclk);
@@ -322,7 +320,7 @@ MODULE_DEVICE_TABLE(of, meson_crypto_of_match_table);
static struct platform_driver meson_crypto_driver = {
.probe = meson_crypto_probe,
- .remove_new = meson_crypto_remove,
+ .remove = meson_crypto_remove,
.driver = {
.name = "gxl-crypto",
.of_match_table = meson_crypto_of_match_table,
diff --git a/drivers/crypto/aspeed/aspeed-acry.c b/drivers/crypto/aspeed/aspeed-acry.c
index b4613bd..8d1c79a 100644
--- a/drivers/crypto/aspeed/aspeed-acry.c
+++ b/drivers/crypto/aspeed/aspeed-acry.c
@@ -601,8 +601,6 @@ static struct aspeed_acry_alg aspeed_acry_akcipher_algs[] = {
.akcipher.base = {
.encrypt = aspeed_acry_rsa_enc,
.decrypt = aspeed_acry_rsa_dec,
- .sign = aspeed_acry_rsa_dec,
- .verify = aspeed_acry_rsa_enc,
.set_pub_key = aspeed_acry_rsa_set_pub_key,
.set_priv_key = aspeed_acry_rsa_set_priv_key,
.max_size = aspeed_acry_rsa_max_size,
@@ -808,7 +806,7 @@ MODULE_DEVICE_TABLE(of, aspeed_acry_of_matches);
static struct platform_driver aspeed_acry_driver = {
.probe = aspeed_acry_probe,
- .remove_new = aspeed_acry_remove,
+ .remove = aspeed_acry_remove,
.driver = {
.name = KBUILD_MODNAME,
.of_match_table = aspeed_acry_of_matches,
diff --git a/drivers/crypto/aspeed/aspeed-hace.c b/drivers/crypto/aspeed/aspeed-hace.c
index 062f2a6..3fe644b 100644
--- a/drivers/crypto/aspeed/aspeed-hace.c
+++ b/drivers/crypto/aspeed/aspeed-hace.c
@@ -266,7 +266,7 @@ MODULE_DEVICE_TABLE(of, aspeed_hace_of_matches);
static struct platform_driver aspeed_hace_driver = {
.probe = aspeed_hace_probe,
- .remove_new = aspeed_hace_remove,
+ .remove = aspeed_hace_remove,
.driver = {
.name = KBUILD_MODNAME,
.of_match_table = aspeed_hace_of_matches,
diff --git a/drivers/crypto/atmel-aes.c b/drivers/crypto/atmel-aes.c
index 0dd9078..14bf869 100644
--- a/drivers/crypto/atmel-aes.c
+++ b/drivers/crypto/atmel-aes.c
@@ -2453,7 +2453,7 @@ static void atmel_aes_remove(struct platform_device *pdev)
static struct platform_driver atmel_aes_driver = {
.probe = atmel_aes_probe,
- .remove_new = atmel_aes_remove,
+ .remove = atmel_aes_remove,
.driver = {
.name = "atmel_aes",
.of_match_table = atmel_aes_dt_ids,
diff --git a/drivers/crypto/atmel-ecc.c b/drivers/crypto/atmel-ecc.c
index 590ea98..0d48e64 100644
--- a/drivers/crypto/atmel-ecc.c
+++ b/drivers/crypto/atmel-ecc.c
@@ -379,7 +379,7 @@ MODULE_DEVICE_TABLE(of, atmel_ecc_dt_ids);
#endif
static const struct i2c_device_id atmel_ecc_id[] = {
- { "atecc508a", 0 },
+ { "atecc508a" },
{ }
};
MODULE_DEVICE_TABLE(i2c, atmel_ecc_id);
diff --git a/drivers/crypto/atmel-sha.c b/drivers/crypto/atmel-sha.c
index 8cc57df..67a1706 100644
--- a/drivers/crypto/atmel-sha.c
+++ b/drivers/crypto/atmel-sha.c
@@ -2691,7 +2691,7 @@ static void atmel_sha_remove(struct platform_device *pdev)
static struct platform_driver atmel_sha_driver = {
.probe = atmel_sha_probe,
- .remove_new = atmel_sha_remove,
+ .remove = atmel_sha_remove,
.driver = {
.name = "atmel_sha",
.of_match_table = atmel_sha_dt_ids,
diff --git a/drivers/crypto/atmel-sha204a.c b/drivers/crypto/atmel-sha204a.c
index a02d496..75bebec 100644
--- a/drivers/crypto/atmel-sha204a.c
+++ b/drivers/crypto/atmel-sha204a.c
@@ -202,8 +202,8 @@ static const struct of_device_id atmel_sha204a_dt_ids[] __maybe_unused = {
MODULE_DEVICE_TABLE(of, atmel_sha204a_dt_ids);
static const struct i2c_device_id atmel_sha204a_id[] = {
- { "atsha204", 0 },
- { "atsha204a", 0 },
+ { "atsha204" },
+ { "atsha204a" },
{ /* sentinel */ }
};
MODULE_DEVICE_TABLE(i2c, atmel_sha204a_id);
diff --git a/drivers/crypto/atmel-tdes.c b/drivers/crypto/atmel-tdes.c
index dcc2380..de9717e 100644
--- a/drivers/crypto/atmel-tdes.c
+++ b/drivers/crypto/atmel-tdes.c
@@ -872,7 +872,7 @@ static void atmel_tdes_done_task(unsigned long data)
if (!err)
err = atmel_tdes_crypt_start(dd);
if (!err)
- return; /* DMA started. Not fininishing. */
+ return; /* DMA started. Not finishing. */
}
atmel_tdes_finish_req(dd, err);
@@ -1074,7 +1074,7 @@ static void atmel_tdes_remove(struct platform_device *pdev)
static struct platform_driver atmel_tdes_driver = {
.probe = atmel_tdes_probe,
- .remove_new = atmel_tdes_remove,
+ .remove = atmel_tdes_remove,
.driver = {
.name = "atmel_tdes",
.of_match_table = atmel_tdes_dt_ids,
diff --git a/drivers/crypto/axis/artpec6_crypto.c b/drivers/crypto/axis/artpec6_crypto.c
index 75440ea..1c1f57b 100644
--- a/drivers/crypto/axis/artpec6_crypto.c
+++ b/drivers/crypto/axis/artpec6_crypto.c
@@ -2975,7 +2975,7 @@ static void artpec6_crypto_remove(struct platform_device *pdev)
static struct platform_driver artpec6_crypto_driver = {
.probe = artpec6_crypto_probe,
- .remove_new = artpec6_crypto_remove,
+ .remove = artpec6_crypto_remove,
.driver = {
.name = "artpec6-crypto",
.of_match_table = artpec6_crypto_of_match,
diff --git a/drivers/crypto/bcm/cipher.c b/drivers/crypto/bcm/cipher.c
index 1a3ecd4..9e6798e 100644
--- a/drivers/crypto/bcm/cipher.c
+++ b/drivers/crypto/bcm/cipher.c
@@ -2415,6 +2415,7 @@ static int ahash_hmac_setkey(struct crypto_ahash *ahash, const u8 *key,
static int ahash_hmac_init(struct ahash_request *req)
{
+ int ret;
struct iproc_reqctx_s *rctx = ahash_request_ctx(req);
struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
struct iproc_ctx_s *ctx = crypto_ahash_ctx(tfm);
@@ -2424,7 +2425,9 @@ static int ahash_hmac_init(struct ahash_request *req)
flow_log("ahash_hmac_init()\n");
/* init the context as a hash */
- ahash_init(req);
+ ret = ahash_init(req);
+ if (ret)
+ return ret;
if (!spu_no_incr_hash(ctx)) {
/* SPU-M can do incr hashing but needs sw for outer HMAC */
@@ -4704,7 +4707,7 @@ static struct platform_driver bcm_spu_pdriver = {
.of_match_table = of_match_ptr(bcm_spu_dt_ids),
},
.probe = bcm_spu_probe,
- .remove_new = bcm_spu_remove,
+ .remove = bcm_spu_remove,
};
module_platform_driver(bcm_spu_pdriver);
diff --git a/drivers/crypto/caam/caampkc.c b/drivers/crypto/caam/caampkc.c
index 887a5f2f..cb001aa 100644
--- a/drivers/crypto/caam/caampkc.c
+++ b/drivers/crypto/caam/caampkc.c
@@ -984,7 +984,7 @@ static int caam_rsa_set_pub_key(struct crypto_akcipher *tfm, const void *key,
return -ENOMEM;
}
-static void caam_rsa_set_priv_key_form(struct caam_rsa_ctx *ctx,
+static int caam_rsa_set_priv_key_form(struct caam_rsa_ctx *ctx,
struct rsa_key *raw_key)
{
struct caam_rsa_key *rsa_key = &ctx->key;
@@ -994,7 +994,7 @@ static void caam_rsa_set_priv_key_form(struct caam_rsa_ctx *ctx,
rsa_key->p = caam_read_raw_data(raw_key->p, &p_sz);
if (!rsa_key->p)
- return;
+ return -ENOMEM;
rsa_key->p_sz = p_sz;
rsa_key->q = caam_read_raw_data(raw_key->q, &q_sz);
@@ -1029,7 +1029,7 @@ static void caam_rsa_set_priv_key_form(struct caam_rsa_ctx *ctx,
rsa_key->priv_form = FORM3;
- return;
+ return 0;
free_dq:
kfree_sensitive(rsa_key->dq);
@@ -1043,6 +1043,7 @@ static void caam_rsa_set_priv_key_form(struct caam_rsa_ctx *ctx,
kfree_sensitive(rsa_key->q);
free_p:
kfree_sensitive(rsa_key->p);
+ return -ENOMEM;
}
static int caam_rsa_set_priv_key(struct crypto_akcipher *tfm, const void *key,
@@ -1088,7 +1089,9 @@ static int caam_rsa_set_priv_key(struct crypto_akcipher *tfm, const void *key,
rsa_key->e_sz = raw_key.e_sz;
rsa_key->n_sz = raw_key.n_sz;
- caam_rsa_set_priv_key_form(ctx, &raw_key);
+ ret = caam_rsa_set_priv_key_form(ctx, &raw_key);
+ if (ret)
+ goto err;
return 0;
diff --git a/drivers/crypto/caam/jr.c b/drivers/crypto/caam/jr.c
index 26eba7d..9fcdb64 100644
--- a/drivers/crypto/caam/jr.c
+++ b/drivers/crypto/caam/jr.c
@@ -819,7 +819,7 @@ static struct platform_driver caam_jr_driver = {
.pm = pm_ptr(&caam_jr_pm_ops),
},
.probe = caam_jr_probe,
- .remove_new = caam_jr_remove,
+ .remove = caam_jr_remove,
.shutdown = caam_jr_remove,
};
diff --git a/drivers/crypto/caam/qi.c b/drivers/crypto/caam/qi.c
index f6111ee..7701d00 100644
--- a/drivers/crypto/caam/qi.c
+++ b/drivers/crypto/caam/qi.c
@@ -733,7 +733,7 @@ static void free_caam_qi_pcpu_netdev(const cpumask_t *cpus)
int caam_qi_init(struct platform_device *caam_pdev)
{
int err, i;
- struct device *ctrldev = &caam_pdev->dev, *qidev;
+ struct device *qidev = &caam_pdev->dev;
struct caam_drv_private *ctrlpriv;
const cpumask_t *cpus = qman_affine_cpus();
cpumask_var_t clean_mask;
@@ -742,8 +742,7 @@ int caam_qi_init(struct platform_device *caam_pdev)
if (!zalloc_cpumask_var(&clean_mask, GFP_KERNEL))
goto fail_cpumask;
- ctrlpriv = dev_get_drvdata(ctrldev);
- qidev = ctrldev;
+ ctrlpriv = dev_get_drvdata(qidev);
/* Initialize the congestion detection */
err = init_cgr(qidev);
@@ -794,7 +793,7 @@ int caam_qi_init(struct platform_device *caam_pdev)
caam_debugfs_qi_init(ctrlpriv);
- err = devm_add_action_or_reset(qidev, caam_qi_shutdown, ctrlpriv);
+ err = devm_add_action_or_reset(qidev, caam_qi_shutdown, qidev);
if (err)
goto fail2;
diff --git a/drivers/crypto/cavium/cpt/cptpf_main.c b/drivers/crypto/cavium/cpt/cptpf_main.c
index 6872ac3..54de869 100644
--- a/drivers/crypto/cavium/cpt/cptpf_main.c
+++ b/drivers/crypto/cavium/cpt/cptpf_main.c
@@ -44,7 +44,7 @@ static void cpt_disable_cores(struct cpt_device *cpt, u64 coremask,
dev_err(dev, "Cores still busy %llx", coremask);
grp = cpt_read_csr64(cpt->reg_base,
CPTX_PF_EXEC_BUSY(0));
- if (timeout--)
+ if (!timeout--)
break;
udelay(CSR_DELAY);
@@ -302,6 +302,8 @@ static int cpt_ucode_load_fw(struct cpt_device *cpt, const u8 *fw, bool is_ae)
ret = do_cpt_init(cpt, mcode);
if (ret) {
+ dma_free_coherent(&cpt->pdev->dev, mcode->code_size,
+ mcode->code, mcode->phys_base);
dev_err(dev, "do_cpt_init failed with ret: %d\n", ret);
goto fw_release;
}
@@ -394,7 +396,7 @@ static void cpt_disable_all_cores(struct cpt_device *cpt)
dev_err(dev, "Cores still busy");
grp = cpt_read_csr64(cpt->reg_base,
CPTX_PF_EXEC_BUSY(0));
- if (timeout--)
+ if (!timeout--)
break;
udelay(CSR_DELAY);
diff --git a/drivers/crypto/cavium/cpt/cptvf_reqmanager.c b/drivers/crypto/cavium/cpt/cptvf_reqmanager.c
index 153004b..fb59bb2 100644
--- a/drivers/crypto/cavium/cpt/cptvf_reqmanager.c
+++ b/drivers/crypto/cavium/cpt/cptvf_reqmanager.c
@@ -238,7 +238,7 @@ static int send_cpt_command(struct cpt_vf *cptvf, union cpt_inst_s *cmd,
qinfo = &cptvf->cqinfo;
queue = &qinfo->queue[qno];
- /* lock commad queue */
+ /* lock command queue */
spin_lock(&queue->lock);
ent = &queue->qhead->head[queue->idx * qinfo->cmd_size];
memcpy(ent, (void *)cmd, qinfo->cmd_size);
@@ -510,7 +510,7 @@ int process_request(struct cpt_vf *cptvf, struct cpt_request_info *req)
info->time_in = jiffies;
info->req = req;
- /* Create the CPT_INST_S type command for HW intrepretation */
+ /* Create the CPT_INST_S type command for HW interpretation */
cptinst.s.doneint = true;
cptinst.s.res_addr = (u64)info->comp_baddr;
cptinst.s.tag = 0;
diff --git a/drivers/crypto/cavium/nitrox/nitrox_lib.c b/drivers/crypto/cavium/nitrox/nitrox_lib.c
index a5cdc2b..0682652 100644
--- a/drivers/crypto/cavium/nitrox/nitrox_lib.c
+++ b/drivers/crypto/cavium/nitrox/nitrox_lib.c
@@ -17,7 +17,7 @@
#define CRYPTO_CTX_SIZE 256
-/* packet inuput ring alignments */
+/* packet input ring alignments */
#define PKTIN_Q_ALIGN_BYTES 16
/* AQM Queue input alignments */
#define AQM_Q_ALIGN_BYTES 32
diff --git a/drivers/crypto/ccp/sp-platform.c b/drivers/crypto/ccp/sp-platform.c
index ff6ceb4..3933cac 100644
--- a/drivers/crypto/ccp/sp-platform.c
+++ b/drivers/crypto/ccp/sp-platform.c
@@ -210,7 +210,7 @@ static struct platform_driver sp_platform_driver = {
.of_match_table = sp_of_match,
},
.probe = sp_platform_probe,
- .remove_new = sp_platform_remove,
+ .remove = sp_platform_remove,
#ifdef CONFIG_PM
.suspend = sp_platform_suspend,
.resume = sp_platform_resume,
diff --git a/drivers/crypto/ccree/cc_aead.c b/drivers/crypto/ccree/cc_aead.c
index 5ef39d6..8153368 100644
--- a/drivers/crypto/ccree/cc_aead.c
+++ b/drivers/crypto/ccree/cc_aead.c
@@ -2226,7 +2226,7 @@ static int cc_rfc4543_gcm_encrypt(struct aead_request *req)
memset(areq_ctx, 0, sizeof(*areq_ctx));
- //plaintext is not encryped with rfc4543
+ //plaintext is not encrypted with rfc4543
areq_ctx->plaintext_authenticate_only = true;
/* No generated IV required */
@@ -2277,7 +2277,7 @@ static int cc_rfc4543_gcm_decrypt(struct aead_request *req)
memset(areq_ctx, 0, sizeof(*areq_ctx));
- //plaintext is not decryped with rfc4543
+ //plaintext is not decrypted with rfc4543
areq_ctx->plaintext_authenticate_only = true;
/* No generated IV required */
diff --git a/drivers/crypto/ccree/cc_cipher.c b/drivers/crypto/ccree/cc_cipher.c
index 3fb667a..d39c067 100644
--- a/drivers/crypto/ccree/cc_cipher.c
+++ b/drivers/crypto/ccree/cc_cipher.c
@@ -179,7 +179,7 @@ static int cc_cipher_init(struct crypto_tfm *tfm)
}
max_key_buf_size <<= 1;
- /* Alloc fallabck tfm or essiv when key size != 256 bit */
+ /* Alloc fallback tfm or essiv when key size != 256 bit */
ctx_p->fallback_tfm =
crypto_alloc_skcipher(name, 0, CRYPTO_ALG_NEED_FALLBACK | CRYPTO_ALG_ASYNC);
diff --git a/drivers/crypto/ccree/cc_driver.c b/drivers/crypto/ccree/cc_driver.c
index 9177b54..061e68a 100644
--- a/drivers/crypto/ccree/cc_driver.c
+++ b/drivers/crypto/ccree/cc_driver.c
@@ -643,7 +643,7 @@ static struct platform_driver ccree_driver = {
#endif
},
.probe = ccree_probe,
- .remove_new = ccree_remove,
+ .remove = ccree_remove,
};
static int __init ccree_init(void)
diff --git a/drivers/crypto/ccree/cc_hash.c b/drivers/crypto/ccree/cc_hash.c
index f418162..d0612be 100644
--- a/drivers/crypto/ccree/cc_hash.c
+++ b/drivers/crypto/ccree/cc_hash.c
@@ -1577,7 +1577,7 @@ struct cc_hash_template {
/* hash descriptors */
static struct cc_hash_template driver_hash[] = {
- //Asynchronize hash template
+ //Asynchronous hash template
{
.name = "sha1",
.driver_name = "sha1-ccree",
diff --git a/drivers/crypto/chelsio/chcr_algo.c b/drivers/crypto/chelsio/chcr_algo.c
index 1774284..af37477f 100644
--- a/drivers/crypto/chelsio/chcr_algo.c
+++ b/drivers/crypto/chelsio/chcr_algo.c
@@ -1186,7 +1186,7 @@ static int chcr_handle_cipher_resp(struct skcipher_request *req,
else
bytes = rounddown(bytes, 16);
} else {
- /*CTR mode counter overfloa*/
+ /*CTR mode counter overflow*/
bytes = req->cryptlen - reqctx->processed;
}
err = chcr_update_cipher_iv(req, fw6_pld, reqctx->iv);
diff --git a/drivers/crypto/exynos-rng.c b/drivers/crypto/exynos-rng.c
index 0dd8baf..2aaa98f 100644
--- a/drivers/crypto/exynos-rng.c
+++ b/drivers/crypto/exynos-rng.c
@@ -389,7 +389,7 @@ static struct platform_driver exynos_rng_driver = {
.of_match_table = exynos_rng_dt_match,
},
.probe = exynos_rng_probe,
- .remove_new = exynos_rng_remove,
+ .remove = exynos_rng_remove,
};
module_platform_driver(exynos_rng_driver);
diff --git a/drivers/crypto/gemini/sl3516-ce-core.c b/drivers/crypto/gemini/sl3516-ce-core.c
index 1d1a889..f7e0e3f 100644
--- a/drivers/crypto/gemini/sl3516-ce-core.c
+++ b/drivers/crypto/gemini/sl3516-ce-core.c
@@ -528,7 +528,7 @@ MODULE_DEVICE_TABLE(of, sl3516_ce_crypto_of_match_table);
static struct platform_driver sl3516_ce_driver = {
.probe = sl3516_ce_probe,
- .remove_new = sl3516_ce_remove,
+ .remove = sl3516_ce_remove,
.driver = {
.name = "sl3516-crypto",
.pm = &sl3516_ce_pm_ops,
diff --git a/drivers/crypto/hisilicon/hpre/hpre.h b/drivers/crypto/hisilicon/hpre/hpre.h
index 9f0b94c..0f3ddba 100644
--- a/drivers/crypto/hisilicon/hpre/hpre.h
+++ b/drivers/crypto/hisilicon/hpre/hpre.h
@@ -100,6 +100,29 @@ struct hpre_sqe {
__le32 rsvd1[_HPRE_SQE_ALIGN_EXT];
};
+enum hpre_cap_table_type {
+ QM_RAS_NFE_TYPE = 0x0,
+ QM_RAS_NFE_RESET,
+ QM_RAS_CE_TYPE,
+ HPRE_RAS_NFE_TYPE,
+ HPRE_RAS_NFE_RESET,
+ HPRE_RAS_CE_TYPE,
+ HPRE_CORE_INFO,
+ HPRE_CORE_EN,
+ HPRE_DRV_ALG_BITMAP,
+ HPRE_ALG_BITMAP,
+ HPRE_CORE1_BITMAP_CAP,
+ HPRE_CORE2_BITMAP_CAP,
+ HPRE_CORE3_BITMAP_CAP,
+ HPRE_CORE4_BITMAP_CAP,
+ HPRE_CORE5_BITMAP_CAP,
+ HPRE_CORE6_BITMAP_CAP,
+ HPRE_CORE7_BITMAP_CAP,
+ HPRE_CORE8_BITMAP_CAP,
+ HPRE_CORE9_BITMAP_CAP,
+ HPRE_CORE10_BITMAP_CAP,
+};
+
struct hisi_qp *hpre_create_qp(u8 type);
int hpre_algs_register(struct hisi_qm *qm);
void hpre_algs_unregister(struct hisi_qm *qm);
diff --git a/drivers/crypto/hisilicon/hpre/hpre_crypto.c b/drivers/crypto/hisilicon/hpre/hpre_crypto.c
index c167dbd..2a29102 100644
--- a/drivers/crypto/hisilicon/hpre/hpre_crypto.c
+++ b/drivers/crypto/hisilicon/hpre/hpre_crypto.c
@@ -2006,8 +2006,6 @@ static void hpre_curve25519_exit_tfm(struct crypto_kpp *tfm)
}
static struct akcipher_alg rsa = {
- .sign = hpre_rsa_dec,
- .verify = hpre_rsa_enc,
.encrypt = hpre_rsa_enc,
.decrypt = hpre_rsa_dec,
.set_pub_key = hpre_rsa_setpubkey,
diff --git a/drivers/crypto/hisilicon/hpre/hpre_main.c b/drivers/crypto/hisilicon/hpre/hpre_main.c
index 6b536ad..96fde94 100644
--- a/drivers/crypto/hisilicon/hpre/hpre_main.c
+++ b/drivers/crypto/hisilicon/hpre/hpre_main.c
@@ -13,6 +13,7 @@
#include <linux/uacce.h>
#include "hpre.h"
+#define CAP_FILE_PERMISSION 0444
#define HPRE_CTRL_CNT_CLR_CE_BIT BIT(0)
#define HPRE_CTRL_CNT_CLR_CE 0x301000
#define HPRE_FSM_MAX_CNT 0x301008
@@ -203,7 +204,7 @@ static const struct hisi_qm_cap_info hpre_basic_info[] = {
{HPRE_RESET_MASK_CAP, 0x3134, 0, GENMASK(31, 0), 0x0, 0x3FFFFE, 0xBFFC3E},
{HPRE_OOO_SHUTDOWN_MASK_CAP, 0x3134, 0, GENMASK(31, 0), 0x0, 0x22, 0xBFFC3E},
{HPRE_CE_MASK_CAP, 0x3138, 0, GENMASK(31, 0), 0x0, 0x1, 0x1},
- {HPRE_CLUSTER_NUM_CAP, 0x313c, 20, GENMASK(3, 0), 0x0, 0x4, 0x1},
+ {HPRE_CLUSTER_NUM_CAP, 0x313c, 20, GENMASK(3, 0), 0x0, 0x4, 0x1},
{HPRE_CORE_TYPE_NUM_CAP, 0x313c, 16, GENMASK(3, 0), 0x0, 0x2, 0x2},
{HPRE_CORE_NUM_CAP, 0x313c, 8, GENMASK(7, 0), 0x0, 0x8, 0xA},
{HPRE_CLUSTER_CORE_NUM_CAP, 0x313c, 0, GENMASK(7, 0), 0x0, 0x2, 0xA},
@@ -222,18 +223,27 @@ static const struct hisi_qm_cap_info hpre_basic_info[] = {
{HPRE_CORE10_ALG_BITMAP_CAP, 0x3170, 0, GENMASK(31, 0), 0x0, 0x10, 0x10}
};
-enum hpre_pre_store_cap_idx {
- HPRE_CLUSTER_NUM_CAP_IDX = 0x0,
- HPRE_CORE_ENABLE_BITMAP_CAP_IDX,
- HPRE_DRV_ALG_BITMAP_CAP_IDX,
- HPRE_DEV_ALG_BITMAP_CAP_IDX,
-};
-
-static const u32 hpre_pre_store_caps[] = {
- HPRE_CLUSTER_NUM_CAP,
- HPRE_CORE_ENABLE_BITMAP_CAP,
- HPRE_DRV_ALG_BITMAP_CAP,
- HPRE_DEV_ALG_BITMAP_CAP,
+static const struct hisi_qm_cap_query_info hpre_cap_query_info[] = {
+ {QM_RAS_NFE_TYPE, "QM_RAS_NFE_TYPE ", 0x3124, 0x0, 0x1C37, 0x7C37},
+ {QM_RAS_NFE_RESET, "QM_RAS_NFE_RESET ", 0x3128, 0x0, 0xC77, 0x6C77},
+ {QM_RAS_CE_TYPE, "QM_RAS_CE_TYPE ", 0x312C, 0x0, 0x8, 0x8},
+ {HPRE_RAS_NFE_TYPE, "HPRE_RAS_NFE_TYPE ", 0x3130, 0x0, 0x3FFFFE, 0x1FFFC3E},
+ {HPRE_RAS_NFE_RESET, "HPRE_RAS_NFE_RESET ", 0x3134, 0x0, 0x3FFFFE, 0xBFFC3E},
+ {HPRE_RAS_CE_TYPE, "HPRE_RAS_CE_TYPE ", 0x3138, 0x0, 0x1, 0x1},
+ {HPRE_CORE_INFO, "HPRE_CORE_INFO ", 0x313c, 0x0, 0x420802, 0x120A0A},
+ {HPRE_CORE_EN, "HPRE_CORE_EN ", 0x3140, 0x0, 0xF, 0x3FF},
+ {HPRE_DRV_ALG_BITMAP, "HPRE_DRV_ALG_BITMAP ", 0x3144, 0x0, 0x03, 0x27},
+ {HPRE_ALG_BITMAP, "HPRE_ALG_BITMAP ", 0x3148, 0x0, 0x03, 0x7F},
+ {HPRE_CORE1_BITMAP_CAP, "HPRE_CORE1_BITMAP_CAP ", 0x314c, 0x0, 0x7F, 0x7F},
+ {HPRE_CORE2_BITMAP_CAP, "HPRE_CORE2_BITMAP_CAP ", 0x3150, 0x0, 0x7F, 0x7F},
+ {HPRE_CORE3_BITMAP_CAP, "HPRE_CORE3_BITMAP_CAP ", 0x3154, 0x0, 0x7F, 0x7F},
+ {HPRE_CORE4_BITMAP_CAP, "HPRE_CORE4_BITMAP_CAP ", 0x3158, 0x0, 0x7F, 0x7F},
+ {HPRE_CORE5_BITMAP_CAP, "HPRE_CORE5_BITMAP_CAP ", 0x315c, 0x0, 0x7F, 0x7F},
+ {HPRE_CORE6_BITMAP_CAP, "HPRE_CORE6_BITMAP_CAP ", 0x3160, 0x0, 0x7F, 0x7F},
+ {HPRE_CORE7_BITMAP_CAP, "HPRE_CORE7_BITMAP_CAP ", 0x3164, 0x0, 0x7F, 0x7F},
+ {HPRE_CORE8_BITMAP_CAP, "HPRE_CORE8_BITMAP_CAP ", 0x3168, 0x0, 0x7F, 0x7F},
+ {HPRE_CORE9_BITMAP_CAP, "HPRE_CORE9_BITMAP_CAP ", 0x316c, 0x0, 0x10, 0x10},
+ {HPRE_CORE10_BITMAP_CAP, "HPRE_CORE10_BITMAP_CAP ", 0x3170, 0x0, 0x10, 0x10},
};
static const struct hpre_hw_error hpre_hw_errors[] = {
@@ -360,7 +370,7 @@ bool hpre_check_alg_support(struct hisi_qm *qm, u32 alg)
{
u32 cap_val;
- cap_val = qm->cap_tables.dev_cap_table[HPRE_DRV_ALG_BITMAP_CAP_IDX].cap_val;
+ cap_val = qm->cap_tables.dev_cap_table[HPRE_DRV_ALG_BITMAP].cap_val;
if (alg & cap_val)
return true;
@@ -415,7 +425,7 @@ static int pf_q_num_set(const char *val, const struct kernel_param *kp)
{
pf_q_num_flag = true;
- return q_num_set(val, kp, PCI_DEVICE_ID_HUAWEI_HPRE_PF);
+ return hisi_qm_q_num_set(val, kp, PCI_DEVICE_ID_HUAWEI_HPRE_PF);
}
static const struct kernel_param_ops hpre_pf_q_num_ops = {
@@ -503,14 +513,17 @@ static int hpre_cfg_by_dsm(struct hisi_qm *qm)
static int hpre_set_cluster(struct hisi_qm *qm)
{
struct device *dev = &qm->pdev->dev;
- unsigned long offset;
u32 cluster_core_mask;
+ unsigned long offset;
+ u32 hpre_core_info;
u8 clusters_num;
u32 val = 0;
int ret, i;
- cluster_core_mask = qm->cap_tables.dev_cap_table[HPRE_CORE_ENABLE_BITMAP_CAP_IDX].cap_val;
- clusters_num = qm->cap_tables.dev_cap_table[HPRE_CLUSTER_NUM_CAP_IDX].cap_val;
+ cluster_core_mask = qm->cap_tables.dev_cap_table[HPRE_CORE_EN].cap_val;
+ hpre_core_info = qm->cap_tables.dev_cap_table[HPRE_CORE_INFO].cap_val;
+ clusters_num = (hpre_core_info >> hpre_basic_info[HPRE_CLUSTER_NUM_CAP].shift) &
+ hpre_basic_info[HPRE_CLUSTER_NUM_CAP].mask;
for (i = 0; i < clusters_num; i++) {
offset = i * HPRE_CLSTR_ADDR_INTRVL;
@@ -593,6 +606,9 @@ static void hpre_close_sva_prefetch(struct hisi_qm *qm)
static void hpre_enable_clock_gate(struct hisi_qm *qm)
{
+ unsigned long offset;
+ u8 clusters_num, i;
+ u32 hpre_core_info;
u32 val;
if (qm->ver < QM_HW_V3)
@@ -606,17 +622,26 @@ static void hpre_enable_clock_gate(struct hisi_qm *qm)
val |= HPRE_PEH_CFG_AUTO_GATE_EN;
writel(val, qm->io_base + HPRE_PEH_CFG_AUTO_GATE);
- val = readl(qm->io_base + HPRE_CLUSTER_DYN_CTL);
- val |= HPRE_CLUSTER_DYN_CTL_EN;
- writel(val, qm->io_base + HPRE_CLUSTER_DYN_CTL);
+ hpre_core_info = qm->cap_tables.dev_cap_table[HPRE_CORE_INFO].cap_val;
+ clusters_num = (hpre_core_info >> hpre_basic_info[HPRE_CLUSTER_NUM_CAP].shift) &
+ hpre_basic_info[HPRE_CLUSTER_NUM_CAP].mask;
+ for (i = 0; i < clusters_num; i++) {
+ offset = (unsigned long)i * HPRE_CLSTR_ADDR_INTRVL;
+ val = readl(qm->io_base + offset + HPRE_CLUSTER_DYN_CTL);
+ val |= HPRE_CLUSTER_DYN_CTL_EN;
+ writel(val, qm->io_base + offset + HPRE_CLUSTER_DYN_CTL);
- val = readl_relaxed(qm->io_base + HPRE_CORE_SHB_CFG);
- val |= HPRE_CORE_GATE_EN;
- writel(val, qm->io_base + HPRE_CORE_SHB_CFG);
+ val = readl(qm->io_base + offset + HPRE_CORE_SHB_CFG);
+ val |= HPRE_CORE_GATE_EN;
+ writel(val, qm->io_base + offset + HPRE_CORE_SHB_CFG);
+ }
}
static void hpre_disable_clock_gate(struct hisi_qm *qm)
{
+ unsigned long offset;
+ u8 clusters_num, i;
+ u32 hpre_core_info;
u32 val;
if (qm->ver < QM_HW_V3)
@@ -630,13 +655,19 @@ static void hpre_disable_clock_gate(struct hisi_qm *qm)
val &= ~HPRE_PEH_CFG_AUTO_GATE_EN;
writel(val, qm->io_base + HPRE_PEH_CFG_AUTO_GATE);
- val = readl(qm->io_base + HPRE_CLUSTER_DYN_CTL);
- val &= ~HPRE_CLUSTER_DYN_CTL_EN;
- writel(val, qm->io_base + HPRE_CLUSTER_DYN_CTL);
+ hpre_core_info = qm->cap_tables.dev_cap_table[HPRE_CORE_INFO].cap_val;
+ clusters_num = (hpre_core_info >> hpre_basic_info[HPRE_CLUSTER_NUM_CAP].shift) &
+ hpre_basic_info[HPRE_CLUSTER_NUM_CAP].mask;
+ for (i = 0; i < clusters_num; i++) {
+ offset = (unsigned long)i * HPRE_CLSTR_ADDR_INTRVL;
+ val = readl(qm->io_base + offset + HPRE_CLUSTER_DYN_CTL);
+ val &= ~HPRE_CLUSTER_DYN_CTL_EN;
+ writel(val, qm->io_base + offset + HPRE_CLUSTER_DYN_CTL);
- val = readl_relaxed(qm->io_base + HPRE_CORE_SHB_CFG);
- val &= ~HPRE_CORE_GATE_EN;
- writel(val, qm->io_base + HPRE_CORE_SHB_CFG);
+ val = readl(qm->io_base + offset + HPRE_CORE_SHB_CFG);
+ val &= ~HPRE_CORE_GATE_EN;
+ writel(val, qm->io_base + offset + HPRE_CORE_SHB_CFG);
+ }
}
static int hpre_set_user_domain_and_cache(struct hisi_qm *qm)
@@ -699,11 +730,14 @@ static int hpre_set_user_domain_and_cache(struct hisi_qm *qm)
static void hpre_cnt_regs_clear(struct hisi_qm *qm)
{
unsigned long offset;
+ u32 hpre_core_info;
u8 clusters_num;
int i;
/* clear clusterX/cluster_ctrl */
- clusters_num = qm->cap_tables.dev_cap_table[HPRE_CLUSTER_NUM_CAP_IDX].cap_val;
+ hpre_core_info = qm->cap_tables.dev_cap_table[HPRE_CORE_INFO].cap_val;
+ clusters_num = (hpre_core_info >> hpre_basic_info[HPRE_CLUSTER_NUM_CAP].shift) &
+ hpre_basic_info[HPRE_CLUSTER_NUM_CAP].mask;
for (i = 0; i < clusters_num; i++) {
offset = HPRE_CLSTR_BASE + i * HPRE_CLSTR_ADDR_INTRVL;
writel(0x0, qm->io_base + offset + HPRE_CLUSTER_INQURY);
@@ -995,10 +1029,13 @@ static int hpre_cluster_debugfs_init(struct hisi_qm *qm)
char buf[HPRE_DBGFS_VAL_MAX_LEN];
struct debugfs_regset32 *regset;
struct dentry *tmp_d;
+ u32 hpre_core_info;
u8 clusters_num;
int i, ret;
- clusters_num = qm->cap_tables.dev_cap_table[HPRE_CLUSTER_NUM_CAP_IDX].cap_val;
+ hpre_core_info = qm->cap_tables.dev_cap_table[HPRE_CORE_INFO].cap_val;
+ clusters_num = (hpre_core_info >> hpre_basic_info[HPRE_CLUSTER_NUM_CAP].shift) &
+ hpre_basic_info[HPRE_CLUSTER_NUM_CAP].mask;
for (i = 0; i < clusters_num; i++) {
ret = snprintf(buf, HPRE_DBGFS_VAL_MAX_LEN, "cluster%d", i);
if (ret >= HPRE_DBGFS_VAL_MAX_LEN)
@@ -1041,6 +1078,26 @@ static int hpre_ctrl_debug_init(struct hisi_qm *qm)
return hpre_cluster_debugfs_init(qm);
}
+static int hpre_cap_regs_show(struct seq_file *s, void *unused)
+{
+ struct hisi_qm *qm = s->private;
+ u32 i, size;
+
+ size = qm->cap_tables.qm_cap_size;
+ for (i = 0; i < size; i++)
+ seq_printf(s, "%s= 0x%08x\n", qm->cap_tables.qm_cap_table[i].name,
+ qm->cap_tables.qm_cap_table[i].cap_val);
+
+ size = qm->cap_tables.dev_cap_size;
+ for (i = 0; i < size; i++)
+ seq_printf(s, "%s= 0x%08x\n", qm->cap_tables.dev_cap_table[i].name,
+ qm->cap_tables.dev_cap_table[i].cap_val);
+
+ return 0;
+}
+
+DEFINE_SHOW_ATTRIBUTE(hpre_cap_regs);
+
static void hpre_dfx_debug_init(struct hisi_qm *qm)
{
struct dfx_diff_registers *hpre_regs = qm->debug.acc_diff_regs;
@@ -1059,6 +1116,9 @@ static void hpre_dfx_debug_init(struct hisi_qm *qm)
if (qm->fun_type == QM_HW_PF && hpre_regs)
debugfs_create_file("diff_regs", 0444, parent,
qm, &hpre_diff_regs_fops);
+
+ debugfs_create_file("cap_regs", CAP_FILE_PERMISSION,
+ qm->debug.debug_root, qm, &hpre_cap_regs_fops);
}
static int hpre_debugfs_init(struct hisi_qm *qm)
@@ -1106,26 +1166,33 @@ static int hpre_pre_store_cap_reg(struct hisi_qm *qm)
{
struct hisi_qm_cap_record *hpre_cap;
struct device *dev = &qm->pdev->dev;
+ u32 hpre_core_info;
+ u8 clusters_num;
size_t i, size;
- size = ARRAY_SIZE(hpre_pre_store_caps);
+ size = ARRAY_SIZE(hpre_cap_query_info);
hpre_cap = devm_kzalloc(dev, sizeof(*hpre_cap) * size, GFP_KERNEL);
if (!hpre_cap)
return -ENOMEM;
for (i = 0; i < size; i++) {
- hpre_cap[i].type = hpre_pre_store_caps[i];
- hpre_cap[i].cap_val = hisi_qm_get_hw_info(qm, hpre_basic_info,
- hpre_pre_store_caps[i], qm->cap_ver);
+ hpre_cap[i].type = hpre_cap_query_info[i].type;
+ hpre_cap[i].name = hpre_cap_query_info[i].name;
+ hpre_cap[i].cap_val = hisi_qm_get_cap_value(qm, hpre_cap_query_info,
+ i, qm->cap_ver);
}
- if (hpre_cap[HPRE_CLUSTER_NUM_CAP_IDX].cap_val > HPRE_CLUSTERS_NUM_MAX) {
+ hpre_core_info = hpre_cap[HPRE_CORE_INFO].cap_val;
+ clusters_num = (hpre_core_info >> hpre_basic_info[HPRE_CLUSTER_NUM_CAP].shift) &
+ hpre_basic_info[HPRE_CLUSTER_NUM_CAP].mask;
+ if (clusters_num > HPRE_CLUSTERS_NUM_MAX) {
dev_err(dev, "Device cluster num %u is out of range for driver supports %d!\n",
- hpre_cap[HPRE_CLUSTER_NUM_CAP_IDX].cap_val, HPRE_CLUSTERS_NUM_MAX);
+ clusters_num, HPRE_CLUSTERS_NUM_MAX);
return -EINVAL;
}
qm->cap_tables.dev_cap_table = hpre_cap;
+ qm->cap_tables.dev_cap_size = size;
return 0;
}
@@ -1172,7 +1239,7 @@ static int hpre_qm_init(struct hisi_qm *qm, struct pci_dev *pdev)
return ret;
}
- alg_msk = qm->cap_tables.dev_cap_table[HPRE_DEV_ALG_BITMAP_CAP_IDX].cap_val;
+ alg_msk = qm->cap_tables.dev_cap_table[HPRE_ALG_BITMAP].cap_val;
ret = hisi_qm_set_algs(qm, alg_msk, hpre_dev_algs, ARRAY_SIZE(hpre_dev_algs));
if (ret) {
pci_err(pdev, "Failed to set hpre algs!\n");
@@ -1188,10 +1255,13 @@ static int hpre_show_last_regs_init(struct hisi_qm *qm)
int com_dfx_regs_num = ARRAY_SIZE(hpre_com_dfx_regs);
struct qm_debug *debug = &qm->debug;
void __iomem *io_base;
+ u32 hpre_core_info;
u8 clusters_num;
int i, j, idx;
- clusters_num = qm->cap_tables.dev_cap_table[HPRE_CLUSTER_NUM_CAP_IDX].cap_val;
+ hpre_core_info = qm->cap_tables.dev_cap_table[HPRE_CORE_INFO].cap_val;
+ clusters_num = (hpre_core_info >> hpre_basic_info[HPRE_CLUSTER_NUM_CAP].shift) &
+ hpre_basic_info[HPRE_CLUSTER_NUM_CAP].mask;
debug->last_words = kcalloc(cluster_dfx_regs_num * clusters_num +
com_dfx_regs_num, sizeof(unsigned int), GFP_KERNEL);
if (!debug->last_words)
@@ -1231,6 +1301,7 @@ static void hpre_show_last_dfx_regs(struct hisi_qm *qm)
struct qm_debug *debug = &qm->debug;
struct pci_dev *pdev = qm->pdev;
void __iomem *io_base;
+ u32 hpre_core_info;
u8 clusters_num;
int i, j, idx;
u32 val;
@@ -1246,7 +1317,9 @@ static void hpre_show_last_dfx_regs(struct hisi_qm *qm)
hpre_com_dfx_regs[i].name, debug->last_words[i], val);
}
- clusters_num = qm->cap_tables.dev_cap_table[HPRE_CLUSTER_NUM_CAP_IDX].cap_val;
+ hpre_core_info = qm->cap_tables.dev_cap_table[HPRE_CORE_INFO].cap_val;
+ clusters_num = (hpre_core_info >> hpre_basic_info[HPRE_CLUSTER_NUM_CAP].shift) &
+ hpre_basic_info[HPRE_CLUSTER_NUM_CAP].mask;
for (i = 0; i < clusters_num; i++) {
io_base = qm->io_base + hpre_cluster_offsets[i];
for (j = 0; j < cluster_dfx_regs_num; j++) {
@@ -1280,11 +1353,15 @@ static u32 hpre_get_hw_err_status(struct hisi_qm *qm)
static void hpre_clear_hw_err_status(struct hisi_qm *qm, u32 err_sts)
{
- u32 nfe;
-
writel(err_sts, qm->io_base + HPRE_HAC_SOURCE_INT);
- nfe = hisi_qm_get_hw_info(qm, hpre_basic_info, HPRE_NFE_MASK_CAP, qm->cap_ver);
- writel(nfe, qm->io_base + HPRE_RAS_NFE_ENB);
+}
+
+static void hpre_disable_error_report(struct hisi_qm *qm, u32 err_type)
+{
+ u32 nfe_mask;
+
+ nfe_mask = hisi_qm_get_hw_info(qm, hpre_basic_info, HPRE_NFE_MASK_CAP, qm->cap_ver);
+ writel(nfe_mask & (~err_type), qm->io_base + HPRE_RAS_NFE_ENB);
}
static void hpre_open_axi_master_ooo(struct hisi_qm *qm)
@@ -1298,6 +1375,27 @@ static void hpre_open_axi_master_ooo(struct hisi_qm *qm)
qm->io_base + HPRE_AM_OOO_SHUTDOWN_ENB);
}
+static enum acc_err_result hpre_get_err_result(struct hisi_qm *qm)
+{
+ u32 err_status;
+
+ err_status = hpre_get_hw_err_status(qm);
+ if (err_status) {
+ if (err_status & qm->err_info.ecc_2bits_mask)
+ qm->err_status.is_dev_ecc_mbit = true;
+ hpre_log_hw_error(qm, err_status);
+
+ if (err_status & qm->err_info.dev_reset_mask) {
+ /* Disable the same error reporting until device is recovered. */
+ hpre_disable_error_report(qm, err_status);
+ return ACC_ERR_NEED_RESET;
+ }
+ hpre_clear_hw_err_status(qm, err_status);
+ }
+
+ return ACC_ERR_RECOVERED;
+}
+
static void hpre_err_info_init(struct hisi_qm *qm)
{
struct hisi_qm_err_info *err_info = &qm->err_info;
@@ -1324,12 +1422,12 @@ static const struct hisi_qm_err_ini hpre_err_ini = {
.hw_err_disable = hpre_hw_error_disable,
.get_dev_hw_err_status = hpre_get_hw_err_status,
.clear_dev_hw_err_status = hpre_clear_hw_err_status,
- .log_dev_hw_err = hpre_log_hw_error,
.open_axi_master_ooo = hpre_open_axi_master_ooo,
.open_sva_prefetch = hpre_open_sva_prefetch,
.close_sva_prefetch = hpre_close_sva_prefetch,
.show_last_dfx_regs = hpre_show_last_dfx_regs,
.err_info_init = hpre_err_info_init,
+ .get_err_result = hpre_get_err_result,
};
static int hpre_pf_probe_init(struct hpre *hpre)
diff --git a/drivers/crypto/hisilicon/qm.c b/drivers/crypto/hisilicon/qm.c
index 07983af..19c1b5d 100644
--- a/drivers/crypto/hisilicon/qm.c
+++ b/drivers/crypto/hisilicon/qm.c
@@ -271,12 +271,6 @@ enum vft_type {
SHAPER_VFT,
};
-enum acc_err_result {
- ACC_ERR_NONE,
- ACC_ERR_NEED_RESET,
- ACC_ERR_RECOVERED,
-};
-
enum qm_alg_type {
ALG_TYPE_0,
ALG_TYPE_1,
@@ -307,11 +301,29 @@ enum qm_basic_type {
QM_VF_IRQ_NUM_CAP,
};
-enum qm_pre_store_cap_idx {
- QM_EQ_IRQ_TYPE_CAP_IDX = 0x0,
- QM_AEQ_IRQ_TYPE_CAP_IDX,
- QM_ABN_IRQ_TYPE_CAP_IDX,
- QM_PF2VF_IRQ_TYPE_CAP_IDX,
+enum qm_cap_table_type {
+ QM_CAP_VF = 0x0,
+ QM_AEQE_NUM,
+ QM_SCQE_NUM,
+ QM_EQ_IRQ,
+ QM_AEQ_IRQ,
+ QM_ABNORMAL_IRQ,
+ QM_MB_IRQ,
+ MAX_IRQ_NUM,
+ EXT_BAR_INDEX,
+};
+
+static const struct hisi_qm_cap_query_info qm_cap_query_info[] = {
+ {QM_CAP_VF, "QM_CAP_VF ", 0x3100, 0x0, 0x0, 0x6F01},
+ {QM_AEQE_NUM, "QM_AEQE_NUM ", 0x3104, 0x800, 0x4000800, 0x4000800},
+ {QM_SCQE_NUM, "QM_SCQE_NUM ",
+ 0x3108, 0x4000400, 0x4000400, 0x4000400},
+ {QM_EQ_IRQ, "QM_EQ_IRQ ", 0x310c, 0x10000, 0x10000, 0x10000},
+ {QM_AEQ_IRQ, "QM_AEQ_IRQ ", 0x3110, 0x0, 0x10001, 0x10001},
+ {QM_ABNORMAL_IRQ, "QM_ABNORMAL_IRQ ", 0x3114, 0x0, 0x10003, 0x10003},
+ {QM_MB_IRQ, "QM_MB_IRQ ", 0x3118, 0x0, 0x0, 0x10002},
+ {MAX_IRQ_NUM, "MAX_IRQ_NUM ", 0x311c, 0x10001, 0x40002, 0x40003},
+ {EXT_BAR_INDEX, "EXT_BAR_INDEX ", 0x3120, 0x0, 0x0, 0x14},
};
static const struct hisi_qm_cap_info qm_cap_info_comm[] = {
@@ -344,13 +356,6 @@ static const struct hisi_qm_cap_info qm_basic_info[] = {
{QM_VF_IRQ_NUM_CAP, 0x311c, 0, GENMASK(15, 0), 0x1, 0x2, 0x3},
};
-static const u32 qm_pre_store_caps[] = {
- QM_EQ_IRQ_TYPE_CAP,
- QM_AEQ_IRQ_TYPE_CAP,
- QM_ABN_IRQ_TYPE_CAP,
- QM_PF2VF_IRQ_TYPE_CAP,
-};
-
struct qm_mailbox {
__le16 w0;
__le16 queue_num;
@@ -451,6 +456,37 @@ static struct qm_typical_qos_table shaper_cbs_s[] = {
static void qm_irqs_unregister(struct hisi_qm *qm);
static int qm_reset_device(struct hisi_qm *qm);
+int hisi_qm_q_num_set(const char *val, const struct kernel_param *kp,
+ unsigned int device)
+{
+ struct pci_dev *pdev;
+ u32 n, q_num;
+ int ret;
+
+ if (!val)
+ return -EINVAL;
+
+ pdev = pci_get_device(PCI_VENDOR_ID_HUAWEI, device, NULL);
+ if (!pdev) {
+ q_num = min_t(u32, QM_QNUM_V1, QM_QNUM_V2);
+ pr_info("No device found currently, suppose queue number is %u\n",
+ q_num);
+ } else {
+ if (pdev->revision == QM_HW_V1)
+ q_num = QM_QNUM_V1;
+ else
+ q_num = QM_QNUM_V2;
+
+ pci_dev_put(pdev);
+ }
+
+ ret = kstrtou32(val, 10, &n);
+ if (ret || n < QM_MIN_QNUM || n > q_num)
+ return -EINVAL;
+
+ return param_set_int(val, kp);
+}
+EXPORT_SYMBOL_GPL(hisi_qm_q_num_set);
static u32 qm_get_hw_error_status(struct hisi_qm *qm)
{
@@ -763,6 +799,27 @@ u32 hisi_qm_get_hw_info(struct hisi_qm *qm,
}
EXPORT_SYMBOL_GPL(hisi_qm_get_hw_info);
+u32 hisi_qm_get_cap_value(struct hisi_qm *qm,
+ const struct hisi_qm_cap_query_info *info_table,
+ u32 index, bool is_read)
+{
+ u32 val;
+
+ switch (qm->ver) {
+ case QM_HW_V1:
+ return info_table[index].v1_val;
+ case QM_HW_V2:
+ return info_table[index].v2_val;
+ default:
+ if (!is_read)
+ return info_table[index].v3_val;
+
+ val = readl(qm->io_base + info_table[index].offset);
+ return val;
+ }
+}
+EXPORT_SYMBOL_GPL(hisi_qm_get_cap_value);
+
static void qm_get_xqc_depth(struct hisi_qm *qm, u16 *low_bits,
u16 *high_bits, enum qm_basic_type type)
{
@@ -1425,22 +1482,25 @@ static void qm_log_hw_error(struct hisi_qm *qm, u32 error_status)
static enum acc_err_result qm_hw_error_handle_v2(struct hisi_qm *qm)
{
- u32 error_status, tmp;
+ u32 error_status;
- /* read err sts */
- tmp = readl(qm->io_base + QM_ABNORMAL_INT_STATUS);
- error_status = qm->error_mask & tmp;
-
- if (error_status) {
+ error_status = qm_get_hw_error_status(qm);
+ if (error_status & qm->error_mask) {
if (error_status & QM_ECC_MBIT)
qm->err_status.is_qm_ecc_mbit = true;
qm_log_hw_error(qm, error_status);
- if (error_status & qm->err_info.qm_reset_mask)
+ if (error_status & qm->err_info.qm_reset_mask) {
+ /* Disable the same error reporting until device is recovered. */
+ writel(qm->err_info.nfe & (~error_status),
+ qm->io_base + QM_RAS_NFE_ENABLE);
return ACC_ERR_NEED_RESET;
+ }
+ /* Clear error source if not need reset. */
writel(error_status, qm->io_base + QM_ABNORMAL_INT_SOURCE);
writel(qm->err_info.nfe, qm->io_base + QM_RAS_NFE_ENABLE);
+ writel(qm->err_info.ce, qm->io_base + QM_RAS_CE_ENABLE);
}
return ACC_ERR_RECOVERED;
@@ -3861,30 +3921,12 @@ EXPORT_SYMBOL_GPL(hisi_qm_sriov_configure);
static enum acc_err_result qm_dev_err_handle(struct hisi_qm *qm)
{
- u32 err_sts;
-
- if (!qm->err_ini->get_dev_hw_err_status) {
- dev_err(&qm->pdev->dev, "Device doesn't support get hw error status!\n");
+ if (!qm->err_ini->get_err_result) {
+ dev_err(&qm->pdev->dev, "Device doesn't support reset!\n");
return ACC_ERR_NONE;
}
- /* get device hardware error status */
- err_sts = qm->err_ini->get_dev_hw_err_status(qm);
- if (err_sts) {
- if (err_sts & qm->err_info.ecc_2bits_mask)
- qm->err_status.is_dev_ecc_mbit = true;
-
- if (qm->err_ini->log_dev_hw_err)
- qm->err_ini->log_dev_hw_err(qm, err_sts);
-
- if (err_sts & qm->err_info.dev_reset_mask)
- return ACC_ERR_NEED_RESET;
-
- if (qm->err_ini->clear_dev_hw_err_status)
- qm->err_ini->clear_dev_hw_err_status(qm, err_sts);
- }
-
- return ACC_ERR_RECOVERED;
+ return qm->err_ini->get_err_result(qm);
}
static enum acc_err_result qm_process_dev_error(struct hisi_qm *qm)
@@ -4866,7 +4908,7 @@ static void qm_unregister_abnormal_irq(struct hisi_qm *qm)
if (qm->fun_type == QM_HW_VF)
return;
- val = qm->cap_tables.qm_cap_table[QM_ABN_IRQ_TYPE_CAP_IDX].cap_val;
+ val = qm->cap_tables.qm_cap_table[QM_ABNORMAL_IRQ].cap_val;
if (!((val >> QM_IRQ_TYPE_SHIFT) & QM_ABN_IRQ_TYPE_MASK))
return;
@@ -4883,7 +4925,7 @@ static int qm_register_abnormal_irq(struct hisi_qm *qm)
if (qm->fun_type == QM_HW_VF)
return 0;
- val = qm->cap_tables.qm_cap_table[QM_ABN_IRQ_TYPE_CAP_IDX].cap_val;
+ val = qm->cap_tables.qm_cap_table[QM_ABNORMAL_IRQ].cap_val;
if (!((val >> QM_IRQ_TYPE_SHIFT) & QM_ABN_IRQ_TYPE_MASK))
return 0;
@@ -4900,7 +4942,7 @@ static void qm_unregister_mb_cmd_irq(struct hisi_qm *qm)
struct pci_dev *pdev = qm->pdev;
u32 irq_vector, val;
- val = qm->cap_tables.qm_cap_table[QM_PF2VF_IRQ_TYPE_CAP_IDX].cap_val;
+ val = qm->cap_tables.qm_cap_table[QM_MB_IRQ].cap_val;
if (!((val >> QM_IRQ_TYPE_SHIFT) & QM_IRQ_TYPE_MASK))
return;
@@ -4914,7 +4956,7 @@ static int qm_register_mb_cmd_irq(struct hisi_qm *qm)
u32 irq_vector, val;
int ret;
- val = qm->cap_tables.qm_cap_table[QM_PF2VF_IRQ_TYPE_CAP_IDX].cap_val;
+ val = qm->cap_tables.qm_cap_table[QM_MB_IRQ].cap_val;
if (!((val >> QM_IRQ_TYPE_SHIFT) & QM_IRQ_TYPE_MASK))
return 0;
@@ -4931,7 +4973,7 @@ static void qm_unregister_aeq_irq(struct hisi_qm *qm)
struct pci_dev *pdev = qm->pdev;
u32 irq_vector, val;
- val = qm->cap_tables.qm_cap_table[QM_AEQ_IRQ_TYPE_CAP_IDX].cap_val;
+ val = qm->cap_tables.qm_cap_table[QM_AEQ_IRQ].cap_val;
if (!((val >> QM_IRQ_TYPE_SHIFT) & QM_IRQ_TYPE_MASK))
return;
@@ -4945,7 +4987,7 @@ static int qm_register_aeq_irq(struct hisi_qm *qm)
u32 irq_vector, val;
int ret;
- val = qm->cap_tables.qm_cap_table[QM_AEQ_IRQ_TYPE_CAP_IDX].cap_val;
+ val = qm->cap_tables.qm_cap_table[QM_AEQ_IRQ].cap_val;
if (!((val >> QM_IRQ_TYPE_SHIFT) & QM_IRQ_TYPE_MASK))
return 0;
@@ -4963,7 +5005,7 @@ static void qm_unregister_eq_irq(struct hisi_qm *qm)
struct pci_dev *pdev = qm->pdev;
u32 irq_vector, val;
- val = qm->cap_tables.qm_cap_table[QM_EQ_IRQ_TYPE_CAP_IDX].cap_val;
+ val = qm->cap_tables.qm_cap_table[QM_EQ_IRQ].cap_val;
if (!((val >> QM_IRQ_TYPE_SHIFT) & QM_IRQ_TYPE_MASK))
return;
@@ -4977,7 +5019,7 @@ static int qm_register_eq_irq(struct hisi_qm *qm)
u32 irq_vector, val;
int ret;
- val = qm->cap_tables.qm_cap_table[QM_EQ_IRQ_TYPE_CAP_IDX].cap_val;
+ val = qm->cap_tables.qm_cap_table[QM_EQ_IRQ].cap_val;
if (!((val >> QM_IRQ_TYPE_SHIFT) & QM_IRQ_TYPE_MASK))
return 0;
@@ -5065,24 +5107,26 @@ static int qm_get_qp_num(struct hisi_qm *qm)
return 0;
}
-static int qm_pre_store_irq_type_caps(struct hisi_qm *qm)
+static int qm_pre_store_caps(struct hisi_qm *qm)
{
struct hisi_qm_cap_record *qm_cap;
struct pci_dev *pdev = qm->pdev;
size_t i, size;
- size = ARRAY_SIZE(qm_pre_store_caps);
+ size = ARRAY_SIZE(qm_cap_query_info);
qm_cap = devm_kzalloc(&pdev->dev, sizeof(*qm_cap) * size, GFP_KERNEL);
if (!qm_cap)
return -ENOMEM;
for (i = 0; i < size; i++) {
- qm_cap[i].type = qm_pre_store_caps[i];
- qm_cap[i].cap_val = hisi_qm_get_hw_info(qm, qm_basic_info,
- qm_pre_store_caps[i], qm->cap_ver);
+ qm_cap[i].type = qm_cap_query_info[i].type;
+ qm_cap[i].name = qm_cap_query_info[i].name;
+ qm_cap[i].cap_val = hisi_qm_get_cap_value(qm, qm_cap_query_info,
+ i, qm->cap_ver);
}
qm->cap_tables.qm_cap_table = qm_cap;
+ qm->cap_tables.qm_cap_size = size;
return 0;
}
@@ -5119,8 +5163,8 @@ static int qm_get_hw_caps(struct hisi_qm *qm)
set_bit(cap_info[i].type, &qm->caps);
}
- /* Fetch and save the value of irq type related capability registers */
- return qm_pre_store_irq_type_caps(qm);
+ /* Fetch and save the value of qm capability registers */
+ return qm_pre_store_caps(qm);
}
static int qm_get_pci_res(struct hisi_qm *qm)
diff --git a/drivers/crypto/hisilicon/sec/sec_drv.c b/drivers/crypto/hisilicon/sec/sec_drv.c
index 9bafcc5..ef0cb73 100644
--- a/drivers/crypto/hisilicon/sec/sec_drv.c
+++ b/drivers/crypto/hisilicon/sec/sec_drv.c
@@ -1304,7 +1304,7 @@ MODULE_DEVICE_TABLE(acpi, sec_acpi_match);
static struct platform_driver sec_driver = {
.probe = sec_probe,
- .remove_new = sec_remove,
+ .remove = sec_remove,
.driver = {
.name = "hisi_sec_platform_driver",
.of_match_table = sec_match,
diff --git a/drivers/crypto/hisilicon/sec2/sec.h b/drivers/crypto/hisilicon/sec2/sec.h
index 410c837..356188b 100644
--- a/drivers/crypto/hisilicon/sec2/sec.h
+++ b/drivers/crypto/hisilicon/sec2/sec.h
@@ -220,11 +220,27 @@ enum sec_cap_type {
SEC_CORE4_ALG_BITMAP_HIGH,
};
-enum sec_cap_reg_record_idx {
- SEC_DRV_ALG_BITMAP_LOW_IDX = 0x0,
- SEC_DRV_ALG_BITMAP_HIGH_IDX,
- SEC_DEV_ALG_BITMAP_LOW_IDX,
- SEC_DEV_ALG_BITMAP_HIGH_IDX,
+enum sec_cap_table_type {
+ QM_RAS_NFE_TYPE = 0x0,
+ QM_RAS_NFE_RESET,
+ QM_RAS_CE_TYPE,
+ SEC_RAS_NFE_TYPE,
+ SEC_RAS_NFE_RESET,
+ SEC_RAS_CE_TYPE,
+ SEC_CORE_INFO,
+ SEC_CORE_EN,
+ SEC_DRV_ALG_BITMAP_LOW_TB,
+ SEC_DRV_ALG_BITMAP_HIGH_TB,
+ SEC_ALG_BITMAP_LOW,
+ SEC_ALG_BITMAP_HIGH,
+ SEC_CORE1_BITMAP_LOW,
+ SEC_CORE1_BITMAP_HIGH,
+ SEC_CORE2_BITMAP_LOW,
+ SEC_CORE2_BITMAP_HIGH,
+ SEC_CORE3_BITMAP_LOW,
+ SEC_CORE3_BITMAP_HIGH,
+ SEC_CORE4_BITMAP_LOW,
+ SEC_CORE4_BITMAP_HIGH,
};
void sec_destroy_qps(struct hisi_qp **qps, int qp_num);
diff --git a/drivers/crypto/hisilicon/sec2/sec_crypto.c b/drivers/crypto/hisilicon/sec2/sec_crypto.c
index 0558f98..ae9ebbb 100644
--- a/drivers/crypto/hisilicon/sec2/sec_crypto.c
+++ b/drivers/crypto/hisilicon/sec2/sec_crypto.c
@@ -2520,8 +2520,8 @@ int sec_register_to_crypto(struct hisi_qm *qm)
u64 alg_mask;
int ret = 0;
- alg_mask = sec_get_alg_bitmap(qm, SEC_DRV_ALG_BITMAP_HIGH_IDX,
- SEC_DRV_ALG_BITMAP_LOW_IDX);
+ alg_mask = sec_get_alg_bitmap(qm, SEC_DRV_ALG_BITMAP_HIGH_TB,
+ SEC_DRV_ALG_BITMAP_LOW_TB);
mutex_lock(&sec_algs_lock);
if (sec_available_devs) {
@@ -2553,8 +2553,8 @@ void sec_unregister_from_crypto(struct hisi_qm *qm)
{
u64 alg_mask;
- alg_mask = sec_get_alg_bitmap(qm, SEC_DRV_ALG_BITMAP_HIGH_IDX,
- SEC_DRV_ALG_BITMAP_LOW_IDX);
+ alg_mask = sec_get_alg_bitmap(qm, SEC_DRV_ALG_BITMAP_HIGH_TB,
+ SEC_DRV_ALG_BITMAP_LOW_TB);
mutex_lock(&sec_algs_lock);
if (--sec_available_devs)
diff --git a/drivers/crypto/hisilicon/sec2/sec_main.c b/drivers/crypto/hisilicon/sec2/sec_main.c
index c35533d..8ec5333 100644
--- a/drivers/crypto/hisilicon/sec2/sec_main.c
+++ b/drivers/crypto/hisilicon/sec2/sec_main.c
@@ -14,9 +14,9 @@
#include <linux/seq_file.h>
#include <linux/topology.h>
#include <linux/uacce.h>
-
#include "sec.h"
+#define CAP_FILE_PERMISSION 0444
#define SEC_VF_NUM 63
#define SEC_QUEUE_NUM_V1 4096
#define PCI_DEVICE_ID_HUAWEI_SEC_PF 0xa255
@@ -167,11 +167,34 @@ static const struct hisi_qm_cap_info sec_basic_info[] = {
{SEC_CORE4_ALG_BITMAP_HIGH, 0x3170, 0, GENMASK(31, 0), 0x3FFF, 0x3FFF, 0x3FFF},
};
-static const u32 sec_pre_store_caps[] = {
- SEC_DRV_ALG_BITMAP_LOW,
- SEC_DRV_ALG_BITMAP_HIGH,
- SEC_DEV_ALG_BITMAP_LOW,
- SEC_DEV_ALG_BITMAP_HIGH,
+static const struct hisi_qm_cap_query_info sec_cap_query_info[] = {
+ {QM_RAS_NFE_TYPE, "QM_RAS_NFE_TYPE ", 0x3124, 0x0, 0x1C77, 0x7C77},
+ {QM_RAS_NFE_RESET, "QM_RAS_NFE_RESET ", 0x3128, 0x0, 0xC77, 0x6C77},
+ {QM_RAS_CE_TYPE, "QM_RAS_CE_TYPE ", 0x312C, 0x0, 0x8, 0x8},
+ {SEC_RAS_NFE_TYPE, "SEC_RAS_NFE_TYPE ", 0x3130, 0x0, 0x177, 0x60177},
+ {SEC_RAS_NFE_RESET, "SEC_RAS_NFE_RESET ", 0x3134, 0x0, 0x177, 0x177},
+ {SEC_RAS_CE_TYPE, "SEC_RAS_CE_TYPE ", 0x3138, 0x0, 0x88, 0xC088},
+ {SEC_CORE_INFO, "SEC_CORE_INFO ", 0x313c, 0x110404, 0x110404, 0x110404},
+ {SEC_CORE_EN, "SEC_CORE_EN ", 0x3140, 0x17F, 0x17F, 0xF},
+ {SEC_DRV_ALG_BITMAP_LOW_TB, "SEC_DRV_ALG_BITMAP_LOW ",
+ 0x3144, 0x18050CB, 0x18050CB, 0x18670CF},
+ {SEC_DRV_ALG_BITMAP_HIGH_TB, "SEC_DRV_ALG_BITMAP_HIGH ",
+ 0x3148, 0x395C, 0x395C, 0x395C},
+ {SEC_ALG_BITMAP_LOW, "SEC_ALG_BITMAP_LOW ",
+ 0x314c, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF},
+ {SEC_ALG_BITMAP_HIGH, "SEC_ALG_BITMAP_HIGH ", 0x3150, 0x3FFF, 0x3FFF, 0x3FFF},
+ {SEC_CORE1_BITMAP_LOW, "SEC_CORE1_BITMAP_LOW ",
+ 0x3154, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF},
+ {SEC_CORE1_BITMAP_HIGH, "SEC_CORE1_BITMAP_HIGH ", 0x3158, 0x3FFF, 0x3FFF, 0x3FFF},
+ {SEC_CORE2_BITMAP_LOW, "SEC_CORE2_BITMAP_LOW ",
+ 0x315c, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF},
+ {SEC_CORE2_BITMAP_HIGH, "SEC_CORE2_BITMAP_HIGH ", 0x3160, 0x3FFF, 0x3FFF, 0x3FFF},
+ {SEC_CORE3_BITMAP_LOW, "SEC_CORE3_BITMAP_LOW ",
+ 0x3164, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF},
+ {SEC_CORE3_BITMAP_HIGH, "SEC_CORE3_BITMAP_HIGH ", 0x3168, 0x3FFF, 0x3FFF, 0x3FFF},
+ {SEC_CORE4_BITMAP_LOW, "SEC_CORE4_BITMAP_LOW ",
+ 0x316c, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF},
+ {SEC_CORE4_BITMAP_HIGH, "SEC_CORE4_BITMAP_HIGH ", 0x3170, 0x3FFF, 0x3FFF, 0x3FFF},
};
static const struct qm_dev_alg sec_dev_algs[] = { {
@@ -322,7 +345,7 @@ static int sec_pf_q_num_set(const char *val, const struct kernel_param *kp)
{
pf_q_num_flag = true;
- return q_num_set(val, kp, PCI_DEVICE_ID_HUAWEI_SEC_PF);
+ return hisi_qm_q_num_set(val, kp, PCI_DEVICE_ID_HUAWEI_SEC_PF);
}
static const struct kernel_param_ops sec_pf_q_num_ops = {
@@ -838,6 +861,26 @@ static int sec_regs_show(struct seq_file *s, void *unused)
DEFINE_SHOW_ATTRIBUTE(sec_regs);
+static int sec_cap_regs_show(struct seq_file *s, void *unused)
+{
+ struct hisi_qm *qm = s->private;
+ u32 i, size;
+
+ size = qm->cap_tables.qm_cap_size;
+ for (i = 0; i < size; i++)
+ seq_printf(s, "%s= 0x%08x\n", qm->cap_tables.qm_cap_table[i].name,
+ qm->cap_tables.qm_cap_table[i].cap_val);
+
+ size = qm->cap_tables.dev_cap_size;
+ for (i = 0; i < size; i++)
+ seq_printf(s, "%s= 0x%08x\n", qm->cap_tables.dev_cap_table[i].name,
+ qm->cap_tables.dev_cap_table[i].cap_val);
+
+ return 0;
+}
+
+DEFINE_SHOW_ATTRIBUTE(sec_cap_regs);
+
static int sec_core_debug_init(struct hisi_qm *qm)
{
struct dfx_diff_registers *sec_regs = qm->debug.acc_diff_regs;
@@ -872,6 +915,9 @@ static int sec_core_debug_init(struct hisi_qm *qm)
tmp_d, data, &sec_atomic64_ops);
}
+ debugfs_create_file("cap_regs", CAP_FILE_PERMISSION,
+ qm->debug.debug_root, qm, &sec_cap_regs_fops);
+
return 0;
}
@@ -1010,11 +1056,15 @@ static u32 sec_get_hw_err_status(struct hisi_qm *qm)
static void sec_clear_hw_err_status(struct hisi_qm *qm, u32 err_sts)
{
- u32 nfe;
-
writel(err_sts, qm->io_base + SEC_CORE_INT_SOURCE);
- nfe = hisi_qm_get_hw_info(qm, sec_basic_info, SEC_NFE_MASK_CAP, qm->cap_ver);
- writel(nfe, qm->io_base + SEC_RAS_NFE_REG);
+}
+
+static void sec_disable_error_report(struct hisi_qm *qm, u32 err_type)
+{
+ u32 nfe_mask;
+
+ nfe_mask = hisi_qm_get_hw_info(qm, sec_basic_info, SEC_NFE_MASK_CAP, qm->cap_ver);
+ writel(nfe_mask & (~err_type), qm->io_base + SEC_RAS_NFE_REG);
}
static void sec_open_axi_master_ooo(struct hisi_qm *qm)
@@ -1026,6 +1076,27 @@ static void sec_open_axi_master_ooo(struct hisi_qm *qm)
writel(val | SEC_AXI_SHUTDOWN_ENABLE, qm->io_base + SEC_CONTROL_REG);
}
+static enum acc_err_result sec_get_err_result(struct hisi_qm *qm)
+{
+ u32 err_status;
+
+ err_status = sec_get_hw_err_status(qm);
+ if (err_status) {
+ if (err_status & qm->err_info.ecc_2bits_mask)
+ qm->err_status.is_dev_ecc_mbit = true;
+ sec_log_hw_error(qm, err_status);
+
+ if (err_status & qm->err_info.dev_reset_mask) {
+ /* Disable the same error reporting until device is recovered. */
+ sec_disable_error_report(qm, err_status);
+ return ACC_ERR_NEED_RESET;
+ }
+ sec_clear_hw_err_status(qm, err_status);
+ }
+
+ return ACC_ERR_RECOVERED;
+}
+
static void sec_err_info_init(struct hisi_qm *qm)
{
struct hisi_qm_err_info *err_info = &qm->err_info;
@@ -1052,12 +1123,12 @@ static const struct hisi_qm_err_ini sec_err_ini = {
.hw_err_disable = sec_hw_error_disable,
.get_dev_hw_err_status = sec_get_hw_err_status,
.clear_dev_hw_err_status = sec_clear_hw_err_status,
- .log_dev_hw_err = sec_log_hw_error,
.open_axi_master_ooo = sec_open_axi_master_ooo,
.open_sva_prefetch = sec_open_sva_prefetch,
.close_sva_prefetch = sec_close_sva_prefetch,
.show_last_dfx_regs = sec_show_last_dfx_regs,
.err_info_init = sec_err_info_init,
+ .get_err_result = sec_get_err_result,
};
static int sec_pf_probe_init(struct sec_dev *sec)
@@ -1085,18 +1156,20 @@ static int sec_pre_store_cap_reg(struct hisi_qm *qm)
struct pci_dev *pdev = qm->pdev;
size_t i, size;
- size = ARRAY_SIZE(sec_pre_store_caps);
+ size = ARRAY_SIZE(sec_cap_query_info);
sec_cap = devm_kzalloc(&pdev->dev, sizeof(*sec_cap) * size, GFP_KERNEL);
if (!sec_cap)
return -ENOMEM;
for (i = 0; i < size; i++) {
- sec_cap[i].type = sec_pre_store_caps[i];
- sec_cap[i].cap_val = hisi_qm_get_hw_info(qm, sec_basic_info,
- sec_pre_store_caps[i], qm->cap_ver);
+ sec_cap[i].type = sec_cap_query_info[i].type;
+ sec_cap[i].name = sec_cap_query_info[i].name;
+ sec_cap[i].cap_val = hisi_qm_get_cap_value(qm, sec_cap_query_info,
+ i, qm->cap_ver);
}
qm->cap_tables.dev_cap_table = sec_cap;
+ qm->cap_tables.dev_cap_size = size;
return 0;
}
@@ -1146,8 +1219,7 @@ static int sec_qm_init(struct hisi_qm *qm, struct pci_dev *pdev)
hisi_qm_uninit(qm);
return ret;
}
-
- alg_msk = sec_get_alg_bitmap(qm, SEC_DEV_ALG_BITMAP_HIGH_IDX, SEC_DEV_ALG_BITMAP_LOW_IDX);
+ alg_msk = sec_get_alg_bitmap(qm, SEC_ALG_BITMAP_HIGH, SEC_ALG_BITMAP_LOW);
ret = hisi_qm_set_algs(qm, alg_msk, sec_dev_algs, ARRAY_SIZE(sec_dev_algs));
if (ret) {
pci_err(qm->pdev, "Failed to set sec algs!\n");
diff --git a/drivers/crypto/hisilicon/trng/trng.c b/drivers/crypto/hisilicon/trng/trng.c
index 66c551e..ac74df4 100644
--- a/drivers/crypto/hisilicon/trng/trng.c
+++ b/drivers/crypto/hisilicon/trng/trng.c
@@ -324,7 +324,7 @@ MODULE_DEVICE_TABLE(acpi, hisi_trng_acpi_match);
static struct platform_driver hisi_trng_driver = {
.probe = hisi_trng_probe,
- .remove_new = hisi_trng_remove,
+ .remove = hisi_trng_remove,
.driver = {
.name = "hisi-trng-v2",
.acpi_match_table = ACPI_PTR(hisi_trng_acpi_match),
diff --git a/drivers/crypto/hisilicon/zip/zip.h b/drivers/crypto/hisilicon/zip/zip.h
index f2e6da3..2fecf34 100644
--- a/drivers/crypto/hisilicon/zip/zip.h
+++ b/drivers/crypto/hisilicon/zip/zip.h
@@ -81,6 +81,24 @@ struct hisi_zip_sqe {
u32 rsvd1[4];
};
+enum zip_cap_table_type {
+ QM_RAS_NFE_TYPE,
+ QM_RAS_NFE_RESET,
+ QM_RAS_CE_TYPE,
+ ZIP_RAS_NFE_TYPE,
+ ZIP_RAS_NFE_RESET,
+ ZIP_RAS_CE_TYPE,
+ ZIP_CORE_INFO,
+ ZIP_CORE_EN,
+ ZIP_DRV_ALG_BITMAP_TB,
+ ZIP_ALG_BITMAP,
+ ZIP_CORE1_BITMAP,
+ ZIP_CORE2_BITMAP,
+ ZIP_CORE3_BITMAP,
+ ZIP_CORE4_BITMAP,
+ ZIP_CORE5_BITMAP,
+};
+
int zip_create_qps(struct hisi_qp **qps, int qp_num, int node);
int hisi_zip_register_to_crypto(struct hisi_qm *qm);
void hisi_zip_unregister_from_crypto(struct hisi_qm *qm);
diff --git a/drivers/crypto/hisilicon/zip/zip_main.c b/drivers/crypto/hisilicon/zip/zip_main.c
index d07e47b..9239b25 100644
--- a/drivers/crypto/hisilicon/zip/zip_main.c
+++ b/drivers/crypto/hisilicon/zip/zip_main.c
@@ -14,6 +14,7 @@
#include <linux/uacce.h>
#include "zip.h"
+#define CAP_FILE_PERMISSION 0444
#define PCI_DEVICE_ID_HUAWEI_ZIP_PF 0xa250
#define HZIP_QUEUE_NUM_V1 4096
@@ -250,24 +251,22 @@ static struct hisi_qm_cap_info zip_basic_cap_info[] = {
{ZIP_CAP_MAX, 0x317c, 0, GENMASK(0, 0), 0x0, 0x0, 0x0}
};
-enum zip_pre_store_cap_idx {
- ZIP_CORE_NUM_CAP_IDX = 0x0,
- ZIP_CLUSTER_COMP_NUM_CAP_IDX,
- ZIP_CLUSTER_DECOMP_NUM_CAP_IDX,
- ZIP_DECOMP_ENABLE_BITMAP_IDX,
- ZIP_COMP_ENABLE_BITMAP_IDX,
- ZIP_DRV_ALG_BITMAP_IDX,
- ZIP_DEV_ALG_BITMAP_IDX,
-};
-
-static const u32 zip_pre_store_caps[] = {
- ZIP_CORE_NUM_CAP,
- ZIP_CLUSTER_COMP_NUM_CAP,
- ZIP_CLUSTER_DECOMP_NUM_CAP,
- ZIP_DECOMP_ENABLE_BITMAP,
- ZIP_COMP_ENABLE_BITMAP,
- ZIP_DRV_ALG_BITMAP,
- ZIP_DEV_ALG_BITMAP,
+static const struct hisi_qm_cap_query_info zip_cap_query_info[] = {
+ {QM_RAS_NFE_TYPE, "QM_RAS_NFE_TYPE ", 0x3124, 0x0, 0x1C57, 0x7C77},
+ {QM_RAS_NFE_RESET, "QM_RAS_NFE_RESET ", 0x3128, 0x0, 0xC57, 0x6C77},
+ {QM_RAS_CE_TYPE, "QM_RAS_CE_TYPE ", 0x312C, 0x0, 0x8, 0x8},
+ {ZIP_RAS_NFE_TYPE, "ZIP_RAS_NFE_TYPE ", 0x3130, 0x0, 0x7FE, 0x1FFE},
+ {ZIP_RAS_NFE_RESET, "ZIP_RAS_NFE_RESET ", 0x3134, 0x0, 0x7FE, 0x7FE},
+ {ZIP_RAS_CE_TYPE, "ZIP_RAS_CE_TYPE ", 0x3138, 0x0, 0x1, 0x1},
+ {ZIP_CORE_INFO, "ZIP_CORE_INFO ", 0x313C, 0x12080206, 0x12080206, 0x12050203},
+ {ZIP_CORE_EN, "ZIP_CORE_EN ", 0x3140, 0xFC0003, 0xFC0003, 0x1C0003},
+ {ZIP_DRV_ALG_BITMAP_TB, "ZIP_DRV_ALG_BITMAP ", 0x3144, 0x0, 0x0, 0x30},
+ {ZIP_ALG_BITMAP, "ZIP_ALG_BITMAP ", 0x3148, 0xF, 0xF, 0x3F},
+ {ZIP_CORE1_BITMAP, "ZIP_CORE1_BITMAP ", 0x314C, 0x5, 0x5, 0xD5},
+ {ZIP_CORE2_BITMAP, "ZIP_CORE2_BITMAP ", 0x3150, 0x5, 0x5, 0xD5},
+ {ZIP_CORE3_BITMAP, "ZIP_CORE3_BITMAP ", 0x3154, 0xA, 0xA, 0x2A},
+ {ZIP_CORE4_BITMAP, "ZIP_CORE4_BITMAP ", 0x3158, 0xA, 0xA, 0x2A},
+ {ZIP_CORE5_BITMAP, "ZIP_CORE5_BITMAP ", 0x315C, 0xA, 0xA, 0x2A},
};
static const struct debugfs_reg32 hzip_dfx_regs[] = {
@@ -402,7 +401,7 @@ static int pf_q_num_set(const char *val, const struct kernel_param *kp)
{
pf_q_num_flag = true;
- return q_num_set(val, kp, PCI_DEVICE_ID_HUAWEI_ZIP_PF);
+ return hisi_qm_q_num_set(val, kp, PCI_DEVICE_ID_HUAWEI_ZIP_PF);
}
static const struct kernel_param_ops pf_q_num_ops = {
@@ -442,7 +441,7 @@ bool hisi_zip_alg_support(struct hisi_qm *qm, u32 alg)
{
u32 cap_val;
- cap_val = qm->cap_tables.dev_cap_table[ZIP_DRV_ALG_BITMAP_IDX].cap_val;
+ cap_val = qm->cap_tables.dev_cap_table[ZIP_DRV_ALG_BITMAP_TB].cap_val;
if ((alg & cap_val) == alg)
return true;
@@ -530,6 +529,7 @@ static int hisi_zip_set_user_domain_and_cache(struct hisi_qm *qm)
{
void __iomem *base = qm->io_base;
u32 dcomp_bm, comp_bm;
+ u32 zip_core_en;
/* qm user domain */
writel(AXUSER_BASE, base + QM_ARUSER_M_CFG_1);
@@ -567,8 +567,12 @@ static int hisi_zip_set_user_domain_and_cache(struct hisi_qm *qm)
}
/* let's open all compression/decompression cores */
- dcomp_bm = qm->cap_tables.dev_cap_table[ZIP_DECOMP_ENABLE_BITMAP_IDX].cap_val;
- comp_bm = qm->cap_tables.dev_cap_table[ZIP_COMP_ENABLE_BITMAP_IDX].cap_val;
+
+ zip_core_en = qm->cap_tables.dev_cap_table[ZIP_CORE_EN].cap_val;
+ dcomp_bm = (zip_core_en >> zip_basic_cap_info[ZIP_DECOMP_ENABLE_BITMAP].shift) &
+ zip_basic_cap_info[ZIP_DECOMP_ENABLE_BITMAP].mask;
+ comp_bm = (zip_core_en >> zip_basic_cap_info[ZIP_COMP_ENABLE_BITMAP].shift) &
+ zip_basic_cap_info[ZIP_COMP_ENABLE_BITMAP].mask;
writel(HZIP_DECOMP_CHECK_ENABLE | dcomp_bm | comp_bm, base + HZIP_CLOCK_GATE_CTRL);
/* enable sqc,cqc writeback */
@@ -788,7 +792,12 @@ DEFINE_SHOW_ATTRIBUTE(hisi_zip_regs);
static void __iomem *get_zip_core_addr(struct hisi_qm *qm, int core_num)
{
- u32 zip_comp_core_num = qm->cap_tables.dev_cap_table[ZIP_CLUSTER_COMP_NUM_CAP_IDX].cap_val;
+ u8 zip_comp_core_num;
+ u32 zip_core_info;
+
+ zip_core_info = qm->cap_tables.dev_cap_table[ZIP_CORE_INFO].cap_val;
+ zip_comp_core_num = (zip_core_info >> zip_basic_cap_info[ZIP_CLUSTER_COMP_NUM_CAP].shift) &
+ zip_basic_cap_info[ZIP_CLUSTER_COMP_NUM_CAP].mask;
if (core_num < zip_comp_core_num)
return qm->io_base + HZIP_CORE_DFX_BASE +
@@ -803,12 +812,16 @@ static int hisi_zip_core_debug_init(struct hisi_qm *qm)
u32 zip_core_num, zip_comp_core_num;
struct device *dev = &qm->pdev->dev;
struct debugfs_regset32 *regset;
+ u32 zip_core_info;
struct dentry *tmp_d;
char buf[HZIP_BUF_SIZE];
int i;
- zip_core_num = qm->cap_tables.dev_cap_table[ZIP_CORE_NUM_CAP_IDX].cap_val;
- zip_comp_core_num = qm->cap_tables.dev_cap_table[ZIP_CLUSTER_COMP_NUM_CAP_IDX].cap_val;
+ zip_core_info = qm->cap_tables.dev_cap_table[ZIP_CORE_INFO].cap_val;
+ zip_core_num = (zip_core_info >> zip_basic_cap_info[ZIP_CORE_NUM_CAP].shift) &
+ zip_basic_cap_info[ZIP_CORE_NUM_CAP].mask;
+ zip_comp_core_num = (zip_core_info >> zip_basic_cap_info[ZIP_CLUSTER_COMP_NUM_CAP].shift) &
+ zip_basic_cap_info[ZIP_CLUSTER_COMP_NUM_CAP].mask;
for (i = 0; i < zip_core_num; i++) {
if (i < zip_comp_core_num)
@@ -834,6 +847,26 @@ static int hisi_zip_core_debug_init(struct hisi_qm *qm)
return 0;
}
+static int zip_cap_regs_show(struct seq_file *s, void *unused)
+{
+ struct hisi_qm *qm = s->private;
+ u32 i, size;
+
+ size = qm->cap_tables.qm_cap_size;
+ for (i = 0; i < size; i++)
+ seq_printf(s, "%s= 0x%08x\n", qm->cap_tables.qm_cap_table[i].name,
+ qm->cap_tables.qm_cap_table[i].cap_val);
+
+ size = qm->cap_tables.dev_cap_size;
+ for (i = 0; i < size; i++)
+ seq_printf(s, "%s= 0x%08x\n", qm->cap_tables.dev_cap_table[i].name,
+ qm->cap_tables.dev_cap_table[i].cap_val);
+
+ return 0;
+}
+
+DEFINE_SHOW_ATTRIBUTE(zip_cap_regs);
+
static void hisi_zip_dfx_debug_init(struct hisi_qm *qm)
{
struct dfx_diff_registers *hzip_regs = qm->debug.acc_diff_regs;
@@ -854,6 +887,9 @@ static void hisi_zip_dfx_debug_init(struct hisi_qm *qm)
if (qm->fun_type == QM_HW_PF && hzip_regs)
debugfs_create_file("diff_regs", 0444, tmp_dir,
qm, &hzip_diff_regs_fops);
+
+ debugfs_create_file("cap_regs", CAP_FILE_PERMISSION,
+ qm->debug.debug_root, qm, &zip_cap_regs_fops);
}
static int hisi_zip_ctrl_debug_init(struct hisi_qm *qm)
@@ -912,9 +948,14 @@ static int hisi_zip_debugfs_init(struct hisi_qm *qm)
/* hisi_zip_debug_regs_clear() - clear the zip debug regs */
static void hisi_zip_debug_regs_clear(struct hisi_qm *qm)
{
- u32 zip_core_num = qm->cap_tables.dev_cap_table[ZIP_CORE_NUM_CAP_IDX].cap_val;
+ u32 zip_core_info;
+ u8 zip_core_num;
int i, j;
+ zip_core_info = qm->cap_tables.dev_cap_table[ZIP_CORE_INFO].cap_val;
+ zip_core_num = (zip_core_info >> zip_basic_cap_info[ZIP_CORE_NUM_CAP].shift) &
+ zip_basic_cap_info[ZIP_CORE_NUM_CAP].mask;
+
/* enable register read_clear bit */
writel(HZIP_RD_CNT_CLR_CE_EN, qm->io_base + HZIP_SOFT_CTRL_CNT_CLR_CE);
for (i = 0; i < zip_core_num; i++)
@@ -946,10 +987,13 @@ static int hisi_zip_show_last_regs_init(struct hisi_qm *qm)
int com_dfx_regs_num = ARRAY_SIZE(hzip_com_dfx_regs);
struct qm_debug *debug = &qm->debug;
void __iomem *io_base;
+ u32 zip_core_info;
u32 zip_core_num;
int i, j, idx;
- zip_core_num = qm->cap_tables.dev_cap_table[ZIP_CORE_NUM_CAP_IDX].cap_val;
+ zip_core_info = qm->cap_tables.dev_cap_table[ZIP_CORE_INFO].cap_val;
+ zip_core_num = (zip_core_info >> zip_basic_cap_info[ZIP_CORE_NUM_CAP].shift) &
+ zip_basic_cap_info[ZIP_CORE_NUM_CAP].mask;
debug->last_words = kcalloc(core_dfx_regs_num * zip_core_num + com_dfx_regs_num,
sizeof(unsigned int), GFP_KERNEL);
@@ -991,6 +1035,7 @@ static void hisi_zip_show_last_dfx_regs(struct hisi_qm *qm)
u32 zip_core_num, zip_comp_core_num;
struct qm_debug *debug = &qm->debug;
char buf[HZIP_BUF_SIZE];
+ u32 zip_core_info;
void __iomem *base;
int i, j, idx;
u32 val;
@@ -1005,8 +1050,11 @@ static void hisi_zip_show_last_dfx_regs(struct hisi_qm *qm)
hzip_com_dfx_regs[i].name, debug->last_words[i], val);
}
- zip_core_num = qm->cap_tables.dev_cap_table[ZIP_CORE_NUM_CAP_IDX].cap_val;
- zip_comp_core_num = qm->cap_tables.dev_cap_table[ZIP_CLUSTER_COMP_NUM_CAP_IDX].cap_val;
+ zip_core_info = qm->cap_tables.dev_cap_table[ZIP_CORE_INFO].cap_val;
+ zip_core_num = (zip_core_info >> zip_basic_cap_info[ZIP_CORE_NUM_CAP].shift) &
+ zip_basic_cap_info[ZIP_CORE_NUM_CAP].mask;
+ zip_comp_core_num = (zip_core_info >> zip_basic_cap_info[ZIP_CLUSTER_COMP_NUM_CAP].shift) &
+ zip_basic_cap_info[ZIP_CLUSTER_COMP_NUM_CAP].mask;
for (i = 0; i < zip_core_num; i++) {
if (i < zip_comp_core_num)
@@ -1059,11 +1107,15 @@ static u32 hisi_zip_get_hw_err_status(struct hisi_qm *qm)
static void hisi_zip_clear_hw_err_status(struct hisi_qm *qm, u32 err_sts)
{
- u32 nfe;
-
writel(err_sts, qm->io_base + HZIP_CORE_INT_SOURCE);
- nfe = hisi_qm_get_hw_info(qm, zip_basic_cap_info, ZIP_NFE_MASK_CAP, qm->cap_ver);
- writel(nfe, qm->io_base + HZIP_CORE_INT_RAS_NFE_ENB);
+}
+
+static void hisi_zip_disable_error_report(struct hisi_qm *qm, u32 err_type)
+{
+ u32 nfe_mask;
+
+ nfe_mask = hisi_qm_get_hw_info(qm, zip_basic_cap_info, ZIP_NFE_MASK_CAP, qm->cap_ver);
+ writel(nfe_mask & (~err_type), qm->io_base + HZIP_CORE_INT_RAS_NFE_ENB);
}
static void hisi_zip_open_axi_master_ooo(struct hisi_qm *qm)
@@ -1093,6 +1145,27 @@ static void hisi_zip_close_axi_master_ooo(struct hisi_qm *qm)
qm->io_base + HZIP_CORE_INT_SET);
}
+static enum acc_err_result hisi_zip_get_err_result(struct hisi_qm *qm)
+{
+ u32 err_status;
+
+ err_status = hisi_zip_get_hw_err_status(qm);
+ if (err_status) {
+ if (err_status & qm->err_info.ecc_2bits_mask)
+ qm->err_status.is_dev_ecc_mbit = true;
+ hisi_zip_log_hw_error(qm, err_status);
+
+ if (err_status & qm->err_info.dev_reset_mask) {
+ /* Disable the same error reporting until device is recovered. */
+ hisi_zip_disable_error_report(qm, err_status);
+ return ACC_ERR_NEED_RESET;
+ }
+ hisi_zip_clear_hw_err_status(qm, err_status);
+ }
+
+ return ACC_ERR_RECOVERED;
+}
+
static void hisi_zip_err_info_init(struct hisi_qm *qm)
{
struct hisi_qm_err_info *err_info = &qm->err_info;
@@ -1120,13 +1193,13 @@ static const struct hisi_qm_err_ini hisi_zip_err_ini = {
.hw_err_disable = hisi_zip_hw_error_disable,
.get_dev_hw_err_status = hisi_zip_get_hw_err_status,
.clear_dev_hw_err_status = hisi_zip_clear_hw_err_status,
- .log_dev_hw_err = hisi_zip_log_hw_error,
.open_axi_master_ooo = hisi_zip_open_axi_master_ooo,
.close_axi_master_ooo = hisi_zip_close_axi_master_ooo,
.open_sva_prefetch = hisi_zip_open_sva_prefetch,
.close_sva_prefetch = hisi_zip_close_sva_prefetch,
.show_last_dfx_regs = hisi_zip_show_last_dfx_regs,
.err_info_init = hisi_zip_err_info_init,
+ .get_err_result = hisi_zip_get_err_result,
};
static int hisi_zip_pf_probe_init(struct hisi_zip *hisi_zip)
@@ -1167,18 +1240,20 @@ static int zip_pre_store_cap_reg(struct hisi_qm *qm)
struct pci_dev *pdev = qm->pdev;
size_t i, size;
- size = ARRAY_SIZE(zip_pre_store_caps);
+ size = ARRAY_SIZE(zip_cap_query_info);
zip_cap = devm_kzalloc(&pdev->dev, sizeof(*zip_cap) * size, GFP_KERNEL);
if (!zip_cap)
return -ENOMEM;
for (i = 0; i < size; i++) {
- zip_cap[i].type = zip_pre_store_caps[i];
- zip_cap[i].cap_val = hisi_qm_get_hw_info(qm, zip_basic_cap_info,
- zip_pre_store_caps[i], qm->cap_ver);
+ zip_cap[i].type = zip_cap_query_info[i].type;
+ zip_cap[i].name = zip_cap_query_info[i].name;
+ zip_cap[i].cap_val = hisi_qm_get_cap_value(qm, zip_cap_query_info,
+ i, qm->cap_ver);
}
qm->cap_tables.dev_cap_table = zip_cap;
+ qm->cap_tables.dev_cap_size = size;
return 0;
}
@@ -1230,7 +1305,7 @@ static int hisi_zip_qm_init(struct hisi_qm *qm, struct pci_dev *pdev)
return ret;
}
- alg_msk = qm->cap_tables.dev_cap_table[ZIP_DEV_ALG_BITMAP_IDX].cap_val;
+ alg_msk = qm->cap_tables.dev_cap_table[ZIP_ALG_BITMAP].cap_val;
ret = hisi_qm_set_algs(qm, alg_msk, zip_dev_algs, ARRAY_SIZE(zip_dev_algs));
if (ret) {
pci_err(qm->pdev, "Failed to set zip algs!\n");
diff --git a/drivers/crypto/img-hash.c b/drivers/crypto/img-hash.c
index 7e93159..1dc2378 100644
--- a/drivers/crypto/img-hash.c
+++ b/drivers/crypto/img-hash.c
@@ -1084,7 +1084,7 @@ static const struct dev_pm_ops img_hash_pm_ops = {
static struct platform_driver img_hash_driver = {
.probe = img_hash_probe,
- .remove_new = img_hash_remove,
+ .remove = img_hash_remove,
.driver = {
.name = "img-hash-accelerator",
.pm = &img_hash_pm_ops,
diff --git a/drivers/crypto/inside-secure/safexcel.c b/drivers/crypto/inside-secure/safexcel.c
index f5c1912..45758c7 100644
--- a/drivers/crypto/inside-secure/safexcel.c
+++ b/drivers/crypto/inside-secure/safexcel.c
@@ -1868,7 +1868,7 @@ MODULE_DEVICE_TABLE(of, safexcel_of_match_table);
static struct platform_driver crypto_safexcel = {
.probe = safexcel_probe,
- .remove_new = safexcel_remove,
+ .remove = safexcel_remove,
.driver = {
.name = "crypto-safexcel",
.of_match_table = safexcel_of_match_table,
diff --git a/drivers/crypto/inside-secure/safexcel_hash.c b/drivers/crypto/inside-secure/safexcel_hash.c
index e17577b..f44c08f 100644
--- a/drivers/crypto/inside-secure/safexcel_hash.c
+++ b/drivers/crypto/inside-secure/safexcel_hash.c
@@ -2093,7 +2093,7 @@ static int safexcel_xcbcmac_cra_init(struct crypto_tfm *tfm)
safexcel_ahash_cra_init(tfm);
ctx->aes = kmalloc(sizeof(*ctx->aes), GFP_KERNEL);
- return PTR_ERR_OR_ZERO(ctx->aes);
+ return ctx->aes == NULL ? -ENOMEM : 0;
}
static void safexcel_xcbcmac_cra_exit(struct crypto_tfm *tfm)
diff --git a/drivers/crypto/intel/iaa/iaa_crypto_main.c b/drivers/crypto/intel/iaa/iaa_crypto_main.c
index 237f870..8fced88 100644
--- a/drivers/crypto/intel/iaa/iaa_crypto_main.c
+++ b/drivers/crypto/intel/iaa/iaa_crypto_main.c
@@ -945,12 +945,22 @@ static inline int check_completion(struct device *dev,
bool only_once)
{
char *op_str = compress ? "compress" : "decompress";
+ int status_checks = 0;
int ret = 0;
while (!comp->status) {
if (only_once)
return -EAGAIN;
cpu_relax();
+ if (status_checks++ >= IAA_COMPLETION_TIMEOUT) {
+ /* Something is wrong with the hw, disable it. */
+ dev_err(dev, "%s completion timed out - "
+ "assuming broken hw, iaa_crypto now DISABLED\n",
+ op_str);
+ iaa_crypto_enabled = false;
+ ret = -ETIMEDOUT;
+ goto out;
+ }
}
if (comp->status != IAX_COMP_SUCCESS) {
diff --git a/drivers/crypto/intel/ixp4xx/ixp4xx_crypto.c b/drivers/crypto/intel/ixp4xx/ixp4xx_crypto.c
index f8a77bf..449c6d3a 100644
--- a/drivers/crypto/intel/ixp4xx/ixp4xx_crypto.c
+++ b/drivers/crypto/intel/ixp4xx/ixp4xx_crypto.c
@@ -1588,7 +1588,7 @@ static const struct of_device_id ixp4xx_crypto_of_match[] = {
static struct platform_driver ixp_crypto_driver = {
.probe = ixp_crypto_probe,
- .remove_new = ixp_crypto_remove,
+ .remove = ixp_crypto_remove,
.driver = {
.name = "ixp4xx_crypto",
.of_match_table = ixp4xx_crypto_of_match,
diff --git a/drivers/crypto/intel/keembay/keembay-ocs-aes-core.c b/drivers/crypto/intel/keembay/keembay-ocs-aes-core.c
index 9b2d098..8a8f6c8 100644
--- a/drivers/crypto/intel/keembay/keembay-ocs-aes-core.c
+++ b/drivers/crypto/intel/keembay/keembay-ocs-aes-core.c
@@ -1656,7 +1656,7 @@ static int kmb_ocs_aes_probe(struct platform_device *pdev)
/* The OCS driver is a platform device. */
static struct platform_driver kmb_ocs_aes_driver = {
.probe = kmb_ocs_aes_probe,
- .remove_new = kmb_ocs_aes_remove,
+ .remove = kmb_ocs_aes_remove,
.driver = {
.name = DRV_NAME,
.of_match_table = kmb_ocs_aes_of_match,
diff --git a/drivers/crypto/intel/keembay/keembay-ocs-ecc.c b/drivers/crypto/intel/keembay/keembay-ocs-ecc.c
index 5e24f2d..5930892 100644
--- a/drivers/crypto/intel/keembay/keembay-ocs-ecc.c
+++ b/drivers/crypto/intel/keembay/keembay-ocs-ecc.c
@@ -991,7 +991,7 @@ static const struct of_device_id kmb_ocs_ecc_of_match[] = {
/* The OCS driver is a platform device. */
static struct platform_driver kmb_ocs_ecc_driver = {
.probe = kmb_ocs_ecc_probe,
- .remove_new = kmb_ocs_ecc_remove,
+ .remove = kmb_ocs_ecc_remove,
.driver = {
.name = DRV_NAME,
.of_match_table = kmb_ocs_ecc_of_match,
diff --git a/drivers/crypto/intel/keembay/keembay-ocs-hcu-core.c b/drivers/crypto/intel/keembay/keembay-ocs-hcu-core.c
index e54c798..95dc897 100644
--- a/drivers/crypto/intel/keembay/keembay-ocs-hcu-core.c
+++ b/drivers/crypto/intel/keembay/keembay-ocs-hcu-core.c
@@ -1243,7 +1243,7 @@ static int kmb_ocs_hcu_probe(struct platform_device *pdev)
/* The OCS driver is a platform device. */
static struct platform_driver kmb_ocs_hcu_driver = {
.probe = kmb_ocs_hcu_probe,
- .remove_new = kmb_ocs_hcu_remove,
+ .remove = kmb_ocs_hcu_remove,
.driver = {
.name = DRV_NAME,
.of_match_table = kmb_ocs_hcu_of_match,
diff --git a/drivers/crypto/intel/qat/qat_420xx/adf_420xx_hw_data.c b/drivers/crypto/intel/qat/qat_420xx/adf_420xx_hw_data.c
index 78f0ea4..9faef33 100644
--- a/drivers/crypto/intel/qat/qat_420xx/adf_420xx_hw_data.c
+++ b/drivers/crypto/intel/qat/qat_420xx/adf_420xx_hw_data.c
@@ -375,7 +375,7 @@ static const char *uof_get_name(struct adf_accel_dev *accel_dev, u32 obj_num,
else
id = -EINVAL;
- if (id < 0 || id > num_objs)
+ if (id < 0 || id >= num_objs)
return NULL;
return fw_objs[id];
diff --git a/drivers/crypto/intel/qat/qat_4xxx/adf_4xxx_hw_data.c b/drivers/crypto/intel/qat/qat_4xxx/adf_4xxx_hw_data.c
index 9fd7ec5..bbd92c0 100644
--- a/drivers/crypto/intel/qat/qat_4xxx/adf_4xxx_hw_data.c
+++ b/drivers/crypto/intel/qat/qat_4xxx/adf_4xxx_hw_data.c
@@ -334,7 +334,7 @@ static const char *uof_get_name(struct adf_accel_dev *accel_dev, u32 obj_num,
else
id = -EINVAL;
- if (id < 0 || id > num_objs)
+ if (id < 0 || id >= num_objs)
return NULL;
return fw_objs[id];
diff --git a/drivers/crypto/intel/qat/qat_common/adf_aer.c b/drivers/crypto/intel/qat/qat_common/adf_aer.c
index ec7913a..4cb8bd8 100644
--- a/drivers/crypto/intel/qat/qat_common/adf_aer.c
+++ b/drivers/crypto/intel/qat/qat_common/adf_aer.c
@@ -281,8 +281,11 @@ int adf_init_aer(void)
return -EFAULT;
device_sriov_wq = alloc_workqueue("qat_device_sriov_wq", 0, 0);
- if (!device_sriov_wq)
+ if (!device_sriov_wq) {
+ destroy_workqueue(device_reset_wq);
+ device_reset_wq = NULL;
return -EFAULT;
+ }
return 0;
}
diff --git a/drivers/crypto/intel/qat/qat_common/adf_common_drv.h b/drivers/crypto/intel/qat/qat_common/adf_common_drv.h
index f7ecabd..eaa6388 100644
--- a/drivers/crypto/intel/qat/qat_common/adf_common_drv.h
+++ b/drivers/crypto/intel/qat/qat_common/adf_common_drv.h
@@ -69,7 +69,6 @@ void adf_devmgr_rm_dev(struct adf_accel_dev *accel_dev,
struct adf_accel_dev *pf);
struct list_head *adf_devmgr_get_head(void);
struct adf_accel_dev *adf_devmgr_get_dev_by_id(u32 id);
-struct adf_accel_dev *adf_devmgr_get_first(void);
struct adf_accel_dev *adf_devmgr_pci_to_accel_dev(struct pci_dev *pci_dev);
int adf_devmgr_verify_id(u32 id);
void adf_devmgr_get_num_dev(u32 *num);
diff --git a/drivers/crypto/intel/qat/qat_common/adf_dbgfs.c b/drivers/crypto/intel/qat/qat_common/adf_dbgfs.c
index c42f5c2..4c11ad1 100644
--- a/drivers/crypto/intel/qat/qat_common/adf_dbgfs.c
+++ b/drivers/crypto/intel/qat/qat_common/adf_dbgfs.c
@@ -22,18 +22,13 @@
void adf_dbgfs_init(struct adf_accel_dev *accel_dev)
{
char name[ADF_DEVICE_NAME_LENGTH];
- void *ret;
/* Create dev top level debugfs entry */
snprintf(name, sizeof(name), "%s%s_%s", ADF_DEVICE_NAME_PREFIX,
accel_dev->hw_device->dev_class->name,
pci_name(accel_dev->accel_pci_dev.pci_dev));
- ret = debugfs_create_dir(name, NULL);
- if (IS_ERR_OR_NULL(ret))
- return;
-
- accel_dev->debugfs_dir = ret;
+ accel_dev->debugfs_dir = debugfs_create_dir(name, NULL);
adf_cfg_dev_dbgfs_add(accel_dev);
}
@@ -59,9 +54,6 @@ EXPORT_SYMBOL_GPL(adf_dbgfs_exit);
*/
void adf_dbgfs_add(struct adf_accel_dev *accel_dev)
{
- if (!accel_dev->debugfs_dir)
- return;
-
if (!accel_dev->is_vf) {
adf_fw_counters_dbgfs_add(accel_dev);
adf_heartbeat_dbgfs_add(accel_dev);
@@ -77,9 +69,6 @@ void adf_dbgfs_add(struct adf_accel_dev *accel_dev)
*/
void adf_dbgfs_rm(struct adf_accel_dev *accel_dev)
{
- if (!accel_dev->debugfs_dir)
- return;
-
if (!accel_dev->is_vf) {
adf_tl_dbgfs_rm(accel_dev);
adf_cnv_dbgfs_rm(accel_dev);
diff --git a/drivers/crypto/intel/qat/qat_common/adf_dev_mgr.c b/drivers/crypto/intel/qat/qat_common/adf_dev_mgr.c
index 96ddd1c..34b9f77 100644
--- a/drivers/crypto/intel/qat/qat_common/adf_dev_mgr.c
+++ b/drivers/crypto/intel/qat/qat_common/adf_dev_mgr.c
@@ -276,16 +276,6 @@ void adf_devmgr_rm_dev(struct adf_accel_dev *accel_dev,
}
EXPORT_SYMBOL_GPL(adf_devmgr_rm_dev);
-struct adf_accel_dev *adf_devmgr_get_first(void)
-{
- struct adf_accel_dev *dev = NULL;
-
- if (!list_empty(&accel_table))
- dev = list_first_entry(&accel_table, struct adf_accel_dev,
- list);
- return dev;
-}
-
/**
* adf_devmgr_pci_to_accel_dev() - Get accel_dev associated with the pci_dev.
* @pci_dev: Pointer to PCI device.
diff --git a/drivers/crypto/intel/qat/qat_common/adf_gen4_pm_debugfs.c b/drivers/crypto/intel/qat/qat_common/adf_gen4_pm_debugfs.c
index ee0b507..2e4095c4 100644
--- a/drivers/crypto/intel/qat/qat_common/adf_gen4_pm_debugfs.c
+++ b/drivers/crypto/intel/qat/qat_common/adf_gen4_pm_debugfs.c
@@ -42,13 +42,13 @@ struct pm_status_row {
const char *key;
};
-static struct pm_status_row pm_fuse_rows[] = {
+static const struct pm_status_row pm_fuse_rows[] = {
PM_INFO_REGSET_ENTRY(fusectl0, ENABLE_PM),
PM_INFO_REGSET_ENTRY(fusectl0, ENABLE_PM_IDLE),
PM_INFO_REGSET_ENTRY(fusectl0, ENABLE_DEEP_PM_IDLE),
};
-static struct pm_status_row pm_info_rows[] = {
+static const struct pm_status_row pm_info_rows[] = {
PM_INFO_REGSET_ENTRY(pm.status, CPM_PM_STATE),
PM_INFO_REGSET_ENTRY(pm.status, PENDING_WP),
PM_INFO_REGSET_ENTRY(pm.status, CURRENT_WP),
@@ -59,7 +59,7 @@ static struct pm_status_row pm_info_rows[] = {
PM_INFO_REGSET_ENTRY(pm.main, THR_VALUE),
};
-static struct pm_status_row pm_ssm_rows[] = {
+static const struct pm_status_row pm_ssm_rows[] = {
PM_INFO_REGSET_ENTRY(ssm.pm_enable, SSM_PM_ENABLE),
PM_INFO_REGSET_ENTRY32(ssm.active_constraint, ACTIVE_CONSTRAINT),
PM_INFO_REGSET_ENTRY(ssm.pm_domain_status, DOMAIN_POWER_GATED),
@@ -83,7 +83,7 @@ static struct pm_status_row pm_ssm_rows[] = {
PM_INFO_REGSET_ENTRY(ssm.pm_managed_status, WCP_MANAGED_COUNT),
};
-static struct pm_status_row pm_log_rows[] = {
+static const struct pm_status_row pm_log_rows[] = {
PM_INFO_REGSET_ENTRY32(event_counters.host_msg, HOST_MSG_EVENT_COUNT),
PM_INFO_REGSET_ENTRY32(event_counters.sys_pm, SYS_PM_EVENT_COUNT),
PM_INFO_REGSET_ENTRY32(event_counters.local_ssm, SSM_EVENT_COUNT),
@@ -91,7 +91,7 @@ static struct pm_status_row pm_log_rows[] = {
PM_INFO_REGSET_ENTRY32(event_counters.unknown, UNKNOWN_EVENT_COUNT),
};
-static struct pm_status_row pm_event_rows[ICP_QAT_NUMBER_OF_PM_EVENTS] = {
+static const struct pm_status_row pm_event_rows[ICP_QAT_NUMBER_OF_PM_EVENTS] = {
PM_INFO_REGSET_ENTRY32(event_log[0], EVENT0),
PM_INFO_REGSET_ENTRY32(event_log[1], EVENT1),
PM_INFO_REGSET_ENTRY32(event_log[2], EVENT2),
@@ -102,14 +102,14 @@ static struct pm_status_row pm_event_rows[ICP_QAT_NUMBER_OF_PM_EVENTS] = {
PM_INFO_REGSET_ENTRY32(event_log[7], EVENT7),
};
-static struct pm_status_row pm_csrs_rows[] = {
+static const struct pm_status_row pm_csrs_rows[] = {
PM_INFO_REGSET_ENTRY32(pm.fw_init, CPM_PM_FW_INIT),
PM_INFO_REGSET_ENTRY32(pm.status, CPM_PM_STATUS),
PM_INFO_REGSET_ENTRY32(pm.main, CPM_PM_MASTER_FW),
PM_INFO_REGSET_ENTRY32(pm.pwrreq, CPM_PM_PWRREQ),
};
-static int pm_scnprint_table(char *buff, struct pm_status_row *table,
+static int pm_scnprint_table(char *buff, const struct pm_status_row *table,
u32 *pm_info_regs, size_t buff_size, int table_len,
bool lowercase)
{
@@ -131,7 +131,7 @@ static int pm_scnprint_table(char *buff, struct pm_status_row *table,
return wr;
}
-static int pm_scnprint_table_upper_keys(char *buff, struct pm_status_row *table,
+static int pm_scnprint_table_upper_keys(char *buff, const struct pm_status_row *table,
u32 *pm_info_regs, size_t buff_size,
int table_len)
{
@@ -139,7 +139,7 @@ static int pm_scnprint_table_upper_keys(char *buff, struct pm_status_row *table,
table_len, false);
}
-static int pm_scnprint_table_lower_keys(char *buff, struct pm_status_row *table,
+static int pm_scnprint_table_lower_keys(char *buff, const struct pm_status_row *table,
u32 *pm_info_regs, size_t buff_size,
int table_len)
{
diff --git a/drivers/crypto/intel/qat/qat_common/adf_hw_arbiter.c b/drivers/crypto/intel/qat/qat_common/adf_hw_arbiter.c
index 65bd26b..f93d9cc 100644
--- a/drivers/crypto/intel/qat/qat_common/adf_hw_arbiter.c
+++ b/drivers/crypto/intel/qat/qat_common/adf_hw_arbiter.c
@@ -90,10 +90,6 @@ void adf_exit_arb(struct adf_accel_dev *accel_dev)
hw_data->get_arb_info(&info);
- /* Reset arbiter configuration */
- for (i = 0; i < ADF_ARB_NUM; i++)
- WRITE_CSR_ARB_SARCONFIG(csr, arb_off, i, 0);
-
/* Unmap worker threads to service arbiters */
for (i = 0; i < hw_data->num_engines; i++)
WRITE_CSR_ARB_WT2SAM(csr, arb_off, wt_off, i, 0);
diff --git a/drivers/crypto/intel/qat/qat_common/qat_hal.c b/drivers/crypto/intel/qat/qat_common/qat_hal.c
index 317cafa..ef8a9cf 100644
--- a/drivers/crypto/intel/qat/qat_common/qat_hal.c
+++ b/drivers/crypto/intel/qat/qat_common/qat_hal.c
@@ -163,7 +163,7 @@ int qat_hal_set_ae_ctx_mode(struct icp_qat_fw_loader_handle *handle,
return -EINVAL;
}
- /* Sets the accelaration engine context mode to either four or eight */
+ /* Sets the acceleration engine context mode to either four or eight */
csr = qat_hal_rd_ae_csr(handle, ae, CTX_ENABLES);
csr = IGNORE_W1C_MASK & csr;
new_csr = (mode == 4) ?
diff --git a/drivers/crypto/marvell/Kconfig b/drivers/crypto/marvell/Kconfig
index 7821757..4c25a78 100644
--- a/drivers/crypto/marvell/Kconfig
+++ b/drivers/crypto/marvell/Kconfig
@@ -7,7 +7,7 @@
config CRYPTO_DEV_MARVELL_CESA
tristate "Marvell's Cryptographic Engine driver"
- depends on PLAT_ORION || ARCH_MVEBU
+ depends on PLAT_ORION || ARCH_MVEBU || COMPILE_TEST
select CRYPTO_LIB_AES
select CRYPTO_LIB_DES
select CRYPTO_SKCIPHER
diff --git a/drivers/crypto/marvell/cesa/cesa.c b/drivers/crypto/marvell/cesa/cesa.c
index 5fd31ba7..fa08f10 100644
--- a/drivers/crypto/marvell/cesa/cesa.c
+++ b/drivers/crypto/marvell/cesa/cesa.c
@@ -375,7 +375,6 @@ static int mv_cesa_get_sram(struct platform_device *pdev, int idx)
{
struct mv_cesa_dev *cesa = platform_get_drvdata(pdev);
struct mv_cesa_engine *engine = &cesa->engines[idx];
- const char *res_name = "sram";
struct resource *res;
engine->pool = of_gen_pool_get(cesa->dev->of_node,
@@ -391,19 +390,7 @@ static int mv_cesa_get_sram(struct platform_device *pdev, int idx)
return -ENOMEM;
}
- if (cesa->caps->nengines > 1) {
- if (!idx)
- res_name = "sram0";
- else
- res_name = "sram1";
- }
-
- res = platform_get_resource_byname(pdev, IORESOURCE_MEM,
- res_name);
- if (!res || resource_size(res) < cesa->sram_size)
- return -EINVAL;
-
- engine->sram = devm_ioremap_resource(cesa->dev, res);
+ engine->sram = devm_platform_get_and_ioremap_resource(pdev, idx, &res);
if (IS_ERR(engine->sram))
return PTR_ERR(engine->sram);
@@ -510,25 +497,21 @@ static int mv_cesa_probe(struct platform_device *pdev)
* if the clock does not exist.
*/
snprintf(res_name, sizeof(res_name), "cesa%u", i);
- engine->clk = devm_clk_get(dev, res_name);
+ engine->clk = devm_clk_get_optional_enabled(dev, res_name);
if (IS_ERR(engine->clk)) {
- engine->clk = devm_clk_get(dev, NULL);
- if (IS_ERR(engine->clk))
- engine->clk = NULL;
+ engine->clk = devm_clk_get_optional_enabled(dev, NULL);
+ if (IS_ERR(engine->clk)) {
+ ret = PTR_ERR(engine->clk);
+ goto err_cleanup;
+ }
}
snprintf(res_name, sizeof(res_name), "cesaz%u", i);
- engine->zclk = devm_clk_get(dev, res_name);
- if (IS_ERR(engine->zclk))
- engine->zclk = NULL;
-
- ret = clk_prepare_enable(engine->clk);
- if (ret)
+ engine->zclk = devm_clk_get_optional_enabled(dev, res_name);
+ if (IS_ERR(engine->zclk)) {
+ ret = PTR_ERR(engine->zclk);
goto err_cleanup;
-
- ret = clk_prepare_enable(engine->zclk);
- if (ret)
- goto err_cleanup;
+ }
engine->regs = cesa->regs + CESA_ENGINE_OFF(i);
@@ -570,13 +553,8 @@ static int mv_cesa_probe(struct platform_device *pdev)
return 0;
err_cleanup:
- for (i = 0; i < caps->nengines; i++) {
- clk_disable_unprepare(cesa->engines[i].zclk);
- clk_disable_unprepare(cesa->engines[i].clk);
+ for (i = 0; i < caps->nengines; i++)
mv_cesa_put_sram(pdev, i);
- if (cesa->engines[i].irq > 0)
- irq_set_affinity_hint(cesa->engines[i].irq, NULL);
- }
return ret;
}
@@ -588,12 +566,8 @@ static void mv_cesa_remove(struct platform_device *pdev)
mv_cesa_remove_algs(cesa);
- for (i = 0; i < cesa->caps->nengines; i++) {
- clk_disable_unprepare(cesa->engines[i].zclk);
- clk_disable_unprepare(cesa->engines[i].clk);
+ for (i = 0; i < cesa->caps->nengines; i++)
mv_cesa_put_sram(pdev, i);
- irq_set_affinity_hint(cesa->engines[i].irq, NULL);
- }
}
static const struct platform_device_id mv_cesa_plat_id_table[] = {
@@ -604,7 +578,7 @@ MODULE_DEVICE_TABLE(platform, mv_cesa_plat_id_table);
static struct platform_driver marvell_cesa = {
.probe = mv_cesa_probe,
- .remove_new = mv_cesa_remove,
+ .remove = mv_cesa_remove,
.id_table = mv_cesa_plat_id_table,
.driver = {
.name = "marvell-cesa",
diff --git a/drivers/crypto/marvell/cesa/cipher.c b/drivers/crypto/marvell/cesa/cipher.c
index 0f37dfd..cf62db5 100644
--- a/drivers/crypto/marvell/cesa/cipher.c
+++ b/drivers/crypto/marvell/cesa/cipher.c
@@ -489,7 +489,7 @@ static int mv_cesa_des_op(struct skcipher_request *req,
static int mv_cesa_ecb_des_encrypt(struct skcipher_request *req)
{
- struct mv_cesa_op_ctx tmpl;
+ struct mv_cesa_op_ctx tmpl = { };
mv_cesa_set_op_cfg(&tmpl,
CESA_SA_DESC_CFG_CRYPTCM_ECB |
@@ -500,7 +500,7 @@ static int mv_cesa_ecb_des_encrypt(struct skcipher_request *req)
static int mv_cesa_ecb_des_decrypt(struct skcipher_request *req)
{
- struct mv_cesa_op_ctx tmpl;
+ struct mv_cesa_op_ctx tmpl = { };
mv_cesa_set_op_cfg(&tmpl,
CESA_SA_DESC_CFG_CRYPTCM_ECB |
@@ -543,7 +543,7 @@ static int mv_cesa_cbc_des_op(struct skcipher_request *req,
static int mv_cesa_cbc_des_encrypt(struct skcipher_request *req)
{
- struct mv_cesa_op_ctx tmpl;
+ struct mv_cesa_op_ctx tmpl = { };
mv_cesa_set_op_cfg(&tmpl, CESA_SA_DESC_CFG_DIR_ENC);
@@ -552,7 +552,7 @@ static int mv_cesa_cbc_des_encrypt(struct skcipher_request *req)
static int mv_cesa_cbc_des_decrypt(struct skcipher_request *req)
{
- struct mv_cesa_op_ctx tmpl;
+ struct mv_cesa_op_ctx tmpl = { };
mv_cesa_set_op_cfg(&tmpl, CESA_SA_DESC_CFG_DIR_DEC);
@@ -596,7 +596,7 @@ static int mv_cesa_des3_op(struct skcipher_request *req,
static int mv_cesa_ecb_des3_ede_encrypt(struct skcipher_request *req)
{
- struct mv_cesa_op_ctx tmpl;
+ struct mv_cesa_op_ctx tmpl = { };
mv_cesa_set_op_cfg(&tmpl,
CESA_SA_DESC_CFG_CRYPTCM_ECB |
@@ -608,7 +608,7 @@ static int mv_cesa_ecb_des3_ede_encrypt(struct skcipher_request *req)
static int mv_cesa_ecb_des3_ede_decrypt(struct skcipher_request *req)
{
- struct mv_cesa_op_ctx tmpl;
+ struct mv_cesa_op_ctx tmpl = { };
mv_cesa_set_op_cfg(&tmpl,
CESA_SA_DESC_CFG_CRYPTCM_ECB |
@@ -649,7 +649,7 @@ static int mv_cesa_cbc_des3_op(struct skcipher_request *req,
static int mv_cesa_cbc_des3_ede_encrypt(struct skcipher_request *req)
{
- struct mv_cesa_op_ctx tmpl;
+ struct mv_cesa_op_ctx tmpl = { };
mv_cesa_set_op_cfg(&tmpl,
CESA_SA_DESC_CFG_CRYPTCM_CBC |
@@ -661,7 +661,7 @@ static int mv_cesa_cbc_des3_ede_encrypt(struct skcipher_request *req)
static int mv_cesa_cbc_des3_ede_decrypt(struct skcipher_request *req)
{
- struct mv_cesa_op_ctx tmpl;
+ struct mv_cesa_op_ctx tmpl = { };
mv_cesa_set_op_cfg(&tmpl,
CESA_SA_DESC_CFG_CRYPTCM_CBC |
@@ -725,7 +725,7 @@ static int mv_cesa_aes_op(struct skcipher_request *req,
static int mv_cesa_ecb_aes_encrypt(struct skcipher_request *req)
{
- struct mv_cesa_op_ctx tmpl;
+ struct mv_cesa_op_ctx tmpl = { };
mv_cesa_set_op_cfg(&tmpl,
CESA_SA_DESC_CFG_CRYPTCM_ECB |
@@ -736,7 +736,7 @@ static int mv_cesa_ecb_aes_encrypt(struct skcipher_request *req)
static int mv_cesa_ecb_aes_decrypt(struct skcipher_request *req)
{
- struct mv_cesa_op_ctx tmpl;
+ struct mv_cesa_op_ctx tmpl = { };
mv_cesa_set_op_cfg(&tmpl,
CESA_SA_DESC_CFG_CRYPTCM_ECB |
@@ -778,7 +778,7 @@ static int mv_cesa_cbc_aes_op(struct skcipher_request *req,
static int mv_cesa_cbc_aes_encrypt(struct skcipher_request *req)
{
- struct mv_cesa_op_ctx tmpl;
+ struct mv_cesa_op_ctx tmpl = { };
mv_cesa_set_op_cfg(&tmpl, CESA_SA_DESC_CFG_DIR_ENC);
@@ -787,7 +787,7 @@ static int mv_cesa_cbc_aes_encrypt(struct skcipher_request *req)
static int mv_cesa_cbc_aes_decrypt(struct skcipher_request *req)
{
- struct mv_cesa_op_ctx tmpl;
+ struct mv_cesa_op_ctx tmpl = { };
mv_cesa_set_op_cfg(&tmpl, CESA_SA_DESC_CFG_DIR_DEC);
diff --git a/drivers/crypto/mxs-dcp.c b/drivers/crypto/mxs-dcp.c
index c82775d..d94a26c 100644
--- a/drivers/crypto/mxs-dcp.c
+++ b/drivers/crypto/mxs-dcp.c
@@ -225,21 +225,22 @@ static int mxs_dcp_start_dma(struct dcp_async_ctx *actx)
static int mxs_dcp_run_aes(struct dcp_async_ctx *actx,
struct skcipher_request *req, int init)
{
- dma_addr_t key_phys = 0;
- dma_addr_t src_phys, dst_phys;
+ dma_addr_t key_phys, src_phys, dst_phys;
struct dcp *sdcp = global_sdcp;
struct dcp_dma_desc *desc = &sdcp->coh->desc[actx->chan];
struct dcp_aes_req_ctx *rctx = skcipher_request_ctx(req);
bool key_referenced = actx->key_referenced;
int ret;
- if (!key_referenced) {
+ if (key_referenced)
+ key_phys = dma_map_single(sdcp->dev, sdcp->coh->aes_key + AES_KEYSIZE_128,
+ AES_KEYSIZE_128, DMA_TO_DEVICE);
+ else
key_phys = dma_map_single(sdcp->dev, sdcp->coh->aes_key,
2 * AES_KEYSIZE_128, DMA_TO_DEVICE);
- ret = dma_mapping_error(sdcp->dev, key_phys);
- if (ret)
- return ret;
- }
+ ret = dma_mapping_error(sdcp->dev, key_phys);
+ if (ret)
+ return ret;
src_phys = dma_map_single(sdcp->dev, sdcp->coh->aes_in_buf,
DCP_BUF_SZ, DMA_TO_DEVICE);
@@ -300,7 +301,10 @@ static int mxs_dcp_run_aes(struct dcp_async_ctx *actx,
err_dst:
dma_unmap_single(sdcp->dev, src_phys, DCP_BUF_SZ, DMA_TO_DEVICE);
err_src:
- if (!key_referenced)
+ if (key_referenced)
+ dma_unmap_single(sdcp->dev, key_phys, AES_KEYSIZE_128,
+ DMA_TO_DEVICE);
+ else
dma_unmap_single(sdcp->dev, key_phys, 2 * AES_KEYSIZE_128,
DMA_TO_DEVICE);
return ret;
@@ -1243,7 +1247,7 @@ MODULE_DEVICE_TABLE(of, mxs_dcp_dt_ids);
static struct platform_driver mxs_dcp_driver = {
.probe = mxs_dcp_probe,
- .remove_new = mxs_dcp_remove,
+ .remove = mxs_dcp_remove,
.driver = {
.name = "mxs-dcp",
.of_match_table = mxs_dcp_dt_ids,
diff --git a/drivers/crypto/n2_core.c b/drivers/crypto/n2_core.c
index b11545c..14c302d 100644
--- a/drivers/crypto/n2_core.c
+++ b/drivers/crypto/n2_core.c
@@ -2119,7 +2119,7 @@ static struct platform_driver n2_crypto_driver = {
.of_match_table = n2_crypto_match,
},
.probe = n2_crypto_probe,
- .remove_new = n2_crypto_remove,
+ .remove = n2_crypto_remove,
};
static const struct of_device_id n2_mau_match[] = {
@@ -2146,7 +2146,7 @@ static struct platform_driver n2_mau_driver = {
.of_match_table = n2_mau_match,
},
.probe = n2_mau_probe,
- .remove_new = n2_mau_remove,
+ .remove = n2_mau_remove,
};
static struct platform_driver * const drivers[] = {
diff --git a/drivers/crypto/nx/nx-common-pseries.c b/drivers/crypto/nx/nx-common-pseries.c
index 35f2d0d..1660c5c 100644
--- a/drivers/crypto/nx/nx-common-pseries.c
+++ b/drivers/crypto/nx/nx-common-pseries.c
@@ -133,7 +133,7 @@ struct nx842_devdata {
};
static struct nx842_devdata __rcu *devdata;
-static DEFINE_SPINLOCK(devdata_mutex);
+static DEFINE_SPINLOCK(devdata_spinlock);
#define NX842_COUNTER_INC(_x) \
static inline void nx842_inc_##_x( \
@@ -750,15 +750,15 @@ static int nx842_OF_upd(struct property *new_prop)
if (!new_devdata)
return -ENOMEM;
- spin_lock_irqsave(&devdata_mutex, flags);
+ spin_lock_irqsave(&devdata_spinlock, flags);
old_devdata = rcu_dereference_check(devdata,
- lockdep_is_held(&devdata_mutex));
+ lockdep_is_held(&devdata_spinlock));
if (old_devdata)
of_node = old_devdata->dev->of_node;
if (!old_devdata || !of_node) {
pr_err("%s: device is not available\n", __func__);
- spin_unlock_irqrestore(&devdata_mutex, flags);
+ spin_unlock_irqrestore(&devdata_spinlock, flags);
kfree(new_devdata);
return -ENODEV;
}
@@ -810,7 +810,7 @@ static int nx842_OF_upd(struct property *new_prop)
old_devdata->max_sg_len);
rcu_assign_pointer(devdata, new_devdata);
- spin_unlock_irqrestore(&devdata_mutex, flags);
+ spin_unlock_irqrestore(&devdata_spinlock, flags);
synchronize_rcu();
dev_set_drvdata(new_devdata->dev, new_devdata);
kfree(old_devdata);
@@ -821,13 +821,13 @@ static int nx842_OF_upd(struct property *new_prop)
dev_info(old_devdata->dev, "%s: device disabled\n", __func__);
nx842_OF_set_defaults(new_devdata);
rcu_assign_pointer(devdata, new_devdata);
- spin_unlock_irqrestore(&devdata_mutex, flags);
+ spin_unlock_irqrestore(&devdata_spinlock, flags);
synchronize_rcu();
dev_set_drvdata(new_devdata->dev, new_devdata);
kfree(old_devdata);
} else {
dev_err(old_devdata->dev, "%s: could not update driver from hardware\n", __func__);
- spin_unlock_irqrestore(&devdata_mutex, flags);
+ spin_unlock_irqrestore(&devdata_spinlock, flags);
}
if (!ret)
@@ -1045,9 +1045,9 @@ static int nx842_probe(struct vio_dev *viodev,
return -ENOMEM;
}
- spin_lock_irqsave(&devdata_mutex, flags);
+ spin_lock_irqsave(&devdata_spinlock, flags);
old_devdata = rcu_dereference_check(devdata,
- lockdep_is_held(&devdata_mutex));
+ lockdep_is_held(&devdata_spinlock));
if (old_devdata && old_devdata->vdev != NULL) {
dev_err(&viodev->dev, "%s: Attempt to register more than one instance of the hardware\n", __func__);
@@ -1062,7 +1062,7 @@ static int nx842_probe(struct vio_dev *viodev,
nx842_OF_set_defaults(new_devdata);
rcu_assign_pointer(devdata, new_devdata);
- spin_unlock_irqrestore(&devdata_mutex, flags);
+ spin_unlock_irqrestore(&devdata_spinlock, flags);
synchronize_rcu();
kfree(old_devdata);
@@ -1101,7 +1101,7 @@ static int nx842_probe(struct vio_dev *viodev,
return 0;
error_unlock:
- spin_unlock_irqrestore(&devdata_mutex, flags);
+ spin_unlock_irqrestore(&devdata_spinlock, flags);
if (new_devdata)
kfree(new_devdata->counters);
kfree(new_devdata);
@@ -1122,12 +1122,13 @@ static void nx842_remove(struct vio_dev *viodev)
crypto_unregister_alg(&nx842_pseries_alg);
- spin_lock_irqsave(&devdata_mutex, flags);
- old_devdata = rcu_dereference_check(devdata,
- lockdep_is_held(&devdata_mutex));
of_reconfig_notifier_unregister(&nx842_of_nb);
+
+ spin_lock_irqsave(&devdata_spinlock, flags);
+ old_devdata = rcu_dereference_check(devdata,
+ lockdep_is_held(&devdata_spinlock));
RCU_INIT_POINTER(devdata, NULL);
- spin_unlock_irqrestore(&devdata_mutex, flags);
+ spin_unlock_irqrestore(&devdata_spinlock, flags);
synchronize_rcu();
dev_set_drvdata(&viodev->dev, NULL);
if (old_devdata)
@@ -1257,11 +1258,11 @@ static void __exit nx842_pseries_exit(void)
crypto_unregister_alg(&nx842_pseries_alg);
- spin_lock_irqsave(&devdata_mutex, flags);
+ spin_lock_irqsave(&devdata_spinlock, flags);
old_devdata = rcu_dereference_check(devdata,
- lockdep_is_held(&devdata_mutex));
+ lockdep_is_held(&devdata_spinlock));
RCU_INIT_POINTER(devdata, NULL);
- spin_unlock_irqrestore(&devdata_mutex, flags);
+ spin_unlock_irqrestore(&devdata_spinlock, flags);
synchronize_rcu();
if (old_devdata && old_devdata->dev)
dev_set_drvdata(old_devdata->dev, NULL);
diff --git a/drivers/crypto/omap-aes.c b/drivers/crypto/omap-aes.c
index bad1ada..e27b846 100644
--- a/drivers/crypto/omap-aes.c
+++ b/drivers/crypto/omap-aes.c
@@ -1305,7 +1305,7 @@ static SIMPLE_DEV_PM_OPS(omap_aes_pm_ops, omap_aes_suspend, omap_aes_resume);
static struct platform_driver omap_aes_driver = {
.probe = omap_aes_probe,
- .remove_new = omap_aes_remove,
+ .remove = omap_aes_remove,
.driver = {
.name = "omap-aes",
.pm = &omap_aes_pm_ops,
diff --git a/drivers/crypto/omap-des.c b/drivers/crypto/omap-des.c
index 209d3dc..498cbd5 100644
--- a/drivers/crypto/omap-des.c
+++ b/drivers/crypto/omap-des.c
@@ -1115,7 +1115,7 @@ static SIMPLE_DEV_PM_OPS(omap_des_pm_ops, omap_des_suspend, omap_des_resume);
static struct platform_driver omap_des_driver = {
.probe = omap_des_probe,
- .remove_new = omap_des_remove,
+ .remove = omap_des_remove,
.driver = {
.name = "omap-des",
.pm = &omap_des_pm_ops,
diff --git a/drivers/crypto/omap-sham.c b/drivers/crypto/omap-sham.c
index 5bcd9ab..7021481 100644
--- a/drivers/crypto/omap-sham.c
+++ b/drivers/crypto/omap-sham.c
@@ -2216,7 +2216,7 @@ static void omap_sham_remove(struct platform_device *pdev)
static struct platform_driver omap_sham_driver = {
.probe = omap_sham_probe,
- .remove_new = omap_sham_remove,
+ .remove = omap_sham_remove,
.driver = {
.name = "omap-sham",
.of_match_table = omap_sham_of_match,
diff --git a/drivers/crypto/qce/core.c b/drivers/crypto/qce/core.c
index 28b5fd8..e228a31 100644
--- a/drivers/crypto/qce/core.c
+++ b/drivers/crypto/qce/core.c
@@ -299,7 +299,7 @@ MODULE_DEVICE_TABLE(of, qce_crypto_of_match);
static struct platform_driver qce_crypto_driver = {
.probe = qce_crypto_probe,
- .remove_new = qce_crypto_remove,
+ .remove = qce_crypto_remove,
.driver = {
.name = KBUILD_MODNAME,
.of_match_table = qce_crypto_of_match,
diff --git a/drivers/crypto/qcom-rng.c b/drivers/crypto/qcom-rng.c
index 09419e7..0685ba1 100644
--- a/drivers/crypto/qcom-rng.c
+++ b/drivers/crypto/qcom-rng.c
@@ -262,7 +262,7 @@ MODULE_DEVICE_TABLE(of, qcom_rng_of_match);
static struct platform_driver qcom_rng_driver = {
.probe = qcom_rng_probe,
- .remove_new = qcom_rng_remove,
+ .remove = qcom_rng_remove,
.driver = {
.name = KBUILD_MODNAME,
.of_match_table = of_match_ptr(qcom_rng_of_match),
diff --git a/drivers/crypto/rockchip/rk3288_crypto.c b/drivers/crypto/rockchip/rk3288_crypto.c
index f74b3c8..b77bdce 100644
--- a/drivers/crypto/rockchip/rk3288_crypto.c
+++ b/drivers/crypto/rockchip/rk3288_crypto.c
@@ -433,7 +433,7 @@ static void rk_crypto_remove(struct platform_device *pdev)
static struct platform_driver crypto_driver = {
.probe = rk_crypto_probe,
- .remove_new = rk_crypto_remove,
+ .remove = rk_crypto_remove,
.driver = {
.name = "rk3288-crypto",
.pm = &rk_crypto_pm_ops,
diff --git a/drivers/crypto/s5p-sss.c b/drivers/crypto/s5p-sss.c
index 8b6e3f5..57ab237 100644
--- a/drivers/crypto/s5p-sss.c
+++ b/drivers/crypto/s5p-sss.c
@@ -2335,7 +2335,7 @@ static void s5p_aes_remove(struct platform_device *pdev)
static struct platform_driver s5p_aes_crypto = {
.probe = s5p_aes_probe,
- .remove_new = s5p_aes_remove,
+ .remove = s5p_aes_remove,
.driver = {
.name = "s5p-secss",
.of_match_table = s5p_sss_dt_match,
diff --git a/drivers/crypto/sa2ul.c b/drivers/crypto/sa2ul.c
index 461eca4..091612b 100644
--- a/drivers/crypto/sa2ul.c
+++ b/drivers/crypto/sa2ul.c
@@ -574,7 +574,7 @@ static int sa_format_cmdl_gen(struct sa_cmdl_cfg *cfg, u8 *cmdl,
/* Clear the command label */
memzero_explicit(cmdl, (SA_MAX_CMDL_WORDS * sizeof(u32)));
- /* Iniialize the command update structure */
+ /* Initialize the command update structure */
memzero_explicit(upd_info, sizeof(*upd_info));
if (cfg->enc_eng_id && cfg->auth_eng_id) {
@@ -2489,7 +2489,7 @@ static void sa_ul_remove(struct platform_device *pdev)
static struct platform_driver sa_ul_driver = {
.probe = sa_ul_probe,
- .remove_new = sa_ul_remove,
+ .remove = sa_ul_remove,
.driver = {
.name = "saul-crypto",
.of_match_table = of_match,
diff --git a/drivers/crypto/sahara.c b/drivers/crypto/sahara.c
index 96d4af5..533080b0 100644
--- a/drivers/crypto/sahara.c
+++ b/drivers/crypto/sahara.c
@@ -1421,7 +1421,7 @@ static void sahara_remove(struct platform_device *pdev)
static struct platform_driver sahara_driver = {
.probe = sahara_probe,
- .remove_new = sahara_remove,
+ .remove = sahara_remove,
.driver = {
.name = SAHARA_NAME,
.of_match_table = sahara_dt_ids,
diff --git a/drivers/crypto/starfive/jh7110-cryp.c b/drivers/crypto/starfive/jh7110-cryp.c
index e4dfed7..42114e9 100644
--- a/drivers/crypto/starfive/jh7110-cryp.c
+++ b/drivers/crypto/starfive/jh7110-cryp.c
@@ -151,7 +151,7 @@ static int starfive_cryp_probe(struct platform_device *pdev)
ret = starfive_aes_register_algs();
if (ret)
- goto err_algs_aes;
+ goto err_engine_start;
ret = starfive_hash_register_algs();
if (ret)
@@ -167,8 +167,6 @@ static int starfive_cryp_probe(struct platform_device *pdev)
starfive_hash_unregister_algs();
err_algs_hash:
starfive_aes_unregister_algs();
-err_algs_aes:
- crypto_engine_stop(cryp->engine);
err_engine_start:
crypto_engine_exit(cryp->engine);
err_engine:
@@ -193,7 +191,6 @@ static void starfive_cryp_remove(struct platform_device *pdev)
starfive_hash_unregister_algs();
starfive_rsa_unregister_algs();
- crypto_engine_stop(cryp->engine);
crypto_engine_exit(cryp->engine);
starfive_dma_cleanup(cryp);
@@ -215,7 +212,7 @@ MODULE_DEVICE_TABLE(of, starfive_dt_ids);
static struct platform_driver starfive_cryp_driver = {
.probe = starfive_cryp_probe,
- .remove_new = starfive_cryp_remove,
+ .remove = starfive_cryp_remove,
.driver = {
.name = DRIVER_NAME,
.of_match_table = starfive_dt_ids,
diff --git a/drivers/crypto/starfive/jh7110-rsa.c b/drivers/crypto/starfive/jh7110-rsa.c
index a778c48..d109c74 100644
--- a/drivers/crypto/starfive/jh7110-rsa.c
+++ b/drivers/crypto/starfive/jh7110-rsa.c
@@ -565,8 +565,6 @@ static void starfive_rsa_exit_tfm(struct crypto_akcipher *tfm)
static struct akcipher_alg starfive_rsa = {
.encrypt = starfive_rsa_enc,
.decrypt = starfive_rsa_dec,
- .sign = starfive_rsa_dec,
- .verify = starfive_rsa_enc,
.set_pub_key = starfive_rsa_set_pub_key,
.set_priv_key = starfive_rsa_set_priv_key,
.max_size = starfive_rsa_max_size,
diff --git a/drivers/crypto/stm32/stm32-crc32.c b/drivers/crypto/stm32/stm32-crc32.c
index e0faddb..de4d040 100644
--- a/drivers/crypto/stm32/stm32-crc32.c
+++ b/drivers/crypto/stm32/stm32-crc32.c
@@ -465,7 +465,7 @@ MODULE_DEVICE_TABLE(of, stm32_dt_ids);
static struct platform_driver stm32_crc_driver = {
.probe = stm32_crc_probe,
- .remove_new = stm32_crc_remove,
+ .remove = stm32_crc_remove,
.driver = {
.name = DRIVER_NAME,
.pm = &stm32_crc_pm_ops,
diff --git a/drivers/crypto/stm32/stm32-cryp.c b/drivers/crypto/stm32/stm32-cryp.c
index 937f6dab..14c6339 100644
--- a/drivers/crypto/stm32/stm32-cryp.c
+++ b/drivers/crypto/stm32/stm32-cryp.c
@@ -2771,7 +2771,7 @@ static const struct dev_pm_ops stm32_cryp_pm_ops = {
static struct platform_driver stm32_cryp_driver = {
.probe = stm32_cryp_probe,
- .remove_new = stm32_cryp_remove,
+ .remove = stm32_cryp_remove,
.driver = {
.name = DRIVER_NAME,
.pm = &stm32_cryp_pm_ops,
diff --git a/drivers/crypto/stm32/stm32-hash.c b/drivers/crypto/stm32/stm32-hash.c
index 3518273..768b27d 100644
--- a/drivers/crypto/stm32/stm32-hash.c
+++ b/drivers/crypto/stm32/stm32-hash.c
@@ -2532,7 +2532,7 @@ static const struct dev_pm_ops stm32_hash_pm_ops = {
static struct platform_driver stm32_hash_driver = {
.probe = stm32_hash_probe,
- .remove_new = stm32_hash_remove,
+ .remove = stm32_hash_remove,
.driver = {
.name = "stm32-hash",
.pm = &stm32_hash_pm_ops,
diff --git a/drivers/crypto/talitos.c b/drivers/crypto/talitos.c
index 511ddcb..e8c0db6 100644
--- a/drivers/crypto/talitos.c
+++ b/drivers/crypto/talitos.c
@@ -3560,7 +3560,7 @@ static struct platform_driver talitos_driver = {
.of_match_table = talitos_match,
},
.probe = talitos_probe,
- .remove_new = talitos_remove,
+ .remove = talitos_remove,
};
module_platform_driver(talitos_driver);
diff --git a/drivers/crypto/tegra/tegra-se-aes.c b/drivers/crypto/tegra/tegra-se-aes.c
index ae7a0f8..9d13059 100644
--- a/drivers/crypto/tegra/tegra-se-aes.c
+++ b/drivers/crypto/tegra/tegra-se-aes.c
@@ -1180,8 +1180,6 @@ static int tegra_ccm_do_one_req(struct crypto_engine *engine, void *areq)
goto out;
} else {
rctx->cryptlen = req->cryptlen - ctx->authsize;
- if (ret)
- goto out;
/* CTR operation */
ret = tegra_ccm_do_ctr(ctx, rctx);
diff --git a/drivers/crypto/tegra/tegra-se-main.c b/drivers/crypto/tegra/tegra-se-main.c
index f94c033..918c0b1 100644
--- a/drivers/crypto/tegra/tegra-se-main.c
+++ b/drivers/crypto/tegra/tegra-se-main.c
@@ -312,7 +312,6 @@ static int tegra_se_probe(struct platform_device *pdev)
ret = tegra_se_host1x_register(se);
if (ret) {
- crypto_engine_stop(se->engine);
crypto_engine_exit(se->engine);
return dev_err_probe(dev, ret, "failed to init host1x params\n");
}
@@ -324,7 +323,6 @@ static void tegra_se_remove(struct platform_device *pdev)
{
struct tegra_se *se = platform_get_drvdata(pdev);
- crypto_engine_stop(se->engine);
crypto_engine_exit(se->engine);
host1x_client_unregister(&se->client);
}
@@ -387,7 +385,7 @@ static struct platform_driver tegra_se_driver = {
.of_match_table = tegra_se_of_match,
},
.probe = tegra_se_probe,
- .remove_new = tegra_se_remove,
+ .remove = tegra_se_remove,
};
static int tegra_se_host1x_probe(struct host1x_device *dev)
diff --git a/drivers/crypto/virtio/virtio_crypto_akcipher_algs.c b/drivers/crypto/virtio/virtio_crypto_akcipher_algs.c
index cb92b7f..48fee07 100644
--- a/drivers/crypto/virtio/virtio_crypto_akcipher_algs.c
+++ b/drivers/crypto/virtio/virtio_crypto_akcipher_algs.c
@@ -83,23 +83,16 @@ static void virtio_crypto_dataq_akcipher_callback(struct virtio_crypto_request *
case VIRTIO_CRYPTO_BADMSG:
error = -EBADMSG;
break;
-
- case VIRTIO_CRYPTO_KEY_REJECTED:
- error = -EKEYREJECTED;
- break;
-
default:
error = -EIO;
break;
}
akcipher_req = vc_akcipher_req->akcipher_req;
- if (vc_akcipher_req->opcode != VIRTIO_CRYPTO_AKCIPHER_VERIFY) {
- /* actuall length maybe less than dst buffer */
- akcipher_req->dst_len = len - sizeof(vc_req->status);
- sg_copy_from_buffer(akcipher_req->dst, sg_nents(akcipher_req->dst),
- vc_akcipher_req->dst_buf, akcipher_req->dst_len);
- }
+ /* actual length maybe less than dst buffer */
+ akcipher_req->dst_len = len - sizeof(vc_req->status);
+ sg_copy_from_buffer(akcipher_req->dst, sg_nents(akcipher_req->dst),
+ vc_akcipher_req->dst_buf, akcipher_req->dst_len);
virtio_crypto_akcipher_finalize_req(vc_akcipher_req, akcipher_req, error);
}
@@ -230,36 +223,27 @@ static int __virtio_crypto_akcipher_do_req(struct virtio_crypto_akcipher_request
int node = dev_to_node(&vcrypto->vdev->dev);
unsigned long flags;
int ret;
- bool verify = vc_akcipher_req->opcode == VIRTIO_CRYPTO_AKCIPHER_VERIFY;
- unsigned int src_len = verify ? req->src_len + req->dst_len : req->src_len;
/* out header */
sg_init_one(&outhdr_sg, req_data, sizeof(*req_data));
sgs[num_out++] = &outhdr_sg;
/* src data */
- src_buf = kcalloc_node(src_len, 1, GFP_KERNEL, node);
+ src_buf = kcalloc_node(req->src_len, 1, GFP_KERNEL, node);
if (!src_buf)
return -ENOMEM;
- if (verify) {
- /* for verify operation, both src and dst data work as OUT direction */
- sg_copy_to_buffer(req->src, sg_nents(req->src), src_buf, src_len);
- sg_init_one(&srcdata_sg, src_buf, src_len);
- sgs[num_out++] = &srcdata_sg;
- } else {
- sg_copy_to_buffer(req->src, sg_nents(req->src), src_buf, src_len);
- sg_init_one(&srcdata_sg, src_buf, src_len);
- sgs[num_out++] = &srcdata_sg;
+ sg_copy_to_buffer(req->src, sg_nents(req->src), src_buf, req->src_len);
+ sg_init_one(&srcdata_sg, src_buf, req->src_len);
+ sgs[num_out++] = &srcdata_sg;
- /* dst data */
- dst_buf = kcalloc_node(req->dst_len, 1, GFP_KERNEL, node);
- if (!dst_buf)
- goto free_src;
+ /* dst data */
+ dst_buf = kcalloc_node(req->dst_len, 1, GFP_KERNEL, node);
+ if (!dst_buf)
+ goto free_src;
- sg_init_one(&dstdata_sg, dst_buf, req->dst_len);
- sgs[num_out + num_in++] = &dstdata_sg;
- }
+ sg_init_one(&dstdata_sg, dst_buf, req->dst_len);
+ sgs[num_out + num_in++] = &dstdata_sg;
vc_akcipher_req->src_buf = src_buf;
vc_akcipher_req->dst_buf = dst_buf;
@@ -352,16 +336,6 @@ static int virtio_crypto_rsa_decrypt(struct akcipher_request *req)
return virtio_crypto_rsa_req(req, VIRTIO_CRYPTO_AKCIPHER_DECRYPT);
}
-static int virtio_crypto_rsa_sign(struct akcipher_request *req)
-{
- return virtio_crypto_rsa_req(req, VIRTIO_CRYPTO_AKCIPHER_SIGN);
-}
-
-static int virtio_crypto_rsa_verify(struct akcipher_request *req)
-{
- return virtio_crypto_rsa_req(req, VIRTIO_CRYPTO_AKCIPHER_VERIFY);
-}
-
static int virtio_crypto_rsa_set_key(struct crypto_akcipher *tfm,
const void *key,
unsigned int keylen,
@@ -524,16 +498,19 @@ static struct virtio_crypto_akcipher_algo virtio_crypto_akcipher_algs[] = {
.algo.base = {
.encrypt = virtio_crypto_rsa_encrypt,
.decrypt = virtio_crypto_rsa_decrypt,
- .sign = virtio_crypto_rsa_sign,
- .verify = virtio_crypto_rsa_verify,
+ /*
+ * Must specify an arbitrary hash algorithm upon
+ * set_{pub,priv}_key (even though it's not used
+ * by encrypt/decrypt) because qemu checks for it.
+ */
.set_pub_key = virtio_crypto_p1pad_rsa_sha1_set_pub_key,
.set_priv_key = virtio_crypto_p1pad_rsa_sha1_set_priv_key,
.max_size = virtio_crypto_rsa_max_size,
.init = virtio_crypto_rsa_init_tfm,
.exit = virtio_crypto_rsa_exit_tfm,
.base = {
- .cra_name = "pkcs1pad(rsa,sha1)",
- .cra_driver_name = "virtio-pkcs1-rsa-with-sha1",
+ .cra_name = "pkcs1pad(rsa)",
+ .cra_driver_name = "virtio-pkcs1-rsa",
.cra_priority = 150,
.cra_module = THIS_MODULE,
.cra_ctxsize = sizeof(struct virtio_crypto_akcipher_ctx),
diff --git a/drivers/crypto/xilinx/zynqmp-aes-gcm.c b/drivers/crypto/xilinx/zynqmp-aes-gcm.c
index 7f0ec68..6e72d92 100644
--- a/drivers/crypto/xilinx/zynqmp-aes-gcm.c
+++ b/drivers/crypto/xilinx/zynqmp-aes-gcm.c
@@ -438,7 +438,7 @@ MODULE_DEVICE_TABLE(of, zynqmp_aes_dt_ids);
static struct platform_driver zynqmp_aes_driver = {
.probe = zynqmp_aes_aead_probe,
- .remove_new = zynqmp_aes_aead_remove,
+ .remove = zynqmp_aes_aead_remove,
.driver = {
.name = "zynqmp-aes",
.of_match_table = zynqmp_aes_dt_ids,
diff --git a/drivers/crypto/xilinx/zynqmp-sha.c b/drivers/crypto/xilinx/zynqmp-sha.c
index 1bcec6f..580649f 100644
--- a/drivers/crypto/xilinx/zynqmp-sha.c
+++ b/drivers/crypto/xilinx/zynqmp-sha.c
@@ -248,7 +248,7 @@ static void zynqmp_sha_remove(struct platform_device *pdev)
static struct platform_driver zynqmp_sha_driver = {
.probe = zynqmp_sha_probe,
- .remove_new = zynqmp_sha_remove,
+ .remove = zynqmp_sha_remove,
.driver = {
.name = "zynqmp-sha3-384",
},
diff --git a/drivers/firmware/arm_scmi/perf.c b/drivers/firmware/arm_scmi/perf.c
index 2d77b5f..c7e5a34 100644
--- a/drivers/firmware/arm_scmi/perf.c
+++ b/drivers/firmware/arm_scmi/perf.c
@@ -373,7 +373,7 @@ static int iter_perf_levels_update_state(struct scmi_iterator_state *st,
return 0;
}
-static inline void
+static inline int
process_response_opp(struct device *dev, struct perf_dom_info *dom,
struct scmi_opp *opp, unsigned int loop_idx,
const struct scmi_msg_resp_perf_describe_levels *r)
@@ -386,12 +386,16 @@ process_response_opp(struct device *dev, struct perf_dom_info *dom,
le16_to_cpu(r->opp[loop_idx].transition_latency_us);
ret = xa_insert(&dom->opps_by_lvl, opp->perf, opp, GFP_KERNEL);
- if (ret)
- dev_warn(dev, "Failed to add opps_by_lvl at %d for %s - ret:%d\n",
+ if (ret) {
+ dev_info(dev, FW_BUG "Failed to add opps_by_lvl at %d for %s - ret:%d\n",
opp->perf, dom->info.name, ret);
+ return ret;
+ }
+
+ return 0;
}
-static inline void
+static inline int
process_response_opp_v4(struct device *dev, struct perf_dom_info *dom,
struct scmi_opp *opp, unsigned int loop_idx,
const struct scmi_msg_resp_perf_describe_levels_v4 *r)
@@ -404,9 +408,11 @@ process_response_opp_v4(struct device *dev, struct perf_dom_info *dom,
le16_to_cpu(r->opp[loop_idx].transition_latency_us);
ret = xa_insert(&dom->opps_by_lvl, opp->perf, opp, GFP_KERNEL);
- if (ret)
- dev_warn(dev, "Failed to add opps_by_lvl at %d for %s - ret:%d\n",
+ if (ret) {
+ dev_info(dev, FW_BUG "Failed to add opps_by_lvl at %d for %s - ret:%d\n",
opp->perf, dom->info.name, ret);
+ return ret;
+ }
/* Note that PERF v4 reports always five 32-bit words */
opp->indicative_freq = le32_to_cpu(r->opp[loop_idx].indicative_freq);
@@ -415,13 +421,21 @@ process_response_opp_v4(struct device *dev, struct perf_dom_info *dom,
ret = xa_insert(&dom->opps_by_idx, opp->level_index, opp,
GFP_KERNEL);
- if (ret)
+ if (ret) {
dev_warn(dev,
"Failed to add opps_by_idx at %d for %s - ret:%d\n",
opp->level_index, dom->info.name, ret);
+ /* Cleanup by_lvl too */
+ xa_erase(&dom->opps_by_lvl, opp->perf);
+
+ return ret;
+ }
+
hash_add(dom->opps_by_freq, &opp->hash, opp->indicative_freq);
}
+
+ return 0;
}
static int
@@ -429,16 +443,22 @@ iter_perf_levels_process_response(const struct scmi_protocol_handle *ph,
const void *response,
struct scmi_iterator_state *st, void *priv)
{
+ int ret;
struct scmi_opp *opp;
struct scmi_perf_ipriv *p = priv;
- opp = &p->perf_dom->opp[st->desc_index + st->loop_idx];
+ opp = &p->perf_dom->opp[p->perf_dom->opp_count];
if (PROTOCOL_REV_MAJOR(p->version) <= 0x3)
- process_response_opp(ph->dev, p->perf_dom, opp, st->loop_idx,
- response);
+ ret = process_response_opp(ph->dev, p->perf_dom, opp,
+ st->loop_idx, response);
else
- process_response_opp_v4(ph->dev, p->perf_dom, opp, st->loop_idx,
- response);
+ ret = process_response_opp_v4(ph->dev, p->perf_dom, opp,
+ st->loop_idx, response);
+
+ /* Skip BAD duplicates received from firmware */
+ if (ret)
+ return ret == -EBUSY ? 0 : ret;
+
p->perf_dom->opp_count++;
dev_dbg(ph->dev, "Level %d Power %d Latency %dus Ifreq %d Index %d\n",
diff --git a/drivers/firmware/google/framebuffer-coreboot.c b/drivers/firmware/google/framebuffer-coreboot.c
index daadd71..c68c9f5 100644
--- a/drivers/firmware/google/framebuffer-coreboot.c
+++ b/drivers/firmware/google/framebuffer-coreboot.c
@@ -15,6 +15,7 @@
#include <linux/module.h>
#include <linux/platform_data/simplefb.h>
#include <linux/platform_device.h>
+#include <linux/sysfb.h>
#include "coreboot_table.h"
@@ -36,6 +37,19 @@ static int framebuffer_probe(struct coreboot_device *dev)
.format = NULL,
};
+ /*
+ * On coreboot systems, the advertised LB_TAG_FRAMEBUFFER entry
+ * in the coreboot table should only be used if the payload did
+ * not pass a framebuffer information to the Linux kernel.
+ *
+ * If the global screen_info data has been filled, the Generic
+ * System Framebuffers (sysfb) will already register a platform
+ * device and pass that screen_info as platform_data to a driver
+ * that can scan-out using the system provided framebuffer.
+ */
+ if (sysfb_handles_screen_info())
+ return -ENODEV;
+
if (!fb->physical_address)
return -ENODEV;
diff --git a/drivers/firmware/google/gsmi.c b/drivers/firmware/google/gsmi.c
index d304913..24e666d 100644
--- a/drivers/firmware/google/gsmi.c
+++ b/drivers/firmware/google/gsmi.c
@@ -918,7 +918,8 @@ static __init int gsmi_init(void)
gsmi_dev.pdev = platform_device_register_full(&gsmi_dev_info);
if (IS_ERR(gsmi_dev.pdev)) {
printk(KERN_ERR "gsmi: unable to register platform device\n");
- return PTR_ERR(gsmi_dev.pdev);
+ ret = PTR_ERR(gsmi_dev.pdev);
+ goto out_unregister;
}
/* SMI access needs to be serialized */
@@ -1056,10 +1057,11 @@ static __init int gsmi_init(void)
gsmi_buf_free(gsmi_dev.name_buf);
kmem_cache_destroy(gsmi_dev.mem_pool);
platform_device_unregister(gsmi_dev.pdev);
- pr_info("gsmi: failed to load: %d\n", ret);
+out_unregister:
#ifdef CONFIG_PM
platform_driver_unregister(&gsmi_driver_info);
#endif
+ pr_info("gsmi: failed to load: %d\n", ret);
return ret;
}
diff --git a/drivers/firmware/sysfb.c b/drivers/firmware/sysfb.c
index a3df782..7c5c03f 100644
--- a/drivers/firmware/sysfb.c
+++ b/drivers/firmware/sysfb.c
@@ -79,6 +79,25 @@ void sysfb_disable(struct device *dev)
}
EXPORT_SYMBOL_GPL(sysfb_disable);
+/**
+ * sysfb_handles_screen_info() - reports if sysfb handles the global screen_info
+ *
+ * Callers can use sysfb_handles_screen_info() to determine whether the Generic
+ * System Framebuffers (sysfb) can handle the global screen_info data structure
+ * or not. Drivers might need this information to know if they have to setup the
+ * system framebuffer, or if they have to delegate this action to sysfb instead.
+ *
+ * Returns:
+ * True if sysfb handles the global screen_info data structure.
+ */
+bool sysfb_handles_screen_info(void)
+{
+ const struct screen_info *si = &screen_info;
+
+ return !!screen_info_video_type(si);
+}
+EXPORT_SYMBOL_GPL(sysfb_handles_screen_info);
+
#if defined(CONFIG_PCI)
static bool sysfb_pci_dev_is_enabled(struct pci_dev *pdev)
{
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
index 44819cd..971419e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
@@ -161,7 +161,8 @@ void amdgpu_bo_placement_from_domain(struct amdgpu_bo *abo, u32 domain)
* When GTT is just an alternative to VRAM make sure that we
* only use it as fallback and still try to fill up VRAM first.
*/
- if (domain & abo->preferred_domains & AMDGPU_GEM_DOMAIN_VRAM)
+ if (domain & abo->preferred_domains & AMDGPU_GEM_DOMAIN_VRAM &&
+ !(adev->flags & AMD_IS_APU))
places[c].flags |= TTM_PL_FLAG_FALLBACK;
c++;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sched.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_sched.c
index b0a8abc..341beec 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sched.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sched.c
@@ -35,21 +35,19 @@ static int amdgpu_sched_process_priority_override(struct amdgpu_device *adev,
int fd,
int32_t priority)
{
- struct fd f = fdget(fd);
+ CLASS(fd, f)(fd);
struct amdgpu_fpriv *fpriv;
struct amdgpu_ctx_mgr *mgr;
struct amdgpu_ctx *ctx;
uint32_t id;
int r;
- if (!fd_file(f))
+ if (fd_empty(f))
return -EINVAL;
r = amdgpu_file_to_fpriv(fd_file(f), &fpriv);
- if (r) {
- fdput(f);
+ if (r)
return r;
- }
mgr = &fpriv->ctx_mgr;
mutex_lock(&mgr->lock);
@@ -57,7 +55,6 @@ static int amdgpu_sched_process_priority_override(struct amdgpu_device *adev,
amdgpu_ctx_priority_override(ctx, priority);
mutex_unlock(&mgr->lock);
- fdput(f);
return 0;
}
@@ -66,31 +63,25 @@ static int amdgpu_sched_context_priority_override(struct amdgpu_device *adev,
unsigned ctx_id,
int32_t priority)
{
- struct fd f = fdget(fd);
+ CLASS(fd, f)(fd);
struct amdgpu_fpriv *fpriv;
struct amdgpu_ctx *ctx;
int r;
- if (!fd_file(f))
+ if (fd_empty(f))
return -EINVAL;
r = amdgpu_file_to_fpriv(fd_file(f), &fpriv);
- if (r) {
- fdput(f);
+ if (r)
return r;
- }
ctx = amdgpu_ctx_get(fpriv, ctx_id);
- if (!ctx) {
- fdput(f);
+ if (!ctx)
return -EINVAL;
- }
amdgpu_ctx_priority_override(ctx, priority);
amdgpu_ctx_put(ctx);
- fdput(f);
-
return 0;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index c76ac0d..7a45f3f 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -1124,8 +1124,10 @@ static void gmc_v9_0_get_coherence_flags(struct amdgpu_device *adev,
uint64_t *flags)
{
struct amdgpu_device *bo_adev = amdgpu_ttm_adev(bo->tbo.bdev);
- bool is_vram = bo->tbo.resource->mem_type == TTM_PL_VRAM;
- bool coherent = bo->flags & (AMDGPU_GEM_CREATE_COHERENT | AMDGPU_GEM_CREATE_EXT_COHERENT);
+ bool is_vram = bo->tbo.resource &&
+ bo->tbo.resource->mem_type == TTM_PL_VRAM;
+ bool coherent = bo->flags & (AMDGPU_GEM_CREATE_COHERENT |
+ AMDGPU_GEM_CREATE_EXT_COHERENT);
bool ext_coherent = bo->flags & AMDGPU_GEM_CREATE_EXT_COHERENT;
bool uncached = bo->flags & AMDGPU_GEM_CREATE_UNCACHED;
struct amdgpu_vm *vm = mapping->bo_va->base.vm;
@@ -1133,6 +1135,8 @@ static void gmc_v9_0_get_coherence_flags(struct amdgpu_device *adev,
bool snoop = false;
bool is_local;
+ dma_resv_assert_held(bo->tbo.base.resv);
+
switch (amdgpu_ip_version(adev, GC_HWIP, 0)) {
case IP_VERSION(9, 4, 1):
case IP_VERSION(9, 4, 2):
@@ -1251,9 +1255,8 @@ static void gmc_v9_0_get_vm_pte(struct amdgpu_device *adev,
*flags &= ~AMDGPU_PTE_VALID;
}
- if (bo && bo->tbo.resource)
- gmc_v9_0_get_coherence_flags(adev, mapping->bo_va->base.bo,
- mapping, flags);
+ if ((*flags & AMDGPU_PTE_VALID) && bo)
+ gmc_v9_0_get_coherence_flags(adev, bo, mapping, flags);
}
static void gmc_v9_0_override_vm_pte_flags(struct amdgpu_device *adev,
diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
index a37a680..b3175ff 100644
--- a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
@@ -550,7 +550,7 @@ static int mes_v12_0_set_hw_resources_1(struct amdgpu_mes *mes, int pipe)
mes_set_hw_res_1_pkt.header.type = MES_API_TYPE_SCHEDULER;
mes_set_hw_res_1_pkt.header.opcode = MES_SCH_API_SET_HW_RSRC_1;
mes_set_hw_res_1_pkt.header.dwsize = API_FRAME_SIZE_IN_DWORDS;
- mes_set_hw_res_1_pkt.mes_kiq_unmap_timeout = 100;
+ mes_set_hw_res_1_pkt.mes_kiq_unmap_timeout = 0xa;
return mes_v12_0_submit_pkt_and_poll_completion(mes, pipe,
&mes_set_hw_res_1_pkt, sizeof(mes_set_hw_res_1_pkt),
diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_7.c b/drivers/gpu/drm/amd/amdgpu/nbio_v7_7.c
index fb37e35..1ac7303 100644
--- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_7.c
+++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_7.c
@@ -247,6 +247,12 @@ static void nbio_v7_7_init_registers(struct amdgpu_device *adev)
if (def != data)
WREG32_SOC15(NBIO, 0, regBIF0_PCIE_MST_CTRL_3, data);
+ switch (adev->ip_versions[NBIO_HWIP][0]) {
+ case IP_VERSION(7, 7, 0):
+ data = RREG32_SOC15(NBIO, 0, regRCC_DEV0_EPF5_STRAP4) & ~BIT(23);
+ WREG32_SOC15(NBIO, 0, regRCC_DEV0_EPF5_STRAP4, data);
+ break;
+ }
}
static void nbio_v7_7_update_medium_grain_clock_gating(struct amdgpu_device *adev,
diff --git a/drivers/gpu/drm/amd/amdgpu/nv.c b/drivers/gpu/drm/amd/amdgpu/nv.c
index 4938e6b..73065a8 100644
--- a/drivers/gpu/drm/amd/amdgpu/nv.c
+++ b/drivers/gpu/drm/amd/amdgpu/nv.c
@@ -67,8 +67,8 @@ static const struct amd_ip_funcs nv_common_ip_funcs;
/* Navi */
static const struct amdgpu_video_codec_info nv_video_codecs_encode_array[] = {
- {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_MPEG4_AVC, 4096, 2304, 0)},
- {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_HEVC, 4096, 2304, 0)},
+ {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_MPEG4_AVC, 4096, 4096, 0)},
+ {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_HEVC, 4096, 4096, 0)},
};
static const struct amdgpu_video_codecs nv_video_codecs_encode = {
@@ -94,8 +94,8 @@ static const struct amdgpu_video_codecs nv_video_codecs_decode = {
/* Sienna Cichlid */
static const struct amdgpu_video_codec_info sc_video_codecs_encode_array[] = {
- {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_MPEG4_AVC, 4096, 2160, 0)},
- {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_HEVC, 7680, 4352, 0)},
+ {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_MPEG4_AVC, 4096, 4096, 0)},
+ {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_HEVC, 8192, 4352, 0)},
};
static const struct amdgpu_video_codecs sc_video_codecs_encode = {
@@ -136,8 +136,8 @@ static const struct amdgpu_video_codecs sc_video_codecs_decode_vcn1 = {
/* SRIOV Sienna Cichlid, not const since data is controlled by host */
static struct amdgpu_video_codec_info sriov_sc_video_codecs_encode_array[] = {
- {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_MPEG4_AVC, 4096, 2160, 0)},
- {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_HEVC, 7680, 4352, 0)},
+ {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_MPEG4_AVC, 4096, 4096, 0)},
+ {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_HEVC, 8192, 4352, 0)},
};
static struct amdgpu_video_codec_info sriov_sc_video_codecs_decode_array_vcn0[] = {
diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.c b/drivers/gpu/drm/amd/amdgpu/soc15.c
index 8d16dac..307185c 100644
--- a/drivers/gpu/drm/amd/amdgpu/soc15.c
+++ b/drivers/gpu/drm/amd/amdgpu/soc15.c
@@ -90,8 +90,8 @@ static const struct amd_ip_funcs soc15_common_ip_funcs;
/* Vega, Raven, Arcturus */
static const struct amdgpu_video_codec_info vega_video_codecs_encode_array[] =
{
- {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_MPEG4_AVC, 4096, 2304, 0)},
- {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_HEVC, 4096, 2304, 0)},
+ {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_MPEG4_AVC, 4096, 4096, 0)},
+ {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_HEVC, 4096, 4096, 0)},
};
static const struct amdgpu_video_codecs vega_video_codecs_encode =
diff --git a/drivers/gpu/drm/amd/amdgpu/soc21.c b/drivers/gpu/drm/amd/amdgpu/soc21.c
index d30ad7d..bba3588 100644
--- a/drivers/gpu/drm/amd/amdgpu/soc21.c
+++ b/drivers/gpu/drm/amd/amdgpu/soc21.c
@@ -49,13 +49,13 @@ static const struct amd_ip_funcs soc21_common_ip_funcs;
/* SOC21 */
static const struct amdgpu_video_codec_info vcn_4_0_0_video_codecs_encode_array_vcn0[] = {
- {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_MPEG4_AVC, 4096, 2304, 0)},
+ {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_MPEG4_AVC, 4096, 4096, 0)},
{codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_HEVC, 8192, 4352, 0)},
{codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_AV1, 8192, 4352, 0)},
};
static const struct amdgpu_video_codec_info vcn_4_0_0_video_codecs_encode_array_vcn1[] = {
- {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_MPEG4_AVC, 4096, 2304, 0)},
+ {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_MPEG4_AVC, 4096, 4096, 0)},
{codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_HEVC, 8192, 4352, 0)},
};
@@ -96,14 +96,14 @@ static const struct amdgpu_video_codecs vcn_4_0_0_video_codecs_decode_vcn1 = {
/* SRIOV SOC21, not const since data is controlled by host */
static struct amdgpu_video_codec_info sriov_vcn_4_0_0_video_codecs_encode_array_vcn0[] = {
- {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_MPEG4_AVC, 4096, 2304, 0)},
- {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_HEVC, 4096, 2304, 0)},
+ {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_MPEG4_AVC, 4096, 4096, 0)},
+ {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_HEVC, 8192, 4352, 0)},
{codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_AV1, 8192, 4352, 0)},
};
static struct amdgpu_video_codec_info sriov_vcn_4_0_0_video_codecs_encode_array_vcn1[] = {
- {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_MPEG4_AVC, 4096, 2304, 0)},
- {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_HEVC, 4096, 2304, 0)},
+ {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_MPEG4_AVC, 4096, 4096, 0)},
+ {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_HEVC, 8192, 4352, 0)},
};
static struct amdgpu_video_codecs sriov_vcn_4_0_0_video_codecs_encode_vcn0 = {
diff --git a/drivers/gpu/drm/amd/amdgpu/soc24.c b/drivers/gpu/drm/amd/amdgpu/soc24.c
index fd4c3d4..29a848f 100644
--- a/drivers/gpu/drm/amd/amdgpu/soc24.c
+++ b/drivers/gpu/drm/amd/amdgpu/soc24.c
@@ -48,7 +48,7 @@
static const struct amd_ip_funcs soc24_common_ip_funcs;
static const struct amdgpu_video_codec_info vcn_5_0_0_video_codecs_encode_array_vcn0[] = {
- {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_MPEG4_AVC, 4096, 2304, 0)},
+ {codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_MPEG4_AVC, 4096, 4096, 0)},
{codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_HEVC, 8192, 4352, 0)},
{codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_AV1, 8192, 4352, 0)},
};
diff --git a/drivers/gpu/drm/amd/amdgpu/vi.c b/drivers/gpu/drm/amd/amdgpu/vi.c
index d39c670..792b2eb 100644
--- a/drivers/gpu/drm/amd/amdgpu/vi.c
+++ b/drivers/gpu/drm/amd/amdgpu/vi.c
@@ -136,15 +136,15 @@ static const struct amdgpu_video_codec_info polaris_video_codecs_encode_array[]
{
.codec_type = AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_MPEG4_AVC,
.max_width = 4096,
- .max_height = 2304,
- .max_pixels_per_frame = 4096 * 2304,
+ .max_height = 4096,
+ .max_pixels_per_frame = 4096 * 4096,
.max_level = 0,
},
{
.codec_type = AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_HEVC,
.max_width = 4096,
- .max_height = 2304,
- .max_pixels_per_frame = 4096 * 2304,
+ .max_height = 4096,
+ .max_pixels_per_frame = 4096 * 4096,
.max_level = 0,
},
};
diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
index 07e9ce9..8d97f17 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
@@ -6762,7 +6762,7 @@ create_stream_for_sink(struct drm_connector *connector,
if (stream->out_transfer_func.tf == TRANSFER_FUNCTION_GAMMA22)
tf = TRANSFER_FUNC_GAMMA_22;
mod_build_vsc_infopacket(stream, &stream->vsc_infopacket, stream->output_color_space, tf);
- aconnector->psr_skip_count = AMDGPU_DM_PSR_ENTRY_DELAY;
+ aconnector->sr_skip_count = AMDGPU_DM_PSR_ENTRY_DELAY;
}
finish:
@@ -8875,6 +8875,56 @@ static void amdgpu_dm_update_cursor(struct drm_plane *plane,
}
}
+static void amdgpu_dm_enable_self_refresh(struct amdgpu_crtc *acrtc_attach,
+ const struct dm_crtc_state *acrtc_state,
+ const u64 current_ts)
+{
+ struct psr_settings *psr = &acrtc_state->stream->link->psr_settings;
+ struct replay_settings *pr = &acrtc_state->stream->link->replay_settings;
+ struct amdgpu_dm_connector *aconn =
+ (struct amdgpu_dm_connector *)acrtc_state->stream->dm_stream_context;
+
+ if (acrtc_state->update_type > UPDATE_TYPE_FAST) {
+ if (pr->config.replay_supported && !pr->replay_feature_enabled)
+ amdgpu_dm_link_setup_replay(acrtc_state->stream->link, aconn);
+ else if (psr->psr_version != DC_PSR_VERSION_UNSUPPORTED &&
+ !psr->psr_feature_enabled)
+ if (!aconn->disallow_edp_enter_psr)
+ amdgpu_dm_link_setup_psr(acrtc_state->stream);
+ }
+
+ /* Decrement skip count when SR is enabled and we're doing fast updates. */
+ if (acrtc_state->update_type == UPDATE_TYPE_FAST &&
+ (psr->psr_feature_enabled || pr->config.replay_supported)) {
+ if (aconn->sr_skip_count > 0)
+ aconn->sr_skip_count--;
+
+ /* Allow SR when skip count is 0. */
+ acrtc_attach->dm_irq_params.allow_sr_entry = !aconn->sr_skip_count;
+
+ /*
+ * If sink supports PSR SU/Panel Replay, there is no need to rely on
+ * a vblank event disable request to enable PSR/RP. PSR SU/RP
+ * can be enabled immediately once OS demonstrates an
+ * adequate number of fast atomic commits to notify KMD
+ * of update events. See `vblank_control_worker()`.
+ */
+ if (acrtc_attach->dm_irq_params.allow_sr_entry &&
+#ifdef CONFIG_DRM_AMD_SECURE_DISPLAY
+ !amdgpu_dm_crc_window_is_activated(acrtc_state->base.crtc) &&
+#endif
+ (current_ts - psr->psr_dirty_rects_change_timestamp_ns) > 500000000) {
+ if (pr->replay_feature_enabled && !pr->replay_allow_active)
+ amdgpu_dm_replay_enable(acrtc_state->stream, true);
+ if (psr->psr_version >= DC_PSR_VERSION_SU_1 &&
+ !psr->psr_allow_active && !aconn->disallow_edp_enter_psr)
+ amdgpu_dm_psr_enable(acrtc_state->stream);
+ }
+ } else {
+ acrtc_attach->dm_irq_params.allow_sr_entry = false;
+ }
+}
+
static void amdgpu_dm_commit_planes(struct drm_atomic_state *state,
struct drm_device *dev,
struct amdgpu_display_manager *dm,
@@ -9028,7 +9078,7 @@ static void amdgpu_dm_commit_planes(struct drm_atomic_state *state,
* during the PSR-SU was disabled.
*/
if (acrtc_state->stream->link->psr_settings.psr_version >= DC_PSR_VERSION_SU_1 &&
- acrtc_attach->dm_irq_params.allow_psr_entry &&
+ acrtc_attach->dm_irq_params.allow_sr_entry &&
#ifdef CONFIG_DRM_AMD_SECURE_DISPLAY
!amdgpu_dm_crc_window_is_activated(acrtc_state->base.crtc) &&
#endif
@@ -9203,9 +9253,12 @@ static void amdgpu_dm_commit_planes(struct drm_atomic_state *state,
bundle->stream_update.abm_level = &acrtc_state->abm_level;
mutex_lock(&dm->dc_lock);
- if ((acrtc_state->update_type > UPDATE_TYPE_FAST) &&
- acrtc_state->stream->link->psr_settings.psr_allow_active)
- amdgpu_dm_psr_disable(acrtc_state->stream);
+ if (acrtc_state->update_type > UPDATE_TYPE_FAST) {
+ if (acrtc_state->stream->link->replay_settings.replay_allow_active)
+ amdgpu_dm_replay_disable(acrtc_state->stream);
+ if (acrtc_state->stream->link->psr_settings.psr_allow_active)
+ amdgpu_dm_psr_disable(acrtc_state->stream);
+ }
mutex_unlock(&dm->dc_lock);
/*
@@ -9246,57 +9299,7 @@ static void amdgpu_dm_commit_planes(struct drm_atomic_state *state,
dm_update_pflip_irq_state(drm_to_adev(dev),
acrtc_attach);
- if (acrtc_state->update_type > UPDATE_TYPE_FAST) {
- if (acrtc_state->stream->link->replay_settings.config.replay_supported &&
- !acrtc_state->stream->link->replay_settings.replay_feature_enabled) {
- struct amdgpu_dm_connector *aconn =
- (struct amdgpu_dm_connector *)acrtc_state->stream->dm_stream_context;
- amdgpu_dm_link_setup_replay(acrtc_state->stream->link, aconn);
- } else if (acrtc_state->stream->link->psr_settings.psr_version != DC_PSR_VERSION_UNSUPPORTED &&
- !acrtc_state->stream->link->psr_settings.psr_feature_enabled) {
-
- struct amdgpu_dm_connector *aconn = (struct amdgpu_dm_connector *)
- acrtc_state->stream->dm_stream_context;
-
- if (!aconn->disallow_edp_enter_psr)
- amdgpu_dm_link_setup_psr(acrtc_state->stream);
- }
- }
-
- /* Decrement skip count when PSR is enabled and we're doing fast updates. */
- if (acrtc_state->update_type == UPDATE_TYPE_FAST &&
- acrtc_state->stream->link->psr_settings.psr_feature_enabled) {
- struct amdgpu_dm_connector *aconn =
- (struct amdgpu_dm_connector *)acrtc_state->stream->dm_stream_context;
-
- if (aconn->psr_skip_count > 0)
- aconn->psr_skip_count--;
-
- /* Allow PSR when skip count is 0. */
- acrtc_attach->dm_irq_params.allow_psr_entry = !aconn->psr_skip_count;
-
- /*
- * If sink supports PSR SU, there is no need to rely on
- * a vblank event disable request to enable PSR. PSR SU
- * can be enabled immediately once OS demonstrates an
- * adequate number of fast atomic commits to notify KMD
- * of update events. See `vblank_control_worker()`.
- */
- if (acrtc_state->stream->link->psr_settings.psr_version >= DC_PSR_VERSION_SU_1 &&
- acrtc_attach->dm_irq_params.allow_psr_entry &&
-#ifdef CONFIG_DRM_AMD_SECURE_DISPLAY
- !amdgpu_dm_crc_window_is_activated(acrtc_state->base.crtc) &&
-#endif
- !acrtc_state->stream->link->psr_settings.psr_allow_active &&
- !aconn->disallow_edp_enter_psr &&
- (timestamp_ns -
- acrtc_state->stream->link->psr_settings.psr_dirty_rects_change_timestamp_ns) >
- 500000000)
- amdgpu_dm_psr_enable(acrtc_state->stream);
- } else {
- acrtc_attach->dm_irq_params.allow_psr_entry = false;
- }
-
+ amdgpu_dm_enable_self_refresh(acrtc_attach, acrtc_state, timestamp_ns);
mutex_unlock(&dm->dc_lock);
}
@@ -12080,7 +12083,7 @@ static int parse_amd_vsdb(struct amdgpu_dm_connector *aconnector,
break;
}
- while (j < EDID_LENGTH) {
+ while (j < EDID_LENGTH - sizeof(struct amd_vsdb_block)) {
struct amd_vsdb_block *amd_vsdb = (struct amd_vsdb_block *)&edid_ext[j];
unsigned int ieeeId = (amd_vsdb->ieee_id[2] << 16) | (amd_vsdb->ieee_id[1] << 8) | (amd_vsdb->ieee_id[0]);
diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h
index 15d4690c..90dfffe 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h
@@ -727,7 +727,7 @@ struct amdgpu_dm_connector {
/* Cached display modes */
struct drm_display_mode freesync_vid_base;
- int psr_skip_count;
+ int sr_skip_count;
bool disallow_edp_enter_psr;
/* Record progress status of mst*/
diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_crtc.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_crtc.c
index a2cf2c0..288be19 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_crtc.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_crtc.c
@@ -266,11 +266,10 @@ static void amdgpu_dm_crtc_vblank_control_worker(struct work_struct *work)
* where the SU region is the full hactive*vactive region. See
* fill_dc_dirty_rects().
*/
- if (vblank_work->stream && vblank_work->stream->link) {
+ if (vblank_work->stream && vblank_work->stream->link && vblank_work->acrtc) {
amdgpu_dm_crtc_set_panel_sr_feature(
vblank_work, vblank_work->enable,
- vblank_work->acrtc->dm_irq_params.allow_psr_entry ||
- vblank_work->stream->link->replay_settings.replay_feature_enabled);
+ vblank_work->acrtc->dm_irq_params.allow_sr_entry);
}
if (dm->active_vblank_irq_count == 0) {
diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_irq_params.h b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_irq_params.h
index 5c93032..6a7ecc1 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_irq_params.h
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_irq_params.h
@@ -33,7 +33,7 @@ struct dm_irq_params {
struct mod_vrr_params vrr_params;
struct dc_stream_state *stream;
int active_planes;
- bool allow_psr_entry;
+ bool allow_sr_entry;
struct mod_freesync_config freesync_config;
#ifdef CONFIG_DEBUG_FS
diff --git a/drivers/gpu/drm/amd/display/dc/bios/bios_parser2.c b/drivers/gpu/drm/amd/display/dc/bios/bios_parser2.c
index be8fbb0..c9a6de1 100644
--- a/drivers/gpu/drm/amd/display/dc/bios/bios_parser2.c
+++ b/drivers/gpu/drm/amd/display/dc/bios/bios_parser2.c
@@ -3122,14 +3122,12 @@ static enum bp_result bios_parser_get_vram_info(
struct dc_vram_info *info)
{
struct bios_parser *bp = BP_FROM_DCB(dcb);
- static enum bp_result result = BP_RESULT_BADBIOSTABLE;
+ enum bp_result result = BP_RESULT_BADBIOSTABLE;
struct atom_common_table_header *header;
struct atom_data_revision revision;
// vram info moved to umc_info for DCN4x
- if (dcb->ctx->dce_version >= DCN_VERSION_4_01 &&
- dcb->ctx->dce_version < DCN_VERSION_MAX &&
- info && DATA_TABLES(umc_info)) {
+ if (info && DATA_TABLES(umc_info)) {
header = GET_IMAGE(struct atom_common_table_header,
DATA_TABLES(umc_info));
diff --git a/drivers/gpu/drm/amd/display/dc/core/dc_state.c b/drivers/gpu/drm/amd/display/dc/core/dc_state.c
index 2597e3f..e006f81 100644
--- a/drivers/gpu/drm/amd/display/dc/core/dc_state.c
+++ b/drivers/gpu/drm/amd/display/dc/core/dc_state.c
@@ -265,6 +265,9 @@ struct dc_state *dc_state_create_copy(struct dc_state *src_state)
dc_state_copy_internal(new_state, src_state);
#ifdef CONFIG_DRM_AMD_DC_FP
+ new_state->bw_ctx.dml2 = NULL;
+ new_state->bw_ctx.dml2_dc_power_source = NULL;
+
if (src_state->bw_ctx.dml2 &&
!dml2_create_copy(&new_state->bw_ctx.dml2, src_state->bw_ctx.dml2)) {
dc_state_release(new_state);
diff --git a/drivers/gpu/drm/amd/display/dc/dml2/dml21/src/dml2_pmo/dml2_pmo_dcn4_fams2.c b/drivers/gpu/drm/amd/display/dc/dml2/dml21/src/dml2_pmo/dml2_pmo_dcn4_fams2.c
index 1cf9015..dd99718 100644
--- a/drivers/gpu/drm/amd/display/dc/dml2/dml21/src/dml2_pmo/dml2_pmo_dcn4_fams2.c
+++ b/drivers/gpu/drm/amd/display/dc/dml2/dml21/src/dml2_pmo/dml2_pmo_dcn4_fams2.c
@@ -8,6 +8,7 @@
#include "dml2_pmo_dcn4_fams2.h"
static const double MIN_VACTIVE_MARGIN_PCT = 0.25; // We need more than non-zero margin because DET buffer granularity can alter vactive latency hiding
+static const double MIN_BLANK_STUTTER_FACTOR = 3.0;
static const struct dml2_pmo_pstate_strategy base_strategy_list_1_display[] = {
// VActive Preferred
@@ -2139,6 +2140,7 @@ bool pmo_dcn4_fams2_init_for_stutter(struct dml2_pmo_init_for_stutter_in_out *in
struct dml2_pmo_instance *pmo = in_out->instance;
bool stutter_period_meets_z8_eco = true;
bool z8_stutter_optimization_too_expensive = false;
+ bool stutter_optimization_too_expensive = false;
double line_time_us, vblank_nom_time_us;
unsigned int i;
@@ -2160,10 +2162,15 @@ bool pmo_dcn4_fams2_init_for_stutter(struct dml2_pmo_init_for_stutter_in_out *in
line_time_us = (double)in_out->base_display_config->display_config.stream_descriptors[i].timing.h_total / (in_out->base_display_config->display_config.stream_descriptors[i].timing.pixel_clock_khz * 1000) * 1000000;
vblank_nom_time_us = line_time_us * in_out->base_display_config->display_config.stream_descriptors[i].timing.vblank_nom;
- if (vblank_nom_time_us < pmo->soc_bb->power_management_parameters.z8_stutter_exit_latency_us) {
+ if (vblank_nom_time_us < pmo->soc_bb->power_management_parameters.z8_stutter_exit_latency_us * MIN_BLANK_STUTTER_FACTOR) {
z8_stutter_optimization_too_expensive = true;
break;
}
+
+ if (vblank_nom_time_us < pmo->soc_bb->power_management_parameters.stutter_enter_plus_exit_latency_us * MIN_BLANK_STUTTER_FACTOR) {
+ stutter_optimization_too_expensive = true;
+ break;
+ }
}
pmo->scratch.pmo_dcn4.num_stutter_candidates = 0;
@@ -2179,7 +2186,7 @@ bool pmo_dcn4_fams2_init_for_stutter(struct dml2_pmo_init_for_stutter_in_out *in
pmo->scratch.pmo_dcn4.z8_vblank_optimizable = false;
}
- if (pmo->soc_bb->power_management_parameters.stutter_enter_plus_exit_latency_us > 0) {
+ if (!stutter_optimization_too_expensive && pmo->soc_bb->power_management_parameters.stutter_enter_plus_exit_latency_us > 0) {
pmo->scratch.pmo_dcn4.optimal_vblank_reserved_time_for_stutter_us[pmo->scratch.pmo_dcn4.num_stutter_candidates] = (unsigned int)pmo->soc_bb->power_management_parameters.stutter_enter_plus_exit_latency_us;
pmo->scratch.pmo_dcn4.num_stutter_candidates++;
}
diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
index ee1bcfa..80e60ea2 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
@@ -1259,33 +1259,26 @@ static int smu_sw_init(void *handle)
smu->watermarks_bitmap = 0;
smu->power_profile_mode = PP_SMC_POWER_PROFILE_BOOTUP_DEFAULT;
smu->default_power_profile_mode = PP_SMC_POWER_PROFILE_BOOTUP_DEFAULT;
- smu->user_dpm_profile.user_workload_mask = 0;
atomic_set(&smu->smu_power.power_gate.vcn_gated, 1);
atomic_set(&smu->smu_power.power_gate.jpeg_gated, 1);
atomic_set(&smu->smu_power.power_gate.vpe_gated, 1);
atomic_set(&smu->smu_power.power_gate.umsch_mm_gated, 1);
- smu->workload_priority[PP_SMC_POWER_PROFILE_BOOTUP_DEFAULT] = 0;
- smu->workload_priority[PP_SMC_POWER_PROFILE_FULLSCREEN3D] = 1;
- smu->workload_priority[PP_SMC_POWER_PROFILE_POWERSAVING] = 2;
- smu->workload_priority[PP_SMC_POWER_PROFILE_VIDEO] = 3;
- smu->workload_priority[PP_SMC_POWER_PROFILE_VR] = 4;
- smu->workload_priority[PP_SMC_POWER_PROFILE_COMPUTE] = 5;
- smu->workload_priority[PP_SMC_POWER_PROFILE_CUSTOM] = 6;
+ smu->workload_prority[PP_SMC_POWER_PROFILE_BOOTUP_DEFAULT] = 0;
+ smu->workload_prority[PP_SMC_POWER_PROFILE_FULLSCREEN3D] = 1;
+ smu->workload_prority[PP_SMC_POWER_PROFILE_POWERSAVING] = 2;
+ smu->workload_prority[PP_SMC_POWER_PROFILE_VIDEO] = 3;
+ smu->workload_prority[PP_SMC_POWER_PROFILE_VR] = 4;
+ smu->workload_prority[PP_SMC_POWER_PROFILE_COMPUTE] = 5;
+ smu->workload_prority[PP_SMC_POWER_PROFILE_CUSTOM] = 6;
if (smu->is_apu ||
- !smu_is_workload_profile_available(smu, PP_SMC_POWER_PROFILE_FULLSCREEN3D)) {
- smu->driver_workload_mask =
- 1 << smu->workload_priority[PP_SMC_POWER_PROFILE_BOOTUP_DEFAULT];
- } else {
- smu->driver_workload_mask =
- 1 << smu->workload_priority[PP_SMC_POWER_PROFILE_FULLSCREEN3D];
- smu->default_power_profile_mode = PP_SMC_POWER_PROFILE_FULLSCREEN3D;
- }
+ !smu_is_workload_profile_available(smu, PP_SMC_POWER_PROFILE_FULLSCREEN3D))
+ smu->workload_mask = 1 << smu->workload_prority[PP_SMC_POWER_PROFILE_BOOTUP_DEFAULT];
+ else
+ smu->workload_mask = 1 << smu->workload_prority[PP_SMC_POWER_PROFILE_FULLSCREEN3D];
- smu->workload_mask = smu->driver_workload_mask |
- smu->user_dpm_profile.user_workload_mask;
smu->workload_setting[0] = PP_SMC_POWER_PROFILE_BOOTUP_DEFAULT;
smu->workload_setting[1] = PP_SMC_POWER_PROFILE_FULLSCREEN3D;
smu->workload_setting[2] = PP_SMC_POWER_PROFILE_POWERSAVING;
@@ -2355,20 +2348,17 @@ static int smu_switch_power_profile(void *handle,
return -EINVAL;
if (!en) {
- smu->driver_workload_mask &= ~(1 << smu->workload_priority[type]);
+ smu->workload_mask &= ~(1 << smu->workload_prority[type]);
index = fls(smu->workload_mask);
index = index > 0 && index <= WORKLOAD_POLICY_MAX ? index - 1 : 0;
workload[0] = smu->workload_setting[index];
} else {
- smu->driver_workload_mask |= (1 << smu->workload_priority[type]);
+ smu->workload_mask |= (1 << smu->workload_prority[type]);
index = fls(smu->workload_mask);
index = index <= WORKLOAD_POLICY_MAX ? index - 1 : 0;
workload[0] = smu->workload_setting[index];
}
- smu->workload_mask = smu->driver_workload_mask |
- smu->user_dpm_profile.user_workload_mask;
-
if (smu_dpm_ctx->dpm_level != AMD_DPM_FORCED_LEVEL_MANUAL &&
smu_dpm_ctx->dpm_level != AMD_DPM_FORCED_LEVEL_PERF_DETERMINISM)
smu_bump_power_profile_mode(smu, workload, 0);
@@ -3059,23 +3049,12 @@ static int smu_set_power_profile_mode(void *handle,
uint32_t param_size)
{
struct smu_context *smu = handle;
- int ret;
if (!smu->pm_enabled || !smu->adev->pm.dpm_enabled ||
!smu->ppt_funcs->set_power_profile_mode)
return -EOPNOTSUPP;
- if (smu->user_dpm_profile.user_workload_mask &
- (1 << smu->workload_priority[param[param_size]]))
- return 0;
-
- smu->user_dpm_profile.user_workload_mask =
- (1 << smu->workload_priority[param[param_size]]);
- smu->workload_mask = smu->user_dpm_profile.user_workload_mask |
- smu->driver_workload_mask;
- ret = smu_bump_power_profile_mode(smu, param, param_size);
-
- return ret;
+ return smu_bump_power_profile_mode(smu, param, param_size);
}
static int smu_get_fan_control_mode(void *handle, u32 *fan_mode)
diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h b/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
index d60d9a1..b44a185 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
+++ b/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
@@ -240,7 +240,6 @@ struct smu_user_dpm_profile {
/* user clock state information */
uint32_t clk_mask[SMU_CLK_COUNT];
uint32_t clk_dependency;
- uint32_t user_workload_mask;
};
#define SMU_TABLE_INIT(tables, table_id, s, a, d) \
@@ -558,8 +557,7 @@ struct smu_context {
bool disable_uclk_switch;
uint32_t workload_mask;
- uint32_t driver_workload_mask;
- uint32_t workload_priority[WORKLOAD_POLICY_MAX];
+ uint32_t workload_prority[WORKLOAD_POLICY_MAX];
uint32_t workload_setting[WORKLOAD_POLICY_MAX];
uint32_t power_profile_mode;
uint32_t default_power_profile_mode;
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu11/arcturus_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu11/arcturus_ppt.c
index 31fe512..c0f6b59 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu11/arcturus_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu11/arcturus_ppt.c
@@ -1455,6 +1455,7 @@ static int arcturus_set_power_profile_mode(struct smu_context *smu,
return -EINVAL;
}
+
if ((profile_mode == PP_SMC_POWER_PROFILE_CUSTOM) &&
(smu->smc_fw_version >= 0x360d00)) {
if (size != 10)
@@ -1522,14 +1523,14 @@ static int arcturus_set_power_profile_mode(struct smu_context *smu,
ret = smu_cmn_send_smc_msg_with_param(smu,
SMU_MSG_SetWorkloadMask,
- smu->workload_mask,
+ 1 << workload_type,
NULL);
if (ret) {
dev_err(smu->adev->dev, "Fail to set workload type %d\n", workload_type);
return ret;
}
- smu_cmn_assign_power_profile(smu);
+ smu->power_profile_mode = profile_mode;
return 0;
}
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu11/navi10_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu11/navi10_ppt.c
index 12223f5..16af1a3 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu11/navi10_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu11/navi10_ppt.c
@@ -2081,13 +2081,10 @@ static int navi10_set_power_profile_mode(struct smu_context *smu, long *input, u
smu->power_profile_mode);
if (workload_type < 0)
return -EINVAL;
-
ret = smu_cmn_send_smc_msg_with_param(smu, SMU_MSG_SetWorkloadMask,
- smu->workload_mask, NULL);
+ 1 << workload_type, NULL);
if (ret)
dev_err(smu->adev->dev, "[%s] Failed to set work load mask!", __func__);
- else
- smu_cmn_assign_power_profile(smu);
return ret;
}
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c
index 3b7b2ec..9c3c482 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c
@@ -1786,13 +1786,10 @@ static int sienna_cichlid_set_power_profile_mode(struct smu_context *smu, long *
smu->power_profile_mode);
if (workload_type < 0)
return -EINVAL;
-
ret = smu_cmn_send_smc_msg_with_param(smu, SMU_MSG_SetWorkloadMask,
- smu->workload_mask, NULL);
+ 1 << workload_type, NULL);
if (ret)
dev_err(smu->adev->dev, "[%s] Failed to set work load mask!", __func__);
- else
- smu_cmn_assign_power_profile(smu);
return ret;
}
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu11/vangogh_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu11/vangogh_ppt.c
index 952ee22..1fe020f 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu11/vangogh_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu11/vangogh_ppt.c
@@ -1079,7 +1079,7 @@ static int vangogh_set_power_profile_mode(struct smu_context *smu, long *input,
}
ret = smu_cmn_send_smc_msg_with_param(smu, SMU_MSG_ActiveProcessNotify,
- smu->workload_mask,
+ 1 << workload_type,
NULL);
if (ret) {
dev_err_once(smu->adev->dev, "Fail to set workload type %d\n",
@@ -1087,7 +1087,7 @@ static int vangogh_set_power_profile_mode(struct smu_context *smu, long *input,
return ret;
}
- smu_cmn_assign_power_profile(smu);
+ smu->power_profile_mode = profile_mode;
return 0;
}
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu12/renoir_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu12/renoir_ppt.c
index 62316a6..cc0504b 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu12/renoir_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu12/renoir_ppt.c
@@ -890,14 +890,14 @@ static int renoir_set_power_profile_mode(struct smu_context *smu, long *input, u
}
ret = smu_cmn_send_smc_msg_with_param(smu, SMU_MSG_ActiveProcessNotify,
- smu->workload_mask,
+ 1 << workload_type,
NULL);
if (ret) {
dev_err_once(smu->adev->dev, "Fail to set workload type %d\n", workload_type);
return ret;
}
- smu_cmn_assign_power_profile(smu);
+ smu->power_profile_mode = profile_mode;
return 0;
}
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c
index 5dd7cec..d53e162 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c
@@ -2485,7 +2485,7 @@ static int smu_v13_0_0_set_power_profile_mode(struct smu_context *smu,
DpmActivityMonitorCoeffInt_t *activity_monitor =
&(activity_monitor_external.DpmActivityMonitorCoeffInt);
int workload_type, ret = 0;
- u32 workload_mask;
+ u32 workload_mask, selected_workload_mask;
smu->power_profile_mode = input[size];
@@ -2552,7 +2552,7 @@ static int smu_v13_0_0_set_power_profile_mode(struct smu_context *smu,
if (workload_type < 0)
return -EINVAL;
- workload_mask = 1 << workload_type;
+ selected_workload_mask = workload_mask = 1 << workload_type;
/* Add optimizations for SMU13.0.0/10. Reuse the power saving profile */
if ((amdgpu_ip_version(smu->adev, MP1_HWIP, 0) == IP_VERSION(13, 0, 0) &&
@@ -2567,22 +2567,12 @@ static int smu_v13_0_0_set_power_profile_mode(struct smu_context *smu,
workload_mask |= 1 << workload_type;
}
- smu->workload_mask |= workload_mask;
ret = smu_cmn_send_smc_msg_with_param(smu,
SMU_MSG_SetWorkloadMask,
- smu->workload_mask,
+ workload_mask,
NULL);
- if (!ret) {
- smu_cmn_assign_power_profile(smu);
- if (smu->power_profile_mode == PP_SMC_POWER_PROFILE_POWERSAVING) {
- workload_type = smu_cmn_to_asic_specific_index(smu,
- CMN2ASIC_MAPPING_WORKLOAD,
- PP_SMC_POWER_PROFILE_FULLSCREEN3D);
- smu->power_profile_mode = smu->workload_mask & (1 << workload_type)
- ? PP_SMC_POWER_PROFILE_FULLSCREEN3D
- : PP_SMC_POWER_PROFILE_BOOTUP_DEFAULT;
- }
- }
+ if (!ret)
+ smu->workload_mask = selected_workload_mask;
return ret;
}
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c
index 9d0b194..b891a5e 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c
@@ -2499,14 +2499,13 @@ static int smu_v13_0_7_set_power_profile_mode(struct smu_context *smu, long *inp
smu->power_profile_mode);
if (workload_type < 0)
return -EINVAL;
-
ret = smu_cmn_send_smc_msg_with_param(smu, SMU_MSG_SetWorkloadMask,
- smu->workload_mask, NULL);
+ 1 << workload_type, NULL);
if (ret)
dev_err(smu->adev->dev, "[%s] Failed to set work load mask!", __func__);
else
- smu_cmn_assign_power_profile(smu);
+ smu->workload_mask = (1 << workload_type);
return ret;
}
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_0_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_0_ppt.c
index 8798ebf..84f9b00 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_0_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_0_ppt.c
@@ -1132,7 +1132,7 @@ static int smu_v14_0_common_get_dpm_level_count(struct smu_context *smu,
static int smu_v14_0_0_print_clk_levels(struct smu_context *smu,
enum smu_clk_type clk_type, char *buf)
{
- int i, size = 0, ret = 0;
+ int i, idx, ret = 0, size = 0;
uint32_t cur_value = 0, value = 0, count = 0;
uint32_t min, max;
@@ -1168,7 +1168,8 @@ static int smu_v14_0_0_print_clk_levels(struct smu_context *smu,
break;
for (i = 0; i < count; i++) {
- ret = smu_v14_0_common_get_dpm_freq_by_index(smu, clk_type, i, &value);
+ idx = (clk_type == SMU_MCLK) ? (count - i - 1) : i;
+ ret = smu_v14_0_common_get_dpm_freq_by_index(smu, clk_type, idx, &value);
if (ret)
break;
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c
index 1aa13d3..1e16a28 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c
@@ -1807,11 +1807,12 @@ static int smu_v14_0_2_set_power_profile_mode(struct smu_context *smu,
if (workload_type < 0)
return -EINVAL;
- ret = smu_cmn_send_smc_msg_with_param(smu, SMU_MSG_SetWorkloadMask,
- smu->workload_mask, NULL);
-
+ ret = smu_cmn_send_smc_msg_with_param(smu,
+ SMU_MSG_SetWorkloadMask,
+ 1 << workload_type,
+ NULL);
if (!ret)
- smu_cmn_assign_power_profile(smu);
+ smu->workload_mask = 1 << workload_type;
return ret;
}
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
index bdfc5e6..91ad434 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
@@ -1138,14 +1138,6 @@ int smu_cmn_set_mp1_state(struct smu_context *smu,
return ret;
}
-void smu_cmn_assign_power_profile(struct smu_context *smu)
-{
- uint32_t index;
- index = fls(smu->workload_mask);
- index = index > 0 && index <= WORKLOAD_POLICY_MAX ? index - 1 : 0;
- smu->power_profile_mode = smu->workload_setting[index];
-}
-
bool smu_cmn_is_audio_func_enabled(struct amdgpu_device *adev)
{
struct pci_dev *p = NULL;
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h
index 8a801e3..1de685d 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h
@@ -130,8 +130,6 @@ void smu_cmn_init_soft_gpu_metrics(void *table, uint8_t frev, uint8_t crev);
int smu_cmn_set_mp1_state(struct smu_context *smu,
enum pp_mp1_state mp1_state);
-void smu_cmn_assign_power_profile(struct smu_context *smu);
-
/*
* Helper function to make sysfs_emit_at() happy. Align buf to
* the current page boundary and record the offset.
diff --git a/drivers/gpu/drm/bridge/tc358768.c b/drivers/gpu/drm/bridge/tc358768.c
index 0e88132..bb1750a3 100644
--- a/drivers/gpu/drm/bridge/tc358768.c
+++ b/drivers/gpu/drm/bridge/tc358768.c
@@ -125,6 +125,9 @@
#define TC358768_DSI_CONFW_MODE_CLR (6 << 29)
#define TC358768_DSI_CONFW_ADDR_DSI_CONTROL (0x3 << 24)
+/* TC358768_DSICMD_TX (0x0600) register */
+#define TC358768_DSI_CMDTX_DC_START BIT(0)
+
static const char * const tc358768_supplies[] = {
"vddc", "vddmipi", "vddio"
};
@@ -229,6 +232,21 @@ static void tc358768_update_bits(struct tc358768_priv *priv, u32 reg, u32 mask,
tc358768_write(priv, reg, tmp);
}
+static void tc358768_dsicmd_tx(struct tc358768_priv *priv)
+{
+ u32 val;
+
+ /* start transfer */
+ tc358768_write(priv, TC358768_DSICMD_TX, TC358768_DSI_CMDTX_DC_START);
+ if (priv->error)
+ return;
+
+ /* wait transfer completion */
+ priv->error = regmap_read_poll_timeout(priv->regmap, TC358768_DSICMD_TX, val,
+ (val & TC358768_DSI_CMDTX_DC_START) == 0,
+ 100, 100000);
+}
+
static int tc358768_sw_reset(struct tc358768_priv *priv)
{
/* Assert Reset */
@@ -516,8 +534,7 @@ static ssize_t tc358768_dsi_host_transfer(struct mipi_dsi_host *host,
}
}
- /* start transfer */
- tc358768_write(priv, TC358768_DSICMD_TX, 1);
+ tc358768_dsicmd_tx(priv);
ret = tc358768_clear_error(priv);
if (ret)
diff --git a/drivers/gpu/drm/drm_syncobj.c b/drivers/gpu/drm/drm_syncobj.c
index 8e3d2d70..4f2ab8a 100644
--- a/drivers/gpu/drm/drm_syncobj.c
+++ b/drivers/gpu/drm/drm_syncobj.c
@@ -712,16 +712,14 @@ static int drm_syncobj_fd_to_handle(struct drm_file *file_private,
int fd, u32 *handle)
{
struct drm_syncobj *syncobj;
- struct fd f = fdget(fd);
+ CLASS(fd, f)(fd);
int ret;
- if (!fd_file(f))
+ if (fd_empty(f))
return -EINVAL;
- if (fd_file(f)->f_op != &drm_syncobj_file_fops) {
- fdput(f);
+ if (fd_file(f)->f_op != &drm_syncobj_file_fops)
return -EINVAL;
- }
/* take a reference to put in the idr */
syncobj = fd_file(f)->private_data;
@@ -739,7 +737,6 @@ static int drm_syncobj_fd_to_handle(struct drm_file *file_private,
} else
drm_syncobj_put(syncobj);
- fdput(f);
return ret;
}
diff --git a/drivers/gpu/drm/i915/display/intel_tv.c b/drivers/gpu/drm/i915/display/intel_tv.c
index 581844d..5fee4be 100644
--- a/drivers/gpu/drm/i915/display/intel_tv.c
+++ b/drivers/gpu/drm/i915/display/intel_tv.c
@@ -928,7 +928,7 @@ intel_enable_tv(struct intel_atomic_state *state,
const struct intel_crtc_state *pipe_config,
const struct drm_connector_state *conn_state)
{
- struct intel_display *display = to_intel_display(state);
+ struct intel_display *display = to_intel_display(encoder);
/* Prevents vblank waits from timing out in intel_tv_detect_type() */
intel_crtc_wait_for_next_vblank(to_intel_crtc(pipe_config->uapi.crtc));
@@ -942,7 +942,7 @@ intel_disable_tv(struct intel_atomic_state *state,
const struct intel_crtc_state *old_crtc_state,
const struct drm_connector_state *old_conn_state)
{
- struct intel_display *display = to_intel_display(state);
+ struct intel_display *display = to_intel_display(encoder);
intel_de_rmw(display, TV_CTL, TV_ENC_ENABLE, 0);
}
diff --git a/drivers/gpu/drm/i915/gt/shmem_utils.c b/drivers/gpu/drm/i915/gt/shmem_utils.c
index 1fb6ff7..bb696b2 100644
--- a/drivers/gpu/drm/i915/gt/shmem_utils.c
+++ b/drivers/gpu/drm/i915/gt/shmem_utils.c
@@ -40,7 +40,7 @@ struct file *shmem_create_from_object(struct drm_i915_gem_object *obj)
if (i915_gem_object_is_shmem(obj)) {
file = obj->base.filp;
- atomic_long_inc(&file->f_count);
+ get_file(file);
return file;
}
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_gsc_fw.c b/drivers/gpu/drm/i915/gt/uc/intel_gsc_fw.c
index 551b0d7..5dc0ccd 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_gsc_fw.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_gsc_fw.c
@@ -80,6 +80,7 @@ int intel_gsc_fw_get_binary_info(struct intel_uc_fw *gsc_fw, const void *data, s
const struct intel_gsc_cpd_header_v2 *cpd_header = NULL;
const struct intel_gsc_cpd_entry *cpd_entry = NULL;
const struct intel_gsc_manifest_header *manifest;
+ struct intel_uc_fw_ver min_ver = { 0 };
size_t min_size = sizeof(*layout);
int i;
@@ -212,33 +213,46 @@ int intel_gsc_fw_get_binary_info(struct intel_uc_fw *gsc_fw, const void *data, s
}
}
- if (IS_ARROWLAKE(gt->i915)) {
+ /*
+ * ARL SKUs require newer firmwares, but the blob is actually common
+ * across all MTL and ARL SKUs, so we need to do an explicit version check
+ * here rather than using a separate table entry. If a too old version
+ * is found, then just don't use GSC rather than aborting the driver load.
+ * Note that the major number in the GSC FW version is used to indicate
+ * the platform, so we expect it to always be 102 for MTL/ARL binaries.
+ */
+ if (IS_ARROWLAKE_S(gt->i915))
+ min_ver = (struct intel_uc_fw_ver){ 102, 0, 10, 1878 };
+ else if (IS_ARROWLAKE_H(gt->i915) || IS_ARROWLAKE_U(gt->i915))
+ min_ver = (struct intel_uc_fw_ver){ 102, 1, 15, 1926 };
+
+ if (IS_METEORLAKE(gt->i915) && gsc->release.major != 102) {
+ gt_info(gt, "Invalid GSC firmware for MTL/ARL, got %d.%d.%d.%d but need 102.x.x.x",
+ gsc->release.major, gsc->release.minor,
+ gsc->release.patch, gsc->release.build);
+ return -EINVAL;
+ }
+
+ if (min_ver.major) {
bool too_old = false;
- /*
- * ARL requires a newer firmware than MTL did (102.0.10.1878) but the
- * firmware is actually common. So, need to do an explicit version check
- * here rather than using a separate table entry. And if the older
- * MTL-only version is found, then just don't use GSC rather than aborting
- * the driver load.
- */
- if (gsc->release.major < 102) {
+ if (gsc->release.minor < min_ver.minor) {
too_old = true;
- } else if (gsc->release.major == 102) {
- if (gsc->release.minor == 0) {
- if (gsc->release.patch < 10) {
+ } else if (gsc->release.minor == min_ver.minor) {
+ if (gsc->release.patch < min_ver.patch) {
+ too_old = true;
+ } else if (gsc->release.patch == min_ver.patch) {
+ if (gsc->release.build < min_ver.build)
too_old = true;
- } else if (gsc->release.patch == 10) {
- if (gsc->release.build < 1878)
- too_old = true;
- }
}
}
if (too_old) {
- gt_info(gt, "GSC firmware too old for ARL, got %d.%d.%d.%d but need at least 102.0.10.1878",
+ gt_info(gt, "GSC firmware too old for ARL, got %d.%d.%d.%d but need at least %d.%d.%d.%d",
gsc->release.major, gsc->release.minor,
- gsc->release.patch, gsc->release.build);
+ gsc->release.patch, gsc->release.build,
+ min_ver.major, min_ver.minor,
+ min_ver.patch, min_ver.build);
return -EINVAL;
}
}
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 39f6614..aa0b1bf 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -540,8 +540,12 @@ IS_SUBPLATFORM(const struct drm_i915_private *i915,
#define IS_LUNARLAKE(i915) (0 && i915)
#define IS_BATTLEMAGE(i915) (0 && i915)
-#define IS_ARROWLAKE(i915) \
- IS_SUBPLATFORM(i915, INTEL_METEORLAKE, INTEL_SUBPLATFORM_ARL)
+#define IS_ARROWLAKE_H(i915) \
+ IS_SUBPLATFORM(i915, INTEL_METEORLAKE, INTEL_SUBPLATFORM_ARL_H)
+#define IS_ARROWLAKE_U(i915) \
+ IS_SUBPLATFORM(i915, INTEL_METEORLAKE, INTEL_SUBPLATFORM_ARL_U)
+#define IS_ARROWLAKE_S(i915) \
+ IS_SUBPLATFORM(i915, INTEL_METEORLAKE, INTEL_SUBPLATFORM_ARL_S)
#define IS_DG2_G10(i915) \
IS_SUBPLATFORM(i915, INTEL_DG2, INTEL_SUBPLATFORM_G10)
#define IS_DG2_G11(i915) \
diff --git a/drivers/gpu/drm/i915/intel_device_info.c b/drivers/gpu/drm/i915/intel_device_info.c
index 3c47c62..4679992 100644
--- a/drivers/gpu/drm/i915/intel_device_info.c
+++ b/drivers/gpu/drm/i915/intel_device_info.c
@@ -200,8 +200,16 @@ static const u16 subplatform_g12_ids[] = {
INTEL_DG2_G12_IDS(ID),
};
-static const u16 subplatform_arl_ids[] = {
- INTEL_ARL_IDS(ID),
+static const u16 subplatform_arl_h_ids[] = {
+ INTEL_ARL_H_IDS(ID),
+};
+
+static const u16 subplatform_arl_u_ids[] = {
+ INTEL_ARL_U_IDS(ID),
+};
+
+static const u16 subplatform_arl_s_ids[] = {
+ INTEL_ARL_S_IDS(ID),
};
static bool find_devid(u16 id, const u16 *p, unsigned int num)
@@ -261,9 +269,15 @@ static void intel_device_info_subplatform_init(struct drm_i915_private *i915)
} else if (find_devid(devid, subplatform_g12_ids,
ARRAY_SIZE(subplatform_g12_ids))) {
mask = BIT(INTEL_SUBPLATFORM_G12);
- } else if (find_devid(devid, subplatform_arl_ids,
- ARRAY_SIZE(subplatform_arl_ids))) {
- mask = BIT(INTEL_SUBPLATFORM_ARL);
+ } else if (find_devid(devid, subplatform_arl_h_ids,
+ ARRAY_SIZE(subplatform_arl_h_ids))) {
+ mask = BIT(INTEL_SUBPLATFORM_ARL_H);
+ } else if (find_devid(devid, subplatform_arl_u_ids,
+ ARRAY_SIZE(subplatform_arl_u_ids))) {
+ mask = BIT(INTEL_SUBPLATFORM_ARL_U);
+ } else if (find_devid(devid, subplatform_arl_s_ids,
+ ARRAY_SIZE(subplatform_arl_s_ids))) {
+ mask = BIT(INTEL_SUBPLATFORM_ARL_S);
}
GEM_BUG_ON(mask & ~INTEL_SUBPLATFORM_MASK);
diff --git a/drivers/gpu/drm/i915/intel_device_info.h b/drivers/gpu/drm/i915/intel_device_info.h
index 643ff1b..a9fcaf3 100644
--- a/drivers/gpu/drm/i915/intel_device_info.h
+++ b/drivers/gpu/drm/i915/intel_device_info.h
@@ -128,7 +128,9 @@ enum intel_platform {
#define INTEL_SUBPLATFORM_RPLU 2
/* MTL */
-#define INTEL_SUBPLATFORM_ARL 0
+#define INTEL_SUBPLATFORM_ARL_H 0
+#define INTEL_SUBPLATFORM_ARL_U 1
+#define INTEL_SUBPLATFORM_ARL_S 2
enum intel_ppgtt_type {
INTEL_PPGTT_NONE = I915_GEM_PPGTT_NONE,
diff --git a/drivers/gpu/drm/i915/selftests/i915_gem.c b/drivers/gpu/drm/i915/selftests/i915_gem.c
index 61da4ed..0727492 100644
--- a/drivers/gpu/drm/i915/selftests/i915_gem.c
+++ b/drivers/gpu/drm/i915/selftests/i915_gem.c
@@ -4,7 +4,7 @@
* Copyright © 2018 Intel Corporation
*/
-#include <linux/random.h>
+#include <linux/prandom.h>
#include "gem/i915_gem_internal.h"
#include "gem/i915_gem_pm.h"
diff --git a/drivers/gpu/drm/i915/selftests/i915_random.h b/drivers/gpu/drm/i915/selftests/i915_random.h
index 05364ec..70330a2 100644
--- a/drivers/gpu/drm/i915/selftests/i915_random.h
+++ b/drivers/gpu/drm/i915/selftests/i915_random.h
@@ -26,7 +26,7 @@
#define __I915_SELFTESTS_RANDOM_H__
#include <linux/math64.h>
-#include <linux/random.h>
+#include <linux/prandom.h>
#include "../i915_selftest.h"
diff --git a/drivers/gpu/drm/i915/selftests/scatterlist.c b/drivers/gpu/drm/i915/selftests/scatterlist.c
index 805c4bf..7e59591 100644
--- a/drivers/gpu/drm/i915/selftests/scatterlist.c
+++ b/drivers/gpu/drm/i915/selftests/scatterlist.c
@@ -22,7 +22,7 @@
*/
#include <linux/prime_numbers.h>
-#include <linux/random.h>
+#include <linux/prandom.h>
#include "i915_selftest.h"
#include "i915_utils.h"
diff --git a/drivers/gpu/drm/lib/drm_random.h b/drivers/gpu/drm/lib/drm_random.h
index 5543bf0..9f82726 100644
--- a/drivers/gpu/drm/lib/drm_random.h
+++ b/drivers/gpu/drm/lib/drm_random.h
@@ -6,7 +6,7 @@
* be transposed to lib/ at the earliest convenience.
*/
-#include <linux/random.h>
+#include <linux/prandom.h>
#define DRM_RND_STATE_INITIALIZER(seed__) ({ \
struct rnd_state state__; \
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/disp/r535.c b/drivers/gpu/drm/nouveau/nvkm/engine/disp/r535.c
index 027867c..99110ab 100644
--- a/drivers/gpu/drm/nouveau/nvkm/engine/disp/r535.c
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/disp/r535.c
@@ -992,7 +992,7 @@ r535_dp_train_target(struct nvkm_outp *outp, u8 target, bool mst, u8 link_nr, u8
ctrl->data = data;
ret = nvkm_gsp_rm_ctrl_push(&disp->rm.objcom, &ctrl, sizeof(*ctrl));
- if (ret == -EAGAIN && ctrl->retryTimeMs) {
+ if ((ret == -EAGAIN || ret == -EBUSY) && ctrl->retryTimeMs) {
/*
* Device (likely an eDP panel) isn't ready yet, wait for the time specified
* by GSP before retrying again
@@ -1060,33 +1060,44 @@ r535_dp_aux_xfer(struct nvkm_outp *outp, u8 type, u32 addr, u8 *data, u8 *psize)
NV0073_CTRL_DP_AUXCH_CTRL_PARAMS *ctrl;
u8 size = *psize;
int ret;
+ int retries;
- ctrl = nvkm_gsp_rm_ctrl_get(&disp->rm.objcom, NV0073_CTRL_CMD_DP_AUXCH_CTRL, sizeof(*ctrl));
- if (IS_ERR(ctrl))
- return PTR_ERR(ctrl);
+ for (retries = 0; retries < 3; ++retries) {
+ ctrl = nvkm_gsp_rm_ctrl_get(&disp->rm.objcom, NV0073_CTRL_CMD_DP_AUXCH_CTRL, sizeof(*ctrl));
+ if (IS_ERR(ctrl))
+ return PTR_ERR(ctrl);
- ctrl->subDeviceInstance = 0;
- ctrl->displayId = BIT(outp->index);
- ctrl->bAddrOnly = !size;
- ctrl->cmd = type;
- if (ctrl->bAddrOnly) {
- ctrl->cmd = NVDEF_SET(ctrl->cmd, NV0073_CTRL, DP_AUXCH_CMD, REQ_TYPE, WRITE);
- ctrl->cmd = NVDEF_SET(ctrl->cmd, NV0073_CTRL, DP_AUXCH_CMD, I2C_MOT, FALSE);
+ ctrl->subDeviceInstance = 0;
+ ctrl->displayId = BIT(outp->index);
+ ctrl->bAddrOnly = !size;
+ ctrl->cmd = type;
+ if (ctrl->bAddrOnly) {
+ ctrl->cmd = NVDEF_SET(ctrl->cmd, NV0073_CTRL, DP_AUXCH_CMD, REQ_TYPE, WRITE);
+ ctrl->cmd = NVDEF_SET(ctrl->cmd, NV0073_CTRL, DP_AUXCH_CMD, I2C_MOT, FALSE);
+ }
+ ctrl->addr = addr;
+ ctrl->size = !ctrl->bAddrOnly ? (size - 1) : 0;
+ memcpy(ctrl->data, data, size);
+
+ ret = nvkm_gsp_rm_ctrl_push(&disp->rm.objcom, &ctrl, sizeof(*ctrl));
+ if ((ret == -EAGAIN || ret == -EBUSY) && ctrl->retryTimeMs) {
+ /*
+ * Device (likely an eDP panel) isn't ready yet, wait for the time specified
+ * by GSP before retrying again
+ */
+ nvkm_debug(&disp->engine.subdev,
+ "Waiting %dms for GSP LT panel delay before retrying in AUX\n",
+ ctrl->retryTimeMs);
+ msleep(ctrl->retryTimeMs);
+ nvkm_gsp_rm_ctrl_done(&disp->rm.objcom, ctrl);
+ } else {
+ memcpy(data, ctrl->data, size);
+ *psize = ctrl->size;
+ ret = ctrl->replyType;
+ nvkm_gsp_rm_ctrl_done(&disp->rm.objcom, ctrl);
+ break;
+ }
}
- ctrl->addr = addr;
- ctrl->size = !ctrl->bAddrOnly ? (size - 1) : 0;
- memcpy(ctrl->data, data, size);
-
- ret = nvkm_gsp_rm_ctrl_push(&disp->rm.objcom, &ctrl, sizeof(*ctrl));
- if (ret) {
- nvkm_gsp_rm_ctrl_done(&disp->rm.objcom, ctrl);
- return ret;
- }
-
- memcpy(data, ctrl->data, size);
- *psize = ctrl->size;
- ret = ctrl->replyType;
- nvkm_gsp_rm_ctrl_done(&disp->rm.objcom, ctrl);
return ret;
}
diff --git a/drivers/gpu/drm/nouveau/nvkm/falcon/fw.c b/drivers/gpu/drm/nouveau/nvkm/falcon/fw.c
index a1c8545f..cac6d64 100644
--- a/drivers/gpu/drm/nouveau/nvkm/falcon/fw.c
+++ b/drivers/gpu/drm/nouveau/nvkm/falcon/fw.c
@@ -89,11 +89,6 @@ nvkm_falcon_fw_boot(struct nvkm_falcon_fw *fw, struct nvkm_subdev *user,
nvkm_falcon_fw_dtor_sigs(fw);
}
- /* after last write to the img, sync dma mappings */
- dma_sync_single_for_device(fw->fw.device->dev,
- fw->fw.phys,
- sg_dma_len(&fw->fw.mem.sgl),
- DMA_TO_DEVICE);
FLCNFW_DBG(fw, "resetting");
fw->func->reset(fw);
@@ -105,6 +100,12 @@ nvkm_falcon_fw_boot(struct nvkm_falcon_fw *fw, struct nvkm_subdev *user,
goto done;
}
+ /* after last write to the img, sync dma mappings */
+ dma_sync_single_for_device(fw->fw.device->dev,
+ fw->fw.phys,
+ sg_dma_len(&fw->fw.mem.sgl),
+ DMA_TO_DEVICE);
+
ret = fw->func->load(fw);
if (ret)
goto done;
diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/gsp/r535.c b/drivers/gpu/drm/nouveau/nvkm/subdev/gsp/r535.c
index cf58f9d..d586aea 100644
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/gsp/r535.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/gsp/r535.c
@@ -78,7 +78,7 @@ r535_rpc_status_to_errno(uint32_t rpc_status)
switch (rpc_status) {
case 0x55: /* NV_ERR_NOT_READY */
case 0x66: /* NV_ERR_TIMEOUT_RETRY */
- return -EAGAIN;
+ return -EBUSY;
case 0x51: /* NV_ERR_NO_MEMORY */
return -ENOMEM;
default:
@@ -601,7 +601,7 @@ r535_gsp_rpc_rm_alloc_push(struct nvkm_gsp_object *object, void *argv, u32 repc)
if (rpc->status) {
ret = ERR_PTR(r535_rpc_status_to_errno(rpc->status));
- if (PTR_ERR(ret) != -EAGAIN)
+ if (PTR_ERR(ret) != -EAGAIN && PTR_ERR(ret) != -EBUSY)
nvkm_error(&gsp->subdev, "RM_ALLOC: 0x%x\n", rpc->status);
} else {
ret = repc ? rpc->params : NULL;
@@ -660,7 +660,7 @@ r535_gsp_rpc_rm_ctrl_push(struct nvkm_gsp_object *object, void **argv, u32 repc)
if (rpc->status) {
ret = r535_rpc_status_to_errno(rpc->status);
- if (ret != -EAGAIN)
+ if (ret != -EAGAIN && ret != -EBUSY)
nvkm_error(&gsp->subdev, "cli:0x%08x obj:0x%08x ctrl cmd:0x%08x failed: 0x%08x\n",
object->client->object.handle, object->handle, rpc->cmd, rpc->status);
}
diff --git a/drivers/gpu/drm/panthor/panthor_mmu.c b/drivers/gpu/drm/panthor/panthor_mmu.c
index 7db2edb..0e6f94d 100644
--- a/drivers/gpu/drm/panthor/panthor_mmu.c
+++ b/drivers/gpu/drm/panthor/panthor_mmu.c
@@ -990,6 +990,8 @@ panthor_vm_map_pages(struct panthor_vm *vm, u64 iova, int prot,
if (!size)
break;
+
+ offset = 0;
}
return panthor_vm_flush_range(vm, start_iova, iova - start_iova);
diff --git a/drivers/gpu/drm/rockchip/rockchip_drm_vop.c b/drivers/gpu/drm/rockchip/rockchip_drm_vop.c
index f161f40..6990013 100644
--- a/drivers/gpu/drm/rockchip/rockchip_drm_vop.c
+++ b/drivers/gpu/drm/rockchip/rockchip_drm_vop.c
@@ -1093,10 +1093,10 @@ static int vop_plane_atomic_async_check(struct drm_plane *plane,
if (!plane->state->fb)
return -EINVAL;
- if (state)
- crtc_state = drm_atomic_get_existing_crtc_state(state,
- new_plane_state->crtc);
- else /* Special case for asynchronous cursor updates. */
+ crtc_state = drm_atomic_get_existing_crtc_state(state, new_plane_state->crtc);
+
+ /* Special case for asynchronous cursor updates. */
+ if (!crtc_state)
crtc_state = plane->crtc->state;
return drm_atomic_helper_check_plane_state(plane->state, crtc_state,
diff --git a/drivers/gpu/drm/vmwgfx/ttm_object.c b/drivers/gpu/drm/vmwgfx/ttm_object.c
index 3353e97..a17e628 100644
--- a/drivers/gpu/drm/vmwgfx/ttm_object.c
+++ b/drivers/gpu/drm/vmwgfx/ttm_object.c
@@ -471,7 +471,7 @@ void ttm_object_device_release(struct ttm_object_device **p_tdev)
*/
static bool __must_check get_dma_buf_unless_doomed(struct dma_buf *dmabuf)
{
- return atomic_long_inc_not_zero(&dmabuf->file->f_count) != 0L;
+ return file_ref_get(&dmabuf->file->f_ref);
}
/**
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c b/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c
index 63b8d75..10d596c 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c
@@ -1265,6 +1265,8 @@ static int vmw_framebuffer_surface_create_handle(struct drm_framebuffer *fb,
struct vmw_framebuffer_surface *vfbs = vmw_framebuffer_to_vfbs(fb);
struct vmw_bo *bo = vmw_user_object_buffer(&vfbs->uo);
+ if (WARN_ON(!bo))
+ return -EINVAL;
return drm_gem_handle_create(file_priv, &bo->tbo.base, handle);
}
diff --git a/drivers/gpu/drm/xe/xe_bo.c b/drivers/gpu/drm/xe/xe_bo.c
index e5f51fd..2a093540 100644
--- a/drivers/gpu/drm/xe/xe_bo.c
+++ b/drivers/gpu/drm/xe/xe_bo.c
@@ -886,8 +886,8 @@ int xe_bo_evict_pinned(struct xe_bo *bo)
if (WARN_ON(!xe_bo_is_pinned(bo)))
return -EINVAL;
- if (WARN_ON(!xe_bo_is_vram(bo)))
- return -EINVAL;
+ if (!xe_bo_is_vram(bo))
+ return 0;
ret = ttm_bo_mem_space(&bo->ttm, &placement, &new_mem, &ctx);
if (ret)
@@ -937,6 +937,7 @@ int xe_bo_restore_pinned(struct xe_bo *bo)
.interruptible = false,
};
struct ttm_resource *new_mem;
+ struct ttm_place *place = &bo->placements[0];
int ret;
xe_bo_assert_held(bo);
@@ -947,9 +948,15 @@ int xe_bo_restore_pinned(struct xe_bo *bo)
if (WARN_ON(!xe_bo_is_pinned(bo)))
return -EINVAL;
- if (WARN_ON(xe_bo_is_vram(bo) || !bo->ttm.ttm))
+ if (WARN_ON(xe_bo_is_vram(bo)))
return -EINVAL;
+ if (WARN_ON(!bo->ttm.ttm && !xe_bo_is_stolen(bo)))
+ return -EINVAL;
+
+ if (!mem_type_is_vram(place->mem_type))
+ return 0;
+
ret = ttm_bo_mem_space(&bo->ttm, &bo->placement, &new_mem, &ctx);
if (ret)
return ret;
@@ -1719,6 +1726,7 @@ int xe_bo_pin_external(struct xe_bo *bo)
int xe_bo_pin(struct xe_bo *bo)
{
+ struct ttm_place *place = &bo->placements[0];
struct xe_device *xe = xe_bo_device(bo);
int err;
@@ -1749,21 +1757,21 @@ int xe_bo_pin(struct xe_bo *bo)
*/
if (IS_DGFX(xe) && !(IS_ENABLED(CONFIG_DRM_XE_DEBUG) &&
bo->flags & XE_BO_FLAG_INTERNAL_TEST)) {
- struct ttm_place *place = &(bo->placements[0]);
-
if (mem_type_is_vram(place->mem_type)) {
xe_assert(xe, place->flags & TTM_PL_FLAG_CONTIGUOUS);
place->fpfn = (xe_bo_addr(bo, 0, PAGE_SIZE) -
vram_region_gpu_offset(bo->ttm.resource)) >> PAGE_SHIFT;
place->lpfn = place->fpfn + (bo->size >> PAGE_SHIFT);
-
- spin_lock(&xe->pinned.lock);
- list_add_tail(&bo->pinned_link, &xe->pinned.kernel_bo_present);
- spin_unlock(&xe->pinned.lock);
}
}
+ if (mem_type_is_vram(place->mem_type) || bo->flags & XE_BO_FLAG_GGTT) {
+ spin_lock(&xe->pinned.lock);
+ list_add_tail(&bo->pinned_link, &xe->pinned.kernel_bo_present);
+ spin_unlock(&xe->pinned.lock);
+ }
+
ttm_bo_pin(&bo->ttm);
/*
@@ -1809,23 +1817,18 @@ void xe_bo_unpin_external(struct xe_bo *bo)
void xe_bo_unpin(struct xe_bo *bo)
{
+ struct ttm_place *place = &bo->placements[0];
struct xe_device *xe = xe_bo_device(bo);
xe_assert(xe, !bo->ttm.base.import_attach);
xe_assert(xe, xe_bo_is_pinned(bo));
- if (IS_DGFX(xe) && !(IS_ENABLED(CONFIG_DRM_XE_DEBUG) &&
- bo->flags & XE_BO_FLAG_INTERNAL_TEST)) {
- struct ttm_place *place = &(bo->placements[0]);
-
- if (mem_type_is_vram(place->mem_type)) {
- spin_lock(&xe->pinned.lock);
- xe_assert(xe, !list_empty(&bo->pinned_link));
- list_del_init(&bo->pinned_link);
- spin_unlock(&xe->pinned.lock);
- }
+ if (mem_type_is_vram(place->mem_type) || bo->flags & XE_BO_FLAG_GGTT) {
+ spin_lock(&xe->pinned.lock);
+ xe_assert(xe, !list_empty(&bo->pinned_link));
+ list_del_init(&bo->pinned_link);
+ spin_unlock(&xe->pinned.lock);
}
-
ttm_bo_unpin(&bo->ttm);
}
diff --git a/drivers/gpu/drm/xe/xe_bo_evict.c b/drivers/gpu/drm/xe/xe_bo_evict.c
index 541b490..8fb2be0 100644
--- a/drivers/gpu/drm/xe/xe_bo_evict.c
+++ b/drivers/gpu/drm/xe/xe_bo_evict.c
@@ -34,14 +34,22 @@ int xe_bo_evict_all(struct xe_device *xe)
u8 id;
int ret;
- if (!IS_DGFX(xe))
- return 0;
-
/* User memory */
- for (mem_type = XE_PL_VRAM0; mem_type <= XE_PL_VRAM1; ++mem_type) {
+ for (mem_type = XE_PL_TT; mem_type <= XE_PL_VRAM1; ++mem_type) {
struct ttm_resource_manager *man =
ttm_manager_type(bdev, mem_type);
+ /*
+ * On igpu platforms with flat CCS we need to ensure we save and restore any CCS
+ * state since this state lives inside graphics stolen memory which doesn't survive
+ * hibernation.
+ *
+ * This can be further improved by only evicting objects that we know have actually
+ * used a compression enabled PAT index.
+ */
+ if (mem_type == XE_PL_TT && (IS_DGFX(xe) || !xe_device_has_flat_ccs(xe)))
+ continue;
+
if (man) {
ret = ttm_resource_manager_evict_all(bdev, man);
if (ret)
@@ -125,9 +133,6 @@ int xe_bo_restore_kernel(struct xe_device *xe)
struct xe_bo *bo;
int ret;
- if (!IS_DGFX(xe))
- return 0;
-
spin_lock(&xe->pinned.lock);
for (;;) {
bo = list_first_entry_or_null(&xe->pinned.evicted,
@@ -159,7 +164,6 @@ int xe_bo_restore_kernel(struct xe_device *xe)
* should setup the iosys map.
*/
xe_assert(xe, !iosys_map_is_null(&bo->vmap));
- xe_assert(xe, xe_bo_is_vram(bo));
xe_bo_put(bo);
diff --git a/drivers/gpu/drm/xe/xe_exec.c b/drivers/gpu/drm/xe/xe_exec.c
index 756b492..31cca93 100644
--- a/drivers/gpu/drm/xe/xe_exec.c
+++ b/drivers/gpu/drm/xe/xe_exec.c
@@ -203,14 +203,14 @@ int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
write_locked = false;
}
if (err)
- goto err_syncs;
+ goto err_hw_exec_mode;
if (write_locked) {
err = xe_vm_userptr_pin(vm);
downgrade_write(&vm->lock);
write_locked = false;
if (err)
- goto err_hw_exec_mode;
+ goto err_unlock_list;
}
if (!args->num_batch_buffer) {
diff --git a/drivers/gpu/drm/xe/xe_oa.c b/drivers/gpu/drm/xe/xe_oa.c
index 2804f14f..78823f5 100644
--- a/drivers/gpu/drm/xe/xe_oa.c
+++ b/drivers/gpu/drm/xe/xe_oa.c
@@ -1206,9 +1206,11 @@ static int xe_oa_release(struct inode *inode, struct file *file)
struct xe_oa_stream *stream = file->private_data;
struct xe_gt *gt = stream->gt;
+ xe_pm_runtime_get(gt_to_xe(gt));
mutex_lock(>->oa.gt_lock);
xe_oa_destroy_locked(stream);
mutex_unlock(>->oa.gt_lock);
+ xe_pm_runtime_put(gt_to_xe(gt));
/* Release the reference the OA stream kept on the driver */
drm_dev_put(>_to_xe(gt)->drm);
diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index 67aebfe..ac4d8fa 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -1069,6 +1069,47 @@ static struct cpuidle_state gnr_cstates[] __initdata = {
.enter = NULL }
};
+static struct cpuidle_state gnrd_cstates[] __initdata = {
+ {
+ .name = "C1",
+ .desc = "MWAIT 0x00",
+ .flags = MWAIT2flg(0x00),
+ .exit_latency = 1,
+ .target_residency = 1,
+ .enter = &intel_idle,
+ .enter_s2idle = intel_idle_s2idle, },
+ {
+ .name = "C1E",
+ .desc = "MWAIT 0x01",
+ .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE,
+ .exit_latency = 4,
+ .target_residency = 4,
+ .enter = &intel_idle,
+ .enter_s2idle = intel_idle_s2idle, },
+ {
+ .name = "C6",
+ .desc = "MWAIT 0x20",
+ .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED |
+ CPUIDLE_FLAG_INIT_XSTATE |
+ CPUIDLE_FLAG_PARTIAL_HINT_MATCH,
+ .exit_latency = 220,
+ .target_residency = 650,
+ .enter = &intel_idle,
+ .enter_s2idle = intel_idle_s2idle, },
+ {
+ .name = "C6P",
+ .desc = "MWAIT 0x21",
+ .flags = MWAIT2flg(0x21) | CPUIDLE_FLAG_TLB_FLUSHED |
+ CPUIDLE_FLAG_INIT_XSTATE |
+ CPUIDLE_FLAG_PARTIAL_HINT_MATCH,
+ .exit_latency = 240,
+ .target_residency = 750,
+ .enter = &intel_idle,
+ .enter_s2idle = intel_idle_s2idle, },
+ {
+ .enter = NULL }
+};
+
static struct cpuidle_state atom_cstates[] __initdata = {
{
.name = "C1E",
@@ -1508,6 +1549,12 @@ static const struct idle_cpu idle_cpu_gnr __initconst = {
.use_acpi = true,
};
+static const struct idle_cpu idle_cpu_gnrd __initconst = {
+ .state_table = gnrd_cstates,
+ .disable_promotion_to_c1e = true,
+ .use_acpi = true,
+};
+
static const struct idle_cpu idle_cpu_avn __initconst = {
.state_table = avn_cstates,
.disable_promotion_to_c1e = true,
@@ -1593,6 +1640,7 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = {
X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, &idle_cpu_spr),
X86_MATCH_VFM(INTEL_EMERALDRAPIDS_X, &idle_cpu_spr),
X86_MATCH_VFM(INTEL_GRANITERAPIDS_X, &idle_cpu_gnr),
+ X86_MATCH_VFM(INTEL_GRANITERAPIDS_D, &idle_cpu_gnrd),
X86_MATCH_VFM(INTEL_XEON_PHI_KNL, &idle_cpu_knl),
X86_MATCH_VFM(INTEL_XEON_PHI_KNM, &idle_cpu_knl),
X86_MATCH_VFM(INTEL_ATOM_GOLDMONT, &idle_cpu_bxt),
diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c
index c4cf26f..be0743d 100644
--- a/drivers/infiniband/core/addr.c
+++ b/drivers/infiniband/core/addr.c
@@ -269,8 +269,6 @@ rdma_find_ndev_for_src_ip_rcu(struct net *net, const struct sockaddr *src_in)
break;
#endif
}
- if (!ret && dev && is_vlan_dev(dev))
- dev = vlan_dev_real_dev(dev);
return ret ? ERR_PTR(ret) : dev;
}
diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c
index 5dbb248..02f1666 100644
--- a/drivers/infiniband/core/ucma.c
+++ b/drivers/infiniband/core/ucma.c
@@ -1615,7 +1615,6 @@ static ssize_t ucma_migrate_id(struct ucma_file *new_file,
struct ucma_event *uevent, *tmp;
struct ucma_context *ctx;
LIST_HEAD(event_list);
- struct fd f;
struct ucma_file *cur_file;
int ret = 0;
@@ -1623,21 +1622,17 @@ static ssize_t ucma_migrate_id(struct ucma_file *new_file,
return -EFAULT;
/* Get current fd to protect against it being closed */
- f = fdget(cmd.fd);
- if (!fd_file(f))
+ CLASS(fd, f)(cmd.fd);
+ if (fd_empty(f))
return -ENOENT;
- if (fd_file(f)->f_op != &ucma_fops) {
- ret = -EINVAL;
- goto file_put;
- }
+ if (fd_file(f)->f_op != &ucma_fops)
+ return -EINVAL;
cur_file = fd_file(f)->private_data;
/* Validate current fd and prevent destruction of id. */
ctx = ucma_get_ctx(cur_file, cmd.id);
- if (IS_ERR(ctx)) {
- ret = PTR_ERR(ctx);
- goto file_put;
- }
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
rdma_lock_handler(ctx->cm_id);
/*
@@ -1678,8 +1673,6 @@ static ssize_t ucma_migrate_id(struct ucma_file *new_file,
err_unlock:
rdma_unlock_handler(ctx->cm_id);
ucma_put_ctx(ctx);
-file_put:
- fdput(f);
return ret;
}
diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
index a4cce36..66b02fb 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -584,7 +584,7 @@ static int ib_uverbs_open_xrcd(struct uverbs_attr_bundle *attrs)
if (cmd.fd != -1) {
/* search for file descriptor */
f = fdget(cmd.fd);
- if (!fd_file(f)) {
+ if (fd_empty(f)) {
ret = -EBADF;
goto err_tree_mutex_unlock;
}
@@ -632,8 +632,7 @@ static int ib_uverbs_open_xrcd(struct uverbs_attr_bundle *attrs)
atomic_inc(&xrcd->usecnt);
}
- if (fd_file(f))
- fdput(f);
+ fdput(f);
mutex_unlock(&ibudev->xrcd_tree_mutex);
uobj_finalize_uobj_create(&obj->uobject, attrs);
@@ -648,8 +647,7 @@ static int ib_uverbs_open_xrcd(struct uverbs_attr_bundle *attrs)
uobj_alloc_abort(&obj->uobject, attrs);
err_tree_mutex_unlock:
- if (fd_file(f))
- fdput(f);
+ fdput(f);
mutex_unlock(&ibudev->xrcd_tree_mutex);
diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c
index 6715c96..9eb290e 100644
--- a/drivers/infiniband/hw/bnxt_re/main.c
+++ b/drivers/infiniband/hw/bnxt_re/main.c
@@ -300,9 +300,6 @@ static void bnxt_re_shutdown(struct auxiliary_device *adev)
struct bnxt_re_en_dev_info *en_info = auxiliary_get_drvdata(adev);
struct bnxt_re_dev *rdev;
- if (!en_info)
- return;
-
rdev = en_info->rdev;
ib_unregister_device(&rdev->ibdev);
bnxt_re_dev_uninit(rdev, BNXT_RE_COMPLETE_REMOVE);
@@ -316,9 +313,6 @@ static void bnxt_re_stop_irq(void *handle)
struct bnxt_qplib_nq *nq;
int indx;
- if (!en_info)
- return;
-
rdev = en_info->rdev;
rcfw = &rdev->rcfw;
@@ -339,9 +333,6 @@ static void bnxt_re_start_irq(void *handle, struct bnxt_msix_entry *ent)
struct bnxt_qplib_nq *nq;
int indx, rc;
- if (!en_info)
- return;
-
rdev = en_info->rdev;
msix_ent = rdev->en_dev->msix_entries;
rcfw = &rdev->rcfw;
@@ -1991,10 +1982,6 @@ static void bnxt_re_remove(struct auxiliary_device *adev)
struct bnxt_re_dev *rdev;
mutex_lock(&bnxt_re_mutex);
- if (!en_info) {
- mutex_unlock(&bnxt_re_mutex);
- return;
- }
rdev = en_info->rdev;
if (rdev)
@@ -2025,7 +2012,15 @@ static int bnxt_re_probe(struct auxiliary_device *adev,
auxiliary_set_drvdata(adev, en_info);
rc = bnxt_re_add_device(adev, BNXT_RE_COMPLETE_INIT);
+ if (rc)
+ goto err;
mutex_unlock(&bnxt_re_mutex);
+ return 0;
+
+err:
+ mutex_unlock(&bnxt_re_mutex);
+ kfree(en_info);
+
return rc;
}
@@ -2035,9 +2030,6 @@ static int bnxt_re_suspend(struct auxiliary_device *adev, pm_message_t state)
struct bnxt_en_dev *en_dev;
struct bnxt_re_dev *rdev;
- if (!en_info)
- return 0;
-
rdev = en_info->rdev;
en_dev = en_info->en_dev;
mutex_lock(&bnxt_re_mutex);
@@ -2082,9 +2074,6 @@ static int bnxt_re_resume(struct auxiliary_device *adev)
struct bnxt_re_en_dev_info *en_info = auxiliary_get_drvdata(adev);
struct bnxt_re_dev *rdev;
- if (!en_info)
- return 0;
-
mutex_lock(&bnxt_re_mutex);
/* L2 driver may invoke this callback during device recovery, resume.
* reset. Current RoCE driver doesn't recover the device in case of
diff --git a/drivers/mailbox/qcom-cpucp-mbox.c b/drivers/mailbox/qcom-cpucp-mbox.c
index e5437c2..44f4ed1 100644
--- a/drivers/mailbox/qcom-cpucp-mbox.c
+++ b/drivers/mailbox/qcom-cpucp-mbox.c
@@ -138,7 +138,7 @@ static int qcom_cpucp_mbox_probe(struct platform_device *pdev)
return irq;
ret = devm_request_irq(dev, irq, qcom_cpucp_mbox_irq_fn,
- IRQF_TRIGGER_HIGH, "apss_cpucp_mbox", cpucp);
+ IRQF_TRIGGER_HIGH | IRQF_NO_SUSPEND, "apss_cpucp_mbox", cpucp);
if (ret < 0)
return dev_err_probe(dev, ret, "Failed to register irq: %d\n", irq);
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index d478aaf..23e0b71 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -2471,7 +2471,8 @@ struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsign
int r;
unsigned int num_locks;
struct dm_bufio_client *c;
- char slab_name[27];
+ char slab_name[64];
+ static atomic_t seqno = ATOMIC_INIT(0);
if (!block_size || block_size & ((1 << SECTOR_SHIFT) - 1)) {
DMERR("%s: block size not specified or is not multiple of 512b", __func__);
@@ -2522,7 +2523,8 @@ struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsign
(block_size < PAGE_SIZE || !is_power_of_2(block_size))) {
unsigned int align = min(1U << __ffs(block_size), (unsigned int)PAGE_SIZE);
- snprintf(slab_name, sizeof(slab_name), "dm_bufio_cache-%u", block_size);
+ snprintf(slab_name, sizeof(slab_name), "dm_bufio_cache-%u-%u",
+ block_size, atomic_inc_return(&seqno));
c->slab_cache = kmem_cache_create(slab_name, block_size, align,
SLAB_RECLAIM_ACCOUNT, NULL);
if (!c->slab_cache) {
@@ -2531,9 +2533,11 @@ struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsign
}
}
if (aux_size)
- snprintf(slab_name, sizeof(slab_name), "dm_bufio_buffer-%u", aux_size);
+ snprintf(slab_name, sizeof(slab_name), "dm_bufio_buffer-%u-%u",
+ aux_size, atomic_inc_return(&seqno));
else
- snprintf(slab_name, sizeof(slab_name), "dm_bufio_buffer");
+ snprintf(slab_name, sizeof(slab_name), "dm_bufio_buffer-%u",
+ atomic_inc_return(&seqno));
c->slab_buffer = kmem_cache_create(slab_name, sizeof(struct dm_buffer) + aux_size,
0, SLAB_RECLAIM_ACCOUNT, NULL);
if (!c->slab_buffer) {
diff --git a/drivers/md/dm-cache-background-tracker.c b/drivers/md/dm-cache-background-tracker.c
index 9c53082..f3051bd 100644
--- a/drivers/md/dm-cache-background-tracker.c
+++ b/drivers/md/dm-cache-background-tracker.c
@@ -11,12 +11,6 @@
#define DM_MSG_PREFIX "dm-background-tracker"
-struct bt_work {
- struct list_head list;
- struct rb_node node;
- struct policy_work work;
-};
-
struct background_tracker {
unsigned int max_work;
atomic_t pending_promotes;
@@ -26,10 +20,10 @@ struct background_tracker {
struct list_head issued;
struct list_head queued;
struct rb_root pending;
-
- struct kmem_cache *work_cache;
};
+struct kmem_cache *btracker_work_cache = NULL;
+
struct background_tracker *btracker_create(unsigned int max_work)
{
struct background_tracker *b = kmalloc(sizeof(*b), GFP_KERNEL);
@@ -48,12 +42,6 @@ struct background_tracker *btracker_create(unsigned int max_work)
INIT_LIST_HEAD(&b->queued);
b->pending = RB_ROOT;
- b->work_cache = KMEM_CACHE(bt_work, 0);
- if (!b->work_cache) {
- DMERR("couldn't create mempool for background work items");
- kfree(b);
- b = NULL;
- }
return b;
}
@@ -66,10 +54,9 @@ void btracker_destroy(struct background_tracker *b)
BUG_ON(!list_empty(&b->issued));
list_for_each_entry_safe (w, tmp, &b->queued, list) {
list_del(&w->list);
- kmem_cache_free(b->work_cache, w);
+ kmem_cache_free(btracker_work_cache, w);
}
- kmem_cache_destroy(b->work_cache);
kfree(b);
}
EXPORT_SYMBOL_GPL(btracker_destroy);
@@ -180,7 +167,7 @@ static struct bt_work *alloc_work(struct background_tracker *b)
if (max_work_reached(b))
return NULL;
- return kmem_cache_alloc(b->work_cache, GFP_NOWAIT);
+ return kmem_cache_alloc(btracker_work_cache, GFP_NOWAIT);
}
int btracker_queue(struct background_tracker *b,
@@ -203,7 +190,7 @@ int btracker_queue(struct background_tracker *b,
* There was a race, we'll just ignore this second
* bit of work for the same oblock.
*/
- kmem_cache_free(b->work_cache, w);
+ kmem_cache_free(btracker_work_cache, w);
return -EINVAL;
}
@@ -244,7 +231,7 @@ void btracker_complete(struct background_tracker *b,
update_stats(b, &w->work, -1);
rb_erase(&w->node, &b->pending);
list_del(&w->list);
- kmem_cache_free(b->work_cache, w);
+ kmem_cache_free(btracker_work_cache, w);
}
EXPORT_SYMBOL_GPL(btracker_complete);
diff --git a/drivers/md/dm-cache-background-tracker.h b/drivers/md/dm-cache-background-tracker.h
index 5b8f5c6..09c8fc5 100644
--- a/drivers/md/dm-cache-background-tracker.h
+++ b/drivers/md/dm-cache-background-tracker.h
@@ -26,6 +26,14 @@
* protected with a spinlock.
*/
+struct bt_work {
+ struct list_head list;
+ struct rb_node node;
+ struct policy_work work;
+};
+
+extern struct kmem_cache *btracker_work_cache;
+
struct background_work;
struct background_tracker;
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 4070931..9cb797a 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -10,6 +10,7 @@
#include "dm-bio-record.h"
#include "dm-cache-metadata.h"
#include "dm-io-tracker.h"
+#include "dm-cache-background-tracker.h"
#include <linux/dm-io.h>
#include <linux/dm-kcopyd.h>
@@ -2263,7 +2264,7 @@ static int parse_cache_args(struct cache_args *ca, int argc, char **argv,
/*----------------------------------------------------------------*/
-static struct kmem_cache *migration_cache;
+static struct kmem_cache *migration_cache = NULL;
#define NOT_CORE_OPTION 1
@@ -3361,7 +3362,7 @@ static int cache_iterate_devices(struct dm_target *ti,
static void disable_passdown_if_not_supported(struct cache *cache)
{
struct block_device *origin_bdev = cache->origin_dev->bdev;
- struct queue_limits *origin_limits = &bdev_get_queue(origin_bdev)->limits;
+ struct queue_limits *origin_limits = bdev_limits(origin_bdev);
const char *reason = NULL;
if (!cache->features.discard_passdown)
@@ -3383,7 +3384,7 @@ static void disable_passdown_if_not_supported(struct cache *cache)
static void set_discard_limits(struct cache *cache, struct queue_limits *limits)
{
struct block_device *origin_bdev = cache->origin_dev->bdev;
- struct queue_limits *origin_limits = &bdev_get_queue(origin_bdev)->limits;
+ struct queue_limits *origin_limits = bdev_limits(origin_bdev);
if (!cache->features.discard_passdown) {
/* No passdown is done so setting own virtual limits */
@@ -3445,22 +3446,36 @@ static int __init dm_cache_init(void)
int r;
migration_cache = KMEM_CACHE(dm_cache_migration, 0);
- if (!migration_cache)
- return -ENOMEM;
+ if (!migration_cache) {
+ r = -ENOMEM;
+ goto err;
+ }
+
+ btracker_work_cache = kmem_cache_create("dm_cache_bt_work",
+ sizeof(struct bt_work), __alignof__(struct bt_work), 0, NULL);
+ if (!btracker_work_cache) {
+ r = -ENOMEM;
+ goto err;
+ }
r = dm_register_target(&cache_target);
if (r) {
- kmem_cache_destroy(migration_cache);
- return r;
+ goto err;
}
return 0;
+
+err:
+ kmem_cache_destroy(migration_cache);
+ kmem_cache_destroy(btracker_work_cache);
+ return r;
}
static void __exit dm_cache_exit(void)
{
dm_unregister_target(&cache_target);
kmem_cache_destroy(migration_cache);
+ kmem_cache_destroy(btracker_work_cache);
}
module_init(dm_cache_init);
diff --git a/drivers/md/dm-clone-target.c b/drivers/md/dm-clone-target.c
index 12bbe48..e956d98 100644
--- a/drivers/md/dm-clone-target.c
+++ b/drivers/md/dm-clone-target.c
@@ -2020,7 +2020,7 @@ static void clone_resume(struct dm_target *ti)
static void disable_passdown_if_not_supported(struct clone *clone)
{
struct block_device *dest_dev = clone->dest_dev->bdev;
- struct queue_limits *dest_limits = &bdev_get_queue(dest_dev)->limits;
+ struct queue_limits *dest_limits = bdev_limits(dest_dev);
const char *reason = NULL;
if (!test_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags))
@@ -2041,7 +2041,7 @@ static void disable_passdown_if_not_supported(struct clone *clone)
static void set_discard_limits(struct clone *clone, struct queue_limits *limits)
{
struct block_device *dest_bdev = clone->dest_dev->bdev;
- struct queue_limits *dest_limits = &bdev_get_queue(dest_bdev)->limits;
+ struct queue_limits *dest_limits = bdev_limits(dest_bdev);
if (!test_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags)) {
/* No passdown is done so we set our own virtual limits */
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 89632ce..9095f19 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -2842,7 +2842,7 @@ static void disable_discard_passdown_if_not_supported(struct pool_c *pt)
{
struct pool *pool = pt->pool;
struct block_device *data_bdev = pt->data_dev->bdev;
- struct queue_limits *data_limits = &bdev_get_queue(data_bdev)->limits;
+ struct queue_limits *data_limits = bdev_limits(data_bdev);
const char *reason = NULL;
if (!pt->adjusted_pf.discard_passdown)
diff --git a/drivers/md/dm-zone.c b/drivers/md/dm-zone.c
index c0d41c3..20edd3f 100644
--- a/drivers/md/dm-zone.c
+++ b/drivers/md/dm-zone.c
@@ -344,7 +344,7 @@ int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q,
clear_bit(DMF_EMULATE_ZONE_APPEND, &md->flags);
} else {
set_bit(DMF_EMULATE_ZONE_APPEND, &md->flags);
- lim->max_zone_append_sectors = 0;
+ lim->max_hw_zone_append_sectors = 0;
}
/*
@@ -379,7 +379,7 @@ int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q,
if (!zlim.mapped_nr_seq_zones) {
lim->max_open_zones = 0;
lim->max_active_zones = 0;
- lim->max_zone_append_sectors = 0;
+ lim->max_hw_zone_append_sectors = 0;
lim->zone_write_granularity = 0;
lim->chunk_sectors = 0;
lim->features &= ~BLK_FEAT_ZONED;
diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
index 29da10e..c3a42dd 100644
--- a/drivers/md/md-bitmap.c
+++ b/drivers/md/md-bitmap.c
@@ -1285,6 +1285,7 @@ static void bitmap_unplug_async(struct bitmap *bitmap)
queue_work(md_bitmap_wq, &unplug_work.work);
wait_for_completion(&done);
+ destroy_work_on_stack(&unplug_work.work);
}
static void bitmap_unplug(struct mddev *mddev, bool sync)
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 67108c3..aebe12b 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -9784,9 +9784,7 @@ EXPORT_SYMBOL(md_reap_sync_thread);
void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev)
{
sysfs_notify_dirent_safe(rdev->sysfs_state);
- wait_event_timeout(rdev->blocked_wait,
- !test_bit(Blocked, &rdev->flags) &&
- !test_bit(BlockedBadBlocks, &rdev->flags),
+ wait_event_timeout(rdev->blocked_wait, !rdev_blocked(rdev),
msecs_to_jiffies(5000));
rdev_dec_pending(rdev, mddev);
}
@@ -9815,6 +9813,17 @@ int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
{
struct mddev *mddev = rdev->mddev;
int rv;
+
+ /*
+ * Recording new badblocks for faulty rdev will force unnecessary
+ * super block updating. This is fragile for external management because
+ * userspace daemon may trying to remove this device and deadlock may
+ * occur. This will be probably solved in the mdadm, but it is safer to
+ * avoid it.
+ */
+ if (test_bit(Faulty, &rdev->flags))
+ return 1;
+
if (is_new)
s += rdev->new_data_offset;
else
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 5d2e6bd..4ba93af 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -1002,6 +1002,30 @@ static inline void mddev_trace_remap(struct mddev *mddev, struct bio *bio,
trace_block_bio_remap(bio, disk_devt(mddev->gendisk), sector);
}
+static inline bool rdev_blocked(struct md_rdev *rdev)
+{
+ /*
+ * Blocked will be set by error handler and cleared by daemon after
+ * updating superblock, meanwhile write IO should be blocked to prevent
+ * reading old data after power failure.
+ */
+ if (test_bit(Blocked, &rdev->flags))
+ return true;
+
+ /*
+ * Faulty device should not be accessed anymore, there is no need to
+ * wait for bad block to be acknowledged.
+ */
+ if (test_bit(Faulty, &rdev->flags))
+ return false;
+
+ /* rdev is blocked by badblocks. */
+ if (test_bit(BlockedBadBlocks, &rdev->flags))
+ return true;
+
+ return false;
+}
+
#define mddev_add_trace_msg(mddev, fmt, args...) \
do { \
if (!mddev_is_dm(mddev)) \
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 32d5875..baaf5f8 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -466,6 +466,12 @@ static void raid0_handle_discard(struct mddev *mddev, struct bio *bio)
struct bio *split = bio_split(bio,
zone->zone_end - bio->bi_iter.bi_sector, GFP_NOIO,
&mddev->bio_set);
+
+ if (IS_ERR(split)) {
+ bio->bi_status = errno_to_blk_status(PTR_ERR(split));
+ bio_endio(bio);
+ return;
+ }
bio_chain(split, bio);
submit_bio_noacct(bio);
bio = split;
@@ -608,6 +614,12 @@ static bool raid0_make_request(struct mddev *mddev, struct bio *bio)
if (sectors < bio_sectors(bio)) {
struct bio *split = bio_split(bio, sectors, GFP_NOIO,
&mddev->bio_set);
+
+ if (IS_ERR(split)) {
+ bio->bi_status = errno_to_blk_status(PTR_ERR(split));
+ bio_endio(bio);
+ return true;
+ }
bio_chain(split, bio);
raid0_map_submit_bio(mddev, bio);
bio = split;
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 6c9d242..a5adf08 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1322,7 +1322,7 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
const enum req_op op = bio_op(bio);
const blk_opf_t do_sync = bio->bi_opf & REQ_SYNC;
int max_sectors;
- int rdisk;
+ int rdisk, error;
bool r1bio_existed = !!r1_bio;
/*
@@ -1383,6 +1383,11 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
if (max_sectors < bio_sectors(bio)) {
struct bio *split = bio_split(bio, max_sectors,
gfp, &conf->bio_split);
+
+ if (IS_ERR(split)) {
+ error = PTR_ERR(split);
+ goto err_handle;
+ }
bio_chain(split, bio);
submit_bio_noacct(bio);
bio = split;
@@ -1410,6 +1415,47 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
read_bio->bi_private = r1_bio;
mddev_trace_remap(mddev, read_bio, r1_bio->sector);
submit_bio_noacct(read_bio);
+ return;
+
+err_handle:
+ atomic_dec(&mirror->rdev->nr_pending);
+ bio->bi_status = errno_to_blk_status(error);
+ set_bit(R1BIO_Uptodate, &r1_bio->state);
+ raid_end_bio_io(r1_bio);
+}
+
+static bool wait_blocked_rdev(struct mddev *mddev, struct bio *bio)
+{
+ struct r1conf *conf = mddev->private;
+ int disks = conf->raid_disks * 2;
+ int i;
+
+retry:
+ for (i = 0; i < disks; i++) {
+ struct md_rdev *rdev = conf->mirrors[i].rdev;
+
+ if (!rdev)
+ continue;
+
+ /* don't write here until the bad block is acknowledged */
+ if (test_bit(WriteErrorSeen, &rdev->flags) &&
+ rdev_has_badblock(rdev, bio->bi_iter.bi_sector,
+ bio_sectors(bio)) < 0)
+ set_bit(BlockedBadBlocks, &rdev->flags);
+
+ if (rdev_blocked(rdev)) {
+ if (bio->bi_opf & REQ_NOWAIT)
+ return false;
+
+ mddev_add_trace_msg(rdev->mddev, "raid1 wait rdev %d blocked",
+ rdev->raid_disk);
+ atomic_inc(&rdev->nr_pending);
+ md_wait_for_blocked_rdev(rdev, rdev->mddev);
+ goto retry;
+ }
+ }
+
+ return true;
}
static void raid1_write_request(struct mddev *mddev, struct bio *bio,
@@ -1417,9 +1463,8 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
{
struct r1conf *conf = mddev->private;
struct r1bio *r1_bio;
- int i, disks;
+ int i, disks, k, error;
unsigned long flags;
- struct md_rdev *blocked_rdev;
int first_clone;
int max_sectors;
bool write_behind = false;
@@ -1457,7 +1502,11 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
return;
}
- retry_write:
+ if (!wait_blocked_rdev(mddev, bio)) {
+ bio_wouldblock_error(bio);
+ return;
+ }
+
r1_bio = alloc_r1bio(mddev, bio);
r1_bio->sectors = max_write_sectors;
@@ -1473,7 +1522,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
*/
disks = conf->raid_disks * 2;
- blocked_rdev = NULL;
max_sectors = r1_bio->sectors;
for (i = 0; i < disks; i++) {
struct md_rdev *rdev = conf->mirrors[i].rdev;
@@ -1486,11 +1534,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
if (!is_discard && rdev && test_bit(WriteMostly, &rdev->flags))
write_behind = true;
- if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
- atomic_inc(&rdev->nr_pending);
- blocked_rdev = rdev;
- break;
- }
r1_bio->bios[i] = NULL;
if (!rdev || test_bit(Faulty, &rdev->flags)) {
if (i < conf->raid_disks)
@@ -1506,13 +1549,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
is_bad = is_badblock(rdev, r1_bio->sector, max_sectors,
&first_bad, &bad_sectors);
- if (is_bad < 0) {
- /* mustn't write here until the bad block is
- * acknowledged*/
- set_bit(BlockedBadBlocks, &rdev->flags);
- blocked_rdev = rdev;
- break;
- }
if (is_bad && first_bad <= r1_bio->sector) {
/* Cannot write here at all */
bad_sectors -= (r1_bio->sector - first_bad);
@@ -1543,27 +1579,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
r1_bio->bios[i] = bio;
}
- if (unlikely(blocked_rdev)) {
- /* Wait for this device to become unblocked */
- int j;
-
- for (j = 0; j < i; j++)
- if (r1_bio->bios[j])
- rdev_dec_pending(conf->mirrors[j].rdev, mddev);
- mempool_free(r1_bio, &conf->r1bio_pool);
- allow_barrier(conf, bio->bi_iter.bi_sector);
-
- if (bio->bi_opf & REQ_NOWAIT) {
- bio_wouldblock_error(bio);
- return;
- }
- mddev_add_trace_msg(mddev, "raid1 wait rdev %d blocked",
- blocked_rdev->raid_disk);
- md_wait_for_blocked_rdev(blocked_rdev, mddev);
- wait_barrier(conf, bio->bi_iter.bi_sector, false);
- goto retry_write;
- }
-
/*
* When using a bitmap, we may call alloc_behind_master_bio below.
* alloc_behind_master_bio allocates a copy of the data payload a page
@@ -1576,6 +1591,11 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
if (max_sectors < bio_sectors(bio)) {
struct bio *split = bio_split(bio, max_sectors,
GFP_NOIO, &conf->bio_split);
+
+ if (IS_ERR(split)) {
+ error = PTR_ERR(split);
+ goto err_handle;
+ }
bio_chain(split, bio);
submit_bio_noacct(bio);
bio = split;
@@ -1660,6 +1680,18 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
/* In case raid1d snuck in to freeze_array */
wake_up_barrier(conf);
+ return;
+err_handle:
+ for (k = 0; k < i; k++) {
+ if (r1_bio->bios[k]) {
+ rdev_dec_pending(conf->mirrors[k].rdev, mddev);
+ r1_bio->bios[k] = NULL;
+ }
+ }
+
+ bio->bi_status = errno_to_blk_status(error);
+ set_bit(R1BIO_Uptodate, &r1_bio->state);
+ raid_end_bio_io(r1_bio);
}
static bool raid1_make_request(struct mddev *mddev, struct bio *bio)
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 862b1fb..1898923 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1159,6 +1159,7 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
int slot = r10_bio->read_slot;
struct md_rdev *err_rdev = NULL;
gfp_t gfp = GFP_NOIO;
+ int error;
if (slot >= 0 && r10_bio->devs[slot].rdev) {
/*
@@ -1206,6 +1207,10 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
if (max_sectors < bio_sectors(bio)) {
struct bio *split = bio_split(bio, max_sectors,
gfp, &conf->bio_split);
+ if (IS_ERR(split)) {
+ error = PTR_ERR(split);
+ goto err_handle;
+ }
bio_chain(split, bio);
allow_barrier(conf);
submit_bio_noacct(bio);
@@ -1236,6 +1241,11 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
mddev_trace_remap(mddev, read_bio, r10_bio->sector);
submit_bio_noacct(read_bio);
return;
+err_handle:
+ atomic_dec(&rdev->nr_pending);
+ bio->bi_status = errno_to_blk_status(error);
+ set_bit(R10BIO_Uptodate, &r10_bio->state);
+ raid_end_bio_io(r10_bio);
}
static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio,
@@ -1285,9 +1295,9 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio,
static void wait_blocked_dev(struct mddev *mddev, struct r10bio *r10_bio)
{
- int i;
struct r10conf *conf = mddev->private;
struct md_rdev *blocked_rdev;
+ int i;
retry_wait:
blocked_rdev = NULL;
@@ -1295,40 +1305,36 @@ static void wait_blocked_dev(struct mddev *mddev, struct r10bio *r10_bio)
struct md_rdev *rdev, *rrdev;
rdev = conf->mirrors[i].rdev;
- rrdev = conf->mirrors[i].replacement;
- if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
- atomic_inc(&rdev->nr_pending);
- blocked_rdev = rdev;
- break;
- }
- if (rrdev && unlikely(test_bit(Blocked, &rrdev->flags))) {
- atomic_inc(&rrdev->nr_pending);
- blocked_rdev = rrdev;
- break;
- }
-
- if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) {
+ if (rdev) {
sector_t dev_sector = r10_bio->devs[i].addr;
/*
* Discard request doesn't care the write result
* so it doesn't need to wait blocked disk here.
*/
- if (!r10_bio->sectors)
- continue;
-
- if (rdev_has_badblock(rdev, dev_sector,
- r10_bio->sectors) < 0) {
+ if (test_bit(WriteErrorSeen, &rdev->flags) &&
+ r10_bio->sectors &&
+ rdev_has_badblock(rdev, dev_sector,
+ r10_bio->sectors) < 0)
/*
- * Mustn't write here until the bad block
- * is acknowledged
+ * Mustn't write here until the bad
+ * block is acknowledged
*/
- atomic_inc(&rdev->nr_pending);
set_bit(BlockedBadBlocks, &rdev->flags);
+
+ if (rdev_blocked(rdev)) {
blocked_rdev = rdev;
+ atomic_inc(&rdev->nr_pending);
break;
}
}
+
+ rrdev = conf->mirrors[i].replacement;
+ if (rrdev && rdev_blocked(rrdev)) {
+ atomic_inc(&rrdev->nr_pending);
+ blocked_rdev = rrdev;
+ break;
+ }
}
if (unlikely(blocked_rdev)) {
@@ -1347,9 +1353,10 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
struct r10bio *r10_bio)
{
struct r10conf *conf = mddev->private;
- int i;
+ int i, k;
sector_t sectors;
int max_sectors;
+ int error;
if ((mddev_is_clustered(mddev) &&
md_cluster_ops->area_resyncing(mddev, WRITE,
@@ -1482,6 +1489,10 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
if (r10_bio->sectors < bio_sectors(bio)) {
struct bio *split = bio_split(bio, r10_bio->sectors,
GFP_NOIO, &conf->bio_split);
+ if (IS_ERR(split)) {
+ error = PTR_ERR(split);
+ goto err_handle;
+ }
bio_chain(split, bio);
allow_barrier(conf);
submit_bio_noacct(bio);
@@ -1503,6 +1514,26 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
raid10_write_one_disk(mddev, r10_bio, bio, true, i);
}
one_write_done(r10_bio);
+ return;
+err_handle:
+ for (k = 0; k < i; k++) {
+ int d = r10_bio->devs[k].devnum;
+ struct md_rdev *rdev = conf->mirrors[d].rdev;
+ struct md_rdev *rrdev = conf->mirrors[d].replacement;
+
+ if (r10_bio->devs[k].bio) {
+ rdev_dec_pending(rdev, mddev);
+ r10_bio->devs[k].bio = NULL;
+ }
+ if (r10_bio->devs[k].repl_bio) {
+ rdev_dec_pending(rrdev, mddev);
+ r10_bio->devs[k].repl_bio = NULL;
+ }
+ }
+
+ bio->bi_status = errno_to_blk_status(error);
+ set_bit(R10BIO_Uptodate, &r10_bio->state);
+ raid_end_bio_io(r10_bio);
}
static void __make_request(struct mddev *mddev, struct bio *bio, int sectors)
@@ -1644,6 +1675,11 @@ static int raid10_handle_discard(struct mddev *mddev, struct bio *bio)
if (remainder) {
split_size = stripe_size - remainder;
split = bio_split(bio, split_size, GFP_NOIO, &conf->bio_split);
+ if (IS_ERR(split)) {
+ bio->bi_status = errno_to_blk_status(PTR_ERR(split));
+ bio_endio(bio);
+ return 0;
+ }
bio_chain(split, bio);
allow_barrier(conf);
/* Resend the fist split part */
@@ -1654,6 +1690,11 @@ static int raid10_handle_discard(struct mddev *mddev, struct bio *bio)
if (remainder) {
split_size = bio_sectors(bio) - remainder;
split = bio_split(bio, split_size, GFP_NOIO, &conf->bio_split);
+ if (IS_ERR(split)) {
+ bio->bi_status = errno_to_blk_status(PTR_ERR(split));
+ bio_endio(bio);
+ return 0;
+ }
bio_chain(split, bio);
allow_barrier(conf);
/* Resend the second split part */
diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c
index a70cbec..37c4da53 100644
--- a/drivers/md/raid5-ppl.c
+++ b/drivers/md/raid5-ppl.c
@@ -258,7 +258,7 @@ static struct ppl_io_unit *ppl_new_iounit(struct ppl_log *log,
memset(pplhdr->reserved, 0xff, PPL_HDR_RESERVED);
pplhdr->signature = cpu_to_le32(ppl_conf->signature);
- io->seq = atomic64_add_return(1, &ppl_conf->seq);
+ io->seq = atomic64_inc_return(&ppl_conf->seq);
pplhdr->generation = cpu_to_le64(io->seq);
return io;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index dc2ea63..f09e767 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -4724,14 +4724,13 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
if (rdev) {
is_bad = rdev_has_badblock(rdev, sh->sector,
RAID5_STRIPE_SECTORS(conf));
- if (s->blocked_rdev == NULL
- && (test_bit(Blocked, &rdev->flags)
- || is_bad < 0)) {
+ if (s->blocked_rdev == NULL) {
if (is_bad < 0)
- set_bit(BlockedBadBlocks,
- &rdev->flags);
- s->blocked_rdev = rdev;
- atomic_inc(&rdev->nr_pending);
+ set_bit(BlockedBadBlocks, &rdev->flags);
+ if (rdev_blocked(rdev)) {
+ s->blocked_rdev = rdev;
+ atomic_inc(&rdev->nr_pending);
+ }
}
}
clear_bit(R5_Insync, &dev->flags);
@@ -7177,6 +7176,8 @@ raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len)
err = mddev_suspend_and_lock(mddev);
if (err)
return err;
+ raid5_quiesce(mddev, true);
+
conf = mddev->private;
if (!conf)
err = -ENODEV;
@@ -7198,6 +7199,8 @@ raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len)
kfree(old_groups);
}
}
+
+ raid5_quiesce(mddev, false);
mddev_unlock_and_resume(mddev);
return err ?: len;
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 896ecfc..d174e58 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -633,7 +633,7 @@ struct r5conf {
* two caches.
*/
int active_name;
- char cache_name[2][32];
+ char cache_name[2][48];
struct kmem_cache *slab_cache; /* for allocating stripes */
struct mutex cache_size_mutex; /* Protect changes to cache size */
diff --git a/drivers/media/mc/mc-request.c b/drivers/media/mc/mc-request.c
index e064914..df39c8c 100644
--- a/drivers/media/mc/mc-request.c
+++ b/drivers/media/mc/mc-request.c
@@ -246,22 +246,21 @@ static const struct file_operations request_fops = {
struct media_request *
media_request_get_by_fd(struct media_device *mdev, int request_fd)
{
- struct fd f;
struct media_request *req;
if (!mdev || !mdev->ops ||
!mdev->ops->req_validate || !mdev->ops->req_queue)
return ERR_PTR(-EBADR);
- f = fdget(request_fd);
- if (!fd_file(f))
- goto err_no_req_fd;
+ CLASS(fd, f)(request_fd);
+ if (fd_empty(f))
+ goto err;
if (fd_file(f)->f_op != &request_fops)
- goto err_fput;
+ goto err;
req = fd_file(f)->private_data;
if (req->mdev != mdev)
- goto err_fput;
+ goto err;
/*
* Note: as long as someone has an open filehandle of the request,
@@ -272,14 +271,9 @@ media_request_get_by_fd(struct media_device *mdev, int request_fd)
* before media_request_get() is called.
*/
media_request_get(req);
- fdput(f);
-
return req;
-err_fput:
- fdput(f);
-
-err_no_req_fd:
+err:
dev_dbg(mdev->dev, "cannot find request_fd %d\n", request_fd);
return ERR_PTR(-EINVAL);
}
diff --git a/drivers/media/rc/lirc_dev.c b/drivers/media/rc/lirc_dev.c
index f042f3f..a2257dc 100644
--- a/drivers/media/rc/lirc_dev.c
+++ b/drivers/media/rc/lirc_dev.c
@@ -815,28 +815,23 @@ void __exit lirc_dev_exit(void)
struct rc_dev *rc_dev_get_from_fd(int fd, bool write)
{
- struct fd f = fdget(fd);
+ CLASS(fd, f)(fd);
struct lirc_fh *fh;
struct rc_dev *dev;
- if (!fd_file(f))
+ if (fd_empty(f))
return ERR_PTR(-EBADF);
- if (fd_file(f)->f_op != &lirc_fops) {
- fdput(f);
+ if (fd_file(f)->f_op != &lirc_fops)
return ERR_PTR(-EINVAL);
- }
- if (write && !(fd_file(f)->f_mode & FMODE_WRITE)) {
- fdput(f);
+ if (write && !(fd_file(f)->f_mode & FMODE_WRITE))
return ERR_PTR(-EPERM);
- }
fh = fd_file(f)->private_data;
dev = fh->rc;
get_device(&dev->dev);
- fdput(f);
return dev;
}
diff --git a/drivers/media/test-drivers/vivid/vivid-vid-cap.c b/drivers/media/test-drivers/vivid/vivid-vid-cap.c
index 6a790ac..d8b34c1 100644
--- a/drivers/media/test-drivers/vivid/vivid-vid-cap.c
+++ b/drivers/media/test-drivers/vivid/vivid-vid-cap.c
@@ -10,6 +10,7 @@
#include <linux/sched.h>
#include <linux/vmalloc.h>
#include <linux/videodev2.h>
+#include <linux/prandom.h>
#include <linux/v4l2-dv-timings.h>
#include <media/v4l2-common.h>
#include <media/v4l2-event.h>
diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c
index ef06a4d..79f6fad 100644
--- a/drivers/mmc/core/block.c
+++ b/drivers/mmc/core/block.c
@@ -2501,6 +2501,56 @@ static inline int mmc_blk_readonly(struct mmc_card *card)
!(card->csd.cmdclass & CCC_BLOCK_WRITE);
}
+/*
+ * Search for a declared partitions node for the disk in mmc-card related node.
+ *
+ * This is to permit support for partition table defined in DT in special case
+ * where a partition table is not written in the disk and is expected to be
+ * passed from the running system.
+ *
+ * For the user disk, "partitions" node is searched.
+ * For the special HW disk, "partitions-" node with the appended name is used
+ * following this conversion table (to adhere to JEDEC naming)
+ * - boot0 -> partitions-boot1
+ * - boot1 -> partitions-boot2
+ * - gp0 -> partitions-gp1
+ * - gp1 -> partitions-gp2
+ * - gp2 -> partitions-gp3
+ * - gp3 -> partitions-gp4
+ */
+static struct fwnode_handle *mmc_blk_get_partitions_node(struct device *mmc_dev,
+ const char *subname)
+{
+ const char *node_name = "partitions";
+
+ if (subname) {
+ mmc_dev = mmc_dev->parent;
+
+ /*
+ * Check if we are allocating a BOOT disk boot0/1 disk.
+ * In DT we use the JEDEC naming boot1/2.
+ */
+ if (!strcmp(subname, "boot0"))
+ node_name = "partitions-boot1";
+ if (!strcmp(subname, "boot1"))
+ node_name = "partitions-boot2";
+ /*
+ * Check if we are allocating a GP disk gp0/1/2/3 disk.
+ * In DT we use the JEDEC naming gp1/2/3/4.
+ */
+ if (!strcmp(subname, "gp0"))
+ node_name = "partitions-gp1";
+ if (!strcmp(subname, "gp1"))
+ node_name = "partitions-gp2";
+ if (!strcmp(subname, "gp2"))
+ node_name = "partitions-gp3";
+ if (!strcmp(subname, "gp3"))
+ node_name = "partitions-gp4";
+ }
+
+ return device_get_named_child_node(mmc_dev, node_name);
+}
+
static struct mmc_blk_data *mmc_blk_alloc_req(struct mmc_card *card,
struct device *parent,
sector_t size,
@@ -2509,6 +2559,7 @@ static struct mmc_blk_data *mmc_blk_alloc_req(struct mmc_card *card,
int area_type,
unsigned int part_type)
{
+ struct fwnode_handle *disk_fwnode;
struct mmc_blk_data *md;
int devidx, ret;
char cap_str[10];
@@ -2610,7 +2661,9 @@ static struct mmc_blk_data *mmc_blk_alloc_req(struct mmc_card *card,
/* used in ->open, must be set before add_disk: */
if (area_type == MMC_BLK_DATA_AREA_MAIN)
dev_set_drvdata(&card->dev, md);
- ret = device_add_disk(md->parent, md->disk, mmc_disk_attr_groups);
+ disk_fwnode = mmc_blk_get_partitions_node(parent, subname);
+ ret = add_disk_fwnode(md->parent, md->disk, mmc_disk_attr_groups,
+ disk_fwnode);
if (ret)
goto err_put_disk;
return md;
diff --git a/drivers/mmc/host/dw_mmc.c b/drivers/mmc/host/dw_mmc.c
index 41e4512..e9f6e4e 100644
--- a/drivers/mmc/host/dw_mmc.c
+++ b/drivers/mmc/host/dw_mmc.c
@@ -2957,8 +2957,8 @@ static int dw_mci_init_slot(struct dw_mci *host)
if (host->use_dma == TRANS_MODE_IDMAC) {
mmc->max_segs = host->ring_size;
mmc->max_blk_size = 65535;
- mmc->max_req_size = DW_MCI_DESC_DATA_LENGTH * host->ring_size;
- mmc->max_seg_size = mmc->max_req_size;
+ mmc->max_seg_size = 0x1000;
+ mmc->max_req_size = mmc->max_seg_size * host->ring_size;
mmc->max_blk_count = mmc->max_req_size / 512;
} else if (host->use_dma == TRANS_MODE_EDMAC) {
mmc->max_segs = 64;
diff --git a/drivers/mmc/host/sunxi-mmc.c b/drivers/mmc/host/sunxi-mmc.c
index d3bd0ac..e0ab5fd 100644
--- a/drivers/mmc/host/sunxi-mmc.c
+++ b/drivers/mmc/host/sunxi-mmc.c
@@ -1191,10 +1191,9 @@ static const struct sunxi_mmc_cfg sun50i_a64_emmc_cfg = {
.needs_new_timings = true,
};
-static const struct sunxi_mmc_cfg sun50i_a100_cfg = {
+static const struct sunxi_mmc_cfg sun50i_h616_cfg = {
.idma_des_size_bits = 16,
.idma_des_shift = 2,
- .clk_delays = NULL,
.can_calibrate = true,
.mask_data0 = true,
.needs_new_timings = true,
@@ -1217,8 +1216,9 @@ static const struct of_device_id sunxi_mmc_of_match[] = {
{ .compatible = "allwinner,sun20i-d1-mmc", .data = &sun20i_d1_cfg },
{ .compatible = "allwinner,sun50i-a64-mmc", .data = &sun50i_a64_cfg },
{ .compatible = "allwinner,sun50i-a64-emmc", .data = &sun50i_a64_emmc_cfg },
- { .compatible = "allwinner,sun50i-a100-mmc", .data = &sun50i_a100_cfg },
+ { .compatible = "allwinner,sun50i-a100-mmc", .data = &sun20i_d1_cfg },
{ .compatible = "allwinner,sun50i-a100-emmc", .data = &sun50i_a100_emmc_cfg },
+ { .compatible = "allwinner,sun50i-h616-mmc", .data = &sun50i_h616_cfg },
{ /* sentinel */ }
};
MODULE_DEVICE_TABLE(of, sunxi_mmc_of_match);
diff --git a/drivers/mtd/tests/oobtest.c b/drivers/mtd/tests/oobtest.c
index 13fed39..e1ee68f 100644
--- a/drivers/mtd/tests/oobtest.c
+++ b/drivers/mtd/tests/oobtest.c
@@ -17,7 +17,7 @@
#include <linux/mtd/mtd.h>
#include <linux/slab.h>
#include <linux/sched.h>
-#include <linux/random.h>
+#include <linux/prandom.h>
#include "mtd_test.h"
diff --git a/drivers/mtd/tests/pagetest.c b/drivers/mtd/tests/pagetest.c
index 8eb40b6..6878700 100644
--- a/drivers/mtd/tests/pagetest.c
+++ b/drivers/mtd/tests/pagetest.c
@@ -17,7 +17,7 @@
#include <linux/mtd/mtd.h>
#include <linux/slab.h>
#include <linux/sched.h>
-#include <linux/random.h>
+#include <linux/prandom.h>
#include "mtd_test.h"
diff --git a/drivers/mtd/tests/subpagetest.c b/drivers/mtd/tests/subpagetest.c
index 05250a0..f34bbf0 100644
--- a/drivers/mtd/tests/subpagetest.c
+++ b/drivers/mtd/tests/subpagetest.c
@@ -15,7 +15,7 @@
#include <linux/mtd/mtd.h>
#include <linux/slab.h>
#include <linux/sched.h>
-#include <linux/random.h>
+#include <linux/prandom.h>
#include "mtd_test.h"
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index b1bffd8..15e0f14 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -1008,6 +1008,8 @@ static void bond_hw_addr_swap(struct bonding *bond, struct slave *new_active,
if (bond->dev->flags & IFF_UP)
bond_hw_addr_flush(bond->dev, old_active->dev);
+
+ bond_slave_ns_maddrs_add(bond, old_active);
}
if (new_active) {
@@ -1024,6 +1026,8 @@ static void bond_hw_addr_swap(struct bonding *bond, struct slave *new_active,
dev_mc_sync(new_active->dev, bond->dev);
netif_addr_unlock_bh(bond->dev);
}
+
+ bond_slave_ns_maddrs_del(bond, new_active);
}
}
@@ -2341,6 +2345,11 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev,
bond_compute_features(bond);
bond_set_carrier(bond);
+ /* Needs to be called before bond_select_active_slave(), which will
+ * remove the maddrs if the slave is selected as active slave.
+ */
+ bond_slave_ns_maddrs_add(bond, new_slave);
+
if (bond_uses_primary(bond)) {
block_netpoll_tx();
bond_select_active_slave(bond);
@@ -2350,7 +2359,6 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev,
if (bond_mode_can_use_xmit_hash(bond))
bond_update_slave_arr(bond, NULL);
-
if (!slave_dev->netdev_ops->ndo_bpf ||
!slave_dev->netdev_ops->ndo_xdp_xmit) {
if (bond->xdp_prog) {
@@ -2548,6 +2556,12 @@ static int __bond_release_one(struct net_device *bond_dev,
if (oldcurrent == slave)
bond_change_active_slave(bond, NULL);
+ /* Must be called after bond_change_active_slave () as the slave
+ * might change from an active slave to a backup slave. Then it is
+ * necessary to clear the maddrs on the backup slave.
+ */
+ bond_slave_ns_maddrs_del(bond, slave);
+
if (bond_is_lb(bond)) {
/* Must be called only after the slave has been
* detached from the list and the curr_active_slave
diff --git a/drivers/net/bonding/bond_options.c b/drivers/net/bonding/bond_options.c
index 95d59a1..327b6ec 100644
--- a/drivers/net/bonding/bond_options.c
+++ b/drivers/net/bonding/bond_options.c
@@ -15,6 +15,7 @@
#include <linux/sched/signal.h>
#include <net/bonding.h>
+#include <net/ndisc.h>
static int bond_option_active_slave_set(struct bonding *bond,
const struct bond_opt_value *newval);
@@ -1234,6 +1235,68 @@ static int bond_option_arp_ip_targets_set(struct bonding *bond,
}
#if IS_ENABLED(CONFIG_IPV6)
+static bool slave_can_set_ns_maddr(const struct bonding *bond, struct slave *slave)
+{
+ return BOND_MODE(bond) == BOND_MODE_ACTIVEBACKUP &&
+ !bond_is_active_slave(slave) &&
+ slave->dev->flags & IFF_MULTICAST;
+}
+
+static void slave_set_ns_maddrs(struct bonding *bond, struct slave *slave, bool add)
+{
+ struct in6_addr *targets = bond->params.ns_targets;
+ char slot_maddr[MAX_ADDR_LEN];
+ int i;
+
+ if (!slave_can_set_ns_maddr(bond, slave))
+ return;
+
+ for (i = 0; i < BOND_MAX_NS_TARGETS; i++) {
+ if (ipv6_addr_any(&targets[i]))
+ break;
+
+ if (!ndisc_mc_map(&targets[i], slot_maddr, slave->dev, 0)) {
+ if (add)
+ dev_mc_add(slave->dev, slot_maddr);
+ else
+ dev_mc_del(slave->dev, slot_maddr);
+ }
+ }
+}
+
+void bond_slave_ns_maddrs_add(struct bonding *bond, struct slave *slave)
+{
+ if (!bond->params.arp_validate)
+ return;
+ slave_set_ns_maddrs(bond, slave, true);
+}
+
+void bond_slave_ns_maddrs_del(struct bonding *bond, struct slave *slave)
+{
+ if (!bond->params.arp_validate)
+ return;
+ slave_set_ns_maddrs(bond, slave, false);
+}
+
+static void slave_set_ns_maddr(struct bonding *bond, struct slave *slave,
+ struct in6_addr *target, struct in6_addr *slot)
+{
+ char target_maddr[MAX_ADDR_LEN], slot_maddr[MAX_ADDR_LEN];
+
+ if (!bond->params.arp_validate || !slave_can_set_ns_maddr(bond, slave))
+ return;
+
+ /* remove the previous maddr from slave */
+ if (!ipv6_addr_any(slot) &&
+ !ndisc_mc_map(slot, slot_maddr, slave->dev, 0))
+ dev_mc_del(slave->dev, slot_maddr);
+
+ /* add new maddr on slave if target is set */
+ if (!ipv6_addr_any(target) &&
+ !ndisc_mc_map(target, target_maddr, slave->dev, 0))
+ dev_mc_add(slave->dev, target_maddr);
+}
+
static void _bond_options_ns_ip6_target_set(struct bonding *bond, int slot,
struct in6_addr *target,
unsigned long last_rx)
@@ -1243,8 +1306,10 @@ static void _bond_options_ns_ip6_target_set(struct bonding *bond, int slot,
struct slave *slave;
if (slot >= 0 && slot < BOND_MAX_NS_TARGETS) {
- bond_for_each_slave(bond, slave, iter)
+ bond_for_each_slave(bond, slave, iter) {
slave->target_last_arp_rx[slot] = last_rx;
+ slave_set_ns_maddr(bond, slave, target, &targets[slot]);
+ }
targets[slot] = *target;
}
}
@@ -1296,15 +1361,30 @@ static int bond_option_ns_ip6_targets_set(struct bonding *bond,
{
return -EPERM;
}
+
+static void slave_set_ns_maddrs(struct bonding *bond, struct slave *slave, bool add) {}
+
+void bond_slave_ns_maddrs_add(struct bonding *bond, struct slave *slave) {}
+
+void bond_slave_ns_maddrs_del(struct bonding *bond, struct slave *slave) {}
#endif
static int bond_option_arp_validate_set(struct bonding *bond,
const struct bond_opt_value *newval)
{
+ bool changed = !!bond->params.arp_validate != !!newval->value;
+ struct list_head *iter;
+ struct slave *slave;
+
netdev_dbg(bond->dev, "Setting arp_validate to %s (%llu)\n",
newval->string, newval->value);
bond->params.arp_validate = newval->value;
+ if (changed) {
+ bond_for_each_slave(bond, slave, iter)
+ slave_set_ns_maddrs(bond, slave, !!bond->params.arp_validate);
+ }
+
return 0;
}
diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
index b83df5f..f1d08816 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -907,7 +907,7 @@ static int igb_request_msix(struct igb_adapter *adapter)
int i, err = 0, vector = 0, free_vector = 0;
err = request_irq(adapter->msix_entries[vector].vector,
- igb_msix_other, IRQF_NO_THREAD, netdev->name, adapter);
+ igb_msix_other, 0, netdev->name, adapter);
if (err)
goto err_out;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c
index dcfccaa..92d5cfe 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c
@@ -866,7 +866,7 @@ mlx5_tc_ct_entry_add_rule(struct mlx5_tc_ct_priv *ct_priv,
return 0;
err_rule:
- mlx5_tc_ct_entry_destroy_mod_hdr(ct_priv, zone_rule->attr, zone_rule->mh);
+ mlx5_tc_ct_entry_destroy_mod_hdr(ct_priv, attr, zone_rule->mh);
mlx5_put_label_mapping(ct_priv, attr->ct_attr.ct_labels_id);
err_mod_hdr:
kfree(attr);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_tx.c
index d61be26..3db31cc 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_tx.c
@@ -660,7 +660,7 @@ tx_sync_info_get(struct mlx5e_ktls_offload_context_tx *priv_tx,
while (remaining > 0) {
skb_frag_t *frag = &record->frags[i];
- get_page(skb_frag_page(frag));
+ page_ref_inc(skb_frag_page(frag));
remaining -= skb_frag_size(frag);
info->frags[i++] = *frag;
}
@@ -763,7 +763,7 @@ void mlx5e_ktls_tx_handle_resync_dump_comp(struct mlx5e_txqsq *sq,
stats = sq->stats;
mlx5e_tx_dma_unmap(sq->pdev, dma);
- put_page(wi->resync_dump_frag_page);
+ page_ref_dec(wi->resync_dump_frag_page);
stats->tls_dump_packets++;
stats->tls_dump_bytes += wi->num_bytes;
}
@@ -816,12 +816,12 @@ mlx5e_ktls_tx_handle_ooo(struct mlx5e_ktls_offload_context_tx *priv_tx,
err_out:
for (; i < info.nr_frags; i++)
- /* The put_page() here undoes the page ref obtained in tx_sync_info_get().
+ /* The page_ref_dec() here undoes the page ref obtained in tx_sync_info_get().
* Page refs obtained for the DUMP WQEs above (by page_ref_add) will be
* released only upon their completions (or in mlx5e_free_txqsq_descs,
* if channel closes).
*/
- put_page(skb_frag_page(&info.frags[i]));
+ page_ref_dec(skb_frag_page(&info.frags[i]));
return MLX5E_KTLS_SYNC_FAIL;
}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index e601324..13a3fa8 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -4267,7 +4267,8 @@ void mlx5e_set_xdp_feature(struct net_device *netdev)
struct mlx5e_params *params = &priv->channels.params;
xdp_features_t val;
- if (params->packet_merge.type != MLX5E_PACKET_MERGE_NONE) {
+ if (!netdev->netdev_ops->ndo_bpf ||
+ params->packet_merge.type != MLX5E_PACKET_MERGE_NONE) {
xdp_clear_features_flag(netdev);
return;
}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_selftest.c b/drivers/net/ethernet/mellanox/mlx5/core/en_selftest.c
index 5bf8318..1d60465 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_selftest.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_selftest.c
@@ -36,6 +36,7 @@
#include "en.h"
#include "en/port.h"
#include "eswitch.h"
+#include "lib/mlx5.h"
static int mlx5e_test_health_info(struct mlx5e_priv *priv)
{
@@ -247,6 +248,9 @@ static int mlx5e_cond_loopback(struct mlx5e_priv *priv)
if (is_mdev_switchdev_mode(priv->mdev))
return -EOPNOTSUPP;
+ if (mlx5_get_sd(priv->mdev))
+ return -EOPNOTSUPP;
+
return 0;
}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index f24f91d..8cf61ae 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -2527,8 +2527,11 @@ static void __esw_offloads_unload_rep(struct mlx5_eswitch *esw,
struct mlx5_eswitch_rep *rep, u8 rep_type)
{
if (atomic_cmpxchg(&rep->rep_data[rep_type].state,
- REP_LOADED, REP_REGISTERED) == REP_LOADED)
+ REP_LOADED, REP_REGISTERED) == REP_LOADED) {
+ if (rep_type == REP_ETH)
+ __esw_offloads_unload_rep(esw, rep, REP_IB);
esw->offloads.rep_ops[rep_type]->unload(rep);
+ }
}
static void __unload_reps_all_vport(struct mlx5_eswitch *esw, u8 rep_type)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
index 8505d5e..6e4f8aa 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
@@ -2105,13 +2105,22 @@ lookup_fte_locked(struct mlx5_flow_group *g,
fte_tmp = NULL;
goto out;
}
- if (!fte_tmp->node.active) {
- tree_put_node(&fte_tmp->node, false);
- fte_tmp = NULL;
- goto out;
- }
nested_down_write_ref_node(&fte_tmp->node, FS_LOCK_CHILD);
+
+ if (!fte_tmp->node.active) {
+ up_write_ref_node(&fte_tmp->node, false);
+
+ if (take_write)
+ up_write_ref_node(&g->node, false);
+ else
+ up_read_ref_node(&g->node);
+
+ tree_put_node(&fte_tmp->node, false);
+
+ return NULL;
+ }
+
out:
if (take_write)
up_write_ref_node(&g->node, false);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c
index 81a9232..7db9cab 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c
@@ -593,9 +593,11 @@ static void irq_pool_free(struct mlx5_irq_pool *pool)
kvfree(pool);
}
-static int irq_pools_init(struct mlx5_core_dev *dev, int sf_vec, int pcif_vec)
+static int irq_pools_init(struct mlx5_core_dev *dev, int sf_vec, int pcif_vec,
+ bool dynamic_vec)
{
struct mlx5_irq_table *table = dev->priv.irq_table;
+ int sf_vec_available = sf_vec;
int num_sf_ctrl;
int err;
@@ -616,6 +618,13 @@ static int irq_pools_init(struct mlx5_core_dev *dev, int sf_vec, int pcif_vec)
num_sf_ctrl = DIV_ROUND_UP(mlx5_sf_max_functions(dev),
MLX5_SFS_PER_CTRL_IRQ);
num_sf_ctrl = min_t(int, MLX5_IRQ_CTRL_SF_MAX, num_sf_ctrl);
+ if (!dynamic_vec && (num_sf_ctrl + 1) > sf_vec_available) {
+ mlx5_core_dbg(dev,
+ "Not enough IRQs for SFs control and completion pool, required=%d avail=%d\n",
+ num_sf_ctrl + 1, sf_vec_available);
+ return 0;
+ }
+
table->sf_ctrl_pool = irq_pool_alloc(dev, pcif_vec, num_sf_ctrl,
"mlx5_sf_ctrl",
MLX5_EQ_SHARE_IRQ_MIN_CTRL,
@@ -624,9 +633,11 @@ static int irq_pools_init(struct mlx5_core_dev *dev, int sf_vec, int pcif_vec)
err = PTR_ERR(table->sf_ctrl_pool);
goto err_pf;
}
- /* init sf_comp_pool */
+ sf_vec_available -= num_sf_ctrl;
+
+ /* init sf_comp_pool, remaining vectors are for the SF completions */
table->sf_comp_pool = irq_pool_alloc(dev, pcif_vec + num_sf_ctrl,
- sf_vec - num_sf_ctrl, "mlx5_sf_comp",
+ sf_vec_available, "mlx5_sf_comp",
MLX5_EQ_SHARE_IRQ_MIN_COMP,
MLX5_EQ_SHARE_IRQ_MAX_COMP);
if (IS_ERR(table->sf_comp_pool)) {
@@ -715,6 +726,7 @@ int mlx5_irq_table_get_num_comp(struct mlx5_irq_table *table)
int mlx5_irq_table_create(struct mlx5_core_dev *dev)
{
int num_eqs = mlx5_max_eq_cap_get(dev);
+ bool dynamic_vec;
int total_vec;
int pcif_vec;
int req_vec;
@@ -724,21 +736,31 @@ int mlx5_irq_table_create(struct mlx5_core_dev *dev)
if (mlx5_core_is_sf(dev))
return 0;
+ /* PCI PF vectors usage is limited by online cpus, device EQs and
+ * PCI MSI-X capability.
+ */
pcif_vec = MLX5_CAP_GEN(dev, num_ports) * num_online_cpus() + 1;
pcif_vec = min_t(int, pcif_vec, num_eqs);
+ pcif_vec = min_t(int, pcif_vec, pci_msix_vec_count(dev->pdev));
total_vec = pcif_vec;
if (mlx5_sf_max_functions(dev))
total_vec += MLX5_MAX_MSIX_PER_SF * mlx5_sf_max_functions(dev);
total_vec = min_t(int, total_vec, pci_msix_vec_count(dev->pdev));
- pcif_vec = min_t(int, pcif_vec, pci_msix_vec_count(dev->pdev));
req_vec = pci_msix_can_alloc_dyn(dev->pdev) ? 1 : total_vec;
n = pci_alloc_irq_vectors(dev->pdev, 1, req_vec, PCI_IRQ_MSIX);
if (n < 0)
return n;
- err = irq_pools_init(dev, total_vec - pcif_vec, pcif_vec);
+ /* Further limit vectors of the pools based on platform for non dynamic case */
+ dynamic_vec = pci_msix_can_alloc_dyn(dev->pdev);
+ if (!dynamic_vec) {
+ pcif_vec = min_t(int, n, pcif_vec);
+ total_vec = min_t(int, n, total_vec);
+ }
+
+ err = irq_pools_init(dev, total_vec - pcif_vec, pcif_vec, dynamic_vec);
if (err)
pci_free_irq_vectors(dev->pdev);
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel-plat.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel-plat.c
index d68f0c4..9739bc9 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel-plat.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel-plat.c
@@ -108,7 +108,12 @@ static int intel_eth_plat_probe(struct platform_device *pdev)
if (IS_ERR(dwmac->tx_clk))
return PTR_ERR(dwmac->tx_clk);
- clk_prepare_enable(dwmac->tx_clk);
+ ret = clk_prepare_enable(dwmac->tx_clk);
+ if (ret) {
+ dev_err(&pdev->dev,
+ "Failed to enable tx_clk\n");
+ return ret;
+ }
/* Check and configure TX clock rate */
rate = clk_get_rate(dwmac->tx_clk);
@@ -119,7 +124,7 @@ static int intel_eth_plat_probe(struct platform_device *pdev)
if (ret) {
dev_err(&pdev->dev,
"Failed to set tx_clk\n");
- return ret;
+ goto err_tx_clk_disable;
}
}
}
@@ -133,7 +138,7 @@ static int intel_eth_plat_probe(struct platform_device *pdev)
if (ret) {
dev_err(&pdev->dev,
"Failed to set clk_ptp_ref\n");
- return ret;
+ goto err_tx_clk_disable;
}
}
}
@@ -149,12 +154,15 @@ static int intel_eth_plat_probe(struct platform_device *pdev)
}
ret = stmmac_dvr_probe(&pdev->dev, plat_dat, &stmmac_res);
- if (ret) {
- clk_disable_unprepare(dwmac->tx_clk);
- return ret;
- }
+ if (ret)
+ goto err_tx_clk_disable;
return 0;
+
+err_tx_clk_disable:
+ if (dwmac->data->tx_clk_en)
+ clk_disable_unprepare(dwmac->tx_clk);
+ return ret;
}
static void intel_eth_plat_remove(struct platform_device *pdev)
@@ -162,7 +170,8 @@ static void intel_eth_plat_remove(struct platform_device *pdev)
struct intel_dwmac *dwmac = get_stmmac_bsp_priv(&pdev->dev);
stmmac_pltfr_remove(pdev);
- clk_disable_unprepare(dwmac->tx_clk);
+ if (dwmac->data->tx_clk_en)
+ clk_disable_unprepare(dwmac->tx_clk);
}
static struct platform_driver intel_eth_plat_driver = {
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-mediatek.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-mediatek.c
index 2a9132d..001857c 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-mediatek.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-mediatek.c
@@ -589,9 +589,9 @@ static int mediatek_dwmac_common_data(struct platform_device *pdev,
plat->mac_interface = priv_plat->phy_mode;
if (priv_plat->mac_wol)
- plat->flags |= STMMAC_FLAG_USE_PHY_WOL;
- else
plat->flags &= ~STMMAC_FLAG_USE_PHY_WOL;
+ else
+ plat->flags |= STMMAC_FLAG_USE_PHY_WOL;
plat->riwt_off = 1;
plat->maxmtu = ETH_DATA_LEN;
plat->host_dma_width = priv_plat->variant->dma_bit_mask;
diff --git a/drivers/net/ethernet/ti/icssg/icssg_prueth.c b/drivers/net/ethernet/ti/icssg/icssg_prueth.c
index 5c20ceb..fe2fd1b 100644
--- a/drivers/net/ethernet/ti/icssg/icssg_prueth.c
+++ b/drivers/net/ethernet/ti/icssg/icssg_prueth.c
@@ -16,6 +16,7 @@
#include <linux/if_hsr.h>
#include <linux/if_vlan.h>
#include <linux/interrupt.h>
+#include <linux/io-64-nonatomic-hi-lo.h>
#include <linux/kernel.h>
#include <linux/mfd/syscon.h>
#include <linux/module.h>
@@ -411,6 +412,8 @@ static int prueth_perout_enable(void *clockops_data,
struct prueth_emac *emac = clockops_data;
u32 reduction_factor = 0, offset = 0;
struct timespec64 ts;
+ u64 current_cycle;
+ u64 start_offset;
u64 ns_period;
if (!on)
@@ -449,8 +452,14 @@ static int prueth_perout_enable(void *clockops_data,
writel(reduction_factor, emac->prueth->shram.va +
TIMESYNC_FW_WC_SYNCOUT_REDUCTION_FACTOR_OFFSET);
- writel(0, emac->prueth->shram.va +
- TIMESYNC_FW_WC_SYNCOUT_START_TIME_CYCLECOUNT_OFFSET);
+ current_cycle = icssg_read_time(emac->prueth->shram.va +
+ TIMESYNC_FW_WC_CYCLECOUNT_OFFSET);
+
+ /* Rounding of current_cycle count to next second */
+ start_offset = roundup(current_cycle, MSEC_PER_SEC);
+
+ hi_lo_writeq(start_offset, emac->prueth->shram.va +
+ TIMESYNC_FW_WC_SYNCOUT_START_TIME_CYCLECOUNT_OFFSET);
return 0;
}
diff --git a/drivers/net/ethernet/ti/icssg/icssg_prueth.h b/drivers/net/ethernet/ti/icssg/icssg_prueth.h
index 8722bb4..f5c1d47 100644
--- a/drivers/net/ethernet/ti/icssg/icssg_prueth.h
+++ b/drivers/net/ethernet/ti/icssg/icssg_prueth.h
@@ -330,6 +330,18 @@ static inline int prueth_emac_slice(struct prueth_emac *emac)
extern const struct ethtool_ops icssg_ethtool_ops;
extern const struct dev_pm_ops prueth_dev_pm_ops;
+static inline u64 icssg_read_time(const void __iomem *addr)
+{
+ u32 low, high;
+
+ do {
+ high = readl(addr + 4);
+ low = readl(addr);
+ } while (high != readl(addr + 4));
+
+ return low + ((u64)high << 32);
+}
+
/* Classifier helpers */
void icssg_class_set_mac_addr(struct regmap *miig_rt, int slice, u8 *mac);
void icssg_class_set_host_mac_addr(struct regmap *miig_rt, const u8 *mac);
diff --git a/drivers/net/ethernet/vertexcom/mse102x.c b/drivers/net/ethernet/vertexcom/mse102x.c
index 2c37957..89dc4c4 100644
--- a/drivers/net/ethernet/vertexcom/mse102x.c
+++ b/drivers/net/ethernet/vertexcom/mse102x.c
@@ -437,13 +437,15 @@ static void mse102x_tx_work(struct work_struct *work)
mse = &mses->mse102x;
while ((txb = skb_dequeue(&mse->txq))) {
+ unsigned int len = max_t(unsigned int, txb->len, ETH_ZLEN);
+
mutex_lock(&mses->lock);
ret = mse102x_tx_pkt_spi(mse, txb, work_timeout);
mutex_unlock(&mses->lock);
if (ret) {
mse->ndev->stats.tx_dropped++;
} else {
- mse->ndev->stats.tx_bytes += txb->len;
+ mse->ndev->stats.tx_bytes += len;
mse->ndev->stats.tx_packets++;
}
diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c
index 4309317..3e9957b6 100644
--- a/drivers/net/phy/phylink.c
+++ b/drivers/net/phy/phylink.c
@@ -78,7 +78,7 @@ struct phylink {
unsigned int pcs_neg_mode;
unsigned int pcs_state;
- bool mac_link_dropped;
+ bool link_failed;
bool using_mac_select_pcs;
struct sfp_bus *sfp_bus;
@@ -1475,9 +1475,9 @@ static void phylink_resolve(struct work_struct *w)
cur_link_state = pl->old_link_state;
if (pl->phylink_disable_state) {
- pl->mac_link_dropped = false;
+ pl->link_failed = false;
link_state.link = false;
- } else if (pl->mac_link_dropped) {
+ } else if (pl->link_failed) {
link_state.link = false;
retrigger = true;
} else {
@@ -1572,7 +1572,7 @@ static void phylink_resolve(struct work_struct *w)
phylink_link_up(pl, link_state);
}
if (!link_state.link && retrigger) {
- pl->mac_link_dropped = false;
+ pl->link_failed = false;
queue_work(system_power_efficient_wq, &pl->resolve);
}
mutex_unlock(&pl->state_mutex);
@@ -1835,6 +1835,8 @@ static void phylink_phy_change(struct phy_device *phydev, bool up)
pl->phy_state.pause |= MLO_PAUSE_RX;
pl->phy_state.interface = phydev->interface;
pl->phy_state.link = up;
+ if (!up)
+ pl->link_failed = true;
mutex_unlock(&pl->state_mutex);
phylink_run_resolve(pl);
@@ -2158,7 +2160,7 @@ EXPORT_SYMBOL_GPL(phylink_disconnect_phy);
static void phylink_link_changed(struct phylink *pl, bool up, const char *what)
{
if (!up)
- pl->mac_link_dropped = true;
+ pl->link_failed = true;
phylink_run_resolve(pl);
phylink_dbg(pl, "%s link %s\n", what, up ? "up" : "down");
}
@@ -2792,7 +2794,7 @@ int phylink_ethtool_set_pauseparam(struct phylink *pl,
* link will cycle.
*/
if (manual_changed) {
- pl->mac_link_dropped = true;
+ pl->link_failed = true;
phylink_run_resolve(pl);
}
diff --git a/drivers/nvme/host/apple.c b/drivers/nvme/host/apple.c
index b1387dc..7cd1102 100644
--- a/drivers/nvme/host/apple.c
+++ b/drivers/nvme/host/apple.c
@@ -649,7 +649,7 @@ static bool apple_nvme_handle_cq(struct apple_nvme_queue *q, bool force)
found = apple_nvme_poll_cq(q, &iob);
- if (!rq_list_empty(iob.req_list))
+ if (!rq_list_empty(&iob.req_list))
apple_nvme_complete_batch(&iob);
return found;
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 855b42c..1a8d32a 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -42,6 +42,8 @@ struct nvme_ns_info {
bool is_readonly;
bool is_ready;
bool is_removed;
+ bool is_rotational;
+ bool no_vwc;
};
unsigned int admin_timeout = 60;
@@ -1639,6 +1641,8 @@ static int nvme_ns_info_from_id_cs_indep(struct nvme_ctrl *ctrl,
info->is_shared = id->nmic & NVME_NS_NMIC_SHARED;
info->is_readonly = id->nsattr & NVME_NS_ATTR_RO;
info->is_ready = id->nstat & NVME_NSTAT_NRDY;
+ info->is_rotational = id->nsfeat & NVME_NS_ROTATIONAL;
+ info->no_vwc = id->nsfeat & NVME_NS_VWC_NOT_PRESENT;
}
kfree(id);
return ret;
@@ -2185,11 +2189,14 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns,
ns->head->ids.csi == NVME_CSI_ZNS)
nvme_update_zone_info(ns, &lim, &zi);
- if (ns->ctrl->vwc & NVME_CTRL_VWC_PRESENT)
+ if ((ns->ctrl->vwc & NVME_CTRL_VWC_PRESENT) && !info->no_vwc)
lim.features |= BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA;
else
lim.features &= ~(BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA);
+ if (info->is_rotational)
+ lim.features |= BLK_FEAT_ROTATIONAL;
+
/*
* Register a metadata profile for PI, or the plain non-integrity NVMe
* metadata masquerading as Type 0 if supported, otherwise reject block
@@ -3636,6 +3643,7 @@ static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl,
head->ns_id = info->nsid;
head->ids = info->ids;
head->shared = info->is_shared;
+ head->rotational = info->is_rotational;
ratelimit_state_init(&head->rs_nuse, 5 * HZ, 1);
ratelimit_set_flags(&head->rs_nuse, RATELIMIT_MSG_ON_RELEASE);
kref_init(&head->ref);
@@ -4017,7 +4025,7 @@ static void nvme_scan_ns(struct nvme_ctrl *ctrl, unsigned nsid)
{
struct nvme_ns_info info = { .nsid = nsid };
struct nvme_ns *ns;
- int ret;
+ int ret = 1;
if (nvme_identify_ns_descs(ctrl, &info))
return;
@@ -4034,9 +4042,10 @@ static void nvme_scan_ns(struct nvme_ctrl *ctrl, unsigned nsid)
* set up a namespace. If not fall back to the legacy version.
*/
if ((ctrl->cap & NVME_CAP_CRMS_CRIMS) ||
- (info.ids.csi != NVME_CSI_NVM && info.ids.csi != NVME_CSI_ZNS))
+ (info.ids.csi != NVME_CSI_NVM && info.ids.csi != NVME_CSI_ZNS) ||
+ ctrl->vs >= NVME_VS(2, 0, 0))
ret = nvme_ns_info_from_id_cs_indep(ctrl, &info);
- else
+ if (ret > 0)
ret = nvme_ns_info_from_identify(ctrl, &info);
if (info.is_removed)
@@ -4895,7 +4904,7 @@ void nvme_unfreeze(struct nvme_ctrl *ctrl)
srcu_idx = srcu_read_lock(&ctrl->srcu);
list_for_each_entry_srcu(ns, &ctrl->namespaces, list,
srcu_read_lock_held(&ctrl->srcu))
- blk_mq_unfreeze_queue(ns->queue);
+ blk_mq_unfreeze_queue_non_owner(ns->queue);
srcu_read_unlock(&ctrl->srcu, srcu_idx);
clear_bit(NVME_CTRL_FROZEN, &ctrl->flags);
}
@@ -4940,7 +4949,12 @@ void nvme_start_freeze(struct nvme_ctrl *ctrl)
srcu_idx = srcu_read_lock(&ctrl->srcu);
list_for_each_entry_srcu(ns, &ctrl->namespaces, list,
srcu_read_lock_held(&ctrl->srcu))
- blk_freeze_queue_start(ns->queue);
+ /*
+ * Typical non_owner use case is from pci driver, in which
+ * start_freeze is called from timeout work function, but
+ * unfreeze is done in reset work context
+ */
+ blk_freeze_queue_start_non_owner(ns->queue);
srcu_read_unlock(&ctrl->srcu, srcu_idx);
}
EXPORT_SYMBOL_GPL(nvme_start_freeze);
@@ -5036,6 +5050,8 @@ static inline void _nvme_check_size(void)
BUILD_BUG_ON(sizeof(struct nvme_id_ctrl_nvm) != NVME_IDENTIFY_DATA_SIZE);
BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64);
BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512);
+ BUILD_BUG_ON(sizeof(struct nvme_endurance_group_log) != 512);
+ BUILD_BUG_ON(sizeof(struct nvme_rotational_media_log) != 512);
BUILD_BUG_ON(sizeof(struct nvme_dbbuf) != 64);
BUILD_BUG_ON(sizeof(struct nvme_directive_cmd) != 64);
BUILD_BUG_ON(sizeof(struct nvme_feat_host_behavior) != 512);
@@ -5044,22 +5060,20 @@ static inline void _nvme_check_size(void)
static int __init nvme_core_init(void)
{
+ unsigned int wq_flags = WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS;
int result = -ENOMEM;
_nvme_check_size();
- nvme_wq = alloc_workqueue("nvme-wq",
- WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0);
+ nvme_wq = alloc_workqueue("nvme-wq", wq_flags, 0);
if (!nvme_wq)
goto out;
- nvme_reset_wq = alloc_workqueue("nvme-reset-wq",
- WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0);
+ nvme_reset_wq = alloc_workqueue("nvme-reset-wq", wq_flags, 0);
if (!nvme_reset_wq)
goto destroy_wq;
- nvme_delete_wq = alloc_workqueue("nvme-delete-wq",
- WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0);
+ nvme_delete_wq = alloc_workqueue("nvme-delete-wq", wq_flags, 0);
if (!nvme_delete_wq)
goto destroy_reset_wq;
diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c
index a96976b..6522ae1 100644
--- a/drivers/nvme/host/ioctl.c
+++ b/drivers/nvme/host/ioctl.c
@@ -114,7 +114,7 @@ static struct request *nvme_alloc_user_request(struct request_queue *q,
static int nvme_map_user_request(struct request *req, u64 ubuffer,
unsigned bufflen, void __user *meta_buffer, unsigned meta_len,
- u32 meta_seed, struct io_uring_cmd *ioucmd, unsigned int flags)
+ struct io_uring_cmd *ioucmd, unsigned int flags)
{
struct request_queue *q = req->q;
struct nvme_ns *ns = q->queuedata;
@@ -152,8 +152,7 @@ static int nvme_map_user_request(struct request *req, u64 ubuffer,
bio_set_dev(bio, bdev);
if (has_metadata) {
- ret = blk_rq_integrity_map_user(req, meta_buffer, meta_len,
- meta_seed);
+ ret = blk_rq_integrity_map_user(req, meta_buffer, meta_len);
if (ret)
goto out_unmap;
}
@@ -170,7 +169,7 @@ static int nvme_map_user_request(struct request *req, u64 ubuffer,
static int nvme_submit_user_cmd(struct request_queue *q,
struct nvme_command *cmd, u64 ubuffer, unsigned bufflen,
- void __user *meta_buffer, unsigned meta_len, u32 meta_seed,
+ void __user *meta_buffer, unsigned meta_len,
u64 *result, unsigned timeout, unsigned int flags)
{
struct nvme_ns *ns = q->queuedata;
@@ -187,7 +186,7 @@ static int nvme_submit_user_cmd(struct request_queue *q,
req->timeout = timeout;
if (ubuffer && bufflen) {
ret = nvme_map_user_request(req, ubuffer, bufflen, meta_buffer,
- meta_len, meta_seed, NULL, flags);
+ meta_len, NULL, flags);
if (ret)
return ret;
}
@@ -268,7 +267,7 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
c.rw.lbatm = cpu_to_le16(io.appmask);
return nvme_submit_user_cmd(ns->queue, &c, io.addr, length, metadata,
- meta_len, lower_32_bits(io.slba), NULL, 0, 0);
+ meta_len, NULL, 0, 0);
}
static bool nvme_validate_passthru_nsid(struct nvme_ctrl *ctrl,
@@ -323,7 +322,7 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c,
cmd.addr, cmd.data_len, nvme_to_user_ptr(cmd.metadata),
- cmd.metadata_len, 0, &result, timeout, 0);
+ cmd.metadata_len, &result, timeout, 0);
if (status >= 0) {
if (put_user(result, &ucmd->result))
@@ -370,7 +369,7 @@ static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c,
cmd.addr, cmd.data_len, nvme_to_user_ptr(cmd.metadata),
- cmd.metadata_len, 0, &cmd.result, timeout, flags);
+ cmd.metadata_len, &cmd.result, timeout, flags);
if (status >= 0) {
if (put_user(cmd.result, &ucmd->result))
@@ -402,7 +401,7 @@ struct nvme_uring_cmd_pdu {
static inline struct nvme_uring_cmd_pdu *nvme_uring_cmd_pdu(
struct io_uring_cmd *ioucmd)
{
- return (struct nvme_uring_cmd_pdu *)&ioucmd->pdu;
+ return io_uring_cmd_to_pdu(ioucmd, struct nvme_uring_cmd_pdu);
}
static void nvme_uring_task_cb(struct io_uring_cmd *ioucmd,
@@ -507,7 +506,7 @@ static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
if (d.addr && d.data_len) {
ret = nvme_map_user_request(req, d.addr,
d.data_len, nvme_to_user_ptr(d.metadata),
- d.metadata_len, 0, ioucmd, vec);
+ d.metadata_len, ioucmd, vec);
if (ret)
return ret;
}
@@ -635,8 +634,6 @@ static int nvme_ns_uring_cmd(struct nvme_ns *ns, struct io_uring_cmd *ioucmd,
struct nvme_ctrl *ctrl = ns->ctrl;
int ret;
- BUILD_BUG_ON(sizeof(struct nvme_uring_cmd_pdu) > sizeof(ioucmd->pdu));
-
ret = nvme_uring_cmd_checks(issue_flags);
if (ret)
return ret;
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 6a15873..f04cfe3 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -635,8 +635,6 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
lim.features |= BLK_FEAT_IO_STAT | BLK_FEAT_NOWAIT | BLK_FEAT_POLL;
if (head->ids.csi == NVME_CSI_ZNS)
lim.features |= BLK_FEAT_ZONED;
- else
- lim.max_zone_append_sectors = 0;
head->disk = blk_alloc_disk(&lim, ctrl->numa_node);
if (IS_ERR(head->disk))
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 093cb423..900719c 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -474,6 +474,7 @@ struct nvme_ns_head {
struct list_head entry;
struct kref ref;
bool shared;
+ bool rotational;
bool passthru_err_log_enabled;
struct nvme_effects_log *effects;
u64 nuse;
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 4b9fda0..5f2e3ad 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -141,6 +141,7 @@ struct nvme_dev {
struct nvme_ctrl ctrl;
u32 last_ps;
bool hmb;
+ struct sg_table *hmb_sgt;
mempool_t *iod_mempool;
@@ -153,6 +154,7 @@ struct nvme_dev {
/* host memory buffer support: */
u64 host_mem_size;
u32 nr_host_mem_descs;
+ u32 host_mem_descs_size;
dma_addr_t host_mem_descs_dma;
struct nvme_host_mem_buf_desc *host_mem_descs;
void **host_mem_desc_bufs;
@@ -902,11 +904,12 @@ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
return BLK_STS_OK;
}
-static void nvme_submit_cmds(struct nvme_queue *nvmeq, struct request **rqlist)
+static void nvme_submit_cmds(struct nvme_queue *nvmeq, struct rq_list *rqlist)
{
+ struct request *req;
+
spin_lock(&nvmeq->sq_lock);
- while (!rq_list_empty(*rqlist)) {
- struct request *req = rq_list_pop(rqlist);
+ while ((req = rq_list_pop(rqlist))) {
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
nvme_sq_copy_cmd(nvmeq, &iod->cmd);
@@ -929,33 +932,26 @@ static bool nvme_prep_rq_batch(struct nvme_queue *nvmeq, struct request *req)
return nvme_prep_rq(nvmeq->dev, req) == BLK_STS_OK;
}
-static void nvme_queue_rqs(struct request **rqlist)
+static void nvme_queue_rqs(struct rq_list *rqlist)
{
- struct request *req, *next, *prev = NULL;
- struct request *requeue_list = NULL;
+ struct rq_list submit_list = { };
+ struct rq_list requeue_list = { };
+ struct nvme_queue *nvmeq = NULL;
+ struct request *req;
- rq_list_for_each_safe(rqlist, req, next) {
- struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
+ while ((req = rq_list_pop(rqlist))) {
+ if (nvmeq && nvmeq != req->mq_hctx->driver_data)
+ nvme_submit_cmds(nvmeq, &submit_list);
+ nvmeq = req->mq_hctx->driver_data;
- if (!nvme_prep_rq_batch(nvmeq, req)) {
- /* detach 'req' and add to remainder list */
- rq_list_move(rqlist, &requeue_list, req, prev);
-
- req = prev;
- if (!req)
- continue;
- }
-
- if (!next || req->mq_hctx != next->mq_hctx) {
- /* detach rest of list, and submit */
- req->rq_next = NULL;
- nvme_submit_cmds(nvmeq, rqlist);
- *rqlist = next;
- prev = NULL;
- } else
- prev = req;
+ if (nvme_prep_rq_batch(nvmeq, req))
+ rq_list_add_tail(&submit_list, req);
+ else
+ rq_list_add_tail(&requeue_list, req);
}
+ if (nvmeq)
+ nvme_submit_cmds(nvmeq, &submit_list);
*rqlist = requeue_list;
}
@@ -1083,7 +1079,7 @@ static irqreturn_t nvme_irq(int irq, void *data)
DEFINE_IO_COMP_BATCH(iob);
if (nvme_poll_cq(nvmeq, &iob)) {
- if (!rq_list_empty(iob.req_list))
+ if (!rq_list_empty(&iob.req_list))
nvme_pci_complete_batch(&iob);
return IRQ_HANDLED;
}
@@ -1951,7 +1947,7 @@ static int nvme_set_host_mem(struct nvme_dev *dev, u32 bits)
return ret;
}
-static void nvme_free_host_mem(struct nvme_dev *dev)
+static void nvme_free_host_mem_multi(struct nvme_dev *dev)
{
int i;
@@ -1966,18 +1962,54 @@ static void nvme_free_host_mem(struct nvme_dev *dev)
kfree(dev->host_mem_desc_bufs);
dev->host_mem_desc_bufs = NULL;
- dma_free_coherent(dev->dev,
- dev->nr_host_mem_descs * sizeof(*dev->host_mem_descs),
+}
+
+static void nvme_free_host_mem(struct nvme_dev *dev)
+{
+ if (dev->hmb_sgt)
+ dma_free_noncontiguous(dev->dev, dev->host_mem_size,
+ dev->hmb_sgt, DMA_BIDIRECTIONAL);
+ else
+ nvme_free_host_mem_multi(dev);
+
+ dma_free_coherent(dev->dev, dev->host_mem_descs_size,
dev->host_mem_descs, dev->host_mem_descs_dma);
dev->host_mem_descs = NULL;
+ dev->host_mem_descs_size = 0;
dev->nr_host_mem_descs = 0;
}
-static int __nvme_alloc_host_mem(struct nvme_dev *dev, u64 preferred,
+static int nvme_alloc_host_mem_single(struct nvme_dev *dev, u64 size)
+{
+ dev->hmb_sgt = dma_alloc_noncontiguous(dev->dev, size,
+ DMA_BIDIRECTIONAL, GFP_KERNEL, 0);
+ if (!dev->hmb_sgt)
+ return -ENOMEM;
+
+ dev->host_mem_descs = dma_alloc_coherent(dev->dev,
+ sizeof(*dev->host_mem_descs), &dev->host_mem_descs_dma,
+ GFP_KERNEL);
+ if (!dev->host_mem_descs) {
+ dma_free_noncontiguous(dev->dev, dev->host_mem_size,
+ dev->hmb_sgt, DMA_BIDIRECTIONAL);
+ dev->hmb_sgt = NULL;
+ return -ENOMEM;
+ }
+ dev->host_mem_size = size;
+ dev->host_mem_descs_size = sizeof(*dev->host_mem_descs);
+ dev->nr_host_mem_descs = 1;
+
+ dev->host_mem_descs[0].addr =
+ cpu_to_le64(dev->hmb_sgt->sgl->dma_address);
+ dev->host_mem_descs[0].size = cpu_to_le32(size / NVME_CTRL_PAGE_SIZE);
+ return 0;
+}
+
+static int nvme_alloc_host_mem_multi(struct nvme_dev *dev, u64 preferred,
u32 chunk_size)
{
struct nvme_host_mem_buf_desc *descs;
- u32 max_entries, len;
+ u32 max_entries, len, descs_size;
dma_addr_t descs_dma;
int i = 0;
void **bufs;
@@ -1990,8 +2022,9 @@ static int __nvme_alloc_host_mem(struct nvme_dev *dev, u64 preferred,
if (dev->ctrl.hmmaxd && dev->ctrl.hmmaxd < max_entries)
max_entries = dev->ctrl.hmmaxd;
- descs = dma_alloc_coherent(dev->dev, max_entries * sizeof(*descs),
- &descs_dma, GFP_KERNEL);
+ descs_size = max_entries * sizeof(*descs);
+ descs = dma_alloc_coherent(dev->dev, descs_size, &descs_dma,
+ GFP_KERNEL);
if (!descs)
goto out;
@@ -2020,6 +2053,7 @@ static int __nvme_alloc_host_mem(struct nvme_dev *dev, u64 preferred,
dev->host_mem_size = size;
dev->host_mem_descs = descs;
dev->host_mem_descs_dma = descs_dma;
+ dev->host_mem_descs_size = descs_size;
dev->host_mem_desc_bufs = bufs;
return 0;
@@ -2034,8 +2068,7 @@ static int __nvme_alloc_host_mem(struct nvme_dev *dev, u64 preferred,
kfree(bufs);
out_free_descs:
- dma_free_coherent(dev->dev, max_entries * sizeof(*descs), descs,
- descs_dma);
+ dma_free_coherent(dev->dev, descs_size, descs, descs_dma);
out:
dev->host_mem_descs = NULL;
return -ENOMEM;
@@ -2047,9 +2080,18 @@ static int nvme_alloc_host_mem(struct nvme_dev *dev, u64 min, u64 preferred)
u64 hmminds = max_t(u32, dev->ctrl.hmminds * 4096, PAGE_SIZE * 2);
u64 chunk_size;
+ /*
+ * If there is an IOMMU that can merge pages, try a virtually
+ * non-contiguous allocation for a single segment first.
+ */
+ if (!(PAGE_SIZE & dma_get_merge_boundary(dev->dev))) {
+ if (!nvme_alloc_host_mem_single(dev, preferred))
+ return 0;
+ }
+
/* start big and work our way down */
for (chunk_size = min_chunk; chunk_size >= hmminds; chunk_size /= 2) {
- if (!__nvme_alloc_host_mem(dev, preferred, chunk_size)) {
+ if (!nvme_alloc_host_mem_multi(dev, preferred, chunk_size)) {
if (!min || dev->host_mem_size >= min)
return 0;
nvme_free_host_mem(dev);
@@ -2097,8 +2139,10 @@ static int nvme_setup_host_mem(struct nvme_dev *dev)
}
dev_info(dev->ctrl.device,
- "allocated %lld MiB host memory buffer.\n",
- dev->host_mem_size >> ilog2(SZ_1M));
+ "allocated %lld MiB host memory buffer (%u segment%s).\n",
+ dev->host_mem_size >> ilog2(SZ_1M),
+ dev->nr_host_mem_descs,
+ str_plural(dev->nr_host_mem_descs));
}
ret = nvme_set_host_mem(dev, enable_bits);
diff --git a/drivers/nvme/host/trace.c b/drivers/nvme/host/trace.c
index 87c437f..ad25ad1 100644
--- a/drivers/nvme/host/trace.c
+++ b/drivers/nvme/host/trace.c
@@ -228,27 +228,61 @@ static const char *nvme_trace_zone_mgmt_recv(struct trace_seq *p, u8 *cdw10)
static const char *nvme_trace_resv_reg(struct trace_seq *p, u8 *cdw10)
{
+ static const char * const rrega_strs[] = {
+ [0x00] = "register",
+ [0x01] = "unregister",
+ [0x02] = "replace",
+ };
const char *ret = trace_seq_buffer_ptr(p);
u8 rrega = cdw10[0] & 0x7;
u8 iekey = (cdw10[0] >> 3) & 0x1;
u8 ptpl = (cdw10[3] >> 6) & 0x3;
+ const char *rrega_str;
- trace_seq_printf(p, "rrega=%u, iekey=%u, ptpl=%u",
- rrega, iekey, ptpl);
+ if (rrega < ARRAY_SIZE(rrega_strs) && rrega_strs[rrega])
+ rrega_str = rrega_strs[rrega];
+ else
+ rrega_str = "reserved";
+
+ trace_seq_printf(p, "rrega=%u:%s, iekey=%u, ptpl=%u",
+ rrega, rrega_str, iekey, ptpl);
trace_seq_putc(p, 0);
return ret;
}
+static const char * const rtype_strs[] = {
+ [0x00] = "reserved",
+ [0x01] = "write exclusive",
+ [0x02] = "exclusive access",
+ [0x03] = "write exclusive registrants only",
+ [0x04] = "exclusive access registrants only",
+ [0x05] = "write exclusive all registrants",
+ [0x06] = "exclusive access all registrants",
+};
+
static const char *nvme_trace_resv_acq(struct trace_seq *p, u8 *cdw10)
{
+ static const char * const racqa_strs[] = {
+ [0x00] = "acquire",
+ [0x01] = "preempt",
+ [0x02] = "preempt and abort",
+ };
const char *ret = trace_seq_buffer_ptr(p);
u8 racqa = cdw10[0] & 0x7;
u8 iekey = (cdw10[0] >> 3) & 0x1;
u8 rtype = cdw10[1];
+ const char *racqa_str = "reserved";
+ const char *rtype_str = "reserved";
- trace_seq_printf(p, "racqa=%u, iekey=%u, rtype=%u",
- racqa, iekey, rtype);
+ if (racqa < ARRAY_SIZE(racqa_strs) && racqa_strs[racqa])
+ racqa_str = racqa_strs[racqa];
+
+ if (rtype < ARRAY_SIZE(rtype_strs) && rtype_strs[rtype])
+ rtype_str = rtype_strs[rtype];
+
+ trace_seq_printf(p, "racqa=%u:%s, iekey=%u, rtype=%u:%s",
+ racqa, racqa_str, iekey, rtype, rtype_str);
trace_seq_putc(p, 0);
return ret;
@@ -256,13 +290,25 @@ static const char *nvme_trace_resv_acq(struct trace_seq *p, u8 *cdw10)
static const char *nvme_trace_resv_rel(struct trace_seq *p, u8 *cdw10)
{
+ static const char * const rrela_strs[] = {
+ [0x00] = "release",
+ [0x01] = "clear",
+ };
const char *ret = trace_seq_buffer_ptr(p);
u8 rrela = cdw10[0] & 0x7;
u8 iekey = (cdw10[0] >> 3) & 0x1;
u8 rtype = cdw10[1];
+ const char *rrela_str = "reserved";
+ const char *rtype_str = "reserved";
- trace_seq_printf(p, "rrela=%u, iekey=%u, rtype=%u",
- rrela, iekey, rtype);
+ if (rrela < ARRAY_SIZE(rrela_strs) && rrela_strs[rrela])
+ rrela_str = rrela_strs[rrela];
+
+ if (rtype < ARRAY_SIZE(rtype_strs) && rtype_strs[rtype])
+ rtype_str = rtype_strs[rtype];
+
+ trace_seq_printf(p, "rrela=%u:%s, iekey=%u, rtype=%u:%s",
+ rrela, rrela_str, iekey, rtype, rtype_str);
trace_seq_putc(p, 0);
return ret;
diff --git a/drivers/nvme/host/zns.c b/drivers/nvme/host/zns.c
index 9a06f9d..382949e 100644
--- a/drivers/nvme/host/zns.c
+++ b/drivers/nvme/host/zns.c
@@ -111,7 +111,7 @@ void nvme_update_zone_info(struct nvme_ns *ns, struct queue_limits *lim,
lim->features |= BLK_FEAT_ZONED;
lim->max_open_zones = zi->max_open_zones;
lim->max_active_zones = zi->max_active_zones;
- lim->max_zone_append_sectors = ns->ctrl->max_zone_append;
+ lim->max_hw_zone_append_sectors = ns->ctrl->max_zone_append;
lim->chunk_sectors = ns->head->zsze =
nvme_lba_to_sect(ns->head, zi->zone_size);
}
diff --git a/drivers/nvme/target/Makefile b/drivers/nvme/target/Makefile
index c402c44..f2b025b 100644
--- a/drivers/nvme/target/Makefile
+++ b/drivers/nvme/target/Makefile
@@ -10,7 +10,7 @@
obj-$(CONFIG_NVME_TARGET_TCP) += nvmet-tcp.o
nvmet-y += core.o configfs.o admin-cmd.o fabrics-cmd.o \
- discovery.o io-cmd-file.o io-cmd-bdev.o
+ discovery.o io-cmd-file.o io-cmd-bdev.o pr.o
nvmet-$(CONFIG_NVME_TARGET_DEBUGFS) += debugfs.o
nvmet-$(CONFIG_NVME_TARGET_PASSTHRU) += passthru.o
nvmet-$(CONFIG_BLK_DEV_ZONED) += zns.o
diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c
index 081f047..934b401 100644
--- a/drivers/nvme/target/admin-cmd.c
+++ b/drivers/nvme/target/admin-cmd.c
@@ -71,6 +71,35 @@ static void nvmet_execute_get_log_page_error(struct nvmet_req *req)
nvmet_req_complete(req, 0);
}
+static void nvmet_execute_get_supported_log_pages(struct nvmet_req *req)
+{
+ struct nvme_supported_log *logs;
+ u16 status;
+
+ logs = kzalloc(sizeof(*logs), GFP_KERNEL);
+ if (!logs) {
+ status = NVME_SC_INTERNAL;
+ goto out;
+ }
+
+ logs->lids[NVME_LOG_SUPPORTED] = cpu_to_le32(NVME_LIDS_LSUPP);
+ logs->lids[NVME_LOG_ERROR] = cpu_to_le32(NVME_LIDS_LSUPP);
+ logs->lids[NVME_LOG_SMART] = cpu_to_le32(NVME_LIDS_LSUPP);
+ logs->lids[NVME_LOG_FW_SLOT] = cpu_to_le32(NVME_LIDS_LSUPP);
+ logs->lids[NVME_LOG_CHANGED_NS] = cpu_to_le32(NVME_LIDS_LSUPP);
+ logs->lids[NVME_LOG_CMD_EFFECTS] = cpu_to_le32(NVME_LIDS_LSUPP);
+ logs->lids[NVME_LOG_ENDURANCE_GROUP] = cpu_to_le32(NVME_LIDS_LSUPP);
+ logs->lids[NVME_LOG_ANA] = cpu_to_le32(NVME_LIDS_LSUPP);
+ logs->lids[NVME_LOG_FEATURES] = cpu_to_le32(NVME_LIDS_LSUPP);
+ logs->lids[NVME_LOG_RMI] = cpu_to_le32(NVME_LIDS_LSUPP);
+ logs->lids[NVME_LOG_RESERVATION] = cpu_to_le32(NVME_LIDS_LSUPP);
+
+ status = nvmet_copy_to_sgl(req, 0, logs, sizeof(*logs));
+ kfree(logs);
+out:
+ nvmet_req_complete(req, status);
+}
+
static u16 nvmet_get_smart_log_nsid(struct nvmet_req *req,
struct nvme_smart_log *slog)
{
@@ -130,6 +159,45 @@ static u16 nvmet_get_smart_log_all(struct nvmet_req *req,
return NVME_SC_SUCCESS;
}
+static void nvmet_execute_get_log_page_rmi(struct nvmet_req *req)
+{
+ struct nvme_rotational_media_log *log;
+ struct gendisk *disk;
+ u16 status;
+
+ req->cmd->common.nsid = cpu_to_le32(le16_to_cpu(
+ req->cmd->get_log_page.lsi));
+ status = nvmet_req_find_ns(req);
+ if (status)
+ goto out;
+
+ if (!req->ns->bdev || bdev_nonrot(req->ns->bdev)) {
+ status = NVME_SC_INVALID_FIELD | NVME_STATUS_DNR;
+ goto out;
+ }
+
+ if (req->transfer_len != sizeof(*log)) {
+ status = NVME_SC_SGL_INVALID_DATA | NVME_STATUS_DNR;
+ goto out;
+ }
+
+ log = kzalloc(sizeof(*log), GFP_KERNEL);
+ if (!log)
+ goto out;
+
+ log->endgid = req->cmd->get_log_page.lsi;
+ disk = req->ns->bdev->bd_disk;
+ if (disk && disk->ia_ranges)
+ log->numa = cpu_to_le16(disk->ia_ranges->nr_ia_ranges);
+ else
+ log->numa = cpu_to_le16(1);
+
+ status = nvmet_copy_to_sgl(req, 0, log, sizeof(*log));
+ kfree(log);
+out:
+ nvmet_req_complete(req, status);
+}
+
static void nvmet_execute_get_log_page_smart(struct nvmet_req *req)
{
struct nvme_smart_log *log;
@@ -176,6 +244,10 @@ static void nvmet_get_cmd_effects_nvm(struct nvme_effects_log *log)
log->iocs[nvme_cmd_read] =
log->iocs[nvme_cmd_flush] =
log->iocs[nvme_cmd_dsm] =
+ log->iocs[nvme_cmd_resv_acquire] =
+ log->iocs[nvme_cmd_resv_register] =
+ log->iocs[nvme_cmd_resv_release] =
+ log->iocs[nvme_cmd_resv_report] =
cpu_to_le32(NVME_CMD_EFFECTS_CSUPP);
log->iocs[nvme_cmd_write] =
log->iocs[nvme_cmd_write_zeroes] =
@@ -272,6 +344,49 @@ static u32 nvmet_format_ana_group(struct nvmet_req *req, u32 grpid,
return struct_size(desc, nsids, count);
}
+static void nvmet_execute_get_log_page_endgrp(struct nvmet_req *req)
+{
+ u64 host_reads, host_writes, data_units_read, data_units_written;
+ struct nvme_endurance_group_log *log;
+ u16 status;
+
+ /*
+ * The target driver emulates each endurance group as its own
+ * namespace, reusing the nsid as the endurance group identifier.
+ */
+ req->cmd->common.nsid = cpu_to_le32(le16_to_cpu(
+ req->cmd->get_log_page.lsi));
+ status = nvmet_req_find_ns(req);
+ if (status)
+ goto out;
+
+ log = kzalloc(sizeof(*log), GFP_KERNEL);
+ if (!log) {
+ status = NVME_SC_INTERNAL;
+ goto out;
+ }
+
+ if (!req->ns->bdev)
+ goto copy;
+
+ host_reads = part_stat_read(req->ns->bdev, ios[READ]);
+ data_units_read =
+ DIV_ROUND_UP(part_stat_read(req->ns->bdev, sectors[READ]), 1000);
+ host_writes = part_stat_read(req->ns->bdev, ios[WRITE]);
+ data_units_written =
+ DIV_ROUND_UP(part_stat_read(req->ns->bdev, sectors[WRITE]), 1000);
+
+ put_unaligned_le64(host_reads, &log->hrc[0]);
+ put_unaligned_le64(data_units_read, &log->dur[0]);
+ put_unaligned_le64(host_writes, &log->hwc[0]);
+ put_unaligned_le64(data_units_written, &log->duw[0]);
+copy:
+ status = nvmet_copy_to_sgl(req, 0, log, sizeof(*log));
+ kfree(log);
+out:
+ nvmet_req_complete(req, status);
+}
+
static void nvmet_execute_get_log_page_ana(struct nvmet_req *req)
{
struct nvme_ana_rsp_hdr hdr = { 0, };
@@ -317,12 +432,44 @@ static void nvmet_execute_get_log_page_ana(struct nvmet_req *req)
nvmet_req_complete(req, status);
}
+static void nvmet_execute_get_log_page_features(struct nvmet_req *req)
+{
+ struct nvme_supported_features_log *features;
+ u16 status;
+
+ features = kzalloc(sizeof(*features), GFP_KERNEL);
+ if (!features) {
+ status = NVME_SC_INTERNAL;
+ goto out;
+ }
+
+ features->fis[NVME_FEAT_NUM_QUEUES] =
+ cpu_to_le32(NVME_FIS_FSUPP | NVME_FIS_CSCPE);
+ features->fis[NVME_FEAT_KATO] =
+ cpu_to_le32(NVME_FIS_FSUPP | NVME_FIS_CSCPE);
+ features->fis[NVME_FEAT_ASYNC_EVENT] =
+ cpu_to_le32(NVME_FIS_FSUPP | NVME_FIS_CSCPE);
+ features->fis[NVME_FEAT_HOST_ID] =
+ cpu_to_le32(NVME_FIS_FSUPP | NVME_FIS_CSCPE);
+ features->fis[NVME_FEAT_WRITE_PROTECT] =
+ cpu_to_le32(NVME_FIS_FSUPP | NVME_FIS_NSCPE);
+ features->fis[NVME_FEAT_RESV_MASK] =
+ cpu_to_le32(NVME_FIS_FSUPP | NVME_FIS_NSCPE);
+
+ status = nvmet_copy_to_sgl(req, 0, features, sizeof(*features));
+ kfree(features);
+out:
+ nvmet_req_complete(req, status);
+}
+
static void nvmet_execute_get_log_page(struct nvmet_req *req)
{
if (!nvmet_check_transfer_len(req, nvmet_get_log_page_len(req->cmd)))
return;
switch (req->cmd->get_log_page.lid) {
+ case NVME_LOG_SUPPORTED:
+ return nvmet_execute_get_supported_log_pages(req);
case NVME_LOG_ERROR:
return nvmet_execute_get_log_page_error(req);
case NVME_LOG_SMART:
@@ -338,8 +485,16 @@ static void nvmet_execute_get_log_page(struct nvmet_req *req)
return nvmet_execute_get_log_changed_ns(req);
case NVME_LOG_CMD_EFFECTS:
return nvmet_execute_get_log_cmd_effects_ns(req);
+ case NVME_LOG_ENDURANCE_GROUP:
+ return nvmet_execute_get_log_page_endgrp(req);
case NVME_LOG_ANA:
return nvmet_execute_get_log_page_ana(req);
+ case NVME_LOG_FEATURES:
+ return nvmet_execute_get_log_page_features(req);
+ case NVME_LOG_RMI:
+ return nvmet_execute_get_log_page_rmi(req);
+ case NVME_LOG_RESERVATION:
+ return nvmet_execute_get_log_page_resv(req);
}
pr_debug("unhandled lid %d on qid %d\n",
req->cmd->get_log_page.lid, req->sq->qid);
@@ -433,7 +588,8 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req)
id->nn = cpu_to_le32(NVMET_MAX_NAMESPACES);
id->mnan = cpu_to_le32(NVMET_MAX_NAMESPACES);
id->oncs = cpu_to_le16(NVME_CTRL_ONCS_DSM |
- NVME_CTRL_ONCS_WRITE_ZEROES);
+ NVME_CTRL_ONCS_WRITE_ZEROES |
+ NVME_CTRL_ONCS_RESERVATIONS);
/* XXX: don't report vwc if the underlying device is write through */
id->vwc = NVME_CTRL_VWC_PRESENT;
@@ -467,6 +623,13 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req)
id->msdbd = ctrl->ops->msdbd;
+ /*
+ * Endurance group identifier is 16 bits, so we can't let namespaces
+ * overflow that since we reuse the nsid
+ */
+ BUILD_BUG_ON(NVMET_MAX_NAMESPACES > USHRT_MAX);
+ id->endgidmax = cpu_to_le16(NVMET_MAX_NAMESPACES);
+
id->anacap = (1 << 0) | (1 << 1) | (1 << 2) | (1 << 3) | (1 << 4);
id->anatt = 10; /* random value */
id->anagrpmax = cpu_to_le32(NVMET_MAX_ANAGRPS);
@@ -551,6 +714,21 @@ static void nvmet_execute_identify_ns(struct nvmet_req *req)
id->nmic = NVME_NS_NMIC_SHARED;
id->anagrpid = cpu_to_le32(req->ns->anagrpid);
+ if (req->ns->pr.enable)
+ id->rescap = NVME_PR_SUPPORT_WRITE_EXCLUSIVE |
+ NVME_PR_SUPPORT_EXCLUSIVE_ACCESS |
+ NVME_PR_SUPPORT_WRITE_EXCLUSIVE_REG_ONLY |
+ NVME_PR_SUPPORT_EXCLUSIVE_ACCESS_REG_ONLY |
+ NVME_PR_SUPPORT_WRITE_EXCLUSIVE_ALL_REGS |
+ NVME_PR_SUPPORT_EXCLUSIVE_ACCESS_ALL_REGS |
+ NVME_PR_SUPPORT_IEKEY_VER_1_3_DEF;
+
+ /*
+ * Since we don't know any better, every namespace is its own endurance
+ * group.
+ */
+ id->endgid = cpu_to_le16(req->ns->nsid);
+
memcpy(&id->nguid, &req->ns->nguid, sizeof(id->nguid));
id->lbaf[0].ds = req->ns->blksize_shift;
@@ -576,7 +754,40 @@ static void nvmet_execute_identify_ns(struct nvmet_req *req)
nvmet_req_complete(req, status);
}
-static void nvmet_execute_identify_nslist(struct nvmet_req *req)
+static void nvmet_execute_identify_endgrp_list(struct nvmet_req *req)
+{
+ u16 min_endgid = le16_to_cpu(req->cmd->identify.cnssid);
+ static const int buf_size = NVME_IDENTIFY_DATA_SIZE;
+ struct nvmet_ctrl *ctrl = req->sq->ctrl;
+ struct nvmet_ns *ns;
+ unsigned long idx;
+ __le16 *list;
+ u16 status;
+ int i = 1;
+
+ list = kzalloc(buf_size, GFP_KERNEL);
+ if (!list) {
+ status = NVME_SC_INTERNAL;
+ goto out;
+ }
+
+ xa_for_each(&ctrl->subsys->namespaces, idx, ns) {
+ if (ns->nsid <= min_endgid)
+ continue;
+
+ list[i++] = cpu_to_le16(ns->nsid);
+ if (i == buf_size / sizeof(__le16))
+ break;
+ }
+
+ list[0] = cpu_to_le16(i - 1);
+ status = nvmet_copy_to_sgl(req, 0, list, buf_size);
+ kfree(list);
+out:
+ nvmet_req_complete(req, status);
+}
+
+static void nvmet_execute_identify_nslist(struct nvmet_req *req, bool match_css)
{
static const int buf_size = NVME_IDENTIFY_DATA_SIZE;
struct nvmet_ctrl *ctrl = req->sq->ctrl;
@@ -606,6 +817,8 @@ static void nvmet_execute_identify_nslist(struct nvmet_req *req)
xa_for_each(&ctrl->subsys->namespaces, idx, ns) {
if (ns->nsid <= min_nsid)
continue;
+ if (match_css && req->ns->csi != req->cmd->identify.csi)
+ continue;
list[i++] = cpu_to_le32(ns->nsid);
if (i == buf_size / sizeof(__le32))
break;
@@ -685,6 +898,56 @@ static void nvmet_execute_identify_ctrl_nvm(struct nvmet_req *req)
nvmet_zero_sgl(req, 0, sizeof(struct nvme_id_ctrl_nvm)));
}
+static void nvme_execute_identify_ns_nvm(struct nvmet_req *req)
+{
+ u16 status;
+
+ status = nvmet_req_find_ns(req);
+ if (status)
+ goto out;
+
+ status = nvmet_copy_to_sgl(req, 0, ZERO_PAGE(0),
+ NVME_IDENTIFY_DATA_SIZE);
+out:
+ nvmet_req_complete(req, status);
+}
+
+static void nvmet_execute_id_cs_indep(struct nvmet_req *req)
+{
+ struct nvme_id_ns_cs_indep *id;
+ u16 status;
+
+ status = nvmet_req_find_ns(req);
+ if (status)
+ goto out;
+
+ id = kzalloc(sizeof(*id), GFP_KERNEL);
+ if (!id) {
+ status = NVME_SC_INTERNAL;
+ goto out;
+ }
+
+ id->nstat = NVME_NSTAT_NRDY;
+ id->anagrpid = cpu_to_le32(req->ns->anagrpid);
+ id->nmic = NVME_NS_NMIC_SHARED;
+ if (req->ns->readonly)
+ id->nsattr |= NVME_NS_ATTR_RO;
+ if (req->ns->bdev && !bdev_nonrot(req->ns->bdev))
+ id->nsfeat |= NVME_NS_ROTATIONAL;
+ /*
+ * We need flush command to flush the file's metadata,
+ * so report supporting vwc if backend is file, even
+ * though buffered_io is disable.
+ */
+ if (req->ns->bdev && !bdev_write_cache(req->ns->bdev))
+ id->nsfeat |= NVME_NS_VWC_NOT_PRESENT;
+
+ status = nvmet_copy_to_sgl(req, 0, id, sizeof(*id));
+ kfree(id);
+out:
+ nvmet_req_complete(req, status);
+}
+
static void nvmet_execute_identify(struct nvmet_req *req)
{
if (!nvmet_check_transfer_len(req, NVME_IDENTIFY_DATA_SIZE))
@@ -698,7 +961,7 @@ static void nvmet_execute_identify(struct nvmet_req *req)
nvmet_execute_identify_ctrl(req);
return;
case NVME_ID_CNS_NS_ACTIVE_LIST:
- nvmet_execute_identify_nslist(req);
+ nvmet_execute_identify_nslist(req, false);
return;
case NVME_ID_CNS_NS_DESC_LIST:
nvmet_execute_identify_desclist(req);
@@ -706,8 +969,8 @@ static void nvmet_execute_identify(struct nvmet_req *req)
case NVME_ID_CNS_CS_NS:
switch (req->cmd->identify.csi) {
case NVME_CSI_NVM:
- /* Not supported */
- break;
+ nvme_execute_identify_ns_nvm(req);
+ return;
case NVME_CSI_ZNS:
if (IS_ENABLED(CONFIG_BLK_DEV_ZONED)) {
nvmet_execute_identify_ns_zns(req);
@@ -729,6 +992,15 @@ static void nvmet_execute_identify(struct nvmet_req *req)
break;
}
break;
+ case NVME_ID_CNS_NS_ACTIVE_LIST_CS:
+ nvmet_execute_identify_nslist(req, true);
+ return;
+ case NVME_ID_CNS_NS_CS_INDEP:
+ nvmet_execute_id_cs_indep(req);
+ return;
+ case NVME_ID_CNS_ENDGRP_LIST:
+ nvmet_execute_identify_endgrp_list(req);
+ return;
}
pr_debug("unhandled identify cns %d on qid %d\n",
@@ -861,6 +1133,9 @@ void nvmet_execute_set_features(struct nvmet_req *req)
case NVME_FEAT_WRITE_PROTECT:
status = nvmet_set_feat_write_protect(req);
break;
+ case NVME_FEAT_RESV_MASK:
+ status = nvmet_set_feat_resv_notif_mask(req, cdw11);
+ break;
default:
req->error_loc = offsetof(struct nvme_common_command, cdw10);
status = NVME_SC_INVALID_FIELD | NVME_STATUS_DNR;
@@ -959,6 +1234,9 @@ void nvmet_execute_get_features(struct nvmet_req *req)
case NVME_FEAT_WRITE_PROTECT:
status = nvmet_get_feat_write_protect(req);
break;
+ case NVME_FEAT_RESV_MASK:
+ status = nvmet_get_feat_resv_notif_mask(req);
+ break;
default:
req->error_loc =
offsetof(struct nvme_common_command, cdw10);
diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c
index 685e89b..eeee9e9 100644
--- a/drivers/nvme/target/configfs.c
+++ b/drivers/nvme/target/configfs.c
@@ -769,6 +769,32 @@ static ssize_t nvmet_ns_revalidate_size_store(struct config_item *item,
CONFIGFS_ATTR_WO(nvmet_ns_, revalidate_size);
+static ssize_t nvmet_ns_resv_enable_show(struct config_item *item, char *page)
+{
+ return sysfs_emit(page, "%d\n", to_nvmet_ns(item)->pr.enable);
+}
+
+static ssize_t nvmet_ns_resv_enable_store(struct config_item *item,
+ const char *page, size_t count)
+{
+ struct nvmet_ns *ns = to_nvmet_ns(item);
+ bool val;
+
+ if (kstrtobool(page, &val))
+ return -EINVAL;
+
+ mutex_lock(&ns->subsys->lock);
+ if (ns->enabled) {
+ pr_err("the ns:%d is already enabled.\n", ns->nsid);
+ mutex_unlock(&ns->subsys->lock);
+ return -EINVAL;
+ }
+ ns->pr.enable = val;
+ mutex_unlock(&ns->subsys->lock);
+ return count;
+}
+CONFIGFS_ATTR(nvmet_ns_, resv_enable);
+
static struct configfs_attribute *nvmet_ns_attrs[] = {
&nvmet_ns_attr_device_path,
&nvmet_ns_attr_device_nguid,
@@ -777,6 +803,7 @@ static struct configfs_attribute *nvmet_ns_attrs[] = {
&nvmet_ns_attr_enable,
&nvmet_ns_attr_buffered_io,
&nvmet_ns_attr_revalidate_size,
+ &nvmet_ns_attr_resv_enable,
#ifdef CONFIG_PCI_P2PDMA
&nvmet_ns_attr_p2pmem,
#endif
diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
index ed2424f..1f4e998 100644
--- a/drivers/nvme/target/core.c
+++ b/drivers/nvme/target/core.c
@@ -611,6 +611,12 @@ int nvmet_ns_enable(struct nvmet_ns *ns)
if (ret)
goto out_restore_subsys_maxnsid;
+ if (ns->pr.enable) {
+ ret = nvmet_pr_init_ns(ns);
+ if (ret)
+ goto out_remove_from_subsys;
+ }
+
subsys->nr_namespaces++;
nvmet_ns_changed(subsys, ns->nsid);
@@ -620,6 +626,8 @@ int nvmet_ns_enable(struct nvmet_ns *ns)
mutex_unlock(&subsys->lock);
return ret;
+out_remove_from_subsys:
+ xa_erase(&subsys->namespaces, ns->nsid);
out_restore_subsys_maxnsid:
subsys->max_nsid = nvmet_max_nsid(subsys);
percpu_ref_exit(&ns->ref);
@@ -663,6 +671,9 @@ void nvmet_ns_disable(struct nvmet_ns *ns)
wait_for_completion(&ns->disable_done);
percpu_ref_exit(&ns->ref);
+ if (ns->pr.enable)
+ nvmet_pr_exit_ns(ns);
+
mutex_lock(&subsys->lock);
subsys->nr_namespaces--;
@@ -754,6 +765,7 @@ static void nvmet_set_error(struct nvmet_req *req, u16 status)
static void __nvmet_req_complete(struct nvmet_req *req, u16 status)
{
struct nvmet_ns *ns = req->ns;
+ struct nvmet_pr_per_ctrl_ref *pc_ref = req->pc_ref;
if (!req->sq->sqhd_disabled)
nvmet_update_sq_head(req);
@@ -766,6 +778,9 @@ static void __nvmet_req_complete(struct nvmet_req *req, u16 status)
trace_nvmet_req_complete(req);
req->ops->queue_response(req);
+
+ if (pc_ref)
+ nvmet_pr_put_ns_pc_ref(pc_ref);
if (ns)
nvmet_put_namespace(ns);
}
@@ -929,18 +944,39 @@ static u16 nvmet_parse_io_cmd(struct nvmet_req *req)
return ret;
}
+ if (req->ns->pr.enable) {
+ ret = nvmet_parse_pr_cmd(req);
+ if (!ret)
+ return ret;
+ }
+
switch (req->ns->csi) {
case NVME_CSI_NVM:
if (req->ns->file)
- return nvmet_file_parse_io_cmd(req);
- return nvmet_bdev_parse_io_cmd(req);
+ ret = nvmet_file_parse_io_cmd(req);
+ else
+ ret = nvmet_bdev_parse_io_cmd(req);
+ break;
case NVME_CSI_ZNS:
if (IS_ENABLED(CONFIG_BLK_DEV_ZONED))
- return nvmet_bdev_zns_parse_io_cmd(req);
- return NVME_SC_INVALID_IO_CMD_SET;
+ ret = nvmet_bdev_zns_parse_io_cmd(req);
+ else
+ ret = NVME_SC_INVALID_IO_CMD_SET;
+ break;
default:
- return NVME_SC_INVALID_IO_CMD_SET;
+ ret = NVME_SC_INVALID_IO_CMD_SET;
}
+ if (ret)
+ return ret;
+
+ if (req->ns->pr.enable) {
+ ret = nvmet_pr_check_cmd_access(req);
+ if (ret)
+ return ret;
+
+ ret = nvmet_pr_get_ns_pc_ref(req);
+ }
+ return ret;
}
bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq,
@@ -964,6 +1000,7 @@ bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq,
req->ns = NULL;
req->error_loc = NVMET_NO_ERROR_LOC;
req->error_slba = 0;
+ req->pc_ref = NULL;
/* no support for fused commands yet */
if (unlikely(flags & (NVME_CMD_FUSE_FIRST | NVME_CMD_FUSE_SECOND))) {
@@ -1015,6 +1052,8 @@ EXPORT_SYMBOL_GPL(nvmet_req_init);
void nvmet_req_uninit(struct nvmet_req *req)
{
percpu_ref_put(&req->sq->ref);
+ if (req->pc_ref)
+ nvmet_pr_put_ns_pc_ref(req->pc_ref);
if (req->ns)
nvmet_put_namespace(req->ns);
}
@@ -1383,7 +1422,8 @@ static void nvmet_fatal_error_handler(struct work_struct *work)
}
u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn,
- struct nvmet_req *req, u32 kato, struct nvmet_ctrl **ctrlp)
+ struct nvmet_req *req, u32 kato, struct nvmet_ctrl **ctrlp,
+ uuid_t *hostid)
{
struct nvmet_subsys *subsys;
struct nvmet_ctrl *ctrl;
@@ -1462,6 +1502,8 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn,
}
ctrl->cntlid = ret;
+ uuid_copy(&ctrl->hostid, hostid);
+
/*
* Discovery controllers may use some arbitrary high value
* in order to cleanup stale discovery sessions
@@ -1478,6 +1520,9 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn,
nvmet_start_keep_alive_timer(ctrl);
mutex_lock(&subsys->lock);
+ ret = nvmet_ctrl_init_pr(ctrl);
+ if (ret)
+ goto init_pr_fail;
list_add_tail(&ctrl->subsys_entry, &subsys->ctrls);
nvmet_setup_p2p_ns_map(ctrl, req);
nvmet_debugfs_ctrl_setup(ctrl);
@@ -1486,6 +1531,10 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn,
*ctrlp = ctrl;
return 0;
+init_pr_fail:
+ mutex_unlock(&subsys->lock);
+ nvmet_stop_keep_alive_timer(ctrl);
+ ida_free(&cntlid_ida, ctrl->cntlid);
out_free_sqs:
kfree(ctrl->sqs);
out_free_changed_ns_list:
@@ -1504,6 +1553,7 @@ static void nvmet_ctrl_free(struct kref *ref)
struct nvmet_subsys *subsys = ctrl->subsys;
mutex_lock(&subsys->lock);
+ nvmet_ctrl_destroy_pr(ctrl);
nvmet_release_p2p_ns_map(ctrl);
list_del(&ctrl->subsys_entry);
mutex_unlock(&subsys->lock);
@@ -1717,7 +1767,7 @@ static int __init nvmet_init(void)
goto out_free_zbd_work_queue;
nvmet_wq = alloc_workqueue("nvmet-wq",
- WQ_MEM_RECLAIM | WQ_UNBOUND, 0);
+ WQ_MEM_RECLAIM | WQ_UNBOUND | WQ_SYSFS, 0);
if (!nvmet_wq)
goto out_free_buffered_work_queue;
diff --git a/drivers/nvme/target/fabrics-cmd.c b/drivers/nvme/target/fabrics-cmd.c
index c4b2edd..c49904e 100644
--- a/drivers/nvme/target/fabrics-cmd.c
+++ b/drivers/nvme/target/fabrics-cmd.c
@@ -64,6 +64,9 @@ static void nvmet_execute_prop_get(struct nvmet_req *req)
case NVME_REG_CSTS:
val = ctrl->csts;
break;
+ case NVME_REG_CRTO:
+ val = NVME_CAP_TIMEOUT(ctrl->csts);
+ break;
default:
status = NVME_SC_INVALID_FIELD | NVME_STATUS_DNR;
break;
@@ -245,12 +248,10 @@ static void nvmet_execute_admin_connect(struct nvmet_req *req)
d->subsysnqn[NVMF_NQN_FIELD_LEN - 1] = '\0';
d->hostnqn[NVMF_NQN_FIELD_LEN - 1] = '\0';
status = nvmet_alloc_ctrl(d->subsysnqn, d->hostnqn, req,
- le32_to_cpu(c->kato), &ctrl);
+ le32_to_cpu(c->kato), &ctrl, &d->hostid);
if (status)
goto out;
- uuid_copy(&ctrl->hostid, &d->hostid);
-
dhchap_status = nvmet_setup_auth(ctrl);
if (dhchap_status) {
pr_err("Failed to setup authentication, dhchap status %u\n",
diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h
index 190f55e..58328b3 100644
--- a/drivers/nvme/target/nvmet.h
+++ b/drivers/nvme/target/nvmet.h
@@ -20,8 +20,9 @@
#include <linux/blkdev.h>
#include <linux/radix-tree.h>
#include <linux/t10-pi.h>
+#include <linux/kfifo.h>
-#define NVMET_DEFAULT_VS NVME_VS(1, 3, 0)
+#define NVMET_DEFAULT_VS NVME_VS(2, 1, 0)
#define NVMET_ASYNC_EVENTS 4
#define NVMET_ERROR_LOG_SLOTS 128
@@ -30,6 +31,7 @@
#define NVMET_MN_MAX_SIZE 40
#define NVMET_SN_MAX_SIZE 20
#define NVMET_FR_MAX_SIZE 8
+#define NVMET_PR_LOG_QUEUE_SIZE 64
/*
* Supported optional AENs:
@@ -56,6 +58,38 @@
#define IPO_IATTR_CONNECT_SQE(x) \
(cpu_to_le32(offsetof(struct nvmf_connect_command, x)))
+struct nvmet_pr_registrant {
+ u64 rkey;
+ uuid_t hostid;
+ enum nvme_pr_type rtype;
+ struct list_head entry;
+ struct rcu_head rcu;
+};
+
+struct nvmet_pr {
+ bool enable;
+ unsigned long notify_mask;
+ atomic_t generation;
+ struct nvmet_pr_registrant __rcu *holder;
+ /*
+ * During the execution of the reservation command, mutual
+ * exclusion is required throughout the process. However,
+ * while waiting asynchronously for the 'per controller
+ * percpu_ref' to complete before the 'preempt and abort'
+ * command finishes, a semaphore is needed to ensure mutual
+ * exclusion instead of a mutex.
+ */
+ struct semaphore pr_sem;
+ struct list_head registrant_list;
+};
+
+struct nvmet_pr_per_ctrl_ref {
+ struct percpu_ref ref;
+ struct completion free_done;
+ struct completion confirm_done;
+ uuid_t hostid;
+};
+
struct nvmet_ns {
struct percpu_ref ref;
struct file *bdev_file;
@@ -85,6 +119,8 @@ struct nvmet_ns {
int pi_type;
int metadata_size;
u8 csi;
+ struct nvmet_pr pr;
+ struct xarray pr_per_ctrl_refs;
};
static inline struct nvmet_ns *to_nvmet_ns(struct config_item *item)
@@ -191,6 +227,13 @@ static inline bool nvmet_port_secure_channel_required(struct nvmet_port *port)
return nvmet_port_disc_addr_treq_secure_channel(port) == NVMF_TREQ_REQUIRED;
}
+struct nvmet_pr_log_mgr {
+ struct mutex lock;
+ u64 lost_count;
+ u64 counter;
+ DECLARE_KFIFO(log_queue, struct nvme_pr_log, NVMET_PR_LOG_QUEUE_SIZE);
+};
+
struct nvmet_ctrl {
struct nvmet_subsys *subsys;
struct nvmet_sq **sqs;
@@ -246,6 +289,7 @@ struct nvmet_ctrl {
u8 *dh_key;
size_t dh_keysize;
#endif
+ struct nvmet_pr_log_mgr pr_log_mgr;
};
struct nvmet_subsys {
@@ -396,6 +440,9 @@ struct nvmet_req {
struct work_struct zmgmt_work;
} z;
#endif /* CONFIG_BLK_DEV_ZONED */
+ struct {
+ struct work_struct abort_work;
+ } r;
};
int sg_cnt;
int metadata_sg_cnt;
@@ -412,6 +459,7 @@ struct nvmet_req {
struct device *p2p_client;
u16 error_loc;
u64 error_slba;
+ struct nvmet_pr_per_ctrl_ref *pc_ref;
};
#define NVMET_MAX_MPOOL_BVEC 16
@@ -498,7 +546,8 @@ void nvmet_ctrl_fatal_error(struct nvmet_ctrl *ctrl);
void nvmet_update_cc(struct nvmet_ctrl *ctrl, u32 new);
u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn,
- struct nvmet_req *req, u32 kato, struct nvmet_ctrl **ctrlp);
+ struct nvmet_req *req, u32 kato, struct nvmet_ctrl **ctrlp,
+ uuid_t *hostid);
struct nvmet_ctrl *nvmet_ctrl_find_get(const char *subsysnqn,
const char *hostnqn, u16 cntlid,
struct nvmet_req *req);
@@ -761,4 +810,18 @@ static inline bool nvmet_has_auth(struct nvmet_ctrl *ctrl)
static inline const char *nvmet_dhchap_dhgroup_name(u8 dhgid) { return NULL; }
#endif
+int nvmet_pr_init_ns(struct nvmet_ns *ns);
+u16 nvmet_parse_pr_cmd(struct nvmet_req *req);
+u16 nvmet_pr_check_cmd_access(struct nvmet_req *req);
+int nvmet_ctrl_init_pr(struct nvmet_ctrl *ctrl);
+void nvmet_ctrl_destroy_pr(struct nvmet_ctrl *ctrl);
+void nvmet_pr_exit_ns(struct nvmet_ns *ns);
+void nvmet_execute_get_log_page_resv(struct nvmet_req *req);
+u16 nvmet_set_feat_resv_notif_mask(struct nvmet_req *req, u32 mask);
+u16 nvmet_get_feat_resv_notif_mask(struct nvmet_req *req);
+u16 nvmet_pr_get_ns_pc_ref(struct nvmet_req *req);
+static inline void nvmet_pr_put_ns_pc_ref(struct nvmet_pr_per_ctrl_ref *pc_ref)
+{
+ percpu_ref_put(&pc_ref->ref);
+}
#endif /* _NVMET_H */
diff --git a/drivers/nvme/target/pr.c b/drivers/nvme/target/pr.c
new file mode 100644
index 0000000..25a02b5
--- /dev/null
+++ b/drivers/nvme/target/pr.c
@@ -0,0 +1,1156 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * NVMe over Fabrics Persist Reservation.
+ * Copyright (c) 2024 Guixin Liu, Alibaba Group.
+ * All rights reserved.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/unaligned.h>
+#include "nvmet.h"
+
+#define NVMET_PR_NOTIFI_MASK_ALL \
+ (1 << NVME_PR_NOTIFY_BIT_REG_PREEMPTED | \
+ 1 << NVME_PR_NOTIFY_BIT_RESV_RELEASED | \
+ 1 << NVME_PR_NOTIFY_BIT_RESV_PREEMPTED)
+
+static inline bool nvmet_pr_parse_ignore_key(u32 cdw10)
+{
+ /* Ignore existing key, bit 03. */
+ return (cdw10 >> 3) & 1;
+}
+
+static inline struct nvmet_ns *nvmet_pr_to_ns(struct nvmet_pr *pr)
+{
+ return container_of(pr, struct nvmet_ns, pr);
+}
+
+static struct nvmet_pr_registrant *
+nvmet_pr_find_registrant(struct nvmet_pr *pr, uuid_t *hostid)
+{
+ struct nvmet_pr_registrant *reg;
+
+ list_for_each_entry_rcu(reg, &pr->registrant_list, entry) {
+ if (uuid_equal(®->hostid, hostid))
+ return reg;
+ }
+ return NULL;
+}
+
+u16 nvmet_set_feat_resv_notif_mask(struct nvmet_req *req, u32 mask)
+{
+ u32 nsid = le32_to_cpu(req->cmd->common.nsid);
+ struct nvmet_ctrl *ctrl = req->sq->ctrl;
+ struct nvmet_ns *ns;
+ unsigned long idx;
+ u16 status;
+
+ if (mask & ~(NVMET_PR_NOTIFI_MASK_ALL)) {
+ req->error_loc = offsetof(struct nvme_common_command, cdw11);
+ return NVME_SC_INVALID_FIELD | NVME_STATUS_DNR;
+ }
+
+ if (nsid != U32_MAX) {
+ status = nvmet_req_find_ns(req);
+ if (status)
+ return status;
+ if (!req->ns->pr.enable)
+ return NVME_SC_INVALID_FIELD | NVME_STATUS_DNR;
+
+ WRITE_ONCE(req->ns->pr.notify_mask, mask);
+ goto success;
+ }
+
+ xa_for_each(&ctrl->subsys->namespaces, idx, ns) {
+ if (ns->pr.enable)
+ WRITE_ONCE(ns->pr.notify_mask, mask);
+ }
+
+success:
+ nvmet_set_result(req, mask);
+ return NVME_SC_SUCCESS;
+}
+
+u16 nvmet_get_feat_resv_notif_mask(struct nvmet_req *req)
+{
+ u16 status;
+
+ status = nvmet_req_find_ns(req);
+ if (status)
+ return status;
+
+ if (!req->ns->pr.enable)
+ return NVME_SC_INVALID_FIELD | NVME_STATUS_DNR;
+
+ nvmet_set_result(req, READ_ONCE(req->ns->pr.notify_mask));
+ return status;
+}
+
+void nvmet_execute_get_log_page_resv(struct nvmet_req *req)
+{
+ struct nvmet_pr_log_mgr *log_mgr = &req->sq->ctrl->pr_log_mgr;
+ struct nvme_pr_log next_log = {0};
+ struct nvme_pr_log log = {0};
+ u16 status = NVME_SC_SUCCESS;
+ u64 lost_count;
+ u64 cur_count;
+ u64 next_count;
+
+ mutex_lock(&log_mgr->lock);
+ if (!kfifo_get(&log_mgr->log_queue, &log))
+ goto out;
+
+ /*
+ * We can't get the last in kfifo.
+ * Utilize the current count and the count from the next log to
+ * calculate the number of lost logs, while also addressing cases
+ * of overflow. If there is no subsequent log, the number of lost
+ * logs is equal to the lost_count within the nvmet_pr_log_mgr.
+ */
+ cur_count = le64_to_cpu(log.count);
+ if (kfifo_peek(&log_mgr->log_queue, &next_log)) {
+ next_count = le64_to_cpu(next_log.count);
+ if (next_count > cur_count)
+ lost_count = next_count - cur_count - 1;
+ else
+ lost_count = U64_MAX - cur_count + next_count - 1;
+ } else {
+ lost_count = log_mgr->lost_count;
+ }
+
+ log.count = cpu_to_le64((cur_count + lost_count) == 0 ?
+ 1 : (cur_count + lost_count));
+ log_mgr->lost_count -= lost_count;
+
+ log.nr_pages = kfifo_len(&log_mgr->log_queue);
+
+out:
+ status = nvmet_copy_to_sgl(req, 0, &log, sizeof(log));
+ mutex_unlock(&log_mgr->lock);
+ nvmet_req_complete(req, status);
+}
+
+static void nvmet_pr_add_resv_log(struct nvmet_ctrl *ctrl, u8 log_type,
+ u32 nsid)
+{
+ struct nvmet_pr_log_mgr *log_mgr = &ctrl->pr_log_mgr;
+ struct nvme_pr_log log = {0};
+
+ mutex_lock(&log_mgr->lock);
+ log_mgr->counter++;
+ if (log_mgr->counter == 0)
+ log_mgr->counter = 1;
+
+ log.count = cpu_to_le64(log_mgr->counter);
+ log.type = log_type;
+ log.nsid = cpu_to_le32(nsid);
+
+ if (!kfifo_put(&log_mgr->log_queue, log)) {
+ pr_info("a reservation log lost, cntlid:%d, log_type:%d, nsid:%d\n",
+ ctrl->cntlid, log_type, nsid);
+ log_mgr->lost_count++;
+ }
+
+ mutex_unlock(&log_mgr->lock);
+}
+
+static void nvmet_pr_resv_released(struct nvmet_pr *pr, uuid_t *hostid)
+{
+ struct nvmet_ns *ns = nvmet_pr_to_ns(pr);
+ struct nvmet_subsys *subsys = ns->subsys;
+ struct nvmet_ctrl *ctrl;
+
+ if (test_bit(NVME_PR_NOTIFY_BIT_RESV_RELEASED, &pr->notify_mask))
+ return;
+
+ mutex_lock(&subsys->lock);
+ list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) {
+ if (!uuid_equal(&ctrl->hostid, hostid) &&
+ nvmet_pr_find_registrant(pr, &ctrl->hostid)) {
+ nvmet_pr_add_resv_log(ctrl,
+ NVME_PR_LOG_RESERVATION_RELEASED, ns->nsid);
+ nvmet_add_async_event(ctrl, NVME_AER_CSS,
+ NVME_AEN_RESV_LOG_PAGE_AVALIABLE,
+ NVME_LOG_RESERVATION);
+ }
+ }
+ mutex_unlock(&subsys->lock);
+}
+
+static void nvmet_pr_send_event_to_host(struct nvmet_pr *pr, uuid_t *hostid,
+ u8 log_type)
+{
+ struct nvmet_ns *ns = nvmet_pr_to_ns(pr);
+ struct nvmet_subsys *subsys = ns->subsys;
+ struct nvmet_ctrl *ctrl;
+
+ mutex_lock(&subsys->lock);
+ list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) {
+ if (uuid_equal(hostid, &ctrl->hostid)) {
+ nvmet_pr_add_resv_log(ctrl, log_type, ns->nsid);
+ nvmet_add_async_event(ctrl, NVME_AER_CSS,
+ NVME_AEN_RESV_LOG_PAGE_AVALIABLE,
+ NVME_LOG_RESERVATION);
+ }
+ }
+ mutex_unlock(&subsys->lock);
+}
+
+static void nvmet_pr_resv_preempted(struct nvmet_pr *pr, uuid_t *hostid)
+{
+ if (test_bit(NVME_PR_NOTIFY_BIT_RESV_PREEMPTED, &pr->notify_mask))
+ return;
+
+ nvmet_pr_send_event_to_host(pr, hostid,
+ NVME_PR_LOG_RESERVATOIN_PREEMPTED);
+}
+
+static void nvmet_pr_registration_preempted(struct nvmet_pr *pr,
+ uuid_t *hostid)
+{
+ if (test_bit(NVME_PR_NOTIFY_BIT_REG_PREEMPTED, &pr->notify_mask))
+ return;
+
+ nvmet_pr_send_event_to_host(pr, hostid,
+ NVME_PR_LOG_REGISTRATION_PREEMPTED);
+}
+
+static inline void nvmet_pr_set_new_holder(struct nvmet_pr *pr, u8 new_rtype,
+ struct nvmet_pr_registrant *reg)
+{
+ reg->rtype = new_rtype;
+ rcu_assign_pointer(pr->holder, reg);
+}
+
+static u16 nvmet_pr_register(struct nvmet_req *req,
+ struct nvmet_pr_register_data *d)
+{
+ struct nvmet_ctrl *ctrl = req->sq->ctrl;
+ struct nvmet_pr_registrant *new, *reg;
+ struct nvmet_pr *pr = &req->ns->pr;
+ u16 status = NVME_SC_SUCCESS;
+ u64 nrkey = le64_to_cpu(d->nrkey);
+
+ new = kmalloc(sizeof(*new), GFP_KERNEL);
+ if (!new)
+ return NVME_SC_INTERNAL;
+
+ down(&pr->pr_sem);
+ reg = nvmet_pr_find_registrant(pr, &ctrl->hostid);
+ if (reg) {
+ if (reg->rkey != nrkey)
+ status = NVME_SC_RESERVATION_CONFLICT | NVME_STATUS_DNR;
+ kfree(new);
+ goto out;
+ }
+
+ memset(new, 0, sizeof(*new));
+ INIT_LIST_HEAD(&new->entry);
+ new->rkey = nrkey;
+ uuid_copy(&new->hostid, &ctrl->hostid);
+ list_add_tail_rcu(&new->entry, &pr->registrant_list);
+
+out:
+ up(&pr->pr_sem);
+ return status;
+}
+
+static void nvmet_pr_unregister_one(struct nvmet_pr *pr,
+ struct nvmet_pr_registrant *reg)
+{
+ struct nvmet_pr_registrant *first_reg;
+ struct nvmet_pr_registrant *holder;
+ u8 original_rtype;
+
+ list_del_rcu(®->entry);
+
+ holder = rcu_dereference_protected(pr->holder, 1);
+ if (reg != holder)
+ goto out;
+
+ original_rtype = holder->rtype;
+ if (original_rtype == NVME_PR_WRITE_EXCLUSIVE_ALL_REGS ||
+ original_rtype == NVME_PR_EXCLUSIVE_ACCESS_ALL_REGS) {
+ first_reg = list_first_or_null_rcu(&pr->registrant_list,
+ struct nvmet_pr_registrant, entry);
+ if (first_reg)
+ first_reg->rtype = original_rtype;
+ rcu_assign_pointer(pr->holder, first_reg);
+ } else {
+ rcu_assign_pointer(pr->holder, NULL);
+
+ if (original_rtype == NVME_PR_WRITE_EXCLUSIVE_REG_ONLY ||
+ original_rtype == NVME_PR_EXCLUSIVE_ACCESS_REG_ONLY)
+ nvmet_pr_resv_released(pr, ®->hostid);
+ }
+out:
+ kfree_rcu(reg, rcu);
+}
+
+static u16 nvmet_pr_unregister(struct nvmet_req *req,
+ struct nvmet_pr_register_data *d,
+ bool ignore_key)
+{
+ u16 status = NVME_SC_RESERVATION_CONFLICT | NVME_STATUS_DNR;
+ struct nvmet_ctrl *ctrl = req->sq->ctrl;
+ struct nvmet_pr *pr = &req->ns->pr;
+ struct nvmet_pr_registrant *reg;
+
+ down(&pr->pr_sem);
+ list_for_each_entry_rcu(reg, &pr->registrant_list, entry) {
+ if (uuid_equal(®->hostid, &ctrl->hostid)) {
+ if (ignore_key || reg->rkey == le64_to_cpu(d->crkey)) {
+ status = NVME_SC_SUCCESS;
+ nvmet_pr_unregister_one(pr, reg);
+ }
+ break;
+ }
+ }
+ up(&pr->pr_sem);
+
+ return status;
+}
+
+static void nvmet_pr_update_reg_rkey(struct nvmet_pr_registrant *reg,
+ void *attr)
+{
+ reg->rkey = *(u64 *)attr;
+}
+
+static u16 nvmet_pr_update_reg_attr(struct nvmet_pr *pr,
+ struct nvmet_pr_registrant *reg,
+ void (*change_attr)(struct nvmet_pr_registrant *reg,
+ void *attr),
+ void *attr)
+{
+ struct nvmet_pr_registrant *holder;
+ struct nvmet_pr_registrant *new;
+
+ holder = rcu_dereference_protected(pr->holder, 1);
+ if (reg != holder) {
+ change_attr(reg, attr);
+ return NVME_SC_SUCCESS;
+ }
+
+ new = kmalloc(sizeof(*new), GFP_ATOMIC);
+ if (!new)
+ return NVME_SC_INTERNAL;
+
+ new->rkey = holder->rkey;
+ new->rtype = holder->rtype;
+ uuid_copy(&new->hostid, &holder->hostid);
+ INIT_LIST_HEAD(&new->entry);
+
+ change_attr(new, attr);
+ list_replace_rcu(&holder->entry, &new->entry);
+ rcu_assign_pointer(pr->holder, new);
+ kfree_rcu(holder, rcu);
+
+ return NVME_SC_SUCCESS;
+}
+
+static u16 nvmet_pr_replace(struct nvmet_req *req,
+ struct nvmet_pr_register_data *d,
+ bool ignore_key)
+{
+ u16 status = NVME_SC_RESERVATION_CONFLICT | NVME_STATUS_DNR;
+ struct nvmet_ctrl *ctrl = req->sq->ctrl;
+ struct nvmet_pr *pr = &req->ns->pr;
+ struct nvmet_pr_registrant *reg;
+ u64 nrkey = le64_to_cpu(d->nrkey);
+
+ down(&pr->pr_sem);
+ list_for_each_entry_rcu(reg, &pr->registrant_list, entry) {
+ if (uuid_equal(®->hostid, &ctrl->hostid)) {
+ if (ignore_key || reg->rkey == le64_to_cpu(d->crkey))
+ status = nvmet_pr_update_reg_attr(pr, reg,
+ nvmet_pr_update_reg_rkey,
+ &nrkey);
+ break;
+ }
+ }
+ up(&pr->pr_sem);
+ return status;
+}
+
+static void nvmet_execute_pr_register(struct nvmet_req *req)
+{
+ u32 cdw10 = le32_to_cpu(req->cmd->common.cdw10);
+ bool ignore_key = nvmet_pr_parse_ignore_key(cdw10);
+ struct nvmet_pr_register_data *d;
+ u8 reg_act = cdw10 & 0x07; /* Reservation Register Action, bit 02:00 */
+ u16 status;
+
+ d = kmalloc(sizeof(*d), GFP_KERNEL);
+ if (!d) {
+ status = NVME_SC_INTERNAL;
+ goto out;
+ }
+
+ status = nvmet_copy_from_sgl(req, 0, d, sizeof(*d));
+ if (status)
+ goto free_data;
+
+ switch (reg_act) {
+ case NVME_PR_REGISTER_ACT_REG:
+ status = nvmet_pr_register(req, d);
+ break;
+ case NVME_PR_REGISTER_ACT_UNREG:
+ status = nvmet_pr_unregister(req, d, ignore_key);
+ break;
+ case NVME_PR_REGISTER_ACT_REPLACE:
+ status = nvmet_pr_replace(req, d, ignore_key);
+ break;
+ default:
+ req->error_loc = offsetof(struct nvme_common_command, cdw10);
+ status = NVME_SC_INVALID_OPCODE | NVME_STATUS_DNR;
+ break;
+ }
+free_data:
+ kfree(d);
+out:
+ if (!status)
+ atomic_inc(&req->ns->pr.generation);
+ nvmet_req_complete(req, status);
+}
+
+static u16 nvmet_pr_acquire(struct nvmet_req *req,
+ struct nvmet_pr_registrant *reg,
+ u8 rtype)
+{
+ struct nvmet_pr *pr = &req->ns->pr;
+ struct nvmet_pr_registrant *holder;
+
+ holder = rcu_dereference_protected(pr->holder, 1);
+ if (holder && reg != holder)
+ return NVME_SC_RESERVATION_CONFLICT | NVME_STATUS_DNR;
+ if (holder && reg == holder) {
+ if (holder->rtype == rtype)
+ return NVME_SC_SUCCESS;
+ return NVME_SC_RESERVATION_CONFLICT | NVME_STATUS_DNR;
+ }
+
+ nvmet_pr_set_new_holder(pr, rtype, reg);
+ return NVME_SC_SUCCESS;
+}
+
+static void nvmet_pr_confirm_ns_pc_ref(struct percpu_ref *ref)
+{
+ struct nvmet_pr_per_ctrl_ref *pc_ref =
+ container_of(ref, struct nvmet_pr_per_ctrl_ref, ref);
+
+ complete(&pc_ref->confirm_done);
+}
+
+static void nvmet_pr_set_ctrl_to_abort(struct nvmet_req *req, uuid_t *hostid)
+{
+ struct nvmet_pr_per_ctrl_ref *pc_ref;
+ struct nvmet_ns *ns = req->ns;
+ unsigned long idx;
+
+ xa_for_each(&ns->pr_per_ctrl_refs, idx, pc_ref) {
+ if (uuid_equal(&pc_ref->hostid, hostid)) {
+ percpu_ref_kill_and_confirm(&pc_ref->ref,
+ nvmet_pr_confirm_ns_pc_ref);
+ wait_for_completion(&pc_ref->confirm_done);
+ }
+ }
+}
+
+static u16 nvmet_pr_unreg_all_host_by_prkey(struct nvmet_req *req, u64 prkey,
+ uuid_t *send_hostid,
+ bool abort)
+{
+ u16 status = NVME_SC_RESERVATION_CONFLICT | NVME_STATUS_DNR;
+ struct nvmet_pr_registrant *reg, *tmp;
+ struct nvmet_pr *pr = &req->ns->pr;
+ uuid_t hostid;
+
+ list_for_each_entry_safe(reg, tmp, &pr->registrant_list, entry) {
+ if (reg->rkey == prkey) {
+ status = NVME_SC_SUCCESS;
+ uuid_copy(&hostid, ®->hostid);
+ if (abort)
+ nvmet_pr_set_ctrl_to_abort(req, &hostid);
+ nvmet_pr_unregister_one(pr, reg);
+ if (!uuid_equal(&hostid, send_hostid))
+ nvmet_pr_registration_preempted(pr, &hostid);
+ }
+ }
+ return status;
+}
+
+static void nvmet_pr_unreg_all_others_by_prkey(struct nvmet_req *req,
+ u64 prkey,
+ uuid_t *send_hostid,
+ bool abort)
+{
+ struct nvmet_pr_registrant *reg, *tmp;
+ struct nvmet_pr *pr = &req->ns->pr;
+ uuid_t hostid;
+
+ list_for_each_entry_safe(reg, tmp, &pr->registrant_list, entry) {
+ if (reg->rkey == prkey &&
+ !uuid_equal(®->hostid, send_hostid)) {
+ uuid_copy(&hostid, ®->hostid);
+ if (abort)
+ nvmet_pr_set_ctrl_to_abort(req, &hostid);
+ nvmet_pr_unregister_one(pr, reg);
+ nvmet_pr_registration_preempted(pr, &hostid);
+ }
+ }
+}
+
+static void nvmet_pr_unreg_all_others(struct nvmet_req *req,
+ uuid_t *send_hostid,
+ bool abort)
+{
+ struct nvmet_pr_registrant *reg, *tmp;
+ struct nvmet_pr *pr = &req->ns->pr;
+ uuid_t hostid;
+
+ list_for_each_entry_safe(reg, tmp, &pr->registrant_list, entry) {
+ if (!uuid_equal(®->hostid, send_hostid)) {
+ uuid_copy(&hostid, ®->hostid);
+ if (abort)
+ nvmet_pr_set_ctrl_to_abort(req, &hostid);
+ nvmet_pr_unregister_one(pr, reg);
+ nvmet_pr_registration_preempted(pr, &hostid);
+ }
+ }
+}
+
+static void nvmet_pr_update_holder_rtype(struct nvmet_pr_registrant *reg,
+ void *attr)
+{
+ u8 new_rtype = *(u8 *)attr;
+
+ reg->rtype = new_rtype;
+}
+
+static u16 nvmet_pr_preempt(struct nvmet_req *req,
+ struct nvmet_pr_registrant *reg,
+ u8 rtype,
+ struct nvmet_pr_acquire_data *d,
+ bool abort)
+{
+ struct nvmet_ctrl *ctrl = req->sq->ctrl;
+ struct nvmet_pr *pr = &req->ns->pr;
+ struct nvmet_pr_registrant *holder;
+ enum nvme_pr_type original_rtype;
+ u64 prkey = le64_to_cpu(d->prkey);
+ u16 status;
+
+ holder = rcu_dereference_protected(pr->holder, 1);
+ if (!holder)
+ return nvmet_pr_unreg_all_host_by_prkey(req, prkey,
+ &ctrl->hostid, abort);
+
+ original_rtype = holder->rtype;
+ if (original_rtype == NVME_PR_WRITE_EXCLUSIVE_ALL_REGS ||
+ original_rtype == NVME_PR_EXCLUSIVE_ACCESS_ALL_REGS) {
+ if (!prkey) {
+ /*
+ * To prevent possible access from other hosts, and
+ * avoid terminate the holder, set the new holder
+ * first before unregistering.
+ */
+ nvmet_pr_set_new_holder(pr, rtype, reg);
+ nvmet_pr_unreg_all_others(req, &ctrl->hostid, abort);
+ return NVME_SC_SUCCESS;
+ }
+ return nvmet_pr_unreg_all_host_by_prkey(req, prkey,
+ &ctrl->hostid, abort);
+ }
+
+ if (holder == reg) {
+ status = nvmet_pr_update_reg_attr(pr, holder,
+ nvmet_pr_update_holder_rtype, &rtype);
+ if (!status && original_rtype != rtype)
+ nvmet_pr_resv_released(pr, ®->hostid);
+ return status;
+ }
+
+ if (prkey == holder->rkey) {
+ /*
+ * Same as before, set the new holder first.
+ */
+ nvmet_pr_set_new_holder(pr, rtype, reg);
+ nvmet_pr_unreg_all_others_by_prkey(req, prkey, &ctrl->hostid,
+ abort);
+ if (original_rtype != rtype)
+ nvmet_pr_resv_released(pr, ®->hostid);
+ return NVME_SC_SUCCESS;
+ }
+
+ if (prkey)
+ return nvmet_pr_unreg_all_host_by_prkey(req, prkey,
+ &ctrl->hostid, abort);
+ return NVME_SC_INVALID_FIELD | NVME_STATUS_DNR;
+}
+
+static void nvmet_pr_do_abort(struct work_struct *w)
+{
+ struct nvmet_req *req = container_of(w, struct nvmet_req, r.abort_work);
+ struct nvmet_pr_per_ctrl_ref *pc_ref;
+ struct nvmet_ns *ns = req->ns;
+ unsigned long idx;
+
+ /*
+ * The target does not support abort, just wait per-controller ref to 0.
+ */
+ xa_for_each(&ns->pr_per_ctrl_refs, idx, pc_ref) {
+ if (percpu_ref_is_dying(&pc_ref->ref)) {
+ wait_for_completion(&pc_ref->free_done);
+ reinit_completion(&pc_ref->confirm_done);
+ reinit_completion(&pc_ref->free_done);
+ percpu_ref_resurrect(&pc_ref->ref);
+ }
+ }
+
+ up(&ns->pr.pr_sem);
+ nvmet_req_complete(req, NVME_SC_SUCCESS);
+}
+
+static u16 __nvmet_execute_pr_acquire(struct nvmet_req *req,
+ struct nvmet_pr_registrant *reg,
+ u8 acquire_act,
+ u8 rtype,
+ struct nvmet_pr_acquire_data *d)
+{
+ u16 status;
+
+ switch (acquire_act) {
+ case NVME_PR_ACQUIRE_ACT_ACQUIRE:
+ status = nvmet_pr_acquire(req, reg, rtype);
+ goto out;
+ case NVME_PR_ACQUIRE_ACT_PREEMPT:
+ status = nvmet_pr_preempt(req, reg, rtype, d, false);
+ goto inc_gen;
+ case NVME_PR_ACQUIRE_ACT_PREEMPT_AND_ABORT:
+ status = nvmet_pr_preempt(req, reg, rtype, d, true);
+ goto inc_gen;
+ default:
+ req->error_loc = offsetof(struct nvme_common_command, cdw10);
+ status = NVME_SC_INVALID_OPCODE | NVME_STATUS_DNR;
+ goto out;
+ }
+inc_gen:
+ if (!status)
+ atomic_inc(&req->ns->pr.generation);
+out:
+ return status;
+}
+
+static void nvmet_execute_pr_acquire(struct nvmet_req *req)
+{
+ u32 cdw10 = le32_to_cpu(req->cmd->common.cdw10);
+ bool ignore_key = nvmet_pr_parse_ignore_key(cdw10);
+ /* Reservation type, bit 15:08 */
+ u8 rtype = (u8)((cdw10 >> 8) & 0xff);
+ /* Reservation acquire action, bit 02:00 */
+ u8 acquire_act = cdw10 & 0x07;
+ struct nvmet_ctrl *ctrl = req->sq->ctrl;
+ struct nvmet_pr_acquire_data *d = NULL;
+ struct nvmet_pr *pr = &req->ns->pr;
+ struct nvmet_pr_registrant *reg;
+ u16 status = NVME_SC_SUCCESS;
+
+ if (ignore_key ||
+ rtype < NVME_PR_WRITE_EXCLUSIVE ||
+ rtype > NVME_PR_EXCLUSIVE_ACCESS_ALL_REGS) {
+ status = NVME_SC_INVALID_FIELD | NVME_STATUS_DNR;
+ goto out;
+ }
+
+ d = kmalloc(sizeof(*d), GFP_KERNEL);
+ if (!d) {
+ status = NVME_SC_INTERNAL;
+ goto out;
+ }
+
+ status = nvmet_copy_from_sgl(req, 0, d, sizeof(*d));
+ if (status)
+ goto free_data;
+
+ status = NVME_SC_RESERVATION_CONFLICT | NVME_STATUS_DNR;
+ down(&pr->pr_sem);
+ list_for_each_entry_rcu(reg, &pr->registrant_list, entry) {
+ if (uuid_equal(®->hostid, &ctrl->hostid) &&
+ reg->rkey == le64_to_cpu(d->crkey)) {
+ status = __nvmet_execute_pr_acquire(req, reg,
+ acquire_act, rtype, d);
+ break;
+ }
+ }
+
+ if (!status && acquire_act == NVME_PR_ACQUIRE_ACT_PREEMPT_AND_ABORT) {
+ kfree(d);
+ INIT_WORK(&req->r.abort_work, nvmet_pr_do_abort);
+ queue_work(nvmet_wq, &req->r.abort_work);
+ return;
+ }
+
+ up(&pr->pr_sem);
+
+free_data:
+ kfree(d);
+out:
+ nvmet_req_complete(req, status);
+}
+
+static u16 nvmet_pr_release(struct nvmet_req *req,
+ struct nvmet_pr_registrant *reg,
+ u8 rtype)
+{
+ struct nvmet_pr *pr = &req->ns->pr;
+ struct nvmet_pr_registrant *holder;
+ u8 original_rtype;
+
+ holder = rcu_dereference_protected(pr->holder, 1);
+ if (!holder || reg != holder)
+ return NVME_SC_SUCCESS;
+
+ original_rtype = holder->rtype;
+ if (original_rtype != rtype)
+ return NVME_SC_RESERVATION_CONFLICT | NVME_STATUS_DNR;
+
+ rcu_assign_pointer(pr->holder, NULL);
+
+ if (original_rtype != NVME_PR_WRITE_EXCLUSIVE &&
+ original_rtype != NVME_PR_EXCLUSIVE_ACCESS)
+ nvmet_pr_resv_released(pr, ®->hostid);
+
+ return NVME_SC_SUCCESS;
+}
+
+static void nvmet_pr_clear(struct nvmet_req *req)
+{
+ struct nvmet_pr_registrant *reg, *tmp;
+ struct nvmet_pr *pr = &req->ns->pr;
+
+ rcu_assign_pointer(pr->holder, NULL);
+
+ list_for_each_entry_safe(reg, tmp, &pr->registrant_list, entry) {
+ list_del_rcu(®->entry);
+ if (!uuid_equal(&req->sq->ctrl->hostid, ®->hostid))
+ nvmet_pr_resv_preempted(pr, ®->hostid);
+ kfree_rcu(reg, rcu);
+ }
+
+ atomic_inc(&pr->generation);
+}
+
+static u16 __nvmet_execute_pr_release(struct nvmet_req *req,
+ struct nvmet_pr_registrant *reg,
+ u8 release_act, u8 rtype)
+{
+ switch (release_act) {
+ case NVME_PR_RELEASE_ACT_RELEASE:
+ return nvmet_pr_release(req, reg, rtype);
+ case NVME_PR_RELEASE_ACT_CLEAR:
+ nvmet_pr_clear(req);
+ return NVME_SC_SUCCESS;
+ default:
+ req->error_loc = offsetof(struct nvme_common_command, cdw10);
+ return NVME_SC_INVALID_OPCODE | NVME_STATUS_DNR;
+ }
+}
+
+static void nvmet_execute_pr_release(struct nvmet_req *req)
+{
+ u32 cdw10 = le32_to_cpu(req->cmd->common.cdw10);
+ bool ignore_key = nvmet_pr_parse_ignore_key(cdw10);
+ u8 rtype = (u8)((cdw10 >> 8) & 0xff); /* Reservation type, bit 15:08 */
+ u8 release_act = cdw10 & 0x07; /* Reservation release action, bit 02:00 */
+ struct nvmet_ctrl *ctrl = req->sq->ctrl;
+ struct nvmet_pr *pr = &req->ns->pr;
+ struct nvmet_pr_release_data *d;
+ struct nvmet_pr_registrant *reg;
+ u16 status;
+
+ if (ignore_key) {
+ status = NVME_SC_INVALID_FIELD | NVME_STATUS_DNR;
+ goto out;
+ }
+
+ d = kmalloc(sizeof(*d), GFP_KERNEL);
+ if (!d) {
+ status = NVME_SC_INTERNAL;
+ goto out;
+ }
+
+ status = nvmet_copy_from_sgl(req, 0, d, sizeof(*d));
+ if (status)
+ goto free_data;
+
+ status = NVME_SC_RESERVATION_CONFLICT | NVME_STATUS_DNR;
+ down(&pr->pr_sem);
+ list_for_each_entry_rcu(reg, &pr->registrant_list, entry) {
+ if (uuid_equal(®->hostid, &ctrl->hostid) &&
+ reg->rkey == le64_to_cpu(d->crkey)) {
+ status = __nvmet_execute_pr_release(req, reg,
+ release_act, rtype);
+ break;
+ }
+ }
+ up(&pr->pr_sem);
+free_data:
+ kfree(d);
+out:
+ nvmet_req_complete(req, status);
+}
+
+static void nvmet_execute_pr_report(struct nvmet_req *req)
+{
+ u32 cdw11 = le32_to_cpu(req->cmd->common.cdw11);
+ u32 cdw10 = le32_to_cpu(req->cmd->common.cdw10);
+ u32 num_bytes = 4 * (cdw10 + 1); /* cdw10 is number of dwords */
+ u8 eds = cdw11 & 1; /* Extended data structure, bit 00 */
+ struct nvme_registered_ctrl_ext *ctrl_eds;
+ struct nvme_reservation_status_ext *data;
+ struct nvmet_pr *pr = &req->ns->pr;
+ struct nvmet_pr_registrant *holder;
+ struct nvmet_pr_registrant *reg;
+ u16 num_ctrls = 0;
+ u16 status;
+ u8 rtype;
+
+ /* nvmet hostid(uuid_t) is 128 bit. */
+ if (!eds) {
+ req->error_loc = offsetof(struct nvme_common_command, cdw11);
+ status = NVME_SC_HOST_ID_INCONSIST | NVME_STATUS_DNR;
+ goto out;
+ }
+
+ if (num_bytes < sizeof(struct nvme_reservation_status_ext)) {
+ req->error_loc = offsetof(struct nvme_common_command, cdw10);
+ status = NVME_SC_INVALID_FIELD | NVME_STATUS_DNR;
+ goto out;
+ }
+
+ data = kmalloc(num_bytes, GFP_KERNEL);
+ if (!data) {
+ status = NVME_SC_INTERNAL;
+ goto out;
+ }
+ memset(data, 0, num_bytes);
+ data->gen = cpu_to_le32(atomic_read(&pr->generation));
+ data->ptpls = 0;
+ ctrl_eds = data->regctl_eds;
+
+ rcu_read_lock();
+ holder = rcu_dereference(pr->holder);
+ rtype = holder ? holder->rtype : 0;
+ data->rtype = rtype;
+
+ list_for_each_entry_rcu(reg, &pr->registrant_list, entry) {
+ num_ctrls++;
+ /*
+ * continue to get the number of all registrans.
+ */
+ if (((void *)ctrl_eds + sizeof(*ctrl_eds)) >
+ ((void *)data + num_bytes))
+ continue;
+ /*
+ * Dynamic controller, set cntlid to 0xffff.
+ */
+ ctrl_eds->cntlid = cpu_to_le16(NVME_CNTLID_DYNAMIC);
+ if (rtype == NVME_PR_WRITE_EXCLUSIVE_ALL_REGS ||
+ rtype == NVME_PR_EXCLUSIVE_ACCESS_ALL_REGS)
+ ctrl_eds->rcsts = 1;
+ if (reg == holder)
+ ctrl_eds->rcsts = 1;
+ uuid_copy((uuid_t *)&ctrl_eds->hostid, ®->hostid);
+ ctrl_eds->rkey = cpu_to_le64(reg->rkey);
+ ctrl_eds++;
+ }
+ rcu_read_unlock();
+
+ put_unaligned_le16(num_ctrls, data->regctl);
+ status = nvmet_copy_to_sgl(req, 0, data, num_bytes);
+ kfree(data);
+out:
+ nvmet_req_complete(req, status);
+}
+
+u16 nvmet_parse_pr_cmd(struct nvmet_req *req)
+{
+ struct nvme_command *cmd = req->cmd;
+
+ switch (cmd->common.opcode) {
+ case nvme_cmd_resv_register:
+ req->execute = nvmet_execute_pr_register;
+ break;
+ case nvme_cmd_resv_acquire:
+ req->execute = nvmet_execute_pr_acquire;
+ break;
+ case nvme_cmd_resv_release:
+ req->execute = nvmet_execute_pr_release;
+ break;
+ case nvme_cmd_resv_report:
+ req->execute = nvmet_execute_pr_report;
+ break;
+ default:
+ return 1;
+ }
+ return NVME_SC_SUCCESS;
+}
+
+static bool nvmet_is_req_write_cmd_group(struct nvmet_req *req)
+{
+ u8 opcode = req->cmd->common.opcode;
+
+ if (req->sq->qid) {
+ switch (opcode) {
+ case nvme_cmd_flush:
+ case nvme_cmd_write:
+ case nvme_cmd_write_zeroes:
+ case nvme_cmd_dsm:
+ case nvme_cmd_zone_append:
+ case nvme_cmd_zone_mgmt_send:
+ return true;
+ default:
+ return false;
+ }
+ }
+ return false;
+}
+
+static bool nvmet_is_req_read_cmd_group(struct nvmet_req *req)
+{
+ u8 opcode = req->cmd->common.opcode;
+
+ if (req->sq->qid) {
+ switch (opcode) {
+ case nvme_cmd_read:
+ case nvme_cmd_zone_mgmt_recv:
+ return true;
+ default:
+ return false;
+ }
+ }
+ return false;
+}
+
+u16 nvmet_pr_check_cmd_access(struct nvmet_req *req)
+{
+ struct nvmet_ctrl *ctrl = req->sq->ctrl;
+ struct nvmet_pr_registrant *holder;
+ struct nvmet_ns *ns = req->ns;
+ struct nvmet_pr *pr = &ns->pr;
+ u16 status = NVME_SC_SUCCESS;
+
+ rcu_read_lock();
+ holder = rcu_dereference(pr->holder);
+ if (!holder)
+ goto unlock;
+ if (uuid_equal(&ctrl->hostid, &holder->hostid))
+ goto unlock;
+
+ /*
+ * The Reservation command group is checked in executing,
+ * allow it here.
+ */
+ switch (holder->rtype) {
+ case NVME_PR_WRITE_EXCLUSIVE:
+ if (nvmet_is_req_write_cmd_group(req))
+ status = NVME_SC_RESERVATION_CONFLICT | NVME_STATUS_DNR;
+ break;
+ case NVME_PR_EXCLUSIVE_ACCESS:
+ if (nvmet_is_req_read_cmd_group(req) ||
+ nvmet_is_req_write_cmd_group(req))
+ status = NVME_SC_RESERVATION_CONFLICT | NVME_STATUS_DNR;
+ break;
+ case NVME_PR_WRITE_EXCLUSIVE_REG_ONLY:
+ case NVME_PR_WRITE_EXCLUSIVE_ALL_REGS:
+ if ((nvmet_is_req_write_cmd_group(req)) &&
+ !nvmet_pr_find_registrant(pr, &ctrl->hostid))
+ status = NVME_SC_RESERVATION_CONFLICT | NVME_STATUS_DNR;
+ break;
+ case NVME_PR_EXCLUSIVE_ACCESS_REG_ONLY:
+ case NVME_PR_EXCLUSIVE_ACCESS_ALL_REGS:
+ if ((nvmet_is_req_read_cmd_group(req) ||
+ nvmet_is_req_write_cmd_group(req)) &&
+ !nvmet_pr_find_registrant(pr, &ctrl->hostid))
+ status = NVME_SC_RESERVATION_CONFLICT | NVME_STATUS_DNR;
+ break;
+ default:
+ pr_warn("the reservation type is set wrong, type:%d\n",
+ holder->rtype);
+ break;
+ }
+
+unlock:
+ rcu_read_unlock();
+ if (status)
+ req->error_loc = offsetof(struct nvme_common_command, opcode);
+ return status;
+}
+
+u16 nvmet_pr_get_ns_pc_ref(struct nvmet_req *req)
+{
+ struct nvmet_pr_per_ctrl_ref *pc_ref;
+
+ pc_ref = xa_load(&req->ns->pr_per_ctrl_refs,
+ req->sq->ctrl->cntlid);
+ if (unlikely(!percpu_ref_tryget_live(&pc_ref->ref)))
+ return NVME_SC_INTERNAL;
+ req->pc_ref = pc_ref;
+ return NVME_SC_SUCCESS;
+}
+
+static void nvmet_pr_ctrl_ns_all_cmds_done(struct percpu_ref *ref)
+{
+ struct nvmet_pr_per_ctrl_ref *pc_ref =
+ container_of(ref, struct nvmet_pr_per_ctrl_ref, ref);
+
+ complete(&pc_ref->free_done);
+}
+
+static int nvmet_pr_alloc_and_insert_pc_ref(struct nvmet_ns *ns,
+ unsigned long idx,
+ uuid_t *hostid)
+{
+ struct nvmet_pr_per_ctrl_ref *pc_ref;
+ int ret;
+
+ pc_ref = kmalloc(sizeof(*pc_ref), GFP_ATOMIC);
+ if (!pc_ref)
+ return -ENOMEM;
+
+ ret = percpu_ref_init(&pc_ref->ref, nvmet_pr_ctrl_ns_all_cmds_done,
+ PERCPU_REF_ALLOW_REINIT, GFP_KERNEL);
+ if (ret)
+ goto free;
+
+ init_completion(&pc_ref->free_done);
+ init_completion(&pc_ref->confirm_done);
+ uuid_copy(&pc_ref->hostid, hostid);
+
+ ret = xa_insert(&ns->pr_per_ctrl_refs, idx, pc_ref, GFP_KERNEL);
+ if (ret)
+ goto exit;
+ return ret;
+exit:
+ percpu_ref_exit(&pc_ref->ref);
+free:
+ kfree(pc_ref);
+ return ret;
+}
+
+int nvmet_ctrl_init_pr(struct nvmet_ctrl *ctrl)
+{
+ struct nvmet_subsys *subsys = ctrl->subsys;
+ struct nvmet_pr_per_ctrl_ref *pc_ref;
+ struct nvmet_ns *ns = NULL;
+ unsigned long idx;
+ int ret;
+
+ ctrl->pr_log_mgr.counter = 0;
+ ctrl->pr_log_mgr.lost_count = 0;
+ mutex_init(&ctrl->pr_log_mgr.lock);
+ INIT_KFIFO(ctrl->pr_log_mgr.log_queue);
+
+ /*
+ * Here we are under subsys lock, if an ns not in subsys->namespaces,
+ * we can make sure that ns is not enabled, and not call
+ * nvmet_pr_init_ns(), see more details in nvmet_ns_enable().
+ * So just check ns->pr.enable.
+ */
+ xa_for_each(&subsys->namespaces, idx, ns) {
+ if (ns->pr.enable) {
+ ret = nvmet_pr_alloc_and_insert_pc_ref(ns, ctrl->cntlid,
+ &ctrl->hostid);
+ if (ret)
+ goto free_per_ctrl_refs;
+ }
+ }
+ return 0;
+
+free_per_ctrl_refs:
+ xa_for_each(&subsys->namespaces, idx, ns) {
+ if (ns->pr.enable) {
+ pc_ref = xa_erase(&ns->pr_per_ctrl_refs, ctrl->cntlid);
+ if (pc_ref)
+ percpu_ref_exit(&pc_ref->ref);
+ kfree(pc_ref);
+ }
+ }
+ return ret;
+}
+
+void nvmet_ctrl_destroy_pr(struct nvmet_ctrl *ctrl)
+{
+ struct nvmet_pr_per_ctrl_ref *pc_ref;
+ struct nvmet_ns *ns;
+ unsigned long idx;
+
+ kfifo_free(&ctrl->pr_log_mgr.log_queue);
+ mutex_destroy(&ctrl->pr_log_mgr.lock);
+
+ xa_for_each(&ctrl->subsys->namespaces, idx, ns) {
+ if (ns->pr.enable) {
+ pc_ref = xa_erase(&ns->pr_per_ctrl_refs, ctrl->cntlid);
+ if (pc_ref)
+ percpu_ref_exit(&pc_ref->ref);
+ kfree(pc_ref);
+ }
+ }
+}
+
+int nvmet_pr_init_ns(struct nvmet_ns *ns)
+{
+ struct nvmet_subsys *subsys = ns->subsys;
+ struct nvmet_pr_per_ctrl_ref *pc_ref;
+ struct nvmet_ctrl *ctrl = NULL;
+ unsigned long idx;
+ int ret;
+
+ ns->pr.holder = NULL;
+ atomic_set(&ns->pr.generation, 0);
+ sema_init(&ns->pr.pr_sem, 1);
+ INIT_LIST_HEAD(&ns->pr.registrant_list);
+ ns->pr.notify_mask = 0;
+
+ xa_init(&ns->pr_per_ctrl_refs);
+
+ list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) {
+ ret = nvmet_pr_alloc_and_insert_pc_ref(ns, ctrl->cntlid,
+ &ctrl->hostid);
+ if (ret)
+ goto free_per_ctrl_refs;
+ }
+ return 0;
+
+free_per_ctrl_refs:
+ xa_for_each(&ns->pr_per_ctrl_refs, idx, pc_ref) {
+ xa_erase(&ns->pr_per_ctrl_refs, idx);
+ percpu_ref_exit(&pc_ref->ref);
+ kfree(pc_ref);
+ }
+ return ret;
+}
+
+void nvmet_pr_exit_ns(struct nvmet_ns *ns)
+{
+ struct nvmet_pr_registrant *reg, *tmp;
+ struct nvmet_pr_per_ctrl_ref *pc_ref;
+ struct nvmet_pr *pr = &ns->pr;
+ unsigned long idx;
+
+ list_for_each_entry_safe(reg, tmp, &pr->registrant_list, entry) {
+ list_del(®->entry);
+ kfree(reg);
+ }
+
+ xa_for_each(&ns->pr_per_ctrl_refs, idx, pc_ref) {
+ /*
+ * No command on ns here, we can safely free pc_ref.
+ */
+ pc_ref = xa_erase(&ns->pr_per_ctrl_refs, idx);
+ percpu_ref_exit(&pc_ref->ref);
+ kfree(pc_ref);
+ }
+
+ xa_destroy(&ns->pr_per_ctrl_refs);
+}
diff --git a/drivers/nvme/target/trace.c b/drivers/nvme/target/trace.c
index 9a35481..6dbc703 100644
--- a/drivers/nvme/target/trace.c
+++ b/drivers/nvme/target/trace.c
@@ -180,6 +180,106 @@ static const char *nvmet_trace_zone_mgmt_recv(struct trace_seq *p, u8 *cdw10)
return ret;
}
+static const char *nvmet_trace_resv_reg(struct trace_seq *p, u8 *cdw10)
+{
+ static const char * const rrega_strs[] = {
+ [0x00] = "register",
+ [0x01] = "unregister",
+ [0x02] = "replace",
+ };
+ const char *ret = trace_seq_buffer_ptr(p);
+ u8 rrega = cdw10[0] & 0x7;
+ u8 iekey = (cdw10[0] >> 3) & 0x1;
+ u8 ptpl = (cdw10[3] >> 6) & 0x3;
+ const char *rrega_str;
+
+ if (rrega < ARRAY_SIZE(rrega_strs) && rrega_strs[rrega])
+ rrega_str = rrega_strs[rrega];
+ else
+ rrega_str = "reserved";
+
+ trace_seq_printf(p, "rrega=%u:%s, iekey=%u, ptpl=%u",
+ rrega, rrega_str, iekey, ptpl);
+ trace_seq_putc(p, 0);
+
+ return ret;
+}
+
+static const char * const rtype_strs[] = {
+ [0x00] = "reserved",
+ [0x01] = "write exclusive",
+ [0x02] = "exclusive access",
+ [0x03] = "write exclusive registrants only",
+ [0x04] = "exclusive access registrants only",
+ [0x05] = "write exclusive all registrants",
+ [0x06] = "exclusive access all registrants",
+};
+
+static const char *nvmet_trace_resv_acq(struct trace_seq *p, u8 *cdw10)
+{
+ static const char * const racqa_strs[] = {
+ [0x00] = "acquire",
+ [0x01] = "preempt",
+ [0x02] = "preempt and abort",
+ };
+ const char *ret = trace_seq_buffer_ptr(p);
+ u8 racqa = cdw10[0] & 0x7;
+ u8 iekey = (cdw10[0] >> 3) & 0x1;
+ u8 rtype = cdw10[1];
+ const char *racqa_str = "reserved";
+ const char *rtype_str = "reserved";
+
+ if (racqa < ARRAY_SIZE(racqa_strs) && racqa_strs[racqa])
+ racqa_str = racqa_strs[racqa];
+
+ if (rtype < ARRAY_SIZE(rtype_strs) && rtype_strs[rtype])
+ rtype_str = rtype_strs[rtype];
+
+ trace_seq_printf(p, "racqa=%u:%s, iekey=%u, rtype=%u:%s",
+ racqa, racqa_str, iekey, rtype, rtype_str);
+ trace_seq_putc(p, 0);
+
+ return ret;
+}
+
+static const char *nvmet_trace_resv_rel(struct trace_seq *p, u8 *cdw10)
+{
+ static const char * const rrela_strs[] = {
+ [0x00] = "release",
+ [0x01] = "clear",
+ };
+ const char *ret = trace_seq_buffer_ptr(p);
+ u8 rrela = cdw10[0] & 0x7;
+ u8 iekey = (cdw10[0] >> 3) & 0x1;
+ u8 rtype = cdw10[1];
+ const char *rrela_str = "reserved";
+ const char *rtype_str = "reserved";
+
+ if (rrela < ARRAY_SIZE(rrela_strs) && rrela_strs[rrela])
+ rrela_str = rrela_strs[rrela];
+
+ if (rtype < ARRAY_SIZE(rtype_strs) && rtype_strs[rtype])
+ rtype_str = rtype_strs[rtype];
+
+ trace_seq_printf(p, "rrela=%u:%s, iekey=%u, rtype=%u:%s",
+ rrela, rrela_str, iekey, rtype, rtype_str);
+ trace_seq_putc(p, 0);
+
+ return ret;
+}
+
+static const char *nvmet_trace_resv_report(struct trace_seq *p, u8 *cdw10)
+{
+ const char *ret = trace_seq_buffer_ptr(p);
+ u32 numd = get_unaligned_le32(cdw10);
+ u8 eds = cdw10[4] & 0x1;
+
+ trace_seq_printf(p, "numd=%u, eds=%u", numd, eds);
+ trace_seq_putc(p, 0);
+
+ return ret;
+}
+
const char *nvmet_trace_parse_nvm_cmd(struct trace_seq *p,
u8 opcode, u8 *cdw10)
{
@@ -195,6 +295,14 @@ const char *nvmet_trace_parse_nvm_cmd(struct trace_seq *p,
return nvmet_trace_zone_mgmt_send(p, cdw10);
case nvme_cmd_zone_mgmt_recv:
return nvmet_trace_zone_mgmt_recv(p, cdw10);
+ case nvme_cmd_resv_register:
+ return nvmet_trace_resv_reg(p, cdw10);
+ case nvme_cmd_resv_acquire:
+ return nvmet_trace_resv_acq(p, cdw10);
+ case nvme_cmd_resv_release:
+ return nvmet_trace_resv_rel(p, cdw10);
+ case nvme_cmd_resv_report:
+ return nvmet_trace_resv_report(p, cdw10);
default:
return nvmet_trace_common(p, cdw10);
}
diff --git a/drivers/nvme/target/zns.c b/drivers/nvme/target/zns.c
index af9e13b..3aef35b 100644
--- a/drivers/nvme/target/zns.c
+++ b/drivers/nvme/target/zns.c
@@ -537,6 +537,7 @@ void nvmet_bdev_execute_zone_append(struct nvmet_req *req)
u16 status = NVME_SC_SUCCESS;
unsigned int total_len = 0;
struct scatterlist *sg;
+ u32 data_len = nvmet_rw_data_len(req);
struct bio *bio;
int sg_cnt;
@@ -544,6 +545,13 @@ void nvmet_bdev_execute_zone_append(struct nvmet_req *req)
if (!nvmet_check_transfer_len(req, nvmet_rw_data_len(req)))
return;
+ if (data_len >
+ bdev_max_zone_append_sectors(req->ns->bdev) << SECTOR_SHIFT) {
+ req->error_loc = offsetof(struct nvme_rw_command, length);
+ status = NVME_SC_INVALID_FIELD | NVME_STATUS_DNR;
+ goto out;
+ }
+
if (!req->sg_cnt) {
nvmet_req_complete(req, 0);
return;
@@ -576,20 +584,17 @@ void nvmet_bdev_execute_zone_append(struct nvmet_req *req)
bio->bi_opf |= REQ_FUA;
for_each_sg(req->sg, sg, req->sg_cnt, sg_cnt) {
- struct page *p = sg_page(sg);
- unsigned int l = sg->length;
- unsigned int o = sg->offset;
- unsigned int ret;
+ unsigned int len = sg->length;
- ret = bio_add_zone_append_page(bio, p, l, o);
- if (ret != sg->length) {
+ if (bio_add_pc_page(bdev_get_queue(bio->bi_bdev), bio,
+ sg_page(sg), len, sg->offset) != len) {
status = NVME_SC_INTERNAL;
goto out_put_bio;
}
- total_len += sg->length;
+ total_len += len;
}
- if (total_len != nvmet_rw_data_len(req)) {
+ if (total_len != data_len) {
status = NVME_SC_INTERNAL | NVME_STATUS_DNR;
goto out_put_bio;
}
diff --git a/drivers/perf/Kconfig b/drivers/perf/Kconfig
index bab8ba6..4e268de 100644
--- a/drivers/perf/Kconfig
+++ b/drivers/perf/Kconfig
@@ -284,4 +284,11 @@
If unsure say 'm'.
+config MARVELL_PEM_PMU
+ tristate "MARVELL PEM PMU Support"
+ depends on ARCH_THUNDER || (COMPILE_TEST && 64BIT)
+ help
+ Enable support for PCIe Interface performance monitoring
+ on Marvell platform.
+
endmenu
diff --git a/drivers/perf/Makefile b/drivers/perf/Makefile
index 8268f38..de71d25 100644
--- a/drivers/perf/Makefile
+++ b/drivers/perf/Makefile
@@ -26,6 +26,7 @@
obj-$(CONFIG_ARM_DMC620_PMU) += arm_dmc620_pmu.o
obj-$(CONFIG_MARVELL_CN10K_TAD_PMU) += marvell_cn10k_tad_pmu.o
obj-$(CONFIG_MARVELL_CN10K_DDR_PMU) += marvell_cn10k_ddr_pmu.o
+obj-$(CONFIG_MARVELL_PEM_PMU) += marvell_pem_pmu.o
obj-$(CONFIG_APPLE_M1_CPU_PMU) += apple_m1_cpu_pmu.o
obj-$(CONFIG_ALIBABA_UNCORE_DRW_PMU) += alibaba_uncore_drw_pmu.o
obj-$(CONFIG_DWC_PCIE_PMU) += dwc_pcie_pmu.o
diff --git a/drivers/perf/alibaba_uncore_drw_pmu.c b/drivers/perf/alibaba_uncore_drw_pmu.c
index c6ff1bc..99a0ef98 100644
--- a/drivers/perf/alibaba_uncore_drw_pmu.c
+++ b/drivers/perf/alibaba_uncore_drw_pmu.c
@@ -782,7 +782,7 @@ static struct platform_driver ali_drw_pmu_driver = {
.acpi_match_table = ali_drw_acpi_match,
},
.probe = ali_drw_pmu_probe,
- .remove_new = ali_drw_pmu_remove,
+ .remove = ali_drw_pmu_remove,
};
static int __init ali_drw_pmu_init(void)
diff --git a/drivers/perf/amlogic/meson_g12_ddr_pmu.c b/drivers/perf/amlogic/meson_g12_ddr_pmu.c
index 99cc791..f33e9a4 100644
--- a/drivers/perf/amlogic/meson_g12_ddr_pmu.c
+++ b/drivers/perf/amlogic/meson_g12_ddr_pmu.c
@@ -379,7 +379,7 @@ MODULE_DEVICE_TABLE(of, meson_ddr_pmu_dt_match);
static struct platform_driver g12_ddr_pmu_driver = {
.probe = g12_ddr_pmu_probe,
- .remove_new = g12_ddr_pmu_remove,
+ .remove = g12_ddr_pmu_remove,
.driver = {
.name = "meson-g12-ddr-pmu",
diff --git a/drivers/perf/arm-cci.c b/drivers/perf/arm-cci.c
index c76bac6..1cc3214 100644
--- a/drivers/perf/arm-cci.c
+++ b/drivers/perf/arm-cci.c
@@ -1705,7 +1705,7 @@ static struct platform_driver cci_pmu_driver = {
.suppress_bind_attrs = true,
},
.probe = cci_pmu_probe,
- .remove_new = cci_pmu_remove,
+ .remove = cci_pmu_remove,
};
module_platform_driver(cci_pmu_driver);
diff --git a/drivers/perf/arm-ccn.c b/drivers/perf/arm-ccn.c
index 5c66b92..d5fcea3 100644
--- a/drivers/perf/arm-ccn.c
+++ b/drivers/perf/arm-ccn.c
@@ -1529,7 +1529,7 @@ static struct platform_driver arm_ccn_driver = {
.suppress_bind_attrs = true,
},
.probe = arm_ccn_probe,
- .remove_new = arm_ccn_remove,
+ .remove = arm_ccn_remove,
};
static int __init arm_ccn_init(void)
diff --git a/drivers/perf/arm-cmn.c b/drivers/perf/arm-cmn.c
index 397a464..49bd811 100644
--- a/drivers/perf/arm-cmn.c
+++ b/drivers/perf/arm-cmn.c
@@ -2662,7 +2662,7 @@ static struct platform_driver arm_cmn_driver = {
.acpi_match_table = ACPI_PTR(arm_cmn_acpi_match),
},
.probe = arm_cmn_probe,
- .remove_new = arm_cmn_remove,
+ .remove = arm_cmn_remove,
};
static int __init arm_cmn_init(void)
diff --git a/drivers/perf/arm_cspmu/arm_cspmu.c b/drivers/perf/arm_cspmu/arm_cspmu.c
index 2158a59..81e8b97e9 100644
--- a/drivers/perf/arm_cspmu/arm_cspmu.c
+++ b/drivers/perf/arm_cspmu/arm_cspmu.c
@@ -1282,7 +1282,7 @@ static struct platform_driver arm_cspmu_driver = {
.suppress_bind_attrs = true,
},
.probe = arm_cspmu_device_probe,
- .remove_new = arm_cspmu_device_remove,
+ .remove = arm_cspmu_device_remove,
.id_table = arm_cspmu_id,
};
diff --git a/drivers/perf/arm_dmc620_pmu.c b/drivers/perf/arm_dmc620_pmu.c
index 7e5f1d4..619cf93 100644
--- a/drivers/perf/arm_dmc620_pmu.c
+++ b/drivers/perf/arm_dmc620_pmu.c
@@ -750,7 +750,7 @@ static struct platform_driver dmc620_pmu_driver = {
.suppress_bind_attrs = true,
},
.probe = dmc620_pmu_device_probe,
- .remove_new = dmc620_pmu_device_remove,
+ .remove = dmc620_pmu_device_remove,
};
static int __init dmc620_pmu_init(void)
diff --git a/drivers/perf/arm_dsu_pmu.c b/drivers/perf/arm_dsu_pmu.c
index f2bd25a..cb4fb59 100644
--- a/drivers/perf/arm_dsu_pmu.c
+++ b/drivers/perf/arm_dsu_pmu.c
@@ -787,7 +787,7 @@ static struct platform_driver dsu_pmu_driver = {
.suppress_bind_attrs = true,
},
.probe = dsu_pmu_device_probe,
- .remove_new = dsu_pmu_device_remove,
+ .remove = dsu_pmu_device_remove,
};
static int dsu_pmu_cpu_online(unsigned int cpu, struct hlist_node *node)
diff --git a/drivers/perf/arm_pmuv3.c b/drivers/perf/arm_pmuv3.c
index 0afe02f..b5cc11a 100644
--- a/drivers/perf/arm_pmuv3.c
+++ b/drivers/perf/arm_pmuv3.c
@@ -770,18 +770,27 @@ static void armv8pmu_enable_user_access(struct arm_pmu *cpu_pmu)
int i;
struct pmu_hw_events *cpuc = this_cpu_ptr(cpu_pmu->hw_events);
- /* Clear any unused counters to avoid leaking their contents */
- for_each_andnot_bit(i, cpu_pmu->cntr_mask, cpuc->used_mask,
- ARMPMU_MAX_HWEVENTS) {
- if (i == ARMV8_PMU_CYCLE_IDX)
- write_pmccntr(0);
- else if (i == ARMV8_PMU_INSTR_IDX)
- write_pmicntr(0);
- else
- armv8pmu_write_evcntr(i, 0);
+ if (is_pmuv3p9(cpu_pmu->pmuver)) {
+ u64 mask = 0;
+ for_each_set_bit(i, cpuc->used_mask, ARMPMU_MAX_HWEVENTS) {
+ if (armv8pmu_event_has_user_read(cpuc->events[i]))
+ mask |= BIT(i);
+ }
+ write_pmuacr(mask);
+ } else {
+ /* Clear any unused counters to avoid leaking their contents */
+ for_each_andnot_bit(i, cpu_pmu->cntr_mask, cpuc->used_mask,
+ ARMPMU_MAX_HWEVENTS) {
+ if (i == ARMV8_PMU_CYCLE_IDX)
+ write_pmccntr(0);
+ else if (i == ARMV8_PMU_INSTR_IDX)
+ write_pmicntr(0);
+ else
+ armv8pmu_write_evcntr(i, 0);
+ }
}
- update_pmuserenr(ARMV8_PMU_USERENR_ER | ARMV8_PMU_USERENR_CR);
+ update_pmuserenr(ARMV8_PMU_USERENR_ER | ARMV8_PMU_USERENR_CR | ARMV8_PMU_USERENR_UEN);
}
static void armv8pmu_enable_event(struct perf_event *event)
@@ -1364,6 +1373,8 @@ PMUV3_INIT_SIMPLE(armv8_neoverse_v3ae)
PMUV3_INIT_SIMPLE(armv8_nvidia_carmel)
PMUV3_INIT_SIMPLE(armv8_nvidia_denver)
+PMUV3_INIT_SIMPLE(armv8_samsung_mongoose)
+
PMUV3_INIT_MAP_EVENT(armv8_cortex_a35, armv8_a53_map_event)
PMUV3_INIT_MAP_EVENT(armv8_cortex_a53, armv8_a53_map_event)
PMUV3_INIT_MAP_EVENT(armv8_cortex_a57, armv8_a57_map_event)
@@ -1409,6 +1420,7 @@ static const struct of_device_id armv8_pmu_of_device_ids[] = {
{.compatible = "brcm,vulcan-pmu", .data = armv8_brcm_vulcan_pmu_init},
{.compatible = "nvidia,carmel-pmu", .data = armv8_nvidia_carmel_pmu_init},
{.compatible = "nvidia,denver-pmu", .data = armv8_nvidia_denver_pmu_init},
+ {.compatible = "samsung,mongoose-pmu", .data = armv8_samsung_mongoose_pmu_init},
{},
};
diff --git a/drivers/perf/arm_smmuv3_pmu.c b/drivers/perf/arm_smmuv3_pmu.c
index d5fa92b..b1510f6 100644
--- a/drivers/perf/arm_smmuv3_pmu.c
+++ b/drivers/perf/arm_smmuv3_pmu.c
@@ -996,7 +996,7 @@ static struct platform_driver smmu_pmu_driver = {
.suppress_bind_attrs = true,
},
.probe = smmu_pmu_probe,
- .remove_new = smmu_pmu_remove,
+ .remove = smmu_pmu_remove,
.shutdown = smmu_pmu_shutdown,
};
diff --git a/drivers/perf/arm_spe_pmu.c b/drivers/perf/arm_spe_pmu.c
index 3569050..fd5b787 100644
--- a/drivers/perf/arm_spe_pmu.c
+++ b/drivers/perf/arm_spe_pmu.c
@@ -1280,7 +1280,7 @@ static struct platform_driver arm_spe_pmu_driver = {
.suppress_bind_attrs = true,
},
.probe = arm_spe_pmu_device_probe,
- .remove_new = arm_spe_pmu_device_remove,
+ .remove = arm_spe_pmu_device_remove,
};
static int __init arm_spe_pmu_init(void)
diff --git a/drivers/perf/cxl_pmu.c b/drivers/perf/cxl_pmu.c
index 43d68b6..bee4b5b 100644
--- a/drivers/perf/cxl_pmu.c
+++ b/drivers/perf/cxl_pmu.c
@@ -354,7 +354,7 @@ static struct attribute *cxl_pmu_event_attrs[] = {
CXL_PMU_EVENT_CXL_ATTR(d2h_req_wowrinvf, CXL_PMU_GID_D2H_REQ, BIT(13)),
CXL_PMU_EVENT_CXL_ATTR(d2h_req_wrinv, CXL_PMU_GID_D2H_REQ, BIT(14)),
CXL_PMU_EVENT_CXL_ATTR(d2h_req_cacheflushed, CXL_PMU_GID_D2H_REQ, BIT(16)),
- /* CXL rev 3.0 Table 3-20 - D2H Repsonse Encodings */
+ /* CXL rev 3.0 Table 3-20 - D2H Response Encodings */
CXL_PMU_EVENT_CXL_ATTR(d2h_rsp_rspihiti, CXL_PMU_GID_D2H_RSP, BIT(4)),
CXL_PMU_EVENT_CXL_ATTR(d2h_rsp_rspvhitv, CXL_PMU_GID_D2H_RSP, BIT(6)),
CXL_PMU_EVENT_CXL_ATTR(d2h_rsp_rspihitse, CXL_PMU_GID_D2H_RSP, BIT(5)),
@@ -377,12 +377,14 @@ static struct attribute *cxl_pmu_event_attrs[] = {
/* CXL rev 3.0 Table 13-5 directly lists these */
CXL_PMU_EVENT_CXL_ATTR(cachedata_d2h_data, CXL_PMU_GID_CACHE_DATA, BIT(0)),
CXL_PMU_EVENT_CXL_ATTR(cachedata_h2d_data, CXL_PMU_GID_CACHE_DATA, BIT(1)),
- /* CXL rev 3.0 Table 3-29 M2S Req Memory Opcodes */
+ /* CXL rev 3.1 Table 3-35 M2S Req Memory Opcodes */
CXL_PMU_EVENT_CXL_ATTR(m2s_req_meminv, CXL_PMU_GID_M2S_REQ, BIT(0)),
CXL_PMU_EVENT_CXL_ATTR(m2s_req_memrd, CXL_PMU_GID_M2S_REQ, BIT(1)),
CXL_PMU_EVENT_CXL_ATTR(m2s_req_memrddata, CXL_PMU_GID_M2S_REQ, BIT(2)),
CXL_PMU_EVENT_CXL_ATTR(m2s_req_memrdfwd, CXL_PMU_GID_M2S_REQ, BIT(3)),
CXL_PMU_EVENT_CXL_ATTR(m2s_req_memwrfwd, CXL_PMU_GID_M2S_REQ, BIT(4)),
+ CXL_PMU_EVENT_CXL_ATTR(m2s_req_memrdtee, CXL_PMU_GID_M2S_REQ, BIT(5)),
+ CXL_PMU_EVENT_CXL_ATTR(m2s_req_memrddatatee, CXL_PMU_GID_M2S_REQ, BIT(6)),
CXL_PMU_EVENT_CXL_ATTR(m2s_req_memspecrd, CXL_PMU_GID_M2S_REQ, BIT(8)),
CXL_PMU_EVENT_CXL_ATTR(m2s_req_meminvnt, CXL_PMU_GID_M2S_REQ, BIT(9)),
CXL_PMU_EVENT_CXL_ATTR(m2s_req_memcleanevict, CXL_PMU_GID_M2S_REQ, BIT(10)),
@@ -404,10 +406,11 @@ static struct attribute *cxl_pmu_event_attrs[] = {
CXL_PMU_EVENT_CXL_ATTR(s2m_bisnp_curblk, CXL_PMU_GID_S2M_BISNP, BIT(4)),
CXL_PMU_EVENT_CXL_ATTR(s2m_bisnp_datblk, CXL_PMU_GID_S2M_BISNP, BIT(5)),
CXL_PMU_EVENT_CXL_ATTR(s2m_bisnp_invblk, CXL_PMU_GID_S2M_BISNP, BIT(6)),
- /* CXL rev 3.0 Table 3-43 S2M NDR Opcopdes */
+ /* CXL rev 3.1 Table 3-50 S2M NDR Opcopdes */
CXL_PMU_EVENT_CXL_ATTR(s2m_ndr_cmp, CXL_PMU_GID_S2M_NDR, BIT(0)),
CXL_PMU_EVENT_CXL_ATTR(s2m_ndr_cmps, CXL_PMU_GID_S2M_NDR, BIT(1)),
CXL_PMU_EVENT_CXL_ATTR(s2m_ndr_cmpe, CXL_PMU_GID_S2M_NDR, BIT(2)),
+ CXL_PMU_EVENT_CXL_ATTR(s2m_ndr_cmpm, CXL_PMU_GID_S2M_NDR, BIT(3)),
CXL_PMU_EVENT_CXL_ATTR(s2m_ndr_biconflictack, CXL_PMU_GID_S2M_NDR, BIT(4)),
/* CXL rev 3.0 Table 3-46 S2M DRS opcodes */
CXL_PMU_EVENT_CXL_ATTR(s2m_drs_memdata, CXL_PMU_GID_S2M_DRS, BIT(0)),
diff --git a/drivers/perf/dwc_pcie_pmu.c b/drivers/perf/dwc_pcie_pmu.c
index 4ca50f9..9cbea96 100644
--- a/drivers/perf/dwc_pcie_pmu.c
+++ b/drivers/perf/dwc_pcie_pmu.c
@@ -82,7 +82,6 @@ struct dwc_pcie_pmu {
u16 ras_des_offset;
u32 nr_lanes;
- struct list_head pmu_node;
struct hlist_node cpuhp_node;
struct perf_event *event[DWC_PCIE_EVENT_TYPE_MAX];
int on_cpu;
@@ -107,6 +106,7 @@ struct dwc_pcie_vendor_id {
static const struct dwc_pcie_vendor_id dwc_pcie_vendor_ids[] = {
{.vendor_id = PCI_VENDOR_ID_ALIBABA },
+ {.vendor_id = PCI_VENDOR_ID_AMPERE },
{.vendor_id = PCI_VENDOR_ID_QCOM },
{} /* terminator */
};
@@ -203,10 +203,10 @@ static struct attribute *dwc_pcie_pmu_time_event_attrs[] = {
DWC_PCIE_PMU_TIME_BASE_EVENT_ATTR(L1_AUX, 0x09),
/* Group #1 */
- DWC_PCIE_PMU_TIME_BASE_EVENT_ATTR(Tx_PCIe_TLP_Data_Payload, 0x20),
- DWC_PCIE_PMU_TIME_BASE_EVENT_ATTR(Rx_PCIe_TLP_Data_Payload, 0x21),
- DWC_PCIE_PMU_TIME_BASE_EVENT_ATTR(Tx_CCIX_TLP_Data_Payload, 0x22),
- DWC_PCIE_PMU_TIME_BASE_EVENT_ATTR(Rx_CCIX_TLP_Data_Payload, 0x23),
+ DWC_PCIE_PMU_TIME_BASE_EVENT_ATTR(tx_pcie_tlp_data_payload, 0x20),
+ DWC_PCIE_PMU_TIME_BASE_EVENT_ATTR(rx_pcie_tlp_data_payload, 0x21),
+ DWC_PCIE_PMU_TIME_BASE_EVENT_ATTR(tx_ccix_tlp_data_payload, 0x22),
+ DWC_PCIE_PMU_TIME_BASE_EVENT_ATTR(rx_ccix_tlp_data_payload, 0x23),
/*
* Leave it to the user to specify the lane ID to avoid generating
@@ -216,9 +216,9 @@ static struct attribute *dwc_pcie_pmu_time_event_attrs[] = {
DWC_PCIE_PMU_LANE_EVENT_ATTR(tx_update_fc_dllp, 0x601),
DWC_PCIE_PMU_LANE_EVENT_ATTR(rx_ack_dllp, 0x602),
DWC_PCIE_PMU_LANE_EVENT_ATTR(rx_update_fc_dllp, 0x603),
- DWC_PCIE_PMU_LANE_EVENT_ATTR(rx_nulified_tlp, 0x604),
- DWC_PCIE_PMU_LANE_EVENT_ATTR(tx_nulified_tlp, 0x605),
- DWC_PCIE_PMU_LANE_EVENT_ATTR(rx_duplicate_tl, 0x606),
+ DWC_PCIE_PMU_LANE_EVENT_ATTR(rx_nullified_tlp, 0x604),
+ DWC_PCIE_PMU_LANE_EVENT_ATTR(tx_nullified_tlp, 0x605),
+ DWC_PCIE_PMU_LANE_EVENT_ATTR(rx_duplicate_tlp, 0x606),
DWC_PCIE_PMU_LANE_EVENT_ATTR(tx_memory_write, 0x700),
DWC_PCIE_PMU_LANE_EVENT_ATTR(tx_memory_read, 0x701),
DWC_PCIE_PMU_LANE_EVENT_ATTR(tx_configuration_write, 0x702),
diff --git a/drivers/perf/fsl_imx8_ddr_perf.c b/drivers/perf/fsl_imx8_ddr_perf.c
index 746b923..b989ffa 100644
--- a/drivers/perf/fsl_imx8_ddr_perf.c
+++ b/drivers/perf/fsl_imx8_ddr_perf.c
@@ -846,7 +846,7 @@ static struct platform_driver imx_ddr_pmu_driver = {
.suppress_bind_attrs = true,
},
.probe = ddr_perf_probe,
- .remove_new = ddr_perf_remove,
+ .remove = ddr_perf_remove,
};
module_platform_driver(imx_ddr_pmu_driver);
diff --git a/drivers/perf/fsl_imx9_ddr_perf.c b/drivers/perf/fsl_imx9_ddr_perf.c
index 69f920b..3c856d9 100644
--- a/drivers/perf/fsl_imx9_ddr_perf.c
+++ b/drivers/perf/fsl_imx9_ddr_perf.c
@@ -81,6 +81,10 @@ struct ddr_pmu {
int id;
};
+static const struct imx_ddr_devtype_data imx91_devtype_data = {
+ .identifier = "imx91",
+};
+
static const struct imx_ddr_devtype_data imx93_devtype_data = {
.identifier = "imx93",
};
@@ -100,6 +104,7 @@ static inline bool is_imx95(struct ddr_pmu *pmu)
}
static const struct of_device_id imx_ddr_pmu_dt_ids[] = {
+ { .compatible = "fsl,imx91-ddr-pmu", .data = &imx91_devtype_data },
{ .compatible = "fsl,imx93-ddr-pmu", .data = &imx93_devtype_data },
{ .compatible = "fsl,imx95-ddr-pmu", .data = &imx95_devtype_data },
{ /* sentinel */ }
@@ -848,7 +853,7 @@ static struct platform_driver imx_ddr_pmu_driver = {
.suppress_bind_attrs = true,
},
.probe = ddr_perf_probe,
- .remove_new = ddr_perf_remove,
+ .remove = ddr_perf_remove,
};
module_platform_driver(imx_ddr_pmu_driver);
diff --git a/drivers/perf/hisilicon/hisi_uncore_cpa_pmu.c b/drivers/perf/hisilicon/hisi_uncore_cpa_pmu.c
index 0e923f9..3f3fb1d 100644
--- a/drivers/perf/hisilicon/hisi_uncore_cpa_pmu.c
+++ b/drivers/perf/hisilicon/hisi_uncore_cpa_pmu.c
@@ -358,7 +358,7 @@ static struct platform_driver hisi_cpa_pmu_driver = {
.suppress_bind_attrs = true,
},
.probe = hisi_cpa_pmu_probe,
- .remove_new = hisi_cpa_pmu_remove,
+ .remove = hisi_cpa_pmu_remove,
};
static int __init hisi_cpa_pmu_module_init(void)
diff --git a/drivers/perf/hisilicon/hisi_uncore_ddrc_pmu.c b/drivers/perf/hisilicon/hisi_uncore_ddrc_pmu.c
index b804e37..a6ebf2e 100644
--- a/drivers/perf/hisilicon/hisi_uncore_ddrc_pmu.c
+++ b/drivers/perf/hisilicon/hisi_uncore_ddrc_pmu.c
@@ -547,7 +547,7 @@ static struct platform_driver hisi_ddrc_pmu_driver = {
.suppress_bind_attrs = true,
},
.probe = hisi_ddrc_pmu_probe,
- .remove_new = hisi_ddrc_pmu_remove,
+ .remove = hisi_ddrc_pmu_remove,
};
static int __init hisi_ddrc_pmu_module_init(void)
diff --git a/drivers/perf/hisilicon/hisi_uncore_hha_pmu.c b/drivers/perf/hisilicon/hisi_uncore_hha_pmu.c
index 21e69b1..3262487 100644
--- a/drivers/perf/hisilicon/hisi_uncore_hha_pmu.c
+++ b/drivers/perf/hisilicon/hisi_uncore_hha_pmu.c
@@ -550,7 +550,7 @@ static struct platform_driver hisi_hha_pmu_driver = {
.suppress_bind_attrs = true,
},
.probe = hisi_hha_pmu_probe,
- .remove_new = hisi_hha_pmu_remove,
+ .remove = hisi_hha_pmu_remove,
};
static int __init hisi_hha_pmu_module_init(void)
diff --git a/drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c b/drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c
index 51ba768..c235b46 100644
--- a/drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c
+++ b/drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c
@@ -584,7 +584,7 @@ static struct platform_driver hisi_l3c_pmu_driver = {
.suppress_bind_attrs = true,
},
.probe = hisi_l3c_pmu_probe,
- .remove_new = hisi_l3c_pmu_remove,
+ .remove = hisi_l3c_pmu_remove,
};
static int __init hisi_l3c_pmu_module_init(void)
diff --git a/drivers/perf/hisilicon/hisi_uncore_pa_pmu.c b/drivers/perf/hisilicon/hisi_uncore_pa_pmu.c
index 3cdb35c..c0f5d7c 100644
--- a/drivers/perf/hisilicon/hisi_uncore_pa_pmu.c
+++ b/drivers/perf/hisilicon/hisi_uncore_pa_pmu.c
@@ -538,7 +538,7 @@ static struct platform_driver hisi_pa_pmu_driver = {
.suppress_bind_attrs = true,
},
.probe = hisi_pa_pmu_probe,
- .remove_new = hisi_pa_pmu_remove,
+ .remove = hisi_pa_pmu_remove,
};
static int __init hisi_pa_pmu_module_init(void)
diff --git a/drivers/perf/hisilicon/hisi_uncore_sllc_pmu.c b/drivers/perf/hisilicon/hisi_uncore_sllc_pmu.c
index 765bbd6..c5f4764e 100644
--- a/drivers/perf/hisilicon/hisi_uncore_sllc_pmu.c
+++ b/drivers/perf/hisilicon/hisi_uncore_sllc_pmu.c
@@ -476,7 +476,7 @@ static struct platform_driver hisi_sllc_pmu_driver = {
.suppress_bind_attrs = true,
},
.probe = hisi_sllc_pmu_probe,
- .remove_new = hisi_sllc_pmu_remove,
+ .remove = hisi_sllc_pmu_remove,
};
static int __init hisi_sllc_pmu_module_init(void)
diff --git a/drivers/perf/marvell_cn10k_ddr_pmu.c b/drivers/perf/marvell_cn10k_ddr_pmu.c
index 94f1ebc..8860d9f 100644
--- a/drivers/perf/marvell_cn10k_ddr_pmu.c
+++ b/drivers/perf/marvell_cn10k_ddr_pmu.c
@@ -732,7 +732,7 @@ static struct platform_driver cn10k_ddr_pmu_driver = {
.suppress_bind_attrs = true,
},
.probe = cn10k_ddr_perf_probe,
- .remove_new = cn10k_ddr_perf_remove,
+ .remove = cn10k_ddr_perf_remove,
};
static int __init cn10k_ddr_pmu_init(void)
diff --git a/drivers/perf/marvell_cn10k_tad_pmu.c b/drivers/perf/marvell_cn10k_tad_pmu.c
index 9e635f3..cda55ee 100644
--- a/drivers/perf/marvell_cn10k_tad_pmu.c
+++ b/drivers/perf/marvell_cn10k_tad_pmu.c
@@ -383,7 +383,7 @@ static struct platform_driver tad_pmu_driver = {
.suppress_bind_attrs = true,
},
.probe = tad_pmu_probe,
- .remove_new = tad_pmu_remove,
+ .remove = tad_pmu_remove,
};
static int tad_pmu_offline_cpu(unsigned int cpu, struct hlist_node *node)
diff --git a/drivers/perf/marvell_pem_pmu.c b/drivers/perf/marvell_pem_pmu.c
new file mode 100644
index 0000000..29fbcd1
--- /dev/null
+++ b/drivers/perf/marvell_pem_pmu.c
@@ -0,0 +1,425 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Marvell PEM(PCIe RC) Performance Monitor Driver
+ *
+ * Copyright (C) 2024 Marvell.
+ */
+
+#include <linux/acpi.h>
+#include <linux/init.h>
+#include <linux/io.h>
+#include <linux/module.h>
+#include <linux/perf_event.h>
+#include <linux/platform_device.h>
+
+/*
+ * Each of these events maps to a free running 64 bit counter
+ * with no event control, but can be reset.
+ */
+enum pem_events {
+ IB_TLP_NPR,
+ IB_TLP_PR,
+ IB_TLP_CPL,
+ IB_TLP_DWORDS_NPR,
+ IB_TLP_DWORDS_PR,
+ IB_TLP_DWORDS_CPL,
+ IB_INFLIGHT,
+ IB_READS,
+ IB_REQ_NO_RO_NCB,
+ IB_REQ_NO_RO_EBUS,
+ OB_TLP_NPR,
+ OB_TLP_PR,
+ OB_TLP_CPL,
+ OB_TLP_DWORDS_NPR,
+ OB_TLP_DWORDS_PR,
+ OB_TLP_DWORDS_CPL,
+ OB_INFLIGHT,
+ OB_READS,
+ OB_MERGES_NPR,
+ OB_MERGES_PR,
+ OB_MERGES_CPL,
+ ATS_TRANS,
+ ATS_TRANS_LATENCY,
+ ATS_PRI,
+ ATS_PRI_LATENCY,
+ ATS_INV,
+ ATS_INV_LATENCY,
+ PEM_EVENTIDS_MAX
+};
+
+static u64 eventid_to_offset_table[] = {
+ [IB_TLP_NPR] = 0x0,
+ [IB_TLP_PR] = 0x8,
+ [IB_TLP_CPL] = 0x10,
+ [IB_TLP_DWORDS_NPR] = 0x100,
+ [IB_TLP_DWORDS_PR] = 0x108,
+ [IB_TLP_DWORDS_CPL] = 0x110,
+ [IB_INFLIGHT] = 0x200,
+ [IB_READS] = 0x300,
+ [IB_REQ_NO_RO_NCB] = 0x400,
+ [IB_REQ_NO_RO_EBUS] = 0x408,
+ [OB_TLP_NPR] = 0x500,
+ [OB_TLP_PR] = 0x508,
+ [OB_TLP_CPL] = 0x510,
+ [OB_TLP_DWORDS_NPR] = 0x600,
+ [OB_TLP_DWORDS_PR] = 0x608,
+ [OB_TLP_DWORDS_CPL] = 0x610,
+ [OB_INFLIGHT] = 0x700,
+ [OB_READS] = 0x800,
+ [OB_MERGES_NPR] = 0x900,
+ [OB_MERGES_PR] = 0x908,
+ [OB_MERGES_CPL] = 0x910,
+ [ATS_TRANS] = 0x2D18,
+ [ATS_TRANS_LATENCY] = 0x2D20,
+ [ATS_PRI] = 0x2D28,
+ [ATS_PRI_LATENCY] = 0x2D30,
+ [ATS_INV] = 0x2D38,
+ [ATS_INV_LATENCY] = 0x2D40,
+};
+
+struct pem_pmu {
+ struct pmu pmu;
+ void __iomem *base;
+ unsigned int cpu;
+ struct device *dev;
+ struct hlist_node node;
+};
+
+#define to_pem_pmu(p) container_of(p, struct pem_pmu, pmu)
+
+static int eventid_to_offset(int eventid)
+{
+ return eventid_to_offset_table[eventid];
+}
+
+/* Events */
+static ssize_t pem_pmu_event_show(struct device *dev,
+ struct device_attribute *attr,
+ char *page)
+{
+ struct perf_pmu_events_attr *pmu_attr;
+
+ pmu_attr = container_of(attr, struct perf_pmu_events_attr, attr);
+ return sysfs_emit(page, "event=0x%02llx\n", pmu_attr->id);
+}
+
+#define PEM_EVENT_ATTR(_name, _id) \
+ (&((struct perf_pmu_events_attr[]) { \
+ { .attr = __ATTR(_name, 0444, pem_pmu_event_show, NULL), \
+ .id = _id, } \
+ })[0].attr.attr)
+
+static struct attribute *pem_perf_events_attrs[] = {
+ PEM_EVENT_ATTR(ib_tlp_npr, IB_TLP_NPR),
+ PEM_EVENT_ATTR(ib_tlp_pr, IB_TLP_PR),
+ PEM_EVENT_ATTR(ib_tlp_cpl_partid, IB_TLP_CPL),
+ PEM_EVENT_ATTR(ib_tlp_dwords_npr, IB_TLP_DWORDS_NPR),
+ PEM_EVENT_ATTR(ib_tlp_dwords_pr, IB_TLP_DWORDS_PR),
+ PEM_EVENT_ATTR(ib_tlp_dwords_cpl_partid, IB_TLP_DWORDS_CPL),
+ PEM_EVENT_ATTR(ib_inflight, IB_INFLIGHT),
+ PEM_EVENT_ATTR(ib_reads, IB_READS),
+ PEM_EVENT_ATTR(ib_req_no_ro_ncb, IB_REQ_NO_RO_NCB),
+ PEM_EVENT_ATTR(ib_req_no_ro_ebus, IB_REQ_NO_RO_EBUS),
+ PEM_EVENT_ATTR(ob_tlp_npr_partid, OB_TLP_NPR),
+ PEM_EVENT_ATTR(ob_tlp_pr_partid, OB_TLP_PR),
+ PEM_EVENT_ATTR(ob_tlp_cpl_partid, OB_TLP_CPL),
+ PEM_EVENT_ATTR(ob_tlp_dwords_npr_partid, OB_TLP_DWORDS_NPR),
+ PEM_EVENT_ATTR(ob_tlp_dwords_pr_partid, OB_TLP_DWORDS_PR),
+ PEM_EVENT_ATTR(ob_tlp_dwords_cpl_partid, OB_TLP_DWORDS_CPL),
+ PEM_EVENT_ATTR(ob_inflight_partid, OB_INFLIGHT),
+ PEM_EVENT_ATTR(ob_reads_partid, OB_READS),
+ PEM_EVENT_ATTR(ob_merges_npr_partid, OB_MERGES_NPR),
+ PEM_EVENT_ATTR(ob_merges_pr_partid, OB_MERGES_PR),
+ PEM_EVENT_ATTR(ob_merges_cpl_partid, OB_MERGES_CPL),
+ PEM_EVENT_ATTR(ats_trans, ATS_TRANS),
+ PEM_EVENT_ATTR(ats_trans_latency, ATS_TRANS_LATENCY),
+ PEM_EVENT_ATTR(ats_pri, ATS_PRI),
+ PEM_EVENT_ATTR(ats_pri_latency, ATS_PRI_LATENCY),
+ PEM_EVENT_ATTR(ats_inv, ATS_INV),
+ PEM_EVENT_ATTR(ats_inv_latency, ATS_INV_LATENCY),
+ NULL
+};
+
+static struct attribute_group pem_perf_events_attr_group = {
+ .name = "events",
+ .attrs = pem_perf_events_attrs,
+};
+
+PMU_FORMAT_ATTR(event, "config:0-5");
+
+static struct attribute *pem_perf_format_attrs[] = {
+ &format_attr_event.attr,
+ NULL
+};
+
+static struct attribute_group pem_perf_format_attr_group = {
+ .name = "format",
+ .attrs = pem_perf_format_attrs,
+};
+
+/* cpumask */
+static ssize_t pem_perf_cpumask_show(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct pem_pmu *pmu = dev_get_drvdata(dev);
+
+ return cpumap_print_to_pagebuf(true, buf, cpumask_of(pmu->cpu));
+}
+
+static struct device_attribute pem_perf_cpumask_attr =
+ __ATTR(cpumask, 0444, pem_perf_cpumask_show, NULL);
+
+static struct attribute *pem_perf_cpumask_attrs[] = {
+ &pem_perf_cpumask_attr.attr,
+ NULL
+};
+
+static struct attribute_group pem_perf_cpumask_attr_group = {
+ .attrs = pem_perf_cpumask_attrs,
+};
+
+static const struct attribute_group *pem_perf_attr_groups[] = {
+ &pem_perf_events_attr_group,
+ &pem_perf_cpumask_attr_group,
+ &pem_perf_format_attr_group,
+ NULL
+};
+
+static int pem_perf_event_init(struct perf_event *event)
+{
+ struct pem_pmu *pmu = to_pem_pmu(event->pmu);
+ struct hw_perf_event *hwc = &event->hw;
+ struct perf_event *sibling;
+
+ if (event->attr.type != event->pmu->type)
+ return -ENOENT;
+
+ if (event->attr.config >= PEM_EVENTIDS_MAX)
+ return -EINVAL;
+
+ if (is_sampling_event(event) ||
+ event->attach_state & PERF_ATTACH_TASK) {
+ return -EOPNOTSUPP;
+ }
+
+ if (event->cpu < 0)
+ return -EOPNOTSUPP;
+
+ /* We must NOT create groups containing mixed PMUs */
+ if (event->group_leader->pmu != event->pmu &&
+ !is_software_event(event->group_leader))
+ return -EINVAL;
+
+ for_each_sibling_event(sibling, event->group_leader) {
+ if (sibling->pmu != event->pmu &&
+ !is_software_event(sibling))
+ return -EINVAL;
+ }
+ /*
+ * Set ownership of event to one CPU, same event can not be observed
+ * on multiple cpus at same time.
+ */
+ event->cpu = pmu->cpu;
+ hwc->idx = -1;
+ return 0;
+}
+
+static u64 pem_perf_read_counter(struct pem_pmu *pmu,
+ struct perf_event *event, int eventid)
+{
+ return readq_relaxed(pmu->base + eventid_to_offset(eventid));
+}
+
+static void pem_perf_event_update(struct perf_event *event)
+{
+ struct pem_pmu *pmu = to_pem_pmu(event->pmu);
+ struct hw_perf_event *hwc = &event->hw;
+ u64 prev_count, new_count;
+
+ do {
+ prev_count = local64_read(&hwc->prev_count);
+ new_count = pem_perf_read_counter(pmu, event, hwc->idx);
+ } while (local64_xchg(&hwc->prev_count, new_count) != prev_count);
+
+ local64_add((new_count - prev_count), &event->count);
+}
+
+static void pem_perf_event_start(struct perf_event *event, int flags)
+{
+ struct pem_pmu *pmu = to_pem_pmu(event->pmu);
+ struct hw_perf_event *hwc = &event->hw;
+ int eventid = hwc->idx;
+
+ /*
+ * All counters are free-running and associated with
+ * a fixed event to track in Hardware
+ */
+ local64_set(&hwc->prev_count,
+ pem_perf_read_counter(pmu, event, eventid));
+
+ hwc->state = 0;
+}
+
+static int pem_perf_event_add(struct perf_event *event, int flags)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ hwc->idx = event->attr.config;
+ if (WARN_ON_ONCE(hwc->idx >= PEM_EVENTIDS_MAX))
+ return -EINVAL;
+ hwc->state |= PERF_HES_STOPPED;
+
+ if (flags & PERF_EF_START)
+ pem_perf_event_start(event, flags);
+
+ return 0;
+}
+
+static void pem_perf_event_stop(struct perf_event *event, int flags)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ if (flags & PERF_EF_UPDATE)
+ pem_perf_event_update(event);
+
+ hwc->state |= PERF_HES_STOPPED;
+}
+
+static void pem_perf_event_del(struct perf_event *event, int flags)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ pem_perf_event_stop(event, PERF_EF_UPDATE);
+ hwc->idx = -1;
+}
+
+static int pem_pmu_offline_cpu(unsigned int cpu, struct hlist_node *node)
+{
+ struct pem_pmu *pmu = hlist_entry_safe(node, struct pem_pmu, node);
+ unsigned int target;
+
+ if (cpu != pmu->cpu)
+ return 0;
+
+ target = cpumask_any_but(cpu_online_mask, cpu);
+ if (target >= nr_cpu_ids)
+ return 0;
+
+ perf_pmu_migrate_context(&pmu->pmu, cpu, target);
+ pmu->cpu = target;
+ return 0;
+}
+
+static int pem_perf_probe(struct platform_device *pdev)
+{
+ struct pem_pmu *pem_pmu;
+ struct resource *res;
+ void __iomem *base;
+ char *name;
+ int ret;
+
+ pem_pmu = devm_kzalloc(&pdev->dev, sizeof(*pem_pmu), GFP_KERNEL);
+ if (!pem_pmu)
+ return -ENOMEM;
+
+ pem_pmu->dev = &pdev->dev;
+ platform_set_drvdata(pdev, pem_pmu);
+
+ base = devm_platform_get_and_ioremap_resource(pdev, 0, &res);
+ if (IS_ERR(base))
+ return PTR_ERR(base);
+
+ pem_pmu->base = base;
+
+ pem_pmu->pmu = (struct pmu) {
+ .module = THIS_MODULE,
+ .capabilities = PERF_PMU_CAP_NO_EXCLUDE,
+ .task_ctx_nr = perf_invalid_context,
+ .attr_groups = pem_perf_attr_groups,
+ .event_init = pem_perf_event_init,
+ .add = pem_perf_event_add,
+ .del = pem_perf_event_del,
+ .start = pem_perf_event_start,
+ .stop = pem_perf_event_stop,
+ .read = pem_perf_event_update,
+ };
+
+ /* Choose this cpu to collect perf data */
+ pem_pmu->cpu = raw_smp_processor_id();
+
+ name = devm_kasprintf(pem_pmu->dev, GFP_KERNEL, "mrvl_pcie_rc_pmu_%llx",
+ res->start);
+ if (!name)
+ return -ENOMEM;
+
+ cpuhp_state_add_instance_nocalls(CPUHP_AP_PERF_ARM_MRVL_PEM_ONLINE,
+ &pem_pmu->node);
+
+ ret = perf_pmu_register(&pem_pmu->pmu, name, -1);
+ if (ret)
+ goto error;
+
+ return 0;
+error:
+ cpuhp_state_remove_instance_nocalls(CPUHP_AP_PERF_ARM_MRVL_PEM_ONLINE,
+ &pem_pmu->node);
+ return ret;
+}
+
+static void pem_perf_remove(struct platform_device *pdev)
+{
+ struct pem_pmu *pem_pmu = platform_get_drvdata(pdev);
+
+ cpuhp_state_remove_instance_nocalls(CPUHP_AP_PERF_ARM_MRVL_PEM_ONLINE,
+ &pem_pmu->node);
+
+ perf_pmu_unregister(&pem_pmu->pmu);
+}
+
+#ifdef CONFIG_ACPI
+static const struct acpi_device_id pem_pmu_acpi_match[] = {
+ {"MRVL000E", 0},
+ {}
+};
+MODULE_DEVICE_TABLE(acpi, pem_pmu_acpi_match);
+#endif
+
+static struct platform_driver pem_pmu_driver = {
+ .driver = {
+ .name = "pem-pmu",
+ .acpi_match_table = ACPI_PTR(pem_pmu_acpi_match),
+ .suppress_bind_attrs = true,
+ },
+ .probe = pem_perf_probe,
+ .remove = pem_perf_remove,
+};
+
+static int __init pem_pmu_init(void)
+{
+ int ret;
+
+ ret = cpuhp_setup_state_multi(CPUHP_AP_PERF_ARM_MRVL_PEM_ONLINE,
+ "perf/marvell/pem:online", NULL,
+ pem_pmu_offline_cpu);
+ if (ret)
+ return ret;
+
+ ret = platform_driver_register(&pem_pmu_driver);
+ if (ret)
+ cpuhp_remove_multi_state(CPUHP_AP_PERF_ARM_MRVL_PEM_ONLINE);
+ return ret;
+}
+
+static void __exit pem_pmu_exit(void)
+{
+ platform_driver_unregister(&pem_pmu_driver);
+ cpuhp_remove_multi_state(CPUHP_AP_PERF_ARM_MRVL_PEM_ONLINE);
+}
+
+module_init(pem_pmu_init);
+module_exit(pem_pmu_exit);
+
+MODULE_DESCRIPTION("Marvell PEM Perf driver");
+MODULE_AUTHOR("Gowthami Thiagarajan <gthiagarajan@marvell.com>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/perf/qcom_l2_pmu.c b/drivers/perf/qcom_l2_pmu.c
index 980e305..ea8c857 100644
--- a/drivers/perf/qcom_l2_pmu.c
+++ b/drivers/perf/qcom_l2_pmu.c
@@ -981,7 +981,7 @@ static struct platform_driver l2_cache_pmu_driver = {
.suppress_bind_attrs = true,
},
.probe = l2_cache_pmu_probe,
- .remove_new = l2_cache_pmu_remove,
+ .remove = l2_cache_pmu_remove,
};
static int __init register_l2_cache_pmu_driver(void)
diff --git a/drivers/perf/riscv_pmu_sbi.c b/drivers/perf/riscv_pmu_sbi.c
index 391ca14..1aa303f 100644
--- a/drivers/perf/riscv_pmu_sbi.c
+++ b/drivers/perf/riscv_pmu_sbi.c
@@ -1393,8 +1393,9 @@ static int pmu_sbi_device_probe(struct platform_device *pdev)
goto out_unregister;
cpu = get_cpu();
-
ret = pmu_sbi_snapshot_setup(pmu, cpu);
+ put_cpu();
+
if (ret) {
/* Snapshot is an optional feature. Continue if not available */
pmu_sbi_snapshot_free(pmu);
@@ -1408,7 +1409,6 @@ static int pmu_sbi_device_probe(struct platform_device *pdev)
*/
static_branch_enable(&sbi_pmu_snapshot_available);
}
- put_cpu();
}
register_sysctl("kernel", sbi_pmu_sysctl_table);
diff --git a/drivers/perf/thunderx2_pmu.c b/drivers/perf/thunderx2_pmu.c
index faf763d..cadd602 100644
--- a/drivers/perf/thunderx2_pmu.c
+++ b/drivers/perf/thunderx2_pmu.c
@@ -1010,7 +1010,7 @@ static struct platform_driver tx2_uncore_driver = {
.suppress_bind_attrs = true,
},
.probe = tx2_uncore_probe,
- .remove_new = tx2_uncore_remove,
+ .remove = tx2_uncore_remove,
};
static int __init tx2_uncore_driver_init(void)
diff --git a/drivers/perf/xgene_pmu.c b/drivers/perf/xgene_pmu.c
index c01466a..33b5497 100644
--- a/drivers/perf/xgene_pmu.c
+++ b/drivers/perf/xgene_pmu.c
@@ -1943,7 +1943,7 @@ static void xgene_pmu_remove(struct platform_device *pdev)
static struct platform_driver xgene_pmu_driver = {
.probe = xgene_pmu_probe,
- .remove_new = xgene_pmu_remove,
+ .remove = xgene_pmu_remove,
.driver = {
.name = "xgene-pmu",
.of_match_table = xgene_pmu_of_match,
diff --git a/drivers/platform/chrome/cros_ec_chardev.c b/drivers/platform/chrome/cros_ec_chardev.c
index 7f034ea..21a4843 100644
--- a/drivers/platform/chrome/cros_ec_chardev.c
+++ b/drivers/platform/chrome/cros_ec_chardev.c
@@ -415,7 +415,7 @@ static struct platform_driver cros_ec_chardev_driver = {
.name = DRV_NAME,
},
.probe = cros_ec_chardev_probe,
- .remove_new = cros_ec_chardev_remove,
+ .remove = cros_ec_chardev_remove,
.id_table = cros_ec_chardev_id,
};
diff --git a/drivers/platform/chrome/cros_ec_debugfs.c b/drivers/platform/chrome/cros_ec_debugfs.c
index 839154c..92ac9a2 100644
--- a/drivers/platform/chrome/cros_ec_debugfs.c
+++ b/drivers/platform/chrome/cros_ec_debugfs.c
@@ -582,7 +582,7 @@ static struct platform_driver cros_ec_debugfs_driver = {
.probe_type = PROBE_PREFER_ASYNCHRONOUS,
},
.probe = cros_ec_debugfs_probe,
- .remove_new = cros_ec_debugfs_remove,
+ .remove = cros_ec_debugfs_remove,
.id_table = cros_ec_debugfs_id,
};
diff --git a/drivers/platform/chrome/cros_ec_i2c.c b/drivers/platform/chrome/cros_ec_i2c.c
index e29c51c..62662ba 100644
--- a/drivers/platform/chrome/cros_ec_i2c.c
+++ b/drivers/platform/chrome/cros_ec_i2c.c
@@ -352,7 +352,7 @@ MODULE_DEVICE_TABLE(of, cros_ec_i2c_of_match);
#endif
static const struct i2c_device_id cros_ec_i2c_id[] = {
- { "cros-ec-i2c", 0 },
+ { "cros-ec-i2c" },
{ }
};
MODULE_DEVICE_TABLE(i2c, cros_ec_i2c_id);
diff --git a/drivers/platform/chrome/cros_ec_lightbar.c b/drivers/platform/chrome/cros_ec_lightbar.c
index 1e69f61..87634f6 100644
--- a/drivers/platform/chrome/cros_ec_lightbar.c
+++ b/drivers/platform/chrome/cros_ec_lightbar.c
@@ -608,7 +608,7 @@ static struct platform_driver cros_ec_lightbar_driver = {
.probe_type = PROBE_PREFER_ASYNCHRONOUS,
},
.probe = cros_ec_lightbar_probe,
- .remove_new = cros_ec_lightbar_remove,
+ .remove = cros_ec_lightbar_remove,
.id_table = cros_ec_lightbar_id,
};
diff --git a/drivers/platform/chrome/cros_ec_lpc.c b/drivers/platform/chrome/cros_ec_lpc.c
index c784119..924bf4d 100644
--- a/drivers/platform/chrome/cros_ec_lpc.c
+++ b/drivers/platform/chrome/cros_ec_lpc.c
@@ -783,7 +783,7 @@ static struct platform_driver cros_ec_lpc_driver = {
.probe_type = PROBE_FORCE_SYNCHRONOUS,
},
.probe = cros_ec_lpc_probe,
- .remove_new = cros_ec_lpc_remove,
+ .remove = cros_ec_lpc_remove,
};
static struct platform_device cros_ec_lpc_device = {
diff --git a/drivers/platform/chrome/cros_ec_sysfs.c b/drivers/platform/chrome/cros_ec_sysfs.c
index 9c94414..bc1a5ba 100644
--- a/drivers/platform/chrome/cros_ec_sysfs.c
+++ b/drivers/platform/chrome/cros_ec_sysfs.c
@@ -359,7 +359,7 @@ static struct platform_driver cros_ec_sysfs_driver = {
.name = DRV_NAME,
},
.probe = cros_ec_sysfs_probe,
- .remove_new = cros_ec_sysfs_remove,
+ .remove = cros_ec_sysfs_remove,
.id_table = cros_ec_sysfs_id,
};
diff --git a/drivers/platform/chrome/cros_ec_typec.c b/drivers/platform/chrome/cros_ec_typec.c
index c7781ae..ae2f862 100644
--- a/drivers/platform/chrome/cros_ec_typec.c
+++ b/drivers/platform/chrome/cros_ec_typec.c
@@ -409,6 +409,7 @@ static int cros_typec_init_ports(struct cros_typec_data *typec)
return 0;
unregister_ports:
+ fwnode_handle_put(fwnode);
cros_unregister_ports(typec);
return ret;
}
@@ -1325,7 +1326,7 @@ static struct platform_driver cros_typec_driver = {
.pm = &cros_typec_pm_ops,
},
.probe = cros_typec_probe,
- .remove_new = cros_typec_remove,
+ .remove = cros_typec_remove,
};
module_platform_driver(cros_typec_driver);
diff --git a/drivers/platform/chrome/cros_ec_vbc.c b/drivers/platform/chrome/cros_ec_vbc.c
index 787a19d..7bdb489 100644
--- a/drivers/platform/chrome/cros_ec_vbc.c
+++ b/drivers/platform/chrome/cros_ec_vbc.c
@@ -145,7 +145,7 @@ static struct platform_driver cros_ec_vbc_driver = {
.name = DRV_NAME,
},
.probe = cros_ec_vbc_probe,
- .remove_new = cros_ec_vbc_remove,
+ .remove = cros_ec_vbc_remove,
.id_table = cros_ec_vbc_id,
};
diff --git a/drivers/platform/chrome/cros_hps_i2c.c b/drivers/platform/chrome/cros_hps_i2c.c
index dd14957..6b479cf 100644
--- a/drivers/platform/chrome/cros_hps_i2c.c
+++ b/drivers/platform/chrome/cros_hps_i2c.c
@@ -129,7 +129,7 @@ static int hps_resume(struct device *dev)
static DEFINE_RUNTIME_DEV_PM_OPS(hps_pm_ops, hps_suspend, hps_resume, NULL);
static const struct i2c_device_id hps_i2c_id[] = {
- { "cros-hps", 0 },
+ { "cros-hps" },
{ }
};
MODULE_DEVICE_TABLE(i2c, hps_i2c_id);
diff --git a/drivers/platform/chrome/cros_typec_switch.c b/drivers/platform/chrome/cros_typec_switch.c
index 07a1938..8d7c34a 100644
--- a/drivers/platform/chrome/cros_typec_switch.c
+++ b/drivers/platform/chrome/cros_typec_switch.c
@@ -318,7 +318,7 @@ static struct platform_driver cros_typec_switch_driver = {
.acpi_match_table = ACPI_PTR(cros_typec_switch_acpi_id),
},
.probe = cros_typec_switch_probe,
- .remove_new = cros_typec_switch_remove,
+ .remove = cros_typec_switch_remove,
};
module_platform_driver(cros_typec_switch_driver);
diff --git a/drivers/platform/chrome/cros_usbpd_logger.c b/drivers/platform/chrome/cros_usbpd_logger.c
index 930c2f4..cd71f1c 100644
--- a/drivers/platform/chrome/cros_usbpd_logger.c
+++ b/drivers/platform/chrome/cros_usbpd_logger.c
@@ -262,7 +262,7 @@ static struct platform_driver cros_usbpd_logger_driver = {
.pm = &cros_usbpd_logger_pm_ops,
},
.probe = cros_usbpd_logger_probe,
- .remove_new = cros_usbpd_logger_remove,
+ .remove = cros_usbpd_logger_remove,
.id_table = cros_usbpd_logger_id,
};
diff --git a/drivers/platform/chrome/cros_usbpd_notify.c b/drivers/platform/chrome/cros_usbpd_notify.c
index c83f81d..313d2bc 100644
--- a/drivers/platform/chrome/cros_usbpd_notify.c
+++ b/drivers/platform/chrome/cros_usbpd_notify.c
@@ -156,7 +156,7 @@ static struct platform_driver cros_usbpd_notify_acpi_driver = {
.acpi_match_table = cros_usbpd_notify_acpi_device_ids,
},
.probe = cros_usbpd_notify_probe_acpi,
- .remove_new = cros_usbpd_notify_remove_acpi,
+ .remove = cros_usbpd_notify_remove_acpi,
};
#endif /* CONFIG_ACPI */
@@ -230,7 +230,7 @@ static struct platform_driver cros_usbpd_notify_plat_driver = {
.name = DRV_NAME,
},
.probe = cros_usbpd_notify_probe_plat,
- .remove_new = cros_usbpd_notify_remove_plat,
+ .remove = cros_usbpd_notify_remove_plat,
.id_table = cros_usbpd_notify_id,
};
diff --git a/drivers/platform/chrome/wilco_ec/core.c b/drivers/platform/chrome/wilco_ec/core.c
index 3e6b6cd..9f978e5 100644
--- a/drivers/platform/chrome/wilco_ec/core.c
+++ b/drivers/platform/chrome/wilco_ec/core.c
@@ -163,7 +163,7 @@ static struct platform_driver wilco_ec_driver = {
.acpi_match_table = wilco_ec_acpi_device_ids,
},
.probe = wilco_ec_probe,
- .remove_new = wilco_ec_remove,
+ .remove = wilco_ec_remove,
.id_table = wilco_ec_id,
};
diff --git a/drivers/platform/chrome/wilco_ec/debugfs.c b/drivers/platform/chrome/wilco_ec/debugfs.c
index 99486086..0617292b 100644
--- a/drivers/platform/chrome/wilco_ec/debugfs.c
+++ b/drivers/platform/chrome/wilco_ec/debugfs.c
@@ -276,7 +276,7 @@ static struct platform_driver wilco_ec_debugfs_driver = {
.name = DRV_NAME,
},
.probe = wilco_ec_debugfs_probe,
- .remove_new = wilco_ec_debugfs_remove,
+ .remove = wilco_ec_debugfs_remove,
.id_table = wilco_ec_debugfs_id,
};
diff --git a/drivers/platform/chrome/wilco_ec/telemetry.c b/drivers/platform/chrome/wilco_ec/telemetry.c
index a87877e..7d8ae2c 100644
--- a/drivers/platform/chrome/wilco_ec/telemetry.c
+++ b/drivers/platform/chrome/wilco_ec/telemetry.c
@@ -417,7 +417,7 @@ MODULE_DEVICE_TABLE(platform, telem_id);
static struct platform_driver telem_driver = {
.probe = telem_device_probe,
- .remove_new = telem_device_remove,
+ .remove = telem_device_remove,
.driver = {
.name = DRV_NAME,
},
diff --git a/drivers/pmdomain/arm/scmi_perf_domain.c b/drivers/pmdomain/arm/scmi_perf_domain.c
index d7ef46c..3693423 100644
--- a/drivers/pmdomain/arm/scmi_perf_domain.c
+++ b/drivers/pmdomain/arm/scmi_perf_domain.c
@@ -125,7 +125,8 @@ static int scmi_perf_domain_probe(struct scmi_device *sdev)
scmi_pd->ph = ph;
scmi_pd->genpd.name = scmi_pd->info->name;
scmi_pd->genpd.flags = GENPD_FLAG_ALWAYS_ON |
- GENPD_FLAG_OPP_TABLE_FW;
+ GENPD_FLAG_OPP_TABLE_FW |
+ GENPD_FLAG_DEV_NAME_FW;
scmi_pd->genpd.set_performance_state = scmi_pd_set_perf_state;
scmi_pd->genpd.attach_dev = scmi_pd_attach_dev;
scmi_pd->genpd.detach_dev = scmi_pd_detach_dev;
diff --git a/drivers/pmdomain/core.c b/drivers/pmdomain/core.c
index 5ede0f7..29ad510 100644
--- a/drivers/pmdomain/core.c
+++ b/drivers/pmdomain/core.c
@@ -7,6 +7,7 @@
#define pr_fmt(fmt) "PM: " fmt
#include <linux/delay.h>
+#include <linux/idr.h>
#include <linux/kernel.h>
#include <linux/io.h>
#include <linux/platform_device.h>
@@ -23,6 +24,9 @@
#include <linux/cpu.h>
#include <linux/debugfs.h>
+/* Provides a unique ID for each genpd device */
+static DEFINE_IDA(genpd_ida);
+
#define GENPD_RETRY_MAX_MS 250 /* Approximate */
#define GENPD_DEV_CALLBACK(genpd, type, callback, dev) \
@@ -171,6 +175,7 @@ static const struct genpd_lock_ops genpd_raw_spin_ops = {
#define genpd_is_cpu_domain(genpd) (genpd->flags & GENPD_FLAG_CPU_DOMAIN)
#define genpd_is_rpm_always_on(genpd) (genpd->flags & GENPD_FLAG_RPM_ALWAYS_ON)
#define genpd_is_opp_table_fw(genpd) (genpd->flags & GENPD_FLAG_OPP_TABLE_FW)
+#define genpd_is_dev_name_fw(genpd) (genpd->flags & GENPD_FLAG_DEV_NAME_FW)
static inline bool irq_safe_dev_in_sleep_domain(struct device *dev,
const struct generic_pm_domain *genpd)
@@ -189,7 +194,7 @@ static inline bool irq_safe_dev_in_sleep_domain(struct device *dev,
if (ret)
dev_warn_once(dev, "PM domain %s will not be powered off\n",
- genpd->name);
+ dev_name(&genpd->dev));
return ret;
}
@@ -274,7 +279,7 @@ static void genpd_debug_remove(struct generic_pm_domain *genpd)
if (!genpd_debugfs_dir)
return;
- debugfs_lookup_and_remove(genpd->name, genpd_debugfs_dir);
+ debugfs_lookup_and_remove(dev_name(&genpd->dev), genpd_debugfs_dir);
}
static void genpd_update_accounting(struct generic_pm_domain *genpd)
@@ -731,7 +736,7 @@ static int _genpd_power_on(struct generic_pm_domain *genpd, bool timed)
genpd->states[state_idx].power_on_latency_ns = elapsed_ns;
genpd->gd->max_off_time_changed = true;
pr_debug("%s: Power-%s latency exceeded, new value %lld ns\n",
- genpd->name, "on", elapsed_ns);
+ dev_name(&genpd->dev), "on", elapsed_ns);
out:
raw_notifier_call_chain(&genpd->power_notifiers, GENPD_NOTIFY_ON, NULL);
@@ -782,7 +787,7 @@ static int _genpd_power_off(struct generic_pm_domain *genpd, bool timed)
genpd->states[state_idx].power_off_latency_ns = elapsed_ns;
genpd->gd->max_off_time_changed = true;
pr_debug("%s: Power-%s latency exceeded, new value %lld ns\n",
- genpd->name, "off", elapsed_ns);
+ dev_name(&genpd->dev), "off", elapsed_ns);
out:
raw_notifier_call_chain(&genpd->power_notifiers, GENPD_NOTIFY_OFF,
@@ -1940,7 +1945,7 @@ int dev_pm_genpd_add_notifier(struct device *dev, struct notifier_block *nb)
if (ret) {
dev_warn(dev, "failed to add notifier for PM domain %s\n",
- genpd->name);
+ dev_name(&genpd->dev));
return ret;
}
@@ -1987,7 +1992,7 @@ int dev_pm_genpd_remove_notifier(struct device *dev)
if (ret) {
dev_warn(dev, "failed to remove notifier for PM domain %s\n",
- genpd->name);
+ dev_name(&genpd->dev));
return ret;
}
@@ -2013,7 +2018,7 @@ static int genpd_add_subdomain(struct generic_pm_domain *genpd,
*/
if (!genpd_is_irq_safe(genpd) && genpd_is_irq_safe(subdomain)) {
WARN(1, "Parent %s of subdomain %s must be IRQ safe\n",
- genpd->name, subdomain->name);
+ dev_name(&genpd->dev), subdomain->name);
return -EINVAL;
}
@@ -2088,7 +2093,7 @@ int pm_genpd_remove_subdomain(struct generic_pm_domain *genpd,
if (!list_empty(&subdomain->parent_links) || subdomain->device_count) {
pr_warn("%s: unable to remove subdomain %s\n",
- genpd->name, subdomain->name);
+ dev_name(&genpd->dev), subdomain->name);
ret = -EBUSY;
goto out;
}
@@ -2225,6 +2230,7 @@ int pm_genpd_init(struct generic_pm_domain *genpd,
genpd->status = is_off ? GENPD_STATE_OFF : GENPD_STATE_ON;
genpd->device_count = 0;
genpd->provider = NULL;
+ genpd->device_id = -ENXIO;
genpd->has_provider = false;
genpd->accounting_time = ktime_get_mono_fast_ns();
genpd->domain.ops.runtime_suspend = genpd_runtime_suspend;
@@ -2265,7 +2271,18 @@ int pm_genpd_init(struct generic_pm_domain *genpd,
return ret;
device_initialize(&genpd->dev);
- dev_set_name(&genpd->dev, "%s", genpd->name);
+
+ if (!genpd_is_dev_name_fw(genpd)) {
+ dev_set_name(&genpd->dev, "%s", genpd->name);
+ } else {
+ ret = ida_alloc(&genpd_ida, GFP_KERNEL);
+ if (ret < 0) {
+ put_device(&genpd->dev);
+ return ret;
+ }
+ genpd->device_id = ret;
+ dev_set_name(&genpd->dev, "%s_%u", genpd->name, genpd->device_id);
+ }
mutex_lock(&gpd_list_lock);
list_add(&genpd->gpd_list_node, &gpd_list);
@@ -2287,13 +2304,13 @@ static int genpd_remove(struct generic_pm_domain *genpd)
if (genpd->has_provider) {
genpd_unlock(genpd);
- pr_err("Provider present, unable to remove %s\n", genpd->name);
+ pr_err("Provider present, unable to remove %s\n", dev_name(&genpd->dev));
return -EBUSY;
}
if (!list_empty(&genpd->parent_links) || genpd->device_count) {
genpd_unlock(genpd);
- pr_err("%s: unable to remove %s\n", __func__, genpd->name);
+ pr_err("%s: unable to remove %s\n", __func__, dev_name(&genpd->dev));
return -EBUSY;
}
@@ -2307,9 +2324,11 @@ static int genpd_remove(struct generic_pm_domain *genpd)
genpd_unlock(genpd);
genpd_debug_remove(genpd);
cancel_work_sync(&genpd->power_off_work);
+ if (genpd->device_id != -ENXIO)
+ ida_free(&genpd_ida, genpd->device_id);
genpd_free_data(genpd);
- pr_debug("%s: removed %s\n", __func__, genpd->name);
+ pr_debug("%s: removed %s\n", __func__, dev_name(&genpd->dev));
return 0;
}
@@ -3272,12 +3291,12 @@ static int genpd_summary_one(struct seq_file *s,
else
snprintf(state, sizeof(state), "%s",
status_lookup[genpd->status]);
- seq_printf(s, "%-30s %-30s %u", genpd->name, state, genpd->performance_state);
+ seq_printf(s, "%-30s %-30s %u", dev_name(&genpd->dev), state, genpd->performance_state);
/*
* Modifications on the list require holding locks on both
* parent and child, so we are safe.
- * Also genpd->name is immutable.
+ * Also the device name is immutable.
*/
list_for_each_entry(link, &genpd->parent_links, parent_node) {
if (list_is_first(&link->parent_node, &genpd->parent_links))
@@ -3502,7 +3521,7 @@ static void genpd_debug_add(struct generic_pm_domain *genpd)
if (!genpd_debugfs_dir)
return;
- d = debugfs_create_dir(genpd->name, genpd_debugfs_dir);
+ d = debugfs_create_dir(dev_name(&genpd->dev), genpd_debugfs_dir);
debugfs_create_file("current_state", 0444,
d, genpd, &status_fops);
diff --git a/drivers/pmdomain/imx/imx93-blk-ctrl.c b/drivers/pmdomain/imx/imx93-blk-ctrl.c
index 904ffa5..b10348a 100644
--- a/drivers/pmdomain/imx/imx93-blk-ctrl.c
+++ b/drivers/pmdomain/imx/imx93-blk-ctrl.c
@@ -313,7 +313,9 @@ static void imx93_blk_ctrl_remove(struct platform_device *pdev)
of_genpd_del_provider(pdev->dev.of_node);
- for (i = 0; bc->onecell_data.num_domains; i++) {
+ pm_runtime_disable(&pdev->dev);
+
+ for (i = 0; i < bc->onecell_data.num_domains; i++) {
struct imx93_blk_ctrl_domain *domain = &bc->domains[i];
pm_genpd_remove(&domain->genpd);
diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c
index 42a4a996..3ed642f 100644
--- a/drivers/s390/block/dasd.c
+++ b/drivers/s390/block/dasd.c
@@ -2117,7 +2117,7 @@ int dasd_flush_device_queue(struct dasd_device *device)
case DASD_CQR_IN_IO:
rc = device->discipline->term_IO(cqr);
if (rc) {
- /* unable to terminate requeust */
+ /* unable to terminate request */
dev_err(&device->cdev->dev,
"Flushing the DASD request queue failed\n");
/* stop flush processing */
diff --git a/drivers/s390/block/dasd_devmap.c b/drivers/s390/block/dasd_devmap.c
index 6adaeb9..71d8fb8 100644
--- a/drivers/s390/block/dasd_devmap.c
+++ b/drivers/s390/block/dasd_devmap.c
@@ -855,7 +855,7 @@ dasd_delete_device(struct dasd_device *device)
dev_set_drvdata(&device->cdev->dev, NULL);
spin_unlock_irqrestore(get_ccwdev_lock(device->cdev), flags);
- /* Removve copy relation */
+ /* Remove copy relation */
dasd_devmap_delete_copy_relation_device(device);
/*
* Drop ref_count by 3, one for the devmap reference, one for
diff --git a/drivers/s390/block/dasd_diag.c b/drivers/s390/block/dasd_diag.c
index 8245b74..26812ab 100644
--- a/drivers/s390/block/dasd_diag.c
+++ b/drivers/s390/block/dasd_diag.c
@@ -25,6 +25,7 @@
#include <linux/io.h>
#include <asm/irq.h>
#include <asm/vtoc.h>
+#include <asm/asm.h>
#include "dasd_int.h"
#include "dasd_diag.h"
@@ -67,22 +68,24 @@ static const u8 DASD_DIAG_CMS1[] = { 0xc3, 0xd4, 0xe2, 0xf1 };/* EBCDIC CMS1 */
static inline int __dia250(void *iob, int cmd)
{
union register_pair rx = { .even = (unsigned long)iob, };
+ int cc, exception;
typedef union {
struct dasd_diag_init_io init_io;
struct dasd_diag_rw_io rw_io;
} addr_type;
- int cc;
- cc = 3;
+ exception = 1;
asm volatile(
" diag %[rx],%[cmd],0x250\n"
- "0: ipm %[cc]\n"
- " srl %[cc],28\n"
+ "0: lhi %[exc],0\n"
"1:\n"
+ CC_IPM(cc)
EX_TABLE(0b,1b)
- : [cc] "+&d" (cc), [rx] "+&d" (rx.pair), "+m" (*(addr_type *)iob)
+ : CC_OUT(cc, cc), [rx] "+d" (rx.pair),
+ "+m" (*(addr_type *)iob), [exc] "+d" (exception)
: [cmd] "d" (cmd)
- : "cc");
+ : CC_CLOBBER);
+ cc = exception ? 3 : CC_TRANSFORM(cc);
return cc | rx.odd;
}
diff --git a/drivers/s390/block/dasd_eckd.c b/drivers/s390/block/dasd_eckd.c
index 90b1064..1ebe589 100644
--- a/drivers/s390/block/dasd_eckd.c
+++ b/drivers/s390/block/dasd_eckd.c
@@ -2405,7 +2405,7 @@ static int dasd_eckd_end_analysis(struct dasd_block *block)
}
if (count_area != NULL && count_area->kl == 0) {
- /* we found notthing violating our disk layout */
+ /* we found nothing violating our disk layout */
if (dasd_check_blocksize(count_area->dl) == 0)
block->bp_block = count_area->dl;
}
diff --git a/drivers/s390/block/dasd_proc.c b/drivers/s390/block/dasd_proc.c
index 0faaa43..48e12e8 100644
--- a/drivers/s390/block/dasd_proc.c
+++ b/drivers/s390/block/dasd_proc.c
@@ -350,6 +350,7 @@ dasd_proc_init(void)
remove_proc_entry("devices", dasd_proc_root_entry);
out_nodevices:
remove_proc_entry("dasd", NULL);
+ dasd_proc_root_entry = NULL;
out_nodasd:
return -ENOENT;
}
@@ -357,7 +358,11 @@ dasd_proc_init(void)
void
dasd_proc_exit(void)
{
+ if (!dasd_proc_root_entry)
+ return;
+
remove_proc_entry("devices", dasd_proc_root_entry);
remove_proc_entry("statistics", dasd_proc_root_entry);
remove_proc_entry("dasd", NULL);
+ dasd_proc_root_entry = NULL;
}
diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c
index 02a4a51..0f14d27 100644
--- a/drivers/s390/block/dcssblk.c
+++ b/drivers/s390/block/dcssblk.c
@@ -339,7 +339,7 @@ dcssblk_shared_show(struct device *dev, struct device_attribute *attr, char *buf
struct dcssblk_dev_info *dev_info;
dev_info = container_of(dev, struct dcssblk_dev_info, dev);
- return sprintf(buf, dev_info->is_shared ? "1\n" : "0\n");
+ return sysfs_emit(buf, dev_info->is_shared ? "1\n" : "0\n");
}
static ssize_t
@@ -444,7 +444,7 @@ dcssblk_save_show(struct device *dev, struct device_attribute *attr, char *buf)
struct dcssblk_dev_info *dev_info;
dev_info = container_of(dev, struct dcssblk_dev_info, dev);
- return sprintf(buf, dev_info->save_pending ? "1\n" : "0\n");
+ return sysfs_emit(buf, dev_info->save_pending ? "1\n" : "0\n");
}
static ssize_t
@@ -506,21 +506,15 @@ static ssize_t
dcssblk_seglist_show(struct device *dev, struct device_attribute *attr,
char *buf)
{
- int i;
-
struct dcssblk_dev_info *dev_info;
struct segment_info *entry;
+ int i;
+ i = 0;
down_read(&dcssblk_devices_sem);
dev_info = container_of(dev, struct dcssblk_dev_info, dev);
- i = 0;
- buf[0] = '\0';
- list_for_each_entry(entry, &dev_info->seg_list, lh) {
- strcpy(&buf[i], entry->segment_name);
- i += strlen(entry->segment_name);
- buf[i] = '\n';
- i++;
- }
+ list_for_each_entry(entry, &dev_info->seg_list, lh)
+ i += sysfs_emit_at(buf, i, "%s\n", entry->segment_name);
up_read(&dcssblk_devices_sem);
return i;
}
diff --git a/drivers/s390/char/con3270.c b/drivers/s390/char/con3270.c
index 053102d..ae1b9aa 100644
--- a/drivers/s390/char/con3270.c
+++ b/drivers/s390/char/con3270.c
@@ -528,7 +528,7 @@ static void tty3270_update(struct timer_list *t)
u8 cmd = TC_WRITE;
int rc, len;
- wrq = xchg(&tp->write, 0);
+ wrq = xchg(&tp->write, NULL);
if (!wrq) {
tty3270_set_timer(tp, 1);
return;
@@ -746,7 +746,7 @@ static void tty3270_issue_read(struct tty3270 *tp, int lock)
struct raw3270_request *rrq;
int rc;
- rrq = xchg(&tp->read, 0);
+ rrq = xchg(&tp->read, NULL);
if (!rrq)
/* Read already scheduled. */
return;
diff --git a/drivers/s390/char/sclp.h b/drivers/s390/char/sclp.h
index 6a23ec2..6c91e422 100644
--- a/drivers/s390/char/sclp.h
+++ b/drivers/s390/char/sclp.h
@@ -14,6 +14,7 @@
#include <asm/asm-extable.h>
#include <asm/sclp.h>
#include <asm/ebcdic.h>
+#include <asm/asm.h>
/* maximum number of pages concerning our own memory management */
#define MAX_KMEM_PAGES (sizeof(unsigned long) << 3)
@@ -325,19 +326,22 @@ struct read_info_sccb * __init sclp_early_get_info(void);
/* Perform service call. Return 0 on success, non-zero otherwise. */
static inline int sclp_service_call(sclp_cmdw_t command, void *sccb)
{
- int cc = 4; /* Initialize for program check handling */
+ int cc, exception;
+ exception = 1;
asm volatile(
- "0: .insn rre,0xb2200000,%1,%2\n" /* servc %1,%2 */
- "1: ipm %0\n"
- " srl %0,28\n"
+ "0: .insn rre,0xb2200000,%[cmd],%[sccb]\n" /* servc */
+ "1: lhi %[exc],0\n"
"2:\n"
+ CC_IPM(cc)
EX_TABLE(0b, 2b)
EX_TABLE(1b, 2b)
- : "+&d" (cc) : "d" (command), "a" (__pa(sccb))
- : "cc", "memory");
- if (cc == 4)
+ : CC_OUT(cc, cc), [exc] "+d" (exception)
+ : [cmd] "d" (command), [sccb] "a" (__pa(sccb))
+ : CC_CLOBBER_LIST("memory"));
+ if (exception)
return -EINVAL;
+ cc = CC_TRANSFORM(cc);
if (cc == 3)
return -EIO;
if (cc == 2)
diff --git a/drivers/s390/char/sclp_cpi_sys.c b/drivers/s390/char/sclp_cpi_sys.c
index f60d7ea..d8f91aa 100644
--- a/drivers/s390/char/sclp_cpi_sys.c
+++ b/drivers/s390/char/sclp_cpi_sys.c
@@ -223,7 +223,7 @@ static ssize_t system_name_show(struct kobject *kobj,
int rc;
mutex_lock(&sclp_cpi_mutex);
- rc = snprintf(page, PAGE_SIZE, "%s\n", system_name);
+ rc = sysfs_emit(page, "%s\n", system_name);
mutex_unlock(&sclp_cpi_mutex);
return rc;
}
@@ -255,7 +255,7 @@ static ssize_t sysplex_name_show(struct kobject *kobj,
int rc;
mutex_lock(&sclp_cpi_mutex);
- rc = snprintf(page, PAGE_SIZE, "%s\n", sysplex_name);
+ rc = sysfs_emit(page, "%s\n", sysplex_name);
mutex_unlock(&sclp_cpi_mutex);
return rc;
}
@@ -287,7 +287,7 @@ static ssize_t system_type_show(struct kobject *kobj,
int rc;
mutex_lock(&sclp_cpi_mutex);
- rc = snprintf(page, PAGE_SIZE, "%s\n", system_type);
+ rc = sysfs_emit(page, "%s\n", system_type);
mutex_unlock(&sclp_cpi_mutex);
return rc;
}
@@ -321,7 +321,7 @@ static ssize_t system_level_show(struct kobject *kobj,
mutex_lock(&sclp_cpi_mutex);
level = system_level;
mutex_unlock(&sclp_cpi_mutex);
- return snprintf(page, PAGE_SIZE, "%#018llx\n", level);
+ return sysfs_emit(page, "%#018llx\n", level);
}
static ssize_t system_level_store(struct kobject *kobj,
diff --git a/drivers/s390/char/sclp_ocf.c b/drivers/s390/char/sclp_ocf.c
index d35f10e..ca6c526 100644
--- a/drivers/s390/char/sclp_ocf.c
+++ b/drivers/s390/char/sclp_ocf.c
@@ -101,7 +101,7 @@ static ssize_t cpc_name_show(struct kobject *kobj,
sclp_ocf_cpc_name_copy(name);
name[OCF_LENGTH_CPC_NAME] = 0;
EBCASC(name, OCF_LENGTH_CPC_NAME);
- return snprintf(page, PAGE_SIZE, "%s\n", name);
+ return sysfs_emit(page, "%s\n", name);
}
static struct kobj_attribute cpc_name_attr =
@@ -113,7 +113,7 @@ static ssize_t hmc_network_show(struct kobject *kobj,
int rc;
spin_lock_irq(&sclp_ocf_lock);
- rc = snprintf(page, PAGE_SIZE, "%s\n", hmc_network);
+ rc = sysfs_emit(page, "%s\n", hmc_network);
spin_unlock_irq(&sclp_ocf_lock);
return rc;
}
diff --git a/drivers/s390/char/sclp_pci.c b/drivers/s390/char/sclp_pci.c
index a3e5a5f..c3466a8 100644
--- a/drivers/s390/char/sclp_pci.c
+++ b/drivers/s390/char/sclp_pci.c
@@ -27,6 +27,7 @@
#define SCLP_ERRNOTIFY_AQ_RESET 0
#define SCLP_ERRNOTIFY_AQ_REPAIR 1
#define SCLP_ERRNOTIFY_AQ_INFO_LOG 2
+#define SCLP_ERRNOTIFY_AQ_OPTICS_DATA 3
static DEFINE_MUTEX(sclp_pci_mutex);
static struct sclp_register sclp_pci_event = {
@@ -116,6 +117,7 @@ static int sclp_pci_check_report(struct zpci_report_error_header *report)
case SCLP_ERRNOTIFY_AQ_RESET:
case SCLP_ERRNOTIFY_AQ_REPAIR:
case SCLP_ERRNOTIFY_AQ_INFO_LOG:
+ case SCLP_ERRNOTIFY_AQ_OPTICS_DATA:
break;
default:
return -EINVAL;
diff --git a/drivers/s390/char/tape_core.c b/drivers/s390/char/tape_core.c
index a6d2a47..ce8a440 100644
--- a/drivers/s390/char/tape_core.c
+++ b/drivers/s390/char/tape_core.c
@@ -96,7 +96,7 @@ tape_medium_state_show(struct device *dev, struct device_attribute *attr, char *
struct tape_device *tdev;
tdev = dev_get_drvdata(dev);
- return scnprintf(buf, PAGE_SIZE, "%i\n", tdev->medium_state);
+ return sysfs_emit(buf, "%i\n", tdev->medium_state);
}
static
@@ -108,7 +108,7 @@ tape_first_minor_show(struct device *dev, struct device_attribute *attr, char *b
struct tape_device *tdev;
tdev = dev_get_drvdata(dev);
- return scnprintf(buf, PAGE_SIZE, "%i\n", tdev->first_minor);
+ return sysfs_emit(buf, "%i\n", tdev->first_minor);
}
static
@@ -120,8 +120,8 @@ tape_state_show(struct device *dev, struct device_attribute *attr, char *buf)
struct tape_device *tdev;
tdev = dev_get_drvdata(dev);
- return scnprintf(buf, PAGE_SIZE, "%s\n", (tdev->first_minor < 0) ?
- "OFFLINE" : tape_state_verbose[tdev->tape_state]);
+ return sysfs_emit(buf, "%s\n", (tdev->first_minor < 0) ?
+ "OFFLINE" : tape_state_verbose[tdev->tape_state]);
}
static
@@ -135,17 +135,17 @@ tape_operation_show(struct device *dev, struct device_attribute *attr, char *buf
tdev = dev_get_drvdata(dev);
if (tdev->first_minor < 0)
- return scnprintf(buf, PAGE_SIZE, "N/A\n");
+ return sysfs_emit(buf, "N/A\n");
spin_lock_irq(get_ccwdev_lock(tdev->cdev));
if (list_empty(&tdev->req_queue))
- rc = scnprintf(buf, PAGE_SIZE, "---\n");
+ rc = sysfs_emit(buf, "---\n");
else {
struct tape_request *req;
req = list_entry(tdev->req_queue.next, struct tape_request,
list);
- rc = scnprintf(buf,PAGE_SIZE, "%s\n", tape_op_verbose[req->op]);
+ rc = sysfs_emit(buf, "%s\n", tape_op_verbose[req->op]);
}
spin_unlock_irq(get_ccwdev_lock(tdev->cdev));
return rc;
@@ -161,7 +161,7 @@ tape_blocksize_show(struct device *dev, struct device_attribute *attr, char *buf
tdev = dev_get_drvdata(dev);
- return scnprintf(buf, PAGE_SIZE, "%i\n", tdev->char_data.block_size);
+ return sysfs_emit(buf, "%i\n", tdev->char_data.block_size);
}
static
diff --git a/drivers/s390/char/uvdevice.c b/drivers/s390/char/uvdevice.c
index f598edc..2b83fb6 100644
--- a/drivers/s390/char/uvdevice.c
+++ b/drivers/s390/char/uvdevice.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * Copyright IBM Corp. 2022
+ * Copyright IBM Corp. 2022, 2024
* Author(s): Steffen Eiden <seiden@linux.ibm.com>
*
* This file provides a Linux misc device to give userspace access to some
@@ -40,6 +40,7 @@ static const u32 ioctl_nr_to_uvc_bit[] __initconst = {
[UVIO_IOCTL_ADD_SECRET_NR] = BIT_UVC_CMD_ADD_SECRET,
[UVIO_IOCTL_LIST_SECRETS_NR] = BIT_UVC_CMD_LIST_SECRETS,
[UVIO_IOCTL_LOCK_SECRETS_NR] = BIT_UVC_CMD_LOCK_SECRETS,
+ [UVIO_IOCTL_RETR_SECRET_NR] = BIT_UVC_CMD_RETR_ATTEST,
};
static_assert(ARRAY_SIZE(ioctl_nr_to_uvc_bit) == UVIO_IOCTL_NUM_IOCTLS);
@@ -62,11 +63,13 @@ static void __init set_supp_uv_cmds(unsigned long *supp_uv_cmds)
}
/**
- * uvio_uvdev_info() - get information about the uvdevice
+ * uvio_uvdev_info() - Get information about the uvdevice
*
* @uv_ioctl: ioctl control block
*
* Lists all IOCTLs that are supported by this uvdevice
+ *
+ * Return: 0 on success or a negative error code on error
*/
static int uvio_uvdev_info(struct uvio_ioctl_cb *uv_ioctl)
{
@@ -177,7 +180,7 @@ static int get_uvio_attest(struct uvio_ioctl_cb *uv_ioctl, struct uvio_attest *u
*
* Context: might sleep
*
- * Return: 0 on success or a negative error code on error.
+ * Return: 0 on success or a negative error code on error
*/
static int uvio_attestation(struct uvio_ioctl_cb *uv_ioctl)
{
@@ -237,7 +240,8 @@ static int uvio_attestation(struct uvio_ioctl_cb *uv_ioctl)
return ret;
}
-/** uvio_add_secret() - perform an Add Secret UVC
+/**
+ * uvio_add_secret() - Perform an Add Secret UVC
*
* @uv_ioctl: ioctl control block
*
@@ -260,7 +264,7 @@ static int uvio_attestation(struct uvio_ioctl_cb *uv_ioctl)
*
* Context: might sleep
*
- * Return: 0 on success or a negative error code on error.
+ * Return: 0 on success or a negative error code on error
*/
static int uvio_add_secret(struct uvio_ioctl_cb *uv_ioctl)
{
@@ -296,7 +300,44 @@ static int uvio_add_secret(struct uvio_ioctl_cb *uv_ioctl)
return ret;
}
-/** uvio_list_secrets() - perform a List Secret UVC
+/*
+ * Do the actual secret list creation. Calls the list secrets UVC until there
+ * is no more space in the user buffer, or the list ends.
+ */
+static int uvio_get_list(void *zpage, struct uvio_ioctl_cb *uv_ioctl)
+{
+ const size_t data_off = offsetof(struct uv_secret_list, secrets);
+ u8 __user *user_buf = (u8 __user *)uv_ioctl->argument_addr;
+ struct uv_secret_list *list = zpage;
+ u16 num_secrets_stored = 0;
+ size_t user_off = data_off;
+ size_t copy_len;
+
+ do {
+ uv_list_secrets(list, list->next_secret_idx, &uv_ioctl->uv_rc,
+ &uv_ioctl->uv_rrc);
+ if (uv_ioctl->uv_rc != UVC_RC_EXECUTED &&
+ uv_ioctl->uv_rc != UVC_RC_MORE_DATA)
+ break;
+
+ copy_len = sizeof(list->secrets[0]) * list->num_secr_stored;
+ if (copy_to_user(user_buf + user_off, list->secrets, copy_len))
+ return -EFAULT;
+
+ user_off += copy_len;
+ num_secrets_stored += list->num_secr_stored;
+ } while (uv_ioctl->uv_rc == UVC_RC_MORE_DATA &&
+ user_off + sizeof(*list) <= uv_ioctl->argument_len);
+
+ list->num_secr_stored = num_secrets_stored;
+ if (copy_to_user(user_buf, list, data_off))
+ return -EFAULT;
+ return 0;
+}
+
+/**
+ * uvio_list_secrets() - Perform a List Secret UVC
+ *
* @uv_ioctl: ioctl control block
*
* uvio_list_secrets() performs the List Secret Ultravisor Call. It verifies
@@ -307,45 +348,43 @@ static int uvio_add_secret(struct uvio_ioctl_cb *uv_ioctl)
*
* The argument specifies the location for the result of the UV-Call.
*
+ * Argument length must be a multiple of a page.
+ * The list secrets IOCTL will call the list UVC multiple times and fill
+ * the provided user-buffer with list elements until either the list ends or
+ * the buffer is full. The list header is merged over all list header from the
+ * individual UVCs.
+ *
* If the List Secrets UV facility is not present, UV will return invalid
* command rc. This won't be fenced in the driver and does not result in a
* negative return value.
*
* Context: might sleep
*
- * Return: 0 on success or a negative error code on error.
+ * Return: 0 on success or a negative error code on error
*/
static int uvio_list_secrets(struct uvio_ioctl_cb *uv_ioctl)
{
- void __user *user_buf_arg = (void __user *)uv_ioctl->argument_addr;
- struct uv_cb_guest_addr uvcb = {
- .header.len = sizeof(uvcb),
- .header.cmd = UVC_CMD_LIST_SECRETS,
- };
- void *secrets = NULL;
- int ret = 0;
+ void *zpage;
+ int rc;
- if (uv_ioctl->argument_len != UVIO_LIST_SECRETS_LEN)
+ if (uv_ioctl->argument_len == 0 ||
+ uv_ioctl->argument_len % UVIO_LIST_SECRETS_LEN != 0)
return -EINVAL;
- secrets = kvzalloc(UVIO_LIST_SECRETS_LEN, GFP_KERNEL);
- if (!secrets)
+ zpage = (void *)get_zeroed_page(GFP_KERNEL);
+ if (!zpage)
return -ENOMEM;
- uvcb.addr = (u64)secrets;
- uv_call_sched(0, (u64)&uvcb);
- uv_ioctl->uv_rc = uvcb.header.rc;
- uv_ioctl->uv_rrc = uvcb.header.rrc;
+ rc = uvio_get_list(zpage, uv_ioctl);
- if (copy_to_user(user_buf_arg, secrets, UVIO_LIST_SECRETS_LEN))
- ret = -EFAULT;
-
- kvfree(secrets);
- return ret;
+ free_page((unsigned long)zpage);
+ return rc;
}
-/** uvio_lock_secrets() - perform a Lock Secret Store UVC
- * @uv_ioctl: ioctl control block
+/**
+ * uvio_lock_secrets() - Perform a Lock Secret Store UVC
+ *
+ * @ioctl: ioctl control block
*
* uvio_lock_secrets() performs the Lock Secret Store Ultravisor Call. It
* performs the UV-call and copies the return codes to the ioctl control block.
@@ -360,7 +399,7 @@ static int uvio_list_secrets(struct uvio_ioctl_cb *uv_ioctl)
*
* Context: might sleep
*
- * Return: 0 on success or a negative error code on error.
+ * Return: 0 on success or a negative error code on error
*/
static int uvio_lock_secrets(struct uvio_ioctl_cb *ioctl)
{
@@ -379,6 +418,59 @@ static int uvio_lock_secrets(struct uvio_ioctl_cb *ioctl)
return 0;
}
+/**
+ * uvio_retr_secret() - Perform a retrieve secret UVC
+ *
+ * @uv_ioctl: ioctl control block.
+ *
+ * uvio_retr_secret() performs the Retrieve Secret Ultravisor Call.
+ * The first two bytes of the argument specify the index of the secret to be
+ * retrieved. The retrieved secret is copied into the argument buffer if there
+ * is enough space.
+ * The argument length must be at least two bytes and at max 8192 bytes.
+ *
+ * Context: might sleep
+ *
+ * Return: 0 on success or a negative error code on error
+ */
+static int uvio_retr_secret(struct uvio_ioctl_cb *uv_ioctl)
+{
+ u16 __user *user_index = (u16 __user *)uv_ioctl->argument_addr;
+ struct uv_cb_retr_secr uvcb = {
+ .header.len = sizeof(uvcb),
+ .header.cmd = UVC_CMD_RETR_SECRET,
+ };
+ u32 buf_len = uv_ioctl->argument_len;
+ void *buf = NULL;
+ int ret;
+
+ if (buf_len > UVIO_RETR_SECRET_MAX_LEN || buf_len < sizeof(*user_index))
+ return -EINVAL;
+
+ buf = kvzalloc(buf_len, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+
+ ret = -EFAULT;
+ if (get_user(uvcb.secret_idx, user_index))
+ goto err;
+
+ uvcb.buf_addr = (u64)buf;
+ uvcb.buf_size = buf_len;
+ uv_call_sched(0, (u64)&uvcb);
+
+ if (copy_to_user((__user void *)uv_ioctl->argument_addr, buf, buf_len))
+ goto err;
+
+ ret = 0;
+ uv_ioctl->uv_rc = uvcb.header.rc;
+ uv_ioctl->uv_rrc = uvcb.header.rrc;
+
+err:
+ kvfree_sensitive(buf, buf_len);
+ return ret;
+}
+
static int uvio_copy_and_check_ioctl(struct uvio_ioctl_cb *ioctl, void __user *argp,
unsigned long cmd)
{
@@ -432,6 +524,9 @@ static long uvio_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
case UVIO_IOCTL_LOCK_SECRETS_NR:
ret = uvio_lock_secrets(&uv_ioctl);
break;
+ case UVIO_IOCTL_RETR_SECRET_NR:
+ ret = uvio_retr_secret(&uv_ioctl);
+ break;
default:
ret = -ENOIOCTLCMD;
break;
diff --git a/drivers/s390/char/vmlogrdr.c b/drivers/s390/char/vmlogrdr.c
index bd5cecc..3dd50ac9 100644
--- a/drivers/s390/char/vmlogrdr.c
+++ b/drivers/s390/char/vmlogrdr.c
@@ -531,7 +531,7 @@ static ssize_t vmlogrdr_autopurge_show(struct device *dev,
char *buf)
{
struct vmlogrdr_priv_t *priv = dev_get_drvdata(dev);
- return sprintf(buf, "%u\n", priv->autopurge);
+ return sysfs_emit(buf, "%u\n", priv->autopurge);
}
@@ -605,7 +605,7 @@ static ssize_t vmlogrdr_autorecording_show(struct device *dev,
char *buf)
{
struct vmlogrdr_priv_t *priv = dev_get_drvdata(dev);
- return sprintf(buf, "%u\n", priv->autorecording);
+ return sysfs_emit(buf, "%u\n", priv->autorecording);
}
diff --git a/drivers/s390/char/vmur.c b/drivers/s390/char/vmur.c
index fe94dec..90ba7a2 100644
--- a/drivers/s390/char/vmur.c
+++ b/drivers/s390/char/vmur.c
@@ -345,7 +345,7 @@ static ssize_t ur_attr_reclen_show(struct device *dev,
urd = urdev_get_from_cdev(to_ccwdev(dev));
if (!urd)
return -ENODEV;
- rc = sprintf(buf, "%zu\n", urd->reclen);
+ rc = sysfs_emit(buf, "%zu\n", urd->reclen);
urdev_put(urd);
return rc;
}
diff --git a/drivers/s390/cio/ccwgroup.c b/drivers/s390/cio/ccwgroup.c
index 66b1bdc..7bcf8b9 100644
--- a/drivers/s390/cio/ccwgroup.c
+++ b/drivers/s390/cio/ccwgroup.c
@@ -147,7 +147,7 @@ static ssize_t ccwgroup_online_show(struct device *dev,
online = (gdev->state == CCWGROUP_ONLINE) ? 1 : 0;
- return scnprintf(buf, PAGE_SIZE, "%d\n", online);
+ return sysfs_emit(buf, "%d\n", online);
}
/*
diff --git a/drivers/s390/cio/chp.c b/drivers/s390/cio/chp.c
index a07bbec..cba2d04 100644
--- a/drivers/s390/cio/chp.c
+++ b/drivers/s390/cio/chp.c
@@ -144,6 +144,18 @@ static ssize_t measurement_chars_read(struct file *filp, struct kobject *kobj,
}
static BIN_ATTR_ADMIN_RO(measurement_chars, sizeof(struct cmg_chars));
+static ssize_t measurement_chars_full_read(struct file *filp,
+ struct kobject *kobj,
+ struct bin_attribute *bin_attr,
+ char *buf, loff_t off, size_t count)
+{
+ struct channel_path *chp = to_channelpath(kobj_to_dev(kobj));
+
+ return memory_read_from_buffer(buf, count, &off, &chp->cmcb,
+ sizeof(chp->cmcb));
+}
+static BIN_ATTR_ADMIN_RO(measurement_chars_full, sizeof(struct cmg_cmcb));
+
static ssize_t chp_measurement_copy_block(void *buf, loff_t off, size_t count,
struct kobject *kobj, bool extended)
{
@@ -201,6 +213,7 @@ static BIN_ATTR_ADMIN_RO(ext_measurement, sizeof(struct cmg_ext_entry));
static struct bin_attribute *measurement_attrs[] = {
&bin_attr_measurement_chars,
+ &bin_attr_measurement_chars_full,
&bin_attr_measurement,
&bin_attr_ext_measurement,
NULL,
@@ -230,7 +243,7 @@ static ssize_t chp_status_show(struct device *dev,
status = chp->state;
mutex_unlock(&chp->lock);
- return status ? sprintf(buf, "online\n") : sprintf(buf, "offline\n");
+ return status ? sysfs_emit(buf, "online\n") : sysfs_emit(buf, "offline\n");
}
static ssize_t chp_status_write(struct device *dev,
@@ -311,7 +324,7 @@ static ssize_t chp_type_show(struct device *dev, struct device_attribute *attr,
mutex_lock(&chp->lock);
type = chp->desc.desc;
mutex_unlock(&chp->lock);
- return sprintf(buf, "%x\n", type);
+ return sysfs_emit(buf, "%x\n", type);
}
static DEVICE_ATTR(type, 0444, chp_type_show, NULL);
@@ -324,8 +337,8 @@ static ssize_t chp_cmg_show(struct device *dev, struct device_attribute *attr,
if (!chp)
return 0;
if (chp->cmg == -1) /* channel measurements not available */
- return sprintf(buf, "unknown\n");
- return sprintf(buf, "%d\n", chp->cmg);
+ return sysfs_emit(buf, "unknown\n");
+ return sysfs_emit(buf, "%d\n", chp->cmg);
}
static DEVICE_ATTR(cmg, 0444, chp_cmg_show, NULL);
@@ -338,8 +351,8 @@ static ssize_t chp_shared_show(struct device *dev,
if (!chp)
return 0;
if (chp->shared == -1) /* channel measurements not available */
- return sprintf(buf, "unknown\n");
- return sprintf(buf, "%x\n", chp->shared);
+ return sysfs_emit(buf, "unknown\n");
+ return sysfs_emit(buf, "%x\n", chp->shared);
}
static DEVICE_ATTR(shared, 0444, chp_shared_show, NULL);
@@ -352,7 +365,7 @@ static ssize_t chp_chid_show(struct device *dev, struct device_attribute *attr,
mutex_lock(&chp->lock);
if (chp->desc_fmt1.flags & 0x10)
- rc = sprintf(buf, "%04x\n", chp->desc_fmt1.chid);
+ rc = sysfs_emit(buf, "%04x\n", chp->desc_fmt1.chid);
else
rc = 0;
mutex_unlock(&chp->lock);
@@ -369,7 +382,7 @@ static ssize_t chp_chid_external_show(struct device *dev,
mutex_lock(&chp->lock);
if (chp->desc_fmt1.flags & 0x10)
- rc = sprintf(buf, "%x\n", chp->desc_fmt1.flags & 0x8 ? 1 : 0);
+ rc = sysfs_emit(buf, "%x\n", chp->desc_fmt1.flags & 0x8 ? 1 : 0);
else
rc = 0;
mutex_unlock(&chp->lock);
@@ -385,7 +398,7 @@ static ssize_t chp_esc_show(struct device *dev,
ssize_t rc;
mutex_lock(&chp->lock);
- rc = sprintf(buf, "%x\n", chp->desc_fmt1.esc);
+ rc = sysfs_emit(buf, "%x\n", chp->desc_fmt1.esc);
mutex_unlock(&chp->lock);
return rc;
diff --git a/drivers/s390/cio/chp.h b/drivers/s390/cio/chp.h
index a15324a..391b52a 100644
--- a/drivers/s390/cio/chp.h
+++ b/drivers/s390/cio/chp.h
@@ -54,6 +54,7 @@ struct channel_path {
int extended;
unsigned long speed;
struct cmg_chars cmg_chars;
+ struct cmg_cmcb cmcb;
};
/* Return channel_path struct for given chpid. */
diff --git a/drivers/s390/cio/chsc.c b/drivers/s390/cio/chsc.c
index dcc1e1c..e646231 100644
--- a/drivers/s390/cio/chsc.c
+++ b/drivers/s390/cio/chsc.c
@@ -376,7 +376,7 @@ struct lir {
#define PARAMS_LEN 10 /* PARAMS=xx,xxxxxx */
#define NODEID_LEN 35 /* NODEID=tttttt/mdl,mmm.ppssssssssssss,xxxx */
-/* Copy EBCIDC text, convert to ASCII and optionally add delimiter. */
+/* Copy EBCDIC text, convert to ASCII and optionally add delimiter. */
static char *store_ebcdic(char *dest, const char *src, unsigned long len,
char delim)
{
@@ -1092,19 +1092,7 @@ int chsc_get_channel_measurement_chars(struct channel_path *chp)
u32 zeroes1;
struct chsc_header response;
u32 zeroes2;
- u32 not_valid : 1;
- u32 shared : 1;
- u32 extended : 1;
- u32 : 21;
- u32 chpid : 8;
- u32 cmcv : 5;
- u32 : 7;
- u32 cmgp : 4;
- u32 cmgq : 8;
- u32 cmg : 8;
- u32 : 16;
- u32 cmgs : 16;
- u32 data[NR_MEASUREMENT_CHARS];
+ struct cmg_cmcb cmcb;
} *scmc_area;
chp->shared = -1;
@@ -1135,15 +1123,16 @@ int chsc_get_channel_measurement_chars(struct channel_path *chp)
scmc_area->response.code);
goto out;
}
- if (scmc_area->not_valid)
+ chp->cmcb = scmc_area->cmcb;
+ if (scmc_area->cmcb.not_valid)
goto out;
- chp->cmg = scmc_area->cmg;
- chp->shared = scmc_area->shared;
- chp->extended = scmc_area->extended;
- chp->speed = scmc_get_speed(scmc_area->cmgs, scmc_area->cmgp);
- chsc_initialize_cmg_chars(chp, scmc_area->cmcv,
- (struct cmg_chars *) &scmc_area->data);
+ chp->cmg = scmc_area->cmcb.cmg;
+ chp->shared = scmc_area->cmcb.shared;
+ chp->extended = scmc_area->cmcb.extended;
+ chp->speed = scmc_get_speed(scmc_area->cmcb.cmgs, scmc_area->cmcb.cmgp);
+ chsc_initialize_cmg_chars(chp, scmc_area->cmcb.cmcv,
+ (struct cmg_chars *)&scmc_area->cmcb.data);
out:
spin_unlock_irqrestore(&chsc_page_lock, flags);
return ret;
diff --git a/drivers/s390/cio/chsc.h b/drivers/s390/cio/chsc.h
index 24cd65d..6fe983e 100644
--- a/drivers/s390/cio/chsc.h
+++ b/drivers/s390/cio/chsc.h
@@ -17,6 +17,22 @@ struct cmg_chars {
u32 values[NR_MEASUREMENT_CHARS];
};
+struct cmg_cmcb {
+ u32 not_valid : 1;
+ u32 shared : 1;
+ u32 extended : 1;
+ u32 : 21;
+ u32 chpid : 8;
+ u32 cmcv : 5;
+ u32 : 7;
+ u32 cmgp : 4;
+ u32 cmgq : 8;
+ u32 cmg : 8;
+ u32 : 16;
+ u32 cmgs : 16;
+ u32 data[NR_MEASUREMENT_CHARS];
+};
+
#define NR_MEASUREMENT_ENTRIES 8
struct cmg_entry {
u32 values[NR_MEASUREMENT_ENTRIES];
diff --git a/drivers/s390/cio/cio.c b/drivers/s390/cio/cio.c
index c32e818..ad17ab0 100644
--- a/drivers/s390/cio/cio.c
+++ b/drivers/s390/cio/cio.c
@@ -459,10 +459,14 @@ int cio_update_schib(struct subchannel *sch)
{
struct schib schib;
- if (stsch(sch->schid, &schib) || !css_sch_is_valid(&schib))
+ if (stsch(sch->schid, &schib))
return -ENODEV;
memcpy(&sch->schib, &schib, sizeof(schib));
+
+ if (!css_sch_is_valid(&schib))
+ return -EACCES;
+
return 0;
}
EXPORT_SYMBOL_GPL(cio_update_schib);
diff --git a/drivers/s390/cio/cio.h b/drivers/s390/cio/cio.h
index a9057a5..08a5e93 100644
--- a/drivers/s390/cio/cio.h
+++ b/drivers/s390/cio/cio.h
@@ -19,7 +19,7 @@ struct pmcw {
u32 intparm; /* interruption parameter */
u32 qf : 1; /* qdio facility */
u32 w : 1;
- u32 isc : 3; /* interruption sublass */
+ u32 isc : 3; /* interruption subclass */
u32 res5 : 3; /* reserved zeros */
u32 ena : 1; /* enabled */
u32 lm : 2; /* limit mode */
diff --git a/drivers/s390/cio/cmf.c b/drivers/s390/cio/cmf.c
index f80dc18..fdab760 100644
--- a/drivers/s390/cio/cmf.c
+++ b/drivers/s390/cio/cmf.c
@@ -46,7 +46,7 @@
/* indices for READCMB */
enum cmb_index {
avg_utilization = -1,
- /* basic and exended format: */
+ /* basic and extended format: */
cmb_ssch_rsch_count = 0,
cmb_sample_count,
cmb_device_connect_time,
@@ -135,7 +135,7 @@ static inline u64 time_to_nsec(u32 value)
* Users are usually interested in average times,
* not accumulated time.
* This also helps us with atomicity problems
- * when reading sinlge values.
+ * when reading single values.
*/
static inline u64 time_to_avg_nsec(u32 value, u32 count)
{
@@ -977,8 +977,7 @@ static struct cmb_operations cmbops_extended = {
static ssize_t cmb_show_attr(struct device *dev, char *buf, enum cmb_index idx)
{
- return sprintf(buf, "%lld\n",
- (unsigned long long) cmf_read(to_ccwdev(dev), idx));
+ return sysfs_emit(buf, "%lld\n", cmf_read(to_ccwdev(dev), idx));
}
static ssize_t cmb_show_avg_sample_interval(struct device *dev,
@@ -998,7 +997,7 @@ static ssize_t cmb_show_avg_sample_interval(struct device *dev,
} else
interval = -1;
spin_unlock_irq(cdev->ccwlock);
- return sprintf(buf, "%ld\n", interval);
+ return sysfs_emit(buf, "%ld\n", interval);
}
static ssize_t cmb_show_avg_utilization(struct device *dev,
@@ -1007,7 +1006,7 @@ static ssize_t cmb_show_avg_utilization(struct device *dev,
{
unsigned long u = cmf_read(to_ccwdev(dev), avg_utilization);
- return sprintf(buf, "%02lu.%01lu%%\n", u / 10, u % 10);
+ return sysfs_emit(buf, "%02lu.%01lu%%\n", u / 10, u % 10);
}
#define cmf_attr(name) \
@@ -1080,7 +1079,7 @@ static ssize_t cmb_enable_show(struct device *dev,
{
struct ccw_device *cdev = to_ccwdev(dev);
- return sprintf(buf, "%d\n", cmf_enabled(cdev));
+ return sysfs_emit(buf, "%d\n", cmf_enabled(cdev));
}
static ssize_t cmb_enable_store(struct device *dev,
@@ -1227,7 +1226,7 @@ int cmf_readall(struct ccw_device *cdev, struct cmbdata *data)
return cmbops->readall(cdev, data);
}
-/* Reenable cmf when a disconnected device becomes available again. */
+/* Re-enable cmf when a disconnected device becomes available again. */
int cmf_reenable(struct ccw_device *cdev)
{
cmbops->reset(cdev);
diff --git a/drivers/s390/cio/css.c b/drivers/s390/cio/css.c
index 7b59d20..be78a57 100644
--- a/drivers/s390/cio/css.c
+++ b/drivers/s390/cio/css.c
@@ -380,11 +380,11 @@ static ssize_t chpids_show(struct device *dev,
for (chp = 0; chp < 8; chp++) {
mask = 0x80 >> chp;
if (ssd->path_mask & mask)
- ret += sprintf(buf + ret, "%02x ", ssd->chpid[chp].id);
+ ret += sysfs_emit_at(buf, ret, "%02x ", ssd->chpid[chp].id);
else
- ret += sprintf(buf + ret, "00 ");
+ ret += sysfs_emit_at(buf, ret, "00 ");
}
- ret += sprintf(buf + ret, "\n");
+ ret += sysfs_emit_at(buf, ret, "\n");
return ret;
}
static DEVICE_ATTR_RO(chpids);
diff --git a/drivers/s390/cio/device.c b/drivers/s390/cio/device.c
index b0f2324..fb2c07c 100644
--- a/drivers/s390/cio/device.c
+++ b/drivers/s390/cio/device.c
@@ -201,10 +201,9 @@ devtype_show (struct device *dev, struct device_attribute *attr, char *buf)
struct ccw_device_id *id = &(cdev->id);
if (id->dev_type != 0)
- return sprintf(buf, "%04x/%02x\n",
- id->dev_type, id->dev_model);
+ return sysfs_emit(buf, "%04x/%02x\n", id->dev_type, id->dev_model);
else
- return sprintf(buf, "n/a\n");
+ return sysfs_emit(buf, "n/a\n");
}
static ssize_t
@@ -213,8 +212,7 @@ cutype_show (struct device *dev, struct device_attribute *attr, char *buf)
struct ccw_device *cdev = to_ccwdev(dev);
struct ccw_device_id *id = &(cdev->id);
- return sprintf(buf, "%04x/%02x\n",
- id->cu_type, id->cu_model);
+ return sysfs_emit(buf, "%04x/%02x\n", id->cu_type, id->cu_model);
}
static ssize_t
@@ -234,7 +232,7 @@ online_show (struct device *dev, struct device_attribute *attr, char *buf)
{
struct ccw_device *cdev = to_ccwdev(dev);
- return sprintf(buf, cdev->online ? "1\n" : "0\n");
+ return sysfs_emit(buf, cdev->online ? "1\n" : "0\n");
}
int ccw_device_is_orphan(struct ccw_device *cdev)
@@ -546,21 +544,21 @@ available_show (struct device *dev, struct device_attribute *attr, char *buf)
struct subchannel *sch;
if (ccw_device_is_orphan(cdev))
- return sprintf(buf, "no device\n");
+ return sysfs_emit(buf, "no device\n");
switch (cdev->private->state) {
case DEV_STATE_BOXED:
- return sprintf(buf, "boxed\n");
+ return sysfs_emit(buf, "boxed\n");
case DEV_STATE_DISCONNECTED:
case DEV_STATE_DISCONNECTED_SENSE_ID:
case DEV_STATE_NOT_OPER:
sch = to_subchannel(dev->parent);
if (!sch->lpm)
- return sprintf(buf, "no path\n");
+ return sysfs_emit(buf, "no path\n");
else
- return sprintf(buf, "no device\n");
+ return sysfs_emit(buf, "no device\n");
default:
/* All other states considered fine. */
- return sprintf(buf, "good\n");
+ return sysfs_emit(buf, "good\n");
}
}
@@ -587,7 +585,7 @@ static ssize_t vpm_show(struct device *dev, struct device_attribute *attr,
{
struct subchannel *sch = to_subchannel(dev);
- return sprintf(buf, "%02x\n", sch->vpm);
+ return sysfs_emit(buf, "%02x\n", sch->vpm);
}
static DEVICE_ATTR_RO(devtype);
@@ -1387,14 +1385,18 @@ enum io_sch_action {
IO_SCH_VERIFY,
IO_SCH_DISC,
IO_SCH_NOP,
+ IO_SCH_ORPH_CDEV,
};
static enum io_sch_action sch_get_action(struct subchannel *sch)
{
struct ccw_device *cdev;
+ int rc;
cdev = sch_get_cdev(sch);
- if (cio_update_schib(sch)) {
+ rc = cio_update_schib(sch);
+
+ if (rc == -ENODEV) {
/* Not operational. */
if (!cdev)
return IO_SCH_UNREG;
@@ -1402,6 +1404,16 @@ static enum io_sch_action sch_get_action(struct subchannel *sch)
return IO_SCH_UNREG;
return IO_SCH_ORPH_UNREG;
}
+
+ /* Avoid unregistering subchannels without working device. */
+ if (rc == -EACCES) {
+ if (!cdev)
+ return IO_SCH_NOP;
+ if (ccw_device_notify(cdev, CIO_GONE) != NOTIFY_OK)
+ return IO_SCH_UNREG_CDEV;
+ return IO_SCH_ORPH_CDEV;
+ }
+
/* Operational. */
if (!cdev)
return IO_SCH_ATTACH;
@@ -1471,6 +1483,7 @@ static int io_subchannel_sch_event(struct subchannel *sch, int process)
rc = 0;
goto out_unlock;
case IO_SCH_ORPH_UNREG:
+ case IO_SCH_ORPH_CDEV:
case IO_SCH_ORPH_ATTACH:
ccw_device_set_disconnected(cdev);
break;
@@ -1502,6 +1515,7 @@ static int io_subchannel_sch_event(struct subchannel *sch, int process)
/* Handle attached ccw device. */
switch (action) {
case IO_SCH_ORPH_UNREG:
+ case IO_SCH_ORPH_CDEV:
case IO_SCH_ORPH_ATTACH:
/* Move ccw device to orphanage. */
rc = ccw_device_move_to_orph(cdev);
diff --git a/drivers/s390/cio/ioasm.c b/drivers/s390/cio/ioasm.c
index acf1edd..5ff1e51 100644
--- a/drivers/s390/cio/ioasm.c
+++ b/drivers/s390/cio/ioasm.c
@@ -8,6 +8,7 @@
#include <asm/asm-extable.h>
#include <asm/chpid.h>
#include <asm/schid.h>
+#include <asm/asm.h>
#include <asm/crw.h>
#include "ioasm.h"
@@ -18,19 +19,20 @@
static inline int __stsch(struct subchannel_id schid, struct schib *addr)
{
unsigned long r1 = *(unsigned int *)&schid;
- int ccode = -EIO;
+ int ccode, exception;
+ exception = 1;
asm volatile(
" lgr 1,%[r1]\n"
" stsch %[addr]\n"
- "0: ipm %[cc]\n"
- " srl %[cc],28\n"
+ "0: lhi %[exc],0\n"
"1:\n"
+ CC_IPM(cc)
EX_TABLE(0b, 1b)
- : [cc] "+&d" (ccode), [addr] "=Q" (*addr)
+ : CC_OUT(cc, ccode), [addr] "=Q" (*addr), [exc] "+d" (exception)
: [r1] "d" (r1)
- : "cc", "1");
- return ccode;
+ : CC_CLOBBER_LIST("1"));
+ return exception ? -EIO : CC_TRANSFORM(ccode);
}
int stsch(struct subchannel_id schid, struct schib *addr)
@@ -47,19 +49,20 @@ EXPORT_SYMBOL(stsch);
static inline int __msch(struct subchannel_id schid, struct schib *addr)
{
unsigned long r1 = *(unsigned int *)&schid;
- int ccode = -EIO;
+ int ccode, exception;
+ exception = 1;
asm volatile(
" lgr 1,%[r1]\n"
" msch %[addr]\n"
- "0: ipm %[cc]\n"
- " srl %[cc],28\n"
+ "0: lhi %[exc],0\n"
"1:\n"
+ CC_IPM(cc)
EX_TABLE(0b, 1b)
- : [cc] "+&d" (ccode)
+ : CC_OUT(cc, ccode), [exc] "+d" (exception)
: [r1] "d" (r1), [addr] "Q" (*addr)
- : "cc", "1");
- return ccode;
+ : CC_CLOBBER_LIST("1"));
+ return exception ? -EIO : CC_TRANSFORM(ccode);
}
int msch(struct subchannel_id schid, struct schib *addr)
@@ -80,12 +83,11 @@ static inline int __tsch(struct subchannel_id schid, struct irb *addr)
asm volatile(
" lgr 1,%[r1]\n"
" tsch %[addr]\n"
- " ipm %[cc]\n"
- " srl %[cc],28"
- : [cc] "=&d" (ccode), [addr] "=Q" (*addr)
+ CC_IPM(cc)
+ : CC_OUT(cc, ccode), [addr] "=Q" (*addr)
: [r1] "d" (r1)
- : "cc", "1");
- return ccode;
+ : CC_CLOBBER_LIST("1"));
+ return CC_TRANSFORM(ccode);
}
int tsch(struct subchannel_id schid, struct irb *addr)
@@ -101,19 +103,20 @@ int tsch(struct subchannel_id schid, struct irb *addr)
static inline int __ssch(struct subchannel_id schid, union orb *addr)
{
unsigned long r1 = *(unsigned int *)&schid;
- int ccode = -EIO;
+ int ccode, exception;
+ exception = 1;
asm volatile(
" lgr 1,%[r1]\n"
" ssch %[addr]\n"
- "0: ipm %[cc]\n"
- " srl %[cc],28\n"
+ "0: lhi %[exc],0\n"
"1:\n"
+ CC_IPM(cc)
EX_TABLE(0b, 1b)
- : [cc] "+&d" (ccode)
+ : CC_OUT(cc, ccode), [exc] "+d" (exception)
: [r1] "d" (r1), [addr] "Q" (*addr)
- : "cc", "memory", "1");
- return ccode;
+ : CC_CLOBBER_LIST("memory", "1"));
+ return CC_TRANSFORM(ccode);
}
int ssch(struct subchannel_id schid, union orb *addr)
@@ -135,12 +138,11 @@ static inline int __csch(struct subchannel_id schid)
asm volatile(
" lgr 1,%[r1]\n"
" csch\n"
- " ipm %[cc]\n"
- " srl %[cc],28\n"
- : [cc] "=&d" (ccode)
+ CC_IPM(cc)
+ : CC_OUT(cc, ccode)
: [r1] "d" (r1)
- : "cc", "1");
- return ccode;
+ : CC_CLOBBER_LIST("1"));
+ return CC_TRANSFORM(ccode);
}
int csch(struct subchannel_id schid)
@@ -160,11 +162,11 @@ int tpi(struct tpi_info *addr)
asm volatile(
" tpi %[addr]\n"
- " ipm %[cc]\n"
- " srl %[cc],28"
- : [cc] "=&d" (ccode), [addr] "=Q" (*addr)
+ CC_IPM(cc)
+ : CC_OUT(cc, ccode), [addr] "=Q" (*addr)
:
- : "cc");
+ : CC_CLOBBER);
+ ccode = CC_TRANSFORM(ccode);
trace_s390_cio_tpi(addr, ccode);
return ccode;
@@ -173,17 +175,19 @@ int tpi(struct tpi_info *addr)
int chsc(void *chsc_area)
{
typedef struct { char _[4096]; } addr_type;
- int cc = -EIO;
+ int cc, exception;
+ exception = 1;
asm volatile(
" .insn rre,0xb25f0000,%[chsc_area],0\n"
- "0: ipm %[cc]\n"
- " srl %[cc],28\n"
+ "0: lhi %[exc],0\n"
"1:\n"
+ CC_IPM(cc)
EX_TABLE(0b, 1b)
- : [cc] "+&d" (cc), "+m" (*(addr_type *)chsc_area)
+ : CC_OUT(cc, cc), "+m" (*(addr_type *)chsc_area), [exc] "+d" (exception)
: [chsc_area] "d" (chsc_area)
- : "cc");
+ : CC_CLOBBER);
+ cc = exception ? -EIO : CC_TRANSFORM(cc);
trace_s390_cio_chsc(chsc_area, cc);
return cc;
@@ -198,12 +202,11 @@ static inline int __rsch(struct subchannel_id schid)
asm volatile(
" lgr 1,%[r1]\n"
" rsch\n"
- " ipm %[cc]\n"
- " srl %[cc],28\n"
- : [cc] "=&d" (ccode)
+ CC_IPM(cc)
+ : CC_OUT(cc, ccode)
: [r1] "d" (r1)
- : "cc", "memory", "1");
- return ccode;
+ : CC_CLOBBER_LIST("memory", "1"));
+ return CC_TRANSFORM(ccode);
}
int rsch(struct subchannel_id schid)
@@ -224,12 +227,11 @@ static inline int __hsch(struct subchannel_id schid)
asm volatile(
" lgr 1,%[r1]\n"
" hsch\n"
- " ipm %[cc]\n"
- " srl %[cc],28\n"
- : [cc] "=&d" (ccode)
+ CC_IPM(cc)
+ : CC_OUT(cc, ccode)
: [r1] "d" (r1)
- : "cc", "1");
- return ccode;
+ : CC_CLOBBER_LIST("1"));
+ return CC_TRANSFORM(ccode);
}
int hsch(struct subchannel_id schid)
@@ -256,7 +258,7 @@ static inline int __xsch(struct subchannel_id schid)
: [cc] "=&d" (ccode)
: [r1] "d" (r1)
: "cc", "1");
- return ccode;
+ return CC_TRANSFORM(ccode);
}
int xsch(struct subchannel_id schid)
@@ -275,12 +277,11 @@ static inline int __stcrw(struct crw *crw)
asm volatile(
" stcrw %[crw]\n"
- " ipm %[cc]\n"
- " srl %[cc],28\n"
- : [cc] "=&d" (ccode), [crw] "=Q" (*crw)
+ CC_IPM(cc)
+ : CC_OUT(cc, ccode), [crw] "=Q" (*crw)
:
- : "cc");
- return ccode;
+ : CC_CLOBBER);
+ return CC_TRANSFORM(ccode);
}
static inline int _stcrw(struct crw *crw)
diff --git a/drivers/s390/cio/qdio_main.c b/drivers/s390/cio/qdio_main.c
index b711bb1..07e8281 100644
--- a/drivers/s390/cio/qdio_main.c
+++ b/drivers/s390/cio/qdio_main.c
@@ -17,6 +17,7 @@
#include <linux/atomic.h>
#include <asm/debug.h>
#include <asm/qdio.h>
+#include <asm/asm.h>
#include <asm/ipl.h>
#include "cio.h"
@@ -42,13 +43,12 @@ static inline int do_siga_sync(unsigned long schid,
" lgr 2,%[out]\n"
" lgr 3,%[in]\n"
" siga 0\n"
- " ipm %[cc]\n"
- " srl %[cc],28\n"
- : [cc] "=&d" (cc)
+ CC_IPM(cc)
+ : CC_OUT(cc, cc)
: [fc] "d" (fc), [schid] "d" (schid),
[out] "d" (out_mask), [in] "d" (in_mask)
- : "cc", "0", "1", "2", "3");
- return cc;
+ : CC_CLOBBER_LIST("0", "1", "2", "3"));
+ return CC_TRANSFORM(cc);
}
static inline int do_siga_input(unsigned long schid, unsigned long mask,
@@ -61,12 +61,11 @@ static inline int do_siga_input(unsigned long schid, unsigned long mask,
" lgr 1,%[schid]\n"
" lgr 2,%[mask]\n"
" siga 0\n"
- " ipm %[cc]\n"
- " srl %[cc],28\n"
- : [cc] "=&d" (cc)
+ CC_IPM(cc)
+ : CC_OUT(cc, cc)
: [fc] "d" (fc), [schid] "d" (schid), [mask] "d" (mask)
- : "cc", "0", "1", "2");
- return cc;
+ : CC_CLOBBER_LIST("0", "1", "2"));
+ return CC_TRANSFORM(cc);
}
/**
@@ -93,13 +92,12 @@ static inline int do_siga_output(unsigned long schid, unsigned long mask,
" lgr 3,%[aob]\n"
" siga 0\n"
" lgr %[fc],0\n"
- " ipm %[cc]\n"
- " srl %[cc],28\n"
- : [cc] "=&d" (cc), [fc] "+&d" (fc)
+ CC_IPM(cc)
+ : CC_OUT(cc, cc), [fc] "+&d" (fc)
: [schid] "d" (schid), [mask] "d" (mask), [aob] "d" (aob)
- : "cc", "0", "1", "2", "3");
+ : CC_CLOBBER_LIST("0", "1", "2", "3"));
*bb = fc >> 31;
- return cc;
+ return CC_TRANSFORM(cc);
}
/**
diff --git a/drivers/s390/cio/scm.c b/drivers/s390/cio/scm.c
index c7894d6..a0825e3 100644
--- a/drivers/s390/cio/scm.c
+++ b/drivers/s390/cio/scm.c
@@ -91,7 +91,7 @@ static ssize_t show_##name(struct device *dev, \
int ret; \
\
device_lock(dev); \
- ret = sprintf(buf, "%u\n", scmdev->attrs.name); \
+ ret = sysfs_emit(buf, "%u\n", scmdev->attrs.name); \
device_unlock(dev); \
\
return ret; \
diff --git a/drivers/s390/crypto/Makefile b/drivers/s390/crypto/Makefile
index c88b6e0..e83c6603 100644
--- a/drivers/s390/crypto/Makefile
+++ b/drivers/s390/crypto/Makefile
@@ -29,6 +29,10 @@
pkey-pckmo-objs := pkey_pckmo.o
obj-$(CONFIG_PKEY_PCKMO) += pkey-pckmo.o
+# pkey uv handler module
+pkey-uv-objs := pkey_uv.o
+obj-$(CONFIG_PKEY_UV) += pkey-uv.o
+
# adjunct processor matrix
vfio_ap-objs := vfio_ap_drv.o vfio_ap_ops.o
obj-$(CONFIG_VFIO_AP) += vfio_ap.o
diff --git a/drivers/s390/crypto/pkey_base.c b/drivers/s390/crypto/pkey_base.c
index fea2433..64a3765 100644
--- a/drivers/s390/crypto/pkey_base.c
+++ b/drivers/s390/crypto/pkey_base.c
@@ -304,7 +304,19 @@ void pkey_handler_request_modules(void)
{
#ifdef CONFIG_MODULES
static const char * const pkey_handler_modules[] = {
- "pkey_cca", "pkey_ep11", "pkey_pckmo" };
+#if IS_MODULE(CONFIG_PKEY_CCA)
+ "pkey_cca",
+#endif
+#if IS_MODULE(CONFIG_PKEY_EP11)
+ "pkey_ep11",
+#endif
+#if IS_MODULE(CONFIG_PKEY_PCKMO)
+ "pkey_pckmo",
+#endif
+#if IS_MODULE(CONFIG_PKEY_UV)
+ "pkey_uv",
+#endif
+ };
int i;
for (i = 0; i < ARRAY_SIZE(pkey_handler_modules); i++) {
diff --git a/drivers/s390/crypto/pkey_base.h b/drivers/s390/crypto/pkey_base.h
index 7a1a5ce..7347647 100644
--- a/drivers/s390/crypto/pkey_base.h
+++ b/drivers/s390/crypto/pkey_base.h
@@ -97,6 +97,42 @@ static inline u32 pkey_aes_bitsize_to_keytype(u32 keybitsize)
}
/*
+ * helper function which translates the PKEY_KEYTYPE_*
+ * to the protected key size minus the WK VP length
+ */
+static inline u32 pkey_keytype_to_size(u32 keytype)
+{
+ switch (keytype) {
+ case PKEY_KEYTYPE_AES_128:
+ return 16;
+ case PKEY_KEYTYPE_AES_192:
+ return 24;
+ case PKEY_KEYTYPE_AES_256:
+ return 32;
+ case PKEY_KEYTYPE_ECC_P256:
+ return 32;
+ case PKEY_KEYTYPE_ECC_P384:
+ return 48;
+ case PKEY_KEYTYPE_ECC_P521:
+ return 80;
+ case PKEY_KEYTYPE_ECC_ED25519:
+ return 32;
+ case PKEY_KEYTYPE_ECC_ED448:
+ return 54;
+ case PKEY_KEYTYPE_AES_XTS_128:
+ return 32;
+ case PKEY_KEYTYPE_AES_XTS_256:
+ return 64;
+ case PKEY_KEYTYPE_HMAC_512:
+ return 64;
+ case PKEY_KEYTYPE_HMAC_1024:
+ return 128;
+ default:
+ return 0;
+ }
+}
+
+/*
* pkey_api.c:
*/
int __init pkey_api_init(void);
diff --git a/drivers/s390/crypto/pkey_cca.c b/drivers/s390/crypto/pkey_cca.c
index 9370513..cda22db 100644
--- a/drivers/s390/crypto/pkey_cca.c
+++ b/drivers/s390/crypto/pkey_cca.c
@@ -12,7 +12,6 @@
#include <linux/module.h>
#include <linux/cpufeature.h>
-#include "zcrypt_api.h"
#include "zcrypt_ccamisc.h"
#include "pkey_base.h"
@@ -225,14 +224,14 @@ static int cca_key2protkey(const struct pkey_apqn *apqns, size_t nr_apqns,
if (hdr->type == TOKTYPE_CCA_INTERNAL &&
hdr->version == TOKVER_CCA_AES) {
/* CCA AES data key */
- if (keylen != sizeof(struct secaeskeytoken))
+ if (keylen < sizeof(struct secaeskeytoken))
return -EINVAL;
if (cca_check_secaeskeytoken(pkey_dbf_info, 3, key, 0))
return -EINVAL;
} else if (hdr->type == TOKTYPE_CCA_INTERNAL &&
hdr->version == TOKVER_CCA_VLSC) {
/* CCA AES cipher key */
- if (keylen < hdr->len || keylen > MAXCCAVLSCTOKENSIZE)
+ if (keylen < hdr->len)
return -EINVAL;
if (cca_check_secaescipherkey(pkey_dbf_info,
3, key, 0, 1))
diff --git a/drivers/s390/crypto/pkey_ep11.c b/drivers/s390/crypto/pkey_ep11.c
index f42d397..5b033ca 100644
--- a/drivers/s390/crypto/pkey_ep11.c
+++ b/drivers/s390/crypto/pkey_ep11.c
@@ -12,7 +12,6 @@
#include <linux/module.h>
#include <linux/cpufeature.h>
-#include "zcrypt_api.h"
#include "zcrypt_ccamisc.h"
#include "zcrypt_ep11misc.h"
#include "pkey_base.h"
diff --git a/drivers/s390/crypto/pkey_pckmo.c b/drivers/s390/crypto/pkey_pckmo.c
index beeca88..835d59f 100644
--- a/drivers/s390/crypto/pkey_pckmo.c
+++ b/drivers/s390/crypto/pkey_pckmo.c
@@ -15,7 +15,6 @@
#include <crypto/aes.h>
#include <linux/random.h>
-#include "zcrypt_api.h"
#include "zcrypt_ccamisc.h"
#include "pkey_base.h"
@@ -38,23 +37,9 @@ static bool is_pckmo_key(const u8 *key, u32 keylen)
case TOKTYPE_NON_CCA:
switch (hdr->version) {
case TOKVER_CLEAR_KEY:
- switch (t->keytype) {
- case PKEY_KEYTYPE_AES_128:
- case PKEY_KEYTYPE_AES_192:
- case PKEY_KEYTYPE_AES_256:
- case PKEY_KEYTYPE_ECC_P256:
- case PKEY_KEYTYPE_ECC_P384:
- case PKEY_KEYTYPE_ECC_P521:
- case PKEY_KEYTYPE_ECC_ED25519:
- case PKEY_KEYTYPE_ECC_ED448:
- case PKEY_KEYTYPE_AES_XTS_128:
- case PKEY_KEYTYPE_AES_XTS_256:
- case PKEY_KEYTYPE_HMAC_512:
- case PKEY_KEYTYPE_HMAC_1024:
+ if (pkey_keytype_to_size(t->keytype))
return true;
- default:
- return false;
- }
+ return false;
case TOKVER_PROTECTED_KEY:
return true;
default:
@@ -86,80 +71,49 @@ static int pckmo_clr2protkey(u32 keytype, const u8 *clrkey, u32 clrkeylen,
int keysize, rc = -EINVAL;
u8 paramblock[160];
- u32 pkeytype;
- long fc;
+ u32 pkeytype = 0;
+ unsigned int fc;
switch (keytype) {
case PKEY_KEYTYPE_AES_128:
- /* 16 byte key, 32 byte aes wkvp, total 48 bytes */
- keysize = 16;
- pkeytype = keytype;
fc = CPACF_PCKMO_ENC_AES_128_KEY;
break;
case PKEY_KEYTYPE_AES_192:
- /* 24 byte key, 32 byte aes wkvp, total 56 bytes */
- keysize = 24;
- pkeytype = keytype;
fc = CPACF_PCKMO_ENC_AES_192_KEY;
break;
case PKEY_KEYTYPE_AES_256:
- /* 32 byte key, 32 byte aes wkvp, total 64 bytes */
- keysize = 32;
- pkeytype = keytype;
fc = CPACF_PCKMO_ENC_AES_256_KEY;
break;
case PKEY_KEYTYPE_ECC_P256:
- /* 32 byte key, 32 byte aes wkvp, total 64 bytes */
- keysize = 32;
pkeytype = PKEY_KEYTYPE_ECC;
fc = CPACF_PCKMO_ENC_ECC_P256_KEY;
break;
case PKEY_KEYTYPE_ECC_P384:
- /* 48 byte key, 32 byte aes wkvp, total 80 bytes */
- keysize = 48;
pkeytype = PKEY_KEYTYPE_ECC;
fc = CPACF_PCKMO_ENC_ECC_P384_KEY;
break;
case PKEY_KEYTYPE_ECC_P521:
- /* 80 byte key, 32 byte aes wkvp, total 112 bytes */
- keysize = 80;
pkeytype = PKEY_KEYTYPE_ECC;
fc = CPACF_PCKMO_ENC_ECC_P521_KEY;
break;
case PKEY_KEYTYPE_ECC_ED25519:
- /* 32 byte key, 32 byte aes wkvp, total 64 bytes */
- keysize = 32;
pkeytype = PKEY_KEYTYPE_ECC;
fc = CPACF_PCKMO_ENC_ECC_ED25519_KEY;
break;
case PKEY_KEYTYPE_ECC_ED448:
- /* 64 byte key, 32 byte aes wkvp, total 96 bytes */
- keysize = 64;
pkeytype = PKEY_KEYTYPE_ECC;
fc = CPACF_PCKMO_ENC_ECC_ED448_KEY;
break;
case PKEY_KEYTYPE_AES_XTS_128:
- /* 2x16 byte keys, 32 byte aes wkvp, total 64 bytes */
- keysize = 32;
- pkeytype = PKEY_KEYTYPE_AES_XTS_128;
fc = CPACF_PCKMO_ENC_AES_XTS_128_DOUBLE_KEY;
break;
case PKEY_KEYTYPE_AES_XTS_256:
- /* 2x32 byte keys, 32 byte aes wkvp, total 96 bytes */
- keysize = 64;
- pkeytype = PKEY_KEYTYPE_AES_XTS_256;
fc = CPACF_PCKMO_ENC_AES_XTS_256_DOUBLE_KEY;
break;
case PKEY_KEYTYPE_HMAC_512:
- /* 64 byte key, 32 byte aes wkvp, total 96 bytes */
- keysize = 64;
- pkeytype = PKEY_KEYTYPE_HMAC_512;
fc = CPACF_PCKMO_ENC_HMAC_512_KEY;
break;
case PKEY_KEYTYPE_HMAC_1024:
- /* 128 byte key, 32 byte aes wkvp, total 160 bytes */
- keysize = 128;
- pkeytype = PKEY_KEYTYPE_HMAC_1024;
fc = CPACF_PCKMO_ENC_HMAC_1024_KEY;
break;
default:
@@ -168,6 +122,9 @@ static int pckmo_clr2protkey(u32 keytype, const u8 *clrkey, u32 clrkeylen,
goto out;
}
+ keysize = pkey_keytype_to_size(keytype);
+ pkeytype = pkeytype ?: keytype;
+
if (clrkeylen && clrkeylen < keysize) {
PKEY_DBF_ERR("%s clear key size too small: %u < %d\n",
__func__, clrkeylen, keysize);
@@ -190,7 +147,8 @@ static int pckmo_clr2protkey(u32 keytype, const u8 *clrkey, u32 clrkeylen,
}
/* check for the pckmo subfunction we need now */
if (!cpacf_test_func(&pckmo_functions, fc)) {
- PKEY_DBF_ERR("%s pckmo functions not available\n", __func__);
+ PKEY_DBF_ERR("%s pckmo fc 0x%02x not available\n",
+ __func__, fc);
rc = -ENODEV;
goto out;
}
@@ -216,60 +174,42 @@ static int pckmo_clr2protkey(u32 keytype, const u8 *clrkey, u32 clrkeylen,
/*
* Verify a raw protected key blob.
- * Currently only AES protected keys are supported.
*/
static int pckmo_verify_protkey(const u8 *protkey, u32 protkeylen,
u32 protkeytype)
{
- struct {
- u8 iv[AES_BLOCK_SIZE];
- u8 key[MAXPROTKEYSIZE];
- } param;
- u8 null_msg[AES_BLOCK_SIZE];
- u8 dest_buf[AES_BLOCK_SIZE];
- unsigned int k, pkeylen;
- unsigned long fc;
- int rc = -EINVAL;
+ u8 clrkey[16] = { 0 }, tmpkeybuf[16 + AES_WK_VP_SIZE];
+ u32 tmpkeybuflen, tmpkeytype;
+ int keysize, rc = -EINVAL;
+ u8 *wkvp;
- switch (protkeytype) {
- case PKEY_KEYTYPE_AES_128:
- pkeylen = 16 + AES_WK_VP_SIZE;
- fc = CPACF_KMC_PAES_128;
- break;
- case PKEY_KEYTYPE_AES_192:
- pkeylen = 24 + AES_WK_VP_SIZE;
- fc = CPACF_KMC_PAES_192;
- break;
- case PKEY_KEYTYPE_AES_256:
- pkeylen = 32 + AES_WK_VP_SIZE;
- fc = CPACF_KMC_PAES_256;
- break;
- default:
+ /* check protkey type and size */
+ keysize = pkey_keytype_to_size(protkeytype);
+ if (!keysize) {
PKEY_DBF_ERR("%s unknown/unsupported keytype %u\n", __func__,
protkeytype);
goto out;
}
- if (protkeylen != pkeylen) {
- PKEY_DBF_ERR("%s invalid protected key size %u for keytype %u\n",
- __func__, protkeylen, protkeytype);
+ if (protkeylen < keysize + AES_WK_VP_SIZE)
goto out;
- }
- memset(null_msg, 0, sizeof(null_msg));
+ /* generate a dummy AES 128 protected key */
+ tmpkeybuflen = sizeof(tmpkeybuf);
+ rc = pckmo_clr2protkey(PKEY_KEYTYPE_AES_128,
+ clrkey, sizeof(clrkey),
+ tmpkeybuf, &tmpkeybuflen, &tmpkeytype);
+ if (rc)
+ goto out;
+ memzero_explicit(tmpkeybuf, 16);
+ wkvp = tmpkeybuf + 16;
- memset(param.iv, 0, sizeof(param.iv));
- memcpy(param.key, protkey, protkeylen);
-
- k = cpacf_kmc(fc | CPACF_ENCRYPT, ¶m, null_msg, dest_buf,
- sizeof(null_msg));
- if (k != sizeof(null_msg)) {
- PKEY_DBF_ERR("%s protected key is not valid\n", __func__);
+ /* compare WK VP from the temp key with that of the given prot key */
+ if (memcmp(wkvp, protkey + keysize, AES_WK_VP_SIZE)) {
+ PKEY_DBF_ERR("%s protected key WK VP mismatch\n", __func__);
rc = -EKEYREJECTED;
goto out;
}
- rc = 0;
-
out:
pr_debug("rc=%d\n", rc);
return rc;
@@ -289,37 +229,33 @@ static int pckmo_key2protkey(const u8 *key, u32 keylen,
switch (hdr->version) {
case TOKVER_PROTECTED_KEY: {
struct protkeytoken *t = (struct protkeytoken *)key;
+ u32 keysize;
if (keylen < sizeof(*t))
goto out;
+ keysize = pkey_keytype_to_size(t->keytype);
+ if (!keysize) {
+ PKEY_DBF_ERR("%s protected key token: unknown keytype %u\n",
+ __func__, t->keytype);
+ goto out;
+ }
switch (t->keytype) {
case PKEY_KEYTYPE_AES_128:
case PKEY_KEYTYPE_AES_192:
case PKEY_KEYTYPE_AES_256:
- if (keylen != sizeof(struct protaeskeytoken))
+ if (t->len != keysize + AES_WK_VP_SIZE ||
+ keylen < sizeof(struct protaeskeytoken))
goto out;
rc = pckmo_verify_protkey(t->protkey, t->len,
t->keytype);
if (rc)
goto out;
break;
- case PKEY_KEYTYPE_AES_XTS_128:
- if (t->len != 64 || keylen != sizeof(*t) + t->len)
- goto out;
- break;
- case PKEY_KEYTYPE_AES_XTS_256:
- case PKEY_KEYTYPE_HMAC_512:
- if (t->len != 96 || keylen != sizeof(*t) + t->len)
- goto out;
- break;
- case PKEY_KEYTYPE_HMAC_1024:
- if (t->len != 160 || keylen != sizeof(*t) + t->len)
- goto out;
- break;
default:
- PKEY_DBF_ERR("%s protected key token: unknown keytype %u\n",
- __func__, t->keytype);
- goto out;
+ if (t->len != keysize + AES_WK_VP_SIZE ||
+ keylen < sizeof(*t) + keysize + AES_WK_VP_SIZE)
+ goto out;
+ break;
}
memcpy(protkey, t->protkey, t->len);
*protkeylen = t->len;
@@ -329,47 +265,12 @@ static int pckmo_key2protkey(const u8 *key, u32 keylen,
}
case TOKVER_CLEAR_KEY: {
struct clearkeytoken *t = (struct clearkeytoken *)key;
- u32 keysize = 0;
+ u32 keysize;
- if (keylen < sizeof(struct clearkeytoken) ||
- keylen != sizeof(*t) + t->len)
+ if (keylen < sizeof(*t) ||
+ keylen < sizeof(*t) + t->len)
goto out;
- switch (t->keytype) {
- case PKEY_KEYTYPE_AES_128:
- case PKEY_KEYTYPE_AES_192:
- case PKEY_KEYTYPE_AES_256:
- keysize = pkey_keytype_aes_to_size(t->keytype);
- break;
- case PKEY_KEYTYPE_ECC_P256:
- keysize = 32;
- break;
- case PKEY_KEYTYPE_ECC_P384:
- keysize = 48;
- break;
- case PKEY_KEYTYPE_ECC_P521:
- keysize = 80;
- break;
- case PKEY_KEYTYPE_ECC_ED25519:
- keysize = 32;
- break;
- case PKEY_KEYTYPE_ECC_ED448:
- keysize = 64;
- break;
- case PKEY_KEYTYPE_AES_XTS_128:
- keysize = 32;
- break;
- case PKEY_KEYTYPE_AES_XTS_256:
- keysize = 64;
- break;
- case PKEY_KEYTYPE_HMAC_512:
- keysize = 64;
- break;
- case PKEY_KEYTYPE_HMAC_1024:
- keysize = 128;
- break;
- default:
- break;
- }
+ keysize = pkey_keytype_to_size(t->keytype);
if (!keysize) {
PKEY_DBF_ERR("%s clear key token: unknown keytype %u\n",
__func__, t->keytype);
@@ -397,8 +298,6 @@ static int pckmo_key2protkey(const u8 *key, u32 keylen,
/*
* Generate a random protected key.
- * Currently only the generation of AES protected keys
- * is supported.
*/
static int pckmo_gen_protkey(u32 keytype, u32 subtype,
u8 *protkey, u32 *protkeylen, u32 *protkeytype)
@@ -407,23 +306,8 @@ static int pckmo_gen_protkey(u32 keytype, u32 subtype,
int keysize;
int rc;
- switch (keytype) {
- case PKEY_KEYTYPE_AES_128:
- case PKEY_KEYTYPE_AES_192:
- case PKEY_KEYTYPE_AES_256:
- keysize = pkey_keytype_aes_to_size(keytype);
- break;
- case PKEY_KEYTYPE_AES_XTS_128:
- keysize = 32;
- break;
- case PKEY_KEYTYPE_AES_XTS_256:
- case PKEY_KEYTYPE_HMAC_512:
- keysize = 64;
- break;
- case PKEY_KEYTYPE_HMAC_1024:
- keysize = 128;
- break;
- default:
+ keysize = pkey_keytype_to_size(keytype);
+ if (!keysize) {
PKEY_DBF_ERR("%s unknown/unsupported keytype %d\n",
__func__, keytype);
return -EINVAL;
@@ -434,6 +318,21 @@ static int pckmo_gen_protkey(u32 keytype, u32 subtype,
return -EINVAL;
}
+ switch (keytype) {
+ case PKEY_KEYTYPE_AES_128:
+ case PKEY_KEYTYPE_AES_192:
+ case PKEY_KEYTYPE_AES_256:
+ case PKEY_KEYTYPE_AES_XTS_128:
+ case PKEY_KEYTYPE_AES_XTS_256:
+ case PKEY_KEYTYPE_HMAC_512:
+ case PKEY_KEYTYPE_HMAC_1024:
+ break;
+ default:
+ PKEY_DBF_ERR("%s unsupported keytype %d\n",
+ __func__, keytype);
+ return -EINVAL;
+ }
+
/* generate a dummy random clear key */
get_random_bytes(clrkey, keysize);
@@ -453,7 +352,6 @@ static int pckmo_gen_protkey(u32 keytype, u32 subtype,
/*
* Verify a protected key token blob.
- * Currently only AES protected keys are supported.
*/
static int pckmo_verify_key(const u8 *key, u32 keylen)
{
@@ -467,11 +365,26 @@ static int pckmo_verify_key(const u8 *key, u32 keylen)
switch (hdr->version) {
case TOKVER_PROTECTED_KEY: {
- struct protaeskeytoken *t;
+ struct protkeytoken *t = (struct protkeytoken *)key;
+ u32 keysize;
- if (keylen != sizeof(struct protaeskeytoken))
+ if (keylen < sizeof(*t))
goto out;
- t = (struct protaeskeytoken *)key;
+ keysize = pkey_keytype_to_size(t->keytype);
+ if (!keysize || t->len != keysize + AES_WK_VP_SIZE)
+ goto out;
+ switch (t->keytype) {
+ case PKEY_KEYTYPE_AES_128:
+ case PKEY_KEYTYPE_AES_192:
+ case PKEY_KEYTYPE_AES_256:
+ if (keylen < sizeof(struct protaeskeytoken))
+ goto out;
+ break;
+ default:
+ if (keylen < sizeof(*t) + keysize + AES_WK_VP_SIZE)
+ goto out;
+ break;
+ }
rc = pckmo_verify_protkey(t->protkey, t->len, t->keytype);
break;
}
diff --git a/drivers/s390/crypto/pkey_sysfs.c b/drivers/s390/crypto/pkey_sysfs.c
index cc0fc1e..a4eb458 100644
--- a/drivers/s390/crypto/pkey_sysfs.c
+++ b/drivers/s390/crypto/pkey_sysfs.c
@@ -10,7 +10,6 @@
#include <linux/sysfs.h>
-#include "zcrypt_api.h"
#include "zcrypt_ccamisc.h"
#include "zcrypt_ep11misc.h"
diff --git a/drivers/s390/crypto/pkey_uv.c b/drivers/s390/crypto/pkey_uv.c
new file mode 100644
index 0000000..805817b
--- /dev/null
+++ b/drivers/s390/crypto/pkey_uv.c
@@ -0,0 +1,284 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * pkey uv specific code
+ *
+ * Copyright IBM Corp. 2024
+ */
+
+#define KMSG_COMPONENT "pkey"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/cpufeature.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <asm/uv.h>
+
+#include "zcrypt_ccamisc.h"
+#include "pkey_base.h"
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("IBM Corporation");
+MODULE_DESCRIPTION("s390 protected key UV handler");
+
+/*
+ * UV secret token struct and defines.
+ */
+
+#define TOKVER_UV_SECRET 0x09
+
+struct uvsecrettoken {
+ u8 type; /* 0x00 = TOKTYPE_NON_CCA */
+ u8 res0[3];
+ u8 version; /* 0x09 = TOKVER_UV_SECRET */
+ u8 res1[3];
+ u16 secret_type; /* one of enum uv_secret_types from uv.h */
+ u16 secret_len; /* length in bytes of the secret */
+ u8 secret_id[UV_SECRET_ID_LEN]; /* the secret id for this secret */
+} __packed;
+
+/*
+ * Check key blob for known and supported UV key.
+ */
+static bool is_uv_key(const u8 *key, u32 keylen)
+{
+ struct uvsecrettoken *t = (struct uvsecrettoken *)key;
+
+ if (keylen < sizeof(*t))
+ return false;
+
+ switch (t->type) {
+ case TOKTYPE_NON_CCA:
+ switch (t->version) {
+ case TOKVER_UV_SECRET:
+ switch (t->secret_type) {
+ case UV_SECRET_AES_128:
+ case UV_SECRET_AES_192:
+ case UV_SECRET_AES_256:
+ case UV_SECRET_AES_XTS_128:
+ case UV_SECRET_AES_XTS_256:
+ case UV_SECRET_HMAC_SHA_256:
+ case UV_SECRET_HMAC_SHA_512:
+ case UV_SECRET_ECDSA_P256:
+ case UV_SECRET_ECDSA_P384:
+ case UV_SECRET_ECDSA_P521:
+ case UV_SECRET_ECDSA_ED25519:
+ case UV_SECRET_ECDSA_ED448:
+ return true;
+ default:
+ return false;
+ }
+ default:
+ return false;
+ }
+ default:
+ return false;
+ }
+}
+
+static bool is_uv_keytype(enum pkey_key_type keytype)
+{
+ switch (keytype) {
+ case PKEY_TYPE_UVSECRET:
+ return true;
+ default:
+ return false;
+ }
+}
+
+static int retrieve_secret(const u8 secret_id[UV_SECRET_ID_LEN],
+ u16 *secret_type, u8 *buf, u32 *buflen)
+{
+ struct uv_secret_list_item_hdr secret_meta_data;
+ int rc;
+
+ rc = uv_get_secret_metadata(secret_id, &secret_meta_data);
+ if (rc)
+ return rc;
+
+ if (*buflen < secret_meta_data.length)
+ return -EINVAL;
+
+ rc = uv_retrieve_secret(secret_meta_data.index,
+ buf, secret_meta_data.length);
+ if (rc)
+ return rc;
+
+ *secret_type = secret_meta_data.type;
+ *buflen = secret_meta_data.length;
+
+ return 0;
+}
+
+static int uv_get_size_and_type(u16 secret_type, u32 *pkeysize, u32 *pkeytype)
+{
+ int rc = 0;
+
+ switch (secret_type) {
+ case UV_SECRET_AES_128:
+ *pkeysize = 16 + AES_WK_VP_SIZE;
+ *pkeytype = PKEY_KEYTYPE_AES_128;
+ break;
+ case UV_SECRET_AES_192:
+ *pkeysize = 24 + AES_WK_VP_SIZE;
+ *pkeytype = PKEY_KEYTYPE_AES_192;
+ break;
+ case UV_SECRET_AES_256:
+ *pkeysize = 32 + AES_WK_VP_SIZE;
+ *pkeytype = PKEY_KEYTYPE_AES_256;
+ break;
+ case UV_SECRET_AES_XTS_128:
+ *pkeysize = 16 + 16 + AES_WK_VP_SIZE;
+ *pkeytype = PKEY_KEYTYPE_AES_XTS_128;
+ break;
+ case UV_SECRET_AES_XTS_256:
+ *pkeysize = 32 + 32 + AES_WK_VP_SIZE;
+ *pkeytype = PKEY_KEYTYPE_AES_XTS_256;
+ break;
+ case UV_SECRET_HMAC_SHA_256:
+ *pkeysize = 64 + AES_WK_VP_SIZE;
+ *pkeytype = PKEY_KEYTYPE_HMAC_512;
+ break;
+ case UV_SECRET_HMAC_SHA_512:
+ *pkeysize = 128 + AES_WK_VP_SIZE;
+ *pkeytype = PKEY_KEYTYPE_HMAC_1024;
+ break;
+ case UV_SECRET_ECDSA_P256:
+ *pkeysize = 32 + AES_WK_VP_SIZE;
+ *pkeytype = PKEY_KEYTYPE_ECC_P256;
+ break;
+ case UV_SECRET_ECDSA_P384:
+ *pkeysize = 48 + AES_WK_VP_SIZE;
+ *pkeytype = PKEY_KEYTYPE_ECC_P384;
+ break;
+ case UV_SECRET_ECDSA_P521:
+ *pkeysize = 80 + AES_WK_VP_SIZE;
+ *pkeytype = PKEY_KEYTYPE_ECC_P521;
+ break;
+ case UV_SECRET_ECDSA_ED25519:
+ *pkeysize = 32 + AES_WK_VP_SIZE;
+ *pkeytype = PKEY_KEYTYPE_ECC_ED25519;
+ break;
+ case UV_SECRET_ECDSA_ED448:
+ *pkeysize = 64 + AES_WK_VP_SIZE;
+ *pkeytype = PKEY_KEYTYPE_ECC_ED448;
+ break;
+ default:
+ rc = -EINVAL;
+ }
+
+ return rc;
+}
+
+static int uv_key2protkey(const struct pkey_apqn *_apqns __always_unused,
+ size_t _nr_apqns __always_unused,
+ const u8 *key, u32 keylen,
+ u8 *protkey, u32 *protkeylen, u32 *keyinfo)
+{
+ struct uvsecrettoken *t = (struct uvsecrettoken *)key;
+ u32 pkeysize, pkeytype;
+ u16 secret_type;
+ int rc;
+
+ rc = uv_get_size_and_type(t->secret_type, &pkeysize, &pkeytype);
+ if (rc)
+ goto out;
+
+ if (*protkeylen < pkeysize) {
+ PKEY_DBF_ERR("%s prot key buffer size too small: %u < %u\n",
+ __func__, *protkeylen, pkeysize);
+ rc = -EINVAL;
+ goto out;
+ }
+
+ rc = retrieve_secret(t->secret_id, &secret_type, protkey, protkeylen);
+ if (rc) {
+ PKEY_DBF_ERR("%s retrieve_secret() failed with %d\n",
+ __func__, rc);
+ goto out;
+ }
+ if (secret_type != t->secret_type) {
+ PKEY_DBF_ERR("%s retrieved secret type %u != expected type %u\n",
+ __func__, secret_type, t->secret_type);
+ rc = -EINVAL;
+ goto out;
+ }
+
+ if (keyinfo)
+ *keyinfo = pkeytype;
+
+out:
+ pr_debug("rc=%d\n", rc);
+ return rc;
+}
+
+static int uv_verifykey(const u8 *key, u32 keylen,
+ u16 *_card __always_unused,
+ u16 *_dom __always_unused,
+ u32 *keytype, u32 *keybitsize, u32 *flags)
+{
+ struct uvsecrettoken *t = (struct uvsecrettoken *)key;
+ struct uv_secret_list_item_hdr secret_meta_data;
+ u32 pkeysize, pkeytype, bitsize;
+ int rc;
+
+ rc = uv_get_size_and_type(t->secret_type, &pkeysize, &pkeytype);
+ if (rc)
+ goto out;
+
+ rc = uv_get_secret_metadata(t->secret_id, &secret_meta_data);
+ if (rc)
+ goto out;
+
+ if (secret_meta_data.type != t->secret_type) {
+ rc = -EINVAL;
+ goto out;
+ }
+
+ /* set keytype; keybitsize and flags are not supported */
+ if (keytype)
+ *keytype = PKEY_TYPE_UVSECRET;
+ if (keybitsize) {
+ bitsize = 8 * pkey_keytype_to_size(pkeytype);
+ *keybitsize = bitsize ?: PKEY_SIZE_UNKNOWN;
+ }
+ if (flags)
+ *flags = pkeytype;
+
+out:
+ pr_debug("rc=%d\n", rc);
+ return rc;
+}
+
+static struct pkey_handler uv_handler = {
+ .module = THIS_MODULE,
+ .name = "PKEY UV handler",
+ .is_supported_key = is_uv_key,
+ .is_supported_keytype = is_uv_keytype,
+ .key_to_protkey = uv_key2protkey,
+ .verify_key = uv_verifykey,
+};
+
+/*
+ * Module init
+ */
+static int __init pkey_uv_init(void)
+{
+ if (!is_prot_virt_guest())
+ return -ENODEV;
+
+ if (!test_bit_inv(BIT_UVC_CMD_RETR_SECRET, uv_info.inst_calls_list))
+ return -ENODEV;
+
+ return pkey_handler_register(&uv_handler);
+}
+
+/*
+ * Module exit
+ */
+static void __exit pkey_uv_exit(void)
+{
+ pkey_handler_unregister(&uv_handler);
+}
+
+module_cpu_feature_match(S390_CPU_FEATURE_UV, pkey_uv_init);
+module_exit(pkey_uv_exit);
diff --git a/drivers/s390/crypto/vfio_ap_ops.c b/drivers/s390/crypto/vfio_ap_ops.c
index 9f76f2d..8c0b40d 100644
--- a/drivers/s390/crypto/vfio_ap_ops.c
+++ b/drivers/s390/crypto/vfio_ap_ops.c
@@ -1521,18 +1521,13 @@ static ssize_t control_domains_show(struct device *dev,
char *buf)
{
unsigned long id;
- int nchars = 0;
- int n;
- char *bufpos = buf;
struct ap_matrix_mdev *matrix_mdev = dev_get_drvdata(dev);
unsigned long max_domid = matrix_mdev->matrix.adm_max;
+ int nchars = 0;
mutex_lock(&matrix_dev->mdevs_lock);
- for_each_set_bit_inv(id, matrix_mdev->matrix.adm, max_domid + 1) {
- n = sprintf(bufpos, "%04lx\n", id);
- bufpos += n;
- nchars += n;
- }
+ for_each_set_bit_inv(id, matrix_mdev->matrix.adm, max_domid + 1)
+ nchars += sysfs_emit_at(buf, nchars, "%04lx\n", id);
mutex_unlock(&matrix_dev->mdevs_lock);
return nchars;
@@ -1541,7 +1536,6 @@ static DEVICE_ATTR_RO(control_domains);
static ssize_t vfio_ap_mdev_matrix_show(struct ap_matrix *matrix, char *buf)
{
- char *bufpos = buf;
unsigned long apid;
unsigned long apqi;
unsigned long apid1;
@@ -1549,33 +1543,21 @@ static ssize_t vfio_ap_mdev_matrix_show(struct ap_matrix *matrix, char *buf)
unsigned long napm_bits = matrix->apm_max + 1;
unsigned long naqm_bits = matrix->aqm_max + 1;
int nchars = 0;
- int n;
apid1 = find_first_bit_inv(matrix->apm, napm_bits);
apqi1 = find_first_bit_inv(matrix->aqm, naqm_bits);
if ((apid1 < napm_bits) && (apqi1 < naqm_bits)) {
for_each_set_bit_inv(apid, matrix->apm, napm_bits) {
- for_each_set_bit_inv(apqi, matrix->aqm,
- naqm_bits) {
- n = sprintf(bufpos, "%02lx.%04lx\n", apid,
- apqi);
- bufpos += n;
- nchars += n;
- }
+ for_each_set_bit_inv(apqi, matrix->aqm, naqm_bits)
+ nchars += sysfs_emit_at(buf, nchars, "%02lx.%04lx\n", apid, apqi);
}
} else if (apid1 < napm_bits) {
- for_each_set_bit_inv(apid, matrix->apm, napm_bits) {
- n = sprintf(bufpos, "%02lx.\n", apid);
- bufpos += n;
- nchars += n;
- }
+ for_each_set_bit_inv(apid, matrix->apm, napm_bits)
+ nchars += sysfs_emit_at(buf, nchars, "%02lx.\n", apid);
} else if (apqi1 < naqm_bits) {
- for_each_set_bit_inv(apqi, matrix->aqm, naqm_bits) {
- n = sprintf(bufpos, ".%04lx\n", apqi);
- bufpos += n;
- nchars += n;
- }
+ for_each_set_bit_inv(apqi, matrix->aqm, naqm_bits)
+ nchars += sysfs_emit_at(buf, nchars, ".%04lx\n", apqi);
}
return nchars;
@@ -2263,14 +2245,11 @@ static ssize_t status_show(struct device *dev,
if (matrix_mdev->kvm &&
test_bit_inv(apid, matrix_mdev->shadow_apcb.apm) &&
test_bit_inv(apqi, matrix_mdev->shadow_apcb.aqm))
- nchars = scnprintf(buf, PAGE_SIZE, "%s\n",
- AP_QUEUE_IN_USE);
+ nchars = sysfs_emit(buf, "%s\n", AP_QUEUE_IN_USE);
else
- nchars = scnprintf(buf, PAGE_SIZE, "%s\n",
- AP_QUEUE_ASSIGNED);
+ nchars = sysfs_emit(buf, "%s\n", AP_QUEUE_ASSIGNED);
} else {
- nchars = scnprintf(buf, PAGE_SIZE, "%s\n",
- AP_QUEUE_UNASSIGNED);
+ nchars = sysfs_emit(buf, "%s\n", AP_QUEUE_UNASSIGNED);
}
mutex_unlock(&matrix_dev->mdevs_lock);
diff --git a/drivers/s390/crypto/zcrypt_ccamisc.h b/drivers/s390/crypto/zcrypt_ccamisc.h
index aed7e83..26bdca7 100644
--- a/drivers/s390/crypto/zcrypt_ccamisc.h
+++ b/drivers/s390/crypto/zcrypt_ccamisc.h
@@ -12,6 +12,7 @@
#include <asm/zcrypt.h>
#include <asm/pkey.h>
+#include "zcrypt_api.h"
/* Key token types */
#define TOKTYPE_NON_CCA 0x00 /* Non-CCA key token */
diff --git a/drivers/s390/net/netiucv.c b/drivers/s390/net/netiucv.c
index 039e18d..31c9f95 100644
--- a/drivers/s390/net/netiucv.c
+++ b/drivers/s390/net/netiucv.c
@@ -1319,7 +1319,7 @@ static ssize_t user_show(struct device *dev, struct device_attribute *attr,
struct netiucv_priv *priv = dev_get_drvdata(dev);
IUCV_DBF_TEXT(trace, 5, __func__);
- return sprintf(buf, "%s\n", netiucv_printuser(priv->conn));
+ return sysfs_emit(buf, "%s\n", netiucv_printuser(priv->conn));
}
static int netiucv_check_user(const char *buf, size_t count, char *username,
@@ -1415,7 +1415,7 @@ static ssize_t buffer_show (struct device *dev, struct device_attribute *attr,
struct netiucv_priv *priv = dev_get_drvdata(dev);
IUCV_DBF_TEXT(trace, 5, __func__);
- return sprintf(buf, "%d\n", priv->conn->max_buffsize);
+ return sysfs_emit(buf, "%d\n", priv->conn->max_buffsize);
}
static ssize_t buffer_write (struct device *dev, struct device_attribute *attr,
@@ -1473,7 +1473,7 @@ static ssize_t dev_fsm_show (struct device *dev, struct device_attribute *attr,
struct netiucv_priv *priv = dev_get_drvdata(dev);
IUCV_DBF_TEXT(trace, 5, __func__);
- return sprintf(buf, "%s\n", fsm_getstate_str(priv->fsm));
+ return sysfs_emit(buf, "%s\n", fsm_getstate_str(priv->fsm));
}
static DEVICE_ATTR(device_fsm_state, 0444, dev_fsm_show, NULL);
@@ -1484,7 +1484,7 @@ static ssize_t conn_fsm_show (struct device *dev,
struct netiucv_priv *priv = dev_get_drvdata(dev);
IUCV_DBF_TEXT(trace, 5, __func__);
- return sprintf(buf, "%s\n", fsm_getstate_str(priv->conn->fsm));
+ return sysfs_emit(buf, "%s\n", fsm_getstate_str(priv->conn->fsm));
}
static DEVICE_ATTR(connection_fsm_state, 0444, conn_fsm_show, NULL);
@@ -1495,7 +1495,7 @@ static ssize_t maxmulti_show (struct device *dev,
struct netiucv_priv *priv = dev_get_drvdata(dev);
IUCV_DBF_TEXT(trace, 5, __func__);
- return sprintf(buf, "%ld\n", priv->conn->prof.maxmulti);
+ return sysfs_emit(buf, "%ld\n", priv->conn->prof.maxmulti);
}
static ssize_t maxmulti_write (struct device *dev,
@@ -1517,7 +1517,7 @@ static ssize_t maxcq_show (struct device *dev, struct device_attribute *attr,
struct netiucv_priv *priv = dev_get_drvdata(dev);
IUCV_DBF_TEXT(trace, 5, __func__);
- return sprintf(buf, "%ld\n", priv->conn->prof.maxcqueue);
+ return sysfs_emit(buf, "%ld\n", priv->conn->prof.maxcqueue);
}
static ssize_t maxcq_write (struct device *dev, struct device_attribute *attr,
@@ -1538,7 +1538,7 @@ static ssize_t sdoio_show (struct device *dev, struct device_attribute *attr,
struct netiucv_priv *priv = dev_get_drvdata(dev);
IUCV_DBF_TEXT(trace, 5, __func__);
- return sprintf(buf, "%ld\n", priv->conn->prof.doios_single);
+ return sysfs_emit(buf, "%ld\n", priv->conn->prof.doios_single);
}
static ssize_t sdoio_write (struct device *dev, struct device_attribute *attr,
@@ -1559,7 +1559,7 @@ static ssize_t mdoio_show (struct device *dev, struct device_attribute *attr,
struct netiucv_priv *priv = dev_get_drvdata(dev);
IUCV_DBF_TEXT(trace, 5, __func__);
- return sprintf(buf, "%ld\n", priv->conn->prof.doios_multi);
+ return sysfs_emit(buf, "%ld\n", priv->conn->prof.doios_multi);
}
static ssize_t mdoio_write (struct device *dev, struct device_attribute *attr,
@@ -1580,7 +1580,7 @@ static ssize_t txlen_show (struct device *dev, struct device_attribute *attr,
struct netiucv_priv *priv = dev_get_drvdata(dev);
IUCV_DBF_TEXT(trace, 5, __func__);
- return sprintf(buf, "%ld\n", priv->conn->prof.txlen);
+ return sysfs_emit(buf, "%ld\n", priv->conn->prof.txlen);
}
static ssize_t txlen_write (struct device *dev, struct device_attribute *attr,
@@ -1601,7 +1601,7 @@ static ssize_t txtime_show (struct device *dev, struct device_attribute *attr,
struct netiucv_priv *priv = dev_get_drvdata(dev);
IUCV_DBF_TEXT(trace, 5, __func__);
- return sprintf(buf, "%ld\n", priv->conn->prof.tx_time);
+ return sysfs_emit(buf, "%ld\n", priv->conn->prof.tx_time);
}
static ssize_t txtime_write (struct device *dev, struct device_attribute *attr,
@@ -1622,7 +1622,7 @@ static ssize_t txpend_show (struct device *dev, struct device_attribute *attr,
struct netiucv_priv *priv = dev_get_drvdata(dev);
IUCV_DBF_TEXT(trace, 5, __func__);
- return sprintf(buf, "%ld\n", priv->conn->prof.tx_pending);
+ return sysfs_emit(buf, "%ld\n", priv->conn->prof.tx_pending);
}
static ssize_t txpend_write (struct device *dev, struct device_attribute *attr,
@@ -1643,7 +1643,7 @@ static ssize_t txmpnd_show (struct device *dev, struct device_attribute *attr,
struct netiucv_priv *priv = dev_get_drvdata(dev);
IUCV_DBF_TEXT(trace, 5, __func__);
- return sprintf(buf, "%ld\n", priv->conn->prof.tx_max_pending);
+ return sysfs_emit(buf, "%ld\n", priv->conn->prof.tx_max_pending);
}
static ssize_t txmpnd_write (struct device *dev, struct device_attribute *attr,
diff --git a/drivers/s390/scsi/zfcp_sysfs.c b/drivers/s390/scsi/zfcp_sysfs.c
index cb67fa8..304b81b 100644
--- a/drivers/s390/scsi/zfcp_sysfs.c
+++ b/drivers/s390/scsi/zfcp_sysfs.c
@@ -24,7 +24,7 @@ static ssize_t zfcp_sysfs_##_feat##_##_name##_show(struct device *dev, \
{ \
struct _feat_def *_feat = container_of(dev, struct _feat_def, dev); \
\
- return sprintf(buf, _format, _value); \
+ return sysfs_emit(buf, _format, _value); \
} \
static ZFCP_DEV_ATTR(_feat, _name, S_IRUGO, \
zfcp_sysfs_##_feat##_##_name##_show, NULL);
@@ -34,7 +34,7 @@ static ssize_t zfcp_sysfs_##_feat##_##_name##_show(struct device *dev, \
struct device_attribute *at,\
char *buf) \
{ \
- return sprintf(buf, _format, _value); \
+ return sysfs_emit(buf, _format, _value); \
} \
static ZFCP_DEV_ATTR(_feat, _name, S_IRUGO, \
zfcp_sysfs_##_feat##_##_name##_show, NULL);
@@ -51,7 +51,7 @@ static ssize_t zfcp_sysfs_adapter_##_name##_show(struct device *dev, \
if (!adapter) \
return -ENODEV; \
\
- i = sprintf(buf, _format, _value); \
+ i = sysfs_emit(buf, _format, _value); \
zfcp_ccw_adapter_put(adapter); \
return i; \
} \
@@ -95,9 +95,9 @@ static ssize_t zfcp_sysfs_port_failed_show(struct device *dev,
struct zfcp_port *port = container_of(dev, struct zfcp_port, dev);
if (atomic_read(&port->status) & ZFCP_STATUS_COMMON_ERP_FAILED)
- return sprintf(buf, "1\n");
+ return sysfs_emit(buf, "1\n");
- return sprintf(buf, "0\n");
+ return sysfs_emit(buf, "0\n");
}
static ssize_t zfcp_sysfs_port_failed_store(struct device *dev,
@@ -135,7 +135,7 @@ static ssize_t zfcp_sysfs_unit_failed_show(struct device *dev,
scsi_device_put(sdev);
}
- return sprintf(buf, "%d\n", failed);
+ return sysfs_emit(buf, "%d\n", failed);
}
static ssize_t zfcp_sysfs_unit_failed_store(struct device *dev,
@@ -176,9 +176,9 @@ static ssize_t zfcp_sysfs_adapter_failed_show(struct device *dev,
return -ENODEV;
if (atomic_read(&adapter->status) & ZFCP_STATUS_COMMON_ERP_FAILED)
- i = sprintf(buf, "1\n");
+ i = sysfs_emit(buf, "1\n");
else
- i = sprintf(buf, "0\n");
+ i = sysfs_emit(buf, "0\n");
zfcp_ccw_adapter_put(adapter);
return i;
@@ -348,8 +348,7 @@ zfcp_sysfs_adapter_diag_max_age_show(struct device *dev,
if (!adapter)
return -ENODEV;
- /* ceil(log(2^64 - 1) / log(10)) = 20 */
- rc = scnprintf(buf, 20 + 2, "%lu\n", adapter->diagnostics->max_age);
+ rc = sysfs_emit(buf, "%lu\n", adapter->diagnostics->max_age);
zfcp_ccw_adapter_put(adapter);
return rc;
@@ -401,14 +400,14 @@ static ssize_t zfcp_sysfs_adapter_fc_security_show(
*/
status = atomic_read(&adapter->status);
if (0 == (status & ZFCP_STATUS_COMMON_OPEN))
- i = sprintf(buf, "unknown\n");
+ i = sysfs_emit(buf, "unknown\n");
else if (!(adapter->adapter_features & FSF_FEATURE_FC_SECURITY))
- i = sprintf(buf, "unsupported\n");
+ i = sysfs_emit(buf, "unsupported\n");
else {
i = zfcp_fsf_scnprint_fc_security(
buf, PAGE_SIZE - 1, adapter->fc_security_algorithms,
ZFCP_FSF_PRINT_FMT_LIST);
- i += scnprintf(buf + i, PAGE_SIZE - i, "\n");
+ i += sysfs_emit_at(buf, i, "\n");
}
zfcp_ccw_adapter_put(adapter);
@@ -490,14 +489,14 @@ static ssize_t zfcp_sysfs_port_fc_security_show(struct device *dev,
0 != (status & ZFCP_STATUS_PORT_LINK_TEST) ||
0 != (status & ZFCP_STATUS_COMMON_ERP_FAILED) ||
0 != (status & ZFCP_STATUS_COMMON_ACCESS_BOXED))
- i = sprintf(buf, "unknown\n");
+ i = sysfs_emit(buf, "unknown\n");
else if (!(adapter->adapter_features & FSF_FEATURE_FC_SECURITY))
- i = sprintf(buf, "unsupported\n");
+ i = sysfs_emit(buf, "unsupported\n");
else {
i = zfcp_fsf_scnprint_fc_security(
buf, PAGE_SIZE - 1, port->connection_info,
ZFCP_FSF_PRINT_FMT_SINGLEITEM);
- i += scnprintf(buf + i, PAGE_SIZE - i, "\n");
+ i += sysfs_emit_at(buf, i, "\n");
}
return i;
@@ -569,8 +568,8 @@ zfcp_sysfs_unit_##_name##_latency_show(struct device *dev, \
do_div(cmin, 1000); \
do_div(cmax, 1000); \
\
- return sprintf(buf, "%llu %llu %llu %llu %llu %llu %llu\n", \
- fmin, fmax, fsum, cmin, cmax, csum, cc); \
+ return sysfs_emit(buf, "%llu %llu %llu %llu %llu %llu %llu\n", \
+ fmin, fmax, fsum, cmin, cmax, csum, cc); \
} \
static ssize_t \
zfcp_sysfs_unit_##_name##_latency_store(struct device *dev, \
@@ -610,8 +609,8 @@ static ssize_t zfcp_sysfs_scsi_##_name##_show(struct device *dev, \
struct scsi_device *sdev = to_scsi_device(dev); \
struct zfcp_scsi_dev *zfcp_sdev = sdev_to_zfcp(sdev); \
\
- return sprintf(buf, _format, _value); \
-} \
+ return sysfs_emit(buf, _format, _value); \
+} \
static DEVICE_ATTR(_name, S_IRUGO, zfcp_sysfs_scsi_##_name##_show, NULL);
ZFCP_DEFINE_SCSI_ATTR(hba_id, "%s\n",
@@ -625,7 +624,7 @@ static ssize_t zfcp_sysfs_scsi_fcp_lun_show(struct device *dev,
{
struct scsi_device *sdev = to_scsi_device(dev);
- return sprintf(buf, "0x%016llx\n", zfcp_scsi_dev_lun(sdev));
+ return sysfs_emit(buf, "0x%016llx\n", zfcp_scsi_dev_lun(sdev));
}
static DEVICE_ATTR(fcp_lun, S_IRUGO, zfcp_sysfs_scsi_fcp_lun_show, NULL);
@@ -641,7 +640,7 @@ static ssize_t zfcp_sysfs_scsi_zfcp_failed_show(struct device *dev,
unsigned int status = atomic_read(&sdev_to_zfcp(sdev)->status);
unsigned int failed = status & ZFCP_STATUS_COMMON_ERP_FAILED ? 1 : 0;
- return sprintf(buf, "%d\n", failed);
+ return sysfs_emit(buf, "%d\n", failed);
}
static ssize_t zfcp_sysfs_scsi_zfcp_failed_store(struct device *dev,
@@ -714,8 +713,8 @@ static ssize_t zfcp_sysfs_adapter_util_show(struct device *dev,
retval = zfcp_fsf_exchange_port_data_sync(adapter->qdio, qtcb_port);
if (retval == 0 || retval == -EAGAIN)
- retval = sprintf(buf, "%u %u %u\n", qtcb_port->cp_util,
- qtcb_port->cb_util, qtcb_port->a_util);
+ retval = sysfs_emit(buf, "%u %u %u\n", qtcb_port->cp_util,
+ qtcb_port->cb_util, qtcb_port->a_util);
kfree(qtcb_port);
return retval;
}
@@ -758,7 +757,7 @@ static ssize_t zfcp_sysfs_adapter_##_name##_show(struct device *dev, \
if (retval) \
return retval; \
\
- return sprintf(buf, _format, ## _arg); \
+ return sysfs_emit(buf, _format, ## _arg); \
} \
static DEVICE_ATTR(_name, S_IRUGO, zfcp_sysfs_adapter_##_name##_show, NULL);
@@ -787,8 +786,8 @@ static ssize_t zfcp_sysfs_adapter_q_full_show(struct device *dev,
util = qdio->req_q_util;
spin_unlock_bh(&qdio->stat_lock);
- return sprintf(buf, "%d %llu\n", atomic_read(&qdio->req_q_full),
- (unsigned long long)util);
+ return sysfs_emit(buf, "%d %llu\n", atomic_read(&qdio->req_q_full),
+ (unsigned long long)util);
}
static DEVICE_ATTR(queue_full, S_IRUGO, zfcp_sysfs_adapter_q_full_show, NULL);
@@ -843,8 +842,7 @@ static ssize_t zfcp_sysfs_adapter_diag_b2b_credit_show(
.data.nport_serv_param -
sizeof(u32));
- rc = scnprintf(buf, 5 + 2, "%hu\n",
- be16_to_cpu(nsp->fl_csp.sp_bb_cred));
+ rc = sysfs_emit(buf, "%hu\n", be16_to_cpu(nsp->fl_csp.sp_bb_cred));
spin_unlock_irqrestore(&diag_hdr->access_lock, flags);
out:
@@ -854,7 +852,7 @@ static ssize_t zfcp_sysfs_adapter_diag_b2b_credit_show(
static ZFCP_DEV_ATTR(adapter_diag, b2b_credit, 0400,
zfcp_sysfs_adapter_diag_b2b_credit_show, NULL);
-#define ZFCP_DEFINE_DIAG_SFP_ATTR(_name, _qtcb_member, _prtsize, _prtfmt) \
+#define ZFCP_DEFINE_DIAG_SFP_ATTR(_name, _qtcb_member, _prtfmt) \
static ssize_t zfcp_sysfs_adapter_diag_sfp_##_name##_show( \
struct device *dev, struct device_attribute *attr, char *buf) \
{ \
@@ -887,8 +885,8 @@ static ZFCP_DEV_ATTR(adapter_diag, b2b_credit, 0400,
goto out; \
\
spin_lock_irqsave(&diag_hdr->access_lock, flags); \
- rc = scnprintf( \
- buf, (_prtsize) + 2, _prtfmt "\n", \
+ rc = sysfs_emit( \
+ buf, _prtfmt "\n", \
adapter->diagnostics->port_data.data._qtcb_member); \
spin_unlock_irqrestore(&diag_hdr->access_lock, flags); \
\
@@ -899,16 +897,16 @@ static ZFCP_DEV_ATTR(adapter_diag, b2b_credit, 0400,
static ZFCP_DEV_ATTR(adapter_diag_sfp, _name, 0400, \
zfcp_sysfs_adapter_diag_sfp_##_name##_show, NULL)
-ZFCP_DEFINE_DIAG_SFP_ATTR(temperature, temperature, 6, "%hd");
-ZFCP_DEFINE_DIAG_SFP_ATTR(vcc, vcc, 5, "%hu");
-ZFCP_DEFINE_DIAG_SFP_ATTR(tx_bias, tx_bias, 5, "%hu");
-ZFCP_DEFINE_DIAG_SFP_ATTR(tx_power, tx_power, 5, "%hu");
-ZFCP_DEFINE_DIAG_SFP_ATTR(rx_power, rx_power, 5, "%hu");
-ZFCP_DEFINE_DIAG_SFP_ATTR(port_tx_type, sfp_flags.port_tx_type, 2, "%hu");
-ZFCP_DEFINE_DIAG_SFP_ATTR(optical_port, sfp_flags.optical_port, 1, "%hu");
-ZFCP_DEFINE_DIAG_SFP_ATTR(sfp_invalid, sfp_flags.sfp_invalid, 1, "%hu");
-ZFCP_DEFINE_DIAG_SFP_ATTR(connector_type, sfp_flags.connector_type, 1, "%hu");
-ZFCP_DEFINE_DIAG_SFP_ATTR(fec_active, sfp_flags.fec_active, 1, "%hu");
+ZFCP_DEFINE_DIAG_SFP_ATTR(temperature, temperature, "%hd");
+ZFCP_DEFINE_DIAG_SFP_ATTR(vcc, vcc, "%hu");
+ZFCP_DEFINE_DIAG_SFP_ATTR(tx_bias, tx_bias, "%hu");
+ZFCP_DEFINE_DIAG_SFP_ATTR(tx_power, tx_power, "%hu");
+ZFCP_DEFINE_DIAG_SFP_ATTR(rx_power, rx_power, "%hu");
+ZFCP_DEFINE_DIAG_SFP_ATTR(port_tx_type, sfp_flags.port_tx_type, "%hu");
+ZFCP_DEFINE_DIAG_SFP_ATTR(optical_port, sfp_flags.optical_port, "%hu");
+ZFCP_DEFINE_DIAG_SFP_ATTR(sfp_invalid, sfp_flags.sfp_invalid, "%hu");
+ZFCP_DEFINE_DIAG_SFP_ATTR(connector_type, sfp_flags.connector_type, "%hu");
+ZFCP_DEFINE_DIAG_SFP_ATTR(fec_active, sfp_flags.fec_active, "%hu");
static struct attribute *zfcp_sysfs_diag_attrs[] = {
&dev_attr_adapter_diag_sfp_temperature.attr,
diff --git a/drivers/s390/virtio/virtio_ccw.c b/drivers/s390/virtio/virtio_ccw.c
index 62eca94..21fa7ac 100644
--- a/drivers/s390/virtio/virtio_ccw.c
+++ b/drivers/s390/virtio/virtio_ccw.c
@@ -58,6 +58,8 @@ struct virtio_ccw_device {
struct virtio_device vdev;
__u8 config[VIRTIO_CCW_CONFIG_SIZE];
struct ccw_device *cdev;
+ /* we make cdev->dev.dma_parms point to this */
+ struct device_dma_parameters dma_parms;
__u32 curr_io;
int err;
unsigned int revision; /* Transport revision */
@@ -1303,6 +1305,7 @@ static int virtio_ccw_offline(struct ccw_device *cdev)
unregister_virtio_device(&vcdev->vdev);
spin_lock_irqsave(get_ccwdev_lock(cdev), flags);
dev_set_drvdata(&cdev->dev, NULL);
+ cdev->dev.dma_parms = NULL;
spin_unlock_irqrestore(get_ccwdev_lock(cdev), flags);
return 0;
}
@@ -1366,6 +1369,7 @@ static int virtio_ccw_online(struct ccw_device *cdev)
}
vcdev->vdev.dev.parent = &cdev->dev;
vcdev->cdev = cdev;
+ cdev->dev.dma_parms = &vcdev->dma_parms;
vcdev->dma_area = ccw_device_dma_zalloc(vcdev->cdev,
sizeof(*vcdev->dma_area),
&vcdev->dma_area_addr);
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index ca4bc0a..8947dab 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -1190,8 +1190,8 @@ static u8 sd_group_number(struct scsi_cmnd *cmd)
if (!sdkp->rscs)
return 0;
- return min3((u32)rq->write_hint, (u32)sdkp->permanent_stream_count,
- 0x3fu);
+ return min3((u32)rq->bio->bi_write_hint,
+ (u32)sdkp->permanent_stream_count, 0x3fu);
}
static blk_status_t sd_setup_rw32_cmnd(struct scsi_cmnd *cmd, bool write,
@@ -1389,7 +1389,7 @@ static blk_status_t sd_setup_read_write_cmnd(struct scsi_cmnd *cmd)
ret = sd_setup_rw16_cmnd(cmd, write, lba, nr_blocks,
protect | fua, dld);
} else if ((nr_blocks > 0xff) || (lba > 0x1fffff) ||
- sdp->use_10_for_rw || protect || rq->write_hint) {
+ sdp->use_10_for_rw || protect || rq->bio->bi_write_hint) {
ret = sd_setup_rw10_cmnd(cmd, write, lba, nr_blocks,
protect | fua);
} else {
diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c
index 6ab27f4..7a447ff 100644
--- a/drivers/scsi/sd_zbc.c
+++ b/drivers/scsi/sd_zbc.c
@@ -633,8 +633,6 @@ int sd_zbc_read_zones(struct scsi_disk *sdkp, struct queue_limits *lim,
lim->max_open_zones = sdkp->zones_max_open;
lim->max_active_zones = 0;
lim->chunk_sectors = logical_to_sectors(sdkp->device, zone_blocks);
- /* Enable block layer zone append emulation */
- lim->max_zone_append_sectors = 0;
return 0;
diff --git a/drivers/tc/tc.c b/drivers/tc/tc.c
index c5b17dd..0629f27 100644
--- a/drivers/tc/tc.c
+++ b/drivers/tc/tc.c
@@ -162,7 +162,7 @@ static int __init tc_init(void)
if (tc_bus.info.slot_size) {
unsigned int tc_clock = tc_get_speed(&tc_bus) / 100000;
- pr_info("tc: TURBOchannel rev. %d at %d.%d MHz "
+ pr_info("tc: TURBOchannel rev. %d at %u.%u MHz "
"(with%s parity)\n", tc_bus.info.revision,
tc_clock / 10, tc_clock % 10,
tc_bus.info.parity ? "" : "out");
diff --git a/drivers/thermal/Makefile b/drivers/thermal/Makefile
index 41c4d56..1e1559b 100644
--- a/drivers/thermal/Makefile
+++ b/drivers/thermal/Makefile
@@ -6,6 +6,7 @@
obj-$(CONFIG_THERMAL) += thermal_sys.o
thermal_sys-y += thermal_core.o thermal_sysfs.o
thermal_sys-y += thermal_trip.o thermal_helpers.o
+thermal_sys-y += thermal_thresholds.o
# netlink interface to manage the thermal framework
thermal_sys-$(CONFIG_THERMAL_NETLINK) += thermal_netlink.o
diff --git a/drivers/thermal/gov_bang_bang.c b/drivers/thermal/gov_bang_bang.c
index 863e7a4..97f3d81 100644
--- a/drivers/thermal/gov_bang_bang.c
+++ b/drivers/thermal/gov_bang_bang.c
@@ -30,9 +30,7 @@ static void bang_bang_set_instance_target(struct thermal_instance *instance,
dev_dbg(&instance->cdev->device, "target=%ld\n", instance->target);
- mutex_lock(&instance->cdev->lock);
- __thermal_cdev_update(instance->cdev);
- mutex_unlock(&instance->cdev->lock);
+ thermal_cdev_update_nocheck(instance->cdev);
}
/**
@@ -67,6 +65,7 @@ static void bang_bang_control(struct thermal_zone_device *tz,
const struct thermal_trip *trip,
bool crossed_up)
{
+ const struct thermal_trip_desc *td = trip_to_trip_desc(trip);
struct thermal_instance *instance;
lockdep_assert_held(&tz->lock);
@@ -75,10 +74,8 @@ static void bang_bang_control(struct thermal_zone_device *tz,
thermal_zone_trip_id(tz, trip), trip->temperature,
tz->temperature, trip->hysteresis);
- list_for_each_entry(instance, &tz->thermal_instances, tz_node) {
- if (instance->trip == trip)
- bang_bang_set_instance_target(instance, crossed_up);
- }
+ list_for_each_entry(instance, &td->thermal_instances, trip_node)
+ bang_bang_set_instance_target(instance, crossed_up);
}
static void bang_bang_manage(struct thermal_zone_device *tz)
@@ -104,8 +101,8 @@ static void bang_bang_manage(struct thermal_zone_device *tz)
* to the thermal zone temperature and the trip point threshold.
*/
turn_on = tz->temperature >= td->threshold;
- list_for_each_entry(instance, &tz->thermal_instances, tz_node) {
- if (!instance->initialized && instance->trip == trip)
+ list_for_each_entry(instance, &td->thermal_instances, trip_node) {
+ if (!instance->initialized)
bang_bang_set_instance_target(instance, turn_on);
}
}
diff --git a/drivers/thermal/gov_fair_share.c b/drivers/thermal/gov_fair_share.c
index ce0ea57..4643be4 100644
--- a/drivers/thermal/gov_fair_share.c
+++ b/drivers/thermal/gov_fair_share.c
@@ -44,7 +44,7 @@ static int get_trip_level(struct thermal_zone_device *tz)
/**
* fair_share_throttle - throttles devices associated with the given zone
* @tz: thermal_zone_device
- * @trip: trip point
+ * @td: trip point descriptor
* @trip_level: number of trips crossed by the zone temperature
*
* Throttling Logic: This uses three parameters to calculate the new
@@ -61,29 +61,23 @@ static int get_trip_level(struct thermal_zone_device *tz)
* new_state of cooling device = P3 * P2 * P1
*/
static void fair_share_throttle(struct thermal_zone_device *tz,
- const struct thermal_trip *trip,
+ const struct thermal_trip_desc *td,
int trip_level)
{
struct thermal_instance *instance;
int total_weight = 0;
int nr_instances = 0;
- list_for_each_entry(instance, &tz->thermal_instances, tz_node) {
- if (instance->trip != trip)
- continue;
-
+ list_for_each_entry(instance, &td->thermal_instances, trip_node) {
total_weight += instance->weight;
nr_instances++;
}
- list_for_each_entry(instance, &tz->thermal_instances, tz_node) {
+ list_for_each_entry(instance, &td->thermal_instances, trip_node) {
struct thermal_cooling_device *cdev = instance->cdev;
u64 dividend;
u32 divisor;
- if (instance->trip != trip)
- continue;
-
dividend = trip_level;
dividend *= cdev->max_state;
divisor = tz->num_trips;
@@ -95,9 +89,7 @@ static void fair_share_throttle(struct thermal_zone_device *tz,
}
instance->target = div_u64(dividend, divisor);
- mutex_lock(&cdev->lock);
- __thermal_cdev_update(cdev);
- mutex_unlock(&cdev->lock);
+ thermal_cdev_update_nocheck(cdev);
}
}
@@ -116,7 +108,7 @@ static void fair_share_manage(struct thermal_zone_device *tz)
trip->type == THERMAL_TRIP_HOT)
continue;
- fair_share_throttle(tz, trip, trip_level);
+ fair_share_throttle(tz, td, trip_level);
}
}
diff --git a/drivers/thermal/gov_power_allocator.c b/drivers/thermal/gov_power_allocator.c
index 1b2345a6..ac6fa6b 100644
--- a/drivers/thermal/gov_power_allocator.c
+++ b/drivers/thermal/gov_power_allocator.c
@@ -97,11 +97,9 @@ struct power_allocator_params {
struct power_actor *power;
};
-static bool power_actor_is_valid(struct power_allocator_params *params,
- struct thermal_instance *instance)
+static bool power_actor_is_valid(struct thermal_instance *instance)
{
- return (instance->trip == params->trip_max &&
- cdev_is_power_actor(instance->cdev));
+ return cdev_is_power_actor(instance->cdev);
}
/**
@@ -118,13 +116,14 @@ static bool power_actor_is_valid(struct power_allocator_params *params,
static u32 estimate_sustainable_power(struct thermal_zone_device *tz)
{
struct power_allocator_params *params = tz->governor_data;
+ const struct thermal_trip_desc *td = trip_to_trip_desc(params->trip_max);
struct thermal_cooling_device *cdev;
struct thermal_instance *instance;
u32 sustainable_power = 0;
u32 min_power;
- list_for_each_entry(instance, &tz->thermal_instances, tz_node) {
- if (!power_actor_is_valid(params, instance))
+ list_for_each_entry(instance, &td->thermal_instances, trip_node) {
+ if (!power_actor_is_valid(instance))
continue;
cdev = instance->cdev;
@@ -323,9 +322,8 @@ power_actor_set_power(struct thermal_cooling_device *cdev,
return ret;
instance->target = clamp_val(state, instance->lower, instance->upper);
- mutex_lock(&cdev->lock);
- __thermal_cdev_update(cdev);
- mutex_unlock(&cdev->lock);
+
+ thermal_cdev_update_nocheck(cdev);
return 0;
}
@@ -356,11 +354,19 @@ static void divvy_up_power(struct power_actor *power, int num_actors,
u32 extra_power = 0;
int i;
- /*
- * Prevent division by 0 if none of the actors request power.
- */
- if (!total_req_power)
- total_req_power = 1;
+ if (!total_req_power) {
+ /*
+ * Nobody requested anything, just give everybody
+ * the maximum power
+ */
+ for (i = 0; i < num_actors; i++) {
+ struct power_actor *pa = &power[i];
+
+ pa->granted_power = pa->max_power;
+ }
+
+ return;
+ }
for (i = 0; i < num_actors; i++) {
struct power_actor *pa = &power[i];
@@ -400,6 +406,7 @@ static void divvy_up_power(struct power_actor *power, int num_actors,
static void allocate_power(struct thermal_zone_device *tz, int control_temp)
{
struct power_allocator_params *params = tz->governor_data;
+ const struct thermal_trip_desc *td = trip_to_trip_desc(params->trip_max);
unsigned int num_actors = params->num_actors;
struct power_actor *power = params->power;
struct thermal_cooling_device *cdev;
@@ -417,10 +424,10 @@ static void allocate_power(struct thermal_zone_device *tz, int control_temp)
/* Clean all buffers for new power estimations */
memset(power, 0, params->buffer_size);
- list_for_each_entry(instance, &tz->thermal_instances, tz_node) {
+ list_for_each_entry(instance, &td->thermal_instances, trip_node) {
struct power_actor *pa = &power[i];
- if (!power_actor_is_valid(params, instance))
+ if (!power_actor_is_valid(instance))
continue;
cdev = instance->cdev;
@@ -454,10 +461,10 @@ static void allocate_power(struct thermal_zone_device *tz, int control_temp)
power_range);
i = 0;
- list_for_each_entry(instance, &tz->thermal_instances, tz_node) {
+ list_for_each_entry(instance, &td->thermal_instances, trip_node) {
struct power_actor *pa = &power[i];
- if (!power_actor_is_valid(params, instance))
+ if (!power_actor_is_valid(instance))
continue;
power_actor_set_power(instance->cdev, instance,
@@ -538,29 +545,29 @@ static void reset_pid_controller(struct power_allocator_params *params)
static void allow_maximum_power(struct thermal_zone_device *tz)
{
struct power_allocator_params *params = tz->governor_data;
+ const struct thermal_trip_desc *td = trip_to_trip_desc(params->trip_max);
struct thermal_cooling_device *cdev;
struct thermal_instance *instance;
u32 req_power;
- list_for_each_entry(instance, &tz->thermal_instances, tz_node) {
- if (!power_actor_is_valid(params, instance))
+ list_for_each_entry(instance, &td->thermal_instances, trip_node) {
+ if (!power_actor_is_valid(instance))
continue;
cdev = instance->cdev;
instance->target = 0;
- mutex_lock(&cdev->lock);
- /*
- * Call for updating the cooling devices local stats and avoid
- * periods of dozen of seconds when those have not been
- * maintained.
- */
- cdev->ops->get_requested_power(cdev, &req_power);
+ scoped_guard(cooling_dev, cdev) {
+ /*
+ * Call for updating the cooling devices local stats and
+ * avoid periods of dozen of seconds when those have not
+ * been maintained.
+ */
+ cdev->ops->get_requested_power(cdev, &req_power);
- if (params->update_cdevs)
- __thermal_cdev_update(cdev);
-
- mutex_unlock(&cdev->lock);
+ if (params->update_cdevs)
+ __thermal_cdev_update(cdev);
+ }
}
}
@@ -581,13 +588,11 @@ static void allow_maximum_power(struct thermal_zone_device *tz)
static int check_power_actors(struct thermal_zone_device *tz,
struct power_allocator_params *params)
{
+ const struct thermal_trip_desc *td = trip_to_trip_desc(params->trip_max);
struct thermal_instance *instance;
int ret = 0;
- list_for_each_entry(instance, &tz->thermal_instances, tz_node) {
- if (instance->trip != params->trip_max)
- continue;
-
+ list_for_each_entry(instance, &td->thermal_instances, trip_node) {
if (!cdev_is_power_actor(instance->cdev)) {
dev_warn(&tz->device, "power_allocator: %s is not a power actor\n",
instance->cdev->type);
@@ -635,14 +640,15 @@ static void power_allocator_update_tz(struct thermal_zone_device *tz,
enum thermal_notify_event reason)
{
struct power_allocator_params *params = tz->governor_data;
+ const struct thermal_trip_desc *td = trip_to_trip_desc(params->trip_max);
struct thermal_instance *instance;
int num_actors = 0;
switch (reason) {
case THERMAL_TZ_BIND_CDEV:
case THERMAL_TZ_UNBIND_CDEV:
- list_for_each_entry(instance, &tz->thermal_instances, tz_node)
- if (power_actor_is_valid(params, instance))
+ list_for_each_entry(instance, &td->thermal_instances, trip_node)
+ if (power_actor_is_valid(instance))
num_actors++;
if (num_actors == params->num_actors)
@@ -652,8 +658,8 @@ static void power_allocator_update_tz(struct thermal_zone_device *tz,
break;
case THERMAL_INSTANCE_WEIGHT_CHANGED:
params->total_weight = 0;
- list_for_each_entry(instance, &tz->thermal_instances, tz_node)
- if (power_actor_is_valid(params, instance))
+ list_for_each_entry(instance, &td->thermal_instances, trip_node)
+ if (power_actor_is_valid(instance))
params->total_weight += instance->weight;
break;
default:
diff --git a/drivers/thermal/gov_step_wise.c b/drivers/thermal/gov_step_wise.c
index fd55271..d1bb59f 100644
--- a/drivers/thermal/gov_step_wise.c
+++ b/drivers/thermal/gov_step_wise.c
@@ -66,9 +66,10 @@ static unsigned long get_target_state(struct thermal_instance *instance,
}
static void thermal_zone_trip_update(struct thermal_zone_device *tz,
- const struct thermal_trip *trip,
+ const struct thermal_trip_desc *td,
int trip_threshold)
{
+ const struct thermal_trip *trip = &td->trip;
enum thermal_trend trend = get_tz_trend(tz, trip);
int trip_id = thermal_zone_trip_id(tz, trip);
struct thermal_instance *instance;
@@ -82,12 +83,9 @@ static void thermal_zone_trip_update(struct thermal_zone_device *tz,
dev_dbg(&tz->device, "Trip%d[type=%d,temp=%d]:trend=%d,throttle=%d\n",
trip_id, trip->type, trip_threshold, trend, throttle);
- list_for_each_entry(instance, &tz->thermal_instances, tz_node) {
+ list_for_each_entry(instance, &td->thermal_instances, trip_node) {
int old_target;
- if (instance->trip != trip)
- continue;
-
old_target = instance->target;
instance->target = get_target_state(instance, trend, throttle);
@@ -99,9 +97,9 @@ static void thermal_zone_trip_update(struct thermal_zone_device *tz,
instance->initialized = true;
- mutex_lock(&instance->cdev->lock);
- instance->cdev->updated = false; /* cdev needs update */
- mutex_unlock(&instance->cdev->lock);
+ scoped_guard(cooling_dev, instance->cdev) {
+ instance->cdev->updated = false; /* cdev needs update */
+ }
}
}
@@ -127,11 +125,13 @@ static void step_wise_manage(struct thermal_zone_device *tz)
trip->type == THERMAL_TRIP_HOT)
continue;
- thermal_zone_trip_update(tz, trip, td->threshold);
+ thermal_zone_trip_update(tz, td, td->threshold);
}
- list_for_each_entry(instance, &tz->thermal_instances, tz_node)
- thermal_cdev_update(instance->cdev);
+ for_each_trip_desc(tz, td) {
+ list_for_each_entry(instance, &td->thermal_instances, trip_node)
+ thermal_cdev_update(instance->cdev);
+ }
}
static struct thermal_governor thermal_gov_step_wise = {
diff --git a/drivers/thermal/testing/zone.c b/drivers/thermal/testing/zone.c
index c6d8c66..1f4e450 100644
--- a/drivers/thermal/testing/zone.c
+++ b/drivers/thermal/testing/zone.c
@@ -185,7 +185,7 @@ static void tt_add_tz_work_fn(struct work_struct *work)
int tt_add_tz(void)
{
struct tt_thermal_zone *tt_zone __free(kfree);
- struct tt_work *tt_work __free(kfree);
+ struct tt_work *tt_work __free(kfree) = NULL;
int ret;
tt_zone = kzalloc(sizeof(*tt_zone), GFP_KERNEL);
@@ -237,7 +237,7 @@ static void tt_zone_unregister_tz(struct tt_thermal_zone *tt_zone)
int tt_del_tz(const char *arg)
{
- struct tt_work *tt_work __free(kfree);
+ struct tt_work *tt_work __free(kfree) = NULL;
struct tt_thermal_zone *tt_zone, *aux;
int ret;
int id;
@@ -288,19 +288,14 @@ static struct tt_thermal_zone *tt_get_tt_zone(const char *arg)
guard(mutex)(&tt_thermal_zones_lock);
- ret = -EINVAL;
list_for_each_entry(tt_zone, &tt_thermal_zones, list_node) {
if (tt_zone->id == id) {
tt_zone->refcount++;
- ret = 0;
- break;
+ return tt_zone;
}
}
- if (ret)
- return ERR_PTR(ret);
-
- return tt_zone;
+ return ERR_PTR(-EINVAL);
}
static void tt_put_tt_zone(struct tt_thermal_zone *tt_zone)
@@ -310,6 +305,9 @@ static void tt_put_tt_zone(struct tt_thermal_zone *tt_zone)
tt_zone->refcount--;
}
+DEFINE_FREE(put_tt_zone, struct tt_thermal_zone *,
+ if (!IS_ERR_OR_NULL(_T)) tt_put_tt_zone(_T))
+
static void tt_zone_add_trip_work_fn(struct work_struct *work)
{
struct tt_work *tt_work = tt_work_of_work(work);
@@ -332,9 +330,9 @@ static void tt_zone_add_trip_work_fn(struct work_struct *work)
int tt_zone_add_trip(const char *arg)
{
+ struct tt_thermal_zone *tt_zone __free(put_tt_zone) = NULL;
+ struct tt_trip *tt_trip __free(kfree) = NULL;
struct tt_work *tt_work __free(kfree);
- struct tt_trip *tt_trip __free(kfree);
- struct tt_thermal_zone *tt_zone;
int id;
tt_work = kzalloc(sizeof(*tt_work), GFP_KERNEL);
@@ -350,10 +348,8 @@ int tt_zone_add_trip(const char *arg)
return PTR_ERR(tt_zone);
id = ida_alloc(&tt_zone->ida, GFP_KERNEL);
- if (id < 0) {
- tt_put_tt_zone(tt_zone);
+ if (id < 0)
return id;
- }
tt_trip->trip.type = THERMAL_TRIP_ACTIVE;
tt_trip->trip.temperature = THERMAL_TEMP_INVALID;
@@ -366,7 +362,7 @@ int tt_zone_add_trip(const char *arg)
tt_zone->num_trips++;
INIT_WORK(&tt_work->work, tt_zone_add_trip_work_fn);
- tt_work->tt_zone = tt_zone;
+ tt_work->tt_zone = no_free_ptr(tt_zone);
tt_work->tt_trip = no_free_ptr(tt_trip);
schedule_work(&(no_free_ptr(tt_work)->work));
@@ -391,7 +387,7 @@ static struct thermal_zone_device_ops tt_zone_ops = {
static int tt_zone_register_tz(struct tt_thermal_zone *tt_zone)
{
- struct thermal_trip *trips __free(kfree);
+ struct thermal_trip *trips __free(kfree) = NULL;
struct thermal_zone_device *tz;
struct tt_trip *tt_trip;
int i;
@@ -425,23 +421,18 @@ static int tt_zone_register_tz(struct tt_thermal_zone *tt_zone)
int tt_zone_reg(const char *arg)
{
- struct tt_thermal_zone *tt_zone;
- int ret;
+ struct tt_thermal_zone *tt_zone __free(put_tt_zone);
tt_zone = tt_get_tt_zone(arg);
if (IS_ERR(tt_zone))
return PTR_ERR(tt_zone);
- ret = tt_zone_register_tz(tt_zone);
-
- tt_put_tt_zone(tt_zone);
-
- return ret;
+ return tt_zone_register_tz(tt_zone);
}
int tt_zone_unreg(const char *arg)
{
- struct tt_thermal_zone *tt_zone;
+ struct tt_thermal_zone *tt_zone __free(put_tt_zone);
tt_zone = tt_get_tt_zone(arg);
if (IS_ERR(tt_zone))
@@ -449,8 +440,6 @@ int tt_zone_unreg(const char *arg)
tt_zone_unregister_tz(tt_zone);
- tt_put_tt_zone(tt_zone);
-
return 0;
}
diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c
index 8f03985..19a3894 100644
--- a/drivers/thermal/thermal_core.c
+++ b/drivers/thermal/thermal_core.c
@@ -15,7 +15,6 @@
#include <linux/slab.h>
#include <linux/kdev_t.h>
#include <linux/idr.h>
-#include <linux/list_sort.h>
#include <linux/thermal.h>
#include <linux/reboot.h>
#include <linux/string.h>
@@ -40,6 +39,8 @@ static DEFINE_MUTEX(thermal_governor_lock);
static struct thermal_governor *def_governor;
+static bool thermal_pm_suspended;
+
/*
* Governor section: set of functions to handle thermal governors
*
@@ -122,7 +123,7 @@ int thermal_register_governor(struct thermal_governor *governor)
if (!governor)
return -EINVAL;
- mutex_lock(&thermal_governor_lock);
+ guard(mutex)(&thermal_governor_lock);
err = -EBUSY;
if (!__find_governor(governor->name)) {
@@ -138,7 +139,7 @@ int thermal_register_governor(struct thermal_governor *governor)
def_governor = governor;
}
- mutex_lock(&thermal_list_lock);
+ guard(mutex)(&thermal_list_lock);
list_for_each_entry(pos, &thermal_tz_list, node) {
/*
@@ -161,9 +162,6 @@ int thermal_register_governor(struct thermal_governor *governor)
}
}
- mutex_unlock(&thermal_list_lock);
- mutex_unlock(&thermal_governor_lock);
-
return err;
}
@@ -174,23 +172,20 @@ void thermal_unregister_governor(struct thermal_governor *governor)
if (!governor)
return;
- mutex_lock(&thermal_governor_lock);
+ guard(mutex)(&thermal_governor_lock);
if (!__find_governor(governor->name))
- goto exit;
+ return;
- mutex_lock(&thermal_list_lock);
+ list_del(&governor->governor_list);
+
+ guard(mutex)(&thermal_list_lock);
list_for_each_entry(pos, &thermal_tz_list, node) {
if (!strncasecmp(pos->governor->name, governor->name,
THERMAL_NAME_LENGTH))
thermal_set_governor(pos, NULL);
}
-
- mutex_unlock(&thermal_list_lock);
- list_del(&governor->governor_list);
-exit:
- mutex_unlock(&thermal_governor_lock);
}
int thermal_zone_device_set_policy(struct thermal_zone_device *tz,
@@ -199,18 +194,12 @@ int thermal_zone_device_set_policy(struct thermal_zone_device *tz,
struct thermal_governor *gov;
int ret = -EINVAL;
- mutex_lock(&thermal_governor_lock);
- mutex_lock(&tz->lock);
+ guard(mutex)(&thermal_governor_lock);
+ guard(thermal_zone)(tz);
gov = __find_governor(strim(policy));
- if (!gov)
- goto exit;
-
- ret = thermal_set_governor(tz, gov);
-
-exit:
- mutex_unlock(&tz->lock);
- mutex_unlock(&thermal_governor_lock);
+ if (gov)
+ ret = thermal_set_governor(tz, gov);
thermal_notify_tz_gov_change(tz, policy);
@@ -222,15 +211,13 @@ int thermal_build_list_of_policies(char *buf)
struct thermal_governor *pos;
ssize_t count = 0;
- mutex_lock(&thermal_governor_lock);
+ guard(mutex)(&thermal_governor_lock);
list_for_each_entry(pos, &thermal_governor_list, governor_list) {
count += sysfs_emit_at(buf, count, "%s ", pos->name);
}
count += sysfs_emit_at(buf, count, "\n");
- mutex_unlock(&thermal_governor_lock);
-
return count;
}
@@ -421,83 +408,46 @@ static void handle_critical_trips(struct thermal_zone_device *tz,
tz->ops.hot(tz);
}
-static void handle_thermal_trip(struct thermal_zone_device *tz,
- struct thermal_trip_desc *td,
- struct list_head *way_up_list,
- struct list_head *way_down_list)
+static void move_trip_to_sorted_list(struct thermal_trip_desc *td,
+ struct list_head *list)
{
- const struct thermal_trip *trip = &td->trip;
- int old_threshold;
-
- if (trip->temperature == THERMAL_TEMP_INVALID)
- return;
+ struct thermal_trip_desc *entry;
/*
- * If the trip temperature or hysteresis has been updated recently,
- * the threshold needs to be computed again using the new values.
- * However, its initial value still reflects the old ones and that
- * is what needs to be compared with the previous zone temperature
- * to decide which action to take.
+ * Delete upfront and then add to make relocation within the same list
+ * work.
*/
- old_threshold = td->threshold;
- td->threshold = trip->temperature;
+ list_del(&td->list_node);
- if (tz->last_temperature >= old_threshold &&
- tz->last_temperature != THERMAL_TEMP_INIT) {
- /*
- * Mitigation is under way, so it needs to stop if the zone
- * temperature falls below the low temperature of the trip.
- * In that case, the trip temperature becomes the new threshold.
- */
- if (tz->temperature < trip->temperature - trip->hysteresis) {
- list_add(&td->notify_list_node, way_down_list);
- td->notify_temp = trip->temperature - trip->hysteresis;
-
- if (trip->type == THERMAL_TRIP_PASSIVE) {
- tz->passive--;
- WARN_ON(tz->passive < 0);
- }
- } else {
- td->threshold -= trip->hysteresis;
+ /* Assume that the new entry is likely to be the last one. */
+ list_for_each_entry_reverse(entry, list, list_node) {
+ if (entry->threshold <= td->threshold) {
+ list_add(&td->list_node, &entry->list_node);
+ return;
}
- } else if (tz->temperature >= trip->temperature) {
- /*
- * There is no mitigation under way, so it needs to be started
- * if the zone temperature exceeds the trip one. The new
- * threshold is then set to the low temperature of the trip.
- */
- list_add_tail(&td->notify_list_node, way_up_list);
- td->notify_temp = trip->temperature;
- td->threshold -= trip->hysteresis;
-
- if (trip->type == THERMAL_TRIP_PASSIVE)
- tz->passive++;
- else if (trip->type == THERMAL_TRIP_CRITICAL ||
- trip->type == THERMAL_TRIP_HOT)
- handle_critical_trips(tz, trip);
}
+ list_add(&td->list_node, list);
}
-static void thermal_zone_device_check(struct work_struct *work)
+static void move_to_trips_high(struct thermal_zone_device *tz,
+ struct thermal_trip_desc *td)
{
- struct thermal_zone_device *tz = container_of(work, struct
- thermal_zone_device,
- poll_queue.work);
- thermal_zone_device_update(tz, THERMAL_EVENT_UNSPECIFIED);
+ td->threshold = td->trip.temperature;
+ move_trip_to_sorted_list(td, &tz->trips_high);
}
-static void thermal_zone_device_init(struct thermal_zone_device *tz)
+static void move_to_trips_reached(struct thermal_zone_device *tz,
+ struct thermal_trip_desc *td)
{
- struct thermal_instance *pos;
+ td->threshold = td->trip.temperature - td->trip.hysteresis;
+ move_trip_to_sorted_list(td, &tz->trips_reached);
+}
- INIT_DELAYED_WORK(&tz->poll_queue, thermal_zone_device_check);
-
- tz->temperature = THERMAL_TEMP_INIT;
- tz->passive = 0;
- tz->prev_low_trip = -INT_MAX;
- tz->prev_high_trip = INT_MAX;
- list_for_each_entry(pos, &tz->thermal_instances, tz_node)
- pos->initialized = false;
+static void move_to_trips_invalid(struct thermal_zone_device *tz,
+ struct thermal_trip_desc *td)
+{
+ td->threshold = INT_MAX;
+ list_move(&td->list_node, &tz->trips_invalid);
}
static void thermal_governor_trip_crossed(struct thermal_governor *governor,
@@ -513,41 +463,154 @@ static void thermal_governor_trip_crossed(struct thermal_governor *governor,
}
static void thermal_trip_crossed(struct thermal_zone_device *tz,
- const struct thermal_trip *trip,
+ struct thermal_trip_desc *td,
struct thermal_governor *governor,
bool crossed_up)
{
+ const struct thermal_trip *trip = &td->trip;
+
if (crossed_up) {
+ if (trip->type == THERMAL_TRIP_PASSIVE)
+ tz->passive++;
+ else if (trip->type == THERMAL_TRIP_CRITICAL ||
+ trip->type == THERMAL_TRIP_HOT)
+ handle_critical_trips(tz, trip);
+
thermal_notify_tz_trip_up(tz, trip);
thermal_debug_tz_trip_up(tz, trip);
} else {
+ if (trip->type == THERMAL_TRIP_PASSIVE) {
+ tz->passive--;
+ WARN_ON(tz->passive < 0);
+ }
thermal_notify_tz_trip_down(tz, trip);
thermal_debug_tz_trip_down(tz, trip);
}
thermal_governor_trip_crossed(governor, tz, trip, crossed_up);
}
-static int thermal_trip_notify_cmp(void *not_used, const struct list_head *a,
- const struct list_head *b)
+void thermal_zone_set_trip_hyst(struct thermal_zone_device *tz,
+ struct thermal_trip *trip, int hyst)
{
- struct thermal_trip_desc *tda = container_of(a, struct thermal_trip_desc,
- notify_list_node);
- struct thermal_trip_desc *tdb = container_of(b, struct thermal_trip_desc,
- notify_list_node);
- return tda->notify_temp - tdb->notify_temp;
+ struct thermal_trip_desc *td = trip_to_trip_desc(trip);
+
+ WRITE_ONCE(trip->hysteresis, hyst);
+ thermal_notify_tz_trip_change(tz, trip);
+ /*
+ * If the zone temperature is above or at the trip tmperature, the trip
+ * is in the trips_reached list and its threshold is equal to its low
+ * temperature. It needs to stay in that list, but its threshold needs
+ * to be updated and the list ordering may need to be restored.
+ */
+ if (tz->temperature >= td->threshold)
+ move_to_trips_reached(tz, td);
+}
+
+void thermal_zone_set_trip_temp(struct thermal_zone_device *tz,
+ struct thermal_trip *trip, int temp)
+{
+ struct thermal_trip_desc *td = trip_to_trip_desc(trip);
+ int old_temp = trip->temperature;
+
+ if (old_temp == temp)
+ return;
+
+ WRITE_ONCE(trip->temperature, temp);
+ thermal_notify_tz_trip_change(tz, trip);
+
+ if (old_temp == THERMAL_TEMP_INVALID) {
+ /*
+ * The trip was invalid before the change, so move it to the
+ * trips_high list regardless of the new temperature value
+ * because there is no mitigation under way for it. If a
+ * mitigation needs to be started, the trip will be moved to the
+ * trips_reached list later.
+ */
+ move_to_trips_high(tz, td);
+ return;
+ }
+
+ if (temp == THERMAL_TEMP_INVALID) {
+ /*
+ * If the trip is in the trips_reached list, mitigation is under
+ * way for it and it needs to be stopped because the trip is
+ * effectively going away.
+ */
+ if (tz->temperature >= td->threshold)
+ thermal_trip_crossed(tz, td, thermal_get_tz_governor(tz), false);
+
+ move_to_trips_invalid(tz, td);
+ return;
+ }
+
+ /*
+ * The trip stays on its current list, but its threshold needs to be
+ * updated due to the temperature change and the list ordering may need
+ * to be restored.
+ */
+ if (tz->temperature >= td->threshold)
+ move_to_trips_reached(tz, td);
+ else
+ move_to_trips_high(tz, td);
+}
+EXPORT_SYMBOL_GPL(thermal_zone_set_trip_temp);
+
+static void thermal_zone_handle_trips(struct thermal_zone_device *tz,
+ struct thermal_governor *governor,
+ int *low, int *high)
+{
+ struct thermal_trip_desc *td, *next;
+ LIST_HEAD(way_down_list);
+
+ /* Check the trips that were below or at the zone temperature. */
+ list_for_each_entry_safe_reverse(td, next, &tz->trips_reached, list_node) {
+ if (td->threshold <= tz->temperature)
+ break;
+
+ thermal_trip_crossed(tz, td, governor, false);
+ /*
+ * The current trips_high list needs to be processed before
+ * adding new entries to it, so put them on a temporary list.
+ */
+ list_move(&td->list_node, &way_down_list);
+ }
+ /* Check the trips that were previously above the zone temperature. */
+ list_for_each_entry_safe(td, next, &tz->trips_high, list_node) {
+ if (td->threshold > tz->temperature)
+ break;
+
+ thermal_trip_crossed(tz, td, governor, true);
+ move_to_trips_reached(tz, td);
+ }
+ /* Move all of the trips from the temporary list to trips_high. */
+ list_for_each_entry_safe(td, next, &way_down_list, list_node)
+ move_to_trips_high(tz, td);
+
+ if (!list_empty(&tz->trips_reached)) {
+ td = list_last_entry(&tz->trips_reached,
+ struct thermal_trip_desc, list_node);
+ /*
+ * Set the "low" value below the current trip threshold in case
+ * the zone temperature is at that threshold and stays there,
+ * which would trigger a new interrupt immediately in vain.
+ */
+ *low = td->threshold - 1;
+ }
+ if (!list_empty(&tz->trips_high)) {
+ td = list_first_entry(&tz->trips_high,
+ struct thermal_trip_desc, list_node);
+ *high = td->threshold;
+ }
}
void __thermal_zone_device_update(struct thermal_zone_device *tz,
enum thermal_notify_event event)
{
struct thermal_governor *governor = thermal_get_tz_governor(tz);
- struct thermal_trip_desc *td;
- LIST_HEAD(way_down_list);
- LIST_HEAD(way_up_list);
int low = -INT_MAX, high = INT_MAX;
int temp, ret;
- if (tz->suspended || tz->mode != THERMAL_DEVICE_ENABLED)
+ if (tz->state != TZ_STATE_READY || tz->mode != THERMAL_DEVICE_ENABLED)
return;
ret = __thermal_zone_get_temp(tz, &temp);
@@ -575,26 +638,12 @@ void __thermal_zone_device_update(struct thermal_zone_device *tz,
tz->notify_event = event;
- for_each_trip_desc(tz, td) {
- handle_thermal_trip(tz, td, &way_up_list, &way_down_list);
+ thermal_zone_handle_trips(tz, governor, &low, &high);
- if (td->threshold <= tz->temperature && td->threshold > low)
- low = td->threshold;
-
- if (td->threshold >= tz->temperature && td->threshold < high)
- high = td->threshold;
- }
+ thermal_thresholds_handle(tz, &low, &high);
thermal_zone_set_trips(tz, low, high);
- list_sort(NULL, &way_up_list, thermal_trip_notify_cmp);
- list_for_each_entry(td, &way_up_list, notify_list_node)
- thermal_trip_crossed(tz, &td->trip, governor, true);
-
- list_sort(NULL, &way_down_list, thermal_trip_notify_cmp);
- list_for_each_entry_reverse(td, &way_down_list, notify_list_node)
- thermal_trip_crossed(tz, &td->trip, governor, false);
-
if (governor->manage)
governor->manage(tz);
@@ -609,26 +658,18 @@ static int thermal_zone_device_set_mode(struct thermal_zone_device *tz,
{
int ret;
- mutex_lock(&tz->lock);
+ guard(thermal_zone)(tz);
/* do nothing if mode isn't changing */
- if (mode == tz->mode) {
- mutex_unlock(&tz->lock);
-
+ if (mode == tz->mode)
return 0;
- }
ret = __thermal_zone_device_set_mode(tz, mode);
- if (ret) {
- mutex_unlock(&tz->lock);
-
+ if (ret)
return ret;
- }
__thermal_zone_device_update(tz, THERMAL_EVENT_UNSPECIFIED);
- mutex_unlock(&tz->lock);
-
if (mode == THERMAL_DEVICE_ENABLED)
thermal_notify_tz_enable(tz);
else
@@ -657,85 +698,81 @@ static bool thermal_zone_is_present(struct thermal_zone_device *tz)
void thermal_zone_device_update(struct thermal_zone_device *tz,
enum thermal_notify_event event)
{
- mutex_lock(&tz->lock);
+ guard(thermal_zone)(tz);
+
if (thermal_zone_is_present(tz))
__thermal_zone_device_update(tz, event);
- mutex_unlock(&tz->lock);
}
EXPORT_SYMBOL_GPL(thermal_zone_device_update);
-void thermal_zone_trip_down(struct thermal_zone_device *tz,
- const struct thermal_trip *trip)
-{
- thermal_trip_crossed(tz, trip, thermal_get_tz_governor(tz), false);
-}
-
int for_each_thermal_governor(int (*cb)(struct thermal_governor *, void *),
void *data)
{
struct thermal_governor *gov;
- int ret = 0;
- mutex_lock(&thermal_governor_lock);
+ guard(mutex)(&thermal_governor_lock);
+
list_for_each_entry(gov, &thermal_governor_list, governor_list) {
+ int ret;
+
ret = cb(gov, data);
if (ret)
- break;
+ return ret;
}
- mutex_unlock(&thermal_governor_lock);
- return ret;
+ return 0;
}
int for_each_thermal_cooling_device(int (*cb)(struct thermal_cooling_device *,
void *), void *data)
{
struct thermal_cooling_device *cdev;
- int ret = 0;
- mutex_lock(&thermal_list_lock);
+ guard(mutex)(&thermal_list_lock);
+
list_for_each_entry(cdev, &thermal_cdev_list, node) {
+ int ret;
+
ret = cb(cdev, data);
if (ret)
- break;
+ return ret;
}
- mutex_unlock(&thermal_list_lock);
- return ret;
+ return 0;
}
int for_each_thermal_zone(int (*cb)(struct thermal_zone_device *, void *),
void *data)
{
struct thermal_zone_device *tz;
- int ret = 0;
- mutex_lock(&thermal_list_lock);
+ guard(mutex)(&thermal_list_lock);
+
list_for_each_entry(tz, &thermal_tz_list, node) {
+ int ret;
+
ret = cb(tz, data);
if (ret)
- break;
+ return ret;
}
- mutex_unlock(&thermal_list_lock);
- return ret;
+ return 0;
}
struct thermal_zone_device *thermal_zone_get_by_id(int id)
{
- struct thermal_zone_device *tz, *match = NULL;
+ struct thermal_zone_device *tz;
- mutex_lock(&thermal_list_lock);
+ guard(mutex)(&thermal_list_lock);
+
list_for_each_entry(tz, &thermal_tz_list, node) {
if (tz->id == id) {
get_device(&tz->device);
- match = tz;
- break;
+ return tz;
}
}
- mutex_unlock(&thermal_list_lock);
- return match;
+ return NULL;
}
/*
@@ -748,12 +785,32 @@ struct thermal_zone_device *thermal_zone_get_by_id(int id)
* binding, and unbinding.
*/
+static int thermal_instance_add(struct thermal_instance *new_instance,
+ struct thermal_cooling_device *cdev,
+ struct thermal_trip_desc *td)
+{
+ struct thermal_instance *instance;
+
+ list_for_each_entry(instance, &td->thermal_instances, trip_node) {
+ if (instance->cdev == cdev)
+ return -EEXIST;
+ }
+
+ list_add_tail(&new_instance->trip_node, &td->thermal_instances);
+
+ guard(cooling_dev)(cdev);
+
+ list_add_tail(&new_instance->cdev_node, &cdev->thermal_instances);
+
+ return 0;
+}
+
/**
* thermal_bind_cdev_to_trip - bind a cooling device to a thermal zone
* @tz: pointer to struct thermal_zone_device
- * @trip: trip point the cooling devices is associated with in this zone.
+ * @td: descriptor of the trip point to bind @cdev to
* @cdev: pointer to struct thermal_cooling_device
- * @cool_spec: cooling specification for @trip and @cdev
+ * @cool_spec: cooling specification for the trip point and @cdev
*
* This interface function bind a thermal cooling device to the certain trip
* point of a thermal zone device.
@@ -762,12 +819,11 @@ struct thermal_zone_device *thermal_zone_get_by_id(int id)
* Return: 0 on success, the proper error value otherwise.
*/
static int thermal_bind_cdev_to_trip(struct thermal_zone_device *tz,
- const struct thermal_trip *trip,
+ struct thermal_trip_desc *td,
struct thermal_cooling_device *cdev,
struct cooling_spec *cool_spec)
{
struct thermal_instance *dev;
- struct thermal_instance *pos;
bool upper_no_limit;
int result;
@@ -790,7 +846,7 @@ static int thermal_bind_cdev_to_trip(struct thermal_zone_device *tz,
return -ENOMEM;
dev->cdev = cdev;
- dev->trip = trip;
+ dev->trip = &td->trip;
dev->upper = cool_spec->upper;
dev->upper_no_limit = upper_no_limit;
dev->lower = cool_spec->lower;
@@ -829,24 +885,15 @@ static int thermal_bind_cdev_to_trip(struct thermal_zone_device *tz,
if (result)
goto remove_trip_file;
- mutex_lock(&cdev->lock);
- list_for_each_entry(pos, &tz->thermal_instances, tz_node)
- if (pos->trip == trip && pos->cdev == cdev) {
- result = -EEXIST;
- break;
- }
- if (!result) {
- list_add_tail(&dev->tz_node, &tz->thermal_instances);
- list_add_tail(&dev->cdev_node, &cdev->thermal_instances);
- atomic_set(&tz->need_update, 1);
+ result = thermal_instance_add(dev, cdev, td);
+ if (result)
+ goto remove_weight_file;
- thermal_governor_update_tz(tz, THERMAL_TZ_BIND_CDEV);
- }
- mutex_unlock(&cdev->lock);
+ thermal_governor_update_tz(tz, THERMAL_TZ_BIND_CDEV);
- if (!result)
- return 0;
+ return 0;
+remove_weight_file:
device_remove_file(&tz->device, &dev->weight_attr);
remove_trip_file:
device_remove_file(&tz->device, &dev->attr);
@@ -859,10 +906,19 @@ static int thermal_bind_cdev_to_trip(struct thermal_zone_device *tz,
return result;
}
+static void thermal_instance_delete(struct thermal_instance *instance)
+{
+ list_del(&instance->trip_node);
+
+ guard(cooling_dev)(instance->cdev);
+
+ list_del(&instance->cdev_node);
+}
+
/**
* thermal_unbind_cdev_from_trip - unbind a cooling device from a thermal zone.
* @tz: pointer to a struct thermal_zone_device.
- * @trip: trip point the cooling devices is associated with in this zone.
+ * @td: descriptor of the trip point to unbind @cdev from
* @cdev: pointer to a struct thermal_cooling_device.
*
* This interface function unbind a thermal cooling device from the certain
@@ -870,28 +926,23 @@ static int thermal_bind_cdev_to_trip(struct thermal_zone_device *tz,
* This function is usually called in the thermal zone device .unbind callback.
*/
static void thermal_unbind_cdev_from_trip(struct thermal_zone_device *tz,
- const struct thermal_trip *trip,
+ struct thermal_trip_desc *td,
struct thermal_cooling_device *cdev)
{
struct thermal_instance *pos, *next;
- mutex_lock(&cdev->lock);
- list_for_each_entry_safe(pos, next, &tz->thermal_instances, tz_node) {
- if (pos->trip == trip && pos->cdev == cdev) {
- list_del(&pos->tz_node);
- list_del(&pos->cdev_node);
-
- thermal_governor_update_tz(tz, THERMAL_TZ_UNBIND_CDEV);
-
- mutex_unlock(&cdev->lock);
+ list_for_each_entry_safe(pos, next, &td->thermal_instances, trip_node) {
+ if (pos->cdev == cdev) {
+ thermal_instance_delete(pos);
goto unbind;
}
}
- mutex_unlock(&cdev->lock);
return;
unbind:
+ thermal_governor_update_tz(tz, THERMAL_TZ_UNBIND_CDEV);
+
device_remove_file(&tz->device, &pos->weight_attr);
device_remove_file(&tz->device, &pos->attr);
sysfs_remove_link(&tz->device.kobj, pos->name);
@@ -924,25 +975,23 @@ static struct class *thermal_class;
static inline
void print_bind_err_msg(struct thermal_zone_device *tz,
- const struct thermal_trip *trip,
+ const struct thermal_trip_desc *td,
struct thermal_cooling_device *cdev, int ret)
{
dev_err(&tz->device, "binding cdev %s to trip %d failed: %d\n",
- cdev->type, thermal_zone_trip_id(tz, trip), ret);
+ cdev->type, thermal_zone_trip_id(tz, &td->trip), ret);
}
-static void thermal_zone_cdev_bind(struct thermal_zone_device *tz,
- struct thermal_cooling_device *cdev)
+static bool __thermal_zone_cdev_bind(struct thermal_zone_device *tz,
+ struct thermal_cooling_device *cdev)
{
struct thermal_trip_desc *td;
+ bool update_tz = false;
if (!tz->ops.should_bind)
- return;
-
- mutex_lock(&tz->lock);
+ return false;
for_each_trip_desc(tz, td) {
- struct thermal_trip *trip = &td->trip;
struct cooling_spec c = {
.upper = THERMAL_NO_LIMIT,
.lower = THERMAL_NO_LIMIT,
@@ -950,15 +999,40 @@ static void thermal_zone_cdev_bind(struct thermal_zone_device *tz,
};
int ret;
- if (!tz->ops.should_bind(tz, trip, cdev, &c))
+ if (!tz->ops.should_bind(tz, &td->trip, cdev, &c))
continue;
- ret = thermal_bind_cdev_to_trip(tz, trip, cdev, &c);
- if (ret)
- print_bind_err_msg(tz, trip, cdev, ret);
+ ret = thermal_bind_cdev_to_trip(tz, td, cdev, &c);
+ if (ret) {
+ print_bind_err_msg(tz, td, cdev, ret);
+ continue;
+ }
+
+ update_tz = true;
}
- mutex_unlock(&tz->lock);
+ return update_tz;
+}
+
+static void thermal_zone_cdev_bind(struct thermal_zone_device *tz,
+ struct thermal_cooling_device *cdev)
+{
+ guard(thermal_zone)(tz);
+
+ if (__thermal_zone_cdev_bind(tz, cdev))
+ __thermal_zone_device_update(tz, THERMAL_EVENT_UNSPECIFIED);
+}
+
+static void thermal_cooling_device_init_complete(struct thermal_cooling_device *cdev)
+{
+ struct thermal_zone_device *tz;
+
+ guard(mutex)(&thermal_list_lock);
+
+ list_add(&cdev->node, &thermal_cdev_list);
+
+ list_for_each_entry(tz, &thermal_tz_list, node)
+ thermal_zone_cdev_bind(tz, cdev);
}
/**
@@ -983,7 +1057,6 @@ __thermal_cooling_device_register(struct device_node *np,
const struct thermal_cooling_device_ops *ops)
{
struct thermal_cooling_device *cdev;
- struct thermal_zone_device *pos = NULL;
unsigned long current_state;
int id, ret;
@@ -1050,21 +1123,7 @@ __thermal_cooling_device_register(struct device_node *np,
if (current_state <= cdev->max_state)
thermal_debug_cdev_add(cdev, current_state);
- /* Add 'this' new cdev to the global cdev list */
- mutex_lock(&thermal_list_lock);
-
- list_add(&cdev->node, &thermal_cdev_list);
-
- /* Update binding information for 'this' new cdev */
- list_for_each_entry(pos, &thermal_tz_list, node)
- thermal_zone_cdev_bind(pos, cdev);
-
- list_for_each_entry(pos, &thermal_tz_list, node)
- if (atomic_cmpxchg(&pos->need_update, 1, 0))
- thermal_zone_device_update(pos,
- THERMAL_EVENT_UNSPECIFIED);
-
- mutex_unlock(&thermal_list_lock);
+ thermal_cooling_device_init_complete(cdev);
return cdev;
@@ -1207,19 +1266,19 @@ void thermal_cooling_device_update(struct thermal_cooling_device *cdev)
* Hold thermal_list_lock throughout the update to prevent the device
* from going away while being updated.
*/
- mutex_lock(&thermal_list_lock);
+ guard(mutex)(&thermal_list_lock);
if (!thermal_cooling_device_present(cdev))
- goto unlock_list;
+ return;
/*
* Update under the cdev lock to prevent the state from being set beyond
* the new limit concurrently.
*/
- mutex_lock(&cdev->lock);
+ guard(cooling_dev)(cdev);
if (cdev->ops->get_max_state(cdev, &cdev->max_state))
- goto unlock;
+ return;
thermal_cooling_device_stats_reinit(cdev);
@@ -1246,63 +1305,59 @@ void thermal_cooling_device_update(struct thermal_cooling_device *cdev)
}
if (cdev->ops->get_cur_state(cdev, &state) || state > cdev->max_state)
- goto unlock;
+ return;
thermal_cooling_device_stats_update(cdev, state);
-
-unlock:
- mutex_unlock(&cdev->lock);
-
-unlock_list:
- mutex_unlock(&thermal_list_lock);
}
EXPORT_SYMBOL_GPL(thermal_cooling_device_update);
+static void __thermal_zone_cdev_unbind(struct thermal_zone_device *tz,
+ struct thermal_cooling_device *cdev)
+{
+ struct thermal_trip_desc *td;
+
+ for_each_trip_desc(tz, td)
+ thermal_unbind_cdev_from_trip(tz, td, cdev);
+}
+
static void thermal_zone_cdev_unbind(struct thermal_zone_device *tz,
struct thermal_cooling_device *cdev)
{
- struct thermal_trip_desc *td;
+ guard(thermal_zone)(tz);
- mutex_lock(&tz->lock);
-
- for_each_trip_desc(tz, td)
- thermal_unbind_cdev_from_trip(tz, &td->trip, cdev);
-
- mutex_unlock(&tz->lock);
+ __thermal_zone_cdev_unbind(tz, cdev);
}
-/**
- * thermal_cooling_device_unregister - removes a thermal cooling device
- * @cdev: the thermal cooling device to remove.
- *
- * thermal_cooling_device_unregister() must be called when a registered
- * thermal cooling device is no longer needed.
- */
-void thermal_cooling_device_unregister(struct thermal_cooling_device *cdev)
+static bool thermal_cooling_device_exit(struct thermal_cooling_device *cdev)
{
struct thermal_zone_device *tz;
+ guard(mutex)(&thermal_list_lock);
+
+ if (!thermal_cooling_device_present(cdev))
+ return false;
+
+ list_del(&cdev->node);
+
+ list_for_each_entry(tz, &thermal_tz_list, node)
+ thermal_zone_cdev_unbind(tz, cdev);
+
+ return true;
+}
+
+/**
+ * thermal_cooling_device_unregister() - removes a thermal cooling device
+ * @cdev: Thermal cooling device to remove.
+ */
+void thermal_cooling_device_unregister(struct thermal_cooling_device *cdev)
+{
if (!cdev)
return;
thermal_debug_cdev_remove(cdev);
- mutex_lock(&thermal_list_lock);
-
- if (!thermal_cooling_device_present(cdev)) {
- mutex_unlock(&thermal_list_lock);
- return;
- }
-
- list_del(&cdev->node);
-
- /* Unbind all thermal zones associated with 'this' cdev */
- list_for_each_entry(tz, &thermal_tz_list, node)
- thermal_zone_cdev_unbind(tz, cdev);
-
- mutex_unlock(&thermal_list_lock);
-
- device_unregister(&cdev->device);
+ if (thermal_cooling_device_exit(cdev))
+ device_unregister(&cdev->device);
}
EXPORT_SYMBOL_GPL(thermal_cooling_device_unregister);
@@ -1314,7 +1369,7 @@ int thermal_zone_get_crit_temp(struct thermal_zone_device *tz, int *temp)
if (tz->ops.get_crit_temp)
return tz->ops.get_crit_temp(tz, temp);
- mutex_lock(&tz->lock);
+ guard(thermal_zone)(tz);
for_each_trip_desc(tz, td) {
const struct thermal_trip *trip = &td->trip;
@@ -1326,12 +1381,91 @@ int thermal_zone_get_crit_temp(struct thermal_zone_device *tz, int *temp)
}
}
- mutex_unlock(&tz->lock);
-
return ret;
}
EXPORT_SYMBOL_GPL(thermal_zone_get_crit_temp);
+static void thermal_zone_device_check(struct work_struct *work)
+{
+ struct thermal_zone_device *tz = container_of(work, struct
+ thermal_zone_device,
+ poll_queue.work);
+ thermal_zone_device_update(tz, THERMAL_EVENT_UNSPECIFIED);
+}
+
+static void thermal_zone_device_init(struct thermal_zone_device *tz)
+{
+ struct thermal_trip_desc *td, *next;
+
+ INIT_DELAYED_WORK(&tz->poll_queue, thermal_zone_device_check);
+
+ tz->temperature = THERMAL_TEMP_INIT;
+ tz->passive = 0;
+ tz->prev_low_trip = -INT_MAX;
+ tz->prev_high_trip = INT_MAX;
+ for_each_trip_desc(tz, td) {
+ struct thermal_instance *instance;
+
+ list_for_each_entry(instance, &td->thermal_instances, trip_node)
+ instance->initialized = false;
+ }
+ /*
+ * At this point, all valid trips need to be moved to trips_high so that
+ * mitigation can be started if the zone temperature is above them.
+ */
+ list_for_each_entry_safe(td, next, &tz->trips_invalid, list_node) {
+ if (td->trip.temperature != THERMAL_TEMP_INVALID)
+ move_to_trips_high(tz, td);
+ }
+ /* The trips_reached list may not be empty during system resume. */
+ list_for_each_entry_safe(td, next, &tz->trips_reached, list_node) {
+ if (td->trip.temperature == THERMAL_TEMP_INVALID)
+ move_to_trips_invalid(tz, td);
+ else
+ move_to_trips_high(tz, td);
+ }
+}
+
+static int thermal_zone_init_governor(struct thermal_zone_device *tz)
+{
+ struct thermal_governor *governor;
+
+ guard(mutex)(&thermal_governor_lock);
+
+ if (tz->tzp)
+ governor = __find_governor(tz->tzp->governor_name);
+ else
+ governor = def_governor;
+
+ return thermal_set_governor(tz, governor);
+}
+
+static void thermal_zone_init_complete(struct thermal_zone_device *tz)
+{
+ struct thermal_cooling_device *cdev;
+
+ guard(mutex)(&thermal_list_lock);
+
+ list_add_tail(&tz->node, &thermal_tz_list);
+
+ guard(thermal_zone)(tz);
+
+ /* Bind cooling devices for this zone. */
+ list_for_each_entry(cdev, &thermal_cdev_list, node)
+ __thermal_zone_cdev_bind(tz, cdev);
+
+ tz->state &= ~TZ_STATE_FLAG_INIT;
+ /*
+ * If system suspend or resume is in progress at this point, the
+ * new thermal zone needs to be marked as suspended because
+ * thermal_pm_notify() has run already.
+ */
+ if (thermal_pm_suspended)
+ tz->state |= TZ_STATE_FLAG_SUSPENDED;
+
+ __thermal_zone_device_update(tz, THERMAL_EVENT_UNSPECIFIED);
+}
+
/**
* thermal_zone_device_register_with_trips() - register a new thermal zone device
* @type: the thermal zone device type
@@ -1366,12 +1500,10 @@ thermal_zone_device_register_with_trips(const char *type,
unsigned int polling_delay)
{
const struct thermal_trip *trip = trips;
- struct thermal_cooling_device *cdev;
struct thermal_zone_device *tz;
struct thermal_trip_desc *td;
int id;
int result;
- struct thermal_governor *governor;
if (!type || strlen(type) == 0) {
pr_err("No thermal zone type defined\n");
@@ -1415,8 +1547,10 @@ thermal_zone_device_register_with_trips(const char *type,
}
}
- INIT_LIST_HEAD(&tz->thermal_instances);
INIT_LIST_HEAD(&tz->node);
+ INIT_LIST_HEAD(&tz->trips_high);
+ INIT_LIST_HEAD(&tz->trips_reached);
+ INIT_LIST_HEAD(&tz->trips_invalid);
ida_init(&tz->ida);
mutex_init(&tz->lock);
init_completion(&tz->removal);
@@ -1439,51 +1573,41 @@ thermal_zone_device_register_with_trips(const char *type,
tz->num_trips = num_trips;
for_each_trip_desc(tz, td) {
td->trip = *trip++;
+ INIT_LIST_HEAD(&td->thermal_instances);
+ INIT_LIST_HEAD(&td->list_node);
/*
* Mark all thresholds as invalid to start with even though
* this only matters for the trips that start as invalid and
* become valid later.
*/
- td->threshold = INT_MAX;
+ move_to_trips_invalid(tz, td);
}
tz->polling_delay_jiffies = msecs_to_jiffies(polling_delay);
tz->passive_delay_jiffies = msecs_to_jiffies(passive_delay);
tz->recheck_delay_jiffies = THERMAL_RECHECK_DELAY;
+ tz->state = TZ_STATE_FLAG_INIT;
+
/* sys I/F */
/* Add nodes that are always present via .groups */
result = thermal_zone_create_device_groups(tz);
if (result)
goto remove_id;
- /* A new thermal zone needs to be updated anyway. */
- atomic_set(&tz->need_update, 1);
-
result = dev_set_name(&tz->device, "thermal_zone%d", tz->id);
if (result) {
thermal_zone_destroy_device_groups(tz);
goto remove_id;
}
+ thermal_zone_device_init(tz);
result = device_register(&tz->device);
if (result)
goto release_device;
- /* Update 'this' zone's governor information */
- mutex_lock(&thermal_governor_lock);
-
- if (tz->tzp)
- governor = __find_governor(tz->tzp->governor_name);
- else
- governor = def_governor;
-
- result = thermal_set_governor(tz, governor);
- if (result) {
- mutex_unlock(&thermal_governor_lock);
+ result = thermal_zone_init_governor(tz);
+ if (result)
goto unregister;
- }
-
- mutex_unlock(&thermal_governor_lock);
if (!tz->tzp || !tz->tzp->no_hwmon) {
result = thermal_add_hwmon_sysfs(tz);
@@ -1491,22 +1615,11 @@ thermal_zone_device_register_with_trips(const char *type,
goto unregister;
}
- mutex_lock(&thermal_list_lock);
+ result = thermal_thresholds_init(tz);
+ if (result)
+ goto remove_hwmon;
- mutex_lock(&tz->lock);
- list_add_tail(&tz->node, &thermal_tz_list);
- mutex_unlock(&tz->lock);
-
- /* Bind cooling devices for this zone */
- list_for_each_entry(cdev, &thermal_cdev_list, node)
- thermal_zone_cdev_bind(tz, cdev);
-
- mutex_unlock(&thermal_list_lock);
-
- thermal_zone_device_init(tz);
- /* Update the new thermal zone and mark it as already updated. */
- if (atomic_cmpxchg(&tz->need_update, 1, 0))
- thermal_zone_device_update(tz, THERMAL_EVENT_UNSPECIFIED);
+ thermal_zone_init_complete(tz);
thermal_notify_tz_create(tz);
@@ -1514,6 +1627,8 @@ thermal_zone_device_register_with_trips(const char *type,
return tz;
+remove_hwmon:
+ thermal_remove_hwmon_sysfs(tz);
unregister:
device_del(&tz->device);
release_device:
@@ -1563,44 +1678,46 @@ struct device *thermal_zone_device(struct thermal_zone_device *tzd)
}
EXPORT_SYMBOL_GPL(thermal_zone_device);
+static bool thermal_zone_exit(struct thermal_zone_device *tz)
+{
+ struct thermal_cooling_device *cdev;
+
+ guard(mutex)(&thermal_list_lock);
+
+ if (list_empty(&tz->node))
+ return false;
+
+ guard(thermal_zone)(tz);
+
+ tz->state |= TZ_STATE_FLAG_EXIT;
+ list_del_init(&tz->node);
+
+ /* Unbind all cdevs associated with this thermal zone. */
+ list_for_each_entry(cdev, &thermal_cdev_list, node)
+ __thermal_zone_cdev_unbind(tz, cdev);
+
+ return true;
+}
+
/**
* thermal_zone_device_unregister - removes the registered thermal zone device
* @tz: the thermal zone device to remove
*/
void thermal_zone_device_unregister(struct thermal_zone_device *tz)
{
- struct thermal_cooling_device *cdev;
- struct thermal_zone_device *pos = NULL;
-
if (!tz)
return;
thermal_debug_tz_remove(tz);
- mutex_lock(&thermal_list_lock);
- list_for_each_entry(pos, &thermal_tz_list, node)
- if (pos == tz)
- break;
- if (pos != tz) {
- /* thermal zone device not found */
- mutex_unlock(&thermal_list_lock);
+ if (!thermal_zone_exit(tz))
return;
- }
-
- mutex_lock(&tz->lock);
- list_del(&tz->node);
- mutex_unlock(&tz->lock);
-
- /* Unbind all cdevs associated with 'this' thermal zone */
- list_for_each_entry(cdev, &thermal_cdev_list, node)
- thermal_zone_cdev_unbind(tz, cdev);
-
- mutex_unlock(&thermal_list_lock);
cancel_delayed_work_sync(&tz->poll_queue);
thermal_set_governor(tz, NULL);
+ thermal_thresholds_exit(tz);
thermal_remove_hwmon_sysfs(tz);
ida_free(&thermal_tz_ida, tz->id);
ida_destroy(&tz->ida);
@@ -1632,24 +1749,23 @@ struct thermal_zone_device *thermal_zone_get_zone_by_name(const char *name)
unsigned int found = 0;
if (!name)
- goto exit;
+ return ERR_PTR(-EINVAL);
- mutex_lock(&thermal_list_lock);
+ guard(mutex)(&thermal_list_lock);
+
list_for_each_entry(pos, &thermal_tz_list, node)
if (!strncasecmp(name, pos->type, THERMAL_NAME_LENGTH)) {
found++;
ref = pos;
}
- mutex_unlock(&thermal_list_lock);
- /* nothing has been found, thus an error code for it */
- if (found == 0)
- ref = ERR_PTR(-ENODEV);
- else if (found > 1)
- /* Success only when an unique zone is found */
- ref = ERR_PTR(-EEXIST);
+ if (!found)
+ return ERR_PTR(-ENODEV);
-exit:
+ /* Success only when one zone is found. */
+ if (found > 1)
+ return ERR_PTR(-EEXIST);
+
return ref;
}
EXPORT_SYMBOL_GPL(thermal_zone_get_zone_by_name);
@@ -1660,9 +1776,9 @@ static void thermal_zone_device_resume(struct work_struct *work)
tz = container_of(work, struct thermal_zone_device, poll_queue.work);
- mutex_lock(&tz->lock);
+ guard(thermal_zone)(tz);
- tz->suspended = false;
+ tz->state &= ~(TZ_STATE_FLAG_SUSPENDED | TZ_STATE_FLAG_RESUMING);
thermal_debug_tz_resume(tz);
thermal_zone_device_init(tz);
@@ -1670,74 +1786,81 @@ static void thermal_zone_device_resume(struct work_struct *work)
__thermal_zone_device_update(tz, THERMAL_TZ_RESUME);
complete(&tz->resume);
- tz->resuming = false;
+}
- mutex_unlock(&tz->lock);
+static void thermal_zone_pm_prepare(struct thermal_zone_device *tz)
+{
+ guard(thermal_zone)(tz);
+
+ if (tz->state & TZ_STATE_FLAG_RESUMING) {
+ /*
+ * thermal_zone_device_resume() queued up for this zone has not
+ * acquired the lock yet, so release it to let the function run
+ * and wait util it has done the work.
+ */
+ scoped_guard(thermal_zone_reverse, tz) {
+ wait_for_completion(&tz->resume);
+ }
+ }
+
+ tz->state |= TZ_STATE_FLAG_SUSPENDED;
+}
+
+static void thermal_pm_notify_prepare(void)
+{
+ struct thermal_zone_device *tz;
+
+ guard(mutex)(&thermal_list_lock);
+
+ thermal_pm_suspended = true;
+
+ list_for_each_entry(tz, &thermal_tz_list, node)
+ thermal_zone_pm_prepare(tz);
+}
+
+static void thermal_zone_pm_complete(struct thermal_zone_device *tz)
+{
+ guard(thermal_zone)(tz);
+
+ cancel_delayed_work(&tz->poll_queue);
+
+ reinit_completion(&tz->resume);
+ tz->state |= TZ_STATE_FLAG_RESUMING;
+
+ /*
+ * Replace the work function with the resume one, which will restore the
+ * original work function and schedule the polling work if needed.
+ */
+ INIT_DELAYED_WORK(&tz->poll_queue, thermal_zone_device_resume);
+ /* Queue up the work without a delay. */
+ mod_delayed_work(system_freezable_power_efficient_wq, &tz->poll_queue, 0);
+}
+
+static void thermal_pm_notify_complete(void)
+{
+ struct thermal_zone_device *tz;
+
+ guard(mutex)(&thermal_list_lock);
+
+ thermal_pm_suspended = false;
+
+ list_for_each_entry(tz, &thermal_tz_list, node)
+ thermal_zone_pm_complete(tz);
}
static int thermal_pm_notify(struct notifier_block *nb,
unsigned long mode, void *_unused)
{
- struct thermal_zone_device *tz;
-
switch (mode) {
case PM_HIBERNATION_PREPARE:
case PM_RESTORE_PREPARE:
case PM_SUSPEND_PREPARE:
- mutex_lock(&thermal_list_lock);
-
- list_for_each_entry(tz, &thermal_tz_list, node) {
- mutex_lock(&tz->lock);
-
- if (tz->resuming) {
- /*
- * thermal_zone_device_resume() queued up for
- * this zone has not acquired the lock yet, so
- * release it to let the function run and wait
- * util it has done the work.
- */
- mutex_unlock(&tz->lock);
-
- wait_for_completion(&tz->resume);
-
- mutex_lock(&tz->lock);
- }
-
- tz->suspended = true;
-
- mutex_unlock(&tz->lock);
- }
-
- mutex_unlock(&thermal_list_lock);
+ thermal_pm_notify_prepare();
break;
case PM_POST_HIBERNATION:
case PM_POST_RESTORE:
case PM_POST_SUSPEND:
- mutex_lock(&thermal_list_lock);
-
- list_for_each_entry(tz, &thermal_tz_list, node) {
- mutex_lock(&tz->lock);
-
- cancel_delayed_work(&tz->poll_queue);
-
- reinit_completion(&tz->resume);
- tz->resuming = true;
-
- /*
- * Replace the work function with the resume one, which
- * will restore the original work function and schedule
- * the polling work if needed.
- */
- INIT_DELAYED_WORK(&tz->poll_queue,
- thermal_zone_device_resume);
- /* Queue up the work without a delay. */
- mod_delayed_work(system_freezable_power_efficient_wq,
- &tz->poll_queue, 0);
-
- mutex_unlock(&tz->lock);
- }
-
- mutex_unlock(&thermal_list_lock);
+ thermal_pm_notify_complete();
break;
default:
break;
diff --git a/drivers/thermal/thermal_core.h b/drivers/thermal/thermal_core.h
index a64d39b1..be271e7 100644
--- a/drivers/thermal/thermal_core.h
+++ b/drivers/thermal/thermal_core.h
@@ -9,10 +9,12 @@
#ifndef __THERMAL_CORE_H__
#define __THERMAL_CORE_H__
+#include <linux/cleanup.h>
#include <linux/device.h>
#include <linux/thermal.h>
#include "thermal_netlink.h"
+#include "thermal_thresholds.h"
#include "thermal_debugfs.h"
struct thermal_attr {
@@ -29,8 +31,8 @@ struct thermal_trip_attrs {
struct thermal_trip_desc {
struct thermal_trip trip;
struct thermal_trip_attrs trip_attrs;
- struct list_head notify_list_node;
- int notify_temp;
+ struct list_head list_node;
+ struct list_head thermal_instances;
int threshold;
};
@@ -61,6 +63,13 @@ struct thermal_governor {
struct list_head governor_list;
};
+#define TZ_STATE_FLAG_SUSPENDED BIT(0)
+#define TZ_STATE_FLAG_RESUMING BIT(1)
+#define TZ_STATE_FLAG_INIT BIT(2)
+#define TZ_STATE_FLAG_EXIT BIT(3)
+
+#define TZ_STATE_READY 0
+
/**
* struct thermal_zone_device - structure for a thermal zone
* @id: unique id number for each thermal zone
@@ -68,6 +77,9 @@ struct thermal_governor {
* @device: &struct device for this thermal zone
* @removal: removal completion
* @resume: resume completion
+ * @trips_high: trips above the current zone temperature
+ * @trips_reached: trips below or at the current zone temperature
+ * @trips_invalid: trips with invalid temperature
* @mode: current mode of this thermal zone
* @devdata: private pointer for device private data
* @num_trips: number of trip points the thermal zone supports
@@ -88,20 +100,17 @@ struct thermal_governor {
trip point.
* @prev_high_trip: the above current temperature if you've crossed a
passive trip point.
- * @need_update: if equals 1, thermal_zone_device_update needs to be invoked.
* @ops: operations this &thermal_zone_device supports
* @tzp: thermal zone parameters
* @governor: pointer to the governor for this thermal zone
* @governor_data: private pointer for governor data
- * @thermal_instances: list of &struct thermal_instance of this thermal zone
* @ida: &struct ida to generate unique id for this zone's cooling
* devices
* @lock: lock to protect thermal_instances list
* @node: node in thermal_tz_list (in thermal_core.c)
* @poll_queue: delayed work for polling
* @notify_event: Last notification event
- * @suspended: thermal zone suspend indicator
- * @resuming: indicates whether or not thermal zone resume is in progress
+ * @state: current state of the thermal zone
* @trips: array of struct thermal_trip objects
*/
struct thermal_zone_device {
@@ -111,6 +120,9 @@ struct thermal_zone_device {
struct completion removal;
struct completion resume;
struct attribute_group trips_attribute_group;
+ struct list_head trips_high;
+ struct list_head trips_reached;
+ struct list_head trips_invalid;
enum thermal_device_mode mode;
void *devdata;
int num_trips;
@@ -123,25 +135,29 @@ struct thermal_zone_device {
int passive;
int prev_low_trip;
int prev_high_trip;
- atomic_t need_update;
struct thermal_zone_device_ops ops;
struct thermal_zone_params *tzp;
struct thermal_governor *governor;
void *governor_data;
- struct list_head thermal_instances;
struct ida ida;
struct mutex lock;
struct list_head node;
struct delayed_work poll_queue;
enum thermal_notify_event notify_event;
- bool suspended;
- bool resuming;
+ u8 state;
#ifdef CONFIG_THERMAL_DEBUGFS
struct thermal_debugfs *debugfs;
#endif
+ struct list_head user_thresholds;
struct thermal_trip_desc trips[] __counted_by(num_trips);
};
+DEFINE_GUARD(thermal_zone, struct thermal_zone_device *, mutex_lock(&_T->lock),
+ mutex_unlock(&_T->lock))
+
+DEFINE_GUARD(thermal_zone_reverse, struct thermal_zone_device *,
+ mutex_unlock(&_T->lock), mutex_lock(&_T->lock))
+
/* Initial thermal zone temperature. */
#define THERMAL_TEMP_INIT INT_MIN
@@ -204,6 +220,7 @@ static inline bool cdev_is_power_actor(struct thermal_cooling_device *cdev)
}
void thermal_cdev_update(struct thermal_cooling_device *);
+void thermal_cdev_update_nocheck(struct thermal_cooling_device *cdev);
void __thermal_cdev_update(struct thermal_cooling_device *cdev);
int get_tz_trend(struct thermal_zone_device *tz, const struct thermal_trip *trip);
@@ -226,7 +243,7 @@ struct thermal_instance {
struct device_attribute attr;
char weight_attr_name[THERMAL_NAME_LENGTH];
struct device_attribute weight_attr;
- struct list_head tz_node; /* node in tz->thermal_instances */
+ struct list_head trip_node; /* node in trip->thermal_instances */
struct list_head cdev_node; /* node in cdev->thermal_instances */
unsigned int weight; /* The weight of the cooling device */
bool upper_no_limit;
@@ -261,8 +278,6 @@ void thermal_zone_set_trips(struct thermal_zone_device *tz, int low, int high);
int thermal_zone_trip_id(const struct thermal_zone_device *tz,
const struct thermal_trip *trip);
int __thermal_zone_get_temp(struct thermal_zone_device *tz, int *temp);
-void thermal_zone_trip_down(struct thermal_zone_device *tz,
- const struct thermal_trip *trip);
void thermal_zone_set_trip_hyst(struct thermal_zone_device *tz,
struct thermal_trip *trip, int hyst);
diff --git a/drivers/thermal/thermal_debugfs.c b/drivers/thermal/thermal_debugfs.c
index 939d3e5..c800504 100644
--- a/drivers/thermal/thermal_debugfs.c
+++ b/drivers/thermal/thermal_debugfs.c
@@ -516,6 +516,19 @@ void thermal_debug_cdev_add(struct thermal_cooling_device *cdev, int state)
cdev->debugfs = thermal_dbg;
}
+static struct thermal_debugfs *thermal_debug_cdev_clear(struct thermal_cooling_device *cdev)
+{
+ struct thermal_debugfs *thermal_dbg;
+
+ guard(cooling_dev)(cdev);
+
+ thermal_dbg = cdev->debugfs;
+ if (thermal_dbg)
+ cdev->debugfs = NULL;
+
+ return thermal_dbg;
+}
+
/**
* thermal_debug_cdev_remove - Remove a cooling device debugfs entry
*
@@ -527,17 +540,9 @@ void thermal_debug_cdev_remove(struct thermal_cooling_device *cdev)
{
struct thermal_debugfs *thermal_dbg;
- mutex_lock(&cdev->lock);
-
- thermal_dbg = cdev->debugfs;
- if (!thermal_dbg) {
- mutex_unlock(&cdev->lock);
+ thermal_dbg = thermal_debug_cdev_clear(cdev);
+ if (!thermal_dbg)
return;
- }
-
- cdev->debugfs = NULL;
-
- mutex_unlock(&cdev->lock);
mutex_lock(&thermal_dbg->lock);
@@ -885,6 +890,19 @@ void thermal_debug_tz_add(struct thermal_zone_device *tz)
tz->debugfs = thermal_dbg;
}
+static struct thermal_debugfs *thermal_debug_tz_clear(struct thermal_zone_device *tz)
+{
+ struct thermal_debugfs *thermal_dbg;
+
+ guard(thermal_zone)(tz);
+
+ thermal_dbg = tz->debugfs;
+ if (thermal_dbg)
+ tz->debugfs = NULL;
+
+ return thermal_dbg;
+}
+
void thermal_debug_tz_remove(struct thermal_zone_device *tz)
{
struct thermal_debugfs *thermal_dbg;
@@ -892,17 +910,9 @@ void thermal_debug_tz_remove(struct thermal_zone_device *tz)
struct tz_debugfs *tz_dbg;
int *trips_crossed;
- mutex_lock(&tz->lock);
-
- thermal_dbg = tz->debugfs;
- if (!thermal_dbg) {
- mutex_unlock(&tz->lock);
+ thermal_dbg = thermal_debug_tz_clear(tz);
+ if (!thermal_dbg)
return;
- }
-
- tz->debugfs = NULL;
-
- mutex_unlock(&tz->lock);
tz_dbg = &thermal_dbg->tz_dbg;
diff --git a/drivers/thermal/thermal_helpers.c b/drivers/thermal/thermal_helpers.c
index dc374a7..b1152ad 100644
--- a/drivers/thermal/thermal_helpers.c
+++ b/drivers/thermal/thermal_helpers.c
@@ -43,10 +43,11 @@ static bool thermal_instance_present(struct thermal_zone_device *tz,
struct thermal_cooling_device *cdev,
const struct thermal_trip *trip)
{
+ const struct thermal_trip_desc *td = trip_to_trip_desc(trip);
struct thermal_instance *ti;
- list_for_each_entry(ti, &tz->thermal_instances, tz_node) {
- if (ti->trip == trip && ti->cdev == cdev)
+ list_for_each_entry(ti, &td->thermal_instances, trip_node) {
+ if (ti->cdev == cdev)
return true;
}
@@ -57,17 +58,10 @@ bool thermal_trip_is_bound_to_cdev(struct thermal_zone_device *tz,
const struct thermal_trip *trip,
struct thermal_cooling_device *cdev)
{
- bool ret;
+ guard(thermal_zone)(tz);
+ guard(cooling_dev)(cdev);
- mutex_lock(&tz->lock);
- mutex_lock(&cdev->lock);
-
- ret = thermal_instance_present(tz, cdev, trip);
-
- mutex_unlock(&cdev->lock);
- mutex_unlock(&tz->lock);
-
- return ret;
+ return thermal_instance_present(tz, cdev, trip);
}
EXPORT_SYMBOL_GPL(thermal_trip_is_bound_to_cdev);
@@ -137,19 +131,14 @@ int thermal_zone_get_temp(struct thermal_zone_device *tz, int *temp)
if (IS_ERR_OR_NULL(tz))
return -EINVAL;
- mutex_lock(&tz->lock);
+ guard(thermal_zone)(tz);
- if (!tz->ops.get_temp) {
- ret = -EINVAL;
- goto unlock;
- }
+ if (!tz->ops.get_temp)
+ return -EINVAL;
ret = __thermal_zone_get_temp(tz, temp);
if (!ret && *temp <= THERMAL_TEMP_INVALID)
- ret = -ENODATA;
-
-unlock:
- mutex_unlock(&tz->lock);
+ return -ENODATA;
return ret;
}
@@ -201,12 +190,23 @@ void __thermal_cdev_update(struct thermal_cooling_device *cdev)
*/
void thermal_cdev_update(struct thermal_cooling_device *cdev)
{
- mutex_lock(&cdev->lock);
+ guard(cooling_dev)(cdev);
+
if (!cdev->updated) {
__thermal_cdev_update(cdev);
cdev->updated = true;
}
- mutex_unlock(&cdev->lock);
+}
+
+/**
+ * thermal_cdev_update_nocheck() - Unconditionally update cooling device state
+ * @cdev: Target cooling device.
+ */
+void thermal_cdev_update_nocheck(struct thermal_cooling_device *cdev)
+{
+ guard(cooling_dev)(cdev);
+
+ __thermal_cdev_update(cdev);
}
/**
diff --git a/drivers/thermal/thermal_hwmon.c b/drivers/thermal/thermal_hwmon.c
index f0e504f..37da7a8 100644
--- a/drivers/thermal/thermal_hwmon.c
+++ b/drivers/thermal/thermal_hwmon.c
@@ -78,12 +78,9 @@ temp_crit_show(struct device *dev, struct device_attribute *attr, char *buf)
int temperature;
int ret;
- mutex_lock(&tz->lock);
+ guard(thermal_zone)(tz);
ret = tz->ops.get_crit_temp(tz, &temperature);
-
- mutex_unlock(&tz->lock);
-
if (ret)
return ret;
diff --git a/drivers/thermal/thermal_netlink.c b/drivers/thermal/thermal_netlink.c
index f3c58c7..315a76b 100644
--- a/drivers/thermal/thermal_netlink.c
+++ b/drivers/thermal/thermal_netlink.c
@@ -9,6 +9,7 @@
#include <linux/module.h>
#include <linux/notifier.h>
#include <linux/kernel.h>
+#include <net/sock.h>
#include <net/genetlink.h>
#include <uapi/linux/thermal.h>
@@ -49,6 +50,11 @@ static const struct nla_policy thermal_genl_policy[THERMAL_GENL_ATTR_MAX + 1] =
[THERMAL_GENL_ATTR_CPU_CAPABILITY_ID] = { .type = NLA_U32 },
[THERMAL_GENL_ATTR_CPU_CAPABILITY_PERFORMANCE] = { .type = NLA_U32 },
[THERMAL_GENL_ATTR_CPU_CAPABILITY_EFFICIENCY] = { .type = NLA_U32 },
+
+ /* Thresholds */
+ [THERMAL_GENL_ATTR_THRESHOLD] = { .type = NLA_NESTED },
+ [THERMAL_GENL_ATTR_THRESHOLD_TEMP] = { .type = NLA_U32 },
+ [THERMAL_GENL_ATTR_THRESHOLD_DIRECTION] = { .type = NLA_U32 },
};
struct param {
@@ -62,6 +68,8 @@ struct param {
int trip_type;
int trip_hyst;
int temp;
+ int prev_temp;
+ int direction;
int cdev_state;
int cdev_max_state;
struct thermal_genl_cpu_caps *cpu_capabilities;
@@ -234,6 +242,34 @@ static int thermal_genl_event_cpu_capability_change(struct param *p)
return -EMSGSIZE;
}
+static int thermal_genl_event_threshold_add(struct param *p)
+{
+ if (nla_put_u32(p->msg, THERMAL_GENL_ATTR_TZ_ID, p->tz_id) ||
+ nla_put_u32(p->msg, THERMAL_GENL_ATTR_THRESHOLD_TEMP, p->temp) ||
+ nla_put_u32(p->msg, THERMAL_GENL_ATTR_THRESHOLD_DIRECTION, p->direction))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+static int thermal_genl_event_threshold_flush(struct param *p)
+{
+ if (nla_put_u32(p->msg, THERMAL_GENL_ATTR_TZ_ID, p->tz_id))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+static int thermal_genl_event_threshold_up(struct param *p)
+{
+ if (nla_put_u32(p->msg, THERMAL_GENL_ATTR_TZ_ID, p->tz_id) ||
+ nla_put_u32(p->msg, THERMAL_GENL_ATTR_TZ_PREV_TEMP, p->prev_temp) ||
+ nla_put_u32(p->msg, THERMAL_GENL_ATTR_TZ_TEMP, p->temp))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
int thermal_genl_event_tz_delete(struct param *p)
__attribute__((alias("thermal_genl_event_tz")));
@@ -246,6 +282,12 @@ int thermal_genl_event_tz_disable(struct param *p)
int thermal_genl_event_tz_trip_down(struct param *p)
__attribute__((alias("thermal_genl_event_tz_trip_up")));
+int thermal_genl_event_threshold_delete(struct param *p)
+ __attribute__((alias("thermal_genl_event_threshold_add")));
+
+int thermal_genl_event_threshold_down(struct param *p)
+ __attribute__((alias("thermal_genl_event_threshold_up")));
+
static cb_t event_cb[] = {
[THERMAL_GENL_EVENT_TZ_CREATE] = thermal_genl_event_tz_create,
[THERMAL_GENL_EVENT_TZ_DELETE] = thermal_genl_event_tz_delete,
@@ -259,6 +301,11 @@ static cb_t event_cb[] = {
[THERMAL_GENL_EVENT_CDEV_STATE_UPDATE] = thermal_genl_event_cdev_state_update,
[THERMAL_GENL_EVENT_TZ_GOV_CHANGE] = thermal_genl_event_gov_change,
[THERMAL_GENL_EVENT_CPU_CAPABILITY_CHANGE] = thermal_genl_event_cpu_capability_change,
+ [THERMAL_GENL_EVENT_THRESHOLD_ADD] = thermal_genl_event_threshold_add,
+ [THERMAL_GENL_EVENT_THRESHOLD_DELETE] = thermal_genl_event_threshold_delete,
+ [THERMAL_GENL_EVENT_THRESHOLD_FLUSH] = thermal_genl_event_threshold_flush,
+ [THERMAL_GENL_EVENT_THRESHOLD_DOWN] = thermal_genl_event_threshold_down,
+ [THERMAL_GENL_EVENT_THRESHOLD_UP] = thermal_genl_event_threshold_up,
};
/*
@@ -401,6 +448,43 @@ int thermal_genl_cpu_capability_event(int count,
}
EXPORT_SYMBOL_GPL(thermal_genl_cpu_capability_event);
+int thermal_notify_threshold_add(const struct thermal_zone_device *tz,
+ int temperature, int direction)
+{
+ struct param p = { .tz_id = tz->id, .temp = temperature, .direction = direction };
+
+ return thermal_genl_send_event(THERMAL_GENL_EVENT_THRESHOLD_ADD, &p);
+}
+
+int thermal_notify_threshold_delete(const struct thermal_zone_device *tz,
+ int temperature, int direction)
+{
+ struct param p = { .tz_id = tz->id, .temp = temperature, .direction = direction };
+
+ return thermal_genl_send_event(THERMAL_GENL_EVENT_THRESHOLD_DELETE, &p);
+}
+
+int thermal_notify_threshold_flush(const struct thermal_zone_device *tz)
+{
+ struct param p = { .tz_id = tz->id };
+
+ return thermal_genl_send_event(THERMAL_GENL_EVENT_THRESHOLD_FLUSH, &p);
+}
+
+int thermal_notify_threshold_down(const struct thermal_zone_device *tz)
+{
+ struct param p = { .tz_id = tz->id, .temp = tz->temperature, .prev_temp = tz->last_temperature };
+
+ return thermal_genl_send_event(THERMAL_GENL_EVENT_THRESHOLD_DOWN, &p);
+}
+
+int thermal_notify_threshold_up(const struct thermal_zone_device *tz)
+{
+ struct param p = { .tz_id = tz->id, .temp = tz->temperature, .prev_temp = tz->last_temperature };
+
+ return thermal_genl_send_event(THERMAL_GENL_EVENT_THRESHOLD_UP, &p);
+}
+
/*************************** Command encoding ********************************/
static int __thermal_genl_cmd_tz_get_id(struct thermal_zone_device *tz,
@@ -459,7 +543,7 @@ static int thermal_genl_cmd_tz_get_trip(struct param *p)
if (!start_trip)
return -EMSGSIZE;
- mutex_lock(&tz->lock);
+ guard(thermal_zone)(tz);
for_each_trip_desc(tz, td) {
const struct thermal_trip *trip = &td->trip;
@@ -469,19 +553,12 @@ static int thermal_genl_cmd_tz_get_trip(struct param *p)
nla_put_u32(msg, THERMAL_GENL_ATTR_TZ_TRIP_TYPE, trip->type) ||
nla_put_u32(msg, THERMAL_GENL_ATTR_TZ_TRIP_TEMP, trip->temperature) ||
nla_put_u32(msg, THERMAL_GENL_ATTR_TZ_TRIP_HYST, trip->hysteresis))
- goto out_cancel_nest;
+ return -EMSGSIZE;
}
- mutex_unlock(&tz->lock);
-
nla_nest_end(msg, start_trip);
return 0;
-
-out_cancel_nest:
- mutex_unlock(&tz->lock);
-
- return -EMSGSIZE;
}
static int thermal_genl_cmd_tz_get_temp(struct param *p)
@@ -512,7 +589,7 @@ static int thermal_genl_cmd_tz_get_temp(struct param *p)
static int thermal_genl_cmd_tz_get_gov(struct param *p)
{
struct sk_buff *msg = p->msg;
- int id, ret = 0;
+ int id;
if (!p->attrs[THERMAL_GENL_ATTR_TZ_ID])
return -EINVAL;
@@ -523,16 +600,14 @@ static int thermal_genl_cmd_tz_get_gov(struct param *p)
if (!tz)
return -EINVAL;
- mutex_lock(&tz->lock);
+ guard(thermal_zone)(tz);
if (nla_put_u32(msg, THERMAL_GENL_ATTR_TZ_ID, id) ||
nla_put_string(msg, THERMAL_GENL_ATTR_TZ_GOV_NAME,
tz->governor->name))
- ret = -EMSGSIZE;
+ return -EMSGSIZE;
- mutex_unlock(&tz->lock);
-
- return ret;
+ return 0;
}
static int __thermal_genl_cmd_cdev_get(struct thermal_cooling_device *cdev,
@@ -572,12 +647,128 @@ static int thermal_genl_cmd_cdev_get(struct param *p)
return ret;
}
+static int __thermal_genl_cmd_threshold_get(struct user_threshold *threshold, void *arg)
+{
+ struct sk_buff *msg = arg;
+
+ if (nla_put_u32(msg, THERMAL_GENL_ATTR_THRESHOLD_TEMP, threshold->temperature) ||
+ nla_put_u32(msg, THERMAL_GENL_ATTR_THRESHOLD_DIRECTION, threshold->direction))
+ return -1;
+
+ return 0;
+}
+
+static int thermal_genl_cmd_threshold_get(struct param *p)
+{
+ struct sk_buff *msg = p->msg;
+ struct nlattr *start_trip;
+ int id, ret;
+
+ if (!p->attrs[THERMAL_GENL_ATTR_TZ_ID])
+ return -EINVAL;
+
+ id = nla_get_u32(p->attrs[THERMAL_GENL_ATTR_TZ_ID]);
+
+ CLASS(thermal_zone_get_by_id, tz)(id);
+ if (!tz)
+ return -EINVAL;
+
+ start_trip = nla_nest_start(msg, THERMAL_GENL_ATTR_THRESHOLD);
+ if (!start_trip)
+ return -EMSGSIZE;
+
+ ret = thermal_thresholds_for_each(tz, __thermal_genl_cmd_threshold_get, msg);
+ if (ret)
+ return -EMSGSIZE;
+
+ nla_nest_end(msg, start_trip);
+
+ return 0;
+}
+
+static int thermal_genl_cmd_threshold_add(struct param *p)
+{
+ int id, temp, direction;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (!p->attrs[THERMAL_GENL_ATTR_TZ_ID] ||
+ !p->attrs[THERMAL_GENL_ATTR_THRESHOLD_TEMP] ||
+ !p->attrs[THERMAL_GENL_ATTR_THRESHOLD_DIRECTION])
+ return -EINVAL;
+
+ id = nla_get_u32(p->attrs[THERMAL_GENL_ATTR_TZ_ID]);
+ temp = nla_get_u32(p->attrs[THERMAL_GENL_ATTR_THRESHOLD_TEMP]);
+ direction = nla_get_u32(p->attrs[THERMAL_GENL_ATTR_THRESHOLD_DIRECTION]);
+
+ CLASS(thermal_zone_get_by_id, tz)(id);
+ if (!tz)
+ return -EINVAL;
+
+ guard(thermal_zone)(tz);
+
+ return thermal_thresholds_add(tz, temp, direction);
+}
+
+static int thermal_genl_cmd_threshold_delete(struct param *p)
+{
+ int id, temp, direction;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (!p->attrs[THERMAL_GENL_ATTR_TZ_ID] ||
+ !p->attrs[THERMAL_GENL_ATTR_THRESHOLD_TEMP] ||
+ !p->attrs[THERMAL_GENL_ATTR_THRESHOLD_DIRECTION])
+ return -EINVAL;
+
+ id = nla_get_u32(p->attrs[THERMAL_GENL_ATTR_TZ_ID]);
+ temp = nla_get_u32(p->attrs[THERMAL_GENL_ATTR_THRESHOLD_TEMP]);
+ direction = nla_get_u32(p->attrs[THERMAL_GENL_ATTR_THRESHOLD_DIRECTION]);
+
+ CLASS(thermal_zone_get_by_id, tz)(id);
+ if (!tz)
+ return -EINVAL;
+
+ guard(thermal_zone)(tz);
+
+ return thermal_thresholds_delete(tz, temp, direction);
+}
+
+static int thermal_genl_cmd_threshold_flush(struct param *p)
+{
+ int id;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (!p->attrs[THERMAL_GENL_ATTR_TZ_ID])
+ return -EINVAL;
+
+ id = nla_get_u32(p->attrs[THERMAL_GENL_ATTR_TZ_ID]);
+
+ CLASS(thermal_zone_get_by_id, tz)(id);
+ if (!tz)
+ return -EINVAL;
+
+ guard(thermal_zone)(tz);
+
+ thermal_thresholds_flush(tz);
+
+ return 0;
+}
+
static cb_t cmd_cb[] = {
- [THERMAL_GENL_CMD_TZ_GET_ID] = thermal_genl_cmd_tz_get_id,
- [THERMAL_GENL_CMD_TZ_GET_TRIP] = thermal_genl_cmd_tz_get_trip,
- [THERMAL_GENL_CMD_TZ_GET_TEMP] = thermal_genl_cmd_tz_get_temp,
- [THERMAL_GENL_CMD_TZ_GET_GOV] = thermal_genl_cmd_tz_get_gov,
- [THERMAL_GENL_CMD_CDEV_GET] = thermal_genl_cmd_cdev_get,
+ [THERMAL_GENL_CMD_TZ_GET_ID] = thermal_genl_cmd_tz_get_id,
+ [THERMAL_GENL_CMD_TZ_GET_TRIP] = thermal_genl_cmd_tz_get_trip,
+ [THERMAL_GENL_CMD_TZ_GET_TEMP] = thermal_genl_cmd_tz_get_temp,
+ [THERMAL_GENL_CMD_TZ_GET_GOV] = thermal_genl_cmd_tz_get_gov,
+ [THERMAL_GENL_CMD_CDEV_GET] = thermal_genl_cmd_cdev_get,
+ [THERMAL_GENL_CMD_THRESHOLD_GET] = thermal_genl_cmd_threshold_get,
+ [THERMAL_GENL_CMD_THRESHOLD_ADD] = thermal_genl_cmd_threshold_add,
+ [THERMAL_GENL_CMD_THRESHOLD_DELETE] = thermal_genl_cmd_threshold_delete,
+ [THERMAL_GENL_CMD_THRESHOLD_FLUSH] = thermal_genl_cmd_threshold_flush,
};
static int thermal_genl_cmd_dumpit(struct sk_buff *skb,
@@ -688,6 +879,26 @@ static const struct genl_small_ops thermal_genl_ops[] = {
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.dumpit = thermal_genl_cmd_dumpit,
},
+ {
+ .cmd = THERMAL_GENL_CMD_THRESHOLD_GET,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .doit = thermal_genl_cmd_doit,
+ },
+ {
+ .cmd = THERMAL_GENL_CMD_THRESHOLD_ADD,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .doit = thermal_genl_cmd_doit,
+ },
+ {
+ .cmd = THERMAL_GENL_CMD_THRESHOLD_DELETE,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .doit = thermal_genl_cmd_doit,
+ },
+ {
+ .cmd = THERMAL_GENL_CMD_THRESHOLD_FLUSH,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .doit = thermal_genl_cmd_doit,
+ },
};
static struct genl_family thermal_genl_family __ro_after_init = {
@@ -700,7 +911,7 @@ static struct genl_family thermal_genl_family __ro_after_init = {
.unbind = thermal_genl_unbind,
.small_ops = thermal_genl_ops,
.n_small_ops = ARRAY_SIZE(thermal_genl_ops),
- .resv_start_op = THERMAL_GENL_CMD_CDEV_GET + 1,
+ .resv_start_op = __THERMAL_GENL_CMD_MAX,
.mcgrps = thermal_genl_mcgrps,
.n_mcgrps = ARRAY_SIZE(thermal_genl_mcgrps),
};
diff --git a/drivers/thermal/thermal_netlink.h b/drivers/thermal/thermal_netlink.h
index e01221e..075e9ae 100644
--- a/drivers/thermal/thermal_netlink.h
+++ b/drivers/thermal/thermal_netlink.h
@@ -53,6 +53,13 @@ int thermal_notify_tz_gov_change(const struct thermal_zone_device *tz,
int thermal_genl_sampling_temp(int id, int temp);
int thermal_genl_cpu_capability_event(int count,
struct thermal_genl_cpu_caps *caps);
+int thermal_notify_threshold_add(const struct thermal_zone_device *tz,
+ int temperature, int direction);
+int thermal_notify_threshold_delete(const struct thermal_zone_device *tz,
+ int temperature, int direction);
+int thermal_notify_threshold_flush(const struct thermal_zone_device *tz);
+int thermal_notify_threshold_down(const struct thermal_zone_device *tz);
+int thermal_notify_threshold_up(const struct thermal_zone_device *tz);
#else
static inline int thermal_netlink_init(void)
{
@@ -139,6 +146,33 @@ static inline int thermal_genl_cpu_capability_event(int count, struct thermal_ge
return 0;
}
+static inline int thermal_notify_threshold_add(const struct thermal_zone_device *tz,
+ int temperature, int direction)
+{
+ return 0;
+}
+
+static inline int thermal_notify_threshold_delete(const struct thermal_zone_device *tz,
+ int temperature, int direction)
+{
+ return 0;
+}
+
+static inline int thermal_notify_threshold_flush(const struct thermal_zone_device *tz)
+{
+ return 0;
+}
+
+static inline int thermal_notify_threshold_down(const struct thermal_zone_device *tz)
+{
+ return 0;
+}
+
+static inline int thermal_notify_threshold_up(const struct thermal_zone_device *tz)
+{
+ return 0;
+}
+
static inline void __init thermal_netlink_exit(void) {}
#endif /* CONFIG_THERMAL_NETLINK */
diff --git a/drivers/thermal/thermal_sysfs.c b/drivers/thermal/thermal_sysfs.c
index 1838aa7..24b9055 100644
--- a/drivers/thermal/thermal_sysfs.c
+++ b/drivers/thermal/thermal_sysfs.c
@@ -50,13 +50,13 @@ static ssize_t
mode_show(struct device *dev, struct device_attribute *attr, char *buf)
{
struct thermal_zone_device *tz = to_thermal_zone(dev);
- int enabled;
- mutex_lock(&tz->lock);
- enabled = tz->mode == THERMAL_DEVICE_ENABLED;
- mutex_unlock(&tz->lock);
+ guard(thermal_zone)(tz);
- return sprintf(buf, "%s\n", enabled ? "enabled" : "disabled");
+ if (tz->mode == THERMAL_DEVICE_ENABLED)
+ return sprintf(buf, "enabled\n");
+
+ return sprintf(buf, "disabled\n");
}
static ssize_t
@@ -103,38 +103,34 @@ trip_point_temp_store(struct device *dev, struct device_attribute *attr,
{
struct thermal_trip *trip = thermal_trip_of_attr(attr, temp);
struct thermal_zone_device *tz = to_thermal_zone(dev);
- int ret, temp;
+ int temp;
- ret = kstrtoint(buf, 10, &temp);
- if (ret)
+ if (kstrtoint(buf, 10, &temp))
return -EINVAL;
- mutex_lock(&tz->lock);
+ guard(thermal_zone)(tz);
if (temp == trip->temperature)
- goto unlock;
+ return count;
/* Arrange the condition to avoid integer overflows. */
if (temp != THERMAL_TEMP_INVALID &&
- temp <= trip->hysteresis + THERMAL_TEMP_INVALID) {
- ret = -EINVAL;
- goto unlock;
- }
+ temp <= trip->hysteresis + THERMAL_TEMP_INVALID)
+ return -EINVAL;
if (tz->ops.set_trip_temp) {
+ int ret;
+
ret = tz->ops.set_trip_temp(tz, trip, temp);
if (ret)
- goto unlock;
+ return ret;
}
thermal_zone_set_trip_temp(tz, trip, temp);
__thermal_zone_device_update(tz, THERMAL_TRIP_CHANGED);
-unlock:
- mutex_unlock(&tz->lock);
-
- return ret ? ret : count;
+ return count;
}
static ssize_t
@@ -152,16 +148,15 @@ trip_point_hyst_store(struct device *dev, struct device_attribute *attr,
{
struct thermal_trip *trip = thermal_trip_of_attr(attr, hyst);
struct thermal_zone_device *tz = to_thermal_zone(dev);
- int ret, hyst;
+ int hyst;
- ret = kstrtoint(buf, 10, &hyst);
- if (ret || hyst < 0)
+ if (kstrtoint(buf, 10, &hyst) || hyst < 0)
return -EINVAL;
- mutex_lock(&tz->lock);
+ guard(thermal_zone)(tz);
if (hyst == trip->hysteresis)
- goto unlock;
+ return count;
/*
* Allow the hysteresis to be updated when the temperature is invalid
@@ -171,22 +166,17 @@ trip_point_hyst_store(struct device *dev, struct device_attribute *attr,
*/
if (trip->temperature == THERMAL_TEMP_INVALID) {
WRITE_ONCE(trip->hysteresis, hyst);
- goto unlock;
+ return count;
}
- if (trip->temperature - hyst <= THERMAL_TEMP_INVALID) {
- ret = -EINVAL;
- goto unlock;
- }
+ if (trip->temperature - hyst <= THERMAL_TEMP_INVALID)
+ return -EINVAL;
thermal_zone_set_trip_hyst(tz, trip, hyst);
__thermal_zone_device_update(tz, THERMAL_TRIP_CHANGED);
-unlock:
- mutex_unlock(&tz->lock);
-
- return ret ? ret : count;
+ return count;
}
static ssize_t
@@ -236,25 +226,26 @@ emul_temp_store(struct device *dev, struct device_attribute *attr,
const char *buf, size_t count)
{
struct thermal_zone_device *tz = to_thermal_zone(dev);
- int ret = 0;
int temperature;
if (kstrtoint(buf, 10, &temperature))
return -EINVAL;
- mutex_lock(&tz->lock);
+ guard(thermal_zone)(tz);
- if (!tz->ops.set_emul_temp)
- tz->emul_temperature = temperature;
- else
+ if (tz->ops.set_emul_temp) {
+ int ret;
+
ret = tz->ops.set_emul_temp(tz, temperature);
+ if (ret)
+ return ret;
+ } else {
+ tz->emul_temperature = temperature;
+ }
- if (!ret)
- __thermal_zone_device_update(tz, THERMAL_EVENT_UNSPECIFIED);
+ __thermal_zone_device_update(tz, THERMAL_EVENT_UNSPECIFIED);
- mutex_unlock(&tz->lock);
-
- return ret ? ret : count;
+ return count;
}
static DEVICE_ATTR_WO(emul_temp);
#endif
@@ -553,14 +544,15 @@ cur_state_store(struct device *dev, struct device_attribute *attr,
if (state > cdev->max_state)
return -EINVAL;
- mutex_lock(&cdev->lock);
+ guard(cooling_dev)(cdev);
result = cdev->ops->set_cur_state(cdev, state);
- if (!result)
- thermal_cooling_device_stats_update(cdev, state);
+ if (result)
+ return result;
- mutex_unlock(&cdev->lock);
- return result ? result : count;
+ thermal_cooling_device_stats_update(cdev, state);
+
+ return count;
}
static struct device_attribute
@@ -634,21 +626,18 @@ static ssize_t total_trans_show(struct device *dev,
{
struct thermal_cooling_device *cdev = to_cooling_device(dev);
struct cooling_dev_stats *stats;
- int ret = 0;
+ int ret;
- mutex_lock(&cdev->lock);
+ guard(cooling_dev)(cdev);
stats = cdev->stats;
if (!stats)
- goto unlock;
+ return 0;
spin_lock(&stats->lock);
ret = sprintf(buf, "%u\n", stats->total_trans);
spin_unlock(&stats->lock);
-unlock:
- mutex_unlock(&cdev->lock);
-
return ret;
}
@@ -661,11 +650,11 @@ time_in_state_ms_show(struct device *dev, struct device_attribute *attr,
ssize_t len = 0;
int i;
- mutex_lock(&cdev->lock);
+ guard(cooling_dev)(cdev);
stats = cdev->stats;
if (!stats)
- goto unlock;
+ return 0;
spin_lock(&stats->lock);
@@ -677,9 +666,6 @@ time_in_state_ms_show(struct device *dev, struct device_attribute *attr,
}
spin_unlock(&stats->lock);
-unlock:
- mutex_unlock(&cdev->lock);
-
return len;
}
@@ -691,11 +677,11 @@ reset_store(struct device *dev, struct device_attribute *attr, const char *buf,
struct cooling_dev_stats *stats;
int i, states;
- mutex_lock(&cdev->lock);
+ guard(cooling_dev)(cdev);
stats = cdev->stats;
if (!stats)
- goto unlock;
+ return count;
states = cdev->max_state + 1;
@@ -711,9 +697,6 @@ reset_store(struct device *dev, struct device_attribute *attr, const char *buf,
spin_unlock(&stats->lock);
-unlock:
- mutex_unlock(&cdev->lock);
-
return count;
}
@@ -725,13 +708,11 @@ static ssize_t trans_table_show(struct device *dev,
ssize_t len = 0;
int i, j;
- mutex_lock(&cdev->lock);
+ guard(cooling_dev)(cdev);
stats = cdev->stats;
- if (!stats) {
- len = -ENODATA;
- goto unlock;
- }
+ if (!stats)
+ return -ENODATA;
len += snprintf(buf + len, PAGE_SIZE - len, " From : To\n");
len += snprintf(buf + len, PAGE_SIZE - len, " : ");
@@ -740,10 +721,8 @@ static ssize_t trans_table_show(struct device *dev,
break;
len += snprintf(buf + len, PAGE_SIZE - len, "state%2u ", i);
}
- if (len >= PAGE_SIZE) {
- len = PAGE_SIZE;
- goto unlock;
- }
+ if (len >= PAGE_SIZE)
+ return PAGE_SIZE;
len += snprintf(buf + len, PAGE_SIZE - len, "\n");
@@ -769,9 +748,6 @@ static ssize_t trans_table_show(struct device *dev,
len = -EFBIG;
}
-unlock:
- mutex_unlock(&cdev->lock);
-
return len;
}
@@ -894,13 +870,11 @@ ssize_t weight_store(struct device *dev, struct device_attribute *attr,
instance = container_of(attr, struct thermal_instance, weight_attr);
/* Don't race with governors using the 'weight' value */
- mutex_lock(&tz->lock);
+ guard(thermal_zone)(tz);
instance->weight = weight;
thermal_governor_update_tz(tz, THERMAL_INSTANCE_WEIGHT_CHANGED);
- mutex_unlock(&tz->lock);
-
return count;
}
diff --git a/drivers/thermal/thermal_thresholds.c b/drivers/thermal/thermal_thresholds.c
new file mode 100644
index 0000000..d9b2a0b
--- /dev/null
+++ b/drivers/thermal/thermal_thresholds.c
@@ -0,0 +1,240 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright 2024 Linaro Limited
+ *
+ * Author: Daniel Lezcano <daniel.lezcano@linaro.org>
+ *
+ * Thermal thresholds
+ */
+#include <linux/list.h>
+#include <linux/list_sort.h>
+#include <linux/slab.h>
+
+#include "thermal_core.h"
+#include "thermal_thresholds.h"
+
+int thermal_thresholds_init(struct thermal_zone_device *tz)
+{
+ INIT_LIST_HEAD(&tz->user_thresholds);
+
+ return 0;
+}
+
+static void __thermal_thresholds_flush(struct thermal_zone_device *tz)
+{
+ struct list_head *thresholds = &tz->user_thresholds;
+ struct user_threshold *entry, *tmp;
+
+ list_for_each_entry_safe(entry, tmp, thresholds, list_node) {
+ list_del(&entry->list_node);
+ kfree(entry);
+ }
+}
+
+void thermal_thresholds_flush(struct thermal_zone_device *tz)
+{
+ lockdep_assert_held(&tz->lock);
+
+ __thermal_thresholds_flush(tz);
+
+ thermal_notify_threshold_flush(tz);
+
+ __thermal_zone_device_update(tz, THERMAL_TZ_FLUSH_THRESHOLDS);
+}
+
+void thermal_thresholds_exit(struct thermal_zone_device *tz)
+{
+ __thermal_thresholds_flush(tz);
+}
+
+static int __thermal_thresholds_cmp(void *data,
+ const struct list_head *l1,
+ const struct list_head *l2)
+{
+ struct user_threshold *t1 = container_of(l1, struct user_threshold, list_node);
+ struct user_threshold *t2 = container_of(l2, struct user_threshold, list_node);
+
+ return t1->temperature - t2->temperature;
+}
+
+static struct user_threshold *__thermal_thresholds_find(const struct list_head *thresholds,
+ int temperature)
+{
+ struct user_threshold *t;
+
+ list_for_each_entry(t, thresholds, list_node)
+ if (t->temperature == temperature)
+ return t;
+
+ return NULL;
+}
+
+static bool __thermal_threshold_is_crossed(struct user_threshold *threshold, int temperature,
+ int last_temperature, int direction,
+ int *low, int *high)
+{
+
+ if (temperature >= threshold->temperature) {
+ if (threshold->temperature > *low &&
+ THERMAL_THRESHOLD_WAY_DOWN & threshold->direction)
+ *low = threshold->temperature;
+
+ if (last_temperature < threshold->temperature &&
+ threshold->direction & direction)
+ return true;
+ } else {
+ if (threshold->temperature < *high && THERMAL_THRESHOLD_WAY_UP
+ & threshold->direction)
+ *high = threshold->temperature;
+
+ if (last_temperature >= threshold->temperature &&
+ threshold->direction & direction)
+ return true;
+ }
+
+ return false;
+}
+
+static bool thermal_thresholds_handle_raising(struct list_head *thresholds, int temperature,
+ int last_temperature, int *low, int *high)
+{
+ struct user_threshold *t;
+
+ list_for_each_entry(t, thresholds, list_node) {
+ if (__thermal_threshold_is_crossed(t, temperature, last_temperature,
+ THERMAL_THRESHOLD_WAY_UP, low, high))
+ return true;
+ }
+
+ return false;
+}
+
+static bool thermal_thresholds_handle_dropping(struct list_head *thresholds, int temperature,
+ int last_temperature, int *low, int *high)
+{
+ struct user_threshold *t;
+
+ list_for_each_entry_reverse(t, thresholds, list_node) {
+ if (__thermal_threshold_is_crossed(t, temperature, last_temperature,
+ THERMAL_THRESHOLD_WAY_DOWN, low, high))
+ return true;
+ }
+
+ return false;
+}
+
+void thermal_thresholds_handle(struct thermal_zone_device *tz, int *low, int *high)
+{
+ struct list_head *thresholds = &tz->user_thresholds;
+
+ int temperature = tz->temperature;
+ int last_temperature = tz->last_temperature;
+
+ lockdep_assert_held(&tz->lock);
+
+ /*
+ * We need a second update in order to detect a threshold being crossed
+ */
+ if (last_temperature == THERMAL_TEMP_INVALID)
+ return;
+
+ /*
+ * The temperature is stable, so obviously we can not have
+ * crossed a threshold.
+ */
+ if (last_temperature == temperature)
+ return;
+
+ /*
+ * Since last update the temperature:
+ * - increased : thresholds are crossed the way up
+ * - decreased : thresholds are crossed the way down
+ */
+ if (temperature > last_temperature) {
+ if (thermal_thresholds_handle_raising(thresholds, temperature,
+ last_temperature, low, high))
+ thermal_notify_threshold_up(tz);
+ } else {
+ if (thermal_thresholds_handle_dropping(thresholds, temperature,
+ last_temperature, low, high))
+ thermal_notify_threshold_down(tz);
+ }
+}
+
+int thermal_thresholds_add(struct thermal_zone_device *tz,
+ int temperature, int direction)
+{
+ struct list_head *thresholds = &tz->user_thresholds;
+ struct user_threshold *t;
+
+ lockdep_assert_held(&tz->lock);
+
+ t = __thermal_thresholds_find(thresholds, temperature);
+ if (t) {
+ if (t->direction == direction)
+ return -EEXIST;
+
+ t->direction |= direction;
+ } else {
+
+ t = kmalloc(sizeof(*t), GFP_KERNEL);
+ if (!t)
+ return -ENOMEM;
+
+ INIT_LIST_HEAD(&t->list_node);
+ t->temperature = temperature;
+ t->direction = direction;
+ list_add(&t->list_node, thresholds);
+ list_sort(NULL, thresholds, __thermal_thresholds_cmp);
+ }
+
+ thermal_notify_threshold_add(tz, temperature, direction);
+
+ __thermal_zone_device_update(tz, THERMAL_TZ_ADD_THRESHOLD);
+
+ return 0;
+}
+
+int thermal_thresholds_delete(struct thermal_zone_device *tz,
+ int temperature, int direction)
+{
+ struct list_head *thresholds = &tz->user_thresholds;
+ struct user_threshold *t;
+
+ lockdep_assert_held(&tz->lock);
+
+ t = __thermal_thresholds_find(thresholds, temperature);
+ if (!t)
+ return -ENOENT;
+
+ if (t->direction == direction) {
+ list_del(&t->list_node);
+ kfree(t);
+ } else {
+ t->direction &= ~direction;
+ }
+
+ thermal_notify_threshold_delete(tz, temperature, direction);
+
+ __thermal_zone_device_update(tz, THERMAL_TZ_DEL_THRESHOLD);
+
+ return 0;
+}
+
+int thermal_thresholds_for_each(struct thermal_zone_device *tz,
+ int (*cb)(struct user_threshold *, void *arg), void *arg)
+{
+ struct list_head *thresholds = &tz->user_thresholds;
+ struct user_threshold *entry;
+ int ret;
+
+ guard(thermal_zone)(tz);
+
+ list_for_each_entry(entry, thresholds, list_node) {
+ ret = cb(entry, arg);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
diff --git a/drivers/thermal/thermal_thresholds.h b/drivers/thermal/thermal_thresholds.h
new file mode 100644
index 0000000..cb37265
--- /dev/null
+++ b/drivers/thermal/thermal_thresholds.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __THERMAL_THRESHOLDS_H__
+#define __THERMAL_THRESHOLDS_H__
+
+struct user_threshold {
+ struct list_head list_node;
+ int temperature;
+ int direction;
+};
+
+int thermal_thresholds_init(struct thermal_zone_device *tz);
+void thermal_thresholds_exit(struct thermal_zone_device *tz);
+void thermal_thresholds_handle(struct thermal_zone_device *tz, int *low, int *high);
+void thermal_thresholds_flush(struct thermal_zone_device *tz);
+int thermal_thresholds_add(struct thermal_zone_device *tz, int temperature, int direction);
+int thermal_thresholds_delete(struct thermal_zone_device *tz, int temperature, int direction);
+int thermal_thresholds_for_each(struct thermal_zone_device *tz,
+ int (*cb)(struct user_threshold *, void *arg), void *arg);
+#endif
diff --git a/drivers/thermal/thermal_trip.c b/drivers/thermal/thermal_trip.c
index b53fac3..4b82384 100644
--- a/drivers/thermal/thermal_trip.c
+++ b/drivers/thermal/thermal_trip.c
@@ -45,13 +45,9 @@ int thermal_zone_for_each_trip(struct thermal_zone_device *tz,
int (*cb)(struct thermal_trip *, void *),
void *data)
{
- int ret;
+ guard(thermal_zone)(tz);
- mutex_lock(&tz->lock);
- ret = for_each_thermal_trip(tz, cb, data);
- mutex_unlock(&tz->lock);
-
- return ret;
+ return for_each_thermal_trip(tz, cb, data);
}
EXPORT_SYMBOL_GPL(thermal_zone_for_each_trip);
@@ -92,43 +88,3 @@ int thermal_zone_trip_id(const struct thermal_zone_device *tz,
*/
return trip_to_trip_desc(trip) - tz->trips;
}
-
-void thermal_zone_set_trip_hyst(struct thermal_zone_device *tz,
- struct thermal_trip *trip, int hyst)
-{
- WRITE_ONCE(trip->hysteresis, hyst);
- thermal_notify_tz_trip_change(tz, trip);
-}
-
-void thermal_zone_set_trip_temp(struct thermal_zone_device *tz,
- struct thermal_trip *trip, int temp)
-{
- if (trip->temperature == temp)
- return;
-
- WRITE_ONCE(trip->temperature, temp);
- thermal_notify_tz_trip_change(tz, trip);
-
- if (temp == THERMAL_TEMP_INVALID) {
- struct thermal_trip_desc *td = trip_to_trip_desc(trip);
-
- if (tz->temperature >= td->threshold) {
- /*
- * The trip has been crossed on the way up, so some
- * adjustments are needed to compensate for the lack
- * of it going forward.
- */
- if (trip->type == THERMAL_TRIP_PASSIVE) {
- tz->passive--;
- WARN_ON_ONCE(tz->passive < 0);
- }
- thermal_zone_trip_down(tz, trip);
- }
- /*
- * Invalidate the threshold to avoid triggering a spurious
- * trip crossing notification when the trip becomes valid.
- */
- td->threshold = INT_MAX;
- }
-}
-EXPORT_SYMBOL_GPL(thermal_zone_set_trip_temp);
diff --git a/drivers/vdpa/ifcvf/ifcvf_base.c b/drivers/vdpa/ifcvf/ifcvf_base.c
index 472daa5..d5507b6 100644
--- a/drivers/vdpa/ifcvf/ifcvf_base.c
+++ b/drivers/vdpa/ifcvf/ifcvf_base.c
@@ -108,7 +108,7 @@ int ifcvf_init_hw(struct ifcvf_hw *hw, struct pci_dev *pdev)
u32 i;
ret = pci_read_config_byte(pdev, PCI_CAPABILITY_LIST, &pos);
- if (ret < 0) {
+ if (ret) {
IFCVF_ERR(pdev, "Failed to read PCI capability list\n");
return -EIO;
}
diff --git a/drivers/vdpa/mlx5/core/mr.c b/drivers/vdpa/mlx5/core/mr.c
index 2dd21e0..7d0c83b 100644
--- a/drivers/vdpa/mlx5/core/mr.c
+++ b/drivers/vdpa/mlx5/core/mr.c
@@ -373,7 +373,7 @@ static int map_direct_mr(struct mlx5_vdpa_dev *mvdev, struct mlx5_vdpa_direct_mr
struct page *pg;
unsigned int nsg;
int sglen;
- u64 pa;
+ u64 pa, offset;
u64 paend;
struct scatterlist *sg;
struct device *dma = mvdev->vdev.dma_dev;
@@ -396,8 +396,10 @@ static int map_direct_mr(struct mlx5_vdpa_dev *mvdev, struct mlx5_vdpa_direct_mr
sg = mr->sg_head.sgl;
for (map = vhost_iotlb_itree_first(iotlb, mr->start, mr->end - 1);
map; map = vhost_iotlb_itree_next(map, mr->start, mr->end - 1)) {
- paend = map->addr + maplen(map, mr);
- for (pa = map->addr; pa < paend; pa += sglen) {
+ offset = mr->start > map->start ? mr->start - map->start : 0;
+ pa = map->addr + offset;
+ paend = map->addr + offset + maplen(map, mr);
+ for (; pa < paend; pa += sglen) {
pg = pfn_to_page(__phys_to_pfn(pa));
if (!sg) {
mlx5_vdpa_warn(mvdev, "sg null. start 0x%llx, end 0x%llx\n",
diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index dee0199..5f581e7 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -3963,28 +3963,28 @@ static int mlx5_vdpa_dev_add(struct vdpa_mgmt_dev *v_mdev, const char *name,
mvdev->vdev.dma_dev = &mdev->pdev->dev;
err = mlx5_vdpa_alloc_resources(&ndev->mvdev);
if (err)
- goto err_mpfs;
+ goto err_alloc;
err = mlx5_vdpa_init_mr_resources(mvdev);
if (err)
- goto err_res;
+ goto err_alloc;
if (MLX5_CAP_GEN(mvdev->mdev, umem_uid_0)) {
err = mlx5_vdpa_create_dma_mr(mvdev);
if (err)
- goto err_mr_res;
+ goto err_alloc;
}
err = alloc_fixed_resources(ndev);
if (err)
- goto err_mr;
+ goto err_alloc;
ndev->cvq_ent.mvdev = mvdev;
INIT_WORK(&ndev->cvq_ent.work, mlx5_cvq_kick_handler);
mvdev->wq = create_singlethread_workqueue("mlx5_vdpa_wq");
if (!mvdev->wq) {
err = -ENOMEM;
- goto err_res2;
+ goto err_alloc;
}
mvdev->vdev.mdev = &mgtdev->mgtdev;
@@ -4010,17 +4010,6 @@ static int mlx5_vdpa_dev_add(struct vdpa_mgmt_dev *v_mdev, const char *name,
_vdpa_unregister_device(&mvdev->vdev);
err_reg:
destroy_workqueue(mvdev->wq);
-err_res2:
- free_fixed_resources(ndev);
-err_mr:
- mlx5_vdpa_clean_mrs(mvdev);
-err_mr_res:
- mlx5_vdpa_destroy_mr_resources(mvdev);
-err_res:
- mlx5_vdpa_free_resources(&ndev->mvdev);
-err_mpfs:
- if (!is_zero_ether_addr(config->mac))
- mlx5_mpfs_del_mac(pfmdev, config->mac);
err_alloc:
put_device(&mvdev->vdev.dev);
return err;
diff --git a/drivers/vdpa/solidrun/snet_main.c b/drivers/vdpa/solidrun/snet_main.c
index 99428a0..c8b7498 100644
--- a/drivers/vdpa/solidrun/snet_main.c
+++ b/drivers/vdpa/solidrun/snet_main.c
@@ -555,7 +555,7 @@ static const struct vdpa_config_ops snet_config_ops = {
static int psnet_open_pf_bar(struct pci_dev *pdev, struct psnet *psnet)
{
- char name[50];
+ char *name;
int ret, i, mask = 0;
/* We don't know which BAR will be used to communicate..
* We will map every bar with len > 0.
@@ -573,7 +573,10 @@ static int psnet_open_pf_bar(struct pci_dev *pdev, struct psnet *psnet)
return -ENODEV;
}
- snprintf(name, sizeof(name), "psnet[%s]-bars", pci_name(pdev));
+ name = devm_kasprintf(&pdev->dev, GFP_KERNEL, "psnet[%s]-bars", pci_name(pdev));
+ if (!name)
+ return -ENOMEM;
+
ret = pcim_iomap_regions(pdev, mask, name);
if (ret) {
SNET_ERR(pdev, "Failed to request and map PCI BARs\n");
@@ -590,10 +593,13 @@ static int psnet_open_pf_bar(struct pci_dev *pdev, struct psnet *psnet)
static int snet_open_vf_bar(struct pci_dev *pdev, struct snet *snet)
{
- char name[50];
+ char *name;
int ret;
- snprintf(name, sizeof(name), "snet[%s]-bar", pci_name(pdev));
+ name = devm_kasprintf(&pdev->dev, GFP_KERNEL, "snet[%s]-bars", pci_name(pdev));
+ if (!name)
+ return -ENOMEM;
+
/* Request and map BAR */
ret = pcim_iomap_regions(pdev, BIT(snet->psnet->cfg.vf_bar), name);
if (ret) {
diff --git a/drivers/vdpa/virtio_pci/vp_vdpa.c b/drivers/vdpa/virtio_pci/vp_vdpa.c
index ac4ab22..1638076 100644
--- a/drivers/vdpa/virtio_pci/vp_vdpa.c
+++ b/drivers/vdpa/virtio_pci/vp_vdpa.c
@@ -612,7 +612,11 @@ static int vp_vdpa_probe(struct pci_dev *pdev, const struct pci_device_id *id)
goto mdev_err;
}
- mdev_id = kzalloc(sizeof(struct virtio_device_id), GFP_KERNEL);
+ /*
+ * id_table should be a null terminated array, so allocate one additional
+ * entry here, see vdpa_mgmtdev_get_classes().
+ */
+ mdev_id = kcalloc(2, sizeof(struct virtio_device_id), GFP_KERNEL);
if (!mdev_id) {
err = -ENOMEM;
goto mdev_id_err;
@@ -632,8 +636,8 @@ static int vp_vdpa_probe(struct pci_dev *pdev, const struct pci_device_id *id)
goto probe_err;
}
- mdev_id->device = mdev->id.device;
- mdev_id->vendor = mdev->id.vendor;
+ mdev_id[0].device = mdev->id.device;
+ mdev_id[0].vendor = mdev->id.vendor;
mgtdev->id_table = mdev_id;
mgtdev->max_supported_vqs = vp_modern_get_num_queues(mdev);
mgtdev->supported_features = vp_modern_get_features(mdev);
diff --git a/drivers/vfio/group.c b/drivers/vfio/group.c
index 95b336d..4955960 100644
--- a/drivers/vfio/group.c
+++ b/drivers/vfio/group.c
@@ -104,15 +104,14 @@ static int vfio_group_ioctl_set_container(struct vfio_group *group,
{
struct vfio_container *container;
struct iommufd_ctx *iommufd;
- struct fd f;
int ret;
int fd;
if (get_user(fd, arg))
return -EFAULT;
- f = fdget(fd);
- if (!fd_file(f))
+ CLASS(fd, f)(fd);
+ if (fd_empty(f))
return -EBADF;
mutex_lock(&group->group_lock);
@@ -153,7 +152,6 @@ static int vfio_group_ioctl_set_container(struct vfio_group *group,
out_unlock:
mutex_unlock(&group->group_lock);
- fdput(f);
return ret;
}
diff --git a/drivers/vfio/virqfd.c b/drivers/vfio/virqfd.c
index d228812..aa2891f 100644
--- a/drivers/vfio/virqfd.c
+++ b/drivers/vfio/virqfd.c
@@ -113,7 +113,6 @@ int vfio_virqfd_enable(void *opaque,
void (*thread)(void *, void *),
void *data, struct virqfd **pvirqfd, int fd)
{
- struct fd irqfd;
struct eventfd_ctx *ctx;
struct virqfd *virqfd;
int ret = 0;
@@ -133,8 +132,8 @@ int vfio_virqfd_enable(void *opaque,
INIT_WORK(&virqfd->inject, virqfd_inject);
INIT_WORK(&virqfd->flush_inject, virqfd_flush_inject);
- irqfd = fdget(fd);
- if (!fd_file(irqfd)) {
+ CLASS(fd, irqfd)(fd);
+ if (fd_empty(irqfd)) {
ret = -EBADF;
goto err_fd;
}
@@ -142,7 +141,7 @@ int vfio_virqfd_enable(void *opaque,
ctx = eventfd_ctx_fileget(fd_file(irqfd));
if (IS_ERR(ctx)) {
ret = PTR_ERR(ctx);
- goto err_ctx;
+ goto err_fd;
}
virqfd->eventfd = ctx;
@@ -181,18 +180,9 @@ int vfio_virqfd_enable(void *opaque,
if ((!handler || handler(opaque, data)) && thread)
schedule_work(&virqfd->inject);
}
-
- /*
- * Do not drop the file until the irqfd is fully initialized,
- * otherwise we might race against the EPOLLHUP.
- */
- fdput(irqfd);
-
return 0;
err_busy:
eventfd_ctx_put(ctx);
-err_ctx:
- fdput(irqfd);
err_fd:
kfree(virqfd);
diff --git a/drivers/virt/acrn/irqfd.c b/drivers/virt/acrn/irqfd.c
index 9994d81..b7da24c 100644
--- a/drivers/virt/acrn/irqfd.c
+++ b/drivers/virt/acrn/irqfd.c
@@ -112,7 +112,6 @@ static int acrn_irqfd_assign(struct acrn_vm *vm, struct acrn_irqfd *args)
struct eventfd_ctx *eventfd = NULL;
struct hsm_irqfd *irqfd, *tmp;
__poll_t events;
- struct fd f;
int ret = 0;
irqfd = kzalloc(sizeof(*irqfd), GFP_KERNEL);
@@ -124,8 +123,8 @@ static int acrn_irqfd_assign(struct acrn_vm *vm, struct acrn_irqfd *args)
INIT_LIST_HEAD(&irqfd->list);
INIT_WORK(&irqfd->shutdown, hsm_irqfd_shutdown_work);
- f = fdget(args->fd);
- if (!fd_file(f)) {
+ CLASS(fd, f)(args->fd);
+ if (fd_empty(f)) {
ret = -EBADF;
goto out;
}
@@ -133,7 +132,7 @@ static int acrn_irqfd_assign(struct acrn_vm *vm, struct acrn_irqfd *args)
eventfd = eventfd_ctx_fileget(fd_file(f));
if (IS_ERR(eventfd)) {
ret = PTR_ERR(eventfd);
- goto fail;
+ goto out;
}
irqfd->eventfd = eventfd;
@@ -162,13 +161,9 @@ static int acrn_irqfd_assign(struct acrn_vm *vm, struct acrn_irqfd *args)
if (events & EPOLLIN)
acrn_irqfd_inject(irqfd);
- fdput(f);
return 0;
fail:
- if (eventfd && !IS_ERR(eventfd))
- eventfd_ctx_put(eventfd);
-
- fdput(f);
+ eventfd_ctx_put(eventfd);
out:
kfree(irqfd);
return ret;
diff --git a/drivers/virt/coco/Kconfig b/drivers/virt/coco/Kconfig
index d9ff676..ff869d8 100644
--- a/drivers/virt/coco/Kconfig
+++ b/drivers/virt/coco/Kconfig
@@ -14,3 +14,5 @@
source "drivers/virt/coco/sev-guest/Kconfig"
source "drivers/virt/coco/tdx-guest/Kconfig"
+
+source "drivers/virt/coco/arm-cca-guest/Kconfig"
diff --git a/drivers/virt/coco/Makefile b/drivers/virt/coco/Makefile
index b69c30c..c3d07cf 100644
--- a/drivers/virt/coco/Makefile
+++ b/drivers/virt/coco/Makefile
@@ -7,3 +7,4 @@
obj-$(CONFIG_ARM_PKVM_GUEST) += pkvm-guest/
obj-$(CONFIG_SEV_GUEST) += sev-guest/
obj-$(CONFIG_INTEL_TDX_GUEST) += tdx-guest/
+obj-$(CONFIG_ARM_CCA_GUEST) += arm-cca-guest/
diff --git a/drivers/virt/coco/arm-cca-guest/Kconfig b/drivers/virt/coco/arm-cca-guest/Kconfig
new file mode 100644
index 0000000..9dd27c3
--- /dev/null
+++ b/drivers/virt/coco/arm-cca-guest/Kconfig
@@ -0,0 +1,11 @@
+config ARM_CCA_GUEST
+ tristate "Arm CCA Guest driver"
+ depends on ARM64
+ default m
+ select TSM_REPORTS
+ help
+ The driver provides userspace interface to request and
+ attestation report from the Realm Management Monitor(RMM).
+
+ If you choose 'M' here, this module will be called
+ arm-cca-guest.
diff --git a/drivers/virt/coco/arm-cca-guest/Makefile b/drivers/virt/coco/arm-cca-guest/Makefile
new file mode 100644
index 0000000..69eeba0
--- /dev/null
+++ b/drivers/virt/coco/arm-cca-guest/Makefile
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
+obj-$(CONFIG_ARM_CCA_GUEST) += arm-cca-guest.o
diff --git a/drivers/virt/coco/arm-cca-guest/arm-cca-guest.c b/drivers/virt/coco/arm-cca-guest/arm-cca-guest.c
new file mode 100644
index 0000000..4881538
--- /dev/null
+++ b/drivers/virt/coco/arm-cca-guest/arm-cca-guest.c
@@ -0,0 +1,224 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2023 ARM Ltd.
+ */
+
+#include <linux/arm-smccc.h>
+#include <linux/cc_platform.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/smp.h>
+#include <linux/tsm.h>
+#include <linux/types.h>
+
+#include <asm/rsi.h>
+
+/**
+ * struct arm_cca_token_info - a descriptor for the token buffer.
+ * @challenge: Pointer to the challenge data
+ * @challenge_size: Size of the challenge data
+ * @granule: PA of the granule to which the token will be written
+ * @offset: Offset within granule to start of buffer in bytes
+ * @result: result of rsi_attestation_token_continue operation
+ */
+struct arm_cca_token_info {
+ void *challenge;
+ unsigned long challenge_size;
+ phys_addr_t granule;
+ unsigned long offset;
+ unsigned long result;
+};
+
+static void arm_cca_attestation_init(void *param)
+{
+ struct arm_cca_token_info *info;
+
+ info = (struct arm_cca_token_info *)param;
+
+ info->result = rsi_attestation_token_init(info->challenge,
+ info->challenge_size);
+}
+
+/**
+ * arm_cca_attestation_continue - Retrieve the attestation token data.
+ *
+ * @param: pointer to the arm_cca_token_info
+ *
+ * Attestation token generation is a long running operation and therefore
+ * the token data may not be retrieved in a single call. Moreover, the
+ * token retrieval operation must be requested on the same CPU on which the
+ * attestation token generation was initialised.
+ * This helper function is therefore scheduled on the same CPU multiple
+ * times until the entire token data is retrieved.
+ */
+static void arm_cca_attestation_continue(void *param)
+{
+ unsigned long len;
+ unsigned long size;
+ struct arm_cca_token_info *info;
+
+ info = (struct arm_cca_token_info *)param;
+
+ size = RSI_GRANULE_SIZE - info->offset;
+ info->result = rsi_attestation_token_continue(info->granule,
+ info->offset, size, &len);
+ info->offset += len;
+}
+
+/**
+ * arm_cca_report_new - Generate a new attestation token.
+ *
+ * @report: pointer to the TSM report context information.
+ * @data: pointer to the context specific data for this module.
+ *
+ * Initialise the attestation token generation using the challenge data
+ * passed in the TSM descriptor. Allocate memory for the attestation token
+ * and schedule calls to retrieve the attestation token on the same CPU
+ * on which the attestation token generation was initialised.
+ *
+ * The challenge data must be at least 32 bytes and no more than 64 bytes. If
+ * less than 64 bytes are provided it will be zero padded to 64 bytes.
+ *
+ * Return:
+ * * %0 - Attestation token generated successfully.
+ * * %-EINVAL - A parameter was not valid.
+ * * %-ENOMEM - Out of memory.
+ * * %-EFAULT - Failed to get IPA for memory page(s).
+ * * A negative status code as returned by smp_call_function_single().
+ */
+static int arm_cca_report_new(struct tsm_report *report, void *data)
+{
+ int ret;
+ int cpu;
+ long max_size;
+ unsigned long token_size = 0;
+ struct arm_cca_token_info info;
+ void *buf;
+ u8 *token __free(kvfree) = NULL;
+ struct tsm_desc *desc = &report->desc;
+
+ if (desc->inblob_len < 32 || desc->inblob_len > 64)
+ return -EINVAL;
+
+ /*
+ * The attestation token 'init' and 'continue' calls must be
+ * performed on the same CPU. smp_call_function_single() is used
+ * instead of simply calling get_cpu() because of the need to
+ * allocate outblob based on the returned value from the 'init'
+ * call and that cannot be done in an atomic context.
+ */
+ cpu = smp_processor_id();
+
+ info.challenge = desc->inblob;
+ info.challenge_size = desc->inblob_len;
+
+ ret = smp_call_function_single(cpu, arm_cca_attestation_init,
+ &info, true);
+ if (ret)
+ return ret;
+ max_size = info.result;
+
+ if (max_size <= 0)
+ return -EINVAL;
+
+ /* Allocate outblob */
+ token = kvzalloc(max_size, GFP_KERNEL);
+ if (!token)
+ return -ENOMEM;
+
+ /*
+ * Since the outblob may not be physically contiguous, use a page
+ * to bounce the buffer from RMM.
+ */
+ buf = alloc_pages_exact(RSI_GRANULE_SIZE, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+
+ /* Get the PA of the memory page(s) that were allocated */
+ info.granule = (unsigned long)virt_to_phys(buf);
+
+ /* Loop until the token is ready or there is an error */
+ do {
+ /* Retrieve one RSI_GRANULE_SIZE data per loop iteration */
+ info.offset = 0;
+ do {
+ /*
+ * Schedule a call to retrieve a sub-granule chunk
+ * of data per loop iteration.
+ */
+ ret = smp_call_function_single(cpu,
+ arm_cca_attestation_continue,
+ (void *)&info, true);
+ if (ret != 0) {
+ token_size = 0;
+ goto exit_free_granule_page;
+ }
+ } while (info.result == RSI_INCOMPLETE &&
+ info.offset < RSI_GRANULE_SIZE);
+
+ if (info.result != RSI_SUCCESS) {
+ ret = -ENXIO;
+ token_size = 0;
+ goto exit_free_granule_page;
+ }
+
+ /*
+ * Copy the retrieved token data from the granule
+ * to the token buffer, ensuring that the RMM doesn't
+ * overflow the buffer.
+ */
+ if (WARN_ON(token_size + info.offset > max_size))
+ break;
+ memcpy(&token[token_size], buf, info.offset);
+ token_size += info.offset;
+ } while (info.result == RSI_INCOMPLETE);
+
+ report->outblob = no_free_ptr(token);
+exit_free_granule_page:
+ report->outblob_len = token_size;
+ free_pages_exact(buf, RSI_GRANULE_SIZE);
+ return ret;
+}
+
+static const struct tsm_ops arm_cca_tsm_ops = {
+ .name = KBUILD_MODNAME,
+ .report_new = arm_cca_report_new,
+};
+
+/**
+ * arm_cca_guest_init - Register with the Trusted Security Module (TSM)
+ * interface.
+ *
+ * Return:
+ * * %0 - Registered successfully with the TSM interface.
+ * * %-ENODEV - The execution context is not an Arm Realm.
+ * * %-EBUSY - Already registered.
+ */
+static int __init arm_cca_guest_init(void)
+{
+ int ret;
+
+ if (!is_realm_world())
+ return -ENODEV;
+
+ ret = tsm_register(&arm_cca_tsm_ops, NULL);
+ if (ret < 0)
+ pr_err("Error %d registering with TSM\n", ret);
+
+ return ret;
+}
+module_init(arm_cca_guest_init);
+
+/**
+ * arm_cca_guest_exit - unregister with the Trusted Security Module (TSM)
+ * interface.
+ */
+static void __exit arm_cca_guest_exit(void)
+{
+ tsm_unregister(&arm_cca_tsm_ops);
+}
+module_exit(arm_cca_guest_exit);
+
+MODULE_AUTHOR("Sami Mujawar <sami.mujawar@arm.com>");
+MODULE_DESCRIPTION("Arm CCA Guest TSM Driver");
+MODULE_LICENSE("GPL");
diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig
index 42a48ac..2eb7473 100644
--- a/drivers/virtio/Kconfig
+++ b/drivers/virtio/Kconfig
@@ -122,7 +122,7 @@
config VIRTIO_MEM
tristate "Virtio mem driver"
- depends on X86_64 || ARM64 || RISCV
+ depends on X86_64 || ARM64 || RISCV || S390
depends on VIRTIO
depends on MEMORY_HOTPLUG
depends on MEMORY_HOTREMOVE
@@ -132,11 +132,11 @@
This driver provides access to virtio-mem paravirtualized memory
devices, allowing to hotplug and hotunplug memory.
- This driver currently only supports x86-64 and arm64. Although it
- should compile on other architectures that implement memory
- hot(un)plug, architecture-specific and/or common
- code changes may be required for virtio-mem, kdump and kexec to work as
- expected.
+ This driver currently supports x86-64, arm64, riscv and s390.
+ Although it should compile on other architectures that implement
+ memory hot(un)plug, architecture-specific and/or common
+ code changes may be required for virtio-mem, kdump and kexec to
+ work as expected.
If unsure, say M.
diff --git a/drivers/virtio/virtio_pci_common.c b/drivers/virtio/virtio_pci_common.c
index c44d8ba..8807445 100644
--- a/drivers/virtio/virtio_pci_common.c
+++ b/drivers/virtio/virtio_pci_common.c
@@ -24,6 +24,16 @@ MODULE_PARM_DESC(force_legacy,
"Force legacy mode for transitional virtio 1 devices");
#endif
+bool vp_is_avq(struct virtio_device *vdev, unsigned int index)
+{
+ struct virtio_pci_device *vp_dev = to_vp_device(vdev);
+
+ if (!virtio_has_feature(vdev, VIRTIO_F_ADMIN_VQ))
+ return false;
+
+ return index == vp_dev->admin_vq.vq_index;
+}
+
/* wait for pending irq handlers */
void vp_synchronize_vectors(struct virtio_device *vdev)
{
@@ -234,10 +244,9 @@ static struct virtqueue *vp_setup_vq(struct virtio_device *vdev, unsigned int in
return vq;
}
-static void vp_del_vq(struct virtqueue *vq)
+static void vp_del_vq(struct virtqueue *vq, struct virtio_pci_vq_info *info)
{
struct virtio_pci_device *vp_dev = to_vp_device(vq->vdev);
- struct virtio_pci_vq_info *info = vp_dev->vqs[vq->index];
unsigned long flags;
/*
@@ -258,13 +267,16 @@ static void vp_del_vq(struct virtqueue *vq)
void vp_del_vqs(struct virtio_device *vdev)
{
struct virtio_pci_device *vp_dev = to_vp_device(vdev);
+ struct virtio_pci_vq_info *info;
struct virtqueue *vq, *n;
int i;
list_for_each_entry_safe(vq, n, &vdev->vqs, list) {
- if (vp_dev->per_vq_vectors) {
- int v = vp_dev->vqs[vq->index]->msix_vector;
+ info = vp_is_avq(vdev, vq->index) ? vp_dev->admin_vq.info :
+ vp_dev->vqs[vq->index];
+ if (vp_dev->per_vq_vectors) {
+ int v = info->msix_vector;
if (v != VIRTIO_MSI_NO_VECTOR &&
!vp_is_slow_path_vector(v)) {
int irq = pci_irq_vector(vp_dev->pci_dev, v);
@@ -273,7 +285,7 @@ void vp_del_vqs(struct virtio_device *vdev)
free_irq(irq, vq);
}
}
- vp_del_vq(vq);
+ vp_del_vq(vq, info);
}
vp_dev->per_vq_vectors = false;
@@ -354,7 +366,7 @@ vp_find_one_vq_msix(struct virtio_device *vdev, int queue_idx,
vring_interrupt, 0,
vp_dev->msix_names[msix_vec], vq);
if (err) {
- vp_del_vq(vq);
+ vp_del_vq(vq, *p_info);
return ERR_PTR(err);
}
diff --git a/drivers/virtio/virtio_pci_common.h b/drivers/virtio/virtio_pci_common.h
index 1d9c499..8beecf2 100644
--- a/drivers/virtio/virtio_pci_common.h
+++ b/drivers/virtio/virtio_pci_common.h
@@ -178,6 +178,7 @@ struct virtio_device *virtio_pci_vf_get_pf_dev(struct pci_dev *pdev);
#define VIRTIO_ADMIN_CMD_BITMAP 0
#endif
+bool vp_is_avq(struct virtio_device *vdev, unsigned int index);
void vp_modern_avq_done(struct virtqueue *vq);
int vp_modern_admin_cmd_exec(struct virtio_device *vdev,
struct virtio_admin_cmd *cmd);
diff --git a/drivers/virtio/virtio_pci_modern.c b/drivers/virtio/virtio_pci_modern.c
index 9193c30..4fbcbc7 100644
--- a/drivers/virtio/virtio_pci_modern.c
+++ b/drivers/virtio/virtio_pci_modern.c
@@ -43,16 +43,6 @@ static int vp_avq_index(struct virtio_device *vdev, u16 *index, u16 *num)
return 0;
}
-static bool vp_is_avq(struct virtio_device *vdev, unsigned int index)
-{
- struct virtio_pci_device *vp_dev = to_vp_device(vdev);
-
- if (!virtio_has_feature(vdev, VIRTIO_F_ADMIN_VQ))
- return false;
-
- return index == vp_dev->admin_vq.vq_index;
-}
-
void vp_modern_avq_done(struct virtqueue *vq)
{
struct virtio_pci_device *vp_dev = to_vp_device(vq->vdev);
@@ -245,7 +235,7 @@ static void vp_modern_avq_cleanup(struct virtio_device *vdev)
if (!virtio_has_feature(vdev, VIRTIO_F_ADMIN_VQ))
return;
- vq = vp_dev->vqs[vp_dev->admin_vq.vq_index]->vq;
+ vq = vp_dev->admin_vq.info->vq;
if (!vq)
return;
diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c
index 4f75bc8..13a10f3 100644
--- a/drivers/xen/privcmd.c
+++ b/drivers/xen/privcmd.c
@@ -965,10 +965,11 @@ static int privcmd_irqfd_assign(struct privcmd_irqfd *irqfd)
struct privcmd_kernel_irqfd *kirqfd, *tmp;
unsigned long flags;
__poll_t events;
- struct fd f;
void *dm_op;
int ret, idx;
+ CLASS(fd, f)(irqfd->fd);
+
kirqfd = kzalloc(sizeof(*kirqfd) + irqfd->size, GFP_KERNEL);
if (!kirqfd)
return -ENOMEM;
@@ -984,8 +985,7 @@ static int privcmd_irqfd_assign(struct privcmd_irqfd *irqfd)
kirqfd->dom = irqfd->dom;
INIT_WORK(&kirqfd->shutdown, irqfd_shutdown);
- f = fdget(irqfd->fd);
- if (!fd_file(f)) {
+ if (fd_empty(f)) {
ret = -EBADF;
goto error_kfree;
}
@@ -993,7 +993,7 @@ static int privcmd_irqfd_assign(struct privcmd_irqfd *irqfd)
kirqfd->eventfd = eventfd_ctx_fileget(fd_file(f));
if (IS_ERR(kirqfd->eventfd)) {
ret = PTR_ERR(kirqfd->eventfd);
- goto error_fd_put;
+ goto error_kfree;
}
/*
@@ -1026,20 +1026,11 @@ static int privcmd_irqfd_assign(struct privcmd_irqfd *irqfd)
irqfd_inject(kirqfd);
srcu_read_unlock(&irqfds_srcu, idx);
-
- /*
- * Do not drop the file until the kirqfd is fully initialized, otherwise
- * we might race against the EPOLLHUP.
- */
- fdput(f);
return 0;
error_eventfd:
eventfd_ctx_put(kirqfd->eventfd);
-error_fd_put:
- fdput(f);
-
error_kfree:
kfree(kirqfd);
return ret;
@@ -1350,7 +1341,6 @@ static int privcmd_ioeventfd_assign(struct privcmd_ioeventfd *ioeventfd)
struct privcmd_kernel_ioeventfd *kioeventfd;
struct privcmd_kernel_ioreq *kioreq;
unsigned long flags;
- struct fd f;
int ret;
/* Check for range overflow */
@@ -1370,15 +1360,7 @@ static int privcmd_ioeventfd_assign(struct privcmd_ioeventfd *ioeventfd)
if (!kioeventfd)
return -ENOMEM;
- f = fdget(ioeventfd->event_fd);
- if (!fd_file(f)) {
- ret = -EBADF;
- goto error_kfree;
- }
-
- kioeventfd->eventfd = eventfd_ctx_fileget(fd_file(f));
- fdput(f);
-
+ kioeventfd->eventfd = eventfd_ctx_fdget(ioeventfd->event_fd);
if (IS_ERR(kioeventfd->eventfd)) {
ret = PTR_ERR(kioeventfd->eventfd);
goto error_kfree;
diff --git a/drivers/xen/xenbus/xenbus_probe.c b/drivers/xen/xenbus/xenbus_probe.c
index 9f097f1..6d32ffb 100644
--- a/drivers/xen/xenbus/xenbus_probe.c
+++ b/drivers/xen/xenbus/xenbus_probe.c
@@ -313,7 +313,7 @@ int xenbus_dev_probe(struct device *_dev)
if (err) {
dev_warn(&dev->dev, "watch_otherend on %s failed.\n",
dev->nodename);
- return err;
+ goto fail_remove;
}
dev->spurious_threshold = 1;
@@ -322,6 +322,12 @@ int xenbus_dev_probe(struct device *_dev)
dev->nodename);
return 0;
+fail_remove:
+ if (drv->remove) {
+ down(&dev->reclaim_sem);
+ drv->remove(dev);
+ up(&dev->reclaim_sem);
+ }
fail_put:
module_put(drv->driver.owner);
fail:
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index f0b999a..017c48a 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -6,7 +6,8 @@
*/
#include <linux/module.h>
#include <linux/init.h>
-#include <linux/parser.h>
+#include <linux/fs_parser.h>
+#include <linux/fs_context.h>
#include <linux/mount.h>
#include <linux/seq_file.h>
#include <linux/slab.h>
@@ -115,87 +116,61 @@ static int adfs_show_options(struct seq_file *seq, struct dentry *root)
return 0;
}
-enum {Opt_uid, Opt_gid, Opt_ownmask, Opt_othmask, Opt_ftsuffix, Opt_err};
+enum {Opt_uid, Opt_gid, Opt_ownmask, Opt_othmask, Opt_ftsuffix};
-static const match_table_t tokens = {
- {Opt_uid, "uid=%u"},
- {Opt_gid, "gid=%u"},
- {Opt_ownmask, "ownmask=%o"},
- {Opt_othmask, "othmask=%o"},
- {Opt_ftsuffix, "ftsuffix=%u"},
- {Opt_err, NULL}
+static const struct fs_parameter_spec adfs_param_spec[] = {
+ fsparam_uid ("uid", Opt_uid),
+ fsparam_gid ("gid", Opt_gid),
+ fsparam_u32oct ("ownmask", Opt_ownmask),
+ fsparam_u32oct ("othmask", Opt_othmask),
+ fsparam_u32 ("ftsuffix", Opt_ftsuffix),
+ {}
};
-static int parse_options(struct super_block *sb, struct adfs_sb_info *asb,
- char *options)
+static int adfs_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
- char *p;
- int option;
+ struct adfs_sb_info *asb = fc->s_fs_info;
+ struct fs_parse_result result;
+ int opt;
- if (!options)
- return 0;
+ opt = fs_parse(fc, adfs_param_spec, param, &result);
+ if (opt < 0)
+ return opt;
- while ((p = strsep(&options, ",")) != NULL) {
- substring_t args[MAX_OPT_ARGS];
- int token;
- if (!*p)
- continue;
-
- token = match_token(p, tokens, args);
- switch (token) {
- case Opt_uid:
- if (match_int(args, &option))
- return -EINVAL;
- asb->s_uid = make_kuid(current_user_ns(), option);
- if (!uid_valid(asb->s_uid))
- return -EINVAL;
- break;
- case Opt_gid:
- if (match_int(args, &option))
- return -EINVAL;
- asb->s_gid = make_kgid(current_user_ns(), option);
- if (!gid_valid(asb->s_gid))
- return -EINVAL;
- break;
- case Opt_ownmask:
- if (match_octal(args, &option))
- return -EINVAL;
- asb->s_owner_mask = option;
- break;
- case Opt_othmask:
- if (match_octal(args, &option))
- return -EINVAL;
- asb->s_other_mask = option;
- break;
- case Opt_ftsuffix:
- if (match_int(args, &option))
- return -EINVAL;
- asb->s_ftsuffix = option;
- break;
- default:
- adfs_msg(sb, KERN_ERR,
- "unrecognised mount option \"%s\" or missing value",
- p);
- return -EINVAL;
- }
+ switch (opt) {
+ case Opt_uid:
+ asb->s_uid = result.uid;
+ break;
+ case Opt_gid:
+ asb->s_gid = result.gid;
+ break;
+ case Opt_ownmask:
+ asb->s_owner_mask = result.uint_32;
+ break;
+ case Opt_othmask:
+ asb->s_other_mask = result.uint_32;
+ break;
+ case Opt_ftsuffix:
+ asb->s_ftsuffix = result.uint_32;
+ break;
+ default:
+ return -EINVAL;
}
return 0;
}
-static int adfs_remount(struct super_block *sb, int *flags, char *data)
+static int adfs_reconfigure(struct fs_context *fc)
{
- struct adfs_sb_info temp_asb;
- int ret;
+ struct adfs_sb_info *new_asb = fc->s_fs_info;
+ struct adfs_sb_info *asb = ADFS_SB(fc->root->d_sb);
- sync_filesystem(sb);
- *flags |= ADFS_SB_FLAGS;
+ sync_filesystem(fc->root->d_sb);
+ fc->sb_flags |= ADFS_SB_FLAGS;
- temp_asb = *ADFS_SB(sb);
- ret = parse_options(sb, &temp_asb, data);
- if (ret == 0)
- *ADFS_SB(sb) = temp_asb;
+ /* Structure copy newly parsed options */
+ *asb = *new_asb;
- return ret;
+ return 0;
}
static int adfs_statfs(struct dentry *dentry, struct kstatfs *buf)
@@ -273,7 +248,6 @@ static const struct super_operations adfs_sops = {
.write_inode = adfs_write_inode,
.put_super = adfs_put_super,
.statfs = adfs_statfs,
- .remount_fs = adfs_remount,
.show_options = adfs_show_options,
};
@@ -361,34 +335,21 @@ static int adfs_validate_dr0(struct super_block *sb, struct buffer_head *bh,
return 0;
}
-static int adfs_fill_super(struct super_block *sb, void *data, int silent)
+static int adfs_fill_super(struct super_block *sb, struct fs_context *fc)
{
struct adfs_discrecord *dr;
struct object_info root_obj;
- struct adfs_sb_info *asb;
+ struct adfs_sb_info *asb = sb->s_fs_info;
struct inode *root;
int ret = -EINVAL;
+ int silent = fc->sb_flags & SB_SILENT;
sb->s_flags |= ADFS_SB_FLAGS;
- asb = kzalloc(sizeof(*asb), GFP_KERNEL);
- if (!asb)
- return -ENOMEM;
-
sb->s_fs_info = asb;
sb->s_magic = ADFS_SUPER_MAGIC;
sb->s_time_gran = 10000000;
- /* set default options */
- asb->s_uid = GLOBAL_ROOT_UID;
- asb->s_gid = GLOBAL_ROOT_GID;
- asb->s_owner_mask = ADFS_DEFAULT_OWNER_MASK;
- asb->s_other_mask = ADFS_DEFAULT_OTHER_MASK;
- asb->s_ftsuffix = 0;
-
- if (parse_options(sb, asb, data))
- goto error;
-
/* Try to probe the filesystem boot block */
ret = adfs_probe(sb, ADFS_DISCRECORD, 1, adfs_validate_bblk);
if (ret == -EILSEQ)
@@ -453,18 +414,61 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent)
return ret;
}
-static struct dentry *adfs_mount(struct file_system_type *fs_type,
- int flags, const char *dev_name, void *data)
+static int adfs_get_tree(struct fs_context *fc)
{
- return mount_bdev(fs_type, flags, dev_name, data, adfs_fill_super);
+ return get_tree_bdev(fc, adfs_fill_super);
+}
+
+static void adfs_free_fc(struct fs_context *fc)
+{
+ struct adfs_context *asb = fc->s_fs_info;
+
+ kfree(asb);
+}
+
+static const struct fs_context_operations adfs_context_ops = {
+ .parse_param = adfs_parse_param,
+ .get_tree = adfs_get_tree,
+ .reconfigure = adfs_reconfigure,
+ .free = adfs_free_fc,
+};
+
+static int adfs_init_fs_context(struct fs_context *fc)
+{
+ struct adfs_sb_info *asb;
+
+ asb = kzalloc(sizeof(struct adfs_sb_info), GFP_KERNEL);
+ if (!asb)
+ return -ENOMEM;
+
+ if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) {
+ struct super_block *sb = fc->root->d_sb;
+ struct adfs_sb_info *old_asb = ADFS_SB(sb);
+
+ /* structure copy existing options before parsing */
+ *asb = *old_asb;
+ } else {
+ /* set default options */
+ asb->s_uid = GLOBAL_ROOT_UID;
+ asb->s_gid = GLOBAL_ROOT_GID;
+ asb->s_owner_mask = ADFS_DEFAULT_OWNER_MASK;
+ asb->s_other_mask = ADFS_DEFAULT_OTHER_MASK;
+ asb->s_ftsuffix = 0;
+ }
+
+ fc->ops = &adfs_context_ops;
+ fc->s_fs_info = asb;
+
+ return 0;
}
static struct file_system_type adfs_fs_type = {
.owner = THIS_MODULE,
.name = "adfs",
- .mount = adfs_mount,
.kill_sb = kill_block_super,
.fs_flags = FS_REQUIRES_DEV,
+ .init_fs_context = adfs_init_fs_context,
+ .parameters = adfs_param_spec,
};
MODULE_ALIAS_FS("adfs");
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 3c58213..2fa4033 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -14,7 +14,8 @@
#include <linux/module.h>
#include <linux/init.h>
#include <linux/statfs.h>
-#include <linux/parser.h>
+#include <linux/fs_parser.h>
+#include <linux/fs_context.h>
#include <linux/magic.h>
#include <linux/sched.h>
#include <linux/cred.h>
@@ -27,7 +28,6 @@
static int affs_statfs(struct dentry *dentry, struct kstatfs *buf);
static int affs_show_options(struct seq_file *m, struct dentry *root);
-static int affs_remount (struct super_block *sb, int *flags, char *data);
static void
affs_commit_super(struct super_block *sb, int wait)
@@ -155,140 +155,114 @@ static const struct super_operations affs_sops = {
.put_super = affs_put_super,
.sync_fs = affs_sync_fs,
.statfs = affs_statfs,
- .remount_fs = affs_remount,
.show_options = affs_show_options,
};
enum {
Opt_bs, Opt_mode, Opt_mufs, Opt_notruncate, Opt_prefix, Opt_protect,
Opt_reserved, Opt_root, Opt_setgid, Opt_setuid,
- Opt_verbose, Opt_volume, Opt_ignore, Opt_err,
+ Opt_verbose, Opt_volume, Opt_ignore,
};
-static const match_table_t tokens = {
- {Opt_bs, "bs=%u"},
- {Opt_mode, "mode=%o"},
- {Opt_mufs, "mufs"},
- {Opt_notruncate, "nofilenametruncate"},
- {Opt_prefix, "prefix=%s"},
- {Opt_protect, "protect"},
- {Opt_reserved, "reserved=%u"},
- {Opt_root, "root=%u"},
- {Opt_setgid, "setgid=%u"},
- {Opt_setuid, "setuid=%u"},
- {Opt_verbose, "verbose"},
- {Opt_volume, "volume=%s"},
- {Opt_ignore, "grpquota"},
- {Opt_ignore, "noquota"},
- {Opt_ignore, "quota"},
- {Opt_ignore, "usrquota"},
- {Opt_err, NULL},
+struct affs_context {
+ kuid_t uid; /* uid to override */
+ kgid_t gid; /* gid to override */
+ unsigned int mode; /* mode to override */
+ unsigned int reserved; /* Number of reserved blocks */
+ int root_block; /* FFS root block number */
+ int blocksize; /* Initial device blksize */
+ char *prefix; /* Prefix for volumes and assigns */
+ char volume[32]; /* Vol. prefix for absolute symlinks */
+ unsigned long mount_flags; /* Options */
};
-static int
-parse_options(char *options, kuid_t *uid, kgid_t *gid, int *mode, int *reserved, s32 *root,
- int *blocksize, char **prefix, char *volume, unsigned long *mount_opts)
+static const struct fs_parameter_spec affs_param_spec[] = {
+ fsparam_u32 ("bs", Opt_bs),
+ fsparam_u32oct ("mode", Opt_mode),
+ fsparam_flag ("mufs", Opt_mufs),
+ fsparam_flag ("nofilenametruncate", Opt_notruncate),
+ fsparam_string ("prefix", Opt_prefix),
+ fsparam_flag ("protect", Opt_protect),
+ fsparam_u32 ("reserved", Opt_reserved),
+ fsparam_u32 ("root", Opt_root),
+ fsparam_gid ("setgid", Opt_setgid),
+ fsparam_uid ("setuid", Opt_setuid),
+ fsparam_flag ("verbose", Opt_verbose),
+ fsparam_string ("volume", Opt_volume),
+ fsparam_flag ("grpquota", Opt_ignore),
+ fsparam_flag ("noquota", Opt_ignore),
+ fsparam_flag ("quota", Opt_ignore),
+ fsparam_flag ("usrquota", Opt_ignore),
+ {},
+};
+
+static int affs_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
- char *p;
- substring_t args[MAX_OPT_ARGS];
+ struct affs_context *ctx = fc->fs_private;
+ struct fs_parse_result result;
+ int n;
+ int opt;
- /* Fill in defaults */
+ opt = fs_parse(fc, affs_param_spec, param, &result);
+ if (opt < 0)
+ return opt;
- *uid = current_uid();
- *gid = current_gid();
- *reserved = 2;
- *root = -1;
- *blocksize = -1;
- volume[0] = ':';
- volume[1] = 0;
- *mount_opts = 0;
- if (!options)
- return 1;
-
- while ((p = strsep(&options, ",")) != NULL) {
- int token, n, option;
- if (!*p)
- continue;
-
- token = match_token(p, tokens, args);
- switch (token) {
- case Opt_bs:
- if (match_int(&args[0], &n))
- return 0;
- if (n != 512 && n != 1024 && n != 2048
- && n != 4096) {
- pr_warn("Invalid blocksize (512, 1024, 2048, 4096 allowed)\n");
- return 0;
- }
- *blocksize = n;
- break;
- case Opt_mode:
- if (match_octal(&args[0], &option))
- return 0;
- *mode = option & 0777;
- affs_set_opt(*mount_opts, SF_SETMODE);
- break;
- case Opt_mufs:
- affs_set_opt(*mount_opts, SF_MUFS);
- break;
- case Opt_notruncate:
- affs_set_opt(*mount_opts, SF_NO_TRUNCATE);
- break;
- case Opt_prefix:
- kfree(*prefix);
- *prefix = match_strdup(&args[0]);
- if (!*prefix)
- return 0;
- affs_set_opt(*mount_opts, SF_PREFIX);
- break;
- case Opt_protect:
- affs_set_opt(*mount_opts, SF_IMMUTABLE);
- break;
- case Opt_reserved:
- if (match_int(&args[0], reserved))
- return 0;
- break;
- case Opt_root:
- if (match_int(&args[0], root))
- return 0;
- break;
- case Opt_setgid:
- if (match_int(&args[0], &option))
- return 0;
- *gid = make_kgid(current_user_ns(), option);
- if (!gid_valid(*gid))
- return 0;
- affs_set_opt(*mount_opts, SF_SETGID);
- break;
- case Opt_setuid:
- if (match_int(&args[0], &option))
- return 0;
- *uid = make_kuid(current_user_ns(), option);
- if (!uid_valid(*uid))
- return 0;
- affs_set_opt(*mount_opts, SF_SETUID);
- break;
- case Opt_verbose:
- affs_set_opt(*mount_opts, SF_VERBOSE);
- break;
- case Opt_volume: {
- char *vol = match_strdup(&args[0]);
- if (!vol)
- return 0;
- strscpy(volume, vol, 32);
- kfree(vol);
- break;
+ switch (opt) {
+ case Opt_bs:
+ n = result.uint_32;
+ if (n != 512 && n != 1024 && n != 2048
+ && n != 4096) {
+ pr_warn("Invalid blocksize (512, 1024, 2048, 4096 allowed)\n");
+ return -EINVAL;
}
- case Opt_ignore:
- /* Silently ignore the quota options */
- break;
- default:
- pr_warn("Unrecognized mount option \"%s\" or missing value\n",
- p);
- return 0;
- }
+ ctx->blocksize = n;
+ break;
+ case Opt_mode:
+ ctx->mode = result.uint_32 & 0777;
+ affs_set_opt(ctx->mount_flags, SF_SETMODE);
+ break;
+ case Opt_mufs:
+ affs_set_opt(ctx->mount_flags, SF_MUFS);
+ break;
+ case Opt_notruncate:
+ affs_set_opt(ctx->mount_flags, SF_NO_TRUNCATE);
+ break;
+ case Opt_prefix:
+ kfree(ctx->prefix);
+ ctx->prefix = param->string;
+ param->string = NULL;
+ affs_set_opt(ctx->mount_flags, SF_PREFIX);
+ break;
+ case Opt_protect:
+ affs_set_opt(ctx->mount_flags, SF_IMMUTABLE);
+ break;
+ case Opt_reserved:
+ ctx->reserved = result.uint_32;
+ break;
+ case Opt_root:
+ ctx->root_block = result.uint_32;
+ break;
+ case Opt_setgid:
+ ctx->gid = result.gid;
+ affs_set_opt(ctx->mount_flags, SF_SETGID);
+ break;
+ case Opt_setuid:
+ ctx->uid = result.uid;
+ affs_set_opt(ctx->mount_flags, SF_SETUID);
+ break;
+ case Opt_verbose:
+ affs_set_opt(ctx->mount_flags, SF_VERBOSE);
+ break;
+ case Opt_volume:
+ strscpy(ctx->volume, param->string, 32);
+ break;
+ case Opt_ignore:
+ /* Silently ignore the quota options */
+ break;
+ default:
+ return -EINVAL;
}
- return 1;
+ return 0;
}
static int affs_show_options(struct seq_file *m, struct dentry *root)
@@ -329,27 +303,22 @@ static int affs_show_options(struct seq_file *m, struct dentry *root)
* hopefully have the guts to do so. Until then: sorry for the mess.
*/
-static int affs_fill_super(struct super_block *sb, void *data, int silent)
+static int affs_fill_super(struct super_block *sb, struct fs_context *fc)
{
struct affs_sb_info *sbi;
+ struct affs_context *ctx = fc->fs_private;
struct buffer_head *root_bh = NULL;
struct buffer_head *boot_bh;
struct inode *root_inode = NULL;
- s32 root_block;
+ int silent = fc->sb_flags & SB_SILENT;
int size, blocksize;
u32 chksum;
int num_bm;
int i, j;
- kuid_t uid;
- kgid_t gid;
- int reserved;
- unsigned long mount_flags;
int tmp_flags; /* fix remount prototype... */
u8 sig[4];
int ret;
- pr_debug("read_super(%s)\n", data ? (const char *)data : "no options");
-
sb->s_magic = AFFS_SUPER_MAGIC;
sb->s_op = &affs_sops;
sb->s_flags |= SB_NODIRATIME;
@@ -369,19 +338,16 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent)
spin_lock_init(&sbi->work_lock);
INIT_DELAYED_WORK(&sbi->sb_work, flush_superblock);
- if (!parse_options(data,&uid,&gid,&i,&reserved,&root_block,
- &blocksize,&sbi->s_prefix,
- sbi->s_volume, &mount_flags)) {
- pr_err("Error parsing options\n");
- return -EINVAL;
- }
- /* N.B. after this point s_prefix must be released */
+ sbi->s_flags = ctx->mount_flags;
+ sbi->s_mode = ctx->mode;
+ sbi->s_uid = ctx->uid;
+ sbi->s_gid = ctx->gid;
+ sbi->s_reserved = ctx->reserved;
+ sbi->s_prefix = ctx->prefix;
+ ctx->prefix = NULL;
+ memcpy(sbi->s_volume, ctx->volume, 32);
- sbi->s_flags = mount_flags;
- sbi->s_mode = i;
- sbi->s_uid = uid;
- sbi->s_gid = gid;
- sbi->s_reserved= reserved;
+ /* N.B. after this point s_prefix must be released */
/* Get the size of the device in 512-byte blocks.
* If we later see that the partition uses bigger
@@ -396,15 +362,16 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent)
i = bdev_logical_block_size(sb->s_bdev);
j = PAGE_SIZE;
+ blocksize = ctx->blocksize;
if (blocksize > 0) {
i = j = blocksize;
size = size / (blocksize / 512);
}
for (blocksize = i; blocksize <= j; blocksize <<= 1, size >>= 1) {
- sbi->s_root_block = root_block;
- if (root_block < 0)
- sbi->s_root_block = (reserved + size - 1) / 2;
+ sbi->s_root_block = ctx->root_block;
+ if (ctx->root_block < 0)
+ sbi->s_root_block = (ctx->reserved + size - 1) / 2;
pr_debug("setting blocksize to %d\n", blocksize);
affs_set_blocksize(sb, blocksize);
sbi->s_partition_size = size;
@@ -424,7 +391,7 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent)
"size=%d, reserved=%d\n",
sb->s_id,
sbi->s_root_block + num_bm,
- blocksize, size, reserved);
+ ctx->blocksize, size, ctx->reserved);
root_bh = affs_bread(sb, sbi->s_root_block + num_bm);
if (!root_bh)
continue;
@@ -447,7 +414,7 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent)
got_root:
/* Keep super block in cache */
sbi->s_root_bh = root_bh;
- root_block = sbi->s_root_block;
+ ctx->root_block = sbi->s_root_block;
/* Find out which kind of FS we have */
boot_bh = sb_bread(sb, 0);
@@ -506,7 +473,7 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent)
return -EINVAL;
}
- if (affs_test_opt(mount_flags, SF_VERBOSE)) {
+ if (affs_test_opt(ctx->mount_flags, SF_VERBOSE)) {
u8 len = AFFS_ROOT_TAIL(sb, root_bh)->disk_name[0];
pr_notice("Mounting volume \"%.*s\": Type=%.3s\\%c, Blocksize=%d\n",
len > 31 ? 31 : len,
@@ -528,7 +495,7 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent)
/* set up enough so that it can read an inode */
- root_inode = affs_iget(sb, root_block);
+ root_inode = affs_iget(sb, ctx->root_block);
if (IS_ERR(root_inode))
return PTR_ERR(root_inode);
@@ -548,56 +515,43 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent)
return 0;
}
-static int
-affs_remount(struct super_block *sb, int *flags, char *data)
+static int affs_reconfigure(struct fs_context *fc)
{
+ struct super_block *sb = fc->root->d_sb;
+ struct affs_context *ctx = fc->fs_private;
struct affs_sb_info *sbi = AFFS_SB(sb);
- int blocksize;
- kuid_t uid;
- kgid_t gid;
- int mode;
- int reserved;
- int root_block;
- unsigned long mount_flags;
int res = 0;
- char volume[32];
- char *prefix = NULL;
-
- pr_debug("%s(flags=0x%x,opts=\"%s\")\n", __func__, *flags, data);
sync_filesystem(sb);
- *flags |= SB_NODIRATIME;
-
- memcpy(volume, sbi->s_volume, 32);
- if (!parse_options(data, &uid, &gid, &mode, &reserved, &root_block,
- &blocksize, &prefix, volume,
- &mount_flags)) {
- kfree(prefix);
- return -EINVAL;
- }
+ fc->sb_flags |= SB_NODIRATIME;
flush_delayed_work(&sbi->sb_work);
- sbi->s_flags = mount_flags;
- sbi->s_mode = mode;
- sbi->s_uid = uid;
- sbi->s_gid = gid;
+ /*
+ * NB: Historically, only mount_flags, mode, uid, gic, prefix,
+ * and volume are accepted during remount.
+ */
+ sbi->s_flags = ctx->mount_flags;
+ sbi->s_mode = ctx->mode;
+ sbi->s_uid = ctx->uid;
+ sbi->s_gid = ctx->gid;
/* protect against readers */
spin_lock(&sbi->symlink_lock);
- if (prefix) {
+ if (ctx->prefix) {
kfree(sbi->s_prefix);
- sbi->s_prefix = prefix;
+ sbi->s_prefix = ctx->prefix;
+ ctx->prefix = NULL;
}
- memcpy(sbi->s_volume, volume, 32);
+ memcpy(sbi->s_volume, ctx->volume, 32);
spin_unlock(&sbi->symlink_lock);
- if ((bool)(*flags & SB_RDONLY) == sb_rdonly(sb))
+ if ((bool)(fc->sb_flags & SB_RDONLY) == sb_rdonly(sb))
return 0;
- if (*flags & SB_RDONLY)
+ if (fc->sb_flags & SB_RDONLY)
affs_free_bitmap(sb);
else
- res = affs_init_bitmap(sb, flags);
+ res = affs_init_bitmap(sb, &fc->sb_flags);
return res;
}
@@ -624,10 +578,9 @@ affs_statfs(struct dentry *dentry, struct kstatfs *buf)
return 0;
}
-static struct dentry *affs_mount(struct file_system_type *fs_type,
- int flags, const char *dev_name, void *data)
+static int affs_get_tree(struct fs_context *fc)
{
- return mount_bdev(fs_type, flags, dev_name, data, affs_fill_super);
+ return get_tree_bdev(fc, affs_fill_super);
}
static void affs_kill_sb(struct super_block *sb)
@@ -643,12 +596,61 @@ static void affs_kill_sb(struct super_block *sb)
}
}
+static void affs_free_fc(struct fs_context *fc)
+{
+ struct affs_context *ctx = fc->fs_private;
+
+ kfree(ctx->prefix);
+ kfree(ctx);
+}
+
+static const struct fs_context_operations affs_context_ops = {
+ .parse_param = affs_parse_param,
+ .get_tree = affs_get_tree,
+ .reconfigure = affs_reconfigure,
+ .free = affs_free_fc,
+};
+
+static int affs_init_fs_context(struct fs_context *fc)
+{
+ struct affs_context *ctx;
+
+ ctx = kzalloc(sizeof(struct affs_context), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+
+ if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) {
+ struct super_block *sb = fc->root->d_sb;
+ struct affs_sb_info *sbi = AFFS_SB(sb);
+
+ /*
+ * NB: historically, no options other than volume were
+ * preserved across a remount unless they were explicitly
+ * passed in.
+ */
+ memcpy(ctx->volume, sbi->s_volume, 32);
+ } else {
+ ctx->uid = current_uid();
+ ctx->gid = current_gid();
+ ctx->reserved = 2;
+ ctx->root_block = -1;
+ ctx->blocksize = -1;
+ ctx->volume[0] = ':';
+ }
+
+ fc->ops = &affs_context_ops;
+ fc->fs_private = ctx;
+
+ return 0;
+}
+
static struct file_system_type affs_fs_type = {
.owner = THIS_MODULE,
.name = "affs",
- .mount = affs_mount,
.kill_sb = affs_kill_sb,
.fs_flags = FS_REQUIRES_DEV,
+ .init_fs_context = affs_init_fs_context,
+ .parameters = affs_param_spec,
};
MODULE_ALIAS_FS("affs");
diff --git a/fs/aio.c b/fs/aio.c
index e892017..72e3970 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -2191,7 +2191,6 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb,
return -EINVAL;
spin_lock_irq(&ctx->ctx_lock);
- /* TODO: use a hash or array, this sucks. */
list_for_each_entry(kiocb, &ctx->active_reqs, ki_list) {
if (kiocb->ki_res.obj == obj) {
ret = kiocb->ki_cancel(&kiocb->rw);
diff --git a/fs/attr.c b/fs/attr.c
index c04d19b..9caf63d 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -272,6 +272,47 @@ int inode_newsize_ok(const struct inode *inode, loff_t offset)
EXPORT_SYMBOL(inode_newsize_ok);
/**
+ * setattr_copy_mgtime - update timestamps for mgtime inodes
+ * @inode: inode timestamps to be updated
+ * @attr: attrs for the update
+ *
+ * With multigrain timestamps, take more care to prevent races when
+ * updating the ctime. Always update the ctime to the very latest using
+ * the standard mechanism, and use that to populate the atime and mtime
+ * appropriately (unless those are being set to specific values).
+ */
+static void setattr_copy_mgtime(struct inode *inode, const struct iattr *attr)
+{
+ unsigned int ia_valid = attr->ia_valid;
+ struct timespec64 now;
+
+ if (ia_valid & ATTR_CTIME) {
+ /*
+ * In the case of an update for a write delegation, we must respect
+ * the value in ia_ctime and not use the current time.
+ */
+ if (ia_valid & ATTR_DELEG)
+ now = inode_set_ctime_deleg(inode, attr->ia_ctime);
+ else
+ now = inode_set_ctime_current(inode);
+ } else {
+ /* If ATTR_CTIME isn't set, then ATTR_MTIME shouldn't be either. */
+ WARN_ON_ONCE(ia_valid & ATTR_MTIME);
+ now = current_time(inode);
+ }
+
+ if (ia_valid & ATTR_ATIME_SET)
+ inode_set_atime_to_ts(inode, attr->ia_atime);
+ else if (ia_valid & ATTR_ATIME)
+ inode_set_atime_to_ts(inode, now);
+
+ if (ia_valid & ATTR_MTIME_SET)
+ inode_set_mtime_to_ts(inode, attr->ia_mtime);
+ else if (ia_valid & ATTR_MTIME)
+ inode_set_mtime_to_ts(inode, now);
+}
+
+/**
* setattr_copy - copy simple metadata updates into the generic inode
* @idmap: idmap of the mount the inode was found from
* @inode: the inode to be updated
@@ -303,12 +344,6 @@ void setattr_copy(struct mnt_idmap *idmap, struct inode *inode,
i_uid_update(idmap, attr, inode);
i_gid_update(idmap, attr, inode);
- if (ia_valid & ATTR_ATIME)
- inode_set_atime_to_ts(inode, attr->ia_atime);
- if (ia_valid & ATTR_MTIME)
- inode_set_mtime_to_ts(inode, attr->ia_mtime);
- if (ia_valid & ATTR_CTIME)
- inode_set_ctime_to_ts(inode, attr->ia_ctime);
if (ia_valid & ATTR_MODE) {
umode_t mode = attr->ia_mode;
if (!in_group_or_capable(idmap, inode,
@@ -316,6 +351,20 @@ void setattr_copy(struct mnt_idmap *idmap, struct inode *inode,
mode &= ~S_ISGID;
inode->i_mode = mode;
}
+
+ if (is_mgtime(inode))
+ return setattr_copy_mgtime(inode, attr);
+
+ if (ia_valid & ATTR_ATIME)
+ inode_set_atime_to_ts(inode, attr->ia_atime);
+ if (ia_valid & ATTR_MTIME)
+ inode_set_mtime_to_ts(inode, attr->ia_mtime);
+ if (ia_valid & ATTR_CTIME) {
+ if (ia_valid & ATTR_DELEG)
+ inode_set_ctime_deleg(inode, attr->ia_ctime);
+ else
+ inode_set_ctime_to_ts(inode, attr->ia_ctime);
+ }
}
EXPORT_SYMBOL(setattr_copy);
diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c
index 47455a8..654a581 100644
--- a/fs/bcachefs/backpointers.c
+++ b/fs/bcachefs/backpointers.c
@@ -52,6 +52,12 @@ int bch2_backpointer_validate(struct bch_fs *c, struct bkey_s_c k,
enum bch_validate_flags flags)
{
struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k);
+ int ret = 0;
+
+ bkey_fsck_err_on(bp.v->level > BTREE_MAX_DEPTH,
+ c, backpointer_level_bad,
+ "backpointer level bad: %u >= %u",
+ bp.v->level, BTREE_MAX_DEPTH);
rcu_read_lock();
struct bch_dev *ca = bch2_dev_rcu_noerror(c, bp.k->p.inode);
@@ -64,7 +70,6 @@ int bch2_backpointer_validate(struct bch_fs *c, struct bkey_s_c k,
struct bpos bucket = bp_pos_to_bucket(ca, bp.k->p);
struct bpos bp_pos = bucket_pos_to_bp_noerror(ca, bucket, bp.v->bucket_offset);
rcu_read_unlock();
- int ret = 0;
bkey_fsck_err_on((bp.v->bucket_offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT) >= ca->mi.bucket_size ||
!bpos_eq(bp.k->p, bp_pos),
@@ -947,9 +952,13 @@ int bch2_check_extents_to_backpointers(struct bch_fs *c)
static int check_one_backpointer(struct btree_trans *trans,
struct bbpos start,
struct bbpos end,
- struct bkey_s_c_backpointer bp,
+ struct bkey_s_c bp_k,
struct bkey_buf *last_flushed)
{
+ if (bp_k.k->type != KEY_TYPE_backpointer)
+ return 0;
+
+ struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(bp_k);
struct bch_fs *c = trans->c;
struct btree_iter iter;
struct bbpos pos = bp_to_bbpos(*bp.v);
@@ -1004,9 +1013,7 @@ static int bch2_check_backpointers_to_extents_pass(struct btree_trans *trans,
POS_MIN, BTREE_ITER_prefetch, k,
NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
progress_update_iter(trans, &progress, &iter, "backpointers_to_extents");
- check_one_backpointer(trans, start, end,
- bkey_s_c_to_backpointer(k),
- &last_flushed);
+ check_one_backpointer(trans, start, end, k, &last_flushed);
}));
bch2_bkey_buf_exit(&last_flushed, c);
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 0ca3fee..81dcf9e 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -182,7 +182,7 @@ static int set_node_max(struct bch_fs *c, struct btree *b, struct bpos new_max)
bch2_btree_node_drop_keys_outside_node(b);
mutex_lock(&c->btree_cache.lock);
- bch2_btree_node_hash_remove(&c->btree_cache, b);
+ __bch2_btree_node_hash_remove(&c->btree_cache, b);
bkey_copy(&b->key, &new->k_i);
ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 6296a11..839d688 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -733,11 +733,8 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
c, ca, b, i, NULL,
bset_past_end_of_btree_node,
"bset past end of btree node (offset %u len %u but written %zu)",
- offset, sectors, ptr_written ?: btree_sectors(c))) {
+ offset, sectors, ptr_written ?: btree_sectors(c)))
i->u64s = 0;
- ret = 0;
- goto out;
- }
btree_err_on(offset && !i->u64s,
-BCH_ERR_btree_node_read_err_fixable,
@@ -829,7 +826,6 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
BSET_BIG_ENDIAN(i), write,
&bn->format);
}
-out:
fsck_err:
printbuf_exit(&buf2);
printbuf_exit(&buf1);
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 22740b6..d596ef9 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -2398,7 +2398,8 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans,
if (new_hash) {
mutex_lock(&c->btree_cache.lock);
bch2_btree_node_hash_remove(&c->btree_cache, new_hash);
- bch2_btree_node_hash_remove(&c->btree_cache, b);
+
+ __bch2_btree_node_hash_remove(&c->btree_cache, b);
bkey_copy(&b->key, new_key);
ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c
index 3f56b58..1639c60 100644
--- a/fs/bcachefs/btree_write_buffer.c
+++ b/fs/bcachefs/btree_write_buffer.c
@@ -277,6 +277,10 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
bool accounting_replay_done = test_bit(BCH_FS_accounting_replay_done, &c->flags);
int ret = 0;
+ ret = bch2_journal_error(&c->journal);
+ if (ret)
+ return ret;
+
bch2_trans_unlock(trans);
bch2_trans_begin(trans);
@@ -491,7 +495,8 @@ static int fetch_wb_keys_from_journal(struct bch_fs *c, u64 seq)
return ret;
}
-static int btree_write_buffer_flush_seq(struct btree_trans *trans, u64 seq)
+static int btree_write_buffer_flush_seq(struct btree_trans *trans, u64 seq,
+ bool *did_work)
{
struct bch_fs *c = trans->c;
struct btree_write_buffer *wb = &c->btree_write_buffer;
@@ -502,6 +507,8 @@ static int btree_write_buffer_flush_seq(struct btree_trans *trans, u64 seq)
fetch_from_journal_err = fetch_wb_keys_from_journal(c, seq);
+ *did_work |= wb->inc.keys.nr || wb->flushing.keys.nr;
+
/*
* On memory allocation failure, bch2_btree_write_buffer_flush_locked()
* is not guaranteed to empty wb->inc:
@@ -521,17 +528,34 @@ static int bch2_btree_write_buffer_journal_flush(struct journal *j,
struct journal_entry_pin *_pin, u64 seq)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
+ bool did_work = false;
- return bch2_trans_run(c, btree_write_buffer_flush_seq(trans, seq));
+ return bch2_trans_run(c, btree_write_buffer_flush_seq(trans, seq, &did_work));
}
int bch2_btree_write_buffer_flush_sync(struct btree_trans *trans)
{
struct bch_fs *c = trans->c;
+ bool did_work = false;
trace_and_count(c, write_buffer_flush_sync, trans, _RET_IP_);
- return btree_write_buffer_flush_seq(trans, journal_cur_seq(&c->journal));
+ return btree_write_buffer_flush_seq(trans, journal_cur_seq(&c->journal), &did_work);
+}
+
+/*
+ * The write buffer requires flushing when going RO: keys in the journal for the
+ * write buffer don't have a journal pin yet
+ */
+bool bch2_btree_write_buffer_flush_going_ro(struct bch_fs *c)
+{
+ if (bch2_journal_error(&c->journal))
+ return false;
+
+ bool did_work = false;
+ bch2_trans_run(c, btree_write_buffer_flush_seq(trans,
+ journal_cur_seq(&c->journal), &did_work));
+ return did_work;
}
int bch2_btree_write_buffer_flush_nocheck_rw(struct btree_trans *trans)
diff --git a/fs/bcachefs/btree_write_buffer.h b/fs/bcachefs/btree_write_buffer.h
index 725e796..d535cea 100644
--- a/fs/bcachefs/btree_write_buffer.h
+++ b/fs/bcachefs/btree_write_buffer.h
@@ -21,6 +21,7 @@ static inline bool bch2_btree_write_buffer_must_wait(struct bch_fs *c)
struct btree_trans;
int bch2_btree_write_buffer_flush_sync(struct btree_trans *);
+bool bch2_btree_write_buffer_flush_going_ro(struct bch_fs *);
int bch2_btree_write_buffer_flush_nocheck_rw(struct btree_trans *);
int bch2_btree_write_buffer_tryflush(struct btree_trans *);
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index c4e91d1..37e3d69 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -1364,7 +1364,7 @@ void bch2_ptr_swab(struct bkey_s k)
for (entry = ptrs.start;
entry < ptrs.end;
entry = extent_entry_next(entry)) {
- switch (extent_entry_type(entry)) {
+ switch (__extent_entry_type(entry)) {
case BCH_EXTENT_ENTRY_ptr:
break;
case BCH_EXTENT_ENTRY_crc32:
@@ -1384,6 +1384,9 @@ void bch2_ptr_swab(struct bkey_s k)
break;
case BCH_EXTENT_ENTRY_rebalance:
break;
+ default:
+ /* Bad entry type: will be caught by validate() */
+ return;
}
}
}
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index ccaafa9..fb35dd3 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -708,6 +708,9 @@ static void journal_entry_dev_usage_to_text(struct printbuf *out, struct bch_fs
container_of(entry, struct jset_entry_dev_usage, entry);
unsigned i, nr_types = jset_entry_dev_usage_nr_types(u);
+ if (vstruct_bytes(entry) < sizeof(*u))
+ return;
+
prt_printf(out, "dev=%u", le32_to_cpu(u->dev));
printbuf_indent_add(out, 2);
diff --git a/fs/bcachefs/recovery_passes.c b/fs/bcachefs/recovery_passes.c
index 4bbeac9..dff589d 100644
--- a/fs/bcachefs/recovery_passes.c
+++ b/fs/bcachefs/recovery_passes.c
@@ -27,6 +27,12 @@ const char * const bch2_recovery_passes[] = {
NULL
};
+/* Fake recovery pass, so that scan_for_btree_nodes isn't 0: */
+static int bch2_recovery_pass_empty(struct bch_fs *c)
+{
+ return 0;
+}
+
static int bch2_set_may_go_rw(struct bch_fs *c)
{
struct journal_keys *keys = &c->journal_keys;
diff --git a/fs/bcachefs/recovery_passes_types.h b/fs/bcachefs/recovery_passes_types.h
index 9d96c06..94dc20c 100644
--- a/fs/bcachefs/recovery_passes_types.h
+++ b/fs/bcachefs/recovery_passes_types.h
@@ -13,6 +13,7 @@
* must never change:
*/
#define BCH_RECOVERY_PASSES() \
+ x(recovery_pass_empty, 41, PASS_SILENT) \
x(scan_for_btree_nodes, 37, 0) \
x(check_topology, 4, 0) \
x(accounting_read, 39, PASS_ALWAYS) \
diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h
index 937275d..9feb673 100644
--- a/fs/bcachefs/sb-errors_format.h
+++ b/fs/bcachefs/sb-errors_format.h
@@ -136,7 +136,9 @@ enum bch_fsck_flags {
x(bucket_gens_nonzero_for_invalid_buckets, 122, FSCK_AUTOFIX) \
x(need_discard_freespace_key_to_invalid_dev_bucket, 123, 0) \
x(need_discard_freespace_key_bad, 124, 0) \
+ x(discarding_bucket_not_in_need_discard_btree, 291, 0) \
x(backpointer_bucket_offset_wrong, 125, 0) \
+ x(backpointer_level_bad, 294, 0) \
x(backpointer_to_missing_device, 126, 0) \
x(backpointer_to_missing_alloc, 127, 0) \
x(backpointer_to_missing_ptr, 128, 0) \
@@ -177,7 +179,9 @@ enum bch_fsck_flags {
x(ptr_stripe_redundant, 163, 0) \
x(reservation_key_nr_replicas_invalid, 164, 0) \
x(reflink_v_refcount_wrong, 165, 0) \
+ x(reflink_v_pos_bad, 292, 0) \
x(reflink_p_to_missing_reflink_v, 166, 0) \
+ x(reflink_refcount_underflow, 293, 0) \
x(stripe_pos_bad, 167, 0) \
x(stripe_val_size_bad, 168, 0) \
x(stripe_csum_granularity_bad, 290, 0) \
@@ -302,7 +306,7 @@ enum bch_fsck_flags {
x(accounting_key_replicas_devs_unsorted, 280, FSCK_AUTOFIX) \
x(accounting_key_version_0, 282, FSCK_AUTOFIX) \
x(logged_op_but_clean, 283, FSCK_AUTOFIX) \
- x(MAX, 291, 0)
+ x(MAX, 295, 0)
enum bch_sb_error_id {
#define x(t, n, ...) BCH_FSCK_ERR_##t = n,
diff --git a/fs/bcachefs/sb-members.c b/fs/bcachefs/sb-members.c
index fb08dd6..116131f 100644
--- a/fs/bcachefs/sb-members.c
+++ b/fs/bcachefs/sb-members.c
@@ -163,7 +163,7 @@ static int validate_member(struct printbuf *err,
return -BCH_ERR_invalid_sb_members;
}
- if (m.btree_bitmap_shift >= 64) {
+ if (m.btree_bitmap_shift >= BCH_MI_BTREE_BITMAP_SHIFT_MAX) {
prt_printf(err, "device %u: invalid btree_bitmap_shift %u", i, m.btree_bitmap_shift);
return -BCH_ERR_invalid_sb_members;
}
@@ -450,7 +450,7 @@ static void __bch2_dev_btree_bitmap_mark(struct bch_sb_field_members_v2 *mi, uns
m->btree_bitmap_shift += resize;
}
- BUG_ON(m->btree_bitmap_shift > 57);
+ BUG_ON(m->btree_bitmap_shift >= BCH_MI_BTREE_BITMAP_SHIFT_MAX);
BUG_ON(end > 64ULL << m->btree_bitmap_shift);
for (unsigned bit = start >> m->btree_bitmap_shift;
diff --git a/fs/bcachefs/sb-members_format.h b/fs/bcachefs/sb-members_format.h
index d727d2d..2adf122 100644
--- a/fs/bcachefs/sb-members_format.h
+++ b/fs/bcachefs/sb-members_format.h
@@ -66,6 +66,12 @@ struct bch_member {
};
/*
+ * btree_allocated_bitmap can represent sector addresses of a u64: it itself has
+ * 64 elements, so 64 - ilog2(64)
+ */
+#define BCH_MI_BTREE_BITMAP_SHIFT_MAX 58
+
+/*
* This limit comes from the bucket_gens array - it's a single allocation, and
* kernel allocation are limited to INT_MAX
*/
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 657fd37..a6ed9a0 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -272,6 +272,7 @@ static void __bch2_fs_read_only(struct bch_fs *c)
clean_passes++;
if (bch2_btree_interior_updates_flush(c) ||
+ bch2_btree_write_buffer_flush_going_ro(c) ||
bch2_journal_flush_all_pins(&c->journal) ||
bch2_btree_flush_all_writes(c) ||
seq != atomic64_read(&c->journal.seq)) {
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index f92f108..8f430ff 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -11,12 +11,13 @@
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/fs.h>
+#include <linux/fs_context.h>
+#include <linux/fs_parser.h>
#include <linux/errno.h>
#include <linux/stat.h>
#include <linux/nls.h>
#include <linux/buffer_head.h>
#include <linux/vfs.h>
-#include <linux/parser.h>
#include <linux/namei.h>
#include <linux/sched.h>
#include <linux/cred.h>
@@ -54,22 +55,20 @@ static int befs_utf2nls(struct super_block *sb, const char *in, int in_len,
static int befs_nls2utf(struct super_block *sb, const char *in, int in_len,
char **out, int *out_len);
static void befs_put_super(struct super_block *);
-static int befs_remount(struct super_block *, int *, char *);
static int befs_statfs(struct dentry *, struct kstatfs *);
static int befs_show_options(struct seq_file *, struct dentry *);
-static int parse_options(char *, struct befs_mount_options *);
static struct dentry *befs_fh_to_dentry(struct super_block *sb,
struct fid *fid, int fh_len, int fh_type);
static struct dentry *befs_fh_to_parent(struct super_block *sb,
struct fid *fid, int fh_len, int fh_type);
static struct dentry *befs_get_parent(struct dentry *child);
+static void befs_free_fc(struct fs_context *fc);
static const struct super_operations befs_sops = {
.alloc_inode = befs_alloc_inode, /* allocate a new inode */
.free_inode = befs_free_inode, /* deallocate an inode */
.put_super = befs_put_super, /* uninit super */
.statfs = befs_statfs, /* statfs */
- .remount_fs = befs_remount,
.show_options = befs_show_options,
};
@@ -672,92 +671,53 @@ static struct dentry *befs_get_parent(struct dentry *child)
}
enum {
- Opt_uid, Opt_gid, Opt_charset, Opt_debug, Opt_err,
+ Opt_uid, Opt_gid, Opt_charset, Opt_debug,
};
-static const match_table_t befs_tokens = {
- {Opt_uid, "uid=%d"},
- {Opt_gid, "gid=%d"},
- {Opt_charset, "iocharset=%s"},
- {Opt_debug, "debug"},
- {Opt_err, NULL}
+static const struct fs_parameter_spec befs_param_spec[] = {
+ fsparam_uid ("uid", Opt_uid),
+ fsparam_gid ("gid", Opt_gid),
+ fsparam_string ("iocharset", Opt_charset),
+ fsparam_flag ("debug", Opt_debug),
+ {}
};
static int
-parse_options(char *options, struct befs_mount_options *opts)
+befs_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
- char *p;
- substring_t args[MAX_OPT_ARGS];
- int option;
- kuid_t uid;
- kgid_t gid;
+ struct befs_mount_options *opts = fc->fs_private;
+ int token;
+ struct fs_parse_result result;
- /* Initialize options */
- opts->uid = GLOBAL_ROOT_UID;
- opts->gid = GLOBAL_ROOT_GID;
- opts->use_uid = 0;
- opts->use_gid = 0;
- opts->iocharset = NULL;
- opts->debug = 0;
+ /* befs ignores all options on remount */
+ if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE)
+ return 0;
- if (!options)
- return 1;
+ token = fs_parse(fc, befs_param_spec, param, &result);
+ if (token < 0)
+ return token;
- while ((p = strsep(&options, ",")) != NULL) {
- int token;
-
- if (!*p)
- continue;
-
- token = match_token(p, befs_tokens, args);
- switch (token) {
- case Opt_uid:
- if (match_int(&args[0], &option))
- return 0;
- uid = INVALID_UID;
- if (option >= 0)
- uid = make_kuid(current_user_ns(), option);
- if (!uid_valid(uid)) {
- pr_err("Invalid uid %d, "
- "using default\n", option);
- break;
- }
- opts->uid = uid;
- opts->use_uid = 1;
- break;
- case Opt_gid:
- if (match_int(&args[0], &option))
- return 0;
- gid = INVALID_GID;
- if (option >= 0)
- gid = make_kgid(current_user_ns(), option);
- if (!gid_valid(gid)) {
- pr_err("Invalid gid %d, "
- "using default\n", option);
- break;
- }
- opts->gid = gid;
- opts->use_gid = 1;
- break;
- case Opt_charset:
- kfree(opts->iocharset);
- opts->iocharset = match_strdup(&args[0]);
- if (!opts->iocharset) {
- pr_err("allocation failure for "
- "iocharset string\n");
- return 0;
- }
- break;
- case Opt_debug:
- opts->debug = 1;
- break;
- default:
- pr_err("Unrecognized mount option \"%s\" "
- "or missing value\n", p);
- return 0;
- }
+ switch (token) {
+ case Opt_uid:
+ opts->uid = result.uid;
+ opts->use_uid = 1;
+ break;
+ case Opt_gid:
+ opts->gid = result.gid;
+ opts->use_gid = 1;
+ break;
+ case Opt_charset:
+ kfree(opts->iocharset);
+ opts->iocharset = param->string;
+ param->string = NULL;
+ break;
+ case Opt_debug:
+ opts->debug = 1;
+ break;
+ default:
+ return -EINVAL;
}
- return 1;
+ return 0;
}
static int befs_show_options(struct seq_file *m, struct dentry *root)
@@ -793,6 +753,21 @@ befs_put_super(struct super_block *sb)
sb->s_fs_info = NULL;
}
+/*
+ * Copy the parsed options into the sbi mount_options member
+ */
+static void
+befs_set_options(struct befs_sb_info *sbi, struct befs_mount_options *opts)
+{
+ sbi->mount_opts.uid = opts->uid;
+ sbi->mount_opts.gid = opts->gid;
+ sbi->mount_opts.use_uid = opts->use_uid;
+ sbi->mount_opts.use_gid = opts->use_gid;
+ sbi->mount_opts.debug = opts->debug;
+ sbi->mount_opts.iocharset = opts->iocharset;
+ opts->iocharset = NULL;
+}
+
/* Allocate private field of the superblock, fill it.
*
* Finish filling the public superblock fields
@@ -800,7 +775,7 @@ befs_put_super(struct super_block *sb)
* Load a set of NLS translations if needed.
*/
static int
-befs_fill_super(struct super_block *sb, void *data, int silent)
+befs_fill_super(struct super_block *sb, struct fs_context *fc)
{
struct buffer_head *bh;
struct befs_sb_info *befs_sb;
@@ -810,6 +785,8 @@ befs_fill_super(struct super_block *sb, void *data, int silent)
const unsigned long sb_block = 0;
const off_t x86_sb_off = 512;
int blocksize;
+ struct befs_mount_options *parsed_opts = fc->fs_private;
+ int silent = fc->sb_flags & SB_SILENT;
sb->s_fs_info = kzalloc(sizeof(*befs_sb), GFP_KERNEL);
if (sb->s_fs_info == NULL)
@@ -817,11 +794,7 @@ befs_fill_super(struct super_block *sb, void *data, int silent)
befs_sb = BEFS_SB(sb);
- if (!parse_options((char *) data, &befs_sb->mount_opts)) {
- if (!silent)
- befs_error(sb, "cannot parse mount options");
- goto unacquire_priv_sbp;
- }
+ befs_set_options(befs_sb, parsed_opts);
befs_debug(sb, "---> %s", __func__);
@@ -934,10 +907,10 @@ befs_fill_super(struct super_block *sb, void *data, int silent)
}
static int
-befs_remount(struct super_block *sb, int *flags, char *data)
+befs_reconfigure(struct fs_context *fc)
{
- sync_filesystem(sb);
- if (!(*flags & SB_RDONLY))
+ sync_filesystem(fc->root->d_sb);
+ if (!(fc->sb_flags & SB_RDONLY))
return -EINVAL;
return 0;
}
@@ -965,19 +938,51 @@ befs_statfs(struct dentry *dentry, struct kstatfs *buf)
return 0;
}
-static struct dentry *
-befs_mount(struct file_system_type *fs_type, int flags, const char *dev_name,
- void *data)
+static int befs_get_tree(struct fs_context *fc)
{
- return mount_bdev(fs_type, flags, dev_name, data, befs_fill_super);
+ return get_tree_bdev(fc, befs_fill_super);
+}
+
+static const struct fs_context_operations befs_context_ops = {
+ .parse_param = befs_parse_param,
+ .get_tree = befs_get_tree,
+ .reconfigure = befs_reconfigure,
+ .free = befs_free_fc,
+};
+
+static int befs_init_fs_context(struct fs_context *fc)
+{
+ struct befs_mount_options *opts;
+
+ opts = kzalloc(sizeof(*opts), GFP_KERNEL);
+ if (!opts)
+ return -ENOMEM;
+
+ /* Initialize options */
+ opts->uid = GLOBAL_ROOT_UID;
+ opts->gid = GLOBAL_ROOT_GID;
+
+ fc->fs_private = opts;
+ fc->ops = &befs_context_ops;
+
+ return 0;
+}
+
+static void befs_free_fc(struct fs_context *fc)
+{
+ struct befs_mount_options *opts = fc->fs_private;
+
+ kfree(opts->iocharset);
+ kfree(fc->fs_private);
}
static struct file_system_type befs_fs_type = {
.owner = THIS_MODULE,
.name = "befs",
- .mount = befs_mount,
.kill_sb = kill_block_super,
.fs_flags = FS_REQUIRES_DEV,
+ .init_fs_context = befs_init_fs_context,
+ .parameters = befs_param_spec,
};
MODULE_ALIAS_FS("befs");
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 06dc4a5..3039a6b 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -258,6 +258,12 @@ create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec,
#ifdef ELF_HWCAP2
NEW_AUX_ENT(AT_HWCAP2, ELF_HWCAP2);
#endif
+#ifdef ELF_HWCAP3
+ NEW_AUX_ENT(AT_HWCAP3, ELF_HWCAP3);
+#endif
+#ifdef ELF_HWCAP4
+ NEW_AUX_ENT(AT_HWCAP4, ELF_HWCAP4);
+#endif
NEW_AUX_ENT(AT_EXECFN, bprm->exec);
if (k_platform) {
NEW_AUX_ENT(AT_PLATFORM,
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 4fe5bb9..31d253b 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -624,6 +624,12 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm,
#ifdef ELF_HWCAP2
NEW_AUX_ENT(AT_HWCAP2, ELF_HWCAP2);
#endif
+#ifdef ELF_HWCAP3
+ NEW_AUX_ENT(AT_HWCAP3, ELF_HWCAP3);
+#endif
+#ifdef ELF_HWCAP4
+ NEW_AUX_ENT(AT_HWCAP4, ELF_HWCAP4);
+#endif
NEW_AUX_ENT(AT_PAGESZ, PAGE_SIZE);
NEW_AUX_ENT(AT_CLKTCK, CLOCKS_PER_SEC);
NEW_AUX_ENT(AT_PHDR, exec_params->ph_addr);
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index 4fb925e..fa85155 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -78,6 +78,32 @@
If unsure, say N.
+config BTRFS_EXPERIMENTAL
+ bool "Btrfs experimental features"
+ depends on BTRFS_FS
+ default n
+ help
+ Enable experimental features. These features may not be stable enough
+ for end users. This is meant for btrfs developers or users who wish
+ to test the functionality and report problems.
+
+ Current list:
+
+ - extent map shrinker - performance problems with too frequent shrinks
+
+ - send stream protocol v3 - fs-verity support
+
+ - checksum offload mode - sysfs knob to affect when checksums are
+ calculated (at IO time, or in a thread)
+
+ - raid-stripe-tree - additional mapping of extents to devices to
+ support RAID1* profiles on zoned devices,
+ RAID56 not yet supported
+
+ - extent tree v2 - complex rework of extent tracking
+
+ If unsure, say N.
+
config BTRFS_FS_REF_VERIFY
bool "Btrfs with the ref verify tool compiled in"
depends on BTRFS_FS
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 87617f2..3cfc440 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -43,4 +43,5 @@
btrfs-$(CONFIG_BTRFS_FS_RUN_SANITY_TESTS) += tests/free-space-tests.o \
tests/extent-buffer-tests.o tests/btrfs-tests.o \
tests/extent-io-tests.o tests/inode-tests.o tests/qgroup-tests.o \
- tests/free-space-tree-tests.o tests/extent-map-tests.o
+ tests/free-space-tree-tests.o tests/extent-map-tests.o \
+ tests/raid-stripe-tree-tests.o
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index f8e1d5b..04f53ca 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -1442,7 +1442,8 @@ static int find_parent_nodes(struct btrfs_backref_walk_ctx *ctx,
*/
delayed_refs = &ctx->trans->transaction->delayed_refs;
spin_lock(&delayed_refs->lock);
- head = btrfs_find_delayed_ref_head(delayed_refs, ctx->bytenr);
+ head = btrfs_find_delayed_ref_head(ctx->fs_info, delayed_refs,
+ ctx->bytenr);
if (head) {
if (!mutex_trylock(&head->mutex)) {
refcount_inc(&head->refs);
diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c
index 7e0f960..1f216d0 100644
--- a/fs/btrfs/bio.c
+++ b/fs/btrfs/bio.c
@@ -587,7 +587,7 @@ static bool should_async_write(struct btrfs_bio *bbio)
{
bool auto_csum_mode = true;
-#ifdef CONFIG_BTRFS_DEBUG
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
struct btrfs_fs_devices *fs_devices = bbio->fs_info->fs_devices;
enum btrfs_offload_csum_mode csum_mode = READ_ONCE(fs_devices->offload_csum_mode);
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 4423d8b..4427c1b8 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -2797,7 +2797,7 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
* uncompressed data size, because the compression is only done
* when writeback triggered and we don't know how much space we
* are actually going to need, so we reserve the uncompressed
- * size because the data may be uncompressible in the worst case.
+ * size because the data may be incompressible in the worst case.
*/
if (ret == 0) {
bool used;
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index e152fde..aa1f55c 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -577,7 +577,6 @@ void btrfs_merge_delalloc_extent(struct btrfs_inode *inode, struct extent_state
struct extent_state *other);
void btrfs_split_delalloc_extent(struct btrfs_inode *inode,
struct extent_state *orig, u64 split);
-void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end);
void btrfs_evict_inode(struct inode *inode);
struct inode *btrfs_alloc_inode(struct super_block *sb);
void btrfs_destroy_inode(struct inode *inode);
@@ -613,11 +612,17 @@ int btrfs_writepage_cow_fixup(struct folio *folio);
int btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info *fs_info,
int compress_type);
int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
- u64 file_offset, u64 disk_bytenr,
- u64 disk_io_size,
- struct page **pages);
+ u64 disk_bytenr, u64 disk_io_size,
+ struct page **pages, void *uring_ctx);
ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
- struct btrfs_ioctl_encoded_io_args *encoded);
+ struct btrfs_ioctl_encoded_io_args *encoded,
+ struct extent_state **cached_state,
+ u64 *disk_bytenr, u64 *disk_io_size);
+ssize_t btrfs_encoded_read_regular(struct kiocb *iocb, struct iov_iter *iter,
+ u64 start, u64 lockend,
+ struct extent_state **cached_state,
+ u64 disk_bytenr, u64 disk_io_size,
+ size_t count, bool compressed, bool *unlocked);
ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
const struct btrfs_ioctl_encoded_io_args *encoded);
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 90aef26..0c4d486 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -453,7 +453,7 @@ static noinline int add_ra_bio_pages(struct inode *inode,
if (pg_index > end_index)
break;
- folio = __filemap_get_folio(mapping, pg_index, 0, 0);
+ folio = filemap_get_folio(mapping, pg_index);
if (!IS_ERR(folio)) {
u64 folio_sz = folio_size(folio);
u64 offset = offset_in_folio(folio, cur);
@@ -545,8 +545,7 @@ static noinline int add_ra_bio_pages(struct inode *inode,
* subpage::readers and to unlock the page.
*/
if (fs_info->sectorsize < PAGE_SIZE)
- btrfs_subpage_start_reader(fs_info, folio, cur,
- add_size);
+ btrfs_folio_set_lock(fs_info, folio, cur, add_size);
folio_put(folio);
cur += add_size;
}
@@ -702,7 +701,7 @@ static void free_heuristic_ws(struct list_head *ws)
kfree(workspace);
}
-static struct list_head *alloc_heuristic_ws(unsigned int level)
+static struct list_head *alloc_heuristic_ws(void)
{
struct heuristic_ws *ws;
@@ -744,9 +743,9 @@ static const struct btrfs_compress_op * const btrfs_compress_op[] = {
static struct list_head *alloc_workspace(int type, unsigned int level)
{
switch (type) {
- case BTRFS_COMPRESS_NONE: return alloc_heuristic_ws(level);
+ case BTRFS_COMPRESS_NONE: return alloc_heuristic_ws();
case BTRFS_COMPRESS_ZLIB: return zlib_alloc_workspace(level);
- case BTRFS_COMPRESS_LZO: return lzo_alloc_workspace(level);
+ case BTRFS_COMPRESS_LZO: return lzo_alloc_workspace();
case BTRFS_COMPRESS_ZSTD: return zstd_alloc_workspace(level);
default:
/*
@@ -1030,6 +1029,7 @@ int btrfs_compress_folios(unsigned int type_level, struct address_space *mapping
{
int type = btrfs_compress_type(type_level);
int level = btrfs_compress_level(type_level);
+ const unsigned long orig_len = *total_out;
struct list_head *workspace;
int ret;
@@ -1037,6 +1037,8 @@ int btrfs_compress_folios(unsigned int type_level, struct address_space *mapping
workspace = get_workspace(type, level);
ret = compression_compress_pages(type, workspace, mapping, start, folios,
out_folios, total_in, total_out);
+ /* The total read-in bytes should be no larger than the input. */
+ ASSERT(*total_in <= orig_len);
put_workspace(type, workspace);
return ret;
}
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index b6563b6..9540340 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -175,7 +175,7 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb);
int lzo_decompress(struct list_head *ws, const u8 *data_in,
struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen,
size_t destlen);
-struct list_head *lzo_alloc_workspace(unsigned int level);
+struct list_head *lzo_alloc_workspace(void);
void lzo_free_workspace(struct list_head *ws);
int zstd_compress_folios(struct list_head *ws, struct address_space *mapping,
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 0cc919d..148648e 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1508,26 +1508,26 @@ static noinline void unlock_up(struct btrfs_path *path, int level,
*/
static int
read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
- struct extent_buffer **eb_ret, int level, int slot,
+ struct extent_buffer **eb_ret, int slot,
const struct btrfs_key *key)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_tree_parent_check check = { 0 };
u64 blocknr;
- u64 gen;
- struct extent_buffer *tmp;
- int ret;
+ struct extent_buffer *tmp = NULL;
+ int ret = 0;
int parent_level;
- bool unlock_up;
+ int err;
+ bool read_tmp = false;
+ bool tmp_locked = false;
+ bool path_released = false;
- unlock_up = ((level + 1 < BTRFS_MAX_LEVEL) && p->locks[level + 1]);
blocknr = btrfs_node_blockptr(*eb_ret, slot);
- gen = btrfs_node_ptr_generation(*eb_ret, slot);
parent_level = btrfs_header_level(*eb_ret);
btrfs_node_key_to_cpu(*eb_ret, &check.first_key, slot);
check.has_first_key = true;
check.level = parent_level - 1;
- check.transid = gen;
+ check.transid = btrfs_node_ptr_generation(*eb_ret, slot);
check.owner_root = btrfs_root_id(root);
/*
@@ -1540,80 +1540,116 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
tmp = find_extent_buffer(fs_info, blocknr);
if (tmp) {
if (p->reada == READA_FORWARD_ALWAYS)
- reada_for_search(fs_info, p, level, slot, key->objectid);
+ reada_for_search(fs_info, p, parent_level, slot, key->objectid);
/* first we do an atomic uptodate check */
- if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
+ if (btrfs_buffer_uptodate(tmp, check.transid, 1) > 0) {
/*
* Do extra check for first_key, eb can be stale due to
* being cached, read from scrub, or have multiple
* parents (shared tree blocks).
*/
- if (btrfs_verify_level_key(tmp,
- parent_level - 1, &check.first_key, gen)) {
- free_extent_buffer(tmp);
- return -EUCLEAN;
+ if (btrfs_verify_level_key(tmp, &check)) {
+ ret = -EUCLEAN;
+ goto out;
}
*eb_ret = tmp;
- return 0;
+ tmp = NULL;
+ ret = 0;
+ goto out;
}
if (p->nowait) {
- free_extent_buffer(tmp);
- return -EAGAIN;
- }
-
- if (unlock_up)
- btrfs_unlock_up_safe(p, level + 1);
-
- /* now we're allowed to do a blocking uptodate check */
- ret = btrfs_read_extent_buffer(tmp, &check);
- if (ret) {
- free_extent_buffer(tmp);
- btrfs_release_path(p);
- return ret;
- }
-
- if (unlock_up)
ret = -EAGAIN;
+ goto out;
+ }
+ if (!p->skip_locking) {
+ btrfs_unlock_up_safe(p, parent_level + 1);
+ tmp_locked = true;
+ btrfs_tree_read_lock(tmp);
+ btrfs_release_path(p);
+ ret = -EAGAIN;
+ path_released = true;
+ }
+
+ /* Now we're allowed to do a blocking uptodate check. */
+ err = btrfs_read_extent_buffer(tmp, &check);
+ if (err) {
+ ret = err;
+ goto out;
+ }
+
+ if (ret == 0) {
+ ASSERT(!tmp_locked);
+ *eb_ret = tmp;
+ tmp = NULL;
+ }
goto out;
} else if (p->nowait) {
- return -EAGAIN;
+ ret = -EAGAIN;
+ goto out;
}
- if (unlock_up) {
- btrfs_unlock_up_safe(p, level + 1);
+ if (!p->skip_locking) {
+ btrfs_unlock_up_safe(p, parent_level + 1);
ret = -EAGAIN;
- } else {
- ret = 0;
}
if (p->reada != READA_NONE)
- reada_for_search(fs_info, p, level, slot, key->objectid);
+ reada_for_search(fs_info, p, parent_level, slot, key->objectid);
- tmp = read_tree_block(fs_info, blocknr, &check);
+ tmp = btrfs_find_create_tree_block(fs_info, blocknr, check.owner_root, check.level);
if (IS_ERR(tmp)) {
- btrfs_release_path(p);
- return PTR_ERR(tmp);
+ ret = PTR_ERR(tmp);
+ tmp = NULL;
+ goto out;
}
+ read_tmp = true;
+
+ if (!p->skip_locking) {
+ ASSERT(ret == -EAGAIN);
+ tmp_locked = true;
+ btrfs_tree_read_lock(tmp);
+ btrfs_release_path(p);
+ path_released = true;
+ }
+
+ /* Now we're allowed to do a blocking uptodate check. */
+ err = btrfs_read_extent_buffer(tmp, &check);
+ if (err) {
+ ret = err;
+ goto out;
+ }
+
/*
* If the read above didn't mark this buffer up to date,
* it will never end up being up to date. Set ret to EIO now
* and give up so that our caller doesn't loop forever
* on our EAGAINs.
*/
- if (!extent_buffer_uptodate(tmp))
+ if (!extent_buffer_uptodate(tmp)) {
ret = -EIO;
-
-out:
- if (ret == 0) {
- *eb_ret = tmp;
- } else {
- free_extent_buffer(tmp);
- btrfs_release_path(p);
+ goto out;
}
+ if (ret == 0) {
+ ASSERT(!tmp_locked);
+ *eb_ret = tmp;
+ tmp = NULL;
+ }
+out:
+ if (tmp) {
+ if (tmp_locked)
+ btrfs_tree_read_unlock(tmp);
+ if (read_tmp && ret && ret != -EAGAIN)
+ free_extent_buffer_stale(tmp);
+ else
+ free_extent_buffer(tmp);
+ }
+ if (ret && !path_released)
+ btrfs_release_path(p);
+
return ret;
}
@@ -2197,8 +2233,8 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
goto done;
}
- err = read_block_for_search(root, p, &b, level, slot, key);
- if (err == -EAGAIN)
+ err = read_block_for_search(root, p, &b, slot, key);
+ if (err == -EAGAIN && !p->nowait)
goto again;
if (err) {
ret = err;
@@ -2324,8 +2360,8 @@ int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key,
goto done;
}
- err = read_block_for_search(root, p, &b, level, slot, key);
- if (err == -EAGAIN)
+ err = read_block_for_search(root, p, &b, slot, key);
+ if (err == -EAGAIN && !p->nowait)
goto again;
if (err) {
ret = err;
@@ -2334,7 +2370,7 @@ int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key,
level = btrfs_header_level(b);
btrfs_tree_read_lock(b);
- b = btrfs_tree_mod_log_rewind(fs_info, p, b, time_seq);
+ b = btrfs_tree_mod_log_rewind(fs_info, b, time_seq);
if (!b) {
ret = -ENOMEM;
goto done;
@@ -4930,8 +4966,7 @@ int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
}
next = c;
- ret = read_block_for_search(root, path, &next, level,
- slot, &key);
+ ret = read_block_for_search(root, path, &next, slot, &key);
if (ret == -EAGAIN && !path->nowait)
goto again;
@@ -4974,8 +5009,7 @@ int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
if (!level)
break;
- ret = read_block_for_search(root, path, &next, level,
- 0, &key);
+ ret = read_block_for_search(root, path, &next, 0, &key);
if (ret == -EAGAIN && !path->nowait)
goto again;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 317a371..307dedf 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -744,16 +744,11 @@ const char *btrfs_super_csum_driver(u16 csum_type);
size_t __attribute_const__ btrfs_get_num_csums(void);
/*
- * We use page status Private2 to indicate there is an ordered extent with
+ * We use folio flag owner_2 to indicate there is an ordered extent with
* unfinished IO.
- *
- * Rename the Private2 accessors to Ordered, to improve readability.
*/
-#define PageOrdered(page) PagePrivate2(page)
-#define SetPageOrdered(page) SetPagePrivate2(page)
-#define ClearPageOrdered(page) ClearPagePrivate2(page)
-#define folio_test_ordered(folio) folio_test_private_2(folio)
-#define folio_set_ordered(folio) folio_set_private_2(folio)
-#define folio_clear_ordered(folio) folio_clear_private_2(folio)
+#define folio_test_ordered(folio) folio_test_owner_2(folio)
+#define folio_set_ordered(folio) folio_set_owner_2(folio)
+#define folio_clear_ordered(folio) folio_clear_owner_2(folio)
#endif
diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h
index 7cfefdf..f4d9fea 100644
--- a/fs/btrfs/delayed-inode.h
+++ b/fs/btrfs/delayed-inode.h
@@ -64,9 +64,9 @@ struct btrfs_delayed_node {
struct mutex mutex;
struct btrfs_inode_item inode_item;
refcount_t refs;
+ int count;
u64 index_cnt;
unsigned long flags;
- int count;
/*
* The size of the next batch of dir index items to insert (if this
* node is from a directory inode). Protected by @mutex.
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 65d841d..0d878db 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -9,6 +9,7 @@
#include "messages.h"
#include "ctree.h"
#include "delayed-ref.h"
+#include "extent-tree.h"
#include "transaction.h"
#include "qgroup.h"
#include "space-info.h"
@@ -298,7 +299,7 @@ static int comp_refs(struct btrfs_delayed_ref_node *ref1,
if (ref1->ref_root < ref2->ref_root)
return -1;
if (ref1->ref_root > ref2->ref_root)
- return -1;
+ return 1;
if (ref1->type == BTRFS_EXTENT_DATA_REF_KEY)
ret = comp_data_refs(ref1, ref2);
}
@@ -313,39 +314,6 @@ static int comp_refs(struct btrfs_delayed_ref_node *ref1,
return 0;
}
-/* insert a new ref to head ref rbtree */
-static struct btrfs_delayed_ref_head *htree_insert(struct rb_root_cached *root,
- struct rb_node *node)
-{
- struct rb_node **p = &root->rb_root.rb_node;
- struct rb_node *parent_node = NULL;
- struct btrfs_delayed_ref_head *entry;
- struct btrfs_delayed_ref_head *ins;
- u64 bytenr;
- bool leftmost = true;
-
- ins = rb_entry(node, struct btrfs_delayed_ref_head, href_node);
- bytenr = ins->bytenr;
- while (*p) {
- parent_node = *p;
- entry = rb_entry(parent_node, struct btrfs_delayed_ref_head,
- href_node);
-
- if (bytenr < entry->bytenr) {
- p = &(*p)->rb_left;
- } else if (bytenr > entry->bytenr) {
- p = &(*p)->rb_right;
- leftmost = false;
- } else {
- return entry;
- }
- }
-
- rb_link_node(node, parent_node, p);
- rb_insert_color_cached(node, root, leftmost);
- return NULL;
-}
-
static struct btrfs_delayed_ref_node* tree_insert(struct rb_root_cached *root,
struct btrfs_delayed_ref_node *ins)
{
@@ -380,75 +348,32 @@ static struct btrfs_delayed_ref_node* tree_insert(struct rb_root_cached *root,
static struct btrfs_delayed_ref_head *find_first_ref_head(
struct btrfs_delayed_ref_root *dr)
{
- struct rb_node *n;
- struct btrfs_delayed_ref_head *entry;
+ unsigned long from = 0;
- n = rb_first_cached(&dr->href_root);
- if (!n)
- return NULL;
+ lockdep_assert_held(&dr->lock);
- entry = rb_entry(n, struct btrfs_delayed_ref_head, href_node);
-
- return entry;
+ return xa_find(&dr->head_refs, &from, ULONG_MAX, XA_PRESENT);
}
-/*
- * Find a head entry based on bytenr. This returns the delayed ref head if it
- * was able to find one, or NULL if nothing was in that spot. If return_bigger
- * is given, the next bigger entry is returned if no exact match is found.
- */
-static struct btrfs_delayed_ref_head *find_ref_head(
- struct btrfs_delayed_ref_root *dr, u64 bytenr,
- bool return_bigger)
-{
- struct rb_root *root = &dr->href_root.rb_root;
- struct rb_node *n;
- struct btrfs_delayed_ref_head *entry;
-
- n = root->rb_node;
- entry = NULL;
- while (n) {
- entry = rb_entry(n, struct btrfs_delayed_ref_head, href_node);
-
- if (bytenr < entry->bytenr)
- n = n->rb_left;
- else if (bytenr > entry->bytenr)
- n = n->rb_right;
- else
- return entry;
- }
- if (entry && return_bigger) {
- if (bytenr > entry->bytenr) {
- n = rb_next(&entry->href_node);
- if (!n)
- return NULL;
- entry = rb_entry(n, struct btrfs_delayed_ref_head,
- href_node);
- }
- return entry;
- }
- return NULL;
-}
-
-int btrfs_delayed_ref_lock(struct btrfs_delayed_ref_root *delayed_refs,
- struct btrfs_delayed_ref_head *head)
+static bool btrfs_delayed_ref_lock(struct btrfs_delayed_ref_root *delayed_refs,
+ struct btrfs_delayed_ref_head *head)
{
lockdep_assert_held(&delayed_refs->lock);
if (mutex_trylock(&head->mutex))
- return 0;
+ return true;
refcount_inc(&head->refs);
spin_unlock(&delayed_refs->lock);
mutex_lock(&head->mutex);
spin_lock(&delayed_refs->lock);
- if (RB_EMPTY_NODE(&head->href_node)) {
+ if (!head->tracked) {
mutex_unlock(&head->mutex);
btrfs_put_delayed_ref_head(head);
- return -EAGAIN;
+ return false;
}
btrfs_put_delayed_ref_head(head);
- return 0;
+ return true;
}
static inline void drop_delayed_ref(struct btrfs_fs_info *fs_info,
@@ -462,7 +387,6 @@ static inline void drop_delayed_ref(struct btrfs_fs_info *fs_info,
if (!list_empty(&ref->add_list))
list_del(&ref->add_list);
btrfs_put_delayed_ref(ref);
- atomic_dec(&delayed_refs->num_entries);
btrfs_delayed_refs_rsv_release(fs_info, 1, 0);
}
@@ -558,33 +482,31 @@ int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq)
}
struct btrfs_delayed_ref_head *btrfs_select_ref_head(
+ const struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_root *delayed_refs)
{
struct btrfs_delayed_ref_head *head;
+ unsigned long start_index;
+ unsigned long found_index;
+ bool found_head = false;
+ bool locked;
- lockdep_assert_held(&delayed_refs->lock);
+ spin_lock(&delayed_refs->lock);
again:
- head = find_ref_head(delayed_refs, delayed_refs->run_delayed_start,
- true);
- if (!head && delayed_refs->run_delayed_start != 0) {
- delayed_refs->run_delayed_start = 0;
- head = find_first_ref_head(delayed_refs);
- }
- if (!head)
- return NULL;
-
- while (head->processing) {
- struct rb_node *node;
-
- node = rb_next(&head->href_node);
- if (!node) {
- if (delayed_refs->run_delayed_start == 0)
- return NULL;
- delayed_refs->run_delayed_start = 0;
- goto again;
+ start_index = (delayed_refs->run_delayed_start >> fs_info->sectorsize_bits);
+ xa_for_each_start(&delayed_refs->head_refs, found_index, head, start_index) {
+ if (!head->processing) {
+ found_head = true;
+ break;
}
- head = rb_entry(node, struct btrfs_delayed_ref_head,
- href_node);
+ }
+ if (!found_head) {
+ if (delayed_refs->run_delayed_start == 0) {
+ spin_unlock(&delayed_refs->lock);
+ return NULL;
+ }
+ delayed_refs->run_delayed_start = 0;
+ goto again;
}
head->processing = true;
@@ -592,18 +514,42 @@ struct btrfs_delayed_ref_head *btrfs_select_ref_head(
delayed_refs->num_heads_ready--;
delayed_refs->run_delayed_start = head->bytenr +
head->num_bytes;
+
+ locked = btrfs_delayed_ref_lock(delayed_refs, head);
+ spin_unlock(&delayed_refs->lock);
+
+ /*
+ * We may have dropped the spin lock to get the head mutex lock, and
+ * that might have given someone else time to free the head. If that's
+ * true, it has been removed from our list and we can move on.
+ */
+ if (!locked)
+ return ERR_PTR(-EAGAIN);
+
return head;
}
-void btrfs_delete_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
+void btrfs_unselect_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
+ struct btrfs_delayed_ref_head *head)
+{
+ spin_lock(&delayed_refs->lock);
+ head->processing = false;
+ delayed_refs->num_heads_ready++;
+ spin_unlock(&delayed_refs->lock);
+ btrfs_delayed_ref_unlock(head);
+}
+
+void btrfs_delete_ref_head(const struct btrfs_fs_info *fs_info,
+ struct btrfs_delayed_ref_root *delayed_refs,
struct btrfs_delayed_ref_head *head)
{
+ const unsigned long index = (head->bytenr >> fs_info->sectorsize_bits);
+
lockdep_assert_held(&delayed_refs->lock);
lockdep_assert_held(&head->lock);
- rb_erase_cached(&head->href_node, &delayed_refs->href_root);
- RB_CLEAR_NODE(&head->href_node);
- atomic_dec(&delayed_refs->num_entries);
+ xa_erase(&delayed_refs->head_refs, index);
+ head->tracked = false;
delayed_refs->num_heads--;
if (!head->processing)
delayed_refs->num_heads_ready--;
@@ -629,7 +575,6 @@ static bool insert_delayed_ref(struct btrfs_trans_handle *trans,
if (!exist) {
if (ref->action == BTRFS_ADD_DELAYED_REF)
list_add_tail(&ref->add_list, &href->ref_add_list);
- atomic_inc(&root->num_entries);
spin_unlock(&href->lock);
trans->delayed_ref_updates++;
return false;
@@ -813,7 +758,7 @@ static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref,
head_ref->is_system = (generic_ref->ref_root == BTRFS_CHUNK_TREE_OBJECTID);
head_ref->ref_tree = RB_ROOT_CACHED;
INIT_LIST_HEAD(&head_ref->ref_add_list);
- RB_CLEAR_NODE(&head_ref->href_node);
+ head_ref->tracked = false;
head_ref->processing = false;
head_ref->total_ref_mod = count_mod;
spin_lock_init(&head_ref->lock);
@@ -830,7 +775,6 @@ static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref,
qrecord->data_rsv = reserved;
qrecord->data_rsv_refroot = generic_ref->ref_root;
}
- qrecord->bytenr = generic_ref->bytenr;
qrecord->num_bytes = generic_ref->num_bytes;
qrecord->old_roots = NULL;
}
@@ -852,19 +796,33 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_delayed_ref_head *existing;
struct btrfs_delayed_ref_root *delayed_refs;
+ const unsigned long index = (head_ref->bytenr >> fs_info->sectorsize_bits);
bool qrecord_inserted = false;
delayed_refs = &trans->transaction->delayed_refs;
+ lockdep_assert_held(&delayed_refs->lock);
+
+#if BITS_PER_LONG == 32
+ if (head_ref->bytenr >= MAX_LFS_FILESIZE) {
+ if (qrecord)
+ xa_release(&delayed_refs->dirty_extents, index);
+ btrfs_err_rl(fs_info,
+"delayed ref head %llu is beyond 32bit page cache and xarray index limit",
+ head_ref->bytenr);
+ btrfs_err_32bit_limit(fs_info);
+ return ERR_PTR(-EOVERFLOW);
+ }
+#endif
/* Record qgroup extent info if provided */
if (qrecord) {
int ret;
- ret = btrfs_qgroup_trace_extent_nolock(fs_info, delayed_refs, qrecord);
+ ret = btrfs_qgroup_trace_extent_nolock(fs_info, delayed_refs, qrecord,
+ head_ref->bytenr);
if (ret) {
/* Clean up if insertion fails or item exists. */
- xa_release(&delayed_refs->dirty_extents,
- qrecord->bytenr >> fs_info->sectorsize_bits);
+ xa_release(&delayed_refs->dirty_extents, index);
/* Caller responsible for freeing qrecord on error. */
if (ret < 0)
return ERR_PTR(ret);
@@ -876,8 +834,7 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans,
trace_add_delayed_ref_head(fs_info, head_ref, action);
- existing = htree_insert(&delayed_refs->href_root,
- &head_ref->href_node);
+ existing = xa_load(&delayed_refs->head_refs, index);
if (existing) {
update_existing_head_ref(trans, existing, head_ref);
/*
@@ -887,6 +844,19 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans,
kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref);
head_ref = existing;
} else {
+ existing = xa_store(&delayed_refs->head_refs, index, head_ref, GFP_ATOMIC);
+ if (xa_is_err(existing)) {
+ /* Memory was preallocated by the caller. */
+ ASSERT(xa_err(existing) != -ENOMEM);
+ return ERR_PTR(xa_err(existing));
+ } else if (WARN_ON(existing)) {
+ /*
+ * Shouldn't happen we just did a lookup before under
+ * delayed_refs->lock.
+ */
+ return ERR_PTR(-EEXIST);
+ }
+ head_ref->tracked = true;
/*
* We reserve the amount of bytes needed to delete csums when
* adding the ref head and not when adding individual drop refs
@@ -900,7 +870,6 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans,
}
delayed_refs->num_heads++;
delayed_refs->num_heads_ready++;
- atomic_inc(&delayed_refs->num_entries);
}
if (qrecord_inserted_ret)
*qrecord_inserted_ret = qrecord_inserted;
@@ -1008,6 +977,8 @@ static int add_delayed_ref(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_head *new_head_ref;
struct btrfs_delayed_ref_root *delayed_refs;
struct btrfs_qgroup_extent_record *record = NULL;
+ const unsigned long index = (generic_ref->bytenr >> fs_info->sectorsize_bits);
+ bool qrecord_reserved = false;
bool qrecord_inserted;
int action = generic_ref->action;
bool merged;
@@ -1023,25 +994,32 @@ static int add_delayed_ref(struct btrfs_trans_handle *trans,
goto free_node;
}
+ delayed_refs = &trans->transaction->delayed_refs;
+
if (btrfs_qgroup_full_accounting(fs_info) && !generic_ref->skip_qgroup) {
record = kzalloc(sizeof(*record), GFP_NOFS);
if (!record) {
ret = -ENOMEM;
goto free_head_ref;
}
- if (xa_reserve(&trans->transaction->delayed_refs.dirty_extents,
- generic_ref->bytenr >> fs_info->sectorsize_bits,
- GFP_NOFS)) {
+ if (xa_reserve(&delayed_refs->dirty_extents, index, GFP_NOFS)) {
ret = -ENOMEM;
goto free_record;
}
+ qrecord_reserved = true;
+ }
+
+ ret = xa_reserve(&delayed_refs->head_refs, index, GFP_NOFS);
+ if (ret) {
+ if (qrecord_reserved)
+ xa_release(&delayed_refs->dirty_extents, index);
+ goto free_record;
}
init_delayed_ref_common(fs_info, node, generic_ref);
init_delayed_ref_head(head_ref, generic_ref, record, reserved);
head_ref->extent_op = extent_op;
- delayed_refs = &trans->transaction->delayed_refs;
spin_lock(&delayed_refs->lock);
/*
@@ -1051,6 +1029,7 @@ static int add_delayed_ref(struct btrfs_trans_handle *trans,
new_head_ref = add_delayed_ref_head(trans, head_ref, record,
action, &qrecord_inserted);
if (IS_ERR(new_head_ref)) {
+ xa_release(&delayed_refs->head_refs, index);
spin_unlock(&delayed_refs->lock);
ret = PTR_ERR(new_head_ref);
goto free_record;
@@ -1074,7 +1053,7 @@ static int add_delayed_ref(struct btrfs_trans_handle *trans,
kmem_cache_free(btrfs_delayed_ref_node_cachep, node);
if (qrecord_inserted)
- return btrfs_qgroup_trace_extent_post(trans, record);
+ return btrfs_qgroup_trace_extent_post(trans, record, generic_ref->bytenr);
return 0;
free_record:
@@ -1113,6 +1092,7 @@ int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes, u8 level,
struct btrfs_delayed_extent_op *extent_op)
{
+ const unsigned long index = (bytenr >> trans->fs_info->sectorsize_bits);
struct btrfs_delayed_ref_head *head_ref;
struct btrfs_delayed_ref_head *head_ref_ret;
struct btrfs_delayed_ref_root *delayed_refs;
@@ -1123,6 +1103,7 @@ int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
.num_bytes = num_bytes,
.tree_ref.level = level,
};
+ int ret;
head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS);
if (!head_ref)
@@ -1132,16 +1113,23 @@ int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
head_ref->extent_op = extent_op;
delayed_refs = &trans->transaction->delayed_refs;
- spin_lock(&delayed_refs->lock);
+ ret = xa_reserve(&delayed_refs->head_refs, index, GFP_NOFS);
+ if (ret) {
+ kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref);
+ return ret;
+ }
+
+ spin_lock(&delayed_refs->lock);
head_ref_ret = add_delayed_ref_head(trans, head_ref, NULL,
BTRFS_UPDATE_DELAYED_HEAD, NULL);
- spin_unlock(&delayed_refs->lock);
-
if (IS_ERR(head_ref_ret)) {
+ xa_release(&delayed_refs->head_refs, index);
+ spin_unlock(&delayed_refs->lock);
kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref);
return PTR_ERR(head_ref_ret);
}
+ spin_unlock(&delayed_refs->lock);
/*
* Need to update the delayed_refs_rsv with any changes we may have
@@ -1164,11 +1152,15 @@ void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref)
* head node if found, or NULL if not.
*/
struct btrfs_delayed_ref_head *
-btrfs_find_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs, u64 bytenr)
+btrfs_find_delayed_ref_head(const struct btrfs_fs_info *fs_info,
+ struct btrfs_delayed_ref_root *delayed_refs,
+ u64 bytenr)
{
+ const unsigned long index = (bytenr >> fs_info->sectorsize_bits);
+
lockdep_assert_held(&delayed_refs->lock);
- return find_ref_head(delayed_refs, bytenr, false);
+ return xa_load(&delayed_refs->head_refs, index);
}
static int find_comp(struct btrfs_delayed_ref_node *entry, u64 root, u64 parent)
@@ -1238,6 +1230,81 @@ bool btrfs_find_delayed_tree_ref(struct btrfs_delayed_ref_head *head,
return found;
}
+void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans)
+{
+ struct btrfs_delayed_ref_root *delayed_refs = &trans->delayed_refs;
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+
+ spin_lock(&delayed_refs->lock);
+ while (true) {
+ struct btrfs_delayed_ref_head *head;
+ struct rb_node *n;
+ bool pin_bytes = false;
+
+ head = find_first_ref_head(delayed_refs);
+ if (!head)
+ break;
+
+ if (!btrfs_delayed_ref_lock(delayed_refs, head))
+ continue;
+
+ spin_lock(&head->lock);
+ while ((n = rb_first_cached(&head->ref_tree)) != NULL) {
+ struct btrfs_delayed_ref_node *ref;
+
+ ref = rb_entry(n, struct btrfs_delayed_ref_node, ref_node);
+ drop_delayed_ref(fs_info, delayed_refs, head, ref);
+ }
+ if (head->must_insert_reserved)
+ pin_bytes = true;
+ btrfs_free_delayed_extent_op(head->extent_op);
+ btrfs_delete_ref_head(fs_info, delayed_refs, head);
+ spin_unlock(&head->lock);
+ spin_unlock(&delayed_refs->lock);
+ mutex_unlock(&head->mutex);
+
+ if (pin_bytes) {
+ struct btrfs_block_group *bg;
+
+ bg = btrfs_lookup_block_group(fs_info, head->bytenr);
+ if (WARN_ON_ONCE(bg == NULL)) {
+ /*
+ * Unexpected and there's nothing we can do here
+ * because we are in a transaction abort path,
+ * so any errors can only be ignored or reported
+ * while attempting to cleanup all resources.
+ */
+ btrfs_err(fs_info,
+"block group for delayed ref at %llu was not found while destroying ref head",
+ head->bytenr);
+ } else {
+ spin_lock(&bg->space_info->lock);
+ spin_lock(&bg->lock);
+ bg->pinned += head->num_bytes;
+ btrfs_space_info_update_bytes_pinned(fs_info,
+ bg->space_info,
+ head->num_bytes);
+ bg->reserved -= head->num_bytes;
+ bg->space_info->bytes_reserved -= head->num_bytes;
+ spin_unlock(&bg->lock);
+ spin_unlock(&bg->space_info->lock);
+
+ btrfs_put_block_group(bg);
+ }
+
+ btrfs_error_unpin_extent_range(fs_info, head->bytenr,
+ head->bytenr + head->num_bytes - 1);
+ }
+ btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head);
+ btrfs_put_delayed_ref_head(head);
+ cond_resched();
+ spin_lock(&delayed_refs->lock);
+ }
+ btrfs_qgroup_destroy_extent_records(trans);
+
+ spin_unlock(&delayed_refs->lock);
+}
+
void __cold btrfs_delayed_ref_exit(void)
{
kmem_cache_destroy(btrfs_delayed_ref_head_cachep);
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index 352921e..611fb33 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -61,7 +61,8 @@ struct btrfs_delayed_ref_node {
/*
* If action is BTRFS_ADD_DELAYED_REF, also link this node to
* ref_head->ref_add_list, then we do not need to iterate the
- * whole ref_head->ref_list to find BTRFS_ADD_DELAYED_REF nodes.
+ * refs rbtree in the corresponding delayed ref head
+ * (struct btrfs_delayed_ref_head::ref_tree).
*/
struct list_head add_list;
@@ -123,12 +124,6 @@ struct btrfs_delayed_ref_head {
u64 bytenr;
u64 num_bytes;
/*
- * For insertion into struct btrfs_delayed_ref_root::href_root.
- * Keep it in the same cache line as 'bytenr' for more efficient
- * searches in the rbtree.
- */
- struct rb_node href_node;
- /*
* the mutex is held while running the refs, and it is also
* held when checking the sum of reference modifications.
*/
@@ -191,6 +186,11 @@ struct btrfs_delayed_ref_head {
bool is_data;
bool is_system;
bool processing;
+ /*
+ * Indicate if it's currently in the data structure that tracks head
+ * refs (struct btrfs_delayed_ref_root::head_refs).
+ */
+ bool tracked;
};
enum btrfs_delayed_ref_flags {
@@ -199,38 +199,52 @@ enum btrfs_delayed_ref_flags {
};
struct btrfs_delayed_ref_root {
- /* head ref rbtree */
- struct rb_root_cached href_root;
-
/*
- * Track dirty extent records.
+ * Track head references.
* The keys correspond to the logical address of the extent ("bytenr")
* right shifted by fs_info->sectorsize_bits. This is both to get a more
* dense index space (optimizes xarray structure) and because indexes in
* xarrays are of "unsigned long" type, meaning they are 32 bits wide on
* 32 bits platforms, limiting the extent range to 4G which is too low
* and makes it unusable (truncated index values) on 32 bits platforms.
+ * Protected by the spinlock 'lock' defined below.
+ */
+ struct xarray head_refs;
+
+ /*
+ * Track dirty extent records.
+ * The keys correspond to the logical address of the extent ("bytenr")
+ * right shifted by fs_info->sectorsize_bits, for same reasons as above.
*/
struct xarray dirty_extents;
- /* this spin lock protects the rbtree and the entries inside */
+ /*
+ * Protects the xarray head_refs, its entries and the following fields:
+ * num_heads, num_heads_ready, pending_csums and run_delayed_start.
+ */
spinlock_t lock;
- /* how many delayed ref updates we've queued, used by the
- * throttling code
- */
- atomic_t num_entries;
-
- /* total number of head nodes in tree */
+ /* Total number of head refs, protected by the spinlock 'lock'. */
unsigned long num_heads;
- /* total number of head nodes ready for processing */
+ /*
+ * Total number of head refs ready for processing, protected by the
+ * spinlock 'lock'.
+ */
unsigned long num_heads_ready;
+ /*
+ * Track space reserved for deleting csums of data extents.
+ * Protected by the spinlock 'lock'.
+ */
u64 pending_csums;
unsigned long flags;
+ /*
+ * Track from which bytenr to start searching ref heads.
+ * Protected by the spinlock 'lock'.
+ */
u64 run_delayed_start;
/*
@@ -372,19 +386,22 @@ void btrfs_merge_delayed_refs(struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_head *head);
struct btrfs_delayed_ref_head *
-btrfs_find_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
+btrfs_find_delayed_ref_head(const struct btrfs_fs_info *fs_info,
+ struct btrfs_delayed_ref_root *delayed_refs,
u64 bytenr);
-int btrfs_delayed_ref_lock(struct btrfs_delayed_ref_root *delayed_refs,
- struct btrfs_delayed_ref_head *head);
static inline void btrfs_delayed_ref_unlock(struct btrfs_delayed_ref_head *head)
{
mutex_unlock(&head->mutex);
}
-void btrfs_delete_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
+void btrfs_delete_ref_head(const struct btrfs_fs_info *fs_info,
+ struct btrfs_delayed_ref_root *delayed_refs,
struct btrfs_delayed_ref_head *head);
struct btrfs_delayed_ref_head *btrfs_select_ref_head(
+ const struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_root *delayed_refs);
+void btrfs_unselect_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
+ struct btrfs_delayed_ref_head *head);
int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq);
@@ -399,6 +416,7 @@ int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info,
bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info);
bool btrfs_find_delayed_tree_ref(struct btrfs_delayed_ref_head *head,
u64 root, u64 parent);
+void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans);
static inline u64 btrfs_delayed_ref_owner(struct btrfs_delayed_ref_node *node)
{
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 83d5cdd7..ac8e97e 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -45,7 +45,7 @@
*
* - Copy existing extents
*
- * This happens by re-using scrub facility, as scrub also iterates through
+ * This happens by reusing scrub facility, as scrub also iterates through
* existing extents from commit root.
*
* Location: scrub_write_block_to_dev_replace() from
@@ -641,6 +641,7 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
return ret;
down_write(&dev_replace->rwsem);
+ dev_replace->replace_task = current;
switch (dev_replace->replace_state) {
case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
@@ -994,6 +995,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
list_add(&tgt_device->dev_alloc_list, &fs_devices->alloc_list);
fs_devices->rw_devices++;
+ dev_replace->replace_task = NULL;
up_write(&dev_replace->rwsem);
btrfs_rm_dev_replace_blocked(fs_info);
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index 1e8cd7c..1ea5d8f 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -27,7 +27,6 @@ static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle
const char *name,
int name_len)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
int ret;
char *ptr;
struct extent_buffer *leaf;
@@ -35,7 +34,7 @@ static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle
ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
if (ret == -EEXIST) {
struct btrfs_dir_item *di;
- di = btrfs_match_dir_item_name(fs_info, path, name, name_len);
+ di = btrfs_match_dir_item_name(path, name, name_len);
if (di)
return ERR_PTR(-EEXIST);
btrfs_extend_item(trans, path, data_size);
@@ -190,7 +189,7 @@ static struct btrfs_dir_item *btrfs_lookup_match_dir(
if (ret > 0)
return ERR_PTR(-ENOENT);
- return btrfs_match_dir_item_name(root->fs_info, path, name, name_len);
+ return btrfs_match_dir_item_name(path, name, name_len);
}
/*
@@ -341,8 +340,7 @@ btrfs_search_dir_index_item(struct btrfs_root *root, struct btrfs_path *path,
if (key.objectid != dirid || key.type != BTRFS_DIR_INDEX_KEY)
break;
- di = btrfs_match_dir_item_name(root->fs_info, path,
- name->name, name->len);
+ di = btrfs_match_dir_item_name(path, name->name, name->len);
if (di)
return di;
}
@@ -378,8 +376,7 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
* this walks through all the entries in a dir item and finds one
* for a specific name.
*/
-struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_fs_info *fs_info,
- const struct btrfs_path *path,
+struct btrfs_dir_item *btrfs_match_dir_item_name(const struct btrfs_path *path,
const char *name, int name_len)
{
struct btrfs_dir_item *dir_item;
diff --git a/fs/btrfs/dir-item.h b/fs/btrfs/dir-item.h
index 5f6dfaf..28d6997 100644
--- a/fs/btrfs/dir-item.h
+++ b/fs/btrfs/dir-item.h
@@ -44,8 +44,7 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
struct btrfs_path *path, u64 dir,
const char *name, u16 name_len,
int mod);
-struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_fs_info *fs_info,
- const struct btrfs_path *path,
+struct btrfs_dir_item *btrfs_match_dir_item_name(const struct btrfs_path *path,
const char *name,
int name_len);
diff --git a/fs/btrfs/direct-io.c b/fs/btrfs/direct-io.c
index bd38df5..a7c3e22 100644
--- a/fs/btrfs/direct-io.c
+++ b/fs/btrfs/direct-io.c
@@ -834,7 +834,7 @@ ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
return ret;
}
- ret = btrfs_write_check(iocb, from, ret);
+ ret = btrfs_write_check(iocb, ret);
if (ret < 0) {
btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
goto out;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index b11bfe6..8143209 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -917,8 +917,7 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
return ERR_PTR(ret);
}
-static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info)
+static struct btrfs_root *alloc_log_tree(struct btrfs_fs_info *fs_info)
{
struct btrfs_root *root;
@@ -966,7 +965,7 @@ int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
{
struct btrfs_root *log_root;
- log_root = alloc_log_tree(trans, fs_info);
+ log_root = alloc_log_tree(fs_info);
if (IS_ERR(log_root))
return PTR_ERR(log_root);
@@ -992,7 +991,7 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
struct btrfs_inode_item *inode_item;
int ret;
- log_root = alloc_log_tree(trans, fs_info);
+ log_root = alloc_log_tree(fs_info);
if (IS_ERR(log_root))
return PTR_ERR(log_root);
@@ -2786,6 +2785,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
btrfs_init_scrub(fs_info);
btrfs_init_balance(fs_info);
btrfs_init_async_reclaim_work(fs_info);
+ btrfs_init_extent_map_shrinker_work(fs_info);
rwlock_init(&fs_info->block_group_cache_lock);
fs_info->block_group_cache_tree = RB_ROOT_CACHED;
@@ -2852,8 +2852,6 @@ static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block
if (ret)
return ret;
- spin_lock_init(&fs_info->extent_map_shrinker_lock);
-
ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL);
if (ret)
return ret;
@@ -3202,8 +3200,7 @@ int btrfs_check_features(struct btrfs_fs_info *fs_info, bool is_rw_mount)
return 0;
}
-int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices,
- const char *options)
+int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices)
{
u32 sectorsize;
u32 nodesize;
@@ -4186,7 +4183,7 @@ static void warn_about_uncommitted_trans(struct btrfs_fs_info *fs_info)
btrfs_warn(fs_info,
"transaction %llu (with %llu dirty metadata bytes) is not committed",
trans->transid, dirty_bytes);
- btrfs_cleanup_one_transaction(trans, fs_info);
+ btrfs_cleanup_one_transaction(trans);
if (trans == fs_info->running_transaction)
fs_info->running_transaction = NULL;
@@ -4294,6 +4291,7 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
cancel_work_sync(&fs_info->async_reclaim_work);
cancel_work_sync(&fs_info->async_data_reclaim_work);
cancel_work_sync(&fs_info->preempt_reclaim_work);
+ cancel_work_sync(&fs_info->em_shrinker_work);
/* Cancel or finish ongoing discard work */
btrfs_discard_cleanup(fs_info);
@@ -4531,75 +4529,6 @@ static void btrfs_destroy_all_ordered_extents(struct btrfs_fs_info *fs_info)
btrfs_wait_ordered_roots(fs_info, U64_MAX, NULL);
}
-static void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
- struct btrfs_fs_info *fs_info)
-{
- struct rb_node *node;
- struct btrfs_delayed_ref_root *delayed_refs = &trans->delayed_refs;
- struct btrfs_delayed_ref_node *ref;
-
- spin_lock(&delayed_refs->lock);
- while ((node = rb_first_cached(&delayed_refs->href_root)) != NULL) {
- struct btrfs_delayed_ref_head *head;
- struct rb_node *n;
- bool pin_bytes = false;
-
- head = rb_entry(node, struct btrfs_delayed_ref_head,
- href_node);
- if (btrfs_delayed_ref_lock(delayed_refs, head))
- continue;
-
- spin_lock(&head->lock);
- while ((n = rb_first_cached(&head->ref_tree)) != NULL) {
- ref = rb_entry(n, struct btrfs_delayed_ref_node,
- ref_node);
- rb_erase_cached(&ref->ref_node, &head->ref_tree);
- RB_CLEAR_NODE(&ref->ref_node);
- if (!list_empty(&ref->add_list))
- list_del(&ref->add_list);
- atomic_dec(&delayed_refs->num_entries);
- btrfs_put_delayed_ref(ref);
- btrfs_delayed_refs_rsv_release(fs_info, 1, 0);
- }
- if (head->must_insert_reserved)
- pin_bytes = true;
- btrfs_free_delayed_extent_op(head->extent_op);
- btrfs_delete_ref_head(delayed_refs, head);
- spin_unlock(&head->lock);
- spin_unlock(&delayed_refs->lock);
- mutex_unlock(&head->mutex);
-
- if (pin_bytes) {
- struct btrfs_block_group *cache;
-
- cache = btrfs_lookup_block_group(fs_info, head->bytenr);
- BUG_ON(!cache);
-
- spin_lock(&cache->space_info->lock);
- spin_lock(&cache->lock);
- cache->pinned += head->num_bytes;
- btrfs_space_info_update_bytes_pinned(fs_info,
- cache->space_info, head->num_bytes);
- cache->reserved -= head->num_bytes;
- cache->space_info->bytes_reserved -= head->num_bytes;
- spin_unlock(&cache->lock);
- spin_unlock(&cache->space_info->lock);
-
- btrfs_put_block_group(cache);
-
- btrfs_error_unpin_extent_range(fs_info, head->bytenr,
- head->bytenr + head->num_bytes - 1);
- }
- btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head);
- btrfs_put_delayed_ref_head(head);
- cond_resched();
- spin_lock(&delayed_refs->lock);
- }
- btrfs_qgroup_destroy_extent_records(trans);
-
- spin_unlock(&delayed_refs->lock);
-}
-
static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root)
{
struct btrfs_inode *btrfs_inode;
@@ -4805,9 +4734,9 @@ static void btrfs_free_all_qgroup_pertrans(struct btrfs_fs_info *fs_info)
spin_unlock(&fs_info->fs_roots_radix_lock);
}
-void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
- struct btrfs_fs_info *fs_info)
+void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans)
{
+ struct btrfs_fs_info *fs_info = cur_trans->fs_info;
struct btrfs_device *dev, *tmp;
btrfs_cleanup_dirty_bgs(cur_trans, fs_info);
@@ -4819,7 +4748,7 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
list_del_init(&dev->post_commit_list);
}
- btrfs_destroy_delayed_refs(cur_trans, fs_info);
+ btrfs_destroy_delayed_refs(cur_trans);
cur_trans->state = TRANS_STATE_COMMIT_START;
wake_up(&fs_info->transaction_blocked_wait);
@@ -4865,7 +4794,7 @@ static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info)
} else {
spin_unlock(&fs_info->trans_lock);
}
- btrfs_cleanup_one_transaction(t, fs_info);
+ btrfs_cleanup_one_transaction(t);
spin_lock(&fs_info->trans_lock);
if (t == fs_info->running_transaction)
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 99af64d..a7051e2 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -52,8 +52,7 @@ struct extent_buffer *btrfs_find_create_tree_block(
int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info);
int btrfs_check_super_csum(struct btrfs_fs_info *fs_info,
const struct btrfs_super_block *disk_sb);
-int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices,
- const char *options);
+int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices);
void __cold close_ctree(struct btrfs_fs_info *fs_info);
int btrfs_validate_super(const struct btrfs_fs_info *fs_info,
const struct btrfs_super_block *sb, int mirror_num);
@@ -127,8 +126,7 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
void btrfs_cleanup_dirty_bgs(struct btrfs_transaction *trans,
struct btrfs_fs_info *fs_info);
-void btrfs_cleanup_one_transaction(struct btrfs_transaction *trans,
- struct btrfs_fs_info *fs_info);
+void btrfs_cleanup_one_transaction(struct btrfs_transaction *trans);
struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
u64 objectid);
int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index d9f511b..412e318 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -182,7 +182,7 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
delayed_refs = &trans->transaction->delayed_refs;
spin_lock(&delayed_refs->lock);
- head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
+ head = btrfs_find_delayed_ref_head(fs_info, delayed_refs, bytenr);
if (head) {
if (!mutex_trylock(&head->mutex)) {
refcount_inc(&head->refs);
@@ -795,7 +795,6 @@ int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
if (insert) {
extra_size = btrfs_extent_inline_ref_size(want);
path->search_for_extension = 1;
- path->keep_locks = 1;
} else
extra_size = -1;
@@ -946,6 +945,25 @@ int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
ret = -EAGAIN;
goto out;
}
+
+ if (path->slots[0] + 1 < btrfs_header_nritems(path->nodes[0])) {
+ struct btrfs_key tmp_key;
+
+ btrfs_item_key_to_cpu(path->nodes[0], &tmp_key, path->slots[0] + 1);
+ if (tmp_key.objectid == bytenr &&
+ tmp_key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
+ ret = -EAGAIN;
+ goto out;
+ }
+ goto out_no_entry;
+ }
+
+ if (!path->keep_locks) {
+ btrfs_release_path(path);
+ path->keep_locks = 1;
+ goto again;
+ }
+
/*
* To add new inline back ref, we have to make sure
* there is no corresponding back ref item.
@@ -959,13 +977,15 @@ int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
goto out;
}
}
+out_no_entry:
*ref_ret = (struct btrfs_extent_inline_ref *)ptr;
out:
- if (insert) {
+ if (path->keep_locks) {
path->keep_locks = 0;
- path->search_for_extension = 0;
btrfs_unlock_up_safe(path, 1);
}
+ if (insert)
+ path->search_for_extension = 0;
return ret;
}
@@ -1807,16 +1827,6 @@ select_delayed_ref(struct btrfs_delayed_ref_head *head)
return ref;
}
-static void unselect_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
- struct btrfs_delayed_ref_head *head)
-{
- spin_lock(&delayed_refs->lock);
- head->processing = false;
- delayed_refs->num_heads_ready++;
- spin_unlock(&delayed_refs->lock);
- btrfs_delayed_ref_unlock(head);
-}
-
static struct btrfs_delayed_extent_op *cleanup_extent_op(
struct btrfs_delayed_ref_head *head)
{
@@ -1891,7 +1901,7 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans,
ret = run_and_cleanup_extent_op(trans, head);
if (ret < 0) {
- unselect_delayed_ref_head(delayed_refs, head);
+ btrfs_unselect_ref_head(delayed_refs, head);
btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret);
return ret;
} else if (ret) {
@@ -1910,7 +1920,7 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans,
spin_unlock(&delayed_refs->lock);
return 1;
}
- btrfs_delete_ref_head(delayed_refs, head);
+ btrfs_delete_ref_head(fs_info, delayed_refs, head);
spin_unlock(&head->lock);
spin_unlock(&delayed_refs->lock);
@@ -1933,39 +1943,6 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans,
return ret;
}
-static struct btrfs_delayed_ref_head *btrfs_obtain_ref_head(
- struct btrfs_trans_handle *trans)
-{
- struct btrfs_delayed_ref_root *delayed_refs =
- &trans->transaction->delayed_refs;
- struct btrfs_delayed_ref_head *head = NULL;
- int ret;
-
- spin_lock(&delayed_refs->lock);
- head = btrfs_select_ref_head(delayed_refs);
- if (!head) {
- spin_unlock(&delayed_refs->lock);
- return head;
- }
-
- /*
- * Grab the lock that says we are going to process all the refs for
- * this head
- */
- ret = btrfs_delayed_ref_lock(delayed_refs, head);
- spin_unlock(&delayed_refs->lock);
-
- /*
- * We may have dropped the spin lock to get the head mutex lock, and
- * that might have given someone else time to free the head. If that's
- * true, it has been removed from our list and we can move on.
- */
- if (ret == -EAGAIN)
- head = ERR_PTR(-EAGAIN);
-
- return head;
-}
-
static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_head *locked_ref,
u64 *bytes_released)
@@ -1986,7 +1963,7 @@ static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
if (ref->seq &&
btrfs_check_delayed_seq(fs_info, ref->seq)) {
spin_unlock(&locked_ref->lock);
- unselect_delayed_ref_head(delayed_refs, locked_ref);
+ btrfs_unselect_ref_head(delayed_refs, locked_ref);
return -EAGAIN;
}
@@ -2009,7 +1986,6 @@ static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
default:
WARN_ON(1);
}
- atomic_dec(&delayed_refs->num_entries);
/*
* Record the must_insert_reserved flag before we drop the
@@ -2035,7 +2011,7 @@ static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
btrfs_free_delayed_extent_op(extent_op);
if (ret) {
- unselect_delayed_ref_head(delayed_refs, locked_ref);
+ btrfs_unselect_ref_head(delayed_refs, locked_ref);
btrfs_put_delayed_ref(ref);
return ret;
}
@@ -2073,7 +2049,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
do {
if (!locked_ref) {
- locked_ref = btrfs_obtain_ref_head(trans);
+ locked_ref = btrfs_select_ref_head(fs_info, delayed_refs);
if (IS_ERR_OR_NULL(locked_ref)) {
if (PTR_ERR(locked_ref) == -EAGAIN) {
continue;
@@ -2220,7 +2196,7 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, u64 min_bytes)
btrfs_create_pending_block_groups(trans);
spin_lock(&delayed_refs->lock);
- if (RB_EMPTY_ROOT(&delayed_refs->href_root.rb_root)) {
+ if (xa_empty(&delayed_refs->head_refs)) {
spin_unlock(&delayed_refs->lock);
return 0;
}
@@ -2275,7 +2251,7 @@ static noinline int check_delayed_ref(struct btrfs_root *root,
delayed_refs = &cur_trans->delayed_refs;
spin_lock(&delayed_refs->lock);
- head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
+ head = btrfs_find_delayed_ref_head(root->fs_info, delayed_refs, bytenr);
if (!head) {
spin_unlock(&delayed_refs->lock);
btrfs_put_transaction(cur_trans);
@@ -3144,7 +3120,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
break;
}
- /* Quick path didn't find the EXTEMT/METADATA_ITEM */
+ /* Quick path didn't find the EXTENT/METADATA_ITEM */
if (path->slots[0] - extent_slot > 5)
break;
extent_slot--;
@@ -3377,13 +3353,14 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
u64 bytenr)
{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_delayed_ref_head *head;
struct btrfs_delayed_ref_root *delayed_refs;
int ret = 0;
delayed_refs = &trans->transaction->delayed_refs;
spin_lock(&delayed_refs->lock);
- head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
+ head = btrfs_find_delayed_ref_head(fs_info, delayed_refs, bytenr);
if (!head)
goto out_delayed_unlock;
@@ -3401,7 +3378,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
if (!mutex_trylock(&head->mutex))
goto out;
- btrfs_delete_ref_head(delayed_refs, head);
+ btrfs_delete_ref_head(fs_info, delayed_refs, head);
head->processing = false;
spin_unlock(&head->lock);
@@ -3411,7 +3388,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
if (head->must_insert_reserved)
ret = 1;
- btrfs_cleanup_ref_head_accounting(trans->fs_info, delayed_refs, head);
+ btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head);
mutex_unlock(&head->mutex);
btrfs_put_delayed_ref_head(head);
return ret;
@@ -5270,7 +5247,7 @@ struct walk_control {
* corrupted file systems must have been caught before calling this function.
*/
static bool visit_node_for_delete(struct btrfs_root *root, struct walk_control *wc,
- struct extent_buffer *eb, u64 refs, u64 flags, int slot)
+ struct extent_buffer *eb, u64 flags, int slot)
{
struct btrfs_key key;
u64 generation;
@@ -5384,7 +5361,7 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
continue;
/* If we don't need to visit this node don't reada. */
- if (!visit_node_for_delete(root, wc, eb, refs, flags, slot))
+ if (!visit_node_for_delete(root, wc, eb, flags, slot))
continue;
reada:
btrfs_readahead_node_child(eb, slot);
@@ -5518,7 +5495,7 @@ static int check_ref_exists(struct btrfs_trans_handle *trans,
*/
delayed_refs = &trans->transaction->delayed_refs;
spin_lock(&delayed_refs->lock);
- head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
+ head = btrfs_find_delayed_ref_head(root->fs_info, delayed_refs, bytenr);
if (!head)
goto out;
if (!mutex_trylock(&head->mutex)) {
@@ -5737,8 +5714,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
/* If we don't have to walk into this node skip it. */
if (!visit_node_for_delete(root, wc, path->nodes[level],
- wc->refs[level - 1], wc->flags[level - 1],
- path->slots[level]))
+ wc->flags[level - 1], path->slots[level]))
goto skip;
/*
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 872cca5..b923d0c 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -190,7 +190,7 @@ static void process_one_folio(struct btrfs_fs_info *fs_info,
btrfs_folio_clamp_clear_writeback(fs_info, folio, start, len);
if (folio != locked_folio && (page_ops & PAGE_UNLOCK))
- btrfs_folio_end_writer_lock(fs_info, folio, start, len);
+ btrfs_folio_end_lock(fs_info, folio, start, len);
}
static void __process_folios_contig(struct address_space *mapping,
@@ -276,7 +276,7 @@ static noinline int lock_delalloc_folios(struct inode *inode,
range_start = max_t(u64, folio_pos(folio), start);
range_len = min_t(u64, folio_pos(folio) + folio_size(folio),
end + 1) - range_start;
- btrfs_folio_set_writer_lock(fs_info, folio, range_start, range_len);
+ btrfs_folio_set_lock(fs_info, folio, range_start, range_len);
processed_end = range_start + range_len - 1;
}
@@ -438,7 +438,7 @@ static void end_folio_read(struct folio *folio, bool uptodate, u64 start, u32 le
if (!btrfs_is_subpage(fs_info, folio->mapping))
folio_unlock(folio);
else
- btrfs_subpage_end_reader(fs_info, folio, start, len);
+ btrfs_folio_end_lock(fs_info, folio, start, len);
}
/*
@@ -495,7 +495,7 @@ static void begin_folio_read(struct btrfs_fs_info *fs_info, struct folio *folio)
return;
ASSERT(folio_test_private(folio));
- btrfs_subpage_start_reader(fs_info, folio, folio_pos(folio), PAGE_SIZE);
+ btrfs_folio_set_lock(fs_info, folio, folio_pos(folio), PAGE_SIZE);
}
/*
@@ -786,7 +786,7 @@ static void submit_extent_folio(struct btrfs_bio_ctrl *bio_ctrl,
}
if (bio_ctrl->wbc)
- wbc_account_cgroup_owner(bio_ctrl->wbc, &folio->page,
+ wbc_account_cgroup_owner(bio_ctrl->wbc, folio,
len);
size -= len;
@@ -1102,6 +1102,45 @@ int btrfs_read_folio(struct file *file, struct folio *folio)
return ret;
}
+static void set_delalloc_bitmap(struct folio *folio, unsigned long *delalloc_bitmap,
+ u64 start, u32 len)
+{
+ struct btrfs_fs_info *fs_info = folio_to_fs_info(folio);
+ const u64 folio_start = folio_pos(folio);
+ unsigned int start_bit;
+ unsigned int nbits;
+
+ ASSERT(start >= folio_start && start + len <= folio_start + PAGE_SIZE);
+ start_bit = (start - folio_start) >> fs_info->sectorsize_bits;
+ nbits = len >> fs_info->sectorsize_bits;
+ ASSERT(bitmap_test_range_all_zero(delalloc_bitmap, start_bit, nbits));
+ bitmap_set(delalloc_bitmap, start_bit, nbits);
+}
+
+static bool find_next_delalloc_bitmap(struct folio *folio,
+ unsigned long *delalloc_bitmap, u64 start,
+ u64 *found_start, u32 *found_len)
+{
+ struct btrfs_fs_info *fs_info = folio_to_fs_info(folio);
+ const u64 folio_start = folio_pos(folio);
+ const unsigned int bitmap_size = fs_info->sectors_per_page;
+ unsigned int start_bit;
+ unsigned int first_zero;
+ unsigned int first_set;
+
+ ASSERT(start >= folio_start && start < folio_start + PAGE_SIZE);
+
+ start_bit = (start - folio_start) >> fs_info->sectorsize_bits;
+ first_set = find_next_bit(delalloc_bitmap, bitmap_size, start_bit);
+ if (first_set >= bitmap_size)
+ return false;
+
+ *found_start = folio_start + (first_set << fs_info->sectorsize_bits);
+ first_zero = find_next_zero_bit(delalloc_bitmap, bitmap_size, first_set);
+ *found_len = (first_zero - first_set) << fs_info->sectorsize_bits;
+ return true;
+}
+
/*
* helper for extent_writepage(), doing all of the delayed allocation setup.
*
@@ -1121,6 +1160,7 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
const bool is_subpage = btrfs_is_subpage(fs_info, folio->mapping);
const u64 page_start = folio_pos(folio);
const u64 page_end = page_start + folio_size(folio) - 1;
+ unsigned long delalloc_bitmap = 0;
/*
* Save the last found delalloc end. As the delalloc end can go beyond
* page boundary, thus we cannot rely on subpage bitmap to locate the
@@ -1131,6 +1171,7 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
u64 delalloc_end = page_end;
u64 delalloc_to_write = 0;
int ret = 0;
+ int bit;
/* Save the dirty bitmap as our submission bitmap will be a subset of it. */
if (btrfs_is_subpage(fs_info, inode->vfs_inode.i_mapping)) {
@@ -1140,6 +1181,12 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
bio_ctrl->submit_bitmap = 1;
}
+ for_each_set_bit(bit, &bio_ctrl->submit_bitmap, fs_info->sectors_per_page) {
+ u64 start = page_start + (bit << fs_info->sectorsize_bits);
+
+ btrfs_folio_set_lock(fs_info, folio, start, fs_info->sectorsize);
+ }
+
/* Lock all (subpage) delalloc ranges inside the folio first. */
while (delalloc_start < page_end) {
delalloc_end = page_end;
@@ -1148,9 +1195,8 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
delalloc_start = delalloc_end + 1;
continue;
}
- btrfs_folio_set_writer_lock(fs_info, folio, delalloc_start,
- min(delalloc_end, page_end) + 1 -
- delalloc_start);
+ set_delalloc_bitmap(folio, &delalloc_bitmap, delalloc_start,
+ min(delalloc_end, page_end) + 1 - delalloc_start);
last_delalloc_end = delalloc_end;
delalloc_start = delalloc_end + 1;
}
@@ -1175,7 +1221,7 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
found_len = last_delalloc_end + 1 - found_start;
found = true;
} else {
- found = btrfs_subpage_find_writer_locked(fs_info, folio,
+ found = find_next_delalloc_bitmap(folio, &delalloc_bitmap,
delalloc_start, &found_start, &found_len);
}
if (!found)
@@ -1314,7 +1360,7 @@ static int submit_one_sector(struct btrfs_inode *inode,
* a folio for a range already written to disk.
*/
btrfs_folio_clear_dirty(fs_info, folio, filepos, sectorsize);
- btrfs_set_range_writeback(inode, filepos, filepos + sectorsize - 1);
+ btrfs_folio_set_writeback(fs_info, folio, filepos, sectorsize);
/*
* Above call should set the whole folio with writeback flag, even
* just for a single subpage sector.
@@ -1391,8 +1437,6 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode,
goto out;
submitted_io = true;
}
-
- btrfs_folio_assert_not_dirty(fs_info, folio, start, len);
out:
/*
* If we didn't submitted any sector (>= i_size), folio dirty get
@@ -1476,7 +1520,7 @@ static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl
* Only unlock ranges that are submitted. As there can be some async
* submitted ranges inside the folio.
*/
- btrfs_folio_end_writer_lock_bitmap(fs_info, folio, bio_ctrl->submit_bitmap);
+ btrfs_folio_end_lock_bitmap(fs_info, folio, bio_ctrl->submit_bitmap);
ASSERT(ret <= 0);
return ret;
}
@@ -1708,7 +1752,7 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb,
ret = bio_add_folio(&bbio->bio, folio, eb->len,
eb->start - folio_pos(folio));
ASSERT(ret);
- wbc_account_cgroup_owner(wbc, folio_page(folio, 0), eb->len);
+ wbc_account_cgroup_owner(wbc, folio, eb->len);
folio_unlock(folio);
} else {
int num_folios = num_extent_folios(eb);
@@ -1722,8 +1766,7 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb,
folio_start_writeback(folio);
ret = bio_add_folio(&bbio->bio, folio, eb->folio_size, 0);
ASSERT(ret);
- wbc_account_cgroup_owner(wbc, folio_page(folio, 0),
- eb->folio_size);
+ wbc_account_cgroup_owner(wbc, folio, eb->folio_size);
wbc->nr_to_write -= folio_nr_pages(folio);
folio_unlock(folio);
}
@@ -2116,7 +2159,27 @@ static int extent_write_cache_pages(struct address_space *mapping,
continue;
}
- if (wbc->sync_mode != WB_SYNC_NONE) {
+ /*
+ * For subpage case, compression can lead to mixed
+ * writeback and dirty flags, e.g:
+ * 0 32K 64K 96K 128K
+ * | |//////||/////| |//|
+ *
+ * In above case, [32K, 96K) is asynchronously submitted
+ * for compression, and [124K, 128K) needs to be written back.
+ *
+ * If we didn't wait wrtiteback for page 64K, [128K, 128K)
+ * won't be submitted as the page still has writeback flag
+ * and will be skipped in the next check.
+ *
+ * This mixed writeback and dirty case is only possible for
+ * subpage case.
+ *
+ * TODO: Remove this check after migrating compression to
+ * regular submission.
+ */
+ if (wbc->sync_mode != WB_SYNC_NONE ||
+ btrfs_is_subpage(inode_to_fs_info(inode), mapping)) {
if (folio_test_writeback(folio))
submit_write_bio(bio_ctrl, 0);
folio_wait_writeback(folio);
@@ -2201,7 +2264,7 @@ void extent_write_locked_range(struct inode *inode, const struct folio *locked_f
u32 cur_len = cur_end + 1 - cur;
struct folio *folio;
- folio = __filemap_get_folio(mapping, cur >> PAGE_SHIFT, 0, 0);
+ folio = filemap_get_folio(mapping, cur >> PAGE_SHIFT);
/*
* This shouldn't happen, the pages are pinned and locked, this
@@ -2234,7 +2297,7 @@ void extent_write_locked_range(struct inode *inode, const struct folio *locked_f
cur, cur_len, !ret);
mapping_set_error(mapping, ret);
}
- btrfs_folio_end_writer_lock(fs_info, folio, cur, cur_len);
+ btrfs_folio_end_lock(fs_info, folio, cur, cur_len);
if (ret < 0)
found_error = true;
next_page:
@@ -2318,7 +2381,7 @@ int extent_invalidate_folio(struct extent_io_tree *tree,
* to drop the page.
*/
static bool try_release_extent_state(struct extent_io_tree *tree,
- struct folio *folio, gfp_t mask)
+ struct folio *folio)
{
u64 start = folio_pos(folio);
u64 end = start + PAGE_SIZE - 1;
@@ -2429,7 +2492,7 @@ bool try_release_extent_mapping(struct folio *folio, gfp_t mask)
cond_resched();
}
}
- return try_release_extent_state(io_tree, folio, mask);
+ return try_release_extent_state(io_tree, folio);
}
static void __free_extent_buffer(struct extent_buffer *eb)
@@ -2443,7 +2506,7 @@ static int extent_buffer_under_io(const struct extent_buffer *eb)
test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
}
-static bool folio_range_has_eb(struct btrfs_fs_info *fs_info, struct folio *folio)
+static bool folio_range_has_eb(struct folio *folio)
{
struct btrfs_subpage *subpage;
@@ -2453,12 +2516,6 @@ static bool folio_range_has_eb(struct btrfs_fs_info *fs_info, struct folio *foli
subpage = folio_get_private(folio);
if (atomic_read(&subpage->eb_refs))
return true;
- /*
- * Even there is no eb refs here, we may still have
- * end_folio_read() call relying on page::private.
- */
- if (atomic_read(&subpage->readers))
- return true;
}
return false;
}
@@ -2517,7 +2574,7 @@ static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct fo
* We can only detach the folio private if there are no other ebs in the
* page range and no unfinished IO.
*/
- if (!folio_range_has_eb(fs_info, folio))
+ if (!folio_range_has_eb(folio))
btrfs_detach_subpage(fs_info, folio);
spin_unlock(&folio->mapping->i_private_lock);
@@ -3122,7 +3179,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
}
/*
* Now all pages of that extent buffer is unmapped, set UNMAPPED flag,
- * so it can be cleaned up without utlizing page->mapping.
+ * so it can be cleaned up without utilizing page->mapping.
*/
set_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags);
@@ -4222,7 +4279,6 @@ void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info,
u64 bytenr, u64 owner_root, u64 gen, int level)
{
struct btrfs_tree_parent_check check = {
- .has_first_key = 0,
.level = level,
.transid = gen
};
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 1d93e120..67ce85f 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -77,10 +77,13 @@ static u64 range_end(u64 start, u64 len)
return start + len;
}
-static void dec_evictable_extent_maps(struct btrfs_inode *inode)
+static void remove_em(struct btrfs_inode *inode, struct extent_map *em)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ rb_erase(&em->rb_node, &inode->extent_tree.root);
+ RB_CLEAR_NODE(&em->rb_node);
+
if (!btrfs_is_testing(fs_info) && is_fstree(btrfs_root_id(inode->root)))
percpu_counter_dec(&fs_info->evictable_extent_maps);
}
@@ -339,7 +342,6 @@ static void validate_extent_map(struct btrfs_fs_info *fs_info, struct extent_map
static void try_merge_map(struct btrfs_inode *inode, struct extent_map *em)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
- struct extent_map_tree *tree = &inode->extent_tree;
struct extent_map *merge = NULL;
struct rb_node *rb;
@@ -371,10 +373,8 @@ static void try_merge_map(struct btrfs_inode *inode, struct extent_map *em)
em->flags |= EXTENT_FLAG_MERGED;
validate_extent_map(fs_info, em);
- rb_erase(&merge->rb_node, &tree->root);
- RB_CLEAR_NODE(&merge->rb_node);
+ remove_em(inode, merge);
free_extent_map(merge);
- dec_evictable_extent_maps(inode);
}
}
@@ -386,12 +386,10 @@ static void try_merge_map(struct btrfs_inode *inode, struct extent_map *em)
if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE)
merge_ondisk_extents(em, merge, em);
validate_extent_map(fs_info, em);
- rb_erase(&merge->rb_node, &tree->root);
- RB_CLEAR_NODE(&merge->rb_node);
em->generation = max(em->generation, merge->generation);
em->flags |= EXTENT_FLAG_MERGED;
+ remove_em(inode, merge);
free_extent_map(merge);
- dec_evictable_extent_maps(inode);
}
}
@@ -588,12 +586,10 @@ void remove_extent_mapping(struct btrfs_inode *inode, struct extent_map *em)
lockdep_assert_held_write(&tree->lock);
WARN_ON(em->flags & EXTENT_FLAG_PINNED);
- rb_erase(&em->rb_node, &tree->root);
if (!(em->flags & EXTENT_FLAG_LOGGING))
list_del_init(&em->list);
- RB_CLEAR_NODE(&em->rb_node);
- dec_evictable_extent_maps(inode);
+ remove_em(inode, em);
}
static void replace_extent_mapping(struct btrfs_inode *inode,
@@ -1122,13 +1118,12 @@ int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre,
struct btrfs_em_shrink_ctx {
long nr_to_scan;
long scanned;
- u64 last_ino;
- u64 last_root;
};
static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_ctx *ctx)
{
- const u64 cur_fs_gen = btrfs_get_fs_generation(inode->root->fs_info);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ const u64 cur_fs_gen = btrfs_get_fs_generation(fs_info);
struct extent_map_tree *tree = &inode->extent_tree;
long nr_dropped = 0;
struct rb_node *node;
@@ -1201,7 +1196,8 @@ static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_c
* lock. This is to avoid slowing other tasks trying to take the
* lock.
*/
- if (need_resched() || rwlock_needbreak(&tree->lock))
+ if (need_resched() || rwlock_needbreak(&tree->lock) ||
+ btrfs_fs_closing(fs_info))
break;
node = next;
}
@@ -1213,19 +1209,21 @@ static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_c
static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx *ctx)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_inode *inode;
long nr_dropped = 0;
- u64 min_ino = ctx->last_ino + 1;
+ u64 min_ino = fs_info->em_shrinker_last_ino + 1;
inode = btrfs_find_first_inode(root, min_ino);
while (inode) {
nr_dropped += btrfs_scan_inode(inode, ctx);
min_ino = btrfs_ino(inode) + 1;
- ctx->last_ino = btrfs_ino(inode);
+ fs_info->em_shrinker_last_ino = btrfs_ino(inode);
btrfs_add_delayed_iput(inode);
- if (ctx->scanned >= ctx->nr_to_scan)
+ if (ctx->scanned >= ctx->nr_to_scan ||
+ btrfs_fs_closing(inode->root->fs_info))
break;
cond_resched();
@@ -1241,52 +1239,43 @@ static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx
* inode if there is one or we will find out this was the last
* one and move to the next root.
*/
- ctx->last_root = btrfs_root_id(root);
+ fs_info->em_shrinker_last_root = btrfs_root_id(root);
} else {
/*
* No more inodes in this root, set extent_map_shrinker_last_ino to 0 so
* that when processing the next root we start from its first inode.
*/
- ctx->last_ino = 0;
- ctx->last_root = btrfs_root_id(root) + 1;
+ fs_info->em_shrinker_last_ino = 0;
+ fs_info->em_shrinker_last_root = btrfs_root_id(root) + 1;
}
return nr_dropped;
}
-long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
+static void btrfs_extent_map_shrinker_worker(struct work_struct *work)
{
+ struct btrfs_fs_info *fs_info;
struct btrfs_em_shrink_ctx ctx;
u64 start_root_id;
u64 next_root_id;
bool cycled = false;
long nr_dropped = 0;
+ fs_info = container_of(work, struct btrfs_fs_info, em_shrinker_work);
+
ctx.scanned = 0;
- ctx.nr_to_scan = nr_to_scan;
+ ctx.nr_to_scan = atomic64_read(&fs_info->em_shrinker_nr_to_scan);
- /*
- * In case we have multiple tasks running this shrinker, make the next
- * one start from the next inode in case it starts before we finish.
- */
- spin_lock(&fs_info->extent_map_shrinker_lock);
- ctx.last_ino = fs_info->extent_map_shrinker_last_ino;
- fs_info->extent_map_shrinker_last_ino++;
- ctx.last_root = fs_info->extent_map_shrinker_last_root;
- spin_unlock(&fs_info->extent_map_shrinker_lock);
-
- start_root_id = ctx.last_root;
- next_root_id = ctx.last_root;
+ start_root_id = fs_info->em_shrinker_last_root;
+ next_root_id = fs_info->em_shrinker_last_root;
if (trace_btrfs_extent_map_shrinker_scan_enter_enabled()) {
s64 nr = percpu_counter_sum_positive(&fs_info->evictable_extent_maps);
- trace_btrfs_extent_map_shrinker_scan_enter(fs_info, nr_to_scan,
- nr, ctx.last_root,
- ctx.last_ino);
+ trace_btrfs_extent_map_shrinker_scan_enter(fs_info, nr);
}
- while (ctx.scanned < ctx.nr_to_scan) {
+ while (ctx.scanned < ctx.nr_to_scan && !btrfs_fs_closing(fs_info)) {
struct btrfs_root *root;
unsigned long count;
@@ -1300,8 +1289,8 @@ long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
spin_unlock(&fs_info->fs_roots_radix_lock);
if (start_root_id > 0 && !cycled) {
next_root_id = 0;
- ctx.last_root = 0;
- ctx.last_ino = 0;
+ fs_info->em_shrinker_last_root = 0;
+ fs_info->em_shrinker_last_ino = 0;
cycled = true;
continue;
}
@@ -1320,29 +1309,40 @@ long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
btrfs_put_root(root);
}
- /*
- * In case of multiple tasks running this extent map shrinking code this
- * isn't perfect but it's simple and silences things like KCSAN. It's
- * not possible to know which task made more progress because we can
- * cycle back to the first root and first inode if it's not the first
- * time the shrinker ran, see the above logic. Also a task that started
- * later may finish ealier than another task and made less progress. So
- * make this simple and update to the progress of the last task that
- * finished, with the occasional possiblity of having two consecutive
- * runs of the shrinker process the same inodes.
- */
- spin_lock(&fs_info->extent_map_shrinker_lock);
- fs_info->extent_map_shrinker_last_ino = ctx.last_ino;
- fs_info->extent_map_shrinker_last_root = ctx.last_root;
- spin_unlock(&fs_info->extent_map_shrinker_lock);
-
if (trace_btrfs_extent_map_shrinker_scan_exit_enabled()) {
s64 nr = percpu_counter_sum_positive(&fs_info->evictable_extent_maps);
- trace_btrfs_extent_map_shrinker_scan_exit(fs_info, nr_dropped,
- nr, ctx.last_root,
- ctx.last_ino);
+ trace_btrfs_extent_map_shrinker_scan_exit(fs_info, nr_dropped, nr);
}
- return nr_dropped;
+ atomic64_set(&fs_info->em_shrinker_nr_to_scan, 0);
+}
+
+void btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
+{
+ /*
+ * Do nothing if the shrinker is already running. In case of high memory
+ * pressure we can have a lot of tasks calling us and all passing the
+ * same nr_to_scan value, but in reality we may need only to free
+ * nr_to_scan extent maps (or less). In case we need to free more than
+ * that, we will be called again by the fs shrinker, so no worries about
+ * not doing enough work to reclaim memory from extent maps.
+ * We can also be repeatedly called with the same nr_to_scan value
+ * simply because the shrinker runs asynchronously and multiple calls
+ * to this function are made before the shrinker does enough progress.
+ *
+ * That's why we set the atomic counter to nr_to_scan only if its
+ * current value is zero, instead of incrementing the counter by
+ * nr_to_scan.
+ */
+ if (atomic64_cmpxchg(&fs_info->em_shrinker_nr_to_scan, 0, nr_to_scan) != 0)
+ return;
+
+ queue_work(system_unbound_wq, &fs_info->em_shrinker_work);
+}
+
+void btrfs_init_extent_map_shrinker_work(struct btrfs_fs_info *fs_info)
+{
+ atomic64_set(&fs_info->em_shrinker_nr_to_scan, 0);
+ INIT_WORK(&fs_info->em_shrinker_work, btrfs_extent_map_shrinker_worker);
}
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index 5154a8f..cd123b2 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -189,6 +189,7 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode,
int btrfs_replace_extent_map_range(struct btrfs_inode *inode,
struct extent_map *new_em,
bool modified);
-long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan);
+void btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan);
+void btrfs_init_extent_map_shrinker_work(struct btrfs_fs_info *fs_info);
#endif
diff --git a/fs/btrfs/fiemap.c b/fs/btrfs/fiemap.c
index df7f09f..b80c07a 100644
--- a/fs/btrfs/fiemap.c
+++ b/fs/btrfs/fiemap.c
@@ -186,7 +186,7 @@ static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo,
* we have in the cache is the last delalloc range we
* found while the file extent item we found can be
* either for a whole delalloc range we previously
- * emmitted or only a part of that range.
+ * emitted or only a part of that range.
*
* We have two cases here:
*
@@ -194,13 +194,13 @@ static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo,
* cached extent's end. In this case just ignore the
* current file extent item because we don't want to
* overlap with previous ranges that may have been
- * emmitted already;
+ * emitted already;
*
* 2) The file extent item starts behind the currently
* cached extent but its end offset goes beyond the
* end offset of the cached extent. We don't want to
* overlap with a previous range that may have been
- * emmitted already, so we emit the currently cached
+ * emitted already, so we emit the currently cached
* extent and then partially store the current file
* extent item's range in the cache, for the subrange
* going the cached extent's end to the end of the
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 4fb521d..588c353 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -37,33 +37,30 @@
#include "file.h"
#include "super.h"
-/* simple helper to fault in pages and copy. This should go away
- * and be replaced with calls into generic code.
+/*
+ * Helper to fault in page and copy. This should go away and be replaced with
+ * calls into generic code.
*/
static noinline int btrfs_copy_from_user(loff_t pos, size_t write_bytes,
- struct page **prepared_pages,
- struct iov_iter *i)
+ struct folio *folio, struct iov_iter *i)
{
size_t copied = 0;
size_t total_copied = 0;
- int pg = 0;
int offset = offset_in_page(pos);
while (write_bytes > 0) {
- size_t count = min_t(size_t,
- PAGE_SIZE - offset, write_bytes);
- struct page *page = prepared_pages[pg];
+ size_t count = min_t(size_t, PAGE_SIZE - offset, write_bytes);
/*
* Copy data from userspace to the current page
*/
- copied = copy_page_from_iter_atomic(page, offset, count, i);
+ copied = copy_folio_from_iter_atomic(folio, offset, count, i);
/* Flush processor's dcache for this page */
- flush_dcache_page(page);
+ flush_dcache_folio(folio);
/*
* if we get a partial write, we can end up with
- * partially up to date pages. These add
+ * partially up to date page. These add
* a lot of complexity, so make sure they don't
* happen by forcing this copy to be retried.
*
@@ -71,7 +68,7 @@ static noinline int btrfs_copy_from_user(loff_t pos, size_t write_bytes,
* back to page at a time copies after we return 0.
*/
if (unlikely(copied < count)) {
- if (!PageUptodate(page)) {
+ if (!folio_test_uptodate(folio)) {
iov_iter_revert(i, copied);
copied = 0;
}
@@ -82,54 +79,44 @@ static noinline int btrfs_copy_from_user(loff_t pos, size_t write_bytes,
write_bytes -= copied;
total_copied += copied;
offset += copied;
- if (offset == PAGE_SIZE) {
- pg++;
- offset = 0;
- }
}
return total_copied;
}
/*
- * unlocks pages after btrfs_file_write is done with them
+ * Unlock folio after btrfs_file_write() is done with it.
*/
-static void btrfs_drop_pages(struct btrfs_fs_info *fs_info,
- struct page **pages, size_t num_pages,
+static void btrfs_drop_folio(struct btrfs_fs_info *fs_info, struct folio *folio,
u64 pos, u64 copied)
{
- size_t i;
u64 block_start = round_down(pos, fs_info->sectorsize);
u64 block_len = round_up(pos + copied, fs_info->sectorsize) - block_start;
ASSERT(block_len <= U32_MAX);
- for (i = 0; i < num_pages; i++) {
- /* page checked is some magic around finding pages that
- * have been modified without going through btrfs_set_page_dirty
- * clear it here. There should be no need to mark the pages
- * accessed as prepare_pages should have marked them accessed
- * in prepare_pages via find_or_create_page()
- */
- btrfs_folio_clamp_clear_checked(fs_info, page_folio(pages[i]),
- block_start, block_len);
- unlock_page(pages[i]);
- put_page(pages[i]);
- }
+ /*
+ * Folio checked is some magic around finding folios that have been
+ * modified without going through btrfs_dirty_folio(). Clear it here.
+ * There should be no need to mark the pages accessed as
+ * prepare_one_folio() should have marked them accessed in
+ * prepare_one_folio() via find_or_create_page()
+ */
+ btrfs_folio_clamp_clear_checked(fs_info, folio, block_start, block_len);
+ folio_unlock(folio);
+ folio_put(folio);
}
/*
* After btrfs_copy_from_user(), update the following things for delalloc:
- * - Mark newly dirtied pages as DELALLOC in the io tree.
+ * - Mark newly dirtied folio as DELALLOC in the io tree.
* Used to advise which range is to be written back.
- * - Mark modified pages as Uptodate/Dirty and not needing COW fixup
+ * - Mark modified folio as Uptodate/Dirty and not needing COW fixup
* - Update inode size for past EOF write
*/
-int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages,
- size_t num_pages, loff_t pos, size_t write_bytes,
- struct extent_state **cached, bool noreserve)
+int btrfs_dirty_folio(struct btrfs_inode *inode, struct folio *folio, loff_t pos,
+ size_t write_bytes, struct extent_state **cached, bool noreserve)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
int ret = 0;
- int i;
u64 num_bytes;
u64 start_pos;
u64 end_of_last_block;
@@ -147,6 +134,8 @@ int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages,
num_bytes = round_up(write_bytes + pos - start_pos,
fs_info->sectorsize);
ASSERT(num_bytes <= U32_MAX);
+ ASSERT(folio_pos(folio) <= pos &&
+ folio_pos(folio) + folio_size(folio) >= pos + write_bytes);
end_of_last_block = start_pos + num_bytes - 1;
@@ -163,16 +152,9 @@ int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages,
if (ret)
return ret;
- for (i = 0; i < num_pages; i++) {
- struct page *p = pages[i];
-
- btrfs_folio_clamp_set_uptodate(fs_info, page_folio(p),
- start_pos, num_bytes);
- btrfs_folio_clamp_clear_checked(fs_info, page_folio(p),
- start_pos, num_bytes);
- btrfs_folio_clamp_set_dirty(fs_info, page_folio(p),
- start_pos, num_bytes);
- }
+ btrfs_folio_clamp_set_uptodate(fs_info, folio, start_pos, num_bytes);
+ btrfs_folio_clamp_clear_checked(fs_info, folio, start_pos, num_bytes);
+ btrfs_folio_clamp_set_dirty(fs_info, folio, start_pos, num_bytes);
/*
* we've only changed i_size in ram, and we haven't updated
@@ -851,55 +833,49 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
}
/*
- * on error we return an unlocked page and the error value
- * on success we return a locked page and 0
+ * On error return an unlocked folio and the error value
+ * On success return a locked folio and 0
*/
-static int prepare_uptodate_page(struct inode *inode,
- struct page *page, u64 pos,
- bool force_uptodate)
+static int prepare_uptodate_folio(struct inode *inode, struct folio *folio, u64 pos,
+ u64 len, bool force_uptodate)
{
- struct folio *folio = page_folio(page);
+ u64 clamp_start = max_t(u64, pos, folio_pos(folio));
+ u64 clamp_end = min_t(u64, pos + len, folio_pos(folio) + folio_size(folio));
int ret = 0;
- if (((pos & (PAGE_SIZE - 1)) || force_uptodate) &&
- !PageUptodate(page)) {
- ret = btrfs_read_folio(NULL, folio);
- if (ret)
- return ret;
- lock_page(page);
- if (!PageUptodate(page)) {
- unlock_page(page);
- return -EIO;
- }
+ if (folio_test_uptodate(folio))
+ return 0;
- /*
- * Since btrfs_read_folio() will unlock the folio before it
- * returns, there is a window where btrfs_release_folio() can be
- * called to release the page. Here we check both inode
- * mapping and PagePrivate() to make sure the page was not
- * released.
- *
- * The private flag check is essential for subpage as we need
- * to store extra bitmap using folio private.
- */
- if (page->mapping != inode->i_mapping || !folio_test_private(folio)) {
- unlock_page(page);
- return -EAGAIN;
- }
+ if (!force_uptodate &&
+ IS_ALIGNED(clamp_start, PAGE_SIZE) &&
+ IS_ALIGNED(clamp_end, PAGE_SIZE))
+ return 0;
+
+ ret = btrfs_read_folio(NULL, folio);
+ if (ret)
+ return ret;
+ folio_lock(folio);
+ if (!folio_test_uptodate(folio)) {
+ folio_unlock(folio);
+ return -EIO;
+ }
+
+ /*
+ * Since btrfs_read_folio() will unlock the folio before it returns,
+ * there is a window where btrfs_release_folio() can be called to
+ * release the page. Here we check both inode mapping and page
+ * private to make sure the page was not released.
+ *
+ * The private flag check is essential for subpage as we need to store
+ * extra bitmap using folio private.
+ */
+ if (folio->mapping != inode->i_mapping || !folio_test_private(folio)) {
+ folio_unlock(folio);
+ return -EAGAIN;
}
return 0;
}
-static fgf_t get_prepare_fgp_flags(bool nowait)
-{
- fgf_t fgp_flags = FGP_LOCK | FGP_ACCESSED | FGP_CREAT;
-
- if (nowait)
- fgp_flags |= FGP_NOWAIT;
-
- return fgp_flags;
-}
-
static gfp_t get_prepare_gfp_flags(struct inode *inode, bool nowait)
{
gfp_t gfp;
@@ -914,89 +890,67 @@ static gfp_t get_prepare_gfp_flags(struct inode *inode, bool nowait)
}
/*
- * this just gets pages into the page cache and locks them down.
+ * Get folio into the page cache and lock it.
*/
-static noinline int prepare_pages(struct inode *inode, struct page **pages,
- size_t num_pages, loff_t pos,
- size_t write_bytes, bool force_uptodate,
- bool nowait)
+static noinline int prepare_one_folio(struct inode *inode, struct folio **folio_ret,
+ loff_t pos, size_t write_bytes,
+ bool force_uptodate, bool nowait)
{
- int i;
unsigned long index = pos >> PAGE_SHIFT;
gfp_t mask = get_prepare_gfp_flags(inode, nowait);
- fgf_t fgp_flags = get_prepare_fgp_flags(nowait);
+ fgf_t fgp_flags = (nowait ? FGP_WRITEBEGIN | FGP_NOWAIT : FGP_WRITEBEGIN);
+ struct folio *folio;
int ret = 0;
- int faili;
- for (i = 0; i < num_pages; i++) {
again:
- pages[i] = pagecache_get_page(inode->i_mapping, index + i,
- fgp_flags, mask | __GFP_WRITE);
- if (!pages[i]) {
- faili = i - 1;
- if (nowait)
- ret = -EAGAIN;
- else
- ret = -ENOMEM;
- goto fail;
- }
-
- ret = set_page_extent_mapped(pages[i]);
- if (ret < 0) {
- faili = i;
- goto fail;
- }
-
- if (i == 0)
- ret = prepare_uptodate_page(inode, pages[i], pos,
- force_uptodate);
- if (!ret && i == num_pages - 1)
- ret = prepare_uptodate_page(inode, pages[i],
- pos + write_bytes, false);
- if (ret) {
- put_page(pages[i]);
- if (!nowait && ret == -EAGAIN) {
- ret = 0;
- goto again;
- }
- faili = i - 1;
- goto fail;
- }
- wait_on_page_writeback(pages[i]);
+ folio = __filemap_get_folio(inode->i_mapping, index, fgp_flags, mask);
+ if (IS_ERR(folio)) {
+ if (nowait)
+ ret = -EAGAIN;
+ else
+ ret = PTR_ERR(folio);
+ return ret;
}
-
+ /* Only support page sized folio yet. */
+ ASSERT(folio_order(folio) == 0);
+ ret = set_folio_extent_mapped(folio);
+ if (ret < 0) {
+ folio_unlock(folio);
+ folio_put(folio);
+ return ret;
+ }
+ ret = prepare_uptodate_folio(inode, folio, pos, write_bytes, force_uptodate);
+ if (ret) {
+ /* The folio is already unlocked. */
+ folio_put(folio);
+ if (!nowait && ret == -EAGAIN) {
+ ret = 0;
+ goto again;
+ }
+ return ret;
+ }
+ *folio_ret = folio;
return 0;
-fail:
- while (faili >= 0) {
- unlock_page(pages[faili]);
- put_page(pages[faili]);
- faili--;
- }
- return ret;
-
}
/*
- * This function locks the extent and properly waits for data=ordered extents
- * to finish before allowing the pages to be modified if need.
+ * Locks the extent and properly waits for data=ordered extents to finish
+ * before allowing the folios to be modified if need.
*
- * The return value:
+ * Return:
* 1 - the extent is locked
* 0 - the extent is not locked, and everything is OK
- * -EAGAIN - need re-prepare the pages
- * the other < 0 number - Something wrong happens
+ * -EAGAIN - need to prepare the folios again
*/
static noinline int
-lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,
- size_t num_pages, loff_t pos,
- size_t write_bytes,
+lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct folio *folio,
+ loff_t pos, size_t write_bytes,
u64 *lockstart, u64 *lockend, bool nowait,
struct extent_state **cached_state)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
u64 start_pos;
u64 last_pos;
- int i;
int ret = 0;
start_pos = round_down(pos, fs_info->sectorsize);
@@ -1008,12 +962,8 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,
if (nowait) {
if (!try_lock_extent(&inode->io_tree, start_pos, last_pos,
cached_state)) {
- for (i = 0; i < num_pages; i++) {
- unlock_page(pages[i]);
- put_page(pages[i]);
- pages[i] = NULL;
- }
-
+ folio_unlock(folio);
+ folio_put(folio);
return -EAGAIN;
}
} else {
@@ -1027,10 +977,8 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,
ordered->file_offset <= last_pos) {
unlock_extent(&inode->io_tree, start_pos, last_pos,
cached_state);
- for (i = 0; i < num_pages; i++) {
- unlock_page(pages[i]);
- put_page(pages[i]);
- }
+ folio_unlock(folio);
+ folio_put(folio);
btrfs_start_ordered_extent(ordered);
btrfs_put_ordered_extent(ordered);
return -EAGAIN;
@@ -1044,11 +992,10 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,
}
/*
- * We should be called after prepare_pages() which should have locked
+ * We should be called after prepare_one_folio() which should have locked
* all pages in the range.
*/
- for (i = 0; i < num_pages; i++)
- WARN_ON(!PageLocked(pages[i]));
+ WARN_ON(!folio_test_locked(folio));
return ret;
}
@@ -1120,27 +1067,7 @@ void btrfs_check_nocow_unlock(struct btrfs_inode *inode)
btrfs_drew_write_unlock(&inode->root->snapshot_lock);
}
-static void update_time_for_write(struct inode *inode)
-{
- struct timespec64 now, ts;
-
- if (IS_NOCMTIME(inode))
- return;
-
- now = current_time(inode);
- ts = inode_get_mtime(inode);
- if (!timespec64_equal(&ts, &now))
- inode_set_mtime_to_ts(inode, now);
-
- ts = inode_get_ctime(inode);
- if (!timespec64_equal(&ts, &now))
- inode_set_ctime_to_ts(inode, now);
-
- if (IS_I_VERSION(inode))
- inode_inc_iversion(inode);
-}
-
-int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from, size_t count)
+int btrfs_write_check(struct kiocb *iocb, size_t count)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
@@ -1170,7 +1097,10 @@ int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from, size_t count)
* need to start yet another transaction to update the inode as we will
* update the inode when we finish writing whatever data we write.
*/
- update_time_for_write(inode);
+ if (!IS_NOCMTIME(inode)) {
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
+ inode_inc_iversion(inode);
+ }
start_pos = round_down(pos, fs_info->sectorsize);
oldsize = i_size_read(inode);
@@ -1192,20 +1122,17 @@ ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i)
loff_t pos;
struct inode *inode = file_inode(file);
struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
- struct page **pages = NULL;
struct extent_changeset *data_reserved = NULL;
u64 release_bytes = 0;
u64 lockstart;
u64 lockend;
size_t num_written = 0;
- int nrptrs;
ssize_t ret;
- bool only_release_metadata = false;
- bool force_page_uptodate = false;
loff_t old_isize = i_size_read(inode);
unsigned int ilock_flags = 0;
const bool nowait = (iocb->ki_flags & IOCB_NOWAIT);
unsigned int bdp_flags = (nowait ? BDP_ASYNC : 0);
+ bool only_release_metadata = false;
if (nowait)
ilock_flags |= BTRFS_ILOCK_TRY;
@@ -1218,38 +1145,26 @@ ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i)
if (ret <= 0)
goto out;
- ret = btrfs_write_check(iocb, i, ret);
+ ret = btrfs_write_check(iocb, ret);
if (ret < 0)
goto out;
pos = iocb->ki_pos;
- nrptrs = min(DIV_ROUND_UP(iov_iter_count(i), PAGE_SIZE),
- PAGE_SIZE / (sizeof(struct page *)));
- nrptrs = min(nrptrs, current->nr_dirtied_pause - current->nr_dirtied);
- nrptrs = max(nrptrs, 8);
- pages = kmalloc_array(nrptrs, sizeof(struct page *), GFP_KERNEL);
- if (!pages) {
- ret = -ENOMEM;
- goto out;
- }
-
while (iov_iter_count(i) > 0) {
struct extent_state *cached_state = NULL;
size_t offset = offset_in_page(pos);
size_t sector_offset;
- size_t write_bytes = min(iov_iter_count(i),
- nrptrs * (size_t)PAGE_SIZE -
- offset);
- size_t num_pages;
+ size_t write_bytes = min(iov_iter_count(i), PAGE_SIZE - offset);
size_t reserve_bytes;
- size_t dirty_pages;
size_t copied;
size_t dirty_sectors;
size_t num_sectors;
+ struct folio *folio = NULL;
int extents_locked;
+ bool force_page_uptodate = false;
/*
- * Fault pages before locking them in prepare_pages
+ * Fault pages before locking them in prepare_one_folio()
* to avoid recursive lock
*/
if (unlikely(fault_in_iov_iter_readable(i, write_bytes))) {
@@ -1288,8 +1203,6 @@ ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i)
only_release_metadata = true;
}
- num_pages = DIV_ROUND_UP(write_bytes + offset, PAGE_SIZE);
- WARN_ON(num_pages > nrptrs);
reserve_bytes = round_up(write_bytes + sector_offset,
fs_info->sectorsize);
WARN_ON(reserve_bytes == 0);
@@ -1317,23 +1230,17 @@ ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i)
break;
}
- /*
- * This is going to setup the pages array with the number of
- * pages we want, so we don't really need to worry about the
- * contents of pages from loop to loop
- */
- ret = prepare_pages(inode, pages, num_pages,
- pos, write_bytes, force_page_uptodate, false);
+ ret = prepare_one_folio(inode, &folio, pos, write_bytes,
+ force_page_uptodate, false);
if (ret) {
btrfs_delalloc_release_extents(BTRFS_I(inode),
reserve_bytes);
break;
}
- extents_locked = lock_and_cleanup_extent_if_need(
- BTRFS_I(inode), pages,
- num_pages, pos, write_bytes, &lockstart,
- &lockend, nowait, &cached_state);
+ extents_locked = lock_and_cleanup_extent_if_need(BTRFS_I(inode),
+ folio, pos, write_bytes, &lockstart,
+ &lockend, nowait, &cached_state);
if (extents_locked < 0) {
if (!nowait && extents_locked == -EAGAIN)
goto again;
@@ -1344,28 +1251,18 @@ ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i)
break;
}
- copied = btrfs_copy_from_user(pos, write_bytes, pages, i);
+ copied = btrfs_copy_from_user(pos, write_bytes, folio, i);
num_sectors = BTRFS_BYTES_TO_BLKS(fs_info, reserve_bytes);
dirty_sectors = round_up(copied + sector_offset,
fs_info->sectorsize);
dirty_sectors = BTRFS_BYTES_TO_BLKS(fs_info, dirty_sectors);
- /*
- * if we have trouble faulting in the pages, fall
- * back to one page at a time
- */
- if (copied < write_bytes)
- nrptrs = 1;
-
if (copied == 0) {
force_page_uptodate = true;
dirty_sectors = 0;
- dirty_pages = 0;
} else {
force_page_uptodate = false;
- dirty_pages = DIV_ROUND_UP(copied + offset,
- PAGE_SIZE);
}
if (num_sectors > dirty_sectors) {
@@ -1375,13 +1272,10 @@ ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i)
btrfs_delalloc_release_metadata(BTRFS_I(inode),
release_bytes, true);
} else {
- u64 __pos;
-
- __pos = round_down(pos,
- fs_info->sectorsize) +
- (dirty_pages << PAGE_SHIFT);
+ u64 release_start = round_up(pos + copied,
+ fs_info->sectorsize);
btrfs_delalloc_release_space(BTRFS_I(inode),
- data_reserved, __pos,
+ data_reserved, release_start,
release_bytes, true);
}
}
@@ -1389,15 +1283,14 @@ ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i)
release_bytes = round_up(copied + sector_offset,
fs_info->sectorsize);
- ret = btrfs_dirty_pages(BTRFS_I(inode), pages,
- dirty_pages, pos, copied,
+ ret = btrfs_dirty_folio(BTRFS_I(inode), folio, pos, copied,
&cached_state, only_release_metadata);
/*
* If we have not locked the extent range, because the range's
* start offset is >= i_size, we might still have a non-NULL
* cached extent state, acquired while marking the extent range
- * as delalloc through btrfs_dirty_pages(). Therefore free any
+ * as delalloc through btrfs_dirty_page(). Therefore free any
* possible cached extent state to avoid a memory leak.
*/
if (extents_locked)
@@ -1408,7 +1301,7 @@ ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i)
btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes);
if (ret) {
- btrfs_drop_pages(fs_info, pages, num_pages, pos, copied);
+ btrfs_drop_folio(fs_info, folio, pos, copied);
break;
}
@@ -1416,7 +1309,7 @@ ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i)
if (only_release_metadata)
btrfs_check_nocow_unlock(BTRFS_I(inode));
- btrfs_drop_pages(fs_info, pages, num_pages, pos, copied);
+ btrfs_drop_folio(fs_info, folio, pos, copied);
cond_resched();
@@ -1424,8 +1317,6 @@ ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i)
num_written += copied;
}
- kfree(pages);
-
if (release_bytes) {
if (only_release_metadata) {
btrfs_check_nocow_unlock(BTRFS_I(inode));
@@ -1470,7 +1361,7 @@ static ssize_t btrfs_encoded_write(struct kiocb *iocb, struct iov_iter *from,
if (ret || encoded->len == 0)
goto out;
- ret = btrfs_write_check(iocb, from, encoded->len);
+ ret = btrfs_write_check(iocb, encoded->len);
if (ret < 0)
goto out;
@@ -3802,6 +3693,7 @@ const struct file_operations btrfs_file_operations = {
.compat_ioctl = btrfs_compat_ioctl,
#endif
.remap_file_range = btrfs_remap_file_range,
+ .uring_cmd = btrfs_uring_cmd,
.fop_flags = FOP_BUFFER_RASYNC | FOP_BUFFER_WASYNC,
};
diff --git a/fs/btrfs/file.h b/fs/btrfs/file.h
index 912254e..de89e64 100644
--- a/fs/btrfs/file.h
+++ b/fs/btrfs/file.h
@@ -34,9 +34,8 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from,
const struct btrfs_ioctl_encoded_io_args *encoded);
int btrfs_release_file(struct inode *inode, struct file *file);
-int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages,
- size_t num_pages, loff_t pos, size_t write_bytes,
- struct extent_state **cached, bool noreserve);
+int btrfs_dirty_folio(struct btrfs_inode *inode, struct folio *folio, loff_t pos,
+ size_t write_bytes, struct extent_state **cached, bool noreserve);
int btrfs_fdatawrite_range(struct btrfs_inode *inode, loff_t start, loff_t end);
int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos,
size_t *write_bytes, bool nowait);
@@ -44,7 +43,7 @@ void btrfs_check_nocow_unlock(struct btrfs_inode *inode);
bool btrfs_find_delalloc_in_range(struct btrfs_inode *inode, u64 start, u64 end,
struct extent_state **cached_state,
u64 *delalloc_start_ret, u64 *delalloc_end_ret);
-int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from, size_t count);
+int btrfs_write_check(struct kiocb *iocb, size_t count);
ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i);
#endif
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index f4bcb25..cfa52ef 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -11,6 +11,7 @@
#include <linux/ratelimit.h>
#include <linux/error-injection.h>
#include <linux/sched/mm.h>
+#include <linux/string_choices.h>
#include "ctree.h"
#include "fs.h"
#include "messages.h"
@@ -1387,6 +1388,7 @@ static int __btrfs_write_out_cache(struct inode *inode,
int bitmaps = 0;
int ret;
int must_iput = 0;
+ int i_size;
if (!i_size_read(inode))
return -EIO;
@@ -1457,11 +1459,16 @@ static int __btrfs_write_out_cache(struct inode *inode,
io_ctl_zero_remaining_pages(io_ctl);
/* Everything is written out, now we dirty the pages in the file. */
- ret = btrfs_dirty_pages(BTRFS_I(inode), io_ctl->pages,
- io_ctl->num_pages, 0, i_size_read(inode),
- &cached_state, false);
- if (ret)
- goto out_nospc;
+ i_size = i_size_read(inode);
+ for (int i = 0; i < round_up(i_size, PAGE_SIZE) / PAGE_SIZE; i++) {
+ u64 dirty_start = i * PAGE_SIZE;
+ u64 dirty_len = min_t(u64, dirty_start + PAGE_SIZE, i_size) - dirty_start;
+
+ ret = btrfs_dirty_folio(BTRFS_I(inode), page_folio(io_ctl->pages[i]),
+ dirty_start, dirty_len, &cached_state, false);
+ if (ret < 0)
+ goto out_nospc;
+ }
if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA))
up_write(&block_group->data_rwsem);
@@ -2936,12 +2943,11 @@ void btrfs_dump_free_space(struct btrfs_block_group *block_group,
if (info->bytes >= bytes && !block_group->ro)
count++;
btrfs_crit(fs_info, "entry offset %llu, bytes %llu, bitmap %s",
- info->offset, info->bytes,
- (info->bitmap) ? "yes" : "no");
+ info->offset, info->bytes, str_yes_no(info->bitmap));
}
spin_unlock(&ctl->tree_lock);
btrfs_info(fs_info, "block group has cluster?: %s",
- list_empty(&block_group->cluster_list) ? "no" : "yes");
+ str_no_yes(list_empty(&block_group->cluster_list)));
btrfs_info(fs_info,
"%d free space entries at or bigger than %llu bytes",
count, bytes);
diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
index 79f64e3..79a1a3d 100644
--- a/fs/btrfs/fs.h
+++ b/fs/btrfs/fs.h
@@ -263,10 +263,10 @@ enum {
BTRFS_FEATURE_INCOMPAT_ZONED | \
BTRFS_FEATURE_INCOMPAT_SIMPLE_QUOTA)
-#ifdef CONFIG_BTRFS_DEBUG
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
/*
* Features under developmen like Extent tree v2 support is enabled
- * only under CONFIG_BTRFS_DEBUG.
+ * only under CONFIG_BTRFS_EXPERIMENTAL
*/
#define BTRFS_FEATURE_INCOMPAT_SUPP \
(BTRFS_FEATURE_INCOMPAT_SUPP_STABLE | \
@@ -317,6 +317,8 @@ struct btrfs_dev_replace {
struct percpu_counter bio_counter;
wait_queue_head_t replace_wait;
+
+ struct task_struct *replace_task;
};
/*
@@ -633,9 +635,10 @@ struct btrfs_fs_info {
s32 delalloc_batch;
struct percpu_counter evictable_extent_maps;
- spinlock_t extent_map_shrinker_lock;
- u64 extent_map_shrinker_last_root;
- u64 extent_map_shrinker_last_ino;
+ u64 em_shrinker_last_root;
+ u64 em_shrinker_last_ino;
+ atomic64_t em_shrinker_nr_to_scan;
+ struct work_struct em_shrinker_work;
/* Protected by 'trans_lock'. */
struct list_head dirty_cowonly_roots;
@@ -876,12 +879,9 @@ struct btrfs_fs_info {
#endif
};
-#define page_to_inode(_page) (BTRFS_I(_Generic((_page), \
- struct page *: (_page))->mapping->host))
#define folio_to_inode(_folio) (BTRFS_I(_Generic((_folio), \
struct folio *: (_folio))->mapping->host))
-#define page_to_fs_info(_page) (page_to_inode(_page)->root->fs_info)
#define folio_to_fs_info(_folio) (folio_to_inode(_folio)->root->fs_info)
#define inode_to_fs_info(_inode) (BTRFS_I(_Generic((_inode), \
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 1e4ca1e..03fe0de 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -421,7 +421,7 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,
index++;
continue;
}
- folio = __filemap_get_folio(inode->vfs_inode.i_mapping, index, 0, 0);
+ folio = filemap_get_folio(inode->vfs_inode.i_mapping, index);
index++;
if (IS_ERR(folio))
continue;
@@ -556,8 +556,7 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
} else {
struct folio *folio;
- folio = __filemap_get_folio(inode->vfs_inode.i_mapping,
- 0, 0, 0);
+ folio = filemap_get_folio(inode->vfs_inode.i_mapping, 0);
ASSERT(!IS_ERR(folio));
btrfs_set_file_extent_compression(leaf, ei, 0);
kaddr = kmap_local_folio(folio, 0);
@@ -646,7 +645,7 @@ static bool can_cow_file_range_inline(struct btrfs_inode *inode,
* If being used directly, you must have already checked we're allowed to cow
* the range by getting true from can_cow_file_range_inline().
*/
-static noinline int __cow_file_range_inline(struct btrfs_inode *inode, u64 offset,
+static noinline int __cow_file_range_inline(struct btrfs_inode *inode,
u64 size, size_t compressed_size,
int compress_type,
struct folio *compressed_folio,
@@ -736,7 +735,7 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode,
return 1;
lock_extent(&inode->io_tree, offset, end, &cached);
- ret = __cow_file_range_inline(inode, offset, size, compressed_size,
+ ret = __cow_file_range_inline(inode, size, compressed_size,
compress_type, compressed_folio,
update_i_size);
if (ret > 0) {
@@ -832,32 +831,16 @@ static inline int inode_need_compress(struct btrfs_inode *inode, u64 start,
return 0;
}
/*
- * Special check for subpage.
+ * Only enable sector perfect compression for experimental builds.
*
- * We lock the full page then run each delalloc range in the page, thus
- * for the following case, we will hit some subpage specific corner case:
+ * This is a big feature change for subpage cases, and can hit
+ * different corner cases, so only limit this feature for
+ * experimental build for now.
*
- * 0 32K 64K
- * | |///////| |///////|
- * \- A \- B
- *
- * In above case, both range A and range B will try to unlock the full
- * page [0, 64K), causing the one finished later will have page
- * unlocked already, triggering various page lock requirement BUG_ON()s.
- *
- * So here we add an artificial limit that subpage compression can only
- * if the range is fully page aligned.
- *
- * In theory we only need to ensure the first page is fully covered, but
- * the tailing partial page will be locked until the full compression
- * finishes, delaying the write of other range.
- *
- * TODO: Make btrfs_run_delalloc_range() to lock all delalloc range
- * first to prevent any submitted async extent to unlock the full page.
- * By this, we can ensure for subpage case that only the last async_cow
- * will unlock the full page.
+ * ETA for moving this out of experimental builds is 6.15.
*/
- if (fs_info->sectorsize < PAGE_SIZE) {
+ if (fs_info->sectorsize < PAGE_SIZE &&
+ !IS_ENABLED(CONFIG_BTRFS_EXPERIMENTAL)) {
if (!PAGE_ALIGNED(start) ||
!PAGE_ALIGNED(end + 1))
return 0;
@@ -896,13 +879,14 @@ static int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 e
for (unsigned long index = start >> PAGE_SHIFT;
index <= end_index; index++) {
- folio = __filemap_get_folio(inode->i_mapping, index, 0, 0);
+ folio = filemap_get_folio(inode->i_mapping, index);
if (IS_ERR(folio)) {
if (!ret)
ret = PTR_ERR(folio);
continue;
}
- folio_clear_dirty_for_io(folio);
+ btrfs_folio_clamp_clear_dirty(inode_to_fs_info(inode), folio, start,
+ end + 1 - start);
folio_put(folio);
}
return ret;
@@ -1001,17 +985,6 @@ static void compress_file_range(struct btrfs_work *work)
(start > 0 || end + 1 < inode->disk_i_size))
goto cleanup_and_bail_uncompressed;
- /*
- * For subpage case, we require full page alignment for the sector
- * aligned range.
- * Thus we must also check against @actual_end, not just @end.
- */
- if (blocksize < PAGE_SIZE) {
- if (!PAGE_ALIGNED(start) ||
- !PAGE_ALIGNED(round_up(actual_end, blocksize)))
- goto cleanup_and_bail_uncompressed;
- }
-
total_compressed = min_t(unsigned long, total_compressed,
BTRFS_MAX_UNCOMPRESSED);
total_in = 0;
@@ -1359,7 +1332,6 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
u64 alloc_hint = 0;
u64 orig_start = start;
u64 num_bytes;
- unsigned long ram_size;
u64 cur_alloc_size = 0;
u64 min_alloc_size;
u64 blocksize = fs_info->sectorsize;
@@ -1367,7 +1339,6 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
struct extent_map *em;
unsigned clear_bits;
unsigned long page_ops;
- bool extent_reserved = false;
int ret = 0;
if (btrfs_is_free_space_inode(inode)) {
@@ -1421,8 +1392,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
struct btrfs_ordered_extent *ordered;
struct btrfs_file_extent file_extent;
- cur_alloc_size = num_bytes;
- ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size,
+ ret = btrfs_reserve_extent(root, num_bytes, num_bytes,
min_alloc_size, 0, alloc_hint,
&ins, 1, 1);
if (ret == -EAGAIN) {
@@ -1453,9 +1423,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
if (ret < 0)
goto out_unlock;
cur_alloc_size = ins.offset;
- extent_reserved = true;
- ram_size = ins.offset;
file_extent.disk_bytenr = ins.objectid;
file_extent.disk_num_bytes = ins.offset;
file_extent.num_bytes = ins.offset;
@@ -1463,14 +1431,14 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
file_extent.offset = 0;
file_extent.compression = BTRFS_COMPRESS_NONE;
- lock_extent(&inode->io_tree, start, start + ram_size - 1,
+ lock_extent(&inode->io_tree, start, start + cur_alloc_size - 1,
&cached);
em = btrfs_create_io_em(inode, start, &file_extent,
BTRFS_ORDERED_REGULAR);
if (IS_ERR(em)) {
unlock_extent(&inode->io_tree, start,
- start + ram_size - 1, &cached);
+ start + cur_alloc_size - 1, &cached);
ret = PTR_ERR(em);
goto out_reserve;
}
@@ -1480,7 +1448,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
1 << BTRFS_ORDERED_REGULAR);
if (IS_ERR(ordered)) {
unlock_extent(&inode->io_tree, start,
- start + ram_size - 1, &cached);
+ start + cur_alloc_size - 1, &cached);
ret = PTR_ERR(ordered);
goto out_drop_extent_cache;
}
@@ -1501,7 +1469,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
*/
if (ret)
btrfs_drop_extent_map_range(inode, start,
- start + ram_size - 1,
+ start + cur_alloc_size - 1,
false);
}
btrfs_put_ordered_extent(ordered);
@@ -1513,13 +1481,13 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
* (which the caller expects to stay locked), don't clear any
* dirty bits and don't set any writeback bits
*
- * Do set the Ordered (Private2) bit so we know this page was
+ * Do set the Ordered flag so we know this page was
* properly setup for writepage.
*/
page_ops = (keep_locked ? 0 : PAGE_UNLOCK);
page_ops |= PAGE_SET_ORDERED;
- extent_clear_unlock_delalloc(inode, start, start + ram_size - 1,
+ extent_clear_unlock_delalloc(inode, start, start + cur_alloc_size - 1,
locked_folio, &cached,
EXTENT_LOCKED | EXTENT_DELALLOC,
page_ops);
@@ -1529,7 +1497,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
num_bytes -= cur_alloc_size;
alloc_hint = ins.objectid + ins.offset;
start += cur_alloc_size;
- extent_reserved = false;
+ cur_alloc_size = 0;
/*
* btrfs_reloc_clone_csums() error, since start is increased
@@ -1545,7 +1513,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
return ret;
out_drop_extent_cache:
- btrfs_drop_extent_map_range(inode, start, start + ram_size - 1, false);
+ btrfs_drop_extent_map_range(inode, start, start + cur_alloc_size - 1, false);
out_reserve:
btrfs_dec_block_group_reservations(fs_info, ins.objectid);
btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
@@ -1599,13 +1567,12 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
* to decrement again the data space_info's bytes_may_use counter,
* therefore we do not pass it the flag EXTENT_CLEAR_DATA_RESV.
*/
- if (extent_reserved) {
+ if (cur_alloc_size) {
extent_clear_unlock_delalloc(inode, start,
start + cur_alloc_size - 1,
locked_folio, &cached, clear_bits,
page_ops);
btrfs_qgroup_free_data(inode, NULL, start, cur_alloc_size, NULL);
- start += cur_alloc_size;
}
/*
@@ -1614,11 +1581,13 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
* space_info's bytes_may_use counter, reserved in
* btrfs_check_data_free_space().
*/
- if (start < end) {
+ if (start + cur_alloc_size < end) {
clear_bits |= EXTENT_CLEAR_DATA_RESV;
- extent_clear_unlock_delalloc(inode, start, end, locked_folio,
+ extent_clear_unlock_delalloc(inode, start + cur_alloc_size,
+ end, locked_folio,
&cached, clear_bits, page_ops);
- btrfs_qgroup_free_data(inode, NULL, start, end - start + 1, NULL);
+ btrfs_qgroup_free_data(inode, NULL, start + cur_alloc_size,
+ end - start - cur_alloc_size + 1, NULL);
}
return ret;
}
@@ -1729,7 +1698,7 @@ static bool run_delalloc_compressed(struct btrfs_inode *inode,
* need full accuracy. Just account the whole thing
* against the first page.
*/
- wbc_account_cgroup_owner(wbc, &locked_folio->page,
+ wbc_account_cgroup_owner(wbc, locked_folio,
cur_end - start);
async_chunk[i].locked_folio = locked_folio;
locked_folio = NULL;
@@ -3094,34 +3063,6 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent)
goto out;
}
- if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
- BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */
-
- btrfs_inode_safe_disk_i_size_write(inode, 0);
- if (freespace_inode)
- trans = btrfs_join_transaction_spacecache(root);
- else
- trans = btrfs_join_transaction(root);
- if (IS_ERR(trans)) {
- ret = PTR_ERR(trans);
- trans = NULL;
- goto out;
- }
- trans->block_rsv = &inode->block_rsv;
- ret = btrfs_update_inode_fallback(trans, inode);
- if (ret) /* -ENOMEM or corruption */
- btrfs_abort_transaction(trans, ret);
-
- ret = btrfs_insert_raid_extent(trans, ordered_extent);
- if (ret)
- btrfs_abort_transaction(trans, ret);
-
- goto out;
- }
-
- clear_bits |= EXTENT_LOCKED;
- lock_extent(io_tree, start, end, &cached_state);
-
if (freespace_inode)
trans = btrfs_join_transaction_spacecache(root);
else
@@ -3135,8 +3076,31 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent)
trans->block_rsv = &inode->block_rsv;
ret = btrfs_insert_raid_extent(trans, ordered_extent);
- if (ret)
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
goto out;
+ }
+
+ if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
+ /* Logic error */
+ ASSERT(list_empty(&ordered_extent->list));
+ if (!list_empty(&ordered_extent->list)) {
+ ret = -EINVAL;
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
+
+ btrfs_inode_safe_disk_i_size_write(inode, 0);
+ ret = btrfs_update_inode_fallback(trans, inode);
+ if (ret) {
+ /* -ENOMEM or corruption */
+ btrfs_abort_transaction(trans, ret);
+ }
+ goto out;
+ }
+
+ clear_bits |= EXTENT_LOCKED;
+ lock_extent(io_tree, start, end, &cached_state);
if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
compress_type = ordered_extent->compress_type;
@@ -3791,14 +3755,45 @@ static int btrfs_init_file_extent_tree(struct btrfs_inode *inode)
return 0;
}
+static int btrfs_add_inode_to_root(struct btrfs_inode *inode, bool prealloc)
+{
+ struct btrfs_root *root = inode->root;
+ struct btrfs_inode *existing;
+ const u64 ino = btrfs_ino(inode);
+ int ret;
+
+ if (inode_unhashed(&inode->vfs_inode))
+ return 0;
+
+ if (prealloc) {
+ ret = xa_reserve(&root->inodes, ino, GFP_NOFS);
+ if (ret)
+ return ret;
+ }
+
+ existing = xa_store(&root->inodes, ino, inode, GFP_ATOMIC);
+
+ if (xa_is_err(existing)) {
+ ret = xa_err(existing);
+ ASSERT(ret != -EINVAL);
+ ASSERT(ret != -ENOMEM);
+ return ret;
+ } else if (existing) {
+ WARN_ON(!(existing->vfs_inode.i_state & (I_WILL_FREE | I_FREEING)));
+ }
+
+ return 0;
+}
+
/*
- * read an inode from the btree into the in-memory inode
+ * Read a locked inode from the btree into the in-memory inode and add it to
+ * its root list/tree.
+ *
+ * On failure clean up the inode.
*/
-static int btrfs_read_locked_inode(struct inode *inode,
- struct btrfs_path *in_path)
+static int btrfs_read_locked_inode(struct inode *inode, struct btrfs_path *path)
{
struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
- struct btrfs_path *path = in_path;
struct extent_buffer *leaf;
struct btrfs_inode_item *inode_item;
struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -3812,25 +3807,25 @@ static int btrfs_read_locked_inode(struct inode *inode,
ret = btrfs_init_file_extent_tree(BTRFS_I(inode));
if (ret)
- return ret;
+ goto out;
ret = btrfs_fill_inode(inode, &rdev);
if (!ret)
filled = true;
- if (!path) {
- path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
- }
+ ASSERT(path);
btrfs_get_inode_key(BTRFS_I(inode), &location);
ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
if (ret) {
- if (path != in_path)
- btrfs_free_path(path);
- return ret;
+ /*
+ * ret > 0 can come from btrfs_search_slot called by
+ * btrfs_lookup_inode(), this means the inode was not found.
+ */
+ if (ret > 0)
+ ret = -ENOENT;
+ goto out;
}
leaf = path->nodes[0];
@@ -3965,8 +3960,6 @@ static int btrfs_read_locked_inode(struct inode *inode,
btrfs_ino(BTRFS_I(inode)),
btrfs_root_id(root), ret);
}
- if (path != in_path)
- btrfs_free_path(path);
if (!maybe_acls)
cache_no_acl(inode);
@@ -3993,7 +3986,15 @@ static int btrfs_read_locked_inode(struct inode *inode,
}
btrfs_sync_inode_flags_to_i_flags(inode);
+
+ ret = btrfs_add_inode_to_root(BTRFS_I(inode), true);
+ if (ret)
+ goto out;
+
return 0;
+out:
+ iget_failed(inode);
+ return ret;
}
/*
@@ -5502,35 +5503,7 @@ static int fixup_tree_root_location(struct btrfs_fs_info *fs_info,
return err;
}
-static int btrfs_add_inode_to_root(struct btrfs_inode *inode, bool prealloc)
-{
- struct btrfs_root *root = inode->root;
- struct btrfs_inode *existing;
- const u64 ino = btrfs_ino(inode);
- int ret;
- if (inode_unhashed(&inode->vfs_inode))
- return 0;
-
- if (prealloc) {
- ret = xa_reserve(&root->inodes, ino, GFP_NOFS);
- if (ret)
- return ret;
- }
-
- existing = xa_store(&root->inodes, ino, inode, GFP_ATOMIC);
-
- if (xa_is_err(existing)) {
- ret = xa_err(existing);
- ASSERT(ret != -EINVAL);
- ASSERT(ret != -ENOMEM);
- return ret;
- } else if (existing) {
- WARN_ON(!(existing->vfs_inode.i_state & (I_WILL_FREE | I_FREEING)));
- }
-
- return 0;
-}
static void btrfs_del_inode_from_root(struct btrfs_inode *inode)
{
@@ -5592,10 +5565,8 @@ static struct inode *btrfs_iget_locked(u64 ino, struct btrfs_root *root)
}
/*
- * Get an inode object given its inode number and corresponding root.
- * Path can be preallocated to prevent recursing back to iget through
- * allocator. NULL is also valid but may require an additional allocation
- * later.
+ * Get an inode object given its inode number and corresponding root. Path is
+ * preallocated to prevent recursing back to iget through allocator.
*/
struct inode *btrfs_iget_path(u64 ino, struct btrfs_root *root,
struct btrfs_path *path)
@@ -5611,30 +5582,40 @@ struct inode *btrfs_iget_path(u64 ino, struct btrfs_root *root,
return inode;
ret = btrfs_read_locked_inode(inode, path);
- /*
- * ret > 0 can come from btrfs_search_slot called by
- * btrfs_read_locked_inode(), this means the inode item was not found.
- */
- if (ret > 0)
- ret = -ENOENT;
- if (ret < 0)
- goto error;
-
- ret = btrfs_add_inode_to_root(BTRFS_I(inode), true);
- if (ret < 0)
- goto error;
+ if (ret)
+ return ERR_PTR(ret);
unlock_new_inode(inode);
-
return inode;
-error:
- iget_failed(inode);
- return ERR_PTR(ret);
}
+/*
+ * Get an inode object given its inode number and corresponding root.
+ */
struct inode *btrfs_iget(u64 ino, struct btrfs_root *root)
{
- return btrfs_iget_path(ino, root, NULL);
+ struct inode *inode;
+ struct btrfs_path *path;
+ int ret;
+
+ inode = btrfs_iget_locked(ino, root);
+ if (!inode)
+ return ERR_PTR(-ENOMEM);
+
+ if (!(inode->i_state & I_NEW))
+ return inode;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return ERR_PTR(-ENOMEM);
+
+ ret = btrfs_read_locked_inode(inode, path);
+ btrfs_free_path(path);
+ if (ret)
+ return ERR_PTR(ret);
+
+ unlock_new_inode(inode);
+ return inode;
}
static struct inode *new_simple_dir(struct inode *dir,
@@ -6023,7 +6004,7 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
* offset. This means that new entries created during readdir
* are *guaranteed* to be seen in the future by that readdir.
* This has broken buggy programs which operate on names as
- * they're returned by readdir. Until we re-use freed offsets
+ * they're returned by readdir. Until we reuse freed offsets
* we have this hack to stop new entries from being returned
* under the assumption that they'll never reach this huge
* offset.
@@ -6765,8 +6746,7 @@ static noinline int uncompress_inline(struct btrfs_path *path,
return ret;
}
-static int read_inline_extent(struct btrfs_inode *inode, struct btrfs_path *path,
- struct folio *folio)
+static int read_inline_extent(struct btrfs_path *path, struct folio *folio)
{
struct btrfs_file_extent_item *fi;
void *kaddr;
@@ -6964,7 +6944,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
ASSERT(em->disk_bytenr == EXTENT_MAP_INLINE);
ASSERT(em->len == fs_info->sectorsize);
- ret = read_inline_extent(inode, path, folio);
+ ret = read_inline_extent(path, folio);
if (ret < 0)
goto out;
goto insert;
@@ -7294,7 +7274,7 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset,
*
* But already submitted bio can still be finished on this folio.
* Furthermore, endio function won't skip folio which has Ordered
- * (Private2) already cleared, so it's possible for endio and
+ * already cleared, so it's possible for endio and
* invalidate_folio to do the same ordered extent accounting twice
* on one folio.
*
@@ -7360,7 +7340,7 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset,
range_len = range_end + 1 - cur;
if (!btrfs_folio_test_ordered(fs_info, folio, cur, range_len)) {
/*
- * If Ordered (Private2) is cleared, it means endio has
+ * If Ordered is cleared, it means endio has
* already been executed for the range.
* We can't delete the extent states as
* btrfs_finish_ordered_io() may still use some of them.
@@ -7433,7 +7413,7 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset,
}
/*
* We have iterated through all ordered extents of the page, the page
- * should not have Ordered (Private2) anymore, or the above iteration
+ * should not have Ordered anymore, or the above iteration
* did something wrong.
*/
ASSERT(!folio_test_ordered(folio));
@@ -8972,28 +8952,6 @@ static int btrfs_tmpfile(struct mnt_idmap *idmap, struct inode *dir,
return finish_open_simple(file, ret);
}
-void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end)
-{
- struct btrfs_fs_info *fs_info = inode->root->fs_info;
- unsigned long index = start >> PAGE_SHIFT;
- unsigned long end_index = end >> PAGE_SHIFT;
- struct folio *folio;
- u32 len;
-
- ASSERT(end + 1 - start <= U32_MAX);
- len = end + 1 - start;
- while (index <= end_index) {
- folio = __filemap_get_folio(inode->vfs_inode.i_mapping, index, 0, 0);
- ASSERT(!IS_ERR(folio)); /* folios should be in the extent_io_tree */
-
- /* This is for data, which doesn't yet support larger folio. */
- ASSERT(folio_order(folio) == 0);
- btrfs_folio_set_writeback(fs_info, folio, start, len);
- folio_put(folio);
- index++;
- }
-}
-
int btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info *fs_info,
int compress_type)
{
@@ -9038,12 +8996,16 @@ static ssize_t btrfs_encoded_read_inline(
unsigned long ptr;
void *tmp;
ssize_t ret;
+ const bool nowait = (iocb->ki_flags & IOCB_NOWAIT);
path = btrfs_alloc_path();
if (!path) {
ret = -ENOMEM;
goto out;
}
+
+ path->nowait = nowait;
+
ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode),
extent_start, 0);
if (ret) {
@@ -9107,6 +9069,7 @@ static ssize_t btrfs_encoded_read_inline(
struct btrfs_encoded_read_private {
wait_queue_head_t wait;
+ void *uring_ctx;
atomic_t pending;
blk_status_t status;
};
@@ -9126,26 +9089,40 @@ static void btrfs_encoded_read_endio(struct btrfs_bio *bbio)
*/
WRITE_ONCE(priv->status, bbio->bio.bi_status);
}
- if (!atomic_dec_return(&priv->pending))
- wake_up(&priv->wait);
+ if (atomic_dec_return(&priv->pending) == 0) {
+ int err = blk_status_to_errno(READ_ONCE(priv->status));
+
+ if (priv->uring_ctx) {
+ btrfs_uring_read_extent_endio(priv->uring_ctx, err);
+ kfree(priv);
+ } else {
+ wake_up(&priv->wait);
+ }
+ }
bio_put(&bbio->bio);
}
int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
- u64 file_offset, u64 disk_bytenr,
- u64 disk_io_size, struct page **pages)
+ u64 disk_bytenr, u64 disk_io_size,
+ struct page **pages, void *uring_ctx)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
- struct btrfs_encoded_read_private priv = {
- .pending = ATOMIC_INIT(1),
- };
+ struct btrfs_encoded_read_private *priv;
unsigned long i = 0;
struct btrfs_bio *bbio;
+ int ret;
- init_waitqueue_head(&priv.wait);
+ priv = kmalloc(sizeof(struct btrfs_encoded_read_private), GFP_NOFS);
+ if (!priv)
+ return -ENOMEM;
+
+ init_waitqueue_head(&priv->wait);
+ atomic_set(&priv->pending, 1);
+ priv->status = 0;
+ priv->uring_ctx = uring_ctx;
bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info,
- btrfs_encoded_read_endio, &priv);
+ btrfs_encoded_read_endio, priv);
bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
bbio->inode = inode;
@@ -9153,11 +9130,11 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
size_t bytes = min_t(u64, disk_io_size, PAGE_SIZE);
if (bio_add_page(&bbio->bio, pages[i], bytes, 0) < bytes) {
- atomic_inc(&priv.pending);
+ atomic_inc(&priv->pending);
btrfs_submit_bbio(bbio, 0);
bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info,
- btrfs_encoded_read_endio, &priv);
+ btrfs_encoded_read_endio, priv);
bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
bbio->inode = inode;
continue;
@@ -9168,22 +9145,33 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
disk_io_size -= bytes;
} while (disk_io_size);
- atomic_inc(&priv.pending);
+ atomic_inc(&priv->pending);
btrfs_submit_bbio(bbio, 0);
- if (atomic_dec_return(&priv.pending))
- io_wait_event(priv.wait, !atomic_read(&priv.pending));
- /* See btrfs_encoded_read_endio() for ordering. */
- return blk_status_to_errno(READ_ONCE(priv.status));
+ if (uring_ctx) {
+ if (atomic_dec_return(&priv->pending) == 0) {
+ ret = blk_status_to_errno(READ_ONCE(priv->status));
+ btrfs_uring_read_extent_endio(uring_ctx, ret);
+ kfree(priv);
+ return ret;
+ }
+
+ return -EIOCBQUEUED;
+ } else {
+ if (atomic_dec_return(&priv->pending) != 0)
+ io_wait_event(priv->wait, !atomic_read(&priv->pending));
+ /* See btrfs_encoded_read_endio() for ordering. */
+ ret = blk_status_to_errno(READ_ONCE(priv->status));
+ kfree(priv);
+ return ret;
+ }
}
-static ssize_t btrfs_encoded_read_regular(struct kiocb *iocb,
- struct iov_iter *iter,
- u64 start, u64 lockend,
- struct extent_state **cached_state,
- u64 disk_bytenr, u64 disk_io_size,
- size_t count, bool compressed,
- bool *unlocked)
+ssize_t btrfs_encoded_read_regular(struct kiocb *iocb, struct iov_iter *iter,
+ u64 start, u64 lockend,
+ struct extent_state **cached_state,
+ u64 disk_bytenr, u64 disk_io_size,
+ size_t count, bool compressed, bool *unlocked)
{
struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
struct extent_io_tree *io_tree = &inode->io_tree;
@@ -9203,8 +9191,8 @@ static ssize_t btrfs_encoded_read_regular(struct kiocb *iocb,
goto out;
}
- ret = btrfs_encoded_read_regular_fill_pages(inode, start, disk_bytenr,
- disk_io_size, pages);
+ ret = btrfs_encoded_read_regular_fill_pages(inode, disk_bytenr,
+ disk_io_size, pages, NULL);
if (ret)
goto out;
@@ -9244,21 +9232,26 @@ static ssize_t btrfs_encoded_read_regular(struct kiocb *iocb,
}
ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
- struct btrfs_ioctl_encoded_io_args *encoded)
+ struct btrfs_ioctl_encoded_io_args *encoded,
+ struct extent_state **cached_state,
+ u64 *disk_bytenr, u64 *disk_io_size)
{
struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct extent_io_tree *io_tree = &inode->io_tree;
ssize_t ret;
size_t count = iov_iter_count(iter);
- u64 start, lockend, disk_bytenr, disk_io_size;
- struct extent_state *cached_state = NULL;
+ u64 start, lockend;
struct extent_map *em;
+ const bool nowait = (iocb->ki_flags & IOCB_NOWAIT);
bool unlocked = false;
file_accessed(iocb->ki_filp);
- btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED);
+ ret = btrfs_inode_lock(inode,
+ BTRFS_ILOCK_SHARED | (nowait ? BTRFS_ILOCK_TRY : 0));
+ if (ret)
+ return ret;
if (iocb->ki_pos >= inode->vfs_inode.i_size) {
btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
@@ -9271,21 +9264,46 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
*/
lockend = start + BTRFS_MAX_UNCOMPRESSED - 1;
- for (;;) {
+ if (nowait) {
struct btrfs_ordered_extent *ordered;
- ret = btrfs_wait_ordered_range(inode, start,
- lockend - start + 1);
- if (ret)
+ if (filemap_range_needs_writeback(inode->vfs_inode.i_mapping,
+ start, lockend)) {
+ ret = -EAGAIN;
goto out_unlock_inode;
- lock_extent(io_tree, start, lockend, &cached_state);
+ }
+
+ if (!try_lock_extent(io_tree, start, lockend, cached_state)) {
+ ret = -EAGAIN;
+ goto out_unlock_inode;
+ }
+
ordered = btrfs_lookup_ordered_range(inode, start,
lockend - start + 1);
- if (!ordered)
- break;
- btrfs_put_ordered_extent(ordered);
- unlock_extent(io_tree, start, lockend, &cached_state);
- cond_resched();
+ if (ordered) {
+ btrfs_put_ordered_extent(ordered);
+ unlock_extent(io_tree, start, lockend, cached_state);
+ ret = -EAGAIN;
+ goto out_unlock_inode;
+ }
+ } else {
+ for (;;) {
+ struct btrfs_ordered_extent *ordered;
+
+ ret = btrfs_wait_ordered_range(inode, start,
+ lockend - start + 1);
+ if (ret)
+ goto out_unlock_inode;
+
+ lock_extent(io_tree, start, lockend, cached_state);
+ ordered = btrfs_lookup_ordered_range(inode, start,
+ lockend - start + 1);
+ if (!ordered)
+ break;
+ btrfs_put_ordered_extent(ordered);
+ unlock_extent(io_tree, start, lockend, cached_state);
+ cond_resched();
+ }
}
em = btrfs_get_extent(inode, NULL, start, lockend - start + 1);
@@ -9304,9 +9322,9 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
free_extent_map(em);
em = NULL;
ret = btrfs_encoded_read_inline(iocb, iter, start, lockend,
- &cached_state, extent_start,
+ cached_state, extent_start,
count, encoded, &unlocked);
- goto out;
+ goto out_unlock_extent;
}
/*
@@ -9317,12 +9335,12 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
inode->vfs_inode.i_size) - iocb->ki_pos;
if (em->disk_bytenr == EXTENT_MAP_HOLE ||
(em->flags & EXTENT_FLAG_PREALLOC)) {
- disk_bytenr = EXTENT_MAP_HOLE;
+ *disk_bytenr = EXTENT_MAP_HOLE;
count = min_t(u64, count, encoded->len);
encoded->len = count;
encoded->unencoded_len = count;
} else if (extent_map_is_compressed(em)) {
- disk_bytenr = em->disk_bytenr;
+ *disk_bytenr = em->disk_bytenr;
/*
* Bail if the buffer isn't large enough to return the whole
* compressed extent.
@@ -9331,7 +9349,7 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
ret = -ENOBUFS;
goto out_em;
}
- disk_io_size = em->disk_num_bytes;
+ *disk_io_size = em->disk_num_bytes;
count = em->disk_num_bytes;
encoded->unencoded_len = em->ram_bytes;
encoded->unencoded_offset = iocb->ki_pos - (em->start - em->offset);
@@ -9341,47 +9359,42 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
goto out_em;
encoded->compression = ret;
} else {
- disk_bytenr = extent_map_block_start(em) + (start - em->start);
+ *disk_bytenr = extent_map_block_start(em) + (start - em->start);
if (encoded->len > count)
encoded->len = count;
/*
* Don't read beyond what we locked. This also limits the page
* allocations that we'll do.
*/
- disk_io_size = min(lockend + 1, iocb->ki_pos + encoded->len) - start;
- count = start + disk_io_size - iocb->ki_pos;
+ *disk_io_size = min(lockend + 1, iocb->ki_pos + encoded->len) - start;
+ count = start + *disk_io_size - iocb->ki_pos;
encoded->len = count;
encoded->unencoded_len = count;
- disk_io_size = ALIGN(disk_io_size, fs_info->sectorsize);
+ *disk_io_size = ALIGN(*disk_io_size, fs_info->sectorsize);
}
free_extent_map(em);
em = NULL;
- if (disk_bytenr == EXTENT_MAP_HOLE) {
- unlock_extent(io_tree, start, lockend, &cached_state);
+ if (*disk_bytenr == EXTENT_MAP_HOLE) {
+ unlock_extent(io_tree, start, lockend, cached_state);
btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
unlocked = true;
ret = iov_iter_zero(count, iter);
if (ret != count)
ret = -EFAULT;
} else {
- ret = btrfs_encoded_read_regular(iocb, iter, start, lockend,
- &cached_state, disk_bytenr,
- disk_io_size, count,
- encoded->compression,
- &unlocked);
+ ret = -EIOCBQUEUED;
+ goto out_unlock_extent;
}
-out:
- if (ret >= 0)
- iocb->ki_pos += encoded->len;
out_em:
free_extent_map(em);
out_unlock_extent:
- if (!unlocked)
- unlock_extent(io_tree, start, lockend, &cached_state);
+ /* Leave inode and extent locked if we need to do a read. */
+ if (!unlocked && ret != -EIOCBQUEUED)
+ unlock_extent(io_tree, start, lockend, cached_state);
out_unlock_inode:
- if (!unlocked)
+ if (!unlocked && ret != -EIOCBQUEUED)
btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
return ret;
}
@@ -9492,7 +9505,7 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
*/
disk_num_bytes = ALIGN(orig_count, fs_info->sectorsize);
nr_folios = DIV_ROUND_UP(disk_num_bytes, PAGE_SIZE);
- folios = kvcalloc(nr_folios, sizeof(struct page *), GFP_KERNEL_ACCOUNT);
+ folios = kvcalloc(nr_folios, sizeof(struct folio *), GFP_KERNEL_ACCOUNT);
if (!folios)
return -ENOMEM;
for (i = 0; i < nr_folios; i++) {
@@ -9556,7 +9569,7 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
if (encoded->unencoded_len == encoded->len &&
encoded->unencoded_offset == 0 &&
can_cow_file_range_inline(inode, start, encoded->len, orig_count)) {
- ret = __cow_file_range_inline(inode, start, encoded->len,
+ ret = __cow_file_range_inline(inode, encoded->len,
orig_count, compression, folios[0],
true);
if (ret <= 0) {
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 226c91f..c9302d1 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -29,6 +29,7 @@
#include <linux/fileattr.h>
#include <linux/fsverity.h>
#include <linux/sched/xacct.h>
+#include <linux/io_uring/cmd.h>
#include "ctree.h"
#include "disk-io.h"
#include "export.h"
@@ -1048,7 +1049,6 @@ static noinline int btrfs_mksnapshot(const struct path *parent,
struct btrfs_qgroup_inherit *inherit)
{
int ret;
- bool snapshot_force_cow = false;
/*
* Force new buffered writes to reserve space even when NOCOW is
@@ -1067,15 +1067,13 @@ static noinline int btrfs_mksnapshot(const struct path *parent,
* creation.
*/
atomic_inc(&root->snapshot_force_cow);
- snapshot_force_cow = true;
btrfs_wait_ordered_extents(root, U64_MAX, NULL);
ret = btrfs_mksubvol(parent, idmap, name, namelen,
root, readonly, inherit);
+ atomic_dec(&root->snapshot_force_cow);
out:
- if (snapshot_force_cow)
- atomic_dec(&root->snapshot_force_cow);
btrfs_drew_read_unlock(&root->snapshot_lock);
return ret;
}
@@ -1308,9 +1306,9 @@ static noinline int __btrfs_ioctl_snap_create(struct file *file,
ret = btrfs_mksubvol(&file->f_path, idmap, name,
namelen, NULL, readonly, inherit);
} else {
- struct fd src = fdget(fd);
+ CLASS(fd, src)(fd);
struct inode *src_inode;
- if (!fd_file(src)) {
+ if (fd_empty(src)) {
ret = -EINVAL;
goto out_drop_write;
}
@@ -1341,7 +1339,6 @@ static noinline int __btrfs_ioctl_snap_create(struct file *file,
BTRFS_I(src_inode)->root,
readonly, inherit);
}
- fdput(src);
}
out_drop_write:
mnt_drop_write_file(file);
@@ -4058,8 +4055,7 @@ static long btrfs_ioctl_quota_rescan_status(struct btrfs_fs_info *fs_info,
return 0;
}
-static long btrfs_ioctl_quota_rescan_wait(struct btrfs_fs_info *fs_info,
- void __user *arg)
+static long btrfs_ioctl_quota_rescan_wait(struct btrfs_fs_info *fs_info)
{
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -4514,12 +4510,17 @@ static int btrfs_ioctl_encoded_read(struct file *file, void __user *argp,
size_t copy_end_kernel = offsetofend(struct btrfs_ioctl_encoded_io_args,
flags);
size_t copy_end;
+ struct btrfs_inode *inode = BTRFS_I(file_inode(file));
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct extent_io_tree *io_tree = &inode->io_tree;
struct iovec iovstack[UIO_FASTIOV];
struct iovec *iov = iovstack;
struct iov_iter iter;
loff_t pos;
struct kiocb kiocb;
ssize_t ret;
+ u64 disk_bytenr, disk_io_size;
+ struct extent_state *cached_state = NULL;
if (!capable(CAP_SYS_ADMIN)) {
ret = -EPERM;
@@ -4572,7 +4573,32 @@ static int btrfs_ioctl_encoded_read(struct file *file, void __user *argp,
init_sync_kiocb(&kiocb, file);
kiocb.ki_pos = pos;
- ret = btrfs_encoded_read(&kiocb, &iter, &args);
+ ret = btrfs_encoded_read(&kiocb, &iter, &args, &cached_state,
+ &disk_bytenr, &disk_io_size);
+
+ if (ret == -EIOCBQUEUED) {
+ bool unlocked = false;
+ u64 start, lockend, count;
+
+ start = ALIGN_DOWN(kiocb.ki_pos, fs_info->sectorsize);
+ lockend = start + BTRFS_MAX_UNCOMPRESSED - 1;
+
+ if (args.compression)
+ count = disk_io_size;
+ else
+ count = args.len;
+
+ ret = btrfs_encoded_read_regular(&kiocb, &iter, start, lockend,
+ &cached_state, disk_bytenr,
+ disk_io_size, count,
+ args.compression, &unlocked);
+
+ if (!unlocked) {
+ unlock_extent(io_tree, start, lockend, &cached_state);
+ btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
+ }
+ }
+
if (ret >= 0) {
fsnotify_access(file);
if (copy_to_user(argp + copy_end,
@@ -4690,6 +4716,439 @@ static int btrfs_ioctl_encoded_write(struct file *file, void __user *argp, bool
return ret;
}
+/*
+ * Context that's attached to an encoded read io_uring command, in cmd->pdu. It
+ * contains the fields in btrfs_uring_read_extent that are necessary to finish
+ * off and cleanup the I/O in btrfs_uring_read_finished.
+ */
+struct btrfs_uring_priv {
+ struct io_uring_cmd *cmd;
+ struct page **pages;
+ unsigned long nr_pages;
+ struct kiocb iocb;
+ struct iovec *iov;
+ struct iov_iter iter;
+ struct extent_state *cached_state;
+ u64 count;
+ u64 start;
+ u64 lockend;
+ int err;
+ bool compressed;
+};
+
+struct io_btrfs_cmd {
+ struct btrfs_uring_priv *priv;
+};
+
+static void btrfs_uring_read_finished(struct io_uring_cmd *cmd, unsigned int issue_flags)
+{
+ struct io_btrfs_cmd *bc = io_uring_cmd_to_pdu(cmd, struct io_btrfs_cmd);
+ struct btrfs_uring_priv *priv = bc->priv;
+ struct btrfs_inode *inode = BTRFS_I(file_inode(priv->iocb.ki_filp));
+ struct extent_io_tree *io_tree = &inode->io_tree;
+ unsigned long index;
+ u64 cur;
+ size_t page_offset;
+ ssize_t ret;
+
+ if (priv->err) {
+ ret = priv->err;
+ goto out;
+ }
+
+ if (priv->compressed) {
+ index = 0;
+ page_offset = 0;
+ } else {
+ index = (priv->iocb.ki_pos - priv->start) >> PAGE_SHIFT;
+ page_offset = offset_in_page(priv->iocb.ki_pos - priv->start);
+ }
+ cur = 0;
+ while (cur < priv->count) {
+ size_t bytes = min_t(size_t, priv->count - cur, PAGE_SIZE - page_offset);
+
+ if (copy_page_to_iter(priv->pages[index], page_offset, bytes,
+ &priv->iter) != bytes) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ index++;
+ cur += bytes;
+ page_offset = 0;
+ }
+ ret = priv->count;
+
+out:
+ unlock_extent(io_tree, priv->start, priv->lockend, &priv->cached_state);
+ btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
+
+ io_uring_cmd_done(cmd, ret, 0, issue_flags);
+ add_rchar(current, ret);
+
+ for (index = 0; index < priv->nr_pages; index++)
+ __free_page(priv->pages[index]);
+
+ kfree(priv->pages);
+ kfree(priv->iov);
+ kfree(priv);
+}
+
+void btrfs_uring_read_extent_endio(void *ctx, int err)
+{
+ struct btrfs_uring_priv *priv = ctx;
+ struct io_btrfs_cmd *bc = io_uring_cmd_to_pdu(priv->cmd, struct io_btrfs_cmd);
+
+ priv->err = err;
+ bc->priv = priv;
+
+ io_uring_cmd_complete_in_task(priv->cmd, btrfs_uring_read_finished);
+}
+
+static int btrfs_uring_read_extent(struct kiocb *iocb, struct iov_iter *iter,
+ u64 start, u64 lockend,
+ struct extent_state *cached_state,
+ u64 disk_bytenr, u64 disk_io_size,
+ size_t count, bool compressed,
+ struct iovec *iov, struct io_uring_cmd *cmd)
+{
+ struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
+ struct extent_io_tree *io_tree = &inode->io_tree;
+ struct page **pages;
+ struct btrfs_uring_priv *priv = NULL;
+ unsigned long nr_pages;
+ int ret;
+
+ nr_pages = DIV_ROUND_UP(disk_io_size, PAGE_SIZE);
+ pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
+ if (!pages)
+ return -ENOMEM;
+ ret = btrfs_alloc_page_array(nr_pages, pages, 0);
+ if (ret) {
+ ret = -ENOMEM;
+ goto out_fail;
+ }
+
+ priv = kmalloc(sizeof(*priv), GFP_NOFS);
+ if (!priv) {
+ ret = -ENOMEM;
+ goto out_fail;
+ }
+
+ priv->iocb = *iocb;
+ priv->iov = iov;
+ priv->iter = *iter;
+ priv->count = count;
+ priv->cmd = cmd;
+ priv->cached_state = cached_state;
+ priv->compressed = compressed;
+ priv->nr_pages = nr_pages;
+ priv->pages = pages;
+ priv->start = start;
+ priv->lockend = lockend;
+ priv->err = 0;
+
+ ret = btrfs_encoded_read_regular_fill_pages(inode, disk_bytenr,
+ disk_io_size, pages, priv);
+ if (ret && ret != -EIOCBQUEUED)
+ goto out_fail;
+
+ /*
+ * If we return -EIOCBQUEUED, we're deferring the cleanup to
+ * btrfs_uring_read_finished(), which will handle unlocking the extent
+ * and inode and freeing the allocations.
+ */
+
+ return -EIOCBQUEUED;
+
+out_fail:
+ unlock_extent(io_tree, start, lockend, &cached_state);
+ btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
+ kfree(priv);
+ return ret;
+}
+
+static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue_flags)
+{
+ size_t copy_end_kernel = offsetofend(struct btrfs_ioctl_encoded_io_args, flags);
+ size_t copy_end;
+ struct btrfs_ioctl_encoded_io_args args = { 0 };
+ int ret;
+ u64 disk_bytenr, disk_io_size;
+ struct file *file;
+ struct btrfs_inode *inode;
+ struct btrfs_fs_info *fs_info;
+ struct extent_io_tree *io_tree;
+ struct iovec iovstack[UIO_FASTIOV];
+ struct iovec *iov = iovstack;
+ struct iov_iter iter;
+ loff_t pos;
+ struct kiocb kiocb;
+ struct extent_state *cached_state = NULL;
+ u64 start, lockend;
+ void __user *sqe_addr;
+
+ if (!capable(CAP_SYS_ADMIN)) {
+ ret = -EPERM;
+ goto out_acct;
+ }
+ file = cmd->file;
+ inode = BTRFS_I(file->f_inode);
+ fs_info = inode->root->fs_info;
+ io_tree = &inode->io_tree;
+ sqe_addr = u64_to_user_ptr(READ_ONCE(cmd->sqe->addr));
+
+ if (issue_flags & IO_URING_F_COMPAT) {
+#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
+ struct btrfs_ioctl_encoded_io_args_32 args32;
+
+ copy_end = offsetofend(struct btrfs_ioctl_encoded_io_args_32, flags);
+ if (copy_from_user(&args32, sqe_addr, copy_end)) {
+ ret = -EFAULT;
+ goto out_acct;
+ }
+ args.iov = compat_ptr(args32.iov);
+ args.iovcnt = args32.iovcnt;
+ args.offset = args32.offset;
+ args.flags = args32.flags;
+#else
+ return -ENOTTY;
+#endif
+ } else {
+ copy_end = copy_end_kernel;
+ if (copy_from_user(&args, sqe_addr, copy_end)) {
+ ret = -EFAULT;
+ goto out_acct;
+ }
+ }
+
+ if (args.flags != 0)
+ return -EINVAL;
+
+ ret = import_iovec(ITER_DEST, args.iov, args.iovcnt, ARRAY_SIZE(iovstack),
+ &iov, &iter);
+ if (ret < 0)
+ goto out_acct;
+
+ if (iov_iter_count(&iter) == 0) {
+ ret = 0;
+ goto out_free;
+ }
+
+ pos = args.offset;
+ ret = rw_verify_area(READ, file, &pos, args.len);
+ if (ret < 0)
+ goto out_free;
+
+ init_sync_kiocb(&kiocb, file);
+ kiocb.ki_pos = pos;
+
+ if (issue_flags & IO_URING_F_NONBLOCK)
+ kiocb.ki_flags |= IOCB_NOWAIT;
+
+ start = ALIGN_DOWN(pos, fs_info->sectorsize);
+ lockend = start + BTRFS_MAX_UNCOMPRESSED - 1;
+
+ ret = btrfs_encoded_read(&kiocb, &iter, &args, &cached_state,
+ &disk_bytenr, &disk_io_size);
+ if (ret < 0 && ret != -EIOCBQUEUED)
+ goto out_free;
+
+ file_accessed(file);
+
+ if (copy_to_user(sqe_addr + copy_end, (const char *)&args + copy_end_kernel,
+ sizeof(args) - copy_end_kernel)) {
+ if (ret == -EIOCBQUEUED) {
+ unlock_extent(io_tree, start, lockend, &cached_state);
+ btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
+ }
+ ret = -EFAULT;
+ goto out_free;
+ }
+
+ if (ret == -EIOCBQUEUED) {
+ u64 count;
+
+ /*
+ * If we've optimized things by storing the iovecs on the stack,
+ * undo this.
+ */
+ if (!iov) {
+ iov = kmalloc(sizeof(struct iovec) * args.iovcnt, GFP_NOFS);
+ if (!iov) {
+ unlock_extent(io_tree, start, lockend, &cached_state);
+ btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
+ ret = -ENOMEM;
+ goto out_acct;
+ }
+
+ memcpy(iov, iovstack, sizeof(struct iovec) * args.iovcnt);
+ }
+
+ count = min_t(u64, iov_iter_count(&iter), disk_io_size);
+
+ /* Match ioctl by not returning past EOF if uncompressed. */
+ if (!args.compression)
+ count = min_t(u64, count, args.len);
+
+ ret = btrfs_uring_read_extent(&kiocb, &iter, start, lockend,
+ cached_state, disk_bytenr,
+ disk_io_size, count,
+ args.compression, iov, cmd);
+
+ goto out_acct;
+ }
+
+out_free:
+ kfree(iov);
+
+out_acct:
+ if (ret > 0)
+ add_rchar(current, ret);
+ inc_syscr(current);
+
+ return ret;
+}
+
+int btrfs_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
+{
+ switch (cmd->cmd_op) {
+ case BTRFS_IOC_ENCODED_READ:
+#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
+ case BTRFS_IOC_ENCODED_READ_32:
+#endif
+ return btrfs_uring_encoded_read(cmd, issue_flags);
+ }
+
+ return -EINVAL;
+}
+
+static int btrfs_ioctl_subvol_sync(struct btrfs_fs_info *fs_info, void __user *argp)
+{
+ struct btrfs_root *root;
+ struct btrfs_ioctl_subvol_wait args = { 0 };
+ signed long sched_ret;
+ int refs;
+ u64 root_flags;
+ bool wait_for_deletion = false;
+ bool found = false;
+
+ if (copy_from_user(&args, argp, sizeof(args)))
+ return -EFAULT;
+
+ switch (args.mode) {
+ case BTRFS_SUBVOL_SYNC_WAIT_FOR_QUEUED:
+ /*
+ * Wait for the first one deleted that waits until all previous
+ * are cleaned.
+ */
+ spin_lock(&fs_info->trans_lock);
+ if (!list_empty(&fs_info->dead_roots)) {
+ root = list_last_entry(&fs_info->dead_roots,
+ struct btrfs_root, root_list);
+ args.subvolid = btrfs_root_id(root);
+ found = true;
+ }
+ spin_unlock(&fs_info->trans_lock);
+ if (!found)
+ return -ENOENT;
+
+ fallthrough;
+ case BTRFS_SUBVOL_SYNC_WAIT_FOR_ONE:
+ if ((0 < args.subvolid && args.subvolid < BTRFS_FIRST_FREE_OBJECTID) ||
+ BTRFS_LAST_FREE_OBJECTID < args.subvolid)
+ return -EINVAL;
+ break;
+ case BTRFS_SUBVOL_SYNC_COUNT:
+ spin_lock(&fs_info->trans_lock);
+ args.count = list_count_nodes(&fs_info->dead_roots);
+ spin_unlock(&fs_info->trans_lock);
+ if (copy_to_user(argp, &args, sizeof(args)))
+ return -EFAULT;
+ return 0;
+ case BTRFS_SUBVOL_SYNC_PEEK_FIRST:
+ spin_lock(&fs_info->trans_lock);
+ /* Last in the list was deleted first. */
+ if (!list_empty(&fs_info->dead_roots)) {
+ root = list_last_entry(&fs_info->dead_roots,
+ struct btrfs_root, root_list);
+ args.subvolid = btrfs_root_id(root);
+ } else {
+ args.subvolid = 0;
+ }
+ spin_unlock(&fs_info->trans_lock);
+ if (copy_to_user(argp, &args, sizeof(args)))
+ return -EFAULT;
+ return 0;
+ case BTRFS_SUBVOL_SYNC_PEEK_LAST:
+ spin_lock(&fs_info->trans_lock);
+ /* First in the list was deleted last. */
+ if (!list_empty(&fs_info->dead_roots)) {
+ root = list_first_entry(&fs_info->dead_roots,
+ struct btrfs_root, root_list);
+ args.subvolid = btrfs_root_id(root);
+ } else {
+ args.subvolid = 0;
+ }
+ spin_unlock(&fs_info->trans_lock);
+ if (copy_to_user(argp, &args, sizeof(args)))
+ return -EFAULT;
+ return 0;
+ default:
+ return -EINVAL;
+ }
+
+ /* 32bit limitation: fs_roots_radix key is not wide enough. */
+ if (sizeof(unsigned long) != sizeof(u64) && args.subvolid > U32_MAX)
+ return -EOVERFLOW;
+
+ while (1) {
+ /* Wait for the specific one. */
+ if (down_read_interruptible(&fs_info->subvol_sem) == -EINTR)
+ return -EINTR;
+ refs = -1;
+ spin_lock(&fs_info->fs_roots_radix_lock);
+ root = radix_tree_lookup(&fs_info->fs_roots_radix,
+ (unsigned long)args.subvolid);
+ if (root) {
+ spin_lock(&root->root_item_lock);
+ refs = btrfs_root_refs(&root->root_item);
+ root_flags = btrfs_root_flags(&root->root_item);
+ spin_unlock(&root->root_item_lock);
+ }
+ spin_unlock(&fs_info->fs_roots_radix_lock);
+ up_read(&fs_info->subvol_sem);
+
+ /* Subvolume does not exist. */
+ if (!root)
+ return -ENOENT;
+
+ /* Subvolume not deleted at all. */
+ if (refs > 0)
+ return -EEXIST;
+ /* We've waited and now the subvolume is gone. */
+ if (wait_for_deletion && refs == -1) {
+ /* Return the one we waited for as the last one. */
+ if (copy_to_user(argp, &args, sizeof(args)))
+ return -EFAULT;
+ return 0;
+ }
+
+ /* Subvolume not found on the first try (deleted or never existed). */
+ if (refs == -1)
+ return -ENOENT;
+
+ wait_for_deletion = true;
+ ASSERT(root_flags & BTRFS_ROOT_SUBVOL_DEAD);
+ sched_ret = schedule_timeout_interruptible(HZ);
+ /* Early wake up or error. */
+ if (sched_ret != 0)
+ return -EINTR;
+ }
+
+ return 0;
+}
+
long btrfs_ioctl(struct file *file, unsigned int
cmd, unsigned long arg)
{
@@ -4812,7 +5271,7 @@ long btrfs_ioctl(struct file *file, unsigned int
case BTRFS_IOC_QUOTA_RESCAN_STATUS:
return btrfs_ioctl_quota_rescan_status(fs_info, argp);
case BTRFS_IOC_QUOTA_RESCAN_WAIT:
- return btrfs_ioctl_quota_rescan_wait(fs_info, argp);
+ return btrfs_ioctl_quota_rescan_wait(fs_info);
case BTRFS_IOC_DEV_REPLACE:
return btrfs_ioctl_dev_replace(fs_info, argp);
case BTRFS_IOC_GET_SUPPORTED_FEATURES:
@@ -4841,6 +5300,8 @@ long btrfs_ioctl(struct file *file, unsigned int
case BTRFS_IOC_ENCODED_WRITE_32:
return btrfs_ioctl_encoded_write(file, argp, true);
#endif
+ case BTRFS_IOC_SUBVOL_SYNC_WAIT:
+ return btrfs_ioctl_subvol_sync(fs_info, argp);
}
return -ENOTTY;
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index 19cd26b..2b760c8 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -22,5 +22,7 @@ void btrfs_sync_inode_flags_to_i_flags(struct inode *inode);
int __pure btrfs_is_empty_uuid(const u8 *uuid);
void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info,
struct btrfs_ioctl_balance_args *bargs);
+int btrfs_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags);
+void btrfs_uring_read_extent_endio(void *ctx, int err);
#endif
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 6a0b7ab..9a7a7b7 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -162,21 +162,6 @@ int btrfs_try_tree_read_lock(struct extent_buffer *eb)
}
/*
- * Try-lock for write.
- *
- * Return 1 if the rwlock has been taken, 0 otherwise
- */
-int btrfs_try_tree_write_lock(struct extent_buffer *eb)
-{
- if (down_write_trylock(&eb->lock)) {
- btrfs_set_eb_lock_owner(eb, current->pid);
- trace_btrfs_try_tree_write_lock(eb);
- return 1;
- }
- return 0;
-}
-
-/*
* Release read lock.
*/
void btrfs_tree_read_unlock(struct extent_buffer *eb)
diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
index 3c15c75..46c8be2 100644
--- a/fs/btrfs/locking.h
+++ b/fs/btrfs/locking.h
@@ -180,7 +180,6 @@ static inline void btrfs_tree_read_lock(struct extent_buffer *eb)
void btrfs_tree_read_unlock(struct extent_buffer *eb);
int btrfs_try_tree_read_lock(struct extent_buffer *eb);
-int btrfs_try_tree_write_lock(struct extent_buffer *eb);
struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root);
struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root);
struct extent_buffer *btrfs_try_read_lock_root_node(struct btrfs_root *root);
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index 72856f6..a45bc11 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -80,7 +80,7 @@ void lzo_free_workspace(struct list_head *ws)
kfree(workspace);
}
-struct list_head *lzo_alloc_workspace(unsigned int level)
+struct list_head *lzo_alloc_workspace(void)
{
struct workspace *workspace;
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 2104d60..95c8499 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -346,10 +346,10 @@ static bool can_finish_ordered_extent(struct btrfs_ordered_extent *ordered,
ASSERT(file_offset + len <= folio_pos(folio) + folio_size(folio));
/*
- * Ordered (Private2) bit indicates whether we still have
+ * Ordered flag indicates whether we still have
* pending io unfinished for the ordered extent.
*
- * If there's no such bit, we need to skip to next range.
+ * If it's not set, we need to skip to next range.
*/
if (!btrfs_folio_test_ordered(fs_info, folio, file_offset, len))
return false;
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index a0e8dec..a6f9283 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -226,8 +226,7 @@ static struct btrfs_qgroup *add_qgroup_rb(struct btrfs_fs_info *fs_info,
return qgroup;
}
-static void __del_qgroup_rb(struct btrfs_fs_info *fs_info,
- struct btrfs_qgroup *qgroup)
+static void __del_qgroup_rb(struct btrfs_qgroup *qgroup)
{
struct btrfs_qgroup_list *list;
@@ -258,7 +257,7 @@ static int del_qgroup_rb(struct btrfs_fs_info *fs_info, u64 qgroupid)
return -ENOENT;
rb_erase(&qgroup->node, &fs_info->qgroup_tree);
- __del_qgroup_rb(fs_info, qgroup);
+ __del_qgroup_rb(qgroup);
return 0;
}
@@ -469,7 +468,7 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
/*
* If a qgroup exists for a subvolume ID, it is possible
* that subvolume has been deleted, in which case
- * re-using that ID would lead to incorrect accounting.
+ * reusing that ID would lead to incorrect accounting.
*
* Ensure that we skip any such subvol ids.
*
@@ -643,7 +642,7 @@ void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info)
while ((n = rb_first(&fs_info->qgroup_tree))) {
qgroup = rb_entry(n, struct btrfs_qgroup, node);
rb_erase(n, &fs_info->qgroup_tree);
- __del_qgroup_rb(fs_info, qgroup);
+ __del_qgroup_rb(qgroup);
btrfs_sysfs_del_one_qgroup(fs_info, qgroup);
kfree(qgroup);
}
@@ -2001,27 +2000,27 @@ int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid,
* Return <0 for insertion failure, caller can free @record safely.
*/
int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info,
- struct btrfs_delayed_ref_root *delayed_refs,
- struct btrfs_qgroup_extent_record *record)
+ struct btrfs_delayed_ref_root *delayed_refs,
+ struct btrfs_qgroup_extent_record *record,
+ u64 bytenr)
{
struct btrfs_qgroup_extent_record *existing, *ret;
- const unsigned long index = (record->bytenr >> fs_info->sectorsize_bits);
+ const unsigned long index = (bytenr >> fs_info->sectorsize_bits);
if (!btrfs_qgroup_full_accounting(fs_info))
return 1;
#if BITS_PER_LONG == 32
- if (record->bytenr >= MAX_LFS_FILESIZE) {
+ if (bytenr >= MAX_LFS_FILESIZE) {
btrfs_err_rl(fs_info,
"qgroup record for extent at %llu is beyond 32bit page cache and xarray index limit",
- record->bytenr);
+ bytenr);
btrfs_err_32bit_limit(fs_info);
return -EOVERFLOW;
}
#endif
- lockdep_assert_held(&delayed_refs->lock);
- trace_btrfs_qgroup_trace_extent(fs_info, record);
+ trace_btrfs_qgroup_trace_extent(fs_info, record, bytenr);
xa_lock(&delayed_refs->dirty_extents);
existing = xa_load(&delayed_refs->dirty_extents, index);
@@ -2066,12 +2065,17 @@ int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info,
* transaction committing, but not now as qgroup accounting will be wrong again.
*/
int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans,
- struct btrfs_qgroup_extent_record *qrecord)
+ struct btrfs_qgroup_extent_record *qrecord,
+ u64 bytenr)
{
- struct btrfs_backref_walk_ctx ctx = { 0 };
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_backref_walk_ctx ctx = {
+ .bytenr = bytenr,
+ .fs_info = fs_info,
+ };
int ret;
- if (!btrfs_qgroup_full_accounting(trans->fs_info))
+ if (!btrfs_qgroup_full_accounting(fs_info))
return 0;
/*
* We are always called in a context where we are already holding a
@@ -2094,16 +2098,13 @@ int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans,
*/
ASSERT(trans != NULL);
- if (trans->fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING)
+ if (fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING)
return 0;
- ctx.bytenr = qrecord->bytenr;
- ctx.fs_info = trans->fs_info;
-
ret = btrfs_find_all_roots(&ctx, true);
if (ret < 0) {
- qgroup_mark_inconsistent(trans->fs_info);
- btrfs_warn(trans->fs_info,
+ qgroup_mark_inconsistent(fs_info);
+ btrfs_warn(fs_info,
"error accounting new delayed refs extent (err code: %d), quota inconsistent",
ret);
return 0;
@@ -2138,7 +2139,7 @@ int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr,
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_qgroup_extent_record *record;
- struct btrfs_delayed_ref_root *delayed_refs;
+ struct btrfs_delayed_ref_root *delayed_refs = &trans->transaction->delayed_refs;
const unsigned long index = (bytenr >> fs_info->sectorsize_bits);
int ret;
@@ -2148,26 +2149,21 @@ int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr,
if (!record)
return -ENOMEM;
- if (xa_reserve(&trans->transaction->delayed_refs.dirty_extents, index, GFP_NOFS)) {
+ if (xa_reserve(&delayed_refs->dirty_extents, index, GFP_NOFS)) {
kfree(record);
return -ENOMEM;
}
- delayed_refs = &trans->transaction->delayed_refs;
- record->bytenr = bytenr;
record->num_bytes = num_bytes;
- record->old_roots = NULL;
- spin_lock(&delayed_refs->lock);
- ret = btrfs_qgroup_trace_extent_nolock(fs_info, delayed_refs, record);
- spin_unlock(&delayed_refs->lock);
+ ret = btrfs_qgroup_trace_extent_nolock(fs_info, delayed_refs, record, bytenr);
if (ret) {
/* Clean up if insertion fails or item exists. */
xa_release(&delayed_refs->dirty_extents, index);
kfree(record);
return 0;
}
- return btrfs_qgroup_trace_extent_post(trans, record);
+ return btrfs_qgroup_trace_extent_post(trans, record, bytenr);
}
/*
@@ -2652,7 +2648,6 @@ int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans,
if (!extent_buffer_uptodate(root_eb)) {
struct btrfs_tree_parent_check check = {
- .has_first_key = false,
.transid = root_gen,
.level = root_level
};
@@ -3043,14 +3038,16 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans)
delayed_refs = &trans->transaction->delayed_refs;
qgroup_to_skip = delayed_refs->qgroup_to_skip;
xa_for_each(&delayed_refs->dirty_extents, index, record) {
+ const u64 bytenr = (((u64)index) << fs_info->sectorsize_bits);
+
num_dirty_extents++;
- trace_btrfs_qgroup_account_extents(fs_info, record);
+ trace_btrfs_qgroup_account_extents(fs_info, record, bytenr);
if (!ret && !(fs_info->qgroup_flags &
BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING)) {
struct btrfs_backref_walk_ctx ctx = { 0 };
- ctx.bytenr = record->bytenr;
+ ctx.bytenr = bytenr;
ctx.fs_info = fs_info;
/*
@@ -3092,7 +3089,7 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans)
ulist_del(record->old_roots, qgroup_to_skip,
0);
}
- ret = btrfs_qgroup_account_extent(trans, record->bytenr,
+ ret = btrfs_qgroup_account_extent(trans, bytenr,
record->num_bytes,
record->old_roots,
new_roots);
@@ -4196,13 +4193,20 @@ static int try_flush_qgroup(struct btrfs_root *root)
return 0;
}
- btrfs_run_delayed_iputs(root->fs_info);
- btrfs_wait_on_delayed_iputs(root->fs_info);
ret = btrfs_start_delalloc_snapshot(root, true);
if (ret < 0)
goto out;
btrfs_wait_ordered_extents(root, U64_MAX, NULL);
+ /*
+ * After waiting for ordered extents run delayed iputs in order to free
+ * space from unlinked files before committing the current transaction,
+ * as ordered extents may have been holding the last reference of an
+ * inode and they add a delayed iput when they complete.
+ */
+ btrfs_run_delayed_iputs(root->fs_info);
+ btrfs_wait_on_delayed_iputs(root->fs_info);
+
ret = btrfs_commit_current_transaction(root);
out:
clear_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state);
@@ -4687,8 +4691,7 @@ void btrfs_qgroup_clean_swapped_blocks(struct btrfs_root *root)
* BOTH POINTERS ARE BEFORE TREE SWAP
* @last_snapshot: last snapshot generation of the subvolume tree
*/
-int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans,
- struct btrfs_root *subvol_root,
+int btrfs_qgroup_add_swapped_blocks(struct btrfs_root *subvol_root,
struct btrfs_block_group *bg,
struct extent_buffer *subvol_parent, int subvol_slot,
struct extent_buffer *reloc_parent, int reloc_slot,
@@ -4894,17 +4897,6 @@ void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans)
xa_destroy(&trans->delayed_refs.dirty_extents);
}
-void btrfs_free_squota_rsv(struct btrfs_fs_info *fs_info, u64 root, u64 rsv_bytes)
-{
- if (btrfs_qgroup_mode(fs_info) != BTRFS_QGROUP_MODE_SIMPLE)
- return;
-
- if (!is_fstree(root))
- return;
-
- btrfs_qgroup_free_refroot(fs_info, root, rsv_bytes, BTRFS_QGROUP_RSV_DATA);
-}
-
int btrfs_record_squota_delta(struct btrfs_fs_info *fs_info,
const struct btrfs_squota_delta *delta)
{
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index c229256..e233cc7 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -127,7 +127,12 @@ struct btrfs_inode;
* Record a dirty extent, and info qgroup to update quota on it
*/
struct btrfs_qgroup_extent_record {
- u64 bytenr;
+ /*
+ * The bytenr of the extent is given by its index in the dirty_extents
+ * xarray of struct btrfs_delayed_ref_root left shifted by
+ * fs_info->sectorsize_bits.
+ */
+
u64 num_bytes;
/*
@@ -345,9 +350,11 @@ void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info);
int btrfs_qgroup_trace_extent_nolock(
struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_root *delayed_refs,
- struct btrfs_qgroup_extent_record *record);
+ struct btrfs_qgroup_extent_record *record,
+ u64 bytenr);
int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans,
- struct btrfs_qgroup_extent_record *qrecord);
+ struct btrfs_qgroup_extent_record *qrecord,
+ u64 bytenr);
int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr,
u64 num_bytes);
int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans,
@@ -432,8 +439,7 @@ void btrfs_qgroup_init_swapped_blocks(
struct btrfs_qgroup_swapped_blocks *swapped_blocks);
void btrfs_qgroup_clean_swapped_blocks(struct btrfs_root *root);
-int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans,
- struct btrfs_root *subvol_root,
+int btrfs_qgroup_add_swapped_blocks(struct btrfs_root *subvol_root,
struct btrfs_block_group *bg,
struct extent_buffer *subvol_parent, int subvol_slot,
struct extent_buffer *reloc_parent, int reloc_slot,
@@ -442,7 +448,6 @@ int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct extent_buffer *eb);
void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans);
bool btrfs_check_quota_leak(const struct btrfs_fs_info *fs_info);
-void btrfs_free_squota_rsv(struct btrfs_fs_info *fs_info, u64 root, u64 rsv_bytes);
int btrfs_record_squota_delta(struct btrfs_fs_info *fs_info,
const struct btrfs_squota_delta *delta);
diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c
index 4c859b5..9ffc79f 100644
--- a/fs/btrfs/raid-stripe-tree.c
+++ b/fs/btrfs/raid-stripe-tree.c
@@ -13,6 +13,39 @@
#include "volumes.h"
#include "print-tree.h"
+static void btrfs_partially_delete_raid_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_path *path,
+ const struct btrfs_key *oldkey,
+ u64 newlen, u64 frontpad)
+{
+ struct btrfs_stripe_extent *extent;
+ struct extent_buffer *leaf;
+ int slot;
+ size_t item_size;
+ struct btrfs_key newkey = {
+ .objectid = oldkey->objectid + frontpad,
+ .type = BTRFS_RAID_STRIPE_KEY,
+ .offset = newlen,
+ };
+
+ ASSERT(oldkey->type == BTRFS_RAID_STRIPE_KEY);
+
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+ item_size = btrfs_item_size(leaf, slot);
+ extent = btrfs_item_ptr(leaf, slot, struct btrfs_stripe_extent);
+
+ for (int i = 0; i < btrfs_num_raid_stripes(item_size); i++) {
+ struct btrfs_raid_stride *stride = &extent->strides[i];
+ u64 phys;
+
+ phys = btrfs_raid_stride_physical(leaf, stride);
+ btrfs_set_raid_stride_physical(leaf, stride, phys + frontpad);
+ }
+
+ btrfs_set_item_key_safe(trans, path, &newkey);
+}
+
int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 length)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
@@ -36,23 +69,24 @@ int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 le
while (1) {
key.objectid = start;
key.type = BTRFS_RAID_STRIPE_KEY;
- key.offset = length;
+ key.offset = 0;
ret = btrfs_search_slot(trans, stripe_root, &key, path, -1, 1);
if (ret < 0)
break;
- if (ret > 0) {
- ret = 0;
- if (path->slots[0] == 0)
- break;
+
+ if (path->slots[0] == btrfs_header_nritems(path->nodes[0]))
path->slots[0]--;
- }
leaf = path->nodes[0];
slot = path->slots[0];
btrfs_item_key_to_cpu(leaf, &key, slot);
found_start = key.objectid;
found_end = found_start + key.offset;
+ ret = 0;
+
+ if (key.type != BTRFS_RAID_STRIPE_KEY)
+ break;
/* That stripe ends before we start, we're done. */
if (found_end <= start)
@@ -61,7 +95,40 @@ int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 le
trace_btrfs_raid_extent_delete(fs_info, start, end,
found_start, found_end);
- ASSERT(found_start >= start && found_end <= end);
+ /*
+ * The stripe extent starts before the range we want to delete:
+ *
+ * |--- RAID Stripe Extent ---|
+ * |--- keep ---|--- drop ---|
+ *
+ * This means we have to duplicate the tree item, truncate the
+ * length to the new size and then re-insert the item.
+ */
+ if (found_start < start) {
+ u64 diff = start - found_start;
+
+ btrfs_partially_delete_raid_extent(trans, path, &key,
+ diff, 0);
+ break;
+ }
+
+ /*
+ * The stripe extent ends after the range we want to delete:
+ *
+ * |--- RAID Stripe Extent ---|
+ * |--- drop ---|--- keep ---|
+ *
+ * This means we have to duplicate the tree item, truncate the
+ * length to the new size and then re-insert the item.
+ */
+ if (found_end > end) {
+ u64 diff = found_end - end;
+
+ btrfs_partially_delete_raid_extent(trans, path, &key,
+ diff, diff);
+ break;
+ }
+
ret = btrfs_del_item(trans, stripe_root, path);
if (ret)
break;
@@ -108,8 +175,9 @@ static int update_raid_extent_item(struct btrfs_trans_handle *trans,
return ret;
}
-static int btrfs_insert_one_raid_extent(struct btrfs_trans_handle *trans,
- struct btrfs_io_context *bioc)
+EXPORT_FOR_TESTS
+int btrfs_insert_one_raid_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_io_context *bioc)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_key stripe_key;
@@ -233,7 +301,7 @@ int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info,
found_end = found_logical + found_length;
if (found_logical > end) {
- ret = -ENOENT;
+ ret = -ENODATA;
goto out;
}
@@ -279,10 +347,10 @@ int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info,
}
/* If we're here, we haven't found the requested devid in the stripe. */
- ret = -ENOENT;
+ ret = -ENODATA;
out:
if (ret > 0)
- ret = -ENOENT;
+ ret = -ENODATA;
if (ret && ret != -EIO && !stripe->rst_search_commit_root) {
btrfs_debug(fs_info,
"cannot find raid-stripe for logical [%llu, %llu] devid %llu, profile %s",
diff --git a/fs/btrfs/raid-stripe-tree.h b/fs/btrfs/raid-stripe-tree.h
index 1ac1c21..5418364 100644
--- a/fs/btrfs/raid-stripe-tree.h
+++ b/fs/btrfs/raid-stripe-tree.h
@@ -28,6 +28,11 @@ int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info,
int btrfs_insert_raid_extent(struct btrfs_trans_handle *trans,
struct btrfs_ordered_extent *ordered_extent);
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+int btrfs_insert_one_raid_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_io_context *bioc);
+#endif
+
static inline bool btrfs_need_stripe_tree_update(struct btrfs_fs_info *fs_info,
u64 map_type)
{
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 39bec67..cdd373c 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -1272,8 +1272,7 @@ static inline void bio_list_put(struct bio_list *bio_list)
static void assert_rbio(struct btrfs_raid_bio *rbio)
{
- if (!IS_ENABLED(CONFIG_BTRFS_DEBUG) ||
- !IS_ENABLED(CONFIG_BTRFS_ASSERT))
+ if (!IS_ENABLED(CONFIG_BTRFS_ASSERT))
return;
/*
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index f3834f8..bf267bd 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1244,7 +1244,7 @@ int replace_path(struct btrfs_trans_handle *trans, struct reloc_control *rc,
* The real subtree rescan is delayed until we have new
* CoW on the subtree root node before transaction commit.
*/
- ret = btrfs_qgroup_add_swapped_blocks(trans, dest,
+ ret = btrfs_qgroup_add_swapped_blocks(dest,
rc->block_group, parent, slot,
path->nodes[level], path->slots[level],
last_snapshot);
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 3a34274..204c928 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -1656,8 +1656,7 @@ static u32 stripe_length(const struct scrub_stripe *stripe)
stripe->bg->start + stripe->bg->length - stripe->logical);
}
-static void scrub_submit_extent_sector_read(struct scrub_ctx *sctx,
- struct scrub_stripe *stripe)
+static void scrub_submit_extent_sector_read(struct scrub_stripe *stripe)
{
struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
struct btrfs_bio *bbio = NULL;
@@ -1704,8 +1703,18 @@ static void scrub_submit_extent_sector_read(struct scrub_ctx *sctx,
&stripe_len, &bioc, &io_stripe, &mirror);
btrfs_put_bioc(bioc);
if (err < 0) {
- set_bit(i, &stripe->io_error_bitmap);
- set_bit(i, &stripe->error_bitmap);
+ if (err != -ENODATA) {
+ /*
+ * Earlier btrfs_get_raid_extent_offset()
+ * returned -ENODATA, which means there's
+ * no entry for the corresponding range
+ * in the stripe tree. But if it's in
+ * the extent tree, then it's a preallocated
+ * extent and not an error.
+ */
+ set_bit(i, &stripe->io_error_bitmap);
+ set_bit(i, &stripe->error_bitmap);
+ }
continue;
}
@@ -1743,7 +1752,7 @@ static void scrub_submit_initial_read(struct scrub_ctx *sctx,
ASSERT(test_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &stripe->state));
if (btrfs_need_stripe_tree_update(fs_info, stripe->bg->flags)) {
- scrub_submit_extent_sector_read(sctx, stripe);
+ scrub_submit_extent_sector_read(stripe);
return;
}
@@ -1954,7 +1963,7 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx,
ASSERT(sctx->raid56_data_stripes);
/*
- * For data stripe search, we cannot re-use the same extent/csum paths,
+ * For data stripe search, we cannot reuse the same extent/csum paths,
* as the data stripe bytenr may be smaller than previous extent. Thus
* we have to use our own extent/csum paths.
*/
@@ -2103,7 +2112,6 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx,
*/
static int scrub_simple_mirror(struct scrub_ctx *sctx,
struct btrfs_block_group *bg,
- struct btrfs_chunk_map *map,
u64 logical_start, u64 logical_length,
struct btrfs_device *device,
u64 physical, int mirror_num)
@@ -2222,7 +2230,7 @@ static int scrub_simple_stripe(struct scrub_ctx *sctx,
* just RAID1, so we can reuse scrub_simple_mirror() to scrub
* this stripe.
*/
- ret = scrub_simple_mirror(sctx, bg, map, cur_logical,
+ ret = scrub_simple_mirror(sctx, bg, cur_logical,
BTRFS_STRIPE_LEN, device, cur_physical,
mirror_num);
if (ret)
@@ -2256,7 +2264,6 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
/* Offset inside the chunk */
u64 offset;
u64 stripe_logical;
- int stop_loop = 0;
/* Extent_path should be released by now. */
ASSERT(sctx->extent_path.nodes[0] == NULL);
@@ -2307,7 +2314,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
* Only @physical and @mirror_num needs to calculated using
* @stripe_index.
*/
- ret = scrub_simple_mirror(sctx, bg, map, bg->start, bg->length,
+ ret = scrub_simple_mirror(sctx, bg, bg->start, bg->length,
scrub_dev, map->stripes[stripe_index].physical,
stripe_index + 1);
offset = 0;
@@ -2362,7 +2369,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
* We can reuse scrub_simple_mirror() here, as the repair part
* is still based on @mirror_num.
*/
- ret = scrub_simple_mirror(sctx, bg, map, logical, BTRFS_STRIPE_LEN,
+ ret = scrub_simple_mirror(sctx, bg, logical, BTRFS_STRIPE_LEN,
scrub_dev, physical, 1);
if (ret < 0)
goto out;
@@ -2370,14 +2377,8 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
logical += increment;
physical += BTRFS_STRIPE_LEN;
spin_lock(&sctx->stat_lock);
- if (stop_loop)
- sctx->stat.last_physical =
- map->stripes[stripe_index].physical + dev_stripe_len;
- else
- sctx->stat.last_physical = physical;
+ sctx->stat.last_physical = physical;
spin_unlock(&sctx->stat_lock);
- if (stop_loop)
- break;
}
out:
ret2 = flush_scrub_stripes(sctx);
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index b068469..7254279 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -980,9 +980,7 @@ static int get_inode_gen(struct btrfs_root *root, u64 ino, u64 *gen)
return ret;
}
-typedef int (*iterate_inode_ref_t)(int num, u64 dir, int index,
- struct fs_path *p,
- void *ctx);
+typedef int (*iterate_inode_ref_t)(u64 dir, struct fs_path *p, void *ctx);
/*
* Helper function to iterate the entries in ONE btrfs_inode_ref or
@@ -1007,8 +1005,6 @@ static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path,
u32 name_len;
char *start;
int ret = 0;
- int num = 0;
- int index;
u64 dir;
unsigned long name_off;
unsigned long elem_size;
@@ -1043,13 +1039,11 @@ static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path,
iref = (struct btrfs_inode_ref *)(ptr + cur);
name_len = btrfs_inode_ref_name_len(eb, iref);
name_off = (unsigned long)(iref + 1);
- index = btrfs_inode_ref_index(eb, iref);
dir = found_key->offset;
} else {
extref = (struct btrfs_inode_extref *)(ptr + cur);
name_len = btrfs_inode_extref_name_len(eb, extref);
name_off = (unsigned long)&extref->name;
- index = btrfs_inode_extref_index(eb, extref);
dir = btrfs_inode_extref_parent(eb, extref);
}
@@ -1094,10 +1088,9 @@ static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path,
}
cur += elem_size + name_len;
- ret = iterate(num, dir, index, p, ctx);
+ ret = iterate(dir, p, ctx);
if (ret)
goto out;
- num++;
}
out:
@@ -1227,8 +1220,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
return ret;
}
-static int __copy_first_ref(int num, u64 dir, int index,
- struct fs_path *p, void *ctx)
+static int __copy_first_ref(u64 dir, struct fs_path *p, void *ctx)
{
int ret;
struct fs_path *pt = ctx;
@@ -3768,7 +3760,6 @@ static int wait_for_dest_dir_move(struct send_ctx *sctx,
struct recorded_ref *parent_ref,
const bool is_orphan)
{
- struct btrfs_fs_info *fs_info = sctx->parent_root->fs_info;
struct btrfs_path *path;
struct btrfs_key key;
struct btrfs_key di_key;
@@ -3797,7 +3788,7 @@ static int wait_for_dest_dir_move(struct send_ctx *sctx,
goto out;
}
- di = btrfs_match_dir_item_name(fs_info, path, parent_ref->name,
+ di = btrfs_match_dir_item_name(path, parent_ref->name,
parent_ref->name_len);
if (!di) {
ret = 0;
@@ -4708,8 +4699,7 @@ static int record_ref_in_tree(struct rb_root *root, struct list_head *refs,
return ret;
}
-static int record_new_ref_if_needed(int num, u64 dir, int index,
- struct fs_path *name, void *ctx)
+static int record_new_ref_if_needed(u64 dir, struct fs_path *name, void *ctx)
{
int ret = 0;
struct send_ctx *sctx = ctx;
@@ -4738,8 +4728,7 @@ static int record_new_ref_if_needed(int num, u64 dir, int index,
return ret;
}
-static int record_deleted_ref_if_needed(int num, u64 dir, int index,
- struct fs_path *name, void *ctx)
+static int record_deleted_ref_if_needed(u64 dir, struct fs_path *name, void *ctx)
{
int ret = 0;
struct send_ctx *sctx = ctx;
@@ -5677,10 +5666,11 @@ static int send_encoded_extent(struct send_ctx *sctx, struct btrfs_path *path,
* Note that send_buf is a mapping of send_buf_pages, so this is really
* reading into send_buf.
*/
- ret = btrfs_encoded_read_regular_fill_pages(BTRFS_I(inode), offset,
+ ret = btrfs_encoded_read_regular_fill_pages(BTRFS_I(inode),
disk_bytenr, disk_num_bytes,
sctx->send_buf_pages +
- (data_offset >> PAGE_SHIFT));
+ (data_offset >> PAGE_SHIFT),
+ NULL);
if (ret)
goto out;
@@ -8135,7 +8125,20 @@ long btrfs_ioctl_send(struct btrfs_inode *inode, const struct btrfs_ioctl_send_a
* making it RW. This also protects against deletion.
*/
spin_lock(&send_root->root_item_lock);
- if (btrfs_root_readonly(send_root) && send_root->dedupe_in_progress) {
+ /*
+ * Unlikely but possible, if the subvolume is marked for deletion but
+ * is slow to remove the directory entry, send can still be started.
+ */
+ if (btrfs_root_dead(send_root)) {
+ spin_unlock(&send_root->root_item_lock);
+ return -EPERM;
+ }
+ /* Userspace tools do the checks and warn the user if it's not RO. */
+ if (!btrfs_root_readonly(send_root)) {
+ spin_unlock(&send_root->root_item_lock);
+ return -EPERM;
+ }
+ if (send_root->dedupe_in_progress) {
dedupe_in_progress_warn(send_root);
spin_unlock(&send_root->root_item_lock);
return -EAGAIN;
@@ -8144,15 +8147,6 @@ long btrfs_ioctl_send(struct btrfs_inode *inode, const struct btrfs_ioctl_send_a
spin_unlock(&send_root->root_item_lock);
/*
- * Userspace tools do the checks and warn the user if it's
- * not RO.
- */
- if (!btrfs_root_readonly(send_root)) {
- ret = -EPERM;
- goto out;
- }
-
- /*
* Check that we don't overflow at later allocations, we request
* clone_sources_count + 1 items, and compare to unsigned long inside
* access_ok. Also set an upper limit for allocation size so this can't
@@ -8217,15 +8211,6 @@ long btrfs_ioctl_send(struct btrfs_inode *inode, const struct btrfs_ioctl_send_a
}
sctx->send_root = send_root;
- /*
- * Unlikely but possible, if the subvolume is marked for deletion but
- * is slow to remove the directory entry, send can still be started
- */
- if (btrfs_root_dead(sctx->send_root)) {
- ret = -EPERM;
- goto out;
- }
-
sctx->clone_roots_cnt = arg->clone_sources_count;
if (sctx->proto >= 2) {
diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h
index b07f4aa..9309886 100644
--- a/fs/btrfs/send.h
+++ b/fs/btrfs/send.h
@@ -16,7 +16,7 @@ struct btrfs_ioctl_send_args;
#define BTRFS_SEND_STREAM_MAGIC "btrfs-stream"
/* Conditional support for the upcoming protocol version. */
-#ifdef CONFIG_BTRFS_DEBUG
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
#define BTRFS_SEND_STREAM_VERSION 3
#else
#define BTRFS_SEND_STREAM_VERSION 2
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index d5a9cd8..255e85f 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -1279,7 +1279,7 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work)
* If we are freeing inodes, we want to make sure all delayed iputs have
* completed, because they could have been on an inode with i_nlink == 0, and
* thus have been truncated and freed up space. But again this space is not
- * immediately re-usable, it comes in the form of a delayed ref, which must be
+ * immediately reusable, it comes in the form of a delayed ref, which must be
* run and then the transaction must be committed.
*
* COMMIT_TRANS
@@ -1488,8 +1488,7 @@ static void priority_reclaim_data_space(struct btrfs_fs_info *fs_info,
spin_unlock(&space_info->lock);
}
-static void wait_reserve_ticket(struct btrfs_fs_info *fs_info,
- struct btrfs_space_info *space_info,
+static void wait_reserve_ticket(struct btrfs_space_info *space_info,
struct reserve_ticket *ticket)
{
@@ -1547,7 +1546,7 @@ static int handle_reserve_ticket(struct btrfs_fs_info *fs_info,
case BTRFS_RESERVE_FLUSH_DATA:
case BTRFS_RESERVE_FLUSH_ALL:
case BTRFS_RESERVE_FLUSH_ALL_STEAL:
- wait_reserve_ticket(fs_info, space_info, ticket);
+ wait_reserve_ticket(space_info, ticket);
break;
case BTRFS_RESERVE_FLUSH_LIMIT:
priority_reclaim_metadata_space(fs_info, space_info, ticket,
@@ -1984,8 +1983,7 @@ static bool is_reclaim_urgent(struct btrfs_space_info *space_info)
return unalloc < data_chunk_size;
}
-static void do_reclaim_sweep(const struct btrfs_fs_info *fs_info,
- struct btrfs_space_info *space_info, int raid)
+static void do_reclaim_sweep(struct btrfs_space_info *space_info, int raid)
{
struct btrfs_block_group *bg;
int thresh_pct;
@@ -2081,6 +2079,6 @@ void btrfs_reclaim_sweep(const struct btrfs_fs_info *fs_info)
if (!btrfs_should_periodic_reclaim(space_info))
continue;
for (raid = 0; raid < BTRFS_NR_RAID_TYPES; raid++)
- do_reclaim_sweep(fs_info, space_info, raid);
+ do_reclaim_sweep(space_info, raid);
}
}
diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c
index fe4d719..8c68059 100644
--- a/fs/btrfs/subpage.c
+++ b/fs/btrfs/subpage.c
@@ -140,12 +140,10 @@ struct btrfs_subpage *btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info,
return ERR_PTR(-ENOMEM);
spin_lock_init(&ret->lock);
- if (type == BTRFS_SUBPAGE_METADATA) {
+ if (type == BTRFS_SUBPAGE_METADATA)
atomic_set(&ret->eb_refs, 0);
- } else {
- atomic_set(&ret->readers, 0);
- atomic_set(&ret->writers, 0);
- }
+ else
+ atomic_set(&ret->nr_locked, 0);
return ret;
}
@@ -221,62 +219,6 @@ static void btrfs_subpage_assert(const struct btrfs_fs_info *fs_info,
__start_bit; \
})
-void btrfs_subpage_start_reader(const struct btrfs_fs_info *fs_info,
- struct folio *folio, u64 start, u32 len)
-{
- struct btrfs_subpage *subpage = folio_get_private(folio);
- const int start_bit = subpage_calc_start_bit(fs_info, folio, locked, start, len);
- const int nbits = len >> fs_info->sectorsize_bits;
- unsigned long flags;
-
-
- btrfs_subpage_assert(fs_info, folio, start, len);
-
- spin_lock_irqsave(&subpage->lock, flags);
- /*
- * Even though it's just for reading the page, no one should have
- * locked the subpage range.
- */
- ASSERT(bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits));
- bitmap_set(subpage->bitmaps, start_bit, nbits);
- atomic_add(nbits, &subpage->readers);
- spin_unlock_irqrestore(&subpage->lock, flags);
-}
-
-void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info,
- struct folio *folio, u64 start, u32 len)
-{
- struct btrfs_subpage *subpage = folio_get_private(folio);
- const int start_bit = subpage_calc_start_bit(fs_info, folio, locked, start, len);
- const int nbits = len >> fs_info->sectorsize_bits;
- unsigned long flags;
- bool is_data;
- bool last;
-
- btrfs_subpage_assert(fs_info, folio, start, len);
- is_data = is_data_inode(BTRFS_I(folio->mapping->host));
-
- spin_lock_irqsave(&subpage->lock, flags);
-
- /* The range should have already been locked. */
- ASSERT(bitmap_test_range_all_set(subpage->bitmaps, start_bit, nbits));
- ASSERT(atomic_read(&subpage->readers) >= nbits);
-
- bitmap_clear(subpage->bitmaps, start_bit, nbits);
- last = atomic_sub_and_test(nbits, &subpage->readers);
-
- /*
- * For data we need to unlock the page if the last read has finished.
- *
- * And please don't replace @last with atomic_sub_and_test() call
- * inside if () condition.
- * As we want the atomic_sub_and_test() to be always executed.
- */
- if (is_data && last)
- folio_unlock(folio);
- spin_unlock_irqrestore(&subpage->lock, flags);
-}
-
static void btrfs_subpage_clamp_range(struct folio *folio, u64 *start, u32 *len)
{
u64 orig_start = *start;
@@ -295,28 +237,8 @@ static void btrfs_subpage_clamp_range(struct folio *folio, u64 *start, u32 *len)
orig_start + orig_len) - *start;
}
-static void btrfs_subpage_start_writer(const struct btrfs_fs_info *fs_info,
- struct folio *folio, u64 start, u32 len)
-{
- struct btrfs_subpage *subpage = folio_get_private(folio);
- const int start_bit = subpage_calc_start_bit(fs_info, folio, locked, start, len);
- const int nbits = (len >> fs_info->sectorsize_bits);
- unsigned long flags;
- int ret;
-
- btrfs_subpage_assert(fs_info, folio, start, len);
-
- spin_lock_irqsave(&subpage->lock, flags);
- ASSERT(atomic_read(&subpage->readers) == 0);
- ASSERT(bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits));
- bitmap_set(subpage->bitmaps, start_bit, nbits);
- ret = atomic_add_return(nbits, &subpage->writers);
- ASSERT(ret == nbits);
- spin_unlock_irqrestore(&subpage->lock, flags);
-}
-
-static bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info,
- struct folio *folio, u64 start, u32 len)
+static bool btrfs_subpage_end_and_test_lock(const struct btrfs_fs_info *fs_info,
+ struct folio *folio, u64 start, u32 len)
{
struct btrfs_subpage *subpage = folio_get_private(folio);
const int start_bit = subpage_calc_start_bit(fs_info, folio, locked, start, len);
@@ -334,9 +256,9 @@ static bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_inf
* extent_clear_unlock_delalloc() for compression path.
*
* This @locked_page is locked by plain lock_page(), thus its
- * subpage::writers is 0. Handle them in a special way.
+ * subpage::locked is 0. Handle them in a special way.
*/
- if (atomic_read(&subpage->writers) == 0) {
+ if (atomic_read(&subpage->nr_locked) == 0) {
spin_unlock_irqrestore(&subpage->lock, flags);
return true;
}
@@ -345,40 +267,13 @@ static bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_inf
clear_bit(bit, subpage->bitmaps);
cleared++;
}
- ASSERT(atomic_read(&subpage->writers) >= cleared);
- last = atomic_sub_and_test(cleared, &subpage->writers);
+ ASSERT(atomic_read(&subpage->nr_locked) >= cleared);
+ last = atomic_sub_and_test(cleared, &subpage->nr_locked);
spin_unlock_irqrestore(&subpage->lock, flags);
return last;
}
/*
- * Lock a folio for delalloc page writeback.
- *
- * Return -EAGAIN if the page is not properly initialized.
- * Return 0 with the page locked, and writer counter updated.
- *
- * Even with 0 returned, the page still need extra check to make sure
- * it's really the correct page, as the caller is using
- * filemap_get_folios_contig(), which can race with page invalidating.
- */
-int btrfs_folio_start_writer_lock(const struct btrfs_fs_info *fs_info,
- struct folio *folio, u64 start, u32 len)
-{
- if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, folio->mapping)) {
- folio_lock(folio);
- return 0;
- }
- folio_lock(folio);
- if (!folio_test_private(folio) || !folio_get_private(folio)) {
- folio_unlock(folio);
- return -EAGAIN;
- }
- btrfs_subpage_clamp_range(folio, &start, &len);
- btrfs_subpage_start_writer(fs_info, folio, start, len);
- return 0;
-}
-
-/*
* Handle different locked folios:
*
* - Non-subpage folio
@@ -394,8 +289,8 @@ int btrfs_folio_start_writer_lock(const struct btrfs_fs_info *fs_info,
* bitmap, reduce the writer lock number, and unlock the page if that's
* the last locked range.
*/
-void btrfs_folio_end_writer_lock(const struct btrfs_fs_info *fs_info,
- struct folio *folio, u64 start, u32 len)
+void btrfs_folio_end_lock(const struct btrfs_fs_info *fs_info,
+ struct folio *folio, u64 start, u32 len)
{
struct btrfs_subpage *subpage = folio_get_private(folio);
@@ -408,24 +303,24 @@ void btrfs_folio_end_writer_lock(const struct btrfs_fs_info *fs_info,
/*
* For subpage case, there are two types of locked page. With or
- * without writers number.
+ * without locked number.
*
- * Since we own the page lock, no one else could touch subpage::writers
+ * Since we own the page lock, no one else could touch subpage::locked
* and we are safe to do several atomic operations without spinlock.
*/
- if (atomic_read(&subpage->writers) == 0) {
- /* No writers, locked by plain lock_page(). */
+ if (atomic_read(&subpage->nr_locked) == 0) {
+ /* No subpage lock, locked by plain lock_page(). */
folio_unlock(folio);
return;
}
btrfs_subpage_clamp_range(folio, &start, &len);
- if (btrfs_subpage_end_and_test_writer(fs_info, folio, start, len))
+ if (btrfs_subpage_end_and_test_lock(fs_info, folio, start, len))
folio_unlock(folio);
}
-void btrfs_folio_end_writer_lock_bitmap(const struct btrfs_fs_info *fs_info,
- struct folio *folio, unsigned long bitmap)
+void btrfs_folio_end_lock_bitmap(const struct btrfs_fs_info *fs_info,
+ struct folio *folio, unsigned long bitmap)
{
struct btrfs_subpage *subpage = folio_get_private(folio);
const int start_bit = fs_info->sectors_per_page * btrfs_bitmap_nr_locked;
@@ -434,13 +329,13 @@ void btrfs_folio_end_writer_lock_bitmap(const struct btrfs_fs_info *fs_info,
int cleared = 0;
int bit;
- if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, folio->mapping)) {
+ if (!btrfs_is_subpage(fs_info, folio->mapping)) {
folio_unlock(folio);
return;
}
- if (atomic_read(&subpage->writers) == 0) {
- /* No writers, locked by plain lock_page(). */
+ if (atomic_read(&subpage->nr_locked) == 0) {
+ /* No subpage lock, locked by plain lock_page(). */
folio_unlock(folio);
return;
}
@@ -450,8 +345,8 @@ void btrfs_folio_end_writer_lock_bitmap(const struct btrfs_fs_info *fs_info,
if (test_and_clear_bit(bit + start_bit, subpage->bitmaps))
cleared++;
}
- ASSERT(atomic_read(&subpage->writers) >= cleared);
- last = atomic_sub_and_test(cleared, &subpage->writers);
+ ASSERT(atomic_read(&subpage->nr_locked) >= cleared);
+ last = atomic_sub_and_test(cleared, &subpage->nr_locked);
spin_unlock_irqrestore(&subpage->lock, flags);
if (last)
folio_unlock(folio);
@@ -776,8 +671,8 @@ void btrfs_folio_assert_not_dirty(const struct btrfs_fs_info *fs_info,
* This populates the involved subpage ranges so that subpage helpers can
* properly unlock them.
*/
-void btrfs_folio_set_writer_lock(const struct btrfs_fs_info *fs_info,
- struct folio *folio, u64 start, u32 len)
+void btrfs_folio_set_lock(const struct btrfs_fs_info *fs_info,
+ struct folio *folio, u64 start, u32 len)
{
struct btrfs_subpage *subpage;
unsigned long flags;
@@ -796,58 +691,11 @@ void btrfs_folio_set_writer_lock(const struct btrfs_fs_info *fs_info,
/* Target range should not yet be locked. */
ASSERT(bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits));
bitmap_set(subpage->bitmaps, start_bit, nbits);
- ret = atomic_add_return(nbits, &subpage->writers);
+ ret = atomic_add_return(nbits, &subpage->nr_locked);
ASSERT(ret <= fs_info->sectors_per_page);
spin_unlock_irqrestore(&subpage->lock, flags);
}
-/*
- * Find any subpage writer locked range inside @folio, starting at file offset
- * @search_start. The caller should ensure the folio is locked.
- *
- * Return true and update @found_start_ret and @found_len_ret to the first
- * writer locked range.
- * Return false if there is no writer locked range.
- */
-bool btrfs_subpage_find_writer_locked(const struct btrfs_fs_info *fs_info,
- struct folio *folio, u64 search_start,
- u64 *found_start_ret, u32 *found_len_ret)
-{
- struct btrfs_subpage *subpage = folio_get_private(folio);
- const u32 sectors_per_page = fs_info->sectors_per_page;
- const unsigned int len = PAGE_SIZE - offset_in_page(search_start);
- const unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
- locked, search_start, len);
- const unsigned int locked_bitmap_start = sectors_per_page * btrfs_bitmap_nr_locked;
- const unsigned int locked_bitmap_end = locked_bitmap_start + sectors_per_page;
- unsigned long flags;
- int first_zero;
- int first_set;
- bool found = false;
-
- ASSERT(folio_test_locked(folio));
- spin_lock_irqsave(&subpage->lock, flags);
- first_set = find_next_bit(subpage->bitmaps, locked_bitmap_end, start_bit);
- if (first_set >= locked_bitmap_end)
- goto out;
-
- found = true;
-
- *found_start_ret = folio_pos(folio) +
- ((first_set - locked_bitmap_start) << fs_info->sectorsize_bits);
- /*
- * Since @first_set is ensured to be smaller than locked_bitmap_end
- * here, @found_start_ret should be inside the folio.
- */
- ASSERT(*found_start_ret < folio_pos(folio) + PAGE_SIZE);
-
- first_zero = find_next_zero_bit(subpage->bitmaps, locked_bitmap_end, first_set);
- *found_len_ret = (first_zero - first_set) << fs_info->sectorsize_bits;
-out:
- spin_unlock_irqrestore(&subpage->lock, flags);
- return found;
-}
-
#define GET_SUBPAGE_BITMAP(subpage, fs_info, name, dst) \
{ \
const int sectors_per_page = fs_info->sectors_per_page; \
diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h
index 4b85d91d..428fa93 100644
--- a/fs/btrfs/subpage.h
+++ b/fs/btrfs/subpage.h
@@ -45,14 +45,6 @@ enum {
struct btrfs_subpage {
/* Common members for both data and metadata pages */
spinlock_t lock;
- /*
- * Both data and metadata needs to track how many readers are for the
- * page.
- * Data relies on @readers to unlock the page when last reader finished.
- * While metadata doesn't need page unlock, it needs to prevent
- * page::private get cleared before the last end_page_read().
- */
- atomic_t readers;
union {
/*
* Structures only used by metadata
@@ -62,8 +54,12 @@ struct btrfs_subpage {
*/
atomic_t eb_refs;
- /* Structures only used by data */
- atomic_t writers;
+ /*
+ * Structures only used by data,
+ *
+ * How many sectors inside the page is locked.
+ */
+ atomic_t nr_locked;
};
unsigned long bitmaps[];
};
@@ -95,23 +91,12 @@ void btrfs_free_subpage(struct btrfs_subpage *subpage);
void btrfs_folio_inc_eb_refs(const struct btrfs_fs_info *fs_info, struct folio *folio);
void btrfs_folio_dec_eb_refs(const struct btrfs_fs_info *fs_info, struct folio *folio);
-void btrfs_subpage_start_reader(const struct btrfs_fs_info *fs_info,
- struct folio *folio, u64 start, u32 len);
-void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info,
- struct folio *folio, u64 start, u32 len);
-
-int btrfs_folio_start_writer_lock(const struct btrfs_fs_info *fs_info,
- struct folio *folio, u64 start, u32 len);
-void btrfs_folio_end_writer_lock(const struct btrfs_fs_info *fs_info,
- struct folio *folio, u64 start, u32 len);
-void btrfs_folio_set_writer_lock(const struct btrfs_fs_info *fs_info,
- struct folio *folio, u64 start, u32 len);
-void btrfs_folio_end_writer_lock_bitmap(const struct btrfs_fs_info *fs_info,
- struct folio *folio, unsigned long bitmap);
-bool btrfs_subpage_find_writer_locked(const struct btrfs_fs_info *fs_info,
- struct folio *folio, u64 search_start,
- u64 *found_start_ret, u32 *found_len_ret);
-
+void btrfs_folio_end_lock(const struct btrfs_fs_info *fs_info,
+ struct folio *folio, u64 start, u32 len);
+void btrfs_folio_set_lock(const struct btrfs_fs_info *fs_info,
+ struct folio *folio, u64 start, u32 len);
+void btrfs_folio_end_lock_bitmap(const struct btrfs_fs_info *fs_info,
+ struct folio *folio, unsigned long bitmap);
/*
* Template for subpage related operations.
*
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index c64d071..97a85d1 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -28,7 +28,6 @@
#include <linux/btrfs.h>
#include <linux/security.h>
#include <linux/fs_parser.h>
-#include <linux/swap.h>
#include "messages.h"
#include "delayed-inode.h"
#include "ctree.h"
@@ -946,8 +945,7 @@ static int get_default_subvol_objectid(struct btrfs_fs_info *fs_info, u64 *objec
}
static int btrfs_fill_super(struct super_block *sb,
- struct btrfs_fs_devices *fs_devices,
- void *data)
+ struct btrfs_fs_devices *fs_devices)
{
struct inode *inode;
struct btrfs_fs_info *fs_info = btrfs_sb(sb);
@@ -971,7 +969,7 @@ static int btrfs_fill_super(struct super_block *sb,
return err;
}
- err = open_ctree(sb, fs_devices, (char *)data);
+ err = open_ctree(sb, fs_devices);
if (err) {
btrfs_err(fs_info, "open_ctree failed");
return err;
@@ -1893,7 +1891,7 @@ static int btrfs_get_tree_super(struct fs_context *fc)
snprintf(sb->s_id, sizeof(sb->s_id), "%pg", bdev);
shrinker_debugfs_rename(sb->s_shrink, "sb-btrfs:%s", sb->s_id);
btrfs_sb(sb)->bdev_holder = &btrfs_fs_type;
- ret = btrfs_fill_super(sb, fs_devices, NULL);
+ ret = btrfs_fill_super(sb, fs_devices);
}
if (ret) {
@@ -2191,7 +2189,8 @@ static struct file_system_type btrfs_fs_type = {
.init_fs_context = btrfs_init_fs_context,
.parameters = btrfs_fs_parameters,
.kill_sb = btrfs_kill_super,
- .fs_flags = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA | FS_ALLOW_IDMAP,
+ .fs_flags = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA |
+ FS_ALLOW_IDMAP | FS_MGTIME,
};
MODULE_ALIAS_FS("btrfs");
@@ -2256,7 +2255,10 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
device = btrfs_scan_one_device(vol->name, BLK_OPEN_READ, false);
if (IS_ERR_OR_NULL(device)) {
mutex_unlock(&uuid_mutex);
- ret = PTR_ERR(device);
+ if (IS_ERR(device))
+ ret = PTR_ERR(device);
+ else
+ ret = 0;
break;
}
ret = !(device->fs_devices->num_devices ==
@@ -2395,13 +2397,7 @@ static long btrfs_nr_cached_objects(struct super_block *sb, struct shrink_contro
trace_btrfs_extent_map_shrinker_count(fs_info, nr);
- /*
- * Only report the real number for DEBUG builds, as there are reports of
- * serious performance degradation caused by too frequent shrinks.
- */
- if (IS_ENABLED(CONFIG_BTRFS_DEBUG))
- return nr;
- return 0;
+ return nr;
}
static long btrfs_free_cached_objects(struct super_block *sb, struct shrink_control *sc)
@@ -2409,16 +2405,10 @@ static long btrfs_free_cached_objects(struct super_block *sb, struct shrink_cont
const long nr_to_scan = min_t(unsigned long, LONG_MAX, sc->nr_to_scan);
struct btrfs_fs_info *fs_info = btrfs_sb(sb);
- /*
- * We may be called from any task trying to allocate memory and we don't
- * want to slow it down with scanning and dropping extent maps. It would
- * also cause heavy lock contention if many tasks concurrently enter
- * here. Therefore only allow kswapd tasks to scan and drop extent maps.
- */
- if (!current_is_kswapd())
- return 0;
+ btrfs_free_extent_maps(fs_info, nr_to_scan);
- return btrfs_free_extent_maps(fs_info, nr_to_scan);
+ /* The extent map shrinker runs asynchronously, so always return 0. */
+ return 0;
}
static const struct super_operations btrfs_super_ops = {
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 03926ad..b843308 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -1390,7 +1390,7 @@ static ssize_t btrfs_bg_reclaim_threshold_store(struct kobject *kobj,
BTRFS_ATTR_RW(, bg_reclaim_threshold, btrfs_bg_reclaim_threshold_show,
btrfs_bg_reclaim_threshold_store);
-#ifdef CONFIG_BTRFS_DEBUG
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
static ssize_t btrfs_offload_csum_show(struct kobject *kobj,
struct kobj_attribute *a, char *buf)
{
@@ -1450,7 +1450,7 @@ static const struct attribute *btrfs_attrs[] = {
BTRFS_ATTR_PTR(, bg_reclaim_threshold),
BTRFS_ATTR_PTR(, commit_stats),
BTRFS_ATTR_PTR(, temp_fsid),
-#ifdef CONFIG_BTRFS_DEBUG
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
BTRFS_ATTR_PTR(, offload_csum),
#endif
NULL,
diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
index ce50847..e607b5d 100644
--- a/fs/btrfs/tests/btrfs-tests.c
+++ b/fs/btrfs/tests/btrfs-tests.c
@@ -29,6 +29,7 @@ const char *test_error[] = {
[TEST_ALLOC_BLOCK_GROUP] = "cannot allocate block group",
[TEST_ALLOC_EXTENT_MAP] = "cannot allocate extent map",
[TEST_ALLOC_CHUNK_MAP] = "cannot allocate chunk map",
+ [TEST_ALLOC_IO_CONTEXT] = "cannot allocate io context",
};
static const struct super_operations btrfs_test_super_ops = {
@@ -291,6 +292,9 @@ int btrfs_run_sanity_tests(void)
ret = btrfs_test_free_space_tree(sectorsize, nodesize);
if (ret)
goto out;
+ ret = btrfs_test_raid_stripe_tree(sectorsize, nodesize);
+ if (ret)
+ goto out;
}
}
ret = btrfs_test_extent_map();
diff --git a/fs/btrfs/tests/btrfs-tests.h b/fs/btrfs/tests/btrfs-tests.h
index dc2f2ab..b524ecf 100644
--- a/fs/btrfs/tests/btrfs-tests.h
+++ b/fs/btrfs/tests/btrfs-tests.h
@@ -24,6 +24,7 @@ enum {
TEST_ALLOC_BLOCK_GROUP,
TEST_ALLOC_EXTENT_MAP,
TEST_ALLOC_CHUNK_MAP,
+ TEST_ALLOC_IO_CONTEXT,
};
extern const char *test_error[];
@@ -37,6 +38,7 @@ int btrfs_test_extent_io(u32 sectorsize, u32 nodesize);
int btrfs_test_inodes(u32 sectorsize, u32 nodesize);
int btrfs_test_qgroups(u32 sectorsize, u32 nodesize);
int btrfs_test_free_space_tree(u32 sectorsize, u32 nodesize);
+int btrfs_test_raid_stripe_tree(u32 sectorsize, u32 nodesize);
int btrfs_test_extent_map(void);
struct inode *btrfs_new_test_inode(void);
struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize);
diff --git a/fs/btrfs/tests/raid-stripe-tree-tests.c b/fs/btrfs/tests/raid-stripe-tree-tests.c
new file mode 100644
index 0000000..30f17eb7
--- /dev/null
+++ b/fs/btrfs/tests/raid-stripe-tree-tests.c
@@ -0,0 +1,538 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2024 Western Digital Corporation or its affiliates.
+ */
+
+#include <linux/sizes.h>
+#include "../fs.h"
+#include "../disk-io.h"
+#include "../transaction.h"
+#include "../volumes.h"
+#include "../raid-stripe-tree.h"
+#include "btrfs-tests.h"
+
+#define RST_TEST_NUM_DEVICES (2)
+#define RST_TEST_RAID1_TYPE (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_RAID1)
+
+typedef int (*test_func_t)(struct btrfs_trans_handle *trans);
+
+static struct btrfs_device *btrfs_device_by_devid(struct btrfs_fs_devices *fs_devices,
+ u64 devid)
+{
+ struct btrfs_device *dev;
+
+ list_for_each_entry(dev, &fs_devices->devices, dev_list) {
+ if (dev->devid == devid)
+ return dev;
+ }
+
+ return NULL;
+}
+
+/*
+ * Test a 64K RST write on a 2 disk RAID1 at a logical address of 1M and then
+ * delete the 1st 32K, making the new start address 1M+32K.
+ */
+static int test_front_delete(struct btrfs_trans_handle *trans)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_io_context *bioc;
+ struct btrfs_io_stripe io_stripe = { 0 };
+ u64 map_type = RST_TEST_RAID1_TYPE;
+ u64 logical = SZ_1M;
+ u64 len = SZ_64K;
+ int ret;
+
+ bioc = alloc_btrfs_io_context(fs_info, logical, RST_TEST_NUM_DEVICES);
+ if (!bioc) {
+ test_std_err(TEST_ALLOC_IO_CONTEXT);
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ io_stripe.dev = btrfs_device_by_devid(fs_info->fs_devices, 0);
+ bioc->map_type = map_type;
+ bioc->size = len;
+
+ for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) {
+ struct btrfs_io_stripe *stripe = &bioc->stripes[i];
+
+ stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i);
+ if (!stripe->dev) {
+ test_err("cannot find device with devid %d", i);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ stripe->physical = logical + i * SZ_1G;
+ }
+
+ ret = btrfs_insert_one_raid_extent(trans, bioc);
+ if (ret) {
+ test_err("inserting RAID extent failed: %d", ret);
+ goto out;
+ }
+
+ ret = btrfs_get_raid_extent_offset(fs_info, logical, &len, map_type, 0, &io_stripe);
+ if (ret) {
+ test_err("lookup of RAID extent [%llu, %llu] failed", logical,
+ logical + len);
+ goto out;
+ }
+
+ if (io_stripe.physical != logical) {
+ test_err("invalid physical address, expected %llu got %llu",
+ logical, io_stripe.physical);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (len != SZ_64K) {
+ test_err("invalid stripe length, expected %llu got %llu",
+ (u64)SZ_64K, len);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = btrfs_delete_raid_extent(trans, logical, SZ_32K);
+ if (ret) {
+ test_err("deleting RAID extent [%llu, %llu] failed", logical,
+ logical + SZ_32K);
+ goto out;
+ }
+
+ len = SZ_32K;
+ ret = btrfs_get_raid_extent_offset(fs_info, logical + SZ_32K, &len,
+ map_type, 0, &io_stripe);
+ if (ret) {
+ test_err("lookup of RAID extent [%llu, %llu] failed",
+ logical + SZ_32K, logical + SZ_32K + len);
+ goto out;
+ }
+
+ if (io_stripe.physical != logical + SZ_32K) {
+ test_err("invalid physical address, expected %llu, got %llu",
+ logical + SZ_32K, io_stripe.physical);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (len != SZ_32K) {
+ test_err("invalid stripe length, expected %llu, got %llu",
+ (u64)SZ_32K, len);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = btrfs_get_raid_extent_offset(fs_info, logical, &len, map_type, 0, &io_stripe);
+ if (!ret) {
+ ret = -EINVAL;
+ test_err("lookup of RAID extent [%llu, %llu] succeeded, should fail",
+ logical, logical + SZ_32K);
+ goto out;
+ }
+
+ ret = btrfs_delete_raid_extent(trans, logical + SZ_32K, SZ_32K);
+out:
+ btrfs_put_bioc(bioc);
+ return ret;
+}
+
+/*
+ * Test a 64K RST write on a 2 disk RAID1 at a logical address of 1M and then
+ * truncate the stripe extent down to 32K.
+ */
+static int test_tail_delete(struct btrfs_trans_handle *trans)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_io_context *bioc;
+ struct btrfs_io_stripe io_stripe = { 0 };
+ u64 map_type = RST_TEST_RAID1_TYPE;
+ u64 logical = SZ_1M;
+ u64 len = SZ_64K;
+ int ret;
+
+ bioc = alloc_btrfs_io_context(fs_info, logical, RST_TEST_NUM_DEVICES);
+ if (!bioc) {
+ test_std_err(TEST_ALLOC_IO_CONTEXT);
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ io_stripe.dev = btrfs_device_by_devid(fs_info->fs_devices, 0);
+ bioc->map_type = map_type;
+ bioc->size = len;
+
+ for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) {
+ struct btrfs_io_stripe *stripe = &bioc->stripes[i];
+
+ stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i);
+ if (!stripe->dev) {
+ test_err("cannot find device with devid %d", i);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ stripe->physical = logical + i * SZ_1G;
+ }
+
+ ret = btrfs_insert_one_raid_extent(trans, bioc);
+ if (ret) {
+ test_err("inserting RAID extent failed: %d", ret);
+ goto out;
+ }
+
+ io_stripe.dev = btrfs_device_by_devid(fs_info->fs_devices, 0);
+ if (!io_stripe.dev) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = btrfs_get_raid_extent_offset(fs_info, logical, &len, map_type, 0, &io_stripe);
+ if (ret) {
+ test_err("lookup of RAID extent [%llu, %llu] failed", logical,
+ logical + len);
+ goto out;
+ }
+
+ if (io_stripe.physical != logical) {
+ test_err("invalid physical address, expected %llu got %llu",
+ logical, io_stripe.physical);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (len != SZ_64K) {
+ test_err("invalid stripe length, expected %llu got %llu",
+ (u64)SZ_64K, len);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = btrfs_delete_raid_extent(trans, logical + SZ_32K, SZ_32K);
+ if (ret) {
+ test_err("deleting RAID extent [%llu, %llu] failed",
+ logical + SZ_32K, logical + SZ_64K);
+ goto out;
+ }
+
+ len = SZ_32K;
+ ret = btrfs_get_raid_extent_offset(fs_info, logical, &len, map_type, 0, &io_stripe);
+ if (ret) {
+ test_err("lookup of RAID extent [%llu, %llu] failed", logical,
+ logical + len);
+ goto out;
+ }
+
+ if (io_stripe.physical != logical) {
+ test_err("invalid physical address, expected %llu, got %llu",
+ logical, io_stripe.physical);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (len != SZ_32K) {
+ test_err("invalid stripe length, expected %llu, got %llu",
+ (u64)SZ_32K, len);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = btrfs_delete_raid_extent(trans, logical, len);
+ if (ret)
+ test_err("deleting RAID extent [%llu, %llu] failed", logical,
+ logical + len);
+
+out:
+ btrfs_put_bioc(bioc);
+ return ret;
+}
+
+/*
+ * Test a 64K RST write on a 2 disk RAID1 at a logical address of 1M and then
+ * overwrite the whole range giving it new physical address at an offset of 1G.
+ * The intent of this test is to exercise the 'update_raid_extent_item()'
+ * function called be btrfs_insert_one_raid_extent().
+ */
+static int test_create_update_delete(struct btrfs_trans_handle *trans)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_io_context *bioc;
+ struct btrfs_io_stripe io_stripe = { 0 };
+ u64 map_type = RST_TEST_RAID1_TYPE;
+ u64 logical = SZ_1M;
+ u64 len = SZ_64K;
+ int ret;
+
+ bioc = alloc_btrfs_io_context(fs_info, logical, RST_TEST_NUM_DEVICES);
+ if (!bioc) {
+ test_std_err(TEST_ALLOC_IO_CONTEXT);
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ io_stripe.dev = btrfs_device_by_devid(fs_info->fs_devices, 0);
+ bioc->map_type = map_type;
+ bioc->size = len;
+
+ for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) {
+ struct btrfs_io_stripe *stripe = &bioc->stripes[i];
+
+ stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i);
+ if (!stripe->dev) {
+ test_err("cannot find device with devid %d", i);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ stripe->physical = logical + i * SZ_1G;
+ }
+
+ ret = btrfs_insert_one_raid_extent(trans, bioc);
+ if (ret) {
+ test_err("inserting RAID extent failed: %d", ret);
+ goto out;
+ }
+
+ io_stripe.dev = btrfs_device_by_devid(fs_info->fs_devices, 0);
+ if (!io_stripe.dev) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = btrfs_get_raid_extent_offset(fs_info, logical, &len, map_type, 0, &io_stripe);
+ if (ret) {
+ test_err("lookup of RAID extent [%llu, %llu] failed", logical,
+ logical + len);
+ goto out;
+ }
+
+ if (io_stripe.physical != logical) {
+ test_err("invalid physical address, expected %llu got %llu",
+ logical, io_stripe.physical);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (len != SZ_64K) {
+ test_err("invalid stripe length, expected %llu got %llu",
+ (u64)SZ_64K, len);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) {
+ struct btrfs_io_stripe *stripe = &bioc->stripes[i];
+
+ stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i);
+ if (!stripe->dev) {
+ test_err("cannot find device with devid %d", i);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ stripe->physical = SZ_1G + logical + i * SZ_1G;
+ }
+
+ ret = btrfs_insert_one_raid_extent(trans, bioc);
+ if (ret) {
+ test_err("updating RAID extent failed: %d", ret);
+ goto out;
+ }
+
+ ret = btrfs_get_raid_extent_offset(fs_info, logical, &len, map_type, 0, &io_stripe);
+ if (ret) {
+ test_err("lookup of RAID extent [%llu, %llu] failed", logical,
+ logical + len);
+ goto out;
+ }
+
+ if (io_stripe.physical != logical + SZ_1G) {
+ test_err("invalid physical address, expected %llu, got %llu",
+ logical + SZ_1G, io_stripe.physical);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (len != SZ_64K) {
+ test_err("invalid stripe length, expected %llu, got %llu",
+ (u64)SZ_64K, len);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = btrfs_delete_raid_extent(trans, logical, len);
+ if (ret)
+ test_err("deleting RAID extent [%llu, %llu] failed", logical,
+ logical + len);
+
+out:
+ btrfs_put_bioc(bioc);
+ return ret;
+}
+
+/*
+ * Test a simple 64K RST write on a 2 disk RAID1 at a logical address of 1M.
+ * The "physical" copy on device 0 is at 1M, on device 1 it is at 1G+1M.
+ */
+static int test_simple_create_delete(struct btrfs_trans_handle *trans)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_io_context *bioc;
+ struct btrfs_io_stripe io_stripe = { 0 };
+ u64 map_type = RST_TEST_RAID1_TYPE;
+ u64 logical = SZ_1M;
+ u64 len = SZ_64K;
+ int ret;
+
+ bioc = alloc_btrfs_io_context(fs_info, logical, RST_TEST_NUM_DEVICES);
+ if (!bioc) {
+ test_std_err(TEST_ALLOC_IO_CONTEXT);
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ bioc->map_type = map_type;
+ bioc->size = SZ_64K;
+
+ for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) {
+ struct btrfs_io_stripe *stripe = &bioc->stripes[i];
+
+ stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i);
+ if (!stripe->dev) {
+ test_err("cannot find device with devid %d", i);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ stripe->physical = logical + i * SZ_1G;
+ }
+
+ ret = btrfs_insert_one_raid_extent(trans, bioc);
+ if (ret) {
+ test_err("inserting RAID extent failed: %d", ret);
+ goto out;
+ }
+
+ io_stripe.dev = btrfs_device_by_devid(fs_info->fs_devices, 0);
+ if (!io_stripe.dev) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = btrfs_get_raid_extent_offset(fs_info, logical, &len, map_type, 0, &io_stripe);
+ if (ret) {
+ test_err("lookup of RAID extent [%llu, %llu] failed", logical,
+ logical + len);
+ goto out;
+ }
+
+ if (io_stripe.physical != logical) {
+ test_err("invalid physical address, expected %llu got %llu",
+ logical, io_stripe.physical);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (len != SZ_64K) {
+ test_err("invalid stripe length, expected %llu got %llu",
+ (u64)SZ_64K, len);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = btrfs_delete_raid_extent(trans, logical, len);
+ if (ret)
+ test_err("deleting RAID extent [%llu, %llu] failed", logical,
+ logical + len);
+
+out:
+ btrfs_put_bioc(bioc);
+ return ret;
+}
+
+static const test_func_t tests[] = {
+ test_simple_create_delete,
+ test_create_update_delete,
+ test_tail_delete,
+ test_front_delete,
+};
+
+static int run_test(test_func_t test, u32 sectorsize, u32 nodesize)
+{
+ struct btrfs_trans_handle trans;
+ struct btrfs_fs_info *fs_info;
+ struct btrfs_root *root = NULL;
+ int ret;
+
+ fs_info = btrfs_alloc_dummy_fs_info(sectorsize, nodesize);
+ if (!fs_info) {
+ test_std_err(TEST_ALLOC_FS_INFO);
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ root = btrfs_alloc_dummy_root(fs_info);
+ if (IS_ERR(root)) {
+ test_std_err(TEST_ALLOC_ROOT);
+ ret = PTR_ERR(root);
+ goto out;
+ }
+ btrfs_set_super_compat_ro_flags(root->fs_info->super_copy,
+ BTRFS_FEATURE_INCOMPAT_RAID_STRIPE_TREE);
+ root->root_key.objectid = BTRFS_RAID_STRIPE_TREE_OBJECTID;
+ root->root_key.type = BTRFS_ROOT_ITEM_KEY;
+ root->root_key.offset = 0;
+ fs_info->stripe_root = root;
+ root->fs_info->tree_root = root;
+
+ root->node = alloc_test_extent_buffer(root->fs_info, nodesize);
+ if (IS_ERR(root->node)) {
+ test_std_err(TEST_ALLOC_EXTENT_BUFFER);
+ ret = PTR_ERR(root->node);
+ goto out;
+ }
+ btrfs_set_header_level(root->node, 0);
+ btrfs_set_header_nritems(root->node, 0);
+ root->alloc_bytenr += 2 * nodesize;
+
+ for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) {
+ struct btrfs_device *dev;
+
+ dev = btrfs_alloc_dummy_device(fs_info);
+ if (IS_ERR(dev)) {
+ test_err("cannot allocate device");
+ ret = PTR_ERR(dev);
+ goto out;
+ }
+ dev->devid = i;
+ }
+
+ btrfs_init_dummy_trans(&trans, root->fs_info);
+ ret = test(&trans);
+ if (ret)
+ goto out;
+
+out:
+ btrfs_free_dummy_root(root);
+ btrfs_free_dummy_fs_info(fs_info);
+
+ return ret;
+}
+
+int btrfs_test_raid_stripe_tree(u32 sectorsize, u32 nodesize)
+{
+ int ret = 0;
+
+ test_msg("running raid-stripe-tree tests");
+ for (int i = 0; i < ARRAY_SIZE(tests); i++) {
+ ret = run_test(tests[i], sectorsize, nodesize);
+ if (ret) {
+ test_err("test-case %ps failed with %d\n", tests[i], ret);
+ goto out;
+ }
+ }
+
+out:
+ return ret;
+}
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 0fc873a..dc0b837 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -141,8 +141,7 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction)
WARN_ON(refcount_read(&transaction->use_count) == 0);
if (refcount_dec_and_test(&transaction->use_count)) {
BUG_ON(!list_empty(&transaction->list));
- WARN_ON(!RB_EMPTY_ROOT(
- &transaction->delayed_refs.href_root.rb_root));
+ WARN_ON(!xa_empty(&transaction->delayed_refs.head_refs));
WARN_ON(!xa_empty(&transaction->delayed_refs.dirty_extents));
if (transaction->delayed_refs.pending_csums)
btrfs_err(transaction->fs_info,
@@ -349,9 +348,8 @@ static noinline int join_transaction(struct btrfs_fs_info *fs_info,
memset(&cur_trans->delayed_refs, 0, sizeof(cur_trans->delayed_refs));
- cur_trans->delayed_refs.href_root = RB_ROOT_CACHED;
+ xa_init(&cur_trans->delayed_refs.head_refs);
xa_init(&cur_trans->delayed_refs.dirty_extents);
- atomic_set(&cur_trans->delayed_refs.num_entries, 0);
/*
* although the tree mod log is per file system and not per transaction,
@@ -2052,7 +2050,7 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, int err)
spin_unlock(&fs_info->trans_lock);
- btrfs_cleanup_one_transaction(trans->transaction, fs_info);
+ btrfs_cleanup_one_transaction(trans->transaction);
spin_lock(&fs_info->trans_lock);
if (cur_trans == fs_info->running_transaction)
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index dd9ce9b..184fa5c 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -33,7 +33,7 @@ struct btrfs_path;
*/
#define BTRFS_TRANS_DIO_WRITE_STUB ((void *) 1)
-/* Radix-tree tag for roots that are part of the trasaction. */
+/* Radix-tree tag for roots that are part of the transaction. */
#define BTRFS_ROOT_TRANS_TAG 0
enum btrfs_trans_state {
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index 7b50263..148d8ce 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -2183,8 +2183,8 @@ int btrfs_check_eb_owner(const struct extent_buffer *eb, u64 root_owner)
return 0;
}
-int btrfs_verify_level_key(struct extent_buffer *eb, int level,
- struct btrfs_key *first_key, u64 parent_transid)
+int btrfs_verify_level_key(struct extent_buffer *eb,
+ const struct btrfs_tree_parent_check *check)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
int found_level;
@@ -2192,16 +2192,16 @@ int btrfs_verify_level_key(struct extent_buffer *eb, int level,
int ret;
found_level = btrfs_header_level(eb);
- if (found_level != level) {
+ if (found_level != check->level) {
WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG),
KERN_ERR "BTRFS: tree level check failed\n");
btrfs_err(fs_info,
"tree level mismatch detected, bytenr=%llu level expected=%u has=%u",
- eb->start, level, found_level);
+ eb->start, check->level, found_level);
return -EIO;
}
- if (!first_key)
+ if (!check->has_first_key)
return 0;
/*
@@ -2226,15 +2226,15 @@ int btrfs_verify_level_key(struct extent_buffer *eb, int level,
btrfs_node_key_to_cpu(eb, &found_key, 0);
else
btrfs_item_key_to_cpu(eb, &found_key, 0);
- ret = btrfs_comp_cpu_keys(first_key, &found_key);
+ ret = btrfs_comp_cpu_keys(&check->first_key, &found_key);
if (ret) {
WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG),
KERN_ERR "BTRFS: tree first key check failed\n");
btrfs_err(fs_info,
"tree first key mismatch detected, bytenr=%llu parent_transid=%llu key expected=(%llu,%u,%llu) has=(%llu,%u,%llu)",
- eb->start, parent_transid, first_key->objectid,
- first_key->type, first_key->offset,
+ eb->start, check->transid, check->first_key.objectid,
+ check->first_key.type, check->first_key.offset,
found_key.objectid, found_key.type,
found_key.offset);
}
diff --git a/fs/btrfs/tree-checker.h b/fs/btrfs/tree-checker.h
index 01669cf..db67f96 100644
--- a/fs/btrfs/tree-checker.h
+++ b/fs/btrfs/tree-checker.h
@@ -69,7 +69,7 @@ int btrfs_check_node(struct extent_buffer *node);
int btrfs_check_chunk_valid(struct extent_buffer *leaf,
struct btrfs_chunk *chunk, u64 logical);
int btrfs_check_eb_owner(const struct extent_buffer *eb, u64 root_owner);
-int btrfs_verify_level_key(struct extent_buffer *eb, int level,
- struct btrfs_key *first_key, u64 parent_transid);
+int btrfs_verify_level_key(struct extent_buffer *eb,
+ const struct btrfs_tree_parent_check *check);
#endif
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 9637c7c..c8d6587 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -6204,7 +6204,6 @@ static int log_delayed_deletions_full(struct btrfs_trans_handle *trans,
static int batch_delete_dir_index_items(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode,
struct btrfs_path *path,
- struct btrfs_log_ctx *ctx,
const struct list_head *delayed_del_list,
const struct btrfs_delayed_item *first,
const struct btrfs_delayed_item **last_ret)
@@ -6265,7 +6264,7 @@ static int log_delayed_deletions_incremental(struct btrfs_trans_handle *trans,
if (ret < 0) {
return ret;
} else if (ret == 0) {
- ret = batch_delete_dir_index_items(trans, inode, path, ctx,
+ ret = batch_delete_dir_index_items(trans, inode, path,
delayed_del_list, curr,
&last);
if (ret)
diff --git a/fs/btrfs/tree-mod-log.c b/fs/btrfs/tree-mod-log.c
index b382a4c..1ac2678 100644
--- a/fs/btrfs/tree-mod-log.c
+++ b/fs/btrfs/tree-mod-log.c
@@ -909,7 +909,6 @@ static void tree_mod_log_rewind(struct btrfs_fs_info *fs_info,
* is freed (its refcount is decremented).
*/
struct extent_buffer *btrfs_tree_mod_log_rewind(struct btrfs_fs_info *fs_info,
- struct btrfs_path *path,
struct extent_buffer *eb,
u64 time_seq)
{
diff --git a/fs/btrfs/tree-mod-log.h b/fs/btrfs/tree-mod-log.h
index 6308c57..1c12566 100644
--- a/fs/btrfs/tree-mod-log.h
+++ b/fs/btrfs/tree-mod-log.h
@@ -41,7 +41,6 @@ int btrfs_tree_mod_log_insert_key(const struct extent_buffer *eb, int slot,
enum btrfs_mod_log_op op);
int btrfs_tree_mod_log_free_eb(struct extent_buffer *eb);
struct extent_buffer *btrfs_tree_mod_log_rewind(struct btrfs_fs_info *fs_info,
- struct btrfs_path *path,
struct extent_buffer *eb,
u64 time_seq);
struct extent_buffer *btrfs_get_old_root(struct btrfs_root *root, u64 time_seq);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index eb51b60..1cccaf9 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -733,6 +733,114 @@ const u8 *btrfs_sb_fsid_ptr(const struct btrfs_super_block *sb)
}
/*
+ * We can have very weird soft links passed in.
+ * One example is "/proc/self/fd/<fd>", which can be a soft link to
+ * a block device.
+ *
+ * But it's never a good idea to use those weird names.
+ * Here we check if the path (not following symlinks) is a good one inside
+ * "/dev/".
+ */
+static bool is_good_dev_path(const char *dev_path)
+{
+ struct path path = { .mnt = NULL, .dentry = NULL };
+ char *path_buf = NULL;
+ char *resolved_path;
+ bool is_good = false;
+ int ret;
+
+ if (!dev_path)
+ goto out;
+
+ path_buf = kmalloc(PATH_MAX, GFP_KERNEL);
+ if (!path_buf)
+ goto out;
+
+ /*
+ * Do not follow soft link, just check if the original path is inside
+ * "/dev/".
+ */
+ ret = kern_path(dev_path, 0, &path);
+ if (ret)
+ goto out;
+ resolved_path = d_path(&path, path_buf, PATH_MAX);
+ if (IS_ERR(resolved_path))
+ goto out;
+ if (strncmp(resolved_path, "/dev/", strlen("/dev/")))
+ goto out;
+ is_good = true;
+out:
+ kfree(path_buf);
+ path_put(&path);
+ return is_good;
+}
+
+static int get_canonical_dev_path(const char *dev_path, char *canonical)
+{
+ struct path path = { .mnt = NULL, .dentry = NULL };
+ char *path_buf = NULL;
+ char *resolved_path;
+ int ret;
+
+ if (!dev_path) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ path_buf = kmalloc(PATH_MAX, GFP_KERNEL);
+ if (!path_buf) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = kern_path(dev_path, LOOKUP_FOLLOW, &path);
+ if (ret)
+ goto out;
+ resolved_path = d_path(&path, path_buf, PATH_MAX);
+ ret = strscpy(canonical, resolved_path, PATH_MAX);
+out:
+ kfree(path_buf);
+ path_put(&path);
+ return ret;
+}
+
+static bool is_same_device(struct btrfs_device *device, const char *new_path)
+{
+ struct path old = { .mnt = NULL, .dentry = NULL };
+ struct path new = { .mnt = NULL, .dentry = NULL };
+ char *old_path = NULL;
+ bool is_same = false;
+ int ret;
+
+ if (!device->name)
+ goto out;
+
+ old_path = kzalloc(PATH_MAX, GFP_NOFS);
+ if (!old_path)
+ goto out;
+
+ rcu_read_lock();
+ ret = strscpy(old_path, rcu_str_deref(device->name), PATH_MAX);
+ rcu_read_unlock();
+ if (ret < 0)
+ goto out;
+
+ ret = kern_path(old_path, LOOKUP_FOLLOW, &old);
+ if (ret)
+ goto out;
+ ret = kern_path(new_path, LOOKUP_FOLLOW, &new);
+ if (ret)
+ goto out;
+ if (path_equal(&old, &new))
+ is_same = true;
+out:
+ kfree(old_path);
+ path_put(&old);
+ path_put(&new);
+ return is_same;
+}
+
+/*
* Add new device to list of registered devices
*
* Returns:
@@ -852,7 +960,7 @@ static noinline struct btrfs_device *device_list_add(const char *path,
MAJOR(path_devt), MINOR(path_devt),
current->comm, task_pid_nr(current));
- } else if (!device->name || strcmp(device->name->str, path)) {
+ } else if (!device->name || !is_same_device(device, path)) {
/*
* When FS is already mounted.
* 1. If you are here and if the device->name is NULL that
@@ -1383,12 +1491,23 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags,
bool new_device_added = false;
struct btrfs_device *device = NULL;
struct file *bdev_file;
+ char *canonical_path = NULL;
u64 bytenr;
dev_t devt;
int ret;
lockdep_assert_held(&uuid_mutex);
+ if (!is_good_dev_path(path)) {
+ canonical_path = kmalloc(PATH_MAX, GFP_KERNEL);
+ if (canonical_path) {
+ ret = get_canonical_dev_path(path, canonical_path);
+ if (ret < 0) {
+ kfree(canonical_path);
+ canonical_path = NULL;
+ }
+ }
+ }
/*
* Avoid an exclusive open here, as the systemd-udev may initiate the
* device scan which may race with the user's mount or mkfs command,
@@ -1433,7 +1552,8 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags,
goto free_disk_super;
}
- device = device_list_add(path, disk_super, &new_device_added);
+ device = device_list_add(canonical_path ? : path, disk_super,
+ &new_device_added);
if (!IS_ERR(device) && new_device_added)
btrfs_free_stale_devices(device->devt, device);
@@ -1442,6 +1562,7 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags,
error_bdev_put:
fput(bdev_file);
+ kfree(canonical_path);
return device;
}
@@ -2721,8 +2842,6 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
set_blocksize(device->bdev_file, BTRFS_BDEV_BLOCKSIZE);
if (seeding_dev) {
- btrfs_clear_sb_rdonly(sb);
-
/* GFP_KERNEL allocation must not be under device_list_mutex */
seed_devices = btrfs_init_sprout(fs_info);
if (IS_ERR(seed_devices)) {
@@ -2865,8 +2984,6 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
mutex_unlock(&fs_info->chunk_mutex);
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
error_trans:
- if (seeding_dev)
- btrfs_set_sb_rdonly(sb);
if (trans)
btrfs_end_transaction(trans);
error_free_zone:
@@ -5310,7 +5427,7 @@ static int decide_stripe_size_zoned(struct alloc_chunk_ctl *ctl,
ctl->num_stripes = ctl->ndevs * ctl->dev_stripes;
data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies;
- /* stripe_size is fixed in zoned filesysmte. Reduce ndevs instead. */
+ /* stripe_size is fixed in zoned filesystem. Reduce ndevs instead. */
if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) {
ctl->ndevs = div_u64(div_u64(ctl->max_chunk_size * ctl->ncopies,
ctl->stripe_size) + ctl->nparity,
@@ -5842,24 +5959,6 @@ unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
return len;
}
-int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
-{
- struct btrfs_chunk_map *map;
- int ret = 0;
-
- if (!btrfs_fs_incompat(fs_info, RAID56))
- return 0;
-
- map = btrfs_get_chunk_map(fs_info, logical, len);
-
- if (!WARN_ON(IS_ERR(map))) {
- if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
- ret = 1;
- btrfs_free_chunk_map(map);
- }
- return ret;
-}
-
static int find_live_mirror(struct btrfs_fs_info *fs_info,
struct btrfs_chunk_map *map, int first,
int dev_replace_is_ongoing)
@@ -5920,9 +6019,9 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info,
return preferred_mirror;
}
-static struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_info,
- u64 logical,
- u16 total_stripes)
+EXPORT_FOR_TESTS
+struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_info,
+ u64 logical, u16 total_stripes)
{
struct btrfs_io_context *bioc;
@@ -6481,13 +6580,15 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
max_len = btrfs_max_io_len(map, map_offset, &io_geom);
*length = min_t(u64, map->chunk_len - map_offset, max_len);
- down_read(&dev_replace->rwsem);
+ if (dev_replace->replace_task != current)
+ down_read(&dev_replace->rwsem);
+
dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
/*
* Hold the semaphore for read during the whole operation, write is
* requested at commit time but must wait.
*/
- if (!dev_replace_is_ongoing)
+ if (!dev_replace_is_ongoing && dev_replace->replace_task != current)
up_read(&dev_replace->rwsem);
switch (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
@@ -6627,7 +6728,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
bioc->mirror_num = io_geom.mirror_num;
out:
- if (dev_replace_is_ongoing) {
+ if (dev_replace_is_ongoing && dev_replace->replace_task != current) {
lockdep_assert_held(&dev_replace->rwsem);
/* Unlock and let waiting writers proceed */
up_read(&dev_replace->rwsem);
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 4481575..3a416b1 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -306,7 +306,7 @@ enum btrfs_read_policy {
BTRFS_NR_READ_POLICY,
};
-#ifdef CONFIG_BTRFS_DEBUG
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
/*
* Checksum mode - offload it to workqueues or do it synchronously in
* btrfs_submit_chunk().
@@ -430,7 +430,7 @@ struct btrfs_fs_devices {
/* Policy used to read the mirrored stripes. */
enum btrfs_read_policy read_policy;
-#ifdef CONFIG_BTRFS_DEBUG
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
/* Checksum mode - offload it or do it synchronously. */
enum btrfs_offload_csum_mode offload_csum_mode;
#endif
@@ -741,8 +741,6 @@ int btrfs_run_dev_stats(struct btrfs_trans_handle *trans);
void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev);
void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev);
void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev);
-int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info,
- u64 logical, u64 len);
unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
u64 logical);
u64 btrfs_calc_stripe_length(const struct btrfs_chunk_map *map);
@@ -840,4 +838,9 @@ bool btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical);
bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr);
const u8 *btrfs_sb_fsid_ptr(const struct btrfs_super_block *sb);
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_info,
+ u64 logical, u16 total_stripes);
+#endif
+
#endif
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index ce464cd8..bc18710 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -85,7 +85,6 @@ int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode,
{
struct btrfs_dir_item *di = NULL;
struct btrfs_root *root = BTRFS_I(inode)->root;
- struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_path *path;
size_t name_len = strlen(name);
int ret = 0;
@@ -143,14 +142,14 @@ int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode,
*/
ret = 0;
btrfs_assert_tree_write_locked(path->nodes[0]);
- di = btrfs_match_dir_item_name(fs_info, path, name, name_len);
+ di = btrfs_match_dir_item_name(path, name, name_len);
if (!di && !(flags & XATTR_REPLACE)) {
ret = -ENOSPC;
goto out;
}
} else if (ret == -EEXIST) {
ret = 0;
- di = btrfs_match_dir_item_name(fs_info, path, name, name_len);
+ di = btrfs_match_dir_item_name(path, name, name_len);
ASSERT(di); /* logic error */
} else if (ret) {
goto out;
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index 100abc0..ddf0d5a 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -194,7 +194,7 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping,
pg_off = offset_in_page(start);
cur_len = btrfs_calc_input_length(orig_end, start);
data_in = kmap_local_folio(in_folio, pg_off);
- start += PAGE_SIZE;
+ start += cur_len;
workspace->strm.next_in = data_in;
workspace->strm.avail_in = cur_len;
}
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index 69d03fe..11ed523 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -707,11 +707,14 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info)
* zoned mode. In this case, we don't have a valid max zone
* append size.
*/
- if (bdev_is_zoned(device->bdev)) {
- blk_stack_limits(lim,
- &bdev_get_queue(device->bdev)->limits,
- 0);
- }
+ if (bdev_is_zoned(device->bdev))
+ blk_stack_limits(lim, bdev_limits(device->bdev), 0);
+ }
+
+ ret = blk_validate_limits(lim);
+ if (ret) {
+ btrfs_err(fs_info, "zoned: failed to validate queue limits");
+ return ret;
}
/*
@@ -1739,7 +1742,7 @@ bool btrfs_use_zone_append(struct btrfs_bio *bbio)
return false;
/*
- * Using REQ_OP_ZONE_APPNED for relocation can break assumptions on the
+ * Using REQ_OP_ZONE_APPEND for relocation can break assumptions on the
* extent layout the relocation code has.
* Furthermore we have set aside own block-group from which only the
* relocation "process" can allocate and make sure only one process at a
@@ -1973,7 +1976,7 @@ int btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info,
if (block_group->meta_write_pointer > eb->start)
return -EBUSY;
- /* If for_sync, this hole will be filled with trasnsaction commit. */
+ /* If for_sync, this hole will be filled with transaction commit. */
if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync)
return -EAGAIN;
return -EBUSY;
diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c
index 866607f..5232b56 100644
--- a/fs/btrfs/zstd.c
+++ b/fs/btrfs/zstd.c
@@ -111,6 +111,8 @@ static void zstd_reclaim_timer_fn(struct timer_list *timer)
unsigned long reclaim_threshold = jiffies - ZSTD_BTRFS_RECLAIM_JIFFIES;
struct list_head *pos, *next;
+ ASSERT(timer == &wsm.timer);
+
spin_lock(&wsm.lock);
if (list_empty(&wsm.lru_list)) {
@@ -495,7 +497,7 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping,
/* Check if we need more input */
if (workspace->in_buf.pos == workspace->in_buf.size) {
- tot_in += PAGE_SIZE;
+ tot_in += workspace->in_buf.size;
kunmap_local(workspace->in_buf.src);
workspace->in_buf.src = NULL;
folio_put(in_folio);
diff --git a/fs/buffer.c b/fs/buffer.c
index 1fc9a50..bb4a31b 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1649,6 +1649,7 @@ void block_invalidate_folio(struct folio *folio, size_t offset, size_t length)
if (length == folio_size(folio))
filemap_release_folio(folio, 0);
out:
+ folio_clear_mappedtodisk(folio);
return;
}
EXPORT_SYMBOL(block_invalidate_folio);
@@ -2803,7 +2804,7 @@ static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh,
bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
bio->bi_write_hint = write_hint;
- __bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh));
+ bio_add_folio_nofail(bio, bh->b_folio, bh->b_size, bh_offset(bh));
bio->bi_end_io = end_bio_bh_io_sync;
bio->bi_private = bh;
@@ -2813,7 +2814,7 @@ static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh,
if (wbc) {
wbc_init_bio(wbc, bio);
- wbc_account_cgroup_owner(wbc, bh->b_page, bh->b_size);
+ wbc_account_cgroup_owner(wbc, bh->b_folio, bh->b_size);
}
submit_bio(bio);
diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c
index 35ba211..3e63cfe 100644
--- a/fs/cachefiles/interface.c
+++ b/fs/cachefiles/interface.c
@@ -327,6 +327,8 @@ static void cachefiles_commit_object(struct cachefiles_object *object,
static void cachefiles_clean_up_object(struct cachefiles_object *object,
struct cachefiles_cache *cache)
{
+ struct file *file;
+
if (test_bit(FSCACHE_COOKIE_RETIRED, &object->cookie->flags)) {
if (!test_bit(CACHEFILES_OBJECT_USING_TMPFILE, &object->flags)) {
cachefiles_see_object(object, cachefiles_obj_see_clean_delete);
@@ -342,10 +344,14 @@ static void cachefiles_clean_up_object(struct cachefiles_object *object,
}
cachefiles_unmark_inode_in_use(object, object->file);
- if (object->file) {
- fput(object->file);
- object->file = NULL;
- }
+
+ spin_lock(&object->lock);
+ file = object->file;
+ object->file = NULL;
+ spin_unlock(&object->lock);
+
+ if (file)
+ fput(file);
}
/*
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index 2b3f993..7cf5971 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -691,11 +691,6 @@ bool cachefiles_commit_tmpfile(struct cachefiles_cache *cache,
}
if (!d_is_negative(dentry)) {
- if (d_backing_inode(dentry) == file_inode(object->file)) {
- success = true;
- goto out_dput;
- }
-
ret = cachefiles_unlink(volume->cache, object, fan, dentry,
FSCACHE_OBJECT_IS_STALE);
if (ret < 0)
diff --git a/fs/cachefiles/ondemand.c b/fs/cachefiles/ondemand.c
index 470c966..fe3de9a 100644
--- a/fs/cachefiles/ondemand.c
+++ b/fs/cachefiles/ondemand.c
@@ -60,26 +60,36 @@ static ssize_t cachefiles_ondemand_fd_write_iter(struct kiocb *kiocb,
{
struct cachefiles_object *object = kiocb->ki_filp->private_data;
struct cachefiles_cache *cache = object->volume->cache;
- struct file *file = object->file;
- size_t len = iter->count;
+ struct file *file;
+ size_t len = iter->count, aligned_len = len;
loff_t pos = kiocb->ki_pos;
const struct cred *saved_cred;
int ret;
- if (!file)
+ spin_lock(&object->lock);
+ file = object->file;
+ if (!file) {
+ spin_unlock(&object->lock);
return -ENOBUFS;
+ }
+ get_file(file);
+ spin_unlock(&object->lock);
cachefiles_begin_secure(cache, &saved_cred);
- ret = __cachefiles_prepare_write(object, file, &pos, &len, len, true);
+ ret = __cachefiles_prepare_write(object, file, &pos, &aligned_len, len, true);
cachefiles_end_secure(cache, saved_cred);
if (ret < 0)
- return ret;
+ goto out;
trace_cachefiles_ondemand_fd_write(object, file_inode(file), pos, len);
ret = __cachefiles_write(object, file, pos, iter, NULL, NULL);
- if (!ret)
+ if (!ret) {
ret = len;
+ kiocb->ki_pos += ret;
+ }
+out:
+ fput(file);
return ret;
}
@@ -87,12 +97,22 @@ static loff_t cachefiles_ondemand_fd_llseek(struct file *filp, loff_t pos,
int whence)
{
struct cachefiles_object *object = filp->private_data;
- struct file *file = object->file;
+ struct file *file;
+ loff_t ret;
- if (!file)
+ spin_lock(&object->lock);
+ file = object->file;
+ if (!file) {
+ spin_unlock(&object->lock);
return -ENOBUFS;
+ }
+ get_file(file);
+ spin_unlock(&object->lock);
- return vfs_llseek(file, pos, whence);
+ ret = vfs_llseek(file, pos, whence);
+ fput(file);
+
+ return ret;
}
static long cachefiles_ondemand_fd_ioctl(struct file *filp, unsigned int ioctl,
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index c2a9e2c..4c82348 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -1054,7 +1054,9 @@ static int ceph_writepages_start(struct address_space *mapping,
if (!nr_folios && !locked_pages)
break;
for (i = 0; i < nr_folios && locked_pages < max_pages; i++) {
- page = &fbatch.folios[i]->page;
+ struct folio *folio = fbatch.folios[i];
+
+ page = &folio->page;
doutc(cl, "? %p idx %lu\n", page, page->index);
if (locked_pages == 0)
lock_page(page); /* first page */
@@ -1081,8 +1083,6 @@ static int ceph_writepages_start(struct address_space *mapping,
continue;
}
if (page_offset(page) >= ceph_wbc.i_size) {
- struct folio *folio = page_folio(page);
-
doutc(cl, "folio at %lu beyond eof %llu\n",
folio->index, ceph_wbc.i_size);
if ((ceph_wbc.size_stable ||
@@ -1098,16 +1098,16 @@ static int ceph_writepages_start(struct address_space *mapping,
unlock_page(page);
break;
}
- if (PageWriteback(page) ||
- PagePrivate2(page) /* [DEPRECATED] */) {
+ if (folio_test_writeback(folio) ||
+ folio_test_private_2(folio) /* [DEPRECATED] */) {
if (wbc->sync_mode == WB_SYNC_NONE) {
- doutc(cl, "%p under writeback\n", page);
- unlock_page(page);
+ doutc(cl, "%p under writeback\n", folio);
+ folio_unlock(folio);
continue;
}
- doutc(cl, "waiting on writeback %p\n", page);
- wait_on_page_writeback(page);
- folio_wait_private_2(page_folio(page)); /* [DEPRECATED] */
+ doutc(cl, "waiting on writeback %p\n", folio);
+ folio_wait_writeback(folio);
+ folio_wait_private_2(folio); /* [DEPRECATED] */
}
if (!clear_page_dirty_for_io(page)) {
diff --git a/fs/char_dev.c b/fs/char_dev.c
index 57cc096..c2ddb99 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -562,8 +562,8 @@ int cdev_device_add(struct cdev *cdev, struct device *dev)
/**
* cdev_device_del() - inverse of cdev_device_add
- * @dev: the device structure
* @cdev: the cdev structure
+ * @dev: the device structure
*
* cdev_device_del() is a helper function to call cdev_del and device_del.
* It should be used whenever cdev_device_add is used.
diff --git a/fs/compat_binfmt_elf.c b/fs/compat_binfmt_elf.c
index 8f0af4f..d5ef546 100644
--- a/fs/compat_binfmt_elf.c
+++ b/fs/compat_binfmt_elf.c
@@ -80,6 +80,16 @@
#define ELF_HWCAP2 COMPAT_ELF_HWCAP2
#endif
+#ifdef COMPAT_ELF_HWCAP3
+#undef ELF_HWCAP3
+#define ELF_HWCAP3 COMPAT_ELF_HWCAP3
+#endif
+
+#ifdef COMPAT_ELF_HWCAP4
+#undef ELF_HWCAP4
+#define ELF_HWCAP4 COMPAT_ELF_HWCAP4
+#endif
+
#ifdef COMPAT_ARCH_DLINFO
#undef ARCH_DLINFO
#define ARCH_DLINFO COMPAT_ARCH_DLINFO
diff --git a/fs/coredump.c b/fs/coredump.c
index 45737b4..d48edb3 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -951,6 +951,7 @@ int dump_user_range(struct coredump_params *cprm, unsigned long start,
} else {
dump_skip(cprm, PAGE_SIZE);
}
+ cond_resched();
}
dump_page_free(dump_page);
return 1;
diff --git a/fs/crypto/keyring.c b/fs/crypto/keyring.c
index 206835e..787e9c8 100644
--- a/fs/crypto/keyring.c
+++ b/fs/crypto/keyring.c
@@ -22,6 +22,7 @@
#include <crypto/skcipher.h>
#include <linux/key-type.h>
#include <linux/random.h>
+#include <linux/once.h>
#include <linux/seq_file.h>
#include "fscrypt_private.h"
diff --git a/fs/dcache.c b/fs/dcache.c
index 0f6b16b..0099077 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -135,6 +135,7 @@ struct dentry_stat_t {
static DEFINE_PER_CPU(long, nr_dentry);
static DEFINE_PER_CPU(long, nr_dentry_unused);
static DEFINE_PER_CPU(long, nr_dentry_negative);
+static int dentry_negative_policy;
#if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS)
/* Statistics gathering. */
@@ -199,6 +200,15 @@ static struct ctl_table fs_dcache_sysctls[] = {
.mode = 0444,
.proc_handler = proc_nr_dentry,
},
+ {
+ .procname = "dentry-negative",
+ .data = &dentry_negative_policy,
+ .maxlen = sizeof(dentry_negative_policy),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
};
static int __init init_fs_dcache_sysctls(void)
@@ -2039,8 +2049,8 @@ EXPORT_SYMBOL(d_obtain_root);
/**
* d_add_ci - lookup or allocate new dentry with case-exact name
- * @inode: the inode case-insensitive lookup has found
* @dentry: the negative dentry that was passed to the parent's lookup func
+ * @inode: the inode case-insensitive lookup has found
* @name: the case-exact name to be associated with the returned dentry
*
* This is to avoid filling the dcache with case-insensitive names to the
@@ -2093,8 +2103,8 @@ EXPORT_SYMBOL(d_add_ci);
/**
* d_same_name - compare dentry name with case-exact name
- * @parent: parent dentry
* @dentry: the negative dentry that was passed to the parent's lookup func
+ * @parent: parent dentry
* @name: the case-exact name to be associated with the returned dentry
*
* Return: true if names are same, or false
@@ -2401,6 +2411,8 @@ void d_delete(struct dentry * dentry)
* Are we the only user?
*/
if (dentry->d_lockref.count == 1) {
+ if (dentry_negative_policy)
+ __d_drop(dentry);
dentry->d_flags &= ~DCACHE_CANT_MOUNT;
dentry_unlink_inode(dentry);
} else {
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index 8272785..69536ca 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -328,10 +328,10 @@ static int crypt_scatterlist(struct ecryptfs_crypt_stat *crypt_stat,
* Convert an eCryptfs page index into a lower byte offset
*/
static loff_t lower_offset_for_page(struct ecryptfs_crypt_stat *crypt_stat,
- struct page *page)
+ struct folio *folio)
{
return ecryptfs_lower_header_size(crypt_stat) +
- ((loff_t)page->index << PAGE_SHIFT);
+ (loff_t)folio->index * PAGE_SIZE;
}
/**
@@ -340,6 +340,7 @@ static loff_t lower_offset_for_page(struct ecryptfs_crypt_stat *crypt_stat,
* encryption operation
* @dst_page: The page to write the result into
* @src_page: The page to read from
+ * @page_index: The offset in the file (in units of PAGE_SIZE)
* @extent_offset: Page extent offset for use in generating IV
* @op: ENCRYPT or DECRYPT to indicate the desired operation
*
@@ -350,9 +351,9 @@ static loff_t lower_offset_for_page(struct ecryptfs_crypt_stat *crypt_stat,
static int crypt_extent(struct ecryptfs_crypt_stat *crypt_stat,
struct page *dst_page,
struct page *src_page,
+ pgoff_t page_index,
unsigned long extent_offset, int op)
{
- pgoff_t page_index = op == ENCRYPT ? src_page->index : dst_page->index;
loff_t extent_base;
char extent_iv[ECRYPTFS_MAX_IV_BYTES];
struct scatterlist src_sg, dst_sg;
@@ -392,7 +393,7 @@ static int crypt_extent(struct ecryptfs_crypt_stat *crypt_stat,
/**
* ecryptfs_encrypt_page
- * @page: Page mapped from the eCryptfs inode for the file; contains
+ * @folio: Folio mapped from the eCryptfs inode for the file; contains
* decrypted content that needs to be encrypted (to a temporary
* page; not in place) and written out to the lower file
*
@@ -406,7 +407,7 @@ static int crypt_extent(struct ecryptfs_crypt_stat *crypt_stat,
*
* Returns zero on success; negative on error
*/
-int ecryptfs_encrypt_page(struct page *page)
+int ecryptfs_encrypt_page(struct folio *folio)
{
struct inode *ecryptfs_inode;
struct ecryptfs_crypt_stat *crypt_stat;
@@ -416,7 +417,7 @@ int ecryptfs_encrypt_page(struct page *page)
loff_t lower_offset;
int rc = 0;
- ecryptfs_inode = page->mapping->host;
+ ecryptfs_inode = folio->mapping->host;
crypt_stat =
&(ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat);
BUG_ON(!(crypt_stat->flags & ECRYPTFS_ENCRYPTED));
@@ -431,8 +432,9 @@ int ecryptfs_encrypt_page(struct page *page)
for (extent_offset = 0;
extent_offset < (PAGE_SIZE / crypt_stat->extent_size);
extent_offset++) {
- rc = crypt_extent(crypt_stat, enc_extent_page, page,
- extent_offset, ENCRYPT);
+ rc = crypt_extent(crypt_stat, enc_extent_page,
+ folio_page(folio, 0), folio->index,
+ extent_offset, ENCRYPT);
if (rc) {
printk(KERN_ERR "%s: Error encrypting extent; "
"rc = [%d]\n", __func__, rc);
@@ -440,7 +442,7 @@ int ecryptfs_encrypt_page(struct page *page)
}
}
- lower_offset = lower_offset_for_page(crypt_stat, page);
+ lower_offset = lower_offset_for_page(crypt_stat, folio);
enc_extent_virt = kmap_local_page(enc_extent_page);
rc = ecryptfs_write_lower(ecryptfs_inode, enc_extent_virt, lower_offset,
PAGE_SIZE);
@@ -461,7 +463,7 @@ int ecryptfs_encrypt_page(struct page *page)
/**
* ecryptfs_decrypt_page
- * @page: Page mapped from the eCryptfs inode for the file; data read
+ * @folio: Folio mapped from the eCryptfs inode for the file; data read
* and decrypted from the lower file will be written into this
* page
*
@@ -475,7 +477,7 @@ int ecryptfs_encrypt_page(struct page *page)
*
* Returns zero on success; negative on error
*/
-int ecryptfs_decrypt_page(struct page *page)
+int ecryptfs_decrypt_page(struct folio *folio)
{
struct inode *ecryptfs_inode;
struct ecryptfs_crypt_stat *crypt_stat;
@@ -484,13 +486,13 @@ int ecryptfs_decrypt_page(struct page *page)
loff_t lower_offset;
int rc = 0;
- ecryptfs_inode = page->mapping->host;
+ ecryptfs_inode = folio->mapping->host;
crypt_stat =
&(ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat);
BUG_ON(!(crypt_stat->flags & ECRYPTFS_ENCRYPTED));
- lower_offset = lower_offset_for_page(crypt_stat, page);
- page_virt = kmap_local_page(page);
+ lower_offset = lower_offset_for_page(crypt_stat, folio);
+ page_virt = kmap_local_folio(folio, 0);
rc = ecryptfs_read_lower(page_virt, lower_offset, PAGE_SIZE,
ecryptfs_inode);
kunmap_local(page_virt);
@@ -504,8 +506,9 @@ int ecryptfs_decrypt_page(struct page *page)
for (extent_offset = 0;
extent_offset < (PAGE_SIZE / crypt_stat->extent_size);
extent_offset++) {
- rc = crypt_extent(crypt_stat, page, page,
- extent_offset, DECRYPT);
+ struct page *page = folio_page(folio, 0);
+ rc = crypt_extent(crypt_stat, page, page, folio->index,
+ extent_offset, DECRYPT);
if (rc) {
printk(KERN_ERR "%s: Error decrypting extent; "
"rc = [%d]\n", __func__, rc);
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index c586c5d..1f562e7 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -569,8 +569,8 @@ void ecryptfs_destroy_mount_crypt_stat(
struct ecryptfs_mount_crypt_stat *mount_crypt_stat);
int ecryptfs_init_crypt_ctx(struct ecryptfs_crypt_stat *crypt_stat);
int ecryptfs_write_inode_size_to_metadata(struct inode *ecryptfs_inode);
-int ecryptfs_encrypt_page(struct page *page);
-int ecryptfs_decrypt_page(struct page *page);
+int ecryptfs_encrypt_page(struct folio *folio);
+int ecryptfs_decrypt_page(struct folio *folio);
int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry,
struct inode *ecryptfs_inode);
int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry);
@@ -653,16 +653,15 @@ int ecryptfs_keyring_auth_tok_for_sig(struct key **auth_tok_key,
int ecryptfs_write_lower(struct inode *ecryptfs_inode, char *data,
loff_t offset, size_t size);
int ecryptfs_write_lower_page_segment(struct inode *ecryptfs_inode,
- struct page *page_for_lower,
+ struct folio *folio_for_lower,
size_t offset_in_page, size_t size);
int ecryptfs_write(struct inode *inode, char *data, loff_t offset, size_t size);
int ecryptfs_read_lower(char *data, loff_t offset, size_t size,
struct inode *ecryptfs_inode);
-int ecryptfs_read_lower_page_segment(struct page *page_for_ecryptfs,
+int ecryptfs_read_lower_page_segment(struct folio *folio_for_ecryptfs,
pgoff_t page_index,
size_t offset_in_page, size_t size,
struct inode *ecryptfs_inode);
-struct page *ecryptfs_get_locked_page(struct inode *inode, loff_t index);
int ecryptfs_parse_packet_length(unsigned char *data, size_t *size,
size_t *length_size);
int ecryptfs_write_packet_length(char *dest, size_t size,
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index cbdf82f..a9819dd 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -1008,14 +1008,6 @@ static int ecryptfs_getattr_link(struct mnt_idmap *idmap,
return rc;
}
-static int ecryptfs_do_getattr(const struct path *path, struct kstat *stat,
- u32 request_mask, unsigned int flags)
-{
- if (flags & AT_GETATTR_NOSEC)
- return vfs_getattr_nosec(path, stat, request_mask, flags);
- return vfs_getattr(path, stat, request_mask, flags);
-}
-
static int ecryptfs_getattr(struct mnt_idmap *idmap,
const struct path *path, struct kstat *stat,
u32 request_mask, unsigned int flags)
@@ -1024,8 +1016,8 @@ static int ecryptfs_getattr(struct mnt_idmap *idmap,
struct kstat lower_stat;
int rc;
- rc = ecryptfs_do_getattr(ecryptfs_dentry_to_lower_path(dentry),
- &lower_stat, request_mask, flags);
+ rc = vfs_getattr_nosec(ecryptfs_dentry_to_lower_path(dentry),
+ &lower_stat, request_mask, flags);
if (!rc) {
fsstack_copy_attr_all(d_inode(dentry),
ecryptfs_inode_to_lower(d_inode(dentry)));
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index ceda555..60f0ac8 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -23,47 +23,29 @@
#include "ecryptfs_kernel.h"
/*
- * ecryptfs_get_locked_page
- *
- * Get one page from cache or lower f/s, return error otherwise.
- *
- * Returns locked and up-to-date page (if ok), with increased
- * refcnt.
- */
-struct page *ecryptfs_get_locked_page(struct inode *inode, loff_t index)
-{
- struct page *page = read_mapping_page(inode->i_mapping, index, NULL);
- if (!IS_ERR(page))
- lock_page(page);
- return page;
-}
-
-/**
- * ecryptfs_writepage
- * @page: Page that is locked before this call is made
- * @wbc: Write-back control structure
- *
- * Returns zero on success; non-zero otherwise
- *
* This is where we encrypt the data and pass the encrypted data to
* the lower filesystem. In OpenPGP-compatible mode, we operate on
* entire underlying packets.
*/
-static int ecryptfs_writepage(struct page *page, struct writeback_control *wbc)
+static int ecryptfs_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
{
- int rc;
+ struct folio *folio = NULL;
+ int error;
- rc = ecryptfs_encrypt_page(page);
- if (rc) {
- ecryptfs_printk(KERN_WARNING, "Error encrypting "
- "page (upper index [0x%.16lx])\n", page->index);
- ClearPageUptodate(page);
- goto out;
+ while ((folio = writeback_iter(mapping, wbc, folio, &error))) {
+ error = ecryptfs_encrypt_page(folio);
+ if (error) {
+ ecryptfs_printk(KERN_WARNING,
+ "Error encrypting folio (index [0x%.16lx])\n",
+ folio->index);
+ folio_clear_uptodate(folio);
+ mapping_set_error(mapping, error);
+ }
+ folio_unlock(folio);
}
- SetPageUptodate(page);
-out:
- unlock_page(page);
- return rc;
+
+ return error;
}
static void strip_xattr_flag(char *page_virt,
@@ -97,7 +79,7 @@ static void strip_xattr_flag(char *page_virt,
/**
* ecryptfs_copy_up_encrypted_with_header
- * @page: Sort of a ``virtual'' representation of the encrypted lower
+ * @folio: Sort of a ``virtual'' representation of the encrypted lower
* file. The actual lower file does not have the metadata in
* the header. This is locked.
* @crypt_stat: The eCryptfs inode's cryptographic context
@@ -106,7 +88,7 @@ static void strip_xattr_flag(char *page_virt,
* seeing, with the header information inserted.
*/
static int
-ecryptfs_copy_up_encrypted_with_header(struct page *page,
+ecryptfs_copy_up_encrypted_with_header(struct folio *folio,
struct ecryptfs_crypt_stat *crypt_stat)
{
loff_t extent_num_in_page = 0;
@@ -115,9 +97,9 @@ ecryptfs_copy_up_encrypted_with_header(struct page *page,
int rc = 0;
while (extent_num_in_page < num_extents_per_page) {
- loff_t view_extent_num = ((((loff_t)page->index)
+ loff_t view_extent_num = ((loff_t)folio->index
* num_extents_per_page)
- + extent_num_in_page);
+ + extent_num_in_page;
size_t num_header_extents_at_front =
(crypt_stat->metadata_size / crypt_stat->extent_size);
@@ -125,21 +107,21 @@ ecryptfs_copy_up_encrypted_with_header(struct page *page,
/* This is a header extent */
char *page_virt;
- page_virt = kmap_local_page(page);
+ page_virt = kmap_local_folio(folio, 0);
memset(page_virt, 0, PAGE_SIZE);
/* TODO: Support more than one header extent */
if (view_extent_num == 0) {
size_t written;
rc = ecryptfs_read_xattr_region(
- page_virt, page->mapping->host);
+ page_virt, folio->mapping->host);
strip_xattr_flag(page_virt + 16, crypt_stat);
ecryptfs_write_header_metadata(page_virt + 20,
crypt_stat,
&written);
}
kunmap_local(page_virt);
- flush_dcache_page(page);
+ flush_dcache_folio(folio);
if (rc) {
printk(KERN_ERR "%s: Error reading xattr "
"region; rc = [%d]\n", __func__, rc);
@@ -152,9 +134,9 @@ ecryptfs_copy_up_encrypted_with_header(struct page *page,
- crypt_stat->metadata_size);
rc = ecryptfs_read_lower_page_segment(
- page, (lower_offset >> PAGE_SHIFT),
+ folio, (lower_offset >> PAGE_SHIFT),
(lower_offset & ~PAGE_MASK),
- crypt_stat->extent_size, page->mapping->host);
+ crypt_stat->extent_size, folio->mapping->host);
if (rc) {
printk(KERN_ERR "%s: Error attempting to read "
"extent at offset [%lld] in the lower "
@@ -180,55 +162,50 @@ ecryptfs_copy_up_encrypted_with_header(struct page *page,
*/
static int ecryptfs_read_folio(struct file *file, struct folio *folio)
{
- struct page *page = &folio->page;
+ struct inode *inode = folio->mapping->host;
struct ecryptfs_crypt_stat *crypt_stat =
- &ecryptfs_inode_to_private(page->mapping->host)->crypt_stat;
- int rc = 0;
+ &ecryptfs_inode_to_private(inode)->crypt_stat;
+ int err = 0;
if (!crypt_stat || !(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) {
- rc = ecryptfs_read_lower_page_segment(page, page->index, 0,
- PAGE_SIZE,
- page->mapping->host);
+ err = ecryptfs_read_lower_page_segment(folio, folio->index, 0,
+ folio_size(folio), inode);
} else if (crypt_stat->flags & ECRYPTFS_VIEW_AS_ENCRYPTED) {
if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR) {
- rc = ecryptfs_copy_up_encrypted_with_header(page,
- crypt_stat);
- if (rc) {
+ err = ecryptfs_copy_up_encrypted_with_header(folio,
+ crypt_stat);
+ if (err) {
printk(KERN_ERR "%s: Error attempting to copy "
"the encrypted content from the lower "
"file whilst inserting the metadata "
- "from the xattr into the header; rc = "
- "[%d]\n", __func__, rc);
+ "from the xattr into the header; err = "
+ "[%d]\n", __func__, err);
goto out;
}
} else {
- rc = ecryptfs_read_lower_page_segment(
- page, page->index, 0, PAGE_SIZE,
- page->mapping->host);
- if (rc) {
- printk(KERN_ERR "Error reading page; rc = "
- "[%d]\n", rc);
+ err = ecryptfs_read_lower_page_segment(folio,
+ folio->index, 0, folio_size(folio),
+ inode);
+ if (err) {
+ printk(KERN_ERR "Error reading page; err = "
+ "[%d]\n", err);
goto out;
}
}
} else {
- rc = ecryptfs_decrypt_page(page);
- if (rc) {
+ err = ecryptfs_decrypt_page(folio);
+ if (err) {
ecryptfs_printk(KERN_ERR, "Error decrypting page; "
- "rc = [%d]\n", rc);
+ "err = [%d]\n", err);
goto out;
}
}
out:
- if (rc)
- ClearPageUptodate(page);
- else
- SetPageUptodate(page);
- ecryptfs_printk(KERN_DEBUG, "Unlocking page with index = [0x%.16lx]\n",
- page->index);
- unlock_page(page);
- return rc;
+ ecryptfs_printk(KERN_DEBUG, "Unlocking folio with index = [0x%.16lx]\n",
+ folio->index);
+ folio_end_read(folio, err == 0);
+ return err;
}
/*
@@ -285,7 +262,7 @@ static int ecryptfs_write_begin(struct file *file,
if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) {
rc = ecryptfs_read_lower_page_segment(
- &folio->page, index, 0, PAGE_SIZE, mapping->host);
+ folio, index, 0, PAGE_SIZE, mapping->host);
if (rc) {
printk(KERN_ERR "%s: Error attempting to read "
"lower page segment; rc = [%d]\n",
@@ -297,7 +274,7 @@ static int ecryptfs_write_begin(struct file *file,
} else if (crypt_stat->flags & ECRYPTFS_VIEW_AS_ENCRYPTED) {
if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR) {
rc = ecryptfs_copy_up_encrypted_with_header(
- &folio->page, crypt_stat);
+ folio, crypt_stat);
if (rc) {
printk(KERN_ERR "%s: Error attempting "
"to copy the encrypted content "
@@ -311,7 +288,7 @@ static int ecryptfs_write_begin(struct file *file,
folio_mark_uptodate(folio);
} else {
rc = ecryptfs_read_lower_page_segment(
- &folio->page, index, 0, PAGE_SIZE,
+ folio, index, 0, PAGE_SIZE,
mapping->host);
if (rc) {
printk(KERN_ERR "%s: Error reading "
@@ -328,7 +305,7 @@ static int ecryptfs_write_begin(struct file *file,
folio_zero_range(folio, 0, PAGE_SIZE);
folio_mark_uptodate(folio);
} else if (len < PAGE_SIZE) {
- rc = ecryptfs_decrypt_page(&folio->page);
+ rc = ecryptfs_decrypt_page(folio);
if (rc) {
printk(KERN_ERR "%s: Error decrypting "
"page at index [%ld]; "
@@ -477,7 +454,7 @@ static int ecryptfs_write_end(struct file *file,
"(page w/ index = [0x%.16lx], to = [%d])\n", index, to);
if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) {
rc = ecryptfs_write_lower_page_segment(ecryptfs_inode,
- &folio->page, 0, to);
+ folio, 0, to);
if (!rc) {
rc = copied;
fsstack_copy_inode_size(ecryptfs_inode,
@@ -499,7 +476,7 @@ static int ecryptfs_write_end(struct file *file,
"zeros in page with index = [0x%.16lx]\n", index);
goto out;
}
- rc = ecryptfs_encrypt_page(&folio->page);
+ rc = ecryptfs_encrypt_page(folio);
if (rc) {
ecryptfs_printk(KERN_WARNING, "Error encrypting page (upper "
"index [0x%.16lx])\n", index);
@@ -548,9 +525,10 @@ const struct address_space_operations ecryptfs_aops = {
.dirty_folio = block_dirty_folio,
.invalidate_folio = block_invalidate_folio,
#endif
- .writepage = ecryptfs_writepage,
+ .writepages = ecryptfs_writepages,
.read_folio = ecryptfs_read_folio,
.write_begin = ecryptfs_write_begin,
.write_end = ecryptfs_write_end,
+ .migrate_folio = filemap_migrate_folio,
.bmap = ecryptfs_bmap,
};
diff --git a/fs/ecryptfs/read_write.c b/fs/ecryptfs/read_write.c
index 3458f15..b3b451c 100644
--- a/fs/ecryptfs/read_write.c
+++ b/fs/ecryptfs/read_write.c
@@ -41,30 +41,29 @@ int ecryptfs_write_lower(struct inode *ecryptfs_inode, char *data,
/**
* ecryptfs_write_lower_page_segment
* @ecryptfs_inode: The eCryptfs inode
- * @page_for_lower: The page containing the data to be written to the
+ * @folio_for_lower: The folio containing the data to be written to the
* lower file
- * @offset_in_page: The offset in the @page_for_lower from which to
+ * @offset_in_page: The offset in the @folio_for_lower from which to
* start writing the data
- * @size: The amount of data from @page_for_lower to write to the
+ * @size: The amount of data from @folio_for_lower to write to the
* lower file
*
* Determines the byte offset in the file for the given page and
* offset within the page, maps the page, and makes the call to write
- * the contents of @page_for_lower to the lower inode.
+ * the contents of @folio_for_lower to the lower inode.
*
* Returns zero on success; non-zero otherwise
*/
int ecryptfs_write_lower_page_segment(struct inode *ecryptfs_inode,
- struct page *page_for_lower,
+ struct folio *folio_for_lower,
size_t offset_in_page, size_t size)
{
char *virt;
loff_t offset;
int rc;
- offset = ((((loff_t)page_for_lower->index) << PAGE_SHIFT)
- + offset_in_page);
- virt = kmap_local_page(page_for_lower);
+ offset = (loff_t)folio_for_lower->index * PAGE_SIZE + offset_in_page;
+ virt = kmap_local_folio(folio_for_lower, 0);
rc = ecryptfs_write_lower(ecryptfs_inode, virt, offset, size);
if (rc > 0)
rc = 0;
@@ -93,7 +92,6 @@ int ecryptfs_write_lower_page_segment(struct inode *ecryptfs_inode,
int ecryptfs_write(struct inode *ecryptfs_inode, char *data, loff_t offset,
size_t size)
{
- struct page *ecryptfs_page;
struct ecryptfs_crypt_stat *crypt_stat;
char *ecryptfs_page_virt;
loff_t ecryptfs_file_size = i_size_read(ecryptfs_inode);
@@ -111,6 +109,7 @@ int ecryptfs_write(struct inode *ecryptfs_inode, char *data, loff_t offset,
else
pos = offset;
while (pos < (offset + size)) {
+ struct folio *ecryptfs_folio;
pgoff_t ecryptfs_page_idx = (pos >> PAGE_SHIFT);
size_t start_offset_in_page = (pos & ~PAGE_MASK);
size_t num_bytes = (PAGE_SIZE - start_offset_in_page);
@@ -130,17 +129,18 @@ int ecryptfs_write(struct inode *ecryptfs_inode, char *data, loff_t offset,
if (num_bytes > total_remaining_zeros)
num_bytes = total_remaining_zeros;
}
- ecryptfs_page = ecryptfs_get_locked_page(ecryptfs_inode,
- ecryptfs_page_idx);
- if (IS_ERR(ecryptfs_page)) {
- rc = PTR_ERR(ecryptfs_page);
+ ecryptfs_folio = read_mapping_folio(ecryptfs_inode->i_mapping,
+ ecryptfs_page_idx, NULL);
+ if (IS_ERR(ecryptfs_folio)) {
+ rc = PTR_ERR(ecryptfs_folio);
printk(KERN_ERR "%s: Error getting page at "
"index [%ld] from eCryptfs inode "
"mapping; rc = [%d]\n", __func__,
ecryptfs_page_idx, rc);
goto out;
}
- ecryptfs_page_virt = kmap_local_page(ecryptfs_page);
+ folio_lock(ecryptfs_folio);
+ ecryptfs_page_virt = kmap_local_folio(ecryptfs_folio, 0);
/*
* pos: where we're now writing, offset: where the request was
@@ -164,17 +164,17 @@ int ecryptfs_write(struct inode *ecryptfs_inode, char *data, loff_t offset,
data_offset += num_bytes;
}
kunmap_local(ecryptfs_page_virt);
- flush_dcache_page(ecryptfs_page);
- SetPageUptodate(ecryptfs_page);
- unlock_page(ecryptfs_page);
+ flush_dcache_folio(ecryptfs_folio);
+ folio_mark_uptodate(ecryptfs_folio);
+ folio_unlock(ecryptfs_folio);
if (crypt_stat->flags & ECRYPTFS_ENCRYPTED)
- rc = ecryptfs_encrypt_page(ecryptfs_page);
+ rc = ecryptfs_encrypt_page(ecryptfs_folio);
else
rc = ecryptfs_write_lower_page_segment(ecryptfs_inode,
- ecryptfs_page,
+ ecryptfs_folio,
start_offset_in_page,
data_offset);
- put_page(ecryptfs_page);
+ folio_put(ecryptfs_folio);
if (rc) {
printk(KERN_ERR "%s: Error encrypting "
"page; rc = [%d]\n", __func__, rc);
@@ -228,7 +228,7 @@ int ecryptfs_read_lower(char *data, loff_t offset, size_t size,
/**
* ecryptfs_read_lower_page_segment
- * @page_for_ecryptfs: The page into which data for eCryptfs will be
+ * @folio_for_ecryptfs: The folio into which data for eCryptfs will be
* written
* @page_index: Page index in @page_for_ecryptfs from which to start
* writing
@@ -243,7 +243,7 @@ int ecryptfs_read_lower(char *data, loff_t offset, size_t size,
*
* Returns zero on success; non-zero otherwise
*/
-int ecryptfs_read_lower_page_segment(struct page *page_for_ecryptfs,
+int ecryptfs_read_lower_page_segment(struct folio *folio_for_ecryptfs,
pgoff_t page_index,
size_t offset_in_page, size_t size,
struct inode *ecryptfs_inode)
@@ -252,12 +252,12 @@ int ecryptfs_read_lower_page_segment(struct page *page_for_ecryptfs,
loff_t offset;
int rc;
- offset = ((((loff_t)page_index) << PAGE_SHIFT) + offset_in_page);
- virt = kmap_local_page(page_for_ecryptfs);
+ offset = (loff_t)page_index * PAGE_SIZE + offset_in_page;
+ virt = kmap_local_folio(folio_for_ecryptfs, 0);
rc = ecryptfs_read_lower(virt, offset, size, ecryptfs_inode);
if (rc > 0)
rc = 0;
kunmap_local(virt);
- flush_dcache_page(page_for_ecryptfs);
+ flush_dcache_folio(folio_for_ecryptfs);
return rc;
}
diff --git a/fs/efs/super.c b/fs/efs/super.c
index e4421c1..c59086b 100644
--- a/fs/efs/super.c
+++ b/fs/efs/super.c
@@ -15,7 +15,6 @@
#include <linux/vfs.h>
#include <linux/blkdev.h>
#include <linux/fs_context.h>
-#include <linux/fs_parser.h>
#include "efs.h"
#include <linux/efs_vh.h>
#include <linux/efs_fs_sb.h>
@@ -49,15 +48,6 @@ static struct pt_types sgi_pt_types[] = {
{0, NULL}
};
-enum {
- Opt_explicit_open,
-};
-
-static const struct fs_parameter_spec efs_param_spec[] = {
- fsparam_flag ("explicit-open", Opt_explicit_open),
- {}
-};
-
/*
* File system definition and registration.
*/
@@ -67,7 +57,6 @@ static struct file_system_type efs_fs_type = {
.kill_sb = efs_kill_sb,
.fs_flags = FS_REQUIRES_DEV,
.init_fs_context = efs_init_fs_context,
- .parameters = efs_param_spec,
};
MODULE_ALIAS_FS("efs");
@@ -265,7 +254,8 @@ static int efs_fill_super(struct super_block *s, struct fs_context *fc)
if (!sb_set_blocksize(s, EFS_BLOCKSIZE)) {
pr_err("device does not support %d byte blocks\n",
EFS_BLOCKSIZE);
- return -EINVAL;
+ return invalf(fc, "device does not support %d byte blocks\n",
+ EFS_BLOCKSIZE);
}
/* read the vh (volume header) block */
@@ -327,43 +317,22 @@ static int efs_fill_super(struct super_block *s, struct fs_context *fc)
return 0;
}
-static void efs_free_fc(struct fs_context *fc)
-{
- kfree(fc->fs_private);
-}
-
static int efs_get_tree(struct fs_context *fc)
{
return get_tree_bdev(fc, efs_fill_super);
}
-static int efs_parse_param(struct fs_context *fc, struct fs_parameter *param)
-{
- int token;
- struct fs_parse_result result;
-
- token = fs_parse(fc, efs_param_spec, param, &result);
- if (token < 0)
- return token;
- return 0;
-}
-
static int efs_reconfigure(struct fs_context *fc)
{
sync_filesystem(fc->root->d_sb);
+ fc->sb_flags |= SB_RDONLY;
return 0;
}
-struct efs_context {
- unsigned long s_mount_opts;
-};
-
static const struct fs_context_operations efs_context_opts = {
- .parse_param = efs_parse_param,
.get_tree = efs_get_tree,
.reconfigure = efs_reconfigure,
- .free = efs_free_fc,
};
/*
@@ -371,12 +340,6 @@ static const struct fs_context_operations efs_context_opts = {
*/
static int efs_init_fs_context(struct fs_context *fc)
{
- struct efs_context *ctx;
-
- ctx = kzalloc(sizeof(struct efs_context), GFP_KERNEL);
- if (!ctx)
- return -ENOMEM;
- fc->fs_private = ctx;
fc->ops = &efs_context_opts;
return 0;
diff --git a/fs/eventfd.c b/fs/eventfd.c
index 22c934f..76129bf 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -347,13 +347,10 @@ EXPORT_SYMBOL_GPL(eventfd_fget);
*/
struct eventfd_ctx *eventfd_ctx_fdget(int fd)
{
- struct eventfd_ctx *ctx;
- struct fd f = fdget(fd);
- if (!fd_file(f))
+ CLASS(fd, f)(fd);
+ if (fd_empty(f))
return ERR_PTR(-EBADF);
- ctx = eventfd_ctx_fileget(fd_file(f));
- fdput(f);
- return ctx;
+ return eventfd_ctx_fileget(fd_file(f));
}
EXPORT_SYMBOL_GPL(eventfd_ctx_fdget);
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 1ae4542..62433cb 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -823,7 +823,8 @@ static bool __ep_remove(struct eventpoll *ep, struct epitem *epi, bool force)
to_free = NULL;
head = file->f_ep;
if (head->first == &epi->fllink && !epi->fllink.next) {
- file->f_ep = NULL;
+ /* See eventpoll_release() for details. */
+ WRITE_ONCE(file->f_ep, NULL);
if (!is_file_epoll(file)) {
struct epitems_head *v;
v = container_of(head, struct epitems_head, epitems);
@@ -1002,7 +1003,7 @@ static struct file *epi_fget(const struct epitem *epi)
struct file *file;
file = epi->ffd.file;
- if (!atomic_long_inc_not_zero(&file->f_count))
+ if (!file_ref_get(&file->f_ref))
file = NULL;
return file;
}
@@ -1372,7 +1373,10 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v
break;
}
}
- wake_up(&ep->wq);
+ if (sync)
+ wake_up_sync(&ep->wq);
+ else
+ wake_up(&ep->wq);
}
if (waitqueue_active(&ep->poll_wait))
pwake++;
@@ -1603,7 +1607,8 @@ static int attach_epitem(struct file *file, struct epitem *epi)
spin_unlock(&file->f_lock);
goto allocate;
}
- file->f_ep = head;
+ /* See eventpoll_release() for details. */
+ WRITE_ONCE(file->f_ep, head);
to_free = NULL;
}
hlist_add_head_rcu(&epi->fllink, file->f_ep);
@@ -2254,25 +2259,22 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
{
int error;
int full_check = 0;
- struct fd f, tf;
struct eventpoll *ep;
struct epitem *epi;
struct eventpoll *tep = NULL;
- error = -EBADF;
- f = fdget(epfd);
- if (!fd_file(f))
- goto error_return;
+ CLASS(fd, f)(epfd);
+ if (fd_empty(f))
+ return -EBADF;
/* Get the "struct file *" for the target file */
- tf = fdget(fd);
- if (!fd_file(tf))
- goto error_fput;
+ CLASS(fd, tf)(fd);
+ if (fd_empty(tf))
+ return -EBADF;
/* The target file descriptor must support poll */
- error = -EPERM;
if (!file_can_poll(fd_file(tf)))
- goto error_tgt_fput;
+ return -EPERM;
/* Check if EPOLLWAKEUP is allowed */
if (ep_op_has_event(op))
@@ -2391,12 +2393,6 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
loop_check_gen++;
mutex_unlock(&epnested_mutex);
}
-
- fdput(tf);
-error_fput:
- fdput(f);
-error_return:
-
return error;
}
@@ -2424,8 +2420,6 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
static int do_epoll_wait(int epfd, struct epoll_event __user *events,
int maxevents, struct timespec64 *to)
{
- int error;
- struct fd f;
struct eventpoll *ep;
/* The maximum number of event must be greater than zero */
@@ -2437,17 +2431,16 @@ static int do_epoll_wait(int epfd, struct epoll_event __user *events,
return -EFAULT;
/* Get the "struct file *" for the eventpoll file */
- f = fdget(epfd);
- if (!fd_file(f))
+ CLASS(fd, f)(epfd);
+ if (fd_empty(f))
return -EBADF;
/*
* We have to check that the file structure underneath the fd
* the user passed to us _is_ an eventpoll file.
*/
- error = -EINVAL;
if (!is_file_epoll(fd_file(f)))
- goto error_fput;
+ return -EINVAL;
/*
* At this point it is safe to assume that the "private_data" contains
@@ -2456,11 +2449,7 @@ static int do_epoll_wait(int epfd, struct epoll_event __user *events,
ep = fd_file(f)->private_data;
/* Time to fish for events ... */
- error = ep_poll(ep, events, maxevents, to);
-
-error_fput:
- fdput(f);
- return error;
+ return ep_poll(ep, events, maxevents, to);
}
SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 591fb3f..8042ad8 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -550,7 +550,8 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group,
trace_ext4_read_block_bitmap_load(sb, block_group, ignore_locked);
ext4_read_bh_nowait(bh, REQ_META | REQ_PRIO |
(ignore_locked ? REQ_RAHEAD : 0),
- ext4_end_bitmap_read);
+ ext4_end_bitmap_read,
+ ext4_simulate_fail(sb, EXT4_SIM_BBITMAP_EIO));
return bh;
verify:
err = ext4_validate_block_bitmap(sb, desc, block_group, bh);
@@ -577,7 +578,6 @@ int ext4_wait_block_bitmap(struct super_block *sb, ext4_group_t block_group,
if (!desc)
return -EFSCORRUPTED;
wait_on_buffer(bh);
- ext4_simulate_fail_bh(sb, bh, EXT4_SIM_BBITMAP_EIO);
if (!buffer_uptodate(bh)) {
ext4_error_err(sb, EIO, "Cannot read block bitmap - "
"block_group = %u, block_bitmap = %llu",
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index ef6a3c8..02d47a6 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -418,7 +418,7 @@ struct fname {
__u32 inode;
__u8 name_len;
__u8 file_type;
- char name[];
+ char name[] __counted_by(name_len);
};
/*
@@ -471,14 +471,13 @@ int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
struct rb_node **p, *parent = NULL;
struct fname *fname, *new_fn;
struct dir_private_info *info;
- int len;
info = dir_file->private_data;
p = &info->root.rb_node;
/* Create and allocate the fname structure */
- len = sizeof(struct fname) + ent_name->len + 1;
- new_fn = kzalloc(len, GFP_KERNEL);
+ new_fn = kzalloc(struct_size(new_fn, name, ent_name->len + 1),
+ GFP_KERNEL);
if (!new_fn)
return -ENOMEM;
new_fn->hash = hash;
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 44b0d41..74f2071 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1729,6 +1729,10 @@ struct ext4_sb_info {
*/
struct work_struct s_sb_upd_work;
+ /* Atomic write unit values in bytes */
+ unsigned int s_awu_min;
+ unsigned int s_awu_max;
+
/* Ext4 fast commit sub transaction ID */
atomic_t s_fc_subtid;
@@ -1865,14 +1869,6 @@ static inline bool ext4_simulate_fail(struct super_block *sb,
return false;
}
-static inline void ext4_simulate_fail_bh(struct super_block *sb,
- struct buffer_head *bh,
- unsigned long code)
-{
- if (!IS_ERR(bh) && ext4_simulate_fail(sb, code))
- clear_buffer_uptodate(bh);
-}
-
/*
* Error number codes for s_{first,last}_error_errno
*
@@ -3100,9 +3096,9 @@ extern struct buffer_head *ext4_sb_bread(struct super_block *sb,
extern struct buffer_head *ext4_sb_bread_unmovable(struct super_block *sb,
sector_t block);
extern void ext4_read_bh_nowait(struct buffer_head *bh, blk_opf_t op_flags,
- bh_end_io_t *end_io);
+ bh_end_io_t *end_io, bool simu_fail);
extern int ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags,
- bh_end_io_t *end_io);
+ bh_end_io_t *end_io, bool simu_fail);
extern int ext4_read_bh_lock(struct buffer_head *bh, blk_opf_t op_flags, bool wait);
extern void ext4_sb_breadahead_unmovable(struct super_block *sb, sector_t block);
extern int ext4_seq_options_show(struct seq_file *seq, void *offset);
@@ -3855,6 +3851,12 @@ static inline int ext4_buffer_uptodate(struct buffer_head *bh)
return buffer_uptodate(bh);
}
+static inline bool ext4_inode_can_atomic_write(struct inode *inode)
+{
+
+ return S_ISREG(inode->i_mode) && EXT4_SB(inode->i_sb)->s_awu_min > 0;
+}
+
extern int ext4_block_write_begin(handle_t *handle, struct folio *folio,
loff_t pos, unsigned len,
get_block_t *get_block);
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 34e25ee..a07a98a 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -568,7 +568,7 @@ __read_extent_tree_block(const char *function, unsigned int line,
if (!bh_uptodate_or_lock(bh)) {
trace_ext4_ext_load_extent(inode, pblk, _RET_IP_);
- err = ext4_read_bh(bh, 0, NULL);
+ err = ext4_read_bh(bh, 0, NULL, false);
if (err < 0)
goto errout;
}
@@ -3138,7 +3138,7 @@ static void ext4_zeroout_es(struct inode *inode, struct ext4_extent *ex)
return;
ext4_es_insert_extent(inode, ee_block, ee_len, ee_pblock,
- EXTENT_STATUS_WRITTEN, 0);
+ EXTENT_STATUS_WRITTEN, false);
}
/* FIXME!! we need to try to merge to left or right after zero-out */
@@ -4158,7 +4158,7 @@ static ext4_lblk_t ext4_ext_determine_insert_hole(struct inode *inode,
/* Put just found gap into cache to speed up subsequent requests */
ext_debug(inode, " -> %u:%u\n", hole_start, len);
ext4_es_insert_extent(inode, hole_start, len, ~0,
- EXTENT_STATUS_HOLE, 0);
+ EXTENT_STATUS_HOLE, false);
/* Update hole_len to reflect hole size after lblk */
if (hole_start != lblk)
@@ -4482,7 +4482,7 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
int depth = 0;
struct ext4_map_blocks map;
unsigned int credits;
- loff_t epos;
+ loff_t epos, old_size = i_size_read(inode);
BUG_ON(!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS));
map.m_lblk = offset;
@@ -4541,6 +4541,11 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
if (ext4_update_inode_size(inode, epos) & 0x1)
inode_set_mtime_to_ts(inode,
inode_get_ctime(inode));
+ if (epos > old_size) {
+ pagecache_isize_extended(inode, old_size, epos);
+ ext4_zero_partial_blocks(handle, inode,
+ old_size, epos - old_size);
+ }
}
ret2 = ext4_mark_inode_dirty(handle, inode);
ext4_update_inode_fsync_trans(handle, inode, 1);
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index c786691..ae29832 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -848,7 +848,7 @@ static int __es_insert_extent(struct inode *inode, struct extent_status *newes,
*/
void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
ext4_lblk_t len, ext4_fsblk_t pblk,
- unsigned int status, int flags)
+ unsigned int status, bool delalloc_reserve_used)
{
struct extent_status newes;
ext4_lblk_t end = lblk + len - 1;
@@ -863,8 +863,8 @@ void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
return;
- es_debug("add [%u/%u) %llu %x %x to extent status tree of inode %lu\n",
- lblk, len, pblk, status, flags, inode->i_ino);
+ es_debug("add [%u/%u) %llu %x %d to extent status tree of inode %lu\n",
+ lblk, len, pblk, status, delalloc_reserve_used, inode->i_ino);
if (!len)
return;
@@ -945,7 +945,7 @@ void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
resv_used += pending;
if (resv_used)
ext4_da_update_reserve_space(inode, resv_used,
- flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE);
+ delalloc_reserve_used);
if (err1 || err2 || err3 < 0)
goto retry;
diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
index 4424232..8f9c008 100644
--- a/fs/ext4/extents_status.h
+++ b/fs/ext4/extents_status.h
@@ -135,7 +135,8 @@ extern void ext4_es_init_tree(struct ext4_es_tree *tree);
extern void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
ext4_lblk_t len, ext4_fsblk_t pblk,
- unsigned int status, int flags);
+ unsigned int status,
+ bool delalloc_reserve_used);
extern void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk,
ext4_lblk_t len, ext4_fsblk_t pblk,
unsigned int status);
diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
index b33664f..26c4fc3 100644
--- a/fs/ext4/fast_commit.c
+++ b/fs/ext4/fast_commit.c
@@ -291,9 +291,9 @@ void ext4_fc_del(struct inode *inode)
return;
restart:
- spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
+ spin_lock(&sbi->s_fc_lock);
if (list_empty(&ei->i_fc_list) && list_empty(&ei->i_fc_dilist)) {
- spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
+ spin_unlock(&sbi->s_fc_lock);
return;
}
@@ -357,9 +357,7 @@ void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handl
}
spin_lock(&sbi->s_fc_lock);
is_ineligible = ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
- if (has_transaction &&
- (!is_ineligible ||
- (is_ineligible && tid_gt(tid, sbi->s_fc_ineligible_tid))))
+ if (has_transaction && (!is_ineligible || tid_gt(tid, sbi->s_fc_ineligible_tid)))
sbi->s_fc_ineligible_tid = tid;
ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
spin_unlock(&sbi->s_fc_lock);
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index f14aed1..3bd96c3 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -392,8 +392,9 @@ static int ext4_dio_write_end_io(struct kiocb *iocb, ssize_t size,
*/
if (pos + size <= READ_ONCE(EXT4_I(inode)->i_disksize) &&
pos + size <= i_size_read(inode))
- return size;
- return ext4_handle_inode_extension(inode, pos, size, size);
+ return 0;
+ error = ext4_handle_inode_extension(inode, pos, size, size);
+ return error < 0 ? error : 0;
}
static const struct iomap_dio_ops ext4_dio_write_ops = {
@@ -564,12 +565,9 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
}
ret = ext4_orphan_add(handle, inode);
- if (ret) {
- ext4_journal_stop(handle);
- goto out;
- }
-
ext4_journal_stop(handle);
+ if (ret)
+ goto out;
}
if (ilock_shared && !unwritten)
@@ -599,6 +597,13 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
ssize_t err;
loff_t endbyte;
+ /*
+ * There is no support for atomic writes on buffered-io yet,
+ * we should never fallback to buffered-io for DIO atomic
+ * writes.
+ */
+ WARN_ON_ONCE(iocb->ki_flags & IOCB_ATOMIC);
+
offset = iocb->ki_pos;
err = ext4_buffered_write_iter(iocb, from);
if (err < 0)
@@ -692,6 +697,20 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
if (IS_DAX(inode))
return ext4_dax_write_iter(iocb, from);
#endif
+
+ if (iocb->ki_flags & IOCB_ATOMIC) {
+ size_t len = iov_iter_count(from);
+ int ret;
+
+ if (len < EXT4_SB(inode->i_sb)->s_awu_min ||
+ len > EXT4_SB(inode->i_sb)->s_awu_max)
+ return -EINVAL;
+
+ ret = generic_atomic_write_valid(iocb, from);
+ if (ret)
+ return ret;
+ }
+
if (iocb->ki_flags & IOCB_DIRECT)
return ext4_dio_write_iter(iocb, from);
else
@@ -884,6 +903,9 @@ static int ext4_file_open(struct inode *inode, struct file *filp)
return ret;
}
+ if (ext4_inode_can_atomic_write(inode))
+ filp->f_mode |= FMODE_CAN_ATOMIC_WRITE;
+
filp->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT;
return dquot_file_open(inode, filp);
}
diff --git a/fs/ext4/fsmap.c b/fs/ext4/fsmap.c
index df853c4..383c6ed 100644
--- a/fs/ext4/fsmap.c
+++ b/fs/ext4/fsmap.c
@@ -185,6 +185,56 @@ static inline ext4_fsblk_t ext4_fsmap_next_pblk(struct ext4_fsmap *fmr)
return fmr->fmr_physical + fmr->fmr_length;
}
+static int ext4_getfsmap_meta_helper(struct super_block *sb,
+ ext4_group_t agno, ext4_grpblk_t start,
+ ext4_grpblk_t len, void *priv)
+{
+ struct ext4_getfsmap_info *info = priv;
+ struct ext4_fsmap *p;
+ struct ext4_fsmap *tmp;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ ext4_fsblk_t fsb, fs_start, fs_end;
+ int error;
+
+ fs_start = fsb = (EXT4_C2B(sbi, start) +
+ ext4_group_first_block_no(sb, agno));
+ fs_end = fs_start + EXT4_C2B(sbi, len);
+
+ /* Return relevant extents from the meta_list */
+ list_for_each_entry_safe(p, tmp, &info->gfi_meta_list, fmr_list) {
+ if (p->fmr_physical < info->gfi_next_fsblk) {
+ list_del(&p->fmr_list);
+ kfree(p);
+ continue;
+ }
+ if (p->fmr_physical <= fs_start ||
+ p->fmr_physical + p->fmr_length <= fs_end) {
+ /* Emit the retained free extent record if present */
+ if (info->gfi_lastfree.fmr_owner) {
+ error = ext4_getfsmap_helper(sb, info,
+ &info->gfi_lastfree);
+ if (error)
+ return error;
+ info->gfi_lastfree.fmr_owner = 0;
+ }
+ error = ext4_getfsmap_helper(sb, info, p);
+ if (error)
+ return error;
+ fsb = p->fmr_physical + p->fmr_length;
+ if (info->gfi_next_fsblk < fsb)
+ info->gfi_next_fsblk = fsb;
+ list_del(&p->fmr_list);
+ kfree(p);
+ continue;
+ }
+ }
+ if (info->gfi_next_fsblk < fsb)
+ info->gfi_next_fsblk = fsb;
+
+ return 0;
+}
+
+
/* Transform a blockgroup's free record into a fsmap */
static int ext4_getfsmap_datadev_helper(struct super_block *sb,
ext4_group_t agno, ext4_grpblk_t start,
@@ -539,6 +589,7 @@ static int ext4_getfsmap_datadev(struct super_block *sb,
error = ext4_mballoc_query_range(sb, info->gfi_agno,
EXT4_B2C(sbi, info->gfi_low.fmr_physical),
EXT4_B2C(sbi, info->gfi_high.fmr_physical),
+ ext4_getfsmap_meta_helper,
ext4_getfsmap_datadev_helper, info);
if (error)
goto err;
@@ -560,7 +611,8 @@ static int ext4_getfsmap_datadev(struct super_block *sb,
/* Report any gaps at the end of the bg */
info->gfi_last = true;
- error = ext4_getfsmap_datadev_helper(sb, end_ag, last_cluster, 0, info);
+ error = ext4_getfsmap_datadev_helper(sb, end_ag, last_cluster + 1,
+ 0, info);
if (error)
goto err;
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 7f1a5f9..21d2280 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -193,8 +193,9 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
* submit the buffer_head for reading
*/
trace_ext4_load_inode_bitmap(sb, block_group);
- ext4_read_bh(bh, REQ_META | REQ_PRIO, ext4_end_bitmap_read);
- ext4_simulate_fail_bh(sb, bh, EXT4_SIM_IBITMAP_EIO);
+ ext4_read_bh(bh, REQ_META | REQ_PRIO,
+ ext4_end_bitmap_read,
+ ext4_simulate_fail(sb, EXT4_SIM_IBITMAP_EIO));
if (!buffer_uptodate(bh)) {
put_bh(bh);
ext4_error_err(sb, EIO, "Cannot read inode bitmap - "
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 7404f09..7de327f 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -170,7 +170,7 @@ static Indirect *ext4_get_branch(struct inode *inode, int depth,
}
if (!bh_uptodate_or_lock(bh)) {
- if (ext4_read_bh(bh, 0, NULL) < 0) {
+ if (ext4_read_bh(bh, 0, NULL, false) < 0) {
put_bh(bh);
goto failure;
}
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 54bdd48..89aade6 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -483,7 +483,7 @@ static int ext4_map_query_blocks(handle_t *handle, struct inode *inode,
status = map->m_flags & EXT4_MAP_UNWRITTEN ?
EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
- map->m_pblk, status, 0);
+ map->m_pblk, status, false);
return retval;
}
@@ -563,8 +563,8 @@ static int ext4_map_create_blocks(handle_t *handle, struct inode *inode,
status = map->m_flags & EXT4_MAP_UNWRITTEN ?
EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
- ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
- map->m_pblk, status, flags);
+ ext4_es_insert_extent(inode, map->m_lblk, map->m_len, map->m_pblk,
+ status, flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE);
return retval;
}
@@ -856,7 +856,14 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
if (nowait)
return sb_find_get_block(inode->i_sb, map.m_pblk);
- bh = sb_getblk(inode->i_sb, map.m_pblk);
+ /*
+ * Since bh could introduce extra ref count such as referred by
+ * journal_head etc. Try to avoid using __GFP_MOVABLE here
+ * as it may fail the migration when journal_head remains.
+ */
+ bh = getblk_unmovable(inode->i_sb->s_bdev, map.m_pblk,
+ inode->i_sb->s_blocksize);
+
if (unlikely(!bh))
return ERR_PTR(-ENOMEM);
if (map.m_flags & EXT4_MAP_NEW) {
@@ -1307,8 +1314,10 @@ static int ext4_write_end(struct file *file,
folio_unlock(folio);
folio_put(folio);
- if (old_size < pos && !verity)
+ if (old_size < pos && !verity) {
pagecache_isize_extended(inode, old_size, pos);
+ ext4_zero_partial_blocks(handle, inode, old_size, pos - old_size);
+ }
/*
* Don't mark the inode dirty under folio lock. First, it unnecessarily
* makes the holding time of folio lock longer. Second, it forces lock
@@ -1423,8 +1432,10 @@ static int ext4_journalled_write_end(struct file *file,
folio_unlock(folio);
folio_put(folio);
- if (old_size < pos && !verity)
+ if (old_size < pos && !verity) {
pagecache_isize_extended(inode, old_size, pos);
+ ext4_zero_partial_blocks(handle, inode, old_size, pos - old_size);
+ }
if (size_changed) {
ret2 = ext4_mark_inode_dirty(handle, inode);
@@ -2985,7 +2996,8 @@ static int ext4_da_do_write_end(struct address_space *mapping,
struct inode *inode = mapping->host;
loff_t old_size = inode->i_size;
bool disksize_changed = false;
- loff_t new_i_size;
+ loff_t new_i_size, zero_len = 0;
+ handle_t *handle;
if (unlikely(!folio_buffers(folio))) {
folio_unlock(folio);
@@ -3029,19 +3041,22 @@ static int ext4_da_do_write_end(struct address_space *mapping,
folio_unlock(folio);
folio_put(folio);
- if (old_size < pos)
+ if (pos > old_size) {
pagecache_isize_extended(inode, old_size, pos);
-
- if (disksize_changed) {
- handle_t *handle;
-
- handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
- if (IS_ERR(handle))
- return PTR_ERR(handle);
- ext4_mark_inode_dirty(handle, inode);
- ext4_journal_stop(handle);
+ zero_len = pos - old_size;
}
+ if (!disksize_changed && !zero_len)
+ return copied;
+
+ handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+ if (zero_len)
+ ext4_zero_partial_blocks(handle, inode, old_size, zero_len);
+ ext4_mark_inode_dirty(handle, inode);
+ ext4_journal_stop(handle);
+
return copied;
}
@@ -3444,17 +3459,34 @@ static int ext4_iomap_overwrite_begin(struct inode *inode, loff_t offset,
return ret;
}
+static inline bool ext4_want_directio_fallback(unsigned flags, ssize_t written)
+{
+ /* must be a directio to fall back to buffered */
+ if ((flags & (IOMAP_WRITE | IOMAP_DIRECT)) !=
+ (IOMAP_WRITE | IOMAP_DIRECT))
+ return false;
+
+ /* atomic writes are all-or-nothing */
+ if (flags & IOMAP_ATOMIC)
+ return false;
+
+ /* can only try again if we wrote nothing */
+ return written == 0;
+}
+
static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length,
ssize_t written, unsigned flags, struct iomap *iomap)
{
/*
* Check to see whether an error occurred while writing out the data to
- * the allocated blocks. If so, return the magic error code so that we
- * fallback to buffered I/O and attempt to complete the remainder of
- * the I/O. Any blocks that may have been allocated in preparation for
- * the direct I/O will be reused during buffered I/O.
+ * the allocated blocks. If so, return the magic error code for
+ * non-atomic write so that we fallback to buffered I/O and attempt to
+ * complete the remainder of the I/O.
+ * For non-atomic writes, any blocks that may have been
+ * allocated in preparation for the direct I/O will be reused during
+ * buffered I/O. For atomic write, we never fallback to buffered-io.
*/
- if (flags & (IOMAP_WRITE | IOMAP_DIRECT) && written == 0)
+ if (ext4_want_directio_fallback(flags, written))
return -ENOTBLK;
return 0;
@@ -4497,10 +4529,10 @@ static int __ext4_get_inode_loc(struct super_block *sb, unsigned long ino,
* Read the block from disk.
*/
trace_ext4_load_inode(sb, ino);
- ext4_read_bh_nowait(bh, REQ_META | REQ_PRIO, NULL);
+ ext4_read_bh_nowait(bh, REQ_META | REQ_PRIO, NULL,
+ ext4_simulate_fail(sb, EXT4_SIM_INODE_EIO));
blk_finish_plug(&plug);
wait_on_buffer(bh);
- ext4_simulate_fail_bh(sb, bh, EXT4_SIM_INODE_EIO);
if (!buffer_uptodate(bh)) {
if (ret_block)
*ret_block = block;
@@ -5426,6 +5458,14 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
}
if (attr->ia_size != inode->i_size) {
+ /* attach jbd2 jinode for EOF folio tail zeroing */
+ if (attr->ia_size & (inode->i_sb->s_blocksize - 1) ||
+ oldsize & (inode->i_sb->s_blocksize - 1)) {
+ error = ext4_inode_attach_jinode(inode);
+ if (error)
+ goto err_out;
+ }
+
handle = ext4_journal_start(inode, EXT4_HT_INODE, 3);
if (IS_ERR(handle)) {
error = PTR_ERR(handle);
@@ -5436,12 +5476,17 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
orphan = 1;
}
/*
- * Update c/mtime on truncate up, ext4_truncate() will
- * update c/mtime in shrink case below
+ * Update c/mtime and tail zero the EOF folio on
+ * truncate up. ext4_truncate() handles the shrink case
+ * below.
*/
- if (!shrink)
+ if (!shrink) {
inode_set_mtime_to_ts(inode,
inode_set_ctime_current(inode));
+ if (oldsize & (inode->i_sb->s_blocksize - 1))
+ ext4_block_truncate_page(handle,
+ inode->i_mapping, oldsize);
+ }
if (shrink)
ext4_fc_track_range(handle, inode,
@@ -5578,6 +5623,18 @@ int ext4_getattr(struct mnt_idmap *idmap, const struct path *path,
}
}
+ if ((request_mask & STATX_WRITE_ATOMIC) && S_ISREG(inode->i_mode)) {
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ unsigned int awu_min = 0, awu_max = 0;
+
+ if (ext4_inode_can_atomic_write(inode)) {
+ awu_min = sbi->s_awu_min;
+ awu_max = sbi->s_awu_max;
+ }
+
+ generic_fill_statx_atomic_writes(stat, awu_min, awu_max);
+ }
+
flags = ei->i_flags & EXT4_FL_USER_VISIBLE;
if (flags & EXT4_APPEND_FL)
stat->attributes |= STATX_ATTR_APPEND;
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 1c77400..7b9ce71 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -1330,7 +1330,6 @@ static long __ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
case EXT4_IOC_MOVE_EXT: {
struct move_extent me;
- struct fd donor;
int err;
if (!(filp->f_mode & FMODE_READ) ||
@@ -1342,30 +1341,26 @@ static long __ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
return -EFAULT;
me.moved_len = 0;
- donor = fdget(me.donor_fd);
- if (!fd_file(donor))
+ CLASS(fd, donor)(me.donor_fd);
+ if (fd_empty(donor))
return -EBADF;
- if (!(fd_file(donor)->f_mode & FMODE_WRITE)) {
- err = -EBADF;
- goto mext_out;
- }
+ if (!(fd_file(donor)->f_mode & FMODE_WRITE))
+ return -EBADF;
if (ext4_has_feature_bigalloc(sb)) {
ext4_msg(sb, KERN_ERR,
"Online defrag not supported with bigalloc");
- err = -EOPNOTSUPP;
- goto mext_out;
+ return -EOPNOTSUPP;
} else if (IS_DAX(inode)) {
ext4_msg(sb, KERN_ERR,
"Online defrag not supported with DAX");
- err = -EOPNOTSUPP;
- goto mext_out;
+ return -EOPNOTSUPP;
}
err = mnt_want_write_file(filp);
if (err)
- goto mext_out;
+ return err;
err = ext4_move_extents(filp, fd_file(donor), me.orig_start,
me.donor_start, me.len, &me.moved_len);
@@ -1374,8 +1369,6 @@ static long __ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
if (copy_to_user((struct move_extent __user *)arg,
&me, sizeof(me)))
err = -EFAULT;
-mext_out:
- fdput(donor);
return err;
}
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index d73e383..b25a27c 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -5711,7 +5711,7 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
(unsigned long)ac->ac_b_ex.fe_logical,
(int)ac->ac_criteria);
mb_debug(sb, "%u found", ac->ac_found);
- mb_debug(sb, "used pa: %s, ", ac->ac_pa ? "yes" : "no");
+ mb_debug(sb, "used pa: %s, ", str_yes_no(ac->ac_pa));
if (ac->ac_pa)
mb_debug(sb, "pa_type %s\n", ac->ac_pa->pa_type == MB_GROUP_PA ?
"group pa" : "inode pa");
@@ -6056,7 +6056,7 @@ static bool ext4_mb_discard_preallocations_should_retry(struct super_block *sb,
}
out_dbg:
- mb_debug(sb, "freed %d, retry ? %s\n", freed, ret ? "yes" : "no");
+ mb_debug(sb, "freed %d, retry ? %s\n", freed, str_yes_no(ret));
return ret;
}
@@ -6999,13 +6999,14 @@ int
ext4_mballoc_query_range(
struct super_block *sb,
ext4_group_t group,
- ext4_grpblk_t start,
+ ext4_grpblk_t first,
ext4_grpblk_t end,
+ ext4_mballoc_query_range_fn meta_formatter,
ext4_mballoc_query_range_fn formatter,
void *priv)
{
void *bitmap;
- ext4_grpblk_t next;
+ ext4_grpblk_t start, next;
struct ext4_buddy e4b;
int error;
@@ -7016,10 +7017,19 @@ ext4_mballoc_query_range(
ext4_lock_group(sb, group);
- start = max(e4b.bd_info->bb_first_free, start);
+ start = max(e4b.bd_info->bb_first_free, first);
if (end >= EXT4_CLUSTERS_PER_GROUP(sb))
end = EXT4_CLUSTERS_PER_GROUP(sb) - 1;
-
+ if (meta_formatter && start != first) {
+ if (start > end)
+ start = end;
+ ext4_unlock_group(sb, group);
+ error = meta_formatter(sb, group, first, start - first,
+ priv);
+ if (error)
+ goto out_unload;
+ ext4_lock_group(sb, group);
+ }
while (start <= end) {
start = mb_find_next_zero_bit(bitmap, end + 1, start);
if (start > end)
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index d8553f1..f8280de 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -259,6 +259,7 @@ ext4_mballoc_query_range(
ext4_group_t agno,
ext4_grpblk_t start,
ext4_grpblk_t end,
+ ext4_mballoc_query_range_fn meta_formatter,
ext4_mballoc_query_range_fn formatter,
void *priv);
diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
index bd946d0..d64c04e 100644
--- a/fs/ext4/mmp.c
+++ b/fs/ext4/mmp.c
@@ -94,7 +94,7 @@ static int read_mmp_block(struct super_block *sb, struct buffer_head **bh,
}
lock_buffer(*bh);
- ret = ext4_read_bh(*bh, REQ_META | REQ_PRIO, NULL);
+ ret = ext4_read_bh(*bh, REQ_META | REQ_PRIO, NULL, false);
if (ret)
goto warn_exit;
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index b64661e..898443e 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -213,7 +213,7 @@ static int mext_page_mkuptodate(struct folio *folio, size_t from, size_t to)
unlock_buffer(bh);
continue;
}
- ext4_read_bh_nowait(bh, 0, NULL);
+ ext4_read_bh_nowait(bh, 0, NULL, false);
nr++;
} while (block++, (bh = bh->b_this_page) != head);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 790db7e..bcf2737 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1747,7 +1747,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
#endif
frame = dx_probe(fname, dir, NULL, frames);
if (IS_ERR(frame))
- return (struct buffer_head *) frame;
+ return ERR_CAST(frame);
do {
block = dx_get_block(frame->at);
bh = ext4_read_dirblock(dir, block, DIRENT_HTREE);
@@ -1952,7 +1952,7 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
if (IS_ERR(bh2)) {
brelse(*bh);
*bh = NULL;
- return (struct ext4_dir_entry_2 *) bh2;
+ return ERR_CAST(bh2);
}
BUFFER_TRACE(*bh, "get_write_access");
@@ -2000,8 +2000,17 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
else
split = count/2;
+ if (WARN_ON_ONCE(split == 0)) {
+ /* Should never happen, but avoid out-of-bounds access below */
+ ext4_error_inode_block(dir, (*bh)->b_blocknr, 0,
+ "bad indexed directory? hash=%08x:%08x count=%d move=%u",
+ hinfo->hash, hinfo->minor_hash, count, move);
+ err = -EFSCORRUPTED;
+ goto out;
+ }
+
hash2 = map[split].hash;
- continued = split > 0 ? hash2 == map[split - 1].hash : 0;
+ continued = hash2 == map[split - 1].hash;
dxtrace(printk(KERN_INFO "Split block %lu at %x, %i/%i\n",
(unsigned long)dx_get_block(frame->at),
hash2, split, count-split));
@@ -2043,10 +2052,11 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
return de;
journal_error:
+ ext4_std_error(dir->i_sb, err);
+out:
brelse(*bh);
brelse(bh2);
*bh = NULL;
- ext4_std_error(dir->i_sb, err);
return ERR_PTR(err);
}
@@ -2395,11 +2405,8 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
if (fscrypt_is_nokey_name(dentry))
return -ENOKEY;
-#if IS_ENABLED(CONFIG_UNICODE)
- if (sb_has_strict_encoding(sb) && IS_CASEFOLDED(dir) &&
- utf8_validate(sb->s_encoding, &dentry->d_name))
+ if (!generic_ci_validate_strict_name(dir, &dentry->d_name))
return -EINVAL;
-#endif
retval = ext4_fname_setup_filename(dir, &dentry->d_name, 0, &fname);
if (retval)
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index ad55438..69b8a72 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -417,11 +417,13 @@ static void io_submit_add_bh(struct ext4_io_submit *io,
submit_and_retry:
ext4_io_submit(io);
}
- if (io->io_bio == NULL)
+ if (io->io_bio == NULL) {
io_submit_init_bio(io, bh);
+ io->io_bio->bi_write_hint = inode->i_write_hint;
+ }
if (!bio_add_folio(io->io_bio, io_folio, bh->b_size, bh_offset(bh)))
goto submit_and_retry;
- wbc_account_cgroup_owner(io->io_wbc, &folio->page, bh->b_size);
+ wbc_account_cgroup_owner(io->io_wbc, folio, bh->b_size);
io->io_next_block++;
}
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index a2704f0..72f77f7 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -1300,7 +1300,7 @@ static struct buffer_head *ext4_get_bitmap(struct super_block *sb, __u64 block)
if (unlikely(!bh))
return NULL;
if (!bh_uptodate_or_lock(bh)) {
- if (ext4_read_bh(bh, 0, NULL) < 0) {
+ if (ext4_read_bh(bh, 0, NULL, false) < 0) {
brelse(bh);
return NULL;
}
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 16a4ce7..785809f 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -161,8 +161,14 @@ MODULE_ALIAS("ext3");
static inline void __ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags,
- bh_end_io_t *end_io)
+ bh_end_io_t *end_io, bool simu_fail)
{
+ if (simu_fail) {
+ clear_buffer_uptodate(bh);
+ unlock_buffer(bh);
+ return;
+ }
+
/*
* buffer's verified bit is no longer valid after reading from
* disk again due to write out error, clear it to make sure we
@@ -176,7 +182,7 @@ static inline void __ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags,
}
void ext4_read_bh_nowait(struct buffer_head *bh, blk_opf_t op_flags,
- bh_end_io_t *end_io)
+ bh_end_io_t *end_io, bool simu_fail)
{
BUG_ON(!buffer_locked(bh));
@@ -184,10 +190,11 @@ void ext4_read_bh_nowait(struct buffer_head *bh, blk_opf_t op_flags,
unlock_buffer(bh);
return;
}
- __ext4_read_bh(bh, op_flags, end_io);
+ __ext4_read_bh(bh, op_flags, end_io, simu_fail);
}
-int ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags, bh_end_io_t *end_io)
+int ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags,
+ bh_end_io_t *end_io, bool simu_fail)
{
BUG_ON(!buffer_locked(bh));
@@ -196,7 +203,7 @@ int ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags, bh_end_io_t *end_io
return 0;
}
- __ext4_read_bh(bh, op_flags, end_io);
+ __ext4_read_bh(bh, op_flags, end_io, simu_fail);
wait_on_buffer(bh);
if (buffer_uptodate(bh))
@@ -208,10 +215,10 @@ int ext4_read_bh_lock(struct buffer_head *bh, blk_opf_t op_flags, bool wait)
{
lock_buffer(bh);
if (!wait) {
- ext4_read_bh_nowait(bh, op_flags, NULL);
+ ext4_read_bh_nowait(bh, op_flags, NULL, false);
return 0;
}
- return ext4_read_bh(bh, op_flags, NULL);
+ return ext4_read_bh(bh, op_flags, NULL, false);
}
/*
@@ -266,7 +273,7 @@ void ext4_sb_breadahead_unmovable(struct super_block *sb, sector_t block)
if (likely(bh)) {
if (trylock_buffer(bh))
- ext4_read_bh_nowait(bh, REQ_RAHEAD, NULL);
+ ext4_read_bh_nowait(bh, REQ_RAHEAD, NULL, false);
brelse(bh);
}
}
@@ -346,9 +353,9 @@ __u32 ext4_free_group_clusters(struct super_block *sb,
__u32 ext4_free_inodes_count(struct super_block *sb,
struct ext4_group_desc *bg)
{
- return le16_to_cpu(bg->bg_free_inodes_count_lo) |
+ return le16_to_cpu(READ_ONCE(bg->bg_free_inodes_count_lo)) |
(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
- (__u32)le16_to_cpu(bg->bg_free_inodes_count_hi) << 16 : 0);
+ (__u32)le16_to_cpu(READ_ONCE(bg->bg_free_inodes_count_hi)) << 16 : 0);
}
__u32 ext4_used_dirs_count(struct super_block *sb,
@@ -402,9 +409,9 @@ void ext4_free_group_clusters_set(struct super_block *sb,
void ext4_free_inodes_set(struct super_block *sb,
struct ext4_group_desc *bg, __u32 count)
{
- bg->bg_free_inodes_count_lo = cpu_to_le16((__u16)count);
+ WRITE_ONCE(bg->bg_free_inodes_count_lo, cpu_to_le16((__u16)count));
if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
- bg->bg_free_inodes_count_hi = cpu_to_le16(count >> 16);
+ WRITE_ONCE(bg->bg_free_inodes_count_hi, cpu_to_le16(count >> 16));
}
void ext4_used_dirs_set(struct super_block *sb,
@@ -2096,16 +2103,16 @@ static int ext4_parse_test_dummy_encryption(const struct fs_parameter *param,
}
#define EXT4_SET_CTX(name) \
-static inline void ctx_set_##name(struct ext4_fs_context *ctx, \
- unsigned long flag) \
+static inline __maybe_unused \
+void ctx_set_##name(struct ext4_fs_context *ctx, unsigned long flag) \
{ \
ctx->mask_s_##name |= flag; \
ctx->vals_s_##name |= flag; \
}
#define EXT4_CLEAR_CTX(name) \
-static inline void ctx_clear_##name(struct ext4_fs_context *ctx, \
- unsigned long flag) \
+static inline __maybe_unused \
+void ctx_clear_##name(struct ext4_fs_context *ctx, unsigned long flag) \
{ \
ctx->mask_s_##name |= flag; \
ctx->vals_s_##name &= ~flag; \
@@ -3030,6 +3037,9 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb,
SEQ_OPTS_PUTS("mb_optimize_scan=1");
}
+ if (nodefs && !test_opt(sb, NO_PREFETCH_BLOCK_BITMAPS))
+ SEQ_OPTS_PUTS("prefetch_block_bitmaps");
+
ext4_show_quota_options(seq, sb);
return 0;
}
@@ -3709,12 +3719,12 @@ static int ext4_run_li_request(struct ext4_li_request *elr)
ret = 1;
if (!ret) {
- start_time = ktime_get_real_ns();
+ start_time = ktime_get_ns();
ret = ext4_init_inode_table(sb, group,
elr->lr_timeout ? 0 : 1);
trace_ext4_lazy_itable_init(sb, group);
if (elr->lr_timeout == 0) {
- elr->lr_timeout = nsecs_to_jiffies((ktime_get_real_ns() - start_time) *
+ elr->lr_timeout = nsecs_to_jiffies((ktime_get_ns() - start_time) *
EXT4_SB(elr->lr_super)->s_li_wait_mult);
}
elr->lr_next_sched = jiffies + elr->lr_timeout;
@@ -3774,8 +3784,9 @@ static int ext4_lazyinit_thread(void *arg)
cont_thread:
while (true) {
- next_wakeup = MAX_JIFFY_OFFSET;
+ bool next_wakeup_initialized = false;
+ next_wakeup = 0;
mutex_lock(&eli->li_list_mtx);
if (list_empty(&eli->li_request_list)) {
mutex_unlock(&eli->li_list_mtx);
@@ -3788,8 +3799,11 @@ static int ext4_lazyinit_thread(void *arg)
lr_request);
if (time_before(jiffies, elr->lr_next_sched)) {
- if (time_before(elr->lr_next_sched, next_wakeup))
+ if (!next_wakeup_initialized ||
+ time_before(elr->lr_next_sched, next_wakeup)) {
next_wakeup = elr->lr_next_sched;
+ next_wakeup_initialized = true;
+ }
continue;
}
if (down_read_trylock(&elr->lr_super->s_umount)) {
@@ -3817,16 +3831,18 @@ static int ext4_lazyinit_thread(void *arg)
elr->lr_next_sched = jiffies +
get_random_u32_below(EXT4_DEF_LI_MAX_START_DELAY * HZ);
}
- if (time_before(elr->lr_next_sched, next_wakeup))
+ if (!next_wakeup_initialized ||
+ time_before(elr->lr_next_sched, next_wakeup)) {
next_wakeup = elr->lr_next_sched;
+ next_wakeup_initialized = true;
+ }
}
mutex_unlock(&eli->li_list_mtx);
try_to_freeze();
cur = jiffies;
- if ((time_after_eq(cur, next_wakeup)) ||
- (MAX_JIFFY_OFFSET == next_wakeup)) {
+ if (!next_wakeup_initialized || time_after_eq(cur, next_wakeup)) {
cond_resched();
continue;
}
@@ -4425,6 +4441,36 @@ static int ext4_handle_clustersize(struct super_block *sb)
return 0;
}
+/*
+ * ext4_atomic_write_init: Initializes filesystem min & max atomic write units.
+ * @sb: super block
+ * TODO: Later add support for bigalloc
+ */
+static void ext4_atomic_write_init(struct super_block *sb)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct block_device *bdev = sb->s_bdev;
+
+ if (!bdev_can_atomic_write(bdev))
+ return;
+
+ if (!ext4_has_feature_extents(sb))
+ return;
+
+ sbi->s_awu_min = max(sb->s_blocksize,
+ bdev_atomic_write_unit_min_bytes(bdev));
+ sbi->s_awu_max = min(sb->s_blocksize,
+ bdev_atomic_write_unit_max_bytes(bdev));
+ if (sbi->s_awu_min && sbi->s_awu_max &&
+ sbi->s_awu_min <= sbi->s_awu_max) {
+ ext4_msg(sb, KERN_NOTICE, "Supports (experimental) DIO atomic writes awu_min: %u, awu_max: %u",
+ sbi->s_awu_min, sbi->s_awu_max);
+ } else {
+ sbi->s_awu_min = 0;
+ sbi->s_awu_max = 0;
+ }
+}
+
static void ext4_fast_commit_init(struct super_block *sb)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -5336,6 +5382,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
spin_lock_init(&sbi->s_bdev_wb_lock);
+ ext4_atomic_write_init(sb);
ext4_fast_commit_init(sb);
sb->s_root = NULL;
@@ -6301,7 +6348,7 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
struct ext4_sb_info *sbi = EXT4_SB(sb);
if (unlikely(ext4_forced_shutdown(sb)))
- return 0;
+ return -EIO;
trace_ext4_sync_fs(sb, wait);
flush_workqueue(sbi->rsv_conversion_wq);
@@ -6518,8 +6565,12 @@ static int __ext4_remount(struct fs_context *fc, struct super_block *sb)
goto restore_opts;
}
- if (test_opt2(sb, ABORT))
- ext4_abort(sb, ESHUTDOWN, "Abort forced by user");
+ if ((old_opts.s_mount_opt & EXT4_MOUNT_DELALLOC) &&
+ !test_opt(sb, DELALLOC)) {
+ ext4_msg(sb, KERN_ERR, "can't disable delalloc during remount");
+ err = -EINVAL;
+ goto restore_opts;
+ }
sb->s_flags = (sb->s_flags & ~SB_POSIXACL) |
(test_opt(sb, POSIX_ACL) ? SB_POSIXACL : 0);
@@ -6689,6 +6740,14 @@ static int __ext4_remount(struct fs_context *fc, struct super_block *sb)
if (!ext4_has_feature_mmp(sb) || sb_rdonly(sb))
ext4_stop_mmpd(sbi);
+ /*
+ * Handle aborting the filesystem as the last thing during remount to
+ * avoid obsure errors during remount when some option changes fail to
+ * apply due to shutdown filesystem.
+ */
+ if (test_opt2(sb, ABORT))
+ ext4_abort(sb, ESHUTDOWN, "Abort forced by user");
+
return 0;
restore_opts:
@@ -7329,7 +7388,7 @@ static struct file_system_type ext4_fs_type = {
.init_fs_context = ext4_init_fs_context,
.parameters = ext4_param_specs,
.kill_sb = ext4_kill_sb,
- .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP,
+ .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP | FS_MGTIME,
};
MODULE_ALIAS_FS("ext4");
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 94f7b08..e3ce763c 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -711,7 +711,8 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio)
}
if (fio->io_wbc && !is_read_io(fio->op))
- wbc_account_cgroup_owner(fio->io_wbc, fio->page, PAGE_SIZE);
+ wbc_account_cgroup_owner(fio->io_wbc, page_folio(fio->page),
+ PAGE_SIZE);
inc_page_count(fio->sbi, is_read_io(fio->op) ?
__read_io_type(page) : WB_DATA_TYPE(fio->page, false));
@@ -911,7 +912,8 @@ int f2fs_merge_page_bio(struct f2fs_io_info *fio)
}
if (fio->io_wbc)
- wbc_account_cgroup_owner(fio->io_wbc, fio->page, PAGE_SIZE);
+ wbc_account_cgroup_owner(fio->io_wbc, page_folio(fio->page),
+ PAGE_SIZE);
inc_page_count(fio->sbi, WB_DATA_TYPE(page, false));
@@ -1011,7 +1013,8 @@ void f2fs_submit_page_write(struct f2fs_io_info *fio)
}
if (fio->io_wbc)
- wbc_account_cgroup_owner(fio->io_wbc, fio->page, PAGE_SIZE);
+ wbc_account_cgroup_owner(fio->io_wbc, page_folio(fio->page),
+ PAGE_SIZE);
io->last_block_in_bio = fio->new_blkaddr;
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 321d8ff..84447d5 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -3038,32 +3038,27 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in,
static int __f2fs_ioc_move_range(struct file *filp,
struct f2fs_move_range *range)
{
- struct fd dst;
int err;
if (!(filp->f_mode & FMODE_READ) ||
!(filp->f_mode & FMODE_WRITE))
return -EBADF;
- dst = fdget(range->dst_fd);
- if (!fd_file(dst))
+ CLASS(fd, dst)(range->dst_fd);
+ if (fd_empty(dst))
return -EBADF;
- if (!(fd_file(dst)->f_mode & FMODE_WRITE)) {
- err = -EBADF;
- goto err_out;
- }
+ if (!(fd_file(dst)->f_mode & FMODE_WRITE))
+ return -EBADF;
err = mnt_want_write_file(filp);
if (err)
- goto err_out;
+ return err;
err = f2fs_move_file_range(filp, range->pos_in, fd_file(dst),
range->pos_out, range->len);
mnt_drop_write_file(filp);
-err_out:
- fdput(dst);
return err;
}
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 22dd9dc..ac77dd9 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -12,7 +12,6 @@
#include <linux/fs.h>
#include <linux/filelock.h>
#include <linux/file.h>
-#include <linux/fdtable.h>
#include <linux/capability.h>
#include <linux/dnotify.h>
#include <linux/slab.h>
@@ -397,6 +396,9 @@ static long f_dupfd_query(int fd, struct file *filp)
{
CLASS(fd_raw, f)(fd);
+ if (fd_empty(f))
+ return -EBADF;
+
/*
* We can do the 'fdput()' immediately, as the only thing that
* matters is the pointer value which isn't changed by the fdput.
@@ -570,24 +572,21 @@ static int check_fcntl_cmd(unsigned cmd)
SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd, unsigned long, arg)
{
- struct fd f = fdget_raw(fd);
- long err = -EBADF;
+ CLASS(fd_raw, f)(fd);
+ long err;
- if (!fd_file(f))
- goto out;
+ if (fd_empty(f))
+ return -EBADF;
if (unlikely(fd_file(f)->f_mode & FMODE_PATH)) {
if (!check_fcntl_cmd(cmd))
- goto out1;
+ return -EBADF;
}
err = security_file_fcntl(fd_file(f), cmd, arg);
if (!err)
err = do_fcntl(fd, cmd, arg, fd_file(f));
-out1:
- fdput(f);
-out:
return err;
}
@@ -596,21 +595,21 @@ SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd,
unsigned long, arg)
{
void __user *argp = (void __user *)arg;
- struct fd f = fdget_raw(fd);
+ CLASS(fd_raw, f)(fd);
struct flock64 flock;
- long err = -EBADF;
+ long err;
- if (!fd_file(f))
- goto out;
+ if (fd_empty(f))
+ return -EBADF;
if (unlikely(fd_file(f)->f_mode & FMODE_PATH)) {
if (!check_fcntl_cmd(cmd))
- goto out1;
+ return -EBADF;
}
err = security_file_fcntl(fd_file(f), cmd, arg);
if (err)
- goto out1;
+ return err;
switch (cmd) {
case F_GETLK64:
@@ -635,9 +634,6 @@ SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd,
err = do_fcntl(fd, cmd, arg, fd_file(f));
break;
}
-out1:
- fdput(f);
-out:
return err;
}
#endif
@@ -733,21 +729,21 @@ static int fixup_compat_flock(struct flock *flock)
static long do_compat_fcntl64(unsigned int fd, unsigned int cmd,
compat_ulong_t arg)
{
- struct fd f = fdget_raw(fd);
+ CLASS(fd_raw, f)(fd);
struct flock flock;
- long err = -EBADF;
+ long err;
- if (!fd_file(f))
- return err;
+ if (fd_empty(f))
+ return -EBADF;
if (unlikely(fd_file(f)->f_mode & FMODE_PATH)) {
if (!check_fcntl_cmd(cmd))
- goto out_put;
+ return -EBADF;
}
err = security_file_fcntl(fd_file(f), cmd, arg);
if (err)
- goto out_put;
+ return err;
switch (cmd) {
case F_GETLK:
@@ -790,8 +786,6 @@ static long do_compat_fcntl64(unsigned int fd, unsigned int cmd,
err = do_fcntl(fd, cmd, arg, fd_file(f));
break;
}
-out_put:
- fdput(f);
return err;
}
diff --git a/fs/fhandle.c b/fs/fhandle.c
index 82df28d..5f80113 100644
--- a/fs/fhandle.c
+++ b/fs/fhandle.c
@@ -139,12 +139,11 @@ static int get_path_from_fd(int fd, struct path *root)
path_get(root);
spin_unlock(&fs->lock);
} else {
- struct fd f = fdget(fd);
- if (!fd_file(f))
+ CLASS(fd, f)(fd);
+ if (fd_empty(f))
return -EBADF;
*root = fd_file(f)->f_path;
path_get(root);
- fdput(f);
}
return 0;
diff --git a/fs/file.c b/fs/file.c
index eb093e7..fb1011c 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -20,10 +20,73 @@
#include <linux/spinlock.h>
#include <linux/rcupdate.h>
#include <linux/close_range.h>
+#include <linux/file_ref.h>
#include <net/sock.h>
#include "internal.h"
+/**
+ * __file_ref_put - Slowpath of file_ref_put()
+ * @ref: Pointer to the reference count
+ * @cnt: Current reference count
+ *
+ * Invoked when the reference count is outside of the valid zone.
+ *
+ * Return:
+ * True if this was the last reference with no future references
+ * possible. This signals the caller that it can safely schedule the
+ * object, which is protected by the reference counter, for
+ * deconstruction.
+ *
+ * False if there are still active references or the put() raced
+ * with a concurrent get()/put() pair. Caller is not allowed to
+ * deconstruct the protected object.
+ */
+bool __file_ref_put(file_ref_t *ref, unsigned long cnt)
+{
+ /* Did this drop the last reference? */
+ if (likely(cnt == FILE_REF_NOREF)) {
+ /*
+ * Carefully try to set the reference count to FILE_REF_DEAD.
+ *
+ * This can fail if a concurrent get() operation has
+ * elevated it again or the corresponding put() even marked
+ * it dead already. Both are valid situations and do not
+ * require a retry. If this fails the caller is not
+ * allowed to deconstruct the object.
+ */
+ if (!atomic_long_try_cmpxchg_release(&ref->refcnt, &cnt, FILE_REF_DEAD))
+ return false;
+
+ /*
+ * The caller can safely schedule the object for
+ * deconstruction. Provide acquire ordering.
+ */
+ smp_acquire__after_ctrl_dep();
+ return true;
+ }
+
+ /*
+ * If the reference count was already in the dead zone, then this
+ * put() operation is imbalanced. Warn, put the reference count back to
+ * DEAD and tell the caller to not deconstruct the object.
+ */
+ if (WARN_ONCE(cnt >= FILE_REF_RELEASED, "imbalanced put on file reference count")) {
+ atomic_long_set(&ref->refcnt, FILE_REF_DEAD);
+ return false;
+ }
+
+ /*
+ * This is a put() operation on a saturated refcount. Restore the
+ * mean saturation value and tell the caller to not deconstruct the
+ * object.
+ */
+ if (cnt > FILE_REF_MAXREF)
+ atomic_long_set(&ref->refcnt, FILE_REF_SATURATED);
+ return false;
+}
+EXPORT_SYMBOL_GPL(__file_ref_put);
+
unsigned int sysctl_nr_open __read_mostly = 1024*1024;
unsigned int sysctl_nr_open_min = BITS_PER_LONG;
/* our min() is unusable in constant expressions ;-/ */
@@ -89,18 +152,11 @@ static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt)
* 'unsigned long' in some places, but simply because that is how the Linux
* kernel bitmaps are defined to work: they are not "bits in an array of bytes",
* they are very much "bits in an array of unsigned long".
- *
- * The ALIGN(nr, BITS_PER_LONG) here is for clarity: since we just multiplied
- * by that "1024/sizeof(ptr)" before, we already know there are sufficient
- * clear low bits. Clang seems to realize that, gcc ends up being confused.
- *
- * On a 128-bit machine, the ALIGN() would actually matter. In the meantime,
- * let's consider it documentation (and maybe a test-case for gcc to improve
- * its code generation ;)
*/
-static struct fdtable * alloc_fdtable(unsigned int nr)
+static struct fdtable *alloc_fdtable(unsigned int slots_wanted)
{
struct fdtable *fdt;
+ unsigned int nr;
void *data;
/*
@@ -108,22 +164,32 @@ static struct fdtable * alloc_fdtable(unsigned int nr)
* Allocation steps are keyed to the size of the fdarray, since it
* grows far faster than any of the other dynamic data. We try to fit
* the fdarray into comfortable page-tuned chunks: starting at 1024B
- * and growing in powers of two from there on.
+ * and growing in powers of two from there on. Since we called only
+ * with slots_wanted > BITS_PER_LONG (embedded instance in files->fdtab
+ * already gives BITS_PER_LONG slots), the above boils down to
+ * 1. use the smallest power of two large enough to give us that many
+ * slots.
+ * 2. on 32bit skip 64 and 128 - the minimal capacity we want there is
+ * 256 slots (i.e. 1Kb fd array).
+ * 3. on 64bit don't skip anything, 1Kb fd array means 128 slots there
+ * and we are never going to be asked for 64 or less.
*/
- nr /= (1024 / sizeof(struct file *));
- nr = roundup_pow_of_two(nr + 1);
- nr *= (1024 / sizeof(struct file *));
- nr = ALIGN(nr, BITS_PER_LONG);
+ if (IS_ENABLED(CONFIG_32BIT) && slots_wanted < 256)
+ nr = 256;
+ else
+ nr = roundup_pow_of_two(slots_wanted);
/*
* Note that this can drive nr *below* what we had passed if sysctl_nr_open
- * had been set lower between the check in expand_files() and here. Deal
- * with that in caller, it's cheaper that way.
+ * had been set lower between the check in expand_files() and here.
*
* We make sure that nr remains a multiple of BITS_PER_LONG - otherwise
* bitmaps handling below becomes unpleasant, to put it mildly...
*/
- if (unlikely(nr > sysctl_nr_open))
- nr = ((sysctl_nr_open - 1) | (BITS_PER_LONG - 1)) + 1;
+ if (unlikely(nr > sysctl_nr_open)) {
+ nr = round_down(sysctl_nr_open, BITS_PER_LONG);
+ if (nr < slots_wanted)
+ return ERR_PTR(-EMFILE);
+ }
fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL_ACCOUNT);
if (!fdt)
@@ -152,14 +218,14 @@ static struct fdtable * alloc_fdtable(unsigned int nr)
out_fdt:
kfree(fdt);
out:
- return NULL;
+ return ERR_PTR(-ENOMEM);
}
/*
* Expand the file descriptor table.
* This function will allocate a new fdtable and both fd array and fdset, of
* the given size.
- * Return <0 error code on error; 1 on successful completion.
+ * Return <0 error code on error; 0 on successful completion.
* The files->file_lock should be held on entry, and will be held on exit.
*/
static int expand_fdtable(struct files_struct *files, unsigned int nr)
@@ -169,7 +235,7 @@ static int expand_fdtable(struct files_struct *files, unsigned int nr)
struct fdtable *new_fdt, *cur_fdt;
spin_unlock(&files->file_lock);
- new_fdt = alloc_fdtable(nr);
+ new_fdt = alloc_fdtable(nr + 1);
/* make sure all fd_install() have seen resize_in_progress
* or have finished their rcu_read_lock_sched() section.
@@ -178,16 +244,8 @@ static int expand_fdtable(struct files_struct *files, unsigned int nr)
synchronize_rcu();
spin_lock(&files->file_lock);
- if (!new_fdt)
- return -ENOMEM;
- /*
- * extremely unlikely race - sysctl_nr_open decreased between the check in
- * caller and alloc_fdtable(). Cheaper to catch it here...
- */
- if (unlikely(new_fdt->max_fds <= nr)) {
- __free_fdtable(new_fdt);
- return -EMFILE;
- }
+ if (IS_ERR(new_fdt))
+ return PTR_ERR(new_fdt);
cur_fdt = files_fdtable(files);
BUG_ON(nr < cur_fdt->max_fds);
copy_fdtable(new_fdt, cur_fdt);
@@ -196,15 +254,14 @@ static int expand_fdtable(struct files_struct *files, unsigned int nr)
call_rcu(&cur_fdt->rcu, free_fdtable_rcu);
/* coupled with smp_rmb() in fd_install() */
smp_wmb();
- return 1;
+ return 0;
}
/*
* Expand files.
* This function will expand the file structures, if the requested size exceeds
* the current capacity and there is room for expansion.
- * Return <0 error code on error; 0 when nothing done; 1 when files were
- * expanded and execution may have blocked.
+ * Return <0 error code on error; 0 on success.
* The files->file_lock should be held on entry, and will be held on exit.
*/
static int expand_files(struct files_struct *files, unsigned int nr)
@@ -212,14 +269,14 @@ static int expand_files(struct files_struct *files, unsigned int nr)
__acquires(files->file_lock)
{
struct fdtable *fdt;
- int expanded = 0;
+ int error;
repeat:
fdt = files_fdtable(files);
/* Do we need to expand? */
if (nr < fdt->max_fds)
- return expanded;
+ return 0;
/* Can we expand? */
if (nr >= sysctl_nr_open)
@@ -227,7 +284,6 @@ static int expand_files(struct files_struct *files, unsigned int nr)
if (unlikely(files->resize_in_progress)) {
spin_unlock(&files->file_lock);
- expanded = 1;
wait_event(files->resize_wait, !files->resize_in_progress);
spin_lock(&files->file_lock);
goto repeat;
@@ -235,27 +291,28 @@ static int expand_files(struct files_struct *files, unsigned int nr)
/* All good, so we try */
files->resize_in_progress = true;
- expanded = expand_fdtable(files, nr);
+ error = expand_fdtable(files, nr);
files->resize_in_progress = false;
wake_up_all(&files->resize_wait);
- return expanded;
+ return error;
}
-static inline void __set_close_on_exec(unsigned int fd, struct fdtable *fdt)
+static inline void __set_close_on_exec(unsigned int fd, struct fdtable *fdt,
+ bool set)
{
- __set_bit(fd, fdt->close_on_exec);
+ if (set) {
+ __set_bit(fd, fdt->close_on_exec);
+ } else {
+ if (test_bit(fd, fdt->close_on_exec))
+ __clear_bit(fd, fdt->close_on_exec);
+ }
}
-static inline void __clear_close_on_exec(unsigned int fd, struct fdtable *fdt)
-{
- if (test_bit(fd, fdt->close_on_exec))
- __clear_bit(fd, fdt->close_on_exec);
-}
-
-static inline void __set_open_fd(unsigned int fd, struct fdtable *fdt)
+static inline void __set_open_fd(unsigned int fd, struct fdtable *fdt, bool set)
{
__set_bit(fd, fdt->open_fds);
+ __set_close_on_exec(fd, fdt, set);
fd /= BITS_PER_LONG;
if (!~fdt->open_fds[fd])
__set_bit(fd, fdt->full_fds_bits);
@@ -264,7 +321,9 @@ static inline void __set_open_fd(unsigned int fd, struct fdtable *fdt)
static inline void __clear_open_fd(unsigned int fd, struct fdtable *fdt)
{
__clear_bit(fd, fdt->open_fds);
- __clear_bit(fd / BITS_PER_LONG, fdt->full_fds_bits);
+ fd /= BITS_PER_LONG;
+ if (test_bit(fd, fdt->full_fds_bits))
+ __clear_bit(fd, fdt->full_fds_bits);
}
static inline bool fd_is_open(unsigned int fd, const struct fdtable *fdt)
@@ -306,7 +365,6 @@ struct files_struct *dup_fd(struct files_struct *oldf, struct fd_range *punch_ho
struct file **old_fds, **new_fds;
unsigned int open_files, i;
struct fdtable *old_fdt, *new_fdt;
- int error;
newf = kmem_cache_alloc(files_cachep, GFP_KERNEL);
if (!newf)
@@ -338,17 +396,10 @@ struct files_struct *dup_fd(struct files_struct *oldf, struct fd_range *punch_ho
if (new_fdt != &newf->fdtab)
__free_fdtable(new_fdt);
- new_fdt = alloc_fdtable(open_files - 1);
- if (!new_fdt) {
- error = -ENOMEM;
- goto out_release;
- }
-
- /* beyond sysctl_nr_open; nothing to do */
- if (unlikely(new_fdt->max_fds < open_files)) {
- __free_fdtable(new_fdt);
- error = -EMFILE;
- goto out_release;
+ new_fdt = alloc_fdtable(open_files);
+ if (IS_ERR(new_fdt)) {
+ kmem_cache_free(files_cachep, newf);
+ return ERR_CAST(new_fdt);
}
/*
@@ -389,10 +440,6 @@ struct files_struct *dup_fd(struct files_struct *oldf, struct fd_range *punch_ho
rcu_assign_pointer(newf->fdt, new_fdt);
return newf;
-
-out_release:
- kmem_cache_free(files_cachep, newf);
- return ERR_PTR(error);
}
static struct fdtable *close_files(struct files_struct * files)
@@ -413,7 +460,7 @@ static struct fdtable *close_files(struct files_struct * files)
set = fdt->open_fds[j++];
while (set) {
if (set & 1) {
- struct file * file = xchg(&fdt->fd[i], NULL);
+ struct file *file = fdt->fd[i];
if (file) {
filp_close(file, files);
cond_resched();
@@ -470,6 +517,15 @@ static unsigned int find_next_fd(struct fdtable *fdt, unsigned int start)
unsigned int maxfd = fdt->max_fds; /* always multiple of BITS_PER_LONG */
unsigned int maxbit = maxfd / BITS_PER_LONG;
unsigned int bitbit = start / BITS_PER_LONG;
+ unsigned int bit;
+
+ /*
+ * Try to avoid looking at the second level bitmap
+ */
+ bit = find_next_zero_bit(&fdt->open_fds[bitbit], BITS_PER_LONG,
+ start & (BITS_PER_LONG - 1));
+ if (bit < BITS_PER_LONG)
+ return bit + bitbit * BITS_PER_LONG;
bitbit = find_next_zero_bit(fdt->full_fds_bits, maxbit, bitbit) * BITS_PER_LONG;
if (bitbit >= maxfd)
@@ -496,7 +552,7 @@ static int alloc_fd(unsigned start, unsigned end, unsigned flags)
if (fd < files->next_fd)
fd = files->next_fd;
- if (fd < fdt->max_fds)
+ if (likely(fd < fdt->max_fds))
fd = find_next_fd(fdt, fd);
/*
@@ -504,36 +560,22 @@ static int alloc_fd(unsigned start, unsigned end, unsigned flags)
* will limit the total number of files that can be opened.
*/
error = -EMFILE;
- if (fd >= end)
+ if (unlikely(fd >= end))
goto out;
- error = expand_files(files, fd);
- if (error < 0)
- goto out;
+ if (unlikely(fd >= fdt->max_fds)) {
+ error = expand_files(files, fd);
+ if (error < 0)
+ goto out;
- /*
- * If we needed to expand the fs array we
- * might have blocked - try again.
- */
- if (error)
goto repeat;
+ }
if (start <= files->next_fd)
files->next_fd = fd + 1;
- __set_open_fd(fd, fdt);
- if (flags & O_CLOEXEC)
- __set_close_on_exec(fd, fdt);
- else
- __clear_close_on_exec(fd, fdt);
+ __set_open_fd(fd, fdt, flags & O_CLOEXEC);
error = fd;
-#if 1
- /* Sanity check */
- if (rcu_access_pointer(fdt->fd[fd]) != NULL) {
- printk(KERN_WARNING "alloc_fd: slot %d not NULL!\n", fd);
- rcu_assign_pointer(fdt->fd[fd], NULL);
- }
-#endif
out:
spin_unlock(&files->file_lock);
@@ -599,7 +641,7 @@ void fd_install(unsigned int fd, struct file *file)
rcu_read_unlock_sched();
spin_lock(&files->file_lock);
fdt = files_fdtable(files);
- BUG_ON(fdt->fd[fd] != NULL);
+ WARN_ON(fdt->fd[fd] != NULL);
rcu_assign_pointer(fdt->fd[fd], file);
spin_unlock(&files->file_lock);
return;
@@ -713,7 +755,7 @@ static inline void __range_close(struct files_struct *files, unsigned int fd,
}
/**
- * __close_range() - Close all file descriptors in a given range.
+ * sys_close_range() - Close all file descriptors in a given range.
*
* @fd: starting file descriptor to close
* @max_fd: last file descriptor to close
@@ -721,8 +763,10 @@ static inline void __range_close(struct files_struct *files, unsigned int fd,
*
* This closes a range of file descriptors. All file descriptors
* from @fd up to and including @max_fd are closed.
+ * Currently, errors to close a given file descriptor are ignored.
*/
-int __close_range(unsigned fd, unsigned max_fd, unsigned int flags)
+SYSCALL_DEFINE3(close_range, unsigned int, fd, unsigned int, max_fd,
+ unsigned int, flags)
{
struct task_struct *me = current;
struct files_struct *cur_fds = me->files, *fds = NULL;
@@ -839,7 +883,7 @@ static struct file *__get_file_rcu(struct file __rcu **f)
if (!file)
return NULL;
- if (unlikely(!atomic_long_inc_not_zero(&file->f_count)))
+ if (unlikely(!file_ref_get(&file->f_ref)))
return ERR_PTR(-EAGAIN);
file_reloaded = rcu_dereference_raw(*f);
@@ -853,8 +897,8 @@ static struct file *__get_file_rcu(struct file __rcu **f)
OPTIMIZER_HIDE_VAR(file_reloaded_cmp);
/*
- * atomic_long_inc_not_zero() above provided a full memory
- * barrier when we acquired a reference.
+ * file_ref_get() above provided a full memory barrier when we
+ * acquired a reference.
*
* This is paired with the write barrier from assigning to the
* __rcu protected file pointer so that if that pointer still
@@ -952,11 +996,11 @@ static inline struct file *__fget_files_rcu(struct files_struct *files,
* We need to confirm it by incrementing the refcount
* and then check the lookup again.
*
- * atomic_long_inc_not_zero() gives us a full memory
- * barrier. We only really need an 'acquire' one to
- * protect the loads below, but we don't have that.
+ * file_ref_get() gives us a full memory barrier. We
+ * only really need an 'acquire' one to protect the
+ * loads below, but we don't have that.
*/
- if (unlikely(!atomic_long_inc_not_zero(&file->f_count)))
+ if (unlikely(!file_ref_get(&file->f_ref)))
continue;
/*
@@ -1037,29 +1081,7 @@ struct file *fget_task(struct task_struct *task, unsigned int fd)
return file;
}
-struct file *lookup_fdget_rcu(unsigned int fd)
-{
- return __fget_files_rcu(current->files, fd, 0);
-
-}
-EXPORT_SYMBOL_GPL(lookup_fdget_rcu);
-
-struct file *task_lookup_fdget_rcu(struct task_struct *task, unsigned int fd)
-{
- /* Must be called with rcu_read_lock held */
- struct files_struct *files;
- struct file *file = NULL;
-
- task_lock(task);
- files = task->files;
- if (files)
- file = __fget_files_rcu(files, fd, 0);
- task_unlock(task);
-
- return file;
-}
-
-struct file *task_lookup_next_fdget_rcu(struct task_struct *task, unsigned int *ret_fd)
+struct file *fget_task_next(struct task_struct *task, unsigned int *ret_fd)
{
/* Must be called with rcu_read_lock held */
struct files_struct *files;
@@ -1069,17 +1091,19 @@ struct file *task_lookup_next_fdget_rcu(struct task_struct *task, unsigned int *
task_lock(task);
files = task->files;
if (files) {
+ rcu_read_lock();
for (; fd < files_fdtable(files)->max_fds; fd++) {
file = __fget_files_rcu(files, fd, 0);
if (file)
break;
}
+ rcu_read_unlock();
}
task_unlock(task);
*ret_fd = fd;
return file;
}
-EXPORT_SYMBOL(task_lookup_next_fdget_rcu);
+EXPORT_SYMBOL(fget_task_next);
/*
* Lightweight file lookup - no refcnt increment if fd table isn't shared.
@@ -1096,6 +1120,13 @@ EXPORT_SYMBOL(task_lookup_next_fdget_rcu);
*
* The fput_needed flag returned by fget_light should be passed to the
* corresponding fput_light.
+ *
+ * (As an exception to rule 2, you can call filp_close between fget_light and
+ * fput_light provided that you capture a real refcount with get_file before
+ * the call to filp_close, and ensure that this real refcount is fput *after*
+ * the fput_light call.)
+ *
+ * See also the documentation in rust/kernel/file.rs.
*/
static inline struct fd __fget_light(unsigned int fd, fmode_t mask)
{
@@ -1176,13 +1207,8 @@ void __f_unlock_pos(struct file *f)
void set_close_on_exec(unsigned int fd, int flag)
{
struct files_struct *files = current->files;
- struct fdtable *fdt;
spin_lock(&files->file_lock);
- fdt = files_fdtable(files);
- if (flag)
- __set_close_on_exec(fd, fdt);
- else
- __clear_close_on_exec(fd, fdt);
+ __set_close_on_exec(fd, files_fdtable(files), flag);
spin_unlock(&files->file_lock);
}
@@ -1223,11 +1249,7 @@ __releases(&files->file_lock)
goto Ebusy;
get_file(file);
rcu_assign_pointer(fdt->fd[fd], file);
- __set_open_fd(fd, fdt);
- if (flags & O_CLOEXEC)
- __set_close_on_exec(fd, fdt);
- else
- __clear_close_on_exec(fd, fdt);
+ __set_open_fd(fd, fdt, flags & O_CLOEXEC);
spin_unlock(&files->file_lock);
if (tofree)
diff --git a/fs/file_table.c b/fs/file_table.c
index eed5ffa..976736b 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -9,7 +9,6 @@
#include <linux/string.h>
#include <linux/slab.h>
#include <linux/file.h>
-#include <linux/fdtable.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/fs.h>
@@ -40,13 +39,17 @@ static struct files_stat_struct files_stat = {
/* SLAB cache for file structures */
static struct kmem_cache *filp_cachep __ro_after_init;
+static struct kmem_cache *bfilp_cachep __ro_after_init;
static struct percpu_counter nr_files __cacheline_aligned_in_smp;
/* Container for backing file with optional user path */
struct backing_file {
struct file file;
- struct path user_path;
+ union {
+ struct path user_path;
+ freeptr_t bf_freeptr;
+ };
};
static inline struct backing_file *backing_file(struct file *f)
@@ -68,7 +71,7 @@ static inline void file_free(struct file *f)
put_cred(f->f_cred);
if (unlikely(f->f_mode & FMODE_BACKING)) {
path_put(backing_file_user_path(f));
- kfree(backing_file(f));
+ kmem_cache_free(bfilp_cachep, backing_file(f));
} else {
kmem_cache_free(filp_cachep, f);
}
@@ -165,16 +168,32 @@ static int init_file(struct file *f, int flags, const struct cred *cred)
* the respective member when opening the file.
*/
mutex_init(&f->f_pos_lock);
- f->f_flags = flags;
- f->f_mode = OPEN_FMODE(flags);
- /* f->f_version: 0 */
+ memset(&f->f_path, 0, sizeof(f->f_path));
+ memset(&f->f_ra, 0, sizeof(f->f_ra));
+
+ f->f_flags = flags;
+ f->f_mode = OPEN_FMODE(flags);
+
+ f->f_op = NULL;
+ f->f_mapping = NULL;
+ f->private_data = NULL;
+ f->f_inode = NULL;
+ f->f_owner = NULL;
+#ifdef CONFIG_EPOLL
+ f->f_ep = NULL;
+#endif
+
+ f->f_iocb_flags = 0;
+ f->f_pos = 0;
+ f->f_wb_err = 0;
+ f->f_sb_err = 0;
/*
* We're SLAB_TYPESAFE_BY_RCU so initialize f_count last. While
* fget-rcu pattern users need to be able to handle spurious
* refcount bumps we should reinitialize the reused file first.
*/
- atomic_long_set(&f->f_count, 1);
+ file_ref_init(&f->f_ref, 1);
return 0;
}
@@ -206,7 +225,7 @@ struct file *alloc_empty_file(int flags, const struct cred *cred)
goto over;
}
- f = kmem_cache_zalloc(filp_cachep, GFP_KERNEL);
+ f = kmem_cache_alloc(filp_cachep, GFP_KERNEL);
if (unlikely(!f))
return ERR_PTR(-ENOMEM);
@@ -240,7 +259,7 @@ struct file *alloc_empty_file_noaccount(int flags, const struct cred *cred)
struct file *f;
int error;
- f = kmem_cache_zalloc(filp_cachep, GFP_KERNEL);
+ f = kmem_cache_alloc(filp_cachep, GFP_KERNEL);
if (unlikely(!f))
return ERR_PTR(-ENOMEM);
@@ -267,13 +286,13 @@ struct file *alloc_empty_backing_file(int flags, const struct cred *cred)
struct backing_file *ff;
int error;
- ff = kzalloc(sizeof(struct backing_file), GFP_KERNEL);
+ ff = kmem_cache_alloc(bfilp_cachep, GFP_KERNEL);
if (unlikely(!ff))
return ERR_PTR(-ENOMEM);
error = init_file(&ff->file, flags, cred);
if (unlikely(error)) {
- kfree(ff);
+ kmem_cache_free(bfilp_cachep, ff);
return ERR_PTR(error);
}
@@ -479,7 +498,7 @@ static DECLARE_DELAYED_WORK(delayed_fput_work, delayed_fput);
void fput(struct file *file)
{
- if (atomic_long_dec_and_test(&file->f_count)) {
+ if (file_ref_put(&file->f_ref)) {
struct task_struct *task = current;
if (unlikely(!(file->f_mode & (FMODE_BACKING | FMODE_OPENED)))) {
@@ -512,7 +531,7 @@ void fput(struct file *file)
*/
void __fput_sync(struct file *file)
{
- if (atomic_long_dec_and_test(&file->f_count))
+ if (file_ref_put(&file->f_ref))
__fput(file);
}
@@ -529,6 +548,11 @@ void __init files_init(void)
filp_cachep = kmem_cache_create("filp", sizeof(struct file), &args,
SLAB_HWCACHE_ALIGN | SLAB_PANIC |
SLAB_ACCOUNT | SLAB_TYPESAFE_BY_RCU);
+
+ args.freeptr_offset = offsetof(struct backing_file, bf_freeptr);
+ bfilp_cachep = kmem_cache_create("bfilp", sizeof(struct backing_file),
+ &args, SLAB_HWCACHE_ALIGN | SLAB_PANIC |
+ SLAB_ACCOUNT | SLAB_TYPESAFE_BY_RCU);
percpu_counter_init(&nr_files, 0, GFP_KERNEL);
}
diff --git a/fs/freevxfs/vxfs_dir.h b/fs/freevxfs/vxfs_dir.h
index fbcd603..8c67627 100644
--- a/fs/freevxfs/vxfs_dir.h
+++ b/fs/freevxfs/vxfs_dir.h
@@ -25,7 +25,7 @@
struct vxfs_dirblk {
__fs16 d_free; /* free space in dirblock */
__fs16 d_nhash; /* no of hash chains */
- __fs16 d_hash[1]; /* hash chain */
+ __fs16 d_hash[]; /* hash chain */
};
/*
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index d8bec3c1..3cd99e2 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -290,7 +290,6 @@ void __inode_attach_wb(struct inode *inode, struct folio *folio)
if (unlikely(cmpxchg(&inode->i_wb, NULL, wb)))
wb_put(wb);
}
-EXPORT_SYMBOL_GPL(__inode_attach_wb);
/**
* inode_cgwb_move_to_attached - put the inode onto wb->b_attached list
@@ -731,8 +730,9 @@ bool cleanup_offline_cgwb(struct bdi_writeback *wb)
* writeback completion, wbc_detach_inode() should be called. This is used
* to track the cgroup writeback context.
*/
-void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
- struct inode *inode)
+static void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
+ struct inode *inode)
+ __releases(&inode->i_lock)
{
if (!inode_cgwb_enabled(inode)) {
spin_unlock(&inode->i_lock);
@@ -762,7 +762,24 @@ void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
if (unlikely(wb_dying(wbc->wb) && !css_is_dying(wbc->wb->memcg_css)))
inode_switch_wbs(inode, wbc->wb_id);
}
-EXPORT_SYMBOL_GPL(wbc_attach_and_unlock_inode);
+
+/**
+ * wbc_attach_fdatawrite_inode - associate wbc and inode for fdatawrite
+ * @wbc: writeback_control of interest
+ * @inode: target inode
+ *
+ * This function is to be used by __filemap_fdatawrite_range(), which is an
+ * alternative entry point into writeback code, and first ensures @inode is
+ * associated with a bdi_writeback and attaches it to @wbc.
+ */
+void wbc_attach_fdatawrite_inode(struct writeback_control *wbc,
+ struct inode *inode)
+{
+ spin_lock(&inode->i_lock);
+ inode_attach_wb(inode, NULL);
+ wbc_attach_and_unlock_inode(wbc, inode);
+}
+EXPORT_SYMBOL_GPL(wbc_attach_fdatawrite_inode);
/**
* wbc_detach_inode - disassociate wbc from inode and perform foreign detection
@@ -890,17 +907,16 @@ EXPORT_SYMBOL_GPL(wbc_detach_inode);
/**
* wbc_account_cgroup_owner - account writeback to update inode cgroup ownership
* @wbc: writeback_control of the writeback in progress
- * @page: page being written out
+ * @folio: folio being written out
* @bytes: number of bytes being written out
*
- * @bytes from @page are about to written out during the writeback
+ * @bytes from @folio are about to written out during the writeback
* controlled by @wbc. Keep the book for foreign inode detection. See
* wbc_detach_inode().
*/
-void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page,
+void wbc_account_cgroup_owner(struct writeback_control *wbc, struct folio *folio,
size_t bytes)
{
- struct folio *folio;
struct cgroup_subsys_state *css;
int id;
@@ -913,7 +929,6 @@ void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page,
if (!wbc->wb || wbc->no_cgroup_owner)
return;
- folio = page_folio(page);
css = mem_cgroup_css_from_folio(folio);
/* dead cgroups shouldn't contribute to inode ownership arbitration */
if (!(css->flags & CSS_ONLINE))
@@ -1227,6 +1242,13 @@ static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
}
}
+static inline void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
+ struct inode *inode)
+ __releases(&inode->i_lock)
+{
+ spin_unlock(&inode->i_lock);
+}
+
#endif /* CONFIG_CGROUP_WRITEBACK */
/*
diff --git a/fs/fs_parser.c b/fs/fs_parser.c
index 24727ec..16fa61ef 100644
--- a/fs/fs_parser.c
+++ b/fs/fs_parser.c
@@ -156,6 +156,7 @@ int fs_lookup_param(struct fs_context *fc,
f = getname_kernel(param->string);
if (IS_ERR(f))
return PTR_ERR(f);
+ param->dirfd = AT_FDCWD;
put_f = true;
break;
case fs_value_is_filename:
@@ -308,6 +309,26 @@ int fs_param_is_fd(struct p_log *log, const struct fs_parameter_spec *p,
}
EXPORT_SYMBOL(fs_param_is_fd);
+int fs_param_is_file_or_string(struct p_log *log,
+ const struct fs_parameter_spec *p,
+ struct fs_parameter *param,
+ struct fs_parse_result *result)
+{
+ switch (param->type) {
+ case fs_value_is_string:
+ return fs_param_is_string(log, p, param, result);
+ case fs_value_is_file:
+ result->uint_32 = param->dirfd;
+ if (result->uint_32 <= INT_MAX)
+ return 0;
+ break;
+ default:
+ break;
+ }
+ return fs_param_bad_value(log, param);
+}
+EXPORT_SYMBOL(fs_param_is_file_or_string);
+
int fs_param_is_uid(struct p_log *log, const struct fs_parameter_spec *p,
struct fs_parameter *param, struct fs_parse_result *result)
{
diff --git a/fs/fsopen.c b/fs/fsopen.c
index 6cef3de..094a7f5 100644
--- a/fs/fsopen.c
+++ b/fs/fsopen.c
@@ -349,7 +349,6 @@ SYSCALL_DEFINE5(fsconfig,
int, aux)
{
struct fs_context *fc;
- struct fd f;
int ret;
int lookup_flags = 0;
@@ -392,12 +391,11 @@ SYSCALL_DEFINE5(fsconfig,
return -EOPNOTSUPP;
}
- f = fdget(fd);
- if (!fd_file(f))
+ CLASS(fd, f)(fd);
+ if (fd_empty(f))
return -EBADF;
- ret = -EINVAL;
if (fd_file(f)->f_op != &fscontext_fops)
- goto out_f;
+ return -EINVAL;
fc = fd_file(f)->private_data;
if (fc->ops == &legacy_fs_context_ops) {
@@ -407,17 +405,14 @@ SYSCALL_DEFINE5(fsconfig,
case FSCONFIG_SET_PATH_EMPTY:
case FSCONFIG_SET_FD:
case FSCONFIG_CMD_CREATE_EXCL:
- ret = -EOPNOTSUPP;
- goto out_f;
+ return -EOPNOTSUPP;
}
}
if (_key) {
param.key = strndup_user(_key, 256);
- if (IS_ERR(param.key)) {
- ret = PTR_ERR(param.key);
- goto out_f;
- }
+ if (IS_ERR(param.key))
+ return PTR_ERR(param.key);
}
switch (cmd) {
@@ -496,7 +491,5 @@ SYSCALL_DEFINE5(fsconfig,
}
out_key:
kfree(param.key);
-out_f:
- fdput(f);
return ret;
}
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 1f64ae6..0723c63 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -2371,13 +2371,12 @@ static long fuse_dev_ioctl_clone(struct file *file, __u32 __user *argp)
int res;
int oldfd;
struct fuse_dev *fud = NULL;
- struct fd f;
if (get_user(oldfd, argp))
return -EFAULT;
- f = fdget(oldfd);
- if (!fd_file(f))
+ CLASS(fd, f)(oldfd);
+ if (fd_empty(f))
return -EINVAL;
/*
@@ -2394,7 +2393,6 @@ static long fuse_dev_ioctl_clone(struct file *file, __u32 __user *argp)
mutex_unlock(&fuse_mutex);
}
- fdput(f);
return res;
}
diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c
index d418d8b..3334c394 100644
--- a/fs/gfs2/export.c
+++ b/fs/gfs2/export.c
@@ -190,6 +190,5 @@ const struct export_operations gfs2_export_ops = {
.fh_to_parent = gfs2_fh_to_parent,
.get_name = gfs2_get_name,
.get_parent = gfs2_get_parent,
- .flags = EXPORT_OP_ASYNC_LOCK,
};
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index f7dd648..1e73cf8 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -1586,6 +1586,7 @@ const struct file_operations gfs2_file_fops = {
.splice_write = gfs2_file_splice_write,
.setlease = simple_nosetlease,
.fallocate = gfs2_fallocate,
+ .fop_flags = FOP_ASYNC_LOCK,
};
const struct file_operations gfs2_dir_fops = {
@@ -1598,6 +1599,7 @@ const struct file_operations gfs2_dir_fops = {
.lock = gfs2_lock,
.flock = gfs2_flock,
.llseek = default_llseek,
+ .fop_flags = FOP_ASYNC_LOCK,
};
#endif /* CONFIG_GFS2_FS_LOCKING_DLM */
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 269c3bc..4701c4a 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -34,7 +34,6 @@
#include <linux/lockref.h>
#include <linux/rhashtable.h>
#include <linux/pid_namespace.h>
-#include <linux/fdtable.h>
#include <linux/file.h>
#include "gfs2.h"
@@ -2768,25 +2767,18 @@ static struct file *gfs2_glockfd_next_file(struct gfs2_glockfd_iter *i)
i->file = NULL;
}
- rcu_read_lock();
for(;; i->fd++) {
- struct inode *inode;
-
- i->file = task_lookup_next_fdget_rcu(i->task, &i->fd);
+ i->file = fget_task_next(i->task, &i->fd);
if (!i->file) {
i->fd = 0;
break;
}
- inode = file_inode(i->file);
- if (inode->i_sb == i->sb)
+ if (file_inode(i->file)->i_sb == i->sb)
break;
- rcu_read_unlock();
fput(i->file);
- rcu_read_lock();
}
- rcu_read_unlock();
return i->file;
}
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index eeac997..3bee9b5d 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -15,10 +15,11 @@
#include <linux/module.h>
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
+#include <linux/fs_context.h>
+#include <linux/fs_parser.h>
#include <linux/mount.h>
#include <linux/init.h>
#include <linux/nls.h>
-#include <linux/parser.h>
#include <linux/seq_file.h>
#include <linux/slab.h>
#include <linux/vfs.h>
@@ -111,21 +112,24 @@ static int hfs_statfs(struct dentry *dentry, struct kstatfs *buf)
return 0;
}
-static int hfs_remount(struct super_block *sb, int *flags, char *data)
+static int hfs_reconfigure(struct fs_context *fc)
{
+ struct super_block *sb = fc->root->d_sb;
+
sync_filesystem(sb);
- *flags |= SB_NODIRATIME;
- if ((bool)(*flags & SB_RDONLY) == sb_rdonly(sb))
+ fc->sb_flags |= SB_NODIRATIME;
+ if ((bool)(fc->sb_flags & SB_RDONLY) == sb_rdonly(sb))
return 0;
- if (!(*flags & SB_RDONLY)) {
+
+ if (!(fc->sb_flags & SB_RDONLY)) {
if (!(HFS_SB(sb)->mdb->drAtrb & cpu_to_be16(HFS_SB_ATTRIB_UNMNT))) {
pr_warn("filesystem was not cleanly unmounted, running fsck.hfs is recommended. leaving read-only.\n");
sb->s_flags |= SB_RDONLY;
- *flags |= SB_RDONLY;
+ fc->sb_flags |= SB_RDONLY;
} else if (HFS_SB(sb)->mdb->drAtrb & cpu_to_be16(HFS_SB_ATTRIB_SLOCK)) {
pr_warn("filesystem is marked locked, leaving read-only.\n");
sb->s_flags |= SB_RDONLY;
- *flags |= SB_RDONLY;
+ fc->sb_flags |= SB_RDONLY;
}
}
return 0;
@@ -180,7 +184,6 @@ static const struct super_operations hfs_super_operations = {
.put_super = hfs_put_super,
.sync_fs = hfs_sync_fs,
.statfs = hfs_statfs,
- .remount_fs = hfs_remount,
.show_options = hfs_show_options,
};
@@ -188,181 +191,112 @@ enum {
opt_uid, opt_gid, opt_umask, opt_file_umask, opt_dir_umask,
opt_part, opt_session, opt_type, opt_creator, opt_quiet,
opt_codepage, opt_iocharset,
- opt_err
};
-static const match_table_t tokens = {
- { opt_uid, "uid=%u" },
- { opt_gid, "gid=%u" },
- { opt_umask, "umask=%o" },
- { opt_file_umask, "file_umask=%o" },
- { opt_dir_umask, "dir_umask=%o" },
- { opt_part, "part=%u" },
- { opt_session, "session=%u" },
- { opt_type, "type=%s" },
- { opt_creator, "creator=%s" },
- { opt_quiet, "quiet" },
- { opt_codepage, "codepage=%s" },
- { opt_iocharset, "iocharset=%s" },
- { opt_err, NULL }
+static const struct fs_parameter_spec hfs_param_spec[] = {
+ fsparam_u32 ("uid", opt_uid),
+ fsparam_u32 ("gid", opt_gid),
+ fsparam_u32oct ("umask", opt_umask),
+ fsparam_u32oct ("file_umask", opt_file_umask),
+ fsparam_u32oct ("dir_umask", opt_dir_umask),
+ fsparam_u32 ("part", opt_part),
+ fsparam_u32 ("session", opt_session),
+ fsparam_string ("type", opt_type),
+ fsparam_string ("creator", opt_creator),
+ fsparam_flag ("quiet", opt_quiet),
+ fsparam_string ("codepage", opt_codepage),
+ fsparam_string ("iocharset", opt_iocharset),
+ {}
};
-static inline int match_fourchar(substring_t *arg, u32 *result)
-{
- if (arg->to - arg->from != 4)
- return -EINVAL;
- memcpy(result, arg->from, 4);
- return 0;
-}
-
/*
- * parse_options()
+ * hfs_parse_param()
*
- * adapted from linux/fs/msdos/inode.c written 1992,93 by Werner Almesberger
- * This function is called by hfs_read_super() to parse the mount options.
+ * This function is called by the vfs to parse the mount options.
*/
-static int parse_options(char *options, struct hfs_sb_info *hsb)
+static int hfs_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
- char *p;
- substring_t args[MAX_OPT_ARGS];
- int tmp, token;
+ struct hfs_sb_info *hsb = fc->s_fs_info;
+ struct fs_parse_result result;
+ int opt;
- /* initialize the sb with defaults */
- hsb->s_uid = current_uid();
- hsb->s_gid = current_gid();
- hsb->s_file_umask = 0133;
- hsb->s_dir_umask = 0022;
- hsb->s_type = hsb->s_creator = cpu_to_be32(0x3f3f3f3f); /* == '????' */
- hsb->s_quiet = 0;
- hsb->part = -1;
- hsb->session = -1;
+ /* hfs does not honor any fs-specific options on remount */
+ if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE)
+ return 0;
- if (!options)
- return 1;
+ opt = fs_parse(fc, hfs_param_spec, param, &result);
+ if (opt < 0)
+ return opt;
- while ((p = strsep(&options, ",")) != NULL) {
- if (!*p)
- continue;
-
- token = match_token(p, tokens, args);
- switch (token) {
- case opt_uid:
- if (match_int(&args[0], &tmp)) {
- pr_err("uid requires an argument\n");
- return 0;
- }
- hsb->s_uid = make_kuid(current_user_ns(), (uid_t)tmp);
- if (!uid_valid(hsb->s_uid)) {
- pr_err("invalid uid %d\n", tmp);
- return 0;
- }
- break;
- case opt_gid:
- if (match_int(&args[0], &tmp)) {
- pr_err("gid requires an argument\n");
- return 0;
- }
- hsb->s_gid = make_kgid(current_user_ns(), (gid_t)tmp);
- if (!gid_valid(hsb->s_gid)) {
- pr_err("invalid gid %d\n", tmp);
- return 0;
- }
- break;
- case opt_umask:
- if (match_octal(&args[0], &tmp)) {
- pr_err("umask requires a value\n");
- return 0;
- }
- hsb->s_file_umask = (umode_t)tmp;
- hsb->s_dir_umask = (umode_t)tmp;
- break;
- case opt_file_umask:
- if (match_octal(&args[0], &tmp)) {
- pr_err("file_umask requires a value\n");
- return 0;
- }
- hsb->s_file_umask = (umode_t)tmp;
- break;
- case opt_dir_umask:
- if (match_octal(&args[0], &tmp)) {
- pr_err("dir_umask requires a value\n");
- return 0;
- }
- hsb->s_dir_umask = (umode_t)tmp;
- break;
- case opt_part:
- if (match_int(&args[0], &hsb->part)) {
- pr_err("part requires an argument\n");
- return 0;
- }
- break;
- case opt_session:
- if (match_int(&args[0], &hsb->session)) {
- pr_err("session requires an argument\n");
- return 0;
- }
- break;
- case opt_type:
- if (match_fourchar(&args[0], &hsb->s_type)) {
- pr_err("type requires a 4 character value\n");
- return 0;
- }
- break;
- case opt_creator:
- if (match_fourchar(&args[0], &hsb->s_creator)) {
- pr_err("creator requires a 4 character value\n");
- return 0;
- }
- break;
- case opt_quiet:
- hsb->s_quiet = 1;
- break;
- case opt_codepage:
- if (hsb->nls_disk) {
- pr_err("unable to change codepage\n");
- return 0;
- }
- p = match_strdup(&args[0]);
- if (p)
- hsb->nls_disk = load_nls(p);
- if (!hsb->nls_disk) {
- pr_err("unable to load codepage \"%s\"\n", p);
- kfree(p);
- return 0;
- }
- kfree(p);
- break;
- case opt_iocharset:
- if (hsb->nls_io) {
- pr_err("unable to change iocharset\n");
- return 0;
- }
- p = match_strdup(&args[0]);
- if (p)
- hsb->nls_io = load_nls(p);
- if (!hsb->nls_io) {
- pr_err("unable to load iocharset \"%s\"\n", p);
- kfree(p);
- return 0;
- }
- kfree(p);
- break;
- default:
- return 0;
+ switch (opt) {
+ case opt_uid:
+ hsb->s_uid = result.uid;
+ break;
+ case opt_gid:
+ hsb->s_gid = result.gid;
+ break;
+ case opt_umask:
+ hsb->s_file_umask = (umode_t)result.uint_32;
+ hsb->s_dir_umask = (umode_t)result.uint_32;
+ break;
+ case opt_file_umask:
+ hsb->s_file_umask = (umode_t)result.uint_32;
+ break;
+ case opt_dir_umask:
+ hsb->s_dir_umask = (umode_t)result.uint_32;
+ break;
+ case opt_part:
+ hsb->part = result.uint_32;
+ break;
+ case opt_session:
+ hsb->session = result.uint_32;
+ break;
+ case opt_type:
+ if (strlen(param->string) != 4) {
+ pr_err("type requires a 4 character value\n");
+ return -EINVAL;
}
- }
-
- if (hsb->nls_disk && !hsb->nls_io) {
- hsb->nls_io = load_nls_default();
+ memcpy(&hsb->s_type, param->string, 4);
+ break;
+ case opt_creator:
+ if (strlen(param->string) != 4) {
+ pr_err("creator requires a 4 character value\n");
+ return -EINVAL;
+ }
+ memcpy(&hsb->s_creator, param->string, 4);
+ break;
+ case opt_quiet:
+ hsb->s_quiet = 1;
+ break;
+ case opt_codepage:
+ if (hsb->nls_disk) {
+ pr_err("unable to change codepage\n");
+ return -EINVAL;
+ }
+ hsb->nls_disk = load_nls(param->string);
+ if (!hsb->nls_disk) {
+ pr_err("unable to load codepage \"%s\"\n",
+ param->string);
+ return -EINVAL;
+ }
+ break;
+ case opt_iocharset:
+ if (hsb->nls_io) {
+ pr_err("unable to change iocharset\n");
+ return -EINVAL;
+ }
+ hsb->nls_io = load_nls(param->string);
if (!hsb->nls_io) {
- pr_err("unable to load default iocharset\n");
- return 0;
+ pr_err("unable to load iocharset \"%s\"\n",
+ param->string);
+ return -EINVAL;
}
+ break;
+ default:
+ return -EINVAL;
}
- hsb->s_dir_umask &= 0777;
- hsb->s_file_umask &= 0577;
- return 1;
+ return 0;
}
/*
@@ -376,29 +310,25 @@ static int parse_options(char *options, struct hfs_sb_info *hsb)
* hfs_btree_init() to get the necessary data about the extents and
* catalog B-trees and, finally, reading the root inode into memory.
*/
-static int hfs_fill_super(struct super_block *sb, void *data, int silent)
+static int hfs_fill_super(struct super_block *sb, struct fs_context *fc)
{
- struct hfs_sb_info *sbi;
+ struct hfs_sb_info *sbi = HFS_SB(sb);
struct hfs_find_data fd;
hfs_cat_rec rec;
struct inode *root_inode;
+ int silent = fc->sb_flags & SB_SILENT;
int res;
- sbi = kzalloc(sizeof(struct hfs_sb_info), GFP_KERNEL);
- if (!sbi)
- return -ENOMEM;
+ /* load_nls_default does not fail */
+ if (sbi->nls_disk && !sbi->nls_io)
+ sbi->nls_io = load_nls_default();
+ sbi->s_dir_umask &= 0777;
+ sbi->s_file_umask &= 0577;
- sbi->sb = sb;
- sb->s_fs_info = sbi;
spin_lock_init(&sbi->work_lock);
INIT_DELAYED_WORK(&sbi->mdb_work, flush_mdb);
- res = -EINVAL;
- if (!parse_options((char *)data, sbi)) {
- pr_err("unable to parse mount options\n");
- goto bail;
- }
-
+ sbi->sb = sb;
sb->s_op = &hfs_super_operations;
sb->s_xattr = hfs_xattr_handlers;
sb->s_flags |= SB_NODIRATIME;
@@ -451,18 +381,56 @@ static int hfs_fill_super(struct super_block *sb, void *data, int silent)
return res;
}
-static struct dentry *hfs_mount(struct file_system_type *fs_type,
- int flags, const char *dev_name, void *data)
+static int hfs_get_tree(struct fs_context *fc)
{
- return mount_bdev(fs_type, flags, dev_name, data, hfs_fill_super);
+ return get_tree_bdev(fc, hfs_fill_super);
+}
+
+static void hfs_free_fc(struct fs_context *fc)
+{
+ kfree(fc->s_fs_info);
+}
+
+static const struct fs_context_operations hfs_context_ops = {
+ .parse_param = hfs_parse_param,
+ .get_tree = hfs_get_tree,
+ .reconfigure = hfs_reconfigure,
+ .free = hfs_free_fc,
+};
+
+static int hfs_init_fs_context(struct fs_context *fc)
+{
+ struct hfs_sb_info *hsb;
+
+ hsb = kzalloc(sizeof(struct hfs_sb_info), GFP_KERNEL);
+ if (!hsb)
+ return -ENOMEM;
+
+ fc->s_fs_info = hsb;
+ fc->ops = &hfs_context_ops;
+
+ if (fc->purpose != FS_CONTEXT_FOR_RECONFIGURE) {
+ /* initialize options with defaults */
+ hsb->s_uid = current_uid();
+ hsb->s_gid = current_gid();
+ hsb->s_file_umask = 0133;
+ hsb->s_dir_umask = 0022;
+ hsb->s_type = cpu_to_be32(0x3f3f3f3f); /* == '????' */
+ hsb->s_creator = cpu_to_be32(0x3f3f3f3f); /* == '????' */
+ hsb->s_quiet = 0;
+ hsb->part = -1;
+ hsb->session = -1;
+ }
+
+ return 0;
}
static struct file_system_type hfs_fs_type = {
.owner = THIS_MODULE,
.name = "hfs",
- .mount = hfs_mount,
.kill_sb = kill_block_super,
.fs_flags = FS_REQUIRES_DEV,
+ .init_fs_context = hfs_init_fs_context,
};
MODULE_ALIAS_FS("hfs");
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index 59ce81d..2f089bf 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -21,6 +21,7 @@
#include <linux/mutex.h>
#include <linux/buffer_head.h>
#include <linux/blkdev.h>
+#include <linux/fs_context.h>
#include "hfsplus_raw.h"
#define DBG_BNODE_REFS 0x00000001
@@ -156,6 +157,7 @@ struct hfsplus_sb_info {
/* Runtime variables */
u32 blockoffset;
+ u32 min_io_size;
sector_t part_start;
sector_t sect_count;
int fs_shift;
@@ -307,7 +309,7 @@ struct hfsplus_readdir_data {
*/
static inline unsigned short hfsplus_min_io_size(struct super_block *sb)
{
- return max_t(unsigned short, bdev_logical_block_size(sb->s_bdev),
+ return max_t(unsigned short, HFSPLUS_SB(sb)->min_io_size,
HFSPLUS_SECTOR_SIZE);
}
@@ -496,8 +498,7 @@ long hfsplus_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
/* options.c */
void hfsplus_fill_defaults(struct hfsplus_sb_info *opts);
-int hfsplus_parse_options_remount(char *input, int *force);
-int hfsplus_parse_options(char *input, struct hfsplus_sb_info *sbi);
+int hfsplus_parse_param(struct fs_context *fc, struct fs_parameter *param);
int hfsplus_show_options(struct seq_file *seq, struct dentry *root);
/* part_tbl.c */
diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c
index c94a587..a66a09a 100644
--- a/fs/hfsplus/options.c
+++ b/fs/hfsplus/options.c
@@ -12,7 +12,8 @@
#include <linux/string.h>
#include <linux/kernel.h>
#include <linux/sched.h>
-#include <linux/parser.h>
+#include <linux/fs_context.h>
+#include <linux/fs_parser.h>
#include <linux/nls.h>
#include <linux/mount.h>
#include <linux/seq_file.h>
@@ -23,26 +24,23 @@ enum {
opt_creator, opt_type,
opt_umask, opt_uid, opt_gid,
opt_part, opt_session, opt_nls,
- opt_nodecompose, opt_decompose,
- opt_barrier, opt_nobarrier,
- opt_force, opt_err
+ opt_decompose, opt_barrier,
+ opt_force,
};
-static const match_table_t tokens = {
- { opt_creator, "creator=%s" },
- { opt_type, "type=%s" },
- { opt_umask, "umask=%o" },
- { opt_uid, "uid=%u" },
- { opt_gid, "gid=%u" },
- { opt_part, "part=%u" },
- { opt_session, "session=%u" },
- { opt_nls, "nls=%s" },
- { opt_decompose, "decompose" },
- { opt_nodecompose, "nodecompose" },
- { opt_barrier, "barrier" },
- { opt_nobarrier, "nobarrier" },
- { opt_force, "force" },
- { opt_err, NULL }
+static const struct fs_parameter_spec hfs_param_spec[] = {
+ fsparam_string ("creator", opt_creator),
+ fsparam_string ("type", opt_type),
+ fsparam_u32oct ("umask", opt_umask),
+ fsparam_u32 ("uid", opt_uid),
+ fsparam_u32 ("gid", opt_gid),
+ fsparam_u32 ("part", opt_part),
+ fsparam_u32 ("session", opt_session),
+ fsparam_string ("nls", opt_nls),
+ fsparam_flag_no ("decompose", opt_decompose),
+ fsparam_flag_no ("barrier", opt_barrier),
+ fsparam_flag ("force", opt_force),
+ {}
};
/* Initialize an options object to reasonable defaults */
@@ -60,162 +58,89 @@ void hfsplus_fill_defaults(struct hfsplus_sb_info *opts)
opts->session = -1;
}
-/* convert a "four byte character" to a 32 bit int with error checks */
-static inline int match_fourchar(substring_t *arg, u32 *result)
+/* Parse options from mount. Returns nonzero errno on failure */
+int hfsplus_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
- if (arg->to - arg->from != 4)
- return -EINVAL;
- memcpy(result, arg->from, 4);
- return 0;
-}
+ struct hfsplus_sb_info *sbi = fc->s_fs_info;
+ struct fs_parse_result result;
+ int opt;
-int hfsplus_parse_options_remount(char *input, int *force)
-{
- char *p;
- substring_t args[MAX_OPT_ARGS];
- int token;
+ /*
+ * Only the force option is examined during remount, all others
+ * are ignored.
+ */
+ if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE &&
+ strncmp(param->key, "force", 5))
+ return 0;
- if (!input)
- return 1;
+ opt = fs_parse(fc, hfs_param_spec, param, &result);
+ if (opt < 0)
+ return opt;
- while ((p = strsep(&input, ",")) != NULL) {
- if (!*p)
- continue;
-
- token = match_token(p, tokens, args);
- switch (token) {
- case opt_force:
- *force = 1;
- break;
- default:
- break;
+ switch (opt) {
+ case opt_creator:
+ if (strlen(param->string) != 4) {
+ pr_err("creator requires a 4 character value\n");
+ return -EINVAL;
}
- }
-
- return 1;
-}
-
-/* Parse options from mount. Returns 0 on failure */
-/* input is the options passed to mount() as a string */
-int hfsplus_parse_options(char *input, struct hfsplus_sb_info *sbi)
-{
- char *p;
- substring_t args[MAX_OPT_ARGS];
- int tmp, token;
-
- if (!input)
- goto done;
-
- while ((p = strsep(&input, ",")) != NULL) {
- if (!*p)
- continue;
-
- token = match_token(p, tokens, args);
- switch (token) {
- case opt_creator:
- if (match_fourchar(&args[0], &sbi->creator)) {
- pr_err("creator requires a 4 character value\n");
- return 0;
- }
- break;
- case opt_type:
- if (match_fourchar(&args[0], &sbi->type)) {
- pr_err("type requires a 4 character value\n");
- return 0;
- }
- break;
- case opt_umask:
- if (match_octal(&args[0], &tmp)) {
- pr_err("umask requires a value\n");
- return 0;
- }
- sbi->umask = (umode_t)tmp;
- break;
- case opt_uid:
- if (match_int(&args[0], &tmp)) {
- pr_err("uid requires an argument\n");
- return 0;
- }
- sbi->uid = make_kuid(current_user_ns(), (uid_t)tmp);
- if (!uid_valid(sbi->uid)) {
- pr_err("invalid uid specified\n");
- return 0;
- } else {
- set_bit(HFSPLUS_SB_UID, &sbi->flags);
- }
- break;
- case opt_gid:
- if (match_int(&args[0], &tmp)) {
- pr_err("gid requires an argument\n");
- return 0;
- }
- sbi->gid = make_kgid(current_user_ns(), (gid_t)tmp);
- if (!gid_valid(sbi->gid)) {
- pr_err("invalid gid specified\n");
- return 0;
- } else {
- set_bit(HFSPLUS_SB_GID, &sbi->flags);
- }
- break;
- case opt_part:
- if (match_int(&args[0], &sbi->part)) {
- pr_err("part requires an argument\n");
- return 0;
- }
- break;
- case opt_session:
- if (match_int(&args[0], &sbi->session)) {
- pr_err("session requires an argument\n");
- return 0;
- }
- break;
- case opt_nls:
- if (sbi->nls) {
- pr_err("unable to change nls mapping\n");
- return 0;
- }
- p = match_strdup(&args[0]);
- if (p)
- sbi->nls = load_nls(p);
- if (!sbi->nls) {
- pr_err("unable to load nls mapping \"%s\"\n",
- p);
- kfree(p);
- return 0;
- }
- kfree(p);
- break;
- case opt_decompose:
- clear_bit(HFSPLUS_SB_NODECOMPOSE, &sbi->flags);
- break;
- case opt_nodecompose:
+ memcpy(&sbi->creator, param->string, 4);
+ break;
+ case opt_type:
+ if (strlen(param->string) != 4) {
+ pr_err("type requires a 4 character value\n");
+ return -EINVAL;
+ }
+ memcpy(&sbi->type, param->string, 4);
+ break;
+ case opt_umask:
+ sbi->umask = (umode_t)result.uint_32;
+ break;
+ case opt_uid:
+ sbi->uid = result.uid;
+ set_bit(HFSPLUS_SB_UID, &sbi->flags);
+ break;
+ case opt_gid:
+ sbi->gid = result.gid;
+ set_bit(HFSPLUS_SB_GID, &sbi->flags);
+ break;
+ case opt_part:
+ sbi->part = result.uint_32;
+ break;
+ case opt_session:
+ sbi->session = result.uint_32;
+ break;
+ case opt_nls:
+ if (sbi->nls) {
+ pr_err("unable to change nls mapping\n");
+ return -EINVAL;
+ }
+ sbi->nls = load_nls(param->string);
+ if (!sbi->nls) {
+ pr_err("unable to load nls mapping \"%s\"\n",
+ param->string);
+ return -EINVAL;
+ }
+ break;
+ case opt_decompose:
+ if (result.negated)
set_bit(HFSPLUS_SB_NODECOMPOSE, &sbi->flags);
- break;
- case opt_barrier:
- clear_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags);
- break;
- case opt_nobarrier:
+ else
+ clear_bit(HFSPLUS_SB_NODECOMPOSE, &sbi->flags);
+ break;
+ case opt_barrier:
+ if (result.negated)
set_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags);
- break;
- case opt_force:
- set_bit(HFSPLUS_SB_FORCE, &sbi->flags);
- break;
- default:
- return 0;
- }
+ else
+ clear_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags);
+ break;
+ case opt_force:
+ set_bit(HFSPLUS_SB_FORCE, &sbi->flags);
+ break;
+ default:
+ return -EINVAL;
}
-done:
- if (!sbi->nls) {
- /* try utf8 first, as this is the old default behaviour */
- sbi->nls = load_nls("utf8");
- if (!sbi->nls)
- sbi->nls = load_nls_default();
- if (!sbi->nls)
- return 0;
- }
-
- return 1;
+ return 0;
}
int hfsplus_show_options(struct seq_file *seq, struct dentry *root)
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 9792020..948b8aa 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -14,6 +14,7 @@
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
#include <linux/fs.h>
+#include <linux/fs_context.h>
#include <linux/slab.h>
#include <linux/vfs.h>
#include <linux/nls.h>
@@ -332,34 +333,33 @@ static int hfsplus_statfs(struct dentry *dentry, struct kstatfs *buf)
return 0;
}
-static int hfsplus_remount(struct super_block *sb, int *flags, char *data)
+static int hfsplus_reconfigure(struct fs_context *fc)
{
- sync_filesystem(sb);
- if ((bool)(*flags & SB_RDONLY) == sb_rdonly(sb))
- return 0;
- if (!(*flags & SB_RDONLY)) {
- struct hfsplus_vh *vhdr = HFSPLUS_SB(sb)->s_vhdr;
- int force = 0;
+ struct super_block *sb = fc->root->d_sb;
- if (!hfsplus_parse_options_remount(data, &force))
- return -EINVAL;
+ sync_filesystem(sb);
+ if ((bool)(fc->sb_flags & SB_RDONLY) == sb_rdonly(sb))
+ return 0;
+ if (!(fc->sb_flags & SB_RDONLY)) {
+ struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
+ struct hfsplus_vh *vhdr = sbi->s_vhdr;
if (!(vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_UNMNT))) {
pr_warn("filesystem was not cleanly unmounted, running fsck.hfsplus is recommended. leaving read-only.\n");
sb->s_flags |= SB_RDONLY;
- *flags |= SB_RDONLY;
- } else if (force) {
+ fc->sb_flags |= SB_RDONLY;
+ } else if (test_bit(HFSPLUS_SB_FORCE, &sbi->flags)) {
/* nothing */
} else if (vhdr->attributes &
cpu_to_be32(HFSPLUS_VOL_SOFTLOCK)) {
pr_warn("filesystem is marked locked, leaving read-only.\n");
sb->s_flags |= SB_RDONLY;
- *flags |= SB_RDONLY;
+ fc->sb_flags |= SB_RDONLY;
} else if (vhdr->attributes &
cpu_to_be32(HFSPLUS_VOL_JOURNALED)) {
pr_warn("filesystem is marked journaled, leaving read-only.\n");
sb->s_flags |= SB_RDONLY;
- *flags |= SB_RDONLY;
+ fc->sb_flags |= SB_RDONLY;
}
}
return 0;
@@ -373,38 +373,33 @@ static const struct super_operations hfsplus_sops = {
.put_super = hfsplus_put_super,
.sync_fs = hfsplus_sync_fs,
.statfs = hfsplus_statfs,
- .remount_fs = hfsplus_remount,
.show_options = hfsplus_show_options,
};
-static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
+static int hfsplus_fill_super(struct super_block *sb, struct fs_context *fc)
{
struct hfsplus_vh *vhdr;
- struct hfsplus_sb_info *sbi;
+ struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
hfsplus_cat_entry entry;
struct hfs_find_data fd;
struct inode *root, *inode;
struct qstr str;
- struct nls_table *nls = NULL;
+ struct nls_table *nls;
u64 last_fs_block, last_fs_page;
+ int silent = fc->sb_flags & SB_SILENT;
int err;
- err = -ENOMEM;
- sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
- if (!sbi)
- goto out;
-
- sb->s_fs_info = sbi;
mutex_init(&sbi->alloc_mutex);
mutex_init(&sbi->vh_mutex);
spin_lock_init(&sbi->work_lock);
INIT_DELAYED_WORK(&sbi->sync_work, delayed_sync_fs);
- hfsplus_fill_defaults(sbi);
err = -EINVAL;
- if (!hfsplus_parse_options(data, sbi)) {
- pr_err("unable to parse mount options\n");
- goto out_unload_nls;
+ if (!sbi->nls) {
+ /* try utf8 first, as this is the old default behaviour */
+ sbi->nls = load_nls("utf8");
+ if (!sbi->nls)
+ sbi->nls = load_nls_default();
}
/* temporarily use utf8 to correctly find the hidden dir below */
@@ -616,7 +611,6 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
unload_nls(sbi->nls);
unload_nls(nls);
kfree(sbi);
-out:
return err;
}
@@ -641,18 +635,46 @@ static void hfsplus_free_inode(struct inode *inode)
#define HFSPLUS_INODE_SIZE sizeof(struct hfsplus_inode_info)
-static struct dentry *hfsplus_mount(struct file_system_type *fs_type,
- int flags, const char *dev_name, void *data)
+static int hfsplus_get_tree(struct fs_context *fc)
{
- return mount_bdev(fs_type, flags, dev_name, data, hfsplus_fill_super);
+ return get_tree_bdev(fc, hfsplus_fill_super);
+}
+
+static void hfsplus_free_fc(struct fs_context *fc)
+{
+ kfree(fc->s_fs_info);
+}
+
+static const struct fs_context_operations hfsplus_context_ops = {
+ .parse_param = hfsplus_parse_param,
+ .get_tree = hfsplus_get_tree,
+ .reconfigure = hfsplus_reconfigure,
+ .free = hfsplus_free_fc,
+};
+
+static int hfsplus_init_fs_context(struct fs_context *fc)
+{
+ struct hfsplus_sb_info *sbi;
+
+ sbi = kzalloc(sizeof(struct hfsplus_sb_info), GFP_KERNEL);
+ if (!sbi)
+ return -ENOMEM;
+
+ if (fc->purpose != FS_CONTEXT_FOR_RECONFIGURE)
+ hfsplus_fill_defaults(sbi);
+
+ fc->s_fs_info = sbi;
+ fc->ops = &hfsplus_context_ops;
+
+ return 0;
}
static struct file_system_type hfsplus_fs_type = {
.owner = THIS_MODULE,
.name = "hfsplus",
- .mount = hfsplus_mount,
.kill_sb = kill_block_super,
.fs_flags = FS_REQUIRES_DEV,
+ .init_fs_context = hfsplus_init_fs_context,
};
MODULE_ALIAS_FS("hfsplus");
diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c
index 9592ffc..7480191 100644
--- a/fs/hfsplus/wrapper.c
+++ b/fs/hfsplus/wrapper.c
@@ -172,6 +172,8 @@ int hfsplus_read_wrapper(struct super_block *sb)
if (!blocksize)
goto out;
+ sbi->min_io_size = blocksize;
+
if (hfsplus_get_last_session(sb, &part_start, &part_size))
goto out;
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index e73717d..2756792 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -9,7 +9,8 @@
#include "hpfs_fn.h"
#include <linux/module.h>
-#include <linux/parser.h>
+#include <linux/fs_context.h>
+#include <linux/fs_parser.h>
#include <linux/init.h>
#include <linux/statfs.h>
#include <linux/magic.h>
@@ -90,7 +91,7 @@ void hpfs_error(struct super_block *s, const char *fmt, ...)
hpfs_sb(s)->sb_was_error = 1;
}
-/*
+/*
* A little trick to detect cycles in many hpfs structures and don't let the
* kernel crash on corrupted filesystem. When first called, set c2 to 0.
*
@@ -272,146 +273,70 @@ static void destroy_inodecache(void)
kmem_cache_destroy(hpfs_inode_cachep);
}
-/*
- * A tiny parser for option strings, stolen from dosfs.
- * Stolen again from read-only hpfs.
- * And updated for table-driven option parsing.
- */
-
enum {
- Opt_help, Opt_uid, Opt_gid, Opt_umask, Opt_case_lower, Opt_case_asis,
- Opt_check_none, Opt_check_normal, Opt_check_strict,
- Opt_err_cont, Opt_err_ro, Opt_err_panic,
- Opt_eas_no, Opt_eas_ro, Opt_eas_rw,
- Opt_chkdsk_no, Opt_chkdsk_errors, Opt_chkdsk_always,
- Opt_timeshift, Opt_err,
+ Opt_help, Opt_uid, Opt_gid, Opt_umask, Opt_case,
+ Opt_check, Opt_err, Opt_eas, Opt_chkdsk, Opt_timeshift,
};
-static const match_table_t tokens = {
- {Opt_help, "help"},
- {Opt_uid, "uid=%u"},
- {Opt_gid, "gid=%u"},
- {Opt_umask, "umask=%o"},
- {Opt_case_lower, "case=lower"},
- {Opt_case_asis, "case=asis"},
- {Opt_check_none, "check=none"},
- {Opt_check_normal, "check=normal"},
- {Opt_check_strict, "check=strict"},
- {Opt_err_cont, "errors=continue"},
- {Opt_err_ro, "errors=remount-ro"},
- {Opt_err_panic, "errors=panic"},
- {Opt_eas_no, "eas=no"},
- {Opt_eas_ro, "eas=ro"},
- {Opt_eas_rw, "eas=rw"},
- {Opt_chkdsk_no, "chkdsk=no"},
- {Opt_chkdsk_errors, "chkdsk=errors"},
- {Opt_chkdsk_always, "chkdsk=always"},
- {Opt_timeshift, "timeshift=%d"},
- {Opt_err, NULL},
+static const struct constant_table hpfs_param_case[] = {
+ {"asis", 0},
+ {"lower", 1},
+ {}
};
-static int parse_opts(char *opts, kuid_t *uid, kgid_t *gid, umode_t *umask,
- int *lowercase, int *eas, int *chk, int *errs,
- int *chkdsk, int *timeshift)
-{
- char *p;
- int option;
+static const struct constant_table hpfs_param_check[] = {
+ {"none", 0},
+ {"normal", 1},
+ {"strict", 2},
+ {}
+};
- if (!opts)
- return 1;
+static const struct constant_table hpfs_param_err[] = {
+ {"continue", 0},
+ {"remount-ro", 1},
+ {"panic", 2},
+ {}
+};
- /*pr_info("Parsing opts: '%s'\n",opts);*/
+static const struct constant_table hpfs_param_eas[] = {
+ {"no", 0},
+ {"ro", 1},
+ {"rw", 2},
+ {}
+};
- while ((p = strsep(&opts, ",")) != NULL) {
- substring_t args[MAX_OPT_ARGS];
- int token;
- if (!*p)
- continue;
+static const struct constant_table hpfs_param_chkdsk[] = {
+ {"no", 0},
+ {"errors", 1},
+ {"always", 2},
+ {}
+};
- token = match_token(p, tokens, args);
- switch (token) {
- case Opt_help:
- return 2;
- case Opt_uid:
- if (match_int(args, &option))
- return 0;
- *uid = make_kuid(current_user_ns(), option);
- if (!uid_valid(*uid))
- return 0;
- break;
- case Opt_gid:
- if (match_int(args, &option))
- return 0;
- *gid = make_kgid(current_user_ns(), option);
- if (!gid_valid(*gid))
- return 0;
- break;
- case Opt_umask:
- if (match_octal(args, &option))
- return 0;
- *umask = option;
- break;
- case Opt_case_lower:
- *lowercase = 1;
- break;
- case Opt_case_asis:
- *lowercase = 0;
- break;
- case Opt_check_none:
- *chk = 0;
- break;
- case Opt_check_normal:
- *chk = 1;
- break;
- case Opt_check_strict:
- *chk = 2;
- break;
- case Opt_err_cont:
- *errs = 0;
- break;
- case Opt_err_ro:
- *errs = 1;
- break;
- case Opt_err_panic:
- *errs = 2;
- break;
- case Opt_eas_no:
- *eas = 0;
- break;
- case Opt_eas_ro:
- *eas = 1;
- break;
- case Opt_eas_rw:
- *eas = 2;
- break;
- case Opt_chkdsk_no:
- *chkdsk = 0;
- break;
- case Opt_chkdsk_errors:
- *chkdsk = 1;
- break;
- case Opt_chkdsk_always:
- *chkdsk = 2;
- break;
- case Opt_timeshift:
- {
- int m = 1;
- char *rhs = args[0].from;
- if (!rhs || !*rhs)
- return 0;
- if (*rhs == '-') m = -1;
- if (*rhs == '+' || *rhs == '-') rhs++;
- *timeshift = simple_strtoul(rhs, &rhs, 0) * m;
- if (*rhs)
- return 0;
- break;
- }
- default:
- return 0;
- }
- }
- return 1;
-}
+static const struct fs_parameter_spec hpfs_param_spec[] = {
+ fsparam_flag ("help", Opt_help),
+ fsparam_uid ("uid", Opt_uid),
+ fsparam_gid ("gid", Opt_gid),
+ fsparam_u32oct ("umask", Opt_umask),
+ fsparam_enum ("case", Opt_case, hpfs_param_case),
+ fsparam_enum ("check", Opt_check, hpfs_param_check),
+ fsparam_enum ("errors", Opt_err, hpfs_param_err),
+ fsparam_enum ("eas", Opt_eas, hpfs_param_eas),
+ fsparam_enum ("chkdsk", Opt_chkdsk, hpfs_param_chkdsk),
+ fsparam_s32 ("timeshift", Opt_timeshift),
+ {}
+};
+
+struct hpfs_fc_context {
+ kuid_t uid;
+ kgid_t gid;
+ umode_t umask;
+ int lowercase;
+ int eas;
+ int chk;
+ int errs;
+ int chkdsk;
+ int timeshift;
+};
static inline void hpfs_help(void)
{
@@ -439,49 +364,92 @@ HPFS filesystem options:\n\
\n");
}
-static int hpfs_remount_fs(struct super_block *s, int *flags, char *data)
+static int hpfs_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
- kuid_t uid;
- kgid_t gid;
- umode_t umask;
- int lowercase, eas, chk, errs, chkdsk, timeshift;
- int o;
+ struct hpfs_fc_context *ctx = fc->fs_private;
+ struct fs_parse_result result;
+ int opt;
+
+ opt = fs_parse(fc, hpfs_param_spec, param, &result);
+ if (opt < 0)
+ return opt;
+
+ switch (opt) {
+ case Opt_help:
+ hpfs_help();
+ return -EINVAL;
+ case Opt_uid:
+ ctx->uid = result.uid;
+ break;
+ case Opt_gid:
+ ctx->gid = result.gid;
+ break;
+ case Opt_umask:
+ ctx->umask = result.uint_32;
+ break;
+ case Opt_case:
+ ctx->lowercase = result.uint_32;
+ break;
+ case Opt_check:
+ ctx->chk = result.uint_32;
+ break;
+ case Opt_err:
+ ctx->errs = result.uint_32;
+ break;
+ case Opt_eas:
+ ctx->eas = result.uint_32;
+ break;
+ case Opt_chkdsk:
+ ctx->chkdsk = result.uint_32;
+ break;
+ case Opt_timeshift:
+ {
+ int m = 1;
+ char *rhs = param->string;
+ int timeshift;
+
+ if (*rhs == '-') m = -1;
+ if (*rhs == '+' || *rhs == '-') rhs++;
+ timeshift = simple_strtoul(rhs, &rhs, 0) * m;
+ if (*rhs)
+ return -EINVAL;
+ ctx->timeshift = timeshift;
+ break;
+ }
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int hpfs_reconfigure(struct fs_context *fc)
+{
+ struct hpfs_fc_context *ctx = fc->fs_private;
+ struct super_block *s = fc->root->d_sb;
struct hpfs_sb_info *sbi = hpfs_sb(s);
sync_filesystem(s);
- *flags |= SB_NOATIME;
+ fc->sb_flags |= SB_NOATIME;
hpfs_lock(s);
- uid = sbi->sb_uid; gid = sbi->sb_gid;
- umask = 0777 & ~sbi->sb_mode;
- lowercase = sbi->sb_lowercase;
- eas = sbi->sb_eas; chk = sbi->sb_chk; chkdsk = sbi->sb_chkdsk;
- errs = sbi->sb_err; timeshift = sbi->sb_timeshift;
- if (!(o = parse_opts(data, &uid, &gid, &umask, &lowercase,
- &eas, &chk, &errs, &chkdsk, ×hift))) {
- pr_err("bad mount options.\n");
- goto out_err;
- }
- if (o == 2) {
- hpfs_help();
- goto out_err;
- }
- if (timeshift != sbi->sb_timeshift) {
+ if (ctx->timeshift != sbi->sb_timeshift) {
pr_err("timeshift can't be changed using remount.\n");
goto out_err;
}
unmark_dirty(s);
- sbi->sb_uid = uid; sbi->sb_gid = gid;
- sbi->sb_mode = 0777 & ~umask;
- sbi->sb_lowercase = lowercase;
- sbi->sb_eas = eas; sbi->sb_chk = chk; sbi->sb_chkdsk = chkdsk;
- sbi->sb_err = errs; sbi->sb_timeshift = timeshift;
+ sbi->sb_uid = ctx->uid; sbi->sb_gid = ctx->gid;
+ sbi->sb_mode = 0777 & ~ctx->umask;
+ sbi->sb_lowercase = ctx->lowercase;
+ sbi->sb_eas = ctx->eas; sbi->sb_chk = ctx->chk;
+ sbi->sb_chkdsk = ctx->chkdsk;
+ sbi->sb_err = ctx->errs; sbi->sb_timeshift = ctx->timeshift;
- if (!(*flags & SB_RDONLY)) mark_dirty(s, 1);
+ if (!(fc->sb_flags & SB_RDONLY)) mark_dirty(s, 1);
hpfs_unlock(s);
return 0;
@@ -530,30 +498,24 @@ static const struct super_operations hpfs_sops =
.evict_inode = hpfs_evict_inode,
.put_super = hpfs_put_super,
.statfs = hpfs_statfs,
- .remount_fs = hpfs_remount_fs,
.show_options = hpfs_show_options,
};
-static int hpfs_fill_super(struct super_block *s, void *options, int silent)
+static int hpfs_fill_super(struct super_block *s, struct fs_context *fc)
{
+ struct hpfs_fc_context *ctx = fc->fs_private;
struct buffer_head *bh0, *bh1, *bh2;
struct hpfs_boot_block *bootblock;
struct hpfs_super_block *superblock;
struct hpfs_spare_block *spareblock;
struct hpfs_sb_info *sbi;
struct inode *root;
-
- kuid_t uid;
- kgid_t gid;
- umode_t umask;
- int lowercase, eas, chk, errs, chkdsk, timeshift;
+ int silent = fc->sb_flags & SB_SILENT;
dnode_secno root_dno;
struct hpfs_dirent *de = NULL;
struct quad_buffer_head qbh;
- int o;
-
sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
if (!sbi) {
return -ENOMEM;
@@ -563,26 +525,6 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)
mutex_init(&sbi->hpfs_mutex);
hpfs_lock(s);
- uid = current_uid();
- gid = current_gid();
- umask = current_umask();
- lowercase = 0;
- eas = 2;
- chk = 1;
- errs = 1;
- chkdsk = 1;
- timeshift = 0;
-
- if (!(o = parse_opts(options, &uid, &gid, &umask, &lowercase,
- &eas, &chk, &errs, &chkdsk, ×hift))) {
- pr_err("bad mount options.\n");
- goto bail0;
- }
- if (o==2) {
- hpfs_help();
- goto bail0;
- }
-
/*sbi->sb_mounting = 1;*/
sb_set_blocksize(s, 512);
sbi->sb_fs_size = -1;
@@ -622,17 +564,17 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)
sbi->sb_dirband_start = le32_to_cpu(superblock->dir_band_start);
sbi->sb_dirband_size = le32_to_cpu(superblock->n_dir_band);
sbi->sb_dmap = le32_to_cpu(superblock->dir_band_bitmap);
- sbi->sb_uid = uid;
- sbi->sb_gid = gid;
- sbi->sb_mode = 0777 & ~umask;
+ sbi->sb_uid = ctx->uid;
+ sbi->sb_gid = ctx->gid;
+ sbi->sb_mode = 0777 & ~ctx->umask;
sbi->sb_n_free = -1;
sbi->sb_n_free_dnodes = -1;
- sbi->sb_lowercase = lowercase;
- sbi->sb_eas = eas;
- sbi->sb_chk = chk;
- sbi->sb_chkdsk = chkdsk;
- sbi->sb_err = errs;
- sbi->sb_timeshift = timeshift;
+ sbi->sb_lowercase = ctx->lowercase;
+ sbi->sb_eas = ctx->eas;
+ sbi->sb_chk = ctx->chk;
+ sbi->sb_chkdsk = ctx->chkdsk;
+ sbi->sb_err = ctx->errs;
+ sbi->sb_timeshift = ctx->timeshift;
sbi->sb_was_error = 0;
sbi->sb_cp_table = NULL;
sbi->sb_c_bitmap = -1;
@@ -653,7 +595,7 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)
/* Check for general fs errors*/
if (spareblock->dirty && !spareblock->old_wrote) {
- if (errs == 2) {
+ if (sbi->sb_err == 2) {
pr_err("Improperly stopped, not mounted\n");
goto bail4;
}
@@ -667,16 +609,16 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)
}
if (le32_to_cpu(spareblock->n_dnode_spares) != le32_to_cpu(spareblock->n_dnode_spares_free)) {
- if (errs >= 2) {
+ if (sbi->sb_err >= 2) {
pr_err("Spare dnodes used, try chkdsk\n");
mark_dirty(s, 0);
goto bail4;
}
hpfs_error(s, "warning: spare dnodes used, try chkdsk");
- if (errs == 0)
+ if (sbi->sb_err == 0)
pr_err("Proceeding, but your filesystem could be corrupted if you delete files or directories\n");
}
- if (chk) {
+ if (sbi->sb_chk) {
unsigned a;
if (le32_to_cpu(superblock->dir_band_end) - le32_to_cpu(superblock->dir_band_start) + 1 != le32_to_cpu(superblock->n_dir_band) ||
le32_to_cpu(superblock->dir_band_end) < le32_to_cpu(superblock->dir_band_start) || le32_to_cpu(superblock->n_dir_band) > 0x4000) {
@@ -755,18 +697,70 @@ bail2: brelse(bh0);
return -EINVAL;
}
-static struct dentry *hpfs_mount(struct file_system_type *fs_type,
- int flags, const char *dev_name, void *data)
+static int hpfs_get_tree(struct fs_context *fc)
{
- return mount_bdev(fs_type, flags, dev_name, data, hpfs_fill_super);
+ return get_tree_bdev(fc, hpfs_fill_super);
}
+static void hpfs_free_fc(struct fs_context *fc)
+{
+ kfree(fc->fs_private);
+}
+
+static const struct fs_context_operations hpfs_fc_context_ops = {
+ .parse_param = hpfs_parse_param,
+ .get_tree = hpfs_get_tree,
+ .reconfigure = hpfs_reconfigure,
+ .free = hpfs_free_fc,
+};
+
+static int hpfs_init_fs_context(struct fs_context *fc)
+{
+ struct hpfs_fc_context *ctx;
+
+ ctx = kzalloc(sizeof(struct hpfs_fc_context), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+
+ if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) {
+ struct super_block *sb = fc->root->d_sb;
+ struct hpfs_sb_info *sbi = hpfs_sb(sb);
+
+ ctx->uid = sbi->sb_uid;
+ ctx->gid = sbi->sb_gid;
+ ctx->umask = 0777 & ~sbi->sb_mode;
+ ctx->lowercase = sbi->sb_lowercase;
+ ctx->eas = sbi->sb_eas;
+ ctx->chk = sbi->sb_chk;
+ ctx->chkdsk = sbi->sb_chkdsk;
+ ctx->errs = sbi->sb_err;
+ ctx->timeshift = sbi->sb_timeshift;
+
+ } else {
+ ctx->uid = current_uid();
+ ctx->gid = current_gid();
+ ctx->umask = current_umask();
+ ctx->lowercase = 0;
+ ctx->eas = 2;
+ ctx->chk = 1;
+ ctx->errs = 1;
+ ctx->chkdsk = 1;
+ ctx->timeshift = 0;
+ }
+
+ fc->fs_private = ctx;
+ fc->ops = &hpfs_fc_context_ops;
+
+ return 0;
+};
+
static struct file_system_type hpfs_fs_type = {
.owner = THIS_MODULE,
.name = "hpfs",
- .mount = hpfs_mount,
.kill_sb = kill_block_super,
.fs_flags = FS_REQUIRES_DEV,
+ .init_fs_context = hpfs_init_fs_context,
+ .parameters = hpfs_param_spec,
};
MODULE_ALIAS_FS("hpfs");
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 5cf32733..1bbf783 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -39,6 +39,9 @@
#include <linux/uaccess.h>
#include <linux/sched/mm.h>
+#define CREATE_TRACE_POINTS
+#include <trace/events/hugetlbfs.h>
+
static const struct address_space_operations hugetlbfs_aops;
static const struct file_operations hugetlbfs_file_operations;
static const struct inode_operations hugetlbfs_dir_inode_operations;
@@ -110,7 +113,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
* way when do_mmap unwinds (may be important on powerpc
* and ia64).
*/
- vm_flags_set(vma, VM_HUGETLB | VM_DONTEXPAND);
+ vm_flags_set(vma, VM_HUGETLB | VM_DONTEXPAND | VM_MTE_ALLOWED);
vma->vm_ops = &hugetlb_vm_ops;
ret = seal_check_write(info->seals, vma);
@@ -687,6 +690,7 @@ static void hugetlbfs_evict_inode(struct inode *inode)
{
struct resv_map *resv_map;
+ trace_hugetlbfs_evict_inode(inode);
remove_inode_hugepages(inode, 0, LLONG_MAX);
/*
@@ -814,8 +818,10 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
return -EOPNOTSUPP;
- if (mode & FALLOC_FL_PUNCH_HOLE)
- return hugetlbfs_punch_hole(inode, offset, len);
+ if (mode & FALLOC_FL_PUNCH_HOLE) {
+ error = hugetlbfs_punch_hole(inode, offset, len);
+ goto out_nolock;
+ }
/*
* Default preallocate case.
@@ -919,6 +925,9 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
inode_set_ctime_current(inode);
out:
inode_unlock(inode);
+
+out_nolock:
+ trace_hugetlbfs_fallocate(inode, mode, offset, len, error);
return error;
}
@@ -935,6 +944,8 @@ static int hugetlbfs_setattr(struct mnt_idmap *idmap,
if (error)
return error;
+ trace_hugetlbfs_setattr(inode, dentry, attr);
+
if (ia_valid & ATTR_SIZE) {
loff_t oldsize = inode->i_size;
loff_t newsize = attr->ia_size;
@@ -1033,6 +1044,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb,
break;
}
lockdep_annotate_inode_mutex_key(inode);
+ trace_hugetlbfs_alloc_inode(inode, dir, mode);
} else {
if (resv_map)
kref_put(&resv_map->refs, resv_map_release);
@@ -1272,6 +1284,7 @@ static struct inode *hugetlbfs_alloc_inode(struct super_block *sb)
static void hugetlbfs_free_inode(struct inode *inode)
{
+ trace_hugetlbfs_free_inode(inode);
kmem_cache_free(hugetlbfs_inode_cachep, HUGETLBFS_I(inode));
}
diff --git a/fs/inode.c b/fs/inode.c
index 8dabb22..b13b778 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -21,7 +21,12 @@
#include <linux/list_lru.h>
#include <linux/iversion.h>
#include <linux/rw_hint.h>
+#include <linux/seq_file.h>
+#include <linux/debugfs.h>
#include <trace/events/writeback.h>
+#define CREATE_TRACE_POINTS
+#include <trace/events/timestamp.h>
+
#include "internal.h"
/*
@@ -98,6 +103,70 @@ long get_nr_dirty_inodes(void)
return nr_dirty > 0 ? nr_dirty : 0;
}
+#ifdef CONFIG_DEBUG_FS
+static DEFINE_PER_CPU(long, mg_ctime_updates);
+static DEFINE_PER_CPU(long, mg_fine_stamps);
+static DEFINE_PER_CPU(long, mg_ctime_swaps);
+
+static unsigned long get_mg_ctime_updates(void)
+{
+ unsigned long sum = 0;
+ int i;
+
+ for_each_possible_cpu(i)
+ sum += data_race(per_cpu(mg_ctime_updates, i));
+ return sum;
+}
+
+static unsigned long get_mg_fine_stamps(void)
+{
+ unsigned long sum = 0;
+ int i;
+
+ for_each_possible_cpu(i)
+ sum += data_race(per_cpu(mg_fine_stamps, i));
+ return sum;
+}
+
+static unsigned long get_mg_ctime_swaps(void)
+{
+ unsigned long sum = 0;
+ int i;
+
+ for_each_possible_cpu(i)
+ sum += data_race(per_cpu(mg_ctime_swaps, i));
+ return sum;
+}
+
+#define mgtime_counter_inc(__var) this_cpu_inc(__var)
+
+static int mgts_show(struct seq_file *s, void *p)
+{
+ unsigned long ctime_updates = get_mg_ctime_updates();
+ unsigned long ctime_swaps = get_mg_ctime_swaps();
+ unsigned long fine_stamps = get_mg_fine_stamps();
+ unsigned long floor_swaps = timekeeping_get_mg_floor_swaps();
+
+ seq_printf(s, "%lu %lu %lu %lu\n",
+ ctime_updates, ctime_swaps, fine_stamps, floor_swaps);
+ return 0;
+}
+
+DEFINE_SHOW_ATTRIBUTE(mgts);
+
+static int __init mg_debugfs_init(void)
+{
+ debugfs_create_file("multigrain_timestamps", S_IFREG | S_IRUGO, NULL, NULL, &mgts_fops);
+ return 0;
+}
+late_initcall(mg_debugfs_init);
+
+#else /* ! CONFIG_DEBUG_FS */
+
+#define mgtime_counter_inc(__var) do { } while (0)
+
+#endif /* CONFIG_DEBUG_FS */
+
/*
* Handle nr_inode sysctl
*/
@@ -174,6 +243,8 @@ int inode_init_always_gfp(struct super_block *sb, struct inode *inode, gfp_t gfp
inode->i_opflags = 0;
if (sb->s_xattr)
inode->i_opflags |= IOP_XATTR;
+ if (sb->s_type->fs_flags & FS_MGTIME)
+ inode->i_opflags |= IOP_MGTIME;
i_uid_write(inode, 0);
i_gid_write(inode, 0);
atomic_set(&inode->i_writecount, 0);
@@ -748,7 +819,7 @@ static void evict(struct inode *inode)
* ___wait_var_event() either sees the bit cleared or
* waitqueue_active() check in wake_up_var() sees the waiter.
*/
- smp_mb();
+ smp_mb__after_spinlock();
inode_wake_up_bit(inode, __I_NEW);
BUG_ON(inode->i_state != (I_FREEING | I_CLEAR));
spin_unlock(&inode->i_lock);
@@ -1241,16 +1312,15 @@ EXPORT_SYMBOL(unlock_two_nondirectories);
* @data: opaque data pointer to pass to @test and @set
*
* Search for the inode specified by @hashval and @data in the inode cache,
- * and if present it is return it with an increased reference count. This is
- * a variant of iget5_locked() for callers that don't want to fail on memory
- * allocation of inode.
+ * and if present return it with an increased reference count. This is a
+ * variant of iget5_locked() that doesn't allocate an inode.
*
- * If the inode is not in cache, insert the pre-allocated inode to cache and
+ * If the inode is not present in the cache, insert the pre-allocated inode and
* return it locked, hashed, and with the I_NEW flag set. The file system gets
* to fill it in before unlocking it via unlock_new_inode().
*
- * Note both @test and @set are called with the inode_hash_lock held, so can't
- * sleep.
+ * Note that both @test and @set are called with the inode_hash_lock held, so
+ * they can't sleep.
*/
struct inode *inode_insert5(struct inode *inode, unsigned long hashval,
int (*test)(struct inode *, void *),
@@ -1314,16 +1384,16 @@ EXPORT_SYMBOL(inode_insert5);
* @data: opaque data pointer to pass to @test and @set
*
* Search for the inode specified by @hashval and @data in the inode cache,
- * and if present it is return it with an increased reference count. This is
- * a generalized version of iget_locked() for file systems where the inode
+ * and if present return it with an increased reference count. This is a
+ * generalized version of iget_locked() for file systems where the inode
* number is not sufficient for unique identification of an inode.
*
- * If the inode is not in cache, allocate a new inode and return it locked,
- * hashed, and with the I_NEW flag set. The file system gets to fill it in
- * before unlocking it via unlock_new_inode().
+ * If the inode is not present in the cache, allocate and insert a new inode
+ * and return it locked, hashed, and with the I_NEW flag set. The file system
+ * gets to fill it in before unlocking it via unlock_new_inode().
*
- * Note both @test and @set are called with the inode_hash_lock held, so can't
- * sleep.
+ * Note that both @test and @set are called with the inode_hash_lock held, so
+ * they can't sleep.
*/
struct inode *iget5_locked(struct super_block *sb, unsigned long hashval,
int (*test)(struct inode *, void *),
@@ -2211,19 +2281,58 @@ int file_remove_privs(struct file *file)
}
EXPORT_SYMBOL(file_remove_privs);
+/**
+ * current_time - Return FS time (possibly fine-grained)
+ * @inode: inode.
+ *
+ * Return the current time truncated to the time granularity supported by
+ * the fs, as suitable for a ctime/mtime change. If the ctime is flagged
+ * as having been QUERIED, get a fine-grained timestamp, but don't update
+ * the floor.
+ *
+ * For a multigrain inode, this is effectively an estimate of the timestamp
+ * that a file would receive. An actual update must go through
+ * inode_set_ctime_current().
+ */
+struct timespec64 current_time(struct inode *inode)
+{
+ struct timespec64 now;
+ u32 cns;
+
+ ktime_get_coarse_real_ts64_mg(&now);
+
+ if (!is_mgtime(inode))
+ goto out;
+
+ /* If nothing has queried it, then coarse time is fine */
+ cns = smp_load_acquire(&inode->i_ctime_nsec);
+ if (cns & I_CTIME_QUERIED) {
+ /*
+ * If there is no apparent change, then get a fine-grained
+ * timestamp.
+ */
+ if (now.tv_nsec == (cns & ~I_CTIME_QUERIED))
+ ktime_get_real_ts64(&now);
+ }
+out:
+ return timestamp_truncate(now, inode);
+}
+EXPORT_SYMBOL(current_time);
+
static int inode_needs_update_time(struct inode *inode)
{
+ struct timespec64 now, ts;
int sync_it = 0;
- struct timespec64 now = current_time(inode);
- struct timespec64 ts;
/* First try to exhaust all avenues to not sync */
if (IS_NOCMTIME(inode))
return 0;
+ now = current_time(inode);
+
ts = inode_get_mtime(inode);
if (!timespec64_equal(&ts, &now))
- sync_it = S_MTIME;
+ sync_it |= S_MTIME;
ts = inode_get_ctime(inode);
if (!timespec64_equal(&ts, &now))
@@ -2600,6 +2709,16 @@ void inode_nohighmem(struct inode *inode)
}
EXPORT_SYMBOL(inode_nohighmem);
+struct timespec64 inode_set_ctime_to_ts(struct inode *inode, struct timespec64 ts)
+{
+ trace_inode_set_ctime_to_ts(inode, &ts);
+ set_normalized_timespec64(&ts, ts.tv_sec, ts.tv_nsec);
+ inode->i_ctime_sec = ts.tv_sec;
+ inode->i_ctime_nsec = ts.tv_nsec;
+ return ts;
+}
+EXPORT_SYMBOL(inode_set_ctime_to_ts);
+
/**
* timestamp_truncate - Truncate timespec to a granularity
* @t: Timespec
@@ -2632,47 +2751,167 @@ struct timespec64 timestamp_truncate(struct timespec64 t, struct inode *inode)
EXPORT_SYMBOL(timestamp_truncate);
/**
- * current_time - Return FS time
- * @inode: inode.
- *
- * Return the current time truncated to the time granularity supported by
- * the fs.
- *
- * Note that inode and inode->sb cannot be NULL.
- * Otherwise, the function warns and returns time without truncation.
- */
-struct timespec64 current_time(struct inode *inode)
-{
- struct timespec64 now;
-
- ktime_get_coarse_real_ts64(&now);
- return timestamp_truncate(now, inode);
-}
-EXPORT_SYMBOL(current_time);
-
-/**
* inode_set_ctime_current - set the ctime to current_time
* @inode: inode
*
- * Set the inode->i_ctime to the current value for the inode. Returns
- * the current value that was assigned to i_ctime.
+ * Set the inode's ctime to the current value for the inode. Returns the
+ * current value that was assigned. If this is not a multigrain inode, then we
+ * set it to the later of the coarse time and floor value.
+ *
+ * If it is multigrain, then we first see if the coarse-grained timestamp is
+ * distinct from what is already there. If so, then use that. Otherwise, get a
+ * fine-grained timestamp.
+ *
+ * After that, try to swap the new value into i_ctime_nsec. Accept the
+ * resulting ctime, regardless of the outcome of the swap. If it has
+ * already been replaced, then that timestamp is later than the earlier
+ * unacceptable one, and is thus acceptable.
*/
struct timespec64 inode_set_ctime_current(struct inode *inode)
{
- struct timespec64 now = current_time(inode);
+ struct timespec64 now;
+ u32 cns, cur;
- inode_set_ctime_to_ts(inode, now);
+ ktime_get_coarse_real_ts64_mg(&now);
+ now = timestamp_truncate(now, inode);
+
+ /* Just return that if this is not a multigrain fs */
+ if (!is_mgtime(inode)) {
+ inode_set_ctime_to_ts(inode, now);
+ goto out;
+ }
+
+ /*
+ * A fine-grained time is only needed if someone has queried
+ * for timestamps, and the current coarse grained time isn't
+ * later than what's already there.
+ */
+ cns = smp_load_acquire(&inode->i_ctime_nsec);
+ if (cns & I_CTIME_QUERIED) {
+ struct timespec64 ctime = { .tv_sec = inode->i_ctime_sec,
+ .tv_nsec = cns & ~I_CTIME_QUERIED };
+
+ if (timespec64_compare(&now, &ctime) <= 0) {
+ ktime_get_real_ts64_mg(&now);
+ now = timestamp_truncate(now, inode);
+ mgtime_counter_inc(mg_fine_stamps);
+ }
+ }
+ mgtime_counter_inc(mg_ctime_updates);
+
+ /* No need to cmpxchg if it's exactly the same */
+ if (cns == now.tv_nsec && inode->i_ctime_sec == now.tv_sec) {
+ trace_ctime_xchg_skip(inode, &now);
+ goto out;
+ }
+ cur = cns;
+retry:
+ /* Try to swap the nsec value into place. */
+ if (try_cmpxchg(&inode->i_ctime_nsec, &cur, now.tv_nsec)) {
+ /* If swap occurred, then we're (mostly) done */
+ inode->i_ctime_sec = now.tv_sec;
+ trace_ctime_ns_xchg(inode, cns, now.tv_nsec, cur);
+ mgtime_counter_inc(mg_ctime_swaps);
+ } else {
+ /*
+ * Was the change due to someone marking the old ctime QUERIED?
+ * If so then retry the swap. This can only happen once since
+ * the only way to clear I_CTIME_QUERIED is to stamp the inode
+ * with a new ctime.
+ */
+ if (!(cns & I_CTIME_QUERIED) && (cns | I_CTIME_QUERIED) == cur) {
+ cns = cur;
+ goto retry;
+ }
+ /* Otherwise, keep the existing ctime */
+ now.tv_sec = inode->i_ctime_sec;
+ now.tv_nsec = cur & ~I_CTIME_QUERIED;
+ }
+out:
return now;
}
EXPORT_SYMBOL(inode_set_ctime_current);
/**
+ * inode_set_ctime_deleg - try to update the ctime on a delegated inode
+ * @inode: inode to update
+ * @update: timespec64 to set the ctime
+ *
+ * Attempt to atomically update the ctime on behalf of a delegation holder.
+ *
+ * The nfs server can call back the holder of a delegation to get updated
+ * inode attributes, including the mtime. When updating the mtime, update
+ * the ctime to a value at least equal to that.
+ *
+ * This can race with concurrent updates to the inode, in which
+ * case the update is skipped.
+ *
+ * Note that this works even when multigrain timestamps are not enabled,
+ * so it is used in either case.
+ */
+struct timespec64 inode_set_ctime_deleg(struct inode *inode, struct timespec64 update)
+{
+ struct timespec64 now, cur_ts;
+ u32 cur, old;
+
+ /* pairs with try_cmpxchg below */
+ cur = smp_load_acquire(&inode->i_ctime_nsec);
+ cur_ts.tv_nsec = cur & ~I_CTIME_QUERIED;
+ cur_ts.tv_sec = inode->i_ctime_sec;
+
+ /* If the update is older than the existing value, skip it. */
+ if (timespec64_compare(&update, &cur_ts) <= 0)
+ return cur_ts;
+
+ ktime_get_coarse_real_ts64_mg(&now);
+
+ /* Clamp the update to "now" if it's in the future */
+ if (timespec64_compare(&update, &now) > 0)
+ update = now;
+
+ update = timestamp_truncate(update, inode);
+
+ /* No need to update if the values are already the same */
+ if (timespec64_equal(&update, &cur_ts))
+ return cur_ts;
+
+ /*
+ * Try to swap the nsec value into place. If it fails, that means
+ * it raced with an update due to a write or similar activity. That
+ * stamp takes precedence, so just skip the update.
+ */
+retry:
+ old = cur;
+ if (try_cmpxchg(&inode->i_ctime_nsec, &cur, update.tv_nsec)) {
+ inode->i_ctime_sec = update.tv_sec;
+ mgtime_counter_inc(mg_ctime_swaps);
+ return update;
+ }
+
+ /*
+ * Was the change due to another task marking the old ctime QUERIED?
+ *
+ * If so, then retry the swap. This can only happen once since
+ * the only way to clear I_CTIME_QUERIED is to stamp the inode
+ * with a new ctime.
+ */
+ if (!(old & I_CTIME_QUERIED) && (cur == (old | I_CTIME_QUERIED)))
+ goto retry;
+
+ /* Otherwise, it was a new timestamp. */
+ cur_ts.tv_sec = inode->i_ctime_sec;
+ cur_ts.tv_nsec = cur & ~I_CTIME_QUERIED;
+ return cur_ts;
+}
+EXPORT_SYMBOL(inode_set_ctime_deleg);
+
+/**
* in_group_or_capable - check whether caller is CAP_FSETID privileged
* @idmap: idmap of the mount @inode was found from
* @inode: inode to check
* @vfsgid: the new/current vfsgid of @inode
*
- * Check wether @vfsgid is in the caller's group list or if the caller is
+ * Check whether @vfsgid is in the caller's group list or if the caller is
* privileged with CAP_FSETID over @inode. This can be used to determine
* whether the setgid bit can be kept or must be dropped.
*
diff --git a/fs/internal.h b/fs/internal.h
index 8c1b7ac..e7f02ae 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -246,7 +246,6 @@ int open_namespace(struct ns_common *ns);
* fs/stat.c:
*/
-int getname_statx_lookup_flags(int flags);
int do_statx(int dfd, struct filename *filename, unsigned int flags,
unsigned int mask, struct statx __user *buffer);
int do_statx_fd(int fd, unsigned int flags, unsigned int mask,
@@ -267,7 +266,7 @@ struct xattr_name {
char name[XATTR_NAME_MAX + 1];
};
-struct xattr_ctx {
+struct kernel_xattr_ctx {
/* Value of attribute */
union {
const void __user *cvalue;
@@ -280,14 +279,15 @@ struct xattr_ctx {
unsigned int flags;
};
+ssize_t file_getxattr(struct file *file, struct kernel_xattr_ctx *ctx);
+ssize_t filename_getxattr(int dfd, struct filename *filename,
+ unsigned int lookup_flags, struct kernel_xattr_ctx *ctx);
+int file_setxattr(struct file *file, struct kernel_xattr_ctx *ctx);
+int filename_setxattr(int dfd, struct filename *filename,
+ unsigned int lookup_flags, struct kernel_xattr_ctx *ctx);
+int setxattr_copy(const char __user *name, struct kernel_xattr_ctx *ctx);
+int import_xattr_name(struct xattr_name *kname, const char __user *name);
-ssize_t do_getxattr(struct mnt_idmap *idmap,
- struct dentry *d,
- struct xattr_ctx *ctx);
-
-int setxattr_copy(const char __user *name, struct xattr_ctx *ctx);
-int do_setxattr(struct mnt_idmap *idmap, struct dentry *dentry,
- struct xattr_ctx *ctx);
int may_write_xattr(struct mnt_idmap *idmap, struct inode *inode);
#ifdef CONFIG_FS_POSIX_ACL
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 6e0c954..638a36b 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -231,11 +231,11 @@ static int ioctl_fiemap(struct file *filp, struct fiemap __user *ufiemap)
static long ioctl_file_clone(struct file *dst_file, unsigned long srcfd,
u64 off, u64 olen, u64 destoff)
{
- struct fd src_file = fdget(srcfd);
+ CLASS(fd, src_file)(srcfd);
loff_t cloned;
int ret;
- if (!fd_file(src_file))
+ if (fd_empty(src_file))
return -EBADF;
cloned = vfs_clone_file_range(fd_file(src_file), off, dst_file, destoff,
olen, 0);
@@ -245,7 +245,6 @@ static long ioctl_file_clone(struct file *dst_file, unsigned long srcfd,
ret = -EINVAL;
else
ret = 0;
- fdput(src_file);
return ret;
}
@@ -892,22 +891,20 @@ static int do_vfs_ioctl(struct file *filp, unsigned int fd,
SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd, unsigned long, arg)
{
- struct fd f = fdget(fd);
+ CLASS(fd, f)(fd);
int error;
- if (!fd_file(f))
+ if (fd_empty(f))
return -EBADF;
error = security_file_ioctl(fd_file(f), cmd, arg);
if (error)
- goto out;
+ return error;
error = do_vfs_ioctl(fd_file(f), fd, cmd, arg);
if (error == -ENOIOCTLCMD)
error = vfs_ioctl(fd_file(f), cmd, arg);
-out:
- fdput(f);
return error;
}
@@ -950,15 +947,15 @@ EXPORT_SYMBOL(compat_ptr_ioctl);
COMPAT_SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd,
compat_ulong_t, arg)
{
- struct fd f = fdget(fd);
+ CLASS(fd, f)(fd);
int error;
- if (!fd_file(f))
+ if (fd_empty(f))
return -EBADF;
error = security_file_ioctl_compat(fd_file(f), cmd, arg);
if (error)
- goto out;
+ return error;
switch (cmd) {
/* FICLONE takes an int argument, so don't use compat_ptr() */
@@ -1009,10 +1006,6 @@ COMPAT_SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd,
error = -ENOTTY;
break;
}
-
- out:
- fdput(f);
-
return error;
}
#endif
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index ef0b68b..ce73d2a 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -1784,7 +1784,7 @@ static int iomap_add_to_ioend(struct iomap_writepage_ctx *wpc,
if (ifs)
atomic_add(len, &ifs->write_bytes_pending);
wpc->ioend->io_size += len;
- wbc_account_cgroup_owner(wbc, &folio->page, len);
+ wbc_account_cgroup_owner(wbc, folio, len);
return 0;
}
diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index f637aa0..b521eb1 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -271,7 +271,7 @@ static int iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio,
* clearing the WRITE_THROUGH flag in the dio request.
*/
static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio,
- const struct iomap *iomap, bool use_fua)
+ const struct iomap *iomap, bool use_fua, bool atomic)
{
blk_opf_t opflags = REQ_SYNC | REQ_IDLE;
@@ -283,6 +283,8 @@ static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio,
opflags |= REQ_FUA;
else
dio->flags &= ~IOMAP_DIO_WRITE_THROUGH;
+ if (atomic)
+ opflags |= REQ_ATOMIC;
return opflags;
}
@@ -293,7 +295,8 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
const struct iomap *iomap = &iter->iomap;
struct inode *inode = iter->inode;
unsigned int fs_block_size = i_blocksize(inode), pad;
- loff_t length = iomap_length(iter);
+ const loff_t length = iomap_length(iter);
+ bool atomic = iter->flags & IOMAP_ATOMIC;
loff_t pos = iter->pos;
blk_opf_t bio_opf;
struct bio *bio;
@@ -303,6 +306,9 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
size_t copied = 0;
size_t orig_count;
+ if (atomic && length != fs_block_size)
+ return -EINVAL;
+
if ((pos | length) & (bdev_logical_block_size(iomap->bdev) - 1) ||
!bdev_iter_is_aligned(iomap->bdev, dio->submit.iter))
return -EINVAL;
@@ -377,12 +383,7 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
goto out;
}
- /*
- * Set the operation flags early so that bio_iov_iter_get_pages
- * can set up the page vector appropriately for a ZONE_APPEND
- * operation.
- */
- bio_opf = iomap_dio_bio_opflags(dio, iomap, use_fua);
+ bio_opf = iomap_dio_bio_opflags(dio, iomap, use_fua, atomic);
nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter, BIO_MAX_VECS);
do {
@@ -415,6 +416,17 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
}
n = bio->bi_iter.bi_size;
+ if (WARN_ON_ONCE(atomic && n != length)) {
+ /*
+ * This bio should have covered the complete length,
+ * which it doesn't, so error. We may need to zero out
+ * the tail (complete FS block), similar to when
+ * bio_iov_iter_get_pages() returns an error, above.
+ */
+ ret = -EINVAL;
+ bio_put(bio);
+ goto zero_tail;
+ }
if (dio->flags & IOMAP_DIO_WRITE) {
task_io_account_write(n);
} else {
@@ -598,6 +610,9 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
if (iocb->ki_flags & IOCB_NOWAIT)
iomi.flags |= IOMAP_NOWAIT;
+ if (iocb->ki_flags & IOCB_ATOMIC)
+ iomi.flags |= IOMAP_ATOMIC;
+
if (iov_iter_rw(iter) == READ) {
/* reads can always complete inline */
dio->flags |= IOMAP_DIO_INLINE_COMP;
@@ -659,7 +674,17 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
if (ret != -EAGAIN) {
trace_iomap_dio_invalidate_fail(inode, iomi.pos,
iomi.len);
- ret = -ENOTBLK;
+ if (iocb->ki_flags & IOCB_ATOMIC) {
+ /*
+ * folio invalidation failed, maybe
+ * this is transient, unlock and see if
+ * the caller tries again.
+ */
+ ret = -EAGAIN;
+ } else {
+ /* fall back to buffered write */
+ ret = -ENOTBLK;
+ }
}
goto out_free_dio;
}
diff --git a/fs/iomap/trace.h b/fs/iomap/trace.h
index 0a991c4..4118a42 100644
--- a/fs/iomap/trace.h
+++ b/fs/iomap/trace.h
@@ -98,7 +98,8 @@ DEFINE_RANGE_EVENT(iomap_dio_rw_queued);
{ IOMAP_REPORT, "REPORT" }, \
{ IOMAP_FAULT, "FAULT" }, \
{ IOMAP_DIRECT, "DIRECT" }, \
- { IOMAP_NOWAIT, "NOWAIT" }
+ { IOMAP_NOWAIT, "NOWAIT" }, \
+ { IOMAP_ATOMIC, "ATOMIC" }
#define IOMAP_F_FLAGS_STRINGS \
{ IOMAP_F_NEW, "NEW" }, \
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 4305a1a..9153ff3 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -662,10 +662,6 @@ void jbd2_journal_commit_transaction(journal_t *journal)
JBUFFER_TRACE(jh, "ph3: write metadata");
escape = jbd2_journal_write_metadata_buffer(commit_transaction,
jh, &wbuf[bufs], blocknr);
- if (escape < 0) {
- jbd2_journal_abort(journal, escape);
- continue;
- }
jbd2_file_log_bh(&io_bufs, wbuf[bufs]);
/* Record the new block's tag in the current descriptor
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 97f487c..7e49d91 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -318,7 +318,6 @@ static inline void jbd2_data_do_escape(char *data)
*
*
* Return value:
- * <0: Error
* =0: Finished OK without escape
* =1: Finished OK with escape
*/
@@ -386,12 +385,7 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
goto escape_done;
spin_unlock(&jh_in->b_state_lock);
- tmp = jbd2_alloc(bh_in->b_size, GFP_NOFS);
- if (!tmp) {
- brelse(new_bh);
- free_buffer_head(new_bh);
- return -ENOMEM;
- }
+ tmp = jbd2_alloc(bh_in->b_size, GFP_NOFS | __GFP_NOFAIL);
spin_lock(&jh_in->b_state_lock);
if (jh_in->b_frozen_data) {
jbd2_free(tmp, bh_in->b_size);
@@ -1518,9 +1512,10 @@ static int journal_load_superblock(journal_t *journal)
* destroy journal_t structures, and to initialise and read existing
* journal blocks from disk. */
-/* First: create and setup a journal_t object in memory. We initialise
- * very few fields yet: that has to wait until we have created the
- * journal structures from from scratch, or loaded them from disk. */
+/* The journal_init_common() function creates and fills a journal_t object
+ * in memory. It calls journal_load_superblock() to load the on-disk journal
+ * superblock and initialize the journal_t object.
+ */
static journal_t *journal_init_common(struct block_device *bdev,
struct block_device *fs_dev,
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index 667f673..9192be7 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -485,6 +485,104 @@ static int jbd2_block_tag_csum_verify(journal_t *j, journal_block_tag_t *tag,
return tag->t_checksum == cpu_to_be16(csum32);
}
+static __always_inline int jbd2_do_replay(journal_t *journal,
+ struct recovery_info *info,
+ struct buffer_head *bh,
+ unsigned long *next_log_block,
+ unsigned int next_commit_ID)
+{
+ char *tagp;
+ int flags;
+ int ret = 0;
+ int tag_bytes = journal_tag_bytes(journal);
+ int descr_csum_size = 0;
+ unsigned long io_block;
+ journal_block_tag_t tag;
+ struct buffer_head *obh;
+ struct buffer_head *nbh;
+
+ if (jbd2_journal_has_csum_v2or3(journal))
+ descr_csum_size = sizeof(struct jbd2_journal_block_tail);
+
+ tagp = &bh->b_data[sizeof(journal_header_t)];
+ while (tagp - bh->b_data + tag_bytes <=
+ journal->j_blocksize - descr_csum_size) {
+ int err;
+
+ memcpy(&tag, tagp, sizeof(tag));
+ flags = be16_to_cpu(tag.t_flags);
+
+ io_block = (*next_log_block)++;
+ wrap(journal, *next_log_block);
+ err = jread(&obh, journal, io_block);
+ if (err) {
+ /* Recover what we can, but report failure at the end. */
+ ret = err;
+ pr_err("JBD2: IO error %d recovering block %lu in log\n",
+ err, io_block);
+ } else {
+ unsigned long long blocknr;
+
+ J_ASSERT(obh != NULL);
+ blocknr = read_tag_block(journal, &tag);
+
+ /* If the block has been revoked, then we're all done here. */
+ if (jbd2_journal_test_revoke(journal, blocknr,
+ next_commit_ID)) {
+ brelse(obh);
+ ++info->nr_revoke_hits;
+ goto skip_write;
+ }
+
+ /* Look for block corruption */
+ if (!jbd2_block_tag_csum_verify(journal, &tag,
+ (journal_block_tag3_t *)tagp,
+ obh->b_data, next_commit_ID)) {
+ brelse(obh);
+ ret = -EFSBADCRC;
+ pr_err("JBD2: Invalid checksum recovering data block %llu in journal block %lu\n",
+ blocknr, io_block);
+ goto skip_write;
+ }
+
+ /* Find a buffer for the new data being restored */
+ nbh = __getblk(journal->j_fs_dev, blocknr,
+ journal->j_blocksize);
+ if (nbh == NULL) {
+ pr_err("JBD2: Out of memory during recovery.\n");
+ brelse(obh);
+ return -ENOMEM;
+ }
+
+ lock_buffer(nbh);
+ memcpy(nbh->b_data, obh->b_data, journal->j_blocksize);
+ if (flags & JBD2_FLAG_ESCAPE) {
+ *((__be32 *)nbh->b_data) =
+ cpu_to_be32(JBD2_MAGIC_NUMBER);
+ }
+
+ BUFFER_TRACE(nbh, "marking dirty");
+ set_buffer_uptodate(nbh);
+ mark_buffer_dirty(nbh);
+ BUFFER_TRACE(nbh, "marking uptodate");
+ ++info->nr_replays;
+ unlock_buffer(nbh);
+ brelse(obh);
+ brelse(nbh);
+ }
+
+skip_write:
+ tagp += tag_bytes;
+ if (!(flags & JBD2_FLAG_SAME_UUID))
+ tagp += 16;
+
+ if (flags & JBD2_FLAG_LAST_TAG)
+ break;
+ }
+
+ return ret;
+}
+
static int do_one_pass(journal_t *journal,
struct recovery_info *info, enum passtype pass)
{
@@ -493,13 +591,10 @@ static int do_one_pass(journal_t *journal,
int err, success = 0;
journal_superblock_t * sb;
journal_header_t * tmp;
- struct buffer_head * bh;
+ struct buffer_head *bh = NULL;
unsigned int sequence;
int blocktype;
- int tag_bytes = journal_tag_bytes(journal);
__u32 crc32_sum = ~0; /* Transactional Checksums */
- int descr_csum_size = 0;
- int block_error = 0;
bool need_check_commit_time = false;
__u64 last_trans_commit_time = 0, commit_time;
@@ -528,12 +623,6 @@ static int do_one_pass(journal_t *journal,
*/
while (1) {
- int flags;
- char * tagp;
- journal_block_tag_t tag;
- struct buffer_head * obh;
- struct buffer_head * nbh;
-
cond_resched();
/* If we already know where to stop the log traversal,
@@ -552,6 +641,8 @@ static int do_one_pass(journal_t *journal,
* record. */
jbd2_debug(3, "JBD2: checking block %ld\n", next_log_block);
+ brelse(bh);
+ bh = NULL;
err = jread(&bh, journal, next_log_block);
if (err)
goto failed;
@@ -567,20 +658,16 @@ static int do_one_pass(journal_t *journal,
tmp = (journal_header_t *)bh->b_data;
- if (tmp->h_magic != cpu_to_be32(JBD2_MAGIC_NUMBER)) {
- brelse(bh);
+ if (tmp->h_magic != cpu_to_be32(JBD2_MAGIC_NUMBER))
break;
- }
blocktype = be32_to_cpu(tmp->h_blocktype);
sequence = be32_to_cpu(tmp->h_sequence);
jbd2_debug(3, "Found magic %d, sequence %d\n",
blocktype, sequence);
- if (sequence != next_commit_ID) {
- brelse(bh);
+ if (sequence != next_commit_ID)
break;
- }
/* OK, we have a valid descriptor block which matches
* all of the sequence number checks. What are we going
@@ -589,11 +676,7 @@ static int do_one_pass(journal_t *journal,
switch(blocktype) {
case JBD2_DESCRIPTOR_BLOCK:
/* Verify checksum first */
- if (jbd2_journal_has_csum_v2or3(journal))
- descr_csum_size =
- sizeof(struct jbd2_journal_block_tail);
- if (descr_csum_size > 0 &&
- !jbd2_descriptor_block_csum_verify(journal,
+ if (!jbd2_descriptor_block_csum_verify(journal,
bh->b_data)) {
/*
* PASS_SCAN can see stale blocks due to lazy
@@ -603,7 +686,6 @@ static int do_one_pass(journal_t *journal,
pr_err("JBD2: Invalid checksum recovering block %lu in log\n",
next_log_block);
err = -EFSBADCRC;
- brelse(bh);
goto failed;
}
need_check_commit_time = true;
@@ -619,125 +701,39 @@ static int do_one_pass(journal_t *journal,
if (pass != PASS_REPLAY) {
if (pass == PASS_SCAN &&
jbd2_has_feature_checksum(journal) &&
- !need_check_commit_time &&
!info->end_transaction) {
if (calc_chksums(journal, bh,
&next_log_block,
- &crc32_sum)) {
- put_bh(bh);
+ &crc32_sum))
break;
- }
- put_bh(bh);
continue;
}
next_log_block += count_tags(journal, bh);
wrap(journal, next_log_block);
- put_bh(bh);
continue;
}
- /* A descriptor block: we can now write all of
- * the data blocks. Yay, useful work is finally
- * getting done here! */
-
- tagp = &bh->b_data[sizeof(journal_header_t)];
- while ((tagp - bh->b_data + tag_bytes)
- <= journal->j_blocksize - descr_csum_size) {
- unsigned long io_block;
-
- memcpy(&tag, tagp, sizeof(tag));
- flags = be16_to_cpu(tag.t_flags);
-
- io_block = next_log_block++;
- wrap(journal, next_log_block);
- err = jread(&obh, journal, io_block);
- if (err) {
- /* Recover what we can, but
- * report failure at the end. */
- success = err;
- printk(KERN_ERR
- "JBD2: IO error %d recovering "
- "block %lu in log\n",
- err, io_block);
- } else {
- unsigned long long blocknr;
-
- J_ASSERT(obh != NULL);
- blocknr = read_tag_block(journal,
- &tag);
-
- /* If the block has been
- * revoked, then we're all done
- * here. */
- if (jbd2_journal_test_revoke
- (journal, blocknr,
- next_commit_ID)) {
- brelse(obh);
- ++info->nr_revoke_hits;
- goto skip_write;
- }
-
- /* Look for block corruption */
- if (!jbd2_block_tag_csum_verify(
- journal, &tag, (journal_block_tag3_t *)tagp,
- obh->b_data, be32_to_cpu(tmp->h_sequence))) {
- brelse(obh);
- success = -EFSBADCRC;
- printk(KERN_ERR "JBD2: Invalid "
- "checksum recovering "
- "data block %llu in "
- "journal block %lu\n",
- blocknr, io_block);
- block_error = 1;
- goto skip_write;
- }
-
- /* Find a buffer for the new
- * data being restored */
- nbh = __getblk(journal->j_fs_dev,
- blocknr,
- journal->j_blocksize);
- if (nbh == NULL) {
- printk(KERN_ERR
- "JBD2: Out of memory "
- "during recovery.\n");
- err = -ENOMEM;
- brelse(bh);
- brelse(obh);
- goto failed;
- }
-
- lock_buffer(nbh);
- memcpy(nbh->b_data, obh->b_data,
- journal->j_blocksize);
- if (flags & JBD2_FLAG_ESCAPE) {
- *((__be32 *)nbh->b_data) =
- cpu_to_be32(JBD2_MAGIC_NUMBER);
- }
-
- BUFFER_TRACE(nbh, "marking dirty");
- set_buffer_uptodate(nbh);
- mark_buffer_dirty(nbh);
- BUFFER_TRACE(nbh, "marking uptodate");
- ++info->nr_replays;
- unlock_buffer(nbh);
- brelse(obh);
- brelse(nbh);
- }
-
- skip_write:
- tagp += tag_bytes;
- if (!(flags & JBD2_FLAG_SAME_UUID))
- tagp += 16;
-
- if (flags & JBD2_FLAG_LAST_TAG)
- break;
+ /*
+ * A descriptor block: we can now write all of the
+ * data blocks. Yay, useful work is finally getting
+ * done here!
+ */
+ err = jbd2_do_replay(journal, info, bh, &next_log_block,
+ next_commit_ID);
+ if (err) {
+ if (err == -ENOMEM)
+ goto failed;
+ success = err;
}
- brelse(bh);
continue;
case JBD2_COMMIT_BLOCK:
+ if (pass != PASS_SCAN) {
+ next_commit_ID++;
+ continue;
+ }
+
/* How to differentiate between interrupted commit
* and journal corruption ?
*
@@ -782,7 +778,6 @@ static int do_one_pass(journal_t *journal,
pr_err("JBD2: Invalid checksum found in transaction %u\n",
next_commit_ID);
err = -EFSBADCRC;
- brelse(bh);
goto failed;
}
ignore_crc_mismatch:
@@ -792,7 +787,6 @@ static int do_one_pass(journal_t *journal,
*/
jbd2_debug(1, "JBD2: Invalid checksum ignored in transaction %u, likely stale data\n",
next_commit_ID);
- brelse(bh);
goto done;
}
@@ -802,8 +796,7 @@ static int do_one_pass(journal_t *journal,
* much to do other than move on to the next sequence
* number.
*/
- if (pass == PASS_SCAN &&
- jbd2_has_feature_checksum(journal)) {
+ if (jbd2_has_feature_checksum(journal)) {
struct commit_header *cbh =
(struct commit_header *)bh->b_data;
unsigned found_chksum =
@@ -812,7 +805,6 @@ static int do_one_pass(journal_t *journal,
if (info->end_transaction) {
journal->j_failed_commit =
info->end_transaction;
- brelse(bh);
break;
}
@@ -828,36 +820,33 @@ static int do_one_pass(journal_t *journal,
goto chksum_error;
crc32_sum = ~0;
+ goto chksum_ok;
}
- if (pass == PASS_SCAN &&
- !jbd2_commit_block_csum_verify(journal,
- bh->b_data)) {
- if (jbd2_commit_block_csum_verify_partial(
- journal,
- bh->b_data)) {
- pr_notice("JBD2: Find incomplete commit block in transaction %u block %lu\n",
- next_commit_ID, next_log_block);
- goto chksum_ok;
- }
- chksum_error:
- if (commit_time < last_trans_commit_time)
- goto ignore_crc_mismatch;
- info->end_transaction = next_commit_ID;
- info->head_block = head_block;
- if (!jbd2_has_feature_async_commit(journal)) {
- journal->j_failed_commit =
- next_commit_ID;
- brelse(bh);
- break;
- }
+ if (jbd2_commit_block_csum_verify(journal, bh->b_data))
+ goto chksum_ok;
+
+ if (jbd2_commit_block_csum_verify_partial(journal,
+ bh->b_data)) {
+ pr_notice("JBD2: Find incomplete commit block in transaction %u block %lu\n",
+ next_commit_ID, next_log_block);
+ goto chksum_ok;
}
- if (pass == PASS_SCAN) {
- chksum_ok:
- last_trans_commit_time = commit_time;
- head_block = next_log_block;
+
+chksum_error:
+ if (commit_time < last_trans_commit_time)
+ goto ignore_crc_mismatch;
+ info->end_transaction = next_commit_ID;
+ info->head_block = head_block;
+
+ if (!jbd2_has_feature_async_commit(journal)) {
+ journal->j_failed_commit = next_commit_ID;
+ break;
}
- brelse(bh);
+
+chksum_ok:
+ last_trans_commit_time = commit_time;
+ head_block = next_log_block;
next_commit_ID++;
continue;
@@ -876,14 +865,11 @@ static int do_one_pass(journal_t *journal,
/* If we aren't in the REVOKE pass, then we can
* just skip over this block. */
- if (pass != PASS_REVOKE) {
- brelse(bh);
+ if (pass != PASS_REVOKE)
continue;
- }
err = scan_revoke_records(journal, bh,
next_commit_ID, info);
- brelse(bh);
if (err)
goto failed;
continue;
@@ -891,12 +877,12 @@ static int do_one_pass(journal_t *journal,
default:
jbd2_debug(3, "Unrecognised magic %d, end of scan.\n",
blocktype);
- brelse(bh);
goto done;
}
}
done:
+ brelse(bh);
/*
* We broke out of the log scan loop: either we came to the
* known end of the log or we found an unexpected block in the
@@ -927,11 +913,10 @@ static int do_one_pass(journal_t *journal,
success = err;
}
- if (block_error && success == 0)
- success = -EIO;
return success;
failed:
+ brelse(bh);
return err;
}
diff --git a/fs/jfs/jfs_filsys.h b/fs/jfs/jfs_filsys.h
index 33ef13a0..8794281 100644
--- a/fs/jfs/jfs_filsys.h
+++ b/fs/jfs/jfs_filsys.h
@@ -24,6 +24,7 @@
#define JFS_ERR_REMOUNT_RO 0x00000002 /* remount read-only */
#define JFS_ERR_CONTINUE 0x00000004 /* continue */
#define JFS_ERR_PANIC 0x00000008 /* panic */
+#define JFS_ERR_MASK (JFS_ERR_REMOUNT_RO|JFS_ERR_CONTINUE|JFS_ERR_PANIC)
/* Quota support */
#define JFS_USRQUOTA 0x00000010
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index e1be21c..223d9ac5 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -6,11 +6,11 @@
#include <linux/fs.h>
#include <linux/module.h>
-#include <linux/parser.h>
#include <linux/completion.h>
#include <linux/vfs.h>
#include <linux/quotaops.h>
-#include <linux/mount.h>
+#include <linux/fs_context.h>
+#include <linux/fs_parser.h>
#include <linux/moduleparam.h>
#include <linux/kthread.h>
#include <linux/posix_acl.h>
@@ -210,240 +210,195 @@ enum {
Opt_discard, Opt_nodiscard, Opt_discard_minblk
};
-static const match_table_t tokens = {
- {Opt_integrity, "integrity"},
- {Opt_nointegrity, "nointegrity"},
- {Opt_iocharset, "iocharset=%s"},
- {Opt_resize, "resize=%u"},
- {Opt_resize_nosize, "resize"},
- {Opt_errors, "errors=%s"},
- {Opt_ignore, "noquota"},
- {Opt_quota, "quota"},
- {Opt_usrquota, "usrquota"},
- {Opt_grpquota, "grpquota"},
- {Opt_uid, "uid=%u"},
- {Opt_gid, "gid=%u"},
- {Opt_umask, "umask=%u"},
- {Opt_discard, "discard"},
- {Opt_nodiscard, "nodiscard"},
- {Opt_discard_minblk, "discard=%u"},
- {Opt_err, NULL}
+static const struct constant_table jfs_param_errors[] = {
+ {"continue", JFS_ERR_CONTINUE},
+ {"remount-ro", JFS_ERR_REMOUNT_RO},
+ {"panic", JFS_ERR_PANIC},
+ {}
};
-static int parse_options(char *options, struct super_block *sb, s64 *newLVSize,
- int *flag)
+static const struct fs_parameter_spec jfs_param_spec[] = {
+ fsparam_flag_no ("integrity", Opt_integrity),
+ fsparam_string ("iocharset", Opt_iocharset),
+ fsparam_u64 ("resize", Opt_resize),
+ fsparam_flag ("resize", Opt_resize_nosize),
+ fsparam_enum ("errors", Opt_errors, jfs_param_errors),
+ fsparam_flag ("quota", Opt_quota),
+ fsparam_flag ("noquota", Opt_ignore),
+ fsparam_flag ("usrquota", Opt_usrquota),
+ fsparam_flag ("grpquota", Opt_grpquota),
+ fsparam_uid ("uid", Opt_uid),
+ fsparam_gid ("gid", Opt_gid),
+ fsparam_u32oct ("umask", Opt_umask),
+ fsparam_flag ("discard", Opt_discard),
+ fsparam_u32 ("discard", Opt_discard_minblk),
+ fsparam_flag ("nodiscard", Opt_nodiscard),
+ {}
+};
+
+struct jfs_context {
+ int flag;
+ kuid_t uid;
+ kgid_t gid;
+ uint umask;
+ uint minblks_trim;
+ void *nls_map;
+ bool resize;
+ s64 newLVSize;
+};
+
+static int jfs_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
- void *nls_map = (void *)-1; /* -1: no change; NULL: none */
- char *p;
- struct jfs_sb_info *sbi = JFS_SBI(sb);
+ struct jfs_context *ctx = fc->fs_private;
+ int reconfigure = (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE);
+ struct fs_parse_result result;
+ struct nls_table *nls_map;
+ int opt;
- *newLVSize = 0;
+ opt = fs_parse(fc, jfs_param_spec, param, &result);
+ if (opt < 0)
+ return opt;
- if (!options)
- return 1;
-
- while ((p = strsep(&options, ",")) != NULL) {
- substring_t args[MAX_OPT_ARGS];
- int token;
- if (!*p)
- continue;
-
- token = match_token(p, tokens, args);
- switch (token) {
- case Opt_integrity:
- *flag &= ~JFS_NOINTEGRITY;
- break;
- case Opt_nointegrity:
- *flag |= JFS_NOINTEGRITY;
- break;
- case Opt_ignore:
- /* Silently ignore the quota options */
- /* Don't do anything ;-) */
- break;
- case Opt_iocharset:
- if (nls_map && nls_map != (void *) -1)
- unload_nls(nls_map);
- if (!strcmp(args[0].from, "none"))
- nls_map = NULL;
- else {
- nls_map = load_nls(args[0].from);
- if (!nls_map) {
- pr_err("JFS: charset not found\n");
- goto cleanup;
- }
+ switch (opt) {
+ case Opt_integrity:
+ if (result.negated)
+ ctx->flag |= JFS_NOINTEGRITY;
+ else
+ ctx->flag &= ~JFS_NOINTEGRITY;
+ break;
+ case Opt_ignore:
+ /* Silently ignore the quota options */
+ /* Don't do anything ;-) */
+ break;
+ case Opt_iocharset:
+ if (ctx->nls_map && ctx->nls_map != (void *) -1) {
+ unload_nls(ctx->nls_map);
+ ctx->nls_map = NULL;
+ }
+ if (!strcmp(param->string, "none"))
+ ctx->nls_map = NULL;
+ else {
+ nls_map = load_nls(param->string);
+ if (!nls_map) {
+ pr_err("JFS: charset not found\n");
+ return -EINVAL;
}
- break;
- case Opt_resize:
- {
- char *resize = args[0].from;
- int rc = kstrtoll(resize, 0, newLVSize);
-
- if (rc)
- goto cleanup;
- break;
+ ctx->nls_map = nls_map;
}
- case Opt_resize_nosize:
- {
- *newLVSize = sb_bdev_nr_blocks(sb);
- if (*newLVSize == 0)
- pr_err("JFS: Cannot determine volume size\n");
- break;
- }
- case Opt_errors:
- {
- char *errors = args[0].from;
- if (!errors || !*errors)
- goto cleanup;
- if (!strcmp(errors, "continue")) {
- *flag &= ~JFS_ERR_REMOUNT_RO;
- *flag &= ~JFS_ERR_PANIC;
- *flag |= JFS_ERR_CONTINUE;
- } else if (!strcmp(errors, "remount-ro")) {
- *flag &= ~JFS_ERR_CONTINUE;
- *flag &= ~JFS_ERR_PANIC;
- *flag |= JFS_ERR_REMOUNT_RO;
- } else if (!strcmp(errors, "panic")) {
- *flag &= ~JFS_ERR_CONTINUE;
- *flag &= ~JFS_ERR_REMOUNT_RO;
- *flag |= JFS_ERR_PANIC;
- } else {
- pr_err("JFS: %s is an invalid error handler\n",
- errors);
- goto cleanup;
- }
- break;
- }
+ break;
+ case Opt_resize:
+ if (!reconfigure)
+ return -EINVAL;
+ ctx->resize = true;
+ ctx->newLVSize = result.uint_64;
+ break;
+ case Opt_resize_nosize:
+ if (!reconfigure)
+ return -EINVAL;
+ ctx->resize = true;
+ break;
+ case Opt_errors:
+ ctx->flag &= ~JFS_ERR_MASK;
+ ctx->flag |= result.uint_32;
+ break;
#ifdef CONFIG_QUOTA
- case Opt_quota:
- case Opt_usrquota:
- *flag |= JFS_USRQUOTA;
- break;
- case Opt_grpquota:
- *flag |= JFS_GRPQUOTA;
- break;
+ case Opt_quota:
+ case Opt_usrquota:
+ ctx->flag |= JFS_USRQUOTA;
+ break;
+ case Opt_grpquota:
+ ctx->flag |= JFS_GRPQUOTA;
+ break;
#else
- case Opt_usrquota:
- case Opt_grpquota:
- case Opt_quota:
- pr_err("JFS: quota operations not supported\n");
- break;
+ case Opt_usrquota:
+ case Opt_grpquota:
+ case Opt_quota:
+ pr_err("JFS: quota operations not supported\n");
+ break;
#endif
- case Opt_uid:
- {
- char *uid = args[0].from;
- uid_t val;
- int rc = kstrtouint(uid, 0, &val);
+ case Opt_uid:
+ ctx->uid = result.uid;
+ break;
- if (rc)
- goto cleanup;
- sbi->uid = make_kuid(current_user_ns(), val);
- if (!uid_valid(sbi->uid))
- goto cleanup;
- break;
+ case Opt_gid:
+ ctx->gid = result.gid;
+ break;
+
+ case Opt_umask:
+ if (result.uint_32 & ~0777) {
+ pr_err("JFS: Invalid value of umask\n");
+ return -EINVAL;
}
+ ctx->umask = result.uint_32;
+ break;
- case Opt_gid:
- {
- char *gid = args[0].from;
- gid_t val;
- int rc = kstrtouint(gid, 0, &val);
+ case Opt_discard:
+ /* if set to 1, even copying files will cause
+ * trimming :O
+ * -> user has more control over the online trimming
+ */
+ ctx->minblks_trim = 64;
+ ctx->flag |= JFS_DISCARD;
+ break;
- if (rc)
- goto cleanup;
- sbi->gid = make_kgid(current_user_ns(), val);
- if (!gid_valid(sbi->gid))
- goto cleanup;
- break;
- }
+ case Opt_nodiscard:
+ ctx->flag &= ~JFS_DISCARD;
+ break;
- case Opt_umask:
- {
- char *umask = args[0].from;
- int rc = kstrtouint(umask, 8, &sbi->umask);
+ case Opt_discard_minblk:
+ ctx->minblks_trim = result.uint_32;
+ ctx->flag |= JFS_DISCARD;
+ break;
- if (rc)
- goto cleanup;
- if (sbi->umask & ~0777) {
- pr_err("JFS: Invalid value of umask\n");
- goto cleanup;
- }
- break;
- }
-
- case Opt_discard:
- /* if set to 1, even copying files will cause
- * trimming :O
- * -> user has more control over the online trimming
- */
- sbi->minblks_trim = 64;
- if (bdev_max_discard_sectors(sb->s_bdev))
- *flag |= JFS_DISCARD;
- else
- pr_err("JFS: discard option not supported on device\n");
- break;
-
- case Opt_nodiscard:
- *flag &= ~JFS_DISCARD;
- break;
-
- case Opt_discard_minblk:
- {
- char *minblks_trim = args[0].from;
- int rc;
- if (bdev_max_discard_sectors(sb->s_bdev)) {
- *flag |= JFS_DISCARD;
- rc = kstrtouint(minblks_trim, 0,
- &sbi->minblks_trim);
- if (rc)
- goto cleanup;
- } else
- pr_err("JFS: discard option not supported on device\n");
- break;
- }
-
- default:
- printk("jfs: Unrecognized mount option \"%s\" or missing value\n",
- p);
- goto cleanup;
- }
+ default:
+ return -EINVAL;
}
- if (nls_map != (void *) -1) {
- /* Discard old (if remount) */
- unload_nls(sbi->nls_tab);
- sbi->nls_tab = nls_map;
- }
- return 1;
-
-cleanup:
- if (nls_map && nls_map != (void *) -1)
- unload_nls(nls_map);
return 0;
}
-static int jfs_remount(struct super_block *sb, int *flags, char *data)
+static int jfs_reconfigure(struct fs_context *fc)
{
- s64 newLVSize = 0;
+ struct jfs_context *ctx = fc->fs_private;
+ struct super_block *sb = fc->root->d_sb;
+ int readonly = fc->sb_flags & SB_RDONLY;
int rc = 0;
- int flag = JFS_SBI(sb)->flag;
+ int flag = ctx->flag;
int ret;
sync_filesystem(sb);
- if (!parse_options(data, sb, &newLVSize, &flag))
- return -EINVAL;
- if (newLVSize) {
+ /* Transfer results of parsing to the sbi */
+ JFS_SBI(sb)->flag = ctx->flag;
+ JFS_SBI(sb)->uid = ctx->uid;
+ JFS_SBI(sb)->gid = ctx->gid;
+ JFS_SBI(sb)->umask = ctx->umask;
+ JFS_SBI(sb)->minblks_trim = ctx->minblks_trim;
+ if (ctx->nls_map != (void *) -1) {
+ unload_nls(JFS_SBI(sb)->nls_tab);
+ JFS_SBI(sb)->nls_tab = ctx->nls_map;
+ }
+ ctx->nls_map = NULL;
+
+ if (ctx->resize) {
if (sb_rdonly(sb)) {
pr_err("JFS: resize requires volume to be mounted read-write\n");
return -EROFS;
}
- rc = jfs_extendfs(sb, newLVSize, 0);
+
+ if (!ctx->newLVSize) {
+ ctx->newLVSize = sb_bdev_nr_blocks(sb);
+ if (ctx->newLVSize == 0)
+ pr_err("JFS: Cannot determine volume size\n");
+ }
+
+ rc = jfs_extendfs(sb, ctx->newLVSize, 0);
if (rc)
return rc;
}
- if (sb_rdonly(sb) && !(*flags & SB_RDONLY)) {
+ if (sb_rdonly(sb) && !readonly) {
/*
* Invalidate any previously read metadata. fsck may have
* changed the on-disk data since we mounted r/o
@@ -459,7 +414,7 @@ static int jfs_remount(struct super_block *sb, int *flags, char *data)
dquot_resume(sb, -1);
return ret;
}
- if (!sb_rdonly(sb) && (*flags & SB_RDONLY)) {
+ if (!sb_rdonly(sb) && readonly) {
rc = dquot_suspend(sb, -1);
if (rc < 0)
return rc;
@@ -467,7 +422,7 @@ static int jfs_remount(struct super_block *sb, int *flags, char *data)
JFS_SBI(sb)->flag = flag;
return rc;
}
- if ((JFS_SBI(sb)->flag & JFS_NOINTEGRITY) != (flag & JFS_NOINTEGRITY))
+ if ((JFS_SBI(sb)->flag & JFS_NOINTEGRITY) != (flag & JFS_NOINTEGRITY)) {
if (!sb_rdonly(sb)) {
rc = jfs_umount_rw(sb);
if (rc)
@@ -477,18 +432,20 @@ static int jfs_remount(struct super_block *sb, int *flags, char *data)
ret = jfs_mount_rw(sb, 1);
return ret;
}
+ }
JFS_SBI(sb)->flag = flag;
return 0;
}
-static int jfs_fill_super(struct super_block *sb, void *data, int silent)
+static int jfs_fill_super(struct super_block *sb, struct fs_context *fc)
{
+ struct jfs_context *ctx = fc->fs_private;
+ int silent = fc->sb_flags & SB_SILENT;
struct jfs_sb_info *sbi;
struct inode *inode;
int rc;
- s64 newLVSize = 0;
- int flag, ret = -EINVAL;
+ int ret = -EINVAL;
jfs_info("In jfs_read_super: s_flags=0x%lx", sb->s_flags);
@@ -501,24 +458,34 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
sb->s_time_min = 0;
sb->s_time_max = U32_MAX;
sbi->sb = sb;
- sbi->uid = INVALID_UID;
- sbi->gid = INVALID_GID;
- sbi->umask = -1;
- /* initialize the mount flag and determine the default error handler */
- flag = JFS_ERR_REMOUNT_RO;
+ /* Transfer results of parsing to the sbi */
+ sbi->flag = ctx->flag;
+ sbi->uid = ctx->uid;
+ sbi->gid = ctx->gid;
+ sbi->umask = ctx->umask;
+ if (ctx->nls_map != (void *) -1) {
+ unload_nls(sbi->nls_tab);
+ sbi->nls_tab = ctx->nls_map;
+ }
+ ctx->nls_map = NULL;
- if (!parse_options((char *) data, sb, &newLVSize, &flag))
- goto out_kfree;
- sbi->flag = flag;
+ if (sbi->flag & JFS_DISCARD) {
+ if (!bdev_max_discard_sectors(sb->s_bdev)) {
+ pr_err("JFS: discard option not supported on device\n");
+ sbi->flag &= ~JFS_DISCARD;
+ } else {
+ sbi->minblks_trim = ctx->minblks_trim;
+ }
+ }
#ifdef CONFIG_JFS_POSIX_ACL
sb->s_flags |= SB_POSIXACL;
#endif
- if (newLVSize) {
+ if (ctx->resize) {
pr_err("resize option for remount only\n");
- goto out_kfree;
+ goto out_unload;
}
/*
@@ -608,7 +575,6 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
sbi->direct_inode = NULL;
out_unload:
unload_nls(sbi->nls_tab);
-out_kfree:
kfree(sbi);
return ret;
}
@@ -664,10 +630,9 @@ static int jfs_unfreeze(struct super_block *sb)
return rc;
}
-static struct dentry *jfs_do_mount(struct file_system_type *fs_type,
- int flags, const char *dev_name, void *data)
+static int jfs_get_tree(struct fs_context *fc)
{
- return mount_bdev(fs_type, flags, dev_name, data, jfs_fill_super);
+ return get_tree_bdev(fc, jfs_fill_super);
}
static int jfs_sync_fs(struct super_block *sb, int wait)
@@ -886,7 +851,6 @@ static const struct super_operations jfs_super_operations = {
.freeze_fs = jfs_freeze,
.unfreeze_fs = jfs_unfreeze,
.statfs = jfs_statfs,
- .remount_fs = jfs_remount,
.show_options = jfs_show_options,
#ifdef CONFIG_QUOTA
.quota_read = jfs_quota_read,
@@ -902,12 +866,71 @@ static const struct export_operations jfs_export_operations = {
.get_parent = jfs_get_parent,
};
+static void jfs_init_options(struct fs_context *fc, struct jfs_context *ctx)
+{
+ if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) {
+ struct super_block *sb = fc->root->d_sb;
+
+ /* Copy over current option values and mount flags */
+ ctx->uid = JFS_SBI(sb)->uid;
+ ctx->gid = JFS_SBI(sb)->gid;
+ ctx->umask = JFS_SBI(sb)->umask;
+ ctx->nls_map = (void *)-1;
+ ctx->minblks_trim = JFS_SBI(sb)->minblks_trim;
+ ctx->flag = JFS_SBI(sb)->flag;
+
+ } else {
+ /*
+ * Initialize the mount flag and determine the default
+ * error handler
+ */
+ ctx->flag = JFS_ERR_REMOUNT_RO;
+ ctx->uid = INVALID_UID;
+ ctx->gid = INVALID_GID;
+ ctx->umask = -1;
+ ctx->nls_map = (void *)-1;
+ }
+}
+
+static void jfs_free_fc(struct fs_context *fc)
+{
+ struct jfs_context *ctx = fc->fs_private;
+
+ if (ctx->nls_map != (void *) -1)
+ unload_nls(ctx->nls_map);
+ kfree(ctx);
+}
+
+static const struct fs_context_operations jfs_context_ops = {
+ .parse_param = jfs_parse_param,
+ .get_tree = jfs_get_tree,
+ .reconfigure = jfs_reconfigure,
+ .free = jfs_free_fc,
+};
+
+static int jfs_init_fs_context(struct fs_context *fc)
+{
+ struct jfs_context *ctx;
+
+ ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+
+ jfs_init_options(fc, ctx);
+
+ fc->fs_private = ctx;
+ fc->ops = &jfs_context_ops;
+
+ return 0;
+}
+
static struct file_system_type jfs_fs_type = {
.owner = THIS_MODULE,
.name = "jfs",
- .mount = jfs_do_mount,
.kill_sb = kill_block_super,
.fs_flags = FS_REQUIRES_DEV,
+ .init_fs_context = jfs_init_fs_context,
+ .parameters = jfs_param_spec,
};
MODULE_ALIAS_FS("jfs");
diff --git a/fs/kernel_read_file.c b/fs/kernel_read_file.c
index 9ff37ae6..de32c95 100644
--- a/fs/kernel_read_file.c
+++ b/fs/kernel_read_file.c
@@ -175,15 +175,11 @@ ssize_t kernel_read_file_from_fd(int fd, loff_t offset, void **buf,
size_t buf_size, size_t *file_size,
enum kernel_read_file_id id)
{
- struct fd f = fdget(fd);
- ssize_t ret = -EBADF;
+ CLASS(fd, f)(fd);
- if (!fd_file(f) || !(fd_file(f)->f_mode & FMODE_READ))
- goto out;
+ if (fd_empty(f) || !(fd_file(f)->f_mode & FMODE_READ))
+ return -EBADF;
- ret = kernel_read_file(fd_file(f), offset, buf, buf_size, file_size, id);
-out:
- fdput(f);
- return ret;
+ return kernel_read_file(fd_file(f), offset, buf, buf_size, file_size, id);
}
EXPORT_SYMBOL_GPL(kernel_read_file_from_fd);
diff --git a/fs/libfs.c b/fs/libfs.c
index 46966fd..748ac59 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -77,6 +77,10 @@ struct dentry *simple_lookup(struct inode *dir, struct dentry *dentry, unsigned
return ERR_PTR(-ENAMETOOLONG);
if (!dentry->d_sb->s_d_op)
d_set_d_op(dentry, &simple_dentry_operations);
+
+ if (IS_ENABLED(CONFIG_UNICODE) && IS_CASEFOLDED(dir))
+ return NULL;
+
d_add(dentry, NULL);
return NULL;
}
@@ -1711,15 +1715,6 @@ static struct dentry *empty_dir_lookup(struct inode *dir, struct dentry *dentry,
return ERR_PTR(-ENOENT);
}
-static int empty_dir_getattr(struct mnt_idmap *idmap,
- const struct path *path, struct kstat *stat,
- u32 request_mask, unsigned int query_flags)
-{
- struct inode *inode = d_inode(path->dentry);
- generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
- return 0;
-}
-
static int empty_dir_setattr(struct mnt_idmap *idmap,
struct dentry *dentry, struct iattr *attr)
{
@@ -1733,9 +1728,7 @@ static ssize_t empty_dir_listxattr(struct dentry *dentry, char *list, size_t siz
static const struct inode_operations empty_dir_inode_operations = {
.lookup = empty_dir_lookup,
- .permission = generic_permission,
.setattr = empty_dir_setattr,
- .getattr = empty_dir_getattr,
.listxattr = empty_dir_listxattr,
};
@@ -1791,8 +1784,8 @@ bool is_empty_dir_inode(struct inode *inode)
*
* Return: 0 if names match, 1 if mismatch, or -ERRNO
*/
-static int generic_ci_d_compare(const struct dentry *dentry, unsigned int len,
- const char *str, const struct qstr *name)
+int generic_ci_d_compare(const struct dentry *dentry, unsigned int len,
+ const char *str, const struct qstr *name)
{
const struct dentry *parent;
const struct inode *dir;
@@ -1835,6 +1828,7 @@ static int generic_ci_d_compare(const struct dentry *dentry, unsigned int len,
return utf8_strncasecmp(dentry->d_sb->s_encoding, name, &qstr);
}
+EXPORT_SYMBOL(generic_ci_d_compare);
/**
* generic_ci_d_hash - generic d_hash implementation for casefolding filesystems
@@ -1843,7 +1837,7 @@ static int generic_ci_d_compare(const struct dentry *dentry, unsigned int len,
*
* Return: 0 if hash was successful or unchanged, and -EINVAL on error
*/
-static int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str)
+int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str)
{
const struct inode *dir = READ_ONCE(dentry->d_inode);
struct super_block *sb = dentry->d_sb;
@@ -1858,6 +1852,7 @@ static int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str)
return -EINVAL;
return 0;
}
+EXPORT_SYMBOL(generic_ci_d_hash);
static const struct dentry_operations generic_ci_dentry_ops = {
.d_hash = generic_ci_d_hash,
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index 1f2149d..2359347 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -30,7 +30,6 @@
#include <linux/sunrpc/svc_xprt.h>
#include <linux/lockd/nlm.h>
#include <linux/lockd/lockd.h>
-#include <linux/exportfs.h>
#define NLMDBG_FACILITY NLMDBG_SVCLOCK
@@ -481,7 +480,7 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
struct nlm_host *host, struct nlm_lock *lock, int wait,
struct nlm_cookie *cookie, int reclaim)
{
- struct inode *inode = nlmsvc_file_inode(file);
+ struct inode *inode __maybe_unused = nlmsvc_file_inode(file);
struct nlm_block *block = NULL;
int error;
int mode;
@@ -496,7 +495,7 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
(long long)lock->fl.fl_end,
wait);
- if (!exportfs_lock_op_is_async(inode->i_sb->s_export_op)) {
+ if (!locks_can_async_lock(nlmsvc_file_file(file)->f_op)) {
async_block = wait;
wait = 0;
}
@@ -550,7 +549,7 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
* requests on the underlaying ->lock() implementation but
* only one nlm_block to being granted by lm_grant().
*/
- if (exportfs_lock_op_is_async(inode->i_sb->s_export_op) &&
+ if (locks_can_async_lock(nlmsvc_file_file(file)->f_op) &&
!list_empty(&block->b_list)) {
spin_unlock(&nlm_blocked_lock);
ret = nlm_lck_blocked;
diff --git a/fs/locks.c b/fs/locks.c
index 2048476..25afc8d 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -2136,7 +2136,6 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd)
{
int can_sleep, error, type;
struct file_lock fl;
- struct fd f;
/*
* LOCK_MAND locks were broken for a long time in that they never
@@ -2155,19 +2154,18 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd)
if (type < 0)
return type;
- error = -EBADF;
- f = fdget(fd);
- if (!fd_file(f))
- return error;
+ CLASS(fd, f)(fd);
+ if (fd_empty(f))
+ return -EBADF;
if (type != F_UNLCK && !(fd_file(f)->f_mode & (FMODE_READ | FMODE_WRITE)))
- goto out_putf;
+ return -EBADF;
flock_make_lock(fd_file(f), &fl, type);
error = security_file_lock(fd_file(f), fl.c.flc_type);
if (error)
- goto out_putf;
+ return error;
can_sleep = !(cmd & LOCK_NB);
if (can_sleep)
@@ -2181,9 +2179,6 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd)
error = locks_lock_file_wait(fd_file(f), &fl);
locks_release_private(&fl);
- out_putf:
- fdput(f);
-
return error;
}
diff --git a/fs/mpage.c b/fs/mpage.c
index b5b5ddf..82aecf3 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -606,7 +606,7 @@ static int __mpage_writepage(struct folio *folio, struct writeback_control *wbc,
* the confused fail path above (OOM) will be very confused when
* it finds all bh marked clean (i.e. it will not write anything)
*/
- wbc_account_cgroup_owner(wbc, &folio->page, folio_size(folio));
+ wbc_account_cgroup_owner(wbc, folio, folio_size(folio));
length = first_unmapped << blkbits;
if (!bio_add_folio(bio, folio, length, 0)) {
bio = mpage_bio_submit_write(bio);
diff --git a/fs/namei.c b/fs/namei.c
index 4a4a22a..9d30c7a 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -211,22 +211,38 @@ getname_flags(const char __user *filename, int flags)
return result;
}
-struct filename *
-getname_uflags(const char __user *filename, int uflags)
+struct filename *getname_uflags(const char __user *filename, int uflags)
{
int flags = (uflags & AT_EMPTY_PATH) ? LOOKUP_EMPTY : 0;
return getname_flags(filename, flags);
}
-struct filename *
-getname(const char __user * filename)
+struct filename *getname(const char __user * filename)
{
return getname_flags(filename, 0);
}
-struct filename *
-getname_kernel(const char * filename)
+struct filename *__getname_maybe_null(const char __user *pathname)
+{
+ struct filename *name;
+ char c;
+
+ /* try to save on allocations; loss on um, though */
+ if (get_user(c, pathname))
+ return ERR_PTR(-EFAULT);
+ if (!c)
+ return NULL;
+
+ name = getname_flags(pathname, LOOKUP_EMPTY);
+ if (!IS_ERR(name) && !(name->name[0])) {
+ putname(name);
+ name = NULL;
+ }
+ return name;
+}
+
+struct filename *getname_kernel(const char * filename)
{
struct filename *result;
int len = strlen(filename) + 1;
@@ -264,7 +280,7 @@ EXPORT_SYMBOL(getname_kernel);
void putname(struct filename *name)
{
- if (IS_ERR(name))
+ if (IS_ERR_OR_NULL(name))
return;
if (WARN_ON_ONCE(!atomic_read(&name->refcnt)))
@@ -326,6 +342,25 @@ static int check_acl(struct mnt_idmap *idmap,
return -EAGAIN;
}
+/*
+ * Very quick optimistic "we know we have no ACL's" check.
+ *
+ * Note that this is purely for ACL_TYPE_ACCESS, and purely
+ * for the "we have cached that there are no ACLs" case.
+ *
+ * If this returns true, we know there are no ACLs. But if
+ * it returns false, we might still not have ACLs (it could
+ * be the is_uncached_acl() case).
+ */
+static inline bool no_acl_inode(struct inode *inode)
+{
+#ifdef CONFIG_FS_POSIX_ACL
+ return likely(!READ_ONCE(inode->i_acl));
+#else
+ return true;
+#endif
+}
+
/**
* acl_permission_check - perform basic UNIX permission checking
* @idmap: idmap of the mount the inode was found from
@@ -348,6 +383,28 @@ static int acl_permission_check(struct mnt_idmap *idmap,
unsigned int mode = inode->i_mode;
vfsuid_t vfsuid;
+ /*
+ * Common cheap case: everybody has the requested
+ * rights, and there are no ACLs to check. No need
+ * to do any owner/group checks in that case.
+ *
+ * - 'mask&7' is the requested permission bit set
+ * - multiplying by 0111 spreads them out to all of ugo
+ * - '& ~mode' looks for missing inode permission bits
+ * - the '!' is for "no missing permissions"
+ *
+ * After that, we just need to check that there are no
+ * ACL's on the inode - do the 'IS_POSIXACL()' check last
+ * because it will dereference the ->i_sb pointer and we
+ * want to avoid that if at all possible.
+ */
+ if (!((mask & 7) * 0111 & ~mode)) {
+ if (no_acl_inode(inode))
+ return 0;
+ if (!IS_POSIXACL(inode))
+ return 0;
+ }
+
/* Are we the owner? If so, ACL's don't matter */
vfsuid = i_uid_into_vfsuid(idmap, inode);
if (likely(vfsuid_eq_kuid(vfsuid, current_fsuid()))) {
@@ -588,6 +645,7 @@ struct nameidata {
unsigned seq;
} *stack, internal[EMBEDDED_LEVELS];
struct filename *name;
+ const char *pathname;
struct nameidata *saved;
unsigned root_seq;
int dfd;
@@ -606,6 +664,7 @@ static void __set_nameidata(struct nameidata *p, int dfd, struct filename *name)
p->depth = 0;
p->dfd = dfd;
p->name = name;
+ p->pathname = likely(name) ? name->name : "";
p->path.mnt = NULL;
p->path.dentry = NULL;
p->total_link_count = old ? old->total_link_count : 0;
@@ -2439,7 +2498,7 @@ static int link_path_walk(const char *name, struct nameidata *nd)
static const char *path_init(struct nameidata *nd, unsigned flags)
{
int error;
- const char *s = nd->name->name;
+ const char *s = nd->pathname;
/* LOOKUP_CACHED requires RCU, ask caller to retry */
if ((flags & (LOOKUP_RCU | LOOKUP_CACHED)) == LOOKUP_CACHED)
@@ -2503,26 +2562,22 @@ static const char *path_init(struct nameidata *nd, unsigned flags)
}
} else {
/* Caller must check execute permissions on the starting path component */
- struct fd f = fdget_raw(nd->dfd);
+ CLASS(fd_raw, f)(nd->dfd);
struct dentry *dentry;
- if (!fd_file(f))
+ if (fd_empty(f))
return ERR_PTR(-EBADF);
if (flags & LOOKUP_LINKAT_EMPTY) {
if (fd_file(f)->f_cred != current_cred() &&
- !ns_capable(fd_file(f)->f_cred->user_ns, CAP_DAC_READ_SEARCH)) {
- fdput(f);
+ !ns_capable(fd_file(f)->f_cred->user_ns, CAP_DAC_READ_SEARCH))
return ERR_PTR(-ENOENT);
- }
}
dentry = fd_file(f)->f_path.dentry;
- if (*s && unlikely(!d_can_lookup(dentry))) {
- fdput(f);
+ if (*s && unlikely(!d_can_lookup(dentry)))
return ERR_PTR(-ENOTDIR);
- }
nd->path = fd_file(f)->f_path;
if (flags & LOOKUP_RCU) {
@@ -2532,7 +2587,6 @@ static const char *path_init(struct nameidata *nd, unsigned flags)
path_get(&nd->path);
nd->inode = nd->path.dentry->d_inode;
}
- fdput(f);
}
/* For scoped-lookups we need to set the root to the dirfd as well. */
diff --git a/fs/namespace.c b/fs/namespace.c
index d26f5e6..6b0a174 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -3901,7 +3901,7 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool a
}
new_ns->ns.ops = &mntns_operations;
if (!anon)
- new_ns->seq = atomic64_add_return(1, &mnt_ns_seq);
+ new_ns->seq = atomic64_inc_return(&mnt_ns_seq);
refcount_set(&new_ns->ns.count, 1);
refcount_set(&new_ns->passive, 1);
new_ns->mounts = RB_ROOT;
@@ -4107,7 +4107,6 @@ SYSCALL_DEFINE3(fsmount, int, fs_fd, unsigned int, flags,
struct file *file;
struct path newmount;
struct mount *mnt;
- struct fd f;
unsigned int mnt_flags = 0;
long ret;
@@ -4135,19 +4134,18 @@ SYSCALL_DEFINE3(fsmount, int, fs_fd, unsigned int, flags,
return -EINVAL;
}
- f = fdget(fs_fd);
- if (!fd_file(f))
+ CLASS(fd, f)(fs_fd);
+ if (fd_empty(f))
return -EBADF;
- ret = -EINVAL;
if (fd_file(f)->f_op != &fscontext_fops)
- goto err_fsfd;
+ return -EINVAL;
fc = fd_file(f)->private_data;
ret = mutex_lock_interruptible(&fc->uapi_mutex);
if (ret < 0)
- goto err_fsfd;
+ return ret;
/* There must be a valid superblock or we can't mount it */
ret = -EINVAL;
@@ -4214,8 +4212,6 @@ SYSCALL_DEFINE3(fsmount, int, fs_fd, unsigned int, flags,
path_put(&newmount);
err_unlock:
mutex_unlock(&fc->uapi_mutex);
-err_fsfd:
- fdput(f);
return ret;
}
@@ -4670,10 +4666,8 @@ static int do_mount_setattr(struct path *path, struct mount_kattr *kattr)
static int build_mount_idmapped(const struct mount_attr *attr, size_t usize,
struct mount_kattr *kattr, unsigned int flags)
{
- int err = 0;
struct ns_common *ns;
struct user_namespace *mnt_userns;
- struct fd f;
if (!((attr->attr_set | attr->attr_clr) & MOUNT_ATTR_IDMAP))
return 0;
@@ -4689,20 +4683,16 @@ static int build_mount_idmapped(const struct mount_attr *attr, size_t usize,
if (attr->userns_fd > INT_MAX)
return -EINVAL;
- f = fdget(attr->userns_fd);
- if (!fd_file(f))
+ CLASS(fd, f)(attr->userns_fd);
+ if (fd_empty(f))
return -EBADF;
- if (!proc_ns_file(fd_file(f))) {
- err = -EINVAL;
- goto out_fput;
- }
+ if (!proc_ns_file(fd_file(f)))
+ return -EINVAL;
ns = get_proc_ns(file_inode(fd_file(f)));
- if (ns->ops->type != CLONE_NEWUSER) {
- err = -EINVAL;
- goto out_fput;
- }
+ if (ns->ops->type != CLONE_NEWUSER)
+ return -EINVAL;
/*
* The initial idmapping cannot be used to create an idmapped
@@ -4713,22 +4703,15 @@ static int build_mount_idmapped(const struct mount_attr *attr, size_t usize,
* result.
*/
mnt_userns = container_of(ns, struct user_namespace, ns);
- if (mnt_userns == &init_user_ns) {
- err = -EPERM;
- goto out_fput;
- }
+ if (mnt_userns == &init_user_ns)
+ return -EPERM;
/* We're not controlling the target namespace. */
- if (!ns_capable(mnt_userns, CAP_SYS_ADMIN)) {
- err = -EPERM;
- goto out_fput;
- }
+ if (!ns_capable(mnt_userns, CAP_SYS_ADMIN))
+ return -EPERM;
kattr->mnt_userns = get_user_ns(mnt_userns);
-
-out_fput:
- fdput(f);
- return err;
+ return 0;
}
static int build_mount_kattr(const struct mount_attr *attr, size_t usize,
@@ -5006,6 +4989,40 @@ static int statmount_fs_type(struct kstatmount *s, struct seq_file *seq)
return 0;
}
+static void statmount_fs_subtype(struct kstatmount *s, struct seq_file *seq)
+{
+ struct super_block *sb = s->mnt->mnt_sb;
+
+ if (sb->s_subtype)
+ seq_puts(seq, sb->s_subtype);
+}
+
+static int statmount_sb_source(struct kstatmount *s, struct seq_file *seq)
+{
+ struct super_block *sb = s->mnt->mnt_sb;
+ struct mount *r = real_mount(s->mnt);
+
+ if (sb->s_op->show_devname) {
+ size_t start = seq->count;
+ int ret;
+
+ ret = sb->s_op->show_devname(seq, s->mnt->mnt_root);
+ if (ret)
+ return ret;
+
+ if (unlikely(seq_has_overflowed(seq)))
+ return -EAGAIN;
+
+ /* Unescape the result */
+ seq->buf[seq->count] = '\0';
+ seq->count = start;
+ seq_commit(seq, string_unescape_inplace(seq->buf + start, UNESCAPE_OCTAL));
+ } else if (r->mnt_devname) {
+ seq_puts(seq, r->mnt_devname);
+ }
+ return 0;
+}
+
static void statmount_mnt_ns_id(struct kstatmount *s, struct mnt_namespace *ns)
{
s->sm.mask |= STATMOUNT_MNT_NS_ID;
@@ -5040,35 +5057,134 @@ static int statmount_mnt_opts(struct kstatmount *s, struct seq_file *seq)
return 0;
}
+static inline int statmount_opt_unescape(struct seq_file *seq, char *buf_start)
+{
+ char *buf_end, *opt_start, *opt_end;
+ int count = 0;
+
+ buf_end = seq->buf + seq->count;
+ *buf_end = '\0';
+ for (opt_start = buf_start + 1; opt_start < buf_end; opt_start = opt_end + 1) {
+ opt_end = strchrnul(opt_start, ',');
+ *opt_end = '\0';
+ buf_start += string_unescape(opt_start, buf_start, 0, UNESCAPE_OCTAL) + 1;
+ if (WARN_ON_ONCE(++count == INT_MAX))
+ return -EOVERFLOW;
+ }
+ seq->count = buf_start - 1 - seq->buf;
+ return count;
+}
+
+static int statmount_opt_array(struct kstatmount *s, struct seq_file *seq)
+{
+ struct vfsmount *mnt = s->mnt;
+ struct super_block *sb = mnt->mnt_sb;
+ size_t start = seq->count;
+ char *buf_start;
+ int err;
+
+ if (!sb->s_op->show_options)
+ return 0;
+
+ buf_start = seq->buf + start;
+ err = sb->s_op->show_options(seq, mnt->mnt_root);
+ if (err)
+ return err;
+
+ if (unlikely(seq_has_overflowed(seq)))
+ return -EAGAIN;
+
+ if (seq->count == start)
+ return 0;
+
+ err = statmount_opt_unescape(seq, buf_start);
+ if (err < 0)
+ return err;
+
+ s->sm.opt_num = err;
+ return 0;
+}
+
+static int statmount_opt_sec_array(struct kstatmount *s, struct seq_file *seq)
+{
+ struct vfsmount *mnt = s->mnt;
+ struct super_block *sb = mnt->mnt_sb;
+ size_t start = seq->count;
+ char *buf_start;
+ int err;
+
+ buf_start = seq->buf + start;
+
+ err = security_sb_show_options(seq, sb);
+ if (!err)
+ return err;
+
+ if (unlikely(seq_has_overflowed(seq)))
+ return -EAGAIN;
+
+ if (seq->count == start)
+ return 0;
+
+ err = statmount_opt_unescape(seq, buf_start);
+ if (err < 0)
+ return err;
+
+ s->sm.opt_sec_num = err;
+ return 0;
+}
+
static int statmount_string(struct kstatmount *s, u64 flag)
{
- int ret;
+ int ret = 0;
size_t kbufsize;
struct seq_file *seq = &s->seq;
struct statmount *sm = &s->sm;
+ u32 start = seq->count;
switch (flag) {
case STATMOUNT_FS_TYPE:
- sm->fs_type = seq->count;
+ sm->fs_type = start;
ret = statmount_fs_type(s, seq);
break;
case STATMOUNT_MNT_ROOT:
- sm->mnt_root = seq->count;
+ sm->mnt_root = start;
ret = statmount_mnt_root(s, seq);
break;
case STATMOUNT_MNT_POINT:
- sm->mnt_point = seq->count;
+ sm->mnt_point = start;
ret = statmount_mnt_point(s, seq);
break;
case STATMOUNT_MNT_OPTS:
- sm->mnt_opts = seq->count;
+ sm->mnt_opts = start;
ret = statmount_mnt_opts(s, seq);
break;
+ case STATMOUNT_OPT_ARRAY:
+ sm->opt_array = start;
+ ret = statmount_opt_array(s, seq);
+ break;
+ case STATMOUNT_OPT_SEC_ARRAY:
+ sm->opt_sec_array = start;
+ ret = statmount_opt_sec_array(s, seq);
+ break;
+ case STATMOUNT_FS_SUBTYPE:
+ sm->fs_subtype = start;
+ statmount_fs_subtype(s, seq);
+ break;
+ case STATMOUNT_SB_SOURCE:
+ sm->sb_source = start;
+ ret = statmount_sb_source(s, seq);
+ break;
default:
WARN_ON_ONCE(true);
return -EINVAL;
}
+ /*
+ * If nothing was emitted, return to avoid setting the flag
+ * and terminating the buffer.
+ */
+ if (seq->count == start)
+ return ret;
if (unlikely(check_add_overflow(sizeof(*sm), seq->count, &kbufsize)))
return -EOVERFLOW;
if (kbufsize >= s->bufsize)
@@ -5203,6 +5319,18 @@ static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id,
if (!err && s->mask & STATMOUNT_MNT_OPTS)
err = statmount_string(s, STATMOUNT_MNT_OPTS);
+ if (!err && s->mask & STATMOUNT_OPT_ARRAY)
+ err = statmount_string(s, STATMOUNT_OPT_ARRAY);
+
+ if (!err && s->mask & STATMOUNT_OPT_SEC_ARRAY)
+ err = statmount_string(s, STATMOUNT_OPT_SEC_ARRAY);
+
+ if (!err && s->mask & STATMOUNT_FS_SUBTYPE)
+ err = statmount_string(s, STATMOUNT_FS_SUBTYPE);
+
+ if (!err && s->mask & STATMOUNT_SB_SOURCE)
+ err = statmount_string(s, STATMOUNT_SB_SOURCE);
+
if (!err && s->mask & STATMOUNT_MNT_NS_ID)
statmount_mnt_ns_id(s, ns);
@@ -5224,7 +5352,9 @@ static inline bool retry_statmount(const long ret, size_t *seq_size)
}
#define STATMOUNT_STRING_REQ (STATMOUNT_MNT_ROOT | STATMOUNT_MNT_POINT | \
- STATMOUNT_FS_TYPE | STATMOUNT_MNT_OPTS)
+ STATMOUNT_FS_TYPE | STATMOUNT_MNT_OPTS | \
+ STATMOUNT_FS_SUBTYPE | STATMOUNT_SB_SOURCE | \
+ STATMOUNT_OPT_ARRAY | STATMOUNT_OPT_SEC_ARRAY)
static int prepare_kstatmount(struct kstatmount *ks, struct mnt_id_req *kreq,
struct statmount __user *buf, size_t bufsize,
diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c
index af46a59..7ac3455 100644
--- a/fs/netfs/buffered_read.c
+++ b/fs/netfs/buffered_read.c
@@ -627,7 +627,7 @@ static bool netfs_skip_folio_read(struct folio *folio, loff_t pos, size_t len,
if (unlikely(always_fill)) {
if (pos - offset + len <= i_size)
return false; /* Page entirely before EOF */
- zero_user_segment(&folio->page, 0, plen);
+ folio_zero_segment(folio, 0, plen);
folio_mark_uptodate(folio);
return true;
}
@@ -646,7 +646,7 @@ static bool netfs_skip_folio_read(struct folio *folio, loff_t pos, size_t len,
return false;
zero_out:
- zero_user_segments(&folio->page, 0, offset, offset + len, plen);
+ folio_zero_segments(folio, 0, offset, offset + len, plen);
return true;
}
@@ -713,7 +713,7 @@ int netfs_write_begin(struct netfs_inode *ctx,
if (folio_test_uptodate(folio))
goto have_folio;
- /* If the page is beyond the EOF, we want to clear it - unless it's
+ /* If the folio is beyond the EOF, we want to clear it - unless it's
* within the cache granule containing the EOF, in which case we need
* to preload the granule.
*/
@@ -773,7 +773,7 @@ int netfs_write_begin(struct netfs_inode *ctx,
EXPORT_SYMBOL(netfs_write_begin);
/*
- * Preload the data into a page we're proposing to write into.
+ * Preload the data into a folio we're proposing to write into.
*/
int netfs_prefetch_for_write(struct file *file, struct folio *folio,
size_t offset, size_t len)
diff --git a/fs/netfs/buffered_write.c b/fs/netfs/buffered_write.c
index b3910df..b482636 100644
--- a/fs/netfs/buffered_write.c
+++ b/fs/netfs/buffered_write.c
@@ -83,13 +83,13 @@ static void netfs_update_i_size(struct netfs_inode *ctx, struct inode *inode,
* netfs_perform_write - Copy data into the pagecache.
* @iocb: The operation parameters
* @iter: The source buffer
- * @netfs_group: Grouping for dirty pages (eg. ceph snaps).
+ * @netfs_group: Grouping for dirty folios (eg. ceph snaps).
*
- * Copy data into pagecache pages attached to the inode specified by @iocb.
+ * Copy data into pagecache folios attached to the inode specified by @iocb.
* The caller must hold appropriate inode locks.
*
- * Dirty pages are tagged with a netfs_folio struct if they're not up to date
- * to indicate the range modified. Dirty pages may also be tagged with a
+ * Dirty folios are tagged with a netfs_folio struct if they're not up to date
+ * to indicate the range modified. Dirty folios may also be tagged with a
* netfs-specific grouping such that data from an old group gets flushed before
* a new one is started.
*/
@@ -223,11 +223,11 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter,
* we try to read it.
*/
if (fpos >= ctx->zero_point) {
- zero_user_segment(&folio->page, 0, offset);
+ folio_zero_segment(folio, 0, offset);
copied = copy_folio_from_iter_atomic(folio, offset, part, iter);
if (unlikely(copied == 0))
goto copy_failed;
- zero_user_segment(&folio->page, offset + copied, flen);
+ folio_zero_segment(folio, offset + copied, flen);
__netfs_set_group(folio, netfs_group);
folio_mark_uptodate(folio);
trace_netfs_folio(folio, netfs_modify_and_clear);
@@ -407,7 +407,7 @@ EXPORT_SYMBOL(netfs_perform_write);
* netfs_buffered_write_iter_locked - write data to a file
* @iocb: IO state structure (file, offset, etc.)
* @from: iov_iter with data to write
- * @netfs_group: Grouping for dirty pages (eg. ceph snaps).
+ * @netfs_group: Grouping for dirty folios (eg. ceph snaps).
*
* This function does all the work needed for actually writing data to a
* file. It does all basic checks, removes SUID from the file, updates
@@ -491,7 +491,9 @@ EXPORT_SYMBOL(netfs_file_write_iter);
/*
* Notification that a previously read-only page is about to become writable.
- * Note that the caller indicates a single page of a multipage folio.
+ * The caller indicates the precise page that needs to be written to, but
+ * we only track group on a per-folio basis, so we block more often than
+ * we might otherwise.
*/
vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_group)
{
@@ -501,7 +503,7 @@ vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_gr
struct address_space *mapping = file->f_mapping;
struct inode *inode = file_inode(file);
struct netfs_inode *ictx = netfs_inode(inode);
- vm_fault_t ret = VM_FAULT_RETRY;
+ vm_fault_t ret = VM_FAULT_NOPAGE;
int err;
_enter("%lx", folio->index);
@@ -510,21 +512,15 @@ vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_gr
if (folio_lock_killable(folio) < 0)
goto out;
- if (folio->mapping != mapping) {
- folio_unlock(folio);
- ret = VM_FAULT_NOPAGE;
- goto out;
- }
-
- if (folio_wait_writeback_killable(folio)) {
- ret = VM_FAULT_LOCKED;
- goto out;
- }
+ if (folio->mapping != mapping)
+ goto unlock;
+ if (folio_wait_writeback_killable(folio) < 0)
+ goto unlock;
/* Can we see a streaming write here? */
if (WARN_ON(!folio_test_uptodate(folio))) {
- ret = VM_FAULT_SIGBUS | VM_FAULT_LOCKED;
- goto out;
+ ret = VM_FAULT_SIGBUS;
+ goto unlock;
}
group = netfs_folio_group(folio);
@@ -559,5 +555,8 @@ vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_gr
out:
sb_end_pagefault(inode->i_sb);
return ret;
+unlock:
+ folio_unlock(folio);
+ goto out;
}
EXPORT_SYMBOL(netfs_page_mkwrite);
diff --git a/fs/netfs/fscache_volume.c b/fs/netfs/fscache_volume.c
index cb75c07..ced14ac 100644
--- a/fs/netfs/fscache_volume.c
+++ b/fs/netfs/fscache_volume.c
@@ -322,8 +322,7 @@ void fscache_create_volume(struct fscache_volume *volume, bool wait)
}
return;
no_wait:
- clear_bit_unlock(FSCACHE_VOLUME_CREATING, &volume->flags);
- wake_up_bit(&volume->flags, FSCACHE_VOLUME_CREATING);
+ clear_and_wake_up_bit(FSCACHE_VOLUME_CREATING, &volume->flags);
}
/*
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 551d295..d80406f 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -8001,9 +8001,6 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
fp = lock_stp->st_stid.sc_file;
switch (lock->lk_type) {
case NFS4_READW_LT:
- if (nfsd4_has_session(cstate) ||
- exportfs_lock_op_is_async(sb->s_export_op))
- flags |= FL_SLEEP;
fallthrough;
case NFS4_READ_LT:
spin_lock(&fp->fi_lock);
@@ -8014,9 +8011,6 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
type = F_RDLCK;
break;
case NFS4_WRITEW_LT:
- if (nfsd4_has_session(cstate) ||
- exportfs_lock_op_is_async(sb->s_export_op))
- flags |= FL_SLEEP;
fallthrough;
case NFS4_WRITE_LT:
spin_lock(&fp->fi_lock);
@@ -8036,15 +8030,10 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
goto out;
}
- /*
- * Most filesystems with their own ->lock operations will block
- * the nfsd thread waiting to acquire the lock. That leads to
- * deadlocks (we don't want every nfsd thread tied up waiting
- * for file locks), so don't attempt blocking lock notifications
- * on those filesystems:
- */
- if (!exportfs_lock_op_is_async(sb->s_export_op))
- flags &= ~FL_SLEEP;
+ if (lock->lk_type & (NFS4_READW_LT | NFS4_WRITEW_LT) &&
+ nfsd4_has_session(cstate) &&
+ locks_can_async_lock(nf->nf_file->f_op))
+ flags |= FL_SLEEP;
nbl = find_or_allocate_block(lock_sop, &fp->fi_fhandle, nn);
if (!nbl) {
diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
index 57b4af5..501ad7b 100644
--- a/fs/nilfs2/btnode.c
+++ b/fs/nilfs2/btnode.c
@@ -68,7 +68,6 @@ nilfs_btnode_create_block(struct address_space *btnc, __u64 blocknr)
goto failed;
}
memset(bh->b_data, 0, i_blocksize(inode));
- bh->b_bdev = inode->i_sb->s_bdev;
bh->b_blocknr = blocknr;
set_buffer_mapped(bh);
set_buffer_uptodate(bh);
@@ -133,7 +132,6 @@ int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr,
goto found;
}
set_buffer_mapped(bh);
- bh->b_bdev = inode->i_sb->s_bdev;
bh->b_blocknr = pblocknr; /* set block address for read */
bh->b_end_io = end_buffer_read_sync;
get_bh(bh);
diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c
index 1c9ae36..ace2225 100644
--- a/fs/nilfs2/gcinode.c
+++ b/fs/nilfs2/gcinode.c
@@ -83,10 +83,8 @@ int nilfs_gccache_submit_read_data(struct inode *inode, sector_t blkoff,
goto out;
}
- if (!buffer_mapped(bh)) {
- bh->b_bdev = inode->i_sb->s_bdev;
+ if (!buffer_mapped(bh))
set_buffer_mapped(bh);
- }
bh->b_blocknr = pbn;
bh->b_end_io = end_buffer_read_sync;
get_bh(bh);
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index ceb7dc0..2db6350 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -89,7 +89,6 @@ static int nilfs_mdt_create_block(struct inode *inode, unsigned long block,
if (buffer_uptodate(bh))
goto failed_bh;
- bh->b_bdev = sb->s_bdev;
err = nilfs_mdt_insert_new_block(inode, block, bh, init_block);
if (likely(!err)) {
get_bh(bh);
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index 10def4b..9a84939 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -39,7 +39,6 @@ static struct buffer_head *__nilfs_get_folio_block(struct folio *folio,
first_block = (unsigned long)index << (PAGE_SHIFT - blkbits);
bh = get_nth_bh(bh, block - first_block);
- touch_buffer(bh);
wait_on_buffer(bh);
return bh;
}
@@ -64,6 +63,7 @@ struct buffer_head *nilfs_grab_buffer(struct inode *inode,
folio_put(folio);
return NULL;
}
+ bh->b_bdev = inode->i_sb->s_bdev;
return bh;
}
@@ -99,16 +99,16 @@ void nilfs_forget_buffer(struct buffer_head *bh)
*/
void nilfs_copy_buffer(struct buffer_head *dbh, struct buffer_head *sbh)
{
- void *kaddr0, *kaddr1;
+ void *saddr, *daddr;
unsigned long bits;
- struct page *spage = sbh->b_page, *dpage = dbh->b_page;
+ struct folio *sfolio = sbh->b_folio, *dfolio = dbh->b_folio;
struct buffer_head *bh;
- kaddr0 = kmap_local_page(spage);
- kaddr1 = kmap_local_page(dpage);
- memcpy(kaddr1 + bh_offset(dbh), kaddr0 + bh_offset(sbh), sbh->b_size);
- kunmap_local(kaddr1);
- kunmap_local(kaddr0);
+ saddr = kmap_local_folio(sfolio, bh_offset(sbh));
+ daddr = kmap_local_folio(dfolio, bh_offset(dbh));
+ memcpy(daddr, saddr, sbh->b_size);
+ kunmap_local(daddr);
+ kunmap_local(saddr);
dbh->b_state = sbh->b_state & NILFS_BUFFER_INHERENT_BITS;
dbh->b_blocknr = sbh->b_blocknr;
@@ -122,13 +122,13 @@ void nilfs_copy_buffer(struct buffer_head *dbh, struct buffer_head *sbh)
unlock_buffer(bh);
}
if (bits & BIT(BH_Uptodate))
- SetPageUptodate(dpage);
+ folio_mark_uptodate(dfolio);
else
- ClearPageUptodate(dpage);
+ folio_clear_uptodate(dfolio);
if (bits & BIT(BH_Mapped))
- SetPageMappedToDisk(dpage);
+ folio_set_mappedtodisk(dfolio);
else
- ClearPageMappedToDisk(dpage);
+ folio_clear_mappedtodisk(dfolio);
}
/**
diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index d5dbef7..6004dfd 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -16,7 +16,6 @@
#include <linux/security.h>
#include <linux/spinlock.h>
#include <linux/slab.h>
-#include <linux/fdtable.h>
#include <linux/fsnotify_backend.h>
static int dir_notify_enable __read_mostly = 1;
@@ -347,9 +346,7 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned int arg)
new_fsn_mark = NULL;
}
- rcu_read_lock();
- f = lookup_fdget_rcu(fd);
- rcu_read_unlock();
+ f = fget_raw(fd);
/* if (f != filp) means that we lost a race and another task/thread
* actually closed the fd we are still playing with before we grabbed
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index 224bcca..24c7c5d 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -1,6 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/fanotify.h>
-#include <linux/fdtable.h>
#include <linux/fsnotify_backend.h>
#include <linux/init.h>
#include <linux/jiffies.h>
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 9644bc7..35159fa 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -1,7 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/fanotify.h>
#include <linux/fcntl.h>
-#include <linux/fdtable.h>
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/anon_inodes.h>
@@ -1003,22 +1002,17 @@ static int fanotify_find_path(int dfd, const char __user *filename,
dfd, filename, flags);
if (filename == NULL) {
- struct fd f = fdget(dfd);
+ CLASS(fd, f)(dfd);
- ret = -EBADF;
- if (!fd_file(f))
- goto out;
+ if (fd_empty(f))
+ return -EBADF;
- ret = -ENOTDIR;
if ((flags & FAN_MARK_ONLYDIR) &&
- !(S_ISDIR(file_inode(fd_file(f))->i_mode))) {
- fdput(f);
- goto out;
- }
+ !(S_ISDIR(file_inode(fd_file(f))->i_mode)))
+ return -ENOTDIR;
*path = fd_file(f)->f_path;
path_get(path);
- fdput(f);
} else {
unsigned int lookup_flags = 0;
@@ -1682,7 +1676,6 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
struct inode *inode = NULL;
struct vfsmount *mnt = NULL;
struct fsnotify_group *group;
- struct fd f;
struct path path;
struct fan_fsid __fsid, *fsid = NULL;
u32 valid_mask = FANOTIFY_EVENTS | FANOTIFY_EVENT_FLAGS;
@@ -1752,14 +1745,13 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
umask = FANOTIFY_EVENT_FLAGS;
}
- f = fdget(fanotify_fd);
- if (unlikely(!fd_file(f)))
+ CLASS(fd, f)(fanotify_fd);
+ if (fd_empty(f))
return -EBADF;
/* verify that this is indeed an fanotify instance */
- ret = -EINVAL;
if (unlikely(fd_file(f)->f_op != &fanotify_fops))
- goto fput_and_out;
+ return -EINVAL;
group = fd_file(f)->private_data;
/*
@@ -1767,23 +1759,21 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
* marks. This also includes setting up such marks by a group that
* was initialized by an unprivileged user.
*/
- ret = -EPERM;
if ((!capable(CAP_SYS_ADMIN) ||
FAN_GROUP_FLAG(group, FANOTIFY_UNPRIV)) &&
mark_type != FAN_MARK_INODE)
- goto fput_and_out;
+ return -EPERM;
/*
* Permission events require minimum priority FAN_CLASS_CONTENT.
*/
- ret = -EINVAL;
if (mask & FANOTIFY_PERM_EVENTS &&
group->priority < FSNOTIFY_PRIO_CONTENT)
- goto fput_and_out;
+ return -EINVAL;
if (mask & FAN_FS_ERROR &&
mark_type != FAN_MARK_FILESYSTEM)
- goto fput_and_out;
+ return -EINVAL;
/*
* Evictable is only relevant for inode marks, because only inode object
@@ -1791,7 +1781,7 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
*/
if (flags & FAN_MARK_EVICTABLE &&
mark_type != FAN_MARK_INODE)
- goto fput_and_out;
+ return -EINVAL;
/*
* Events that do not carry enough information to report
@@ -1803,7 +1793,7 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
fid_mode = FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS);
if (mask & ~(FANOTIFY_FD_EVENTS|FANOTIFY_EVENT_FLAGS) &&
(!fid_mode || mark_type == FAN_MARK_MOUNT))
- goto fput_and_out;
+ return -EINVAL;
/*
* FAN_RENAME uses special info type records to report the old and
@@ -1811,23 +1801,22 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
* useful and was not implemented.
*/
if (mask & FAN_RENAME && !(fid_mode & FAN_REPORT_NAME))
- goto fput_and_out;
+ return -EINVAL;
if (mark_cmd == FAN_MARK_FLUSH) {
- ret = 0;
if (mark_type == FAN_MARK_MOUNT)
fsnotify_clear_vfsmount_marks_by_group(group);
else if (mark_type == FAN_MARK_FILESYSTEM)
fsnotify_clear_sb_marks_by_group(group);
else
fsnotify_clear_inode_marks_by_group(group);
- goto fput_and_out;
+ return 0;
}
ret = fanotify_find_path(dfd, pathname, &path, flags,
(mask & ALL_FSNOTIFY_EVENTS), obj_type);
if (ret)
- goto fput_and_out;
+ return ret;
if (mark_cmd == FAN_MARK_ADD) {
ret = fanotify_events_supported(group, &path, mask, flags);
@@ -1906,8 +1895,6 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
path_put_and_out:
path_put(&path);
-fput_and_out:
- fdput(f);
return ret;
}
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 0794dca..e0c4895 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -732,7 +732,6 @@ SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname,
struct fsnotify_group *group;
struct inode *inode;
struct path path;
- struct fd f;
int ret;
unsigned flags = 0;
@@ -752,21 +751,17 @@ SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname,
if (unlikely(!(mask & ALL_INOTIFY_BITS)))
return -EINVAL;
- f = fdget(fd);
- if (unlikely(!fd_file(f)))
+ CLASS(fd, f)(fd);
+ if (fd_empty(f))
return -EBADF;
/* IN_MASK_ADD and IN_MASK_CREATE don't make sense together */
- if (unlikely((mask & IN_MASK_ADD) && (mask & IN_MASK_CREATE))) {
- ret = -EINVAL;
- goto fput_and_out;
- }
+ if (unlikely((mask & IN_MASK_ADD) && (mask & IN_MASK_CREATE)))
+ return -EINVAL;
/* verify that this is indeed an inotify instance */
- if (unlikely(fd_file(f)->f_op != &inotify_fops)) {
- ret = -EINVAL;
- goto fput_and_out;
- }
+ if (unlikely(fd_file(f)->f_op != &inotify_fops))
+ return -EINVAL;
if (!(mask & IN_DONT_FOLLOW))
flags |= LOOKUP_FOLLOW;
@@ -776,7 +771,7 @@ SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname,
ret = inotify_find_inode(pathname, &path, flags,
(mask & IN_ALL_EVENTS));
if (ret)
- goto fput_and_out;
+ return ret;
/* inode held in place by reference to path; group by fget on fd */
inode = path.dentry->d_inode;
@@ -785,8 +780,6 @@ SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname,
/* create/update an inode mark */
ret = inotify_update_watch(group, inode, mask);
path_put(&path);
-fput_and_out:
- fdput(f);
return ret;
}
@@ -794,33 +787,26 @@ SYSCALL_DEFINE2(inotify_rm_watch, int, fd, __s32, wd)
{
struct fsnotify_group *group;
struct inotify_inode_mark *i_mark;
- struct fd f;
- int ret = -EINVAL;
+ CLASS(fd, f)(fd);
- f = fdget(fd);
- if (unlikely(!fd_file(f)))
+ if (fd_empty(f))
return -EBADF;
/* verify that this is indeed an inotify instance */
if (unlikely(fd_file(f)->f_op != &inotify_fops))
- goto out;
+ return -EINVAL;
group = fd_file(f)->private_data;
i_mark = inotify_idr_find(group, wd);
if (unlikely(!i_mark))
- goto out;
-
- ret = 0;
+ return -EINVAL;
fsnotify_destroy_mark(&i_mark->fsn_mark, group);
/* match ref taken by inotify_idr_find */
fsnotify_put_mark(&i_mark->fsn_mark);
-
-out:
- fdput(f);
- return ret;
+ return 0;
}
/*
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 4b9f45d..4200a03 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -1765,42 +1765,41 @@ static ssize_t o2hb_region_dev_store(struct config_item *item,
long fd;
int sectsize;
char *p = (char *)page;
- struct fd f;
ssize_t ret = -EINVAL;
int live_threshold;
if (reg->hr_bdev_file)
- goto out;
+ return -EINVAL;
/* We can't heartbeat without having had our node number
* configured yet. */
if (o2nm_this_node() == O2NM_MAX_NODES)
- goto out;
+ return -EINVAL;
fd = simple_strtol(p, &p, 0);
if (!p || (*p && (*p != '\n')))
- goto out;
+ return -EINVAL;
if (fd < 0 || fd >= INT_MAX)
- goto out;
+ return -EINVAL;
- f = fdget(fd);
- if (fd_file(f) == NULL)
- goto out;
+ CLASS(fd, f)(fd);
+ if (fd_empty(f))
+ return -EINVAL;
if (reg->hr_blocks == 0 || reg->hr_start_block == 0 ||
reg->hr_block_bytes == 0)
- goto out2;
+ return -EINVAL;
if (!S_ISBLK(fd_file(f)->f_mapping->host->i_mode))
- goto out2;
+ return -EINVAL;
reg->hr_bdev_file = bdev_file_open_by_dev(fd_file(f)->f_mapping->host->i_rdev,
BLK_OPEN_WRITE | BLK_OPEN_READ, NULL, NULL);
if (IS_ERR(reg->hr_bdev_file)) {
ret = PTR_ERR(reg->hr_bdev_file);
reg->hr_bdev_file = NULL;
- goto out2;
+ return ret;
}
sectsize = bdev_logical_block_size(reg_bdev(reg));
@@ -1906,9 +1905,6 @@ static ssize_t o2hb_region_dev_store(struct config_item *item,
fput(reg->hr_bdev_file);
reg->hr_bdev_file = NULL;
}
-out2:
- fdput(f);
-out:
return ret;
}
diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c
index 96b6847..b95724b 100644
--- a/fs/ocfs2/export.c
+++ b/fs/ocfs2/export.c
@@ -280,5 +280,4 @@ const struct export_operations ocfs2_export_ops = {
.fh_to_dentry = ocfs2_fh_to_dentry,
.fh_to_parent = ocfs2_fh_to_parent,
.get_parent = ocfs2_get_parent,
- .flags = EXPORT_OP_ASYNC_LOCK,
};
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 06af219..4fa6c84 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2812,6 +2812,7 @@ const struct file_operations ocfs2_fops = {
.splice_write = iter_file_splice_write,
.fallocate = ocfs2_fallocate,
.remap_file_range = ocfs2_remap_file_range,
+ .fop_flags = FOP_ASYNC_LOCK,
};
WRAP_DIR_ITER(ocfs2_readdir) // FIXME!
@@ -2828,6 +2829,7 @@ const struct file_operations ocfs2_dops = {
#endif
.lock = ocfs2_lock,
.flock = ocfs2_flock,
+ .fop_flags = FOP_ASYNC_LOCK,
};
/*
diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c
index c4a4016..b0733c0 100644
--- a/fs/ocfs2/resize.c
+++ b/fs/ocfs2/resize.c
@@ -574,6 +574,8 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
ocfs2_commit_trans(osb, handle);
out_free_group_bh:
+ if (ret < 0)
+ ocfs2_remove_from_cache(INODE_CACHE(inode), group_bh);
brelse(group_bh);
out_unlock:
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 3d40462..c79b429 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -2319,6 +2319,7 @@ static int ocfs2_verify_volume(struct ocfs2_dinode *di,
struct ocfs2_blockcheck_stats *stats)
{
int status = -EAGAIN;
+ u32 blksz_bits;
if (memcmp(di->i_signature, OCFS2_SUPER_BLOCK_SIGNATURE,
strlen(OCFS2_SUPER_BLOCK_SIGNATURE)) == 0) {
@@ -2333,11 +2334,15 @@ static int ocfs2_verify_volume(struct ocfs2_dinode *di,
goto out;
}
status = -EINVAL;
- if ((1 << le32_to_cpu(di->id2.i_super.s_blocksize_bits)) != blksz) {
+ /* Acceptable block sizes are 512 bytes, 1K, 2K and 4K. */
+ blksz_bits = le32_to_cpu(di->id2.i_super.s_blocksize_bits);
+ if (blksz_bits < 9 || blksz_bits > 12) {
mlog(ML_ERROR, "found superblock with incorrect block "
- "size: found %u, should be %u\n",
- 1 << le32_to_cpu(di->id2.i_super.s_blocksize_bits),
- blksz);
+ "size bits: found %u, should be 9, 10, 11, or 12\n",
+ blksz_bits);
+ } else if ((1 << le32_to_cpu(blksz_bits)) != blksz) {
+ mlog(ML_ERROR, "found superblock with incorrect block "
+ "size: found %u, should be %u\n", 1 << blksz_bits, blksz);
} else if (le16_to_cpu(di->id2.i_super.s_major_rev_level) !=
OCFS2_MAJOR_REV_LEVEL ||
le16_to_cpu(di->id2.i_super.s_minor_rev_level) !=
diff --git a/fs/open.c b/fs/open.c
index 5da4df2..4b37c59 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -187,19 +187,13 @@ long do_ftruncate(struct file *file, loff_t length, int small)
long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
{
- struct fd f;
- int error;
-
if (length < 0)
return -EINVAL;
- f = fdget(fd);
- if (!fd_file(f))
+ CLASS(fd, f)(fd);
+ if (fd_empty(f))
return -EBADF;
- error = do_ftruncate(fd_file(f), length, small);
-
- fdput(f);
- return error;
+ return do_ftruncate(fd_file(f), length, small);
}
SYSCALL_DEFINE2(ftruncate, unsigned int, fd, off_t, length)
@@ -349,14 +343,12 @@ EXPORT_SYMBOL_GPL(vfs_fallocate);
int ksys_fallocate(int fd, int mode, loff_t offset, loff_t len)
{
- struct fd f = fdget(fd);
- int error = -EBADF;
+ CLASS(fd, f)(fd);
- if (fd_file(f)) {
- error = vfs_fallocate(fd_file(f), mode, offset, len);
- fdput(f);
- }
- return error;
+ if (fd_empty(f))
+ return -EBADF;
+
+ return vfs_fallocate(fd_file(f), mode, offset, len);
}
SYSCALL_DEFINE4(fallocate, int, fd, int, mode, loff_t, offset, loff_t, len)
@@ -580,23 +572,18 @@ SYSCALL_DEFINE1(chdir, const char __user *, filename)
SYSCALL_DEFINE1(fchdir, unsigned int, fd)
{
- struct fd f = fdget_raw(fd);
+ CLASS(fd_raw, f)(fd);
int error;
- error = -EBADF;
- if (!fd_file(f))
- goto out;
+ if (fd_empty(f))
+ return -EBADF;
- error = -ENOTDIR;
if (!d_can_lookup(fd_file(f)->f_path.dentry))
- goto out_putf;
+ return -ENOTDIR;
error = file_permission(fd_file(f), MAY_EXEC | MAY_CHDIR);
if (!error)
set_fs_pwd(current->fs, &fd_file(f)->f_path);
-out_putf:
- fdput(f);
-out:
return error;
}
@@ -671,14 +658,12 @@ int vfs_fchmod(struct file *file, umode_t mode)
SYSCALL_DEFINE2(fchmod, unsigned int, fd, umode_t, mode)
{
- struct fd f = fdget(fd);
- int err = -EBADF;
+ CLASS(fd, f)(fd);
- if (fd_file(f)) {
- err = vfs_fchmod(fd_file(f), mode);
- fdput(f);
- }
- return err;
+ if (fd_empty(f))
+ return -EBADF;
+
+ return vfs_fchmod(fd_file(f), mode);
}
static int do_fchmodat(int dfd, const char __user *filename, umode_t mode,
@@ -865,14 +850,12 @@ int vfs_fchown(struct file *file, uid_t user, gid_t group)
int ksys_fchown(unsigned int fd, uid_t user, gid_t group)
{
- struct fd f = fdget(fd);
- int error = -EBADF;
+ CLASS(fd, f)(fd);
- if (fd_file(f)) {
- error = vfs_fchown(fd_file(f), user, group);
- fdput(f);
- }
- return error;
+ if (fd_empty(f))
+ return -EBADF;
+
+ return vfs_fchown(fd_file(f), user, group);
}
SYSCALL_DEFINE3(fchown, unsigned int, fd, uid_t, user, gid_t, group)
@@ -1576,23 +1559,6 @@ SYSCALL_DEFINE1(close, unsigned int, fd)
return retval;
}
-/**
- * sys_close_range() - Close all file descriptors in a given range.
- *
- * @fd: starting file descriptor to close
- * @max_fd: last file descriptor to close
- * @flags: reserved for future extensions
- *
- * This closes a range of file descriptors. All file descriptors
- * from @fd up to and including @max_fd are closed.
- * Currently, errors to close a given file descriptor are ignored.
- */
-SYSCALL_DEFINE3(close_range, unsigned int, fd, unsigned int, max_fd,
- unsigned int, flags)
-{
- return __close_range(fd, max_fd, flags);
-}
-
/*
* This routine simulates a hangup on the tty, to arrange that users
* are given clean terminals at login time.
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index 2ed6ad6..ee2cbd0 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -16,7 +16,6 @@
#include <linux/sched/signal.h>
#include <linux/cred.h>
#include <linux/namei.h>
-#include <linux/fdtable.h>
#include <linux/ratelimit.h>
#include <linux/exportfs.h>
#include "overlayfs.h"
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index 35fd3e3..8b31f44 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -170,7 +170,7 @@ int ovl_getattr(struct mnt_idmap *idmap, const struct path *path,
type = ovl_path_real(dentry, &realpath);
old_cred = ovl_override_creds(dentry->d_sb);
- err = ovl_do_getattr(&realpath, stat, request_mask, flags);
+ err = vfs_getattr_nosec(&realpath, stat, request_mask, flags);
if (err)
goto out;
@@ -195,8 +195,8 @@ int ovl_getattr(struct mnt_idmap *idmap, const struct path *path,
(!is_dir ? STATX_NLINK : 0);
ovl_path_lower(dentry, &realpath);
- err = ovl_do_getattr(&realpath, &lowerstat, lowermask,
- flags);
+ err = vfs_getattr_nosec(&realpath, &lowerstat, lowermask,
+ flags);
if (err)
goto out;
@@ -248,8 +248,8 @@ int ovl_getattr(struct mnt_idmap *idmap, const struct path *path,
ovl_path_lowerdata(dentry, &realpath);
if (realpath.dentry) {
- err = ovl_do_getattr(&realpath, &lowerdatastat,
- lowermask, flags);
+ err = vfs_getattr_nosec(&realpath, &lowerdatastat,
+ lowermask, flags);
if (err)
goto out;
} else {
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index 0bfe35d..910dbbb 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -412,14 +412,6 @@ static inline bool ovl_open_flags_need_copy_up(int flags)
return ((OPEN_FMODE(flags) & FMODE_WRITE) || (flags & O_TRUNC));
}
-static inline int ovl_do_getattr(const struct path *path, struct kstat *stat,
- u32 request_mask, unsigned int flags)
-{
- if (flags & AT_GETATTR_NOSEC)
- return vfs_getattr_nosec(path, stat, request_mask, flags);
- return vfs_getattr(path, stat, request_mask, flags);
-}
-
/* util.c */
int ovl_get_write_access(struct dentry *dentry);
void ovl_put_write_access(struct dentry *dentry);
diff --git a/fs/overlayfs/params.c b/fs/overlayfs/params.c
index e42546c..1115c22 100644
--- a/fs/overlayfs/params.c
+++ b/fs/overlayfs/params.c
@@ -141,10 +141,10 @@ static int ovl_verity_mode_def(void)
const struct fs_parameter_spec ovl_parameter_spec[] = {
fsparam_string_empty("lowerdir", Opt_lowerdir),
- fsparam_string("lowerdir+", Opt_lowerdir_add),
- fsparam_string("datadir+", Opt_datadir_add),
- fsparam_string("upperdir", Opt_upperdir),
- fsparam_string("workdir", Opt_workdir),
+ fsparam_file_or_string("lowerdir+", Opt_lowerdir_add),
+ fsparam_file_or_string("datadir+", Opt_datadir_add),
+ fsparam_file_or_string("upperdir", Opt_upperdir),
+ fsparam_file_or_string("workdir", Opt_workdir),
fsparam_flag("default_permissions", Opt_default_permissions),
fsparam_enum("redirect_dir", Opt_redirect_dir, ovl_parameter_redirect_dir),
fsparam_enum("index", Opt_index, ovl_parameter_bool),
@@ -367,40 +367,100 @@ static void ovl_add_layer(struct fs_context *fc, enum ovl_opt layer,
}
}
-static int ovl_parse_layer(struct fs_context *fc, const char *layer_name, enum ovl_opt layer)
+static inline bool is_upper_layer(enum ovl_opt layer)
{
- char *name = kstrdup(layer_name, GFP_KERNEL);
- bool upper = (layer == Opt_upperdir || layer == Opt_workdir);
- struct path path;
+ return layer == Opt_upperdir || layer == Opt_workdir;
+}
+
+/* Handle non-file descriptor-based layer options that require path lookup. */
+static inline int ovl_kern_path(const char *layer_name, struct path *layer_path,
+ enum ovl_opt layer)
+{
int err;
+ switch (layer) {
+ case Opt_upperdir:
+ fallthrough;
+ case Opt_workdir:
+ fallthrough;
+ case Opt_lowerdir:
+ err = ovl_mount_dir(layer_name, layer_path);
+ break;
+ case Opt_lowerdir_add:
+ fallthrough;
+ case Opt_datadir_add:
+ err = ovl_mount_dir_noesc(layer_name, layer_path);
+ break;
+ default:
+ WARN_ON_ONCE(true);
+ err = -EINVAL;
+ }
+
+ return err;
+}
+
+static int ovl_do_parse_layer(struct fs_context *fc, const char *layer_name,
+ struct path *layer_path, enum ovl_opt layer)
+{
+ char *name __free(kfree) = kstrdup(layer_name, GFP_KERNEL);
+ bool upper;
+ int err = 0;
+
if (!name)
return -ENOMEM;
- if (upper || layer == Opt_lowerdir)
- err = ovl_mount_dir(name, &path);
- else
- err = ovl_mount_dir_noesc(name, &path);
+ upper = is_upper_layer(layer);
+ err = ovl_mount_dir_check(fc, layer_path, layer, name, upper);
if (err)
- goto out_free;
-
- err = ovl_mount_dir_check(fc, &path, layer, name, upper);
- if (err)
- goto out_put;
+ return err;
if (!upper) {
err = ovl_ctx_realloc_lower(fc);
if (err)
- goto out_put;
+ return err;
}
/* Store the user provided path string in ctx to show in mountinfo */
- ovl_add_layer(fc, layer, &path, &name);
+ ovl_add_layer(fc, layer, layer_path, &name);
+ return err;
+}
-out_put:
- path_put(&path);
-out_free:
- kfree(name);
+static int ovl_parse_layer(struct fs_context *fc, struct fs_parameter *param,
+ enum ovl_opt layer)
+{
+ struct path layer_path __free(path_put) = {};
+ int err = 0;
+
+ switch (param->type) {
+ case fs_value_is_string:
+ err = ovl_kern_path(param->string, &layer_path, layer);
+ if (err)
+ return err;
+ err = ovl_do_parse_layer(fc, param->string, &layer_path, layer);
+ break;
+ case fs_value_is_file: {
+ char *buf __free(kfree);
+ char *layer_name;
+
+ buf = kmalloc(PATH_MAX, GFP_KERNEL_ACCOUNT);
+ if (!buf)
+ return -ENOMEM;
+
+ layer_path = param->file->f_path;
+ path_get(&layer_path);
+
+ layer_name = d_path(&layer_path, buf, PATH_MAX);
+ if (IS_ERR(layer_name))
+ return PTR_ERR(layer_name);
+
+ err = ovl_do_parse_layer(fc, layer_name, &layer_path, layer);
+ break;
+ }
+ default:
+ WARN_ON_ONCE(true);
+ err = -EINVAL;
+ }
+
return err;
}
@@ -474,7 +534,13 @@ static int ovl_parse_param_lowerdir(const char *name, struct fs_context *fc)
iter = dup;
for (nr = 0; nr < nr_lower; nr++) {
- err = ovl_parse_layer(fc, iter, Opt_lowerdir);
+ struct path path __free(path_put) = {};
+
+ err = ovl_kern_path(iter, &path, Opt_lowerdir);
+ if (err)
+ goto out_err;
+
+ err = ovl_do_parse_layer(fc, iter, &path, Opt_lowerdir);
if (err)
goto out_err;
@@ -555,7 +621,7 @@ static int ovl_parse_param(struct fs_context *fc, struct fs_parameter *param)
case Opt_datadir_add:
case Opt_upperdir:
case Opt_workdir:
- err = ovl_parse_layer(fc, param->string, opt);
+ err = ovl_parse_layer(fc, param, opt);
break;
case Opt_default_permissions:
config->default_permissions = true;
diff --git a/fs/pidfs.c b/fs/pidfs.c
index 80675b6..618abb1 100644
--- a/fs/pidfs.c
+++ b/fs/pidfs.c
@@ -2,6 +2,7 @@
#include <linux/anon_inodes.h>
#include <linux/file.h>
#include <linux/fs.h>
+#include <linux/cgroup.h>
#include <linux/magic.h>
#include <linux/mount.h>
#include <linux/pid.h>
@@ -114,6 +115,81 @@ static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts)
return poll_flags;
}
+static long pidfd_info(struct task_struct *task, unsigned int cmd, unsigned long arg)
+{
+ struct pidfd_info __user *uinfo = (struct pidfd_info __user *)arg;
+ size_t usize = _IOC_SIZE(cmd);
+ struct pidfd_info kinfo = {};
+ struct user_namespace *user_ns;
+ const struct cred *c;
+ __u64 mask;
+#ifdef CONFIG_CGROUPS
+ struct cgroup *cgrp;
+#endif
+
+ if (!uinfo)
+ return -EINVAL;
+ if (usize < PIDFD_INFO_SIZE_VER0)
+ return -EINVAL; /* First version, no smaller struct possible */
+
+ if (copy_from_user(&mask, &uinfo->mask, sizeof(mask)))
+ return -EFAULT;
+
+ c = get_task_cred(task);
+ if (!c)
+ return -ESRCH;
+
+ /* Unconditionally return identifiers and credentials, the rest only on request */
+
+ user_ns = current_user_ns();
+ kinfo.ruid = from_kuid_munged(user_ns, c->uid);
+ kinfo.rgid = from_kgid_munged(user_ns, c->gid);
+ kinfo.euid = from_kuid_munged(user_ns, c->euid);
+ kinfo.egid = from_kgid_munged(user_ns, c->egid);
+ kinfo.suid = from_kuid_munged(user_ns, c->suid);
+ kinfo.sgid = from_kgid_munged(user_ns, c->sgid);
+ kinfo.fsuid = from_kuid_munged(user_ns, c->fsuid);
+ kinfo.fsgid = from_kgid_munged(user_ns, c->fsgid);
+ kinfo.mask |= PIDFD_INFO_CREDS;
+ put_cred(c);
+
+#ifdef CONFIG_CGROUPS
+ rcu_read_lock();
+ cgrp = task_dfl_cgroup(task);
+ kinfo.cgroupid = cgroup_id(cgrp);
+ kinfo.mask |= PIDFD_INFO_CGROUPID;
+ rcu_read_unlock();
+#endif
+
+ /*
+ * Copy pid/tgid last, to reduce the chances the information might be
+ * stale. Note that it is not possible to ensure it will be valid as the
+ * task might return as soon as the copy_to_user finishes, but that's ok
+ * and userspace expects that might happen and can act accordingly, so
+ * this is just best-effort. What we can do however is checking that all
+ * the fields are set correctly, or return ESRCH to avoid providing
+ * incomplete information. */
+
+ kinfo.ppid = task_ppid_nr_ns(task, NULL);
+ kinfo.tgid = task_tgid_vnr(task);
+ kinfo.pid = task_pid_vnr(task);
+ kinfo.mask |= PIDFD_INFO_PID;
+
+ if (kinfo.pid == 0 || kinfo.tgid == 0 || (kinfo.ppid == 0 && kinfo.pid != 1))
+ return -ESRCH;
+
+ /*
+ * If userspace and the kernel have the same struct size it can just
+ * be copied. If userspace provides an older struct, only the bits that
+ * userspace knows about will be copied. If userspace provides a new
+ * struct, only the bits that the kernel knows about will be copied.
+ */
+ if (copy_to_user(uinfo, &kinfo, min(usize, sizeof(kinfo))))
+ return -EFAULT;
+
+ return 0;
+}
+
static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
struct task_struct *task __free(put_task) = NULL;
@@ -122,13 +198,17 @@ static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
struct ns_common *ns_common = NULL;
struct pid_namespace *pid_ns;
- if (arg)
- return -EINVAL;
-
task = get_pid_task(pid, PIDTYPE_PID);
if (!task)
return -ESRCH;
+ /* Extensible IOCTL that does not open namespace FDs, take a shortcut */
+ if (_IOC_NR(cmd) == _IOC_NR(PIDFD_GET_INFO))
+ return pidfd_info(task, cmd, arg);
+
+ if (arg)
+ return -EINVAL;
+
scoped_guard(task_lock, task) {
nsp = task->nsproxy;
if (nsp)
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index 6c66a37..4050942 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -200,11 +200,11 @@ EXPORT_SYMBOL(posix_acl_init);
* Allocate a new ACL with the specified number of entries.
*/
struct posix_acl *
-posix_acl_alloc(int count, gfp_t flags)
+posix_acl_alloc(unsigned int count, gfp_t flags)
{
- const size_t size = sizeof(struct posix_acl) +
- count * sizeof(struct posix_acl_entry);
- struct posix_acl *acl = kmalloc(size, flags);
+ struct posix_acl *acl;
+
+ acl = kmalloc(struct_size(acl, a_entries, count), flags);
if (acl)
posix_acl_init(acl, count);
return acl;
@@ -220,9 +220,8 @@ posix_acl_clone(const struct posix_acl *acl, gfp_t flags)
struct posix_acl *clone = NULL;
if (acl) {
- int size = sizeof(struct posix_acl) + acl->a_count *
- sizeof(struct posix_acl_entry);
- clone = kmemdup(acl, size, flags);
+ clone = kmemdup(acl, struct_size(acl, a_entries, acl->a_count),
+ flags);
if (clone)
refcount_set(&clone->a_refcount, 1);
}
diff --git a/fs/proc/base.c b/fs/proc/base.c
index b31283d..e9d7ddc 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -58,7 +58,6 @@
#include <linux/init.h>
#include <linux/capability.h>
#include <linux/file.h>
-#include <linux/fdtable.h>
#include <linux/generic-radix-tree.h>
#include <linux/string.h>
#include <linux/seq_file.h>
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index 5e391cb..24baf23 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -116,9 +116,7 @@ static bool tid_fd_mode(struct task_struct *task, unsigned fd, fmode_t *mode)
{
struct file *file;
- rcu_read_lock();
- file = task_lookup_fdget_rcu(task, fd);
- rcu_read_unlock();
+ file = fget_task(task, fd);
if (file) {
*mode = file->f_mode;
fput(file);
@@ -258,19 +256,17 @@ static int proc_readfd_common(struct file *file, struct dir_context *ctx,
if (!dir_emit_dots(file, ctx))
goto out;
- rcu_read_lock();
for (fd = ctx->pos - 2;; fd++) {
struct file *f;
struct fd_data data;
char name[10 + 1];
unsigned int len;
- f = task_lookup_next_fdget_rcu(p, &fd);
+ f = fget_task_next(p, &fd);
ctx->pos = fd + 2LL;
if (!f)
break;
data.mode = f->f_mode;
- rcu_read_unlock();
fput(f);
data.fd = fd;
@@ -278,11 +274,9 @@ static int proc_readfd_common(struct file *file, struct dir_context *ctx,
if (!proc_fill_cache(file, ctx,
name, len, instantiate, p,
&data))
- goto out;
+ break;
cond_resched();
- rcu_read_lock();
}
- rcu_read_unlock();
out:
put_task_struct(p);
return 0;
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index e52bd96..38a5a3e 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -978,7 +978,7 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
[ilog2(VM_UFFD_MINOR)] = "ui",
#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */
-#ifdef CONFIG_X86_USER_SHADOW_STACK
+#ifdef CONFIG_ARCH_HAS_USER_SHADOW_STACK
[ilog2(VM_SHADOW_STACK)] = "ss",
#endif
#if defined(CONFIG_64BIT) || defined(CONFIG_PPC32)
@@ -2665,8 +2665,10 @@ static int pagemap_scan_get_args(struct pm_scan_arg *arg,
return -EFAULT;
if (!arg->vec && arg->vec_len)
return -EINVAL;
+ if (UINT_MAX == SIZE_MAX && arg->vec_len > SIZE_MAX)
+ return -EINVAL;
if (arg->vec && !access_ok((void __user *)(long)arg->vec,
- arg->vec_len * sizeof(struct page_region)))
+ size_mul(arg->vec_len, sizeof(struct page_region))))
return -EFAULT;
/* Fixup default values */
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index 290157b..7c2b75a 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -976,21 +976,19 @@ SYSCALL_DEFINE4(quotactl_fd, unsigned int, fd, unsigned int, cmd,
struct super_block *sb;
unsigned int cmds = cmd >> SUBCMDSHIFT;
unsigned int type = cmd & SUBCMDMASK;
- struct fd f;
+ CLASS(fd_raw, f)(fd);
int ret;
- f = fdget_raw(fd);
- if (!fd_file(f))
+ if (fd_empty(f))
return -EBADF;
- ret = -EINVAL;
if (type >= MAXQUOTAS)
- goto out;
+ return -EINVAL;
if (quotactl_cmd_write(cmds)) {
ret = mnt_want_write(fd_file(f)->f_path.mnt);
if (ret)
- goto out;
+ return ret;
}
sb = fd_file(f)->f_path.mnt->mnt_sb;
@@ -1008,7 +1006,5 @@ SYSCALL_DEFINE4(quotactl_fd, unsigned int, fd, unsigned int, cmd,
if (quotactl_cmd_write(cmds))
mnt_drop_write(fd_file(f)->f_path.mnt);
-out:
- fdput(f);
return ret;
}
diff --git a/fs/read_write.c b/fs/read_write.c
index 64dc24a..a613324 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -386,8 +386,8 @@ EXPORT_SYMBOL(vfs_llseek);
static off_t ksys_lseek(unsigned int fd, off_t offset, unsigned int whence)
{
off_t retval;
- struct fd f = fdget_pos(fd);
- if (!fd_file(f))
+ CLASS(fd_pos, f)(fd);
+ if (fd_empty(f))
return -EBADF;
retval = -EINVAL;
@@ -397,7 +397,6 @@ static off_t ksys_lseek(unsigned int fd, off_t offset, unsigned int whence)
if (res != (loff_t)retval)
retval = -EOVERFLOW; /* LFS: should only happen on 32 bit platforms */
}
- fdput_pos(f);
return retval;
}
@@ -420,15 +419,14 @@ SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high,
unsigned int, whence)
{
int retval;
- struct fd f = fdget_pos(fd);
+ CLASS(fd_pos, f)(fd);
loff_t offset;
- if (!fd_file(f))
+ if (fd_empty(f))
return -EBADF;
- retval = -EINVAL;
if (whence > SEEK_MAX)
- goto out_putf;
+ return -EINVAL;
offset = vfs_llseek(fd_file(f), ((loff_t) offset_high << 32) | offset_low,
whence);
@@ -439,8 +437,6 @@ SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high,
if (!copy_to_user(result, &offset, sizeof(offset)))
retval = 0;
}
-out_putf:
- fdput_pos(f);
return retval;
}
#endif
@@ -700,10 +696,10 @@ static inline loff_t *file_ppos(struct file *file)
ssize_t ksys_read(unsigned int fd, char __user *buf, size_t count)
{
- struct fd f = fdget_pos(fd);
+ CLASS(fd_pos, f)(fd);
ssize_t ret = -EBADF;
- if (fd_file(f)) {
+ if (!fd_empty(f)) {
loff_t pos, *ppos = file_ppos(fd_file(f));
if (ppos) {
pos = *ppos;
@@ -712,7 +708,6 @@ ssize_t ksys_read(unsigned int fd, char __user *buf, size_t count)
ret = vfs_read(fd_file(f), buf, count, ppos);
if (ret >= 0 && ppos)
fd_file(f)->f_pos = pos;
- fdput_pos(f);
}
return ret;
}
@@ -724,10 +719,10 @@ SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
ssize_t ksys_write(unsigned int fd, const char __user *buf, size_t count)
{
- struct fd f = fdget_pos(fd);
+ CLASS(fd_pos, f)(fd);
ssize_t ret = -EBADF;
- if (fd_file(f)) {
+ if (!fd_empty(f)) {
loff_t pos, *ppos = file_ppos(fd_file(f));
if (ppos) {
pos = *ppos;
@@ -736,7 +731,6 @@ ssize_t ksys_write(unsigned int fd, const char __user *buf, size_t count)
ret = vfs_write(fd_file(f), buf, count, ppos);
if (ret >= 0 && ppos)
fd_file(f)->f_pos = pos;
- fdput_pos(f);
}
return ret;
@@ -751,21 +745,17 @@ SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
ssize_t ksys_pread64(unsigned int fd, char __user *buf, size_t count,
loff_t pos)
{
- struct fd f;
- ssize_t ret = -EBADF;
-
if (pos < 0)
return -EINVAL;
- f = fdget(fd);
- if (fd_file(f)) {
- ret = -ESPIPE;
- if (fd_file(f)->f_mode & FMODE_PREAD)
- ret = vfs_read(fd_file(f), buf, count, &pos);
- fdput(f);
- }
+ CLASS(fd, f)(fd);
+ if (fd_empty(f))
+ return -EBADF;
- return ret;
+ if (fd_file(f)->f_mode & FMODE_PREAD)
+ return vfs_read(fd_file(f), buf, count, &pos);
+
+ return -ESPIPE;
}
SYSCALL_DEFINE4(pread64, unsigned int, fd, char __user *, buf,
@@ -785,21 +775,17 @@ COMPAT_SYSCALL_DEFINE5(pread64, unsigned int, fd, char __user *, buf,
ssize_t ksys_pwrite64(unsigned int fd, const char __user *buf,
size_t count, loff_t pos)
{
- struct fd f;
- ssize_t ret = -EBADF;
-
if (pos < 0)
return -EINVAL;
- f = fdget(fd);
- if (fd_file(f)) {
- ret = -ESPIPE;
- if (fd_file(f)->f_mode & FMODE_PWRITE)
- ret = vfs_write(fd_file(f), buf, count, &pos);
- fdput(f);
- }
+ CLASS(fd, f)(fd);
+ if (fd_empty(f))
+ return -EBADF;
- return ret;
+ if (fd_file(f)->f_mode & FMODE_PWRITE)
+ return vfs_write(fd_file(f), buf, count, &pos);
+
+ return -ESPIPE;
}
SYSCALL_DEFINE4(pwrite64, unsigned int, fd, const char __user *, buf,
@@ -1075,10 +1061,10 @@ static ssize_t vfs_writev(struct file *file, const struct iovec __user *vec,
static ssize_t do_readv(unsigned long fd, const struct iovec __user *vec,
unsigned long vlen, rwf_t flags)
{
- struct fd f = fdget_pos(fd);
+ CLASS(fd_pos, f)(fd);
ssize_t ret = -EBADF;
- if (fd_file(f)) {
+ if (!fd_empty(f)) {
loff_t pos, *ppos = file_ppos(fd_file(f));
if (ppos) {
pos = *ppos;
@@ -1087,7 +1073,6 @@ static ssize_t do_readv(unsigned long fd, const struct iovec __user *vec,
ret = vfs_readv(fd_file(f), vec, vlen, ppos, flags);
if (ret >= 0 && ppos)
fd_file(f)->f_pos = pos;
- fdput_pos(f);
}
if (ret > 0)
@@ -1099,10 +1084,10 @@ static ssize_t do_readv(unsigned long fd, const struct iovec __user *vec,
static ssize_t do_writev(unsigned long fd, const struct iovec __user *vec,
unsigned long vlen, rwf_t flags)
{
- struct fd f = fdget_pos(fd);
+ CLASS(fd_pos, f)(fd);
ssize_t ret = -EBADF;
- if (fd_file(f)) {
+ if (!fd_empty(f)) {
loff_t pos, *ppos = file_ppos(fd_file(f));
if (ppos) {
pos = *ppos;
@@ -1111,7 +1096,6 @@ static ssize_t do_writev(unsigned long fd, const struct iovec __user *vec,
ret = vfs_writev(fd_file(f), vec, vlen, ppos, flags);
if (ret >= 0 && ppos)
fd_file(f)->f_pos = pos;
- fdput_pos(f);
}
if (ret > 0)
@@ -1129,18 +1113,16 @@ static inline loff_t pos_from_hilo(unsigned long high, unsigned long low)
static ssize_t do_preadv(unsigned long fd, const struct iovec __user *vec,
unsigned long vlen, loff_t pos, rwf_t flags)
{
- struct fd f;
ssize_t ret = -EBADF;
if (pos < 0)
return -EINVAL;
- f = fdget(fd);
- if (fd_file(f)) {
+ CLASS(fd, f)(fd);
+ if (!fd_empty(f)) {
ret = -ESPIPE;
if (fd_file(f)->f_mode & FMODE_PREAD)
ret = vfs_readv(fd_file(f), vec, vlen, &pos, flags);
- fdput(f);
}
if (ret > 0)
@@ -1152,18 +1134,16 @@ static ssize_t do_preadv(unsigned long fd, const struct iovec __user *vec,
static ssize_t do_pwritev(unsigned long fd, const struct iovec __user *vec,
unsigned long vlen, loff_t pos, rwf_t flags)
{
- struct fd f;
ssize_t ret = -EBADF;
if (pos < 0)
return -EINVAL;
- f = fdget(fd);
- if (fd_file(f)) {
+ CLASS(fd, f)(fd);
+ if (!fd_empty(f)) {
ret = -ESPIPE;
if (fd_file(f)->f_mode & FMODE_PWRITE)
ret = vfs_writev(fd_file(f), vec, vlen, &pos, flags);
- fdput(f);
}
if (ret > 0)
@@ -1315,7 +1295,6 @@ COMPAT_SYSCALL_DEFINE6(pwritev2, compat_ulong_t, fd,
static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
size_t count, loff_t max)
{
- struct fd in, out;
struct inode *in_inode, *out_inode;
struct pipe_inode_info *opipe;
loff_t pos;
@@ -1326,35 +1305,32 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
/*
* Get input file, and verify that it is ok..
*/
- retval = -EBADF;
- in = fdget(in_fd);
- if (!fd_file(in))
- goto out;
+ CLASS(fd, in)(in_fd);
+ if (fd_empty(in))
+ return -EBADF;
if (!(fd_file(in)->f_mode & FMODE_READ))
- goto fput_in;
- retval = -ESPIPE;
+ return -EBADF;
if (!ppos) {
pos = fd_file(in)->f_pos;
} else {
pos = *ppos;
if (!(fd_file(in)->f_mode & FMODE_PREAD))
- goto fput_in;
+ return -ESPIPE;
}
retval = rw_verify_area(READ, fd_file(in), &pos, count);
if (retval < 0)
- goto fput_in;
+ return retval;
if (count > MAX_RW_COUNT)
count = MAX_RW_COUNT;
/*
* Get output file, and verify that it is ok..
*/
- retval = -EBADF;
- out = fdget(out_fd);
- if (!fd_file(out))
- goto fput_in;
+ CLASS(fd, out)(out_fd);
+ if (fd_empty(out))
+ return -EBADF;
if (!(fd_file(out)->f_mode & FMODE_WRITE))
- goto fput_out;
+ return -EBADF;
in_inode = file_inode(fd_file(in));
out_inode = file_inode(fd_file(out));
out_pos = fd_file(out)->f_pos;
@@ -1363,9 +1339,8 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes);
if (unlikely(pos + count > max)) {
- retval = -EOVERFLOW;
if (pos >= max)
- goto fput_out;
+ return -EOVERFLOW;
count = max - pos;
}
@@ -1384,7 +1359,7 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
if (!opipe) {
retval = rw_verify_area(WRITE, fd_file(out), &out_pos, count);
if (retval < 0)
- goto fput_out;
+ return retval;
retval = do_splice_direct(fd_file(in), &pos, fd_file(out), &out_pos,
count, fl);
} else {
@@ -1410,12 +1385,6 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
inc_syscw(current);
if (pos > max)
retval = -EOVERFLOW;
-
-fput_out:
- fdput(out);
-fput_in:
- fdput(in);
-out:
return retval;
}
@@ -1671,36 +1640,32 @@ SYSCALL_DEFINE6(copy_file_range, int, fd_in, loff_t __user *, off_in,
{
loff_t pos_in;
loff_t pos_out;
- struct fd f_in;
- struct fd f_out;
ssize_t ret = -EBADF;
- f_in = fdget(fd_in);
- if (!fd_file(f_in))
- goto out2;
+ CLASS(fd, f_in)(fd_in);
+ if (fd_empty(f_in))
+ return -EBADF;
- f_out = fdget(fd_out);
- if (!fd_file(f_out))
- goto out1;
+ CLASS(fd, f_out)(fd_out);
+ if (fd_empty(f_out))
+ return -EBADF;
- ret = -EFAULT;
if (off_in) {
if (copy_from_user(&pos_in, off_in, sizeof(loff_t)))
- goto out;
+ return -EFAULT;
} else {
pos_in = fd_file(f_in)->f_pos;
}
if (off_out) {
if (copy_from_user(&pos_out, off_out, sizeof(loff_t)))
- goto out;
+ return -EFAULT;
} else {
pos_out = fd_file(f_out)->f_pos;
}
- ret = -EINVAL;
if (flags != 0)
- goto out;
+ return -EINVAL;
ret = vfs_copy_file_range(fd_file(f_in), pos_in, fd_file(f_out), pos_out, len,
flags);
@@ -1722,12 +1687,6 @@ SYSCALL_DEFINE6(copy_file_range, int, fd_in, loff_t __user *, off_in,
fd_file(f_out)->f_pos = pos_out;
}
}
-
-out:
- fdput(f_out);
-out1:
- fdput(f_in);
-out2:
return ret;
}
@@ -1830,18 +1789,22 @@ int generic_file_rw_checks(struct file *file_in, struct file *file_out)
return 0;
}
-bool generic_atomic_write_valid(struct iov_iter *iter, loff_t pos)
+int generic_atomic_write_valid(struct kiocb *iocb, struct iov_iter *iter)
{
size_t len = iov_iter_count(iter);
if (!iter_is_ubuf(iter))
- return false;
+ return -EINVAL;
if (!is_power_of_2(len))
- return false;
+ return -EINVAL;
- if (!IS_ALIGNED(pos, len))
- return false;
+ if (!IS_ALIGNED(iocb->ki_pos, len))
+ return -EINVAL;
- return true;
+ if (!(iocb->ki_flags & IOCB_DIRECT))
+ return -EOPNOTSUPP;
+
+ return 0;
}
+EXPORT_SYMBOL_GPL(generic_atomic_write_valid);
diff --git a/fs/readdir.c b/fs/readdir.c
index 6d29cab..0038efd 100644
--- a/fs/readdir.c
+++ b/fs/readdir.c
@@ -219,20 +219,19 @@ SYSCALL_DEFINE3(old_readdir, unsigned int, fd,
struct old_linux_dirent __user *, dirent, unsigned int, count)
{
int error;
- struct fd f = fdget_pos(fd);
+ CLASS(fd_pos, f)(fd);
struct readdir_callback buf = {
.ctx.actor = fillonedir,
.dirent = dirent
};
- if (!fd_file(f))
+ if (fd_empty(f))
return -EBADF;
error = iterate_dir(fd_file(f), &buf.ctx);
if (buf.result)
error = buf.result;
- fdput_pos(f);
return error;
}
@@ -309,7 +308,7 @@ static bool filldir(struct dir_context *ctx, const char *name, int namlen,
SYSCALL_DEFINE3(getdents, unsigned int, fd,
struct linux_dirent __user *, dirent, unsigned int, count)
{
- struct fd f;
+ CLASS(fd_pos, f)(fd);
struct getdents_callback buf = {
.ctx.actor = filldir,
.count = count,
@@ -317,8 +316,7 @@ SYSCALL_DEFINE3(getdents, unsigned int, fd,
};
int error;
- f = fdget_pos(fd);
- if (!fd_file(f))
+ if (fd_empty(f))
return -EBADF;
error = iterate_dir(fd_file(f), &buf.ctx);
@@ -333,7 +331,6 @@ SYSCALL_DEFINE3(getdents, unsigned int, fd,
else
error = count - buf.count;
}
- fdput_pos(f);
return error;
}
@@ -392,7 +389,7 @@ static bool filldir64(struct dir_context *ctx, const char *name, int namlen,
SYSCALL_DEFINE3(getdents64, unsigned int, fd,
struct linux_dirent64 __user *, dirent, unsigned int, count)
{
- struct fd f;
+ CLASS(fd_pos, f)(fd);
struct getdents_callback64 buf = {
.ctx.actor = filldir64,
.count = count,
@@ -400,8 +397,7 @@ SYSCALL_DEFINE3(getdents64, unsigned int, fd,
};
int error;
- f = fdget_pos(fd);
- if (!fd_file(f))
+ if (fd_empty(f))
return -EBADF;
error = iterate_dir(fd_file(f), &buf.ctx);
@@ -417,7 +413,6 @@ SYSCALL_DEFINE3(getdents64, unsigned int, fd,
else
error = count - buf.count;
}
- fdput_pos(f);
return error;
}
@@ -477,20 +472,19 @@ COMPAT_SYSCALL_DEFINE3(old_readdir, unsigned int, fd,
struct compat_old_linux_dirent __user *, dirent, unsigned int, count)
{
int error;
- struct fd f = fdget_pos(fd);
+ CLASS(fd_pos, f)(fd);
struct compat_readdir_callback buf = {
.ctx.actor = compat_fillonedir,
.dirent = dirent
};
- if (!fd_file(f))
+ if (fd_empty(f))
return -EBADF;
error = iterate_dir(fd_file(f), &buf.ctx);
if (buf.result)
error = buf.result;
- fdput_pos(f);
return error;
}
@@ -560,7 +554,7 @@ static bool compat_filldir(struct dir_context *ctx, const char *name, int namlen
COMPAT_SYSCALL_DEFINE3(getdents, unsigned int, fd,
struct compat_linux_dirent __user *, dirent, unsigned int, count)
{
- struct fd f;
+ CLASS(fd_pos, f)(fd);
struct compat_getdents_callback buf = {
.ctx.actor = compat_filldir,
.current_dir = dirent,
@@ -568,8 +562,7 @@ COMPAT_SYSCALL_DEFINE3(getdents, unsigned int, fd,
};
int error;
- f = fdget_pos(fd);
- if (!fd_file(f))
+ if (fd_empty(f))
return -EBADF;
error = iterate_dir(fd_file(f), &buf.ctx);
@@ -584,7 +577,6 @@ COMPAT_SYSCALL_DEFINE3(getdents, unsigned int, fd,
else
error = count - buf.count;
}
- fdput_pos(f);
return error;
}
#endif
diff --git a/fs/remap_range.c b/fs/remap_range.c
index 4403d5c..26afbbb 100644
--- a/fs/remap_range.c
+++ b/fs/remap_range.c
@@ -536,20 +536,19 @@ int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same)
}
for (i = 0, info = same->info; i < count; i++, info++) {
- struct fd dst_fd = fdget(info->dest_fd);
- struct file *dst_file = fd_file(dst_fd);
+ CLASS(fd, dst_fd)(info->dest_fd);
- if (!dst_file) {
+ if (fd_empty(dst_fd)) {
info->status = -EBADF;
goto next_loop;
}
if (info->reserved) {
info->status = -EINVAL;
- goto next_fdput;
+ goto next_loop;
}
- deduped = vfs_dedupe_file_range_one(file, off, dst_file,
+ deduped = vfs_dedupe_file_range_one(file, off, fd_file(dst_fd),
info->dest_offset, len,
REMAP_FILE_CAN_SHORTEN);
if (deduped == -EBADE)
@@ -559,8 +558,6 @@ int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same)
else
info->bytes_deduped = len;
-next_fdput:
- fdput(dst_fd);
next_loop:
if (fatal_signal_pending(current))
break;
diff --git a/fs/select.c b/fs/select.c
index a77907f..e223d1f 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -462,15 +462,22 @@ static int max_select_fd(unsigned long n, fd_set_bits *fds)
EPOLLNVAL)
#define POLLEX_SET (EPOLLPRI | EPOLLNVAL)
-static inline void wait_key_set(poll_table *wait, unsigned long in,
+static inline __poll_t select_poll_one(int fd, poll_table *wait, unsigned long in,
unsigned long out, unsigned long bit,
__poll_t ll_flag)
{
+ CLASS(fd, f)(fd);
+
+ if (fd_empty(f))
+ return EPOLLNVAL;
+
wait->_key = POLLEX_SET | ll_flag;
if (in & bit)
wait->_key |= POLLIN_SET;
if (out & bit)
wait->_key |= POLLOUT_SET;
+
+ return vfs_poll(fd_file(f), wait);
}
static noinline_for_stack int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time)
@@ -522,20 +529,12 @@ static noinline_for_stack int do_select(int n, fd_set_bits *fds, struct timespec
}
for (j = 0; j < BITS_PER_LONG; ++j, ++i, bit <<= 1) {
- struct fd f;
if (i >= n)
break;
if (!(bit & all_bits))
continue;
- mask = EPOLLNVAL;
- f = fdget(i);
- if (fd_file(f)) {
- wait_key_set(wait, in, out, bit,
- busy_flag);
- mask = vfs_poll(fd_file(f), wait);
-
- fdput(f);
- }
+ mask = select_poll_one(i, wait, in, out, bit,
+ busy_flag);
if ((mask & POLLIN_SET) && (in & bit)) {
res_in |= bit;
retval++;
@@ -856,15 +855,14 @@ static inline __poll_t do_pollfd(struct pollfd *pollfd, poll_table *pwait,
__poll_t busy_flag)
{
int fd = pollfd->fd;
- __poll_t mask = 0, filter;
- struct fd f;
+ __poll_t mask, filter;
if (fd < 0)
- goto out;
- mask = EPOLLNVAL;
- f = fdget(fd);
- if (!fd_file(f))
- goto out;
+ return 0;
+
+ CLASS(fd, f)(fd);
+ if (fd_empty(f))
+ return EPOLLNVAL;
/* userland u16 ->events contains POLL... bitmap */
filter = demangle_poll(pollfd->events) | EPOLLERR | EPOLLHUP;
@@ -872,13 +870,7 @@ static inline __poll_t do_pollfd(struct pollfd *pollfd, poll_table *pwait,
mask = vfs_poll(fd_file(f), pwait);
if (mask & busy_flag)
*can_busy_poll = true;
- mask &= filter; /* Mask out unneeded events. */
- fdput(f);
-
-out:
- /* ... and so does ->revents */
- pollfd->revents = mangle_poll(mask);
- return mask;
+ return mask & filter; /* Mask out unneeded events. */
}
static int do_poll(struct poll_list *list, struct poll_wqueues *wait,
@@ -910,6 +902,7 @@ static int do_poll(struct poll_list *list, struct poll_wqueues *wait,
pfd = walk->entries;
pfd_end = pfd + walk->len;
for (; pfd != pfd_end; pfd++) {
+ __poll_t mask;
/*
* Fish for events. If we found one, record it
* and kill poll_table->_qproc, so we don't
@@ -917,8 +910,9 @@ static int do_poll(struct poll_list *list, struct poll_wqueues *wait,
* this. They'll get immediately deregistered
* when we break out and return.
*/
- if (do_pollfd(pfd, pt, &can_busy_loop,
- busy_flag)) {
+ mask = do_pollfd(pfd, pt, &can_busy_loop, busy_flag);
+ pfd->revents = mangle_poll(mask);
+ if (mask) {
count++;
pt->_qproc = NULL;
/* found something, stop busy polling */
diff --git a/fs/seq_file.c b/fs/seq_file.c
index e676c8b..8bbb1ad 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -343,8 +343,8 @@ EXPORT_SYMBOL(seq_lseek);
/**
* seq_release - free the structures associated with sequential file.
- * @file: file in question
* @inode: its inode
+ * @file: file in question
*
* Frees the structures associated with sequential file; can be used
* as ->f_op->release() if you don't have private data to destroy.
diff --git a/fs/signalfd.c b/fs/signalfd.c
index 736bebf..d1a5f43 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -288,20 +288,17 @@ static int do_signalfd4(int ufd, sigset_t *mask, int flags)
fd_install(ufd, file);
} else {
- struct fd f = fdget(ufd);
- if (!fd_file(f))
+ CLASS(fd, f)(ufd);
+ if (fd_empty(f))
return -EBADF;
ctx = fd_file(f)->private_data;
- if (fd_file(f)->f_op != &signalfd_fops) {
- fdput(f);
+ if (fd_file(f)->f_op != &signalfd_fops)
return -EINVAL;
- }
spin_lock_irq(¤t->sighand->siglock);
ctx->sigmask = *mask;
spin_unlock_irq(¤t->sighand->siglock);
wake_up(¤t->sighand->signalfd_wqh);
- fdput(f);
}
return ufd;
diff --git a/fs/smb/client/ioctl.c b/fs/smb/client/ioctl.c
index 2ce1936..56439da 100644
--- a/fs/smb/client/ioctl.c
+++ b/fs/smb/client/ioctl.c
@@ -72,7 +72,6 @@ static long cifs_ioctl_copychunk(unsigned int xid, struct file *dst_file,
unsigned long srcfd)
{
int rc;
- struct fd src_file;
struct inode *src_inode;
cifs_dbg(FYI, "ioctl copychunk range\n");
@@ -89,8 +88,8 @@ static long cifs_ioctl_copychunk(unsigned int xid, struct file *dst_file,
return rc;
}
- src_file = fdget(srcfd);
- if (!fd_file(src_file)) {
+ CLASS(fd, src_file)(srcfd);
+ if (fd_empty(src_file)) {
rc = -EBADF;
goto out_drop_write;
}
@@ -98,20 +97,18 @@ static long cifs_ioctl_copychunk(unsigned int xid, struct file *dst_file,
if (fd_file(src_file)->f_op->unlocked_ioctl != cifs_ioctl) {
rc = -EBADF;
cifs_dbg(VFS, "src file seems to be from a different filesystem type\n");
- goto out_fput;
+ goto out_drop_write;
}
src_inode = file_inode(fd_file(src_file));
rc = -EINVAL;
if (S_ISDIR(src_inode->i_mode))
- goto out_fput;
+ goto out_drop_write;
rc = cifs_file_copychunk_range(xid, fd_file(src_file), 0, dst_file, 0,
src_inode->i_size, 0);
if (rc > 0)
rc = 0;
-out_fput:
- fdput(src_file);
out_drop_write:
mnt_drop_write_file(dst_file);
return rc;
diff --git a/fs/splice.c b/fs/splice.c
index 06232d7..2898fa1 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -1564,21 +1564,6 @@ static ssize_t vmsplice_to_pipe(struct file *file, struct iov_iter *iter,
return ret;
}
-static int vmsplice_type(struct fd f, int *type)
-{
- if (!fd_file(f))
- return -EBADF;
- if (fd_file(f)->f_mode & FMODE_WRITE) {
- *type = ITER_SOURCE;
- } else if (fd_file(f)->f_mode & FMODE_READ) {
- *type = ITER_DEST;
- } else {
- fdput(f);
- return -EBADF;
- }
- return 0;
-}
-
/*
* Note that vmsplice only really supports true splicing _from_ user memory
* to a pipe, not the other way around. Splicing from user memory is a simple
@@ -1602,21 +1587,25 @@ SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, uiov,
struct iovec *iov = iovstack;
struct iov_iter iter;
ssize_t error;
- struct fd f;
int type;
if (unlikely(flags & ~SPLICE_F_ALL))
return -EINVAL;
- f = fdget(fd);
- error = vmsplice_type(f, &type);
- if (error)
- return error;
+ CLASS(fd, f)(fd);
+ if (fd_empty(f))
+ return -EBADF;
+ if (fd_file(f)->f_mode & FMODE_WRITE)
+ type = ITER_SOURCE;
+ else if (fd_file(f)->f_mode & FMODE_READ)
+ type = ITER_DEST;
+ else
+ return -EBADF;
error = import_iovec(type, uiov, nr_segs,
ARRAY_SIZE(iovstack), &iov, &iter);
if (error < 0)
- goto out_fdput;
+ return error;
if (!iov_iter_count(&iter))
error = 0;
@@ -1626,8 +1615,6 @@ SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, uiov,
error = vmsplice_to_user(fd_file(f), &iter, flags);
kfree(iov);
-out_fdput:
- fdput(f);
return error;
}
@@ -1635,27 +1622,22 @@ SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in,
int, fd_out, loff_t __user *, off_out,
size_t, len, unsigned int, flags)
{
- struct fd in, out;
- ssize_t error;
-
if (unlikely(!len))
return 0;
if (unlikely(flags & ~SPLICE_F_ALL))
return -EINVAL;
- error = -EBADF;
- in = fdget(fd_in);
- if (fd_file(in)) {
- out = fdget(fd_out);
- if (fd_file(out)) {
- error = __do_splice(fd_file(in), off_in, fd_file(out), off_out,
+ CLASS(fd, in)(fd_in);
+ if (fd_empty(in))
+ return -EBADF;
+
+ CLASS(fd, out)(fd_out);
+ if (fd_empty(out))
+ return -EBADF;
+
+ return __do_splice(fd_file(in), off_in, fd_file(out), off_out,
len, flags);
- fdput(out);
- }
- fdput(in);
- }
- return error;
}
/*
@@ -2005,25 +1987,19 @@ ssize_t do_tee(struct file *in, struct file *out, size_t len,
SYSCALL_DEFINE4(tee, int, fdin, int, fdout, size_t, len, unsigned int, flags)
{
- struct fd in, out;
- ssize_t error;
-
if (unlikely(flags & ~SPLICE_F_ALL))
return -EINVAL;
if (unlikely(!len))
return 0;
- error = -EBADF;
- in = fdget(fdin);
- if (fd_file(in)) {
- out = fdget(fdout);
- if (fd_file(out)) {
- error = do_tee(fd_file(in), fd_file(out), len, flags);
- fdput(out);
- }
- fdput(in);
- }
+ CLASS(fd, in)(fdin);
+ if (fd_empty(in))
+ return -EBADF;
- return error;
+ CLASS(fd, out)(fdout);
+ if (fd_empty(out))
+ return -EBADF;
+
+ return do_tee(fd_file(in), fd_file(out), len, flags);
}
diff --git a/fs/stat.c b/fs/stat.c
index 41e5983..0870e96 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -23,10 +23,46 @@
#include <linux/uaccess.h>
#include <asm/unistd.h>
+#include <trace/events/timestamp.h>
+
#include "internal.h"
#include "mount.h"
/**
+ * fill_mg_cmtime - Fill in the mtime and ctime and flag ctime as QUERIED
+ * @stat: where to store the resulting values
+ * @request_mask: STATX_* values requested
+ * @inode: inode from which to grab the c/mtime
+ *
+ * Given @inode, grab the ctime and mtime out if it and store the result
+ * in @stat. When fetching the value, flag it as QUERIED (if not already)
+ * so the next write will record a distinct timestamp.
+ *
+ * NB: The QUERIED flag is tracked in the ctime, but we set it there even
+ * if only the mtime was requested, as that ensures that the next mtime
+ * change will be distinct.
+ */
+void fill_mg_cmtime(struct kstat *stat, u32 request_mask, struct inode *inode)
+{
+ atomic_t *pcn = (atomic_t *)&inode->i_ctime_nsec;
+
+ /* If neither time was requested, then don't report them */
+ if (!(request_mask & (STATX_CTIME|STATX_MTIME))) {
+ stat->result_mask &= ~(STATX_CTIME|STATX_MTIME);
+ return;
+ }
+
+ stat->mtime = inode_get_mtime(inode);
+ stat->ctime.tv_sec = inode->i_ctime_sec;
+ stat->ctime.tv_nsec = (u32)atomic_read(pcn);
+ if (!(stat->ctime.tv_nsec & I_CTIME_QUERIED))
+ stat->ctime.tv_nsec = ((u32)atomic_fetch_or(I_CTIME_QUERIED, pcn));
+ stat->ctime.tv_nsec &= ~I_CTIME_QUERIED;
+ trace_fill_mg_cmtime(inode, &stat->ctime, &stat->mtime);
+}
+EXPORT_SYMBOL(fill_mg_cmtime);
+
+/**
* generic_fillattr - Fill in the basic attributes from the inode struct
* @idmap: idmap of the mount the inode was found from
* @request_mask: statx request_mask
@@ -58,8 +94,14 @@ void generic_fillattr(struct mnt_idmap *idmap, u32 request_mask,
stat->rdev = inode->i_rdev;
stat->size = i_size_read(inode);
stat->atime = inode_get_atime(inode);
- stat->mtime = inode_get_mtime(inode);
- stat->ctime = inode_get_ctime(inode);
+
+ if (is_mgtime(inode)) {
+ fill_mg_cmtime(stat, request_mask, inode);
+ } else {
+ stat->ctime = inode_get_ctime(inode);
+ stat->mtime = inode_get_mtime(inode);
+ }
+
stat->blksize = i_blocksize(inode);
stat->blocks = inode->i_blocks;
@@ -165,7 +207,7 @@ int vfs_getattr_nosec(const struct path *path, struct kstat *stat,
if (inode->i_op->getattr)
return inode->i_op->getattr(idmap, path, stat,
request_mask,
- query_flags | AT_GETATTR_NOSEC);
+ query_flags);
generic_fillattr(idmap, request_mask, inode, stat);
return 0;
@@ -198,9 +240,6 @@ int vfs_getattr(const struct path *path, struct kstat *stat,
{
int retval;
- if (WARN_ON_ONCE(query_flags & AT_GETATTR_NOSEC))
- return -EPERM;
-
retval = security_inode_getattr(path);
if (retval)
return retval;
@@ -220,18 +259,13 @@ EXPORT_SYMBOL(vfs_getattr);
*/
int vfs_fstat(int fd, struct kstat *stat)
{
- struct fd f;
- int error;
-
- f = fdget_raw(fd);
- if (!fd_file(f))
+ CLASS(fd_raw, f)(fd);
+ if (fd_empty(f))
return -EBADF;
- error = vfs_getattr(&fd_file(f)->f_path, stat, STATX_BASIC_STATS, 0);
- fdput(f);
- return error;
+ return vfs_getattr(&fd_file(f)->f_path, stat, STATX_BASIC_STATS, 0);
}
-int getname_statx_lookup_flags(int flags)
+static int statx_lookup_flags(int flags)
{
int lookup_flags = 0;
@@ -239,8 +273,6 @@ int getname_statx_lookup_flags(int flags)
lookup_flags |= LOOKUP_FOLLOW;
if (!(flags & AT_NO_AUTOMOUNT))
lookup_flags |= LOOKUP_AUTOMOUNT;
- if (flags & AT_EMPTY_PATH)
- lookup_flags |= LOOKUP_EMPTY;
return lookup_flags;
}
@@ -277,7 +309,7 @@ static int vfs_statx_fd(int fd, int flags, struct kstat *stat,
u32 request_mask)
{
CLASS(fd_raw, f)(fd);
- if (!fd_file(f))
+ if (fd_empty(f))
return -EBADF;
return vfs_statx_path(&fd_file(f)->f_path, flags, stat, request_mask);
}
@@ -301,7 +333,7 @@ static int vfs_statx(int dfd, struct filename *filename, int flags,
struct kstat *stat, u32 request_mask)
{
struct path path;
- unsigned int lookup_flags = getname_statx_lookup_flags(flags);
+ unsigned int lookup_flags = statx_lookup_flags(flags);
int error;
if (flags & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT | AT_EMPTY_PATH |
@@ -326,18 +358,11 @@ int vfs_fstatat(int dfd, const char __user *filename,
{
int ret;
int statx_flags = flags | AT_NO_AUTOMOUNT;
- struct filename *name;
+ struct filename *name = getname_maybe_null(filename, flags);
- /*
- * Work around glibc turning fstat() into fstatat(AT_EMPTY_PATH)
- *
- * If AT_EMPTY_PATH is set, we expect the common case to be that
- * empty path, and avoid doing all the extra pathname work.
- */
- if (flags == AT_EMPTY_PATH && vfs_empty_path(dfd, filename))
+ if (!name && dfd >= 0)
return vfs_fstat(dfd, stat);
- name = getname_flags(filename, getname_statx_lookup_flags(statx_flags));
ret = vfs_statx(dfd, name, statx_flags, stat, STATX_BASIC_STATS);
putname(name);
@@ -774,24 +799,11 @@ SYSCALL_DEFINE5(statx,
struct statx __user *, buffer)
{
int ret;
- unsigned lflags;
- struct filename *name;
+ struct filename *name = getname_maybe_null(filename, flags);
- /*
- * Short-circuit handling of NULL and "" paths.
- *
- * For a NULL path we require and accept only the AT_EMPTY_PATH flag
- * (possibly |'d with AT_STATX flags).
- *
- * However, glibc on 32-bit architectures implements fstatat as statx
- * with the "" pathname and AT_NO_AUTOMOUNT | AT_EMPTY_PATH flags.
- * Supporting this results in the uglification below.
- */
- lflags = flags & ~(AT_NO_AUTOMOUNT | AT_STATX_SYNC_TYPE);
- if (lflags == AT_EMPTY_PATH && vfs_empty_path(dfd, filename))
+ if (!name && dfd >= 0)
return do_statx_fd(dfd, flags & ~AT_NO_AUTOMOUNT, mask, buffer);
- name = getname_flags(filename, getname_statx_lookup_flags(flags));
ret = do_statx(dfd, name, flags, mask, buffer);
putname(name);
diff --git a/fs/statfs.c b/fs/statfs.c
index 9c7bb27..a45ac85 100644
--- a/fs/statfs.c
+++ b/fs/statfs.c
@@ -114,13 +114,11 @@ int user_statfs(const char __user *pathname, struct kstatfs *st)
int fd_statfs(int fd, struct kstatfs *st)
{
- struct fd f = fdget_raw(fd);
- int error = -EBADF;
- if (fd_file(f)) {
- error = vfs_statfs(&fd_file(f)->f_path, st);
- fdput(f);
- }
- return error;
+ CLASS(fd_raw, f)(fd);
+
+ if (fd_empty(f))
+ return -EBADF;
+ return vfs_statfs(&fd_file(f)->f_path, st);
}
static int do_statfs_native(struct kstatfs *st, struct statfs __user *p)
diff --git a/fs/sync.c b/fs/sync.c
index 67df255..2955cd4 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -148,11 +148,11 @@ void emergency_sync(void)
*/
SYSCALL_DEFINE1(syncfs, int, fd)
{
- struct fd f = fdget(fd);
+ CLASS(fd, f)(fd);
struct super_block *sb;
int ret, ret2;
- if (!fd_file(f))
+ if (fd_empty(f))
return -EBADF;
sb = fd_file(f)->f_path.dentry->d_sb;
@@ -162,7 +162,6 @@ SYSCALL_DEFINE1(syncfs, int, fd)
ret2 = errseq_check_and_advance(&sb->s_wb_err, &fd_file(f)->f_sb_err);
- fdput(f);
return ret ? ret : ret2;
}
@@ -205,14 +204,12 @@ EXPORT_SYMBOL(vfs_fsync);
static int do_fsync(unsigned int fd, int datasync)
{
- struct fd f = fdget(fd);
- int ret = -EBADF;
+ CLASS(fd, f)(fd);
- if (fd_file(f)) {
- ret = vfs_fsync(fd_file(f), datasync);
- fdput(f);
- }
- return ret;
+ if (fd_empty(f))
+ return -EBADF;
+
+ return vfs_fsync(fd_file(f), datasync);
}
SYSCALL_DEFINE1(fsync, unsigned int, fd)
@@ -355,16 +352,12 @@ int sync_file_range(struct file *file, loff_t offset, loff_t nbytes,
int ksys_sync_file_range(int fd, loff_t offset, loff_t nbytes,
unsigned int flags)
{
- int ret;
- struct fd f;
+ CLASS(fd, f)(fd);
- ret = -EBADF;
- f = fdget(fd);
- if (fd_file(f))
- ret = sync_file_range(fd_file(f), offset, nbytes, flags);
+ if (fd_empty(f))
+ return -EBADF;
- fdput(f);
- return ret;
+ return sync_file_range(fd_file(f), offset, nbytes, flags);
}
SYSCALL_DEFINE4(sync_file_range, int, fd, loff_t, offset, loff_t, nbytes,
diff --git a/fs/timerfd.c b/fs/timerfd.c
index 137523e..4c32244 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -394,19 +394,6 @@ static const struct file_operations timerfd_fops = {
.unlocked_ioctl = timerfd_ioctl,
};
-static int timerfd_fget(int fd, struct fd *p)
-{
- struct fd f = fdget(fd);
- if (!fd_file(f))
- return -EBADF;
- if (fd_file(f)->f_op != &timerfd_fops) {
- fdput(f);
- return -EINVAL;
- }
- *p = f;
- return 0;
-}
-
SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags)
{
int ufd;
@@ -471,7 +458,6 @@ static int do_timerfd_settime(int ufd, int flags,
const struct itimerspec64 *new,
struct itimerspec64 *old)
{
- struct fd f;
struct timerfd_ctx *ctx;
int ret;
@@ -479,15 +465,17 @@ static int do_timerfd_settime(int ufd, int flags,
!itimerspec64_valid(new))
return -EINVAL;
- ret = timerfd_fget(ufd, &f);
- if (ret)
- return ret;
+ CLASS(fd, f)(ufd);
+ if (fd_empty(f))
+ return -EBADF;
+
+ if (fd_file(f)->f_op != &timerfd_fops)
+ return -EINVAL;
+
ctx = fd_file(f)->private_data;
- if (isalarm(ctx) && !capable(CAP_WAKE_ALARM)) {
- fdput(f);
+ if (isalarm(ctx) && !capable(CAP_WAKE_ALARM))
return -EPERM;
- }
timerfd_setup_cancel(ctx, flags);
@@ -535,17 +523,18 @@ static int do_timerfd_settime(int ufd, int flags,
ret = timerfd_setup(ctx, flags, new);
spin_unlock_irq(&ctx->wqh.lock);
- fdput(f);
return ret;
}
static int do_timerfd_gettime(int ufd, struct itimerspec64 *t)
{
- struct fd f;
struct timerfd_ctx *ctx;
- int ret = timerfd_fget(ufd, &f);
- if (ret)
- return ret;
+ CLASS(fd, f)(ufd);
+
+ if (fd_empty(f))
+ return -EBADF;
+ if (fd_file(f)->f_op != &timerfd_fops)
+ return -EINVAL;
ctx = fd_file(f)->private_data;
spin_lock_irq(&ctx->wqh.lock);
@@ -567,7 +556,6 @@ static int do_timerfd_gettime(int ufd, struct itimerspec64 *t)
t->it_value = ktime_to_timespec64(timerfd_get_remaining(ctx));
t->it_interval = ktime_to_timespec64(ctx->tintv);
spin_unlock_irq(&ctx->wqh.lock);
- fdput(f);
return 0;
}
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 2915830..3fb308b 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -19,9 +19,9 @@
#include <linux/module.h>
#include <linux/ctype.h>
#include <linux/kthread.h>
-#include <linux/parser.h>
+#include <linux/fs_context.h>
+#include <linux/fs_parser.h>
#include <linux/seq_file.h>
-#include <linux/mount.h>
#include <linux/math64.h>
#include <linux/writeback.h>
#include "ubifs.h"
@@ -981,177 +981,120 @@ enum {
Opt_auth_key,
Opt_auth_hash_name,
Opt_ignore,
- Opt_err,
};
-static const match_table_t tokens = {
- {Opt_fast_unmount, "fast_unmount"},
- {Opt_norm_unmount, "norm_unmount"},
- {Opt_bulk_read, "bulk_read"},
- {Opt_no_bulk_read, "no_bulk_read"},
- {Opt_chk_data_crc, "chk_data_crc"},
- {Opt_no_chk_data_crc, "no_chk_data_crc"},
- {Opt_override_compr, "compr=%s"},
- {Opt_auth_key, "auth_key=%s"},
- {Opt_auth_hash_name, "auth_hash_name=%s"},
- {Opt_ignore, "ubi=%s"},
- {Opt_ignore, "vol=%s"},
- {Opt_assert, "assert=%s"},
- {Opt_err, NULL},
+static const struct constant_table ubifs_param_compr[] = {
+ { "none", UBIFS_COMPR_NONE },
+ { "lzo", UBIFS_COMPR_LZO },
+ { "zlib", UBIFS_COMPR_ZLIB },
+ { "zstd", UBIFS_COMPR_ZSTD },
+ {}
+};
+
+static const struct constant_table ubifs_param_assert[] = {
+ { "report", ASSACT_REPORT },
+ { "read-only", ASSACT_RO },
+ { "panic", ASSACT_PANIC },
+ {}
+};
+
+static const struct fs_parameter_spec ubifs_fs_param_spec[] = {
+ fsparam_flag ("fast_unmount", Opt_fast_unmount),
+ fsparam_flag ("norm_unmount", Opt_norm_unmount),
+ fsparam_flag ("bulk_read", Opt_bulk_read),
+ fsparam_flag ("no_bulk_read", Opt_no_bulk_read),
+ fsparam_flag ("chk_data_crc", Opt_chk_data_crc),
+ fsparam_flag ("no_chk_data_crc", Opt_no_chk_data_crc),
+ fsparam_enum ("compr", Opt_override_compr, ubifs_param_compr),
+ fsparam_enum ("assert", Opt_assert, ubifs_param_assert),
+ fsparam_string ("auth_key", Opt_auth_key),
+ fsparam_string ("auth_hash_name", Opt_auth_hash_name),
+ fsparam_string ("ubi", Opt_ignore),
+ fsparam_string ("vol", Opt_ignore),
+ {}
+};
+
+struct ubifs_fs_context {
+ struct ubifs_mount_opts mount_opts;
+ char *auth_key_name;
+ char *auth_hash_name;
+ unsigned int no_chk_data_crc:1;
+ unsigned int bulk_read:1;
+ unsigned int default_compr:2;
+ unsigned int assert_action:2;
};
/**
- * parse_standard_option - parse a standard mount option.
- * @option: the option to parse
- *
- * Normally, standard mount options like "sync" are passed to file-systems as
- * flags. However, when a "rootflags=" kernel boot parameter is used, they may
- * be present in the options string. This function tries to deal with this
- * situation and parse standard options. Returns 0 if the option was not
- * recognized, and the corresponding integer flag if it was.
- *
- * UBIFS is only interested in the "sync" option, so do not check for anything
- * else.
- */
-static int parse_standard_option(const char *option)
-{
-
- pr_notice("UBIFS: parse %s\n", option);
- if (!strcmp(option, "sync"))
- return SB_SYNCHRONOUS;
- return 0;
-}
-
-/**
- * ubifs_parse_options - parse mount parameters.
- * @c: UBIFS file-system description object
- * @options: parameters to parse
- * @is_remount: non-zero if this is FS re-mount
+ * ubifs_parse_param - parse a parameter.
+ * @fc: the filesystem context
+ * @param: the parameter to parse
*
* This function parses UBIFS mount options and returns zero in case success
* and a negative error code in case of failure.
*/
-static int ubifs_parse_options(struct ubifs_info *c, char *options,
- int is_remount)
+static int ubifs_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
- char *p;
- substring_t args[MAX_OPT_ARGS];
+ struct ubifs_fs_context *ctx = fc->fs_private;
+ struct fs_parse_result result;
+ bool is_remount = (fc->purpose & FS_CONTEXT_FOR_RECONFIGURE);
+ int opt;
- if (!options)
- return 0;
+ opt = fs_parse(fc, ubifs_fs_param_spec, param, &result);
+ if (opt < 0)
+ return opt;
- while ((p = strsep(&options, ","))) {
- int token;
-
- if (!*p)
- continue;
-
- token = match_token(p, tokens, args);
- switch (token) {
+ switch (opt) {
/*
* %Opt_fast_unmount and %Opt_norm_unmount options are ignored.
* We accept them in order to be backward-compatible. But this
* should be removed at some point.
*/
- case Opt_fast_unmount:
- c->mount_opts.unmount_mode = 2;
- break;
- case Opt_norm_unmount:
- c->mount_opts.unmount_mode = 1;
- break;
- case Opt_bulk_read:
- c->mount_opts.bulk_read = 2;
- c->bulk_read = 1;
- break;
- case Opt_no_bulk_read:
- c->mount_opts.bulk_read = 1;
- c->bulk_read = 0;
- break;
- case Opt_chk_data_crc:
- c->mount_opts.chk_data_crc = 2;
- c->no_chk_data_crc = 0;
- break;
- case Opt_no_chk_data_crc:
- c->mount_opts.chk_data_crc = 1;
- c->no_chk_data_crc = 1;
- break;
- case Opt_override_compr:
- {
- char *name = match_strdup(&args[0]);
-
- if (!name)
- return -ENOMEM;
- if (!strcmp(name, "none"))
- c->mount_opts.compr_type = UBIFS_COMPR_NONE;
- else if (!strcmp(name, "lzo"))
- c->mount_opts.compr_type = UBIFS_COMPR_LZO;
- else if (!strcmp(name, "zlib"))
- c->mount_opts.compr_type = UBIFS_COMPR_ZLIB;
- else if (!strcmp(name, "zstd"))
- c->mount_opts.compr_type = UBIFS_COMPR_ZSTD;
- else {
- ubifs_err(c, "unknown compressor \"%s\"", name); //FIXME: is c ready?
- kfree(name);
- return -EINVAL;
- }
- kfree(name);
- c->mount_opts.override_compr = 1;
- c->default_compr = c->mount_opts.compr_type;
- break;
+ case Opt_fast_unmount:
+ ctx->mount_opts.unmount_mode = 2;
+ break;
+ case Opt_norm_unmount:
+ ctx->mount_opts.unmount_mode = 1;
+ break;
+ case Opt_bulk_read:
+ ctx->mount_opts.bulk_read = 2;
+ ctx->bulk_read = 1;
+ break;
+ case Opt_no_bulk_read:
+ ctx->mount_opts.bulk_read = 1;
+ ctx->bulk_read = 0;
+ break;
+ case Opt_chk_data_crc:
+ ctx->mount_opts.chk_data_crc = 2;
+ ctx->no_chk_data_crc = 0;
+ break;
+ case Opt_no_chk_data_crc:
+ ctx->mount_opts.chk_data_crc = 1;
+ ctx->no_chk_data_crc = 1;
+ break;
+ case Opt_override_compr:
+ ctx->mount_opts.compr_type = result.uint_32;
+ ctx->mount_opts.override_compr = 1;
+ ctx->default_compr = ctx->mount_opts.compr_type;
+ break;
+ case Opt_assert:
+ ctx->assert_action = result.uint_32;
+ break;
+ case Opt_auth_key:
+ if (!is_remount) {
+ kfree(ctx->auth_key_name);
+ ctx->auth_key_name = param->string;
+ param->string = NULL;
}
- case Opt_assert:
- {
- char *act = match_strdup(&args[0]);
-
- if (!act)
- return -ENOMEM;
- if (!strcmp(act, "report"))
- c->assert_action = ASSACT_REPORT;
- else if (!strcmp(act, "read-only"))
- c->assert_action = ASSACT_RO;
- else if (!strcmp(act, "panic"))
- c->assert_action = ASSACT_PANIC;
- else {
- ubifs_err(c, "unknown assert action \"%s\"", act);
- kfree(act);
- return -EINVAL;
- }
- kfree(act);
- break;
+ break;
+ case Opt_auth_hash_name:
+ if (!is_remount) {
+ kfree(ctx->auth_hash_name);
+ ctx->auth_hash_name = param->string;
+ param->string = NULL;
}
- case Opt_auth_key:
- if (!is_remount) {
- c->auth_key_name = kstrdup(args[0].from,
- GFP_KERNEL);
- if (!c->auth_key_name)
- return -ENOMEM;
- }
- break;
- case Opt_auth_hash_name:
- if (!is_remount) {
- c->auth_hash_name = kstrdup(args[0].from,
- GFP_KERNEL);
- if (!c->auth_hash_name)
- return -ENOMEM;
- }
- break;
- case Opt_ignore:
- break;
- default:
- {
- unsigned long flag;
- struct super_block *sb = c->vfs_sb;
-
- flag = parse_standard_option(p);
- if (!flag) {
- ubifs_err(c, "unrecognized mount option \"%s\" or missing value",
- p);
- return -EINVAL;
- }
- sb->s_flags |= flag;
- break;
- }
- }
+ break;
+ case Opt_ignore:
+ break;
}
return 0;
@@ -2003,21 +1946,27 @@ static void ubifs_put_super(struct super_block *sb)
mutex_unlock(&c->umount_mutex);
}
-static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
+static int ubifs_reconfigure(struct fs_context *fc)
{
+ struct ubifs_fs_context *ctx = fc->fs_private;
+ struct super_block *sb = fc->root->d_sb;
int err;
struct ubifs_info *c = sb->s_fs_info;
sync_filesystem(sb);
- dbg_gen("old flags %#lx, new flags %#x", sb->s_flags, *flags);
+ dbg_gen("old flags %#lx, new flags %#x", sb->s_flags, fc->sb_flags);
- err = ubifs_parse_options(c, data, 1);
- if (err) {
- ubifs_err(c, "invalid or unknown remount parameter");
- return err;
- }
+ /*
+ * Apply the mount option changes.
+ * auth_key_name and auth_hash_name are ignored on remount.
+ */
+ c->mount_opts = ctx->mount_opts;
+ c->bulk_read = ctx->bulk_read;
+ c->no_chk_data_crc = ctx->no_chk_data_crc;
+ c->default_compr = ctx->default_compr;
+ c->assert_action = ctx->assert_action;
- if (c->ro_mount && !(*flags & SB_RDONLY)) {
+ if (c->ro_mount && !(fc->sb_flags & SB_RDONLY)) {
if (c->ro_error) {
ubifs_msg(c, "cannot re-mount R/W due to prior errors");
return -EROFS;
@@ -2029,7 +1978,7 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
err = ubifs_remount_rw(c);
if (err)
return err;
- } else if (!c->ro_mount && (*flags & SB_RDONLY)) {
+ } else if (!c->ro_mount && (fc->sb_flags & SB_RDONLY)) {
if (c->ro_error) {
ubifs_msg(c, "cannot re-mount R/O due to prior errors");
return -EROFS;
@@ -2062,14 +2011,13 @@ const struct super_operations ubifs_super_operations = {
.evict_inode = ubifs_evict_inode,
.statfs = ubifs_statfs,
.dirty_inode = ubifs_dirty_inode,
- .remount_fs = ubifs_remount_fs,
.show_options = ubifs_show_options,
.sync_fs = ubifs_sync_fs,
};
/**
* open_ubi - parse UBI device name string and open the UBI device.
- * @name: UBI volume name
+ * @fc: The filesystem context
* @mode: UBI volume open mode
*
* The primary method of mounting UBIFS is by specifying the UBI volume
@@ -2086,15 +2034,13 @@ const struct super_operations ubifs_super_operations = {
* returns UBI volume description object in case of success and a negative
* error code in case of failure.
*/
-static struct ubi_volume_desc *open_ubi(const char *name, int mode)
+static struct ubi_volume_desc *open_ubi(struct fs_context *fc, int mode)
{
struct ubi_volume_desc *ubi;
+ const char *name = fc->source;
int dev, vol;
char *endptr;
- if (!name || !*name)
- return ERR_PTR(-EINVAL);
-
/* First, try to open using the device node path method */
ubi = ubi_open_volume_path(name, mode);
if (!IS_ERR(ubi))
@@ -2102,14 +2048,14 @@ static struct ubi_volume_desc *open_ubi(const char *name, int mode)
/* Try the "nodev" method */
if (name[0] != 'u' || name[1] != 'b' || name[2] != 'i')
- return ERR_PTR(-EINVAL);
+ goto invalid_source;
/* ubi:NAME method */
if ((name[3] == ':' || name[3] == '!') && name[4] != '\0')
return ubi_open_volume_nm(0, name + 4, mode);
if (!isdigit(name[3]))
- return ERR_PTR(-EINVAL);
+ goto invalid_source;
dev = simple_strtoul(name + 3, &endptr, 0);
@@ -2121,7 +2067,7 @@ static struct ubi_volume_desc *open_ubi(const char *name, int mode)
if (*endptr == '_' && isdigit(endptr[1])) {
vol = simple_strtoul(endptr + 1, &endptr, 0);
if (*endptr != '\0')
- return ERR_PTR(-EINVAL);
+ goto invalid_source;
return ubi_open_volume(dev, vol, mode);
}
@@ -2129,7 +2075,8 @@ static struct ubi_volume_desc *open_ubi(const char *name, int mode)
if ((*endptr == ':' || *endptr == '!') && endptr[1] != '\0')
return ubi_open_volume_nm(dev, ++endptr, mode);
- return ERR_PTR(-EINVAL);
+invalid_source:
+ return ERR_PTR(invalf(fc, "Invalid source name"));
}
static struct ubifs_info *alloc_ubifs_info(struct ubi_volume_desc *ubi)
@@ -2181,9 +2128,10 @@ static struct ubifs_info *alloc_ubifs_info(struct ubi_volume_desc *ubi)
return c;
}
-static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
+static int ubifs_fill_super(struct super_block *sb, struct fs_context *fc)
{
struct ubifs_info *c = sb->s_fs_info;
+ struct ubifs_fs_context *ctx = fc->fs_private;
struct inode *root;
int err;
@@ -2195,9 +2143,18 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
goto out;
}
- err = ubifs_parse_options(c, data, 0);
- if (err)
- goto out_close;
+ /* Copy in parsed mount options */
+ c->mount_opts = ctx->mount_opts;
+ c->auth_key_name = ctx->auth_key_name;
+ c->auth_hash_name = ctx->auth_hash_name;
+ c->no_chk_data_crc = ctx->no_chk_data_crc;
+ c->bulk_read = ctx->bulk_read;
+ c->default_compr = ctx->default_compr;
+ c->assert_action = ctx->assert_action;
+
+ /* ubifs_info owns auth strings now */
+ ctx->auth_key_name = NULL;
+ ctx->auth_hash_name = NULL;
/*
* UBIFS provides 'backing_dev_info' in order to disable read-ahead. For
@@ -2264,41 +2221,38 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
return err;
}
-static int sb_test(struct super_block *sb, void *data)
+static int sb_test(struct super_block *sb, struct fs_context *fc)
{
- struct ubifs_info *c1 = data;
+ struct ubifs_info *c1 = fc->s_fs_info;
struct ubifs_info *c = sb->s_fs_info;
return c->vi.cdev == c1->vi.cdev;
}
-static int sb_set(struct super_block *sb, void *data)
-{
- sb->s_fs_info = data;
- return set_anon_super(sb, NULL);
-}
-
-static struct dentry *ubifs_mount(struct file_system_type *fs_type, int flags,
- const char *name, void *data)
+static int ubifs_get_tree(struct fs_context *fc)
{
struct ubi_volume_desc *ubi;
struct ubifs_info *c;
struct super_block *sb;
int err;
- dbg_gen("name %s, flags %#x", name, flags);
+ if (!fc->source || !*fc->source)
+ return invalf(fc, "No source specified");
+
+ dbg_gen("name %s, flags %#x", fc->source, fc->sb_flags);
/*
* Get UBI device number and volume ID. Mount it read-only so far
* because this might be a new mount point, and UBI allows only one
* read-write user at a time.
*/
- ubi = open_ubi(name, UBI_READONLY);
+ ubi = open_ubi(fc, UBI_READONLY);
if (IS_ERR(ubi)) {
- if (!(flags & SB_SILENT))
+ err = PTR_ERR(ubi);
+ if (!(fc->sb_flags & SB_SILENT))
pr_err("UBIFS error (pid: %d): cannot open \"%s\", error %d",
- current->pid, name, (int)PTR_ERR(ubi));
- return ERR_CAST(ubi);
+ current->pid, fc->source, err);
+ return err;
}
c = alloc_ubifs_info(ubi);
@@ -2306,10 +2260,11 @@ static struct dentry *ubifs_mount(struct file_system_type *fs_type, int flags,
err = -ENOMEM;
goto out_close;
}
+ fc->s_fs_info = c;
dbg_gen("opened ubi%d_%d", c->vi.ubi_num, c->vi.vol_id);
- sb = sget(fs_type, sb_test, sb_set, flags, c);
+ sb = sget_fc(fc, sb_test, set_anon_super_fc);
if (IS_ERR(sb)) {
err = PTR_ERR(sb);
kfree(c);
@@ -2321,12 +2276,12 @@ static struct dentry *ubifs_mount(struct file_system_type *fs_type, int flags,
kfree(c);
/* A new mount point for already mounted UBIFS */
dbg_gen("this ubi volume is already mounted");
- if (!!(flags & SB_RDONLY) != c1->ro_mount) {
+ if (!!(fc->sb_flags & SB_RDONLY) != c1->ro_mount) {
err = -EBUSY;
goto out_deact;
}
} else {
- err = ubifs_fill_super(sb, data, flags & SB_SILENT ? 1 : 0);
+ err = ubifs_fill_super(sb, fc);
if (err)
goto out_deact;
/* We do not support atime */
@@ -2340,13 +2295,14 @@ static struct dentry *ubifs_mount(struct file_system_type *fs_type, int flags,
/* 'fill_super()' opens ubi again so we must close it here */
ubi_close_volume(ubi);
- return dget(sb->s_root);
+ fc->root = dget(sb->s_root);
+ return 0;
out_deact:
deactivate_locked_super(sb);
out_close:
ubi_close_volume(ubi);
- return ERR_PTR(err);
+ return err;
}
static void kill_ubifs_super(struct super_block *s)
@@ -2356,10 +2312,61 @@ static void kill_ubifs_super(struct super_block *s)
kfree(c);
}
+static void ubifs_free_fc(struct fs_context *fc)
+{
+ struct ubifs_fs_context *ctx = fc->fs_private;
+
+ if (ctx) {
+ kfree(ctx->auth_key_name);
+ kfree(ctx->auth_hash_name);
+ kfree(ctx);
+ }
+}
+
+static const struct fs_context_operations ubifs_context_ops = {
+ .free = ubifs_free_fc,
+ .parse_param = ubifs_parse_param,
+ .get_tree = ubifs_get_tree,
+ .reconfigure = ubifs_reconfigure,
+};
+
+static int ubifs_init_fs_context(struct fs_context *fc)
+{
+ struct ubifs_fs_context *ctx;
+
+ ctx = kzalloc(sizeof(struct ubifs_fs_context), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+
+ if (fc->purpose != FS_CONTEXT_FOR_RECONFIGURE) {
+ /* Iniitialize for first mount */
+ ctx->no_chk_data_crc = 1;
+ ctx->assert_action = ASSACT_RO;
+ } else {
+ struct ubifs_info *c = fc->root->d_sb->s_fs_info;
+
+ /*
+ * Preserve existing options across remounts.
+ * auth_key_name and auth_hash_name are not remountable.
+ */
+ ctx->mount_opts = c->mount_opts;
+ ctx->bulk_read = c->bulk_read;
+ ctx->no_chk_data_crc = c->no_chk_data_crc;
+ ctx->default_compr = c->default_compr;
+ ctx->assert_action = c->assert_action;
+ }
+
+ fc->ops = &ubifs_context_ops;
+ fc->fs_private = ctx;
+
+ return 0;
+}
+
static struct file_system_type ubifs_fs_type = {
.name = "ubifs",
.owner = THIS_MODULE,
- .mount = ubifs_mount,
+ .init_fs_context = ubifs_init_fs_context,
+ .parameters = ubifs_fs_param_spec,
.kill_sb = kill_ubifs_super,
};
MODULE_ALIAS_FS("ubifs");
diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c
index 53c11be..194ed3a 100644
--- a/fs/ufs/balloc.c
+++ b/fs/ufs/balloc.c
@@ -33,6 +33,29 @@ static u64 ufs_bitmap_search (struct super_block *, struct ufs_cg_private_info *
static unsigned char ufs_fragtable_8fpb[], ufs_fragtable_other[];
static void ufs_clusteracct(struct super_block *, struct ufs_cg_private_info *, unsigned, int);
+static void adjust_free_blocks(struct super_block *sb,
+ struct ufs_cylinder_group *ucg,
+ struct ufs_cg_private_info *ucpi,
+ unsigned fragment, int delta)
+{
+ struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi;
+
+ if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD)
+ ufs_clusteracct(sb, ucpi, fragment, delta);
+
+ fs32_add(sb, &ucg->cg_cs.cs_nbfree, delta);
+ uspi->cs_total.cs_nbfree += delta;
+ fs32_add(sb, &UFS_SB(sb)->fs_cs(ucpi->c_cgx).cs_nbfree, delta);
+
+ if (uspi->fs_magic != UFS2_MAGIC) {
+ unsigned cylno = ufs_cbtocylno(fragment);
+
+ fs16_add(sb, &ubh_cg_blks(ucpi, cylno,
+ ufs_cbtorpos(fragment)), delta);
+ fs32_add(sb, &ubh_cg_blktot(ucpi, cylno), delta);
+ }
+}
+
/*
* Free 'count' fragments from fragment number 'fragment'
*/
@@ -43,7 +66,6 @@ void ufs_free_fragments(struct inode *inode, u64 fragment, unsigned count)
struct ufs_cg_private_info * ucpi;
struct ufs_cylinder_group * ucg;
unsigned cgno, bit, end_bit, bbase, blkmap, i;
- u64 blkno;
sb = inode->i_sb;
uspi = UFS_SB(sb)->s_uspi;
@@ -51,7 +73,7 @@ void ufs_free_fragments(struct inode *inode, u64 fragment, unsigned count)
UFSD("ENTER, fragment %llu, count %u\n",
(unsigned long long)fragment, count);
- if (ufs_fragnum(fragment) + count > uspi->s_fpg)
+ if (ufs_fragnum(fragment) + count > uspi->s_fpb)
ufs_error (sb, "ufs_free_fragments", "internal error");
mutex_lock(&UFS_SB(sb)->s_lock);
@@ -94,23 +116,11 @@ void ufs_free_fragments(struct inode *inode, u64 fragment, unsigned count)
/*
* Trying to reassemble free fragments into block
*/
- blkno = ufs_fragstoblks (bbase);
- if (ubh_isblockset(UCPI_UBH(ucpi), ucpi->c_freeoff, blkno)) {
+ if (ubh_isblockset(uspi, ucpi, bbase)) {
fs32_sub(sb, &ucg->cg_cs.cs_nffree, uspi->s_fpb);
uspi->cs_total.cs_nffree -= uspi->s_fpb;
fs32_sub(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, uspi->s_fpb);
- if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD)
- ufs_clusteracct (sb, ucpi, blkno, 1);
- fs32_add(sb, &ucg->cg_cs.cs_nbfree, 1);
- uspi->cs_total.cs_nbfree++;
- fs32_add(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nbfree, 1);
- if (uspi->fs_magic != UFS2_MAGIC) {
- unsigned cylno = ufs_cbtocylno (bbase);
-
- fs16_add(sb, &ubh_cg_blks(ucpi, cylno,
- ufs_cbtorpos(bbase)), 1);
- fs32_add(sb, &ubh_cg_blktot(ucpi, cylno), 1);
- }
+ adjust_free_blocks(sb, ucg, ucpi, bbase, 1);
}
ubh_mark_buffer_dirty (USPI_UBH(uspi));
@@ -139,7 +149,6 @@ void ufs_free_blocks(struct inode *inode, u64 fragment, unsigned count)
struct ufs_cg_private_info * ucpi;
struct ufs_cylinder_group * ucg;
unsigned overflow, cgno, bit, end_bit, i;
- u64 blkno;
sb = inode->i_sb;
uspi = UFS_SB(sb)->s_uspi;
@@ -181,26 +190,12 @@ void ufs_free_blocks(struct inode *inode, u64 fragment, unsigned count)
}
for (i = bit; i < end_bit; i += uspi->s_fpb) {
- blkno = ufs_fragstoblks(i);
- if (ubh_isblockset(UCPI_UBH(ucpi), ucpi->c_freeoff, blkno)) {
+ if (ubh_isblockset(uspi, ucpi, i)) {
ufs_error(sb, "ufs_free_blocks", "freeing free fragment");
}
- ubh_setblock(UCPI_UBH(ucpi), ucpi->c_freeoff, blkno);
+ ubh_setblock(uspi, ucpi, i);
inode_sub_bytes(inode, uspi->s_fpb << uspi->s_fshift);
- if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD)
- ufs_clusteracct (sb, ucpi, blkno, 1);
-
- fs32_add(sb, &ucg->cg_cs.cs_nbfree, 1);
- uspi->cs_total.cs_nbfree++;
- fs32_add(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nbfree, 1);
-
- if (uspi->fs_magic != UFS2_MAGIC) {
- unsigned cylno = ufs_cbtocylno(i);
-
- fs16_add(sb, &ubh_cg_blks(ucpi, cylno,
- ufs_cbtorpos(i)), 1);
- fs32_add(sb, &ubh_cg_blktot(ucpi, cylno), 1);
- }
+ adjust_free_blocks(sb, ucg, ucpi, i, 1);
}
ubh_mark_buffer_dirty (USPI_UBH(uspi));
@@ -234,13 +229,13 @@ void ufs_free_blocks(struct inode *inode, u64 fragment, unsigned count)
* situated at the end of file.
*
* We can come here from ufs_writepage or ufs_prepare_write,
- * locked_page is argument of these functions, so we already lock it.
+ * locked_folio is argument of these functions, so we already lock it.
*/
static void ufs_change_blocknr(struct inode *inode, sector_t beg,
unsigned int count, sector_t oldb,
- sector_t newb, struct page *locked_page)
+ sector_t newb, struct folio *locked_folio)
{
- struct folio *folio, *locked_folio = page_folio(locked_page);
+ struct folio *folio;
const unsigned blks_per_page =
1 << (PAGE_SHIFT - inode->i_blkbits);
const unsigned mask = blks_per_page - 1;
@@ -337,7 +332,7 @@ static void ufs_clear_frags(struct inode *inode, sector_t beg, unsigned int n,
u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment,
u64 goal, unsigned count, int *err,
- struct page *locked_page)
+ struct folio *locked_folio)
{
struct super_block * sb;
struct ufs_sb_private_info * uspi;
@@ -417,7 +412,7 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment,
result = ufs_alloc_fragments (inode, cgno, goal, count, err);
if (result) {
ufs_clear_frags(inode, result + oldcount,
- newcount - oldcount, locked_page != NULL);
+ newcount - oldcount, locked_folio != NULL);
*err = 0;
write_seqlock(&UFS_I(inode)->meta_lock);
ufs_cpu_to_data_ptr(sb, p, result);
@@ -441,7 +436,7 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment,
fragment + count);
read_sequnlock_excl(&UFS_I(inode)->meta_lock);
ufs_clear_frags(inode, result + oldcount, newcount - oldcount,
- locked_page != NULL);
+ locked_folio != NULL);
mutex_unlock(&UFS_SB(sb)->s_lock);
UFSD("EXIT, result %llu\n", (unsigned long long)result);
return result;
@@ -462,11 +457,11 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment,
result = ufs_alloc_fragments (inode, cgno, goal, request, err);
if (result) {
ufs_clear_frags(inode, result + oldcount, newcount - oldcount,
- locked_page != NULL);
+ locked_folio != NULL);
mutex_unlock(&UFS_SB(sb)->s_lock);
ufs_change_blocknr(inode, fragment - oldcount, oldcount,
uspi->s_sbbase + tmp,
- uspi->s_sbbase + result, locked_page);
+ uspi->s_sbbase + result, locked_folio);
*err = 0;
write_seqlock(&UFS_I(inode)->meta_lock);
ufs_cpu_to_data_ptr(sb, p, result);
@@ -698,7 +693,7 @@ static u64 ufs_alloccg_block(struct inode *inode,
struct super_block * sb;
struct ufs_sb_private_info * uspi;
struct ufs_cylinder_group * ucg;
- u64 result, blkno;
+ u64 result;
UFSD("ENTER, goal %llu\n", (unsigned long long)goal);
@@ -716,7 +711,7 @@ static u64 ufs_alloccg_block(struct inode *inode,
/*
* If the requested block is available, use it.
*/
- if (ubh_isblockset(UCPI_UBH(ucpi), ucpi->c_freeoff, ufs_fragstoblks(goal))) {
+ if (ubh_isblockset(uspi, ucpi, goal)) {
result = goal;
goto gotit;
}
@@ -729,22 +724,8 @@ static u64 ufs_alloccg_block(struct inode *inode,
gotit:
if (!try_add_frags(inode, uspi->s_fpb))
return 0;
- blkno = ufs_fragstoblks(result);
- ubh_clrblock (UCPI_UBH(ucpi), ucpi->c_freeoff, blkno);
- if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD)
- ufs_clusteracct (sb, ucpi, blkno, -1);
-
- fs32_sub(sb, &ucg->cg_cs.cs_nbfree, 1);
- uspi->cs_total.cs_nbfree--;
- fs32_sub(sb, &UFS_SB(sb)->fs_cs(ucpi->c_cgx).cs_nbfree, 1);
-
- if (uspi->fs_magic != UFS2_MAGIC) {
- unsigned cylno = ufs_cbtocylno((unsigned)result);
-
- fs16_sub(sb, &ubh_cg_blks(ucpi, cylno,
- ufs_cbtorpos((unsigned)result)), 1);
- fs32_sub(sb, &ubh_cg_blktot(ucpi, cylno), 1);
- }
+ ubh_clrblock(uspi, ucpi, result);
+ adjust_free_blocks(sb, ucg, ucpi, result, -1);
UFSD("EXIT, result %llu\n", (unsigned long long)result);
@@ -863,12 +844,12 @@ static u64 ufs_bitmap_search(struct super_block *sb,
}
static void ufs_clusteracct(struct super_block * sb,
- struct ufs_cg_private_info * ucpi, unsigned blkno, int cnt)
+ struct ufs_cg_private_info * ucpi, unsigned frag, int cnt)
{
- struct ufs_sb_private_info * uspi;
+ struct ufs_sb_private_info * uspi = UFS_SB(sb)->s_uspi;
int i, start, end, forw, back;
+ unsigned blkno = ufs_fragstoblks(frag);
- uspi = UFS_SB(sb)->s_uspi;
if (uspi->s_contigsumsize <= 0)
return;
diff --git a/fs/ufs/cylinder.c b/fs/ufs/cylinder.c
index 1abe545..a281327 100644
--- a/fs/ufs/cylinder.c
+++ b/fs/ufs/cylinder.c
@@ -26,7 +26,7 @@
* Read cylinder group into cache. The memory space for ufs_cg_private_info
* structure is already allocated during ufs_read_super.
*/
-static void ufs_read_cylinder (struct super_block * sb,
+static bool ufs_read_cylinder(struct super_block *sb,
unsigned cgno, unsigned bitmap_nr)
{
struct ufs_sb_info * sbi = UFS_SB(sb);
@@ -46,9 +46,11 @@ static void ufs_read_cylinder (struct super_block * sb,
* We have already the first fragment of cylinder group block in buffer
*/
UCPI_UBH(ucpi)->bh[0] = sbi->s_ucg[cgno];
- for (i = 1; i < UCPI_UBH(ucpi)->count; i++)
- if (!(UCPI_UBH(ucpi)->bh[i] = sb_bread(sb, UCPI_UBH(ucpi)->fragment + i)))
+ for (i = 1; i < UCPI_UBH(ucpi)->count; i++) {
+ UCPI_UBH(ucpi)->bh[i] = sb_bread(sb, UCPI_UBH(ucpi)->fragment + i);
+ if (!UCPI_UBH(ucpi)->bh[i])
goto failed;
+ }
sbi->s_cgno[bitmap_nr] = cgno;
ucpi->c_cgx = fs32_to_cpu(sb, ucg->cg_cgx);
@@ -67,13 +69,14 @@ static void ufs_read_cylinder (struct super_block * sb,
ucpi->c_clusteroff = fs32_to_cpu(sb, ucg->cg_u.cg_44.cg_clusteroff);
ucpi->c_nclusterblks = fs32_to_cpu(sb, ucg->cg_u.cg_44.cg_nclusterblks);
UFSD("EXIT\n");
- return;
+ return true;
failed:
for (j = 1; j < i; j++)
- brelse (sbi->s_ucg[j]);
+ brelse(UCPI_UBH(ucpi)->bh[j]);
sbi->s_cgno[bitmap_nr] = UFS_CGNO_EMPTY;
ufs_error (sb, "ufs_read_cylinder", "can't read cylinder group block %u", cgno);
+ return false;
}
/*
@@ -156,15 +159,14 @@ struct ufs_cg_private_info * ufs_load_cylinder (
UFSD("EXIT (FAILED)\n");
return NULL;
}
- else {
- UFSD("EXIT\n");
- return sbi->s_ucpi[cgno];
- }
} else {
- ufs_read_cylinder (sb, cgno, cgno);
- UFSD("EXIT\n");
- return sbi->s_ucpi[cgno];
+ if (unlikely(!ufs_read_cylinder (sb, cgno, cgno))) {
+ UFSD("EXIT (FAILED)\n");
+ return NULL;
+ }
}
+ UFSD("EXIT\n");
+ return sbi->s_ucpi[cgno];
}
/*
* Cylinder group number cg is in cache but it was not last used,
@@ -195,7 +197,10 @@ struct ufs_cg_private_info * ufs_load_cylinder (
sbi->s_ucpi[j] = sbi->s_ucpi[j-1];
}
sbi->s_ucpi[0] = ucpi;
- ufs_read_cylinder (sb, cgno, 0);
+ if (unlikely(!ufs_read_cylinder (sb, cgno, 0))) {
+ UFSD("EXIT (FAILED)\n");
+ return NULL;
+ }
}
UFSD("EXIT\n");
return sbi->s_ucpi[0];
diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c
index d6e6a21..88d0062 100644
--- a/fs/ufs/dir.c
+++ b/fs/ufs/dir.c
@@ -81,10 +81,9 @@ ino_t ufs_inode_by_name(struct inode *dir, const struct qstr *qstr)
}
-/* Releases the page */
-void ufs_set_link(struct inode *dir, struct ufs_dir_entry *de,
- struct folio *folio, struct inode *inode,
- bool update_times)
+int ufs_set_link(struct inode *dir, struct ufs_dir_entry *de,
+ struct folio *folio, struct inode *inode,
+ bool update_times)
{
loff_t pos = folio_pos(folio) + offset_in_folio(folio, de);
unsigned len = fs16_to_cpu(dir->i_sb, de->d_reclen);
@@ -92,17 +91,19 @@ void ufs_set_link(struct inode *dir, struct ufs_dir_entry *de,
folio_lock(folio);
err = ufs_prepare_chunk(folio, pos, len);
- BUG_ON(err);
+ if (unlikely(err)) {
+ folio_unlock(folio);
+ return err;
+ }
de->d_ino = cpu_to_fs32(dir->i_sb, inode->i_ino);
ufs_set_de_type(dir->i_sb, de, inode->i_mode);
ufs_commit_chunk(folio, pos, len);
- folio_release_kmap(folio, de);
if (update_times)
inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
mark_inode_dirty(dir);
- ufs_handle_dirsync(dir);
+ return ufs_handle_dirsync(dir);
}
static bool ufs_check_folio(struct folio *folio, char *kaddr)
@@ -505,8 +506,7 @@ int ufs_delete_entry(struct inode *inode, struct ufs_dir_entry *dir,
if (de->d_reclen == 0) {
ufs_error(inode->i_sb, __func__,
"zero-length directory entry");
- err = -EIO;
- goto out;
+ return -EIO;
}
pde = de;
de = ufs_next_entry(sb, de);
@@ -516,18 +516,17 @@ int ufs_delete_entry(struct inode *inode, struct ufs_dir_entry *dir,
pos = folio_pos(folio) + from;
folio_lock(folio);
err = ufs_prepare_chunk(folio, pos, to - from);
- BUG_ON(err);
+ if (unlikely(err)) {
+ folio_unlock(folio);
+ return err;
+ }
if (pde)
pde->d_reclen = cpu_to_fs16(sb, to - from);
dir->d_ino = 0;
ufs_commit_chunk(folio, pos, to - from);
inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
mark_inode_dirty(inode);
- err = ufs_handle_dirsync(inode);
-out:
- folio_release_kmap(folio, kaddr);
- UFSD("EXIT\n");
- return err;
+ return ufs_handle_dirsync(inode);
}
int ufs_make_empty(struct inode * inode, struct inode *dir)
diff --git a/fs/ufs/file.c b/fs/ufs/file.c
index 6558882..487ad1f 100644
--- a/fs/ufs/file.c
+++ b/fs/ufs/file.c
@@ -42,4 +42,5 @@ const struct file_operations ufs_file_operations = {
.open = generic_file_open,
.fsync = generic_file_fsync,
.splice_read = filemap_splice_read,
+ .splice_write = iter_file_splice_write,
};
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index 5331ae7..7dc38fd 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -220,7 +220,7 @@ static u64 ufs_frag_map(struct inode *inode, unsigned offsets[4], int depth)
*/
static bool
ufs_extend_tail(struct inode *inode, u64 writes_to,
- int *err, struct page *locked_page)
+ int *err, struct folio *locked_folio)
{
struct ufs_inode_info *ufsi = UFS_I(inode);
struct super_block *sb = inode->i_sb;
@@ -239,7 +239,7 @@ ufs_extend_tail(struct inode *inode, u64 writes_to,
p = ufs_get_direct_data_ptr(uspi, ufsi, block);
tmp = ufs_new_fragments(inode, p, lastfrag, ufs_data_ptr_to_cpu(sb, p),
new_size - (lastfrag & uspi->s_fpbmask), err,
- locked_page);
+ locked_folio);
return tmp != 0;
}
@@ -250,12 +250,11 @@ ufs_extend_tail(struct inode *inode, u64 writes_to,
* @new_fragment: number of new allocated fragment(s)
* @err: we set it if something wrong
* @new: we set it if we allocate new block
- * @locked_page: for ufs_new_fragments()
+ * @locked_folio: for ufs_new_fragments()
*/
-static u64
-ufs_inode_getfrag(struct inode *inode, unsigned index,
+static u64 ufs_inode_getfrag(struct inode *inode, unsigned index,
sector_t new_fragment, int *err,
- int *new, struct page *locked_page)
+ int *new, struct folio *locked_folio)
{
struct ufs_inode_info *ufsi = UFS_I(inode);
struct super_block *sb = inode->i_sb;
@@ -264,11 +263,6 @@ ufs_inode_getfrag(struct inode *inode, unsigned index,
unsigned nfrags = uspi->s_fpb;
void *p;
- /* TODO : to be done for write support
- if ( (flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2)
- goto ufs2;
- */
-
p = ufs_get_direct_data_ptr(uspi, ufsi, index);
tmp = ufs_data_ptr_to_cpu(sb, p);
if (tmp)
@@ -288,7 +282,7 @@ ufs_inode_getfrag(struct inode *inode, unsigned index,
goal += uspi->s_fpb;
}
tmp = ufs_new_fragments(inode, p, ufs_blknum(new_fragment),
- goal, nfrags, err, locked_page);
+ goal, nfrags, err, locked_folio);
if (!tmp) {
*err = -ENOSPC;
@@ -303,21 +297,6 @@ ufs_inode_getfrag(struct inode *inode, unsigned index,
mark_inode_dirty(inode);
out:
return tmp + uspi->s_sbbase;
-
- /* This part : To be implemented ....
- Required only for writing, not required for READ-ONLY.
-ufs2:
-
- u2_block = ufs_fragstoblks(fragment);
- u2_blockoff = ufs_fragnum(fragment);
- p = ufsi->i_u1.u2_i_data + block;
- goal = 0;
-
-repeat2:
- tmp = fs32_to_cpu(sb, *p);
- lastfrag = ufsi->i_lastfrag;
-
- */
}
/**
@@ -329,12 +308,11 @@ ufs_inode_getfrag(struct inode *inode, unsigned index,
* (block will hold this fragment and also uspi->s_fpb-1)
* @err: see ufs_inode_getfrag()
* @new: see ufs_inode_getfrag()
- * @locked_page: see ufs_inode_getfrag()
+ * @locked_folio: see ufs_inode_getfrag()
*/
-static u64
-ufs_inode_getblock(struct inode *inode, u64 ind_block,
- unsigned index, sector_t new_fragment, int *err,
- int *new, struct page *locked_page)
+static u64 ufs_inode_getblock(struct inode *inode, u64 ind_block,
+ unsigned index, sector_t new_fragment, int *err,
+ int *new, struct folio *locked_folio)
{
struct super_block *sb = inode->i_sb;
struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi;
@@ -369,7 +347,7 @@ ufs_inode_getblock(struct inode *inode, u64 ind_block,
else
goal = bh->b_blocknr + uspi->s_fpb;
tmp = ufs_new_fragments(inode, p, ufs_blknum(new_fragment), goal,
- uspi->s_fpb, err, locked_page);
+ uspi->s_fpb, err, locked_folio);
if (!tmp)
goto out;
@@ -434,14 +412,14 @@ static int ufs_getfrag_block(struct inode *inode, sector_t fragment, struct buff
unsigned tailfrags = lastfrag & uspi->s_fpbmask;
if (tailfrags && fragment >= lastfrag) {
if (!ufs_extend_tail(inode, fragment,
- &err, bh_result->b_page))
+ &err, bh_result->b_folio))
goto out;
}
}
if (depth == 1) {
phys64 = ufs_inode_getfrag(inode, offsets[0], fragment,
- &err, &new, bh_result->b_page);
+ &err, &new, bh_result->b_folio);
} else {
int i;
phys64 = ufs_inode_getfrag(inode, offsets[0], fragment,
@@ -450,7 +428,7 @@ static int ufs_getfrag_block(struct inode *inode, sector_t fragment, struct buff
phys64 = ufs_inode_getblock(inode, phys64, offsets[i],
fragment, &err, NULL, NULL);
phys64 = ufs_inode_getblock(inode, phys64, offsets[depth - 1],
- fragment, &err, &new, bh_result->b_page);
+ fragment, &err, &new, bh_result->b_folio);
}
out:
if (phys64) {
@@ -898,91 +876,84 @@ static inline void free_data(struct to_free *ctx, u64 from, unsigned count)
#define DIRECT_FRAGMENT ((inode->i_size + uspi->s_fsize - 1) >> uspi->s_fshift)
+/*
+ * used only for truncation down to direct blocks.
+ */
static void ufs_trunc_direct(struct inode *inode)
{
struct ufs_inode_info *ufsi = UFS_I(inode);
- struct super_block * sb;
- struct ufs_sb_private_info * uspi;
- void *p;
- u64 frag1, frag2, frag3, frag4, block1, block2;
+ struct super_block *sb = inode->i_sb;
+ struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi;
+ unsigned int new_frags, old_frags;
+ unsigned int old_slot, new_slot;
+ unsigned int old_tail, new_tail;
struct to_free ctx = {.inode = inode};
- unsigned i, tmp;
UFSD("ENTER: ino %lu\n", inode->i_ino);
- sb = inode->i_sb;
- uspi = UFS_SB(sb)->s_uspi;
+ new_frags = DIRECT_FRAGMENT;
+ // new_frags = first fragment past the new EOF
+ old_frags = min_t(u64, UFS_NDIR_FRAGMENT, ufsi->i_lastfrag);
+ // old_frags = first fragment past the old EOF or covered by indirects
- frag1 = DIRECT_FRAGMENT;
- frag4 = min_t(u64, UFS_NDIR_FRAGMENT, ufsi->i_lastfrag);
- frag2 = ((frag1 & uspi->s_fpbmask) ? ((frag1 | uspi->s_fpbmask) + 1) : frag1);
- frag3 = frag4 & ~uspi->s_fpbmask;
- block1 = block2 = 0;
- if (frag2 > frag3) {
- frag2 = frag4;
- frag3 = frag4 = 0;
- } else if (frag2 < frag3) {
- block1 = ufs_fragstoblks (frag2);
- block2 = ufs_fragstoblks (frag3);
- }
+ if (new_frags >= old_frags) // expanding - nothing to free
+ goto done;
- UFSD("ino %lu, frag1 %llu, frag2 %llu, block1 %llu, block2 %llu,"
- " frag3 %llu, frag4 %llu\n", inode->i_ino,
- (unsigned long long)frag1, (unsigned long long)frag2,
- (unsigned long long)block1, (unsigned long long)block2,
- (unsigned long long)frag3, (unsigned long long)frag4);
+ old_tail = ufs_fragnum(old_frags);
+ old_slot = ufs_fragstoblks(old_frags);
+ new_tail = ufs_fragnum(new_frags);
+ new_slot = ufs_fragstoblks(new_frags);
- if (frag1 >= frag2)
- goto next1;
-
- /*
- * Free first free fragments
- */
- p = ufs_get_direct_data_ptr(uspi, ufsi, ufs_fragstoblks(frag1));
- tmp = ufs_data_ptr_to_cpu(sb, p);
- if (!tmp )
- ufs_panic (sb, "ufs_trunc_direct", "internal error");
- frag2 -= frag1;
- frag1 = ufs_fragnum (frag1);
-
- ufs_free_fragments(inode, tmp + frag1, frag2);
-
-next1:
- /*
- * Free whole blocks
- */
- for (i = block1 ; i < block2; i++) {
- p = ufs_get_direct_data_ptr(uspi, ufsi, i);
- tmp = ufs_data_ptr_to_cpu(sb, p);
+ if (old_slot == new_slot) { // old_tail > 0
+ void *p = ufs_get_direct_data_ptr(uspi, ufsi, old_slot);
+ u64 tmp = ufs_data_ptr_to_cpu(sb, p);
if (!tmp)
- continue;
- write_seqlock(&ufsi->meta_lock);
- ufs_data_ptr_clear(uspi, p);
- write_sequnlock(&ufsi->meta_lock);
+ ufs_panic(sb, __func__, "internal error");
+ if (!new_tail) {
+ write_seqlock(&ufsi->meta_lock);
+ ufs_data_ptr_clear(uspi, p);
+ write_sequnlock(&ufsi->meta_lock);
+ }
+ ufs_free_fragments(inode, tmp + new_tail, old_tail - new_tail);
+ } else {
+ unsigned int slot = new_slot;
- free_data(&ctx, tmp, uspi->s_fpb);
+ if (new_tail) {
+ void *p = ufs_get_direct_data_ptr(uspi, ufsi, slot++);
+ u64 tmp = ufs_data_ptr_to_cpu(sb, p);
+ if (!tmp)
+ ufs_panic(sb, __func__, "internal error");
+
+ ufs_free_fragments(inode, tmp + new_tail,
+ uspi->s_fpb - new_tail);
+ }
+ while (slot < old_slot) {
+ void *p = ufs_get_direct_data_ptr(uspi, ufsi, slot++);
+ u64 tmp = ufs_data_ptr_to_cpu(sb, p);
+ if (!tmp)
+ continue;
+ write_seqlock(&ufsi->meta_lock);
+ ufs_data_ptr_clear(uspi, p);
+ write_sequnlock(&ufsi->meta_lock);
+
+ free_data(&ctx, tmp, uspi->s_fpb);
+ }
+
+ free_data(&ctx, 0, 0);
+
+ if (old_tail) {
+ void *p = ufs_get_direct_data_ptr(uspi, ufsi, slot);
+ u64 tmp = ufs_data_ptr_to_cpu(sb, p);
+ if (!tmp)
+ ufs_panic(sb, __func__, "internal error");
+ write_seqlock(&ufsi->meta_lock);
+ ufs_data_ptr_clear(uspi, p);
+ write_sequnlock(&ufsi->meta_lock);
+
+ ufs_free_fragments(inode, tmp, old_tail);
+ }
}
-
- free_data(&ctx, 0, 0);
-
- if (frag3 >= frag4)
- goto next3;
-
- /*
- * Free last free fragments
- */
- p = ufs_get_direct_data_ptr(uspi, ufsi, ufs_fragstoblks(frag3));
- tmp = ufs_data_ptr_to_cpu(sb, p);
- if (!tmp )
- ufs_panic(sb, "ufs_truncate_direct", "internal error");
- frag4 = ufs_fragnum (frag4);
- write_seqlock(&ufsi->meta_lock);
- ufs_data_ptr_clear(uspi, p);
- write_sequnlock(&ufsi->meta_lock);
-
- ufs_free_fragments (inode, tmp, frag4);
- next3:
-
+done:
UFSD("EXIT: ino %lu\n", inode->i_ino);
}
diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c
index c839097..38a024c 100644
--- a/fs/ufs/namei.c
+++ b/fs/ufs/namei.c
@@ -210,20 +210,18 @@ static int ufs_unlink(struct inode *dir, struct dentry *dentry)
struct inode * inode = d_inode(dentry);
struct ufs_dir_entry *de;
struct folio *folio;
- int err = -ENOENT;
+ int err;
de = ufs_find_entry(dir, &dentry->d_name, &folio);
if (!de)
- goto out;
+ return -ENOENT;
err = ufs_delete_entry(dir, de, folio);
- if (err)
- goto out;
-
- inode_set_ctime_to_ts(inode, inode_get_ctime(dir));
- inode_dec_link_count(inode);
- err = 0;
-out:
+ if (!err) {
+ inode_set_ctime_to_ts(inode, inode_get_ctime(dir));
+ inode_dec_link_count(inode);
+ }
+ folio_release_kmap(folio, de);
return err;
}
@@ -253,14 +251,14 @@ static int ufs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
struct ufs_dir_entry * dir_de = NULL;
struct folio *old_folio;
struct ufs_dir_entry *old_de;
- int err = -ENOENT;
+ int err;
if (flags & ~RENAME_NOREPLACE)
return -EINVAL;
old_de = ufs_find_entry(old_dir, &old_dentry->d_name, &old_folio);
if (!old_de)
- goto out;
+ return -ENOENT;
if (S_ISDIR(old_inode->i_mode)) {
err = -EIO;
@@ -281,7 +279,10 @@ static int ufs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
new_de = ufs_find_entry(new_dir, &new_dentry->d_name, &new_folio);
if (!new_de)
goto out_dir;
- ufs_set_link(new_dir, new_de, new_folio, old_inode, 1);
+ err = ufs_set_link(new_dir, new_de, new_folio, old_inode, 1);
+ folio_release_kmap(new_folio, new_de);
+ if (err)
+ goto out_dir;
inode_set_ctime_current(new_inode);
if (dir_de)
drop_nlink(new_inode);
@@ -299,26 +300,20 @@ static int ufs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
* rename.
*/
inode_set_ctime_current(old_inode);
-
- ufs_delete_entry(old_dir, old_de, old_folio);
mark_inode_dirty(old_inode);
- if (dir_de) {
+ err = ufs_delete_entry(old_dir, old_de, old_folio);
+ if (!err && dir_de) {
if (old_dir != new_dir)
- ufs_set_link(old_inode, dir_de, dir_folio, new_dir, 0);
- else
- folio_release_kmap(dir_folio, dir_de);
+ err = ufs_set_link(old_inode, dir_de, dir_folio,
+ new_dir, 0);
inode_dec_link_count(old_dir);
}
- return 0;
-
-
out_dir:
if (dir_de)
folio_release_kmap(dir_folio, dir_de);
out_old:
folio_release_kmap(old_folio, old_de);
-out:
return err;
}
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index bc62578..762699c 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -505,7 +505,6 @@ static int ufs_read_cylinder_structures(struct super_block *sb)
{
struct ufs_sb_info *sbi = UFS_SB(sb);
struct ufs_sb_private_info *uspi = sbi->s_uspi;
- struct ufs_buffer_head * ubh;
unsigned char * base, * space;
unsigned size, blks, i;
@@ -521,21 +520,13 @@ static int ufs_read_cylinder_structures(struct super_block *sb)
if (!base)
goto failed;
sbi->s_csp = (struct ufs_csum *)space;
- for (i = 0; i < blks; i += uspi->s_fpb) {
- size = uspi->s_bsize;
- if (i + uspi->s_fpb > blks)
- size = (blks - i) * uspi->s_fsize;
-
- ubh = ubh_bread(sb, uspi->s_csaddr + i, size);
-
- if (!ubh)
+ for (i = 0; i < blks; i++) {
+ struct buffer_head *bh = sb_bread(sb, uspi->s_csaddr + i);
+ if (!bh)
goto failed;
-
- ubh_ubhcpymem (space, ubh, size);
-
- space += size;
- ubh_brelse (ubh);
- ubh = NULL;
+ memcpy(space, bh->b_data, uspi->s_fsize);
+ space += uspi->s_fsize;
+ brelse (bh);
}
/*
@@ -645,7 +636,6 @@ static void ufs_put_super_internal(struct super_block *sb)
{
struct ufs_sb_info *sbi = UFS_SB(sb);
struct ufs_sb_private_info *uspi = sbi->s_uspi;
- struct ufs_buffer_head * ubh;
unsigned char * base, * space;
unsigned blks, size, i;
@@ -656,18 +646,17 @@ static void ufs_put_super_internal(struct super_block *sb)
size = uspi->s_cssize;
blks = (size + uspi->s_fsize - 1) >> uspi->s_fshift;
base = space = (char*) sbi->s_csp;
- for (i = 0; i < blks; i += uspi->s_fpb) {
- size = uspi->s_bsize;
- if (i + uspi->s_fpb > blks)
- size = (blks - i) * uspi->s_fsize;
+ for (i = 0; i < blks; i++, space += uspi->s_fsize) {
+ struct buffer_head *bh = sb_bread(sb, uspi->s_csaddr + i);
- ubh = ubh_bread(sb, uspi->s_csaddr + i, size);
-
- ubh_memcpyubh (ubh, space, size);
- space += size;
- ubh_mark_buffer_uptodate (ubh, 1);
- ubh_mark_buffer_dirty (ubh);
- ubh_brelse (ubh);
+ if (unlikely(!bh)) { // better than an oops...
+ ufs_panic(sb, __func__,
+ "can't write part of cylinder group summary");
+ continue;
+ }
+ memcpy(bh->b_data, space, uspi->s_fsize);
+ mark_buffer_dirty(bh);
+ brelse(bh);
}
for (i = 0; i < sbi->s_cg_loaded; i++) {
ufs_put_cylinder (sb, i);
@@ -1240,11 +1229,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
else
uspi->s_apbshift = uspi->s_bshift - 2;
- uspi->s_2apbshift = uspi->s_apbshift * 2;
- uspi->s_3apbshift = uspi->s_apbshift * 3;
uspi->s_apb = 1 << uspi->s_apbshift;
- uspi->s_2apb = 1 << uspi->s_2apbshift;
- uspi->s_3apb = 1 << uspi->s_3apbshift;
uspi->s_apbmask = uspi->s_apb - 1;
uspi->s_nspfshift = uspi->s_fshift - UFS_SECTOR_BITS;
uspi->s_nspb = uspi->s_nspf << uspi->s_fpbshift;
diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h
index a2c762cb..e7df65dd 100644
--- a/fs/ufs/ufs.h
+++ b/fs/ufs/ufs.h
@@ -88,10 +88,10 @@ struct ufs_inode_info {
#endif
/* balloc.c */
-extern void ufs_free_fragments (struct inode *, u64, unsigned);
-extern void ufs_free_blocks (struct inode *, u64, unsigned);
-extern u64 ufs_new_fragments(struct inode *, void *, u64, u64,
- unsigned, int *, struct page *);
+void ufs_free_fragments (struct inode *, u64 fragment, unsigned count);
+void ufs_free_blocks (struct inode *, u64 fragment, unsigned count);
+u64 ufs_new_fragments(struct inode *, void *, u64 fragment, u64 goal,
+ unsigned count, int *err, struct folio *);
/* cylinder.c */
extern struct ufs_cg_private_info * ufs_load_cylinder (struct super_block *, unsigned);
@@ -108,8 +108,8 @@ struct ufs_dir_entry *ufs_find_entry(struct inode *, const struct qstr *,
int ufs_delete_entry(struct inode *, struct ufs_dir_entry *, struct folio *);
int ufs_empty_dir(struct inode *);
struct ufs_dir_entry *ufs_dotdot(struct inode *, struct folio **);
-void ufs_set_link(struct inode *dir, struct ufs_dir_entry *de,
- struct folio *folio, struct inode *inode, bool update_times);
+int ufs_set_link(struct inode *dir, struct ufs_dir_entry *de,
+ struct folio *folio, struct inode *inode, bool update_times);
/* file.c */
extern const struct inode_operations ufs_file_inode_operations;
diff --git a/fs/ufs/ufs_fs.h b/fs/ufs/ufs_fs.h
index ef9ead4..0905f9a 100644
--- a/fs/ufs/ufs_fs.h
+++ b/fs/ufs/ufs_fs.h
@@ -775,12 +775,8 @@ struct ufs_sb_private_info {
__u32 s_fpbmask; /* fragments per block mask */
__u32 s_apb; /* address per block */
- __u32 s_2apb; /* address per block^2 */
- __u32 s_3apb; /* address per block^3 */
__u32 s_apbmask; /* address per block mask */
__u32 s_apbshift; /* address per block shift */
- __u32 s_2apbshift; /* address per block shift * 2 */
- __u32 s_3apbshift; /* address per block shift * 3 */
__u32 s_nspfshift; /* number of sector per fragment shift */
__u32 s_nspb; /* number of sector per block */
__u32 s_inopf; /* inodes per fragment */
diff --git a/fs/ufs/util.c b/fs/ufs/util.c
index 2acf191..f0e906a 100644
--- a/fs/ufs/util.c
+++ b/fs/ufs/util.c
@@ -99,20 +99,6 @@ void ubh_mark_buffer_dirty (struct ufs_buffer_head * ubh)
mark_buffer_dirty (ubh->bh[i]);
}
-void ubh_mark_buffer_uptodate (struct ufs_buffer_head * ubh, int flag)
-{
- unsigned i;
- if (!ubh)
- return;
- if (flag) {
- for ( i = 0; i < ubh->count; i++ )
- set_buffer_uptodate (ubh->bh[i]);
- } else {
- for ( i = 0; i < ubh->count; i++ )
- clear_buffer_uptodate (ubh->bh[i]);
- }
-}
-
void ubh_sync_block(struct ufs_buffer_head *ubh)
{
if (ubh) {
@@ -146,38 +132,6 @@ int ubh_buffer_dirty (struct ufs_buffer_head * ubh)
return result;
}
-void _ubh_ubhcpymem_(struct ufs_sb_private_info * uspi,
- unsigned char * mem, struct ufs_buffer_head * ubh, unsigned size)
-{
- unsigned len, bhno;
- if (size > (ubh->count << uspi->s_fshift))
- size = ubh->count << uspi->s_fshift;
- bhno = 0;
- while (size) {
- len = min_t(unsigned int, size, uspi->s_fsize);
- memcpy (mem, ubh->bh[bhno]->b_data, len);
- mem += uspi->s_fsize;
- size -= len;
- bhno++;
- }
-}
-
-void _ubh_memcpyubh_(struct ufs_sb_private_info * uspi,
- struct ufs_buffer_head * ubh, unsigned char * mem, unsigned size)
-{
- unsigned len, bhno;
- if (size > (ubh->count << uspi->s_fshift))
- size = ubh->count << uspi->s_fshift;
- bhno = 0;
- while (size) {
- len = min_t(unsigned int, size, uspi->s_fsize);
- memcpy (ubh->bh[bhno]->b_data, mem, len);
- mem += uspi->s_fsize;
- size -= len;
- bhno++;
- }
-}
-
dev_t
ufs_get_inode_dev(struct super_block *sb, struct ufs_inode_info *ufsi)
{
diff --git a/fs/ufs/util.h b/fs/ufs/util.h
index bf708b68..391bb4f 100644
--- a/fs/ufs/util.h
+++ b/fs/ufs/util.h
@@ -263,14 +263,9 @@ extern struct ufs_buffer_head * ubh_bread_uspi(struct ufs_sb_private_info *, str
extern void ubh_brelse (struct ufs_buffer_head *);
extern void ubh_brelse_uspi (struct ufs_sb_private_info *);
extern void ubh_mark_buffer_dirty (struct ufs_buffer_head *);
-extern void ubh_mark_buffer_uptodate (struct ufs_buffer_head *, int);
extern void ubh_sync_block(struct ufs_buffer_head *);
extern void ubh_bforget (struct ufs_buffer_head *);
extern int ubh_buffer_dirty (struct ufs_buffer_head *);
-#define ubh_ubhcpymem(mem,ubh,size) _ubh_ubhcpymem_(uspi,mem,ubh,size)
-extern void _ubh_ubhcpymem_(struct ufs_sb_private_info *, unsigned char *, struct ufs_buffer_head *, unsigned);
-#define ubh_memcpyubh(ubh,mem,size) _ubh_memcpyubh_(uspi,ubh,mem,size)
-extern void _ubh_memcpyubh_(struct ufs_sb_private_info *, struct ufs_buffer_head *, unsigned char *, unsigned);
/* This functions works with cache pages*/
struct folio *ufs_get_locked_folio(struct address_space *mapping, pgoff_t index);
@@ -455,65 +450,69 @@ static inline unsigned _ubh_find_last_zero_bit_(
return (base << uspi->s_bpfshift) + pos - begin;
}
-#define ubh_isblockclear(ubh,begin,block) (!_ubh_isblockset_(uspi,ubh,begin,block))
-
-#define ubh_isblockset(ubh,begin,block) _ubh_isblockset_(uspi,ubh,begin,block)
-static inline int _ubh_isblockset_(struct ufs_sb_private_info * uspi,
- struct ufs_buffer_head * ubh, unsigned begin, unsigned block)
+static inline int ubh_isblockset(struct ufs_sb_private_info *uspi,
+ struct ufs_cg_private_info *ucpi, unsigned int frag)
{
+ struct ufs_buffer_head *ubh = UCPI_UBH(ucpi);
+ u8 *p = ubh_get_addr(ubh, ucpi->c_freeoff + (frag >> 3));
u8 mask;
+
switch (uspi->s_fpb) {
case 8:
- return (*ubh_get_addr (ubh, begin + block) == 0xff);
+ return *p == 0xff;
case 4:
- mask = 0x0f << ((block & 0x01) << 2);
- return (*ubh_get_addr (ubh, begin + (block >> 1)) & mask) == mask;
+ mask = 0x0f << (frag & 4);
+ return (*p & mask) == mask;
case 2:
- mask = 0x03 << ((block & 0x03) << 1);
- return (*ubh_get_addr (ubh, begin + (block >> 2)) & mask) == mask;
+ mask = 0x03 << (frag & 6);
+ return (*p & mask) == mask;
case 1:
- mask = 0x01 << (block & 0x07);
- return (*ubh_get_addr (ubh, begin + (block >> 3)) & mask) == mask;
+ mask = 0x01 << (frag & 7);
+ return (*p & mask) == mask;
}
return 0;
}
-#define ubh_clrblock(ubh,begin,block) _ubh_clrblock_(uspi,ubh,begin,block)
-static inline void _ubh_clrblock_(struct ufs_sb_private_info * uspi,
- struct ufs_buffer_head * ubh, unsigned begin, unsigned block)
+static inline void ubh_clrblock(struct ufs_sb_private_info *uspi,
+ struct ufs_cg_private_info *ucpi, unsigned int frag)
{
+ struct ufs_buffer_head *ubh = UCPI_UBH(ucpi);
+ u8 *p = ubh_get_addr(ubh, ucpi->c_freeoff + (frag >> 3));
+
switch (uspi->s_fpb) {
case 8:
- *ubh_get_addr (ubh, begin + block) = 0x00;
+ *p = 0x00;
return;
case 4:
- *ubh_get_addr (ubh, begin + (block >> 1)) &= ~(0x0f << ((block & 0x01) << 2));
+ *p &= ~(0x0f << (frag & 4));
return;
case 2:
- *ubh_get_addr (ubh, begin + (block >> 2)) &= ~(0x03 << ((block & 0x03) << 1));
+ *p &= ~(0x03 << (frag & 6));
return;
case 1:
- *ubh_get_addr (ubh, begin + (block >> 3)) &= ~(0x01 << ((block & 0x07)));
+ *p &= ~(0x01 << (frag & 7));
return;
}
}
-#define ubh_setblock(ubh,begin,block) _ubh_setblock_(uspi,ubh,begin,block)
-static inline void _ubh_setblock_(struct ufs_sb_private_info * uspi,
- struct ufs_buffer_head * ubh, unsigned begin, unsigned block)
+static inline void ubh_setblock(struct ufs_sb_private_info * uspi,
+ struct ufs_cg_private_info *ucpi, unsigned int frag)
{
+ struct ufs_buffer_head *ubh = UCPI_UBH(ucpi);
+ u8 *p = ubh_get_addr(ubh, ucpi->c_freeoff + (frag >> 3));
+
switch (uspi->s_fpb) {
case 8:
- *ubh_get_addr(ubh, begin + block) = 0xff;
+ *p = 0xff;
return;
case 4:
- *ubh_get_addr(ubh, begin + (block >> 1)) |= (0x0f << ((block & 0x01) << 2));
+ *p |= 0x0f << (frag & 4);
return;
case 2:
- *ubh_get_addr(ubh, begin + (block >> 2)) |= (0x03 << ((block & 0x03) << 1));
+ *p |= 0x03 << (frag & 6);
return;
case 1:
- *ubh_get_addr(ubh, begin + (block >> 3)) |= (0x01 << ((block & 0x07)));
+ *p |= 0x01 << (frag & 7);
return;
}
}
diff --git a/fs/unicode/utf8-core.c b/fs/unicode/utf8-core.c
index 8395066..7f7cb14 100644
--- a/fs/unicode/utf8-core.c
+++ b/fs/unicode/utf8-core.c
@@ -214,3 +214,29 @@ void utf8_unload(struct unicode_map *um)
}
EXPORT_SYMBOL(utf8_unload);
+/**
+ * utf8_parse_version - Parse a UTF-8 version number from a string
+ *
+ * @version: input string
+ *
+ * Returns the parsed version on success, negative code on error
+ */
+int utf8_parse_version(char *version)
+{
+ substring_t args[3];
+ unsigned int maj, min, rev;
+ static const struct match_token token[] = {
+ {1, "%d.%d.%d"},
+ {0, NULL}
+ };
+
+ if (match_token(version, token, args) != 1)
+ return -EINVAL;
+
+ if (match_int(&args[0], &maj) || match_int(&args[1], &min) ||
+ match_int(&args[2], &rev))
+ return -EINVAL;
+
+ return UNICODE_AGE(maj, min, rev);
+}
+EXPORT_SYMBOL(utf8_parse_version);
diff --git a/fs/unicode/utf8-selftest.c b/fs/unicode/utf8-selftest.c
index 600e15e..5ddaf27 100644
--- a/fs/unicode/utf8-selftest.c
+++ b/fs/unicode/utf8-selftest.c
@@ -17,9 +17,6 @@
static unsigned int failed_tests;
static unsigned int total_tests;
-/* Tests will be based on this version. */
-#define UTF8_LATEST UNICODE_AGE(12, 1, 0)
-
#define _test(cond, func, line, fmt, ...) do { \
total_tests++; \
if (!cond) { \
diff --git a/fs/utimes.c b/fs/utimes.c
index 99b26f7..c7c7958 100644
--- a/fs/utimes.c
+++ b/fs/utimes.c
@@ -108,18 +108,13 @@ static int do_utimes_path(int dfd, const char __user *filename,
static int do_utimes_fd(int fd, struct timespec64 *times, int flags)
{
- struct fd f;
- int error;
-
if (flags)
return -EINVAL;
- f = fdget(fd);
- if (!fd_file(f))
+ CLASS(fd, f)(fd);
+ if (fd_empty(f))
return -EBADF;
- error = vfs_utimes(&fd_file(f)->f_path, times);
- fdput(f);
- return error;
+ return vfs_utimes(&fd_file(f)->f_path, times);
}
/*
diff --git a/fs/xattr.c b/fs/xattr.c
index 05ec7e7..02bee14 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -586,25 +586,32 @@ vfs_removexattr(struct mnt_idmap *idmap, struct dentry *dentry,
}
EXPORT_SYMBOL_GPL(vfs_removexattr);
+int import_xattr_name(struct xattr_name *kname, const char __user *name)
+{
+ int error = strncpy_from_user(kname->name, name,
+ sizeof(kname->name));
+ if (error == 0 || error == sizeof(kname->name))
+ return -ERANGE;
+ if (error < 0)
+ return error;
+ return 0;
+}
+
/*
* Extended attribute SET operations
*/
-int setxattr_copy(const char __user *name, struct xattr_ctx *ctx)
+int setxattr_copy(const char __user *name, struct kernel_xattr_ctx *ctx)
{
int error;
if (ctx->flags & ~(XATTR_CREATE|XATTR_REPLACE))
return -EINVAL;
- error = strncpy_from_user(ctx->kname->name, name,
- sizeof(ctx->kname->name));
- if (error == 0 || error == sizeof(ctx->kname->name))
- return -ERANGE;
- if (error < 0)
+ error = import_xattr_name(ctx->kname, name);
+ if (error)
return error;
- error = 0;
if (ctx->size) {
if (ctx->size > XATTR_SIZE_MAX)
return -E2BIG;
@@ -619,8 +626,8 @@ int setxattr_copy(const char __user *name, struct xattr_ctx *ctx)
return error;
}
-int do_setxattr(struct mnt_idmap *idmap, struct dentry *dentry,
- struct xattr_ctx *ctx)
+static int do_setxattr(struct mnt_idmap *idmap, struct dentry *dentry,
+ struct kernel_xattr_ctx *ctx)
{
if (is_posix_acl_xattr(ctx->kname->name))
return do_set_acl(idmap, dentry, ctx->kname->name,
@@ -630,32 +637,32 @@ int do_setxattr(struct mnt_idmap *idmap, struct dentry *dentry,
ctx->kvalue, ctx->size, ctx->flags);
}
-static int path_setxattr(const char __user *pathname,
- const char __user *name, const void __user *value,
- size_t size, int flags, unsigned int lookup_flags)
+int file_setxattr(struct file *f, struct kernel_xattr_ctx *ctx)
{
- struct xattr_name kname;
- struct xattr_ctx ctx = {
- .cvalue = value,
- .kvalue = NULL,
- .size = size,
- .kname = &kname,
- .flags = flags,
- };
+ int error = mnt_want_write_file(f);
+
+ if (!error) {
+ audit_file(f);
+ error = do_setxattr(file_mnt_idmap(f), f->f_path.dentry, ctx);
+ mnt_drop_write_file(f);
+ }
+ return error;
+}
+
+/* unconditionally consumes filename */
+int filename_setxattr(int dfd, struct filename *filename,
+ unsigned int lookup_flags, struct kernel_xattr_ctx *ctx)
+{
struct path path;
int error;
- error = setxattr_copy(name, &ctx);
- if (error)
- return error;
-
retry:
- error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
+ error = filename_lookup(dfd, filename, lookup_flags, &path, NULL);
if (error)
goto out;
error = mnt_want_write(path.mnt);
if (!error) {
- error = do_setxattr(mnt_idmap(path.mnt), path.dentry, &ctx);
+ error = do_setxattr(mnt_idmap(path.mnt), path.dentry, ctx);
mnt_drop_write(path.mnt);
}
path_put(&path);
@@ -665,80 +672,121 @@ static int path_setxattr(const char __user *pathname,
}
out:
+ putname(filename);
+ return error;
+}
+
+static int path_setxattrat(int dfd, const char __user *pathname,
+ unsigned int at_flags, const char __user *name,
+ const void __user *value, size_t size, int flags)
+{
+ struct xattr_name kname;
+ struct kernel_xattr_ctx ctx = {
+ .cvalue = value,
+ .kvalue = NULL,
+ .size = size,
+ .kname = &kname,
+ .flags = flags,
+ };
+ struct filename *filename;
+ unsigned int lookup_flags = 0;
+ int error;
+
+ if ((at_flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0)
+ return -EINVAL;
+
+ if (!(at_flags & AT_SYMLINK_NOFOLLOW))
+ lookup_flags = LOOKUP_FOLLOW;
+
+ error = setxattr_copy(name, &ctx);
+ if (error)
+ return error;
+
+ filename = getname_maybe_null(pathname, at_flags);
+ if (!filename) {
+ CLASS(fd, f)(dfd);
+ if (fd_empty(f))
+ error = -EBADF;
+ else
+ error = file_setxattr(fd_file(f), &ctx);
+ } else {
+ error = filename_setxattr(dfd, filename, lookup_flags, &ctx);
+ }
kvfree(ctx.kvalue);
return error;
}
+SYSCALL_DEFINE6(setxattrat, int, dfd, const char __user *, pathname, unsigned int, at_flags,
+ const char __user *, name, const struct xattr_args __user *, uargs,
+ size_t, usize)
+{
+ struct xattr_args args = {};
+ int error;
+
+ BUILD_BUG_ON(sizeof(struct xattr_args) < XATTR_ARGS_SIZE_VER0);
+ BUILD_BUG_ON(sizeof(struct xattr_args) != XATTR_ARGS_SIZE_LATEST);
+
+ if (unlikely(usize < XATTR_ARGS_SIZE_VER0))
+ return -EINVAL;
+ if (usize > PAGE_SIZE)
+ return -E2BIG;
+
+ error = copy_struct_from_user(&args, sizeof(args), uargs, usize);
+ if (error)
+ return error;
+
+ return path_setxattrat(dfd, pathname, at_flags, name,
+ u64_to_user_ptr(args.value), args.size,
+ args.flags);
+}
+
SYSCALL_DEFINE5(setxattr, const char __user *, pathname,
const char __user *, name, const void __user *, value,
size_t, size, int, flags)
{
- return path_setxattr(pathname, name, value, size, flags, LOOKUP_FOLLOW);
+ return path_setxattrat(AT_FDCWD, pathname, 0, name, value, size, flags);
}
SYSCALL_DEFINE5(lsetxattr, const char __user *, pathname,
const char __user *, name, const void __user *, value,
size_t, size, int, flags)
{
- return path_setxattr(pathname, name, value, size, flags, 0);
+ return path_setxattrat(AT_FDCWD, pathname, AT_SYMLINK_NOFOLLOW, name,
+ value, size, flags);
}
SYSCALL_DEFINE5(fsetxattr, int, fd, const char __user *, name,
const void __user *,value, size_t, size, int, flags)
{
- struct xattr_name kname;
- struct xattr_ctx ctx = {
- .cvalue = value,
- .kvalue = NULL,
- .size = size,
- .kname = &kname,
- .flags = flags,
- };
- int error;
-
- CLASS(fd, f)(fd);
- if (!fd_file(f))
- return -EBADF;
-
- audit_file(fd_file(f));
- error = setxattr_copy(name, &ctx);
- if (error)
- return error;
-
- error = mnt_want_write_file(fd_file(f));
- if (!error) {
- error = do_setxattr(file_mnt_idmap(fd_file(f)),
- fd_file(f)->f_path.dentry, &ctx);
- mnt_drop_write_file(fd_file(f));
- }
- kvfree(ctx.kvalue);
- return error;
+ return path_setxattrat(fd, NULL, AT_EMPTY_PATH, name,
+ value, size, flags);
}
/*
* Extended attribute GET operations
*/
-ssize_t
+static ssize_t
do_getxattr(struct mnt_idmap *idmap, struct dentry *d,
- struct xattr_ctx *ctx)
+ struct kernel_xattr_ctx *ctx)
{
ssize_t error;
char *kname = ctx->kname->name;
+ void *kvalue = NULL;
if (ctx->size) {
if (ctx->size > XATTR_SIZE_MAX)
ctx->size = XATTR_SIZE_MAX;
- ctx->kvalue = kvzalloc(ctx->size, GFP_KERNEL);
- if (!ctx->kvalue)
+ kvalue = kvzalloc(ctx->size, GFP_KERNEL);
+ if (!kvalue)
return -ENOMEM;
}
- if (is_posix_acl_xattr(ctx->kname->name))
- error = do_get_acl(idmap, d, kname, ctx->kvalue, ctx->size);
+ if (is_posix_acl_xattr(kname))
+ error = do_get_acl(idmap, d, kname, kvalue, ctx->size);
else
- error = vfs_getxattr(idmap, d, kname, ctx->kvalue, ctx->size);
+ error = vfs_getxattr(idmap, d, kname, kvalue, ctx->size);
if (error > 0) {
- if (ctx->size && copy_to_user(ctx->value, ctx->kvalue, error))
+ if (ctx->size && copy_to_user(ctx->value, kvalue, error))
error = -EFAULT;
} else if (error == -ERANGE && ctx->size >= XATTR_SIZE_MAX) {
/* The file system tried to returned a value bigger
@@ -746,79 +794,114 @@ do_getxattr(struct mnt_idmap *idmap, struct dentry *d,
error = -E2BIG;
}
+ kvfree(kvalue);
return error;
}
-static ssize_t
-getxattr(struct mnt_idmap *idmap, struct dentry *d,
- const char __user *name, void __user *value, size_t size)
+ssize_t file_getxattr(struct file *f, struct kernel_xattr_ctx *ctx)
{
- ssize_t error;
- struct xattr_name kname;
- struct xattr_ctx ctx = {
- .value = value,
- .kvalue = NULL,
- .size = size,
- .kname = &kname,
- .flags = 0,
- };
-
- error = strncpy_from_user(kname.name, name, sizeof(kname.name));
- if (error == 0 || error == sizeof(kname.name))
- error = -ERANGE;
- if (error < 0)
- return error;
-
- error = do_getxattr(idmap, d, &ctx);
-
- kvfree(ctx.kvalue);
- return error;
+ audit_file(f);
+ return do_getxattr(file_mnt_idmap(f), f->f_path.dentry, ctx);
}
-static ssize_t path_getxattr(const char __user *pathname,
- const char __user *name, void __user *value,
- size_t size, unsigned int lookup_flags)
+/* unconditionally consumes filename */
+ssize_t filename_getxattr(int dfd, struct filename *filename,
+ unsigned int lookup_flags, struct kernel_xattr_ctx *ctx)
{
struct path path;
ssize_t error;
retry:
- error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
+ error = filename_lookup(dfd, filename, lookup_flags, &path, NULL);
if (error)
- return error;
- error = getxattr(mnt_idmap(path.mnt), path.dentry, name, value, size);
+ goto out;
+ error = do_getxattr(mnt_idmap(path.mnt), path.dentry, ctx);
path_put(&path);
if (retry_estale(error, lookup_flags)) {
lookup_flags |= LOOKUP_REVAL;
goto retry;
}
+out:
+ putname(filename);
return error;
}
+static ssize_t path_getxattrat(int dfd, const char __user *pathname,
+ unsigned int at_flags, const char __user *name,
+ void __user *value, size_t size)
+{
+ struct xattr_name kname;
+ struct kernel_xattr_ctx ctx = {
+ .value = value,
+ .size = size,
+ .kname = &kname,
+ .flags = 0,
+ };
+ struct filename *filename;
+ ssize_t error;
+
+ if ((at_flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0)
+ return -EINVAL;
+
+ error = import_xattr_name(&kname, name);
+ if (error)
+ return error;
+
+ filename = getname_maybe_null(pathname, at_flags);
+ if (!filename) {
+ CLASS(fd, f)(dfd);
+ if (fd_empty(f))
+ return -EBADF;
+ return file_getxattr(fd_file(f), &ctx);
+ } else {
+ int lookup_flags = 0;
+ if (!(at_flags & AT_SYMLINK_NOFOLLOW))
+ lookup_flags = LOOKUP_FOLLOW;
+ return filename_getxattr(dfd, filename, lookup_flags, &ctx);
+ }
+}
+
+SYSCALL_DEFINE6(getxattrat, int, dfd, const char __user *, pathname, unsigned int, at_flags,
+ const char __user *, name, struct xattr_args __user *, uargs, size_t, usize)
+{
+ struct xattr_args args = {};
+ int error;
+
+ BUILD_BUG_ON(sizeof(struct xattr_args) < XATTR_ARGS_SIZE_VER0);
+ BUILD_BUG_ON(sizeof(struct xattr_args) != XATTR_ARGS_SIZE_LATEST);
+
+ if (unlikely(usize < XATTR_ARGS_SIZE_VER0))
+ return -EINVAL;
+ if (usize > PAGE_SIZE)
+ return -E2BIG;
+
+ error = copy_struct_from_user(&args, sizeof(args), uargs, usize);
+ if (error)
+ return error;
+
+ if (args.flags != 0)
+ return -EINVAL;
+
+ return path_getxattrat(dfd, pathname, at_flags, name,
+ u64_to_user_ptr(args.value), args.size);
+}
+
SYSCALL_DEFINE4(getxattr, const char __user *, pathname,
const char __user *, name, void __user *, value, size_t, size)
{
- return path_getxattr(pathname, name, value, size, LOOKUP_FOLLOW);
+ return path_getxattrat(AT_FDCWD, pathname, 0, name, value, size);
}
SYSCALL_DEFINE4(lgetxattr, const char __user *, pathname,
const char __user *, name, void __user *, value, size_t, size)
{
- return path_getxattr(pathname, name, value, size, 0);
+ return path_getxattrat(AT_FDCWD, pathname, AT_SYMLINK_NOFOLLOW, name,
+ value, size);
}
SYSCALL_DEFINE4(fgetxattr, int, fd, const char __user *, name,
void __user *, value, size_t, size)
{
- struct fd f = fdget(fd);
- ssize_t error = -EBADF;
-
- if (!fd_file(f))
- return error;
- audit_file(fd_file(f));
- error = getxattr(file_mnt_idmap(fd_file(f)), fd_file(f)->f_path.dentry,
- name, value, size);
- fdput(f);
- return error;
+ return path_getxattrat(fd, NULL, AT_EMPTY_PATH, name, value, size);
}
/*
@@ -853,47 +936,80 @@ listxattr(struct dentry *d, char __user *list, size_t size)
return error;
}
-static ssize_t path_listxattr(const char __user *pathname, char __user *list,
- size_t size, unsigned int lookup_flags)
+static
+ssize_t file_listxattr(struct file *f, char __user *list, size_t size)
+{
+ audit_file(f);
+ return listxattr(f->f_path.dentry, list, size);
+}
+
+/* unconditionally consumes filename */
+static
+ssize_t filename_listxattr(int dfd, struct filename *filename,
+ unsigned int lookup_flags,
+ char __user *list, size_t size)
{
struct path path;
ssize_t error;
retry:
- error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
+ error = filename_lookup(dfd, filename, lookup_flags, &path, NULL);
if (error)
- return error;
+ goto out;
error = listxattr(path.dentry, list, size);
path_put(&path);
if (retry_estale(error, lookup_flags)) {
lookup_flags |= LOOKUP_REVAL;
goto retry;
}
+out:
+ putname(filename);
return error;
}
+static ssize_t path_listxattrat(int dfd, const char __user *pathname,
+ unsigned int at_flags, char __user *list,
+ size_t size)
+{
+ struct filename *filename;
+ int lookup_flags;
+
+ if ((at_flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0)
+ return -EINVAL;
+
+ filename = getname_maybe_null(pathname, at_flags);
+ if (!filename) {
+ CLASS(fd, f)(dfd);
+ if (fd_empty(f))
+ return -EBADF;
+ return file_listxattr(fd_file(f), list, size);
+ }
+
+ lookup_flags = (at_flags & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW;
+ return filename_listxattr(dfd, filename, lookup_flags, list, size);
+}
+
+SYSCALL_DEFINE5(listxattrat, int, dfd, const char __user *, pathname,
+ unsigned int, at_flags,
+ char __user *, list, size_t, size)
+{
+ return path_listxattrat(dfd, pathname, at_flags, list, size);
+}
+
SYSCALL_DEFINE3(listxattr, const char __user *, pathname, char __user *, list,
size_t, size)
{
- return path_listxattr(pathname, list, size, LOOKUP_FOLLOW);
+ return path_listxattrat(AT_FDCWD, pathname, 0, list, size);
}
SYSCALL_DEFINE3(llistxattr, const char __user *, pathname, char __user *, list,
size_t, size)
{
- return path_listxattr(pathname, list, size, 0);
+ return path_listxattrat(AT_FDCWD, pathname, AT_SYMLINK_NOFOLLOW, list, size);
}
SYSCALL_DEFINE3(flistxattr, int, fd, char __user *, list, size_t, size)
{
- struct fd f = fdget(fd);
- ssize_t error = -EBADF;
-
- if (!fd_file(f))
- return error;
- audit_file(fd_file(f));
- error = listxattr(fd_file(f)->f_path.dentry, list, size);
- fdput(f);
- return error;
+ return path_listxattrat(fd, NULL, AT_EMPTY_PATH, list, size);
}
/*
@@ -907,25 +1023,33 @@ removexattr(struct mnt_idmap *idmap, struct dentry *d, const char *name)
return vfs_removexattr(idmap, d, name);
}
-static int path_removexattr(const char __user *pathname,
- const char __user *name, unsigned int lookup_flags)
+static int file_removexattr(struct file *f, struct xattr_name *kname)
+{
+ int error = mnt_want_write_file(f);
+
+ if (!error) {
+ audit_file(f);
+ error = removexattr(file_mnt_idmap(f),
+ f->f_path.dentry, kname->name);
+ mnt_drop_write_file(f);
+ }
+ return error;
+}
+
+/* unconditionally consumes filename */
+static int filename_removexattr(int dfd, struct filename *filename,
+ unsigned int lookup_flags, struct xattr_name *kname)
{
struct path path;
int error;
- char kname[XATTR_NAME_MAX + 1];
- error = strncpy_from_user(kname, name, sizeof(kname));
- if (error == 0 || error == sizeof(kname))
- error = -ERANGE;
- if (error < 0)
- return error;
retry:
- error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
+ error = filename_lookup(dfd, filename, lookup_flags, &path, NULL);
if (error)
- return error;
+ goto out;
error = mnt_want_write(path.mnt);
if (!error) {
- error = removexattr(mnt_idmap(path.mnt), path.dentry, kname);
+ error = removexattr(mnt_idmap(path.mnt), path.dentry, kname->name);
mnt_drop_write(path.mnt);
}
path_put(&path);
@@ -933,45 +1057,58 @@ static int path_removexattr(const char __user *pathname,
lookup_flags |= LOOKUP_REVAL;
goto retry;
}
+out:
+ putname(filename);
return error;
}
+static int path_removexattrat(int dfd, const char __user *pathname,
+ unsigned int at_flags, const char __user *name)
+{
+ struct xattr_name kname;
+ struct filename *filename;
+ unsigned int lookup_flags;
+ int error;
+
+ if ((at_flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0)
+ return -EINVAL;
+
+ error = import_xattr_name(&kname, name);
+ if (error)
+ return error;
+
+ filename = getname_maybe_null(pathname, at_flags);
+ if (!filename) {
+ CLASS(fd, f)(dfd);
+ if (fd_empty(f))
+ return -EBADF;
+ return file_removexattr(fd_file(f), &kname);
+ }
+ lookup_flags = (at_flags & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW;
+ return filename_removexattr(dfd, filename, lookup_flags, &kname);
+}
+
+SYSCALL_DEFINE4(removexattrat, int, dfd, const char __user *, pathname,
+ unsigned int, at_flags, const char __user *, name)
+{
+ return path_removexattrat(dfd, pathname, at_flags, name);
+}
+
SYSCALL_DEFINE2(removexattr, const char __user *, pathname,
const char __user *, name)
{
- return path_removexattr(pathname, name, LOOKUP_FOLLOW);
+ return path_removexattrat(AT_FDCWD, pathname, 0, name);
}
SYSCALL_DEFINE2(lremovexattr, const char __user *, pathname,
const char __user *, name)
{
- return path_removexattr(pathname, name, 0);
+ return path_removexattrat(AT_FDCWD, pathname, AT_SYMLINK_NOFOLLOW, name);
}
SYSCALL_DEFINE2(fremovexattr, int, fd, const char __user *, name)
{
- struct fd f = fdget(fd);
- char kname[XATTR_NAME_MAX + 1];
- int error = -EBADF;
-
- if (!fd_file(f))
- return error;
- audit_file(fd_file(f));
-
- error = strncpy_from_user(kname, name, sizeof(kname));
- if (error == 0 || error == sizeof(kname))
- error = -ERANGE;
- if (error < 0)
- return error;
-
- error = mnt_want_write_file(fd_file(f));
- if (!error) {
- error = removexattr(file_mnt_idmap(fd_file(f)),
- fd_file(f)->f_path.dentry, kname);
- mnt_drop_write_file(fd_file(f));
- }
- fdput(f);
- return error;
+ return path_removexattrat(fd, NULL, AT_EMPTY_PATH, name);
}
int xattr_list_one(char **buffer, ssize_t *remaining_size, const char *name)
@@ -1005,9 +1142,10 @@ generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
{
const struct xattr_handler *handler, * const *handlers = dentry->d_sb->s_xattr;
ssize_t remaining_size = buffer_size;
- int err = 0;
for_each_xattr_handler(handlers, handler) {
+ int err;
+
if (!handler->name || (handler->list && !handler->list(dentry)))
continue;
err = xattr_list_one(&buffer, &remaining_size, handler->name);
@@ -1015,7 +1153,7 @@ generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
return err;
}
- return err ? err : buffer_size - remaining_size;
+ return buffer_size - remaining_size;
}
EXPORT_SYMBOL(generic_listxattr);
diff --git a/fs/xfs/libxfs/xfs_trans_inode.c b/fs/xfs/libxfs/xfs_trans_inode.c
index 3c40f37..c962ad6 100644
--- a/fs/xfs/libxfs/xfs_trans_inode.c
+++ b/fs/xfs/libxfs/xfs_trans_inode.c
@@ -62,12 +62,12 @@ xfs_trans_ichgtime(
ASSERT(tp);
xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
- tv = current_time(inode);
+ /* If the mtime changes, then ctime must also change */
+ ASSERT(flags & XFS_ICHGTIME_CHG);
+ tv = inode_set_ctime_current(inode);
if (flags & XFS_ICHGTIME_MOD)
inode_set_mtime_to_ts(inode, tv);
- if (flags & XFS_ICHGTIME_CHG)
- inode_set_ctime_to_ts(inode, tv);
if (flags & XFS_ICHGTIME_ACCESS)
inode_set_atime_to_ts(inode, tv);
if (flags & XFS_ICHGTIME_CREATE)
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index aa4dbda..e8196f5 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -2115,6 +2115,13 @@ xfs_alloc_buftarg(
btp->bt_daxdev = fs_dax_get_by_bdev(btp->bt_bdev, &btp->bt_dax_part_off,
mp, ops);
+ if (bdev_can_atomic_write(btp->bt_bdev)) {
+ btp->bt_bdev_awu_min = bdev_atomic_write_unit_min_bytes(
+ btp->bt_bdev);
+ btp->bt_bdev_awu_max = bdev_atomic_write_unit_max_bytes(
+ btp->bt_bdev);
+ }
+
/*
* When allocating the buftargs we have not yet read the super block and
* thus don't know the file system sector size yet.
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 209a389..3d56bc7 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -124,6 +124,10 @@ struct xfs_buftarg {
struct percpu_counter bt_io_count;
struct ratelimit_state bt_ioerror_rl;
+ /* Atomic write unit values */
+ unsigned int bt_bdev_awu_min;
+ unsigned int bt_bdev_awu_max;
+
/* built-in cache, if we're not using the perag one */
struct xfs_buf_cache bt_cache[];
};
diff --git a/fs/xfs/xfs_exchrange.c b/fs/xfs/xfs_exchrange.c
index 75cb53f..fa29c8b 100644
--- a/fs/xfs/xfs_exchrange.c
+++ b/fs/xfs/xfs_exchrange.c
@@ -813,8 +813,6 @@ xfs_ioc_exchange_range(
.file2 = file,
};
struct xfs_exchange_range args;
- struct fd file1;
- int error;
if (copy_from_user(&args, argp, sizeof(args)))
return -EFAULT;
@@ -828,14 +826,12 @@ xfs_ioc_exchange_range(
fxr.length = args.length;
fxr.flags = args.flags;
- file1 = fdget(args.file1_fd);
- if (!fd_file(file1))
+ CLASS(fd, file1)(args.file1_fd);
+ if (fd_empty(file1))
return -EBADF;
fxr.file1 = fd_file(file1);
- error = xfs_exchange_range(&fxr);
- fdput(file1);
- return error;
+ return xfs_exchange_range(&fxr);
}
/* Opaque freshness blob for XFS_IOC_COMMIT_RANGE */
@@ -909,8 +905,6 @@ xfs_ioc_commit_range(
struct xfs_commit_range_fresh *kern_f;
struct xfs_inode *ip2 = XFS_I(file_inode(file));
struct xfs_mount *mp = ip2->i_mount;
- struct fd file1;
- int error;
kern_f = (struct xfs_commit_range_fresh *)&args.file2_freshness;
@@ -934,12 +928,10 @@ xfs_ioc_commit_range(
fxr.file2_ctime.tv_sec = kern_f->file2_ctime;
fxr.file2_ctime.tv_nsec = kern_f->file2_ctime_nsec;
- file1 = fdget(args.file1_fd);
+ CLASS(fd, file1)(args.file1_fd);
if (fd_empty(file1))
return -EBADF;
fxr.file1 = fd_file(file1);
- error = xfs_exchange_range(&fxr);
- fdput(file1);
- return error;
+ return xfs_exchange_range(&fxr);
}
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index b19916b..ca47cae 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -852,6 +852,20 @@ xfs_file_write_iter(
if (IS_DAX(inode))
return xfs_file_dax_write(iocb, from);
+ if (iocb->ki_flags & IOCB_ATOMIC) {
+ /*
+ * Currently only atomic writing of a single FS block is
+ * supported. It would be possible to atomic write smaller than
+ * a FS block, but there is no requirement to support this.
+ * Note that iomap also does not support this yet.
+ */
+ if (ocount != ip->i_mount->m_sb.sb_blocksize)
+ return -EINVAL;
+ ret = generic_atomic_write_valid(iocb, from);
+ if (ret)
+ return ret;
+ }
+
if (iocb->ki_flags & IOCB_DIRECT) {
/*
* Allow a directio write to fall back to a buffered
@@ -1239,6 +1253,8 @@ xfs_file_open(
if (xfs_is_shutdown(XFS_M(inode->i_sb)))
return -EIO;
file->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT;
+ if (xfs_inode_can_atomicwrite(XFS_I(inode)))
+ file->f_mode |= FMODE_CAN_ATOMIC_WRITE;
return generic_file_open(inode, file);
}
diff --git a/fs/xfs/xfs_handle.c b/fs/xfs/xfs_handle.c
index 49e5e5f..f19fce5 100644
--- a/fs/xfs/xfs_handle.c
+++ b/fs/xfs/xfs_handle.c
@@ -85,22 +85,23 @@ xfs_find_handle(
int hsize;
xfs_handle_t handle;
struct inode *inode;
- struct fd f = EMPTY_FD;
struct path path;
int error;
struct xfs_inode *ip;
if (cmd == XFS_IOC_FD_TO_HANDLE) {
- f = fdget(hreq->fd);
- if (!fd_file(f))
+ CLASS(fd, f)(hreq->fd);
+
+ if (fd_empty(f))
return -EBADF;
- inode = file_inode(fd_file(f));
+ path = fd_file(f)->f_path;
+ path_get(&path);
} else {
error = user_path_at(AT_FDCWD, hreq->path, 0, &path);
if (error)
return error;
- inode = d_inode(path.dentry);
}
+ inode = d_inode(path.dentry);
ip = XFS_I(inode);
/*
@@ -134,10 +135,7 @@ xfs_find_handle(
error = 0;
out_put:
- if (cmd == XFS_IOC_FD_TO_HANDLE)
- fdput(f);
- else
- path_put(&path);
+ path_put(&path);
return error;
}
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 03944b6..a2a6b5f 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -332,6 +332,21 @@ static inline bool xfs_inode_has_bigrtalloc(struct xfs_inode *ip)
(XFS_IS_REALTIME_INODE(ip) ? \
(ip)->i_mount->m_rtdev_targp : (ip)->i_mount->m_ddev_targp)
+static inline bool
+xfs_inode_can_atomicwrite(
+ struct xfs_inode *ip)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_buftarg *target = xfs_inode_buftarg(ip);
+
+ if (mp->m_sb.sb_blocksize < target->bt_bdev_awu_min)
+ return false;
+ if (mp->m_sb.sb_blocksize > target->bt_bdev_awu_max)
+ return false;
+
+ return true;
+}
+
/*
* In-core inode flags.
*/
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 2567fd2..af1bb5d 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -881,41 +881,29 @@ xfs_ioc_swapext(
xfs_swapext_t *sxp)
{
xfs_inode_t *ip, *tip;
- struct fd f, tmp;
- int error = 0;
/* Pull information for the target fd */
- f = fdget((int)sxp->sx_fdtarget);
- if (!fd_file(f)) {
- error = -EINVAL;
- goto out;
- }
+ CLASS(fd, f)((int)sxp->sx_fdtarget);
+ if (fd_empty(f))
+ return -EINVAL;
if (!(fd_file(f)->f_mode & FMODE_WRITE) ||
!(fd_file(f)->f_mode & FMODE_READ) ||
- (fd_file(f)->f_flags & O_APPEND)) {
- error = -EBADF;
- goto out_put_file;
- }
+ (fd_file(f)->f_flags & O_APPEND))
+ return -EBADF;
- tmp = fdget((int)sxp->sx_fdtmp);
- if (!fd_file(tmp)) {
- error = -EINVAL;
- goto out_put_file;
- }
+ CLASS(fd, tmp)((int)sxp->sx_fdtmp);
+ if (fd_empty(tmp))
+ return -EINVAL;
if (!(fd_file(tmp)->f_mode & FMODE_WRITE) ||
!(fd_file(tmp)->f_mode & FMODE_READ) ||
- (fd_file(tmp)->f_flags & O_APPEND)) {
- error = -EBADF;
- goto out_put_tmp_file;
- }
+ (fd_file(tmp)->f_flags & O_APPEND))
+ return -EBADF;
if (IS_SWAPFILE(file_inode(fd_file(f))) ||
- IS_SWAPFILE(file_inode(fd_file(tmp)))) {
- error = -EINVAL;
- goto out_put_tmp_file;
- }
+ IS_SWAPFILE(file_inode(fd_file(tmp))))
+ return -EINVAL;
/*
* We need to ensure that the fds passed in point to XFS inodes
@@ -923,37 +911,22 @@ xfs_ioc_swapext(
* control over what the user passes us here.
*/
if (fd_file(f)->f_op != &xfs_file_operations ||
- fd_file(tmp)->f_op != &xfs_file_operations) {
- error = -EINVAL;
- goto out_put_tmp_file;
- }
+ fd_file(tmp)->f_op != &xfs_file_operations)
+ return -EINVAL;
ip = XFS_I(file_inode(fd_file(f)));
tip = XFS_I(file_inode(fd_file(tmp)));
- if (ip->i_mount != tip->i_mount) {
- error = -EINVAL;
- goto out_put_tmp_file;
- }
+ if (ip->i_mount != tip->i_mount)
+ return -EINVAL;
- if (ip->i_ino == tip->i_ino) {
- error = -EINVAL;
- goto out_put_tmp_file;
- }
+ if (ip->i_ino == tip->i_ino)
+ return -EINVAL;
- if (xfs_is_shutdown(ip->i_mount)) {
- error = -EIO;
- goto out_put_tmp_file;
- }
+ if (xfs_is_shutdown(ip->i_mount))
+ return -EIO;
- error = xfs_swap_extents(ip, tip, sxp);
-
- out_put_tmp_file:
- fdput(tmp);
- out_put_file:
- fdput(f);
- out:
- return error;
+ return xfs_swap_extents(ip, tip, sxp);
}
static int
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index ee79cf1..4084d26f 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -570,6 +570,20 @@ xfs_stat_blksize(
return max_t(uint32_t, PAGE_SIZE, mp->m_sb.sb_blocksize);
}
+static void
+xfs_get_atomic_write_attr(
+ struct xfs_inode *ip,
+ unsigned int *unit_min,
+ unsigned int *unit_max)
+{
+ if (!xfs_inode_can_atomicwrite(ip)) {
+ *unit_min = *unit_max = 0;
+ return;
+ }
+
+ *unit_min = *unit_max = ip->i_mount->m_sb.sb_blocksize;
+}
+
STATIC int
xfs_vn_getattr(
struct mnt_idmap *idmap,
@@ -597,8 +611,9 @@ xfs_vn_getattr(
stat->gid = vfsgid_into_kgid(vfsgid);
stat->ino = ip->i_ino;
stat->atime = inode_get_atime(inode);
- stat->mtime = inode_get_mtime(inode);
- stat->ctime = inode_get_ctime(inode);
+
+ fill_mg_cmtime(stat, request_mask, inode);
+
stat->blocks = XFS_FSB_TO_BB(mp, ip->i_nblocks + ip->i_delayed_blks);
if (xfs_has_v3inodes(mp)) {
@@ -608,11 +623,6 @@ xfs_vn_getattr(
}
}
- if ((request_mask & STATX_CHANGE_COOKIE) && IS_I_VERSION(inode)) {
- stat->change_cookie = inode_query_iversion(inode);
- stat->result_mask |= STATX_CHANGE_COOKIE;
- }
-
/*
* Note: If you add another clause to set an attribute flag, please
* update attributes_mask below.
@@ -643,6 +653,14 @@ xfs_vn_getattr(
stat->dio_mem_align = bdev_dma_alignment(bdev) + 1;
stat->dio_offset_align = bdev_logical_block_size(bdev);
}
+ if (request_mask & STATX_WRITE_ATOMIC) {
+ unsigned int unit_min, unit_max;
+
+ xfs_get_atomic_write_attr(ip, &unit_min,
+ &unit_max);
+ generic_fill_statx_atomic_writes(stat,
+ unit_min, unit_max);
+ }
fallthrough;
default:
stat->blksize = xfs_stat_blksize(ip);
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index fbb3a15..fda75db 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -2063,7 +2063,7 @@ static struct file_system_type xfs_fs_type = {
.init_fs_context = xfs_init_fs_context,
.parameters = xfs_fs_parameters,
.kill_sb = xfs_kill_sb,
- .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP,
+ .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP | FS_MGTIME,
};
MODULE_ALIAS_FS("xfs");
diff --git a/include/asm-generic/audit_change_attr.h b/include/asm-generic/audit_change_attr.h
index 3316708..cc84053 100644
--- a/include/asm-generic/audit_change_attr.h
+++ b/include/asm-generic/audit_change_attr.h
@@ -11,9 +11,15 @@ __NR_lchown,
__NR_fchown,
#endif
__NR_setxattr,
+#ifdef __NR_setxattrat
+__NR_setxattrat,
+#endif
__NR_lsetxattr,
__NR_fsetxattr,
__NR_removexattr,
+#ifdef __NR_removexattrat
+__NR_removexattrat,
+#endif
__NR_lremovexattr,
__NR_fremovexattr,
#ifdef __NR_fchownat
diff --git a/include/crypto/akcipher.h b/include/crypto/akcipher.h
index 18a10ca..cdf7da7 100644
--- a/include/crypto/akcipher.h
+++ b/include/crypto/akcipher.h
@@ -12,24 +12,19 @@
#include <linux/crypto.h>
/**
- * struct akcipher_request - public key request
+ * struct akcipher_request - public key cipher request
*
* @base: Common attributes for async crypto requests
* @src: Source data
- * For verify op this is signature + digest, in that case
- * total size of @src is @src_len + @dst_len.
- * @dst: Destination data (Should be NULL for verify op)
+ * @dst: Destination data
* @src_len: Size of the input buffer
- * For verify op it's size of signature part of @src, this part
- * is supposed to be operated by cipher.
- * @dst_len: Size of @dst buffer (for all ops except verify).
+ * @dst_len: Size of @dst buffer
* It needs to be at least as big as the expected result
* depending on the operation.
* After operation it will be updated with the actual size of the
* result.
* In case of error where the dst sgl size was insufficient,
* it will be updated to the size required for the operation.
- * For verify op this is size of digest part in @src.
* @__ctx: Start of private context data
*/
struct akcipher_request {
@@ -55,15 +50,8 @@ struct crypto_akcipher {
};
/**
- * struct akcipher_alg - generic public key algorithm
+ * struct akcipher_alg - generic public key cipher algorithm
*
- * @sign: Function performs a sign operation as defined by public key
- * algorithm. In case of error, where the dst_len was insufficient,
- * the req->dst_len will be updated to the size required for the
- * operation
- * @verify: Function performs a complete verify operation as defined by
- * public key algorithm, returning verification status. Requires
- * digest value as input parameter.
* @encrypt: Function performs an encrypt operation as defined by public key
* algorithm. In case of error, where the dst_len was insufficient,
* the req->dst_len will be updated to the size required for the
@@ -94,8 +82,6 @@ struct crypto_akcipher {
* @base: Common crypto API algorithm data structure
*/
struct akcipher_alg {
- int (*sign)(struct akcipher_request *req);
- int (*verify)(struct akcipher_request *req);
int (*encrypt)(struct akcipher_request *req);
int (*decrypt)(struct akcipher_request *req);
int (*set_pub_key)(struct crypto_akcipher *tfm, const void *key,
@@ -110,9 +96,9 @@ struct akcipher_alg {
};
/**
- * DOC: Generic Public Key API
+ * DOC: Generic Public Key Cipher API
*
- * The Public Key API is used with the algorithms of type
+ * The Public Key Cipher API is used with the algorithms of type
* CRYPTO_ALG_TYPE_AKCIPHER (listed as type "akcipher" in /proc/crypto)
*/
@@ -243,10 +229,9 @@ static inline void akcipher_request_set_callback(struct akcipher_request *req,
*
* @req: public key request
* @src: ptr to input scatter list
- * @dst: ptr to output scatter list or NULL for verify op
+ * @dst: ptr to output scatter list
* @src_len: size of the src input scatter list to be processed
- * @dst_len: size of the dst output scatter list or size of signature
- * portion in @src for verify op
+ * @dst_len: size of the dst output scatter list
*/
static inline void akcipher_request_set_crypt(struct akcipher_request *req,
struct scatterlist *src,
@@ -348,44 +333,6 @@ int crypto_akcipher_sync_decrypt(struct crypto_akcipher *tfm,
void *dst, unsigned int dlen);
/**
- * crypto_akcipher_sign() - Invoke public key sign operation
- *
- * Function invokes the specific public key sign operation for a given
- * public key algorithm
- *
- * @req: asymmetric key request
- *
- * Return: zero on success; error code in case of error
- */
-static inline int crypto_akcipher_sign(struct akcipher_request *req)
-{
- struct crypto_akcipher *tfm = crypto_akcipher_reqtfm(req);
-
- return crypto_akcipher_alg(tfm)->sign(req);
-}
-
-/**
- * crypto_akcipher_verify() - Invoke public key signature verification
- *
- * Function invokes the specific public key signature verification operation
- * for a given public key algorithm.
- *
- * @req: asymmetric key request
- *
- * Note: req->dst should be NULL, req->src should point to SG of size
- * (req->src_size + req->dst_size), containing signature (of req->src_size
- * length) with appended digest (of req->dst_size length).
- *
- * Return: zero on verification success; error code in case of error.
- */
-static inline int crypto_akcipher_verify(struct akcipher_request *req)
-{
- struct crypto_akcipher *tfm = crypto_akcipher_reqtfm(req);
-
- return crypto_akcipher_alg(tfm)->verify(req);
-}
-
-/**
* crypto_akcipher_set_pub_key() - Invoke set public key operation
*
* Function invokes the algorithm specific set key function, which knows
diff --git a/include/crypto/internal/akcipher.h b/include/crypto/internal/akcipher.h
index a0fba4b..14ee62b 100644
--- a/include/crypto/internal/akcipher.h
+++ b/include/crypto/internal/akcipher.h
@@ -124,7 +124,7 @@ static inline struct akcipher_alg *crypto_spawn_akcipher_alg(
/**
* crypto_register_akcipher() -- Register public key algorithm
*
- * Function registers an implementation of a public key verify algorithm
+ * Function registers an implementation of a public key cipher algorithm
*
* @alg: algorithm definition
*
@@ -135,7 +135,7 @@ int crypto_register_akcipher(struct akcipher_alg *alg);
/**
* crypto_unregister_akcipher() -- Unregister public key algorithm
*
- * Function unregisters an implementation of a public key verify algorithm
+ * Function unregisters an implementation of a public key cipher algorithm
*
* @alg: algorithm definition
*/
diff --git a/include/crypto/internal/ecc.h b/include/crypto/internal/ecc.h
index 065f00e..57cd752 100644
--- a/include/crypto/internal/ecc.h
+++ b/include/crypto/internal/ecc.h
@@ -42,6 +42,18 @@
#define ECC_POINT_INIT(x, y, ndigits) (struct ecc_point) { x, y, ndigits }
+/*
+ * The integers r and s making up the signature are expected to be
+ * formatted as two consecutive u64 arrays of size ECC_MAX_BYTES.
+ * The bytes within each u64 digit are in native endianness,
+ * but the order of the u64 digits themselves is little endian.
+ * This format allows direct use by internal vli_*() functions.
+ */
+struct ecdsa_raw_sig {
+ u64 r[ECC_MAX_DIGITS];
+ u64 s[ECC_MAX_DIGITS];
+};
+
/**
* ecc_swap_digits() - Copy ndigits from big endian array to native array
* @in: Input array
@@ -293,4 +305,6 @@ void ecc_point_mult_shamir(const struct ecc_point *result,
const u64 *y, const struct ecc_point *q,
const struct ecc_curve *curve);
+extern struct crypto_template ecdsa_x962_tmpl;
+extern struct crypto_template ecdsa_p1363_tmpl;
#endif
diff --git a/include/crypto/internal/rsa.h b/include/crypto/internal/rsa.h
index e870133..071a195 100644
--- a/include/crypto/internal/rsa.h
+++ b/include/crypto/internal/rsa.h
@@ -8,6 +8,7 @@
#ifndef _RSA_HELPER_
#define _RSA_HELPER_
#include <linux/types.h>
+#include <crypto/akcipher.h>
/**
* rsa_key - RSA key structure
@@ -53,5 +54,33 @@ int rsa_parse_pub_key(struct rsa_key *rsa_key, const void *key,
int rsa_parse_priv_key(struct rsa_key *rsa_key, const void *key,
unsigned int key_len);
+#define RSA_PUB (true)
+#define RSA_PRIV (false)
+
+static inline int rsa_set_key(struct crypto_akcipher *child,
+ unsigned int *key_size, bool is_pubkey,
+ const void *key, unsigned int keylen)
+{
+ int err;
+
+ *key_size = 0;
+
+ if (is_pubkey)
+ err = crypto_akcipher_set_pub_key(child, key, keylen);
+ else
+ err = crypto_akcipher_set_priv_key(child, key, keylen);
+ if (err)
+ return err;
+
+ /* Find out new modulus size from rsa implementation */
+ err = crypto_akcipher_maxsize(child);
+ if (err > PAGE_SIZE)
+ return -ENOTSUPP;
+
+ *key_size = err;
+ return 0;
+}
+
extern struct crypto_template rsa_pkcs1pad_tmpl;
+extern struct crypto_template rsassa_pkcs1_tmpl;
#endif
diff --git a/include/crypto/internal/sig.h b/include/crypto/internal/sig.h
index 97cb26e..b16648c 100644
--- a/include/crypto/internal/sig.h
+++ b/include/crypto/internal/sig.h
@@ -10,8 +10,88 @@
#include <crypto/algapi.h>
#include <crypto/sig.h>
+struct sig_instance {
+ void (*free)(struct sig_instance *inst);
+ union {
+ struct {
+ char head[offsetof(struct sig_alg, base)];
+ struct crypto_instance base;
+ };
+ struct sig_alg alg;
+ };
+};
+
+struct crypto_sig_spawn {
+ struct crypto_spawn base;
+};
+
static inline void *crypto_sig_ctx(struct crypto_sig *tfm)
{
return crypto_tfm_ctx(&tfm->base);
}
+
+/**
+ * crypto_register_sig() -- Register public key signature algorithm
+ *
+ * Function registers an implementation of a public key signature algorithm
+ *
+ * @alg: algorithm definition
+ *
+ * Return: zero on success; error code in case of error
+ */
+int crypto_register_sig(struct sig_alg *alg);
+
+/**
+ * crypto_unregister_sig() -- Unregister public key signature algorithm
+ *
+ * Function unregisters an implementation of a public key signature algorithm
+ *
+ * @alg: algorithm definition
+ */
+void crypto_unregister_sig(struct sig_alg *alg);
+
+int sig_register_instance(struct crypto_template *tmpl,
+ struct sig_instance *inst);
+
+static inline struct sig_instance *sig_instance(struct crypto_instance *inst)
+{
+ return container_of(&inst->alg, struct sig_instance, alg.base);
+}
+
+static inline struct sig_instance *sig_alg_instance(struct crypto_sig *tfm)
+{
+ return sig_instance(crypto_tfm_alg_instance(&tfm->base));
+}
+
+static inline struct crypto_instance *sig_crypto_instance(struct sig_instance
+ *inst)
+{
+ return container_of(&inst->alg.base, struct crypto_instance, alg);
+}
+
+static inline void *sig_instance_ctx(struct sig_instance *inst)
+{
+ return crypto_instance_ctx(sig_crypto_instance(inst));
+}
+
+int crypto_grab_sig(struct crypto_sig_spawn *spawn,
+ struct crypto_instance *inst,
+ const char *name, u32 type, u32 mask);
+
+static inline struct crypto_sig *crypto_spawn_sig(struct crypto_sig_spawn
+ *spawn)
+{
+ return crypto_spawn_tfm2(&spawn->base);
+}
+
+static inline void crypto_drop_sig(struct crypto_sig_spawn *spawn)
+{
+ crypto_drop_spawn(&spawn->base);
+}
+
+static inline struct sig_alg *crypto_spawn_sig_alg(struct crypto_sig_spawn
+ *spawn)
+{
+ return container_of(spawn->base.alg, struct sig_alg, base);
+}
#endif
diff --git a/include/crypto/public_key.h b/include/crypto/public_key.h
index b7f3089..81098e0 100644
--- a/include/crypto/public_key.h
+++ b/include/crypto/public_key.h
@@ -104,9 +104,6 @@ static inline int restrict_link_by_digsig(struct key *dest_keyring,
extern int query_asymmetric_key(const struct kernel_pkey_params *,
struct kernel_pkey_query *);
-extern int encrypt_blob(struct kernel_pkey_params *, const void *, void *);
-extern int decrypt_blob(struct kernel_pkey_params *, const void *, void *);
-extern int create_signature(struct kernel_pkey_params *, const void *, void *);
extern int verify_signature(const struct key *,
const struct public_key_signature *);
diff --git a/include/crypto/sig.h b/include/crypto/sig.h
index d25186b..cff41ad 100644
--- a/include/crypto/sig.h
+++ b/include/crypto/sig.h
@@ -20,6 +20,56 @@ struct crypto_sig {
};
/**
+ * struct sig_alg - generic public key signature algorithm
+ *
+ * @sign: Function performs a sign operation as defined by public key
+ * algorithm. Optional.
+ * @verify: Function performs a complete verify operation as defined by
+ * public key algorithm, returning verification status. Optional.
+ * @set_pub_key: Function invokes the algorithm specific set public key
+ * function, which knows how to decode and interpret
+ * the BER encoded public key and parameters. Mandatory.
+ * @set_priv_key: Function invokes the algorithm specific set private key
+ * function, which knows how to decode and interpret
+ * the BER encoded private key and parameters. Optional.
+ * @key_size: Function returns key size. Mandatory.
+ * @digest_size: Function returns maximum digest size. Optional.
+ * @max_size: Function returns maximum signature size. Optional.
+ * @init: Initialize the cryptographic transformation object.
+ * This function is used to initialize the cryptographic
+ * transformation object. This function is called only once at
+ * the instantiation time, right after the transformation context
+ * was allocated. In case the cryptographic hardware has some
+ * special requirements which need to be handled by software, this
+ * function shall check for the precise requirement of the
+ * transformation and put any software fallbacks in place.
+ * @exit: Deinitialize the cryptographic transformation object. This is a
+ * counterpart to @init, used to remove various changes set in
+ * @init.
+ *
+ * @base: Common crypto API algorithm data structure
+ */
+struct sig_alg {
+ int (*sign)(struct crypto_sig *tfm,
+ const void *src, unsigned int slen,
+ void *dst, unsigned int dlen);
+ int (*verify)(struct crypto_sig *tfm,
+ const void *src, unsigned int slen,
+ const void *digest, unsigned int dlen);
+ int (*set_pub_key)(struct crypto_sig *tfm,
+ const void *key, unsigned int keylen);
+ int (*set_priv_key)(struct crypto_sig *tfm,
+ const void *key, unsigned int keylen);
+ unsigned int (*key_size)(struct crypto_sig *tfm);
+ unsigned int (*digest_size)(struct crypto_sig *tfm);
+ unsigned int (*max_size)(struct crypto_sig *tfm);
+ int (*init)(struct crypto_sig *tfm);
+ void (*exit)(struct crypto_sig *tfm);
+
+ struct crypto_alg base;
+};
+
+/**
* DOC: Generic Public Key Signature API
*
* The Public Key Signature API is used with the algorithms of type
@@ -47,6 +97,21 @@ static inline struct crypto_tfm *crypto_sig_tfm(struct crypto_sig *tfm)
return &tfm->base;
}
+static inline struct crypto_sig *__crypto_sig_tfm(struct crypto_tfm *tfm)
+{
+ return container_of(tfm, struct crypto_sig, base);
+}
+
+static inline struct sig_alg *__crypto_sig_alg(struct crypto_alg *alg)
+{
+ return container_of(alg, struct sig_alg, base);
+}
+
+static inline struct sig_alg *crypto_sig_alg(struct crypto_sig *tfm)
+{
+ return __crypto_sig_alg(crypto_sig_tfm(tfm)->__crt_alg);
+}
+
/**
* crypto_free_sig() - free signature tfm handle
*
@@ -60,16 +125,55 @@ static inline void crypto_free_sig(struct crypto_sig *tfm)
}
/**
- * crypto_sig_maxsize() - Get len for output buffer
+ * crypto_sig_keysize() - Get key size
*
- * Function returns the dest buffer size required for a given key.
+ * Function returns the key size in bytes.
* Function assumes that the key is already set in the transformation. If this
- * function is called without a setkey or with a failed setkey, you will end up
+ * function is called without a setkey or with a failed setkey, you may end up
* in a NULL dereference.
*
* @tfm: signature tfm handle allocated with crypto_alloc_sig()
*/
-int crypto_sig_maxsize(struct crypto_sig *tfm);
+static inline unsigned int crypto_sig_keysize(struct crypto_sig *tfm)
+{
+ struct sig_alg *alg = crypto_sig_alg(tfm);
+
+ return alg->key_size(tfm);
+}
+
+/**
+ * crypto_sig_digestsize() - Get maximum digest size
+ *
+ * Function returns the maximum digest size in bytes.
+ * Function assumes that the key is already set in the transformation. If this
+ * function is called without a setkey or with a failed setkey, you may end up
+ * in a NULL dereference.
+ *
+ * @tfm: signature tfm handle allocated with crypto_alloc_sig()
+ */
+static inline unsigned int crypto_sig_digestsize(struct crypto_sig *tfm)
+{
+ struct sig_alg *alg = crypto_sig_alg(tfm);
+
+ return alg->digest_size(tfm);
+}
+
+/**
+ * crypto_sig_maxsize() - Get maximum signature size
+ *
+ * Function returns the maximum signature size in bytes.
+ * Function assumes that the key is already set in the transformation. If this
+ * function is called without a setkey or with a failed setkey, you may end up
+ * in a NULL dereference.
+ *
+ * @tfm: signature tfm handle allocated with crypto_alloc_sig()
+ */
+static inline unsigned int crypto_sig_maxsize(struct crypto_sig *tfm)
+{
+ struct sig_alg *alg = crypto_sig_alg(tfm);
+
+ return alg->max_size(tfm);
+}
/**
* crypto_sig_sign() - Invoke signing operation
@@ -84,9 +188,14 @@ int crypto_sig_maxsize(struct crypto_sig *tfm);
*
* Return: zero on success; error code in case of error
*/
-int crypto_sig_sign(struct crypto_sig *tfm,
- const void *src, unsigned int slen,
- void *dst, unsigned int dlen);
+static inline int crypto_sig_sign(struct crypto_sig *tfm,
+ const void *src, unsigned int slen,
+ void *dst, unsigned int dlen)
+{
+ struct sig_alg *alg = crypto_sig_alg(tfm);
+
+ return alg->sign(tfm, src, slen, dst, dlen);
+}
/**
* crypto_sig_verify() - Invoke signature verification
@@ -102,9 +211,14 @@ int crypto_sig_sign(struct crypto_sig *tfm,
*
* Return: zero on verification success; error code in case of error.
*/
-int crypto_sig_verify(struct crypto_sig *tfm,
- const void *src, unsigned int slen,
- const void *digest, unsigned int dlen);
+static inline int crypto_sig_verify(struct crypto_sig *tfm,
+ const void *src, unsigned int slen,
+ const void *digest, unsigned int dlen)
+{
+ struct sig_alg *alg = crypto_sig_alg(tfm);
+
+ return alg->verify(tfm, src, slen, digest, dlen);
+}
/**
* crypto_sig_set_pubkey() - Invoke set public key operation
@@ -119,8 +233,13 @@ int crypto_sig_verify(struct crypto_sig *tfm,
*
* Return: zero on success; error code in case of error
*/
-int crypto_sig_set_pubkey(struct crypto_sig *tfm,
- const void *key, unsigned int keylen);
+static inline int crypto_sig_set_pubkey(struct crypto_sig *tfm,
+ const void *key, unsigned int keylen)
+{
+ struct sig_alg *alg = crypto_sig_alg(tfm);
+
+ return alg->set_pub_key(tfm, key, keylen);
+}
/**
* crypto_sig_set_privkey() - Invoke set private key operation
@@ -135,6 +254,11 @@ int crypto_sig_set_pubkey(struct crypto_sig *tfm,
*
* Return: zero on success; error code in case of error
*/
-int crypto_sig_set_privkey(struct crypto_sig *tfm,
- const void *key, unsigned int keylen);
+static inline int crypto_sig_set_privkey(struct crypto_sig *tfm,
+ const void *key, unsigned int keylen)
+{
+ struct sig_alg *alg = crypto_sig_alg(tfm);
+
+ return alg->set_priv_key(tfm, key, keylen);
+}
#endif
diff --git a/include/drm/intel/i915_pciids.h b/include/drm/intel/i915_pciids.h
index 2bf03eb..f355345 100644
--- a/include/drm/intel/i915_pciids.h
+++ b/include/drm/intel/i915_pciids.h
@@ -771,13 +771,24 @@
INTEL_ATS_M150_IDS(MACRO__, ## __VA_ARGS__), \
INTEL_ATS_M75_IDS(MACRO__, ## __VA_ARGS__)
-/* MTL */
-#define INTEL_ARL_IDS(MACRO__, ...) \
- MACRO__(0x7D41, ## __VA_ARGS__), \
+/* ARL */
+#define INTEL_ARL_H_IDS(MACRO__, ...) \
MACRO__(0x7D51, ## __VA_ARGS__), \
- MACRO__(0x7D67, ## __VA_ARGS__), \
MACRO__(0x7DD1, ## __VA_ARGS__)
+#define INTEL_ARL_U_IDS(MACRO__, ...) \
+ MACRO__(0x7D41, ## __VA_ARGS__) \
+
+#define INTEL_ARL_S_IDS(MACRO__, ...) \
+ MACRO__(0x7D67, ## __VA_ARGS__), \
+ MACRO__(0xB640, ## __VA_ARGS__)
+
+#define INTEL_ARL_IDS(MACRO__, ...) \
+ INTEL_ARL_H_IDS(MACRO__, ## __VA_ARGS__), \
+ INTEL_ARL_U_IDS(MACRO__, ## __VA_ARGS__), \
+ INTEL_ARL_S_IDS(MACRO__, ## __VA_ARGS__)
+
+/* MTL */
#define INTEL_MTL_IDS(MACRO__, ...) \
INTEL_ARL_IDS(MACRO__, ## __VA_ARGS__), \
MACRO__(0x7D40, ## __VA_ARGS__), \
diff --git a/include/linux/asn1_decoder.h b/include/linux/asn1_decoder.h
index 83f9c6e..b41bce8 100644
--- a/include/linux/asn1_decoder.h
+++ b/include/linux/asn1_decoder.h
@@ -9,6 +9,7 @@
#define _LINUX_ASN1_DECODER_H
#include <linux/asn1.h>
+#include <linux/types.h>
struct asn1_decoder;
diff --git a/include/linux/asn1_encoder.h b/include/linux/asn1_encoder.h
index 08cd0c2..d17484d 100644
--- a/include/linux/asn1_encoder.h
+++ b/include/linux/asn1_encoder.h
@@ -6,7 +6,6 @@
#include <linux/types.h>
#include <linux/asn1.h>
#include <linux/asn1_ber_bytecode.h>
-#include <linux/bug.h>
#define asn1_oid_len(oid) (sizeof(oid)/sizeof(u32))
unsigned char *
diff --git a/include/linux/bio-integrity.h b/include/linux/bio-integrity.h
index dd831c2..dbf0f74 100644
--- a/include/linux/bio-integrity.h
+++ b/include/linux/bio-integrity.h
@@ -72,7 +72,7 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio, gfp_t gfp,
unsigned int nr);
int bio_integrity_add_page(struct bio *bio, struct page *page, unsigned int len,
unsigned int offset);
-int bio_integrity_map_user(struct bio *bio, void __user *ubuf, ssize_t len, u32 seed);
+int bio_integrity_map_user(struct bio *bio, void __user *ubuf, ssize_t len);
void bio_integrity_unmap_user(struct bio *bio);
bool bio_integrity_prep(struct bio *bio);
void bio_integrity_advance(struct bio *bio, unsigned int bytes_done);
@@ -99,7 +99,7 @@ static inline void bioset_integrity_free(struct bio_set *bs)
}
static inline int bio_integrity_map_user(struct bio *bio, void __user *ubuf,
- ssize_t len, u32 seed)
+ ssize_t len)
{
return -EINVAL;
}
diff --git a/include/linux/bio.h b/include/linux/bio.h
index faceadb..60830a6 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -418,8 +418,6 @@ bool __must_check bio_add_folio(struct bio *bio, struct folio *folio,
size_t len, size_t off);
extern int bio_add_pc_page(struct request_queue *, struct bio *, struct page *,
unsigned int, unsigned int);
-int bio_add_zone_append_page(struct bio *bio, struct page *page,
- unsigned int len, unsigned int offset);
void __bio_add_page(struct bio *bio, struct page *page,
unsigned int len, unsigned int off);
void bio_add_folio_nofail(struct bio *bio, struct folio *folio, size_t len,
@@ -677,6 +675,23 @@ static inline void bio_clear_polled(struct bio *bio)
bio->bi_opf &= ~REQ_POLLED;
}
+/**
+ * bio_is_zone_append - is this a zone append bio?
+ * @bio: bio to check
+ *
+ * Check if @bio is a zone append operation. Core block layer code and end_io
+ * handlers must use this instead of an open coded REQ_OP_ZONE_APPEND check
+ * because the block layer can rewrite REQ_OP_ZONE_APPEND to REQ_OP_WRITE if
+ * it is not natively supported.
+ */
+static inline bool bio_is_zone_append(struct bio *bio)
+{
+ if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED))
+ return false;
+ return bio_op(bio) == REQ_OP_ZONE_APPEND ||
+ bio_flagged(bio, BIO_EMULATES_ZONE_APPEND);
+}
+
struct bio *blk_next_bio(struct bio *bio, struct block_device *bdev,
unsigned int nr_pages, blk_opf_t opf, gfp_t gfp);
struct bio *bio_chain_and_submit(struct bio *prev, struct bio *new);
diff --git a/include/linux/blk-integrity.h b/include/linux/blk-integrity.h
index 676f8f8..c7eae0b 100644
--- a/include/linux/blk-integrity.h
+++ b/include/linux/blk-integrity.h
@@ -28,7 +28,7 @@ static inline bool queue_limits_stack_integrity_bdev(struct queue_limits *t,
int blk_rq_map_integrity_sg(struct request *, struct scatterlist *);
int blk_rq_count_integrity_sg(struct request_queue *, struct bio *);
int blk_rq_integrity_map_user(struct request *rq, void __user *ubuf,
- ssize_t bytes, u32 seed);
+ ssize_t bytes);
static inline bool
blk_integrity_queue_supports_integrity(struct request_queue *q)
@@ -104,8 +104,7 @@ static inline int blk_rq_map_integrity_sg(struct request *q,
}
static inline int blk_rq_integrity_map_user(struct request *rq,
void __user *ubuf,
- ssize_t bytes,
- u32 seed)
+ ssize_t bytes)
{
return -EINVAL;
}
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 4fecf46..c596e0e 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -156,9 +156,6 @@ struct request {
struct blk_crypto_keyslot *crypt_keyslot;
#endif
- enum rw_hint write_hint;
- unsigned short ioprio;
-
enum mq_rq_state state;
atomic_t ref;
@@ -222,7 +219,9 @@ static inline bool blk_rq_is_passthrough(struct request *rq)
static inline unsigned short req_get_ioprio(struct request *req)
{
- return req->ioprio;
+ if (req->bio)
+ return req->bio->bi_ioprio;
+ return 0;
}
#define rq_data_dir(rq) (op_is_write(req_op(rq)) ? WRITE : READ)
@@ -230,62 +229,61 @@ static inline unsigned short req_get_ioprio(struct request *req)
#define rq_dma_dir(rq) \
(op_is_write(req_op(rq)) ? DMA_TO_DEVICE : DMA_FROM_DEVICE)
-#define rq_list_add(listptr, rq) do { \
- (rq)->rq_next = *(listptr); \
- *(listptr) = rq; \
-} while (0)
-
-#define rq_list_add_tail(lastpptr, rq) do { \
- (rq)->rq_next = NULL; \
- **(lastpptr) = rq; \
- *(lastpptr) = &rq->rq_next; \
-} while (0)
-
-#define rq_list_pop(listptr) \
-({ \
- struct request *__req = NULL; \
- if ((listptr) && *(listptr)) { \
- __req = *(listptr); \
- *(listptr) = __req->rq_next; \
- } \
- __req; \
-})
-
-#define rq_list_peek(listptr) \
-({ \
- struct request *__req = NULL; \
- if ((listptr) && *(listptr)) \
- __req = *(listptr); \
- __req; \
-})
-
-#define rq_list_for_each(listptr, pos) \
- for (pos = rq_list_peek((listptr)); pos; pos = rq_list_next(pos))
-
-#define rq_list_for_each_safe(listptr, pos, nxt) \
- for (pos = rq_list_peek((listptr)), nxt = rq_list_next(pos); \
- pos; pos = nxt, nxt = pos ? rq_list_next(pos) : NULL)
-
-#define rq_list_next(rq) (rq)->rq_next
-#define rq_list_empty(list) ((list) == (struct request *) NULL)
-
-/**
- * rq_list_move() - move a struct request from one list to another
- * @src: The source list @rq is currently in
- * @dst: The destination list that @rq will be appended to
- * @rq: The request to move
- * @prev: The request preceding @rq in @src (NULL if @rq is the head)
- */
-static inline void rq_list_move(struct request **src, struct request **dst,
- struct request *rq, struct request *prev)
+static inline int rq_list_empty(const struct rq_list *rl)
{
- if (prev)
- prev->rq_next = rq->rq_next;
- else
- *src = rq->rq_next;
- rq_list_add(dst, rq);
+ return rl->head == NULL;
}
+static inline void rq_list_init(struct rq_list *rl)
+{
+ rl->head = NULL;
+ rl->tail = NULL;
+}
+
+static inline void rq_list_add_tail(struct rq_list *rl, struct request *rq)
+{
+ rq->rq_next = NULL;
+ if (rl->tail)
+ rl->tail->rq_next = rq;
+ else
+ rl->head = rq;
+ rl->tail = rq;
+}
+
+static inline void rq_list_add_head(struct rq_list *rl, struct request *rq)
+{
+ rq->rq_next = rl->head;
+ rl->head = rq;
+ if (!rl->tail)
+ rl->tail = rq;
+}
+
+static inline struct request *rq_list_pop(struct rq_list *rl)
+{
+ struct request *rq = rl->head;
+
+ if (rq) {
+ rl->head = rl->head->rq_next;
+ if (!rl->head)
+ rl->tail = NULL;
+ rq->rq_next = NULL;
+ }
+
+ return rq;
+}
+
+static inline struct request *rq_list_peek(struct rq_list *rl)
+{
+ return rl->head;
+}
+
+#define rq_list_for_each(rl, pos) \
+ for (pos = rq_list_peek((rl)); (pos); pos = pos->rq_next)
+
+#define rq_list_for_each_safe(rl, pos, nxt) \
+ for (pos = rq_list_peek((rl)), nxt = pos->rq_next; \
+ pos; pos = nxt, nxt = pos ? pos->rq_next : NULL)
+
/**
* enum blk_eh_timer_return - How the timeout handler should proceed
* @BLK_EH_DONE: The block driver completed the command or will complete it at
@@ -577,7 +575,7 @@ struct blk_mq_ops {
* empty the @rqlist completely, then the rest will be queued
* individually by the block layer upon return.
*/
- void (*queue_rqs)(struct request **rqlist);
+ void (*queue_rqs)(struct rq_list *rqlist);
/**
* @get_budget: Reserve budget before queue request, once .queue_rq is
@@ -857,12 +855,6 @@ void blk_mq_end_request_batch(struct io_comp_batch *ib);
*/
static inline bool blk_mq_need_time_stamp(struct request *rq)
{
- /*
- * passthrough io doesn't use iostat accounting, cgroup stats
- * and io scheduler functionalities.
- */
- if (blk_rq_is_passthrough(rq))
- return false;
return (rq->rq_flags & (RQF_IO_STAT | RQF_STATS | RQF_USE_SCHED));
}
@@ -892,7 +884,7 @@ static inline bool blk_mq_add_to_batch(struct request *req,
else if (iob->complete != complete)
return false;
iob->need_ts |= blk_mq_need_time_stamp(req);
- rq_list_add(&iob->req_list, req);
+ rq_list_add_tail(&iob->req_list, req);
return true;
}
@@ -925,6 +917,8 @@ void blk_freeze_queue_start(struct request_queue *q);
void blk_mq_freeze_queue_wait(struct request_queue *q);
int blk_mq_freeze_queue_wait_timeout(struct request_queue *q,
unsigned long timeout);
+void blk_mq_unfreeze_queue_non_owner(struct request_queue *q);
+void blk_freeze_queue_start_non_owner(struct request_queue *q);
void blk_mq_map_queues(struct blk_mq_queue_map *qmap);
void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues);
@@ -989,7 +983,6 @@ static inline void blk_rq_bio_prep(struct request *rq, struct bio *bio,
rq->nr_phys_segments = nr_segs;
rq->__data_len = bio->bi_iter.bi_size;
rq->bio = rq->biotail = bio;
- rq->ioprio = bio_prio(bio);
}
void blk_mq_hctx_set_fq_lock_class(struct blk_mq_hw_ctx *hctx,
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 50c3b95..a1fd0dd 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -25,6 +25,7 @@
#include <linux/uuid.h>
#include <linux/xarray.h>
#include <linux/file.h>
+#include <linux/lockdep.h>
struct module;
struct request_queue;
@@ -194,7 +195,7 @@ struct gendisk {
unsigned int nr_zones;
unsigned int zone_capacity;
unsigned int last_zone_capacity;
- unsigned long *conv_zones_bitmap;
+ unsigned long __rcu *conv_zones_bitmap;
unsigned int zone_wplugs_hash_bits;
spinlock_t zone_wplugs_lock;
struct mempool_s *zone_wplugs_pool;
@@ -349,6 +350,9 @@ typedef unsigned int __bitwise blk_flags_t;
/* I/O topology is misaligned */
#define BLK_FLAG_MISALIGNED ((__force blk_flags_t)(1u << 1))
+/* passthrough command IO accounting */
+#define BLK_FLAG_IOSTATS_PASSTHROUGH ((__force blk_flags_t)(1u << 2))
+
struct queue_limits {
blk_features_t features;
blk_flags_t flags;
@@ -371,6 +375,7 @@ struct queue_limits {
unsigned int max_user_discard_sectors;
unsigned int max_secure_erase_sectors;
unsigned int max_write_zeroes_sectors;
+ unsigned int max_hw_zone_append_sectors;
unsigned int max_zone_append_sectors;
unsigned int discard_granularity;
unsigned int discard_alignment;
@@ -471,6 +476,11 @@ struct request_queue {
struct xarray hctx_table;
struct percpu_ref q_usage_counter;
+ struct lock_class_key io_lock_cls_key;
+ struct lockdep_map io_lockdep_map;
+
+ struct lock_class_key q_lock_cls_key;
+ struct lockdep_map q_lockdep_map;
struct request *last_merge;
@@ -566,6 +576,10 @@ struct request_queue {
struct throtl_data *td;
#endif
struct rcu_head rcu_head;
+#ifdef CONFIG_LOCKDEP
+ struct task_struct *mq_freeze_owner;
+ int mq_freeze_owner_depth;
+#endif
wait_queue_head_t mq_freeze_wq;
/*
* Protect concurrent access to q_usage_counter by
@@ -617,6 +631,8 @@ void blk_queue_flag_clear(unsigned int flag, struct request_queue *q);
test_bit(QUEUE_FLAG_NOXMERGES, &(q)->queue_flags)
#define blk_queue_nonrot(q) (!((q)->limits.features & BLK_FEAT_ROTATIONAL))
#define blk_queue_io_stat(q) ((q)->limits.features & BLK_FEAT_IO_STAT)
+#define blk_queue_passthrough_stat(q) \
+ ((q)->limits.flags & BLK_FLAG_IOSTATS_PASSTHROUGH)
#define blk_queue_dax(q) ((q)->limits.features & BLK_FEAT_DAX)
#define blk_queue_pci_p2pdma(q) ((q)->limits.features & BLK_FEAT_PCI_P2PDMA)
#ifdef CONFIG_BLK_RQ_ALLOC_TIME
@@ -725,6 +741,9 @@ static inline unsigned int blk_queue_depth(struct request_queue *q)
#define for_each_bio(_bio) \
for (; _bio; _bio = _bio->bi_next)
+int __must_check add_disk_fwnode(struct device *parent, struct gendisk *disk,
+ const struct attribute_group **groups,
+ struct fwnode_handle *fwnode);
int __must_check device_add_disk(struct device *parent, struct gendisk *disk,
const struct attribute_group **groups);
static inline int __must_check add_disk(struct gendisk *disk)
@@ -929,6 +948,7 @@ queue_limits_start_update(struct request_queue *q)
int queue_limits_commit_update(struct request_queue *q,
struct queue_limits *lim);
int queue_limits_set(struct request_queue *q, struct queue_limits *lim);
+int blk_validate_limits(struct queue_limits *lim);
/**
* queue_limits_cancel_update - cancel an atomic update of queue limits
@@ -986,6 +1006,11 @@ extern void blk_put_queue(struct request_queue *);
void blk_mark_disk_dead(struct gendisk *disk);
+struct rq_list {
+ struct request *head;
+ struct request *tail;
+};
+
#ifdef CONFIG_BLOCK
/*
* blk_plug permits building a queue of related requests by holding the I/O
@@ -999,10 +1024,10 @@ void blk_mark_disk_dead(struct gendisk *disk);
* blk_flush_plug() is called.
*/
struct blk_plug {
- struct request *mq_list; /* blk-mq requests */
+ struct rq_list mq_list; /* blk-mq requests */
/* if ios_left is > 1, we can batch tag/rq allocations */
- struct request *cached_rq;
+ struct rq_list cached_rqs;
u64 cur_ktime;
unsigned short nr_ios;
@@ -1145,6 +1170,11 @@ enum blk_default_limits {
*/
#define BLK_DEF_MAX_SECTORS_CAP 2560u
+static inline struct queue_limits *bdev_limits(struct block_device *bdev)
+{
+ return &bdev_get_queue(bdev)->limits;
+}
+
static inline unsigned long queue_segment_boundary(const struct request_queue *q)
{
return q->limits.seg_boundary_mask;
@@ -1185,25 +1215,9 @@ static inline unsigned int queue_max_segment_size(const struct request_queue *q)
return q->limits.max_segment_size;
}
-static inline unsigned int
-queue_limits_max_zone_append_sectors(const struct queue_limits *l)
-{
- unsigned int max_sectors = min(l->chunk_sectors, l->max_hw_sectors);
-
- return min_not_zero(l->max_zone_append_sectors, max_sectors);
-}
-
-static inline unsigned int queue_max_zone_append_sectors(struct request_queue *q)
-{
- if (!blk_queue_is_zoned(q))
- return 0;
-
- return queue_limits_max_zone_append_sectors(&q->limits);
-}
-
static inline bool queue_emulates_zone_append(struct request_queue *q)
{
- return blk_queue_is_zoned(q) && !q->limits.max_zone_append_sectors;
+ return blk_queue_is_zoned(q) && !q->limits.max_hw_zone_append_sectors;
}
static inline bool bdev_emulates_zone_append(struct block_device *bdev)
@@ -1214,7 +1228,7 @@ static inline bool bdev_emulates_zone_append(struct block_device *bdev)
static inline unsigned int
bdev_max_zone_append_sectors(struct block_device *bdev)
{
- return queue_max_zone_append_sectors(bdev_get_queue(bdev));
+ return bdev_limits(bdev)->max_zone_append_sectors;
}
static inline unsigned int bdev_max_segments(struct block_device *bdev)
@@ -1279,23 +1293,23 @@ unsigned int bdev_discard_alignment(struct block_device *bdev);
static inline unsigned int bdev_max_discard_sectors(struct block_device *bdev)
{
- return bdev_get_queue(bdev)->limits.max_discard_sectors;
+ return bdev_limits(bdev)->max_discard_sectors;
}
static inline unsigned int bdev_discard_granularity(struct block_device *bdev)
{
- return bdev_get_queue(bdev)->limits.discard_granularity;
+ return bdev_limits(bdev)->discard_granularity;
}
static inline unsigned int
bdev_max_secure_erase_sectors(struct block_device *bdev)
{
- return bdev_get_queue(bdev)->limits.max_secure_erase_sectors;
+ return bdev_limits(bdev)->max_secure_erase_sectors;
}
static inline unsigned int bdev_write_zeroes_sectors(struct block_device *bdev)
{
- return bdev_get_queue(bdev)->limits.max_write_zeroes_sectors;
+ return bdev_limits(bdev)->max_write_zeroes_sectors;
}
static inline bool bdev_nonrot(struct block_device *bdev)
@@ -1331,7 +1345,7 @@ static inline bool bdev_write_cache(struct block_device *bdev)
static inline bool bdev_fua(struct block_device *bdev)
{
- return bdev_get_queue(bdev)->limits.features & BLK_FEAT_FUA;
+ return bdev_limits(bdev)->features & BLK_FEAT_FUA;
}
static inline bool bdev_nowait(struct block_device *bdev)
@@ -1376,6 +1390,33 @@ static inline bool bdev_is_zone_start(struct block_device *bdev,
return bdev_offset_from_zone_start(bdev, sector) == 0;
}
+/**
+ * bdev_zone_is_seq - check if a sector belongs to a sequential write zone
+ * @bdev: block device to check
+ * @sector: sector number
+ *
+ * Check if @sector on @bdev is contained in a sequential write required zone.
+ */
+static inline bool bdev_zone_is_seq(struct block_device *bdev, sector_t sector)
+{
+ bool is_seq = false;
+
+#if IS_ENABLED(CONFIG_BLK_DEV_ZONED)
+ if (bdev_is_zoned(bdev)) {
+ struct gendisk *disk = bdev->bd_disk;
+ unsigned long *bitmap;
+
+ rcu_read_lock();
+ bitmap = rcu_dereference(disk->conv_zones_bitmap);
+ is_seq = !bitmap ||
+ !test_bit(disk_zone_no(disk, sector), bitmap);
+ rcu_read_unlock();
+ }
+#endif
+
+ return is_seq;
+}
+
static inline int queue_dma_alignment(const struct request_queue *q)
{
return q->limits.dma_alignment;
@@ -1648,7 +1689,7 @@ int bdev_thaw(struct block_device *bdev);
void bdev_fput(struct file *bdev_file);
struct io_comp_batch {
- struct request *req_list;
+ struct rq_list req_list;
bool need_ts;
void (*complete)(struct io_comp_batch *);
};
@@ -1674,6 +1715,22 @@ static inline bool bdev_can_atomic_write(struct block_device *bdev)
return true;
}
+static inline unsigned int
+bdev_atomic_write_unit_min_bytes(struct block_device *bdev)
+{
+ if (!bdev_can_atomic_write(bdev))
+ return 0;
+ return queue_atomic_write_unit_min_bytes(bdev_get_queue(bdev));
+}
+
+static inline unsigned int
+bdev_atomic_write_unit_max_bytes(struct block_device *bdev)
+{
+ if (!bdev_can_atomic_write(bdev))
+ return 0;
+ return queue_atomic_write_unit_max_bytes(bdev_get_queue(bdev));
+}
+
#define DEFINE_IO_COMP_BATCH(name) struct io_comp_batch name = { }
#endif /* _LINUX_BLKDEV_H */
diff --git a/include/linux/cleanup.h b/include/linux/cleanup.h
index 038b2d5..875c998 100644
--- a/include/linux/cleanup.h
+++ b/include/linux/cleanup.h
@@ -234,7 +234,7 @@ const volatile void * __must_check_fn(const volatile void *val)
* DEFINE_CLASS(fdget, struct fd, fdput(_T), fdget(fd), int fd)
*
* CLASS(fdget, f)(fd);
- * if (!fd_file(f))
+ * if (fd_empty(f))
* return -EBADF;
*
* // use 'f' without concern
diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 2361ed4..61d9a66 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -227,6 +227,7 @@ enum cpuhp_state {
CPUHP_AP_PERF_ARM_APM_XGENE_ONLINE,
CPUHP_AP_PERF_ARM_CAVIUM_TX2_UNCORE_ONLINE,
CPUHP_AP_PERF_ARM_MARVELL_CN10K_DDR_ONLINE,
+ CPUHP_AP_PERF_ARM_MRVL_PEM_ONLINE,
CPUHP_AP_PERF_POWERPC_NEST_IMC_ONLINE,
CPUHP_AP_PERF_POWERPC_CORE_IMC_ONLINE,
CPUHP_AP_PERF_POWERPC_THREAD_IMC_ONLINE,
diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h
index 1ff5202..752e0b2 100644
--- a/include/linux/energy_model.h
+++ b/include/linux/energy_model.h
@@ -55,6 +55,8 @@ struct em_perf_table {
* struct em_perf_domain - Performance domain
* @em_table: Pointer to the runtime modifiable em_perf_table
* @nr_perf_states: Number of performance states
+ * @min_perf_state: Minimum allowed Performance State index
+ * @max_perf_state: Maximum allowed Performance State index
* @flags: See "em_perf_domain flags"
* @cpus: Cpumask covering the CPUs of the domain. It's here
* for performance reasons to avoid potential cache
@@ -70,6 +72,8 @@ struct em_perf_table {
struct em_perf_domain {
struct em_perf_table __rcu *em_table;
int nr_perf_states;
+ int min_perf_state;
+ int max_perf_state;
unsigned long flags;
unsigned long cpus[];
};
@@ -173,13 +177,14 @@ void em_table_free(struct em_perf_table __rcu *table);
int em_dev_compute_costs(struct device *dev, struct em_perf_state *table,
int nr_states);
int em_dev_update_chip_binning(struct device *dev);
+int em_update_performance_limits(struct em_perf_domain *pd,
+ unsigned long freq_min_khz, unsigned long freq_max_khz);
/**
* em_pd_get_efficient_state() - Get an efficient performance state from the EM
* @table: List of performance states, in ascending order
- * @nr_perf_states: Number of performance states
+ * @pd: performance domain for which this must be done
* @max_util: Max utilization to map with the EM
- * @pd_flags: Performance Domain flags
*
* It is called from the scheduler code quite frequently and as a consequence
* doesn't implement any check.
@@ -188,13 +193,16 @@ int em_dev_update_chip_binning(struct device *dev);
* requirement.
*/
static inline int
-em_pd_get_efficient_state(struct em_perf_state *table, int nr_perf_states,
- unsigned long max_util, unsigned long pd_flags)
+em_pd_get_efficient_state(struct em_perf_state *table,
+ struct em_perf_domain *pd, unsigned long max_util)
{
+ unsigned long pd_flags = pd->flags;
+ int min_ps = pd->min_perf_state;
+ int max_ps = pd->max_perf_state;
struct em_perf_state *ps;
int i;
- for (i = 0; i < nr_perf_states; i++) {
+ for (i = min_ps; i <= max_ps; i++) {
ps = &table[i];
if (ps->performance >= max_util) {
if (pd_flags & EM_PERF_DOMAIN_SKIP_INEFFICIENCIES &&
@@ -204,7 +212,7 @@ em_pd_get_efficient_state(struct em_perf_state *table, int nr_perf_states,
}
}
- return nr_perf_states - 1;
+ return max_ps;
}
/**
@@ -253,8 +261,7 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd,
* requested performance.
*/
em_table = rcu_dereference(pd->em_table);
- i = em_pd_get_efficient_state(em_table->state, pd->nr_perf_states,
- max_util, pd->flags);
+ i = em_pd_get_efficient_state(em_table->state, pd, max_util);
ps = &em_table->state[i];
/*
@@ -391,6 +398,12 @@ static inline int em_dev_update_chip_binning(struct device *dev)
{
return -EINVAL;
}
+static inline
+int em_update_performance_limits(struct em_perf_domain *pd,
+ unsigned long freq_min_khz, unsigned long freq_max_khz)
+{
+ return -EINVAL;
+}
#endif
#endif
diff --git a/include/linux/eventpoll.h b/include/linux/eventpoll.h
index 3337745..0c0d00f 100644
--- a/include/linux/eventpoll.h
+++ b/include/linux/eventpoll.h
@@ -42,7 +42,7 @@ static inline void eventpoll_release(struct file *file)
* because the file in on the way to be removed and nobody ( but
* eventpoll ) has still a reference to this file.
*/
- if (likely(!file->f_ep))
+ if (likely(!READ_ONCE(file->f_ep)))
return;
/*
diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h
index 893a1d2..1ab165c 100644
--- a/include/linux/exportfs.h
+++ b/include/linux/exportfs.h
@@ -250,19 +250,6 @@ struct export_operations {
unsigned long flags;
};
-/**
- * exportfs_lock_op_is_async() - export op supports async lock operation
- * @export_ops: the nfs export operations to check
- *
- * Returns true if the nfs export_operations structure has
- * EXPORT_OP_ASYNC_LOCK in their flags set
- */
-static inline bool
-exportfs_lock_op_is_async(const struct export_operations *export_ops)
-{
- return export_ops->flags & EXPORT_OP_ASYNC_LOCK;
-}
-
extern int exportfs_encode_inode_fh(struct inode *inode, struct fid *fid,
int *max_len, struct inode *parent,
int flags);
diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h
index b1c5722..c45306a 100644
--- a/include/linux/fdtable.h
+++ b/include/linux/fdtable.h
@@ -92,10 +92,6 @@ static inline struct file *files_lookup_fd_locked(struct files_struct *files, un
return files_lookup_fd_raw(files, fd);
}
-struct file *lookup_fdget_rcu(unsigned int fd);
-struct file *task_lookup_fdget_rcu(struct task_struct *task, unsigned int fd);
-struct file *task_lookup_next_fdget_rcu(struct task_struct *task, unsigned int *fd);
-
static inline bool close_on_exec(unsigned int fd, const struct files_struct *files)
{
return test_bit(fd, files_fdtable(files)->close_on_exec);
@@ -115,7 +111,6 @@ int iterate_fd(struct files_struct *, unsigned,
const void *);
extern int close_fd(unsigned int fd);
-extern int __close_range(unsigned int fd, unsigned int max_fd, unsigned int flags);
extern struct file *file_close_fd(unsigned int fd);
extern struct kmem_cache *files_cachep;
diff --git a/include/linux/file.h b/include/linux/file.h
index f98de14..302f113 100644
--- a/include/linux/file.h
+++ b/include/linux/file.h
@@ -30,12 +30,6 @@ extern struct file *alloc_file_pseudo_noaccount(struct inode *, struct vfsmount
extern struct file *alloc_file_clone(struct file *, int flags,
const struct file_operations *);
-static inline void fput_light(struct file *file, int fput_needed)
-{
- if (fput_needed)
- fput(file);
-}
-
/* either a reference to struct file + flags
* (cloned vs. borrowed, pos locked), with
* flags stored in lower bits of value,
@@ -72,6 +66,7 @@ static inline void fdput(struct fd fd)
extern struct file *fget(unsigned int fd);
extern struct file *fget_raw(unsigned int fd);
extern struct file *fget_task(struct task_struct *task, unsigned int fd);
+extern struct file *fget_task_next(struct task_struct *task, unsigned int *fd);
extern void __f_unlock_pos(struct file *);
struct fd fdget(unsigned int fd);
@@ -87,6 +82,7 @@ static inline void fdput_pos(struct fd f)
DEFINE_CLASS(fd, struct fd, fdput(_T), fdget(fd), int fd)
DEFINE_CLASS(fd_raw, struct fd, fdput(_T), fdget_raw(fd), int fd)
+DEFINE_CLASS(fd_pos, struct fd, fdput_pos(_T), fdget_pos(fd), int fd)
extern int f_dupfd(unsigned int from, struct file *file, unsigned flags);
extern int replace_fd(unsigned fd, struct file *file, unsigned flags);
diff --git a/include/linux/file_ref.h b/include/linux/file_ref.h
new file mode 100644
index 0000000..9b3a8d9
--- /dev/null
+++ b/include/linux/file_ref.h
@@ -0,0 +1,177 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef _LINUX_FILE_REF_H
+#define _LINUX_FILE_REF_H
+
+#include <linux/atomic.h>
+#include <linux/preempt.h>
+#include <linux/types.h>
+
+/*
+ * file_ref is a reference count implementation specifically for use by
+ * files. It takes inspiration from rcuref but differs in key aspects
+ * such as support for SLAB_TYPESAFE_BY_RCU type caches.
+ *
+ * FILE_REF_ONEREF FILE_REF_MAXREF
+ * 0x0000000000000000UL 0x7FFFFFFFFFFFFFFFUL
+ * <-------------------valid ------------------->
+ *
+ * FILE_REF_SATURATED
+ * 0x8000000000000000UL 0xA000000000000000UL 0xBFFFFFFFFFFFFFFFUL
+ * <-----------------------saturation zone---------------------->
+ *
+ * FILE_REF_RELEASED FILE_REF_DEAD
+ * 0xC000000000000000UL 0xE000000000000000UL
+ * <-------------------dead zone------------------->
+ *
+ * FILE_REF_NOREF
+ * 0xFFFFFFFFFFFFFFFFUL
+ */
+
+#ifdef CONFIG_64BIT
+#define FILE_REF_ONEREF 0x0000000000000000UL
+#define FILE_REF_MAXREF 0x7FFFFFFFFFFFFFFFUL
+#define FILE_REF_SATURATED 0xA000000000000000UL
+#define FILE_REF_RELEASED 0xC000000000000000UL
+#define FILE_REF_DEAD 0xE000000000000000UL
+#define FILE_REF_NOREF 0xFFFFFFFFFFFFFFFFUL
+#else
+#define FILE_REF_ONEREF 0x00000000U
+#define FILE_REF_MAXREF 0x7FFFFFFFU
+#define FILE_REF_SATURATED 0xA0000000U
+#define FILE_REF_RELEASED 0xC0000000U
+#define FILE_REF_DEAD 0xE0000000U
+#define FILE_REF_NOREF 0xFFFFFFFFU
+#endif
+
+typedef struct {
+#ifdef CONFIG_64BIT
+ atomic64_t refcnt;
+#else
+ atomic_t refcnt;
+#endif
+} file_ref_t;
+
+/**
+ * file_ref_init - Initialize a file reference count
+ * @ref: Pointer to the reference count
+ * @cnt: The initial reference count typically '1'
+ */
+static inline void file_ref_init(file_ref_t *ref, unsigned long cnt)
+{
+ atomic_long_set(&ref->refcnt, cnt - 1);
+}
+
+bool __file_ref_put(file_ref_t *ref, unsigned long cnt);
+
+/**
+ * file_ref_get - Acquire one reference on a file
+ * @ref: Pointer to the reference count
+ *
+ * Similar to atomic_inc_not_zero() but saturates at FILE_REF_MAXREF.
+ *
+ * Provides full memory ordering.
+ *
+ * Return: False if the attempt to acquire a reference failed. This happens
+ * when the last reference has been put already. True if a reference
+ * was successfully acquired
+ */
+static __always_inline __must_check bool file_ref_get(file_ref_t *ref)
+{
+ /*
+ * Unconditionally increase the reference count with full
+ * ordering. The saturation and dead zones provide enough
+ * tolerance for this.
+ *
+ * If this indicates negative the file in question the fail can
+ * be freed and immediately reused due to SLAB_TYPSAFE_BY_RCU.
+ * Hence, unconditionally altering the file reference count to
+ * e.g., reset the file reference count back to the middle of
+ * the deadzone risk end up marking someone else's file as dead
+ * behind their back.
+ *
+ * It would be possible to do a careful:
+ *
+ * cnt = atomic_long_inc_return();
+ * if (likely(cnt >= 0))
+ * return true;
+ *
+ * and then something like:
+ *
+ * if (cnt >= FILE_REF_RELEASE)
+ * atomic_long_try_cmpxchg(&ref->refcnt, &cnt, FILE_REF_DEAD),
+ *
+ * to set the value back to the middle of the deadzone. But it's
+ * practically impossible to go from FILE_REF_DEAD to
+ * FILE_REF_ONEREF. It would need 2305843009213693952/2^61
+ * file_ref_get()s to resurrect such a dead file.
+ */
+ return !atomic_long_add_negative(1, &ref->refcnt);
+}
+
+/**
+ * file_ref_inc - Acquire one reference on a file
+ * @ref: Pointer to the reference count
+ *
+ * Acquire an additional reference on a file. Warns if the caller didn't
+ * already hold a reference.
+ */
+static __always_inline void file_ref_inc(file_ref_t *ref)
+{
+ long prior = atomic_long_fetch_inc_relaxed(&ref->refcnt);
+ WARN_ONCE(prior < 0, "file_ref_inc() on a released file reference");
+}
+
+/**
+ * file_ref_put -- Release a file reference
+ * @ref: Pointer to the reference count
+ *
+ * Provides release memory ordering, such that prior loads and stores
+ * are done before, and provides an acquire ordering on success such
+ * that free() must come after.
+ *
+ * Return: True if this was the last reference with no future references
+ * possible. This signals the caller that it can safely release
+ * the object which is protected by the reference counter.
+ * False if there are still active references or the put() raced
+ * with a concurrent get()/put() pair. Caller is not allowed to
+ * release the protected object.
+ */
+static __always_inline __must_check bool file_ref_put(file_ref_t *ref)
+{
+ long cnt;
+
+ /*
+ * While files are SLAB_TYPESAFE_BY_RCU and thus file_ref_put()
+ * calls don't risk UAFs when a file is recyclyed, it is still
+ * vulnerable to UAFs caused by freeing the whole slab page once
+ * it becomes unused. Prevent file_ref_put() from being
+ * preempted protects against this.
+ */
+ guard(preempt)();
+ /*
+ * Unconditionally decrease the reference count. The saturation
+ * and dead zones provide enough tolerance for this. If this
+ * fails then we need to handle the last reference drop and
+ * cases inside the saturation and dead zones.
+ */
+ cnt = atomic_long_dec_return(&ref->refcnt);
+ if (cnt >= 0)
+ return false;
+ return __file_ref_put(ref, cnt);
+}
+
+/**
+ * file_ref_read - Read the number of file references
+ * @ref: Pointer to the reference count
+ *
+ * Return: The number of held references (0 ... N)
+ */
+static inline unsigned long file_ref_read(file_ref_t *ref)
+{
+ unsigned long c = atomic_long_read(&ref->refcnt);
+
+ /* Return 0 if within the DEAD zone. */
+ return c >= FILE_REF_RELEASED ? 0 : c + 1;
+}
+
+#endif
diff --git a/include/linux/filelock.h b/include/linux/filelock.h
index bb44224..c412ded 100644
--- a/include/linux/filelock.h
+++ b/include/linux/filelock.h
@@ -180,6 +180,11 @@ static inline void locks_wake_up(struct file_lock *fl)
wake_up(&fl->c.flc_wait);
}
+static inline bool locks_can_async_lock(const struct file_operations *fops)
+{
+ return !fops->lock || fops->fop_flags & FOP_ASYNC_LOCK;
+}
+
/* fs/locks.c */
void locks_free_lock_context(struct inode *inode);
void locks_free_lock(struct file_lock *fl);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 3559446..7e29433 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -45,6 +45,8 @@
#include <linux/slab.h>
#include <linux/maple_tree.h>
#include <linux/rw_hint.h>
+#include <linux/file_ref.h>
+#include <linux/unicode.h>
#include <asm/byteorder.h>
#include <uapi/linux/fs.h>
@@ -623,6 +625,7 @@ is_uncached_acl(struct posix_acl *acl)
#define IOP_NOFOLLOW 0x0004
#define IOP_XATTR 0x0008
#define IOP_DEFAULT_READLINK 0x0010
+#define IOP_MGTIME 0x0020
/*
* Keep mostly read-only and often accessed (especially for
@@ -1005,7 +1008,7 @@ static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index)
/**
* struct file - Represents a file
- * @f_count: reference count
+ * @f_ref: reference count
* @f_lock: Protects f_ep, f_flags. Must not be taken from IRQ context.
* @f_mode: FMODE_* flags often used in hotpaths
* @f_op: file operations
@@ -1030,7 +1033,7 @@ static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index)
* @f_freeptr: Pointer used by SLAB_TYPESAFE_BY_RCU file cache (don't touch.)
*/
struct file {
- atomic_long_t f_count;
+ file_ref_t f_ref;
spinlock_t f_lock;
fmode_t f_mode;
const struct file_operations *f_op;
@@ -1078,15 +1081,14 @@ struct file_handle {
static inline struct file *get_file(struct file *f)
{
- long prior = atomic_long_fetch_inc_relaxed(&f->f_count);
- WARN_ONCE(!prior, "struct file::f_count incremented from zero; use-after-free condition present!\n");
+ file_ref_inc(&f->f_ref);
return f;
}
struct file *get_file_rcu(struct file __rcu **f);
struct file *get_file_active(struct file **f);
-#define file_count(x) atomic_long_read(&(x)->f_count)
+#define file_count(f) file_ref_read(&(f)->f_ref)
#define MAX_NON_LFS ((1UL<<31) - 1)
@@ -1584,6 +1586,8 @@ static inline bool fsuidgid_has_mapping(struct super_block *sb,
struct timespec64 current_time(struct inode *inode);
struct timespec64 inode_set_ctime_current(struct inode *inode);
+struct timespec64 inode_set_ctime_deleg(struct inode *inode,
+ struct timespec64 update);
static inline time64_t inode_get_atime_sec(const struct inode *inode)
{
@@ -1653,6 +1657,17 @@ static inline struct timespec64 inode_set_mtime(struct inode *inode,
return inode_set_mtime_to_ts(inode, ts);
}
+/*
+ * Multigrain timestamps
+ *
+ * Conditionally use fine-grained ctime and mtime timestamps when there
+ * are users actively observing them via getattr. The primary use-case
+ * for this is NFS clients that use the ctime to distinguish between
+ * different states of the file, and that are often fooled by multiple
+ * operations that occur in the same coarse-grained timer tick.
+ */
+#define I_CTIME_QUERIED ((u32)BIT(31))
+
static inline time64_t inode_get_ctime_sec(const struct inode *inode)
{
return inode->i_ctime_sec;
@@ -1660,7 +1675,7 @@ static inline time64_t inode_get_ctime_sec(const struct inode *inode)
static inline long inode_get_ctime_nsec(const struct inode *inode)
{
- return inode->i_ctime_nsec;
+ return inode->i_ctime_nsec & ~I_CTIME_QUERIED;
}
static inline struct timespec64 inode_get_ctime(const struct inode *inode)
@@ -1671,13 +1686,7 @@ static inline struct timespec64 inode_get_ctime(const struct inode *inode)
return ts;
}
-static inline struct timespec64 inode_set_ctime_to_ts(struct inode *inode,
- struct timespec64 ts)
-{
- inode->i_ctime_sec = ts.tv_sec;
- inode->i_ctime_nsec = ts.tv_nsec;
- return ts;
-}
+struct timespec64 inode_set_ctime_to_ts(struct inode *inode, struct timespec64 ts);
/**
* inode_set_ctime - set the ctime in the inode
@@ -2116,6 +2125,8 @@ struct file_operations {
#define FOP_HUGE_PAGES ((__force fop_flags_t)(1 << 4))
/* Treat loff_t as unsigned (e.g., /dev/mem) */
#define FOP_UNSIGNED_OFFSET ((__force fop_flags_t)(1 << 5))
+/* Supports asynchronous lock callbacks */
+#define FOP_ASYNC_LOCK ((__force fop_flags_t)(1 << 6))
/* Wrap a directory iterator that needs exclusive inode access */
int wrap_directory_iterator(struct file *, struct dir_context *,
@@ -2542,6 +2553,7 @@ struct file_system_type {
#define FS_USERNS_MOUNT 8 /* Can be mounted by userns root */
#define FS_DISALLOW_NOTIFY_PERM 16 /* Disable fanotify permission events */
#define FS_ALLOW_IDMAP 32 /* FS has been updated to handle vfs idmappings. */
+#define FS_MGTIME 64 /* FS uses multigrain timestamps */
#define FS_RENAME_DOES_D_MOVE 32768 /* FS will handle d_move() during rename() internally. */
int (*init_fs_context)(struct fs_context *);
const struct fs_parameter_spec *parameters;
@@ -2565,6 +2577,17 @@ struct file_system_type {
#define MODULE_ALIAS_FS(NAME) MODULE_ALIAS("fs-" NAME)
+/**
+ * is_mgtime: is this inode using multigrain timestamps
+ * @inode: inode to test for multigrain timestamps
+ *
+ * Return true if the inode uses multigrain timestamps, false otherwise.
+ */
+static inline bool is_mgtime(const struct inode *inode)
+{
+ return inode->i_opflags & IOP_MGTIME;
+}
+
extern struct dentry *mount_bdev(struct file_system_type *fs_type,
int flags, const char *dev_name, void *data,
int (*fill_super)(struct super_block *, void *, int));
@@ -2766,6 +2789,16 @@ extern struct filename *getname_flags(const char __user *, int);
extern struct filename *getname_uflags(const char __user *, int);
extern struct filename *getname(const char __user *);
extern struct filename *getname_kernel(const char *);
+extern struct filename *__getname_maybe_null(const char __user *);
+static inline struct filename *getname_maybe_null(const char __user *name, int flags)
+{
+ if (!(flags & AT_EMPTY_PATH))
+ return getname(name);
+
+ if (!name)
+ return NULL;
+ return __getname_maybe_null(name);
+}
extern void putname(struct filename *name);
extern int finish_open(struct file *file, struct dentry *dentry,
@@ -3326,6 +3359,7 @@ extern void page_put_link(void *);
extern int page_symlink(struct inode *inode, const char *symname, int len);
extern const struct inode_operations page_symlink_inode_operations;
extern void kfree_link(void *);
+void fill_mg_cmtime(struct kstat *stat, u32 request_mask, struct inode *inode);
void generic_fillattr(struct mnt_idmap *, u32, struct inode *, struct kstat *);
void generic_fill_statx_attr(struct inode *inode, struct kstat *stat);
void generic_fill_statx_atomic_writes(struct kstat *stat,
@@ -3456,6 +3490,54 @@ extern int generic_ci_match(const struct inode *parent,
const struct qstr *folded_name,
const u8 *de_name, u32 de_name_len);
+#if IS_ENABLED(CONFIG_UNICODE)
+int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str);
+int generic_ci_d_compare(const struct dentry *dentry, unsigned int len,
+ const char *str, const struct qstr *name);
+
+/**
+ * generic_ci_validate_strict_name - Check if a given name is suitable
+ * for a directory
+ *
+ * This functions checks if the proposed filename is valid for the
+ * parent directory. That means that only valid UTF-8 filenames will be
+ * accepted for casefold directories from filesystems created with the
+ * strict encoding flag. That also means that any name will be
+ * accepted for directories that doesn't have casefold enabled, or
+ * aren't being strict with the encoding.
+ *
+ * @dir: inode of the directory where the new file will be created
+ * @name: name of the new file
+ *
+ * Return:
+ * * True: if the filename is suitable for this directory. It can be
+ * true if a given name is not suitable for a strict encoding
+ * directory, but the directory being used isn't strict
+ * * False if the filename isn't suitable for this directory. This only
+ * happens when a directory is casefolded and the filesystem is strict
+ * about its encoding.
+ */
+static inline bool generic_ci_validate_strict_name(struct inode *dir, struct qstr *name)
+{
+ if (!IS_CASEFOLDED(dir) || !sb_has_strict_encoding(dir->i_sb))
+ return true;
+
+ /*
+ * A casefold dir must have a encoding set, unless the filesystem
+ * is corrupted
+ */
+ if (WARN_ON_ONCE(!dir->i_sb->s_encoding))
+ return true;
+
+ return !utf8_validate(dir->i_sb->s_encoding, name);
+}
+#else
+static inline bool generic_ci_validate_strict_name(struct inode *dir, struct qstr *name)
+{
+ return true;
+}
+#endif
+
static inline bool sb_has_encoding(const struct super_block *sb)
{
#if IS_ENABLED(CONFIG_UNICODE)
@@ -3726,6 +3808,6 @@ static inline bool vfs_empty_path(int dfd, const char __user *path)
return !c;
}
-bool generic_atomic_write_valid(struct iov_iter *iter, loff_t pos);
+int generic_atomic_write_valid(struct kiocb *iocb, struct iov_iter *iter);
#endif /* _LINUX_FS_H */
diff --git a/include/linux/fs_parser.h b/include/linux/fs_parser.h
index 6cf713a..3cef566 100644
--- a/include/linux/fs_parser.h
+++ b/include/linux/fs_parser.h
@@ -28,7 +28,8 @@ typedef int fs_param_type(struct p_log *,
*/
fs_param_type fs_param_is_bool, fs_param_is_u32, fs_param_is_s32, fs_param_is_u64,
fs_param_is_enum, fs_param_is_string, fs_param_is_blob, fs_param_is_blockdev,
- fs_param_is_path, fs_param_is_fd, fs_param_is_uid, fs_param_is_gid;
+ fs_param_is_path, fs_param_is_fd, fs_param_is_uid, fs_param_is_gid,
+ fs_param_is_file_or_string;
/*
* Specification of the type of value a parameter wants.
@@ -133,6 +134,8 @@ static inline bool fs_validate_description(const char *name,
#define fsparam_bdev(NAME, OPT) __fsparam(fs_param_is_blockdev, NAME, OPT, 0, NULL)
#define fsparam_path(NAME, OPT) __fsparam(fs_param_is_path, NAME, OPT, 0, NULL)
#define fsparam_fd(NAME, OPT) __fsparam(fs_param_is_fd, NAME, OPT, 0, NULL)
+#define fsparam_file_or_string(NAME, OPT) \
+ __fsparam(fs_param_is_file_or_string, NAME, OPT, 0, NULL)
#define fsparam_uid(NAME, OPT) __fsparam(fs_param_is_uid, NAME, OPT, 0, NULL)
#define fsparam_gid(NAME, OPT) __fsparam(fs_param_is_gid, NAME, OPT, 0, NULL)
diff --git a/include/linux/hisi_acc_qm.h b/include/linux/hisi_acc_qm.h
index 9d7754a..6dbd0d4 100644
--- a/include/linux/hisi_acc_qm.h
+++ b/include/linux/hisi_acc_qm.h
@@ -229,6 +229,12 @@ struct hisi_qm_status {
struct hisi_qm;
+enum acc_err_result {
+ ACC_ERR_NONE,
+ ACC_ERR_NEED_RESET,
+ ACC_ERR_RECOVERED,
+};
+
struct hisi_qm_err_info {
char *acpi_rst;
u32 msi_wr_port;
@@ -257,9 +263,9 @@ struct hisi_qm_err_ini {
void (*close_axi_master_ooo)(struct hisi_qm *qm);
void (*open_sva_prefetch)(struct hisi_qm *qm);
void (*close_sva_prefetch)(struct hisi_qm *qm);
- void (*log_dev_hw_err)(struct hisi_qm *qm, u32 err_sts);
void (*show_last_dfx_regs)(struct hisi_qm *qm);
void (*err_info_init)(struct hisi_qm *qm);
+ enum acc_err_result (*get_err_result)(struct hisi_qm *qm);
};
struct hisi_qm_cap_info {
@@ -274,13 +280,25 @@ struct hisi_qm_cap_info {
u32 v3_val;
};
+struct hisi_qm_cap_query_info {
+ u32 type;
+ const char *name;
+ u32 offset;
+ u32 v1_val;
+ u32 v2_val;
+ u32 v3_val;
+};
+
struct hisi_qm_cap_record {
u32 type;
+ const char *name;
u32 cap_val;
};
struct hisi_qm_cap_tables {
+ u32 qm_cap_size;
struct hisi_qm_cap_record *qm_cap_table;
+ u32 dev_cap_size;
struct hisi_qm_cap_record *dev_cap_table;
};
@@ -436,37 +454,6 @@ struct hisi_qp {
struct uacce_queue *uacce_q;
};
-static inline int q_num_set(const char *val, const struct kernel_param *kp,
- unsigned int device)
-{
- struct pci_dev *pdev;
- u32 n, q_num;
- int ret;
-
- if (!val)
- return -EINVAL;
-
- pdev = pci_get_device(PCI_VENDOR_ID_HUAWEI, device, NULL);
- if (!pdev) {
- q_num = min_t(u32, QM_QNUM_V1, QM_QNUM_V2);
- pr_info("No device found currently, suppose queue number is %u\n",
- q_num);
- } else {
- if (pdev->revision == QM_HW_V1)
- q_num = QM_QNUM_V1;
- else
- q_num = QM_QNUM_V2;
-
- pci_dev_put(pdev);
- }
-
- ret = kstrtou32(val, 10, &n);
- if (ret || n < QM_MIN_QNUM || n > q_num)
- return -EINVAL;
-
- return param_set_int(val, kp);
-}
-
static inline int vfs_num_set(const char *val, const struct kernel_param *kp)
{
u32 n;
@@ -526,6 +513,8 @@ static inline void hisi_qm_del_list(struct hisi_qm *qm, struct hisi_qm_list *qm_
mutex_unlock(&qm_list->lock);
}
+int hisi_qm_q_num_set(const char *val, const struct kernel_param *kp,
+ unsigned int device);
int hisi_qm_init(struct hisi_qm *qm);
void hisi_qm_uninit(struct hisi_qm *qm);
int hisi_qm_start(struct hisi_qm *qm);
@@ -583,6 +572,9 @@ void hisi_qm_regs_dump(struct seq_file *s, struct debugfs_regset32 *regset);
u32 hisi_qm_get_hw_info(struct hisi_qm *qm,
const struct hisi_qm_cap_info *info_table,
u32 index, bool is_read);
+u32 hisi_qm_get_cap_value(struct hisi_qm *qm,
+ const struct hisi_qm_cap_query_info *info_table,
+ u32 index, bool is_read);
int hisi_qm_set_algs(struct hisi_qm *qm, u64 alg_msk, const struct qm_dev_alg *dev_algs,
u32 dev_algs_size);
diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h
index c189d36..578a3fd 100644
--- a/include/linux/io_uring/cmd.h
+++ b/include/linux/io_uring/cmd.h
@@ -110,7 +110,7 @@ static inline void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd,
static inline struct task_struct *io_uring_cmd_get_task(struct io_uring_cmd *cmd)
{
- return cmd_to_io_kiocb(cmd)->task;
+ return cmd_to_io_kiocb(cmd)->tctx->task;
}
#endif /* _LINUX_IO_URING_CMD_H */
diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 4b9ba52..593c10a 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -37,6 +37,7 @@ enum io_uring_cmd_flags {
/* set when uring wants to cancel a previously issued command */
IO_URING_F_CANCEL = (1 << 11),
IO_URING_F_COMPAT = (1 << 12),
+ IO_URING_F_TASK_DEAD = (1 << 13),
};
struct io_wq_work_node {
@@ -55,19 +56,18 @@ struct io_wq_work {
int cancel_seq;
};
-struct io_fixed_file {
- /* file * with additional FFS_* flags */
- unsigned long file_ptr;
+struct io_rsrc_data {
+ unsigned int nr;
+ struct io_rsrc_node **nodes;
};
struct io_file_table {
- struct io_fixed_file *files;
+ struct io_rsrc_data data;
unsigned long *bitmap;
unsigned int alloc_hint;
};
struct io_hash_bucket {
- spinlock_t lock;
struct hlist_head list;
} ____cacheline_aligned_in_smp;
@@ -76,6 +76,12 @@ struct io_hash_table {
unsigned hash_bits;
};
+struct io_mapped_region {
+ struct page **pages;
+ void *vmap_ptr;
+ size_t nr_pages;
+};
+
/*
* Arbitrary limit, can be raised if need be
*/
@@ -85,6 +91,7 @@ struct io_uring_task {
/* submission side */
int cached_refs;
const struct io_ring_ctx *last;
+ struct task_struct *task;
struct io_wq *io_wq;
struct file *registered_rings[IO_RINGFD_REG_MAX];
@@ -270,7 +277,6 @@ struct io_ring_ctx {
* Fixed resources fast path, should be accessed only under
* uring_lock, and updated through io_uring_register(2)
*/
- struct io_rsrc_node *rsrc_node;
atomic_t cancel_seq;
/*
@@ -283,15 +289,13 @@ struct io_ring_ctx {
struct io_wq_work_list iopoll_list;
struct io_file_table file_table;
- struct io_mapped_ubuf **user_bufs;
- unsigned nr_user_files;
- unsigned nr_user_bufs;
+ struct io_rsrc_data buf_table;
struct io_submit_state submit_state;
struct xarray io_bl_xa;
- struct io_hash_table cancel_table_locked;
+ struct io_hash_table cancel_table;
struct io_alloc_cache apoll_cache;
struct io_alloc_cache netmsg_cache;
struct io_alloc_cache rw_cache;
@@ -302,6 +306,11 @@ struct io_ring_ctx {
* ->uring_cmd() by io_uring_cmd_insert_cancelable()
*/
struct hlist_head cancelable_uring_cmd;
+ /*
+ * For Hybrid IOPOLL, runtime in hybrid polling, without
+ * scheduling time
+ */
+ u64 hybrid_poll_time;
} ____cacheline_aligned_in_smp;
struct {
@@ -316,6 +325,9 @@ struct io_ring_ctx {
unsigned cq_entries;
struct io_ev_fd __rcu *io_ev_fd;
unsigned cq_extra;
+
+ void *cq_wait_arg;
+ size_t cq_wait_size;
} ____cacheline_aligned_in_smp;
/*
@@ -342,7 +354,6 @@ struct io_ring_ctx {
struct list_head io_buffers_comp;
struct list_head cq_overflow_list;
- struct io_hash_table cancel_table;
struct hlist_head waitid_list;
@@ -366,16 +377,6 @@ struct io_ring_ctx {
struct wait_queue_head poll_wq;
struct io_restriction restrictions;
- /* slow path rsrc auxilary data, used by update/register */
- struct io_rsrc_data *file_data;
- struct io_rsrc_data *buf_data;
-
- /* protected by ->uring_lock */
- struct list_head rsrc_ref_list;
- struct io_alloc_cache rsrc_node_cache;
- struct wait_queue_head rsrc_quiesce_wq;
- unsigned rsrc_quiesce;
-
u32 pers_next;
struct xarray personalities;
@@ -409,7 +410,7 @@ struct io_ring_ctx {
/* napi busy poll default timeout */
ktime_t napi_busy_poll_dt;
bool napi_prefer_busy_poll;
- bool napi_enabled;
+ u8 napi_track_mode;
DECLARE_HASHTABLE(napi_ht, 4);
#endif
@@ -418,6 +419,13 @@ struct io_ring_ctx {
unsigned evfd_last_cq_tail;
/*
+ * Protection for resize vs mmap races - both the mmap and resize
+ * side will need to grab this lock, to prevent either side from
+ * being run concurrently with the other.
+ */
+ struct mutex resize_lock;
+
+ /*
* If IORING_SETUP_NO_MMAP is used, then the below holds
* the gup'ed pages for the two rings, and the sqes.
*/
@@ -425,6 +433,9 @@ struct io_ring_ctx {
unsigned short n_sqe_pages;
struct page **ring_pages;
struct page **sqe_pages;
+
+ /* used for optimised request parameter and wait argument passing */
+ struct io_mapped_region param_region;
};
struct io_tw_state {
@@ -447,6 +458,7 @@ enum {
REQ_F_LINK_TIMEOUT_BIT,
REQ_F_NEED_CLEANUP_BIT,
REQ_F_POLLED_BIT,
+ REQ_F_HYBRID_IOPOLL_STATE_BIT,
REQ_F_BUFFER_SELECTED_BIT,
REQ_F_BUFFER_RING_BIT,
REQ_F_REISSUE_BIT,
@@ -459,7 +471,6 @@ enum {
REQ_F_DOUBLE_POLL_BIT,
REQ_F_APOLL_MULTISHOT_BIT,
REQ_F_CLEAR_POLLIN_BIT,
- REQ_F_HASH_LOCKED_BIT,
/* keep async read/write and isreg together and in order */
REQ_F_SUPPORT_NOWAIT_BIT,
REQ_F_ISREG_BIT,
@@ -468,6 +479,7 @@ enum {
REQ_F_BL_EMPTY_BIT,
REQ_F_BL_NO_RECYCLE_BIT,
REQ_F_BUFFERS_COMMIT_BIT,
+ REQ_F_BUF_NODE_BIT,
/* not a real bit, just to check we're not overflowing the space */
__REQ_F_LAST_BIT,
@@ -506,6 +518,8 @@ enum {
REQ_F_NEED_CLEANUP = IO_REQ_FLAG(REQ_F_NEED_CLEANUP_BIT),
/* already went through poll handler */
REQ_F_POLLED = IO_REQ_FLAG(REQ_F_POLLED_BIT),
+ /* every req only blocks once in hybrid poll */
+ REQ_F_IOPOLL_STATE = IO_REQ_FLAG(REQ_F_HYBRID_IOPOLL_STATE_BIT),
/* buffer already selected */
REQ_F_BUFFER_SELECTED = IO_REQ_FLAG(REQ_F_BUFFER_SELECTED_BIT),
/* buffer selected from ring, needs commit */
@@ -534,8 +548,6 @@ enum {
REQ_F_APOLL_MULTISHOT = IO_REQ_FLAG(REQ_F_APOLL_MULTISHOT_BIT),
/* recvmsg special flag, clear EPOLLIN */
REQ_F_CLEAR_POLLIN = IO_REQ_FLAG(REQ_F_CLEAR_POLLIN_BIT),
- /* hashed into ->cancel_hash_locked, protected by ->uring_lock */
- REQ_F_HASH_LOCKED = IO_REQ_FLAG(REQ_F_HASH_LOCKED_BIT),
/* don't use lazy poll wake for this request */
REQ_F_POLL_NO_LAZY = IO_REQ_FLAG(REQ_F_POLL_NO_LAZY_BIT),
/* file is pollable */
@@ -546,6 +558,8 @@ enum {
REQ_F_BL_NO_RECYCLE = IO_REQ_FLAG(REQ_F_BL_NO_RECYCLE_BIT),
/* buffer ring head needs incrementing on put */
REQ_F_BUFFERS_COMMIT = IO_REQ_FLAG(REQ_F_BUFFERS_COMMIT_BIT),
+ /* buf node is valid */
+ REQ_F_BUF_NODE = IO_REQ_FLAG(REQ_F_BUF_NODE_BIT),
};
typedef void (*io_req_tw_func_t)(struct io_kiocb *req, struct io_tw_state *ts);
@@ -615,12 +629,9 @@ struct io_kiocb {
struct io_cqe cqe;
struct io_ring_ctx *ctx;
- struct task_struct *task;
+ struct io_uring_task *tctx;
union {
- /* store used ubuf, so we can prevent reloading */
- struct io_mapped_ubuf *imu;
-
/* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */
struct io_buffer *kbuf;
@@ -629,6 +640,8 @@ struct io_kiocb {
* REQ_F_BUFFER_RING is set.
*/
struct io_buffer_list *buf_list;
+
+ struct io_rsrc_node *buf_node;
};
union {
@@ -638,13 +651,20 @@ struct io_kiocb {
__poll_t apoll_events;
};
- struct io_rsrc_node *rsrc_node;
+ struct io_rsrc_node *file_node;
atomic_t refs;
bool cancel_seq_set;
struct io_task_work io_task_work;
- /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */
- struct hlist_node hash_node;
+ union {
+ /*
+ * for polled requests, i.e. IORING_OP_POLL_ADD and async armed
+ * poll
+ */
+ struct hlist_node hash_node;
+ /* For IOPOLL setup queues, with hybrid polling */
+ u64 iopoll_start;
+ };
/* internal polling, see IORING_FEAT_FAST_POLL */
struct async_poll *apoll;
/* opcode allocated if it needs to store data for async defer */
@@ -667,4 +687,9 @@ struct io_overflow_cqe {
struct io_uring_cqe cqe;
};
+static inline bool io_ctx_cqe32(struct io_ring_ctx *ctx)
+{
+ return ctx->flags & IORING_SETUP_CQE32;
+}
+
#endif
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index f61407e..27048ec 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -178,6 +178,7 @@ struct iomap_folio_ops {
#else
#define IOMAP_DAX 0
#endif /* CONFIG_FS_DAX */
+#define IOMAP_ATOMIC (1 << 9)
struct iomap_ops {
/*
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index 8aef9bb..50f7ea8 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -1796,22 +1796,21 @@ static inline unsigned long jbd2_log_space_left(journal_t *journal)
static inline u32 jbd2_chksum(journal_t *journal, u32 crc,
const void *address, unsigned int length)
{
- struct {
- struct shash_desc shash;
- char ctx[JBD_MAX_CHECKSUM_SIZE];
- } desc;
+ DEFINE_RAW_FLEX(struct shash_desc, desc, __ctx,
+ DIV_ROUND_UP(JBD_MAX_CHECKSUM_SIZE,
+ sizeof(*((struct shash_desc *)0)->__ctx)));
int err;
BUG_ON(crypto_shash_descsize(journal->j_chksum_driver) >
JBD_MAX_CHECKSUM_SIZE);
- desc.shash.tfm = journal->j_chksum_driver;
- *(u32 *)desc.ctx = crc;
+ desc->tfm = journal->j_chksum_driver;
+ *(u32 *)desc->__ctx = crc;
- err = crypto_shash_update(&desc.shash, address, length);
+ err = crypto_shash_update(desc, address, length);
BUG_ON(err);
- return *(u32 *)desc.ctx;
+ return *(u32 *)desc->__ctx;
}
/* Return most recent uncommitted transaction */
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 9b4a6ff..c1a85d4 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -270,9 +270,7 @@ enum {
/* bits 24:31 of host->flags are reserved for LLD specific flags */
- /* various lengths of time */
- ATA_TMOUT_BOOT = 30000, /* heuristic */
- ATA_TMOUT_BOOT_QUICK = 7000, /* heuristic */
+ /* Various lengths of time */
ATA_TMOUT_INTERNAL_QUICK = 5000,
ATA_TMOUT_MAX_PARK = 30000,
diff --git a/include/linux/lsm/apparmor.h b/include/linux/lsm/apparmor.h
new file mode 100644
index 0000000..612cbfa
--- /dev/null
+++ b/include/linux/lsm/apparmor.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Linux Security Module interface to other subsystems.
+ * AppArmor presents single pointer to an aa_label structure.
+ */
+#ifndef __LINUX_LSM_APPARMOR_H
+#define __LINUX_LSM_APPARMOR_H
+
+struct aa_label;
+
+struct lsm_prop_apparmor {
+#ifdef CONFIG_SECURITY_APPARMOR
+ struct aa_label *label;
+#endif
+};
+
+#endif /* ! __LINUX_LSM_APPARMOR_H */
diff --git a/include/linux/lsm/bpf.h b/include/linux/lsm/bpf.h
new file mode 100644
index 0000000..8106e20
--- /dev/null
+++ b/include/linux/lsm/bpf.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Linux Security Module interface to other subsystems.
+ * BPF may present a single u32 value.
+ */
+#ifndef __LINUX_LSM_BPF_H
+#define __LINUX_LSM_BPF_H
+#include <linux/types.h>
+
+struct lsm_prop_bpf {
+#ifdef CONFIG_BPF_LSM
+ u32 secid;
+#endif
+};
+
+#endif /* ! __LINUX_LSM_BPF_H */
diff --git a/include/linux/lsm/selinux.h b/include/linux/lsm/selinux.h
new file mode 100644
index 0000000..9455a6b
--- /dev/null
+++ b/include/linux/lsm/selinux.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Linux Security Module interface to other subsystems.
+ * SELinux presents a single u32 value which is known as a secid.
+ */
+#ifndef __LINUX_LSM_SELINUX_H
+#define __LINUX_LSM_SELINUX_H
+#include <linux/types.h>
+
+struct lsm_prop_selinux {
+#ifdef CONFIG_SECURITY_SELINUX
+ u32 secid;
+#endif
+};
+
+#endif /* ! __LINUX_LSM_SELINUX_H */
diff --git a/include/linux/lsm/smack.h b/include/linux/lsm/smack.h
new file mode 100644
index 0000000..ff730dd
--- /dev/null
+++ b/include/linux/lsm/smack.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Linux Security Module interface to other subsystems.
+ * Smack presents a pointer into the global Smack label list.
+ */
+#ifndef __LINUX_LSM_SMACK_H
+#define __LINUX_LSM_SMACK_H
+
+struct smack_known;
+
+struct lsm_prop_smack {
+#ifdef CONFIG_SECURITY_SMACK
+ struct smack_known *skp;
+#endif
+};
+
+#endif /* ! __LINUX_LSM_SMACK_H */
diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h
index 9eca013..eb29375 100644
--- a/include/linux/lsm_hook_defs.h
+++ b/include/linux/lsm_hook_defs.h
@@ -176,7 +176,8 @@ LSM_HOOK(int, -EOPNOTSUPP, inode_setsecurity, struct inode *inode,
const char *name, const void *value, size_t size, int flags)
LSM_HOOK(int, 0, inode_listsecurity, struct inode *inode, char *buffer,
size_t buffer_size)
-LSM_HOOK(void, LSM_RET_VOID, inode_getsecid, struct inode *inode, u32 *secid)
+LSM_HOOK(void, LSM_RET_VOID, inode_getlsmprop, struct inode *inode,
+ struct lsm_prop *prop)
LSM_HOOK(int, 0, inode_copy_up, struct dentry *src, struct cred **new)
LSM_HOOK(int, -EOPNOTSUPP, inode_copy_up_xattr, struct dentry *src,
const char *name)
@@ -217,6 +218,8 @@ LSM_HOOK(int, 0, cred_prepare, struct cred *new, const struct cred *old,
LSM_HOOK(void, LSM_RET_VOID, cred_transfer, struct cred *new,
const struct cred *old)
LSM_HOOK(void, LSM_RET_VOID, cred_getsecid, const struct cred *c, u32 *secid)
+LSM_HOOK(void, LSM_RET_VOID, cred_getlsmprop, const struct cred *c,
+ struct lsm_prop *prop)
LSM_HOOK(int, 0, kernel_act_as, struct cred *new, u32 secid)
LSM_HOOK(int, 0, kernel_create_files_as, struct cred *new, struct inode *inode)
LSM_HOOK(int, 0, kernel_module_request, char *kmod_name)
@@ -235,9 +238,9 @@ LSM_HOOK(int, 0, task_fix_setgroups, struct cred *new, const struct cred * old)
LSM_HOOK(int, 0, task_setpgid, struct task_struct *p, pid_t pgid)
LSM_HOOK(int, 0, task_getpgid, struct task_struct *p)
LSM_HOOK(int, 0, task_getsid, struct task_struct *p)
-LSM_HOOK(void, LSM_RET_VOID, current_getsecid_subj, u32 *secid)
-LSM_HOOK(void, LSM_RET_VOID, task_getsecid_obj,
- struct task_struct *p, u32 *secid)
+LSM_HOOK(void, LSM_RET_VOID, current_getlsmprop_subj, struct lsm_prop *prop)
+LSM_HOOK(void, LSM_RET_VOID, task_getlsmprop_obj,
+ struct task_struct *p, struct lsm_prop *prop)
LSM_HOOK(int, 0, task_setnice, struct task_struct *p, int nice)
LSM_HOOK(int, 0, task_setioprio, struct task_struct *p, int ioprio)
LSM_HOOK(int, 0, task_getioprio, struct task_struct *p)
@@ -256,8 +259,8 @@ LSM_HOOK(void, LSM_RET_VOID, task_to_inode, struct task_struct *p,
struct inode *inode)
LSM_HOOK(int, 0, userns_create, const struct cred *cred)
LSM_HOOK(int, 0, ipc_permission, struct kern_ipc_perm *ipcp, short flag)
-LSM_HOOK(void, LSM_RET_VOID, ipc_getsecid, struct kern_ipc_perm *ipcp,
- u32 *secid)
+LSM_HOOK(void, LSM_RET_VOID, ipc_getlsmprop, struct kern_ipc_perm *ipcp,
+ struct lsm_prop *prop)
LSM_HOOK(int, 0, msg_msg_alloc_security, struct msg_msg *msg)
LSM_HOOK(void, LSM_RET_VOID, msg_msg_free_security, struct msg_msg *msg)
LSM_HOOK(int, 0, msg_queue_alloc_security, struct kern_ipc_perm *perm)
@@ -294,6 +297,8 @@ LSM_HOOK(int, -EINVAL, setprocattr, const char *name, void *value, size_t size)
LSM_HOOK(int, 0, ismaclabel, const char *name)
LSM_HOOK(int, -EOPNOTSUPP, secid_to_secctx, u32 secid, char **secdata,
u32 *seclen)
+LSM_HOOK(int, -EOPNOTSUPP, lsmprop_to_secctx, struct lsm_prop *prop,
+ char **secdata, u32 *seclen)
LSM_HOOK(int, 0, secctx_to_secid, const char *secdata, u32 seclen, u32 *secid)
LSM_HOOK(void, LSM_RET_VOID, release_secctx, char *secdata, u32 seclen)
LSM_HOOK(void, LSM_RET_VOID, inode_invalidate_secctx, struct inode *inode)
@@ -416,7 +421,8 @@ LSM_HOOK(void, LSM_RET_VOID, key_post_create_or_update, struct key *keyring,
LSM_HOOK(int, 0, audit_rule_init, u32 field, u32 op, char *rulestr,
void **lsmrule, gfp_t gfp)
LSM_HOOK(int, 0, audit_rule_known, struct audit_krule *krule)
-LSM_HOOK(int, 0, audit_rule_match, u32 secid, u32 field, u32 op, void *lsmrule)
+LSM_HOOK(int, 0, audit_rule_match, struct lsm_prop *prop, u32 field, u32 op,
+ void *lsmrule)
LSM_HOOK(void, LSM_RET_VOID, audit_rule_free, void *lsmrule)
#endif /* CONFIG_AUDIT */
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 34d2da0..e1b4155 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -1760,8 +1760,9 @@ static inline int memcg_kmem_id(struct mem_cgroup *memcg)
struct mem_cgroup *mem_cgroup_from_slab_obj(void *p);
-static inline void count_objcg_event(struct obj_cgroup *objcg,
- enum vm_event_item idx)
+static inline void count_objcg_events(struct obj_cgroup *objcg,
+ enum vm_event_item idx,
+ unsigned long count)
{
struct mem_cgroup *memcg;
@@ -1770,7 +1771,7 @@ static inline void count_objcg_event(struct obj_cgroup *objcg,
rcu_read_lock();
memcg = obj_cgroup_memcg(objcg);
- count_memcg_events(memcg, idx, 1);
+ count_memcg_events(memcg, idx, count);
rcu_read_unlock();
}
@@ -1825,8 +1826,9 @@ static inline struct mem_cgroup *mem_cgroup_from_slab_obj(void *p)
return NULL;
}
-static inline void count_objcg_event(struct obj_cgroup *objcg,
- enum vm_event_item idx)
+static inline void count_objcg_events(struct obj_cgroup *objcg,
+ enum vm_event_item idx,
+ unsigned long count)
{
}
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 61fff5d..feb5c80 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -329,12 +329,14 @@ extern unsigned int kobjsize(const void *objp);
#define VM_HIGH_ARCH_BIT_3 35 /* bit only usable on 64-bit architectures */
#define VM_HIGH_ARCH_BIT_4 36 /* bit only usable on 64-bit architectures */
#define VM_HIGH_ARCH_BIT_5 37 /* bit only usable on 64-bit architectures */
+#define VM_HIGH_ARCH_BIT_6 38 /* bit only usable on 64-bit architectures */
#define VM_HIGH_ARCH_0 BIT(VM_HIGH_ARCH_BIT_0)
#define VM_HIGH_ARCH_1 BIT(VM_HIGH_ARCH_BIT_1)
#define VM_HIGH_ARCH_2 BIT(VM_HIGH_ARCH_BIT_2)
#define VM_HIGH_ARCH_3 BIT(VM_HIGH_ARCH_BIT_3)
#define VM_HIGH_ARCH_4 BIT(VM_HIGH_ARCH_BIT_4)
#define VM_HIGH_ARCH_5 BIT(VM_HIGH_ARCH_BIT_5)
+#define VM_HIGH_ARCH_6 BIT(VM_HIGH_ARCH_BIT_6)
#endif /* CONFIG_ARCH_USES_HIGH_VMA_FLAGS */
#ifdef CONFIG_ARCH_HAS_PKEYS
@@ -365,7 +367,17 @@ extern unsigned int kobjsize(const void *objp);
* for more details on the guard size.
*/
# define VM_SHADOW_STACK VM_HIGH_ARCH_5
-#else
+#endif
+
+#if defined(CONFIG_ARM64_GCS)
+/*
+ * arm64's Guarded Control Stack implements similar functionality and
+ * has similar constraints to shadow stacks.
+ */
+# define VM_SHADOW_STACK VM_HIGH_ARCH_6
+#endif
+
+#ifndef VM_SHADOW_STACK
# define VM_SHADOW_STACK VM_NONE
#endif
@@ -4220,4 +4232,8 @@ static inline void pgalloc_tag_copy(struct folio *new, struct folio *old)
}
#endif /* CONFIG_MEM_ALLOC_PROFILING */
+int arch_get_shadow_stack_status(struct task_struct *t, unsigned long __user *status);
+int arch_set_shadow_stack_status(struct task_struct *t, unsigned long status);
+int arch_lock_shadow_stack_status(struct task_struct *t, unsigned long status);
+
#endif /* _LINUX_MM_H */
diff --git a/include/linux/netlink.h b/include/linux/netlink.h
index b332c20..a48a308 100644
--- a/include/linux/netlink.h
+++ b/include/linux/netlink.h
@@ -239,7 +239,7 @@ int netlink_register_notifier(struct notifier_block *nb);
int netlink_unregister_notifier(struct notifier_block *nb);
/* finegrained unicast helpers: */
-struct sock *netlink_getsockbyfilp(struct file *filp);
+struct sock *netlink_getsockbyfd(int fd);
int netlink_attachskb(struct sock *sk, struct sk_buff *skb,
long *timeo, struct sock *ssk);
void netlink_detachskb(struct sock *sk, struct sk_buff *skb);
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index b58d940..0a6e220 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -327,7 +327,8 @@ struct nvme_id_ctrl {
__le32 sanicap;
__le32 hmminds;
__le16 hmmaxd;
- __u8 rsvd338[4];
+ __le16 nvmsetidmax;
+ __le16 endgidmax;
__u8 anatt;
__u8 anacap;
__le32 anagrpmax;
@@ -522,6 +523,7 @@ enum {
NVME_ID_CNS_NS_DESC_LIST = 0x03,
NVME_ID_CNS_CS_NS = 0x05,
NVME_ID_CNS_CS_CTRL = 0x06,
+ NVME_ID_CNS_NS_ACTIVE_LIST_CS = 0x07,
NVME_ID_CNS_NS_CS_INDEP = 0x08,
NVME_ID_CNS_NS_PRESENT_LIST = 0x10,
NVME_ID_CNS_NS_PRESENT = 0x11,
@@ -530,6 +532,7 @@ enum {
NVME_ID_CNS_SCNDRY_CTRL_LIST = 0x15,
NVME_ID_CNS_NS_GRANULARITY = 0x16,
NVME_ID_CNS_UUID_LIST = 0x17,
+ NVME_ID_CNS_ENDGRP_LIST = 0x19,
};
enum {
@@ -560,6 +563,8 @@ enum {
NVME_NS_FLBAS_LBA_SHIFT = 1,
NVME_NS_FLBAS_META_EXT = 0x10,
NVME_NS_NMIC_SHARED = 1 << 0,
+ NVME_NS_ROTATIONAL = 1 << 4,
+ NVME_NS_VWC_NOT_PRESENT = 1 << 5,
NVME_LBAF_RP_BEST = 0,
NVME_LBAF_RP_BETTER = 1,
NVME_LBAF_RP_GOOD = 2,
@@ -617,6 +622,40 @@ enum {
NVME_NIDT_CSI = 0x04,
};
+struct nvme_endurance_group_log {
+ __u8 egcw;
+ __u8 egfeat;
+ __u8 rsvd2;
+ __u8 avsp;
+ __u8 avspt;
+ __u8 pused;
+ __le16 did;
+ __u8 rsvd8[24];
+ __u8 ee[16];
+ __u8 dur[16];
+ __u8 duw[16];
+ __u8 muw[16];
+ __u8 hrc[16];
+ __u8 hwc[16];
+ __u8 mdie[16];
+ __u8 neile[16];
+ __u8 tegcap[16];
+ __u8 uegcap[16];
+ __u8 rsvd192[320];
+};
+
+struct nvme_rotational_media_log {
+ __le16 endgid;
+ __le16 numa;
+ __le16 nrs;
+ __u8 rsvd6[2];
+ __le32 spinc;
+ __le32 fspinc;
+ __le32 ldc;
+ __le32 fldc;
+ __u8 rsvd24[488];
+};
+
struct nvme_smart_log {
__u8 critical_warning;
__u8 temperature[2];
@@ -1244,6 +1283,7 @@ enum {
NVME_FEAT_WRITE_PROTECT = 0x84,
NVME_FEAT_VENDOR_START = 0xC0,
NVME_FEAT_VENDOR_END = 0xFF,
+ NVME_LOG_SUPPORTED = 0x00,
NVME_LOG_ERROR = 0x01,
NVME_LOG_SMART = 0x02,
NVME_LOG_FW_SLOT = 0x03,
@@ -1254,6 +1294,8 @@ enum {
NVME_LOG_TELEMETRY_CTRL = 0x08,
NVME_LOG_ENDURANCE_GROUP = 0x09,
NVME_LOG_ANA = 0x0c,
+ NVME_LOG_FEATURES = 0x12,
+ NVME_LOG_RMI = 0x16,
NVME_LOG_DISC = 0x70,
NVME_LOG_RESERVATION = 0x80,
NVME_FWACT_REPL = (0 << 3),
@@ -1261,6 +1303,24 @@ enum {
NVME_FWACT_ACTV = (2 << 3),
};
+struct nvme_supported_log {
+ __le32 lids[256];
+};
+
+enum {
+ NVME_LIDS_LSUPP = 1 << 0,
+};
+
+struct nvme_supported_features_log {
+ __le32 fis[256];
+};
+
+enum {
+ NVME_FIS_FSUPP = 1 << 0,
+ NVME_FIS_NSCPE = 1 << 20,
+ NVME_FIS_CSCPE = 1 << 21,
+};
+
/* NVMe Namespace Write Protect State */
enum {
NVME_NS_NO_WRITE_PROTECT = 0,
@@ -1281,7 +1341,8 @@ struct nvme_identify {
__u8 cns;
__u8 rsvd3;
__le16 ctrlid;
- __u8 rsvd11[3];
+ __le16 cnssid;
+ __u8 rsvd11;
__u8 csi;
__u32 rsvd12[4];
};
@@ -1389,7 +1450,7 @@ struct nvme_get_log_page_command {
__u8 lsp; /* upper 4 bits reserved */
__le16 numdl;
__le16 numdu;
- __u16 rsvd11;
+ __le16 lsi;
union {
struct {
__le32 lpol;
@@ -2037,4 +2098,72 @@ struct nvme_completion {
#define NVME_MINOR(ver) (((ver) >> 8) & 0xff)
#define NVME_TERTIARY(ver) ((ver) & 0xff)
+enum {
+ NVME_AEN_RESV_LOG_PAGE_AVALIABLE = 0x00,
+};
+
+enum {
+ NVME_PR_LOG_EMPTY_LOG_PAGE = 0x00,
+ NVME_PR_LOG_REGISTRATION_PREEMPTED = 0x01,
+ NVME_PR_LOG_RESERVATION_RELEASED = 0x02,
+ NVME_PR_LOG_RESERVATOIN_PREEMPTED = 0x03,
+};
+
+enum {
+ NVME_PR_NOTIFY_BIT_REG_PREEMPTED = 1,
+ NVME_PR_NOTIFY_BIT_RESV_RELEASED = 2,
+ NVME_PR_NOTIFY_BIT_RESV_PREEMPTED = 3,
+};
+
+struct nvme_pr_log {
+ __le64 count;
+ __u8 type;
+ __u8 nr_pages;
+ __u8 rsvd1[2];
+ __le32 nsid;
+ __u8 rsvd2[48];
+};
+
+struct nvmet_pr_register_data {
+ __le64 crkey;
+ __le64 nrkey;
+};
+
+struct nvmet_pr_acquire_data {
+ __le64 crkey;
+ __le64 prkey;
+};
+
+struct nvmet_pr_release_data {
+ __le64 crkey;
+};
+
+enum nvme_pr_capabilities {
+ NVME_PR_SUPPORT_PTPL = 1,
+ NVME_PR_SUPPORT_WRITE_EXCLUSIVE = 1 << 1,
+ NVME_PR_SUPPORT_EXCLUSIVE_ACCESS = 1 << 2,
+ NVME_PR_SUPPORT_WRITE_EXCLUSIVE_REG_ONLY = 1 << 3,
+ NVME_PR_SUPPORT_EXCLUSIVE_ACCESS_REG_ONLY = 1 << 4,
+ NVME_PR_SUPPORT_WRITE_EXCLUSIVE_ALL_REGS = 1 << 5,
+ NVME_PR_SUPPORT_EXCLUSIVE_ACCESS_ALL_REGS = 1 << 6,
+ NVME_PR_SUPPORT_IEKEY_VER_1_3_DEF = 1 << 7,
+};
+
+enum nvme_pr_register_action {
+ NVME_PR_REGISTER_ACT_REG = 0,
+ NVME_PR_REGISTER_ACT_UNREG = 1,
+ NVME_PR_REGISTER_ACT_REPLACE = 1 << 1,
+};
+
+enum nvme_pr_acquire_action {
+ NVME_PR_ACQUIRE_ACT_ACQUIRE = 0,
+ NVME_PR_ACQUIRE_ACT_PREEMPT = 1,
+ NVME_PR_ACQUIRE_ACT_PREEMPT_AND_ABORT = 1 << 1,
+};
+
+enum nvme_pr_release_action {
+ NVME_PR_RELEASE_ACT_RELEASE = 0,
+ NVME_PR_RELEASE_ACT_CLEAR = 1,
+};
+
#endif /* _LINUX_NVME_H */
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index cc839e4..908ee0a 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -543,7 +543,7 @@ FOLIO_FLAG(swapbacked, FOLIO_HEAD_PAGE)
* - PG_private and PG_private_2 cause release_folio() and co to be invoked
*/
PAGEFLAG(Private, private, PF_ANY)
-PAGEFLAG(Private2, private_2, PF_ANY) TESTSCFLAG(Private2, private_2, PF_ANY)
+FOLIO_FLAG(private_2, FOLIO_HEAD_PAGE)
/* owner_2 can be set on tail pages for anon memory */
FOLIO_FLAG(owner_2, FOLIO_HEAD_PAGE)
@@ -554,7 +554,7 @@ FOLIO_FLAG(owner_2, FOLIO_HEAD_PAGE)
*/
TESTPAGEFLAG(Writeback, writeback, PF_NO_TAIL)
TESTSCFLAG(Writeback, writeback, PF_NO_TAIL)
-PAGEFLAG(MappedToDisk, mappedtodisk, PF_NO_TAIL)
+FOLIO_FLAG(mappedtodisk, FOLIO_HEAD_PAGE)
/* PG_readahead is only used for reads; PG_reclaim is only for writes */
PAGEFLAG(Reclaim, reclaim, PF_NO_TAIL)
diff --git a/include/linux/perf/arm_pmuv3.h b/include/linux/perf/arm_pmuv3.h
index 3372c1b..d698efb 100644
--- a/include/linux/perf/arm_pmuv3.h
+++ b/include/linux/perf/arm_pmuv3.h
@@ -257,6 +257,7 @@
#define ARMV8_PMU_USERENR_SW (1 << 1) /* PMSWINC can be written at EL0 */
#define ARMV8_PMU_USERENR_CR (1 << 2) /* Cycle counter can be read at EL0 */
#define ARMV8_PMU_USERENR_ER (1 << 3) /* Event counter can be read at EL0 */
+#define ARMV8_PMU_USERENR_UEN (1 << 4) /* Fine grained per counter access at EL0 */
/* Mask for writable bits */
#define ARMV8_PMU_USERENR_MASK (ARMV8_PMU_USERENR_EN | ARMV8_PMU_USERENR_SW | \
ARMV8_PMU_USERENR_CR | ARMV8_PMU_USERENR_ER)
diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h
index b637ec1..cf4b11b 100644
--- a/include/linux/pm_domain.h
+++ b/include/linux/pm_domain.h
@@ -92,6 +92,10 @@ struct dev_pm_domain_list {
* GENPD_FLAG_OPP_TABLE_FW: The genpd provider supports performance states,
* but its corresponding OPP tables are not
* described in DT, but are given directly by FW.
+ *
+ * GENPD_FLAG_DEV_NAME_FW: Instructs genpd to generate an unique device name
+ * using ida. It is used by genpd providers which
+ * get their genpd-names directly from FW.
*/
#define GENPD_FLAG_PM_CLK (1U << 0)
#define GENPD_FLAG_IRQ_SAFE (1U << 1)
@@ -101,6 +105,7 @@ struct dev_pm_domain_list {
#define GENPD_FLAG_RPM_ALWAYS_ON (1U << 5)
#define GENPD_FLAG_MIN_RESIDENCY (1U << 6)
#define GENPD_FLAG_OPP_TABLE_FW (1U << 7)
+#define GENPD_FLAG_DEV_NAME_FW (1U << 8)
enum gpd_status {
GENPD_STATE_ON = 0, /* PM domain is on */
@@ -163,6 +168,7 @@ struct generic_pm_domain {
atomic_t sd_count; /* Number of subdomains with power "on" */
enum gpd_status status; /* Current state of the domain */
unsigned int device_count; /* Number of devices */
+ unsigned int device_id; /* unique device id */
unsigned int suspended_count; /* System suspend device counter */
unsigned int prepared_count; /* Suspend counter of prepared devices */
unsigned int performance_state; /* Aggregated max performance state */
diff --git a/include/linux/posix_acl.h b/include/linux/posix_acl.h
index 0e65b3d..e2d47eb 100644
--- a/include/linux/posix_acl.h
+++ b/include/linux/posix_acl.h
@@ -28,9 +28,9 @@ struct posix_acl_entry {
struct posix_acl {
refcount_t a_refcount;
- struct rcu_head a_rcu;
unsigned int a_count;
- struct posix_acl_entry a_entries[];
+ struct rcu_head a_rcu;
+ struct posix_acl_entry a_entries[] __counted_by(a_count);
};
#define FOREACH_ACL_ENTRY(pa, acl, pe) \
@@ -62,7 +62,7 @@ posix_acl_release(struct posix_acl *acl)
/* posix_acl.c */
extern void posix_acl_init(struct posix_acl *, int);
-extern struct posix_acl *posix_acl_alloc(int, gfp_t);
+extern struct posix_acl *posix_acl_alloc(unsigned int count, gfp_t flags);
extern struct posix_acl *posix_acl_from_mode(umode_t, gfp_t);
extern int posix_acl_equiv_mode(const struct posix_acl *, umode_t *);
extern int __posix_acl_create(struct posix_acl **, gfp_t, umode_t *);
diff --git a/include/linux/prandom.h b/include/linux/prandom.h
index f7f1e52..f2ed5b7 100644
--- a/include/linux/prandom.h
+++ b/include/linux/prandom.h
@@ -10,6 +10,7 @@
#include <linux/types.h>
#include <linux/once.h>
+#include <linux/percpu.h>
#include <linux/random.h>
struct rnd_state {
diff --git a/include/linux/random.h b/include/linux/random.h
index b0a940a..333cecf 100644
--- a/include/linux/random.h
+++ b/include/linux/random.h
@@ -145,13 +145,6 @@ declare_get_random_var_wait(u64, u32)
declare_get_random_var_wait(long, unsigned long)
#undef declare_get_random_var
-/*
- * This is designed to be standalone for just prandom
- * users, but for now we include it from <linux/random.h>
- * for legacy reasons.
- */
-#include <linux/prandom.h>
-
#ifdef CONFIG_SMP
int random_prepare_cpu(unsigned int cpu);
int random_online_cpu(unsigned int cpu);
diff --git a/include/linux/sched/task_stack.h b/include/linux/sched/task_stack.h
index bf10bdb..6c2fef8 100644
--- a/include/linux/sched/task_stack.h
+++ b/include/linux/sched/task_stack.h
@@ -9,6 +9,7 @@
#include <linux/sched.h>
#include <linux/magic.h>
#include <linux/refcount.h>
+#include <linux/kasan.h>
#ifdef CONFIG_THREAD_INFO_IN_TASK
@@ -89,6 +90,7 @@ static inline int object_is_on_stack(const void *obj)
{
void *stack = task_stack_page(current);
+ obj = kasan_reset_tag(obj);
return (obj >= stack) && (obj < (stack + THREAD_SIZE));
}
diff --git a/include/linux/security.h b/include/linux/security.h
index 2ec8f30..cbdba43 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -34,6 +34,10 @@
#include <linux/sockptr.h>
#include <linux/bpf.h>
#include <uapi/linux/lsm.h>
+#include <linux/lsm/selinux.h>
+#include <linux/lsm/smack.h>
+#include <linux/lsm/apparmor.h>
+#include <linux/lsm/bpf.h>
struct linux_binprm;
struct cred;
@@ -152,6 +156,16 @@ enum lockdown_reason {
LOCKDOWN_CONFIDENTIALITY_MAX,
};
+/*
+ * Data exported by the security modules
+ */
+struct lsm_prop {
+ struct lsm_prop_selinux selinux;
+ struct lsm_prop_smack smack;
+ struct lsm_prop_apparmor apparmor;
+ struct lsm_prop_bpf bpf;
+};
+
extern const char *const lockdown_reasons[LOCKDOWN_CONFIDENTIALITY_MAX+1];
extern u32 lsm_active_cnt;
extern const struct lsm_id *lsm_idlist[];
@@ -269,8 +283,32 @@ static inline const char *kernel_load_data_id_str(enum kernel_load_data_id id)
return kernel_load_data_str[id];
}
+/**
+ * lsmprop_init - initialize a lsm_prop structure
+ * @prop: Pointer to the data to initialize
+ *
+ * Set all secid for all modules to the specified value.
+ */
+static inline void lsmprop_init(struct lsm_prop *prop)
+{
+ memset(prop, 0, sizeof(*prop));
+}
+
#ifdef CONFIG_SECURITY
+/**
+ * lsmprop_is_set - report if there is a value in the lsm_prop
+ * @prop: Pointer to the exported LSM data
+ *
+ * Returns true if there is a value set, false otherwise
+ */
+static inline bool lsmprop_is_set(struct lsm_prop *prop)
+{
+ const struct lsm_prop empty = {};
+
+ return !!memcmp(prop, &empty, sizeof(*prop));
+}
+
int call_blocking_lsm_notifier(enum lsm_event event, void *data);
int register_blocking_lsm_notifier(struct notifier_block *nb);
int unregister_blocking_lsm_notifier(struct notifier_block *nb);
@@ -408,7 +446,7 @@ int security_inode_getsecurity(struct mnt_idmap *idmap,
void **buffer, bool alloc);
int security_inode_setsecurity(struct inode *inode, const char *name, const void *value, size_t size, int flags);
int security_inode_listsecurity(struct inode *inode, char *buffer, size_t buffer_size);
-void security_inode_getsecid(struct inode *inode, u32 *secid);
+void security_inode_getlsmprop(struct inode *inode, struct lsm_prop *prop);
int security_inode_copy_up(struct dentry *src, struct cred **new);
int security_inode_copy_up_xattr(struct dentry *src, const char *name);
int security_inode_setintegrity(const struct inode *inode,
@@ -444,6 +482,7 @@ void security_cred_free(struct cred *cred);
int security_prepare_creds(struct cred *new, const struct cred *old, gfp_t gfp);
void security_transfer_creds(struct cred *new, const struct cred *old);
void security_cred_getsecid(const struct cred *c, u32 *secid);
+void security_cred_getlsmprop(const struct cred *c, struct lsm_prop *prop);
int security_kernel_act_as(struct cred *new, u32 secid);
int security_kernel_create_files_as(struct cred *new, struct inode *inode);
int security_kernel_module_request(char *kmod_name);
@@ -463,8 +502,8 @@ int security_task_fix_setgroups(struct cred *new, const struct cred *old);
int security_task_setpgid(struct task_struct *p, pid_t pgid);
int security_task_getpgid(struct task_struct *p);
int security_task_getsid(struct task_struct *p);
-void security_current_getsecid_subj(u32 *secid);
-void security_task_getsecid_obj(struct task_struct *p, u32 *secid);
+void security_current_getlsmprop_subj(struct lsm_prop *prop);
+void security_task_getlsmprop_obj(struct task_struct *p, struct lsm_prop *prop);
int security_task_setnice(struct task_struct *p, int nice);
int security_task_setioprio(struct task_struct *p, int ioprio);
int security_task_getioprio(struct task_struct *p);
@@ -482,7 +521,7 @@ int security_task_prctl(int option, unsigned long arg2, unsigned long arg3,
void security_task_to_inode(struct task_struct *p, struct inode *inode);
int security_create_user_ns(const struct cred *cred);
int security_ipc_permission(struct kern_ipc_perm *ipcp, short flag);
-void security_ipc_getsecid(struct kern_ipc_perm *ipcp, u32 *secid);
+void security_ipc_getlsmprop(struct kern_ipc_perm *ipcp, struct lsm_prop *prop);
int security_msg_msg_alloc(struct msg_msg *msg);
void security_msg_msg_free(struct msg_msg *msg);
int security_msg_queue_alloc(struct kern_ipc_perm *msq);
@@ -515,6 +554,7 @@ int security_setprocattr(int lsmid, const char *name, void *value, size_t size);
int security_netlink_send(struct sock *sk, struct sk_buff *skb);
int security_ismaclabel(const char *name);
int security_secid_to_secctx(u32 secid, char **secdata, u32 *seclen);
+int security_lsmprop_to_secctx(struct lsm_prop *prop, char **secdata, u32 *seclen);
int security_secctx_to_secid(const char *secdata, u32 seclen, u32 *secid);
void security_release_secctx(char *secdata, u32 seclen);
void security_inode_invalidate_secctx(struct inode *inode);
@@ -531,6 +571,17 @@ int security_bdev_setintegrity(struct block_device *bdev,
size_t size);
#else /* CONFIG_SECURITY */
+/**
+ * lsmprop_is_set - report if there is a value in the lsm_prop
+ * @prop: Pointer to the exported LSM data
+ *
+ * Returns true if there is a value set, false otherwise
+ */
+static inline bool lsmprop_is_set(struct lsm_prop *prop)
+{
+ return false;
+}
+
static inline int call_blocking_lsm_notifier(enum lsm_event event, void *data)
{
return 0;
@@ -1020,9 +1071,10 @@ static inline int security_inode_listsecurity(struct inode *inode, char *buffer,
return 0;
}
-static inline void security_inode_getsecid(struct inode *inode, u32 *secid)
+static inline void security_inode_getlsmprop(struct inode *inode,
+ struct lsm_prop *prop)
{
- *secid = 0;
+ lsmprop_init(prop);
}
static inline int security_inode_copy_up(struct dentry *src, struct cred **new)
@@ -1172,6 +1224,10 @@ static inline void security_cred_getsecid(const struct cred *c, u32 *secid)
*secid = 0;
}
+static inline void security_cred_getlsmprop(const struct cred *c,
+ struct lsm_prop *prop)
+{ }
+
static inline int security_kernel_act_as(struct cred *cred, u32 secid)
{
return 0;
@@ -1249,14 +1305,15 @@ static inline int security_task_getsid(struct task_struct *p)
return 0;
}
-static inline void security_current_getsecid_subj(u32 *secid)
+static inline void security_current_getlsmprop_subj(struct lsm_prop *prop)
{
- *secid = 0;
+ lsmprop_init(prop);
}
-static inline void security_task_getsecid_obj(struct task_struct *p, u32 *secid)
+static inline void security_task_getlsmprop_obj(struct task_struct *p,
+ struct lsm_prop *prop)
{
- *secid = 0;
+ lsmprop_init(prop);
}
static inline int security_task_setnice(struct task_struct *p, int nice)
@@ -1332,9 +1389,10 @@ static inline int security_ipc_permission(struct kern_ipc_perm *ipcp,
return 0;
}
-static inline void security_ipc_getsecid(struct kern_ipc_perm *ipcp, u32 *secid)
+static inline void security_ipc_getlsmprop(struct kern_ipc_perm *ipcp,
+ struct lsm_prop *prop)
{
- *secid = 0;
+ lsmprop_init(prop);
}
static inline int security_msg_msg_alloc(struct msg_msg *msg)
@@ -1468,7 +1526,14 @@ static inline int security_ismaclabel(const char *name)
return 0;
}
-static inline int security_secid_to_secctx(u32 secid, char **secdata, u32 *seclen)
+static inline int security_secid_to_secctx(u32 secid, char **secdata,
+ u32 *seclen)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline int security_lsmprop_to_secctx(struct lsm_prop *prop,
+ char **secdata, u32 *seclen)
{
return -EOPNOTSUPP;
}
@@ -2095,7 +2160,8 @@ static inline void security_key_post_create_or_update(struct key *keyring,
int security_audit_rule_init(u32 field, u32 op, char *rulestr, void **lsmrule,
gfp_t gfp);
int security_audit_rule_known(struct audit_krule *krule);
-int security_audit_rule_match(u32 secid, u32 field, u32 op, void *lsmrule);
+int security_audit_rule_match(struct lsm_prop *prop, u32 field, u32 op,
+ void *lsmrule);
void security_audit_rule_free(void *lsmrule);
#else
@@ -2111,8 +2177,8 @@ static inline int security_audit_rule_known(struct audit_krule *krule)
return 0;
}
-static inline int security_audit_rule_match(u32 secid, u32 field, u32 op,
- void *lsmrule)
+static inline int security_audit_rule_match(struct lsm_prop *prop, u32 field,
+ u32 op, void *lsmrule)
{
return 0;
}
diff --git a/include/linux/sed-opal.h b/include/linux/sed-opal.h
index 2ac5082..80f33a93f 100644
--- a/include/linux/sed-opal.h
+++ b/include/linux/sed-opal.h
@@ -52,6 +52,7 @@ static inline bool is_sed_ioctl(unsigned int cmd)
case IOC_OPAL_GET_GEOMETRY:
case IOC_OPAL_DISCOVERY:
case IOC_OPAL_REVERT_LSP:
+ case IOC_OPAL_SET_SID_PW:
return true;
}
return false;
diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index 515a9a6..018da28 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -42,10 +42,10 @@ struct shmem_inode_info {
struct inode vfs_inode;
};
-#define SHMEM_FL_USER_VISIBLE FS_FL_USER_VISIBLE
+#define SHMEM_FL_USER_VISIBLE (FS_FL_USER_VISIBLE | FS_CASEFOLD_FL)
#define SHMEM_FL_USER_MODIFIABLE \
- (FS_IMMUTABLE_FL | FS_APPEND_FL | FS_NODUMP_FL | FS_NOATIME_FL)
-#define SHMEM_FL_INHERITED (FS_NODUMP_FL | FS_NOATIME_FL)
+ (FS_IMMUTABLE_FL | FS_APPEND_FL | FS_NODUMP_FL | FS_NOATIME_FL | FS_CASEFOLD_FL)
+#define SHMEM_FL_INHERITED (FS_NODUMP_FL | FS_NOATIME_FL | FS_CASEFOLD_FL)
struct shmem_quota_limits {
qsize_t usrquota_bhardlimit; /* Default user quota block hard limit */
diff --git a/include/linux/slab.h b/include/linux/slab.h
index b35e2db..0268ea7 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -448,6 +448,7 @@ void kfree_sensitive(const void *objp);
size_t __ksize(const void *objp);
DEFINE_FREE(kfree, void *, if (!IS_ERR_OR_NULL(_T)) kfree(_T))
+DEFINE_FREE(kfree_sensitive, void *, if (_T) kfree_sensitive(_T))
/**
* ksize - Report actual allocation size of associated object
diff --git a/include/linux/sockptr.h b/include/linux/sockptr.h
index fc5a206..195debe 100644
--- a/include/linux/sockptr.h
+++ b/include/linux/sockptr.h
@@ -77,7 +77,9 @@ static inline int copy_safe_from_sockptr(void *dst, size_t ksize,
{
if (optlen < ksize)
return -EINVAL;
- return copy_from_sockptr(dst, optval, ksize);
+ if (copy_from_sockptr(dst, optval, ksize))
+ return -EFAULT;
+ return 0;
}
static inline int copy_struct_from_sockptr(void *dst, size_t ksize,
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 5758104..c633320 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -77,6 +77,7 @@ struct cachestat_range;
struct cachestat;
struct statmount;
struct mnt_id_req;
+struct xattr_args;
#include <linux/types.h>
#include <linux/aio_abi.h>
@@ -338,23 +339,35 @@ asmlinkage long sys_io_uring_register(unsigned int fd, unsigned int op,
void __user *arg, unsigned int nr_args);
asmlinkage long sys_setxattr(const char __user *path, const char __user *name,
const void __user *value, size_t size, int flags);
+asmlinkage long sys_setxattrat(int dfd, const char __user *path, unsigned int at_flags,
+ const char __user *name,
+ const struct xattr_args __user *args, size_t size);
asmlinkage long sys_lsetxattr(const char __user *path, const char __user *name,
const void __user *value, size_t size, int flags);
asmlinkage long sys_fsetxattr(int fd, const char __user *name,
const void __user *value, size_t size, int flags);
asmlinkage long sys_getxattr(const char __user *path, const char __user *name,
void __user *value, size_t size);
+asmlinkage long sys_getxattrat(int dfd, const char __user *path, unsigned int at_flags,
+ const char __user *name,
+ struct xattr_args __user *args, size_t size);
asmlinkage long sys_lgetxattr(const char __user *path, const char __user *name,
void __user *value, size_t size);
asmlinkage long sys_fgetxattr(int fd, const char __user *name,
void __user *value, size_t size);
asmlinkage long sys_listxattr(const char __user *path, char __user *list,
size_t size);
+asmlinkage long sys_listxattrat(int dfd, const char __user *path,
+ unsigned int at_flags,
+ char __user *list, size_t size);
asmlinkage long sys_llistxattr(const char __user *path, char __user *list,
size_t size);
asmlinkage long sys_flistxattr(int fd, char __user *list, size_t size);
asmlinkage long sys_removexattr(const char __user *path,
const char __user *name);
+asmlinkage long sys_removexattrat(int dfd, const char __user *path,
+ unsigned int at_flags,
+ const char __user *name);
asmlinkage long sys_lremovexattr(const char __user *path,
const char __user *name);
asmlinkage long sys_fremovexattr(int fd, const char __user *name);
diff --git a/include/linux/sysfb.h b/include/linux/sysfb.h
index bef5f06..07cbab5 100644
--- a/include/linux/sysfb.h
+++ b/include/linux/sysfb.h
@@ -60,12 +60,19 @@ struct efifb_dmi_info {
void sysfb_disable(struct device *dev);
+bool sysfb_handles_screen_info(void);
+
#else /* CONFIG_SYSFB */
static inline void sysfb_disable(struct device *dev)
{
}
+static inline bool sysfb_handles_screen_info(void)
+{
+ return false;
+}
+
#endif /* CONFIG_SYSFB */
#ifdef CONFIG_EFI
diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index 25ea8fe..75480247 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -56,6 +56,9 @@ enum thermal_notify_event {
THERMAL_TZ_UNBIND_CDEV, /* Cooling dev is unbind from the thermal zone */
THERMAL_INSTANCE_WEIGHT_CHANGED, /* Thermal instance weight changed */
THERMAL_TZ_RESUME, /* Thermal zone is resuming after system sleep */
+ THERMAL_TZ_ADD_THRESHOLD, /* Threshold added */
+ THERMAL_TZ_DEL_THRESHOLD, /* Threshold deleted */
+ THERMAL_TZ_FLUSH_THRESHOLDS, /* All thresholds deleted */
};
/**
@@ -137,6 +140,9 @@ struct thermal_cooling_device {
#endif
};
+DEFINE_GUARD(cooling_dev, struct thermal_cooling_device *, mutex_lock(&_T->lock),
+ mutex_unlock(&_T->lock))
+
/* Structure to define Thermal Zone parameters */
struct thermal_zone_params {
const char *governor_name;
diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h
index fc12a9b..84a035e 100644
--- a/include/linux/timekeeping.h
+++ b/include/linux/timekeeping.h
@@ -45,6 +45,11 @@ extern void ktime_get_real_ts64(struct timespec64 *tv);
extern void ktime_get_coarse_ts64(struct timespec64 *ts);
extern void ktime_get_coarse_real_ts64(struct timespec64 *ts);
+/* Multigrain timestamp interfaces */
+extern void ktime_get_coarse_real_ts64_mg(struct timespec64 *ts);
+extern void ktime_get_real_ts64_mg(struct timespec64 *ts);
+extern unsigned long timekeeping_get_mg_floor_swaps(void);
+
void getboottime64(struct timespec64 *ts);
/*
diff --git a/include/linux/tpm.h b/include/linux/tpm.h
index 587b96b..20a40ad 100644
--- a/include/linux/tpm.h
+++ b/include/linux/tpm.h
@@ -421,6 +421,7 @@ void tpm_buf_append_u32(struct tpm_buf *buf, const u32 value);
u8 tpm_buf_read_u8(struct tpm_buf *buf, off_t *offset);
u16 tpm_buf_read_u16(struct tpm_buf *buf, off_t *offset);
u32 tpm_buf_read_u32(struct tpm_buf *buf, off_t *offset);
+void tpm_buf_append_handle(struct tpm_chip *chip, struct tpm_buf *buf, u32 handle);
/*
* Check if TPM device is in the firmware upgrade mode.
@@ -505,6 +506,8 @@ void tpm_buf_append_name(struct tpm_chip *chip, struct tpm_buf *buf,
void tpm_buf_append_hmac_session(struct tpm_chip *chip, struct tpm_buf *buf,
u8 attributes, u8 *passphrase,
int passphraselen);
+void tpm_buf_append_auth(struct tpm_chip *chip, struct tpm_buf *buf,
+ u8 attributes, u8 *passphrase, int passphraselen);
static inline void tpm_buf_append_hmac_session_opt(struct tpm_chip *chip,
struct tpm_buf *buf,
u8 attributes,
diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
index 43844510..e9c702c 100644
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
@@ -403,6 +403,103 @@ copy_struct_from_user(void *dst, size_t ksize, const void __user *src,
return 0;
}
+/**
+ * copy_struct_to_user: copy a struct to userspace
+ * @dst: Destination address, in userspace. This buffer must be @ksize
+ * bytes long.
+ * @usize: (Alleged) size of @dst struct.
+ * @src: Source address, in kernel space.
+ * @ksize: Size of @src struct.
+ * @ignored_trailing: Set to %true if there was a non-zero byte in @src that
+ * userspace cannot see because they are using an smaller struct.
+ *
+ * Copies a struct from kernel space to userspace, in a way that guarantees
+ * backwards-compatibility for struct syscall arguments (as long as future
+ * struct extensions are made such that all new fields are *appended* to the
+ * old struct, and zeroed-out new fields have the same meaning as the old
+ * struct).
+ *
+ * Some syscalls may wish to make sure that userspace knows about everything in
+ * the struct, and if there is a non-zero value that userspce doesn't know
+ * about, they want to return an error (such as -EMSGSIZE) or have some other
+ * fallback (such as adding a "you're missing some information" flag). If
+ * @ignored_trailing is non-%NULL, it will be set to %true if there was a
+ * non-zero byte that could not be copied to userspace (ie. was past @usize).
+ *
+ * While unconditionally returning an error in this case is the simplest
+ * solution, for maximum backward compatibility you should try to only return
+ * -EMSGSIZE if the user explicitly requested the data that couldn't be copied.
+ * Note that structure sizes can change due to header changes and simple
+ * recompilations without code changes(!), so if you care about
+ * @ignored_trailing you probably want to make sure that any new field data is
+ * associated with a flag. Otherwise you might assume that a program knows
+ * about data it does not.
+ *
+ * @ksize is just sizeof(*src), and @usize should've been passed by userspace.
+ * The recommended usage is something like the following:
+ *
+ * SYSCALL_DEFINE2(foobar, struct foo __user *, uarg, size_t, usize)
+ * {
+ * int err;
+ * bool ignored_trailing;
+ * struct foo karg = {};
+ *
+ * if (usize > PAGE_SIZE)
+ * return -E2BIG;
+ * if (usize < FOO_SIZE_VER0)
+ * return -EINVAL;
+ *
+ * // ... modify karg somehow ...
+ *
+ * err = copy_struct_to_user(uarg, usize, &karg, sizeof(karg),
+ * &ignored_trailing);
+ * if (err)
+ * return err;
+ * if (ignored_trailing)
+ * return -EMSGSIZE:
+ *
+ * // ...
+ * }
+ *
+ * There are three cases to consider:
+ * * If @usize == @ksize, then it's copied verbatim.
+ * * If @usize < @ksize, then the kernel is trying to pass userspace a newer
+ * struct than it supports. Thus we only copy the interoperable portions
+ * (@usize) and ignore the rest (but @ignored_trailing is set to %true if
+ * any of the trailing (@ksize - @usize) bytes are non-zero).
+ * * If @usize > @ksize, then the kernel is trying to pass userspace an older
+ * struct than userspace supports. In order to make sure the
+ * unknown-to-the-kernel fields don't contain garbage values, we zero the
+ * trailing (@usize - @ksize) bytes.
+ *
+ * Returns (in all cases, some data may have been copied):
+ * * -EFAULT: access to userspace failed.
+ */
+static __always_inline __must_check int
+copy_struct_to_user(void __user *dst, size_t usize, const void *src,
+ size_t ksize, bool *ignored_trailing)
+{
+ size_t size = min(ksize, usize);
+ size_t rest = max(ksize, usize) - size;
+
+ /* Double check if ksize is larger than a known object size. */
+ if (WARN_ON_ONCE(ksize > __builtin_object_size(src, 1)))
+ return -E2BIG;
+
+ /* Deal with trailing bytes. */
+ if (usize > ksize) {
+ if (clear_user(dst + size, rest))
+ return -EFAULT;
+ }
+ if (ignored_trailing)
+ *ignored_trailing = ksize < usize &&
+ memchr_inv(src + size, 0, rest) != NULL;
+ /* Copy the interoperable parts of the struct. */
+ if (copy_to_user(dst, src, size))
+ return -EFAULT;
+ return 0;
+}
+
bool copy_from_kernel_nofault_allowed(const void *unsafe_src, size_t size);
long copy_from_kernel_nofault(void *dst, const void *src, size_t size);
diff --git a/include/linux/unicode.h b/include/linux/unicode.h
index 4d39e6e..5e6b212 100644
--- a/include/linux/unicode.h
+++ b/include/linux/unicode.h
@@ -16,6 +16,8 @@ struct utf8data_table;
((unsigned int)(MIN) << UNICODE_MIN_SHIFT) | \
((unsigned int)(REV)))
+#define UTF8_LATEST UNICODE_AGE(12, 1, 0)
+
static inline u8 unicode_major(unsigned int age)
{
return (age >> UNICODE_MAJ_SHIFT) & 0xff;
@@ -76,4 +78,6 @@ int utf8_casefold_hash(const struct unicode_map *um, const void *salt,
struct unicode_map *utf8_load(unsigned int version);
void utf8_unload(struct unicode_map *um);
+int utf8_parse_version(char *version);
+
#endif /* _LINUX_UNICODE_H */
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index aed952d..f70d095 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -134,6 +134,8 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
#ifdef CONFIG_SWAP
SWAP_RA,
SWAP_RA_HIT,
+ SWPIN_ZERO,
+ SWPOUT_ZERO,
#ifdef CONFIG_KSM
KSM_SWPIN_COPY,
#endif
diff --git a/include/linux/wait.h b/include/linux/wait.h
index 8aa3372..2b322a9b 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -221,6 +221,7 @@ void __wake_up_pollfree(struct wait_queue_head *wq_head);
#define wake_up_all(x) __wake_up(x, TASK_NORMAL, 0, NULL)
#define wake_up_locked(x) __wake_up_locked((x), TASK_NORMAL, 1)
#define wake_up_all_locked(x) __wake_up_locked((x), TASK_NORMAL, 0)
+#define wake_up_sync(x) __wake_up_sync(x, TASK_NORMAL)
#define wake_up_interruptible(x) __wake_up(x, TASK_INTERRUPTIBLE, 1, NULL)
#define wake_up_interruptible_nr(x, nr) __wake_up(x, TASK_INTERRUPTIBLE, nr, NULL)
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index d6db822e..d11b903 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -213,11 +213,8 @@ static inline void wait_on_inode(struct inode *inode)
#include <linux/bio.h>
void __inode_attach_wb(struct inode *inode, struct folio *folio);
-void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
- struct inode *inode)
- __releases(&inode->i_lock);
void wbc_detach_inode(struct writeback_control *wbc);
-void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page,
+void wbc_account_cgroup_owner(struct writeback_control *wbc, struct folio *folio,
size_t bytes);
int cgroup_writeback_by_id(u64 bdi_id, int memcg_id,
enum wb_reason reason, struct wb_completion *done);
@@ -254,22 +251,8 @@ static inline void inode_detach_wb(struct inode *inode)
}
}
-/**
- * wbc_attach_fdatawrite_inode - associate wbc and inode for fdatawrite
- * @wbc: writeback_control of interest
- * @inode: target inode
- *
- * This function is to be used by __filemap_fdatawrite_range(), which is an
- * alternative entry point into writeback code, and first ensures @inode is
- * associated with a bdi_writeback and attaches it to @wbc.
- */
-static inline void wbc_attach_fdatawrite_inode(struct writeback_control *wbc,
- struct inode *inode)
-{
- spin_lock(&inode->i_lock);
- inode_attach_wb(inode, NULL);
- wbc_attach_and_unlock_inode(wbc, inode);
-}
+void wbc_attach_fdatawrite_inode(struct writeback_control *wbc,
+ struct inode *inode);
/**
* wbc_init_bio - writeback specific initializtion of bio
@@ -303,13 +286,6 @@ static inline void inode_detach_wb(struct inode *inode)
{
}
-static inline void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
- struct inode *inode)
- __releases(&inode->i_lock)
-{
- spin_unlock(&inode->i_lock);
-}
-
static inline void wbc_attach_fdatawrite_inode(struct writeback_control *wbc,
struct inode *inode)
{
@@ -324,7 +300,7 @@ static inline void wbc_init_bio(struct writeback_control *wbc, struct bio *bio)
}
static inline void wbc_account_cgroup_owner(struct writeback_control *wbc,
- struct page *page, size_t bytes)
+ struct folio *folio, size_t bytes)
{
}
diff --git a/include/linux/xattr.h b/include/linux/xattr.h
index d200518..86b0d47 100644
--- a/include/linux/xattr.h
+++ b/include/linux/xattr.h
@@ -19,6 +19,10 @@
#include <linux/user_namespace.h>
#include <uapi/linux/xattr.h>
+/* List of all open_how "versions". */
+#define XATTR_ARGS_SIZE_VER0 16 /* sizeof first published struct */
+#define XATTR_ARGS_SIZE_LATEST XATTR_ARGS_SIZE_VER0
+
struct inode;
struct dentry;
diff --git a/include/net/bond_options.h b/include/net/bond_options.h
index 473a014..18687cc 100644
--- a/include/net/bond_options.h
+++ b/include/net/bond_options.h
@@ -161,5 +161,7 @@ void bond_option_arp_ip_targets_clear(struct bonding *bond);
#if IS_ENABLED(CONFIG_IPV6)
void bond_option_ns_ip6_targets_clear(struct bonding *bond);
#endif
+void bond_slave_ns_maddrs_add(struct bonding *bond, struct slave *slave);
+void bond_slave_ns_maddrs_del(struct bonding *bond, struct slave *slave);
#endif /* _NET_BOND_OPTIONS_H */
diff --git a/include/net/netlabel.h b/include/net/netlabel.h
index 529160f..8de8344 100644
--- a/include/net/netlabel.h
+++ b/include/net/netlabel.h
@@ -97,7 +97,7 @@ struct calipso_doi;
/* NetLabel audit information */
struct netlbl_audit {
- u32 secid;
+ struct lsm_prop prop;
kuid_t loginuid;
unsigned int sessionid;
};
diff --git a/include/net/tls.h b/include/net/tls.h
index 3a33924..61fef28 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -390,8 +390,12 @@ tls_offload_ctx_tx(const struct tls_context *tls_ctx)
static inline bool tls_sw_has_ctx_tx(const struct sock *sk)
{
- struct tls_context *ctx = tls_get_ctx(sk);
+ struct tls_context *ctx;
+ if (!sk_is_inet(sk) || !inet_test_bit(IS_ICSK, sk))
+ return false;
+
+ ctx = tls_get_ctx(sk);
if (!ctx)
return false;
return !!tls_sw_ctx_tx(ctx);
@@ -399,8 +403,12 @@ static inline bool tls_sw_has_ctx_tx(const struct sock *sk)
static inline bool tls_sw_has_ctx_rx(const struct sock *sk)
{
- struct tls_context *ctx = tls_get_ctx(sk);
+ struct tls_context *ctx;
+ if (!sk_is_inet(sk) || !inet_test_bit(IS_ICSK, sk))
+ return false;
+
+ ctx = tls_get_ctx(sk);
if (!ctx)
return false;
return !!tls_sw_ctx_rx(ctx);
diff --git a/include/scsi/libfcoe.h b/include/scsi/libfcoe.h
index 3c58992..6616348 100644
--- a/include/scsi/libfcoe.h
+++ b/include/scsi/libfcoe.h
@@ -15,7 +15,7 @@
#include <linux/skbuff.h>
#include <linux/workqueue.h>
#include <linux/local_lock.h>
-#include <linux/random.h>
+#include <linux/prandom.h>
#include <scsi/fc/fc_fcoe.h>
#include <scsi/libfc.h>
#include <scsi/fcoe_sysfs.h>
diff --git a/include/trace/events/block.h b/include/trace/events/block.h
index 1527d5d..bd0ea07 100644
--- a/include/trace/events/block.h
+++ b/include/trace/events/block.h
@@ -99,7 +99,7 @@ TRACE_EVENT(block_rq_requeue,
__entry->dev = rq->q->disk ? disk_devt(rq->q->disk) : 0;
__entry->sector = blk_rq_trace_sector(rq);
__entry->nr_sector = blk_rq_trace_nr_sectors(rq);
- __entry->ioprio = rq->ioprio;
+ __entry->ioprio = req_get_ioprio(rq);
blk_fill_rwbs(__entry->rwbs, rq->cmd_flags);
__get_str(cmd)[0] = '\0';
@@ -136,7 +136,7 @@ DECLARE_EVENT_CLASS(block_rq_completion,
__entry->sector = blk_rq_pos(rq);
__entry->nr_sector = nr_bytes >> 9;
__entry->error = blk_status_to_errno(error);
- __entry->ioprio = rq->ioprio;
+ __entry->ioprio = req_get_ioprio(rq);
blk_fill_rwbs(__entry->rwbs, rq->cmd_flags);
__get_str(cmd)[0] = '\0';
@@ -209,7 +209,7 @@ DECLARE_EVENT_CLASS(block_rq,
__entry->sector = blk_rq_trace_sector(rq);
__entry->nr_sector = blk_rq_trace_nr_sectors(rq);
__entry->bytes = blk_rq_bytes(rq);
- __entry->ioprio = rq->ioprio;
+ __entry->ioprio = req_get_ioprio(rq);
blk_fill_rwbs(__entry->rwbs, rq->cmd_flags);
__get_str(cmd)[0] = '\0';
diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index af6b382..4df93ca 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -1706,9 +1706,10 @@ DEFINE_EVENT(btrfs__qgroup_rsv_data, btrfs_qgroup_release_data,
DECLARE_EVENT_CLASS(btrfs_qgroup_extent,
TP_PROTO(const struct btrfs_fs_info *fs_info,
- const struct btrfs_qgroup_extent_record *rec),
+ const struct btrfs_qgroup_extent_record *rec,
+ u64 bytenr),
- TP_ARGS(fs_info, rec),
+ TP_ARGS(fs_info, rec, bytenr),
TP_STRUCT__entry_btrfs(
__field( u64, bytenr )
@@ -1716,7 +1717,7 @@ DECLARE_EVENT_CLASS(btrfs_qgroup_extent,
),
TP_fast_assign_btrfs(fs_info,
- __entry->bytenr = rec->bytenr;
+ __entry->bytenr = bytenr;
__entry->num_bytes = rec->num_bytes;
),
@@ -1727,17 +1728,19 @@ DECLARE_EVENT_CLASS(btrfs_qgroup_extent,
DEFINE_EVENT(btrfs_qgroup_extent, btrfs_qgroup_account_extents,
TP_PROTO(const struct btrfs_fs_info *fs_info,
- const struct btrfs_qgroup_extent_record *rec),
+ const struct btrfs_qgroup_extent_record *rec,
+ u64 bytenr),
- TP_ARGS(fs_info, rec)
+ TP_ARGS(fs_info, rec, bytenr)
);
DEFINE_EVENT(btrfs_qgroup_extent, btrfs_qgroup_trace_extent,
TP_PROTO(const struct btrfs_fs_info *fs_info,
- const struct btrfs_qgroup_extent_record *rec),
+ const struct btrfs_qgroup_extent_record *rec,
+ u64 bytenr),
- TP_ARGS(fs_info, rec)
+ TP_ARGS(fs_info, rec, bytenr)
);
TRACE_EVENT(qgroup_num_dirty_extents,
@@ -2341,7 +2344,6 @@ DEFINE_BTRFS_LOCK_EVENT(btrfs_tree_read_unlock_blocking);
DEFINE_BTRFS_LOCK_EVENT(btrfs_set_lock_blocking_read);
DEFINE_BTRFS_LOCK_EVENT(btrfs_set_lock_blocking_write);
DEFINE_BTRFS_LOCK_EVENT(btrfs_try_tree_read_lock);
-DEFINE_BTRFS_LOCK_EVENT(btrfs_try_tree_write_lock);
DEFINE_BTRFS_LOCK_EVENT(btrfs_tree_read_lock_atomic);
DECLARE_EVENT_CLASS(btrfs__space_info_update,
@@ -2553,10 +2555,9 @@ TRACE_EVENT(btrfs_extent_map_shrinker_count,
TRACE_EVENT(btrfs_extent_map_shrinker_scan_enter,
- TP_PROTO(const struct btrfs_fs_info *fs_info, long nr_to_scan, long nr,
- u64 last_root_id, u64 last_ino),
+ TP_PROTO(const struct btrfs_fs_info *fs_info, long nr),
- TP_ARGS(fs_info, nr_to_scan, nr, last_root_id, last_ino),
+ TP_ARGS(fs_info, nr),
TP_STRUCT__entry_btrfs(
__field( long, nr_to_scan )
@@ -2566,10 +2567,11 @@ TRACE_EVENT(btrfs_extent_map_shrinker_scan_enter,
),
TP_fast_assign_btrfs(fs_info,
- __entry->nr_to_scan = nr_to_scan;
+ __entry->nr_to_scan = \
+ atomic64_read(&fs_info->em_shrinker_nr_to_scan);
__entry->nr = nr;
- __entry->last_root_id = last_root_id;
- __entry->last_ino = last_ino;
+ __entry->last_root_id = fs_info->em_shrinker_last_root;
+ __entry->last_ino = fs_info->em_shrinker_last_ino;
),
TP_printk_btrfs("nr_to_scan=%ld nr=%ld last_root=%llu(%s) last_ino=%llu",
@@ -2579,10 +2581,9 @@ TRACE_EVENT(btrfs_extent_map_shrinker_scan_enter,
TRACE_EVENT(btrfs_extent_map_shrinker_scan_exit,
- TP_PROTO(const struct btrfs_fs_info *fs_info, long nr_dropped, long nr,
- u64 last_root_id, u64 last_ino),
+ TP_PROTO(const struct btrfs_fs_info *fs_info, long nr_dropped, long nr),
- TP_ARGS(fs_info, nr_dropped, nr, last_root_id, last_ino),
+ TP_ARGS(fs_info, nr_dropped, nr),
TP_STRUCT__entry_btrfs(
__field( long, nr_dropped )
@@ -2594,8 +2595,8 @@ TRACE_EVENT(btrfs_extent_map_shrinker_scan_exit,
TP_fast_assign_btrfs(fs_info,
__entry->nr_dropped = nr_dropped;
__entry->nr = nr;
- __entry->last_root_id = last_root_id;
- __entry->last_ino = last_ino;
+ __entry->last_root_id = fs_info->em_shrinker_last_root;
+ __entry->last_ino = fs_info->em_shrinker_last_ino;
),
TP_printk_btrfs("nr_dropped=%ld nr=%ld last_root=%llu(%s) last_ino=%llu",
diff --git a/include/trace/events/hugetlbfs.h b/include/trace/events/hugetlbfs.h
new file mode 100644
index 0000000..8331c90
--- /dev/null
+++ b/include/trace/events/hugetlbfs.h
@@ -0,0 +1,156 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM hugetlbfs
+
+#if !defined(_TRACE_HUGETLBFS_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_HUGETLBFS_H
+
+#include <linux/tracepoint.h>
+
+TRACE_EVENT(hugetlbfs_alloc_inode,
+
+ TP_PROTO(struct inode *inode, struct inode *dir, int mode),
+
+ TP_ARGS(inode, dir, mode),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(ino_t, ino)
+ __field(ino_t, dir)
+ __field(__u16, mode)
+ ),
+
+ TP_fast_assign(
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->ino = inode->i_ino;
+ __entry->dir = dir->i_ino;
+ __entry->mode = mode;
+ ),
+
+ TP_printk("dev %d,%d ino %lu dir %lu mode 0%o",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long) __entry->ino,
+ (unsigned long) __entry->dir, __entry->mode)
+);
+
+DECLARE_EVENT_CLASS(hugetlbfs__inode,
+
+ TP_PROTO(struct inode *inode),
+
+ TP_ARGS(inode),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(ino_t, ino)
+ __field(__u16, mode)
+ __field(loff_t, size)
+ __field(unsigned int, nlink)
+ __field(unsigned int, seals)
+ __field(blkcnt_t, blocks)
+ ),
+
+ TP_fast_assign(
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->ino = inode->i_ino;
+ __entry->mode = inode->i_mode;
+ __entry->size = inode->i_size;
+ __entry->nlink = inode->i_nlink;
+ __entry->seals = HUGETLBFS_I(inode)->seals;
+ __entry->blocks = inode->i_blocks;
+ ),
+
+ TP_printk("dev %d,%d ino %lu mode 0%o size %lld nlink %u seals %u blocks %llu",
+ MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino,
+ __entry->mode, __entry->size, __entry->nlink, __entry->seals,
+ (unsigned long long)__entry->blocks)
+);
+
+DEFINE_EVENT(hugetlbfs__inode, hugetlbfs_evict_inode,
+
+ TP_PROTO(struct inode *inode),
+
+ TP_ARGS(inode)
+);
+
+DEFINE_EVENT(hugetlbfs__inode, hugetlbfs_free_inode,
+
+ TP_PROTO(struct inode *inode),
+
+ TP_ARGS(inode)
+);
+
+TRACE_EVENT(hugetlbfs_setattr,
+
+ TP_PROTO(struct inode *inode, struct dentry *dentry,
+ struct iattr *attr),
+
+ TP_ARGS(inode, dentry, attr),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(ino_t, ino)
+ __field(unsigned int, d_len)
+ __string(d_name, dentry->d_name.name)
+ __field(unsigned int, ia_valid)
+ __field(unsigned int, ia_mode)
+ __field(loff_t, old_size)
+ __field(loff_t, ia_size)
+ ),
+
+ TP_fast_assign(
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->ino = inode->i_ino;
+ __entry->d_len = dentry->d_name.len;
+ __assign_str(d_name);
+ __entry->ia_valid = attr->ia_valid;
+ __entry->ia_mode = attr->ia_mode;
+ __entry->old_size = inode->i_size;
+ __entry->ia_size = attr->ia_size;
+ ),
+
+ TP_printk("dev %d,%d ino %lu name %.*s valid %#x mode 0%o old_size %lld size %lld",
+ MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long)__entry->ino,
+ __entry->d_len, __get_str(d_name), __entry->ia_valid, __entry->ia_mode,
+ __entry->old_size, __entry->ia_size)
+);
+
+TRACE_EVENT(hugetlbfs_fallocate,
+
+ TP_PROTO(struct inode *inode, int mode,
+ loff_t offset, loff_t len, int ret),
+
+ TP_ARGS(inode, mode, offset, len, ret),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(ino_t, ino)
+ __field(int, mode)
+ __field(loff_t, offset)
+ __field(loff_t, len)
+ __field(loff_t, size)
+ __field(int, ret)
+ ),
+
+ TP_fast_assign(
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->ino = inode->i_ino;
+ __entry->mode = mode;
+ __entry->offset = offset;
+ __entry->len = len;
+ __entry->size = inode->i_size;
+ __entry->ret = ret;
+ ),
+
+ TP_printk("dev %d,%d ino %lu mode 0%o offset %lld len %lld size %lld ret %d",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long)__entry->ino, __entry->mode,
+ (unsigned long long)__entry->offset,
+ (unsigned long long)__entry->len,
+ (unsigned long long)__entry->size,
+ __entry->ret)
+);
+
+#endif /* _TRACE_HUGETLBFS_H */
+
+ /* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/include/trace/events/io_uring.h b/include/trace/events/io_uring.h
index 412c9c2..fb81c53 100644
--- a/include/trace/events/io_uring.h
+++ b/include/trace/events/io_uring.h
@@ -315,20 +315,14 @@ TRACE_EVENT(io_uring_fail_link,
* io_uring_complete - called when completing an SQE
*
* @ctx: pointer to a ring context structure
- * @req: pointer to a submitted request
- * @user_data: user data associated with the request
- * @res: result of the request
- * @cflags: completion flags
- * @extra1: extra 64-bit data for CQE32
- * @extra2: extra 64-bit data for CQE32
- *
+ * @req: (optional) pointer to a submitted request
+ * @cqe: pointer to the filled in CQE being posted
*/
TRACE_EVENT(io_uring_complete,
- TP_PROTO(void *ctx, void *req, u64 user_data, int res, unsigned cflags,
- u64 extra1, u64 extra2),
+TP_PROTO(struct io_ring_ctx *ctx, void *req, struct io_uring_cqe *cqe),
- TP_ARGS(ctx, req, user_data, res, cflags, extra1, extra2),
+ TP_ARGS(ctx, req, cqe),
TP_STRUCT__entry (
__field( void *, ctx )
@@ -343,11 +337,11 @@ TRACE_EVENT(io_uring_complete,
TP_fast_assign(
__entry->ctx = ctx;
__entry->req = req;
- __entry->user_data = user_data;
- __entry->res = res;
- __entry->cflags = cflags;
- __entry->extra1 = extra1;
- __entry->extra2 = extra2;
+ __entry->user_data = cqe->user_data;
+ __entry->res = cqe->res;
+ __entry->cflags = cqe->flags;
+ __entry->extra1 = io_ctx_cqe32(ctx) ? cqe->big_cqe[0] : 0;
+ __entry->extra2 = io_ctx_cqe32(ctx) ? cqe->big_cqe[1] : 0;
),
TP_printk("ring %p, req %p, user_data 0x%llx, result %d, cflags 0x%x "
diff --git a/include/trace/events/netfs.h b/include/trace/events/netfs.h
index 69975c9..bf511bc 100644
--- a/include/trace/events/netfs.h
+++ b/include/trace/events/netfs.h
@@ -450,7 +450,7 @@ TRACE_EVENT(netfs_folio,
struct address_space *__m = READ_ONCE(folio->mapping);
__entry->ino = __m ? __m->host->i_ino : 0;
__entry->why = why;
- __entry->index = folio_index(folio);
+ __entry->index = folio->index;
__entry->nr = folio_nr_pages(folio);
),
diff --git a/include/trace/events/timestamp.h b/include/trace/events/timestamp.h
new file mode 100644
index 0000000..c9e5ec9
--- /dev/null
+++ b/include/trace/events/timestamp.h
@@ -0,0 +1,124 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM timestamp
+
+#if !defined(_TRACE_TIMESTAMP_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_TIMESTAMP_H
+
+#include <linux/tracepoint.h>
+#include <linux/fs.h>
+
+#define CTIME_QUERIED_FLAGS \
+ { I_CTIME_QUERIED, "Q" }
+
+DECLARE_EVENT_CLASS(ctime,
+ TP_PROTO(struct inode *inode,
+ struct timespec64 *ctime),
+
+ TP_ARGS(inode, ctime),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(ino_t, ino)
+ __field(time64_t, ctime_s)
+ __field(u32, ctime_ns)
+ __field(u32, gen)
+ ),
+
+ TP_fast_assign(
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->ino = inode->i_ino;
+ __entry->gen = inode->i_generation;
+ __entry->ctime_s = ctime->tv_sec;
+ __entry->ctime_ns = ctime->tv_nsec;
+ ),
+
+ TP_printk("ino=%d:%d:%ld:%u ctime=%lld.%u",
+ MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->gen,
+ __entry->ctime_s, __entry->ctime_ns
+ )
+);
+
+DEFINE_EVENT(ctime, inode_set_ctime_to_ts,
+ TP_PROTO(struct inode *inode,
+ struct timespec64 *ctime),
+ TP_ARGS(inode, ctime));
+
+DEFINE_EVENT(ctime, ctime_xchg_skip,
+ TP_PROTO(struct inode *inode,
+ struct timespec64 *ctime),
+ TP_ARGS(inode, ctime));
+
+TRACE_EVENT(ctime_ns_xchg,
+ TP_PROTO(struct inode *inode,
+ u32 old,
+ u32 new,
+ u32 cur),
+
+ TP_ARGS(inode, old, new, cur),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(ino_t, ino)
+ __field(u32, gen)
+ __field(u32, old)
+ __field(u32, new)
+ __field(u32, cur)
+ ),
+
+ TP_fast_assign(
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->ino = inode->i_ino;
+ __entry->gen = inode->i_generation;
+ __entry->old = old;
+ __entry->new = new;
+ __entry->cur = cur;
+ ),
+
+ TP_printk("ino=%d:%d:%ld:%u old=%u:%s new=%u cur=%u:%s",
+ MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->gen,
+ __entry->old & ~I_CTIME_QUERIED,
+ __print_flags(__entry->old & I_CTIME_QUERIED, "|", CTIME_QUERIED_FLAGS),
+ __entry->new,
+ __entry->cur & ~I_CTIME_QUERIED,
+ __print_flags(__entry->cur & I_CTIME_QUERIED, "|", CTIME_QUERIED_FLAGS)
+ )
+);
+
+TRACE_EVENT(fill_mg_cmtime,
+ TP_PROTO(struct inode *inode,
+ struct timespec64 *ctime,
+ struct timespec64 *mtime),
+
+ TP_ARGS(inode, ctime, mtime),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(ino_t, ino)
+ __field(time64_t, ctime_s)
+ __field(time64_t, mtime_s)
+ __field(u32, ctime_ns)
+ __field(u32, mtime_ns)
+ __field(u32, gen)
+ ),
+
+ TP_fast_assign(
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->ino = inode->i_ino;
+ __entry->gen = inode->i_generation;
+ __entry->ctime_s = ctime->tv_sec;
+ __entry->mtime_s = mtime->tv_sec;
+ __entry->ctime_ns = ctime->tv_nsec;
+ __entry->mtime_ns = mtime->tv_nsec;
+ ),
+
+ TP_printk("ino=%d:%d:%ld:%u ctime=%lld.%u mtime=%lld.%u",
+ MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->gen,
+ __entry->ctime_s, __entry->ctime_ns,
+ __entry->mtime_s, __entry->mtime_ns
+ )
+);
+#endif /* _TRACE_TIMESTAMP_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/include/uapi/asm-generic/mman.h b/include/uapi/asm-generic/mman.h
index 57e8195..5e3d61d 100644
--- a/include/uapi/asm-generic/mman.h
+++ b/include/uapi/asm-generic/mman.h
@@ -19,4 +19,8 @@
#define MCL_FUTURE 2 /* lock all future mappings */
#define MCL_ONFAULT 4 /* lock all pages that are faulted in */
+#define SHADOW_STACK_SET_TOKEN (1ULL << 0) /* Set up a restore token in the shadow stack */
+#define SHADOW_STACK_SET_MARKER (1ULL << 1) /* Set up a top of stack marker in the shadow stack */
+
+
#endif /* __ASM_GENERIC_MMAN_H */
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index 5bf6148..88dc393 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -841,8 +841,17 @@ __SYSCALL(__NR_lsm_list_modules, sys_lsm_list_modules)
#define __NR_mseal 462
__SYSCALL(__NR_mseal, sys_mseal)
+#define __NR_setxattrat 463
+__SYSCALL(__NR_setxattrat, sys_setxattrat)
+#define __NR_getxattrat 464
+__SYSCALL(__NR_getxattrat, sys_getxattrat)
+#define __NR_listxattrat 465
+__SYSCALL(__NR_listxattrat, sys_listxattrat)
+#define __NR_removexattrat 466
+__SYSCALL(__NR_removexattrat, sys_removexattrat)
+
#undef __NR_syscalls
-#define __NR_syscalls 463
+#define __NR_syscalls 467
/*
* 32 bit systems traditionally used different
diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h
index cdf6ad8..d3b222d 100644
--- a/include/uapi/linux/btrfs.h
+++ b/include/uapi/linux/btrfs.h
@@ -1049,6 +1049,29 @@ struct btrfs_ioctl_encoded_io_args {
#define BTRFS_ENCODED_IO_ENCRYPTION_NONE 0
#define BTRFS_ENCODED_IO_ENCRYPTION_TYPES 1
+/*
+ * Wait for subvolume cleaning process. This queries the kernel queue and it
+ * can change between the calls.
+ *
+ * - FOR_ONE - specify the subvolid
+ * - FOR_QUEUED - wait for all currently queued
+ * - COUNT - count number of queued
+ * - PEEK_FIRST - read which is the first in the queue (to be cleaned or being
+ * cleaned already), or 0 if the queue is empty
+ * - PEEK_LAST - read the last subvolid in the queue, or 0 if the queue is empty
+ */
+struct btrfs_ioctl_subvol_wait {
+ __u64 subvolid;
+ __u32 mode;
+ __u32 count;
+};
+
+#define BTRFS_SUBVOL_SYNC_WAIT_FOR_ONE (0)
+#define BTRFS_SUBVOL_SYNC_WAIT_FOR_QUEUED (1)
+#define BTRFS_SUBVOL_SYNC_COUNT (2)
+#define BTRFS_SUBVOL_SYNC_PEEK_FIRST (3)
+#define BTRFS_SUBVOL_SYNC_PEEK_LAST (4)
+
/* Error codes as returned by the kernel */
enum btrfs_err_code {
BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET = 1,
@@ -1181,6 +1204,8 @@ enum btrfs_err_code {
struct btrfs_ioctl_encoded_io_args)
#define BTRFS_IOC_ENCODED_WRITE _IOW(BTRFS_IOCTL_MAGIC, 64, \
struct btrfs_ioctl_encoded_io_args)
+#define BTRFS_IOC_SUBVOL_SYNC_WAIT _IOW(BTRFS_IOCTL_MAGIC, 65, \
+ struct btrfs_ioctl_subvol_wait)
#ifdef __cplusplus
}
diff --git a/include/uapi/linux/cryptouser.h b/include/uapi/linux/cryptouser.h
index 20a6c0f..db05e041 100644
--- a/include/uapi/linux/cryptouser.h
+++ b/include/uapi/linux/cryptouser.h
@@ -64,6 +64,7 @@ enum crypto_attr_type_t {
CRYPTOCFGA_STAT_AKCIPHER, /* No longer supported, do not use. */
CRYPTOCFGA_STAT_KPP, /* No longer supported, do not use. */
CRYPTOCFGA_STAT_ACOMP, /* No longer supported, do not use. */
+ CRYPTOCFGA_REPORT_SIG, /* struct crypto_report_sig */
__CRYPTOCFGA_MAX
#define CRYPTOCFGA_MAX (__CRYPTOCFGA_MAX - 1)
@@ -207,6 +208,10 @@ struct crypto_report_acomp {
char type[CRYPTO_MAX_NAME];
};
+struct crypto_report_sig {
+ char type[CRYPTO_MAX_NAME];
+};
+
#define CRYPTO_REPORT_MAXSIZE (sizeof(struct crypto_user_alg) + \
sizeof(struct crypto_report_blkcipher))
diff --git a/include/uapi/linux/elf.h b/include/uapi/linux/elf.h
index b993598..9adc218 100644
--- a/include/uapi/linux/elf.h
+++ b/include/uapi/linux/elf.h
@@ -443,6 +443,7 @@ typedef struct elf64_shdr {
#define NT_ARM_ZT 0x40d /* ARM SME ZT registers */
#define NT_ARM_FPMR 0x40e /* ARM floating point mode register */
#define NT_ARM_POE 0x40f /* ARM POE registers */
+#define NT_ARM_GCS 0x410 /* ARM GCS state */
#define NT_ARC_V2 0x600 /* ARCv2 accumulator/extra registers */
#define NT_VMCOREDD 0x700 /* Vmcore Device Dump Note */
#define NT_MIPS_DSP 0x800 /* MIPS DSP ASE registers */
diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h
index 87e2dec..a40833b 100644
--- a/include/uapi/linux/fcntl.h
+++ b/include/uapi/linux/fcntl.h
@@ -154,8 +154,4 @@
usable with open_by_handle_at(2). */
#define AT_HANDLE_MNT_ID_UNIQUE 0x001 /* Return the u64 unique mount ID. */
-#if defined(__KERNEL__)
-#define AT_GETATTR_NOSEC 0x80000000
-#endif
-
#endif /* _UAPI_LINUX_FCNTL_H */
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 1fe79e7..4418d019 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -200,6 +200,9 @@ enum io_uring_sqe_flags_bit {
*/
#define IORING_SETUP_NO_SQARRAY (1U << 16)
+/* Use hybrid poll in iopoll process */
+#define IORING_SETUP_HYBRID_IOPOLL (1U << 17)
+
enum io_uring_op {
IORING_OP_NOP,
IORING_OP_READV,
@@ -416,6 +419,9 @@ enum io_uring_msg_ring_flags {
* IORING_NOP_INJECT_RESULT Inject result from sqe->result
*/
#define IORING_NOP_INJECT_RESULT (1U << 0)
+#define IORING_NOP_FILE (1U << 1)
+#define IORING_NOP_FIXED_FILE (1U << 2)
+#define IORING_NOP_FIXED_BUFFER (1U << 3)
/*
* IO completion data structure (Completion Queue Entry)
@@ -518,6 +524,7 @@ struct io_cqring_offsets {
#define IORING_ENTER_EXT_ARG (1U << 3)
#define IORING_ENTER_REGISTERED_RING (1U << 4)
#define IORING_ENTER_ABS_TIMER (1U << 5)
+#define IORING_ENTER_EXT_ARG_REG (1U << 6)
/*
* Passed in for io_uring_setup(2). Copied back with updated info on success
@@ -612,6 +619,16 @@ enum io_uring_register_op {
/* clone registered buffers from source ring to current ring */
IORING_REGISTER_CLONE_BUFFERS = 30,
+ /* send MSG_RING without having a ring */
+ IORING_REGISTER_SEND_MSG_RING = 31,
+
+ /* 32 reserved for zc rx */
+
+ /* resize CQ ring */
+ IORING_REGISTER_RESIZE_RINGS = 33,
+
+ IORING_REGISTER_MEM_REGION = 34,
+
/* this goes last */
IORING_REGISTER_LAST,
@@ -632,6 +649,31 @@ struct io_uring_files_update {
__aligned_u64 /* __s32 * */ fds;
};
+enum {
+ /* initialise with user provided memory pointed by user_addr */
+ IORING_MEM_REGION_TYPE_USER = 1,
+};
+
+struct io_uring_region_desc {
+ __u64 user_addr;
+ __u64 size;
+ __u32 flags;
+ __u32 id;
+ __u64 mmap_offset;
+ __u64 __resv[4];
+};
+
+enum {
+ /* expose the region as registered wait arguments */
+ IORING_MEM_REGION_REG_WAIT_ARG = 1,
+};
+
+struct io_uring_mem_region_reg {
+ __u64 region_uptr; /* struct io_uring_region_desc * */
+ __u64 flags;
+ __u64 __resv[2];
+};
+
/*
* Register a fully sparse file space, rather than pass in an array of all
* -1 file descriptors.
@@ -698,13 +740,17 @@ struct io_uring_clock_register {
};
enum {
- IORING_REGISTER_SRC_REGISTERED = 1,
+ IORING_REGISTER_SRC_REGISTERED = (1U << 0),
+ IORING_REGISTER_DST_REPLACE = (1U << 1),
};
struct io_uring_clone_buffers {
__u32 src_fd;
__u32 flags;
- __u32 pad[6];
+ __u32 src_off;
+ __u32 dst_off;
+ __u32 nr;
+ __u32 pad[3];
};
struct io_uring_buf {
@@ -768,12 +814,40 @@ struct io_uring_buf_status {
__u32 resv[8];
};
+enum io_uring_napi_op {
+ /* register/ungister backward compatible opcode */
+ IO_URING_NAPI_REGISTER_OP = 0,
+
+ /* opcodes to update napi_list when static tracking is used */
+ IO_URING_NAPI_STATIC_ADD_ID = 1,
+ IO_URING_NAPI_STATIC_DEL_ID = 2
+};
+
+enum io_uring_napi_tracking_strategy {
+ /* value must be 0 for backward compatibility */
+ IO_URING_NAPI_TRACKING_DYNAMIC = 0,
+ IO_URING_NAPI_TRACKING_STATIC = 1,
+ IO_URING_NAPI_TRACKING_INACTIVE = 255
+};
+
/* argument for IORING_(UN)REGISTER_NAPI */
struct io_uring_napi {
__u32 busy_poll_to;
__u8 prefer_busy_poll;
- __u8 pad[3];
- __u64 resv;
+
+ /* a io_uring_napi_op value */
+ __u8 opcode;
+ __u8 pad[2];
+
+ /*
+ * for IO_URING_NAPI_REGISTER_OP, it is a
+ * io_uring_napi_tracking_strategy value.
+ *
+ * for IO_URING_NAPI_STATIC_ADD_ID/IO_URING_NAPI_STATIC_DEL_ID
+ * it is the napi id to add/del from napi_list.
+ */
+ __u32 op_param;
+ __u32 resv;
};
/*
@@ -795,6 +869,43 @@ enum io_uring_register_restriction_op {
IORING_RESTRICTION_LAST
};
+enum {
+ IORING_REG_WAIT_TS = (1U << 0),
+};
+
+/*
+ * Argument for IORING_REGISTER_CQWAIT_REG, registering a region of
+ * struct io_uring_reg_wait that can be indexed when io_uring_enter(2) is
+ * called rather than pass in a wait argument structure separately.
+ */
+struct io_uring_cqwait_reg_arg {
+ __u32 flags;
+ __u32 struct_size;
+ __u32 nr_entries;
+ __u32 pad;
+ __u64 user_addr;
+ __u64 pad2[3];
+};
+
+/*
+ * Argument for io_uring_enter(2) with
+ * IORING_GETEVENTS | IORING_ENTER_EXT_ARG_REG set, where the actual argument
+ * is an index into a previously registered fixed wait region described by
+ * the below structure.
+ */
+struct io_uring_reg_wait {
+ struct __kernel_timespec ts;
+ __u32 min_wait_usec;
+ __u32 flags;
+ __u64 sigmask;
+ __u32 sigmask_sz;
+ __u32 pad[3];
+ __u64 pad2[2];
+};
+
+/*
+ * Argument for io_uring_enter(2) with IORING_GETEVENTS | IORING_ENTER_EXT_ARG
+ */
struct io_uring_getevents_arg {
__u64 sigmask;
__u32 sigmask_sz;
diff --git a/include/uapi/linux/mount.h b/include/uapi/linux/mount.h
index 225bc36..c070088 100644
--- a/include/uapi/linux/mount.h
+++ b/include/uapi/linux/mount.h
@@ -154,7 +154,7 @@ struct mount_attr {
*/
struct statmount {
__u32 size; /* Total size, including strings */
- __u32 mnt_opts; /* [str] Mount options of the mount */
+ __u32 mnt_opts; /* [str] Options (comma separated, escaped) */
__u64 mask; /* What results were written */
__u32 sb_dev_major; /* Device ID */
__u32 sb_dev_minor;
@@ -173,7 +173,13 @@ struct statmount {
__u32 mnt_root; /* [str] Root of mount relative to root of fs */
__u32 mnt_point; /* [str] Mountpoint relative to current root */
__u64 mnt_ns_id; /* ID of the mount namespace */
- __u64 __spare2[49];
+ __u32 fs_subtype; /* [str] Subtype of fs_type (if any) */
+ __u32 sb_source; /* [str] Source string of the mount */
+ __u32 opt_num; /* Number of fs options */
+ __u32 opt_array; /* [str] Array of nul terminated fs options */
+ __u32 opt_sec_num; /* Number of security options */
+ __u32 opt_sec_array; /* [str] Array of nul terminated security options */
+ __u64 __spare2[46];
char str[]; /* Variable size part containing strings */
};
@@ -207,6 +213,10 @@ struct mnt_id_req {
#define STATMOUNT_FS_TYPE 0x00000020U /* Want/got fs_type */
#define STATMOUNT_MNT_NS_ID 0x00000040U /* Want/got mnt_ns_id */
#define STATMOUNT_MNT_OPTS 0x00000080U /* Want/got mnt_opts */
+#define STATMOUNT_FS_SUBTYPE 0x00000100U /* Want/got fs_subtype */
+#define STATMOUNT_SB_SOURCE 0x00000200U /* Want/got sb_source */
+#define STATMOUNT_OPT_ARRAY 0x00000400U /* Want/got opt_... */
+#define STATMOUNT_OPT_SEC_ARRAY 0x00000800U /* Want/got opt_sec... */
/*
* Special @mnt_id values that can be passed to listmount
diff --git a/include/uapi/linux/pidfd.h b/include/uapi/linux/pidfd.h
index 565fc06..4540f63 100644
--- a/include/uapi/linux/pidfd.h
+++ b/include/uapi/linux/pidfd.h
@@ -16,6 +16,55 @@
#define PIDFD_SIGNAL_THREAD_GROUP (1UL << 1)
#define PIDFD_SIGNAL_PROCESS_GROUP (1UL << 2)
+/* Flags for pidfd_info. */
+#define PIDFD_INFO_PID (1UL << 0) /* Always returned, even if not requested */
+#define PIDFD_INFO_CREDS (1UL << 1) /* Always returned, even if not requested */
+#define PIDFD_INFO_CGROUPID (1UL << 2) /* Always returned if available, even if not requested */
+
+#define PIDFD_INFO_SIZE_VER0 64 /* sizeof first published struct */
+
+struct pidfd_info {
+ /*
+ * This mask is similar to the request_mask in statx(2).
+ *
+ * Userspace indicates what extensions or expensive-to-calculate fields
+ * they want by setting the corresponding bits in mask. The kernel
+ * will ignore bits that it does not know about.
+ *
+ * When filling the structure, the kernel will only set bits
+ * corresponding to the fields that were actually filled by the kernel.
+ * This also includes any future extensions that might be automatically
+ * filled. If the structure size is too small to contain a field
+ * (requested or not), to avoid confusion the mask will not
+ * contain a bit for that field.
+ *
+ * As such, userspace MUST verify that mask contains the
+ * corresponding flags after the ioctl(2) returns to ensure that it is
+ * using valid data.
+ */
+ __u64 mask;
+ /*
+ * The information contained in the following fields might be stale at the
+ * time it is received, as the target process might have exited as soon as
+ * the IOCTL was processed, and there is no way to avoid that. However, it
+ * is guaranteed that if the call was successful, then the information was
+ * correct and referred to the intended process at the time the work was
+ * performed. */
+ __u64 cgroupid;
+ __u32 pid;
+ __u32 tgid;
+ __u32 ppid;
+ __u32 ruid;
+ __u32 rgid;
+ __u32 euid;
+ __u32 egid;
+ __u32 suid;
+ __u32 sgid;
+ __u32 fsuid;
+ __u32 fsgid;
+ __u32 spare0[1];
+};
+
#define PIDFS_IOCTL_MAGIC 0xFF
#define PIDFD_GET_CGROUP_NAMESPACE _IO(PIDFS_IOCTL_MAGIC, 1)
@@ -28,5 +77,6 @@
#define PIDFD_GET_TIME_FOR_CHILDREN_NAMESPACE _IO(PIDFS_IOCTL_MAGIC, 8)
#define PIDFD_GET_USER_NAMESPACE _IO(PIDFS_IOCTL_MAGIC, 9)
#define PIDFD_GET_UTS_NAMESPACE _IO(PIDFS_IOCTL_MAGIC, 10)
+#define PIDFD_GET_INFO _IOWR(PIDFS_IOCTL_MAGIC, 11, struct pidfd_info)
#endif /* _UAPI_LINUX_PIDFD_H */
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index 3579179..557a3d2 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -328,4 +328,26 @@ struct prctl_mm_map {
# define PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC 0x10 /* Clear the aspect on exec */
# define PR_PPC_DEXCR_CTRL_MASK 0x1f
+/*
+ * Get the current shadow stack configuration for the current thread,
+ * this will be the value configured via PR_SET_SHADOW_STACK_STATUS.
+ */
+#define PR_GET_SHADOW_STACK_STATUS 74
+
+/*
+ * Set the current shadow stack configuration. Enabling the shadow
+ * stack will cause a shadow stack to be allocated for the thread.
+ */
+#define PR_SET_SHADOW_STACK_STATUS 75
+# define PR_SHADOW_STACK_ENABLE (1UL << 0)
+# define PR_SHADOW_STACK_WRITE (1UL << 1)
+# define PR_SHADOW_STACK_PUSH (1UL << 2)
+
+/*
+ * Prevent further changes to the specified shadow stack
+ * configuration. All bits may be locked via this call, including
+ * undefined bits.
+ */
+#define PR_LOCK_SHADOW_STACK_STATUS 76
+
#endif /* _LINUX_PRCTL_H */
diff --git a/include/uapi/linux/sed-opal.h b/include/uapi/linux/sed-opal.h
index d3994b7..9025dd5 100644
--- a/include/uapi/linux/sed-opal.h
+++ b/include/uapi/linux/sed-opal.h
@@ -215,5 +215,6 @@ struct opal_revert_lsp {
#define IOC_OPAL_GET_GEOMETRY _IOR('p', 238, struct opal_geometry)
#define IOC_OPAL_DISCOVERY _IOW('p', 239, struct opal_discovery)
#define IOC_OPAL_REVERT_LSP _IOW('p', 240, struct opal_revert_lsp)
+#define IOC_OPAL_SET_SID_PW _IOW('p', 241, struct opal_new_pw)
#endif /* _UAPI_SED_OPAL_H */
diff --git a/include/uapi/linux/thermal.h b/include/uapi/linux/thermal.h
index fc78bf3..ba8604b 100644
--- a/include/uapi/linux/thermal.h
+++ b/include/uapi/linux/thermal.h
@@ -3,6 +3,8 @@
#define _UAPI_LINUX_THERMAL_H
#define THERMAL_NAME_LENGTH 20
+#define THERMAL_THRESHOLD_WAY_UP BIT(0)
+#define THERMAL_THRESHOLD_WAY_DOWN BIT(1)
enum thermal_device_mode {
THERMAL_DEVICE_DISABLED = 0,
@@ -18,7 +20,7 @@ enum thermal_trip_type {
/* Adding event notification support elements */
#define THERMAL_GENL_FAMILY_NAME "thermal"
-#define THERMAL_GENL_VERSION 0x01
+#define THERMAL_GENL_VERSION 0x02
#define THERMAL_GENL_SAMPLING_GROUP_NAME "sampling"
#define THERMAL_GENL_EVENT_GROUP_NAME "event"
@@ -28,6 +30,7 @@ enum thermal_genl_attr {
THERMAL_GENL_ATTR_TZ,
THERMAL_GENL_ATTR_TZ_ID,
THERMAL_GENL_ATTR_TZ_TEMP,
+ THERMAL_GENL_ATTR_TZ_PREV_TEMP,
THERMAL_GENL_ATTR_TZ_TRIP,
THERMAL_GENL_ATTR_TZ_TRIP_ID,
THERMAL_GENL_ATTR_TZ_TRIP_TYPE,
@@ -48,6 +51,9 @@ enum thermal_genl_attr {
THERMAL_GENL_ATTR_CPU_CAPABILITY_ID,
THERMAL_GENL_ATTR_CPU_CAPABILITY_PERFORMANCE,
THERMAL_GENL_ATTR_CPU_CAPABILITY_EFFICIENCY,
+ THERMAL_GENL_ATTR_THRESHOLD,
+ THERMAL_GENL_ATTR_THRESHOLD_TEMP,
+ THERMAL_GENL_ATTR_THRESHOLD_DIRECTION,
__THERMAL_GENL_ATTR_MAX,
};
#define THERMAL_GENL_ATTR_MAX (__THERMAL_GENL_ATTR_MAX - 1)
@@ -75,6 +81,11 @@ enum thermal_genl_event {
THERMAL_GENL_EVENT_CDEV_STATE_UPDATE, /* Cdev state updated */
THERMAL_GENL_EVENT_TZ_GOV_CHANGE, /* Governor policy changed */
THERMAL_GENL_EVENT_CPU_CAPABILITY_CHANGE, /* CPU capability changed */
+ THERMAL_GENL_EVENT_THRESHOLD_ADD, /* A thresold has been added */
+ THERMAL_GENL_EVENT_THRESHOLD_DELETE, /* A thresold has been deleted */
+ THERMAL_GENL_EVENT_THRESHOLD_FLUSH, /* All thresolds have been deleted */
+ THERMAL_GENL_EVENT_THRESHOLD_UP, /* A thresold has been crossed the way up */
+ THERMAL_GENL_EVENT_THRESHOLD_DOWN, /* A thresold has been crossed the way down */
__THERMAL_GENL_EVENT_MAX,
};
#define THERMAL_GENL_EVENT_MAX (__THERMAL_GENL_EVENT_MAX - 1)
@@ -82,12 +93,16 @@ enum thermal_genl_event {
/* Commands supported by the thermal_genl_family */
enum thermal_genl_cmd {
THERMAL_GENL_CMD_UNSPEC,
- THERMAL_GENL_CMD_TZ_GET_ID, /* List of thermal zones id */
- THERMAL_GENL_CMD_TZ_GET_TRIP, /* List of thermal trips */
- THERMAL_GENL_CMD_TZ_GET_TEMP, /* Get the thermal zone temperature */
- THERMAL_GENL_CMD_TZ_GET_GOV, /* Get the thermal zone governor */
- THERMAL_GENL_CMD_TZ_GET_MODE, /* Get the thermal zone mode */
- THERMAL_GENL_CMD_CDEV_GET, /* List of cdev id */
+ THERMAL_GENL_CMD_TZ_GET_ID, /* List of thermal zones id */
+ THERMAL_GENL_CMD_TZ_GET_TRIP, /* List of thermal trips */
+ THERMAL_GENL_CMD_TZ_GET_TEMP, /* Get the thermal zone temperature */
+ THERMAL_GENL_CMD_TZ_GET_GOV, /* Get the thermal zone governor */
+ THERMAL_GENL_CMD_TZ_GET_MODE, /* Get the thermal zone mode */
+ THERMAL_GENL_CMD_CDEV_GET, /* List of cdev id */
+ THERMAL_GENL_CMD_THRESHOLD_GET, /* List of thresholds */
+ THERMAL_GENL_CMD_THRESHOLD_ADD, /* Add a threshold */
+ THERMAL_GENL_CMD_THRESHOLD_DELETE, /* Delete a threshold */
+ THERMAL_GENL_CMD_THRESHOLD_FLUSH, /* Flush all the thresholds */
__THERMAL_GENL_CMD_MAX,
};
#define THERMAL_GENL_CMD_MAX (__THERMAL_GENL_CMD_MAX - 1)
diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h
index 1287363..a8bc98b 100644
--- a/include/uapi/linux/ublk_cmd.h
+++ b/include/uapi/linux/ublk_cmd.h
@@ -147,8 +147,18 @@
*/
#define UBLK_F_NEED_GET_DATA (1UL << 2)
+/*
+ * - Block devices are recoverable if ublk server exits and restarts
+ * - Outstanding I/O when ublk server exits is met with errors
+ * - I/O issued while there is no ublk server queues
+ */
#define UBLK_F_USER_RECOVERY (1UL << 3)
+/*
+ * - Block devices are recoverable if ublk server exits and restarts
+ * - Outstanding I/O when ublk server exits is reissued
+ * - I/O issued while there is no ublk server queues
+ */
#define UBLK_F_USER_RECOVERY_REISSUE (1UL << 4)
/*
@@ -190,10 +200,18 @@
*/
#define UBLK_F_ZONED (1ULL << 8)
+/*
+ * - Block devices are recoverable if ublk server exits and restarts
+ * - Outstanding I/O when ublk server exits is met with errors
+ * - I/O issued while there is no ublk server is met with errors
+ */
+#define UBLK_F_USER_RECOVERY_FAIL_IO (1ULL << 9)
+
/* device state */
#define UBLK_S_DEV_DEAD 0
#define UBLK_S_DEV_LIVE 1
#define UBLK_S_DEV_QUIESCED 2
+#define UBLK_S_DEV_FAIL_IO 3
/* shipped via sqe->cmd of io_uring command */
struct ublksrv_ctrl_cmd {
diff --git a/include/uapi/linux/virtio_crypto.h b/include/uapi/linux/virtio_crypto.h
index 71a54a6..2fccb64 100644
--- a/include/uapi/linux/virtio_crypto.h
+++ b/include/uapi/linux/virtio_crypto.h
@@ -329,6 +329,7 @@ struct virtio_crypto_op_header {
VIRTIO_CRYPTO_OPCODE(VIRTIO_CRYPTO_SERVICE_AKCIPHER, 0x00)
#define VIRTIO_CRYPTO_AKCIPHER_DECRYPT \
VIRTIO_CRYPTO_OPCODE(VIRTIO_CRYPTO_SERVICE_AKCIPHER, 0x01)
+ /* akcipher sign/verify opcodes are deprecated */
#define VIRTIO_CRYPTO_AKCIPHER_SIGN \
VIRTIO_CRYPTO_OPCODE(VIRTIO_CRYPTO_SERVICE_AKCIPHER, 0x02)
#define VIRTIO_CRYPTO_AKCIPHER_VERIFY \
diff --git a/include/uapi/linux/xattr.h b/include/uapi/linux/xattr.h
index 9463db2..9854f9c 100644
--- a/include/uapi/linux/xattr.h
+++ b/include/uapi/linux/xattr.h
@@ -11,6 +11,7 @@
*/
#include <linux/libc-compat.h>
+#include <linux/types.h>
#ifndef _UAPI_LINUX_XATTR_H
#define _UAPI_LINUX_XATTR_H
@@ -20,6 +21,12 @@
#define XATTR_CREATE 0x1 /* set value, fail if attr already exists */
#define XATTR_REPLACE 0x2 /* set value, fail if attr does not exist */
+
+struct xattr_args {
+ __aligned_u64 __user value;
+ __u32 size;
+ __u32 flags;
+};
#endif
/* Namespaces */
diff --git a/init/initramfs.c b/init/initramfs.c
index bc911e4..b2f7583 100644
--- a/init/initramfs.c
+++ b/init/initramfs.c
@@ -360,6 +360,15 @@ static int __init do_name(void)
{
state = SkipIt;
next_state = Reset;
+
+ /* name_len > 0 && name_len <= PATH_MAX checked in do_header */
+ if (collected[name_len - 1] != '\0') {
+ pr_err("initramfs name without nulterm: %.*s\n",
+ (int)name_len, collected);
+ error("malformed archive");
+ return 1;
+ }
+
if (strcmp(collected, "TRAILER!!!") == 0) {
free_hash();
return 0;
@@ -424,6 +433,12 @@ static int __init do_copy(void)
static int __init do_symlink(void)
{
+ if (collected[name_len - 1] != '\0') {
+ pr_err("initramfs symlink without nulterm: %.*s\n",
+ (int)name_len, collected);
+ error("malformed archive");
+ return 1;
+ }
collected[N_ALIGN(name_len) + body_len] = '\0';
clean_path(collected, 0);
init_symlink(collected + N_ALIGN(name_len), collected);
diff --git a/io_uring/cancel.c b/io_uring/cancel.c
index a6e58a2..4841935 100644
--- a/io_uring/cancel.c
+++ b/io_uring/cancel.c
@@ -205,7 +205,7 @@ int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
.opcode = cancel->opcode,
.seq = atomic_inc_return(&req->ctx->cancel_seq),
};
- struct io_uring_task *tctx = req->task->io_uring;
+ struct io_uring_task *tctx = req->tctx;
int ret;
if (cd.flags & IORING_ASYNC_CANCEL_FD) {
@@ -232,16 +232,6 @@ int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
return IOU_OK;
}
-void init_hash_table(struct io_hash_table *table, unsigned size)
-{
- unsigned int i;
-
- for (i = 0; i < size; i++) {
- spin_lock_init(&table->hbs[i].lock);
- INIT_HLIST_HEAD(&table->hbs[i].list);
- }
-}
-
static int __io_sync_cancel(struct io_uring_task *tctx,
struct io_cancel_data *cd, int fd)
{
@@ -250,10 +240,12 @@ static int __io_sync_cancel(struct io_uring_task *tctx,
/* fixed must be grabbed every time since we drop the uring_lock */
if ((cd->flags & IORING_ASYNC_CANCEL_FD) &&
(cd->flags & IORING_ASYNC_CANCEL_FD_FIXED)) {
- if (unlikely(fd >= ctx->nr_user_files))
+ struct io_rsrc_node *node;
+
+ node = io_rsrc_node_lookup(&ctx->file_table.data, fd);
+ if (unlikely(!node))
return -EBADF;
- fd = array_index_nospec(fd, ctx->nr_user_files);
- cd->file = io_file_from_index(&ctx->file_table, fd);
+ cd->file = io_slot_file(node);
if (!cd->file)
return -EBADF;
}
diff --git a/io_uring/cancel.h b/io_uring/cancel.h
index b33995e..bbfea2c 100644
--- a/io_uring/cancel.h
+++ b/io_uring/cancel.h
@@ -20,7 +20,6 @@ int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags);
int io_try_cancel(struct io_uring_task *tctx, struct io_cancel_data *cd,
unsigned int issue_flags);
-void init_hash_table(struct io_hash_table *table, unsigned size);
int io_sync_cancel(struct io_ring_ctx *ctx, void __user *arg);
bool io_cancel_req_match(struct io_kiocb *req, struct io_cancel_data *cd);
diff --git a/io_uring/eventfd.c b/io_uring/eventfd.c
index e37fddd..fab936d3 100644
--- a/io_uring/eventfd.c
+++ b/io_uring/eventfd.c
@@ -13,10 +13,12 @@
struct io_ev_fd {
struct eventfd_ctx *cq_ev_fd;
- unsigned int eventfd_async: 1;
- struct rcu_head rcu;
+ unsigned int eventfd_async;
+ /* protected by ->completion_lock */
+ unsigned last_cq_tail;
refcount_t refs;
atomic_t ops;
+ struct rcu_head rcu;
};
enum {
@@ -41,14 +43,58 @@ static void io_eventfd_do_signal(struct rcu_head *rcu)
io_eventfd_free(rcu);
}
-void io_eventfd_signal(struct io_ring_ctx *ctx)
+static void io_eventfd_put(struct io_ev_fd *ev_fd)
{
- struct io_ev_fd *ev_fd = NULL;
+ if (refcount_dec_and_test(&ev_fd->refs))
+ call_rcu(&ev_fd->rcu, io_eventfd_free);
+}
+
+static void io_eventfd_release(struct io_ev_fd *ev_fd, bool put_ref)
+{
+ if (put_ref)
+ io_eventfd_put(ev_fd);
+ rcu_read_unlock();
+}
+
+/*
+ * Returns true if the caller should put the ev_fd reference, false if not.
+ */
+static bool __io_eventfd_signal(struct io_ev_fd *ev_fd)
+{
+ if (eventfd_signal_allowed()) {
+ eventfd_signal_mask(ev_fd->cq_ev_fd, EPOLL_URING_WAKE);
+ return true;
+ }
+ if (!atomic_fetch_or(BIT(IO_EVENTFD_OP_SIGNAL_BIT), &ev_fd->ops)) {
+ call_rcu_hurry(&ev_fd->rcu, io_eventfd_do_signal);
+ return false;
+ }
+ return true;
+}
+
+/*
+ * Trigger if eventfd_async isn't set, or if it's set and the caller is
+ * an async worker. If ev_fd isn't valid, obviously return false.
+ */
+static bool io_eventfd_trigger(struct io_ev_fd *ev_fd)
+{
+ if (ev_fd)
+ return !ev_fd->eventfd_async || io_wq_current_is_worker();
+ return false;
+}
+
+/*
+ * On success, returns with an ev_fd reference grabbed and the RCU read
+ * lock held.
+ */
+static struct io_ev_fd *io_eventfd_grab(struct io_ring_ctx *ctx)
+{
+ struct io_ev_fd *ev_fd;
if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED)
- return;
+ return NULL;
- guard(rcu)();
+ rcu_read_lock();
/*
* rcu_dereference ctx->io_ev_fd once and use it for both for checking
@@ -57,51 +103,53 @@ void io_eventfd_signal(struct io_ring_ctx *ctx)
ev_fd = rcu_dereference(ctx->io_ev_fd);
/*
- * Check again if ev_fd exists incase an io_eventfd_unregister call
+ * Check again if ev_fd exists in case an io_eventfd_unregister call
* completed between the NULL check of ctx->io_ev_fd at the start of
* the function and rcu_read_lock.
*/
- if (unlikely(!ev_fd))
- return;
- if (!refcount_inc_not_zero(&ev_fd->refs))
- return;
- if (ev_fd->eventfd_async && !io_wq_current_is_worker())
- goto out;
+ if (io_eventfd_trigger(ev_fd) && refcount_inc_not_zero(&ev_fd->refs))
+ return ev_fd;
- if (likely(eventfd_signal_allowed())) {
- eventfd_signal_mask(ev_fd->cq_ev_fd, EPOLL_URING_WAKE);
- } else {
- if (!atomic_fetch_or(BIT(IO_EVENTFD_OP_SIGNAL_BIT), &ev_fd->ops)) {
- call_rcu_hurry(&ev_fd->rcu, io_eventfd_do_signal);
- return;
- }
- }
-out:
- if (refcount_dec_and_test(&ev_fd->refs))
- call_rcu(&ev_fd->rcu, io_eventfd_free);
+ rcu_read_unlock();
+ return NULL;
+}
+
+void io_eventfd_signal(struct io_ring_ctx *ctx)
+{
+ struct io_ev_fd *ev_fd;
+
+ ev_fd = io_eventfd_grab(ctx);
+ if (ev_fd)
+ io_eventfd_release(ev_fd, __io_eventfd_signal(ev_fd));
}
void io_eventfd_flush_signal(struct io_ring_ctx *ctx)
{
- bool skip;
+ struct io_ev_fd *ev_fd;
- spin_lock(&ctx->completion_lock);
+ ev_fd = io_eventfd_grab(ctx);
+ if (ev_fd) {
+ bool skip, put_ref = true;
- /*
- * Eventfd should only get triggered when at least one event has been
- * posted. Some applications rely on the eventfd notification count
- * only changing IFF a new CQE has been added to the CQ ring. There's
- * no depedency on 1:1 relationship between how many times this
- * function is called (and hence the eventfd count) and number of CQEs
- * posted to the CQ ring.
- */
- skip = ctx->cached_cq_tail == ctx->evfd_last_cq_tail;
- ctx->evfd_last_cq_tail = ctx->cached_cq_tail;
- spin_unlock(&ctx->completion_lock);
- if (skip)
- return;
+ /*
+ * Eventfd should only get triggered when at least one event
+ * has been posted. Some applications rely on the eventfd
+ * notification count only changing IFF a new CQE has been
+ * added to the CQ ring. There's no dependency on 1:1
+ * relationship between how many times this function is called
+ * (and hence the eventfd count) and number of CQEs posted to
+ * the CQ ring.
+ */
+ spin_lock(&ctx->completion_lock);
+ skip = ctx->cached_cq_tail == ev_fd->last_cq_tail;
+ ev_fd->last_cq_tail = ctx->cached_cq_tail;
+ spin_unlock(&ctx->completion_lock);
- io_eventfd_signal(ctx);
+ if (!skip)
+ put_ref = __io_eventfd_signal(ev_fd);
+
+ io_eventfd_release(ev_fd, put_ref);
+ }
}
int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg,
@@ -132,7 +180,7 @@ int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg,
}
spin_lock(&ctx->completion_lock);
- ctx->evfd_last_cq_tail = ctx->cached_cq_tail;
+ ev_fd->last_cq_tail = ctx->cached_cq_tail;
spin_unlock(&ctx->completion_lock);
ev_fd->eventfd_async = eventfd_async;
@@ -152,8 +200,7 @@ int io_eventfd_unregister(struct io_ring_ctx *ctx)
if (ev_fd) {
ctx->has_evfd = false;
rcu_assign_pointer(ctx->io_ev_fd, NULL);
- if (refcount_dec_and_test(&ev_fd->refs))
- call_rcu(&ev_fd->rcu, io_eventfd_free);
+ io_eventfd_put(ev_fd);
return 0;
}
diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c
index 6b12476..b214e5a 100644
--- a/io_uring/fdinfo.c
+++ b/io_uring/fdinfo.c
@@ -46,6 +46,46 @@ static __cold int io_uring_show_cred(struct seq_file *m, unsigned int id,
return 0;
}
+#ifdef CONFIG_NET_RX_BUSY_POLL
+static __cold void common_tracking_show_fdinfo(struct io_ring_ctx *ctx,
+ struct seq_file *m,
+ const char *tracking_strategy)
+{
+ seq_puts(m, "NAPI:\tenabled\n");
+ seq_printf(m, "napi tracking:\t%s\n", tracking_strategy);
+ seq_printf(m, "napi_busy_poll_dt:\t%llu\n", ctx->napi_busy_poll_dt);
+ if (ctx->napi_prefer_busy_poll)
+ seq_puts(m, "napi_prefer_busy_poll:\ttrue\n");
+ else
+ seq_puts(m, "napi_prefer_busy_poll:\tfalse\n");
+}
+
+static __cold void napi_show_fdinfo(struct io_ring_ctx *ctx,
+ struct seq_file *m)
+{
+ unsigned int mode = READ_ONCE(ctx->napi_track_mode);
+
+ switch (mode) {
+ case IO_URING_NAPI_TRACKING_INACTIVE:
+ seq_puts(m, "NAPI:\tdisabled\n");
+ break;
+ case IO_URING_NAPI_TRACKING_DYNAMIC:
+ common_tracking_show_fdinfo(ctx, m, "dynamic");
+ break;
+ case IO_URING_NAPI_TRACKING_STATIC:
+ common_tracking_show_fdinfo(ctx, m, "static");
+ break;
+ default:
+ seq_printf(m, "NAPI:\tunknown mode (%u)\n", mode);
+ }
+}
+#else
+static inline void napi_show_fdinfo(struct io_ring_ctx *ctx,
+ struct seq_file *m)
+{
+}
+#endif
+
/*
* Caller holds a reference to the file already, we don't need to do
* anything else to get an extra reference.
@@ -165,20 +205,27 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file)
seq_printf(m, "SqThreadCpu:\t%d\n", sq_cpu);
seq_printf(m, "SqTotalTime:\t%llu\n", sq_total_time);
seq_printf(m, "SqWorkTime:\t%llu\n", sq_work_time);
- seq_printf(m, "UserFiles:\t%u\n", ctx->nr_user_files);
- for (i = 0; has_lock && i < ctx->nr_user_files; i++) {
- struct file *f = io_file_from_index(&ctx->file_table, i);
+ seq_printf(m, "UserFiles:\t%u\n", ctx->file_table.data.nr);
+ for (i = 0; has_lock && i < ctx->file_table.data.nr; i++) {
+ struct file *f = NULL;
+ if (ctx->file_table.data.nodes[i])
+ f = io_slot_file(ctx->file_table.data.nodes[i]);
if (f)
seq_printf(m, "%5u: %s\n", i, file_dentry(f)->d_iname);
else
seq_printf(m, "%5u: <none>\n", i);
}
- seq_printf(m, "UserBufs:\t%u\n", ctx->nr_user_bufs);
- for (i = 0; has_lock && i < ctx->nr_user_bufs; i++) {
- struct io_mapped_ubuf *buf = ctx->user_bufs[i];
+ seq_printf(m, "UserBufs:\t%u\n", ctx->buf_table.nr);
+ for (i = 0; has_lock && i < ctx->buf_table.nr; i++) {
+ struct io_mapped_ubuf *buf = NULL;
- seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf, buf->len);
+ if (ctx->buf_table.nodes[i])
+ buf = ctx->buf_table.nodes[i]->buf;
+ if (buf)
+ seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf, buf->len);
+ else
+ seq_printf(m, "%5u: <none>\n", i);
}
if (has_lock && !xa_empty(&ctx->personalities)) {
unsigned long index;
@@ -190,22 +237,13 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file)
}
seq_puts(m, "PollList:\n");
- for (i = 0; i < (1U << ctx->cancel_table.hash_bits); i++) {
+ for (i = 0; has_lock && i < (1U << ctx->cancel_table.hash_bits); i++) {
struct io_hash_bucket *hb = &ctx->cancel_table.hbs[i];
- struct io_hash_bucket *hbl = &ctx->cancel_table_locked.hbs[i];
struct io_kiocb *req;
- spin_lock(&hb->lock);
hlist_for_each_entry(req, &hb->list, hash_node)
seq_printf(m, " op=%d, task_works=%d\n", req->opcode,
- task_work_pending(req->task));
- spin_unlock(&hb->lock);
-
- if (!has_lock)
- continue;
- hlist_for_each_entry(req, &hbl->list, hash_node)
- seq_printf(m, " op=%d, task_works=%d\n", req->opcode,
- task_work_pending(req->task));
+ task_work_pending(req->tctx->task));
}
if (has_lock)
@@ -221,18 +259,6 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file)
}
spin_unlock(&ctx->completion_lock);
-
-#ifdef CONFIG_NET_RX_BUSY_POLL
- if (ctx->napi_enabled) {
- seq_puts(m, "NAPI:\tenabled\n");
- seq_printf(m, "napi_busy_poll_dt:\t%llu\n", ctx->napi_busy_poll_dt);
- if (ctx->napi_prefer_busy_poll)
- seq_puts(m, "napi_prefer_busy_poll:\ttrue\n");
- else
- seq_puts(m, "napi_prefer_busy_poll:\tfalse\n");
- } else {
- seq_puts(m, "NAPI:\tdisabled\n");
- }
-#endif
+ napi_show_fdinfo(ctx, m);
}
#endif
diff --git a/io_uring/filetable.c b/io_uring/filetable.c
index 997c56d..a21660e 100644
--- a/io_uring/filetable.c
+++ b/io_uring/filetable.c
@@ -36,27 +36,22 @@ static int io_file_bitmap_get(struct io_ring_ctx *ctx)
return -ENFILE;
}
-bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files)
+bool io_alloc_file_tables(struct io_ring_ctx *ctx, struct io_file_table *table,
+ unsigned nr_files)
{
- table->files = kvcalloc(nr_files, sizeof(table->files[0]),
- GFP_KERNEL_ACCOUNT);
- if (unlikely(!table->files))
+ if (io_rsrc_data_alloc(&table->data, nr_files))
return false;
-
table->bitmap = bitmap_zalloc(nr_files, GFP_KERNEL_ACCOUNT);
- if (unlikely(!table->bitmap)) {
- kvfree(table->files);
- return false;
- }
-
- return true;
+ if (table->bitmap)
+ return true;
+ io_rsrc_data_free(ctx, &table->data);
+ return false;
}
-void io_free_file_tables(struct io_file_table *table)
+void io_free_file_tables(struct io_ring_ctx *ctx, struct io_file_table *table)
{
- kvfree(table->files);
+ io_rsrc_data_free(ctx, &table->data);
bitmap_free(table->bitmap);
- table->files = NULL;
table->bitmap = NULL;
}
@@ -64,32 +59,24 @@ static int io_install_fixed_file(struct io_ring_ctx *ctx, struct file *file,
u32 slot_index)
__must_hold(&req->ctx->uring_lock)
{
- struct io_fixed_file *file_slot;
- int ret;
+ struct io_rsrc_node *node;
if (io_is_uring_fops(file))
return -EBADF;
- if (!ctx->file_data)
+ if (!ctx->file_table.data.nr)
return -ENXIO;
- if (slot_index >= ctx->nr_user_files)
+ if (slot_index >= ctx->file_table.data.nr)
return -EINVAL;
- slot_index = array_index_nospec(slot_index, ctx->nr_user_files);
- file_slot = io_fixed_file_slot(&ctx->file_table, slot_index);
+ node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE);
+ if (!node)
+ return -ENOMEM;
- if (file_slot->file_ptr) {
- ret = io_queue_rsrc_removal(ctx->file_data, slot_index,
- io_slot_file(file_slot));
- if (ret)
- return ret;
-
- file_slot->file_ptr = 0;
- } else {
+ if (!io_reset_rsrc_node(ctx, &ctx->file_table.data, slot_index))
io_file_bitmap_set(&ctx->file_table, slot_index);
- }
- *io_get_tag_slot(ctx->file_data, slot_index) = 0;
- io_fixed_file_set(file_slot, file);
+ ctx->file_table.data.nodes[slot_index] = node;
+ io_fixed_file_set(node, file);
return 0;
}
@@ -134,25 +121,17 @@ int io_fixed_fd_install(struct io_kiocb *req, unsigned int issue_flags,
int io_fixed_fd_remove(struct io_ring_ctx *ctx, unsigned int offset)
{
- struct io_fixed_file *file_slot;
- int ret;
+ struct io_rsrc_node *node;
- if (unlikely(!ctx->file_data))
+ if (unlikely(!ctx->file_table.data.nr))
return -ENXIO;
- if (offset >= ctx->nr_user_files)
+ if (offset >= ctx->file_table.data.nr)
return -EINVAL;
- offset = array_index_nospec(offset, ctx->nr_user_files);
- file_slot = io_fixed_file_slot(&ctx->file_table, offset);
- if (!file_slot->file_ptr)
+ node = io_rsrc_node_lookup(&ctx->file_table.data, offset);
+ if (!node)
return -EBADF;
-
- ret = io_queue_rsrc_removal(ctx->file_data, offset,
- io_slot_file(file_slot));
- if (ret)
- return ret;
-
- file_slot->file_ptr = 0;
+ io_reset_rsrc_node(ctx, &ctx->file_table.data, offset);
io_file_bitmap_clear(&ctx->file_table, offset);
return 0;
}
@@ -167,7 +146,7 @@ int io_register_file_alloc_range(struct io_ring_ctx *ctx,
return -EFAULT;
if (check_add_overflow(range.off, range.len, &end))
return -EOVERFLOW;
- if (range.resv || end > ctx->nr_user_files)
+ if (range.resv || end > ctx->file_table.data.nr)
return -EINVAL;
io_file_table_set_alloc_range(ctx, range.off, range.len);
diff --git a/io_uring/filetable.h b/io_uring/filetable.h
index b2435c4..7717ea9 100644
--- a/io_uring/filetable.h
+++ b/io_uring/filetable.h
@@ -4,9 +4,10 @@
#include <linux/file.h>
#include <linux/io_uring_types.h>
+#include "rsrc.h"
-bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files);
-void io_free_file_tables(struct io_file_table *table);
+bool io_alloc_file_tables(struct io_ring_ctx *ctx, struct io_file_table *table, unsigned nr_files);
+void io_free_file_tables(struct io_ring_ctx *ctx, struct io_file_table *table);
int io_fixed_fd_install(struct io_kiocb *req, unsigned int issue_flags,
struct file *file, unsigned int file_slot);
@@ -33,50 +34,34 @@ static inline void io_file_bitmap_set(struct io_file_table *table, int bit)
table->alloc_hint = bit + 1;
}
-static inline struct io_fixed_file *
-io_fixed_file_slot(struct io_file_table *table, unsigned i)
-{
- return &table->files[i];
-}
-
#define FFS_NOWAIT 0x1UL
#define FFS_ISREG 0x2UL
#define FFS_MASK ~(FFS_NOWAIT|FFS_ISREG)
-static inline unsigned int io_slot_flags(struct io_fixed_file *slot)
+static inline unsigned int io_slot_flags(struct io_rsrc_node *node)
{
- return (slot->file_ptr & ~FFS_MASK) << REQ_F_SUPPORT_NOWAIT_BIT;
+
+ return (node->file_ptr & ~FFS_MASK) << REQ_F_SUPPORT_NOWAIT_BIT;
}
-static inline struct file *io_slot_file(struct io_fixed_file *slot)
+static inline struct file *io_slot_file(struct io_rsrc_node *node)
{
- return (struct file *)(slot->file_ptr & FFS_MASK);
+ return (struct file *)(node->file_ptr & FFS_MASK);
}
-static inline struct file *io_file_from_index(struct io_file_table *table,
- int index)
-{
- return io_slot_file(io_fixed_file_slot(table, index));
-}
-
-static inline void io_fixed_file_set(struct io_fixed_file *file_slot,
+static inline void io_fixed_file_set(struct io_rsrc_node *node,
struct file *file)
{
- file_slot->file_ptr = (unsigned long)file |
+ node->file_ptr = (unsigned long)file |
(io_file_get_flags(file) >> REQ_F_SUPPORT_NOWAIT_BIT);
}
-static inline void io_reset_alloc_hint(struct io_ring_ctx *ctx)
-{
- ctx->file_table.alloc_hint = ctx->file_alloc_start;
-}
-
static inline void io_file_table_set_alloc_range(struct io_ring_ctx *ctx,
unsigned off, unsigned len)
{
ctx->file_alloc_start = off;
ctx->file_alloc_end = off + len;
- io_reset_alloc_hint(ctx);
+ ctx->file_table.alloc_hint = ctx->file_alloc_start;
}
#endif
diff --git a/io_uring/futex.c b/io_uring/futex.c
index 914848f..e29662f 100644
--- a/io_uring/futex.c
+++ b/io_uring/futex.c
@@ -141,7 +141,7 @@ int io_futex_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd,
return -ENOENT;
}
-bool io_futex_remove_all(struct io_ring_ctx *ctx, struct task_struct *task,
+bool io_futex_remove_all(struct io_ring_ctx *ctx, struct io_uring_task *tctx,
bool cancel_all)
{
struct hlist_node *tmp;
@@ -151,7 +151,7 @@ bool io_futex_remove_all(struct io_ring_ctx *ctx, struct task_struct *task,
lockdep_assert_held(&ctx->uring_lock);
hlist_for_each_entry_safe(req, tmp, &ctx->futex_list, hash_node) {
- if (!io_match_task_safe(req, task, cancel_all))
+ if (!io_match_task_safe(req, tctx, cancel_all))
continue;
hlist_del_init(&req->hash_node);
__io_futex_cancel(ctx, req);
diff --git a/io_uring/futex.h b/io_uring/futex.h
index b8bb098..d789fcf 100644
--- a/io_uring/futex.h
+++ b/io_uring/futex.h
@@ -11,7 +11,7 @@ int io_futex_wake(struct io_kiocb *req, unsigned int issue_flags);
#if defined(CONFIG_FUTEX)
int io_futex_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd,
unsigned int issue_flags);
-bool io_futex_remove_all(struct io_ring_ctx *ctx, struct task_struct *task,
+bool io_futex_remove_all(struct io_ring_ctx *ctx, struct io_uring_task *tctx,
bool cancel_all);
bool io_futex_cache_init(struct io_ring_ctx *ctx);
void io_futex_cache_free(struct io_ring_ctx *ctx);
@@ -23,7 +23,7 @@ static inline int io_futex_cancel(struct io_ring_ctx *ctx,
return 0;
}
static inline bool io_futex_remove_all(struct io_ring_ctx *ctx,
- struct task_struct *task, bool cancel_all)
+ struct io_uring_task *tctx, bool cancel_all)
{
return false;
}
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index b2736e3..73af598 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -51,7 +51,6 @@
#include <linux/sched/signal.h>
#include <linux/fs.h>
#include <linux/file.h>
-#include <linux/fdtable.h>
#include <linux/mm.h>
#include <linux/mman.h>
#include <linux/percpu.h>
@@ -70,6 +69,7 @@
#include <linux/io_uring/cmd.h>
#include <linux/audit.h>
#include <linux/security.h>
+#include <linux/jump_label.h>
#include <asm/shmparam.h>
#define CREATE_TRACE_POINTS
@@ -104,9 +104,6 @@
#include "alloc_cache.h"
#include "eventfd.h"
-#define IORING_MAX_ENTRIES 32768
-#define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES)
-
#define SQE_COMMON_FLAGS (IOSQE_FIXED_FILE | IOSQE_IO_LINK | \
IOSQE_IO_HARDLINK | IOSQE_ASYNC)
@@ -144,11 +141,13 @@ struct io_defer_entry {
#define IO_CQ_WAKE_FORCE (IO_CQ_WAKE_INIT >> 1)
static bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
- struct task_struct *task,
+ struct io_uring_task *tctx,
bool cancel_all);
static void io_queue_sqe(struct io_kiocb *req);
+static __read_mostly DEFINE_STATIC_KEY_FALSE(io_key_has_sqarray);
+
struct kmem_cache *req_cachep;
static struct workqueue_struct *iou_wq __ro_after_init;
@@ -201,12 +200,12 @@ static bool io_match_linked(struct io_kiocb *head)
* As io_match_task() but protected against racing with linked timeouts.
* User must not hold timeout_lock.
*/
-bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task,
+bool io_match_task_safe(struct io_kiocb *head, struct io_uring_task *tctx,
bool cancel_all)
{
bool matched;
- if (task && head->task != task)
+ if (tctx && head->tctx != tctx)
return false;
if (cancel_all)
return true;
@@ -261,15 +260,23 @@ static __cold void io_fallback_req_func(struct work_struct *work)
static int io_alloc_hash_table(struct io_hash_table *table, unsigned bits)
{
- unsigned hash_buckets = 1U << bits;
- size_t hash_size = hash_buckets * sizeof(table->hbs[0]);
+ unsigned int hash_buckets;
+ int i;
- table->hbs = kmalloc(hash_size, GFP_KERNEL);
- if (!table->hbs)
- return -ENOMEM;
+ do {
+ hash_buckets = 1U << bits;
+ table->hbs = kvmalloc_array(hash_buckets, sizeof(table->hbs[0]),
+ GFP_KERNEL_ACCOUNT);
+ if (table->hbs)
+ break;
+ if (bits == 1)
+ return -ENOMEM;
+ bits--;
+ } while (1);
table->hash_bits = bits;
- init_hash_table(table, hash_buckets);
+ for (i = 0; i < hash_buckets; i++)
+ INIT_HLIST_HEAD(&table->hbs[i].list);
return 0;
}
@@ -294,21 +301,18 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
hash_bits = clamp(hash_bits, 1, 8);
if (io_alloc_hash_table(&ctx->cancel_table, hash_bits))
goto err;
- if (io_alloc_hash_table(&ctx->cancel_table_locked, hash_bits))
- goto err;
if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free,
0, GFP_KERNEL))
goto err;
ctx->flags = p->flags;
+ ctx->hybrid_poll_time = LLONG_MAX;
atomic_set(&ctx->cq_wait_nr, IO_CQ_WAKE_INIT);
init_waitqueue_head(&ctx->sqo_sq_wait);
INIT_LIST_HEAD(&ctx->sqd_list);
INIT_LIST_HEAD(&ctx->cq_overflow_list);
INIT_LIST_HEAD(&ctx->io_buffers_cache);
- ret = io_alloc_cache_init(&ctx->rsrc_node_cache, IO_NODE_ALLOC_CACHE_MAX,
- sizeof(struct io_rsrc_node));
- ret |= io_alloc_cache_init(&ctx->apoll_cache, IO_POLL_ALLOC_CACHE_MAX,
+ ret = io_alloc_cache_init(&ctx->apoll_cache, IO_POLL_ALLOC_CACHE_MAX,
sizeof(struct async_poll));
ret |= io_alloc_cache_init(&ctx->netmsg_cache, IO_ALLOC_CACHE_MAX,
sizeof(struct io_async_msghdr));
@@ -327,7 +331,6 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
mutex_init(&ctx->uring_lock);
init_waitqueue_head(&ctx->cq_wait);
init_waitqueue_head(&ctx->poll_wq);
- init_waitqueue_head(&ctx->rsrc_quiesce_wq);
spin_lock_init(&ctx->completion_lock);
spin_lock_init(&ctx->timeout_lock);
INIT_WQ_LIST(&ctx->iopoll_list);
@@ -335,7 +338,6 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
INIT_LIST_HEAD(&ctx->defer_list);
INIT_LIST_HEAD(&ctx->timeout_list);
INIT_LIST_HEAD(&ctx->ltimeout_list);
- INIT_LIST_HEAD(&ctx->rsrc_ref_list);
init_llist_head(&ctx->work_llist);
INIT_LIST_HEAD(&ctx->tctx_list);
ctx->submit_state.free_list.next = NULL;
@@ -347,21 +349,20 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
INIT_WQ_LIST(&ctx->submit_state.compl_reqs);
INIT_HLIST_HEAD(&ctx->cancelable_uring_cmd);
io_napi_init(ctx);
+ mutex_init(&ctx->resize_lock);
return ctx;
free_ref:
percpu_ref_exit(&ctx->refs);
err:
- io_alloc_cache_free(&ctx->rsrc_node_cache, kfree);
io_alloc_cache_free(&ctx->apoll_cache, kfree);
io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free);
io_alloc_cache_free(&ctx->rw_cache, io_rw_cache_free);
io_alloc_cache_free(&ctx->uring_cache, kfree);
io_alloc_cache_free(&ctx->msg_cache, io_msg_cache_free);
io_futex_cache_free(ctx);
- kfree(ctx->cancel_table.hbs);
- kfree(ctx->cancel_table_locked.hbs);
+ kvfree(ctx->cancel_table.hbs);
xa_destroy(&ctx->io_bl_xa);
kfree(ctx);
return NULL;
@@ -405,11 +406,8 @@ static void io_clean_op(struct io_kiocb *req)
kfree(req->apoll);
req->apoll = NULL;
}
- if (req->flags & REQ_F_INFLIGHT) {
- struct io_uring_task *tctx = req->task->io_uring;
-
- atomic_dec(&tctx->inflight_tracked);
- }
+ if (req->flags & REQ_F_INFLIGHT)
+ atomic_dec(&req->tctx->inflight_tracked);
if (req->flags & REQ_F_CREDS)
put_cred(req->creds);
if (req->flags & REQ_F_ASYNC_DATA) {
@@ -423,7 +421,7 @@ static inline void io_req_track_inflight(struct io_kiocb *req)
{
if (!(req->flags & REQ_F_INFLIGHT)) {
req->flags |= REQ_F_INFLIGHT;
- atomic_inc(&req->task->io_uring->inflight_tracked);
+ atomic_inc(&req->tctx->inflight_tracked);
}
}
@@ -512,7 +510,7 @@ static void io_prep_async_link(struct io_kiocb *req)
static void io_queue_iowq(struct io_kiocb *req)
{
struct io_kiocb *link = io_prep_linked_timeout(req);
- struct io_uring_task *tctx = req->task->io_uring;
+ struct io_uring_task *tctx = req->tctx;
BUG_ON(!tctx);
BUG_ON(!tctx->io_wq);
@@ -527,7 +525,7 @@ static void io_queue_iowq(struct io_kiocb *req)
* procedure rather than attempt to run this request (or create a new
* worker for it).
*/
- if (WARN_ON_ONCE(!same_thread_group(req->task, current)))
+ if (WARN_ON_ONCE(!same_thread_group(tctx->task, current)))
atomic_or(IO_WQ_WORK_CANCEL, &req->work.flags);
trace_io_uring_queue_async_work(req, io_wq_is_hashed(&req->work));
@@ -675,30 +673,19 @@ static void io_cqring_do_overflow_flush(struct io_ring_ctx *ctx)
mutex_unlock(&ctx->uring_lock);
}
-/* can be called by any task */
-static void io_put_task_remote(struct task_struct *task)
-{
- struct io_uring_task *tctx = task->io_uring;
-
- percpu_counter_sub(&tctx->inflight, 1);
- if (unlikely(atomic_read(&tctx->in_cancel)))
- wake_up(&tctx->wait);
- put_task_struct(task);
-}
-
-/* used by a task to put its own references */
-static void io_put_task_local(struct task_struct *task)
-{
- task->io_uring->cached_refs++;
-}
-
/* must to be called somewhat shortly after putting a request */
-static inline void io_put_task(struct task_struct *task)
+static inline void io_put_task(struct io_kiocb *req)
{
- if (likely(task == current))
- io_put_task_local(task);
- else
- io_put_task_remote(task);
+ struct io_uring_task *tctx = req->tctx;
+
+ if (likely(tctx->task == current)) {
+ tctx->cached_refs++;
+ } else {
+ percpu_counter_sub(&tctx->inflight, 1);
+ if (unlikely(atomic_read(&tctx->in_cancel)))
+ wake_up(&tctx->wait);
+ put_task_struct(tctx->task);
+ }
}
void io_task_refs_refill(struct io_uring_task *tctx)
@@ -820,8 +807,6 @@ static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res,
* the ring.
*/
if (likely(io_get_cqe(ctx, &cqe))) {
- trace_io_uring_complete(ctx, NULL, user_data, res, cflags, 0, 0);
-
WRITE_ONCE(cqe->user_data, user_data);
WRITE_ONCE(cqe->res, res);
WRITE_ONCE(cqe->flags, cflags);
@@ -830,6 +815,8 @@ static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res,
WRITE_ONCE(cqe->big_cqe[0], 0);
WRITE_ONCE(cqe->big_cqe[1], 0);
}
+
+ trace_io_uring_complete(ctx, NULL, cqe);
return true;
}
return false;
@@ -946,6 +933,8 @@ void io_req_defer_failed(struct io_kiocb *req, s32 res)
static void io_preinit_req(struct io_kiocb *req, struct io_ring_ctx *ctx)
{
req->ctx = ctx;
+ req->buf_node = NULL;
+ req->file_node = NULL;
req->link = NULL;
req->async_data = NULL;
/* not necessary, but safer to zero */
@@ -1076,23 +1065,8 @@ struct llist_node *io_handle_tw_list(struct llist_node *node,
return node;
}
-/**
- * io_llist_xchg - swap all entries in a lock-less list
- * @head: the head of lock-less list to delete all entries
- * @new: new entry as the head of the list
- *
- * If list is empty, return NULL, otherwise, return the pointer to the first entry.
- * The order of entries returned is from the newest to the oldest added one.
- */
-static inline struct llist_node *io_llist_xchg(struct llist_head *head,
- struct llist_node *new)
+static __cold void __io_fallback_tw(struct llist_node *node, bool sync)
{
- return xchg(&head->first, new);
-}
-
-static __cold void io_fallback_tw(struct io_uring_task *tctx, bool sync)
-{
- struct llist_node *node = llist_del_all(&tctx->task_list);
struct io_ring_ctx *last_ctx = NULL;
struct io_kiocb *req;
@@ -1118,6 +1092,13 @@ static __cold void io_fallback_tw(struct io_uring_task *tctx, bool sync)
}
}
+static void io_fallback_tw(struct io_uring_task *tctx, bool sync)
+{
+ struct llist_node *node = llist_del_all(&tctx->task_list);
+
+ __io_fallback_tw(node, sync);
+}
+
struct llist_node *tctx_task_work_run(struct io_uring_task *tctx,
unsigned int max_entries,
unsigned int *count)
@@ -1228,7 +1209,7 @@ static inline void io_req_local_work_add(struct io_kiocb *req,
static void io_req_normal_work_add(struct io_kiocb *req)
{
- struct io_uring_task *tctx = req->task->io_uring;
+ struct io_uring_task *tctx = req->tctx;
struct io_ring_ctx *ctx = req->ctx;
/* task_work already pending, we're done */
@@ -1247,7 +1228,7 @@ static void io_req_normal_work_add(struct io_kiocb *req)
return;
}
- if (likely(!task_work_add(req->task, &tctx->task_work, ctx->notify_method)))
+ if (likely(!task_work_add(tctx->task, &tctx->task_work, ctx->notify_method)))
return;
io_fallback_tw(tctx, false);
@@ -1271,16 +1252,9 @@ void io_req_task_work_add_remote(struct io_kiocb *req, struct io_ring_ctx *ctx,
static void __cold io_move_task_work_from_local(struct io_ring_ctx *ctx)
{
- struct llist_node *node;
+ struct llist_node *node = llist_del_all(&ctx->work_llist);
- node = llist_del_all(&ctx->work_llist);
- while (node) {
- struct io_kiocb *req = container_of(node, struct io_kiocb,
- io_task_work.node);
-
- node = node->next;
- io_req_normal_work_add(req);
- }
+ __io_fallback_tw(node, false);
}
static bool io_run_local_work_continue(struct io_ring_ctx *ctx, int events,
@@ -1311,7 +1285,7 @@ static int __io_run_local_work(struct io_ring_ctx *ctx, struct io_tw_state *ts,
* llists are in reverse order, flip it back the right way before
* running the pending items.
*/
- node = llist_reverse_order(io_llist_xchg(&ctx->work_llist, NULL));
+ node = llist_reverse_order(llist_del_all(&ctx->work_llist));
while (node) {
struct llist_node *next = node->next;
struct io_kiocb *req = container_of(node, struct io_kiocb,
@@ -1364,8 +1338,7 @@ static void io_req_task_cancel(struct io_kiocb *req, struct io_tw_state *ts)
void io_req_task_submit(struct io_kiocb *req, struct io_tw_state *ts)
{
io_tw_lock(req->ctx, ts);
- /* req->task == current here, checking PF_EXITING is safe */
- if (unlikely(req->task->flags & PF_EXITING))
+ if (unlikely(io_should_terminate_tw()))
io_req_defer_failed(req, -EFAULT);
else if (req->flags & REQ_F_FORCE_ASYNC)
io_queue_iowq(req);
@@ -1423,8 +1396,8 @@ static void io_free_batch_list(struct io_ring_ctx *ctx,
io_clean_op(req);
}
io_put_file(req);
- io_put_rsrc_node(ctx, req->rsrc_node);
- io_put_task(req->task);
+ io_req_put_rsrc_nodes(req);
+ io_put_task(req);
node = req->comp_list.next;
io_req_add_to_cache(req, ctx);
@@ -1886,20 +1859,16 @@ inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
unsigned int issue_flags)
{
struct io_ring_ctx *ctx = req->ctx;
- struct io_fixed_file *slot;
+ struct io_rsrc_node *node;
struct file *file = NULL;
io_ring_submit_lock(ctx, issue_flags);
-
- if (unlikely((unsigned int)fd >= ctx->nr_user_files))
- goto out;
- fd = array_index_nospec(fd, ctx->nr_user_files);
- slot = io_fixed_file_slot(&ctx->file_table, fd);
- if (!req->rsrc_node)
- __io_req_set_rsrc_node(req, ctx);
- req->flags |= io_slot_flags(slot);
- file = io_slot_file(slot);
-out:
+ node = io_rsrc_node_lookup(&ctx->file_table.data, fd);
+ if (node) {
+ io_req_assign_rsrc_node(&req->file_node, node);
+ req->flags |= io_slot_flags(node);
+ file = io_slot_file(node);
+ }
io_ring_submit_unlock(ctx, issue_flags);
return file;
}
@@ -2044,8 +2013,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
req->flags = (__force io_req_flags_t) sqe_flags;
req->cqe.user_data = READ_ONCE(sqe->user_data);
req->file = NULL;
- req->rsrc_node = NULL;
- req->task = current;
+ req->tctx = current->io_uring;
req->cancel_seq_set = false;
if (unlikely(opcode >= IORING_OP_LAST)) {
@@ -2263,7 +2231,8 @@ static bool io_get_sqe(struct io_ring_ctx *ctx, const struct io_uring_sqe **sqe)
unsigned mask = ctx->sq_entries - 1;
unsigned head = ctx->cached_sq_head++ & mask;
- if (!(ctx->flags & IORING_SETUP_NO_SQARRAY)) {
+ if (static_branch_unlikely(&io_key_has_sqarray) &&
+ (!(ctx->flags & IORING_SETUP_NO_SQARRAY))) {
head = READ_ONCE(ctx->sq_array[head]);
if (unlikely(head >= ctx->sq_entries)) {
/* drop invalid entries */
@@ -2274,6 +2243,7 @@ static bool io_get_sqe(struct io_ring_ctx *ctx, const struct io_uring_sqe **sqe)
READ_ONCE(ctx->rings->sq_dropped) + 1);
return false;
}
+ head = array_index_nospec(head, ctx->sq_entries);
}
/*
@@ -2502,9 +2472,10 @@ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
struct ext_arg {
size_t argsz;
- struct __kernel_timespec __user *ts;
+ struct timespec64 ts;
const sigset_t __user *sig;
ktime_t min_time;
+ bool ts_set;
};
/*
@@ -2542,13 +2513,8 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags,
iowq.timeout = KTIME_MAX;
start_time = io_get_time(ctx);
- if (ext_arg->ts) {
- struct timespec64 ts;
-
- if (get_timespec64(&ts, ext_arg->ts))
- return -EFAULT;
-
- iowq.timeout = timespec64_to_ktime(ts);
+ if (ext_arg->ts_set) {
+ iowq.timeout = timespec64_to_ktime(ext_arg->ts);
if (!(flags & IORING_ENTER_ABS_TIMER))
iowq.timeout = ktime_add(iowq.timeout, start_time);
}
@@ -2672,8 +2638,8 @@ static void io_rings_free(struct io_ring_ctx *ctx)
ctx->sq_sqes = NULL;
}
-static unsigned long rings_size(struct io_ring_ctx *ctx, unsigned int sq_entries,
- unsigned int cq_entries, size_t *sq_offset)
+unsigned long rings_size(unsigned int flags, unsigned int sq_entries,
+ unsigned int cq_entries, size_t *sq_offset)
{
struct io_rings *rings;
size_t off, sq_array_size;
@@ -2681,7 +2647,7 @@ static unsigned long rings_size(struct io_ring_ctx *ctx, unsigned int sq_entries
off = struct_size(rings, cqes, cq_entries);
if (off == SIZE_MAX)
return SIZE_MAX;
- if (ctx->flags & IORING_SETUP_CQE32) {
+ if (flags & IORING_SETUP_CQE32) {
if (check_shl_overflow(off, 1, &off))
return SIZE_MAX;
}
@@ -2692,7 +2658,7 @@ static unsigned long rings_size(struct io_ring_ctx *ctx, unsigned int sq_entries
return SIZE_MAX;
#endif
- if (ctx->flags & IORING_SETUP_NO_SQARRAY) {
+ if (flags & IORING_SETUP_NO_SQARRAY) {
*sq_offset = SIZE_MAX;
return off;
}
@@ -2729,15 +2695,10 @@ static void io_req_caches_free(struct io_ring_ctx *ctx)
static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
{
io_sq_thread_finish(ctx);
- /* __io_rsrc_put_work() may need uring_lock to progress, wait w/o it */
- if (WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list)))
- return;
mutex_lock(&ctx->uring_lock);
- if (ctx->buf_data)
- __io_sqe_buffers_unregister(ctx);
- if (ctx->file_data)
- __io_sqe_files_unregister(ctx);
+ io_sqe_buffers_unregister(ctx);
+ io_sqe_files_unregister(ctx);
io_cqring_overflow_kill(ctx);
io_eventfd_unregister(ctx);
io_alloc_cache_free(&ctx->apoll_cache, kfree);
@@ -2747,34 +2708,31 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
io_alloc_cache_free(&ctx->msg_cache, io_msg_cache_free);
io_futex_cache_free(ctx);
io_destroy_buffers(ctx);
+ io_free_region(ctx, &ctx->param_region);
mutex_unlock(&ctx->uring_lock);
if (ctx->sq_creds)
put_cred(ctx->sq_creds);
if (ctx->submitter_task)
put_task_struct(ctx->submitter_task);
- /* there are no registered resources left, nobody uses it */
- if (ctx->rsrc_node)
- io_rsrc_node_destroy(ctx, ctx->rsrc_node);
-
- WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list));
WARN_ON_ONCE(!list_empty(&ctx->ltimeout_list));
- io_alloc_cache_free(&ctx->rsrc_node_cache, kfree);
if (ctx->mm_account) {
mmdrop(ctx->mm_account);
ctx->mm_account = NULL;
}
io_rings_free(ctx);
+ if (!(ctx->flags & IORING_SETUP_NO_SQARRAY))
+ static_branch_dec(&io_key_has_sqarray);
+
percpu_ref_exit(&ctx->refs);
free_uid(ctx->user);
io_req_caches_free(ctx);
if (ctx->hash_map)
io_wq_put_hash(ctx->hash_map);
io_napi_free(ctx);
- kfree(ctx->cancel_table.hbs);
- kfree(ctx->cancel_table_locked.hbs);
+ kvfree(ctx->cancel_table.hbs);
xa_destroy(&ctx->io_bl_xa);
kfree(ctx);
}
@@ -3013,7 +2971,7 @@ static int io_uring_release(struct inode *inode, struct file *file)
}
struct io_task_cancel {
- struct task_struct *task;
+ struct io_uring_task *tctx;
bool all;
};
@@ -3022,11 +2980,11 @@ static bool io_cancel_task_cb(struct io_wq_work *work, void *data)
struct io_kiocb *req = container_of(work, struct io_kiocb, work);
struct io_task_cancel *cancel = data;
- return io_match_task_safe(req, cancel->task, cancel->all);
+ return io_match_task_safe(req, cancel->tctx, cancel->all);
}
static __cold bool io_cancel_defer_files(struct io_ring_ctx *ctx,
- struct task_struct *task,
+ struct io_uring_task *tctx,
bool cancel_all)
{
struct io_defer_entry *de;
@@ -3034,7 +2992,7 @@ static __cold bool io_cancel_defer_files(struct io_ring_ctx *ctx,
spin_lock(&ctx->completion_lock);
list_for_each_entry_reverse(de, &ctx->defer_list, list) {
- if (io_match_task_safe(de->req, task, cancel_all)) {
+ if (io_match_task_safe(de->req, tctx, cancel_all)) {
list_cut_position(&list, &ctx->defer_list, &de->list);
break;
}
@@ -3077,11 +3035,10 @@ static __cold bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx)
}
static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
- struct task_struct *task,
+ struct io_uring_task *tctx,
bool cancel_all)
{
- struct io_task_cancel cancel = { .task = task, .all = cancel_all, };
- struct io_uring_task *tctx = task ? task->io_uring : NULL;
+ struct io_task_cancel cancel = { .tctx = tctx, .all = cancel_all, };
enum io_wq_cancel cret;
bool ret = false;
@@ -3095,9 +3052,9 @@ static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
if (!ctx->rings)
return false;
- if (!task) {
+ if (!tctx) {
ret |= io_uring_try_cancel_iowq(ctx);
- } else if (tctx && tctx->io_wq) {
+ } else if (tctx->io_wq) {
/*
* Cancels requests of all rings, not only @ctx, but
* it's fine as the task is in exit/exec.
@@ -3120,15 +3077,15 @@ static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
if ((ctx->flags & IORING_SETUP_DEFER_TASKRUN) &&
io_allowed_defer_tw_run(ctx))
ret |= io_run_local_work(ctx, INT_MAX) > 0;
- ret |= io_cancel_defer_files(ctx, task, cancel_all);
+ ret |= io_cancel_defer_files(ctx, tctx, cancel_all);
mutex_lock(&ctx->uring_lock);
- ret |= io_poll_remove_all(ctx, task, cancel_all);
- ret |= io_waitid_remove_all(ctx, task, cancel_all);
- ret |= io_futex_remove_all(ctx, task, cancel_all);
- ret |= io_uring_try_cancel_uring_cmd(ctx, task, cancel_all);
+ ret |= io_poll_remove_all(ctx, tctx, cancel_all);
+ ret |= io_waitid_remove_all(ctx, tctx, cancel_all);
+ ret |= io_futex_remove_all(ctx, tctx, cancel_all);
+ ret |= io_uring_try_cancel_uring_cmd(ctx, tctx, cancel_all);
mutex_unlock(&ctx->uring_lock);
- ret |= io_kill_timeouts(ctx, task, cancel_all);
- if (task)
+ ret |= io_kill_timeouts(ctx, tctx, cancel_all);
+ if (tctx)
ret |= io_run_task_work() > 0;
else
ret |= flush_delayed_work(&ctx->fallback_work);
@@ -3181,12 +3138,13 @@ __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd)
if (node->ctx->sq_data)
continue;
loop |= io_uring_try_cancel_requests(node->ctx,
- current, cancel_all);
+ current->io_uring,
+ cancel_all);
}
} else {
list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
loop |= io_uring_try_cancel_requests(ctx,
- current,
+ current->io_uring,
cancel_all);
}
@@ -3233,22 +3191,44 @@ void __io_uring_cancel(bool cancel_all)
io_uring_cancel_generic(cancel_all, NULL);
}
-static int io_validate_ext_arg(unsigned flags, const void __user *argp, size_t argsz)
+static struct io_uring_reg_wait *io_get_ext_arg_reg(struct io_ring_ctx *ctx,
+ const struct io_uring_getevents_arg __user *uarg)
{
- if (flags & IORING_ENTER_EXT_ARG) {
- struct io_uring_getevents_arg arg;
+ unsigned long size = sizeof(struct io_uring_reg_wait);
+ unsigned long offset = (uintptr_t)uarg;
+ unsigned long end;
- if (argsz != sizeof(arg))
- return -EINVAL;
- if (copy_from_user(&arg, argp, sizeof(arg)))
- return -EFAULT;
- }
+ if (unlikely(offset % sizeof(long)))
+ return ERR_PTR(-EFAULT);
+
+ /* also protects from NULL ->cq_wait_arg as the size would be 0 */
+ if (unlikely(check_add_overflow(offset, size, &end) ||
+ end > ctx->cq_wait_size))
+ return ERR_PTR(-EFAULT);
+
+ return ctx->cq_wait_arg + offset;
+}
+
+static int io_validate_ext_arg(struct io_ring_ctx *ctx, unsigned flags,
+ const void __user *argp, size_t argsz)
+{
+ struct io_uring_getevents_arg arg;
+
+ if (!(flags & IORING_ENTER_EXT_ARG))
+ return 0;
+ if (flags & IORING_ENTER_EXT_ARG_REG)
+ return -EINVAL;
+ if (argsz != sizeof(arg))
+ return -EINVAL;
+ if (copy_from_user(&arg, argp, sizeof(arg)))
+ return -EFAULT;
return 0;
}
-static int io_get_ext_arg(unsigned flags, const void __user *argp,
- struct ext_arg *ext_arg)
+static int io_get_ext_arg(struct io_ring_ctx *ctx, unsigned flags,
+ const void __user *argp, struct ext_arg *ext_arg)
{
+ const struct io_uring_getevents_arg __user *uarg = argp;
struct io_uring_getevents_arg arg;
/*
@@ -3257,7 +3237,28 @@ static int io_get_ext_arg(unsigned flags, const void __user *argp,
*/
if (!(flags & IORING_ENTER_EXT_ARG)) {
ext_arg->sig = (const sigset_t __user *) argp;
- ext_arg->ts = NULL;
+ return 0;
+ }
+
+ if (flags & IORING_ENTER_EXT_ARG_REG) {
+ struct io_uring_reg_wait *w;
+
+ if (ext_arg->argsz != sizeof(struct io_uring_reg_wait))
+ return -EINVAL;
+ w = io_get_ext_arg_reg(ctx, argp);
+ if (IS_ERR(w))
+ return PTR_ERR(w);
+
+ if (w->flags & ~IORING_REG_WAIT_TS)
+ return -EINVAL;
+ ext_arg->min_time = READ_ONCE(w->min_wait_usec) * NSEC_PER_USEC;
+ ext_arg->sig = u64_to_user_ptr(READ_ONCE(w->sigmask));
+ ext_arg->argsz = READ_ONCE(w->sigmask_sz);
+ if (w->flags & IORING_REG_WAIT_TS) {
+ ext_arg->ts.tv_sec = READ_ONCE(w->ts.tv_sec);
+ ext_arg->ts.tv_nsec = READ_ONCE(w->ts.tv_nsec);
+ ext_arg->ts_set = true;
+ }
return 0;
}
@@ -3267,13 +3268,32 @@ static int io_get_ext_arg(unsigned flags, const void __user *argp,
*/
if (ext_arg->argsz != sizeof(arg))
return -EINVAL;
- if (copy_from_user(&arg, argp, sizeof(arg)))
+#ifdef CONFIG_64BIT
+ if (!user_access_begin(uarg, sizeof(*uarg)))
return -EFAULT;
+ unsafe_get_user(arg.sigmask, &uarg->sigmask, uaccess_end);
+ unsafe_get_user(arg.sigmask_sz, &uarg->sigmask_sz, uaccess_end);
+ unsafe_get_user(arg.min_wait_usec, &uarg->min_wait_usec, uaccess_end);
+ unsafe_get_user(arg.ts, &uarg->ts, uaccess_end);
+ user_access_end();
+#else
+ if (copy_from_user(&arg, uarg, sizeof(arg)))
+ return -EFAULT;
+#endif
ext_arg->min_time = arg.min_wait_usec * NSEC_PER_USEC;
ext_arg->sig = u64_to_user_ptr(arg.sigmask);
ext_arg->argsz = arg.sigmask_sz;
- ext_arg->ts = u64_to_user_ptr(arg.ts);
+ if (arg.ts) {
+ if (get_timespec64(&ext_arg->ts, u64_to_user_ptr(arg.ts)))
+ return -EFAULT;
+ ext_arg->ts_set = true;
+ }
return 0;
+#ifdef CONFIG_64BIT
+uaccess_end:
+ user_access_end();
+ return -EFAULT;
+#endif
}
SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
@@ -3287,7 +3307,8 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
if (unlikely(flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP |
IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG |
IORING_ENTER_REGISTERED_RING |
- IORING_ENTER_ABS_TIMER)))
+ IORING_ENTER_ABS_TIMER |
+ IORING_ENTER_EXT_ARG_REG)))
return -EINVAL;
/*
@@ -3370,7 +3391,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
*/
mutex_lock(&ctx->uring_lock);
iopoll_locked:
- ret2 = io_validate_ext_arg(flags, argp, argsz);
+ ret2 = io_validate_ext_arg(ctx, flags, argp, argsz);
if (likely(!ret2)) {
min_complete = min(min_complete,
ctx->cq_entries);
@@ -3380,7 +3401,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
} else {
struct ext_arg ext_arg = { .argsz = argsz };
- ret2 = io_get_ext_arg(flags, argp, &ext_arg);
+ ret2 = io_get_ext_arg(ctx, flags, argp, &ext_arg);
if (likely(!ret2)) {
min_complete = min(min_complete,
ctx->cq_entries);
@@ -3437,7 +3458,8 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx,
ctx->sq_entries = p->sq_entries;
ctx->cq_entries = p->cq_entries;
- size = rings_size(ctx, p->sq_entries, p->cq_entries, &sq_array_offset);
+ size = rings_size(ctx->flags, p->sq_entries, p->cq_entries,
+ &sq_array_offset);
if (size == SIZE_MAX)
return -EOVERFLOW;
@@ -3503,14 +3525,8 @@ static struct file *io_uring_get_file(struct io_ring_ctx *ctx)
O_RDWR | O_CLOEXEC, NULL);
}
-static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
- struct io_uring_params __user *params)
+int io_uring_fill_params(unsigned entries, struct io_uring_params *p)
{
- struct io_ring_ctx *ctx;
- struct io_uring_task *tctx;
- struct file *file;
- int ret;
-
if (!entries)
return -EINVAL;
if (entries > IORING_MAX_ENTRIES) {
@@ -3552,6 +3568,42 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
p->cq_entries = 2 * p->sq_entries;
}
+ p->sq_off.head = offsetof(struct io_rings, sq.head);
+ p->sq_off.tail = offsetof(struct io_rings, sq.tail);
+ p->sq_off.ring_mask = offsetof(struct io_rings, sq_ring_mask);
+ p->sq_off.ring_entries = offsetof(struct io_rings, sq_ring_entries);
+ p->sq_off.flags = offsetof(struct io_rings, sq_flags);
+ p->sq_off.dropped = offsetof(struct io_rings, sq_dropped);
+ p->sq_off.resv1 = 0;
+ if (!(p->flags & IORING_SETUP_NO_MMAP))
+ p->sq_off.user_addr = 0;
+
+ p->cq_off.head = offsetof(struct io_rings, cq.head);
+ p->cq_off.tail = offsetof(struct io_rings, cq.tail);
+ p->cq_off.ring_mask = offsetof(struct io_rings, cq_ring_mask);
+ p->cq_off.ring_entries = offsetof(struct io_rings, cq_ring_entries);
+ p->cq_off.overflow = offsetof(struct io_rings, cq_overflow);
+ p->cq_off.cqes = offsetof(struct io_rings, cqes);
+ p->cq_off.flags = offsetof(struct io_rings, cq_flags);
+ p->cq_off.resv1 = 0;
+ if (!(p->flags & IORING_SETUP_NO_MMAP))
+ p->cq_off.user_addr = 0;
+
+ return 0;
+}
+
+static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
+ struct io_uring_params __user *params)
+{
+ struct io_ring_ctx *ctx;
+ struct io_uring_task *tctx;
+ struct file *file;
+ int ret;
+
+ ret = io_uring_fill_params(entries, p);
+ if (unlikely(ret))
+ return ret;
+
ctx = io_ring_ctx_alloc(p);
if (!ctx)
return -ENOMEM;
@@ -3559,6 +3611,9 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
ctx->clockid = CLOCK_MONOTONIC;
ctx->clock_offset = 0;
+ if (!(ctx->flags & IORING_SETUP_NO_SQARRAY))
+ static_branch_inc(&io_key_has_sqarray);
+
if ((ctx->flags & IORING_SETUP_DEFER_TASKRUN) &&
!(ctx->flags & IORING_SETUP_IOPOLL) &&
!(ctx->flags & IORING_SETUP_SQPOLL))
@@ -3609,6 +3664,11 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
ctx->notify_method = TWA_SIGNAL;
}
+ /* HYBRID_IOPOLL only valid with IOPOLL */
+ if ((ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_HYBRID_IOPOLL)) ==
+ IORING_SETUP_HYBRID_IOPOLL)
+ goto err;
+
/*
* For DEFER_TASKRUN we require the completion task to be the same as the
* submission task. This implies that there is only one submitter, so enforce
@@ -3632,37 +3692,13 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
if (ret)
goto err;
+ if (!(p->flags & IORING_SETUP_NO_SQARRAY))
+ p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings;
+
ret = io_sq_offload_create(ctx, p);
if (ret)
goto err;
- ret = io_rsrc_init(ctx);
- if (ret)
- goto err;
-
- p->sq_off.head = offsetof(struct io_rings, sq.head);
- p->sq_off.tail = offsetof(struct io_rings, sq.tail);
- p->sq_off.ring_mask = offsetof(struct io_rings, sq_ring_mask);
- p->sq_off.ring_entries = offsetof(struct io_rings, sq_ring_entries);
- p->sq_off.flags = offsetof(struct io_rings, sq_flags);
- p->sq_off.dropped = offsetof(struct io_rings, sq_dropped);
- if (!(ctx->flags & IORING_SETUP_NO_SQARRAY))
- p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings;
- p->sq_off.resv1 = 0;
- if (!(ctx->flags & IORING_SETUP_NO_MMAP))
- p->sq_off.user_addr = 0;
-
- p->cq_off.head = offsetof(struct io_rings, cq.head);
- p->cq_off.tail = offsetof(struct io_rings, cq.tail);
- p->cq_off.ring_mask = offsetof(struct io_rings, cq_ring_mask);
- p->cq_off.ring_entries = offsetof(struct io_rings, cq_ring_entries);
- p->cq_off.overflow = offsetof(struct io_rings, cq_overflow);
- p->cq_off.cqes = offsetof(struct io_rings, cqes);
- p->cq_off.flags = offsetof(struct io_rings, cq_flags);
- p->cq_off.resv1 = 0;
- if (!(ctx->flags & IORING_SETUP_NO_MMAP))
- p->cq_off.user_addr = 0;
-
p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP |
IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS |
IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL |
@@ -3738,7 +3774,7 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
IORING_SETUP_SQE128 | IORING_SETUP_CQE32 |
IORING_SETUP_SINGLE_ISSUER | IORING_SETUP_DEFER_TASKRUN |
IORING_SETUP_NO_MMAP | IORING_SETUP_REGISTERED_FD_ONLY |
- IORING_SETUP_NO_SQARRAY))
+ IORING_SETUP_NO_SQARRAY | IORING_SETUP_HYBRID_IOPOLL))
return -EINVAL;
return io_uring_create(entries, &p, params);
@@ -3776,6 +3812,8 @@ static int __init io_uring_init(void)
struct kmem_cache_args kmem_args = {
.useroffset = offsetof(struct io_kiocb, cmd.data),
.usersize = sizeof_field(struct io_kiocb, cmd.data),
+ .freeptr_offset = offsetof(struct io_kiocb, work),
+ .use_freeptr_offset = true,
};
#define __BUILD_BUG_VERIFY_OFFSET_SIZE(stype, eoffset, esize, ename) do { \
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 70b6675..4070d4c 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -65,6 +65,12 @@ static inline bool io_should_wake(struct io_wait_queue *iowq)
return dist >= 0 || atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts;
}
+#define IORING_MAX_ENTRIES 32768
+#define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES)
+
+unsigned long rings_size(unsigned int flags, unsigned int sq_entries,
+ unsigned int cq_entries, size_t *sq_offset);
+int io_uring_fill_params(unsigned entries, struct io_uring_params *p);
bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow);
int io_run_task_work_sig(struct io_ring_ctx *ctx);
void io_req_defer_failed(struct io_kiocb *req, s32 res);
@@ -109,7 +115,7 @@ void io_queue_next(struct io_kiocb *req);
void io_task_refs_refill(struct io_uring_task *tctx);
bool __io_alloc_req_refill(struct io_ring_ctx *ctx);
-bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task,
+bool io_match_task_safe(struct io_kiocb *head, struct io_uring_task *tctx,
bool cancel_all);
void io_activate_pollwq(struct io_ring_ctx *ctx);
@@ -130,7 +136,7 @@ static inline void io_lockdep_assert_cq_locked(struct io_ring_ctx *ctx)
* Not from an SQE, as those cannot be submitted, but via
* updating tagged resources.
*/
- if (ctx->submitter_task->flags & PF_EXITING)
+ if (percpu_ref_is_dying(&ctx->refs))
lockdep_assert(current_work());
else
lockdep_assert(current == ctx->submitter_task);
@@ -189,16 +195,15 @@ static __always_inline bool io_fill_cqe_req(struct io_ring_ctx *ctx,
if (unlikely(!io_get_cqe(ctx, &cqe)))
return false;
- if (trace_io_uring_complete_enabled())
- trace_io_uring_complete(req->ctx, req, req->cqe.user_data,
- req->cqe.res, req->cqe.flags,
- req->big_cqe.extra1, req->big_cqe.extra2);
memcpy(cqe, &req->cqe, sizeof(*cqe));
if (ctx->flags & IORING_SETUP_CQE32) {
memcpy(cqe->big_cqe, &req->big_cqe, sizeof(*cqe));
memset(&req->big_cqe, 0, sizeof(req->big_cqe));
}
+
+ if (trace_io_uring_complete_enabled())
+ trace_io_uring_complete(req->ctx, req, cqe);
return true;
}
@@ -421,6 +426,19 @@ static inline bool io_allowed_run_tw(struct io_ring_ctx *ctx)
ctx->submitter_task == current);
}
+/*
+ * Terminate the request if either of these conditions are true:
+ *
+ * 1) It's being executed by the original task, but that task is marked
+ * with PF_EXITING as it's exiting.
+ * 2) PF_KTHREAD is set, in which case the invoker of the task_work is
+ * our fallback task_work.
+ */
+static inline bool io_should_terminate_tw(void)
+{
+ return current->flags & (PF_KTHREAD | PF_EXITING);
+}
+
static inline void io_req_queue_tw_complete(struct io_kiocb *req, s32 res)
{
io_req_set_res(req, res, 0);
diff --git a/io_uring/memmap.c b/io_uring/memmap.c
index a0f32a2..6e6ee79 100644
--- a/io_uring/memmap.c
+++ b/io_uring/memmap.c
@@ -12,6 +12,7 @@
#include "memmap.h"
#include "kbuf.h"
+#include "rsrc.h"
static void *io_mem_alloc_compound(struct page **pages, int nr_pages,
size_t size, gfp_t gfp)
@@ -140,6 +141,8 @@ struct page **io_pin_pages(unsigned long uaddr, unsigned long len, int *npages)
nr_pages = end - start;
if (WARN_ON_ONCE(!nr_pages))
return ERR_PTR(-EINVAL);
+ if (WARN_ON_ONCE(nr_pages > INT_MAX))
+ return ERR_PTR(-EOVERFLOW);
pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL);
if (!pages)
@@ -192,6 +195,74 @@ void *__io_uaddr_map(struct page ***pages, unsigned short *npages,
return ERR_PTR(-ENOMEM);
}
+void io_free_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr)
+{
+ if (mr->pages) {
+ unpin_user_pages(mr->pages, mr->nr_pages);
+ kvfree(mr->pages);
+ }
+ if (mr->vmap_ptr)
+ vunmap(mr->vmap_ptr);
+ if (mr->nr_pages && ctx->user)
+ __io_unaccount_mem(ctx->user, mr->nr_pages);
+
+ memset(mr, 0, sizeof(*mr));
+}
+
+int io_create_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr,
+ struct io_uring_region_desc *reg)
+{
+ int pages_accounted = 0;
+ struct page **pages;
+ int nr_pages, ret;
+ void *vptr;
+ u64 end;
+
+ if (WARN_ON_ONCE(mr->pages || mr->vmap_ptr || mr->nr_pages))
+ return -EFAULT;
+ if (memchr_inv(®->__resv, 0, sizeof(reg->__resv)))
+ return -EINVAL;
+ if (reg->flags != IORING_MEM_REGION_TYPE_USER)
+ return -EINVAL;
+ if (!reg->user_addr)
+ return -EFAULT;
+ if (!reg->size || reg->mmap_offset || reg->id)
+ return -EINVAL;
+ if ((reg->size >> PAGE_SHIFT) > INT_MAX)
+ return E2BIG;
+ if ((reg->user_addr | reg->size) & ~PAGE_MASK)
+ return -EINVAL;
+ if (check_add_overflow(reg->user_addr, reg->size, &end))
+ return -EOVERFLOW;
+
+ pages = io_pin_pages(reg->user_addr, reg->size, &nr_pages);
+ if (IS_ERR(pages))
+ return PTR_ERR(pages);
+
+ if (ctx->user) {
+ ret = __io_account_mem(ctx->user, nr_pages);
+ if (ret)
+ goto out_free;
+ pages_accounted = nr_pages;
+ }
+
+ vptr = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL);
+ if (!vptr) {
+ ret = -ENOMEM;
+ goto out_free;
+ }
+
+ mr->pages = pages;
+ mr->vmap_ptr = vptr;
+ mr->nr_pages = nr_pages;
+ return 0;
+out_free:
+ if (pages_accounted)
+ __io_unaccount_mem(ctx->user, pages_accounted);
+ io_pages_free(&pages, nr_pages);
+ return ret;
+}
+
static void *io_uring_validate_mmap_request(struct file *file, loff_t pgoff,
size_t sz)
{
@@ -204,11 +275,15 @@ static void *io_uring_validate_mmap_request(struct file *file, loff_t pgoff,
/* Don't allow mmap if the ring was setup without it */
if (ctx->flags & IORING_SETUP_NO_MMAP)
return ERR_PTR(-EINVAL);
+ if (!ctx->rings)
+ return ERR_PTR(-EFAULT);
return ctx->rings;
case IORING_OFF_SQES:
/* Don't allow mmap if the ring was setup without it */
if (ctx->flags & IORING_SETUP_NO_MMAP)
return ERR_PTR(-EINVAL);
+ if (!ctx->sq_sqes)
+ return ERR_PTR(-EFAULT);
return ctx->sq_sqes;
case IORING_OFF_PBUF_RING: {
struct io_buffer_list *bl;
@@ -247,6 +322,8 @@ __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
unsigned int npages;
void *ptr;
+ guard(mutex)(&ctx->resize_lock);
+
ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz);
if (IS_ERR(ptr))
return PTR_ERR(ptr);
@@ -270,6 +347,7 @@ unsigned long io_uring_get_unmapped_area(struct file *filp, unsigned long addr,
unsigned long len, unsigned long pgoff,
unsigned long flags)
{
+ struct io_ring_ctx *ctx = filp->private_data;
void *ptr;
/*
@@ -280,6 +358,8 @@ unsigned long io_uring_get_unmapped_area(struct file *filp, unsigned long addr,
if (addr)
return -EINVAL;
+ guard(mutex)(&ctx->resize_lock);
+
ptr = io_uring_validate_mmap_request(filp, pgoff, len);
if (IS_ERR(ptr))
return -ENOMEM;
@@ -325,8 +405,11 @@ unsigned long io_uring_get_unmapped_area(struct file *file, unsigned long addr,
unsigned long len, unsigned long pgoff,
unsigned long flags)
{
+ struct io_ring_ctx *ctx = file->private_data;
void *ptr;
+ guard(mutex)(&ctx->resize_lock);
+
ptr = io_uring_validate_mmap_request(file, pgoff, len);
if (IS_ERR(ptr))
return PTR_ERR(ptr);
diff --git a/io_uring/memmap.h b/io_uring/memmap.h
index 5cec5b7..f361a63 100644
--- a/io_uring/memmap.h
+++ b/io_uring/memmap.h
@@ -22,4 +22,18 @@ unsigned long io_uring_get_unmapped_area(struct file *file, unsigned long addr,
unsigned long flags);
int io_uring_mmap(struct file *file, struct vm_area_struct *vma);
+void io_free_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr);
+int io_create_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr,
+ struct io_uring_region_desc *reg);
+
+static inline void *io_region_get_ptr(struct io_mapped_region *mr)
+{
+ return mr->vmap_ptr;
+}
+
+static inline bool io_region_is_set(struct io_mapped_region *mr)
+{
+ return !!mr->nr_pages;
+}
+
#endif
diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c
index 7fd9bad..333c220 100644
--- a/io_uring/msg_ring.c
+++ b/io_uring/msg_ring.c
@@ -89,8 +89,8 @@ static void io_msg_tw_complete(struct io_kiocb *req, struct io_tw_state *ts)
static int io_msg_remote_post(struct io_ring_ctx *ctx, struct io_kiocb *req,
int res, u32 cflags, u64 user_data)
{
- req->task = READ_ONCE(ctx->submitter_task);
- if (!req->task) {
+ req->tctx = READ_ONCE(ctx->submitter_task->io_uring);
+ if (!req->tctx) {
kmem_cache_free(req_cachep, req);
return -EOWNERDEAD;
}
@@ -116,14 +116,13 @@ static struct io_kiocb *io_msg_get_kiocb(struct io_ring_ctx *ctx)
return kmem_cache_alloc(req_cachep, GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO);
}
-static int io_msg_data_remote(struct io_kiocb *req)
+static int io_msg_data_remote(struct io_ring_ctx *target_ctx,
+ struct io_msg *msg)
{
- struct io_ring_ctx *target_ctx = req->file->private_data;
- struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg);
struct io_kiocb *target;
u32 flags = 0;
- target = io_msg_get_kiocb(req->ctx);
+ target = io_msg_get_kiocb(target_ctx);
if (unlikely(!target))
return -ENOMEM;
@@ -134,10 +133,9 @@ static int io_msg_data_remote(struct io_kiocb *req)
msg->user_data);
}
-static int io_msg_ring_data(struct io_kiocb *req, unsigned int issue_flags)
+static int __io_msg_ring_data(struct io_ring_ctx *target_ctx,
+ struct io_msg *msg, unsigned int issue_flags)
{
- struct io_ring_ctx *target_ctx = req->file->private_data;
- struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg);
u32 flags = 0;
int ret;
@@ -149,7 +147,7 @@ static int io_msg_ring_data(struct io_kiocb *req, unsigned int issue_flags)
return -EBADFD;
if (io_msg_need_remote(target_ctx))
- return io_msg_data_remote(req);
+ return io_msg_data_remote(target_ctx, msg);
if (msg->flags & IORING_MSG_RING_FLAGS_PASS)
flags = msg->cqe_flags;
@@ -166,22 +164,32 @@ static int io_msg_ring_data(struct io_kiocb *req, unsigned int issue_flags)
return ret;
}
-static struct file *io_msg_grab_file(struct io_kiocb *req, unsigned int issue_flags)
+static int io_msg_ring_data(struct io_kiocb *req, unsigned int issue_flags)
+{
+ struct io_ring_ctx *target_ctx = req->file->private_data;
+ struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg);
+
+ return __io_msg_ring_data(target_ctx, msg, issue_flags);
+}
+
+static int io_msg_grab_file(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg);
struct io_ring_ctx *ctx = req->ctx;
- struct file *file = NULL;
- int idx = msg->src_fd;
+ struct io_rsrc_node *node;
+ int ret = -EBADF;
io_ring_submit_lock(ctx, issue_flags);
- if (likely(idx < ctx->nr_user_files)) {
- idx = array_index_nospec(idx, ctx->nr_user_files);
- file = io_file_from_index(&ctx->file_table, idx);
- if (file)
- get_file(file);
+ node = io_rsrc_node_lookup(&ctx->file_table.data, msg->src_fd);
+ if (node) {
+ msg->src_file = io_slot_file(node);
+ if (msg->src_file)
+ get_file(msg->src_file);
+ req->flags |= REQ_F_NEED_CLEANUP;
+ ret = 0;
}
io_ring_submit_unlock(ctx, issue_flags);
- return file;
+ return ret;
}
static int io_msg_install_complete(struct io_kiocb *req, unsigned int issue_flags)
@@ -250,7 +258,6 @@ static int io_msg_send_fd(struct io_kiocb *req, unsigned int issue_flags)
struct io_ring_ctx *target_ctx = req->file->private_data;
struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg);
struct io_ring_ctx *ctx = req->ctx;
- struct file *src_file = msg->src_file;
if (msg->len)
return -EINVAL;
@@ -258,12 +265,10 @@ static int io_msg_send_fd(struct io_kiocb *req, unsigned int issue_flags)
return -EINVAL;
if (target_ctx->flags & IORING_SETUP_R_DISABLED)
return -EBADFD;
- if (!src_file) {
- src_file = io_msg_grab_file(req, issue_flags);
- if (!src_file)
- return -EBADF;
- msg->src_file = src_file;
- req->flags |= REQ_F_NEED_CLEANUP;
+ if (!msg->src_file) {
+ int ret = io_msg_grab_file(req, issue_flags);
+ if (unlikely(ret))
+ return ret;
}
if (io_msg_need_remote(target_ctx))
@@ -271,10 +276,8 @@ static int io_msg_send_fd(struct io_kiocb *req, unsigned int issue_flags)
return io_msg_install_complete(req, issue_flags);
}
-int io_msg_ring_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+static int __io_msg_ring_prep(struct io_msg *msg, const struct io_uring_sqe *sqe)
{
- struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg);
-
if (unlikely(sqe->buf_index || sqe->personality))
return -EINVAL;
@@ -291,6 +294,11 @@ int io_msg_ring_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return 0;
}
+int io_msg_ring_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+ return __io_msg_ring_prep(io_kiocb_to_cmd(req, struct io_msg), sqe);
+}
+
int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg);
@@ -322,6 +330,31 @@ int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags)
return IOU_OK;
}
+int io_uring_sync_msg_ring(struct io_uring_sqe *sqe)
+{
+ struct io_msg io_msg = { };
+ int ret;
+
+ ret = __io_msg_ring_prep(&io_msg, sqe);
+ if (unlikely(ret))
+ return ret;
+
+ /*
+ * Only data sending supported, not IORING_MSG_SEND_FD as that one
+ * doesn't make sense without a source ring to send files from.
+ */
+ if (io_msg.cmd != IORING_MSG_DATA)
+ return -EINVAL;
+
+ CLASS(fd, f)(sqe->fd);
+ if (fd_empty(f))
+ return -EBADF;
+ if (!io_is_uring_fops(fd_file(f)))
+ return -EBADFD;
+ return __io_msg_ring_data(fd_file(f)->private_data,
+ &io_msg, IO_URING_F_UNLOCKED);
+}
+
void io_msg_cache_free(const void *entry)
{
struct io_kiocb *req = (struct io_kiocb *) entry;
diff --git a/io_uring/msg_ring.h b/io_uring/msg_ring.h
index 3030f39..38e7f8f 100644
--- a/io_uring/msg_ring.h
+++ b/io_uring/msg_ring.h
@@ -1,5 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
+int io_uring_sync_msg_ring(struct io_uring_sqe *sqe);
int io_msg_ring_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags);
void io_msg_ring_cleanup(struct io_kiocb *req);
diff --git a/io_uring/napi.c b/io_uring/napi.c
index d0cf694..b1ade3f 100644
--- a/io_uring/napi.c
+++ b/io_uring/napi.c
@@ -38,67 +38,88 @@ static inline ktime_t net_to_ktime(unsigned long t)
return ns_to_ktime(t << 10);
}
-void __io_napi_add(struct io_ring_ctx *ctx, struct socket *sock)
+int __io_napi_add_id(struct io_ring_ctx *ctx, unsigned int napi_id)
{
struct hlist_head *hash_list;
- unsigned int napi_id;
- struct sock *sk;
struct io_napi_entry *e;
- sk = sock->sk;
- if (!sk)
- return;
-
- napi_id = READ_ONCE(sk->sk_napi_id);
-
/* Non-NAPI IDs can be rejected. */
if (napi_id < MIN_NAPI_ID)
- return;
+ return -EINVAL;
hash_list = &ctx->napi_ht[hash_min(napi_id, HASH_BITS(ctx->napi_ht))];
- rcu_read_lock();
- e = io_napi_hash_find(hash_list, napi_id);
- if (e) {
- e->timeout = jiffies + NAPI_TIMEOUT;
- rcu_read_unlock();
- return;
+ scoped_guard(rcu) {
+ e = io_napi_hash_find(hash_list, napi_id);
+ if (e) {
+ WRITE_ONCE(e->timeout, jiffies + NAPI_TIMEOUT);
+ return -EEXIST;
+ }
}
- rcu_read_unlock();
e = kmalloc(sizeof(*e), GFP_NOWAIT);
if (!e)
- return;
+ return -ENOMEM;
e->napi_id = napi_id;
e->timeout = jiffies + NAPI_TIMEOUT;
+ /*
+ * guard(spinlock) is not used to manually unlock it before calling
+ * kfree()
+ */
spin_lock(&ctx->napi_lock);
if (unlikely(io_napi_hash_find(hash_list, napi_id))) {
spin_unlock(&ctx->napi_lock);
kfree(e);
- return;
+ return -EEXIST;
}
hlist_add_tail_rcu(&e->node, hash_list);
- list_add_tail(&e->list, &ctx->napi_list);
+ list_add_tail_rcu(&e->list, &ctx->napi_list);
spin_unlock(&ctx->napi_lock);
+ return 0;
+}
+
+static int __io_napi_del_id(struct io_ring_ctx *ctx, unsigned int napi_id)
+{
+ struct hlist_head *hash_list;
+ struct io_napi_entry *e;
+
+ /* Non-NAPI IDs can be rejected. */
+ if (napi_id < MIN_NAPI_ID)
+ return -EINVAL;
+
+ hash_list = &ctx->napi_ht[hash_min(napi_id, HASH_BITS(ctx->napi_ht))];
+ guard(spinlock)(&ctx->napi_lock);
+ e = io_napi_hash_find(hash_list, napi_id);
+ if (!e)
+ return -ENOENT;
+
+ list_del_rcu(&e->list);
+ hash_del_rcu(&e->node);
+ kfree_rcu(e, rcu);
+ return 0;
}
static void __io_napi_remove_stale(struct io_ring_ctx *ctx)
{
struct io_napi_entry *e;
- unsigned int i;
- spin_lock(&ctx->napi_lock);
- hash_for_each(ctx->napi_ht, i, e, node) {
- if (time_after(jiffies, e->timeout)) {
- list_del(&e->list);
+ guard(spinlock)(&ctx->napi_lock);
+ /*
+ * list_for_each_entry_safe() is not required as long as:
+ * 1. list_del_rcu() does not reset the deleted node next pointer
+ * 2. kfree_rcu() delays the memory freeing until the next quiescent
+ * state
+ */
+ list_for_each_entry(e, &ctx->napi_list, list) {
+ if (time_after(jiffies, READ_ONCE(e->timeout))) {
+ list_del_rcu(&e->list);
hash_del_rcu(&e->node);
kfree_rcu(e, rcu);
}
}
- spin_unlock(&ctx->napi_lock);
}
static inline void io_napi_remove_stale(struct io_ring_ctx *ctx, bool is_stale)
@@ -136,45 +157,73 @@ static bool io_napi_busy_loop_should_end(void *data,
return false;
}
-static bool __io_napi_do_busy_loop(struct io_ring_ctx *ctx,
- void *loop_end_arg)
+/*
+ * never report stale entries
+ */
+static bool static_tracking_do_busy_loop(struct io_ring_ctx *ctx,
+ bool (*loop_end)(void *, unsigned long),
+ void *loop_end_arg)
{
struct io_napi_entry *e;
- bool (*loop_end)(void *, unsigned long) = NULL;
- bool is_stale = false;
- if (loop_end_arg)
- loop_end = io_napi_busy_loop_should_end;
+ list_for_each_entry_rcu(e, &ctx->napi_list, list)
+ napi_busy_loop_rcu(e->napi_id, loop_end, loop_end_arg,
+ ctx->napi_prefer_busy_poll, BUSY_POLL_BUDGET);
+ return false;
+}
+
+static bool
+dynamic_tracking_do_busy_loop(struct io_ring_ctx *ctx,
+ bool (*loop_end)(void *, unsigned long),
+ void *loop_end_arg)
+{
+ struct io_napi_entry *e;
+ bool is_stale = false;
list_for_each_entry_rcu(e, &ctx->napi_list, list) {
napi_busy_loop_rcu(e->napi_id, loop_end, loop_end_arg,
ctx->napi_prefer_busy_poll, BUSY_POLL_BUDGET);
- if (time_after(jiffies, e->timeout))
+ if (time_after(jiffies, READ_ONCE(e->timeout)))
is_stale = true;
}
return is_stale;
}
+static inline bool
+__io_napi_do_busy_loop(struct io_ring_ctx *ctx,
+ bool (*loop_end)(void *, unsigned long),
+ void *loop_end_arg)
+{
+ if (READ_ONCE(ctx->napi_track_mode) == IO_URING_NAPI_TRACKING_STATIC)
+ return static_tracking_do_busy_loop(ctx, loop_end, loop_end_arg);
+ return dynamic_tracking_do_busy_loop(ctx, loop_end, loop_end_arg);
+}
+
static void io_napi_blocking_busy_loop(struct io_ring_ctx *ctx,
struct io_wait_queue *iowq)
{
unsigned long start_time = busy_loop_current_time();
+ bool (*loop_end)(void *, unsigned long) = NULL;
void *loop_end_arg = NULL;
bool is_stale = false;
/* Singular lists use a different napi loop end check function and are
* only executed once.
*/
- if (list_is_singular(&ctx->napi_list))
+ if (list_is_singular(&ctx->napi_list)) {
+ loop_end = io_napi_busy_loop_should_end;
loop_end_arg = iowq;
+ }
- rcu_read_lock();
- do {
- is_stale = __io_napi_do_busy_loop(ctx, loop_end_arg);
- } while (!io_napi_busy_loop_should_end(iowq, start_time) && !loop_end_arg);
- rcu_read_unlock();
+ scoped_guard(rcu) {
+ do {
+ is_stale = __io_napi_do_busy_loop(ctx, loop_end,
+ loop_end_arg);
+ } while (!io_napi_busy_loop_should_end(iowq, start_time) &&
+ !loop_end_arg);
+ }
io_napi_remove_stale(ctx, is_stale);
}
@@ -193,6 +242,7 @@ void io_napi_init(struct io_ring_ctx *ctx)
spin_lock_init(&ctx->napi_lock);
ctx->napi_prefer_busy_poll = false;
ctx->napi_busy_poll_dt = ns_to_ktime(sys_dt);
+ ctx->napi_track_mode = IO_URING_NAPI_TRACKING_INACTIVE;
}
/*
@@ -204,14 +254,31 @@ void io_napi_init(struct io_ring_ctx *ctx)
void io_napi_free(struct io_ring_ctx *ctx)
{
struct io_napi_entry *e;
- unsigned int i;
- spin_lock(&ctx->napi_lock);
- hash_for_each(ctx->napi_ht, i, e, node) {
+ guard(spinlock)(&ctx->napi_lock);
+ list_for_each_entry(e, &ctx->napi_list, list) {
hash_del_rcu(&e->node);
kfree_rcu(e, rcu);
}
- spin_unlock(&ctx->napi_lock);
+ INIT_LIST_HEAD_RCU(&ctx->napi_list);
+}
+
+static int io_napi_register_napi(struct io_ring_ctx *ctx,
+ struct io_uring_napi *napi)
+{
+ switch (napi->op_param) {
+ case IO_URING_NAPI_TRACKING_DYNAMIC:
+ case IO_URING_NAPI_TRACKING_STATIC:
+ break;
+ default:
+ return -EINVAL;
+ }
+ /* clean the napi list for new settings */
+ io_napi_free(ctx);
+ WRITE_ONCE(ctx->napi_track_mode, napi->op_param);
+ WRITE_ONCE(ctx->napi_busy_poll_dt, napi->busy_poll_to * NSEC_PER_USEC);
+ WRITE_ONCE(ctx->napi_prefer_busy_poll, !!napi->prefer_busy_poll);
+ return 0;
}
/*
@@ -225,7 +292,8 @@ int io_register_napi(struct io_ring_ctx *ctx, void __user *arg)
{
const struct io_uring_napi curr = {
.busy_poll_to = ktime_to_us(ctx->napi_busy_poll_dt),
- .prefer_busy_poll = ctx->napi_prefer_busy_poll
+ .prefer_busy_poll = ctx->napi_prefer_busy_poll,
+ .op_param = ctx->napi_track_mode
};
struct io_uring_napi napi;
@@ -233,16 +301,26 @@ int io_register_napi(struct io_ring_ctx *ctx, void __user *arg)
return -EINVAL;
if (copy_from_user(&napi, arg, sizeof(napi)))
return -EFAULT;
- if (napi.pad[0] || napi.pad[1] || napi.pad[2] || napi.resv)
+ if (napi.pad[0] || napi.pad[1] || napi.resv)
return -EINVAL;
if (copy_to_user(arg, &curr, sizeof(curr)))
return -EFAULT;
- WRITE_ONCE(ctx->napi_busy_poll_dt, napi.busy_poll_to * NSEC_PER_USEC);
- WRITE_ONCE(ctx->napi_prefer_busy_poll, !!napi.prefer_busy_poll);
- WRITE_ONCE(ctx->napi_enabled, true);
- return 0;
+ switch (napi.opcode) {
+ case IO_URING_NAPI_REGISTER_OP:
+ return io_napi_register_napi(ctx, &napi);
+ case IO_URING_NAPI_STATIC_ADD_ID:
+ if (curr.op_param != IO_URING_NAPI_TRACKING_STATIC)
+ return -EINVAL;
+ return __io_napi_add_id(ctx, napi.op_param);
+ case IO_URING_NAPI_STATIC_DEL_ID:
+ if (curr.op_param != IO_URING_NAPI_TRACKING_STATIC)
+ return -EINVAL;
+ return __io_napi_del_id(ctx, napi.op_param);
+ default:
+ return -EINVAL;
+ }
}
/*
@@ -265,7 +343,7 @@ int io_unregister_napi(struct io_ring_ctx *ctx, void __user *arg)
WRITE_ONCE(ctx->napi_busy_poll_dt, 0);
WRITE_ONCE(ctx->napi_prefer_busy_poll, false);
- WRITE_ONCE(ctx->napi_enabled, false);
+ WRITE_ONCE(ctx->napi_track_mode, IO_URING_NAPI_TRACKING_INACTIVE);
return 0;
}
@@ -307,9 +385,9 @@ int io_napi_sqpoll_busy_poll(struct io_ring_ctx *ctx)
if (list_empty_careful(&ctx->napi_list))
return 0;
- rcu_read_lock();
- is_stale = __io_napi_do_busy_loop(ctx, NULL);
- rcu_read_unlock();
+ scoped_guard(rcu) {
+ is_stale = __io_napi_do_busy_loop(ctx, NULL, NULL);
+ }
io_napi_remove_stale(ctx, is_stale);
return 1;
diff --git a/io_uring/napi.h b/io_uring/napi.h
index fd275ef..fa742f4 100644
--- a/io_uring/napi.h
+++ b/io_uring/napi.h
@@ -15,7 +15,7 @@ void io_napi_free(struct io_ring_ctx *ctx);
int io_register_napi(struct io_ring_ctx *ctx, void __user *arg);
int io_unregister_napi(struct io_ring_ctx *ctx, void __user *arg);
-void __io_napi_add(struct io_ring_ctx *ctx, struct socket *sock);
+int __io_napi_add_id(struct io_ring_ctx *ctx, unsigned int napi_id);
void __io_napi_busy_loop(struct io_ring_ctx *ctx, struct io_wait_queue *iowq);
int io_napi_sqpoll_busy_poll(struct io_ring_ctx *ctx);
@@ -44,12 +44,12 @@ static inline void io_napi_add(struct io_kiocb *req)
struct io_ring_ctx *ctx = req->ctx;
struct socket *sock;
- if (!READ_ONCE(ctx->napi_enabled))
+ if (READ_ONCE(ctx->napi_track_mode) != IO_URING_NAPI_TRACKING_DYNAMIC)
return;
sock = sock_from_file(req->file);
- if (sock)
- __io_napi_add(ctx, sock);
+ if (sock && sock->sk)
+ __io_napi_add_id(ctx, READ_ONCE(sock->sk->sk_napi_id));
}
#else
diff --git a/io_uring/net.c b/io_uring/net.c
index 1850765..df1f7dc 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -74,9 +74,8 @@ struct io_sr_msg {
unsigned nr_multishot_loops;
u16 flags;
/* initialised and used only by !msg send variants */
- u16 addr_len;
u16 buf_group;
- void __user *addr;
+ u16 buf_index;
void __user *msg_control;
/* used only for send zerocopy */
struct io_kiocb *notif;
@@ -263,6 +262,7 @@ static int io_msg_copy_hdr(struct io_kiocb *req, struct io_async_msghdr *iomsg,
struct user_msghdr *msg, int ddir)
{
struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
+ struct user_msghdr __user *umsg = sr->umsg;
struct iovec *iov;
int ret, nr_segs;
@@ -274,16 +274,16 @@ static int io_msg_copy_hdr(struct io_kiocb *req, struct io_async_msghdr *iomsg,
nr_segs = 1;
}
- if (!user_access_begin(sr->umsg, sizeof(*sr->umsg)))
+ if (!user_access_begin(umsg, sizeof(*umsg)))
return -EFAULT;
ret = -EFAULT;
- unsafe_get_user(msg->msg_name, &sr->umsg->msg_name, ua_end);
- unsafe_get_user(msg->msg_namelen, &sr->umsg->msg_namelen, ua_end);
- unsafe_get_user(msg->msg_iov, &sr->umsg->msg_iov, ua_end);
- unsafe_get_user(msg->msg_iovlen, &sr->umsg->msg_iovlen, ua_end);
- unsafe_get_user(msg->msg_control, &sr->umsg->msg_control, ua_end);
- unsafe_get_user(msg->msg_controllen, &sr->umsg->msg_controllen, ua_end);
+ unsafe_get_user(msg->msg_name, &umsg->msg_name, ua_end);
+ unsafe_get_user(msg->msg_namelen, &umsg->msg_namelen, ua_end);
+ unsafe_get_user(msg->msg_iov, &umsg->msg_iov, ua_end);
+ unsafe_get_user(msg->msg_iovlen, &umsg->msg_iovlen, ua_end);
+ unsafe_get_user(msg->msg_control, &umsg->msg_control, ua_end);
+ unsafe_get_user(msg->msg_controllen, &umsg->msg_controllen, ua_end);
msg->msg_flags = 0;
if (req->flags & REQ_F_BUFFER_SELECT) {
@@ -356,24 +356,33 @@ void io_sendmsg_recvmsg_cleanup(struct io_kiocb *req)
io_netmsg_iovec_free(io);
}
-static int io_send_setup(struct io_kiocb *req)
+static int io_send_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
struct io_async_msghdr *kmsg = req->async_data;
+ void __user *addr;
+ u16 addr_len;
int ret;
+ sr->buf = u64_to_user_ptr(READ_ONCE(sqe->addr));
+
+ if (READ_ONCE(sqe->__pad3[0]))
+ return -EINVAL;
+
kmsg->msg.msg_name = NULL;
kmsg->msg.msg_namelen = 0;
kmsg->msg.msg_control = NULL;
kmsg->msg.msg_controllen = 0;
kmsg->msg.msg_ubuf = NULL;
- if (sr->addr) {
- ret = move_addr_to_kernel(sr->addr, sr->addr_len, &kmsg->addr);
+ addr = u64_to_user_ptr(READ_ONCE(sqe->addr2));
+ addr_len = READ_ONCE(sqe->addr_len);
+ if (addr) {
+ ret = move_addr_to_kernel(addr, addr_len, &kmsg->addr);
if (unlikely(ret < 0))
return ret;
kmsg->msg.msg_name = &kmsg->addr;
- kmsg->msg.msg_namelen = sr->addr_len;
+ kmsg->msg.msg_namelen = addr_len;
}
if (!io_do_buffer_select(req)) {
ret = import_ubuf(ITER_SOURCE, sr->buf, sr->len,
@@ -384,16 +393,14 @@ static int io_send_setup(struct io_kiocb *req)
return 0;
}
-static int io_sendmsg_prep_setup(struct io_kiocb *req, int is_msg)
+static int io_sendmsg_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
- struct io_async_msghdr *kmsg;
+ struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
+ struct io_async_msghdr *kmsg = req->async_data;
int ret;
- kmsg = io_msg_alloc_async(req);
- if (unlikely(!kmsg))
- return -ENOMEM;
- if (!is_msg)
- return io_send_setup(req);
+ sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
+
ret = io_sendmsg_copy_hdr(req, kmsg);
if (!ret)
req->flags |= REQ_F_NEED_CLEANUP;
@@ -408,16 +415,11 @@ int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
sr->done_io = 0;
- if (req->opcode == IORING_OP_SEND) {
- if (READ_ONCE(sqe->__pad3[0]))
+ if (req->opcode != IORING_OP_SEND) {
+ if (sqe->addr2 || sqe->file_index)
return -EINVAL;
- sr->addr = u64_to_user_ptr(READ_ONCE(sqe->addr2));
- sr->addr_len = READ_ONCE(sqe->addr_len);
- } else if (sqe->addr2 || sqe->file_index) {
- return -EINVAL;
}
- sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
sr->len = READ_ONCE(sqe->len);
sr->flags = READ_ONCE(sqe->ioprio);
if (sr->flags & ~SENDMSG_FLAGS)
@@ -439,7 +441,11 @@ int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
if (req->ctx->compat)
sr->msg_flags |= MSG_CMSG_COMPAT;
#endif
- return io_sendmsg_prep_setup(req, req->opcode == IORING_OP_SENDMSG);
+ if (unlikely(!io_msg_alloc_async(req)))
+ return -ENOMEM;
+ if (req->opcode != IORING_OP_SENDMSG)
+ return io_send_setup(req, sqe);
+ return io_sendmsg_setup(req, sqe);
}
static void io_req_msg_cleanup(struct io_kiocb *req,
@@ -1254,31 +1260,16 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
}
}
- if (zc->flags & IORING_RECVSEND_FIXED_BUF) {
- unsigned idx = READ_ONCE(sqe->buf_index);
-
- if (unlikely(idx >= ctx->nr_user_bufs))
- return -EFAULT;
- idx = array_index_nospec(idx, ctx->nr_user_bufs);
- req->imu = READ_ONCE(ctx->user_bufs[idx]);
- io_req_set_rsrc_node(notif, ctx, 0);
- }
-
- if (req->opcode == IORING_OP_SEND_ZC) {
- if (READ_ONCE(sqe->__pad3[0]))
- return -EINVAL;
- zc->addr = u64_to_user_ptr(READ_ONCE(sqe->addr2));
- zc->addr_len = READ_ONCE(sqe->addr_len);
- } else {
+ if (req->opcode != IORING_OP_SEND_ZC) {
if (unlikely(sqe->addr2 || sqe->file_index))
return -EINVAL;
if (unlikely(zc->flags & IORING_RECVSEND_FIXED_BUF))
return -EINVAL;
}
- zc->buf = u64_to_user_ptr(READ_ONCE(sqe->addr));
zc->len = READ_ONCE(sqe->len);
zc->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL | MSG_ZEROCOPY;
+ zc->buf_index = READ_ONCE(sqe->buf_index);
if (zc->msg_flags & MSG_DONTWAIT)
req->flags |= REQ_F_NOWAIT;
@@ -1286,7 +1277,11 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
if (req->ctx->compat)
zc->msg_flags |= MSG_CMSG_COMPAT;
#endif
- return io_sendmsg_prep_setup(req, req->opcode == IORING_OP_SENDMSG_ZC);
+ if (unlikely(!io_msg_alloc_async(req)))
+ return -ENOMEM;
+ if (req->opcode != IORING_OP_SENDMSG_ZC)
+ return io_send_setup(req, sqe);
+ return io_sendmsg_setup(req, sqe);
}
static int io_sg_from_iter_iovec(struct sk_buff *skb,
@@ -1339,14 +1334,31 @@ static int io_sg_from_iter(struct sk_buff *skb,
return ret;
}
-static int io_send_zc_import(struct io_kiocb *req, struct io_async_msghdr *kmsg)
+static int io_send_zc_import(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
+ struct io_async_msghdr *kmsg = req->async_data;
int ret;
if (sr->flags & IORING_RECVSEND_FIXED_BUF) {
- ret = io_import_fixed(ITER_SOURCE, &kmsg->msg.msg_iter, req->imu,
- (u64)(uintptr_t)sr->buf, sr->len);
+ struct io_ring_ctx *ctx = req->ctx;
+ struct io_rsrc_node *node;
+
+ ret = -EFAULT;
+ io_ring_submit_lock(ctx, issue_flags);
+ node = io_rsrc_node_lookup(&ctx->buf_table, sr->buf_index);
+ if (node) {
+ io_req_assign_buf_node(sr->notif, node);
+ ret = 0;
+ }
+ io_ring_submit_unlock(ctx, issue_flags);
+
+ if (unlikely(ret))
+ return ret;
+
+ ret = io_import_fixed(ITER_SOURCE, &kmsg->msg.msg_iter,
+ node->buf, (u64)(uintptr_t)sr->buf,
+ sr->len);
if (unlikely(ret))
return ret;
kmsg->msg.sg_from_iter = io_sg_from_iter;
@@ -1382,7 +1394,7 @@ int io_send_zc(struct io_kiocb *req, unsigned int issue_flags)
return -EAGAIN;
if (!zc->done_io) {
- ret = io_send_zc_import(req, kmsg);
+ ret = io_send_zc_import(req, issue_flags);
if (unlikely(ret))
return ret;
}
diff --git a/io_uring/nop.c b/io_uring/nop.c
index a5bcf3d..6d470d4 100644
--- a/io_uring/nop.c
+++ b/io_uring/nop.c
@@ -8,35 +8,72 @@
#include <uapi/linux/io_uring.h>
#include "io_uring.h"
+#include "rsrc.h"
#include "nop.h"
struct io_nop {
/* NOTE: kiocb has the file as the first member, so don't do it here */
struct file *file;
int result;
+ int fd;
+ int buffer;
+ unsigned int flags;
};
+#define NOP_FLAGS (IORING_NOP_INJECT_RESULT | IORING_NOP_FIXED_FILE | \
+ IORING_NOP_FIXED_BUFFER | IORING_NOP_FILE)
+
int io_nop_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
- unsigned int flags;
struct io_nop *nop = io_kiocb_to_cmd(req, struct io_nop);
- flags = READ_ONCE(sqe->nop_flags);
- if (flags & ~IORING_NOP_INJECT_RESULT)
+ nop->flags = READ_ONCE(sqe->nop_flags);
+ if (nop->flags & ~NOP_FLAGS)
return -EINVAL;
- if (flags & IORING_NOP_INJECT_RESULT)
+ if (nop->flags & IORING_NOP_INJECT_RESULT)
nop->result = READ_ONCE(sqe->len);
else
nop->result = 0;
+ if (nop->flags & IORING_NOP_FIXED_FILE)
+ nop->fd = READ_ONCE(sqe->fd);
+ if (nop->flags & IORING_NOP_FIXED_BUFFER)
+ nop->buffer = READ_ONCE(sqe->buf_index);
return 0;
}
int io_nop(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_nop *nop = io_kiocb_to_cmd(req, struct io_nop);
+ int ret = nop->result;
- if (nop->result < 0)
+ if (nop->flags & IORING_NOP_FILE) {
+ if (nop->flags & IORING_NOP_FIXED_FILE) {
+ req->file = io_file_get_fixed(req, nop->fd, issue_flags);
+ req->flags |= REQ_F_FIXED_FILE;
+ } else {
+ req->file = io_file_get_normal(req, nop->fd);
+ }
+ if (!req->file) {
+ ret = -EBADF;
+ goto done;
+ }
+ }
+ if (nop->flags & IORING_NOP_FIXED_BUFFER) {
+ struct io_ring_ctx *ctx = req->ctx;
+ struct io_rsrc_node *node;
+
+ ret = -EFAULT;
+ io_ring_submit_lock(ctx, issue_flags);
+ node = io_rsrc_node_lookup(&ctx->buf_table, nop->buffer);
+ if (node) {
+ io_req_assign_buf_node(req, node);
+ ret = 0;
+ }
+ io_ring_submit_unlock(ctx, issue_flags);
+ }
+done:
+ if (ret < 0)
req_set_fail(req);
io_req_set_res(req, nop->result, 0);
return IOU_OK;
diff --git a/io_uring/notif.c b/io_uring/notif.c
index 28859ae..ee3a335 100644
--- a/io_uring/notif.c
+++ b/io_uring/notif.c
@@ -89,7 +89,7 @@ static int io_link_skb(struct sk_buff *skb, struct ubuf_info *uarg)
/* make sure all noifications can be finished in the same task_work */
if (unlikely(notif->ctx != prev_notif->ctx ||
- notif->task != prev_notif->task))
+ notif->tctx != prev_notif->tctx))
return -EEXIST;
nd->head = prev_nd->head;
@@ -115,9 +115,10 @@ struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx)
notif->opcode = IORING_OP_NOP;
notif->flags = 0;
notif->file = NULL;
- notif->task = current;
+ notif->tctx = current->io_uring;
io_get_task_refs(1);
- notif->rsrc_node = NULL;
+ notif->file_node = NULL;
+ notif->buf_node = NULL;
nd = io_notif_to_data(notif);
nd->zc_report = false;
diff --git a/io_uring/opdef.c b/io_uring/opdef.c
index a2be3bb..3de75ec 100644
--- a/io_uring/opdef.c
+++ b/io_uring/opdef.c
@@ -641,6 +641,7 @@ const struct io_cold_def io_cold_defs[] = {
},
[IORING_OP_SPLICE] = {
.name = "SPLICE",
+ .cleanup = io_splice_cleanup,
},
[IORING_OP_PROVIDE_BUFFERS] = {
.name = "PROVIDE_BUFFERS",
@@ -650,6 +651,7 @@ const struct io_cold_def io_cold_defs[] = {
},
[IORING_OP_TEE] = {
.name = "TEE",
+ .cleanup = io_splice_cleanup,
},
[IORING_OP_SHUTDOWN] = {
.name = "SHUTDOWN",
diff --git a/io_uring/poll.c b/io_uring/poll.c
index 1f63b60..bced9ed 100644
--- a/io_uring/poll.c
+++ b/io_uring/poll.c
@@ -122,53 +122,12 @@ static void io_poll_req_insert(struct io_kiocb *req)
{
struct io_hash_table *table = &req->ctx->cancel_table;
u32 index = hash_long(req->cqe.user_data, table->hash_bits);
- struct io_hash_bucket *hb = &table->hbs[index];
-
- spin_lock(&hb->lock);
- hlist_add_head(&req->hash_node, &hb->list);
- spin_unlock(&hb->lock);
-}
-
-static void io_poll_req_delete(struct io_kiocb *req, struct io_ring_ctx *ctx)
-{
- struct io_hash_table *table = &req->ctx->cancel_table;
- u32 index = hash_long(req->cqe.user_data, table->hash_bits);
- spinlock_t *lock = &table->hbs[index].lock;
-
- spin_lock(lock);
- hash_del(&req->hash_node);
- spin_unlock(lock);
-}
-
-static void io_poll_req_insert_locked(struct io_kiocb *req)
-{
- struct io_hash_table *table = &req->ctx->cancel_table_locked;
- u32 index = hash_long(req->cqe.user_data, table->hash_bits);
lockdep_assert_held(&req->ctx->uring_lock);
hlist_add_head(&req->hash_node, &table->hbs[index].list);
}
-static void io_poll_tw_hash_eject(struct io_kiocb *req, struct io_tw_state *ts)
-{
- struct io_ring_ctx *ctx = req->ctx;
-
- if (req->flags & REQ_F_HASH_LOCKED) {
- /*
- * ->cancel_table_locked is protected by ->uring_lock in
- * contrast to per bucket spinlocks. Likely, tctx_task_work()
- * already grabbed the mutex for us, but there is a chance it
- * failed.
- */
- io_tw_lock(ctx, ts);
- hash_del(&req->hash_node);
- req->flags &= ~REQ_F_HASH_LOCKED;
- } else {
- io_poll_req_delete(req, ctx);
- }
-}
-
static void io_init_poll_iocb(struct io_poll *poll, __poll_t events)
{
poll->head = NULL;
@@ -265,8 +224,7 @@ static int io_poll_check_events(struct io_kiocb *req, struct io_tw_state *ts)
{
int v;
- /* req->task == current here, checking PF_EXITING is safe */
- if (unlikely(req->task->flags & PF_EXITING))
+ if (unlikely(io_should_terminate_tw()))
return -ECANCELED;
do {
@@ -363,7 +321,8 @@ void io_poll_task_func(struct io_kiocb *req, struct io_tw_state *ts)
return;
}
io_poll_remove_entries(req);
- io_poll_tw_hash_eject(req, ts);
+ /* task_work always has ->uring_lock held */
+ hash_del(&req->hash_node);
if (req->opcode == IORING_OP_POLL_ADD) {
if (ret == IOU_POLL_DONE) {
@@ -563,12 +522,13 @@ static bool io_poll_can_finish_inline(struct io_kiocb *req,
return pt->owning || io_poll_get_ownership(req);
}
-static void io_poll_add_hash(struct io_kiocb *req)
+static void io_poll_add_hash(struct io_kiocb *req, unsigned int issue_flags)
{
- if (req->flags & REQ_F_HASH_LOCKED)
- io_poll_req_insert_locked(req);
- else
- io_poll_req_insert(req);
+ struct io_ring_ctx *ctx = req->ctx;
+
+ io_ring_submit_lock(ctx, issue_flags);
+ io_poll_req_insert(req);
+ io_ring_submit_unlock(ctx, issue_flags);
}
/*
@@ -605,11 +565,6 @@ static int __io_arm_poll_handler(struct io_kiocb *req,
ipt->owning = issue_flags & IO_URING_F_UNLOCKED;
atomic_set(&req->poll_refs, (int)ipt->owning);
- /* io-wq doesn't hold uring_lock */
- if (issue_flags & IO_URING_F_UNLOCKED)
- req->flags &= ~REQ_F_HASH_LOCKED;
-
-
/*
* Exclusive waits may only wake a limited amount of entries
* rather than all of them, this may interfere with lazy
@@ -638,7 +593,7 @@ static int __io_arm_poll_handler(struct io_kiocb *req,
if (mask &&
((poll->events & (EPOLLET|EPOLLONESHOT)) == (EPOLLET|EPOLLONESHOT))) {
if (!io_poll_can_finish_inline(req, ipt)) {
- io_poll_add_hash(req);
+ io_poll_add_hash(req, issue_flags);
return 0;
}
io_poll_remove_entries(req);
@@ -647,7 +602,7 @@ static int __io_arm_poll_handler(struct io_kiocb *req,
return 1;
}
- io_poll_add_hash(req);
+ io_poll_add_hash(req, issue_flags);
if (mask && (poll->events & EPOLLET) &&
io_poll_can_finish_inline(req, ipt)) {
@@ -720,12 +675,6 @@ int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags)
__poll_t mask = POLLPRI | POLLERR | EPOLLET;
int ret;
- /*
- * apoll requests already grab the mutex to complete in the tw handler,
- * so removal from the mutex-backed hash is free, use it by default.
- */
- req->flags |= REQ_F_HASH_LOCKED;
-
if (!def->pollin && !def->pollout)
return IO_APOLL_ABORTED;
if (!io_file_can_poll(req))
@@ -761,58 +710,41 @@ int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags)
return IO_APOLL_OK;
}
-static __cold bool io_poll_remove_all_table(struct task_struct *tsk,
- struct io_hash_table *table,
- bool cancel_all)
+/*
+ * Returns true if we found and killed one or more poll requests
+ */
+__cold bool io_poll_remove_all(struct io_ring_ctx *ctx, struct io_uring_task *tctx,
+ bool cancel_all)
{
- unsigned nr_buckets = 1U << table->hash_bits;
+ unsigned nr_buckets = 1U << ctx->cancel_table.hash_bits;
struct hlist_node *tmp;
struct io_kiocb *req;
bool found = false;
int i;
- for (i = 0; i < nr_buckets; i++) {
- struct io_hash_bucket *hb = &table->hbs[i];
+ lockdep_assert_held(&ctx->uring_lock);
- spin_lock(&hb->lock);
+ for (i = 0; i < nr_buckets; i++) {
+ struct io_hash_bucket *hb = &ctx->cancel_table.hbs[i];
+
hlist_for_each_entry_safe(req, tmp, &hb->list, hash_node) {
- if (io_match_task_safe(req, tsk, cancel_all)) {
+ if (io_match_task_safe(req, tctx, cancel_all)) {
hlist_del_init(&req->hash_node);
io_poll_cancel_req(req);
found = true;
}
}
- spin_unlock(&hb->lock);
}
return found;
}
-/*
- * Returns true if we found and killed one or more poll requests
- */
-__cold bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk,
- bool cancel_all)
- __must_hold(&ctx->uring_lock)
-{
- bool ret;
-
- ret = io_poll_remove_all_table(tsk, &ctx->cancel_table, cancel_all);
- ret |= io_poll_remove_all_table(tsk, &ctx->cancel_table_locked, cancel_all);
- return ret;
-}
-
static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, bool poll_only,
- struct io_cancel_data *cd,
- struct io_hash_table *table,
- struct io_hash_bucket **out_bucket)
+ struct io_cancel_data *cd)
{
struct io_kiocb *req;
- u32 index = hash_long(cd->data, table->hash_bits);
- struct io_hash_bucket *hb = &table->hbs[index];
+ u32 index = hash_long(cd->data, ctx->cancel_table.hash_bits);
+ struct io_hash_bucket *hb = &ctx->cancel_table.hbs[index];
- *out_bucket = NULL;
-
- spin_lock(&hb->lock);
hlist_for_each_entry(req, &hb->list, hash_node) {
if (cd->data != req->cqe.user_data)
continue;
@@ -822,35 +754,25 @@ static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, bool poll_only,
if (io_cancel_match_sequence(req, cd->seq))
continue;
}
- *out_bucket = hb;
return req;
}
- spin_unlock(&hb->lock);
return NULL;
}
static struct io_kiocb *io_poll_file_find(struct io_ring_ctx *ctx,
- struct io_cancel_data *cd,
- struct io_hash_table *table,
- struct io_hash_bucket **out_bucket)
+ struct io_cancel_data *cd)
{
- unsigned nr_buckets = 1U << table->hash_bits;
+ unsigned nr_buckets = 1U << ctx->cancel_table.hash_bits;
struct io_kiocb *req;
int i;
- *out_bucket = NULL;
-
for (i = 0; i < nr_buckets; i++) {
- struct io_hash_bucket *hb = &table->hbs[i];
+ struct io_hash_bucket *hb = &ctx->cancel_table.hbs[i];
- spin_lock(&hb->lock);
hlist_for_each_entry(req, &hb->list, hash_node) {
- if (io_cancel_req_match(req, cd)) {
- *out_bucket = hb;
+ if (io_cancel_req_match(req, cd))
return req;
- }
}
- spin_unlock(&hb->lock);
}
return NULL;
}
@@ -866,23 +788,21 @@ static int io_poll_disarm(struct io_kiocb *req)
return 0;
}
-static int __io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd,
- struct io_hash_table *table)
+static int __io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd)
{
- struct io_hash_bucket *bucket;
struct io_kiocb *req;
if (cd->flags & (IORING_ASYNC_CANCEL_FD | IORING_ASYNC_CANCEL_OP |
IORING_ASYNC_CANCEL_ANY))
- req = io_poll_file_find(ctx, cd, table, &bucket);
+ req = io_poll_file_find(ctx, cd);
else
- req = io_poll_find(ctx, false, cd, table, &bucket);
+ req = io_poll_find(ctx, false, cd);
- if (req)
+ if (req) {
io_poll_cancel_req(req);
- if (bucket)
- spin_unlock(&bucket->lock);
- return req ? 0 : -ENOENT;
+ return 0;
+ }
+ return -ENOENT;
}
int io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd,
@@ -890,12 +810,8 @@ int io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd,
{
int ret;
- ret = __io_poll_cancel(ctx, cd, &ctx->cancel_table);
- if (ret != -ENOENT)
- return ret;
-
io_ring_submit_lock(ctx, issue_flags);
- ret = __io_poll_cancel(ctx, cd, &ctx->cancel_table_locked);
+ ret = __io_poll_cancel(ctx, cd);
io_ring_submit_unlock(ctx, issue_flags);
return ret;
}
@@ -972,13 +888,6 @@ int io_poll_add(struct io_kiocb *req, unsigned int issue_flags)
ipt.pt._qproc = io_poll_queue_proc;
- /*
- * If sqpoll or single issuer, there is no contention for ->uring_lock
- * and we'll end up holding it in tw handlers anyway.
- */
- if (req->ctx->flags & (IORING_SETUP_SQPOLL|IORING_SETUP_SINGLE_ISSUER))
- req->flags |= REQ_F_HASH_LOCKED;
-
ret = __io_arm_poll_handler(req, poll, &ipt, poll->events, issue_flags);
if (ret > 0) {
io_req_set_res(req, ipt.result_mask, 0);
@@ -992,32 +901,16 @@ int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags)
struct io_poll_update *poll_update = io_kiocb_to_cmd(req, struct io_poll_update);
struct io_ring_ctx *ctx = req->ctx;
struct io_cancel_data cd = { .ctx = ctx, .data = poll_update->old_user_data, };
- struct io_hash_bucket *bucket;
struct io_kiocb *preq;
int ret2, ret = 0;
io_ring_submit_lock(ctx, issue_flags);
- preq = io_poll_find(ctx, true, &cd, &ctx->cancel_table, &bucket);
+ preq = io_poll_find(ctx, true, &cd);
ret2 = io_poll_disarm(preq);
- if (bucket)
- spin_unlock(&bucket->lock);
- if (!ret2)
- goto found;
- if (ret2 != -ENOENT) {
- ret = ret2;
- goto out;
- }
-
- preq = io_poll_find(ctx, true, &cd, &ctx->cancel_table_locked, &bucket);
- ret2 = io_poll_disarm(preq);
- if (bucket)
- spin_unlock(&bucket->lock);
if (ret2) {
ret = ret2;
goto out;
}
-
-found:
if (WARN_ON_ONCE(preq->opcode != IORING_OP_POLL_ADD)) {
ret = -EFAULT;
goto out;
diff --git a/io_uring/poll.h b/io_uring/poll.h
index b0e3745..04ede93 100644
--- a/io_uring/poll.h
+++ b/io_uring/poll.h
@@ -40,7 +40,7 @@ struct io_cancel_data;
int io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd,
unsigned issue_flags);
int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags);
-bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk,
+bool io_poll_remove_all(struct io_ring_ctx *ctx, struct io_uring_task *tctx,
bool cancel_all);
void io_poll_task_func(struct io_kiocb *req, struct io_tw_state *ts);
diff --git a/io_uring/register.c b/io_uring/register.c
index eca26d4..1a60f49 100644
--- a/io_uring/register.c
+++ b/io_uring/register.c
@@ -28,6 +28,8 @@
#include "kbuf.h"
#include "napi.h"
#include "eventfd.h"
+#include "msg_ring.h"
+#include "memmap.h"
#define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \
IORING_REGISTER_LAST + IORING_OP_LAST)
@@ -360,6 +362,259 @@ static int io_register_clock(struct io_ring_ctx *ctx,
return 0;
}
+/*
+ * State to maintain until we can swap. Both new and old state, used for
+ * either mapping or freeing.
+ */
+struct io_ring_ctx_rings {
+ unsigned short n_ring_pages;
+ unsigned short n_sqe_pages;
+ struct page **ring_pages;
+ struct page **sqe_pages;
+ struct io_uring_sqe *sq_sqes;
+ struct io_rings *rings;
+};
+
+static void io_register_free_rings(struct io_uring_params *p,
+ struct io_ring_ctx_rings *r)
+{
+ if (!(p->flags & IORING_SETUP_NO_MMAP)) {
+ io_pages_unmap(r->rings, &r->ring_pages, &r->n_ring_pages,
+ true);
+ io_pages_unmap(r->sq_sqes, &r->sqe_pages, &r->n_sqe_pages,
+ true);
+ } else {
+ io_pages_free(&r->ring_pages, r->n_ring_pages);
+ io_pages_free(&r->sqe_pages, r->n_sqe_pages);
+ vunmap(r->rings);
+ vunmap(r->sq_sqes);
+ }
+}
+
+#define swap_old(ctx, o, n, field) \
+ do { \
+ (o).field = (ctx)->field; \
+ (ctx)->field = (n).field; \
+ } while (0)
+
+#define RESIZE_FLAGS (IORING_SETUP_CQSIZE | IORING_SETUP_CLAMP)
+#define COPY_FLAGS (IORING_SETUP_NO_SQARRAY | IORING_SETUP_SQE128 | \
+ IORING_SETUP_CQE32 | IORING_SETUP_NO_MMAP)
+
+static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg)
+{
+ struct io_ring_ctx_rings o = { }, n = { }, *to_free = NULL;
+ size_t size, sq_array_offset;
+ struct io_uring_params p;
+ unsigned i, tail;
+ void *ptr;
+ int ret;
+
+ /* for single issuer, must be owner resizing */
+ if (ctx->flags & IORING_SETUP_SINGLE_ISSUER &&
+ current != ctx->submitter_task)
+ return -EEXIST;
+ if (copy_from_user(&p, arg, sizeof(p)))
+ return -EFAULT;
+ if (p.flags & ~RESIZE_FLAGS)
+ return -EINVAL;
+
+ /* properties that are always inherited */
+ p.flags |= (ctx->flags & COPY_FLAGS);
+
+ ret = io_uring_fill_params(p.sq_entries, &p);
+ if (unlikely(ret))
+ return ret;
+
+ /* nothing to do, but copy params back */
+ if (p.sq_entries == ctx->sq_entries && p.cq_entries == ctx->cq_entries) {
+ if (copy_to_user(arg, &p, sizeof(p)))
+ return -EFAULT;
+ return 0;
+ }
+
+ size = rings_size(p.flags, p.sq_entries, p.cq_entries,
+ &sq_array_offset);
+ if (size == SIZE_MAX)
+ return -EOVERFLOW;
+
+ if (!(p.flags & IORING_SETUP_NO_MMAP))
+ n.rings = io_pages_map(&n.ring_pages, &n.n_ring_pages, size);
+ else
+ n.rings = __io_uaddr_map(&n.ring_pages, &n.n_ring_pages,
+ p.cq_off.user_addr, size);
+ if (IS_ERR(n.rings))
+ return PTR_ERR(n.rings);
+
+ n.rings->sq_ring_mask = p.sq_entries - 1;
+ n.rings->cq_ring_mask = p.cq_entries - 1;
+ n.rings->sq_ring_entries = p.sq_entries;
+ n.rings->cq_ring_entries = p.cq_entries;
+
+ if (copy_to_user(arg, &p, sizeof(p))) {
+ io_register_free_rings(&p, &n);
+ return -EFAULT;
+ }
+
+ if (p.flags & IORING_SETUP_SQE128)
+ size = array_size(2 * sizeof(struct io_uring_sqe), p.sq_entries);
+ else
+ size = array_size(sizeof(struct io_uring_sqe), p.sq_entries);
+ if (size == SIZE_MAX) {
+ io_register_free_rings(&p, &n);
+ return -EOVERFLOW;
+ }
+
+ if (!(p.flags & IORING_SETUP_NO_MMAP))
+ ptr = io_pages_map(&n.sqe_pages, &n.n_sqe_pages, size);
+ else
+ ptr = __io_uaddr_map(&n.sqe_pages, &n.n_sqe_pages,
+ p.sq_off.user_addr,
+ size);
+ if (IS_ERR(ptr)) {
+ io_register_free_rings(&p, &n);
+ return PTR_ERR(ptr);
+ }
+
+ /*
+ * If using SQPOLL, park the thread
+ */
+ if (ctx->sq_data) {
+ mutex_unlock(&ctx->uring_lock);
+ io_sq_thread_park(ctx->sq_data);
+ mutex_lock(&ctx->uring_lock);
+ }
+
+ /*
+ * We'll do the swap. Grab the ctx->resize_lock, which will exclude
+ * any new mmap's on the ring fd. Clear out existing mappings to prevent
+ * mmap from seeing them, as we'll unmap them. Any attempt to mmap
+ * existing rings beyond this point will fail. Not that it could proceed
+ * at this point anyway, as the io_uring mmap side needs go grab the
+ * ctx->resize_lock as well. Likewise, hold the completion lock over the
+ * duration of the actual swap.
+ */
+ mutex_lock(&ctx->resize_lock);
+ spin_lock(&ctx->completion_lock);
+ o.rings = ctx->rings;
+ ctx->rings = NULL;
+ o.sq_sqes = ctx->sq_sqes;
+ ctx->sq_sqes = NULL;
+
+ /*
+ * Now copy SQ and CQ entries, if any. If either of the destination
+ * rings can't hold what is already there, then fail the operation.
+ */
+ n.sq_sqes = ptr;
+ tail = o.rings->sq.tail;
+ if (tail - o.rings->sq.head > p.sq_entries)
+ goto overflow;
+ for (i = o.rings->sq.head; i < tail; i++) {
+ unsigned src_head = i & (ctx->sq_entries - 1);
+ unsigned dst_head = i & n.rings->sq_ring_mask;
+
+ n.sq_sqes[dst_head] = o.sq_sqes[src_head];
+ }
+ n.rings->sq.head = o.rings->sq.head;
+ n.rings->sq.tail = o.rings->sq.tail;
+
+ tail = o.rings->cq.tail;
+ if (tail - o.rings->cq.head > p.cq_entries) {
+overflow:
+ /* restore old rings, and return -EOVERFLOW via cleanup path */
+ ctx->rings = o.rings;
+ ctx->sq_sqes = o.sq_sqes;
+ to_free = &n;
+ ret = -EOVERFLOW;
+ goto out;
+ }
+ for (i = o.rings->cq.head; i < tail; i++) {
+ unsigned src_head = i & (ctx->cq_entries - 1);
+ unsigned dst_head = i & n.rings->cq_ring_mask;
+
+ n.rings->cqes[dst_head] = o.rings->cqes[src_head];
+ }
+ n.rings->cq.head = o.rings->cq.head;
+ n.rings->cq.tail = o.rings->cq.tail;
+ /* invalidate cached cqe refill */
+ ctx->cqe_cached = ctx->cqe_sentinel = NULL;
+
+ n.rings->sq_dropped = o.rings->sq_dropped;
+ n.rings->sq_flags = o.rings->sq_flags;
+ n.rings->cq_flags = o.rings->cq_flags;
+ n.rings->cq_overflow = o.rings->cq_overflow;
+
+ /* all done, store old pointers and assign new ones */
+ if (!(ctx->flags & IORING_SETUP_NO_SQARRAY))
+ ctx->sq_array = (u32 *)((char *)n.rings + sq_array_offset);
+
+ ctx->sq_entries = p.sq_entries;
+ ctx->cq_entries = p.cq_entries;
+
+ ctx->rings = n.rings;
+ ctx->sq_sqes = n.sq_sqes;
+ swap_old(ctx, o, n, n_ring_pages);
+ swap_old(ctx, o, n, n_sqe_pages);
+ swap_old(ctx, o, n, ring_pages);
+ swap_old(ctx, o, n, sqe_pages);
+ to_free = &o;
+ ret = 0;
+out:
+ spin_unlock(&ctx->completion_lock);
+ mutex_unlock(&ctx->resize_lock);
+ io_register_free_rings(&p, to_free);
+
+ if (ctx->sq_data)
+ io_sq_thread_unpark(ctx->sq_data);
+
+ return ret;
+}
+
+static int io_register_mem_region(struct io_ring_ctx *ctx, void __user *uarg)
+{
+ struct io_uring_mem_region_reg __user *reg_uptr = uarg;
+ struct io_uring_mem_region_reg reg;
+ struct io_uring_region_desc __user *rd_uptr;
+ struct io_uring_region_desc rd;
+ int ret;
+
+ if (io_region_is_set(&ctx->param_region))
+ return -EBUSY;
+ if (copy_from_user(®, reg_uptr, sizeof(reg)))
+ return -EFAULT;
+ rd_uptr = u64_to_user_ptr(reg.region_uptr);
+ if (copy_from_user(&rd, rd_uptr, sizeof(rd)))
+ return -EFAULT;
+
+ if (memchr_inv(®.__resv, 0, sizeof(reg.__resv)))
+ return -EINVAL;
+ if (reg.flags & ~IORING_MEM_REGION_REG_WAIT_ARG)
+ return -EINVAL;
+
+ /*
+ * This ensures there are no waiters. Waiters are unlocked and it's
+ * hard to synchronise with them, especially if we need to initialise
+ * the region.
+ */
+ if ((reg.flags & IORING_MEM_REGION_REG_WAIT_ARG) &&
+ !(ctx->flags & IORING_SETUP_R_DISABLED))
+ return -EINVAL;
+
+ ret = io_create_region(ctx, &ctx->param_region, &rd);
+ if (ret)
+ return ret;
+ if (copy_to_user(rd_uptr, &rd, sizeof(rd))) {
+ io_free_region(ctx, &ctx->param_region);
+ return -EFAULT;
+ }
+
+ if (reg.flags & IORING_MEM_REGION_REG_WAIT_ARG) {
+ ctx->cq_wait_arg = io_region_get_ptr(&ctx->param_region);
+ ctx->cq_wait_size = rd.size;
+ }
+ return 0;
+}
+
static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
void __user *arg, unsigned nr_args)
__releases(ctx->uring_lock)
@@ -548,6 +803,18 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
break;
ret = io_register_clone_buffers(ctx, arg);
break;
+ case IORING_REGISTER_RESIZE_RINGS:
+ ret = -EINVAL;
+ if (!arg || nr_args != 1)
+ break;
+ ret = io_register_resize_rings(ctx, arg);
+ break;
+ case IORING_REGISTER_MEM_REGION:
+ ret = -EINVAL;
+ if (!arg || nr_args != 1)
+ break;
+ ret = io_register_mem_region(ctx, arg);
+ break;
default:
ret = -EINVAL;
break;
@@ -588,6 +855,32 @@ struct file *io_uring_register_get_file(unsigned int fd, bool registered)
return ERR_PTR(-EOPNOTSUPP);
}
+/*
+ * "blind" registration opcodes are ones where there's no ring given, and
+ * hence the source fd must be -1.
+ */
+static int io_uring_register_blind(unsigned int opcode, void __user *arg,
+ unsigned int nr_args)
+{
+ switch (opcode) {
+ case IORING_REGISTER_SEND_MSG_RING: {
+ struct io_uring_sqe sqe;
+
+ if (!arg || nr_args != 1)
+ return -EINVAL;
+ if (copy_from_user(&sqe, arg, sizeof(sqe)))
+ return -EFAULT;
+ /* no flags supported */
+ if (sqe.flags)
+ return -EINVAL;
+ if (sqe.opcode == IORING_OP_MSG_RING)
+ return io_uring_sync_msg_ring(&sqe);
+ }
+ }
+
+ return -EINVAL;
+}
+
SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
void __user *, arg, unsigned int, nr_args)
{
@@ -602,6 +895,9 @@ SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
if (opcode >= IORING_REGISTER_LAST)
return -EINVAL;
+ if (fd == -1)
+ return io_uring_register_blind(opcode, arg, nr_args);
+
file = io_uring_register_get_file(fd, use_registered_ring);
if (IS_ERR(file))
return PTR_ERR(file);
@@ -610,7 +906,8 @@ SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
mutex_lock(&ctx->uring_lock);
ret = __io_uring_register(ctx, opcode, arg, nr_args);
mutex_unlock(&ctx->uring_lock);
- trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs, ret);
+ trace_io_uring_register(ctx, opcode, ctx->file_table.data.nr,
+ ctx->buf_table.nr, ret);
if (!use_registered_ring)
fput(file);
return ret;
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 6f3b6de..adaae86 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -13,7 +13,6 @@
#include <uapi/linux/io_uring.h>
#include "io_uring.h"
-#include "alloc_cache.h"
#include "openclose.h"
#include "rsrc.h"
#include "memmap.h"
@@ -26,21 +25,13 @@ struct io_rsrc_update {
u32 offset;
};
-static void io_rsrc_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc);
-static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
- struct io_mapped_ubuf **pimu,
- struct page **last_hpage);
+static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
+ struct iovec *iov, struct page **last_hpage);
/* only define max */
#define IORING_MAX_FIXED_FILES (1U << 20)
#define IORING_MAX_REG_BUFFERS (1U << 14)
-static const struct io_mapped_ubuf dummy_ubuf = {
- /* set invalid range, so io_import_fixed() fails meeting it */
- .ubuf = -1UL,
- .len = UINT_MAX,
-};
-
int __io_account_mem(struct user_struct *user, unsigned long nr_pages)
{
unsigned long page_limit, cur_pages, new_pages;
@@ -110,13 +101,13 @@ static int io_buffer_validate(struct iovec *iov)
return 0;
}
-static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slot)
+static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_rsrc_node *node)
{
- struct io_mapped_ubuf *imu = *slot;
unsigned int i;
- *slot = NULL;
- if (imu != &dummy_ubuf) {
+ if (node->buf) {
+ struct io_mapped_ubuf *imu = node->buf;
+
if (!refcount_dec_and_test(&imu->refs))
return;
for (i = 0; i < imu->nr_bvecs; i++)
@@ -127,205 +118,40 @@ static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slo
}
}
-static void io_rsrc_put_work(struct io_rsrc_node *node)
+struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx, int type)
{
- struct io_rsrc_put *prsrc = &node->item;
+ struct io_rsrc_node *node;
- if (prsrc->tag)
- io_post_aux_cqe(node->ctx, prsrc->tag, 0, 0);
-
- switch (node->type) {
- case IORING_RSRC_FILE:
- fput(prsrc->file);
- break;
- case IORING_RSRC_BUFFER:
- io_rsrc_buf_put(node->ctx, prsrc);
- break;
- default:
- WARN_ON_ONCE(1);
- break;
+ node = kzalloc(sizeof(*node), GFP_KERNEL);
+ if (node) {
+ node->type = type;
+ node->refs = 1;
}
+ return node;
}
-void io_rsrc_node_destroy(struct io_ring_ctx *ctx, struct io_rsrc_node *node)
+__cold void io_rsrc_data_free(struct io_ring_ctx *ctx, struct io_rsrc_data *data)
{
- if (!io_alloc_cache_put(&ctx->rsrc_node_cache, node))
- kfree(node);
-}
-
-void io_rsrc_node_ref_zero(struct io_rsrc_node *node)
- __must_hold(&node->ctx->uring_lock)
-{
- struct io_ring_ctx *ctx = node->ctx;
-
- while (!list_empty(&ctx->rsrc_ref_list)) {
- node = list_first_entry(&ctx->rsrc_ref_list,
- struct io_rsrc_node, node);
- /* recycle ref nodes in order */
- if (node->refs)
- break;
- list_del(&node->node);
-
- if (likely(!node->empty))
- io_rsrc_put_work(node);
- io_rsrc_node_destroy(ctx, node);
+ if (!data->nr)
+ return;
+ while (data->nr--) {
+ if (data->nodes[data->nr])
+ io_put_rsrc_node(ctx, data->nodes[data->nr]);
}
- if (list_empty(&ctx->rsrc_ref_list) && unlikely(ctx->rsrc_quiesce))
- wake_up_all(&ctx->rsrc_quiesce_wq);
+ kvfree(data->nodes);
+ data->nodes = NULL;
+ data->nr = 0;
}
-struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx)
+__cold int io_rsrc_data_alloc(struct io_rsrc_data *data, unsigned nr)
{
- struct io_rsrc_node *ref_node;
-
- ref_node = io_alloc_cache_get(&ctx->rsrc_node_cache);
- if (!ref_node) {
- ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL);
- if (!ref_node)
- return NULL;
- }
-
- ref_node->ctx = ctx;
- ref_node->empty = 0;
- ref_node->refs = 1;
- return ref_node;
-}
-
-__cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data,
- struct io_ring_ctx *ctx)
-{
- struct io_rsrc_node *backup;
- DEFINE_WAIT(we);
- int ret;
-
- /* As We may drop ->uring_lock, other task may have started quiesce */
- if (data->quiesce)
- return -ENXIO;
-
- backup = io_rsrc_node_alloc(ctx);
- if (!backup)
- return -ENOMEM;
- ctx->rsrc_node->empty = true;
- ctx->rsrc_node->type = -1;
- list_add_tail(&ctx->rsrc_node->node, &ctx->rsrc_ref_list);
- io_put_rsrc_node(ctx, ctx->rsrc_node);
- ctx->rsrc_node = backup;
-
- if (list_empty(&ctx->rsrc_ref_list))
+ data->nodes = kvmalloc_array(nr, sizeof(struct io_rsrc_node *),
+ GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+ if (data->nodes) {
+ data->nr = nr;
return 0;
-
- if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) {
- atomic_set(&ctx->cq_wait_nr, 1);
- smp_mb();
}
-
- ctx->rsrc_quiesce++;
- data->quiesce = true;
- do {
- prepare_to_wait(&ctx->rsrc_quiesce_wq, &we, TASK_INTERRUPTIBLE);
- mutex_unlock(&ctx->uring_lock);
-
- ret = io_run_task_work_sig(ctx);
- if (ret < 0) {
- finish_wait(&ctx->rsrc_quiesce_wq, &we);
- mutex_lock(&ctx->uring_lock);
- if (list_empty(&ctx->rsrc_ref_list))
- ret = 0;
- break;
- }
-
- schedule();
- mutex_lock(&ctx->uring_lock);
- ret = 0;
- } while (!list_empty(&ctx->rsrc_ref_list));
-
- finish_wait(&ctx->rsrc_quiesce_wq, &we);
- data->quiesce = false;
- ctx->rsrc_quiesce--;
-
- if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) {
- atomic_set(&ctx->cq_wait_nr, 0);
- smp_mb();
- }
- return ret;
-}
-
-static void io_free_page_table(void **table, size_t size)
-{
- unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE);
-
- for (i = 0; i < nr_tables; i++)
- kfree(table[i]);
- kfree(table);
-}
-
-static void io_rsrc_data_free(struct io_rsrc_data *data)
-{
- size_t size = data->nr * sizeof(data->tags[0][0]);
-
- if (data->tags)
- io_free_page_table((void **)data->tags, size);
- kfree(data);
-}
-
-static __cold void **io_alloc_page_table(size_t size)
-{
- unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE);
- size_t init_size = size;
- void **table;
-
- table = kcalloc(nr_tables, sizeof(*table), GFP_KERNEL_ACCOUNT);
- if (!table)
- return NULL;
-
- for (i = 0; i < nr_tables; i++) {
- unsigned int this_size = min_t(size_t, size, PAGE_SIZE);
-
- table[i] = kzalloc(this_size, GFP_KERNEL_ACCOUNT);
- if (!table[i]) {
- io_free_page_table(table, init_size);
- return NULL;
- }
- size -= this_size;
- }
- return table;
-}
-
-__cold static int io_rsrc_data_alloc(struct io_ring_ctx *ctx, int type,
- u64 __user *utags,
- unsigned nr, struct io_rsrc_data **pdata)
-{
- struct io_rsrc_data *data;
- int ret = 0;
- unsigned i;
-
- data = kzalloc(sizeof(*data), GFP_KERNEL);
- if (!data)
- return -ENOMEM;
- data->tags = (u64 **)io_alloc_page_table(nr * sizeof(data->tags[0][0]));
- if (!data->tags) {
- kfree(data);
- return -ENOMEM;
- }
-
- data->nr = nr;
- data->ctx = ctx;
- data->rsrc_type = type;
- if (utags) {
- ret = -EFAULT;
- for (i = 0; i < nr; i++) {
- u64 *tag_slot = io_get_tag_slot(data, i);
-
- if (copy_from_user(tag_slot, &utags[i],
- sizeof(*tag_slot)))
- goto fail;
- }
- }
- *pdata = data;
- return 0;
-fail:
- io_rsrc_data_free(data);
- return ret;
+ return -ENOMEM;
}
static int __io_sqe_files_update(struct io_ring_ctx *ctx,
@@ -334,14 +160,12 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
{
u64 __user *tags = u64_to_user_ptr(up->tags);
__s32 __user *fds = u64_to_user_ptr(up->data);
- struct io_rsrc_data *data = ctx->file_data;
- struct io_fixed_file *file_slot;
int fd, i, err = 0;
unsigned int done;
- if (!ctx->file_data)
+ if (!ctx->file_table.data.nr)
return -ENXIO;
- if (up->offset + nr_args > ctx->nr_user_files)
+ if (up->offset + nr_args > ctx->file_table.data.nr)
return -EINVAL;
for (done = 0; done < nr_args; done++) {
@@ -359,19 +183,13 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
if (fd == IORING_REGISTER_FILES_SKIP)
continue;
- i = array_index_nospec(up->offset + done, ctx->nr_user_files);
- file_slot = io_fixed_file_slot(&ctx->file_table, i);
-
- if (file_slot->file_ptr) {
- err = io_queue_rsrc_removal(data, i,
- io_slot_file(file_slot));
- if (err)
- break;
- file_slot->file_ptr = 0;
+ i = up->offset + done;
+ if (io_reset_rsrc_node(ctx, &ctx->file_table.data, i))
io_file_bitmap_clear(&ctx->file_table, i);
- }
+
if (fd != -1) {
struct file *file = fget(fd);
+ struct io_rsrc_node *node;
if (!file) {
err = -EBADF;
@@ -385,8 +203,16 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
err = -EBADF;
break;
}
- *io_get_tag_slot(data, i) = tag;
- io_fixed_file_set(file_slot, file);
+ node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE);
+ if (!node) {
+ err = -ENOMEM;
+ fput(file);
+ break;
+ }
+ ctx->file_table.data.nodes[i] = node;
+ if (tag)
+ node->tag = tag;
+ io_fixed_file_set(node, file);
io_file_bitmap_set(&ctx->file_table, i);
}
}
@@ -405,13 +231,13 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
__u32 done;
int i, err;
- if (!ctx->buf_data)
+ if (!ctx->buf_table.nr)
return -ENXIO;
- if (up->offset + nr_args > ctx->nr_user_bufs)
+ if (up->offset + nr_args > ctx->buf_table.nr)
return -EINVAL;
for (done = 0; done < nr_args; done++) {
- struct io_mapped_ubuf *imu;
+ struct io_rsrc_node *node;
u64 tag = 0;
uvec = u64_to_user_ptr(user_data);
@@ -427,27 +253,21 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
err = io_buffer_validate(iov);
if (err)
break;
- if (!iov->iov_base && tag) {
- err = -EINVAL;
+ node = io_sqe_buffer_register(ctx, iov, &last_hpage);
+ if (IS_ERR(node)) {
+ err = PTR_ERR(node);
break;
}
- err = io_sqe_buffer_register(ctx, iov, &imu, &last_hpage);
- if (err)
- break;
-
- i = array_index_nospec(up->offset + done, ctx->nr_user_bufs);
- if (ctx->user_bufs[i] != &dummy_ubuf) {
- err = io_queue_rsrc_removal(ctx->buf_data, i,
- ctx->user_bufs[i]);
- if (unlikely(err)) {
- io_buffer_unmap(ctx, &imu);
+ if (tag) {
+ if (!node) {
+ err = -EINVAL;
break;
}
- ctx->user_bufs[i] = (struct io_mapped_ubuf *)&dummy_ubuf;
+ node->tag = tag;
}
-
- ctx->user_bufs[i] = imu;
- *io_get_tag_slot(ctx->buf_data, i) = tag;
+ i = array_index_nospec(up->offset + done, ctx->buf_table.nr);
+ io_reset_rsrc_node(ctx, &ctx->buf_table, i);
+ ctx->buf_table.nodes[i] = node;
if (ctx->compat)
user_data += sizeof(struct compat_iovec);
else
@@ -563,7 +383,7 @@ static int io_files_update_with_index_alloc(struct io_kiocb *req,
struct file *file;
int ret, fd;
- if (!req->ctx->file_data)
+ if (!req->ctx->file_table.data.nr)
return -ENXIO;
for (done = 0; done < up->nr_args; done++) {
@@ -622,65 +442,38 @@ int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
return IOU_OK;
}
-int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, void *rsrc)
+void io_free_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node)
{
- struct io_ring_ctx *ctx = data->ctx;
- struct io_rsrc_node *node = ctx->rsrc_node;
- u64 *tag_slot = io_get_tag_slot(data, idx);
+ lockdep_assert_held(&ctx->uring_lock);
- ctx->rsrc_node = io_rsrc_node_alloc(ctx);
- if (unlikely(!ctx->rsrc_node)) {
- ctx->rsrc_node = node;
- return -ENOMEM;
+ if (node->tag)
+ io_post_aux_cqe(ctx, node->tag, 0, 0);
+
+ switch (node->type) {
+ case IORING_RSRC_FILE:
+ if (io_slot_file(node))
+ fput(io_slot_file(node));
+ break;
+ case IORING_RSRC_BUFFER:
+ if (node->buf)
+ io_buffer_unmap(ctx, node);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ break;
}
- node->item.rsrc = rsrc;
- node->type = data->rsrc_type;
- node->item.tag = *tag_slot;
- *tag_slot = 0;
- list_add_tail(&node->node, &ctx->rsrc_ref_list);
- io_put_rsrc_node(ctx, node);
- return 0;
-}
-
-void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
-{
- int i;
-
- for (i = 0; i < ctx->nr_user_files; i++) {
- struct file *file = io_file_from_index(&ctx->file_table, i);
-
- if (!file)
- continue;
- io_file_bitmap_clear(&ctx->file_table, i);
- fput(file);
- }
-
- io_free_file_tables(&ctx->file_table);
- io_file_table_set_alloc_range(ctx, 0, 0);
- io_rsrc_data_free(ctx->file_data);
- ctx->file_data = NULL;
- ctx->nr_user_files = 0;
+ kfree(node);
}
int io_sqe_files_unregister(struct io_ring_ctx *ctx)
{
- unsigned nr = ctx->nr_user_files;
- int ret;
-
- if (!ctx->file_data)
+ if (!ctx->file_table.data.nr)
return -ENXIO;
- /*
- * Quiesce may unlock ->uring_lock, and while it's not held
- * prevent new requests using the table.
- */
- ctx->nr_user_files = 0;
- ret = io_rsrc_ref_quiesce(ctx->file_data, ctx);
- ctx->nr_user_files = nr;
- if (!ret)
- __io_sqe_files_unregister(ctx);
- return ret;
+ io_free_file_tables(ctx, &ctx->file_table);
+ io_file_table_set_alloc_range(ctx, 0, 0);
+ return 0;
}
int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
@@ -691,7 +484,7 @@ int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
int fd, ret;
unsigned i;
- if (ctx->file_data)
+ if (ctx->file_table.data.nr)
return -EBUSY;
if (!nr_args)
return -EINVAL;
@@ -699,28 +492,22 @@ int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
return -EMFILE;
if (nr_args > rlimit(RLIMIT_NOFILE))
return -EMFILE;
- ret = io_rsrc_data_alloc(ctx, IORING_RSRC_FILE, tags, nr_args,
- &ctx->file_data);
- if (ret)
- return ret;
-
- if (!io_alloc_file_tables(&ctx->file_table, nr_args)) {
- io_rsrc_data_free(ctx->file_data);
- ctx->file_data = NULL;
+ if (!io_alloc_file_tables(ctx, &ctx->file_table, nr_args))
return -ENOMEM;
- }
- for (i = 0; i < nr_args; i++, ctx->nr_user_files++) {
- struct io_fixed_file *file_slot;
+ for (i = 0; i < nr_args; i++) {
+ struct io_rsrc_node *node;
+ u64 tag = 0;
- if (fds && copy_from_user(&fd, &fds[i], sizeof(fd))) {
- ret = -EFAULT;
+ ret = -EFAULT;
+ if (tags && copy_from_user(&tag, &tags[i], sizeof(tag)))
goto fail;
- }
+ if (fds && copy_from_user(&fd, &fds[i], sizeof(fd)))
+ goto fail;
/* allow sparse sets */
if (!fds || fd == -1) {
ret = -EINVAL;
- if (unlikely(*io_get_tag_slot(ctx->file_data, i)))
+ if (tag)
goto fail;
continue;
}
@@ -737,56 +524,33 @@ int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
fput(file);
goto fail;
}
- file_slot = io_fixed_file_slot(&ctx->file_table, i);
- io_fixed_file_set(file_slot, file);
+ ret = -ENOMEM;
+ node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE);
+ if (!node) {
+ fput(file);
+ goto fail;
+ }
+ if (tag)
+ node->tag = tag;
+ ctx->file_table.data.nodes[i] = node;
+ io_fixed_file_set(node, file);
io_file_bitmap_set(&ctx->file_table, i);
}
/* default it to the whole table */
- io_file_table_set_alloc_range(ctx, 0, ctx->nr_user_files);
+ io_file_table_set_alloc_range(ctx, 0, ctx->file_table.data.nr);
return 0;
fail:
- __io_sqe_files_unregister(ctx);
+ io_sqe_files_unregister(ctx);
return ret;
}
-static void io_rsrc_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
-{
- io_buffer_unmap(ctx, &prsrc->buf);
- prsrc->buf = NULL;
-}
-
-void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
-{
- unsigned int i;
-
- for (i = 0; i < ctx->nr_user_bufs; i++)
- io_buffer_unmap(ctx, &ctx->user_bufs[i]);
- kfree(ctx->user_bufs);
- io_rsrc_data_free(ctx->buf_data);
- ctx->user_bufs = NULL;
- ctx->buf_data = NULL;
- ctx->nr_user_bufs = 0;
-}
-
int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
{
- unsigned nr = ctx->nr_user_bufs;
- int ret;
-
- if (!ctx->buf_data)
+ if (!ctx->buf_table.nr)
return -ENXIO;
-
- /*
- * Quiesce may unlock ->uring_lock, and while it's not held
- * prevent new requests using the table.
- */
- ctx->nr_user_bufs = 0;
- ret = io_rsrc_ref_quiesce(ctx->buf_data, ctx);
- ctx->nr_user_bufs = nr;
- if (!ret)
- __io_sqe_buffers_unregister(ctx);
- return ret;
+ io_rsrc_data_free(ctx, &ctx->buf_table);
+ return 0;
}
/*
@@ -812,9 +576,13 @@ static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages,
}
/* check previously registered pages */
- for (i = 0; i < ctx->nr_user_bufs; i++) {
- struct io_mapped_ubuf *imu = ctx->user_bufs[i];
+ for (i = 0; i < ctx->buf_table.nr; i++) {
+ struct io_rsrc_node *node = ctx->buf_table.nodes[i];
+ struct io_mapped_ubuf *imu;
+ if (!node)
+ continue;
+ imu = node->buf;
for (j = 0; j < imu->nr_bvecs; j++) {
if (!PageCompound(imu->bvec[j].bv_page))
continue;
@@ -950,21 +718,26 @@ static bool io_try_coalesce_buffer(struct page ***pages, int *nr_pages,
return io_do_coalesce_buffer(pages, nr_pages, data, nr_folios);
}
-static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
- struct io_mapped_ubuf **pimu,
- struct page **last_hpage)
+static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
+ struct iovec *iov,
+ struct page **last_hpage)
{
struct io_mapped_ubuf *imu = NULL;
struct page **pages = NULL;
+ struct io_rsrc_node *node;
unsigned long off;
size_t size;
int ret, nr_pages, i;
struct io_imu_folio_data data;
bool coalesced;
- *pimu = (struct io_mapped_ubuf *)&dummy_ubuf;
if (!iov->iov_base)
- return 0;
+ return NULL;
+
+ node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER);
+ if (!node)
+ return ERR_PTR(-ENOMEM);
+ node->buf = NULL;
ret = -ENOMEM;
pages = io_pin_pages((unsigned long) iov->iov_base, iov->iov_len,
@@ -998,7 +771,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
imu->folio_shift = data.folio_shift;
refcount_set(&imu->refs, 1);
off = (unsigned long) iov->iov_base & ((1UL << imu->folio_shift) - 1);
- *pimu = imu;
+ node->buf = imu;
ret = 0;
for (i = 0; i < nr_pages; i++) {
@@ -1010,46 +783,42 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
size -= vec_len;
}
done:
- if (ret)
+ if (ret) {
kvfree(imu);
+ if (node)
+ io_put_rsrc_node(ctx, node);
+ node = ERR_PTR(ret);
+ }
kvfree(pages);
- return ret;
-}
-
-static int io_buffers_map_alloc(struct io_ring_ctx *ctx, unsigned int nr_args)
-{
- ctx->user_bufs = kcalloc(nr_args, sizeof(*ctx->user_bufs), GFP_KERNEL);
- return ctx->user_bufs ? 0 : -ENOMEM;
+ return node;
}
int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
unsigned int nr_args, u64 __user *tags)
{
struct page *last_hpage = NULL;
- struct io_rsrc_data *data;
+ struct io_rsrc_data data;
struct iovec fast_iov, *iov = &fast_iov;
const struct iovec __user *uvec;
int i, ret;
BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16));
- if (ctx->user_bufs)
+ if (ctx->buf_table.nr)
return -EBUSY;
if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS)
return -EINVAL;
- ret = io_rsrc_data_alloc(ctx, IORING_RSRC_BUFFER, tags, nr_args, &data);
+ ret = io_rsrc_data_alloc(&data, nr_args);
if (ret)
return ret;
- ret = io_buffers_map_alloc(ctx, nr_args);
- if (ret) {
- io_rsrc_data_free(data);
- return ret;
- }
if (!arg)
memset(iov, 0, sizeof(*iov));
- for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) {
+ for (i = 0; i < nr_args; i++) {
+ struct io_rsrc_node *node;
+ u64 tag = 0;
+
if (arg) {
uvec = (struct iovec __user *) arg;
iov = iovec_from_user(uvec, 1, 1, &fast_iov, ctx->compat);
@@ -1066,22 +835,31 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
arg += sizeof(struct iovec);
}
- if (!iov->iov_base && *io_get_tag_slot(data, i)) {
- ret = -EINVAL;
- break;
+ if (tags) {
+ if (copy_from_user(&tag, &tags[i], sizeof(tag))) {
+ ret = -EFAULT;
+ break;
+ }
}
- ret = io_sqe_buffer_register(ctx, iov, &ctx->user_bufs[i],
- &last_hpage);
- if (ret)
+ node = io_sqe_buffer_register(ctx, iov, &last_hpage);
+ if (IS_ERR(node)) {
+ ret = PTR_ERR(node);
break;
+ }
+ if (tag) {
+ if (!node) {
+ ret = -EINVAL;
+ break;
+ }
+ node->tag = tag;
+ }
+ data.nodes[i] = node;
}
- WARN_ON_ONCE(ctx->buf_data);
-
- ctx->buf_data = data;
+ ctx->buf_table = data;
if (ret)
- __io_sqe_buffers_unregister(ctx);
+ io_sqe_buffers_unregister(ctx);
return ret;
}
@@ -1127,7 +905,6 @@ int io_import_fixed(int ddir, struct iov_iter *iter,
const struct bio_vec *bvec = imu->bvec;
if (offset < bvec->bv_len) {
- iter->bvec = bvec;
iter->count -= offset;
iter->iov_offset = offset;
} else {
@@ -1137,7 +914,7 @@ int io_import_fixed(int ddir, struct iov_iter *iter,
offset -= bvec->bv_len;
seg_skip = 1 + (offset >> imu->folio_shift);
- iter->bvec = bvec + seg_skip;
+ iter->bvec += seg_skip;
iter->nr_segs -= seg_skip;
iter->count -= bvec->bv_len + offset;
iter->iov_offset = offset & ((1UL << imu->folio_shift) - 1);
@@ -1147,11 +924,43 @@ int io_import_fixed(int ddir, struct iov_iter *iter,
return 0;
}
-static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx)
+static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx,
+ struct io_uring_clone_buffers *arg)
{
- struct io_mapped_ubuf **user_bufs;
- struct io_rsrc_data *data;
- int i, ret, nbufs;
+ struct io_rsrc_data data;
+ int i, ret, off, nr;
+ unsigned int nbufs;
+
+ /* if offsets are given, must have nr specified too */
+ if (!arg->nr && (arg->dst_off || arg->src_off))
+ return -EINVAL;
+ /* not allowed unless REPLACE is set */
+ if (ctx->buf_table.nr && !(arg->flags & IORING_REGISTER_DST_REPLACE))
+ return -EBUSY;
+
+ nbufs = READ_ONCE(src_ctx->buf_table.nr);
+ if (!arg->nr)
+ arg->nr = nbufs;
+ else if (arg->nr > nbufs)
+ return -EINVAL;
+ else if (arg->nr > IORING_MAX_REG_BUFFERS)
+ return -EINVAL;
+ if (check_add_overflow(arg->nr, arg->dst_off, &nbufs))
+ return -EOVERFLOW;
+
+ ret = io_rsrc_data_alloc(&data, max(nbufs, ctx->buf_table.nr));
+ if (ret)
+ return ret;
+
+ /* Fill entries in data from dst that won't overlap with src */
+ for (i = 0; i < min(arg->dst_off, ctx->buf_table.nr); i++) {
+ struct io_rsrc_node *src_node = ctx->buf_table.nodes[i];
+
+ if (src_node) {
+ data.nodes[i] = src_node;
+ src_node->refs++;
+ }
+ }
/*
* Drop our own lock here. We'll setup the data we need and reference
@@ -1161,45 +970,77 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
mutex_lock(&src_ctx->uring_lock);
ret = -ENXIO;
- nbufs = src_ctx->nr_user_bufs;
+ nbufs = src_ctx->buf_table.nr;
if (!nbufs)
goto out_unlock;
- ret = io_rsrc_data_alloc(ctx, IORING_RSRC_BUFFER, NULL, nbufs, &data);
- if (ret)
+ ret = -EINVAL;
+ if (!arg->nr)
+ arg->nr = nbufs;
+ else if (arg->nr > nbufs)
+ goto out_unlock;
+ ret = -EOVERFLOW;
+ if (check_add_overflow(arg->nr, arg->src_off, &off))
+ goto out_unlock;
+ if (off > nbufs)
goto out_unlock;
- ret = -ENOMEM;
- user_bufs = kcalloc(nbufs, sizeof(*ctx->user_bufs), GFP_KERNEL);
- if (!user_bufs)
- goto out_free_data;
+ off = arg->dst_off;
+ i = arg->src_off;
+ nr = arg->nr;
+ while (nr--) {
+ struct io_rsrc_node *dst_node, *src_node;
- for (i = 0; i < nbufs; i++) {
- struct io_mapped_ubuf *src = src_ctx->user_bufs[i];
+ src_node = io_rsrc_node_lookup(&src_ctx->buf_table, i);
+ if (!src_node) {
+ dst_node = NULL;
+ } else {
+ dst_node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER);
+ if (!dst_node) {
+ ret = -ENOMEM;
+ goto out_put_free;
+ }
- if (src != &dummy_ubuf)
- refcount_inc(&src->refs);
- user_bufs[i] = src;
+ refcount_inc(&src_node->buf->refs);
+ dst_node->buf = src_node->buf;
+ }
+ data.nodes[off++] = dst_node;
+ i++;
}
/* Have a ref on the bufs now, drop src lock and re-grab our own lock */
mutex_unlock(&src_ctx->uring_lock);
mutex_lock(&ctx->uring_lock);
- if (!ctx->user_bufs) {
- ctx->user_bufs = user_bufs;
- ctx->buf_data = data;
- ctx->nr_user_bufs = nbufs;
+
+ /*
+ * If asked for replace, put the old table. data->nodes[] holds both
+ * old and new nodes at this point.
+ */
+ if (arg->flags & IORING_REGISTER_DST_REPLACE)
+ io_rsrc_data_free(ctx, &ctx->buf_table);
+
+ /*
+ * ctx->buf_table should be empty now - either the contents are being
+ * replaced and we just freed the table, or someone raced setting up
+ * a buffer table while the clone was happening. If not empty, fall
+ * through to failure handling.
+ */
+ if (!ctx->buf_table.nr) {
+ ctx->buf_table = data;
return 0;
}
+ mutex_unlock(&ctx->uring_lock);
+ mutex_lock(&src_ctx->uring_lock);
/* someone raced setting up buffers, dump ours */
- for (i = 0; i < nbufs; i++)
- io_buffer_unmap(ctx, &user_bufs[i]);
- io_rsrc_data_free(data);
- kfree(user_bufs);
- return -EBUSY;
-out_free_data:
- io_rsrc_data_free(data);
+ ret = -EBUSY;
+out_put_free:
+ i = data.nr;
+ while (i--) {
+ io_buffer_unmap(src_ctx, data.nodes[i]);
+ kfree(data.nodes[i]);
+ }
out_unlock:
+ io_rsrc_data_free(ctx, &data);
mutex_unlock(&src_ctx->uring_lock);
mutex_lock(&ctx->uring_lock);
return ret;
@@ -1219,12 +1060,12 @@ int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg)
struct file *file;
int ret;
- if (ctx->user_bufs || ctx->nr_user_bufs)
- return -EBUSY;
if (copy_from_user(&buf, arg, sizeof(buf)))
return -EFAULT;
- if (buf.flags & ~IORING_REGISTER_SRC_REGISTERED)
+ if (buf.flags & ~(IORING_REGISTER_SRC_REGISTERED|IORING_REGISTER_DST_REPLACE))
return -EINVAL;
+ if (!(buf.flags & IORING_REGISTER_DST_REPLACE) && ctx->buf_table.nr)
+ return -EBUSY;
if (memchr_inv(buf.pad, 0, sizeof(buf.pad)))
return -EINVAL;
@@ -1232,7 +1073,7 @@ int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg)
file = io_uring_register_get_file(buf.src_fd, registered_src);
if (IS_ERR(file))
return PTR_ERR(file);
- ret = io_clone_buffers(ctx, file->private_data);
+ ret = io_clone_buffers(ctx, file->private_data, &buf);
if (!registered_src)
fput(file);
return ret;
diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index 8ed5880..7a4668d 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -13,33 +13,17 @@ enum {
IORING_RSRC_BUFFER = 1,
};
-struct io_rsrc_put {
+struct io_rsrc_node {
+ unsigned char type;
+ int refs;
+
u64 tag;
union {
- void *rsrc;
- struct file *file;
+ unsigned long file_ptr;
struct io_mapped_ubuf *buf;
};
};
-struct io_rsrc_data {
- struct io_ring_ctx *ctx;
-
- u64 **tags;
- unsigned int nr;
- u16 rsrc_type;
- bool quiesce;
-};
-
-struct io_rsrc_node {
- struct io_ring_ctx *ctx;
- int refs;
- bool empty;
- u16 type;
- struct list_head node;
- struct io_rsrc_put item;
-};
-
struct io_mapped_ubuf {
u64 ubuf;
unsigned int len;
@@ -58,21 +42,19 @@ struct io_imu_folio_data {
unsigned int folio_shift;
};
-void io_rsrc_node_ref_zero(struct io_rsrc_node *node);
-void io_rsrc_node_destroy(struct io_ring_ctx *ctx, struct io_rsrc_node *ref_node);
-struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx);
-int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, void *rsrc);
+struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx, int type);
+void io_free_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node);
+void io_rsrc_data_free(struct io_ring_ctx *ctx, struct io_rsrc_data *data);
+int io_rsrc_data_alloc(struct io_rsrc_data *data, unsigned nr);
int io_import_fixed(int ddir, struct iov_iter *iter,
struct io_mapped_ubuf *imu,
u64 buf_addr, size_t len);
int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg);
-void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx);
int io_sqe_buffers_unregister(struct io_ring_ctx *ctx);
int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
unsigned int nr_args, u64 __user *tags);
-void __io_sqe_files_unregister(struct io_ring_ctx *ctx);
int io_sqe_files_unregister(struct io_ring_ctx *ctx);
int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
unsigned nr_args, u64 __user *tags);
@@ -84,51 +66,56 @@ int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg,
int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
unsigned int size, unsigned int type);
+static inline struct io_rsrc_node *io_rsrc_node_lookup(struct io_rsrc_data *data,
+ int index)
+{
+ if (index < data->nr)
+ return data->nodes[array_index_nospec(index, data->nr)];
+ return NULL;
+}
+
static inline void io_put_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node)
{
- lockdep_assert_held(&ctx->uring_lock);
-
if (node && !--node->refs)
- io_rsrc_node_ref_zero(node);
+ io_free_rsrc_node(ctx, node);
}
-static inline void io_charge_rsrc_node(struct io_ring_ctx *ctx,
- struct io_rsrc_node *node)
+static inline bool io_reset_rsrc_node(struct io_ring_ctx *ctx,
+ struct io_rsrc_data *data, int index)
{
- node->refs++;
+ struct io_rsrc_node *node = data->nodes[index];
+
+ if (!node)
+ return false;
+ io_put_rsrc_node(ctx, node);
+ data->nodes[index] = NULL;
+ return true;
}
-static inline void __io_req_set_rsrc_node(struct io_kiocb *req,
- struct io_ring_ctx *ctx)
+static inline void io_req_put_rsrc_nodes(struct io_kiocb *req)
{
- lockdep_assert_held(&ctx->uring_lock);
- req->rsrc_node = ctx->rsrc_node;
- io_charge_rsrc_node(ctx, ctx->rsrc_node);
-}
-
-static inline void io_req_set_rsrc_node(struct io_kiocb *req,
- struct io_ring_ctx *ctx,
- unsigned int issue_flags)
-{
- if (!req->rsrc_node) {
- io_ring_submit_lock(ctx, issue_flags);
- __io_req_set_rsrc_node(req, ctx);
- io_ring_submit_unlock(ctx, issue_flags);
+ if (req->file_node) {
+ io_put_rsrc_node(req->ctx, req->file_node);
+ req->file_node = NULL;
+ }
+ if (req->flags & REQ_F_BUF_NODE) {
+ io_put_rsrc_node(req->ctx, req->buf_node);
+ req->buf_node = NULL;
}
}
-static inline u64 *io_get_tag_slot(struct io_rsrc_data *data, unsigned int idx)
+static inline void io_req_assign_rsrc_node(struct io_rsrc_node **dst_node,
+ struct io_rsrc_node *node)
{
- unsigned int off = idx & IO_RSRC_TAG_TABLE_MASK;
- unsigned int table_idx = idx >> IO_RSRC_TAG_TABLE_SHIFT;
-
- return &data->tags[table_idx][off];
+ node->refs++;
+ *dst_node = node;
}
-static inline int io_rsrc_init(struct io_ring_ctx *ctx)
+static inline void io_req_assign_buf_node(struct io_kiocb *req,
+ struct io_rsrc_node *node)
{
- ctx->rsrc_node = io_rsrc_node_alloc(ctx);
- return ctx->rsrc_node ? 0 : -ENOMEM;
+ io_req_assign_rsrc_node(&req->buf_node, node);
+ req->flags |= REQ_F_BUF_NODE;
}
int io_files_update(struct io_kiocb *req, unsigned int issue_flags);
diff --git a/io_uring/rw.c b/io_uring/rw.c
index 155938f..cce8bc2 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -330,22 +330,21 @@ static int io_prep_rw_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe
{
struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
struct io_ring_ctx *ctx = req->ctx;
+ struct io_rsrc_node *node;
struct io_async_rw *io;
- u16 index;
int ret;
ret = io_prep_rw(req, sqe, ddir, false);
if (unlikely(ret))
return ret;
- if (unlikely(req->buf_index >= ctx->nr_user_bufs))
+ node = io_rsrc_node_lookup(&ctx->buf_table, req->buf_index);
+ if (!node)
return -EFAULT;
- index = array_index_nospec(req->buf_index, ctx->nr_user_bufs);
- req->imu = ctx->user_bufs[index];
- io_req_set_rsrc_node(req, ctx, 0);
+ io_req_assign_buf_node(req, node);
io = req->async_data;
- ret = io_import_fixed(ddir, &io->iter, req->imu, rw->addr, rw->len);
+ ret = io_import_fixed(ddir, &io->iter, node->buf, rw->addr, rw->len);
iov_iter_save_state(&io->iter, &io->iter_state);
return ret;
}
@@ -435,7 +434,7 @@ static bool io_rw_should_reissue(struct io_kiocb *req)
* Play it safe and assume not safe to re-import and reissue if we're
* not in the original thread group (or in task context).
*/
- if (!same_thread_group(req->task, current) || !in_task())
+ if (!same_thread_group(req->tctx->task, current) || !in_task())
return false;
return true;
}
@@ -818,6 +817,11 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode, int rw_type)
kiocb->ki_flags |= IOCB_HIPRI;
kiocb->ki_complete = io_complete_rw_iopoll;
req->iopoll_completed = 0;
+ if (ctx->flags & IORING_SETUP_HYBRID_IOPOLL) {
+ /* make sure every req only blocks once*/
+ req->flags &= ~REQ_F_IOPOLL_STATE;
+ req->iopoll_start = ktime_get_ns();
+ }
} else {
if (kiocb->ki_flags & IOCB_HIPRI)
return -EINVAL;
@@ -1135,6 +1139,78 @@ void io_rw_fail(struct io_kiocb *req)
io_req_set_res(req, res, req->cqe.flags);
}
+static int io_uring_classic_poll(struct io_kiocb *req, struct io_comp_batch *iob,
+ unsigned int poll_flags)
+{
+ struct file *file = req->file;
+
+ if (req->opcode == IORING_OP_URING_CMD) {
+ struct io_uring_cmd *ioucmd;
+
+ ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);
+ return file->f_op->uring_cmd_iopoll(ioucmd, iob, poll_flags);
+ } else {
+ struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
+
+ return file->f_op->iopoll(&rw->kiocb, iob, poll_flags);
+ }
+}
+
+static u64 io_hybrid_iopoll_delay(struct io_ring_ctx *ctx, struct io_kiocb *req)
+{
+ struct hrtimer_sleeper timer;
+ enum hrtimer_mode mode;
+ ktime_t kt;
+ u64 sleep_time;
+
+ if (req->flags & REQ_F_IOPOLL_STATE)
+ return 0;
+
+ if (ctx->hybrid_poll_time == LLONG_MAX)
+ return 0;
+
+ /* Using half the running time to do schedule */
+ sleep_time = ctx->hybrid_poll_time / 2;
+
+ kt = ktime_set(0, sleep_time);
+ req->flags |= REQ_F_IOPOLL_STATE;
+
+ mode = HRTIMER_MODE_REL;
+ hrtimer_init_sleeper_on_stack(&timer, CLOCK_MONOTONIC, mode);
+ hrtimer_set_expires(&timer.timer, kt);
+ set_current_state(TASK_INTERRUPTIBLE);
+ hrtimer_sleeper_start_expires(&timer, mode);
+
+ if (timer.task)
+ io_schedule();
+
+ hrtimer_cancel(&timer.timer);
+ __set_current_state(TASK_RUNNING);
+ destroy_hrtimer_on_stack(&timer.timer);
+ return sleep_time;
+}
+
+static int io_uring_hybrid_poll(struct io_kiocb *req,
+ struct io_comp_batch *iob, unsigned int poll_flags)
+{
+ struct io_ring_ctx *ctx = req->ctx;
+ u64 runtime, sleep_time;
+ int ret;
+
+ sleep_time = io_hybrid_iopoll_delay(ctx, req);
+ ret = io_uring_classic_poll(req, iob, poll_flags);
+ runtime = ktime_get_ns() - req->iopoll_start - sleep_time;
+
+ /*
+ * Use minimum sleep time if we're polling devices with different
+ * latencies. We could get more completions from the faster ones.
+ */
+ if (ctx->hybrid_poll_time > runtime)
+ ctx->hybrid_poll_time = runtime;
+
+ return ret;
+}
+
int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
{
struct io_wq_work_node *pos, *start, *prev;
@@ -1151,7 +1227,6 @@ int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
wq_list_for_each(pos, start, &ctx->iopoll_list) {
struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list);
- struct file *file = req->file;
int ret;
/*
@@ -1162,29 +1237,23 @@ int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
if (READ_ONCE(req->iopoll_completed))
break;
- if (req->opcode == IORING_OP_URING_CMD) {
- struct io_uring_cmd *ioucmd;
+ if (ctx->flags & IORING_SETUP_HYBRID_IOPOLL)
+ ret = io_uring_hybrid_poll(req, &iob, poll_flags);
+ else
+ ret = io_uring_classic_poll(req, &iob, poll_flags);
- ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);
- ret = file->f_op->uring_cmd_iopoll(ioucmd, &iob,
- poll_flags);
- } else {
- struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
-
- ret = file->f_op->iopoll(&rw->kiocb, &iob, poll_flags);
- }
if (unlikely(ret < 0))
return ret;
else if (ret)
poll_flags |= BLK_POLL_ONESHOT;
/* iopoll may have completed current req */
- if (!rq_list_empty(iob.req_list) ||
+ if (!rq_list_empty(&iob.req_list) ||
READ_ONCE(req->iopoll_completed))
break;
}
- if (!rq_list_empty(iob.req_list))
+ if (!rq_list_empty(&iob.req_list))
iob.complete(&iob);
else if (!pos)
return 0;
diff --git a/io_uring/splice.c b/io_uring/splice.c
index 3b659cd..5b84f16 100644
--- a/io_uring/splice.c
+++ b/io_uring/splice.c
@@ -21,6 +21,7 @@ struct io_splice {
u64 len;
int splice_fd_in;
unsigned int flags;
+ struct io_rsrc_node *rsrc_node;
};
static int __io_splice_prep(struct io_kiocb *req,
@@ -34,6 +35,7 @@ static int __io_splice_prep(struct io_kiocb *req,
if (unlikely(sp->flags & ~valid_flags))
return -EINVAL;
sp->splice_fd_in = READ_ONCE(sqe->splice_fd_in);
+ sp->rsrc_node = NULL;
req->flags |= REQ_F_FORCE_ASYNC;
return 0;
}
@@ -45,6 +47,36 @@ int io_tee_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return __io_splice_prep(req, sqe);
}
+void io_splice_cleanup(struct io_kiocb *req)
+{
+ struct io_splice *sp = io_kiocb_to_cmd(req, struct io_splice);
+
+ io_put_rsrc_node(req->ctx, sp->rsrc_node);
+}
+
+static struct file *io_splice_get_file(struct io_kiocb *req,
+ unsigned int issue_flags)
+{
+ struct io_splice *sp = io_kiocb_to_cmd(req, struct io_splice);
+ struct io_ring_ctx *ctx = req->ctx;
+ struct io_rsrc_node *node;
+ struct file *file = NULL;
+
+ if (!(sp->flags & SPLICE_F_FD_IN_FIXED))
+ return io_file_get_normal(req, sp->splice_fd_in);
+
+ io_ring_submit_lock(ctx, issue_flags);
+ node = io_rsrc_node_lookup(&ctx->file_table.data, sp->splice_fd_in);
+ if (node) {
+ node->refs++;
+ sp->rsrc_node = node;
+ file = io_slot_file(node);
+ req->flags |= REQ_F_NEED_CLEANUP;
+ }
+ io_ring_submit_unlock(ctx, issue_flags);
+ return file;
+}
+
int io_tee(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_splice *sp = io_kiocb_to_cmd(req, struct io_splice);
@@ -55,10 +87,7 @@ int io_tee(struct io_kiocb *req, unsigned int issue_flags)
WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK);
- if (sp->flags & SPLICE_F_FD_IN_FIXED)
- in = io_file_get_fixed(req, sp->splice_fd_in, issue_flags);
- else
- in = io_file_get_normal(req, sp->splice_fd_in);
+ in = io_splice_get_file(req, issue_flags);
if (!in) {
ret = -EBADF;
goto done;
@@ -96,10 +125,7 @@ int io_splice(struct io_kiocb *req, unsigned int issue_flags)
WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK);
- if (sp->flags & SPLICE_F_FD_IN_FIXED)
- in = io_file_get_fixed(req, sp->splice_fd_in, issue_flags);
- else
- in = io_file_get_normal(req, sp->splice_fd_in);
+ in = io_splice_get_file(req, issue_flags);
if (!in) {
ret = -EBADF;
goto done;
diff --git a/io_uring/splice.h b/io_uring/splice.h
index 542f941..b9b2848 100644
--- a/io_uring/splice.h
+++ b/io_uring/splice.h
@@ -3,5 +3,6 @@
int io_tee_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_tee(struct io_kiocb *req, unsigned int issue_flags);
+void io_splice_cleanup(struct io_kiocb *req);
int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_splice(struct io_kiocb *req, unsigned int issue_flags);
diff --git a/io_uring/sqpoll.c b/io_uring/sqpoll.c
index a265939..6df5e64 100644
--- a/io_uring/sqpoll.c
+++ b/io_uring/sqpoll.c
@@ -40,6 +40,7 @@ void io_sq_thread_unpark(struct io_sq_data *sqd)
if (atomic_dec_return(&sqd->park_pending))
set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
mutex_unlock(&sqd->lock);
+ wake_up(&sqd->wait);
}
void io_sq_thread_park(struct io_sq_data *sqd)
@@ -106,29 +107,21 @@ static struct io_sq_data *io_attach_sq_data(struct io_uring_params *p)
{
struct io_ring_ctx *ctx_attach;
struct io_sq_data *sqd;
- struct fd f;
+ CLASS(fd, f)(p->wq_fd);
- f = fdget(p->wq_fd);
- if (!fd_file(f))
+ if (fd_empty(f))
return ERR_PTR(-ENXIO);
- if (!io_is_uring_fops(fd_file(f))) {
- fdput(f);
+ if (!io_is_uring_fops(fd_file(f)))
return ERR_PTR(-EINVAL);
- }
ctx_attach = fd_file(f)->private_data;
sqd = ctx_attach->sq_data;
- if (!sqd) {
- fdput(f);
+ if (!sqd)
return ERR_PTR(-EINVAL);
- }
- if (sqd->task_tgid != current->tgid) {
- fdput(f);
+ if (sqd->task_tgid != current->tgid)
return ERR_PTR(-EPERM);
- }
refcount_inc(&sqd->refs);
- fdput(f);
return sqd;
}
@@ -215,7 +208,7 @@ static bool io_sqd_handle_event(struct io_sq_data *sqd)
mutex_unlock(&sqd->lock);
if (signal_pending(current))
did_sig = get_signal(&ksig);
- cond_resched();
+ wait_event(sqd->wait, !atomic_read(&sqd->park_pending));
mutex_lock(&sqd->lock);
sqd->sq_cpu = raw_smp_processor_id();
}
@@ -417,16 +410,11 @@ __cold int io_sq_offload_create(struct io_ring_ctx *ctx,
/* Retain compatibility with failing for an invalid attach attempt */
if ((ctx->flags & (IORING_SETUP_ATTACH_WQ | IORING_SETUP_SQPOLL)) ==
IORING_SETUP_ATTACH_WQ) {
- struct fd f;
-
- f = fdget(p->wq_fd);
- if (!fd_file(f))
+ CLASS(fd, f)(p->wq_fd);
+ if (fd_empty(f))
return -ENXIO;
- if (!io_is_uring_fops(fd_file(f))) {
- fdput(f);
+ if (!io_is_uring_fops(fd_file(f)))
return -EINVAL;
- }
- fdput(f);
}
if (ctx->flags & IORING_SETUP_SQPOLL) {
struct task_struct *tsk;
diff --git a/io_uring/statx.c b/io_uring/statx.c
index f7f9b20..6bc4651 100644
--- a/io_uring/statx.c
+++ b/io_uring/statx.c
@@ -36,8 +36,7 @@ int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
sx->buffer = u64_to_user_ptr(READ_ONCE(sqe->addr2));
sx->flags = READ_ONCE(sqe->statx_flags);
- sx->filename = getname_flags(path,
- getname_statx_lookup_flags(sx->flags));
+ sx->filename = getname_uflags(path, sx->flags);
if (IS_ERR(sx->filename)) {
int ret = PTR_ERR(sx->filename);
diff --git a/io_uring/tctx.c b/io_uring/tctx.c
index c043fe9..503f3ff 100644
--- a/io_uring/tctx.c
+++ b/io_uring/tctx.c
@@ -81,6 +81,7 @@ __cold int io_uring_alloc_task_context(struct task_struct *task,
return ret;
}
+ tctx->task = task;
xa_init(&tctx->xa);
init_waitqueue_head(&tctx->wait);
atomic_set(&tctx->in_cancel, 0);
diff --git a/io_uring/timeout.c b/io_uring/timeout.c
index 9973876..5b12bd6 100644
--- a/io_uring/timeout.c
+++ b/io_uring/timeout.c
@@ -300,16 +300,18 @@ static void io_req_task_link_timeout(struct io_kiocb *req, struct io_tw_state *t
{
struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout);
struct io_kiocb *prev = timeout->prev;
- int ret = -ENOENT;
+ int ret;
if (prev) {
- if (!(req->task->flags & PF_EXITING)) {
+ if (!io_should_terminate_tw()) {
struct io_cancel_data cd = {
.ctx = req->ctx,
.data = prev->cqe.user_data,
};
- ret = io_try_cancel(req->task->io_uring, &cd, 0);
+ ret = io_try_cancel(req->tctx, &cd, 0);
+ } else {
+ ret = -ECANCELED;
}
io_req_set_res(req, ret ?: -ETIME, 0);
io_req_task_complete(req, ts);
@@ -637,13 +639,13 @@ void io_queue_linked_timeout(struct io_kiocb *req)
io_put_req(req);
}
-static bool io_match_task(struct io_kiocb *head, struct task_struct *task,
+static bool io_match_task(struct io_kiocb *head, struct io_uring_task *tctx,
bool cancel_all)
__must_hold(&head->ctx->timeout_lock)
{
struct io_kiocb *req;
- if (task && head->task != task)
+ if (tctx && head->tctx != tctx)
return false;
if (cancel_all)
return true;
@@ -656,7 +658,7 @@ static bool io_match_task(struct io_kiocb *head, struct task_struct *task,
}
/* Returns true if we found and killed one or more timeouts */
-__cold bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk,
+__cold bool io_kill_timeouts(struct io_ring_ctx *ctx, struct io_uring_task *tctx,
bool cancel_all)
{
struct io_timeout *timeout, *tmp;
@@ -671,7 +673,7 @@ __cold bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk,
list_for_each_entry_safe(timeout, tmp, &ctx->timeout_list, list) {
struct io_kiocb *req = cmd_to_io_kiocb(timeout);
- if (io_match_task(req, tsk, cancel_all) &&
+ if (io_match_task(req, tctx, cancel_all) &&
io_kill_timeout(req, -ECANCELED))
canceled++;
}
diff --git a/io_uring/timeout.h b/io_uring/timeout.h
index a6939f1..e91b324 100644
--- a/io_uring/timeout.h
+++ b/io_uring/timeout.h
@@ -24,7 +24,7 @@ static inline struct io_kiocb *io_disarm_linked_timeout(struct io_kiocb *req)
__cold void io_flush_timeouts(struct io_ring_ctx *ctx);
struct io_cancel_data;
int io_timeout_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd);
-__cold bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk,
+__cold bool io_kill_timeouts(struct io_ring_ctx *ctx, struct io_uring_task *tctx,
bool cancel_all);
void io_queue_linked_timeout(struct io_kiocb *req);
void io_disarm_next(struct io_kiocb *req);
diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c
index 39c3c81..d9fb214 100644
--- a/io_uring/uring_cmd.c
+++ b/io_uring/uring_cmd.c
@@ -47,7 +47,7 @@ static void io_req_uring_cleanup(struct io_kiocb *req, unsigned int issue_flags)
}
bool io_uring_try_cancel_uring_cmd(struct io_ring_ctx *ctx,
- struct task_struct *task, bool cancel_all)
+ struct io_uring_task *tctx, bool cancel_all)
{
struct hlist_node *tmp;
struct io_kiocb *req;
@@ -61,7 +61,7 @@ bool io_uring_try_cancel_uring_cmd(struct io_ring_ctx *ctx,
struct io_uring_cmd);
struct file *file = req->file;
- if (!cancel_all && req->task != task)
+ if (!cancel_all && req->tctx != tctx)
continue;
if (cmd->flags & IORING_URING_CMD_CANCELABLE) {
@@ -119,9 +119,13 @@ EXPORT_SYMBOL_GPL(io_uring_cmd_mark_cancelable);
static void io_uring_cmd_work(struct io_kiocb *req, struct io_tw_state *ts)
{
struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);
+ unsigned int flags = IO_URING_F_COMPLETE_DEFER;
+
+ if (current->flags & (PF_EXITING | PF_KTHREAD))
+ flags |= IO_URING_F_TASK_DEAD;
/* task_work executor checks the deffered list completion */
- ioucmd->task_work_cb(ioucmd, IO_URING_F_COMPLETE_DEFER);
+ ioucmd->task_work_cb(ioucmd, flags);
}
void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd,
@@ -209,14 +213,18 @@ int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
if (ioucmd->flags & IORING_URING_CMD_FIXED) {
struct io_ring_ctx *ctx = req->ctx;
- u16 index;
+ struct io_rsrc_node *node;
+ u16 index = READ_ONCE(sqe->buf_index);
- req->buf_index = READ_ONCE(sqe->buf_index);
- if (unlikely(req->buf_index >= ctx->nr_user_bufs))
+ node = io_rsrc_node_lookup(&ctx->buf_table, index);
+ if (unlikely(!node))
return -EFAULT;
- index = array_index_nospec(req->buf_index, ctx->nr_user_bufs);
- req->imu = ctx->user_bufs[index];
- io_req_set_rsrc_node(req, ctx, 0);
+ /*
+ * Pi node upfront, prior to io_uring_cmd_import_fixed()
+ * being called. This prevents destruction of the mapped buffer
+ * we'll need at actual import time.
+ */
+ io_req_assign_buf_node(req, node);
}
ioucmd->cmd_op = READ_ONCE(sqe->cmd_op);
@@ -272,8 +280,13 @@ int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw,
struct iov_iter *iter, void *ioucmd)
{
struct io_kiocb *req = cmd_to_io_kiocb(ioucmd);
+ struct io_rsrc_node *node = req->buf_node;
- return io_import_fixed(rw, iter, req->imu, ubuf, len);
+ /* Must have had rsrc_node assigned at prep time */
+ if (node)
+ return io_import_fixed(rw, iter, node->buf, ubuf, len);
+
+ return -EFAULT;
}
EXPORT_SYMBOL_GPL(io_uring_cmd_import_fixed);
diff --git a/io_uring/uring_cmd.h b/io_uring/uring_cmd.h
index a361f98..7dba0f1 100644
--- a/io_uring/uring_cmd.h
+++ b/io_uring/uring_cmd.h
@@ -8,4 +8,4 @@ int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags);
int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
bool io_uring_try_cancel_uring_cmd(struct io_ring_ctx *ctx,
- struct task_struct *task, bool cancel_all);
+ struct io_uring_task *tctx, bool cancel_all);
diff --git a/io_uring/waitid.c b/io_uring/waitid.c
index 6362ec20..daef5dd 100644
--- a/io_uring/waitid.c
+++ b/io_uring/waitid.c
@@ -184,7 +184,7 @@ int io_waitid_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd,
return -ENOENT;
}
-bool io_waitid_remove_all(struct io_ring_ctx *ctx, struct task_struct *task,
+bool io_waitid_remove_all(struct io_ring_ctx *ctx, struct io_uring_task *tctx,
bool cancel_all)
{
struct hlist_node *tmp;
@@ -194,7 +194,7 @@ bool io_waitid_remove_all(struct io_ring_ctx *ctx, struct task_struct *task,
lockdep_assert_held(&ctx->uring_lock);
hlist_for_each_entry_safe(req, tmp, &ctx->waitid_list, hash_node) {
- if (!io_match_task_safe(req, task, cancel_all))
+ if (!io_match_task_safe(req, tctx, cancel_all))
continue;
hlist_del_init(&req->hash_node);
__io_waitid_cancel(ctx, req);
@@ -331,7 +331,7 @@ int io_waitid(struct io_kiocb *req, unsigned int issue_flags)
hlist_add_head(&req->hash_node, &ctx->waitid_list);
init_waitqueue_func_entry(&iwa->wo.child_wait, io_waitid_wait);
- iwa->wo.child_wait.private = req->task;
+ iwa->wo.child_wait.private = req->tctx->task;
iw->head = ¤t->signal->wait_chldexit;
add_wait_queue(iw->head, &iwa->wo.child_wait);
diff --git a/io_uring/waitid.h b/io_uring/waitid.h
index 956a8ad..d5544aa 100644
--- a/io_uring/waitid.h
+++ b/io_uring/waitid.h
@@ -11,5 +11,5 @@ int io_waitid_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_waitid(struct io_kiocb *req, unsigned int issue_flags);
int io_waitid_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd,
unsigned int issue_flags);
-bool io_waitid_remove_all(struct io_ring_ctx *ctx, struct task_struct *task,
+bool io_waitid_remove_all(struct io_ring_ctx *ctx, struct io_uring_task *tctx,
bool cancel_all);
diff --git a/io_uring/xattr.c b/io_uring/xattr.c
index 6cf41c3..de5064f 100644
--- a/io_uring/xattr.c
+++ b/io_uring/xattr.c
@@ -18,7 +18,7 @@
struct io_xattr {
struct file *file;
- struct xattr_ctx ctx;
+ struct kernel_xattr_ctx ctx;
struct filename *filename;
};
@@ -48,13 +48,10 @@ static int __io_getxattr_prep(struct io_kiocb *req,
const char __user *name;
int ret;
- if (unlikely(req->flags & REQ_F_FIXED_FILE))
- return -EBADF;
-
ix->filename = NULL;
ix->ctx.kvalue = NULL;
name = u64_to_user_ptr(READ_ONCE(sqe->addr));
- ix->ctx.cvalue = u64_to_user_ptr(READ_ONCE(sqe->addr2));
+ ix->ctx.value = u64_to_user_ptr(READ_ONCE(sqe->addr2));
ix->ctx.size = READ_ONCE(sqe->len);
ix->ctx.flags = READ_ONCE(sqe->xattr_flags);
@@ -65,11 +62,8 @@ static int __io_getxattr_prep(struct io_kiocb *req,
if (!ix->ctx.kname)
return -ENOMEM;
- ret = strncpy_from_user(ix->ctx.kname->name, name,
- sizeof(ix->ctx.kname->name));
- if (!ret || ret == sizeof(ix->ctx.kname->name))
- ret = -ERANGE;
- if (ret < 0) {
+ ret = import_xattr_name(ix->ctx.kname, name);
+ if (ret) {
kfree(ix->ctx.kname);
return ret;
}
@@ -90,19 +84,20 @@ int io_getxattr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
const char __user *path;
int ret;
+ if (unlikely(req->flags & REQ_F_FIXED_FILE))
+ return -EBADF;
+
ret = __io_getxattr_prep(req, sqe);
if (ret)
return ret;
path = u64_to_user_ptr(READ_ONCE(sqe->addr3));
- ix->filename = getname_flags(path, LOOKUP_FOLLOW);
- if (IS_ERR(ix->filename)) {
- ret = PTR_ERR(ix->filename);
- ix->filename = NULL;
- }
+ ix->filename = getname(path);
+ if (IS_ERR(ix->filename))
+ return PTR_ERR(ix->filename);
- return ret;
+ return 0;
}
int io_fgetxattr(struct io_kiocb *req, unsigned int issue_flags)
@@ -112,10 +107,7 @@ int io_fgetxattr(struct io_kiocb *req, unsigned int issue_flags)
WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK);
- ret = do_getxattr(file_mnt_idmap(req->file),
- req->file->f_path.dentry,
- &ix->ctx);
-
+ ret = file_getxattr(req->file, &ix->ctx);
io_xattr_finish(req, ret);
return IOU_OK;
}
@@ -123,24 +115,12 @@ int io_fgetxattr(struct io_kiocb *req, unsigned int issue_flags)
int io_getxattr(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_xattr *ix = io_kiocb_to_cmd(req, struct io_xattr);
- unsigned int lookup_flags = LOOKUP_FOLLOW;
- struct path path;
int ret;
WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK);
-retry:
- ret = filename_lookup(AT_FDCWD, ix->filename, lookup_flags, &path, NULL);
- if (!ret) {
- ret = do_getxattr(mnt_idmap(path.mnt), path.dentry, &ix->ctx);
-
- path_put(&path);
- if (retry_estale(ret, lookup_flags)) {
- lookup_flags |= LOOKUP_REVAL;
- goto retry;
- }
- }
-
+ ret = filename_getxattr(AT_FDCWD, ix->filename, LOOKUP_FOLLOW, &ix->ctx);
+ ix->filename = NULL;
io_xattr_finish(req, ret);
return IOU_OK;
}
@@ -152,9 +132,6 @@ static int __io_setxattr_prep(struct io_kiocb *req,
const char __user *name;
int ret;
- if (unlikely(req->flags & REQ_F_FIXED_FILE))
- return -EBADF;
-
ix->filename = NULL;
name = u64_to_user_ptr(READ_ONCE(sqe->addr));
ix->ctx.cvalue = u64_to_user_ptr(READ_ONCE(sqe->addr2));
@@ -183,19 +160,20 @@ int io_setxattr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
const char __user *path;
int ret;
+ if (unlikely(req->flags & REQ_F_FIXED_FILE))
+ return -EBADF;
+
ret = __io_setxattr_prep(req, sqe);
if (ret)
return ret;
path = u64_to_user_ptr(READ_ONCE(sqe->addr3));
- ix->filename = getname_flags(path, LOOKUP_FOLLOW);
- if (IS_ERR(ix->filename)) {
- ret = PTR_ERR(ix->filename);
- ix->filename = NULL;
- }
+ ix->filename = getname(path);
+ if (IS_ERR(ix->filename))
+ return PTR_ERR(ix->filename);
- return ret;
+ return 0;
}
int io_fsetxattr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
@@ -203,28 +181,14 @@ int io_fsetxattr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return __io_setxattr_prep(req, sqe);
}
-static int __io_setxattr(struct io_kiocb *req, unsigned int issue_flags,
- const struct path *path)
+int io_fsetxattr(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_xattr *ix = io_kiocb_to_cmd(req, struct io_xattr);
int ret;
- ret = mnt_want_write(path->mnt);
- if (!ret) {
- ret = do_setxattr(mnt_idmap(path->mnt), path->dentry, &ix->ctx);
- mnt_drop_write(path->mnt);
- }
-
- return ret;
-}
-
-int io_fsetxattr(struct io_kiocb *req, unsigned int issue_flags)
-{
- int ret;
-
WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK);
- ret = __io_setxattr(req, issue_flags, &req->file->f_path);
+ ret = file_setxattr(req->file, &ix->ctx);
io_xattr_finish(req, ret);
return IOU_OK;
}
@@ -232,23 +196,12 @@ int io_fsetxattr(struct io_kiocb *req, unsigned int issue_flags)
int io_setxattr(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_xattr *ix = io_kiocb_to_cmd(req, struct io_xattr);
- unsigned int lookup_flags = LOOKUP_FOLLOW;
- struct path path;
int ret;
WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK);
-retry:
- ret = filename_lookup(AT_FDCWD, ix->filename, lookup_flags, &path, NULL);
- if (!ret) {
- ret = __io_setxattr(req, issue_flags, &path);
- path_put(&path);
- if (retry_estale(ret, lookup_flags)) {
- lookup_flags |= LOOKUP_REVAL;
- goto retry;
- }
- }
-
+ ret = filename_setxattr(AT_FDCWD, ix->filename, LOOKUP_FOLLOW, &ix->ctx);
+ ix->filename = NULL;
io_xattr_finish(req, ret);
return IOU_OK;
}
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index 34fa0bd..35b4f86 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -1063,7 +1063,6 @@ static int do_mq_timedsend(mqd_t mqdes, const char __user *u_msg_ptr,
size_t msg_len, unsigned int msg_prio,
struct timespec64 *ts)
{
- struct fd f;
struct inode *inode;
struct ext_wait_queue wait;
struct ext_wait_queue *receiver;
@@ -1084,37 +1083,27 @@ static int do_mq_timedsend(mqd_t mqdes, const char __user *u_msg_ptr,
audit_mq_sendrecv(mqdes, msg_len, msg_prio, ts);
- f = fdget(mqdes);
- if (unlikely(!fd_file(f))) {
- ret = -EBADF;
- goto out;
- }
+ CLASS(fd, f)(mqdes);
+ if (fd_empty(f))
+ return -EBADF;
inode = file_inode(fd_file(f));
- if (unlikely(fd_file(f)->f_op != &mqueue_file_operations)) {
- ret = -EBADF;
- goto out_fput;
- }
+ if (unlikely(fd_file(f)->f_op != &mqueue_file_operations))
+ return -EBADF;
info = MQUEUE_I(inode);
audit_file(fd_file(f));
- if (unlikely(!(fd_file(f)->f_mode & FMODE_WRITE))) {
- ret = -EBADF;
- goto out_fput;
- }
+ if (unlikely(!(fd_file(f)->f_mode & FMODE_WRITE)))
+ return -EBADF;
- if (unlikely(msg_len > info->attr.mq_msgsize)) {
- ret = -EMSGSIZE;
- goto out_fput;
- }
+ if (unlikely(msg_len > info->attr.mq_msgsize))
+ return -EMSGSIZE;
/* First try to allocate memory, before doing anything with
* existing queues. */
msg_ptr = load_msg(u_msg_ptr, msg_len);
- if (IS_ERR(msg_ptr)) {
- ret = PTR_ERR(msg_ptr);
- goto out_fput;
- }
+ if (IS_ERR(msg_ptr))
+ return PTR_ERR(msg_ptr);
msg_ptr->m_ts = msg_len;
msg_ptr->m_type = msg_prio;
@@ -1172,9 +1161,6 @@ static int do_mq_timedsend(mqd_t mqdes, const char __user *u_msg_ptr,
out_free:
if (ret)
free_msg(msg_ptr);
-out_fput:
- fdput(f);
-out:
return ret;
}
@@ -1184,7 +1170,6 @@ static int do_mq_timedreceive(mqd_t mqdes, char __user *u_msg_ptr,
{
ssize_t ret;
struct msg_msg *msg_ptr;
- struct fd f;
struct inode *inode;
struct mqueue_inode_info *info;
struct ext_wait_queue wait;
@@ -1198,30 +1183,22 @@ static int do_mq_timedreceive(mqd_t mqdes, char __user *u_msg_ptr,
audit_mq_sendrecv(mqdes, msg_len, 0, ts);
- f = fdget(mqdes);
- if (unlikely(!fd_file(f))) {
- ret = -EBADF;
- goto out;
- }
+ CLASS(fd, f)(mqdes);
+ if (fd_empty(f))
+ return -EBADF;
inode = file_inode(fd_file(f));
- if (unlikely(fd_file(f)->f_op != &mqueue_file_operations)) {
- ret = -EBADF;
- goto out_fput;
- }
+ if (unlikely(fd_file(f)->f_op != &mqueue_file_operations))
+ return -EBADF;
info = MQUEUE_I(inode);
audit_file(fd_file(f));
- if (unlikely(!(fd_file(f)->f_mode & FMODE_READ))) {
- ret = -EBADF;
- goto out_fput;
- }
+ if (unlikely(!(fd_file(f)->f_mode & FMODE_READ)))
+ return -EBADF;
/* checks if buffer is big enough */
- if (unlikely(msg_len < info->attr.mq_msgsize)) {
- ret = -EMSGSIZE;
- goto out_fput;
- }
+ if (unlikely(msg_len < info->attr.mq_msgsize))
+ return -EMSGSIZE;
/*
* msg_insert really wants us to have a valid, spare node struct so
@@ -1275,9 +1252,6 @@ static int do_mq_timedreceive(mqd_t mqdes, char __user *u_msg_ptr,
}
free_msg(msg_ptr);
}
-out_fput:
- fdput(f);
-out:
return ret;
}
@@ -1317,7 +1291,6 @@ SYSCALL_DEFINE5(mq_timedreceive, mqd_t, mqdes, char __user *, u_msg_ptr,
static int do_mq_notify(mqd_t mqdes, const struct sigevent *notification)
{
int ret;
- struct fd f;
struct sock *sock;
struct inode *inode;
struct mqueue_inode_info *info;
@@ -1347,39 +1320,31 @@ static int do_mq_notify(mqd_t mqdes, const struct sigevent *notification)
if (copy_from_user(nc->data,
notification->sigev_value.sival_ptr,
NOTIFY_COOKIE_LEN)) {
- ret = -EFAULT;
- goto free_skb;
+ kfree_skb(nc);
+ return -EFAULT;
}
/* TODO: add a header? */
skb_put(nc, NOTIFY_COOKIE_LEN);
/* and attach it to the socket */
retry:
- f = fdget(notification->sigev_signo);
- if (!fd_file(f)) {
- ret = -EBADF;
- goto out;
- }
- sock = netlink_getsockbyfilp(fd_file(f));
- fdput(f);
+ sock = netlink_getsockbyfd(notification->sigev_signo);
if (IS_ERR(sock)) {
- ret = PTR_ERR(sock);
- goto free_skb;
+ kfree_skb(nc);
+ return PTR_ERR(sock);
}
timeo = MAX_SCHEDULE_TIMEOUT;
ret = netlink_attachskb(sock, nc, &timeo, NULL);
- if (ret == 1) {
- sock = NULL;
+ if (ret == 1)
goto retry;
- }
if (ret)
return ret;
}
}
- f = fdget(mqdes);
- if (!fd_file(f)) {
+ CLASS(fd, f)(mqdes);
+ if (fd_empty(f)) {
ret = -EBADF;
goto out;
}
@@ -1387,7 +1352,7 @@ static int do_mq_notify(mqd_t mqdes, const struct sigevent *notification)
inode = file_inode(fd_file(f));
if (unlikely(fd_file(f)->f_op != &mqueue_file_operations)) {
ret = -EBADF;
- goto out_fput;
+ goto out;
}
info = MQUEUE_I(inode);
@@ -1426,15 +1391,9 @@ static int do_mq_notify(mqd_t mqdes, const struct sigevent *notification)
inode_set_atime_to_ts(inode, inode_set_ctime_current(inode));
}
spin_unlock(&info->lock);
-out_fput:
- fdput(f);
out:
if (sock)
netlink_detachskb(sock, nc);
- else
-free_skb:
- dev_kfree_skb(nc);
-
return ret;
}
@@ -1452,21 +1411,18 @@ SYSCALL_DEFINE2(mq_notify, mqd_t, mqdes,
static int do_mq_getsetattr(int mqdes, struct mq_attr *new, struct mq_attr *old)
{
- struct fd f;
struct inode *inode;
struct mqueue_inode_info *info;
if (new && (new->mq_flags & (~O_NONBLOCK)))
return -EINVAL;
- f = fdget(mqdes);
- if (!fd_file(f))
+ CLASS(fd, f)(mqdes);
+ if (fd_empty(f))
return -EBADF;
- if (unlikely(fd_file(f)->f_op != &mqueue_file_operations)) {
- fdput(f);
+ if (unlikely(fd_file(f)->f_op != &mqueue_file_operations))
return -EBADF;
- }
inode = file_inode(fd_file(f));
info = MQUEUE_I(inode);
@@ -1490,7 +1446,6 @@ static int do_mq_getsetattr(int mqdes, struct mq_attr *new, struct mq_attr *old)
}
spin_unlock(&info->lock);
- fdput(f);
return 0;
}
diff --git a/kernel/Kconfig.kexec b/kernel/Kconfig.kexec
index 6c34e63..4d111f8 100644
--- a/kernel/Kconfig.kexec
+++ b/kernel/Kconfig.kexec
@@ -97,7 +97,7 @@
config CRASH_DUMP
bool "kernel crash dumps"
- default y
+ default ARCH_DEFAULT_CRASH_DUMP
depends on ARCH_SUPPORTS_CRASH_DUMP
depends on KEXEC_CORE
select VMCORE_INFO
diff --git a/kernel/audit.c b/kernel/audit.c
index 1edaa48..6a95a60 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -123,7 +123,7 @@ static u32 audit_backlog_wait_time = AUDIT_BACKLOG_WAIT_TIME;
/* The identity of the user shutting down the audit system. */
static kuid_t audit_sig_uid = INVALID_UID;
static pid_t audit_sig_pid = -1;
-static u32 audit_sig_sid;
+static struct lsm_prop audit_sig_lsm;
/* Records can be lost in several ways:
0) [suppressed in audit_alloc]
@@ -1473,20 +1473,21 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
}
case AUDIT_SIGNAL_INFO:
len = 0;
- if (audit_sig_sid) {
- err = security_secid_to_secctx(audit_sig_sid, &ctx, &len);
+ if (lsmprop_is_set(&audit_sig_lsm)) {
+ err = security_lsmprop_to_secctx(&audit_sig_lsm, &ctx,
+ &len);
if (err)
return err;
}
sig_data = kmalloc(struct_size(sig_data, ctx, len), GFP_KERNEL);
if (!sig_data) {
- if (audit_sig_sid)
+ if (lsmprop_is_set(&audit_sig_lsm))
security_release_secctx(ctx, len);
return -ENOMEM;
}
sig_data->uid = from_kuid(&init_user_ns, audit_sig_uid);
sig_data->pid = audit_sig_pid;
- if (audit_sig_sid) {
+ if (lsmprop_is_set(&audit_sig_lsm)) {
memcpy(sig_data->ctx, ctx, len);
security_release_secctx(ctx, len);
}
@@ -2102,8 +2103,8 @@ bool audit_string_contains_control(const char *string, size_t len)
/**
* audit_log_n_untrustedstring - log a string that may contain random characters
* @ab: audit_buffer
- * @len: length of string (not including trailing null)
* @string: string to be logged
+ * @len: length of string (not including trailing null)
*
* This code will escape a string that is passed to it if the string
* contains a control character, unprintable character, double quote mark,
@@ -2178,16 +2179,16 @@ void audit_log_key(struct audit_buffer *ab, char *key)
int audit_log_task_context(struct audit_buffer *ab)
{
+ struct lsm_prop prop;
char *ctx = NULL;
unsigned len;
int error;
- u32 sid;
- security_current_getsecid_subj(&sid);
- if (!sid)
+ security_current_getlsmprop_subj(&prop);
+ if (!lsmprop_is_set(&prop))
return 0;
- error = security_secid_to_secctx(sid, &ctx, &len);
+ error = security_lsmprop_to_secctx(&prop, &ctx, &len);
if (error) {
if (error != -EINVAL)
goto error_path;
@@ -2404,7 +2405,7 @@ int audit_signal_info(int sig, struct task_struct *t)
audit_sig_uid = auid;
else
audit_sig_uid = uid;
- security_current_getsecid_subj(&audit_sig_sid);
+ security_current_getlsmprop_subj(&audit_sig_lsm);
}
return audit_signal_info_syscall(t);
diff --git a/kernel/audit.h b/kernel/audit.h
index a60d2840..0211cb3 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -11,6 +11,7 @@
#include <linux/fs.h>
#include <linux/audit.h>
+#include <linux/security.h>
#include <linux/skbuff.h>
#include <uapi/linux/mqueue.h>
#include <linux/tty.h>
@@ -81,7 +82,7 @@ struct audit_names {
kuid_t uid;
kgid_t gid;
dev_t rdev;
- u32 osid;
+ struct lsm_prop oprop;
struct audit_cap_data fcap;
unsigned int fcap_ver;
unsigned char type; /* record type */
@@ -143,7 +144,7 @@ struct audit_context {
kuid_t target_auid;
kuid_t target_uid;
unsigned int target_sessionid;
- u32 target_sid;
+ struct lsm_prop target_ref;
char target_comm[TASK_COMM_LEN];
struct audit_tree_refs *trees, *first_trees;
@@ -160,7 +161,7 @@ struct audit_context {
kuid_t uid;
kgid_t gid;
umode_t mode;
- u32 osid;
+ struct lsm_prop oprop;
int has_perm;
uid_t perm_uid;
gid_t perm_gid;
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 470041c..bceb9f5 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -1339,8 +1339,8 @@ int audit_filter(int msgtype, unsigned int listtype)
for (i = 0; i < e->rule.field_count; i++) {
struct audit_field *f = &e->rule.fields[i];
+ struct lsm_prop prop = { };
pid_t pid;
- u32 sid;
switch (f->type) {
case AUDIT_PID:
@@ -1370,9 +1370,10 @@ int audit_filter(int msgtype, unsigned int listtype)
case AUDIT_SUBJ_SEN:
case AUDIT_SUBJ_CLR:
if (f->lsm_rule) {
- security_current_getsecid_subj(&sid);
- result = security_audit_rule_match(sid,
- f->type, f->op, f->lsm_rule);
+ security_current_getlsmprop_subj(&prop);
+ result = security_audit_rule_match(
+ &prop, f->type, f->op,
+ f->lsm_rule);
}
break;
case AUDIT_EXE:
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index cd57053..91afdd0 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -100,7 +100,7 @@ struct audit_aux_data_pids {
kuid_t target_auid[AUDIT_AUX_PIDS];
kuid_t target_uid[AUDIT_AUX_PIDS];
unsigned int target_sessionid[AUDIT_AUX_PIDS];
- u32 target_sid[AUDIT_AUX_PIDS];
+ struct lsm_prop target_ref[AUDIT_AUX_PIDS];
char target_comm[AUDIT_AUX_PIDS][TASK_COMM_LEN];
int pid_count;
};
@@ -470,7 +470,7 @@ static int audit_filter_rules(struct task_struct *tsk,
{
const struct cred *cred;
int i, need_sid = 1;
- u32 sid;
+ struct lsm_prop prop = { };
unsigned int sessionid;
if (ctx && rule->prio <= ctx->prio)
@@ -674,14 +674,16 @@ static int audit_filter_rules(struct task_struct *tsk,
* fork()/copy_process() in which case
* the new @tsk creds are still a dup
* of @current's creds so we can still
- * use security_current_getsecid_subj()
+ * use
+ * security_current_getlsmprop_subj()
* here even though it always refs
* @current's creds
*/
- security_current_getsecid_subj(&sid);
+ security_current_getlsmprop_subj(&prop);
need_sid = 0;
}
- result = security_audit_rule_match(sid, f->type,
+ result = security_audit_rule_match(&prop,
+ f->type,
f->op,
f->lsm_rule);
}
@@ -697,14 +699,14 @@ static int audit_filter_rules(struct task_struct *tsk,
/* Find files that match */
if (name) {
result = security_audit_rule_match(
- name->osid,
+ &name->oprop,
f->type,
f->op,
f->lsm_rule);
} else if (ctx) {
list_for_each_entry(n, &ctx->names_list, list) {
if (security_audit_rule_match(
- n->osid,
+ &n->oprop,
f->type,
f->op,
f->lsm_rule)) {
@@ -716,7 +718,7 @@ static int audit_filter_rules(struct task_struct *tsk,
/* Find ipc objects that match */
if (!ctx || ctx->type != AUDIT_IPC)
break;
- if (security_audit_rule_match(ctx->ipc.osid,
+ if (security_audit_rule_match(&ctx->ipc.oprop,
f->type, f->op,
f->lsm_rule))
++result;
@@ -1017,7 +1019,7 @@ static void audit_reset_context(struct audit_context *ctx)
ctx->target_pid = 0;
ctx->target_auid = ctx->target_uid = KUIDT_INIT(0);
ctx->target_sessionid = 0;
- ctx->target_sid = 0;
+ lsmprop_init(&ctx->target_ref);
ctx->target_comm[0] = '\0';
unroll_tree_refs(ctx, NULL, 0);
WARN_ON(!list_empty(&ctx->killed_trees));
@@ -1091,8 +1093,9 @@ static inline void audit_free_context(struct audit_context *context)
}
static int audit_log_pid_context(struct audit_context *context, pid_t pid,
- kuid_t auid, kuid_t uid, unsigned int sessionid,
- u32 sid, char *comm)
+ kuid_t auid, kuid_t uid,
+ unsigned int sessionid, struct lsm_prop *prop,
+ char *comm)
{
struct audit_buffer *ab;
char *ctx = NULL;
@@ -1106,8 +1109,8 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid,
audit_log_format(ab, "opid=%d oauid=%d ouid=%d oses=%d", pid,
from_kuid(&init_user_ns, auid),
from_kuid(&init_user_ns, uid), sessionid);
- if (sid) {
- if (security_secid_to_secctx(sid, &ctx, &len)) {
+ if (lsmprop_is_set(prop)) {
+ if (security_lsmprop_to_secctx(prop, &ctx, &len)) {
audit_log_format(ab, " obj=(none)");
rc = 1;
} else {
@@ -1384,19 +1387,17 @@ static void show_special(struct audit_context *context, int *call_panic)
audit_log_format(ab, " a%d=%lx", i,
context->socketcall.args[i]);
break; }
- case AUDIT_IPC: {
- u32 osid = context->ipc.osid;
-
+ case AUDIT_IPC:
audit_log_format(ab, "ouid=%u ogid=%u mode=%#ho",
from_kuid(&init_user_ns, context->ipc.uid),
from_kgid(&init_user_ns, context->ipc.gid),
context->ipc.mode);
- if (osid) {
+ if (lsmprop_is_set(&context->ipc.oprop)) {
char *ctx = NULL;
u32 len;
- if (security_secid_to_secctx(osid, &ctx, &len)) {
- audit_log_format(ab, " osid=%u", osid);
+ if (security_lsmprop_to_secctx(&context->ipc.oprop,
+ &ctx, &len)) {
*call_panic = 1;
} else {
audit_log_format(ab, " obj=%s", ctx);
@@ -1416,7 +1417,7 @@ static void show_special(struct audit_context *context, int *call_panic)
context->ipc.perm_gid,
context->ipc.perm_mode);
}
- break; }
+ break;
case AUDIT_MQ_OPEN:
audit_log_format(ab,
"oflag=0x%x mode=%#ho mq_flags=0x%lx mq_maxmsg=%ld "
@@ -1558,13 +1559,11 @@ static void audit_log_name(struct audit_context *context, struct audit_names *n,
from_kgid(&init_user_ns, n->gid),
MAJOR(n->rdev),
MINOR(n->rdev));
- if (n->osid != 0) {
+ if (lsmprop_is_set(&n->oprop)) {
char *ctx = NULL;
u32 len;
- if (security_secid_to_secctx(
- n->osid, &ctx, &len)) {
- audit_log_format(ab, " osid=%u", n->osid);
+ if (security_lsmprop_to_secctx(&n->oprop, &ctx, &len)) {
if (call_panic)
*call_panic = 2;
} else {
@@ -1653,8 +1652,8 @@ static void audit_log_uring(struct audit_context *ctx)
audit_log_format(ab, "uring_op=%d", ctx->uring_op);
if (ctx->return_valid != AUDITSC_INVALID)
audit_log_format(ab, " success=%s exit=%ld",
- (ctx->return_valid == AUDITSC_SUCCESS ?
- "yes" : "no"),
+ str_yes_no(ctx->return_valid ==
+ AUDITSC_SUCCESS),
ctx->return_code);
audit_log_format(ab,
" items=%d"
@@ -1696,8 +1695,8 @@ static void audit_log_exit(void)
audit_log_format(ab, " per=%lx", context->personality);
if (context->return_valid != AUDITSC_INVALID)
audit_log_format(ab, " success=%s exit=%ld",
- (context->return_valid == AUDITSC_SUCCESS ?
- "yes" : "no"),
+ str_yes_no(context->return_valid ==
+ AUDITSC_SUCCESS),
context->return_code);
audit_log_format(ab,
" a0=%lx a1=%lx a2=%lx a3=%lx items=%d",
@@ -1780,7 +1779,7 @@ static void audit_log_exit(void)
axs->target_auid[i],
axs->target_uid[i],
axs->target_sessionid[i],
- axs->target_sid[i],
+ &axs->target_ref[i],
axs->target_comm[i]))
call_panic = 1;
}
@@ -1789,7 +1788,7 @@ static void audit_log_exit(void)
audit_log_pid_context(context, context->target_pid,
context->target_auid, context->target_uid,
context->target_sessionid,
- context->target_sid, context->target_comm))
+ &context->target_ref, context->target_comm))
call_panic = 1;
if (context->pwd.dentry && context->pwd.mnt) {
@@ -2278,7 +2277,7 @@ static void audit_copy_inode(struct audit_names *name,
name->uid = inode->i_uid;
name->gid = inode->i_gid;
name->rdev = inode->i_rdev;
- security_inode_getsecid(inode, &name->osid);
+ security_inode_getlsmprop(inode, &name->oprop);
if (flags & AUDIT_INODE_NOEVAL) {
name->fcap_ver = -1;
return;
@@ -2632,7 +2631,7 @@ void __audit_ipc_obj(struct kern_ipc_perm *ipcp)
context->ipc.gid = ipcp->gid;
context->ipc.mode = ipcp->mode;
context->ipc.has_perm = 0;
- security_ipc_getsecid(ipcp, &context->ipc.osid);
+ security_ipc_getlsmprop(ipcp, &context->ipc.oprop);
context->type = AUDIT_IPC;
}
@@ -2729,7 +2728,7 @@ void __audit_ptrace(struct task_struct *t)
context->target_auid = audit_get_loginuid(t);
context->target_uid = task_uid(t);
context->target_sessionid = audit_get_sessionid(t);
- security_task_getsecid_obj(t, &context->target_sid);
+ security_task_getlsmprop_obj(t, &context->target_ref);
memcpy(context->target_comm, t->comm, TASK_COMM_LEN);
}
@@ -2756,7 +2755,7 @@ int audit_signal_info_syscall(struct task_struct *t)
ctx->target_auid = audit_get_loginuid(t);
ctx->target_uid = t_uid;
ctx->target_sessionid = audit_get_sessionid(t);
- security_task_getsecid_obj(t, &ctx->target_sid);
+ security_task_getlsmprop_obj(t, &ctx->target_ref);
memcpy(ctx->target_comm, t->comm, TASK_COMM_LEN);
return 0;
}
@@ -2777,7 +2776,7 @@ int audit_signal_info_syscall(struct task_struct *t)
axp->target_auid[axp->pid_count] = audit_get_loginuid(t);
axp->target_uid[axp->pid_count] = t_uid;
axp->target_sessionid[axp->pid_count] = audit_get_sessionid(t);
- security_task_getsecid_obj(t, &axp->target_sid[axp->pid_count]);
+ security_task_getlsmprop_obj(t, &axp->target_ref[axp->pid_count]);
memcpy(axp->target_comm[axp->pid_count], t->comm, TASK_COMM_LEN);
axp->pid_count++;
diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c
index 29da6d3..e16e79f8 100644
--- a/kernel/bpf/bpf_inode_storage.c
+++ b/kernel/bpf/bpf_inode_storage.c
@@ -16,7 +16,6 @@
#include <uapi/linux/btf.h>
#include <linux/bpf_lsm.h>
#include <linux/btf_ids.h>
-#include <linux/fdtable.h>
#include <linux/rcupdate_trace.h>
DEFINE_BPF_STORAGE_CACHE(inode_cache);
diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c
index adf6dfe..1eb9852 100644
--- a/kernel/bpf/bpf_task_storage.c
+++ b/kernel/bpf/bpf_task_storage.c
@@ -16,7 +16,6 @@
#include <linux/filter.h>
#include <uapi/linux/btf.h>
#include <linux/btf_ids.h>
-#include <linux/fdtable.h>
#include <linux/rcupdate_trace.h>
DEFINE_BPF_STORAGE_CACHE(task_cache);
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 5e77c58..e303626 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -21,7 +21,7 @@
#include <linux/filter.h>
#include <linux/skbuff.h>
#include <linux/vmalloc.h>
-#include <linux/random.h>
+#include <linux/prandom.h>
#include <linux/bpf.h>
#include <linux/btf.h>
#include <linux/objtool.h>
diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
index 5af9e13..98d9b4c 100644
--- a/kernel/bpf/task_iter.c
+++ b/kernel/bpf/task_iter.c
@@ -5,7 +5,6 @@
#include <linux/namei.h>
#include <linux/pid_namespace.h>
#include <linux/fs.h>
-#include <linux/fdtable.h>
#include <linux/filter.h>
#include <linux/bpf_mem_alloc.h>
#include <linux/btf_ids.h>
@@ -286,17 +285,14 @@ task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info)
curr_fd = 0;
}
- rcu_read_lock();
- f = task_lookup_next_fdget_rcu(curr_task, &curr_fd);
+ f = fget_task_next(curr_task, &curr_fd);
if (f) {
/* set info->fd */
info->fd = curr_fd;
- rcu_read_unlock();
return f;
}
/* the current task is done, go to the next task */
- rcu_read_unlock();
put_task_struct(curr_task);
if (info->common.type == BPF_TASK_ITER_TID) {
diff --git a/kernel/bpf/token.c b/kernel/bpf/token.c
index dcbec1a..26057aa 100644
--- a/kernel/bpf/token.c
+++ b/kernel/bpf/token.c
@@ -1,6 +1,5 @@
#include <linux/bpf.h>
#include <linux/vmalloc.h>
-#include <linux/fdtable.h>
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/kernel.h>
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 044c7ba..9bc4a84 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -6476,7 +6476,6 @@ static int cgroup_css_set_fork(struct kernel_clone_args *kargs)
struct cgroup *dst_cgrp = NULL;
struct css_set *cset;
struct super_block *sb;
- struct file *f;
if (kargs->flags & CLONE_INTO_CGROUP)
cgroup_lock();
@@ -6493,14 +6492,14 @@ static int cgroup_css_set_fork(struct kernel_clone_args *kargs)
return 0;
}
- f = fget_raw(kargs->cgroup);
- if (!f) {
+ CLASS(fd_raw, f)(kargs->cgroup);
+ if (fd_empty(f)) {
ret = -EBADF;
goto err;
}
- sb = f->f_path.dentry->d_sb;
+ sb = fd_file(f)->f_path.dentry->d_sb;
- dst_cgrp = cgroup_get_from_file(f);
+ dst_cgrp = cgroup_get_from_file(fd_file(f));
if (IS_ERR(dst_cgrp)) {
ret = PTR_ERR(dst_cgrp);
dst_cgrp = NULL;
@@ -6548,15 +6547,12 @@ static int cgroup_css_set_fork(struct kernel_clone_args *kargs)
}
put_css_set(cset);
- fput(f);
kargs->cgrp = dst_cgrp;
return ret;
err:
cgroup_threadgroup_change_end(current);
cgroup_unlock();
- if (f)
- fput(f);
if (dst_cgrp)
cgroup_put(dst_cgrp);
put_css_set(cset);
@@ -6966,14 +6962,11 @@ EXPORT_SYMBOL_GPL(cgroup_get_from_path);
*/
struct cgroup *cgroup_v1v2_get_from_fd(int fd)
{
- struct cgroup *cgrp;
- struct fd f = fdget_raw(fd);
- if (!fd_file(f))
+ CLASS(fd_raw, f)(fd);
+ if (fd_empty(f))
return ERR_PTR(-EBADF);
- cgrp = cgroup_v1v2_get_from_file(fd_file(f));
- fdput(f);
- return cgrp;
+ return cgroup_v1v2_get_from_file(fd_file(f));
}
/**
diff --git a/kernel/events/core.c b/kernel/events/core.c
index df27d08..b65446b 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -966,22 +966,20 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
{
struct perf_cgroup *cgrp;
struct cgroup_subsys_state *css;
- struct fd f = fdget(fd);
+ CLASS(fd, f)(fd);
int ret = 0;
- if (!fd_file(f))
+ if (fd_empty(f))
return -EBADF;
css = css_tryget_online_from_dir(fd_file(f)->f_path.dentry,
&perf_event_cgrp_subsys);
- if (IS_ERR(css)) {
- ret = PTR_ERR(css);
- goto out;
- }
+ if (IS_ERR(css))
+ return PTR_ERR(css);
ret = perf_cgroup_ensure_storage(event, css);
if (ret)
- goto out;
+ return ret;
cgrp = container_of(css, struct perf_cgroup, css);
event->cgrp = cgrp;
@@ -995,8 +993,6 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
perf_detach_cgroup(event);
ret = -EINVAL;
}
-out:
- fdput(f);
return ret;
}
@@ -5998,18 +5994,9 @@ EXPORT_SYMBOL_GPL(perf_event_period);
static const struct file_operations perf_fops;
-static inline int perf_fget_light(int fd, struct fd *p)
+static inline bool is_perf_file(struct fd f)
{
- struct fd f = fdget(fd);
- if (!fd_file(f))
- return -EBADF;
-
- if (fd_file(f)->f_op != &perf_fops) {
- fdput(f);
- return -EBADF;
- }
- *p = f;
- return 0;
+ return !fd_empty(f) && fd_file(f)->f_op == &perf_fops;
}
static int perf_event_set_output(struct perf_event *event,
@@ -6057,20 +6044,14 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon
case PERF_EVENT_IOC_SET_OUTPUT:
{
- int ret;
+ CLASS(fd, output)(arg); // arg == -1 => empty
+ struct perf_event *output_event = NULL;
if (arg != -1) {
- struct perf_event *output_event;
- struct fd output;
- ret = perf_fget_light(arg, &output);
- if (ret)
- return ret;
+ if (!is_perf_file(output))
+ return -EBADF;
output_event = fd_file(output)->private_data;
- ret = perf_event_set_output(event, output_event);
- fdput(output);
- } else {
- ret = perf_event_set_output(event, NULL);
}
- return ret;
+ return perf_event_set_output(event, output_event);
}
case PERF_EVENT_IOC_SET_FILTER:
@@ -12664,7 +12645,6 @@ SYSCALL_DEFINE5(perf_event_open,
struct perf_event_attr attr;
struct perf_event_context *ctx;
struct file *event_file = NULL;
- struct fd group = EMPTY_FD;
struct task_struct *task = NULL;
struct pmu *pmu;
int event_fd;
@@ -12735,10 +12715,12 @@ SYSCALL_DEFINE5(perf_event_open,
if (event_fd < 0)
return event_fd;
+ CLASS(fd, group)(group_fd); // group_fd == -1 => empty
if (group_fd != -1) {
- err = perf_fget_light(group_fd, &group);
- if (err)
+ if (!is_perf_file(group)) {
+ err = -EBADF;
goto err_fd;
+ }
group_leader = fd_file(group)->private_data;
if (flags & PERF_FLAG_FD_OUTPUT)
output_event = group_leader;
@@ -12750,7 +12732,7 @@ SYSCALL_DEFINE5(perf_event_open,
task = find_lively_task_by_vpid(pid);
if (IS_ERR(task)) {
err = PTR_ERR(task);
- goto err_group_fd;
+ goto err_fd;
}
}
@@ -13017,12 +12999,11 @@ SYSCALL_DEFINE5(perf_event_open,
mutex_unlock(¤t->perf_event_mutex);
/*
- * Drop the reference on the group_event after placing the
- * new event on the sibling_list. This ensures destruction
- * of the group leader will find the pointer to itself in
- * perf_group_detach().
+ * File reference in group guarantees that group_leader has been
+ * kept alive until we place the new event on the sibling_list.
+ * This ensures destruction of the group leader will find
+ * the pointer to itself in perf_group_detach().
*/
- fdput(group);
fd_install(event_fd, event_file);
return event_fd;
@@ -13041,8 +13022,6 @@ SYSCALL_DEFINE5(perf_event_open,
err_task:
if (task)
put_task_struct(task);
-err_group_fd:
- fdput(group);
err_fd:
put_unused_fd(event_fd);
return err;
diff --git a/kernel/exit.c b/kernel/exit.c
index 619f001..1dcddfe 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -25,7 +25,6 @@
#include <linux/acct.h>
#include <linux/tsacct_kern.h>
#include <linux/file.h>
-#include <linux/fdtable.h>
#include <linux/freezer.h>
#include <linux/binfmts.h>
#include <linux/nsproxy.h>
diff --git a/kernel/kcmp.c b/kernel/kcmp.c
index b0639f2..2c59685 100644
--- a/kernel/kcmp.c
+++ b/kernel/kcmp.c
@@ -63,9 +63,7 @@ get_file_raw_ptr(struct task_struct *task, unsigned int idx)
{
struct file *file;
- rcu_read_lock();
- file = task_lookup_fdget_rcu(task, idx);
- rcu_read_unlock();
+ file = fget_task(task, idx);
if (file)
fput(file);
diff --git a/kernel/module/dups.c b/kernel/module/dups.c
index 9a92f2f..bd2149f 100644
--- a/kernel/module/dups.c
+++ b/kernel/module/dups.c
@@ -18,7 +18,6 @@
#include <linux/completion.h>
#include <linux/cred.h>
#include <linux/file.h>
-#include <linux/fdtable.h>
#include <linux/workqueue.h>
#include <linux/security.h>
#include <linux/mount.h>
diff --git a/kernel/module/kmod.c b/kernel/module/kmod.c
index 0800d98..25f2538 100644
--- a/kernel/module/kmod.c
+++ b/kernel/module/kmod.c
@@ -15,7 +15,6 @@
#include <linux/completion.h>
#include <linux/cred.h>
#include <linux/file.h>
-#include <linux/fdtable.h>
#include <linux/workqueue.h>
#include <linux/security.h>
#include <linux/mount.h>
diff --git a/kernel/module/main.c b/kernel/module/main.c
index 49b9bca..4490924 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -3202,7 +3202,7 @@ static int idempotent_init_module(struct file *f, const char __user * uargs, int
{
struct idempotent idem;
- if (!f || !(f->f_mode & FMODE_READ))
+ if (!(f->f_mode & FMODE_READ))
return -EBADF;
/* Are we the winners of the race and get to do this? */
@@ -3219,10 +3219,7 @@ static int idempotent_init_module(struct file *f, const char __user * uargs, int
SYSCALL_DEFINE3(finit_module, int, fd, const char __user *, uargs, int, flags)
{
- int err;
- struct fd f;
-
- err = may_init_module();
+ int err = may_init_module();
if (err)
return err;
@@ -3233,10 +3230,10 @@ SYSCALL_DEFINE3(finit_module, int, fd, const char __user *, uargs, int, flags)
|MODULE_INIT_COMPRESSED_FILE))
return -EINVAL;
- f = fdget(fd);
- err = idempotent_init_module(fd_file(f), uargs, flags);
- fdput(f);
- return err;
+ CLASS(fd, f)(fd);
+ if (fd_empty(f))
+ return -EBADF;
+ return idempotent_init_module(fd_file(f), uargs, flags);
}
/* Keep in sync with MODULE_FLAGS_BUF_SIZE !!! */
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index dc952c3..c9d97ed 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -545,12 +545,12 @@ static void commit_nsset(struct nsset *nsset)
SYSCALL_DEFINE2(setns, int, fd, int, flags)
{
- struct fd f = fdget(fd);
+ CLASS(fd, f)(fd);
struct ns_common *ns = NULL;
struct nsset nsset = {};
int err = 0;
- if (!fd_file(f))
+ if (fd_empty(f))
return -EBADF;
if (proc_ns_file(fd_file(f))) {
@@ -580,7 +580,6 @@ SYSCALL_DEFINE2(setns, int, fd, int, flags)
}
put_nsset(&nsset);
out:
- fdput(f);
return err;
}
diff --git a/kernel/padata.c b/kernel/padata.c
index d899f34..d51bbc7 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -521,13 +521,6 @@ void __init padata_do_multithreaded(struct padata_mt_job *job)
ps.chunk_size = max(ps.chunk_size, 1ul);
ps.chunk_size = roundup(ps.chunk_size, job->align);
- /*
- * chunk_size can be 0 if the caller sets min_chunk to 0. So force it
- * to at least 1 to prevent divide-by-0 panic in padata_mt_helper().`
- */
- if (!ps.chunk_size)
- ps.chunk_size = 1U;
-
list_for_each_entry(pw, &works, pw_list)
if (job->numa_aware) {
int old_node = atomic_read(&last_used_nid);
diff --git a/kernel/pid.c b/kernel/pid.c
index 2715afb..115448e 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -536,11 +536,10 @@ EXPORT_SYMBOL_GPL(find_ge_pid);
struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags)
{
- struct fd f;
+ CLASS(fd, f)(fd);
struct pid *pid;
- f = fdget(fd);
- if (!fd_file(f))
+ if (fd_empty(f))
return ERR_PTR(-EBADF);
pid = pidfd_pid(fd_file(f));
@@ -548,8 +547,6 @@ struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags)
get_pid(pid);
*flags = fd_file(f)->f_flags;
}
-
- fdput(f);
return pid;
}
@@ -747,23 +744,18 @@ SYSCALL_DEFINE3(pidfd_getfd, int, pidfd, int, fd,
unsigned int, flags)
{
struct pid *pid;
- struct fd f;
- int ret;
/* flags is currently unused - make sure it's unset */
if (flags)
return -EINVAL;
- f = fdget(pidfd);
- if (!fd_file(f))
+ CLASS(fd, f)(pidfd);
+ if (fd_empty(f))
return -EBADF;
pid = pidfd_pid(fd_file(f));
if (IS_ERR(pid))
- ret = PTR_ERR(pid);
- else
- ret = pidfd_getfd(pid, fd);
+ return PTR_ERR(pid);
- fdput(f);
- return ret;
+ return pidfd_getfd(pid, fd);
}
diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c
index 927cc55..d07faf4 100644
--- a/kernel/power/energy_model.c
+++ b/kernel/power/energy_model.c
@@ -628,6 +628,8 @@ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states,
goto unlock;
dev->em_pd->flags |= flags;
+ dev->em_pd->min_perf_state = 0;
+ dev->em_pd->max_perf_state = nr_states - 1;
em_cpufreq_update_efficiencies(dev, dev->em_pd->em_table->state);
@@ -856,3 +858,53 @@ int em_dev_update_chip_binning(struct device *dev)
return em_recalc_and_update(dev, pd, em_table);
}
EXPORT_SYMBOL_GPL(em_dev_update_chip_binning);
+
+
+/**
+ * em_update_performance_limits() - Update Energy Model with performance
+ * limits information.
+ * @pd : Performance Domain with EM that has to be updated.
+ * @freq_min_khz : New minimum allowed frequency for this device.
+ * @freq_max_khz : New maximum allowed frequency for this device.
+ *
+ * This function allows to update the EM with information about available
+ * performance levels. It takes the minimum and maximum frequency in kHz
+ * and does internal translation to performance levels.
+ * Returns 0 on success or -EINVAL when failed.
+ */
+int em_update_performance_limits(struct em_perf_domain *pd,
+ unsigned long freq_min_khz, unsigned long freq_max_khz)
+{
+ struct em_perf_state *table;
+ int min_ps = -1;
+ int max_ps = -1;
+ int i;
+
+ if (!pd)
+ return -EINVAL;
+
+ rcu_read_lock();
+ table = em_perf_state_from_pd(pd);
+
+ for (i = 0; i < pd->nr_perf_states; i++) {
+ if (freq_min_khz == table[i].frequency)
+ min_ps = i;
+ if (freq_max_khz == table[i].frequency)
+ max_ps = i;
+ }
+ rcu_read_unlock();
+
+ /* Only update when both are found and sane */
+ if (min_ps < 0 || max_ps < 0 || max_ps < min_ps)
+ return -EINVAL;
+
+
+ /* Guard simultaneous updates and make them atomic */
+ mutex_lock(&em_pd_mutex);
+ pd->min_perf_state = min_ps;
+ pd->max_perf_state = max_ps;
+ mutex_unlock(&em_pd_mutex);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(em_update_performance_limits);
diff --git a/kernel/scftorture.c b/kernel/scftorture.c
index 44e83a6..d86d2d9 100644
--- a/kernel/scftorture.c
+++ b/kernel/scftorture.c
@@ -97,6 +97,7 @@ struct scf_statistics {
static struct scf_statistics *scf_stats_p;
static struct task_struct *scf_torture_stats_task;
static DEFINE_PER_CPU(long long, scf_invoked_count);
+static DEFINE_PER_CPU(struct llist_head, scf_free_pool);
// Data for random primitive selection
#define SCF_PRIM_RESCHED 0
@@ -133,6 +134,7 @@ struct scf_check {
bool scfc_wait;
bool scfc_rpc;
struct completion scfc_completion;
+ struct llist_node scf_node;
};
// Use to wait for all threads to start.
@@ -148,6 +150,33 @@ static DEFINE_TORTURE_RANDOM_PERCPU(scf_torture_rand);
extern void resched_cpu(int cpu); // An alternative IPI vector.
+static void scf_add_to_free_list(struct scf_check *scfcp)
+{
+ struct llist_head *pool;
+ unsigned int cpu;
+
+ if (!scfcp)
+ return;
+ cpu = raw_smp_processor_id() % nthreads;
+ pool = &per_cpu(scf_free_pool, cpu);
+ llist_add(&scfcp->scf_node, pool);
+}
+
+static void scf_cleanup_free_list(unsigned int cpu)
+{
+ struct llist_head *pool;
+ struct llist_node *node;
+ struct scf_check *scfcp;
+
+ pool = &per_cpu(scf_free_pool, cpu);
+ node = llist_del_all(pool);
+ while (node) {
+ scfcp = llist_entry(node, struct scf_check, scf_node);
+ node = node->next;
+ kfree(scfcp);
+ }
+}
+
// Print torture statistics. Caller must ensure serialization.
static void scf_torture_stats_print(void)
{
@@ -296,7 +325,7 @@ static void scf_handler(void *scfc_in)
if (scfcp->scfc_rpc)
complete(&scfcp->scfc_completion);
} else {
- kfree(scfcp);
+ scf_add_to_free_list(scfcp);
}
}
@@ -320,10 +349,6 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra
struct scf_check *scfcp = NULL;
struct scf_selector *scfsp = scf_sel_rand(trsp);
- if (use_cpus_read_lock)
- cpus_read_lock();
- else
- preempt_disable();
if (scfsp->scfs_prim == SCF_PRIM_SINGLE || scfsp->scfs_wait) {
scfcp = kmalloc(sizeof(*scfcp), GFP_ATOMIC);
if (!scfcp) {
@@ -337,6 +362,10 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra
scfcp->scfc_rpc = false;
}
}
+ if (use_cpus_read_lock)
+ cpus_read_lock();
+ else
+ preempt_disable();
switch (scfsp->scfs_prim) {
case SCF_PRIM_RESCHED:
if (IS_BUILTIN(CONFIG_SCF_TORTURE_TEST)) {
@@ -363,7 +392,7 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra
scfp->n_single_wait_ofl++;
else
scfp->n_single_ofl++;
- kfree(scfcp);
+ scf_add_to_free_list(scfcp);
scfcp = NULL;
}
break;
@@ -391,7 +420,7 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra
preempt_disable();
} else {
scfp->n_single_rpc_ofl++;
- kfree(scfcp);
+ scf_add_to_free_list(scfcp);
scfcp = NULL;
}
break;
@@ -428,7 +457,7 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra
pr_warn("%s: Memory-ordering failure, scfs_prim: %d.\n", __func__, scfsp->scfs_prim);
atomic_inc(&n_mb_out_errs); // Leak rather than trash!
} else {
- kfree(scfcp);
+ scf_add_to_free_list(scfcp);
}
barrier(); // Prevent race-reduction compiler optimizations.
}
@@ -463,7 +492,7 @@ static int scftorture_invoker(void *arg)
// Make sure that the CPU is affinitized appropriately during testing.
curcpu = raw_smp_processor_id();
- WARN_ONCE(curcpu != scfp->cpu % nr_cpu_ids,
+ WARN_ONCE(curcpu != cpu,
"%s: Wanted CPU %d, running on %d, nr_cpu_ids = %d\n",
__func__, scfp->cpu, curcpu, nr_cpu_ids);
@@ -479,6 +508,8 @@ static int scftorture_invoker(void *arg)
VERBOSE_SCFTORTOUT("scftorture_invoker %d started", scfp->cpu);
do {
+ scf_cleanup_free_list(cpu);
+
scftorture_invoke_one(scfp, &rand);
while (cpu_is_offline(cpu) && !torture_must_stop()) {
schedule_timeout_interruptible(HZ / 5);
@@ -523,12 +554,15 @@ static void scf_torture_cleanup(void)
torture_stop_kthread("scftorture_invoker", scf_stats_p[i].task);
else
goto end;
- smp_call_function(scf_cleanup_handler, NULL, 0);
+ smp_call_function(scf_cleanup_handler, NULL, 1);
torture_stop_kthread(scf_torture_stats, scf_torture_stats_task);
scf_torture_stats_print(); // -After- the stats thread is stopped!
kfree(scf_stats_p); // -After- the last stats print has completed!
scf_stats_p = NULL;
+ for (i = 0; i < nr_cpu_ids; i++)
+ scf_cleanup_free_list(i);
+
if (atomic_read(&n_errs) || atomic_read(&n_mb_in_errs) || atomic_read(&n_mb_out_errs))
scftorture_print_module_parms("End of test: FAILURE");
else if (torture_onoff_failures())
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 719e0ed..a1c353a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5920,12 +5920,15 @@ static void prev_balance(struct rq *rq, struct task_struct *prev,
#ifdef CONFIG_SCHED_CLASS_EXT
/*
- * SCX requires a balance() call before every pick_next_task() including
- * when waking up from SCHED_IDLE. If @start_class is below SCX, start
- * from SCX instead.
+ * SCX requires a balance() call before every pick_task() including when
+ * waking up from SCHED_IDLE. If @start_class is below SCX, start from
+ * SCX instead. Also, set a flag to detect missing balance() call.
*/
- if (scx_enabled() && sched_class_above(&ext_sched_class, start_class))
- start_class = &ext_sched_class;
+ if (scx_enabled()) {
+ rq->scx.flags |= SCX_RQ_BAL_PENDING;
+ if (sched_class_above(&ext_sched_class, start_class))
+ start_class = &ext_sched_class;
+ }
#endif
/*
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index c6ba153..28c7790 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -783,9 +783,8 @@ static int sugov_init(struct cpufreq_policy *policy)
if (ret)
goto fail;
- sugov_eas_rebuild_sd();
-
out:
+ sugov_eas_rebuild_sd();
mutex_unlock(&global_tunables_lock);
return 0;
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index b5f4b1a..751d73d 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -2634,7 +2634,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
lockdep_assert_rq_held(rq);
rq->scx.flags |= SCX_RQ_IN_BALANCE;
- rq->scx.flags &= ~SCX_RQ_BAL_KEEP;
+ rq->scx.flags &= ~(SCX_RQ_BAL_PENDING | SCX_RQ_BAL_KEEP);
if (static_branch_unlikely(&scx_ops_cpu_preempt) &&
unlikely(rq->scx.cpu_released)) {
@@ -2645,7 +2645,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
* emitted in scx_next_task_picked().
*/
if (SCX_HAS_OP(cpu_acquire))
- SCX_CALL_OP(0, cpu_acquire, cpu_of(rq), NULL);
+ SCX_CALL_OP(SCX_KF_REST, cpu_acquire, cpu_of(rq), NULL);
rq->scx.cpu_released = false;
}
@@ -2948,12 +2948,11 @@ static struct task_struct *pick_task_scx(struct rq *rq)
{
struct task_struct *prev = rq->curr;
struct task_struct *p;
+ bool prev_on_scx = prev->sched_class == &ext_sched_class;
+ bool keep_prev = rq->scx.flags & SCX_RQ_BAL_KEEP;
+ bool kick_idle = false;
/*
- * If balance_scx() is telling us to keep running @prev, replenish slice
- * if necessary and keep running @prev. Otherwise, pop the first one
- * from the local DSQ.
- *
* WORKAROUND:
*
* %SCX_RQ_BAL_KEEP should be set iff $prev is on SCX as it must just
@@ -2962,22 +2961,41 @@ static struct task_struct *pick_task_scx(struct rq *rq)
* which then ends up calling pick_task_scx() without preceding
* balance_scx().
*
- * For now, ignore cases where $prev is not on SCX. This isn't great and
- * can theoretically lead to stalls. However, for switch_all cases, this
- * happens only while a BPF scheduler is being loaded or unloaded, and,
- * for partial cases, fair will likely keep triggering this CPU.
+ * Keep running @prev if possible and avoid stalling from entering idle
+ * without balancing.
*
- * Once fair is fixed, restore WARN_ON_ONCE().
+ * Once fair is fixed, remove the workaround and trigger WARN_ON_ONCE()
+ * if pick_task_scx() is called without preceding balance_scx().
*/
- if ((rq->scx.flags & SCX_RQ_BAL_KEEP) &&
- prev->sched_class == &ext_sched_class) {
+ if (unlikely(rq->scx.flags & SCX_RQ_BAL_PENDING)) {
+ if (prev_on_scx) {
+ keep_prev = true;
+ } else {
+ keep_prev = false;
+ kick_idle = true;
+ }
+ } else if (unlikely(keep_prev && !prev_on_scx)) {
+ /* only allowed during transitions */
+ WARN_ON_ONCE(scx_ops_enable_state() == SCX_OPS_ENABLED);
+ keep_prev = false;
+ }
+
+ /*
+ * If balance_scx() is telling us to keep running @prev, replenish slice
+ * if necessary and keep running @prev. Otherwise, pop the first one
+ * from the local DSQ.
+ */
+ if (keep_prev) {
p = prev;
if (!p->scx.slice)
p->scx.slice = SCX_SLICE_DFL;
} else {
p = first_local_task(rq);
- if (!p)
+ if (!p) {
+ if (kick_idle)
+ scx_bpf_kick_cpu(cpu_of(rq), SCX_KICK_IDLE);
return NULL;
+ }
if (unlikely(!p->scx.slice)) {
if (!scx_rq_bypassing(rq) && !scx_warned_zero_slice) {
@@ -4979,7 +4997,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
if (!cpumask_equal(housekeeping_cpumask(HK_TYPE_DOMAIN),
cpu_possible_mask)) {
- pr_err("sched_ext: Not compatible with \"isolcpus=\" domain isolation");
+ pr_err("sched_ext: Not compatible with \"isolcpus=\" domain isolation\n");
return -EINVAL;
}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 6c54a57..c03b3d7 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -751,8 +751,9 @@ enum scx_rq_flags {
*/
SCX_RQ_ONLINE = 1 << 0,
SCX_RQ_CAN_STOP_TICK = 1 << 1,
- SCX_RQ_BAL_KEEP = 1 << 2, /* balance decided to keep current */
- SCX_RQ_BYPASSING = 1 << 3,
+ SCX_RQ_BAL_PENDING = 1 << 2, /* balance hasn't run yet */
+ SCX_RQ_BAL_KEEP = 1 << 3, /* balance decided to keep current */
+ SCX_RQ_BYPASSING = 1 << 4,
SCX_RQ_IN_WAKEUP = 1 << 16,
SCX_RQ_IN_BALANCE = 1 << 17,
diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c
index 24f9f90..f9bed5e 100644
--- a/kernel/sched/syscalls.c
+++ b/kernel/sched/syscalls.c
@@ -1081,45 +1081,6 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
return copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
}
-/*
- * Copy the kernel size attribute structure (which might be larger
- * than what user-space knows about) to user-space.
- *
- * Note that all cases are valid: user-space buffer can be larger or
- * smaller than the kernel-space buffer. The usual case is that both
- * have the same size.
- */
-static int
-sched_attr_copy_to_user(struct sched_attr __user *uattr,
- struct sched_attr *kattr,
- unsigned int usize)
-{
- unsigned int ksize = sizeof(*kattr);
-
- if (!access_ok(uattr, usize))
- return -EFAULT;
-
- /*
- * sched_getattr() ABI forwards and backwards compatibility:
- *
- * If usize == ksize then we just copy everything to user-space and all is good.
- *
- * If usize < ksize then we only copy as much as user-space has space for,
- * this keeps ABI compatibility as well. We skip the rest.
- *
- * If usize > ksize then user-space is using a newer version of the ABI,
- * which part the kernel doesn't know about. Just ignore it - tooling can
- * detect the kernel's knowledge of attributes from the attr->size value
- * which is set to ksize in this case.
- */
- kattr->size = min(usize, ksize);
-
- if (copy_to_user(uattr, kattr, kattr->size))
- return -EFAULT;
-
- return 0;
-}
-
/**
* sys_sched_getattr - similar to sched_getparam, but with sched_attr
* @pid: the pid in question.
@@ -1164,7 +1125,8 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
#endif
}
- return sched_attr_copy_to_user(uattr, &kattr, usize);
+ kattr.size = min(usize, sizeof(kattr));
+ return copy_struct_to_user(uattr, usize, &kattr, sizeof(kattr), NULL);
}
#ifdef CONFIG_SMP
diff --git a/kernel/signal.c b/kernel/signal.c
index cbabb2d..65fd233 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -3909,7 +3909,6 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig,
siginfo_t __user *, info, unsigned int, flags)
{
int ret;
- struct fd f;
struct pid *pid;
kernel_siginfo_t kinfo;
enum pid_type type;
@@ -3922,20 +3921,17 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig,
if (hweight32(flags & PIDFD_SEND_SIGNAL_FLAGS) > 1)
return -EINVAL;
- f = fdget(pidfd);
- if (!fd_file(f))
+ CLASS(fd, f)(pidfd);
+ if (fd_empty(f))
return -EBADF;
/* Is this a pidfd? */
pid = pidfd_to_pid(fd_file(f));
- if (IS_ERR(pid)) {
- ret = PTR_ERR(pid);
- goto err;
- }
+ if (IS_ERR(pid))
+ return PTR_ERR(pid);
- ret = -EINVAL;
if (!access_pidfd_pidns(pid))
- goto err;
+ return -EINVAL;
switch (flags) {
case 0:
@@ -3959,28 +3955,23 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig,
if (info) {
ret = copy_siginfo_from_user_any(&kinfo, info);
if (unlikely(ret))
- goto err;
+ return ret;
- ret = -EINVAL;
if (unlikely(sig != kinfo.si_signo))
- goto err;
+ return -EINVAL;
/* Only allow sending arbitrary signals to yourself. */
- ret = -EPERM;
if ((task_pid(current) != pid || type > PIDTYPE_TGID) &&
(kinfo.si_code >= 0 || kinfo.si_code == SI_TKILL))
- goto err;
+ return -EPERM;
} else {
prepare_kill_siginfo(sig, &kinfo, type);
}
if (type == PIDTYPE_PGID)
- ret = kill_pgrp_info(sig, &kinfo, pid);
+ return kill_pgrp_info(sig, &kinfo, pid);
else
- ret = kill_pid_info_type(sig, &kinfo, pid, type);
-err:
- fdput(f);
- return ret;
+ return kill_pid_info_type(sig, &kinfo, pid, type);
}
static int
diff --git a/kernel/smp.c b/kernel/smp.c
index f25e206..27dc31a 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -246,7 +246,7 @@ static bool csd_lock_wait_toolong(call_single_data_t *csd, u64 ts0, u64 *ts1, in
return true;
}
- ts2 = sched_clock();
+ ts2 = ktime_get_mono_fast_ns();
/* How long since we last checked for a stuck CSD lock.*/
ts_delta = ts2 - *ts1;
if (likely(ts_delta <= csd_lock_timeout_ns * (*nmessages + 1) *
@@ -321,7 +321,7 @@ static void __csd_lock_wait(call_single_data_t *csd)
int bug_id = 0;
u64 ts0, ts1;
- ts1 = ts0 = sched_clock();
+ ts1 = ts0 = ktime_get_mono_fast_ns();
for (;;) {
if (csd_lock_wait_toolong(csd, ts0, &ts1, &bug_id, &nmessages))
break;
diff --git a/kernel/sys.c b/kernel/sys.c
index 4da31f2..c4c701c 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1911,12 +1911,11 @@ SYSCALL_DEFINE1(umask, int, mask)
static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
{
- struct fd exe;
+ CLASS(fd, exe)(fd);
struct inode *inode;
int err;
- exe = fdget(fd);
- if (!fd_file(exe))
+ if (fd_empty(exe))
return -EBADF;
inode = file_inode(fd_file(exe));
@@ -1926,18 +1925,14 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
* sure that this one is executable as well, to avoid breaking an
* overall picture.
*/
- err = -EACCES;
if (!S_ISREG(inode->i_mode) || path_noexec(&fd_file(exe)->f_path))
- goto exit;
+ return -EACCES;
err = file_permission(fd_file(exe), MAY_EXEC);
if (err)
- goto exit;
+ return err;
- err = replace_mm_exe_file(mm, fd_file(exe));
-exit:
- fdput(exe);
- return err;
+ return replace_mm_exe_file(mm, fd_file(exe));
}
/*
@@ -2324,6 +2319,21 @@ int __weak arch_prctl_spec_ctrl_set(struct task_struct *t, unsigned long which,
return -EINVAL;
}
+int __weak arch_get_shadow_stack_status(struct task_struct *t, unsigned long __user *status)
+{
+ return -EINVAL;
+}
+
+int __weak arch_set_shadow_stack_status(struct task_struct *t, unsigned long status)
+{
+ return -EINVAL;
+}
+
+int __weak arch_lock_shadow_stack_status(struct task_struct *t, unsigned long status)
+{
+ return -EINVAL;
+}
+
#define PR_IO_FLUSHER (PF_MEMALLOC_NOIO | PF_LOCAL_THROTTLE)
#ifdef CONFIG_ANON_VMA_NAME
@@ -2784,6 +2794,21 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
case PR_RISCV_SET_ICACHE_FLUSH_CTX:
error = RISCV_SET_ICACHE_FLUSH_CTX(arg2, arg3);
break;
+ case PR_GET_SHADOW_STACK_STATUS:
+ if (arg3 || arg4 || arg5)
+ return -EINVAL;
+ error = arch_get_shadow_stack_status(me, (unsigned long __user *) arg2);
+ break;
+ case PR_SET_SHADOW_STACK_STATUS:
+ if (arg3 || arg4 || arg5)
+ return -EINVAL;
+ error = arch_set_shadow_stack_status(me, arg2);
+ break;
+ case PR_LOCK_SHADOW_STACK_STATUS:
+ if (arg3 || arg4 || arg5)
+ return -EINVAL;
+ error = arch_lock_shadow_stack_status(me, arg2);
+ break;
default:
error = -EINVAL;
break;
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 0700f40..0cd680c 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -411,15 +411,14 @@ static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
struct nlattr *na;
size_t size;
u32 fd;
- struct fd f;
na = info->attrs[CGROUPSTATS_CMD_ATTR_FD];
if (!na)
return -EINVAL;
fd = nla_get_u32(info->attrs[CGROUPSTATS_CMD_ATTR_FD]);
- f = fdget(fd);
- if (!fd_file(f))
+ CLASS(fd, f)(fd);
+ if (fd_empty(f))
return 0;
size = nla_total_size(sizeof(struct cgroupstats));
@@ -427,14 +426,13 @@ static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
rc = prepare_reply(info, CGROUPSTATS_CMD_NEW, &rep_skb,
size);
if (rc < 0)
- goto err;
+ return rc;
na = nla_reserve(rep_skb, CGROUPSTATS_TYPE_CGROUP_STATS,
sizeof(struct cgroupstats));
if (na == NULL) {
nlmsg_free(rep_skb);
- rc = -EMSGSIZE;
- goto err;
+ return -EMSGSIZE;
}
stats = nla_data(na);
@@ -443,14 +441,10 @@ static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
rc = cgroupstats_build(stats, fd_file(f)->f_path.dentry);
if (rc < 0) {
nlmsg_free(rep_skb);
- goto err;
+ return rc;
}
- rc = send_reply(rep_skb, info);
-
-err:
- fdput(f);
- return rc;
+ return send_reply(rep_skb, info);
}
static int cmd_attr_register_cpumask(struct genl_info *info)
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 7e6f409..962b2a31f 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -114,6 +114,23 @@ static struct tk_fast tk_fast_raw ____cacheline_aligned = {
.base[1] = FAST_TK_INIT,
};
+/*
+ * Multigrain timestamps require tracking the latest fine-grained timestamp
+ * that has been issued, and never returning a coarse-grained timestamp that is
+ * earlier than that value.
+ *
+ * mg_floor represents the latest fine-grained time that has been handed out as
+ * a file timestamp on the system. This is tracked as a monotonic ktime_t, and
+ * converted to a realtime clock value on an as-needed basis.
+ *
+ * Maintaining mg_floor ensures the multigrain interfaces never issue a
+ * timestamp earlier than one that has been previously issued.
+ *
+ * The exception to this rule is when there is a backward realtime clock jump. If
+ * such an event occurs, a timestamp can appear to be earlier than a previous one.
+ */
+static __cacheline_aligned_in_smp atomic64_t mg_floor;
+
static inline void tk_normalize_xtime(struct timekeeper *tk)
{
while (tk->tkr_mono.xtime_nsec >= ((u64)NSEC_PER_SEC << tk->tkr_mono.shift)) {
@@ -2394,6 +2411,94 @@ void ktime_get_coarse_real_ts64(struct timespec64 *ts)
}
EXPORT_SYMBOL(ktime_get_coarse_real_ts64);
+/**
+ * ktime_get_coarse_real_ts64_mg - return latter of coarse grained time or floor
+ * @ts: timespec64 to be filled
+ *
+ * Fetch the global mg_floor value, convert it to realtime and compare it
+ * to the current coarse-grained time. Fill @ts with whichever is
+ * latest. Note that this is a filesystem-specific interface and should be
+ * avoided outside of that context.
+ */
+void ktime_get_coarse_real_ts64_mg(struct timespec64 *ts)
+{
+ struct timekeeper *tk = &tk_core.timekeeper;
+ u64 floor = atomic64_read(&mg_floor);
+ ktime_t f_real, offset, coarse;
+ unsigned int seq;
+
+ do {
+ seq = read_seqcount_begin(&tk_core.seq);
+ *ts = tk_xtime(tk);
+ offset = tk_core.timekeeper.offs_real;
+ } while (read_seqcount_retry(&tk_core.seq, seq));
+
+ coarse = timespec64_to_ktime(*ts);
+ f_real = ktime_add(floor, offset);
+ if (ktime_after(f_real, coarse))
+ *ts = ktime_to_timespec64(f_real);
+}
+
+/**
+ * ktime_get_real_ts64_mg - attempt to update floor value and return result
+ * @ts: pointer to the timespec to be set
+ *
+ * Get a monotonic fine-grained time value and attempt to swap it into
+ * mg_floor. If that succeeds then accept the new floor value. If it fails
+ * then another task raced in during the interim time and updated the
+ * floor. Since any update to the floor must be later than the previous
+ * floor, either outcome is acceptable.
+ *
+ * Typically this will be called after calling ktime_get_coarse_real_ts64_mg(),
+ * and determining that the resulting coarse-grained timestamp did not effect
+ * a change in ctime. Any more recent floor value would effect a change to
+ * ctime, so there is no need to retry the atomic64_try_cmpxchg() on failure.
+ *
+ * @ts will be filled with the latest floor value, regardless of the outcome of
+ * the cmpxchg. Note that this is a filesystem specific interface and should be
+ * avoided outside of that context.
+ */
+void ktime_get_real_ts64_mg(struct timespec64 *ts)
+{
+ struct timekeeper *tk = &tk_core.timekeeper;
+ ktime_t old = atomic64_read(&mg_floor);
+ ktime_t offset, mono;
+ unsigned int seq;
+ u64 nsecs;
+
+ do {
+ seq = read_seqcount_begin(&tk_core.seq);
+
+ ts->tv_sec = tk->xtime_sec;
+ mono = tk->tkr_mono.base;
+ nsecs = timekeeping_get_ns(&tk->tkr_mono);
+ offset = tk_core.timekeeper.offs_real;
+ } while (read_seqcount_retry(&tk_core.seq, seq));
+
+ mono = ktime_add_ns(mono, nsecs);
+
+ /*
+ * Attempt to update the floor with the new time value. As any
+ * update must be later then the existing floor, and would effect
+ * a change to ctime from the perspective of the current task,
+ * accept the resulting floor value regardless of the outcome of
+ * the swap.
+ */
+ if (atomic64_try_cmpxchg(&mg_floor, &old, mono)) {
+ ts->tv_nsec = 0;
+ timespec64_add_ns(ts, nsecs);
+ timekeeping_inc_mg_floor_swaps();
+ } else {
+ /*
+ * Another task changed mg_floor since "old" was fetched.
+ * "old" has been updated with the latest value of "mg_floor".
+ * That value is newer than the previous floor value, which
+ * is enough to effect a change to ctime. Accept it.
+ */
+ *ts = ktime_to_timespec64(ktime_add(old, offset));
+ }
+}
+
void ktime_get_coarse_ts64(struct timespec64 *ts)
{
struct timekeeper *tk = &tk_core.timekeeper;
diff --git a/kernel/time/timekeeping_debug.c b/kernel/time/timekeeping_debug.c
index b73e885..badeb22 100644
--- a/kernel/time/timekeeping_debug.c
+++ b/kernel/time/timekeeping_debug.c
@@ -17,6 +17,9 @@
#define NUM_BINS 32
+/* Incremented every time mg_floor is updated */
+DEFINE_PER_CPU(unsigned long, timekeeping_mg_floor_swaps);
+
static unsigned int sleep_time_bin[NUM_BINS] = {0};
static int tk_debug_sleep_time_show(struct seq_file *s, void *data)
@@ -53,3 +56,13 @@ void tk_debug_account_sleep_time(const struct timespec64 *t)
(s64)t->tv_sec, t->tv_nsec / NSEC_PER_MSEC);
}
+unsigned long timekeeping_get_mg_floor_swaps(void)
+{
+ unsigned long sum = 0;
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ sum += data_race(per_cpu(timekeeping_mg_floor_swaps, cpu));
+
+ return sum;
+}
diff --git a/kernel/time/timekeeping_internal.h b/kernel/time/timekeeping_internal.h
index 4ca2787..0bbae82 100644
--- a/kernel/time/timekeeping_internal.h
+++ b/kernel/time/timekeeping_internal.h
@@ -10,9 +10,24 @@
* timekeeping debug functions
*/
#ifdef CONFIG_DEBUG_FS
+
+DECLARE_PER_CPU(unsigned long, timekeeping_mg_floor_swaps);
+
+static inline void timekeeping_inc_mg_floor_swaps(void)
+{
+ this_cpu_inc(timekeeping_mg_floor_swaps);
+}
+
extern void tk_debug_account_sleep_time(const struct timespec64 *t);
+
#else
+
#define tk_debug_account_sleep_time(x)
+
+static inline void timekeeping_inc_mg_floor_swaps(void)
+{
+}
+
#endif
#ifdef CONFIG_CLOCKSOURCE_VALIDATE_LAST_CYCLE
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 3ea4f7b..5807116 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2337,12 +2337,9 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags,
if (!buffer->buffers[cpu])
goto fail_free_buffers;
- /* If already mapped, do not hook to CPU hotplug */
- if (!start) {
- ret = cpuhp_state_add_instance(CPUHP_TRACE_RB_PREPARE, &buffer->node);
- if (ret < 0)
- goto fail_free_buffers;
- }
+ ret = cpuhp_state_add_instance(CPUHP_TRACE_RB_PREPARE, &buffer->node);
+ if (ret < 0)
+ goto fail_free_buffers;
mutex_init(&buffer->mutex);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 2b64b3e..6a891e0 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2386,6 +2386,25 @@ void tracing_reset_online_cpus(struct array_buffer *buf)
ring_buffer_record_enable(buffer);
}
+static void tracing_reset_all_cpus(struct array_buffer *buf)
+{
+ struct trace_buffer *buffer = buf->buffer;
+
+ if (!buffer)
+ return;
+
+ ring_buffer_record_disable(buffer);
+
+ /* Make sure all commits have finished */
+ synchronize_rcu();
+
+ buf->time_start = buffer_ftrace_now(buf, buf->cpu);
+
+ ring_buffer_reset(buffer);
+
+ ring_buffer_record_enable(buffer);
+}
+
/* Must have trace_types_lock held */
void tracing_reset_all_online_cpus_unlocked(void)
{
@@ -6145,8 +6164,13 @@ static void update_last_data(struct trace_array *tr)
if (!tr->text_delta && !tr->data_delta)
return;
- /* Clear old data */
- tracing_reset_online_cpus(&tr->array_buffer);
+ /*
+ * Need to clear all CPU buffers as there cannot be events
+ * from the previous boot mixed with events with this boot
+ * as that will cause a confusing trace. Need to clear all
+ * CPU buffers, even for those that may currently be offline.
+ */
+ tracing_reset_all_cpus(&tr->array_buffer);
/* Using current data now */
tr->text_delta = 0;
diff --git a/kernel/umh.c b/kernel/umh.c
index ff1f13a..be92342 100644
--- a/kernel/umh.c
+++ b/kernel/umh.c
@@ -13,7 +13,6 @@
#include <linux/completion.h>
#include <linux/cred.h>
#include <linux/file.h>
-#include <linux/fdtable.h>
#include <linux/fs_struct.h>
#include <linux/workqueue.h>
#include <linux/security.h>
diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c
index d36242f..1895fbc 100644
--- a/kernel/watch_queue.c
+++ b/kernel/watch_queue.c
@@ -663,16 +663,14 @@ struct watch_queue *get_watch_queue(int fd)
{
struct pipe_inode_info *pipe;
struct watch_queue *wqueue = ERR_PTR(-EINVAL);
- struct fd f;
+ CLASS(fd, f)(fd);
- f = fdget(fd);
- if (fd_file(f)) {
+ if (!fd_empty(f)) {
pipe = get_pipe_info(fd_file(f), false);
if (pipe && pipe->watch_queue) {
wqueue = pipe->watch_queue;
kref_get(&wqueue->usage);
}
- fdput(f);
}
return wqueue;
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 7312ae7..fcad505 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1905,7 +1905,7 @@
bool "Filter access to /dev/mem"
depends on MMU && DEVMEM
depends on ARCH_HAS_DEVMEM_IS_ALLOWED || GENERIC_LIB_DEVMEM_IS_ALLOWED
- default y if PPC || X86 || ARM64
+ default y if PPC || X86 || ARM64 || S390
help
If this option is disabled, you allow userspace (root) access to all
of memory, including kernel and userspace memory. Accidental
diff --git a/lib/crc32.c b/lib/crc32.c
index 5649847d..ff587fe 100644
--- a/lib/crc32.c
+++ b/lib/crc32.c
@@ -205,7 +205,11 @@ EXPORT_SYMBOL(crc32_le);
EXPORT_SYMBOL(__crc32c_le);
u32 __pure crc32_le_base(u32, unsigned char const *, size_t) __alias(crc32_le);
+EXPORT_SYMBOL(crc32_le_base);
+
u32 __pure __crc32c_le_base(u32, unsigned char const *, size_t) __alias(__crc32c_le);
+EXPORT_SYMBOL(__crc32c_le_base);
+
u32 __pure crc32_be_base(u32, unsigned char const *, size_t) __alias(crc32_be);
/*
diff --git a/lib/crypto/Makefile b/lib/crypto/Makefile
index 969baab..01fac1c 100644
--- a/lib/crypto/Makefile
+++ b/lib/crypto/Makefile
@@ -58,3 +58,5 @@
endif
obj-$(CONFIG_MPILIB) += mpi/
+
+obj-$(CONFIG_CRYPTO_MANAGER_EXTRA_TESTS) += simd.o
diff --git a/lib/crypto/mpi/mpi-bit.c b/lib/crypto/mpi/mpi-bit.c
index 835a2f0..934d813 100644
--- a/lib/crypto/mpi/mpi-bit.c
+++ b/lib/crypto/mpi/mpi-bit.c
@@ -95,6 +95,7 @@ int mpi_set_bit(MPI a, unsigned int n)
a->d[limbno] |= (A_LIMB_1<<bitno);
return 0;
}
+EXPORT_SYMBOL_GPL(mpi_set_bit);
/*
* Shift A by N bits to the right.
diff --git a/lib/crypto/simd.c b/lib/crypto/simd.c
new file mode 100644
index 0000000..9c36cb3
--- /dev/null
+++ b/lib/crypto/simd.c
@@ -0,0 +1,11 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * SIMD testing utility functions
+ *
+ * Copyright 2024 Google LLC
+ */
+
+#include <crypto/internal/simd.h>
+
+DEFINE_PER_CPU(bool, crypto_simd_disabled_for_test);
+EXPORT_PER_CPU_SYMBOL_GPL(crypto_simd_disabled_for_test);
diff --git a/lib/interval_tree_test.c b/lib/interval_tree_test.c
index f37f4d4..837064b 100644
--- a/lib/interval_tree_test.c
+++ b/lib/interval_tree_test.c
@@ -2,7 +2,7 @@
#include <linux/module.h>
#include <linux/moduleparam.h>
#include <linux/interval_tree.h>
-#include <linux/random.h>
+#include <linux/prandom.h>
#include <linux/slab.h>
#include <asm/timex.h>
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index 908e75a..9ec806f 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -1682,8 +1682,8 @@ static ssize_t iov_iter_extract_xarray_pages(struct iov_iter *i,
}
/*
- * Extract a list of contiguous pages from an ITER_BVEC iterator. This does
- * not get references on the pages, nor does it get a pin on them.
+ * Extract a list of virtually contiguous pages from an ITER_BVEC iterator.
+ * This does not get references on the pages, nor does it get a pin on them.
*/
static ssize_t iov_iter_extract_bvec_pages(struct iov_iter *i,
struct page ***pages, size_t maxsize,
@@ -1691,35 +1691,59 @@ static ssize_t iov_iter_extract_bvec_pages(struct iov_iter *i,
iov_iter_extraction_t extraction_flags,
size_t *offset0)
{
- struct page **p, *page;
- size_t skip = i->iov_offset, offset, size;
- int k;
+ size_t skip = i->iov_offset, size = 0;
+ struct bvec_iter bi;
+ int k = 0;
- for (;;) {
- if (i->nr_segs == 0)
- return 0;
- size = min(maxsize, i->bvec->bv_len - skip);
- if (size)
- break;
+ if (i->nr_segs == 0)
+ return 0;
+
+ if (i->iov_offset == i->bvec->bv_len) {
i->iov_offset = 0;
i->nr_segs--;
i->bvec++;
skip = 0;
}
+ bi.bi_idx = 0;
+ bi.bi_size = maxsize;
+ bi.bi_bvec_done = skip;
- skip += i->bvec->bv_offset;
- page = i->bvec->bv_page + skip / PAGE_SIZE;
- offset = skip % PAGE_SIZE;
- *offset0 = offset;
+ maxpages = want_pages_array(pages, maxsize, skip, maxpages);
- maxpages = want_pages_array(pages, size, offset, maxpages);
- if (!maxpages)
- return -ENOMEM;
- p = *pages;
- for (k = 0; k < maxpages; k++)
- p[k] = page + k;
+ while (bi.bi_size && bi.bi_idx < i->nr_segs) {
+ struct bio_vec bv = bvec_iter_bvec(i->bvec, bi);
- size = min_t(size_t, size, maxpages * PAGE_SIZE - offset);
+ /*
+ * The iov_iter_extract_pages interface only allows an offset
+ * into the first page. Break out of the loop if we see an
+ * offset into subsequent pages, the caller will have to call
+ * iov_iter_extract_pages again for the reminder.
+ */
+ if (k) {
+ if (bv.bv_offset)
+ break;
+ } else {
+ *offset0 = bv.bv_offset;
+ }
+
+ (*pages)[k++] = bv.bv_page;
+ size += bv.bv_len;
+
+ if (k >= maxpages)
+ break;
+
+ /*
+ * We are done when the end of the bvec doesn't align to a page
+ * boundary as that would create a hole in the returned space.
+ * The caller will handle this with another call to
+ * iov_iter_extract_pages.
+ */
+ if (bv.bv_offset + bv.bv_len != PAGE_SIZE)
+ break;
+
+ bvec_iter_advance_single(i->bvec, &bi, bv.bv_len);
+ }
+
iov_iter_advance(i, size);
return size;
}
diff --git a/lib/kunit/string-stream-test.c b/lib/kunit/string-stream-test.c
index 7511442e..7734e33 100644
--- a/lib/kunit/string-stream-test.c
+++ b/lib/kunit/string-stream-test.c
@@ -9,6 +9,7 @@
#include <kunit/static_stub.h>
#include <kunit/test.h>
#include <linux/ktime.h>
+#include <linux/prandom.h>
#include <linux/slab.h>
#include <linux/timekeeping.h>
diff --git a/lib/random32.c b/lib/random32.c
index 0a5a0e3..24e7acd 100644
--- a/lib/random32.c
+++ b/lib/random32.c
@@ -36,7 +36,7 @@
#include <linux/percpu.h>
#include <linux/export.h>
#include <linux/jiffies.h>
-#include <linux/random.h>
+#include <linux/prandom.h>
#include <linux/sched.h>
#include <linux/bitops.h>
#include <linux/slab.h>
diff --git a/lib/rbtree_test.c b/lib/rbtree_test.c
index 41ae3c7..8655a76 100644
--- a/lib/rbtree_test.c
+++ b/lib/rbtree_test.c
@@ -2,7 +2,7 @@
#include <linux/module.h>
#include <linux/moduleparam.h>
#include <linux/rbtree_augmented.h>
-#include <linux/random.h>
+#include <linux/prandom.h>
#include <linux/slab.h>
#include <asm/timex.h>
diff --git a/lib/test_bpf.c b/lib/test_bpf.c
index fa5edd6..2eed1ad 100644
--- a/lib/test_bpf.c
+++ b/lib/test_bpf.c
@@ -14,7 +14,7 @@
#include <linux/skbuff.h>
#include <linux/netdevice.h>
#include <linux/if_vlan.h>
-#include <linux/random.h>
+#include <linux/prandom.h>
#include <linux/highmem.h>
#include <linux/sched.h>
diff --git a/lib/test_parman.c b/lib/test_parman.c
index 35e3224..f9b9742 100644
--- a/lib/test_parman.c
+++ b/lib/test_parman.c
@@ -39,7 +39,7 @@
#include <linux/slab.h>
#include <linux/bitops.h>
#include <linux/err.h>
-#include <linux/random.h>
+#include <linux/prandom.h>
#include <linux/parman.h>
#define TEST_PARMAN_PRIO_SHIFT 7 /* defines number of prios for testing */
diff --git a/lib/test_scanf.c b/lib/test_scanf.c
index 7257b176..44f8508 100644
--- a/lib/test_scanf.c
+++ b/lib/test_scanf.c
@@ -11,7 +11,7 @@
#include <linux/module.h>
#include <linux/overflow.h>
#include <linux/printk.h>
-#include <linux/random.h>
+#include <linux/prandom.h>
#include <linux/slab.h>
#include <linux/string.h>
diff --git a/mm/Kconfig b/mm/Kconfig
index 33fa51d..84000b0 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -1295,6 +1295,12 @@
into virtual nodes when booted with "numa=fake=N", where N is the
number of nodes. This is only useful for debugging.
+config ARCH_HAS_USER_SHADOW_STACK
+ bool
+ help
+ The architecture has hardware support for userspace shadow call
+ stacks (eg, x86 CET, arm64 GCS or RISC-V Zicfiss).
+
source "mm/damon/Kconfig"
endmenu
diff --git a/mm/fadvise.c b/mm/fadvise.c
index 532dee2..588fe76 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -190,16 +190,12 @@ EXPORT_SYMBOL(vfs_fadvise);
int ksys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
{
- struct fd f = fdget(fd);
- int ret;
+ CLASS(fd, f)(fd);
- if (!fd_file(f))
+ if (fd_empty(f))
return -EBADF;
- ret = vfs_fadvise(fd_file(f), offset, len, advice);
-
- fdput(f);
- return ret;
+ return vfs_fadvise(fd_file(f), offset, len, advice);
}
SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice)
diff --git a/mm/filemap.c b/mm/filemap.c
index 56fa431..196779e 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2620,6 +2620,8 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
loff_t isize, end_offset;
loff_t last_pos = ra->prev_pos;
+ if (unlikely(iocb->ki_pos < 0))
+ return -EINVAL;
if (unlikely(iocb->ki_pos >= inode->i_sb->s_maxbytes))
return 0;
if (unlikely(!iov_iter_count(iter)))
@@ -4421,31 +4423,25 @@ SYSCALL_DEFINE4(cachestat, unsigned int, fd,
struct cachestat_range __user *, cstat_range,
struct cachestat __user *, cstat, unsigned int, flags)
{
- struct fd f = fdget(fd);
+ CLASS(fd, f)(fd);
struct address_space *mapping;
struct cachestat_range csr;
struct cachestat cs;
pgoff_t first_index, last_index;
- if (!fd_file(f))
+ if (fd_empty(f))
return -EBADF;
if (copy_from_user(&csr, cstat_range,
- sizeof(struct cachestat_range))) {
- fdput(f);
+ sizeof(struct cachestat_range)))
return -EFAULT;
- }
/* hugetlbfs is not supported */
- if (is_file_hugepages(fd_file(f))) {
- fdput(f);
+ if (is_file_hugepages(fd_file(f)))
return -EOPNOTSUPP;
- }
- if (flags != 0) {
- fdput(f);
+ if (flags != 0)
return -EINVAL;
- }
first_index = csr.off >> PAGE_SHIFT;
last_index =
@@ -4453,7 +4449,6 @@ SYSCALL_DEFINE4(cachestat, unsigned int, fd,
memset(&cs, 0, sizeof(struct cachestat));
mapping = fd_file(f)->f_mapping;
filemap_cachestat(mapping, first_index, last_index, &cs);
- fdput(f);
if (copy_to_user(cstat, &cs, sizeof(struct cachestat)))
return -EFAULT;
diff --git a/mm/gup.c b/mm/gup.c
index 4637dab..ad0c892 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -2273,20 +2273,57 @@ struct page *get_dump_page(unsigned long addr)
#endif /* CONFIG_ELF_CORE */
#ifdef CONFIG_MIGRATION
+
+/*
+ * An array of either pages or folios ("pofs"). Although it may seem tempting to
+ * avoid this complication, by simply interpreting a list of folios as a list of
+ * pages, that approach won't work in the longer term, because eventually the
+ * layouts of struct page and struct folio will become completely different.
+ * Furthermore, this pof approach avoids excessive page_folio() calls.
+ */
+struct pages_or_folios {
+ union {
+ struct page **pages;
+ struct folio **folios;
+ void **entries;
+ };
+ bool has_folios;
+ long nr_entries;
+};
+
+static struct folio *pofs_get_folio(struct pages_or_folios *pofs, long i)
+{
+ if (pofs->has_folios)
+ return pofs->folios[i];
+ return page_folio(pofs->pages[i]);
+}
+
+static void pofs_clear_entry(struct pages_or_folios *pofs, long i)
+{
+ pofs->entries[i] = NULL;
+}
+
+static void pofs_unpin(struct pages_or_folios *pofs)
+{
+ if (pofs->has_folios)
+ unpin_folios(pofs->folios, pofs->nr_entries);
+ else
+ unpin_user_pages(pofs->pages, pofs->nr_entries);
+}
+
/*
* Returns the number of collected folios. Return value is always >= 0.
*/
static unsigned long collect_longterm_unpinnable_folios(
- struct list_head *movable_folio_list,
- unsigned long nr_folios,
- struct folio **folios)
+ struct list_head *movable_folio_list,
+ struct pages_or_folios *pofs)
{
unsigned long i, collected = 0;
struct folio *prev_folio = NULL;
bool drain_allow = true;
- for (i = 0; i < nr_folios; i++) {
- struct folio *folio = folios[i];
+ for (i = 0; i < pofs->nr_entries; i++) {
+ struct folio *folio = pofs_get_folio(pofs, i);
if (folio == prev_folio)
continue;
@@ -2327,16 +2364,15 @@ static unsigned long collect_longterm_unpinnable_folios(
* Returns -EAGAIN if all folios were successfully migrated or -errno for
* failure (or partial success).
*/
-static int migrate_longterm_unpinnable_folios(
- struct list_head *movable_folio_list,
- unsigned long nr_folios,
- struct folio **folios)
+static int
+migrate_longterm_unpinnable_folios(struct list_head *movable_folio_list,
+ struct pages_or_folios *pofs)
{
int ret;
unsigned long i;
- for (i = 0; i < nr_folios; i++) {
- struct folio *folio = folios[i];
+ for (i = 0; i < pofs->nr_entries; i++) {
+ struct folio *folio = pofs_get_folio(pofs, i);
if (folio_is_device_coherent(folio)) {
/*
@@ -2344,7 +2380,7 @@ static int migrate_longterm_unpinnable_folios(
* convert the pin on the source folio to a normal
* reference.
*/
- folios[i] = NULL;
+ pofs_clear_entry(pofs, i);
folio_get(folio);
gup_put_folio(folio, 1, FOLL_PIN);
@@ -2363,8 +2399,8 @@ static int migrate_longterm_unpinnable_folios(
* calling folio_isolate_lru() which takes a reference so the
* folio won't be freed if it's migrating.
*/
- unpin_folio(folios[i]);
- folios[i] = NULL;
+ unpin_folio(folio);
+ pofs_clear_entry(pofs, i);
}
if (!list_empty(movable_folio_list)) {
@@ -2387,12 +2423,26 @@ static int migrate_longterm_unpinnable_folios(
return -EAGAIN;
err:
- unpin_folios(folios, nr_folios);
+ pofs_unpin(pofs);
putback_movable_pages(movable_folio_list);
return ret;
}
+static long
+check_and_migrate_movable_pages_or_folios(struct pages_or_folios *pofs)
+{
+ LIST_HEAD(movable_folio_list);
+ unsigned long collected;
+
+ collected = collect_longterm_unpinnable_folios(&movable_folio_list,
+ pofs);
+ if (!collected)
+ return 0;
+
+ return migrate_longterm_unpinnable_folios(&movable_folio_list, pofs);
+}
+
/*
* Check whether all folios are *allowed* to be pinned indefinitely (long term).
* Rather confusingly, all folios in the range are required to be pinned via
@@ -2417,16 +2467,13 @@ static int migrate_longterm_unpinnable_folios(
static long check_and_migrate_movable_folios(unsigned long nr_folios,
struct folio **folios)
{
- unsigned long collected;
- LIST_HEAD(movable_folio_list);
+ struct pages_or_folios pofs = {
+ .folios = folios,
+ .has_folios = true,
+ .nr_entries = nr_folios,
+ };
- collected = collect_longterm_unpinnable_folios(&movable_folio_list,
- nr_folios, folios);
- if (!collected)
- return 0;
-
- return migrate_longterm_unpinnable_folios(&movable_folio_list,
- nr_folios, folios);
+ return check_and_migrate_movable_pages_or_folios(&pofs);
}
/*
@@ -2436,22 +2483,13 @@ static long check_and_migrate_movable_folios(unsigned long nr_folios,
static long check_and_migrate_movable_pages(unsigned long nr_pages,
struct page **pages)
{
- struct folio **folios;
- long i, ret;
+ struct pages_or_folios pofs = {
+ .pages = pages,
+ .has_folios = false,
+ .nr_entries = nr_pages,
+ };
- folios = kmalloc_array(nr_pages, sizeof(*folios), GFP_KERNEL);
- if (!folios) {
- unpin_user_pages(pages, nr_pages);
- return -ENOMEM;
- }
-
- for (i = 0; i < nr_pages; i++)
- folios[i] = page_folio(pages[i]);
-
- ret = check_and_migrate_movable_folios(nr_pages, folios);
-
- kfree(folios);
- return ret;
+ return check_and_migrate_movable_pages_or_folios(&pofs);
}
#else
static long check_and_migrate_movable_pages(unsigned long nr_pages,
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 03fd4bc..5734d5d 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -3790,7 +3790,9 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
* in the case it was underused, then consider it used and
* don't add it back to split_queue.
*/
- if (!did_split && !folio_test_partially_mapped(folio)) {
+ if (did_split) {
+ ; /* folio already removed from list */
+ } else if (!folio_test_partially_mapped(folio)) {
list_del_init(&folio->_deferred_list);
removed++;
} else {
diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c
index f8744f5..86527d8 100644
--- a/mm/memcontrol-v1.c
+++ b/mm/memcontrol-v1.c
@@ -1936,8 +1936,6 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
struct mem_cgroup_event *event;
struct cgroup_subsys_state *cfile_css;
unsigned int efd, cfd;
- struct fd efile;
- struct fd cfile;
struct dentry *cdentry;
const char *name;
char *endp;
@@ -1961,6 +1959,12 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
else
return -EINVAL;
+ CLASS(fd, efile)(efd);
+ if (fd_empty(efile))
+ return -EBADF;
+
+ CLASS(fd, cfile)(cfd);
+
event = kzalloc(sizeof(*event), GFP_KERNEL);
if (!event)
return -ENOMEM;
@@ -1971,20 +1975,13 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
init_waitqueue_func_entry(&event->wait, memcg_event_wake);
INIT_WORK(&event->remove, memcg_event_remove);
- efile = fdget(efd);
- if (!fd_file(efile)) {
- ret = -EBADF;
- goto out_kfree;
- }
-
event->eventfd = eventfd_ctx_fileget(fd_file(efile));
if (IS_ERR(event->eventfd)) {
ret = PTR_ERR(event->eventfd);
- goto out_put_efile;
+ goto out_kfree;
}
- cfile = fdget(cfd);
- if (!fd_file(cfile)) {
+ if (fd_empty(cfile)) {
ret = -EBADF;
goto out_put_eventfd;
}
@@ -1993,7 +1990,7 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
/* AV: shouldn't we check that it's been opened for read instead? */
ret = file_permission(fd_file(cfile), MAY_READ);
if (ret < 0)
- goto out_put_cfile;
+ goto out_put_eventfd;
/*
* The control file must be a regular cgroup1 file. As a regular cgroup
@@ -2002,7 +1999,7 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
cdentry = fd_file(cfile)->f_path.dentry;
if (cdentry->d_sb->s_type != &cgroup_fs_type || !d_is_reg(cdentry)) {
ret = -EINVAL;
- goto out_put_cfile;
+ goto out_put_eventfd;
}
/*
@@ -2035,7 +2032,7 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
event->unregister_event = memsw_cgroup_usage_unregister_event;
} else {
ret = -EINVAL;
- goto out_put_cfile;
+ goto out_put_eventfd;
}
/*
@@ -2047,11 +2044,9 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
&memory_cgrp_subsys);
ret = -EINVAL;
if (IS_ERR(cfile_css))
- goto out_put_cfile;
- if (cfile_css != css) {
- css_put(cfile_css);
- goto out_put_cfile;
- }
+ goto out_put_eventfd;
+ if (cfile_css != css)
+ goto out_put_css;
ret = event->register_event(memcg, event->eventfd, buf);
if (ret)
@@ -2062,23 +2057,14 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
spin_lock_irq(&memcg->event_list_lock);
list_add(&event->list, &memcg->event_list);
spin_unlock_irq(&memcg->event_list_lock);
-
- fdput(cfile);
- fdput(efile);
-
return nbytes;
out_put_css:
- css_put(css);
-out_put_cfile:
- fdput(cfile);
+ css_put(cfile_css);
out_put_eventfd:
eventfd_ctx_put(event->eventfd);
-out_put_efile:
- fdput(efile);
out_kfree:
kfree(event);
-
return ret;
}
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 06df2af9..53db98d 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -431,6 +431,10 @@ static const unsigned int memcg_vm_event_stat[] = {
PGDEACTIVATE,
PGLAZYFREE,
PGLAZYFREED,
+#ifdef CONFIG_SWAP
+ SWPIN_ZERO,
+ SWPOUT_ZERO,
+#endif
#ifdef CONFIG_ZSWAP
ZSWPIN,
ZSWPOUT,
diff --git a/mm/migrate.c b/mm/migrate.c
index dfa24e4..47a8d10 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -473,7 +473,7 @@ static int folio_expected_refs(struct address_space *mapping,
* The number of remaining references must be:
* 1 for anonymous folios without a mapping
* 2 for folios with a mapping
- * 3 for folios with a mapping and PagePrivate/PagePrivate2 set.
+ * 3 for folios with a mapping and the private flag set.
*/
static int __folio_migrate_mapping(struct address_space *mapping,
struct folio *newfolio, struct folio *folio, int expected_count)
@@ -787,7 +787,7 @@ static int __migrate_folio(struct address_space *mapping, struct folio *dst,
* @mode: How to migrate the page.
*
* Common logic to directly migrate a single LRU folio suitable for
- * folios that do not use PagePrivate/PagePrivate2.
+ * folios that do not have private data.
*
* Folios are locked upon entry and exit.
*/
diff --git a/mm/mremap.c b/mm/mremap.c
index dda09e9..dee98ff 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -648,7 +648,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
* Prevent negative return values when {old,new}_addr was realigned
* but we broke out of the above loop for the first PMD itself.
*/
- if (len + old_addr < old_end)
+ if (old_addr < old_end - len)
return 0;
return len + old_addr - old_end; /* how much done */
diff --git a/mm/nommu.c b/mm/nommu.c
index e9b5f52..9cb6e99 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -573,7 +573,7 @@ static int delete_vma_from_mm(struct vm_area_struct *vma)
VMA_ITERATOR(vmi, vma->vm_mm, vma->vm_start);
vma_iter_config(&vmi, vma->vm_start, vma->vm_end);
- if (vma_iter_prealloc(&vmi, vma)) {
+ if (vma_iter_prealloc(&vmi, NULL)) {
pr_warn("Allocation of vma tree for process %d failed\n",
current->pid);
return -ENOMEM;
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index fcd4c14..72a5d88 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -54,7 +54,7 @@
#define DIRTY_POLL_THRESH (128 >> (PAGE_SHIFT - 10))
/*
- * Estimate write bandwidth at 200ms intervals.
+ * Estimate write bandwidth or update dirty limit at 200ms intervals.
*/
#define BANDWIDTH_INTERVAL max(HZ/5, 1)
@@ -586,7 +586,7 @@ static void wb_domain_writeout_add(struct wb_domain *dom,
/* First event after period switching was turned off? */
if (unlikely(!dom->period_time)) {
/*
- * We can race with other __bdi_writeout_inc calls here but
+ * We can race with other wb_domain_writeout_add calls here but
* it does not cause any harm since the resulting time when
* timer will fire and what is in writeout_period_time will be
* roughly the same.
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index c6c7bb3..b695833 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1048,6 +1048,7 @@ __always_inline bool free_pages_prepare(struct page *page,
bool skip_kasan_poison = should_skip_kasan_poison(page);
bool init = want_init_on_free();
bool compound = PageCompound(page);
+ struct folio *folio = page_folio(page);
VM_BUG_ON_PAGE(PageTail(page), page);
@@ -1057,6 +1058,20 @@ __always_inline bool free_pages_prepare(struct page *page,
if (memcg_kmem_online() && PageMemcgKmem(page))
__memcg_kmem_uncharge_page(page, order);
+ /*
+ * In rare cases, when truncation or holepunching raced with
+ * munlock after VM_LOCKED was cleared, Mlocked may still be
+ * found set here. This does not indicate a problem, unless
+ * "unevictable_pgs_cleared" appears worryingly large.
+ */
+ if (unlikely(folio_test_mlocked(folio))) {
+ long nr_pages = folio_nr_pages(folio);
+
+ __folio_clear_mlocked(folio);
+ zone_stat_mod_folio(folio, NR_MLOCK, -nr_pages);
+ count_vm_events(UNEVICTABLE_PGCLEARED, nr_pages);
+ }
+
if (unlikely(PageHWPoison(page)) && !order) {
/* Do not let hwpoison pages hit pcplists/buddy */
reset_page_owner(page, order);
@@ -4592,7 +4607,8 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid,
gfp = alloc_gfp;
/* Find an allowed local zone that meets the low watermark. */
- for_each_zone_zonelist_nodemask(zone, z, ac.zonelist, ac.highest_zoneidx, ac.nodemask) {
+ z = ac.preferred_zoneref;
+ for_next_zone_zonelist_nodemask(zone, z, ac.highest_zoneidx, ac.nodemask) {
unsigned long mark;
if (cpusets_enabled() && (alloc_flags & ALLOC_CPUSET) &&
diff --git a/mm/page_io.c b/mm/page_io.c
index 69536a2..01749b9 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -204,7 +204,9 @@ static bool is_folio_zero_filled(struct folio *folio)
static void swap_zeromap_folio_set(struct folio *folio)
{
+ struct obj_cgroup *objcg = get_obj_cgroup_from_folio(folio);
struct swap_info_struct *sis = swp_swap_info(folio->swap);
+ int nr_pages = folio_nr_pages(folio);
swp_entry_t entry;
unsigned int i;
@@ -212,6 +214,12 @@ static void swap_zeromap_folio_set(struct folio *folio)
entry = page_swap_entry(folio_page(folio, i));
set_bit(swp_offset(entry), sis->zeromap);
}
+
+ count_vm_events(SWPOUT_ZERO, nr_pages);
+ if (objcg) {
+ count_objcg_events(objcg, SWPOUT_ZERO, nr_pages);
+ obj_cgroup_put(objcg);
+ }
}
static void swap_zeromap_folio_clear(struct folio *folio)
@@ -503,6 +511,7 @@ static void sio_read_complete(struct kiocb *iocb, long ret)
static bool swap_read_folio_zeromap(struct folio *folio)
{
int nr_pages = folio_nr_pages(folio);
+ struct obj_cgroup *objcg;
bool is_zeromap;
/*
@@ -517,6 +526,13 @@ static bool swap_read_folio_zeromap(struct folio *folio)
if (!is_zeromap)
return false;
+ objcg = get_obj_cgroup_from_folio(folio);
+ count_vm_events(SWPIN_ZERO, nr_pages);
+ if (objcg) {
+ count_objcg_events(objcg, SWPIN_ZERO, nr_pages);
+ obj_cgroup_put(objcg);
+ }
+
folio_zero_range(folio, 0, folio_size(folio));
folio_mark_uptodate(folio);
return true;
diff --git a/mm/readahead.c b/mm/readahead.c
index 3dc6c7a..9a80772 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -673,29 +673,22 @@ EXPORT_SYMBOL_GPL(page_cache_async_ra);
ssize_t ksys_readahead(int fd, loff_t offset, size_t count)
{
- ssize_t ret;
- struct fd f;
+ CLASS(fd, f)(fd);
- ret = -EBADF;
- f = fdget(fd);
- if (!fd_file(f) || !(fd_file(f)->f_mode & FMODE_READ))
- goto out;
+ if (fd_empty(f) || !(fd_file(f)->f_mode & FMODE_READ))
+ return -EBADF;
/*
* The readahead() syscall is intended to run only on files
* that can execute readahead. If readahead is not possible
* on this file, then we must return -EINVAL.
*/
- ret = -EINVAL;
if (!fd_file(f)->f_mapping || !fd_file(f)->f_mapping->a_ops ||
(!S_ISREG(file_inode(fd_file(f))->i_mode) &&
!S_ISBLK(file_inode(fd_file(f))->i_mode)))
- goto out;
+ return -EINVAL;
- ret = vfs_fadvise(fd_file(f), offset, count, POSIX_FADV_WILLNEED);
-out:
- fdput(f);
- return ret;
+ return vfs_fadvise(fd_file(f), offset, count, POSIX_FADV_WILLNEED);
}
SYSCALL_DEFINE3(readahead, int, fd, loff_t, offset, size_t, count)
diff --git a/mm/shmem.c b/mm/shmem.c
index e87f5d6..c7881e1 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -40,6 +40,7 @@
#include <linux/fs_parser.h>
#include <linux/swapfile.h>
#include <linux/iversion.h>
+#include <linux/unicode.h>
#include "swap.h"
static struct vfsmount *shm_mnt __ro_after_init;
@@ -123,6 +124,10 @@ struct shmem_options {
bool noswap;
unsigned short quota_types;
struct shmem_quota_limits qlimits;
+#if IS_ENABLED(CONFIG_UNICODE)
+ struct unicode_map *encoding;
+ bool strict_encoding;
+#endif
#define SHMEM_SEEN_BLOCKS 1
#define SHMEM_SEEN_INODES 2
#define SHMEM_SEEN_HUGE 4
@@ -1166,9 +1171,7 @@ static int shmem_getattr(struct mnt_idmap *idmap,
stat->attributes_mask |= (STATX_ATTR_APPEND |
STATX_ATTR_IMMUTABLE |
STATX_ATTR_NODUMP);
- inode_lock_shared(inode);
generic_fillattr(idmap, request_mask, inode, stat);
- inode_unlock_shared(inode);
if (shmem_huge_global_enabled(inode, 0, 0, false, NULL, 0))
stat->blksize = HPAGE_PMD_SIZE;
@@ -2751,13 +2754,62 @@ static int shmem_file_open(struct inode *inode, struct file *file)
#ifdef CONFIG_TMPFS_XATTR
static int shmem_initxattrs(struct inode *, const struct xattr *, void *);
+#if IS_ENABLED(CONFIG_UNICODE)
+/*
+ * shmem_inode_casefold_flags - Deal with casefold file attribute flag
+ *
+ * The casefold file attribute needs some special checks. I can just be added to
+ * an empty dir, and can't be removed from a non-empty dir.
+ */
+static int shmem_inode_casefold_flags(struct inode *inode, unsigned int fsflags,
+ struct dentry *dentry, unsigned int *i_flags)
+{
+ unsigned int old = inode->i_flags;
+ struct super_block *sb = inode->i_sb;
+
+ if (fsflags & FS_CASEFOLD_FL) {
+ if (!(old & S_CASEFOLD)) {
+ if (!sb->s_encoding)
+ return -EOPNOTSUPP;
+
+ if (!S_ISDIR(inode->i_mode))
+ return -ENOTDIR;
+
+ if (dentry && !simple_empty(dentry))
+ return -ENOTEMPTY;
+ }
+
+ *i_flags = *i_flags | S_CASEFOLD;
+ } else if (old & S_CASEFOLD) {
+ if (dentry && !simple_empty(dentry))
+ return -ENOTEMPTY;
+ }
+
+ return 0;
+}
+#else
+static int shmem_inode_casefold_flags(struct inode *inode, unsigned int fsflags,
+ struct dentry *dentry, unsigned int *i_flags)
+{
+ if (fsflags & FS_CASEFOLD_FL)
+ return -EOPNOTSUPP;
+
+ return 0;
+}
+#endif
+
/*
* chattr's fsflags are unrelated to extended attributes,
* but tmpfs has chosen to enable them under the same config option.
*/
-static void shmem_set_inode_flags(struct inode *inode, unsigned int fsflags)
+static int shmem_set_inode_flags(struct inode *inode, unsigned int fsflags, struct dentry *dentry)
{
unsigned int i_flags = 0;
+ int ret;
+
+ ret = shmem_inode_casefold_flags(inode, fsflags, dentry, &i_flags);
+ if (ret)
+ return ret;
if (fsflags & FS_NOATIME_FL)
i_flags |= S_NOATIME;
@@ -2768,10 +2820,12 @@ static void shmem_set_inode_flags(struct inode *inode, unsigned int fsflags)
/*
* But FS_NODUMP_FL does not require any action in i_flags.
*/
- inode_set_flags(inode, i_flags, S_NOATIME | S_APPEND | S_IMMUTABLE);
+ inode_set_flags(inode, i_flags, S_NOATIME | S_APPEND | S_IMMUTABLE | S_CASEFOLD);
+
+ return 0;
}
#else
-static void shmem_set_inode_flags(struct inode *inode, unsigned int fsflags)
+static void shmem_set_inode_flags(struct inode *inode, unsigned int fsflags, struct dentry *dentry)
{
}
#define shmem_initxattrs NULL
@@ -2818,7 +2872,7 @@ static struct inode *__shmem_get_inode(struct mnt_idmap *idmap,
info->fsflags = (dir == NULL) ? 0 :
SHMEM_I(dir)->fsflags & SHMEM_FL_INHERITED;
if (info->fsflags)
- shmem_set_inode_flags(inode, info->fsflags);
+ shmem_set_inode_flags(inode, info->fsflags, NULL);
INIT_LIST_HEAD(&info->shrinklist);
INIT_LIST_HEAD(&info->swaplist);
simple_xattrs_init(&info->xattrs);
@@ -3564,6 +3618,9 @@ shmem_mknod(struct mnt_idmap *idmap, struct inode *dir,
struct inode *inode;
int error;
+ if (!generic_ci_validate_strict_name(dir, &dentry->d_name))
+ return -EINVAL;
+
inode = shmem_get_inode(idmap, dir->i_sb, dir, mode, dev, VM_NORESERVE);
if (IS_ERR(inode))
return PTR_ERR(inode);
@@ -3583,7 +3640,12 @@ shmem_mknod(struct mnt_idmap *idmap, struct inode *dir,
dir->i_size += BOGO_DIRENT_SIZE;
inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
inode_inc_iversion(dir);
- d_instantiate(dentry, inode);
+
+ if (IS_ENABLED(CONFIG_UNICODE) && IS_CASEFOLDED(dir))
+ d_add(dentry, inode);
+ else
+ d_instantiate(dentry, inode);
+
dget(dentry); /* Extra count - pin the dentry in core */
return error;
@@ -3674,7 +3736,10 @@ static int shmem_link(struct dentry *old_dentry, struct inode *dir,
inc_nlink(inode);
ihold(inode); /* New dentry reference */
dget(dentry); /* Extra pinning count for the created dentry */
- d_instantiate(dentry, inode);
+ if (IS_ENABLED(CONFIG_UNICODE) && IS_CASEFOLDED(dir))
+ d_add(dentry, inode);
+ else
+ d_instantiate(dentry, inode);
out:
return ret;
}
@@ -3694,6 +3759,14 @@ static int shmem_unlink(struct inode *dir, struct dentry *dentry)
inode_inc_iversion(dir);
drop_nlink(inode);
dput(dentry); /* Undo the count from "create" - does all the work */
+
+ /*
+ * For now, VFS can't deal with case-insensitive negative dentries, so
+ * we invalidate them
+ */
+ if (IS_ENABLED(CONFIG_UNICODE) && IS_CASEFOLDED(dir))
+ d_invalidate(dentry);
+
return 0;
}
@@ -3838,7 +3911,10 @@ static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir,
dir->i_size += BOGO_DIRENT_SIZE;
inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
inode_inc_iversion(dir);
- d_instantiate(dentry, inode);
+ if (IS_ENABLED(CONFIG_UNICODE) && IS_CASEFOLDED(dir))
+ d_add(dentry, inode);
+ else
+ d_instantiate(dentry, inode);
dget(dentry);
return 0;
@@ -3903,16 +3979,23 @@ static int shmem_fileattr_set(struct mnt_idmap *idmap,
{
struct inode *inode = d_inode(dentry);
struct shmem_inode_info *info = SHMEM_I(inode);
+ int ret, flags;
if (fileattr_has_fsx(fa))
return -EOPNOTSUPP;
if (fa->flags & ~SHMEM_FL_USER_MODIFIABLE)
return -EOPNOTSUPP;
- info->fsflags = (info->fsflags & ~SHMEM_FL_USER_MODIFIABLE) |
+ flags = (info->fsflags & ~SHMEM_FL_USER_MODIFIABLE) |
(fa->flags & SHMEM_FL_USER_MODIFIABLE);
- shmem_set_inode_flags(inode, info->fsflags);
+ ret = shmem_set_inode_flags(inode, flags, dentry);
+
+ if (ret)
+ return ret;
+
+ info->fsflags = flags;
+
inode_set_ctime_current(inode);
inode_inc_iversion(inode);
return 0;
@@ -4191,6 +4274,9 @@ enum shmem_param {
Opt_usrquota_inode_hardlimit,
Opt_grpquota_block_hardlimit,
Opt_grpquota_inode_hardlimit,
+ Opt_casefold_version,
+ Opt_casefold,
+ Opt_strict_encoding,
};
static const struct constant_table shmem_param_enums_huge[] = {
@@ -4222,9 +4308,54 @@ const struct fs_parameter_spec shmem_fs_parameters[] = {
fsparam_string("grpquota_block_hardlimit", Opt_grpquota_block_hardlimit),
fsparam_string("grpquota_inode_hardlimit", Opt_grpquota_inode_hardlimit),
#endif
+ fsparam_string("casefold", Opt_casefold_version),
+ fsparam_flag ("casefold", Opt_casefold),
+ fsparam_flag ("strict_encoding", Opt_strict_encoding),
{}
};
+#if IS_ENABLED(CONFIG_UNICODE)
+static int shmem_parse_opt_casefold(struct fs_context *fc, struct fs_parameter *param,
+ bool latest_version)
+{
+ struct shmem_options *ctx = fc->fs_private;
+ unsigned int version = UTF8_LATEST;
+ struct unicode_map *encoding;
+ char *version_str = param->string + 5;
+
+ if (!latest_version) {
+ if (strncmp(param->string, "utf8-", 5))
+ return invalfc(fc, "Only UTF-8 encodings are supported "
+ "in the format: utf8-<version number>");
+
+ version = utf8_parse_version(version_str);
+ if (version < 0)
+ return invalfc(fc, "Invalid UTF-8 version: %s", version_str);
+ }
+
+ encoding = utf8_load(version);
+
+ if (IS_ERR(encoding)) {
+ return invalfc(fc, "Failed loading UTF-8 version: utf8-%u.%u.%u\n",
+ unicode_major(version), unicode_minor(version),
+ unicode_rev(version));
+ }
+
+ pr_info("tmpfs: Using encoding : utf8-%u.%u.%u\n",
+ unicode_major(version), unicode_minor(version), unicode_rev(version));
+
+ ctx->encoding = encoding;
+
+ return 0;
+}
+#else
+static int shmem_parse_opt_casefold(struct fs_context *fc, struct fs_parameter *param,
+ bool latest_version)
+{
+ return invalfc(fc, "tmpfs: Kernel not built with CONFIG_UNICODE\n");
+}
+#endif
+
static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param)
{
struct shmem_options *ctx = fc->fs_private;
@@ -4383,6 +4514,17 @@ static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param)
"Group quota inode hardlimit too large.");
ctx->qlimits.grpquota_ihardlimit = size;
break;
+ case Opt_casefold_version:
+ return shmem_parse_opt_casefold(fc, param, false);
+ case Opt_casefold:
+ return shmem_parse_opt_casefold(fc, param, true);
+ case Opt_strict_encoding:
+#if IS_ENABLED(CONFIG_UNICODE)
+ ctx->strict_encoding = true;
+ break;
+#else
+ return invalfc(fc, "tmpfs: Kernel not built with CONFIG_UNICODE\n");
+#endif
}
return 0;
@@ -4612,6 +4754,11 @@ static void shmem_put_super(struct super_block *sb)
{
struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
+#if IS_ENABLED(CONFIG_UNICODE)
+ if (sb->s_encoding)
+ utf8_unload(sb->s_encoding);
+#endif
+
#ifdef CONFIG_TMPFS_QUOTA
shmem_disable_quotas(sb);
#endif
@@ -4622,6 +4769,14 @@ static void shmem_put_super(struct super_block *sb)
sb->s_fs_info = NULL;
}
+#if IS_ENABLED(CONFIG_UNICODE) && defined(CONFIG_TMPFS)
+static const struct dentry_operations shmem_ci_dentry_ops = {
+ .d_hash = generic_ci_d_hash,
+ .d_compare = generic_ci_d_compare,
+ .d_delete = always_delete_dentry,
+};
+#endif
+
static int shmem_fill_super(struct super_block *sb, struct fs_context *fc)
{
struct shmem_options *ctx = fc->fs_private;
@@ -4656,9 +4811,25 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc)
}
sb->s_export_op = &shmem_export_ops;
sb->s_flags |= SB_NOSEC | SB_I_VERSION;
+
+#if IS_ENABLED(CONFIG_UNICODE)
+ if (!ctx->encoding && ctx->strict_encoding) {
+ pr_err("tmpfs: strict_encoding option without encoding is forbidden\n");
+ error = -EINVAL;
+ goto failed;
+ }
+
+ if (ctx->encoding) {
+ sb->s_encoding = ctx->encoding;
+ sb->s_d_op = &shmem_ci_dentry_ops;
+ if (ctx->strict_encoding)
+ sb->s_encoding_flags = SB_ENC_STRICT_MODE_FL;
+ }
+#endif
+
#else
sb->s_flags |= SB_NOUSER;
-#endif
+#endif /* CONFIG_TMPFS */
sbinfo->max_blocks = ctx->blocks;
sbinfo->max_inodes = ctx->inodes;
sbinfo->free_ispace = sbinfo->max_inodes * BOGO_INODE_SIZE;
@@ -4932,6 +5103,10 @@ int shmem_init_fs_context(struct fs_context *fc)
ctx->uid = current_fsuid();
ctx->gid = current_fsgid();
+#if IS_ENABLED(CONFIG_UNICODE)
+ ctx->encoding = NULL;
+#endif
+
fc->fs_private = ctx;
fc->ops = &shmem_fs_context_ops;
return 0;
@@ -4945,9 +5120,69 @@ static struct file_system_type shmem_fs_type = {
.parameters = shmem_fs_parameters,
#endif
.kill_sb = kill_litter_super,
- .fs_flags = FS_USERNS_MOUNT | FS_ALLOW_IDMAP,
+ .fs_flags = FS_USERNS_MOUNT | FS_ALLOW_IDMAP | FS_MGTIME,
};
+#if defined(CONFIG_SYSFS) && defined(CONFIG_TMPFS)
+
+#define __INIT_KOBJ_ATTR(_name, _mode, _show, _store) \
+{ \
+ .attr = { .name = __stringify(_name), .mode = _mode }, \
+ .show = _show, \
+ .store = _store, \
+}
+
+#define TMPFS_ATTR_W(_name, _store) \
+ static struct kobj_attribute tmpfs_attr_##_name = \
+ __INIT_KOBJ_ATTR(_name, 0200, NULL, _store)
+
+#define TMPFS_ATTR_RW(_name, _show, _store) \
+ static struct kobj_attribute tmpfs_attr_##_name = \
+ __INIT_KOBJ_ATTR(_name, 0644, _show, _store)
+
+#define TMPFS_ATTR_RO(_name, _show) \
+ static struct kobj_attribute tmpfs_attr_##_name = \
+ __INIT_KOBJ_ATTR(_name, 0444, _show, NULL)
+
+#if IS_ENABLED(CONFIG_UNICODE)
+static ssize_t casefold_show(struct kobject *kobj, struct kobj_attribute *a,
+ char *buf)
+{
+ return sysfs_emit(buf, "supported\n");
+}
+TMPFS_ATTR_RO(casefold, casefold_show);
+#endif
+
+static struct attribute *tmpfs_attributes[] = {
+#if IS_ENABLED(CONFIG_UNICODE)
+ &tmpfs_attr_casefold.attr,
+#endif
+ NULL
+};
+
+static const struct attribute_group tmpfs_attribute_group = {
+ .attrs = tmpfs_attributes,
+ .name = "features"
+};
+
+static struct kobject *tmpfs_kobj;
+
+static int __init tmpfs_sysfs_init(void)
+{
+ int ret;
+
+ tmpfs_kobj = kobject_create_and_add("tmpfs", fs_kobj);
+ if (!tmpfs_kobj)
+ return -ENOMEM;
+
+ ret = sysfs_create_group(tmpfs_kobj, &tmpfs_attribute_group);
+ if (ret)
+ kobject_put(tmpfs_kobj);
+
+ return ret;
+}
+#endif /* CONFIG_SYSFS && CONFIG_TMPFS */
+
void __init shmem_init(void)
{
int error;
@@ -4971,6 +5206,14 @@ void __init shmem_init(void)
goto out1;
}
+#if defined(CONFIG_SYSFS) && defined(CONFIG_TMPFS)
+ error = tmpfs_sysfs_init();
+ if (error) {
+ pr_err("Could not init tmpfs sysfs\n");
+ goto out1;
+ }
+#endif
+
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
if (has_transparent_hugepage() && shmem_huge > SHMEM_HUGE_DENY)
SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge;
diff --git a/mm/swap.c b/mm/swap.c
index b8e3259..59f30a9 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -78,20 +78,6 @@ static void __page_cache_release(struct folio *folio, struct lruvec **lruvecp,
lruvec_del_folio(*lruvecp, folio);
__folio_clear_lru_flags(folio);
}
-
- /*
- * In rare cases, when truncation or holepunching raced with
- * munlock after VM_LOCKED was cleared, Mlocked may still be
- * found set here. This does not indicate a problem, unless
- * "unevictable_pgs_cleared" appears worryingly large.
- */
- if (unlikely(folio_test_mlocked(folio))) {
- long nr_pages = folio_nr_pages(folio);
-
- __folio_clear_mlocked(folio);
- zone_stat_mod_folio(folio, NR_MLOCK, -nr_pages);
- count_vm_events(UNEVICTABLE_PGCLEARED, nr_pages);
- }
}
/*
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 46bd4b1..b0a9071 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -664,12 +664,15 @@ static bool cluster_scan_range(struct swap_info_struct *si,
return true;
}
-static void cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster_info *ci,
+static bool cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster_info *ci,
unsigned int start, unsigned char usage,
unsigned int order)
{
unsigned int nr_pages = 1 << order;
+ if (!(si->flags & SWP_WRITEOK))
+ return false;
+
if (cluster_is_free(ci)) {
if (nr_pages < SWAPFILE_CLUSTER) {
list_move_tail(&ci->list, &si->nonfull_clusters[order]);
@@ -690,6 +693,8 @@ static void cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster
list_move_tail(&ci->list, &si->full_clusters);
ci->flags = CLUSTER_FLAG_FULL;
}
+
+ return true;
}
static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si, unsigned long offset,
@@ -713,7 +718,10 @@ static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si, unsigne
while (offset <= end) {
if (cluster_scan_range(si, ci, offset, nr_pages)) {
- cluster_alloc_range(si, ci, offset, usage, order);
+ if (!cluster_alloc_range(si, ci, offset, usage, order)) {
+ offset = SWAP_NEXT_INVALID;
+ goto done;
+ }
*foundp = offset;
if (ci->count == SWAPFILE_CLUSTER) {
offset = SWAP_NEXT_INVALID;
@@ -805,7 +813,11 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o
if (!list_empty(&si->free_clusters)) {
ci = list_first_entry(&si->free_clusters, struct swap_cluster_info, list);
offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), &found, order, usage);
- VM_BUG_ON(!found);
+ /*
+ * Either we didn't touch the cluster due to swapoff,
+ * or the allocation must success.
+ */
+ VM_BUG_ON((si->flags & SWP_WRITEOK) && !found);
goto done;
}
@@ -929,7 +941,7 @@ static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset,
si->highest_bit = 0;
del_from_avail_list(si);
- if (vm_swap_full())
+ if (si->cluster_info && vm_swap_full())
schedule_work(&si->reclaim_work);
}
}
@@ -1041,6 +1053,8 @@ static int cluster_alloc_swap(struct swap_info_struct *si,
VM_BUG_ON(!si->cluster_info);
+ si->flags += SWP_SCANNING;
+
while (n_ret < nr) {
unsigned long offset = cluster_alloc_swap_entry(si, order, usage);
@@ -1049,6 +1063,8 @@ static int cluster_alloc_swap(struct swap_info_struct *si,
slots[n_ret++] = swp_entry(si->type, offset);
}
+ si->flags -= SWP_SCANNING;
+
return n_ret;
}
diff --git a/mm/truncate.c b/mm/truncate.c
index 0668cd3..09fa809 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -166,7 +166,6 @@ static void truncate_cleanup_folio(struct folio *folio)
* Hence dirty accounting check is placed after invalidation.
*/
folio_cancel_dirty(folio);
- folio_clear_mappedtodisk(folio);
}
int truncate_inode_folio(struct address_space *mapping, struct folio *folio)
@@ -797,6 +796,21 @@ void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to)
*/
if (folio_mkclean(folio))
folio_mark_dirty(folio);
+
+ /*
+ * The post-eof range of the folio must be zeroed before it is exposed
+ * to the file. Writeback normally does this, but since i_size has been
+ * increased we handle it here.
+ */
+ if (folio_test_dirty(folio)) {
+ unsigned int offset, end;
+
+ offset = from - folio_pos(folio);
+ end = min_t(unsigned int, to - folio_pos(folio),
+ folio_size(folio));
+ folio_zero_segment(folio, offset, end);
+ }
+
folio_unlock(folio);
folio_put(folio);
}
diff --git a/mm/vmstat.c b/mm/vmstat.c
index b5a4cea..ac6a5aa 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1415,6 +1415,8 @@ const char * const vmstat_text[] = {
#ifdef CONFIG_SWAP
"swap_ra",
"swap_ra_hit",
+ "swpin_zero",
+ "swpout_zero",
#ifdef CONFIG_KSM
"ksm_swpin_copy",
#endif
diff --git a/mm/zswap.c b/mm/zswap.c
index 1620139..0030ce8 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -1053,7 +1053,7 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
count_vm_event(ZSWPWB);
if (entry->objcg)
- count_objcg_event(entry->objcg, ZSWPWB);
+ count_objcg_events(entry->objcg, ZSWPWB, 1);
zswap_entry_free(entry);
@@ -1483,7 +1483,7 @@ bool zswap_store(struct folio *folio)
if (objcg) {
obj_cgroup_charge_zswap(objcg, entry->length);
- count_objcg_event(objcg, ZSWPOUT);
+ count_objcg_events(objcg, ZSWPOUT, 1);
}
/*
@@ -1577,7 +1577,7 @@ bool zswap_load(struct folio *folio)
count_vm_event(ZSWPIN);
if (entry->objcg)
- count_objcg_event(entry->objcg, ZSWPIN);
+ count_objcg_events(entry->objcg, ZSWPIN, 1);
if (swapcache) {
zswap_entry_free(entry);
diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index 96d097b..0ac354d 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -3788,8 +3788,6 @@ static void hci_acldata_packet(struct hci_dev *hdev, struct sk_buff *skb)
hci_dev_lock(hdev);
conn = hci_conn_hash_lookup_handle(hdev, handle);
- if (conn && hci_dev_test_flag(hdev, HCI_MGMT))
- mgmt_device_connected(hdev, conn, NULL, 0);
hci_dev_unlock(hdev);
if (conn) {
diff --git a/net/core/filter.c b/net/core/filter.c
index e31ee8b..fb56567 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2249,7 +2249,7 @@ static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb,
rcu_read_unlock();
return ret;
}
- rcu_read_unlock_bh();
+ rcu_read_unlock();
if (dst)
IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
out_drop:
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index e39479f..b231b27 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -694,20 +694,18 @@ EXPORT_SYMBOL_GPL(get_net_ns);
struct net *get_net_ns_by_fd(int fd)
{
- struct fd f = fdget(fd);
- struct net *net = ERR_PTR(-EINVAL);
+ CLASS(fd, f)(fd);
- if (!fd_file(f))
+ if (fd_empty(f))
return ERR_PTR(-EBADF);
if (proc_ns_file(fd_file(f))) {
struct ns_common *ns = get_proc_ns(file_inode(fd_file(f)));
if (ns->ops == &netns_operations)
- net = get_net(container_of(ns, struct net, ns));
+ return get_net(container_of(ns, struct net, ns));
}
- fdput(f);
- return net;
+ return ERR_PTR(-EINVAL);
}
EXPORT_SYMBOL_GPL(get_net_ns_by_fd);
#endif
diff --git a/net/core/sock.c b/net/core/sock.c
index 039be95..da50df4 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1052,32 +1052,34 @@ static int sock_reserve_memory(struct sock *sk, int bytes)
#ifdef CONFIG_PAGE_POOL
-/* This is the number of tokens that the user can SO_DEVMEM_DONTNEED in
- * 1 syscall. The limit exists to limit the amount of memory the kernel
- * allocates to copy these tokens.
+/* This is the number of tokens and frags that the user can SO_DEVMEM_DONTNEED
+ * in 1 syscall. The limit exists to limit the amount of memory the kernel
+ * allocates to copy these tokens, and to prevent looping over the frags for
+ * too long.
*/
#define MAX_DONTNEED_TOKENS 128
+#define MAX_DONTNEED_FRAGS 1024
static noinline_for_stack int
sock_devmem_dontneed(struct sock *sk, sockptr_t optval, unsigned int optlen)
{
unsigned int num_tokens, i, j, k, netmem_num = 0;
struct dmabuf_token *tokens;
+ int ret = 0, num_frags = 0;
netmem_ref netmems[16];
- int ret = 0;
if (!sk_is_tcp(sk))
return -EBADF;
- if (optlen % sizeof(struct dmabuf_token) ||
+ if (optlen % sizeof(*tokens) ||
optlen > sizeof(*tokens) * MAX_DONTNEED_TOKENS)
return -EINVAL;
- tokens = kvmalloc_array(optlen, sizeof(*tokens), GFP_KERNEL);
+ num_tokens = optlen / sizeof(*tokens);
+ tokens = kvmalloc_array(num_tokens, sizeof(*tokens), GFP_KERNEL);
if (!tokens)
return -ENOMEM;
- num_tokens = optlen / sizeof(struct dmabuf_token);
if (copy_from_sockptr(tokens, optval, optlen)) {
kvfree(tokens);
return -EFAULT;
@@ -1086,24 +1088,28 @@ sock_devmem_dontneed(struct sock *sk, sockptr_t optval, unsigned int optlen)
xa_lock_bh(&sk->sk_user_frags);
for (i = 0; i < num_tokens; i++) {
for (j = 0; j < tokens[i].token_count; j++) {
+ if (++num_frags > MAX_DONTNEED_FRAGS)
+ goto frag_limit_reached;
+
netmem_ref netmem = (__force netmem_ref)__xa_erase(
&sk->sk_user_frags, tokens[i].token_start + j);
- if (netmem &&
- !WARN_ON_ONCE(!netmem_is_net_iov(netmem))) {
- netmems[netmem_num++] = netmem;
- if (netmem_num == ARRAY_SIZE(netmems)) {
- xa_unlock_bh(&sk->sk_user_frags);
- for (k = 0; k < netmem_num; k++)
- WARN_ON_ONCE(!napi_pp_put_page(netmems[k]));
- netmem_num = 0;
- xa_lock_bh(&sk->sk_user_frags);
- }
- ret++;
+ if (!netmem || WARN_ON_ONCE(!netmem_is_net_iov(netmem)))
+ continue;
+
+ netmems[netmem_num++] = netmem;
+ if (netmem_num == ARRAY_SIZE(netmems)) {
+ xa_unlock_bh(&sk->sk_user_frags);
+ for (k = 0; k < netmem_num; k++)
+ WARN_ON_ONCE(!napi_pp_put_page(netmems[k]));
+ netmem_num = 0;
+ xa_lock_bh(&sk->sk_user_frags);
}
+ ret++;
}
}
+frag_limit_reached:
xa_unlock_bh(&sk->sk_user_frags);
for (k = 0; k < netmem_num; k++)
WARN_ON_ONCE(!napi_pp_put_page(netmems[k]));
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index da5dba1..d664924 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -618,7 +618,7 @@ static int dccp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
by tcp. Feel free to propose better solution.
--ANK (980728)
*/
- if (np->rxopt.all)
+ if (np->rxopt.all && sk->sk_state != DCCP_LISTEN)
opt_skb = skb_clone_and_charge_r(skb, sk);
if (sk->sk_state == DCCP_OPEN) { /* Fast path */
diff --git a/net/handshake/request.c b/net/handshake/request.c
index 94d5cef..274d2c8 100644
--- a/net/handshake/request.c
+++ b/net/handshake/request.c
@@ -13,7 +13,6 @@
#include <linux/module.h>
#include <linux/skbuff.h>
#include <linux/inet.h>
-#include <linux/fdtable.h>
#include <linux/rhashtable.h>
#include <net/sock.h>
diff --git a/net/ipv4/ipmr_base.c b/net/ipv4/ipmr_base.c
index 271dc03..f0af12a 100644
--- a/net/ipv4/ipmr_base.c
+++ b/net/ipv4/ipmr_base.c
@@ -310,7 +310,8 @@ int mr_table_dump(struct mr_table *mrt, struct sk_buff *skb,
if (filter->filter_set)
flags |= NLM_F_DUMP_FILTERED;
- list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list) {
+ list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list,
+ lockdep_rtnl_is_held()) {
if (e < s_e)
goto next_entry;
if (filter->dev &&
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index d71ab4e..c9de5ef 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1618,7 +1618,7 @@ int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
by tcp. Feel free to propose better solution.
--ANK (980728)
*/
- if (np->rxopt.all)
+ if (np->rxopt.all && sk->sk_state != TCP_LISTEN)
opt_skb = skb_clone_and_charge_r(skb, sk);
if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
@@ -1656,8 +1656,6 @@ int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
if (reason)
goto reset;
}
- if (opt_skb)
- __kfree_skb(opt_skb);
return 0;
}
} else
diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c
index db586a5..45a2b5f 100644
--- a/net/mptcp/pm_netlink.c
+++ b/net/mptcp/pm_netlink.c
@@ -524,7 +524,8 @@ __lookup_addr(struct pm_nl_pernet *pernet, const struct mptcp_addr_info *info)
{
struct mptcp_pm_addr_entry *entry;
- list_for_each_entry(entry, &pernet->local_addr_list, list) {
+ list_for_each_entry_rcu(entry, &pernet->local_addr_list, list,
+ lockdep_is_held(&pernet->lock)) {
if (mptcp_addresses_equal(&entry->addr, info, entry->addr.port))
return entry;
}
diff --git a/net/mptcp/pm_userspace.c b/net/mptcp/pm_userspace.c
index 56dfea9..e35178f 100644
--- a/net/mptcp/pm_userspace.c
+++ b/net/mptcp/pm_userspace.c
@@ -308,14 +308,17 @@ int mptcp_pm_nl_remove_doit(struct sk_buff *skb, struct genl_info *info)
lock_sock(sk);
+ spin_lock_bh(&msk->pm.lock);
match = mptcp_userspace_pm_lookup_addr_by_id(msk, id_val);
if (!match) {
GENL_SET_ERR_MSG(info, "address with specified id not found");
+ spin_unlock_bh(&msk->pm.lock);
release_sock(sk);
goto out;
}
list_move(&match->list, &free_list);
+ spin_unlock_bh(&msk->pm.lock);
mptcp_pm_remove_addrs(msk, &free_list);
@@ -560,6 +563,7 @@ int mptcp_userspace_pm_set_flags(struct sk_buff *skb, struct genl_info *info)
struct nlattr *token = info->attrs[MPTCP_PM_ATTR_TOKEN];
struct nlattr *attr = info->attrs[MPTCP_PM_ATTR_ADDR];
struct net *net = sock_net(skb->sk);
+ struct mptcp_pm_addr_entry *entry;
struct mptcp_sock *msk;
int ret = -EINVAL;
struct sock *sk;
@@ -601,6 +605,17 @@ int mptcp_userspace_pm_set_flags(struct sk_buff *skb, struct genl_info *info)
if (loc.flags & MPTCP_PM_ADDR_FLAG_BACKUP)
bkup = 1;
+ spin_lock_bh(&msk->pm.lock);
+ list_for_each_entry(entry, &msk->pm.userspace_pm_local_addr_list, list) {
+ if (mptcp_addresses_equal(&entry->addr, &loc.addr, false)) {
+ if (bkup)
+ entry->flags |= MPTCP_PM_ADDR_FLAG_BACKUP;
+ else
+ entry->flags &= ~MPTCP_PM_ADDR_FLAG_BACKUP;
+ }
+ }
+ spin_unlock_bh(&msk->pm.lock);
+
lock_sock(sk);
ret = mptcp_pm_nl_mp_prio_send_ack(msk, &loc.addr, &rem.addr, bkup);
release_sock(sk);
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index d263091..48d4809 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -2082,7 +2082,8 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied)
slow = lock_sock_fast(ssk);
WRITE_ONCE(ssk->sk_rcvbuf, rcvbuf);
WRITE_ONCE(tcp_sk(ssk)->window_clamp, window_clamp);
- tcp_cleanup_rbuf(ssk, 1);
+ if (tcp_can_send_ack(ssk))
+ tcp_cleanup_rbuf(ssk, 1);
unlock_sock_fast(ssk, slow);
}
}
@@ -2205,7 +2206,7 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
cmsg_flags = MPTCP_CMSG_INQ;
while (copied < len) {
- int bytes_read;
+ int err, bytes_read;
bytes_read = __mptcp_recvmsg_mskq(msk, msg, len - copied, flags, &tss, &cmsg_flags);
if (unlikely(bytes_read < 0)) {
@@ -2267,9 +2268,16 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
}
pr_debug("block timeout %ld\n", timeo);
- sk_wait_data(sk, &timeo, NULL);
+ mptcp_rcv_space_adjust(msk, copied);
+ err = sk_wait_data(sk, &timeo, NULL);
+ if (err < 0) {
+ err = copied ? : err;
+ goto out_err;
+ }
}
+ mptcp_rcv_space_adjust(msk, copied);
+
out_err:
if (cmsg_flags && copied >= 0) {
if (cmsg_flags & MPTCP_CMSG_TS)
@@ -2285,8 +2293,6 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
pr_debug("msk=%p rx queue empty=%d:%d copied=%d\n",
msk, skb_queue_empty_lockless(&sk->sk_receive_queue),
skb_queue_empty(&msk->receive_queue), copied);
- if (!(flags & MSG_PEEK))
- mptcp_rcv_space_adjust(msk, copied);
release_sock(sk);
return copied;
diff --git a/net/netlabel/netlabel_unlabeled.c b/net/netlabel/netlabel_unlabeled.c
index 9996883..1bc2d08 100644
--- a/net/netlabel/netlabel_unlabeled.c
+++ b/net/netlabel/netlabel_unlabeled.c
@@ -1538,7 +1538,7 @@ int __init netlbl_unlabel_defconf(void)
/* Only the kernel is allowed to call this function and the only time
* it is called is at bootup before the audit subsystem is reporting
* messages so don't worry to much about these values. */
- security_current_getsecid_subj(&audit_info.secid);
+ security_current_getlsmprop_subj(&audit_info.prop);
audit_info.loginuid = GLOBAL_ROOT_UID;
audit_info.sessionid = 0;
diff --git a/net/netlabel/netlabel_user.c b/net/netlabel/netlabel_user.c
index 3ed4fea..81635a1 100644
--- a/net/netlabel/netlabel_user.c
+++ b/net/netlabel/netlabel_user.c
@@ -98,10 +98,9 @@ struct audit_buffer *netlbl_audit_start_common(int type,
from_kuid(&init_user_ns, audit_info->loginuid),
audit_info->sessionid);
- if (audit_info->secid != 0 &&
- security_secid_to_secctx(audit_info->secid,
- &secctx,
- &secctx_len) == 0) {
+ if (lsmprop_is_set(&audit_info->prop) &&
+ security_lsmprop_to_secctx(&audit_info->prop, &secctx,
+ &secctx_len) == 0) {
audit_log_format(audit_buf, " subj=%s", secctx);
security_release_secctx(secctx, secctx_len);
}
diff --git a/net/netlabel/netlabel_user.h b/net/netlabel/netlabel_user.h
index d6c5b31..d4c4349 100644
--- a/net/netlabel/netlabel_user.h
+++ b/net/netlabel/netlabel_user.h
@@ -32,7 +32,7 @@
*/
static inline void netlbl_netlink_auditinfo(struct netlbl_audit *audit_info)
{
- security_current_getsecid_subj(&audit_info->secid);
+ security_current_getlsmprop_subj(&audit_info->prop);
audit_info->loginuid = audit_get_loginuid(current);
audit_info->sessionid = audit_get_sessionid(current);
}
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 0a9287f..8953c4d 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -393,15 +393,6 @@ static void netlink_skb_set_owner_r(struct sk_buff *skb, struct sock *sk)
static void netlink_sock_destruct(struct sock *sk)
{
- struct netlink_sock *nlk = nlk_sk(sk);
-
- if (nlk->cb_running) {
- if (nlk->cb.done)
- nlk->cb.done(&nlk->cb);
- module_put(nlk->cb.module);
- kfree_skb(nlk->cb.skb);
- }
-
skb_queue_purge(&sk->sk_receive_queue);
if (!sock_flag(sk, SOCK_DEAD)) {
@@ -414,14 +405,6 @@ static void netlink_sock_destruct(struct sock *sk)
WARN_ON(nlk_sk(sk)->groups);
}
-static void netlink_sock_destruct_work(struct work_struct *work)
-{
- struct netlink_sock *nlk = container_of(work, struct netlink_sock,
- work);
-
- sk_free(&nlk->sk);
-}
-
/* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it is _very_ bad on
* SMP. Look, when several writers sleep and reader wakes them up, all but one
* immediately hit write lock and grab all the cpus. Exclusive sleep solves
@@ -731,12 +714,6 @@ static void deferred_put_nlk_sk(struct rcu_head *head)
if (!refcount_dec_and_test(&sk->sk_refcnt))
return;
- if (nlk->cb_running && nlk->cb.done) {
- INIT_WORK(&nlk->work, netlink_sock_destruct_work);
- schedule_work(&nlk->work);
- return;
- }
-
sk_free(sk);
}
@@ -788,6 +765,14 @@ static int netlink_release(struct socket *sock)
NETLINK_URELEASE, &n);
}
+ /* Terminate any outstanding dump */
+ if (nlk->cb_running) {
+ if (nlk->cb.done)
+ nlk->cb.done(&nlk->cb);
+ module_put(nlk->cb.module);
+ kfree_skb(nlk->cb.skb);
+ }
+
module_put(nlk->module);
if (netlink_is_kernel(sk)) {
@@ -1180,11 +1165,16 @@ static struct sock *netlink_getsockbyportid(struct sock *ssk, u32 portid)
return sock;
}
-struct sock *netlink_getsockbyfilp(struct file *filp)
+struct sock *netlink_getsockbyfd(int fd)
{
- struct inode *inode = file_inode(filp);
+ CLASS(fd, f)(fd);
+ struct inode *inode;
struct sock *sock;
+ if (fd_empty(f))
+ return ERR_PTR(-EBADF);
+
+ inode = file_inode(fd_file(f));
if (!S_ISSOCK(inode->i_mode))
return ERR_PTR(-ENOTSOCK);
diff --git a/net/netlink/af_netlink.h b/net/netlink/af_netlink.h
index 5b0e4e6..778a380 100644
--- a/net/netlink/af_netlink.h
+++ b/net/netlink/af_netlink.h
@@ -4,7 +4,6 @@
#include <linux/rhashtable.h>
#include <linux/atomic.h>
-#include <linux/workqueue.h>
#include <net/sock.h>
/* flags */
@@ -50,7 +49,6 @@ struct netlink_sock {
struct rhash_head node;
struct rcu_head rcu;
- struct work_struct work;
};
static inline struct netlink_sock *nlk_sk(struct sock *sk)
diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index 9412d88..d3a03c5 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -92,6 +92,16 @@ struct tc_u_common {
long knodes;
};
+static u32 handle2id(u32 h)
+{
+ return ((h & 0x80000000) ? ((h >> 20) & 0x7FF) : h);
+}
+
+static u32 id2handle(u32 id)
+{
+ return (id | 0x800U) << 20;
+}
+
static inline unsigned int u32_hash_fold(__be32 key,
const struct tc_u32_sel *sel,
u8 fshift)
@@ -310,7 +320,7 @@ static u32 gen_new_htid(struct tc_u_common *tp_c, struct tc_u_hnode *ptr)
int id = idr_alloc_cyclic(&tp_c->handle_idr, ptr, 1, 0x7FF, GFP_KERNEL);
if (id < 0)
return 0;
- return (id | 0x800U) << 20;
+ return id2handle(id);
}
static struct hlist_head *tc_u_common_hash;
@@ -360,7 +370,7 @@ static int u32_init(struct tcf_proto *tp)
return -ENOBUFS;
refcount_set(&root_ht->refcnt, 1);
- root_ht->handle = tp_c ? gen_new_htid(tp_c, root_ht) : 0x80000000;
+ root_ht->handle = tp_c ? gen_new_htid(tp_c, root_ht) : id2handle(0);
root_ht->prio = tp->prio;
root_ht->is_root = true;
idr_init(&root_ht->handle_idr);
@@ -612,7 +622,7 @@ static int u32_destroy_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht,
if (phn == ht) {
u32_clear_hw_hnode(tp, ht, extack);
idr_destroy(&ht->handle_idr);
- idr_remove(&tp_c->handle_idr, ht->handle);
+ idr_remove(&tp_c->handle_idr, handle2id(ht->handle));
RCU_INIT_POINTER(*hn, ht->next);
kfree_rcu(ht, rcu);
return 0;
@@ -989,7 +999,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
err = u32_replace_hw_hnode(tp, ht, userflags, extack);
if (err) {
- idr_remove(&tp_c->handle_idr, handle);
+ idr_remove(&tp_c->handle_idr, handle2id(handle));
kfree(ht);
return err;
}
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index 39382ee..fe6fed2 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -17,6 +17,7 @@
#include <linux/errno.h>
#include <linux/skbuff.h>
#include <linux/vmalloc.h>
+#include <linux/prandom.h>
#include <linux/rtnetlink.h>
#include <linux/reciprocal_div.h>
#include <linux/rbtree.h>
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index f7b809c0..38e2fbd 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -683,7 +683,7 @@ static int sctp_v6_available(union sctp_addr *addr, struct sctp_sock *sp)
struct sock *sk = &sp->inet.sk;
struct net *net = sock_net(sk);
struct net_device *dev = NULL;
- int type;
+ int type, res, bound_dev_if;
type = ipv6_addr_type(in6);
if (IPV6_ADDR_ANY == type)
@@ -697,14 +697,21 @@ static int sctp_v6_available(union sctp_addr *addr, struct sctp_sock *sp)
if (!(type & IPV6_ADDR_UNICAST))
return 0;
- if (sk->sk_bound_dev_if) {
- dev = dev_get_by_index_rcu(net, sk->sk_bound_dev_if);
+ rcu_read_lock();
+ bound_dev_if = READ_ONCE(sk->sk_bound_dev_if);
+ if (bound_dev_if) {
+ res = 0;
+ dev = dev_get_by_index_rcu(net, bound_dev_if);
if (!dev)
- return 0;
+ goto out;
}
- return ipv6_can_nonlocal_bind(net, &sp->inet) ||
- ipv6_chk_addr(net, in6, dev, 0);
+ res = ipv6_can_nonlocal_bind(net, &sp->inet) ||
+ ipv6_chk_addr(net, in6, dev, 0);
+
+out:
+ rcu_read_unlock();
+ return res;
}
/* This function checks if the address is a valid address to be used for
diff --git a/net/socket.c b/net/socket.c
index 042451f..87a573a7 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -509,7 +509,7 @@ static int sock_map_fd(struct socket *sock, int flags)
struct socket *sock_from_file(struct file *file)
{
- if (file->f_op == &socket_file_ops)
+ if (likely(file->f_op == &socket_file_ops))
return file->private_data; /* set in sock_alloc_file */
return NULL;
@@ -549,24 +549,6 @@ struct socket *sockfd_lookup(int fd, int *err)
}
EXPORT_SYMBOL(sockfd_lookup);
-static struct socket *sockfd_lookup_light(int fd, int *err, int *fput_needed)
-{
- struct fd f = fdget(fd);
- struct socket *sock;
-
- *err = -EBADF;
- if (fd_file(f)) {
- sock = sock_from_file(fd_file(f));
- if (likely(sock)) {
- *fput_needed = f.word & FDPUT_FPUT;
- return sock;
- }
- *err = -ENOTSOCK;
- fdput(f);
- }
- return NULL;
-}
-
static ssize_t sockfs_listxattr(struct dentry *dentry, char *buffer,
size_t size)
{
@@ -1858,16 +1840,20 @@ int __sys_bind(int fd, struct sockaddr __user *umyaddr, int addrlen)
{
struct socket *sock;
struct sockaddr_storage address;
- int err, fput_needed;
+ CLASS(fd, f)(fd);
+ int err;
- sock = sockfd_lookup_light(fd, &err, &fput_needed);
- if (sock) {
- err = move_addr_to_kernel(umyaddr, addrlen, &address);
- if (!err)
- err = __sys_bind_socket(sock, &address, addrlen);
- fput_light(sock->file, fput_needed);
- }
- return err;
+ if (fd_empty(f))
+ return -EBADF;
+ sock = sock_from_file(fd_file(f));
+ if (unlikely(!sock))
+ return -ENOTSOCK;
+
+ err = move_addr_to_kernel(umyaddr, addrlen, &address);
+ if (unlikely(err))
+ return err;
+
+ return __sys_bind_socket(sock, &address, addrlen);
}
SYSCALL_DEFINE3(bind, int, fd, struct sockaddr __user *, umyaddr, int, addrlen)
@@ -1896,15 +1882,16 @@ int __sys_listen_socket(struct socket *sock, int backlog)
int __sys_listen(int fd, int backlog)
{
+ CLASS(fd, f)(fd);
struct socket *sock;
- int err, fput_needed;
- sock = sockfd_lookup_light(fd, &err, &fput_needed);
- if (sock) {
- err = __sys_listen_socket(sock, backlog);
- fput_light(sock->file, fput_needed);
- }
- return err;
+ if (fd_empty(f))
+ return -EBADF;
+ sock = sock_from_file(fd_file(f));
+ if (unlikely(!sock))
+ return -ENOTSOCK;
+
+ return __sys_listen_socket(sock, backlog);
}
SYSCALL_DEFINE2(listen, int, fd, int, backlog)
@@ -2014,17 +2001,12 @@ static int __sys_accept4_file(struct file *file, struct sockaddr __user *upeer_s
int __sys_accept4(int fd, struct sockaddr __user *upeer_sockaddr,
int __user *upeer_addrlen, int flags)
{
- int ret = -EBADF;
- struct fd f;
+ CLASS(fd, f)(fd);
- f = fdget(fd);
- if (fd_file(f)) {
- ret = __sys_accept4_file(fd_file(f), upeer_sockaddr,
+ if (fd_empty(f))
+ return -EBADF;
+ return __sys_accept4_file(fd_file(f), upeer_sockaddr,
upeer_addrlen, flags);
- fdput(f);
- }
-
- return ret;
}
SYSCALL_DEFINE4(accept4, int, fd, struct sockaddr __user *, upeer_sockaddr,
@@ -2076,20 +2058,18 @@ int __sys_connect_file(struct file *file, struct sockaddr_storage *address,
int __sys_connect(int fd, struct sockaddr __user *uservaddr, int addrlen)
{
- int ret = -EBADF;
- struct fd f;
+ struct sockaddr_storage address;
+ CLASS(fd, f)(fd);
+ int ret;
- f = fdget(fd);
- if (fd_file(f)) {
- struct sockaddr_storage address;
+ if (fd_empty(f))
+ return -EBADF;
- ret = move_addr_to_kernel(uservaddr, addrlen, &address);
- if (!ret)
- ret = __sys_connect_file(fd_file(f), &address, addrlen, 0);
- fdput(f);
- }
+ ret = move_addr_to_kernel(uservaddr, addrlen, &address);
+ if (ret)
+ return ret;
- return ret;
+ return __sys_connect_file(fd_file(f), &address, addrlen, 0);
}
SYSCALL_DEFINE3(connect, int, fd, struct sockaddr __user *, uservaddr,
@@ -2108,26 +2088,25 @@ int __sys_getsockname(int fd, struct sockaddr __user *usockaddr,
{
struct socket *sock;
struct sockaddr_storage address;
- int err, fput_needed;
+ CLASS(fd, f)(fd);
+ int err;
- sock = sockfd_lookup_light(fd, &err, &fput_needed);
- if (!sock)
- goto out;
+ if (fd_empty(f))
+ return -EBADF;
+ sock = sock_from_file(fd_file(f));
+ if (unlikely(!sock))
+ return -ENOTSOCK;
err = security_socket_getsockname(sock);
if (err)
- goto out_put;
+ return err;
err = READ_ONCE(sock->ops)->getname(sock, (struct sockaddr *)&address, 0);
if (err < 0)
- goto out_put;
- /* "err" is actually length in this case */
- err = move_addr_to_user(&address, err, usockaddr, usockaddr_len);
+ return err;
-out_put:
- fput_light(sock->file, fput_needed);
-out:
- return err;
+ /* "err" is actually length in this case */
+ return move_addr_to_user(&address, err, usockaddr, usockaddr_len);
}
SYSCALL_DEFINE3(getsockname, int, fd, struct sockaddr __user *, usockaddr,
@@ -2146,26 +2125,25 @@ int __sys_getpeername(int fd, struct sockaddr __user *usockaddr,
{
struct socket *sock;
struct sockaddr_storage address;
- int err, fput_needed;
+ CLASS(fd, f)(fd);
+ int err;
- sock = sockfd_lookup_light(fd, &err, &fput_needed);
- if (sock != NULL) {
- const struct proto_ops *ops = READ_ONCE(sock->ops);
+ if (fd_empty(f))
+ return -EBADF;
+ sock = sock_from_file(fd_file(f));
+ if (unlikely(!sock))
+ return -ENOTSOCK;
- err = security_socket_getpeername(sock);
- if (err) {
- fput_light(sock->file, fput_needed);
- return err;
- }
+ err = security_socket_getpeername(sock);
+ if (err)
+ return err;
- err = ops->getname(sock, (struct sockaddr *)&address, 1);
- if (err >= 0)
- /* "err" is actually length in this case */
- err = move_addr_to_user(&address, err, usockaddr,
- usockaddr_len);
- fput_light(sock->file, fput_needed);
- }
- return err;
+ err = READ_ONCE(sock->ops)->getname(sock, (struct sockaddr *)&address, 1);
+ if (err < 0)
+ return err;
+
+ /* "err" is actually length in this case */
+ return move_addr_to_user(&address, err, usockaddr, usockaddr_len);
}
SYSCALL_DEFINE3(getpeername, int, fd, struct sockaddr __user *, usockaddr,
@@ -2186,14 +2164,17 @@ int __sys_sendto(int fd, void __user *buff, size_t len, unsigned int flags,
struct sockaddr_storage address;
int err;
struct msghdr msg;
- int fput_needed;
err = import_ubuf(ITER_SOURCE, buff, len, &msg.msg_iter);
if (unlikely(err))
return err;
- sock = sockfd_lookup_light(fd, &err, &fput_needed);
- if (!sock)
- goto out;
+
+ CLASS(fd, f)(fd);
+ if (fd_empty(f))
+ return -EBADF;
+ sock = sock_from_file(fd_file(f));
+ if (unlikely(!sock))
+ return -ENOTSOCK;
msg.msg_name = NULL;
msg.msg_control = NULL;
@@ -2203,7 +2184,7 @@ int __sys_sendto(int fd, void __user *buff, size_t len, unsigned int flags,
if (addr) {
err = move_addr_to_kernel(addr, addr_len, &address);
if (err < 0)
- goto out_put;
+ return err;
msg.msg_name = (struct sockaddr *)&address;
msg.msg_namelen = addr_len;
}
@@ -2211,12 +2192,7 @@ int __sys_sendto(int fd, void __user *buff, size_t len, unsigned int flags,
if (sock->file->f_flags & O_NONBLOCK)
flags |= MSG_DONTWAIT;
msg.msg_flags = flags;
- err = __sock_sendmsg(sock, &msg);
-
-out_put:
- fput_light(sock->file, fput_needed);
-out:
- return err;
+ return __sock_sendmsg(sock, &msg);
}
SYSCALL_DEFINE6(sendto, int, fd, void __user *, buff, size_t, len,
@@ -2251,14 +2227,18 @@ int __sys_recvfrom(int fd, void __user *ubuf, size_t size, unsigned int flags,
};
struct socket *sock;
int err, err2;
- int fput_needed;
err = import_ubuf(ITER_DEST, ubuf, size, &msg.msg_iter);
if (unlikely(err))
return err;
- sock = sockfd_lookup_light(fd, &err, &fput_needed);
- if (!sock)
- goto out;
+
+ CLASS(fd, f)(fd);
+
+ if (fd_empty(f))
+ return -EBADF;
+ sock = sock_from_file(fd_file(f));
+ if (unlikely(!sock))
+ return -ENOTSOCK;
if (sock->file->f_flags & O_NONBLOCK)
flags |= MSG_DONTWAIT;
@@ -2270,9 +2250,6 @@ int __sys_recvfrom(int fd, void __user *ubuf, size_t size, unsigned int flags,
if (err2 < 0)
err = err2;
}
-
- fput_light(sock->file, fput_needed);
-out:
return err;
}
@@ -2347,17 +2324,16 @@ int __sys_setsockopt(int fd, int level, int optname, char __user *user_optval,
{
sockptr_t optval = USER_SOCKPTR(user_optval);
bool compat = in_compat_syscall();
- int err, fput_needed;
struct socket *sock;
+ CLASS(fd, f)(fd);
- sock = sockfd_lookup_light(fd, &err, &fput_needed);
- if (!sock)
- return err;
+ if (fd_empty(f))
+ return -EBADF;
+ sock = sock_from_file(fd_file(f));
+ if (unlikely(!sock))
+ return -ENOTSOCK;
- err = do_sock_setsockopt(sock, compat, level, optname, optval, optlen);
-
- fput_light(sock->file, fput_needed);
- return err;
+ return do_sock_setsockopt(sock, compat, level, optname, optval, optlen);
}
SYSCALL_DEFINE5(setsockopt, int, fd, int, level, int, optname,
@@ -2413,20 +2389,17 @@ EXPORT_SYMBOL(do_sock_getsockopt);
int __sys_getsockopt(int fd, int level, int optname, char __user *optval,
int __user *optlen)
{
- int err, fput_needed;
struct socket *sock;
- bool compat;
+ CLASS(fd, f)(fd);
- sock = sockfd_lookup_light(fd, &err, &fput_needed);
- if (!sock)
- return err;
+ if (fd_empty(f))
+ return -EBADF;
+ sock = sock_from_file(fd_file(f));
+ if (unlikely(!sock))
+ return -ENOTSOCK;
- compat = in_compat_syscall();
- err = do_sock_getsockopt(sock, compat, level, optname,
+ return do_sock_getsockopt(sock, in_compat_syscall(), level, optname,
USER_SOCKPTR(optval), USER_SOCKPTR(optlen));
-
- fput_light(sock->file, fput_needed);
- return err;
}
SYSCALL_DEFINE5(getsockopt, int, fd, int, level, int, optname,
@@ -2452,15 +2425,16 @@ int __sys_shutdown_sock(struct socket *sock, int how)
int __sys_shutdown(int fd, int how)
{
- int err, fput_needed;
struct socket *sock;
+ CLASS(fd, f)(fd);
- sock = sockfd_lookup_light(fd, &err, &fput_needed);
- if (sock != NULL) {
- err = __sys_shutdown_sock(sock, how);
- fput_light(sock->file, fput_needed);
- }
- return err;
+ if (fd_empty(f))
+ return -EBADF;
+ sock = sock_from_file(fd_file(f));
+ if (unlikely(!sock))
+ return -ENOTSOCK;
+
+ return __sys_shutdown_sock(sock, how);
}
SYSCALL_DEFINE2(shutdown, int, fd, int, how)
@@ -2676,22 +2650,21 @@ long __sys_sendmsg_sock(struct socket *sock, struct msghdr *msg,
long __sys_sendmsg(int fd, struct user_msghdr __user *msg, unsigned int flags,
bool forbid_cmsg_compat)
{
- int fput_needed, err;
struct msghdr msg_sys;
struct socket *sock;
if (forbid_cmsg_compat && (flags & MSG_CMSG_COMPAT))
return -EINVAL;
- sock = sockfd_lookup_light(fd, &err, &fput_needed);
- if (!sock)
- goto out;
+ CLASS(fd, f)(fd);
- err = ___sys_sendmsg(sock, msg, &msg_sys, flags, NULL, 0);
+ if (fd_empty(f))
+ return -EBADF;
+ sock = sock_from_file(fd_file(f));
+ if (unlikely(!sock))
+ return -ENOTSOCK;
- fput_light(sock->file, fput_needed);
-out:
- return err;
+ return ___sys_sendmsg(sock, msg, &msg_sys, flags, NULL, 0);
}
SYSCALL_DEFINE3(sendmsg, int, fd, struct user_msghdr __user *, msg, unsigned int, flags)
@@ -2706,7 +2679,7 @@ SYSCALL_DEFINE3(sendmsg, int, fd, struct user_msghdr __user *, msg, unsigned int
int __sys_sendmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen,
unsigned int flags, bool forbid_cmsg_compat)
{
- int fput_needed, err, datagrams;
+ int err, datagrams;
struct socket *sock;
struct mmsghdr __user *entry;
struct compat_mmsghdr __user *compat_entry;
@@ -2722,9 +2695,13 @@ int __sys_sendmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen,
datagrams = 0;
- sock = sockfd_lookup_light(fd, &err, &fput_needed);
- if (!sock)
- return err;
+ CLASS(fd, f)(fd);
+
+ if (fd_empty(f))
+ return -EBADF;
+ sock = sock_from_file(fd_file(f));
+ if (unlikely(!sock))
+ return -ENOTSOCK;
used_address.name_len = UINT_MAX;
entry = mmsg;
@@ -2761,8 +2738,6 @@ int __sys_sendmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen,
cond_resched();
}
- fput_light(sock->file, fput_needed);
-
/* We only return an error if no datagrams were able to be sent */
if (datagrams != 0)
return datagrams;
@@ -2884,22 +2859,21 @@ long __sys_recvmsg_sock(struct socket *sock, struct msghdr *msg,
long __sys_recvmsg(int fd, struct user_msghdr __user *msg, unsigned int flags,
bool forbid_cmsg_compat)
{
- int fput_needed, err;
struct msghdr msg_sys;
struct socket *sock;
if (forbid_cmsg_compat && (flags & MSG_CMSG_COMPAT))
return -EINVAL;
- sock = sockfd_lookup_light(fd, &err, &fput_needed);
- if (!sock)
- goto out;
+ CLASS(fd, f)(fd);
- err = ___sys_recvmsg(sock, msg, &msg_sys, flags, 0);
+ if (fd_empty(f))
+ return -EBADF;
+ sock = sock_from_file(fd_file(f));
+ if (unlikely(!sock))
+ return -ENOTSOCK;
- fput_light(sock->file, fput_needed);
-out:
- return err;
+ return ___sys_recvmsg(sock, msg, &msg_sys, flags, 0);
}
SYSCALL_DEFINE3(recvmsg, int, fd, struct user_msghdr __user *, msg,
@@ -2916,7 +2890,7 @@ static int do_recvmmsg(int fd, struct mmsghdr __user *mmsg,
unsigned int vlen, unsigned int flags,
struct timespec64 *timeout)
{
- int fput_needed, err, datagrams;
+ int err = 0, datagrams;
struct socket *sock;
struct mmsghdr __user *entry;
struct compat_mmsghdr __user *compat_entry;
@@ -2931,16 +2905,18 @@ static int do_recvmmsg(int fd, struct mmsghdr __user *mmsg,
datagrams = 0;
- sock = sockfd_lookup_light(fd, &err, &fput_needed);
- if (!sock)
- return err;
+ CLASS(fd, f)(fd);
+
+ if (fd_empty(f))
+ return -EBADF;
+ sock = sock_from_file(fd_file(f));
+ if (unlikely(!sock))
+ return -ENOTSOCK;
if (likely(!(flags & MSG_ERRQUEUE))) {
err = sock_error(sock->sk);
- if (err) {
- datagrams = err;
- goto out_put;
- }
+ if (err)
+ return err;
}
entry = mmsg;
@@ -2997,12 +2973,10 @@ static int do_recvmmsg(int fd, struct mmsghdr __user *mmsg,
}
if (err == 0)
- goto out_put;
+ return datagrams;
- if (datagrams == 0) {
- datagrams = err;
- goto out_put;
- }
+ if (datagrams == 0)
+ return err;
/*
* We may return less entries than requested (vlen) if the
@@ -3017,9 +2991,6 @@ static int do_recvmmsg(int fd, struct mmsghdr __user *mmsg,
*/
WRITE_ONCE(sock->sk->sk_err, -err);
}
-out_put:
- fput_light(sock->file, fput_needed);
-
return datagrams;
}
diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index 35681ad..dfd2916 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -836,6 +836,9 @@ static void vsock_sk_destruct(struct sock *sk)
{
struct vsock_sock *vsk = vsock_sk(sk);
+ /* Flush MSG_ZEROCOPY leftovers. */
+ __skb_queue_purge(&sk->sk_error_queue);
+
vsock_deassign_transport(vsk);
/* When clearing these addresses, there's no need to set the family and
diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
index ccbd2bc..9acc13a 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -400,6 +400,7 @@ static int virtio_transport_send_pkt_info(struct vsock_sock *vsk,
if (virtio_transport_init_zcopy_skb(vsk, skb,
info->msg,
can_zcopy)) {
+ kfree_skb(skb);
ret = -ENOMEM;
break;
}
@@ -1109,6 +1110,7 @@ void virtio_transport_destruct(struct vsock_sock *vsk)
struct virtio_vsock_sock *vvs = vsk->trans;
kfree(vvs);
+ vsk->trans = NULL;
}
EXPORT_SYMBOL_GPL(virtio_transport_destruct);
@@ -1512,6 +1514,14 @@ virtio_transport_recv_listen(struct sock *sk, struct sk_buff *skb,
return -ENOMEM;
}
+ /* __vsock_release() might have already flushed accept_queue.
+ * Subsequent enqueues would lead to a memory leak.
+ */
+ if (sk->sk_shutdown == SHUTDOWN_MASK) {
+ virtio_transport_reset_no_sock(t, skb);
+ return -ESHUTDOWN;
+ }
+
child = vsock_create_connected(sk);
if (!child) {
virtio_transport_reset_no_sock(t, skb);
diff --git a/rust/bindings/bindings_helper.h b/rust/bindings/bindings_helper.h
index ae82e9c..ca13659 100644
--- a/rust/bindings/bindings_helper.h
+++ b/rust/bindings/bindings_helper.h
@@ -10,14 +10,20 @@
#include <linux/blk-mq.h>
#include <linux/blk_types.h>
#include <linux/blkdev.h>
+#include <linux/cred.h>
#include <linux/errname.h>
#include <linux/ethtool.h>
+#include <linux/file.h>
#include <linux/firmware.h>
+#include <linux/fs.h>
#include <linux/jiffies.h>
#include <linux/mdio.h>
#include <linux/phy.h>
+#include <linux/pid_namespace.h>
+#include <linux/poll.h>
#include <linux/refcount.h>
#include <linux/sched.h>
+#include <linux/security.h>
#include <linux/slab.h>
#include <linux/wait.h>
#include <linux/workqueue.h>
diff --git a/rust/helpers/cred.c b/rust/helpers/cred.c
new file mode 100644
index 0000000..fde7ae2
--- /dev/null
+++ b/rust/helpers/cred.c
@@ -0,0 +1,13 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/cred.h>
+
+const struct cred *rust_helper_get_cred(const struct cred *cred)
+{
+ return get_cred(cred);
+}
+
+void rust_helper_put_cred(const struct cred *cred)
+{
+ put_cred(cred);
+}
diff --git a/rust/helpers/fs.c b/rust/helpers/fs.c
new file mode 100644
index 0000000..a75c967
--- /dev/null
+++ b/rust/helpers/fs.c
@@ -0,0 +1,12 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Copyright (C) 2024 Google LLC.
+ */
+
+#include <linux/fs.h>
+
+struct file *rust_helper_get_file(struct file *f)
+{
+ return get_file(f);
+}
diff --git a/rust/helpers/helpers.c b/rust/helpers/helpers.c
index 30f4014..62022b1 100644
--- a/rust/helpers/helpers.c
+++ b/rust/helpers/helpers.c
@@ -11,12 +11,15 @@
#include "bug.c"
#include "build_assert.c"
#include "build_bug.c"
+#include "cred.c"
#include "err.c"
+#include "fs.c"
#include "kunit.c"
#include "mutex.c"
#include "page.c"
#include "rbtree.c"
#include "refcount.c"
+#include "security.c"
#include "signal.c"
#include "slab.c"
#include "spinlock.c"
diff --git a/rust/helpers/security.c b/rust/helpers/security.c
new file mode 100644
index 0000000..239e5b4
--- /dev/null
+++ b/rust/helpers/security.c
@@ -0,0 +1,20 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/security.h>
+
+#ifndef CONFIG_SECURITY
+void rust_helper_security_cred_getsecid(const struct cred *c, u32 *secid)
+{
+ security_cred_getsecid(c, secid);
+}
+
+int rust_helper_security_secid_to_secctx(u32 secid, char **secdata, u32 *seclen)
+{
+ return security_secid_to_secctx(secid, secdata, seclen);
+}
+
+void rust_helper_security_release_secctx(char *secdata, u32 seclen)
+{
+ security_release_secctx(secdata, seclen);
+}
+#endif
diff --git a/rust/helpers/task.c b/rust/helpers/task.c
index 7ac7892..7d66487 100644
--- a/rust/helpers/task.c
+++ b/rust/helpers/task.c
@@ -17,3 +17,41 @@ void rust_helper_put_task_struct(struct task_struct *t)
{
put_task_struct(t);
}
+
+kuid_t rust_helper_task_uid(struct task_struct *task)
+{
+ return task_uid(task);
+}
+
+kuid_t rust_helper_task_euid(struct task_struct *task)
+{
+ return task_euid(task);
+}
+
+#ifndef CONFIG_USER_NS
+uid_t rust_helper_from_kuid(struct user_namespace *to, kuid_t uid)
+{
+ return from_kuid(to, uid);
+}
+#endif /* CONFIG_USER_NS */
+
+bool rust_helper_uid_eq(kuid_t left, kuid_t right)
+{
+ return uid_eq(left, right);
+}
+
+kuid_t rust_helper_current_euid(void)
+{
+ return current_euid();
+}
+
+struct user_namespace *rust_helper_current_user_ns(void)
+{
+ return current_user_ns();
+}
+
+pid_t rust_helper_task_tgid_nr_ns(struct task_struct *tsk,
+ struct pid_namespace *ns)
+{
+ return task_tgid_nr_ns(tsk, ns);
+}
diff --git a/rust/kernel/cred.rs b/rust/kernel/cred.rs
new file mode 100644
index 0000000..81d6778
--- /dev/null
+++ b/rust/kernel/cred.rs
@@ -0,0 +1,85 @@
+// SPDX-License-Identifier: GPL-2.0
+
+// Copyright (C) 2024 Google LLC.
+
+//! Credentials management.
+//!
+//! C header: [`include/linux/cred.h`](srctree/include/linux/cred.h).
+//!
+//! Reference: <https://www.kernel.org/doc/html/latest/security/credentials.html>
+
+use crate::{
+ bindings,
+ task::Kuid,
+ types::{AlwaysRefCounted, Opaque},
+};
+
+/// Wraps the kernel's `struct cred`.
+///
+/// Credentials are used for various security checks in the kernel.
+///
+/// Most fields of credentials are immutable. When things have their credentials changed, that
+/// happens by replacing the credential instead of changing an existing credential. See the [kernel
+/// documentation][ref] for more info on this.
+///
+/// # Invariants
+///
+/// Instances of this type are always ref-counted, that is, a call to `get_cred` ensures that the
+/// allocation remains valid at least until the matching call to `put_cred`.
+///
+/// [ref]: https://www.kernel.org/doc/html/latest/security/credentials.html
+#[repr(transparent)]
+pub struct Credential(Opaque<bindings::cred>);
+
+// SAFETY:
+// - `Credential::dec_ref` can be called from any thread.
+// - It is okay to send ownership of `Credential` across thread boundaries.
+unsafe impl Send for Credential {}
+
+// SAFETY: It's OK to access `Credential` through shared references from other threads because
+// we're either accessing properties that don't change or that are properly synchronised by C code.
+unsafe impl Sync for Credential {}
+
+impl Credential {
+ /// Creates a reference to a [`Credential`] from a valid pointer.
+ ///
+ /// # Safety
+ ///
+ /// The caller must ensure that `ptr` is valid and remains valid for the lifetime of the
+ /// returned [`Credential`] reference.
+ pub unsafe fn from_ptr<'a>(ptr: *const bindings::cred) -> &'a Credential {
+ // SAFETY: The safety requirements guarantee the validity of the dereference, while the
+ // `Credential` type being transparent makes the cast ok.
+ unsafe { &*ptr.cast() }
+ }
+
+ /// Get the id for this security context.
+ pub fn get_secid(&self) -> u32 {
+ let mut secid = 0;
+ // SAFETY: The invariants of this type ensures that the pointer is valid.
+ unsafe { bindings::security_cred_getsecid(self.0.get(), &mut secid) };
+ secid
+ }
+
+ /// Returns the effective UID of the given credential.
+ pub fn euid(&self) -> Kuid {
+ // SAFETY: By the type invariant, we know that `self.0` is valid. Furthermore, the `euid`
+ // field of a credential is never changed after initialization, so there is no potential
+ // for data races.
+ Kuid::from_raw(unsafe { (*self.0.get()).euid })
+ }
+}
+
+// SAFETY: The type invariants guarantee that `Credential` is always ref-counted.
+unsafe impl AlwaysRefCounted for Credential {
+ fn inc_ref(&self) {
+ // SAFETY: The existence of a shared reference means that the refcount is nonzero.
+ unsafe { bindings::get_cred(self.0.get()) };
+ }
+
+ unsafe fn dec_ref(obj: core::ptr::NonNull<Credential>) {
+ // SAFETY: The safety requirements guarantee that the refcount is nonzero. The cast is okay
+ // because `Credential` has the same representation as `struct cred`.
+ unsafe { bindings::put_cred(obj.cast().as_ptr()) };
+ }
+}
diff --git a/rust/kernel/fs.rs b/rust/kernel/fs.rs
new file mode 100644
index 0000000..0121b38
--- /dev/null
+++ b/rust/kernel/fs.rs
@@ -0,0 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0
+
+//! Kernel file systems.
+//!
+//! C headers: [`include/linux/fs.h`](srctree/include/linux/fs.h)
+
+pub mod file;
+pub use self::file::{File, LocalFile};
diff --git a/rust/kernel/fs/file.rs b/rust/kernel/fs/file.rs
new file mode 100644
index 0000000..e03dbe1
--- /dev/null
+++ b/rust/kernel/fs/file.rs
@@ -0,0 +1,461 @@
+// SPDX-License-Identifier: GPL-2.0
+
+// Copyright (C) 2024 Google LLC.
+
+//! Files and file descriptors.
+//!
+//! C headers: [`include/linux/fs.h`](srctree/include/linux/fs.h) and
+//! [`include/linux/file.h`](srctree/include/linux/file.h)
+
+use crate::{
+ bindings,
+ cred::Credential,
+ error::{code::*, Error, Result},
+ types::{ARef, AlwaysRefCounted, NotThreadSafe, Opaque},
+};
+use core::ptr;
+
+/// Flags associated with a [`File`].
+pub mod flags {
+ /// File is opened in append mode.
+ pub const O_APPEND: u32 = bindings::O_APPEND;
+
+ /// Signal-driven I/O is enabled.
+ pub const O_ASYNC: u32 = bindings::FASYNC;
+
+ /// Close-on-exec flag is set.
+ pub const O_CLOEXEC: u32 = bindings::O_CLOEXEC;
+
+ /// File was created if it didn't already exist.
+ pub const O_CREAT: u32 = bindings::O_CREAT;
+
+ /// Direct I/O is enabled for this file.
+ pub const O_DIRECT: u32 = bindings::O_DIRECT;
+
+ /// File must be a directory.
+ pub const O_DIRECTORY: u32 = bindings::O_DIRECTORY;
+
+ /// Like [`O_SYNC`] except metadata is not synced.
+ pub const O_DSYNC: u32 = bindings::O_DSYNC;
+
+ /// Ensure that this file is created with the `open(2)` call.
+ pub const O_EXCL: u32 = bindings::O_EXCL;
+
+ /// Large file size enabled (`off64_t` over `off_t`).
+ pub const O_LARGEFILE: u32 = bindings::O_LARGEFILE;
+
+ /// Do not update the file last access time.
+ pub const O_NOATIME: u32 = bindings::O_NOATIME;
+
+ /// File should not be used as process's controlling terminal.
+ pub const O_NOCTTY: u32 = bindings::O_NOCTTY;
+
+ /// If basename of path is a symbolic link, fail open.
+ pub const O_NOFOLLOW: u32 = bindings::O_NOFOLLOW;
+
+ /// File is using nonblocking I/O.
+ pub const O_NONBLOCK: u32 = bindings::O_NONBLOCK;
+
+ /// File is using nonblocking I/O.
+ ///
+ /// This is effectively the same flag as [`O_NONBLOCK`] on all architectures
+ /// except SPARC64.
+ pub const O_NDELAY: u32 = bindings::O_NDELAY;
+
+ /// Used to obtain a path file descriptor.
+ pub const O_PATH: u32 = bindings::O_PATH;
+
+ /// Write operations on this file will flush data and metadata.
+ pub const O_SYNC: u32 = bindings::O_SYNC;
+
+ /// This file is an unnamed temporary regular file.
+ pub const O_TMPFILE: u32 = bindings::O_TMPFILE;
+
+ /// File should be truncated to length 0.
+ pub const O_TRUNC: u32 = bindings::O_TRUNC;
+
+ /// Bitmask for access mode flags.
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// use kernel::fs::file;
+ /// # fn do_something() {}
+ /// # let flags = 0;
+ /// if (flags & file::flags::O_ACCMODE) == file::flags::O_RDONLY {
+ /// do_something();
+ /// }
+ /// ```
+ pub const O_ACCMODE: u32 = bindings::O_ACCMODE;
+
+ /// File is read only.
+ pub const O_RDONLY: u32 = bindings::O_RDONLY;
+
+ /// File is write only.
+ pub const O_WRONLY: u32 = bindings::O_WRONLY;
+
+ /// File can be both read and written.
+ pub const O_RDWR: u32 = bindings::O_RDWR;
+}
+
+/// Wraps the kernel's `struct file`. Thread safe.
+///
+/// This represents an open file rather than a file on a filesystem. Processes generally reference
+/// open files using file descriptors. However, file descriptors are not the same as files. A file
+/// descriptor is just an integer that corresponds to a file, and a single file may be referenced
+/// by multiple file descriptors.
+///
+/// # Refcounting
+///
+/// Instances of this type are reference-counted. The reference count is incremented by the
+/// `fget`/`get_file` functions and decremented by `fput`. The Rust type `ARef<File>` represents a
+/// pointer that owns a reference count on the file.
+///
+/// Whenever a process opens a file descriptor (fd), it stores a pointer to the file in its fd
+/// table (`struct files_struct`). This pointer owns a reference count to the file, ensuring the
+/// file isn't prematurely deleted while the file descriptor is open. In Rust terminology, the
+/// pointers in `struct files_struct` are `ARef<File>` pointers.
+///
+/// ## Light refcounts
+///
+/// Whenever a process has an fd to a file, it may use something called a "light refcount" as a
+/// performance optimization. Light refcounts are acquired by calling `fdget` and released with
+/// `fdput`. The idea behind light refcounts is that if the fd is not closed between the calls to
+/// `fdget` and `fdput`, then the refcount cannot hit zero during that time, as the `struct
+/// files_struct` holds a reference until the fd is closed. This means that it's safe to access the
+/// file even if `fdget` does not increment the refcount.
+///
+/// The requirement that the fd is not closed during a light refcount applies globally across all
+/// threads - not just on the thread using the light refcount. For this reason, light refcounts are
+/// only used when the `struct files_struct` is not shared with other threads, since this ensures
+/// that other unrelated threads cannot suddenly start using the fd and close it. Therefore,
+/// calling `fdget` on a shared `struct files_struct` creates a normal refcount instead of a light
+/// refcount.
+///
+/// Light reference counts must be released with `fdput` before the system call returns to
+/// userspace. This means that if you wait until the current system call returns to userspace, then
+/// all light refcounts that existed at the time have gone away.
+///
+/// ### The file position
+///
+/// Each `struct file` has a position integer, which is protected by the `f_pos_lock` mutex.
+/// However, if the `struct file` is not shared, then the kernel may avoid taking the lock as a
+/// performance optimization.
+///
+/// The condition for avoiding the `f_pos_lock` mutex is different from the condition for using
+/// `fdget`. With `fdget`, you may avoid incrementing the refcount as long as the current fd table
+/// is not shared; it is okay if there are other fd tables that also reference the same `struct
+/// file`. However, `fdget_pos` can only avoid taking the `f_pos_lock` if the entire `struct file`
+/// is not shared, as different processes with an fd to the same `struct file` share the same
+/// position.
+///
+/// To represent files that are not thread safe due to this optimization, the [`LocalFile`] type is
+/// used.
+///
+/// ## Rust references
+///
+/// The reference type `&File` is similar to light refcounts:
+///
+/// * `&File` references don't own a reference count. They can only exist as long as the reference
+/// count stays positive, and can only be created when there is some mechanism in place to ensure
+/// this.
+///
+/// * The Rust borrow-checker normally ensures this by enforcing that the `ARef<File>` from which
+/// a `&File` is created outlives the `&File`.
+///
+/// * Using the unsafe [`File::from_raw_file`] means that it is up to the caller to ensure that the
+/// `&File` only exists while the reference count is positive.
+///
+/// * You can think of `fdget` as using an fd to look up an `ARef<File>` in the `struct
+/// files_struct` and create an `&File` from it. The "fd cannot be closed" rule is like the Rust
+/// rule "the `ARef<File>` must outlive the `&File`".
+///
+/// # Invariants
+///
+/// * All instances of this type are refcounted using the `f_count` field.
+/// * There must not be any active calls to `fdget_pos` on this file that did not take the
+/// `f_pos_lock` mutex.
+#[repr(transparent)]
+pub struct File {
+ inner: Opaque<bindings::file>,
+}
+
+// SAFETY: This file is known to not have any active `fdget_pos` calls that did not take the
+// `f_pos_lock` mutex, so it is safe to transfer it between threads.
+unsafe impl Send for File {}
+
+// SAFETY: This file is known to not have any active `fdget_pos` calls that did not take the
+// `f_pos_lock` mutex, so it is safe to access its methods from several threads in parallel.
+unsafe impl Sync for File {}
+
+// SAFETY: The type invariants guarantee that `File` is always ref-counted. This implementation
+// makes `ARef<File>` own a normal refcount.
+unsafe impl AlwaysRefCounted for File {
+ #[inline]
+ fn inc_ref(&self) {
+ // SAFETY: The existence of a shared reference means that the refcount is nonzero.
+ unsafe { bindings::get_file(self.as_ptr()) };
+ }
+
+ #[inline]
+ unsafe fn dec_ref(obj: ptr::NonNull<File>) {
+ // SAFETY: To call this method, the caller passes us ownership of a normal refcount, so we
+ // may drop it. The cast is okay since `File` has the same representation as `struct file`.
+ unsafe { bindings::fput(obj.cast().as_ptr()) }
+ }
+}
+
+/// Wraps the kernel's `struct file`. Not thread safe.
+///
+/// This type represents a file that is not known to be safe to transfer across thread boundaries.
+/// To obtain a thread-safe [`File`], use the [`assume_no_fdget_pos`] conversion.
+///
+/// See the documentation for [`File`] for more information.
+///
+/// # Invariants
+///
+/// * All instances of this type are refcounted using the `f_count` field.
+/// * If there is an active call to `fdget_pos` that did not take the `f_pos_lock` mutex, then it
+/// must be on the same thread as this file.
+///
+/// [`assume_no_fdget_pos`]: LocalFile::assume_no_fdget_pos
+pub struct LocalFile {
+ inner: Opaque<bindings::file>,
+}
+
+// SAFETY: The type invariants guarantee that `LocalFile` is always ref-counted. This implementation
+// makes `ARef<File>` own a normal refcount.
+unsafe impl AlwaysRefCounted for LocalFile {
+ #[inline]
+ fn inc_ref(&self) {
+ // SAFETY: The existence of a shared reference means that the refcount is nonzero.
+ unsafe { bindings::get_file(self.as_ptr()) };
+ }
+
+ #[inline]
+ unsafe fn dec_ref(obj: ptr::NonNull<LocalFile>) {
+ // SAFETY: To call this method, the caller passes us ownership of a normal refcount, so we
+ // may drop it. The cast is okay since `File` has the same representation as `struct file`.
+ unsafe { bindings::fput(obj.cast().as_ptr()) }
+ }
+}
+
+impl LocalFile {
+ /// Constructs a new `struct file` wrapper from a file descriptor.
+ ///
+ /// The file descriptor belongs to the current process, and there might be active local calls
+ /// to `fdget_pos` on the same file.
+ ///
+ /// To obtain an `ARef<File>`, use the [`assume_no_fdget_pos`] function to convert.
+ ///
+ /// [`assume_no_fdget_pos`]: LocalFile::assume_no_fdget_pos
+ #[inline]
+ pub fn fget(fd: u32) -> Result<ARef<LocalFile>, BadFdError> {
+ // SAFETY: FFI call, there are no requirements on `fd`.
+ let ptr = ptr::NonNull::new(unsafe { bindings::fget(fd) }).ok_or(BadFdError)?;
+
+ // SAFETY: `bindings::fget` created a refcount, and we pass ownership of it to the `ARef`.
+ //
+ // INVARIANT: This file is in the fd table on this thread, so either all `fdget_pos` calls
+ // are on this thread, or the file is shared, in which case `fdget_pos` calls took the
+ // `f_pos_lock` mutex.
+ Ok(unsafe { ARef::from_raw(ptr.cast()) })
+ }
+
+ /// Creates a reference to a [`LocalFile`] from a valid pointer.
+ ///
+ /// # Safety
+ ///
+ /// * The caller must ensure that `ptr` points at a valid file and that the file's refcount is
+ /// positive for the duration of 'a.
+ /// * The caller must ensure that if there is an active call to `fdget_pos` that did not take
+ /// the `f_pos_lock` mutex, then that call is on the current thread.
+ #[inline]
+ pub unsafe fn from_raw_file<'a>(ptr: *const bindings::file) -> &'a LocalFile {
+ // SAFETY: The caller guarantees that the pointer is not dangling and stays valid for the
+ // duration of 'a. The cast is okay because `File` is `repr(transparent)`.
+ //
+ // INVARIANT: The caller guarantees that there are no problematic `fdget_pos` calls.
+ unsafe { &*ptr.cast() }
+ }
+
+ /// Assume that there are no active `fdget_pos` calls that prevent us from sharing this file.
+ ///
+ /// This makes it safe to transfer this file to other threads. No checks are performed, and
+ /// using it incorrectly may lead to a data race on the file position if the file is shared
+ /// with another thread.
+ ///
+ /// This method is intended to be used together with [`LocalFile::fget`] when the caller knows
+ /// statically that there are no `fdget_pos` calls on the current thread. For example, you
+ /// might use it when calling `fget` from an ioctl, since ioctls usually do not touch the file
+ /// position.
+ ///
+ /// # Safety
+ ///
+ /// There must not be any active `fdget_pos` calls on the current thread.
+ #[inline]
+ pub unsafe fn assume_no_fdget_pos(me: ARef<LocalFile>) -> ARef<File> {
+ // INVARIANT: There are no `fdget_pos` calls on the current thread, and by the type
+ // invariants, if there is a `fdget_pos` call on another thread, then it took the
+ // `f_pos_lock` mutex.
+ //
+ // SAFETY: `LocalFile` and `File` have the same layout.
+ unsafe { ARef::from_raw(ARef::into_raw(me).cast()) }
+ }
+
+ /// Returns a raw pointer to the inner C struct.
+ #[inline]
+ pub fn as_ptr(&self) -> *mut bindings::file {
+ self.inner.get()
+ }
+
+ /// Returns the credentials of the task that originally opened the file.
+ pub fn cred(&self) -> &Credential {
+ // SAFETY: It's okay to read the `f_cred` field without synchronization because `f_cred` is
+ // never changed after initialization of the file.
+ let ptr = unsafe { (*self.as_ptr()).f_cred };
+
+ // SAFETY: The signature of this function ensures that the caller will only access the
+ // returned credential while the file is still valid, and the C side ensures that the
+ // credential stays valid at least as long as the file.
+ unsafe { Credential::from_ptr(ptr) }
+ }
+
+ /// Returns the flags associated with the file.
+ ///
+ /// The flags are a combination of the constants in [`flags`].
+ #[inline]
+ pub fn flags(&self) -> u32 {
+ // This `read_volatile` is intended to correspond to a READ_ONCE call.
+ //
+ // SAFETY: The file is valid because the shared reference guarantees a nonzero refcount.
+ //
+ // FIXME(read_once): Replace with `read_once` when available on the Rust side.
+ unsafe { core::ptr::addr_of!((*self.as_ptr()).f_flags).read_volatile() }
+ }
+}
+
+impl File {
+ /// Creates a reference to a [`File`] from a valid pointer.
+ ///
+ /// # Safety
+ ///
+ /// * The caller must ensure that `ptr` points at a valid file and that the file's refcount is
+ /// positive for the duration of 'a.
+ /// * The caller must ensure that if there are active `fdget_pos` calls on this file, then they
+ /// took the `f_pos_lock` mutex.
+ #[inline]
+ pub unsafe fn from_raw_file<'a>(ptr: *const bindings::file) -> &'a File {
+ // SAFETY: The caller guarantees that the pointer is not dangling and stays valid for the
+ // duration of 'a. The cast is okay because `File` is `repr(transparent)`.
+ //
+ // INVARIANT: The caller guarantees that there are no problematic `fdget_pos` calls.
+ unsafe { &*ptr.cast() }
+ }
+}
+
+// Make LocalFile methods available on File.
+impl core::ops::Deref for File {
+ type Target = LocalFile;
+ #[inline]
+ fn deref(&self) -> &LocalFile {
+ // SAFETY: The caller provides a `&File`, and since it is a reference, it must point at a
+ // valid file for the desired duration.
+ //
+ // By the type invariants, there are no `fdget_pos` calls that did not take the
+ // `f_pos_lock` mutex.
+ unsafe { LocalFile::from_raw_file(self as *const File as *const bindings::file) }
+ }
+}
+
+/// A file descriptor reservation.
+///
+/// This allows the creation of a file descriptor in two steps: first, we reserve a slot for it,
+/// then we commit or drop the reservation. The first step may fail (e.g., the current process ran
+/// out of available slots), but commit and drop never fail (and are mutually exclusive).
+///
+/// Dropping the reservation happens in the destructor of this type.
+///
+/// # Invariants
+///
+/// The fd stored in this struct must correspond to a reserved file descriptor of the current task.
+pub struct FileDescriptorReservation {
+ fd: u32,
+ /// Prevent values of this type from being moved to a different task.
+ ///
+ /// The `fd_install` and `put_unused_fd` functions assume that the value of `current` is
+ /// unchanged since the call to `get_unused_fd_flags`. By adding this marker to this type, we
+ /// prevent it from being moved across task boundaries, which ensures that `current` does not
+ /// change while this value exists.
+ _not_send: NotThreadSafe,
+}
+
+impl FileDescriptorReservation {
+ /// Creates a new file descriptor reservation.
+ pub fn get_unused_fd_flags(flags: u32) -> Result<Self> {
+ // SAFETY: FFI call, there are no safety requirements on `flags`.
+ let fd: i32 = unsafe { bindings::get_unused_fd_flags(flags) };
+ if fd < 0 {
+ return Err(Error::from_errno(fd));
+ }
+ Ok(Self {
+ fd: fd as u32,
+ _not_send: NotThreadSafe,
+ })
+ }
+
+ /// Returns the file descriptor number that was reserved.
+ pub fn reserved_fd(&self) -> u32 {
+ self.fd
+ }
+
+ /// Commits the reservation.
+ ///
+ /// The previously reserved file descriptor is bound to `file`. This method consumes the
+ /// [`FileDescriptorReservation`], so it will not be usable after this call.
+ pub fn fd_install(self, file: ARef<File>) {
+ // SAFETY: `self.fd` was previously returned by `get_unused_fd_flags`. We have not yet used
+ // the fd, so it is still valid, and `current` still refers to the same task, as this type
+ // cannot be moved across task boundaries.
+ //
+ // Furthermore, the file pointer is guaranteed to own a refcount by its type invariants,
+ // and we take ownership of that refcount by not running the destructor below.
+ // Additionally, the file is known to not have any non-shared `fdget_pos` calls, so even if
+ // this process starts using the file position, this will not result in a data race on the
+ // file position.
+ unsafe { bindings::fd_install(self.fd, file.as_ptr()) };
+
+ // `fd_install` consumes both the file descriptor and the file reference, so we cannot run
+ // the destructors.
+ core::mem::forget(self);
+ core::mem::forget(file);
+ }
+}
+
+impl Drop for FileDescriptorReservation {
+ fn drop(&mut self) {
+ // SAFETY: By the type invariants of this type, `self.fd` was previously returned by
+ // `get_unused_fd_flags`. We have not yet used the fd, so it is still valid, and `current`
+ // still refers to the same task, as this type cannot be moved across task boundaries.
+ unsafe { bindings::put_unused_fd(self.fd) };
+ }
+}
+
+/// Represents the `EBADF` error code.
+///
+/// Used for methods that can only fail with `EBADF`.
+#[derive(Copy, Clone, Eq, PartialEq)]
+pub struct BadFdError;
+
+impl From<BadFdError> for Error {
+ #[inline]
+ fn from(_: BadFdError) -> Error {
+ EBADF
+ }
+}
+
+impl core::fmt::Debug for BadFdError {
+ fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+ f.pad("EBADF")
+ }
+}
diff --git a/rust/kernel/lib.rs b/rust/kernel/lib.rs
index b5f4b3c..9843eed 100644
--- a/rust/kernel/lib.rs
+++ b/rust/kernel/lib.rs
@@ -30,10 +30,12 @@
#[cfg(CONFIG_BLOCK)]
pub mod block;
mod build_assert;
+pub mod cred;
pub mod device;
pub mod error;
#[cfg(CONFIG_RUST_FW_LOADER_ABSTRACTIONS)]
pub mod firmware;
+pub mod fs;
pub mod init;
pub mod ioctl;
#[cfg(CONFIG_KUNIT)]
@@ -45,6 +47,8 @@
pub mod prelude;
pub mod print;
pub mod rbtree;
+pub mod security;
+pub mod seq_file;
pub mod sizes;
mod static_assert;
#[doc(hidden)]
diff --git a/rust/kernel/security.rs b/rust/kernel/security.rs
new file mode 100644
index 0000000..2522868
--- /dev/null
+++ b/rust/kernel/security.rs
@@ -0,0 +1,74 @@
+// SPDX-License-Identifier: GPL-2.0
+
+// Copyright (C) 2024 Google LLC.
+
+//! Linux Security Modules (LSM).
+//!
+//! C header: [`include/linux/security.h`](srctree/include/linux/security.h).
+
+use crate::{
+ bindings,
+ error::{to_result, Result},
+};
+
+/// A security context string.
+///
+/// # Invariants
+///
+/// The `secdata` and `seclen` fields correspond to a valid security context as returned by a
+/// successful call to `security_secid_to_secctx`, that has not yet been destroyed by calling
+/// `security_release_secctx`.
+pub struct SecurityCtx {
+ secdata: *mut core::ffi::c_char,
+ seclen: usize,
+}
+
+impl SecurityCtx {
+ /// Get the security context given its id.
+ pub fn from_secid(secid: u32) -> Result<Self> {
+ let mut secdata = core::ptr::null_mut();
+ let mut seclen = 0u32;
+ // SAFETY: Just a C FFI call. The pointers are valid for writes.
+ to_result(unsafe { bindings::security_secid_to_secctx(secid, &mut secdata, &mut seclen) })?;
+
+ // INVARIANT: If the above call did not fail, then we have a valid security context.
+ Ok(Self {
+ secdata,
+ seclen: seclen as usize,
+ })
+ }
+
+ /// Returns whether the security context is empty.
+ pub fn is_empty(&self) -> bool {
+ self.seclen == 0
+ }
+
+ /// Returns the length of this security context.
+ pub fn len(&self) -> usize {
+ self.seclen
+ }
+
+ /// Returns the bytes for this security context.
+ pub fn as_bytes(&self) -> &[u8] {
+ let ptr = self.secdata;
+ if ptr.is_null() {
+ debug_assert_eq!(self.seclen, 0);
+ // We can't pass a null pointer to `slice::from_raw_parts` even if the length is zero.
+ return &[];
+ }
+
+ // SAFETY: The call to `security_secid_to_secctx` guarantees that the pointer is valid for
+ // `seclen` bytes. Furthermore, if the length is zero, then we have ensured that the
+ // pointer is not null.
+ unsafe { core::slice::from_raw_parts(ptr.cast(), self.seclen) }
+ }
+}
+
+impl Drop for SecurityCtx {
+ fn drop(&mut self) {
+ // SAFETY: By the invariant of `Self`, this frees a pointer that came from a successful
+ // call to `security_secid_to_secctx` and has not yet been destroyed by
+ // `security_release_secctx`.
+ unsafe { bindings::security_release_secctx(self.secdata, self.seclen as u32) };
+ }
+}
diff --git a/rust/kernel/seq_file.rs b/rust/kernel/seq_file.rs
new file mode 100644
index 0000000..6ca29d5
--- /dev/null
+++ b/rust/kernel/seq_file.rs
@@ -0,0 +1,52 @@
+// SPDX-License-Identifier: GPL-2.0
+
+//! Seq file bindings.
+//!
+//! C header: [`include/linux/seq_file.h`](srctree/include/linux/seq_file.h)
+
+use crate::{bindings, c_str, types::NotThreadSafe, types::Opaque};
+
+/// A utility for generating the contents of a seq file.
+#[repr(transparent)]
+pub struct SeqFile {
+ inner: Opaque<bindings::seq_file>,
+ _not_send: NotThreadSafe,
+}
+
+impl SeqFile {
+ /// Creates a new [`SeqFile`] from a raw pointer.
+ ///
+ /// # Safety
+ ///
+ /// The caller must ensure that for the duration of 'a the following is satisfied:
+ /// * The pointer points at a valid `struct seq_file`.
+ /// * The `struct seq_file` is not accessed from any other thread.
+ pub unsafe fn from_raw<'a>(ptr: *mut bindings::seq_file) -> &'a SeqFile {
+ // SAFETY: The caller ensures that the reference is valid for 'a. There's no way to trigger
+ // a data race by using the `&SeqFile` since this is the only thread accessing the seq_file.
+ //
+ // CAST: The layout of `struct seq_file` and `SeqFile` is compatible.
+ unsafe { &*ptr.cast() }
+ }
+
+ /// Used by the [`seq_print`] macro.
+ pub fn call_printf(&self, args: core::fmt::Arguments<'_>) {
+ // SAFETY: Passing a void pointer to `Arguments` is valid for `%pA`.
+ unsafe {
+ bindings::seq_printf(
+ self.inner.get(),
+ c_str!("%pA").as_char_ptr(),
+ &args as *const _ as *const core::ffi::c_void,
+ );
+ }
+ }
+}
+
+/// Write to a [`SeqFile`] with the ordinary Rust formatting syntax.
+#[macro_export]
+macro_rules! seq_print {
+ ($m:expr, $($arg:tt)+) => (
+ $m.call_printf(format_args!($($arg)+))
+ );
+}
+pub use seq_print;
diff --git a/rust/kernel/sync.rs b/rust/kernel/sync.rs
index 0ab2097..bae4a51 100644
--- a/rust/kernel/sync.rs
+++ b/rust/kernel/sync.rs
@@ -11,6 +11,7 @@
mod condvar;
pub mod lock;
mod locked_by;
+pub mod poll;
pub use arc::{Arc, ArcBorrow, UniqueArc};
pub use condvar::{new_condvar, CondVar, CondVarTimeoutResult};
diff --git a/rust/kernel/sync/lock.rs b/rust/kernel/sync/lock.rs
index f6c34ca..d6e9bab 100644
--- a/rust/kernel/sync/lock.rs
+++ b/rust/kernel/sync/lock.rs
@@ -6,8 +6,13 @@
//! spinlocks, raw spinlocks) to be provided with minimal effort.
use super::LockClassKey;
-use crate::{init::PinInit, pin_init, str::CStr, types::Opaque, types::ScopeGuard};
-use core::{cell::UnsafeCell, marker::PhantomData, marker::PhantomPinned};
+use crate::{
+ init::PinInit,
+ pin_init,
+ str::CStr,
+ types::{NotThreadSafe, Opaque, ScopeGuard},
+};
+use core::{cell::UnsafeCell, marker::PhantomPinned};
use macros::pin_data;
pub mod mutex;
@@ -139,7 +144,7 @@ pub fn lock(&self) -> Guard<'_, T, B> {
pub struct Guard<'a, T: ?Sized, B: Backend> {
pub(crate) lock: &'a Lock<T, B>,
pub(crate) state: B::GuardState,
- _not_send: PhantomData<*mut ()>,
+ _not_send: NotThreadSafe,
}
// SAFETY: `Guard` is sync when the data protected by the lock is also sync.
@@ -191,7 +196,7 @@ pub(crate) unsafe fn new(lock: &'a Lock<T, B>, state: B::GuardState) -> Self {
Self {
lock,
state,
- _not_send: PhantomData,
+ _not_send: NotThreadSafe,
}
}
}
diff --git a/rust/kernel/sync/poll.rs b/rust/kernel/sync/poll.rs
new file mode 100644
index 0000000..d5f1715
--- /dev/null
+++ b/rust/kernel/sync/poll.rs
@@ -0,0 +1,121 @@
+// SPDX-License-Identifier: GPL-2.0
+
+// Copyright (C) 2024 Google LLC.
+
+//! Utilities for working with `struct poll_table`.
+
+use crate::{
+ bindings,
+ fs::File,
+ prelude::*,
+ sync::{CondVar, LockClassKey},
+ types::Opaque,
+};
+use core::ops::Deref;
+
+/// Creates a [`PollCondVar`] initialiser with the given name and a newly-created lock class.
+#[macro_export]
+macro_rules! new_poll_condvar {
+ ($($name:literal)?) => {
+ $crate::sync::poll::PollCondVar::new(
+ $crate::optional_name!($($name)?), $crate::static_lock_class!()
+ )
+ };
+}
+
+/// Wraps the kernel's `struct poll_table`.
+///
+/// # Invariants
+///
+/// This struct contains a valid `struct poll_table`.
+///
+/// For a `struct poll_table` to be valid, its `_qproc` function must follow the safety
+/// requirements of `_qproc` functions:
+///
+/// * The `_qproc` function is given permission to enqueue a waiter to the provided `poll_table`
+/// during the call. Once the waiter is removed and an rcu grace period has passed, it must no
+/// longer access the `wait_queue_head`.
+#[repr(transparent)]
+pub struct PollTable(Opaque<bindings::poll_table>);
+
+impl PollTable {
+ /// Creates a reference to a [`PollTable`] from a valid pointer.
+ ///
+ /// # Safety
+ ///
+ /// The caller must ensure that for the duration of 'a, the pointer will point at a valid poll
+ /// table (as defined in the type invariants).
+ ///
+ /// The caller must also ensure that the `poll_table` is only accessed via the returned
+ /// reference for the duration of 'a.
+ pub unsafe fn from_ptr<'a>(ptr: *mut bindings::poll_table) -> &'a mut PollTable {
+ // SAFETY: The safety requirements guarantee the validity of the dereference, while the
+ // `PollTable` type being transparent makes the cast ok.
+ unsafe { &mut *ptr.cast() }
+ }
+
+ fn get_qproc(&self) -> bindings::poll_queue_proc {
+ let ptr = self.0.get();
+ // SAFETY: The `ptr` is valid because it originates from a reference, and the `_qproc`
+ // field is not modified concurrently with this call since we have an immutable reference.
+ unsafe { (*ptr)._qproc }
+ }
+
+ /// Register this [`PollTable`] with the provided [`PollCondVar`], so that it can be notified
+ /// using the condition variable.
+ pub fn register_wait(&mut self, file: &File, cv: &PollCondVar) {
+ if let Some(qproc) = self.get_qproc() {
+ // SAFETY: The pointers to `file` and `self` need to be valid for the duration of this
+ // call to `qproc`, which they are because they are references.
+ //
+ // The `cv.wait_queue_head` pointer must be valid until an rcu grace period after the
+ // waiter is removed. The `PollCondVar` is pinned, so before `cv.wait_queue_head` can
+ // be destroyed, the destructor must run. That destructor first removes all waiters,
+ // and then waits for an rcu grace period. Therefore, `cv.wait_queue_head` is valid for
+ // long enough.
+ unsafe { qproc(file.as_ptr() as _, cv.wait_queue_head.get(), self.0.get()) };
+ }
+ }
+}
+
+/// A wrapper around [`CondVar`] that makes it usable with [`PollTable`].
+///
+/// [`CondVar`]: crate::sync::CondVar
+#[pin_data(PinnedDrop)]
+pub struct PollCondVar {
+ #[pin]
+ inner: CondVar,
+}
+
+impl PollCondVar {
+ /// Constructs a new condvar initialiser.
+ pub fn new(name: &'static CStr, key: &'static LockClassKey) -> impl PinInit<Self> {
+ pin_init!(Self {
+ inner <- CondVar::new(name, key),
+ })
+ }
+}
+
+// Make the `CondVar` methods callable on `PollCondVar`.
+impl Deref for PollCondVar {
+ type Target = CondVar;
+
+ fn deref(&self) -> &CondVar {
+ &self.inner
+ }
+}
+
+#[pinned_drop]
+impl PinnedDrop for PollCondVar {
+ fn drop(self: Pin<&mut Self>) {
+ // Clear anything registered using `register_wait`.
+ //
+ // SAFETY: The pointer points at a valid `wait_queue_head`.
+ unsafe { bindings::__wake_up_pollfree(self.inner.wait_queue_head.get()) };
+
+ // Wait for epoll items to be properly removed.
+ //
+ // SAFETY: Just an FFI call.
+ unsafe { bindings::synchronize_rcu() };
+ }
+}
diff --git a/rust/kernel/task.rs b/rust/kernel/task.rs
index 55dff7e..0805990 100644
--- a/rust/kernel/task.rs
+++ b/rust/kernel/task.rs
@@ -4,10 +4,13 @@
//!
//! C header: [`include/linux/sched.h`](srctree/include/linux/sched.h).
-use crate::types::Opaque;
+use crate::{
+ bindings,
+ types::{NotThreadSafe, Opaque},
+};
use core::{
+ cmp::{Eq, PartialEq},
ffi::{c_int, c_long, c_uint},
- marker::PhantomData,
ops::Deref,
ptr,
};
@@ -94,7 +97,22 @@ unsafe impl Sync for Task {}
/// The type of process identifiers (PIDs).
type Pid = bindings::pid_t;
+/// The type of user identifiers (UIDs).
+#[derive(Copy, Clone)]
+pub struct Kuid {
+ kuid: bindings::kuid_t,
+}
+
impl Task {
+ /// Returns a raw pointer to the current task.
+ ///
+ /// It is up to the user to use the pointer correctly.
+ #[inline]
+ pub fn current_raw() -> *mut bindings::task_struct {
+ // SAFETY: Getting the current pointer is always safe.
+ unsafe { bindings::get_current() }
+ }
+
/// Returns a task reference for the currently executing task/thread.
///
/// The recommended way to get the current task/thread is to use the
@@ -106,7 +124,7 @@ impl Task {
pub unsafe fn current() -> impl Deref<Target = Task> {
struct TaskRef<'a> {
task: &'a Task,
- _not_send: PhantomData<*mut ()>,
+ _not_send: NotThreadSafe,
}
impl Deref for TaskRef<'_> {
@@ -117,23 +135,27 @@ fn deref(&self) -> &Self::Target {
}
}
- // SAFETY: Just an FFI call with no additional safety requirements.
- let ptr = unsafe { bindings::get_current() };
-
+ let current = Task::current_raw();
TaskRef {
// SAFETY: If the current thread is still running, the current task is valid. Given
// that `TaskRef` is not `Send`, we know it cannot be transferred to another thread
// (where it could potentially outlive the caller).
- task: unsafe { &*ptr.cast() },
- _not_send: PhantomData,
+ task: unsafe { &*current.cast() },
+ _not_send: NotThreadSafe,
}
}
+ /// Returns a raw pointer to the task.
+ #[inline]
+ pub fn as_ptr(&self) -> *mut bindings::task_struct {
+ self.0.get()
+ }
+
/// Returns the group leader of the given task.
pub fn group_leader(&self) -> &Task {
- // SAFETY: By the type invariant, we know that `self.0` is a valid task. Valid tasks always
- // have a valid `group_leader`.
- let ptr = unsafe { *ptr::addr_of!((*self.0.get()).group_leader) };
+ // SAFETY: The group leader of a task never changes after initialization, so reading this
+ // field is not a data race.
+ let ptr = unsafe { *ptr::addr_of!((*self.as_ptr()).group_leader) };
// SAFETY: The lifetime of the returned task reference is tied to the lifetime of `self`,
// and given that a task has a reference to its group leader, we know it must be valid for
@@ -143,23 +165,41 @@ pub fn group_leader(&self) -> &Task {
/// Returns the PID of the given task.
pub fn pid(&self) -> Pid {
- // SAFETY: By the type invariant, we know that `self.0` is a valid task. Valid tasks always
- // have a valid pid.
- unsafe { *ptr::addr_of!((*self.0.get()).pid) }
+ // SAFETY: The pid of a task never changes after initialization, so reading this field is
+ // not a data race.
+ unsafe { *ptr::addr_of!((*self.as_ptr()).pid) }
+ }
+
+ /// Returns the UID of the given task.
+ pub fn uid(&self) -> Kuid {
+ // SAFETY: It's always safe to call `task_uid` on a valid task.
+ Kuid::from_raw(unsafe { bindings::task_uid(self.as_ptr()) })
+ }
+
+ /// Returns the effective UID of the given task.
+ pub fn euid(&self) -> Kuid {
+ // SAFETY: It's always safe to call `task_euid` on a valid task.
+ Kuid::from_raw(unsafe { bindings::task_euid(self.as_ptr()) })
}
/// Determines whether the given task has pending signals.
pub fn signal_pending(&self) -> bool {
- // SAFETY: By the type invariant, we know that `self.0` is valid.
- unsafe { bindings::signal_pending(self.0.get()) != 0 }
+ // SAFETY: It's always safe to call `signal_pending` on a valid task.
+ unsafe { bindings::signal_pending(self.as_ptr()) != 0 }
+ }
+
+ /// Returns the given task's pid in the current pid namespace.
+ pub fn pid_in_current_ns(&self) -> Pid {
+ // SAFETY: It's valid to pass a null pointer as the namespace (defaults to current
+ // namespace). The task pointer is also valid.
+ unsafe { bindings::task_tgid_nr_ns(self.as_ptr(), ptr::null_mut()) }
}
/// Wakes up the task.
pub fn wake_up(&self) {
- // SAFETY: By the type invariant, we know that `self.0.get()` is non-null and valid.
- // And `wake_up_process` is safe to be called for any valid task, even if the task is
+ // SAFETY: It's always safe to call `signal_pending` on a valid task, even if the task
// running.
- unsafe { bindings::wake_up_process(self.0.get()) };
+ unsafe { bindings::wake_up_process(self.as_ptr()) };
}
}
@@ -167,7 +207,7 @@ pub fn wake_up(&self) {
unsafe impl crate::types::AlwaysRefCounted for Task {
fn inc_ref(&self) {
// SAFETY: The existence of a shared reference means that the refcount is nonzero.
- unsafe { bindings::get_task_struct(self.0.get()) };
+ unsafe { bindings::get_task_struct(self.as_ptr()) };
}
unsafe fn dec_ref(obj: ptr::NonNull<Self>) {
@@ -175,3 +215,43 @@ unsafe fn dec_ref(obj: ptr::NonNull<Self>) {
unsafe { bindings::put_task_struct(obj.cast().as_ptr()) }
}
}
+
+impl Kuid {
+ /// Get the current euid.
+ #[inline]
+ pub fn current_euid() -> Kuid {
+ // SAFETY: Just an FFI call.
+ Self::from_raw(unsafe { bindings::current_euid() })
+ }
+
+ /// Create a `Kuid` given the raw C type.
+ #[inline]
+ pub fn from_raw(kuid: bindings::kuid_t) -> Self {
+ Self { kuid }
+ }
+
+ /// Turn this kuid into the raw C type.
+ #[inline]
+ pub fn into_raw(self) -> bindings::kuid_t {
+ self.kuid
+ }
+
+ /// Converts this kernel UID into a userspace UID.
+ ///
+ /// Uses the namespace of the current task.
+ #[inline]
+ pub fn into_uid_in_current_ns(self) -> bindings::uid_t {
+ // SAFETY: Just an FFI call.
+ unsafe { bindings::from_kuid(bindings::current_user_ns(), self.kuid) }
+ }
+}
+
+impl PartialEq for Kuid {
+ #[inline]
+ fn eq(&self, other: &Kuid) -> bool {
+ // SAFETY: Just an FFI call.
+ unsafe { bindings::uid_eq(self.kuid, other.kuid) }
+ }
+}
+
+impl Eq for Kuid {}
diff --git a/rust/kernel/types.rs b/rust/kernel/types.rs
index 9e7ca06..3238ffaa 100644
--- a/rust/kernel/types.rs
+++ b/rust/kernel/types.rs
@@ -532,3 +532,24 @@ unsafe impl AsBytes for str {}
// does not have any uninitialized portions either.
unsafe impl<T: AsBytes> AsBytes for [T] {}
unsafe impl<T: AsBytes, const N: usize> AsBytes for [T; N] {}
+
+/// Zero-sized type to mark types not [`Send`].
+///
+/// Add this type as a field to your struct if your type should not be sent to a different task.
+/// Since [`Send`] is an auto trait, adding a single field that is `!Send` will ensure that the
+/// whole type is `!Send`.
+///
+/// If a type is `!Send` it is impossible to give control over an instance of the type to another
+/// task. This is useful to include in types that store or reference task-local information. A file
+/// descriptor is an example of such task-local information.
+///
+/// This type also makes the type `!Sync`, which prevents immutable access to the value from
+/// several threads in parallel.
+pub type NotThreadSafe = PhantomData<*mut ()>;
+
+/// Used to construct instances of type [`NotThreadSafe`] similar to how `PhantomData` is
+/// constructed.
+///
+/// [`NotThreadSafe`]: type@NotThreadSafe
+#[allow(non_upper_case_globals)]
+pub const NotThreadSafe: NotThreadSafe = PhantomData;
diff --git a/samples/landlock/sandboxer.c b/samples/landlock/sandboxer.c
index f847e83..57565dfd 100644
--- a/samples/landlock/sandboxer.c
+++ b/samples/landlock/sandboxer.c
@@ -60,6 +60,25 @@ static inline int landlock_restrict_self(const int ruleset_fd,
#define ENV_SCOPED_NAME "LL_SCOPED"
#define ENV_DELIMITER ":"
+static int str2num(const char *numstr, __u64 *num_dst)
+{
+ char *endptr = NULL;
+ int err = 0;
+ __u64 num;
+
+ errno = 0;
+ num = strtoull(numstr, &endptr, 10);
+ if (errno != 0)
+ err = errno;
+ /* Was the string empty, or not entirely parsed successfully? */
+ else if ((*numstr == '\0') || (*endptr != '\0'))
+ err = EINVAL;
+ else
+ *num_dst = num;
+
+ return err;
+}
+
static int parse_path(char *env_path, const char ***const path_list)
{
int i, num_paths = 0;
@@ -160,7 +179,6 @@ static int populate_ruleset_net(const char *const env_var, const int ruleset_fd,
char *env_port_name, *env_port_name_next, *strport;
struct landlock_net_port_attr net_port = {
.allowed_access = allowed_access,
- .port = 0,
};
env_port_name = getenv(env_var);
@@ -171,7 +189,17 @@ static int populate_ruleset_net(const char *const env_var, const int ruleset_fd,
env_port_name_next = env_port_name;
while ((strport = strsep(&env_port_name_next, ENV_DELIMITER))) {
- net_port.port = atoi(strport);
+ __u64 port;
+
+ if (strcmp(strport, "") == 0)
+ continue;
+
+ if (str2num(strport, &port)) {
+ fprintf(stderr, "Failed to parse port at \"%s\"\n",
+ strport);
+ goto out_free_name;
+ }
+ net_port.port = port;
if (landlock_add_rule(ruleset_fd, LANDLOCK_RULE_NET_PORT,
&net_port, 0)) {
fprintf(stderr,
@@ -262,6 +290,44 @@ static bool check_ruleset_scope(const char *const env_var,
#define LANDLOCK_ABI_LAST 6
+#define XSTR(s) #s
+#define STR(s) XSTR(s)
+
+/* clang-format off */
+
+static const char help[] =
+ "usage: " ENV_FS_RO_NAME "=\"...\" " ENV_FS_RW_NAME "=\"...\" "
+ "[other environment variables] %1$s <cmd> [args]...\n"
+ "\n"
+ "Execute the given command in a restricted environment.\n"
+ "Multi-valued settings (lists of ports, paths, scopes) are colon-delimited.\n"
+ "\n"
+ "Mandatory settings:\n"
+ "* " ENV_FS_RO_NAME ": paths allowed to be used in a read-only way\n"
+ "* " ENV_FS_RW_NAME ": paths allowed to be used in a read-write way\n"
+ "\n"
+ "Optional settings (when not set, their associated access check "
+ "is always allowed, which is different from an empty string which "
+ "means an empty list):\n"
+ "* " ENV_TCP_BIND_NAME ": ports allowed to bind (server)\n"
+ "* " ENV_TCP_CONNECT_NAME ": ports allowed to connect (client)\n"
+ "* " ENV_SCOPED_NAME ": actions denied on the outside of the landlock domain\n"
+ " - \"a\" to restrict opening abstract unix sockets\n"
+ " - \"s\" to restrict sending signals\n"
+ "\n"
+ "Example:\n"
+ ENV_FS_RO_NAME "=\"${PATH}:/lib:/usr:/proc:/etc:/dev/urandom\" "
+ ENV_FS_RW_NAME "=\"/dev/null:/dev/full:/dev/zero:/dev/pts:/tmp\" "
+ ENV_TCP_BIND_NAME "=\"9418\" "
+ ENV_TCP_CONNECT_NAME "=\"80:443\" "
+ ENV_SCOPED_NAME "=\"a:s\" "
+ "%1$s bash -i\n"
+ "\n"
+ "This sandboxer can use Landlock features up to ABI version "
+ STR(LANDLOCK_ABI_LAST) ".\n";
+
+/* clang-format on */
+
int main(const int argc, char *const argv[], char *const *const envp)
{
const char *cmd_path;
@@ -280,47 +346,7 @@ int main(const int argc, char *const argv[], char *const *const envp)
};
if (argc < 2) {
- fprintf(stderr,
- "usage: %s=\"...\" %s=\"...\" %s=\"...\" %s=\"...\" %s=\"...\" %s "
- "<cmd> [args]...\n\n",
- ENV_FS_RO_NAME, ENV_FS_RW_NAME, ENV_TCP_BIND_NAME,
- ENV_TCP_CONNECT_NAME, ENV_SCOPED_NAME, argv[0]);
- fprintf(stderr,
- "Execute a command in a restricted environment.\n\n");
- fprintf(stderr,
- "Environment variables containing paths and ports "
- "each separated by a colon:\n");
- fprintf(stderr,
- "* %s: list of paths allowed to be used in a read-only way.\n",
- ENV_FS_RO_NAME);
- fprintf(stderr,
- "* %s: list of paths allowed to be used in a read-write way.\n\n",
- ENV_FS_RW_NAME);
- fprintf(stderr,
- "Environment variables containing ports are optional "
- "and could be skipped.\n");
- fprintf(stderr,
- "* %s: list of ports allowed to bind (server).\n",
- ENV_TCP_BIND_NAME);
- fprintf(stderr,
- "* %s: list of ports allowed to connect (client).\n",
- ENV_TCP_CONNECT_NAME);
- fprintf(stderr, "* %s: list of scoped IPCs.\n",
- ENV_SCOPED_NAME);
- fprintf(stderr,
- "\nexample:\n"
- "%s=\"${PATH}:/lib:/usr:/proc:/etc:/dev/urandom\" "
- "%s=\"/dev/null:/dev/full:/dev/zero:/dev/pts:/tmp\" "
- "%s=\"9418\" "
- "%s=\"80:443\" "
- "%s=\"a:s\" "
- "%s bash -i\n\n",
- ENV_FS_RO_NAME, ENV_FS_RW_NAME, ENV_TCP_BIND_NAME,
- ENV_TCP_CONNECT_NAME, ENV_SCOPED_NAME, argv[0]);
- fprintf(stderr,
- "This sandboxer can use Landlock features "
- "up to ABI version %d.\n",
- LANDLOCK_ABI_LAST);
+ fprintf(stderr, help, argv[0]);
return 1;
}
diff --git a/samples/pktgen/pktgen_sample01_simple.sh b/samples/pktgen/pktgen_sample01_simple.sh
index cdb9f497..66cb707 100755
--- a/samples/pktgen/pktgen_sample01_simple.sh
+++ b/samples/pktgen/pktgen_sample01_simple.sh
@@ -76,7 +76,7 @@
pg_set $DEV "udp_dst_max $UDP_DST_MAX"
fi
-[ ! -z "$UDP_CSUM" ] && pg_set $dev "flag UDPCSUM"
+[ ! -z "$UDP_CSUM" ] && pg_set $DEV "flag UDPCSUM"
# Setup random UDP port src range
pg_set $DEV "flag UDPSRC_RND"
diff --git a/scripts/remove-stale-files b/scripts/remove-stale-files
index 8fc55a7..6e39fa8 100755
--- a/scripts/remove-stale-files
+++ b/scripts/remove-stale-files
@@ -20,6 +20,9 @@
# yard. Stale files stay in this file for a while (for some release cycles?),
# then will be really dead and removed from the code base entirely.
+# moved to security/selinux/genheaders
+rm -f scripts/selinux/genheaders/genheaders
+
rm -f *.spec
rm -f lib/test_fortify.log
diff --git a/scripts/selinux/Makefile b/scripts/selinux/Makefile
index 59494e1..4b1308f 100644
--- a/scripts/selinux/Makefile
+++ b/scripts/selinux/Makefile
@@ -1,2 +1,2 @@
# SPDX-License-Identifier: GPL-2.0-only
-subdir-y := mdp genheaders
+subdir-y := mdp
diff --git a/scripts/selinux/genheaders/.gitignore b/scripts/selinux/genheaders/.gitignore
deleted file mode 100644
index 5fcadd3..0000000
--- a/scripts/selinux/genheaders/.gitignore
+++ /dev/null
@@ -1,2 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-genheaders
diff --git a/scripts/selinux/genheaders/Makefile b/scripts/selinux/genheaders/Makefile
deleted file mode 100644
index 1faf7f0..0000000
--- a/scripts/selinux/genheaders/Makefile
+++ /dev/null
@@ -1,5 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-hostprogs-always-y += genheaders
-HOST_EXTRACFLAGS += \
- -I$(srctree)/include/uapi -I$(srctree)/include \
- -I$(srctree)/security/selinux/include
diff --git a/scripts/selinux/mdp/Makefile b/scripts/selinux/mdp/Makefile
index d61058d..673782e 100644
--- a/scripts/selinux/mdp/Makefile
+++ b/scripts/selinux/mdp/Makefile
@@ -1,7 +1,7 @@
# SPDX-License-Identifier: GPL-2.0
hostprogs-always-y += mdp
HOST_EXTRACFLAGS += \
- -I$(srctree)/include/uapi -I$(srctree)/include \
+ -I$(srctree)/include \
-I$(srctree)/security/selinux/include -I$(objtree)/include
clean-files := policy.* file_contexts
diff --git a/scripts/selinux/mdp/mdp.c b/scripts/selinux/mdp/mdp.c
index 1415604..5236592 100644
--- a/scripts/selinux/mdp/mdp.c
+++ b/scripts/selinux/mdp/mdp.c
@@ -11,10 +11,6 @@
* Authors: Serge E. Hallyn <serue@us.ibm.com>
*/
-
-/* NOTE: we really do want to use the kernel headers here */
-#define __EXPORTED_HEADERS__
-
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
diff --git a/scripts/syscall.tbl b/scripts/syscall.tbl
index 845e24e..ebbdb3c 100644
--- a/scripts/syscall.tbl
+++ b/scripts/syscall.tbl
@@ -403,3 +403,7 @@
460 common lsm_set_self_attr sys_lsm_set_self_attr
461 common lsm_list_modules sys_lsm_list_modules
462 common mseal sys_mseal
+463 common setxattrat sys_setxattrat
+464 common getxattrat sys_getxattrat
+465 common listxattrat sys_listxattrat
+466 common removexattrat sys_removexattrat
diff --git a/security/apparmor/audit.c b/security/apparmor/audit.c
index 6b5181c..73087d7 100644
--- a/security/apparmor/audit.c
+++ b/security/apparmor/audit.c
@@ -264,13 +264,13 @@ int aa_audit_rule_known(struct audit_krule *rule)
return 0;
}
-int aa_audit_rule_match(u32 sid, u32 field, u32 op, void *vrule)
+int aa_audit_rule_match(struct lsm_prop *prop, u32 field, u32 op, void *vrule)
{
struct aa_audit_rule *rule = vrule;
struct aa_label *label;
int found = 0;
- label = aa_secid_to_label(sid);
+ label = prop->apparmor.label;
if (!label)
return -ENOENT;
diff --git a/security/apparmor/domain.c b/security/apparmor/domain.c
index 571158e..2bc34dc 100644
--- a/security/apparmor/domain.c
+++ b/security/apparmor/domain.c
@@ -9,7 +9,6 @@
*/
#include <linux/errno.h>
-#include <linux/fdtable.h>
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/mount.h>
diff --git a/security/apparmor/include/audit.h b/security/apparmor/include/audit.h
index 0c8cc86..e272293 100644
--- a/security/apparmor/include/audit.h
+++ b/security/apparmor/include/audit.h
@@ -202,6 +202,6 @@ static inline int complain_error(int error)
void aa_audit_rule_free(void *vrule);
int aa_audit_rule_init(u32 field, u32 op, char *rulestr, void **vrule, gfp_t gfp);
int aa_audit_rule_known(struct audit_krule *rule);
-int aa_audit_rule_match(u32 sid, u32 field, u32 op, void *vrule);
+int aa_audit_rule_match(struct lsm_prop *prop, u32 field, u32 op, void *vrule);
#endif /* __AA_AUDIT_H */
diff --git a/security/apparmor/include/secid.h b/security/apparmor/include/secid.h
index a912a5d..cc6d1c9 100644
--- a/security/apparmor/include/secid.h
+++ b/security/apparmor/include/secid.h
@@ -26,6 +26,8 @@ extern int apparmor_display_secid_mode;
struct aa_label *aa_secid_to_label(u32 secid);
int apparmor_secid_to_secctx(u32 secid, char **secdata, u32 *seclen);
+int apparmor_lsmprop_to_secctx(struct lsm_prop *prop, char **secdata,
+ u32 *seclen);
int apparmor_secctx_to_secid(const char *secdata, u32 seclen, u32 *secid);
void apparmor_release_secctx(char *secdata, u32 seclen);
diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c
index f5d0529..1edc128 100644
--- a/security/apparmor/lsm.c
+++ b/security/apparmor/lsm.c
@@ -982,17 +982,20 @@ static void apparmor_bprm_committed_creds(const struct linux_binprm *bprm)
return;
}
-static void apparmor_current_getsecid_subj(u32 *secid)
+static void apparmor_current_getlsmprop_subj(struct lsm_prop *prop)
{
struct aa_label *label = __begin_current_label_crit_section();
- *secid = label->secid;
+
+ prop->apparmor.label = label;
__end_current_label_crit_section(label);
}
-static void apparmor_task_getsecid_obj(struct task_struct *p, u32 *secid)
+static void apparmor_task_getlsmprop_obj(struct task_struct *p,
+ struct lsm_prop *prop)
{
struct aa_label *label = aa_get_task_label(p);
- *secid = label->secid;
+
+ prop->apparmor.label = label;
aa_put_label(label);
}
@@ -1503,8 +1506,9 @@ static struct security_hook_list apparmor_hooks[] __ro_after_init = {
LSM_HOOK_INIT(task_free, apparmor_task_free),
LSM_HOOK_INIT(task_alloc, apparmor_task_alloc),
- LSM_HOOK_INIT(current_getsecid_subj, apparmor_current_getsecid_subj),
- LSM_HOOK_INIT(task_getsecid_obj, apparmor_task_getsecid_obj),
+ LSM_HOOK_INIT(current_getlsmprop_subj,
+ apparmor_current_getlsmprop_subj),
+ LSM_HOOK_INIT(task_getlsmprop_obj, apparmor_task_getlsmprop_obj),
LSM_HOOK_INIT(task_setrlimit, apparmor_task_setrlimit),
LSM_HOOK_INIT(task_kill, apparmor_task_kill),
LSM_HOOK_INIT(userns_create, apparmor_userns_create),
@@ -1517,6 +1521,7 @@ static struct security_hook_list apparmor_hooks[] __ro_after_init = {
#endif
LSM_HOOK_INIT(secid_to_secctx, apparmor_secid_to_secctx),
+ LSM_HOOK_INIT(lsmprop_to_secctx, apparmor_lsmprop_to_secctx),
LSM_HOOK_INIT(secctx_to_secid, apparmor_secctx_to_secid),
LSM_HOOK_INIT(release_secctx, apparmor_release_secctx),
diff --git a/security/apparmor/secid.c b/security/apparmor/secid.c
index 83d3d1e..6350d10 100644
--- a/security/apparmor/secid.c
+++ b/security/apparmor/secid.c
@@ -61,10 +61,10 @@ struct aa_label *aa_secid_to_label(u32 secid)
return xa_load(&aa_secids, secid);
}
-int apparmor_secid_to_secctx(u32 secid, char **secdata, u32 *seclen)
+static int apparmor_label_to_secctx(struct aa_label *label, char **secdata,
+ u32 *seclen)
{
/* TODO: cache secctx and ref count so we don't have to recreate */
- struct aa_label *label = aa_secid_to_label(secid);
int flags = FLAG_VIEW_SUBNS | FLAG_HIDDEN_UNCONFINED | FLAG_ABS_ROOT;
int len;
@@ -90,6 +90,23 @@ int apparmor_secid_to_secctx(u32 secid, char **secdata, u32 *seclen)
return 0;
}
+int apparmor_secid_to_secctx(u32 secid, char **secdata, u32 *seclen)
+{
+ struct aa_label *label = aa_secid_to_label(secid);
+
+ return apparmor_label_to_secctx(label, secdata, seclen);
+}
+
+int apparmor_lsmprop_to_secctx(struct lsm_prop *prop, char **secdata,
+ u32 *seclen)
+{
+ struct aa_label *label;
+
+ label = prop->apparmor.label;
+
+ return apparmor_label_to_secctx(label, secdata, seclen);
+}
+
int apparmor_secctx_to_secid(const char *secdata, u32 seclen, u32 *secid)
{
struct aa_label *label;
diff --git a/security/integrity/evm/evm_main.c b/security/integrity/evm/evm_main.c
index 6924ed5..377e57e 100644
--- a/security/integrity/evm/evm_main.c
+++ b/security/integrity/evm/evm_main.c
@@ -1084,7 +1084,8 @@ static void evm_file_release(struct file *file)
if (!S_ISREG(inode->i_mode) || !(mode & FMODE_WRITE))
return;
- if (iint && atomic_read(&inode->i_writecount) == 1)
+ if (iint && iint->flags & EVM_NEW_FILE &&
+ atomic_read(&inode->i_writecount) == 1)
iint->flags &= ~EVM_NEW_FILE;
}
diff --git a/security/integrity/ima/ima.h b/security/integrity/ima/ima.h
index 3c323ca..c0d3b71 100644
--- a/security/integrity/ima/ima.h
+++ b/security/integrity/ima/ima.h
@@ -369,7 +369,7 @@ static inline void ima_process_queued_keys(void) {}
/* LIM API function definitions */
int ima_get_action(struct mnt_idmap *idmap, struct inode *inode,
- const struct cred *cred, u32 secid, int mask,
+ const struct cred *cred, struct lsm_prop *prop, int mask,
enum ima_hooks func, int *pcr,
struct ima_template_desc **template_desc,
const char *func_data, unsigned int *allowed_algos);
@@ -400,8 +400,8 @@ const char *ima_d_path(const struct path *path, char **pathbuf, char *filename);
/* IMA policy related functions */
int ima_match_policy(struct mnt_idmap *idmap, struct inode *inode,
- const struct cred *cred, u32 secid, enum ima_hooks func,
- int mask, int flags, int *pcr,
+ const struct cred *cred, struct lsm_prop *prop,
+ enum ima_hooks func, int mask, int flags, int *pcr,
struct ima_template_desc **template_desc,
const char *func_data, unsigned int *allowed_algos);
void ima_init_policy(void);
@@ -555,7 +555,7 @@ static inline void ima_filter_rule_free(void *lsmrule)
{
}
-static inline int ima_filter_rule_match(u32 secid, u32 field, u32 op,
+static inline int ima_filter_rule_match(struct lsm_prop *prop, u32 field, u32 op,
void *lsmrule)
{
return -EINVAL;
diff --git a/security/integrity/ima/ima_api.c b/security/integrity/ima/ima_api.c
index 984e861..c35ea61 100644
--- a/security/integrity/ima/ima_api.c
+++ b/security/integrity/ima/ima_api.c
@@ -165,7 +165,7 @@ void ima_add_violation(struct file *file, const unsigned char *filename,
* @idmap: idmap of the mount the inode was found from
* @inode: pointer to the inode associated with the object being validated
* @cred: pointer to credentials structure to validate
- * @secid: secid of the task being validated
+ * @prop: properties of the task being validated
* @mask: contains the permission mask (MAY_READ, MAY_WRITE, MAY_EXEC,
* MAY_APPEND)
* @func: caller identifier
@@ -187,7 +187,7 @@ void ima_add_violation(struct file *file, const unsigned char *filename,
*
*/
int ima_get_action(struct mnt_idmap *idmap, struct inode *inode,
- const struct cred *cred, u32 secid, int mask,
+ const struct cred *cred, struct lsm_prop *prop, int mask,
enum ima_hooks func, int *pcr,
struct ima_template_desc **template_desc,
const char *func_data, unsigned int *allowed_algos)
@@ -196,7 +196,7 @@ int ima_get_action(struct mnt_idmap *idmap, struct inode *inode,
flags &= ima_policy_flag;
- return ima_match_policy(idmap, inode, cred, secid, func, mask,
+ return ima_match_policy(idmap, inode, cred, prop, func, mask,
flags, pcr, template_desc, func_data,
allowed_algos);
}
diff --git a/security/integrity/ima/ima_appraise.c b/security/integrity/ima/ima_appraise.c
index 656c709..884a353 100644
--- a/security/integrity/ima/ima_appraise.c
+++ b/security/integrity/ima/ima_appraise.c
@@ -73,13 +73,13 @@ bool is_ima_appraise_enabled(void)
int ima_must_appraise(struct mnt_idmap *idmap, struct inode *inode,
int mask, enum ima_hooks func)
{
- u32 secid;
+ struct lsm_prop prop;
if (!ima_appraise)
return 0;
- security_current_getsecid_subj(&secid);
- return ima_match_policy(idmap, inode, current_cred(), secid,
+ security_current_getlsmprop_subj(&prop);
+ return ima_match_policy(idmap, inode, current_cred(), &prop,
func, mask, IMA_APPRAISE | IMA_HASH, NULL,
NULL, NULL, NULL);
}
diff --git a/security/integrity/ima/ima_main.c b/security/integrity/ima/ima_main.c
index 06132cf..9b87556 100644
--- a/security/integrity/ima/ima_main.c
+++ b/security/integrity/ima/ima_main.c
@@ -206,8 +206,8 @@ static void ima_file_free(struct file *file)
}
static int process_measurement(struct file *file, const struct cred *cred,
- u32 secid, char *buf, loff_t size, int mask,
- enum ima_hooks func)
+ struct lsm_prop *prop, char *buf, loff_t size,
+ int mask, enum ima_hooks func)
{
struct inode *real_inode, *inode = file_inode(file);
struct ima_iint_cache *iint = NULL;
@@ -232,7 +232,7 @@ static int process_measurement(struct file *file, const struct cred *cred,
* bitmask based on the appraise/audit/measurement policy.
* Included is the appraise submask.
*/
- action = ima_get_action(file_mnt_idmap(file), inode, cred, secid,
+ action = ima_get_action(file_mnt_idmap(file), inode, cred, prop,
mask, func, &pcr, &template_desc, NULL,
&allowed_algos);
violation_check = ((func == FILE_CHECK || func == MMAP_CHECK ||
@@ -443,23 +443,23 @@ static int process_measurement(struct file *file, const struct cred *cred,
static int ima_file_mmap(struct file *file, unsigned long reqprot,
unsigned long prot, unsigned long flags)
{
- u32 secid;
+ struct lsm_prop prop;
int ret;
if (!file)
return 0;
- security_current_getsecid_subj(&secid);
+ security_current_getlsmprop_subj(&prop);
if (reqprot & PROT_EXEC) {
- ret = process_measurement(file, current_cred(), secid, NULL,
+ ret = process_measurement(file, current_cred(), &prop, NULL,
0, MAY_EXEC, MMAP_CHECK_REQPROT);
if (ret)
return ret;
}
if (prot & PROT_EXEC)
- return process_measurement(file, current_cred(), secid, NULL,
+ return process_measurement(file, current_cred(), &prop, NULL,
0, MAY_EXEC, MMAP_CHECK);
return 0;
@@ -488,9 +488,9 @@ static int ima_file_mprotect(struct vm_area_struct *vma, unsigned long reqprot,
char *pathbuf = NULL;
const char *pathname = NULL;
struct inode *inode;
+ struct lsm_prop prop;
int result = 0;
int action;
- u32 secid;
int pcr;
/* Is mprotect making an mmap'ed file executable? */
@@ -498,13 +498,13 @@ static int ima_file_mprotect(struct vm_area_struct *vma, unsigned long reqprot,
!(prot & PROT_EXEC) || (vma->vm_flags & VM_EXEC))
return 0;
- security_current_getsecid_subj(&secid);
+ security_current_getlsmprop_subj(&prop);
inode = file_inode(vma->vm_file);
action = ima_get_action(file_mnt_idmap(vma->vm_file), inode,
- current_cred(), secid, MAY_EXEC, MMAP_CHECK,
+ current_cred(), &prop, MAY_EXEC, MMAP_CHECK,
&pcr, &template, NULL, NULL);
action |= ima_get_action(file_mnt_idmap(vma->vm_file), inode,
- current_cred(), secid, MAY_EXEC,
+ current_cred(), &prop, MAY_EXEC,
MMAP_CHECK_REQPROT, &pcr, &template, NULL,
NULL);
@@ -541,16 +541,16 @@ static int ima_file_mprotect(struct vm_area_struct *vma, unsigned long reqprot,
static int ima_bprm_check(struct linux_binprm *bprm)
{
int ret;
- u32 secid;
+ struct lsm_prop prop;
- security_current_getsecid_subj(&secid);
- ret = process_measurement(bprm->file, current_cred(), secid, NULL, 0,
- MAY_EXEC, BPRM_CHECK);
+ security_current_getlsmprop_subj(&prop);
+ ret = process_measurement(bprm->file, current_cred(),
+ &prop, NULL, 0, MAY_EXEC, BPRM_CHECK);
if (ret)
return ret;
- security_cred_getsecid(bprm->cred, &secid);
- return process_measurement(bprm->file, bprm->cred, secid, NULL, 0,
+ security_cred_getlsmprop(bprm->cred, &prop);
+ return process_measurement(bprm->file, bprm->cred, &prop, NULL, 0,
MAY_EXEC, CREDS_CHECK);
}
@@ -566,10 +566,10 @@ static int ima_bprm_check(struct linux_binprm *bprm)
*/
static int ima_file_check(struct file *file, int mask)
{
- u32 secid;
+ struct lsm_prop prop;
- security_current_getsecid_subj(&secid);
- return process_measurement(file, current_cred(), secid, NULL, 0,
+ security_current_getlsmprop_subj(&prop);
+ return process_measurement(file, current_cred(), &prop, NULL, 0,
mask & (MAY_READ | MAY_WRITE | MAY_EXEC |
MAY_APPEND), FILE_CHECK);
}
@@ -768,7 +768,7 @@ static int ima_read_file(struct file *file, enum kernel_read_file_id read_id,
bool contents)
{
enum ima_hooks func;
- u32 secid;
+ struct lsm_prop prop;
/*
* Do devices using pre-allocated memory run the risk of the
@@ -788,9 +788,9 @@ static int ima_read_file(struct file *file, enum kernel_read_file_id read_id,
/* Read entire file for all partial reads. */
func = read_idmap[read_id] ?: FILE_CHECK;
- security_current_getsecid_subj(&secid);
- return process_measurement(file, current_cred(), secid, NULL,
- 0, MAY_READ, func);
+ security_current_getlsmprop_subj(&prop);
+ return process_measurement(file, current_cred(), &prop, NULL, 0,
+ MAY_READ, func);
}
const int read_idmap[READING_MAX_ID] = {
@@ -818,7 +818,7 @@ static int ima_post_read_file(struct file *file, char *buf, loff_t size,
enum kernel_read_file_id read_id)
{
enum ima_hooks func;
- u32 secid;
+ struct lsm_prop prop;
/* permit signed certs */
if (!file && read_id == READING_X509_CERTIFICATE)
@@ -831,8 +831,8 @@ static int ima_post_read_file(struct file *file, char *buf, loff_t size,
}
func = read_idmap[read_id] ?: FILE_CHECK;
- security_current_getsecid_subj(&secid);
- return process_measurement(file, current_cred(), secid, buf, size,
+ security_current_getlsmprop_subj(&prop);
+ return process_measurement(file, current_cred(), &prop, buf, size,
MAY_READ, func);
}
@@ -967,7 +967,7 @@ int process_buffer_measurement(struct mnt_idmap *idmap,
int digest_hash_len = hash_digest_size[ima_hash_algo];
int violation = 0;
int action = 0;
- u32 secid;
+ struct lsm_prop prop;
if (digest && digest_len < digest_hash_len)
return -EINVAL;
@@ -990,9 +990,9 @@ int process_buffer_measurement(struct mnt_idmap *idmap,
* buffer measurements.
*/
if (func) {
- security_current_getsecid_subj(&secid);
+ security_current_getlsmprop_subj(&prop);
action = ima_get_action(idmap, inode, current_cred(),
- secid, 0, func, &pcr, &template,
+ &prop, 0, func, &pcr, &template,
func_data, NULL);
if (!(action & IMA_MEASURE) && !digest)
return -ENOENT;
@@ -1062,19 +1062,16 @@ int process_buffer_measurement(struct mnt_idmap *idmap,
*/
void ima_kexec_cmdline(int kernel_fd, const void *buf, int size)
{
- struct fd f;
-
if (!buf || !size)
return;
- f = fdget(kernel_fd);
- if (!fd_file(f))
+ CLASS(fd, f)(kernel_fd);
+ if (fd_empty(f))
return;
process_buffer_measurement(file_mnt_idmap(fd_file(f)), file_inode(fd_file(f)),
buf, size, "kexec-cmdline", KEXEC_CMDLINE, 0,
NULL, false, NULL, 0);
- fdput(f);
}
/**
@@ -1114,7 +1111,7 @@ EXPORT_SYMBOL_GPL(ima_measure_critical_data);
#ifdef CONFIG_INTEGRITY_ASYMMETRIC_KEYS
/**
- * ima_kernel_module_request - Prevent crypto-pkcs1pad(rsa,*) requests
+ * ima_kernel_module_request - Prevent crypto-pkcs1(rsa,*) requests
* @kmod_name: kernel module name
*
* Avoid a verification loop where verifying the signature of the modprobe
@@ -1128,7 +1125,7 @@ EXPORT_SYMBOL_GPL(ima_measure_critical_data);
* algorithm on the fly, but crypto_larval_lookup() will try to use alg_name
* in order to load a kernel module with same name.
*
- * Since we don't have any real "crypto-pkcs1pad(rsa,*)" kernel modules,
+ * Since we don't have any real "crypto-pkcs1(rsa,*)" kernel modules,
* we are safe to fail such module request from crypto_larval_lookup(), and
* avoid the verification loop.
*
@@ -1136,7 +1133,7 @@ EXPORT_SYMBOL_GPL(ima_measure_critical_data);
*/
static int ima_kernel_module_request(char *kmod_name)
{
- if (strncmp(kmod_name, "crypto-pkcs1pad(rsa,", 20) == 0)
+ if (strncmp(kmod_name, "crypto-pkcs1(rsa,", 17) == 0)
return -EINVAL;
return 0;
diff --git a/security/integrity/ima/ima_policy.c b/security/integrity/ima/ima_policy.c
index 09da8e6..dbfd554 100644
--- a/security/integrity/ima/ima_policy.c
+++ b/security/integrity/ima/ima_policy.c
@@ -557,7 +557,7 @@ static bool ima_match_rule_data(struct ima_rule_entry *rule,
* @idmap: idmap of the mount the inode was found from
* @inode: a pointer to an inode
* @cred: a pointer to a credentials structure for user validation
- * @secid: the secid of the task to be validated
+ * @prop: LSM properties of the task to be validated
* @func: LIM hook identifier
* @mask: requested action (MAY_READ | MAY_WRITE | MAY_APPEND | MAY_EXEC)
* @func_data: func specific data, may be NULL
@@ -567,7 +567,7 @@ static bool ima_match_rule_data(struct ima_rule_entry *rule,
static bool ima_match_rules(struct ima_rule_entry *rule,
struct mnt_idmap *idmap,
struct inode *inode, const struct cred *cred,
- u32 secid, enum ima_hooks func, int mask,
+ struct lsm_prop *prop, enum ima_hooks func, int mask,
const char *func_data)
{
int i;
@@ -635,7 +635,7 @@ static bool ima_match_rules(struct ima_rule_entry *rule,
return false;
for (i = 0; i < MAX_LSM_RULES; i++) {
int rc = 0;
- u32 osid;
+ struct lsm_prop prop = { };
if (!lsm_rule->lsm[i].rule) {
if (!lsm_rule->lsm[i].args_p)
@@ -649,15 +649,15 @@ static bool ima_match_rules(struct ima_rule_entry *rule,
case LSM_OBJ_USER:
case LSM_OBJ_ROLE:
case LSM_OBJ_TYPE:
- security_inode_getsecid(inode, &osid);
- rc = ima_filter_rule_match(osid, lsm_rule->lsm[i].type,
+ security_inode_getlsmprop(inode, &prop);
+ rc = ima_filter_rule_match(&prop, lsm_rule->lsm[i].type,
Audit_equal,
lsm_rule->lsm[i].rule);
break;
case LSM_SUBJ_USER:
case LSM_SUBJ_ROLE:
case LSM_SUBJ_TYPE:
- rc = ima_filter_rule_match(secid, lsm_rule->lsm[i].type,
+ rc = ima_filter_rule_match(&prop, lsm_rule->lsm[i].type,
Audit_equal,
lsm_rule->lsm[i].rule);
break;
@@ -720,7 +720,7 @@ static int get_subaction(struct ima_rule_entry *rule, enum ima_hooks func)
* @inode: pointer to an inode for which the policy decision is being made
* @cred: pointer to a credentials structure for which the policy decision is
* being made
- * @secid: LSM secid of the task to be validated
+ * @prop: LSM properties of the task to be validated
* @func: IMA hook identifier
* @mask: requested action (MAY_READ | MAY_WRITE | MAY_APPEND | MAY_EXEC)
* @flags: IMA actions to consider (e.g. IMA_MEASURE | IMA_APPRAISE)
@@ -737,8 +737,8 @@ static int get_subaction(struct ima_rule_entry *rule, enum ima_hooks func)
* than writes so ima_match_policy() is classical RCU candidate.
*/
int ima_match_policy(struct mnt_idmap *idmap, struct inode *inode,
- const struct cred *cred, u32 secid, enum ima_hooks func,
- int mask, int flags, int *pcr,
+ const struct cred *cred, struct lsm_prop *prop,
+ enum ima_hooks func, int mask, int flags, int *pcr,
struct ima_template_desc **template_desc,
const char *func_data, unsigned int *allowed_algos)
{
@@ -756,7 +756,7 @@ int ima_match_policy(struct mnt_idmap *idmap, struct inode *inode,
if (!(entry->action & actmask))
continue;
- if (!ima_match_rules(entry, idmap, inode, cred, secid,
+ if (!ima_match_rules(entry, idmap, inode, cred, prop,
func, mask, func_data))
continue;
diff --git a/security/integrity/ima/ima_template_lib.c b/security/integrity/ima/ima_template_lib.c
index 4183956..0e627ea 100644
--- a/security/integrity/ima/ima_template_lib.c
+++ b/security/integrity/ima/ima_template_lib.c
@@ -318,15 +318,21 @@ static int ima_eventdigest_init_common(const u8 *digest, u32 digestsize,
hash_algo_name[hash_algo]);
}
- if (digest)
+ if (digest) {
memcpy(buffer + offset, digest, digestsize);
- else
+ } else {
/*
* If digest is NULL, the event being recorded is a violation.
* Make room for the digest by increasing the offset by the
- * hash algorithm digest size.
+ * hash algorithm digest size. If the hash algorithm is not
+ * specified increase the offset by IMA_DIGEST_SIZE which
+ * fits SHA1 or MD5
*/
- offset += hash_digest_size[hash_algo];
+ if (hash_algo < HASH_ALGO__LAST)
+ offset += hash_digest_size[hash_algo];
+ else
+ offset += IMA_DIGEST_SIZE;
+ }
return ima_write_template_field_data(buffer, offset + digestsize,
fmt, field_data);
diff --git a/security/integrity/integrity.h b/security/integrity/integrity.h
index 660f76c..c2c2da6 100644
--- a/security/integrity/integrity.h
+++ b/security/integrity/integrity.h
@@ -37,6 +37,8 @@ struct evm_ima_xattr_data {
);
u8 data[];
} __packed;
+static_assert(offsetof(struct evm_ima_xattr_data, data) == sizeof(struct evm_ima_xattr_data_hdr),
+ "struct member likely outside of __struct_group()");
/* Only used in the EVM HMAC code. */
struct evm_xattr {
@@ -65,6 +67,8 @@ struct ima_digest_data {
);
u8 digest[];
} __packed;
+static_assert(offsetof(struct ima_digest_data, digest) == sizeof(struct ima_digest_data_hdr),
+ "struct member likely outside of __struct_group()");
/*
* Instead of wrapping the ima_digest_data struct inside a local structure
diff --git a/security/landlock/fs.c b/security/landlock/fs.c
index 7d79fc8..e31b97a 100644
--- a/security/landlock/fs.c
+++ b/security/landlock/fs.c
@@ -389,37 +389,21 @@ static bool is_nouser_or_private(const struct dentry *dentry)
}
static access_mask_t
-get_raw_handled_fs_accesses(const struct landlock_ruleset *const domain)
-{
- access_mask_t access_dom = 0;
- size_t layer_level;
-
- for (layer_level = 0; layer_level < domain->num_layers; layer_level++)
- access_dom |=
- landlock_get_raw_fs_access_mask(domain, layer_level);
- return access_dom;
-}
-
-static access_mask_t
get_handled_fs_accesses(const struct landlock_ruleset *const domain)
{
/* Handles all initially denied by default access rights. */
- return get_raw_handled_fs_accesses(domain) |
+ return landlock_union_access_masks(domain).fs |
LANDLOCK_ACCESS_FS_INITIALLY_DENIED;
}
-static const struct landlock_ruleset *
-get_fs_domain(const struct landlock_ruleset *const domain)
-{
- if (!domain || !get_raw_handled_fs_accesses(domain))
- return NULL;
-
- return domain;
-}
+static const struct access_masks any_fs = {
+ .fs = ~0,
+};
static const struct landlock_ruleset *get_current_fs_domain(void)
{
- return get_fs_domain(landlock_get_current_domain());
+ return landlock_get_applicable_domain(landlock_get_current_domain(),
+ any_fs);
}
/*
@@ -1517,7 +1501,8 @@ static int hook_file_open(struct file *const file)
access_mask_t open_access_request, full_access_request, allowed_access,
optional_access;
const struct landlock_ruleset *const dom =
- get_fs_domain(landlock_cred(file->f_cred)->domain);
+ landlock_get_applicable_domain(
+ landlock_cred(file->f_cred)->domain, any_fs);
if (!dom)
return 0;
diff --git a/security/landlock/net.c b/security/landlock/net.c
index c8bcd29..d5dcc440 100644
--- a/security/landlock/net.c
+++ b/security/landlock/net.c
@@ -39,27 +39,9 @@ int landlock_append_net_rule(struct landlock_ruleset *const ruleset,
return err;
}
-static access_mask_t
-get_raw_handled_net_accesses(const struct landlock_ruleset *const domain)
-{
- access_mask_t access_dom = 0;
- size_t layer_level;
-
- for (layer_level = 0; layer_level < domain->num_layers; layer_level++)
- access_dom |= landlock_get_net_access_mask(domain, layer_level);
- return access_dom;
-}
-
-static const struct landlock_ruleset *get_current_net_domain(void)
-{
- const struct landlock_ruleset *const dom =
- landlock_get_current_domain();
-
- if (!dom || !get_raw_handled_net_accesses(dom))
- return NULL;
-
- return dom;
-}
+static const struct access_masks any_net = {
+ .net = ~0,
+};
static int current_check_access_socket(struct socket *const sock,
struct sockaddr *const address,
@@ -72,7 +54,9 @@ static int current_check_access_socket(struct socket *const sock,
struct landlock_id id = {
.type = LANDLOCK_KEY_NET_PORT,
};
- const struct landlock_ruleset *const dom = get_current_net_domain();
+ const struct landlock_ruleset *const dom =
+ landlock_get_applicable_domain(landlock_get_current_domain(),
+ any_net);
if (!dom)
return 0;
diff --git a/security/landlock/ruleset.h b/security/landlock/ruleset.h
index 61bdbc5..631e24d 100644
--- a/security/landlock/ruleset.h
+++ b/security/landlock/ruleset.h
@@ -11,6 +11,7 @@
#include <linux/bitops.h>
#include <linux/build_bug.h>
+#include <linux/kernel.h>
#include <linux/mutex.h>
#include <linux/rbtree.h>
#include <linux/refcount.h>
@@ -47,6 +48,15 @@ struct access_masks {
access_mask_t scope : LANDLOCK_NUM_SCOPE;
};
+union access_masks_all {
+ struct access_masks masks;
+ u32 all;
+};
+
+/* Makes sure all fields are covered. */
+static_assert(sizeof(typeof_member(union access_masks_all, masks)) ==
+ sizeof(typeof_member(union access_masks_all, all)));
+
typedef u16 layer_mask_t;
/* Makes sure all layers can be checked. */
static_assert(BITS_PER_TYPE(layer_mask_t) >= LANDLOCK_MAX_NUM_LAYERS);
@@ -260,6 +270,61 @@ static inline void landlock_get_ruleset(struct landlock_ruleset *const ruleset)
refcount_inc(&ruleset->usage);
}
+/**
+ * landlock_union_access_masks - Return all access rights handled in the
+ * domain
+ *
+ * @domain: Landlock ruleset (used as a domain)
+ *
+ * Returns: an access_masks result of the OR of all the domain's access masks.
+ */
+static inline struct access_masks
+landlock_union_access_masks(const struct landlock_ruleset *const domain)
+{
+ union access_masks_all matches = {};
+ size_t layer_level;
+
+ for (layer_level = 0; layer_level < domain->num_layers; layer_level++) {
+ union access_masks_all layer = {
+ .masks = domain->access_masks[layer_level],
+ };
+
+ matches.all |= layer.all;
+ }
+
+ return matches.masks;
+}
+
+/**
+ * landlock_get_applicable_domain - Return @domain if it applies to (handles)
+ * at least one of the access rights specified
+ * in @masks
+ *
+ * @domain: Landlock ruleset (used as a domain)
+ * @masks: access masks
+ *
+ * Returns: @domain if any access rights specified in @masks is handled, or
+ * NULL otherwise.
+ */
+static inline const struct landlock_ruleset *
+landlock_get_applicable_domain(const struct landlock_ruleset *const domain,
+ const struct access_masks masks)
+{
+ const union access_masks_all masks_all = {
+ .masks = masks,
+ };
+ union access_masks_all merge = {};
+
+ if (!domain)
+ return NULL;
+
+ merge.masks = landlock_union_access_masks(domain);
+ if (merge.all & masks_all.all)
+ return domain;
+
+ return NULL;
+}
+
static inline void
landlock_add_fs_access_mask(struct landlock_ruleset *const ruleset,
const access_mask_t fs_access_mask,
@@ -296,18 +361,11 @@ landlock_add_scope_mask(struct landlock_ruleset *const ruleset,
}
static inline access_mask_t
-landlock_get_raw_fs_access_mask(const struct landlock_ruleset *const ruleset,
- const u16 layer_level)
-{
- return ruleset->access_masks[layer_level].fs;
-}
-
-static inline access_mask_t
landlock_get_fs_access_mask(const struct landlock_ruleset *const ruleset,
const u16 layer_level)
{
/* Handles all initially denied by default access rights. */
- return landlock_get_raw_fs_access_mask(ruleset, layer_level) |
+ return ruleset->access_masks[layer_level].fs |
LANDLOCK_ACCESS_FS_INITIALLY_DENIED;
}
diff --git a/security/landlock/syscalls.c b/security/landlock/syscalls.c
index f5a0e71..4ed8e70 100644
--- a/security/landlock/syscalls.c
+++ b/security/landlock/syscalls.c
@@ -241,31 +241,21 @@ SYSCALL_DEFINE3(landlock_create_ruleset,
static struct landlock_ruleset *get_ruleset_from_fd(const int fd,
const fmode_t mode)
{
- struct fd ruleset_f;
+ CLASS(fd, ruleset_f)(fd);
struct landlock_ruleset *ruleset;
- ruleset_f = fdget(fd);
- if (!fd_file(ruleset_f))
+ if (fd_empty(ruleset_f))
return ERR_PTR(-EBADF);
/* Checks FD type and access right. */
- if (fd_file(ruleset_f)->f_op != &ruleset_fops) {
- ruleset = ERR_PTR(-EBADFD);
- goto out_fdput;
- }
- if (!(fd_file(ruleset_f)->f_mode & mode)) {
- ruleset = ERR_PTR(-EPERM);
- goto out_fdput;
- }
+ if (fd_file(ruleset_f)->f_op != &ruleset_fops)
+ return ERR_PTR(-EBADFD);
+ if (!(fd_file(ruleset_f)->f_mode & mode))
+ return ERR_PTR(-EPERM);
ruleset = fd_file(ruleset_f)->private_data;
- if (WARN_ON_ONCE(ruleset->num_layers != 1)) {
- ruleset = ERR_PTR(-EINVAL);
- goto out_fdput;
- }
+ if (WARN_ON_ONCE(ruleset->num_layers != 1))
+ return ERR_PTR(-EINVAL);
landlock_get_ruleset(ruleset);
-
-out_fdput:
- fdput(ruleset_f);
return ruleset;
}
@@ -276,15 +266,12 @@ static struct landlock_ruleset *get_ruleset_from_fd(const int fd,
*/
static int get_path_from_fd(const s32 fd, struct path *const path)
{
- struct fd f;
- int err = 0;
+ CLASS(fd_raw, f)(fd);
BUILD_BUG_ON(!__same_type(
fd, ((struct landlock_path_beneath_attr *)NULL)->parent_fd));
- /* Handles O_PATH. */
- f = fdget_raw(fd);
- if (!fd_file(f))
+ if (fd_empty(f))
return -EBADF;
/*
* Forbids ruleset FDs, internal filesystems (e.g. nsfs), including
@@ -295,16 +282,12 @@ static int get_path_from_fd(const s32 fd, struct path *const path)
(fd_file(f)->f_path.mnt->mnt_flags & MNT_INTERNAL) ||
(fd_file(f)->f_path.dentry->d_sb->s_flags & SB_NOUSER) ||
d_is_negative(fd_file(f)->f_path.dentry) ||
- IS_PRIVATE(d_backing_inode(fd_file(f)->f_path.dentry))) {
- err = -EBADFD;
- goto out_fdput;
- }
+ IS_PRIVATE(d_backing_inode(fd_file(f)->f_path.dentry)))
+ return -EBADFD;
+
*path = fd_file(f)->f_path;
path_get(path);
-
-out_fdput:
- fdput(f);
- return err;
+ return 0;
}
static int add_rule_path_beneath(struct landlock_ruleset *const ruleset,
@@ -329,7 +312,7 @@ static int add_rule_path_beneath(struct landlock_ruleset *const ruleset,
return -ENOMSG;
/* Checks that allowed_access matches the @ruleset constraints. */
- mask = landlock_get_raw_fs_access_mask(ruleset, 0);
+ mask = ruleset->access_masks[0].fs;
if ((path_beneath_attr.allowed_access | mask) != mask)
return -EINVAL;
diff --git a/security/landlock/task.c b/security/landlock/task.c
index 4acbd7c..dc7dab7 100644
--- a/security/landlock/task.c
+++ b/security/landlock/task.c
@@ -204,12 +204,17 @@ static bool is_abstract_socket(struct sock *const sock)
return false;
}
+static const struct access_masks unix_scope = {
+ .scope = LANDLOCK_SCOPE_ABSTRACT_UNIX_SOCKET,
+};
+
static int hook_unix_stream_connect(struct sock *const sock,
struct sock *const other,
struct sock *const newsk)
{
const struct landlock_ruleset *const dom =
- landlock_get_current_domain();
+ landlock_get_applicable_domain(landlock_get_current_domain(),
+ unix_scope);
/* Quick return for non-landlocked tasks. */
if (!dom)
@@ -225,7 +230,8 @@ static int hook_unix_may_send(struct socket *const sock,
struct socket *const other)
{
const struct landlock_ruleset *const dom =
- landlock_get_current_domain();
+ landlock_get_applicable_domain(landlock_get_current_domain(),
+ unix_scope);
if (!dom)
return 0;
@@ -243,6 +249,10 @@ static int hook_unix_may_send(struct socket *const sock,
return 0;
}
+static const struct access_masks signal_scope = {
+ .scope = LANDLOCK_SCOPE_SIGNAL,
+};
+
static int hook_task_kill(struct task_struct *const p,
struct kernel_siginfo *const info, const int sig,
const struct cred *const cred)
@@ -256,6 +266,7 @@ static int hook_task_kill(struct task_struct *const p,
} else {
dom = landlock_get_current_domain();
}
+ dom = landlock_get_applicable_domain(dom, signal_scope);
/* Quick return for non-landlocked tasks. */
if (!dom)
@@ -279,7 +290,8 @@ static int hook_file_send_sigiotask(struct task_struct *tsk,
/* Lock already held by send_sigio() and send_sigurg(). */
lockdep_assert_held(&fown->lock);
- dom = landlock_file(fown->file)->fown_domain;
+ dom = landlock_get_applicable_domain(
+ landlock_file(fown->file)->fown_domain, signal_scope);
/* Quick return for unowned socket. */
if (!dom)
diff --git a/security/loadpin/loadpin.c b/security/loadpin/loadpin.c
index 02144ec..6825245 100644
--- a/security/loadpin/loadpin.c
+++ b/security/loadpin/loadpin.c
@@ -283,7 +283,6 @@ enum loadpin_securityfs_interface_index {
static int read_trusted_verity_root_digests(unsigned int fd)
{
- struct fd f;
void *data;
int rc;
char *p, *d;
@@ -295,8 +294,8 @@ static int read_trusted_verity_root_digests(unsigned int fd)
if (!list_empty(&dm_verity_loadpin_trusted_root_digests))
return -EPERM;
- f = fdget(fd);
- if (!fd_file(f))
+ CLASS(fd, f)(fd);
+ if (fd_empty(f))
return -EINVAL;
data = kzalloc(SZ_4K, GFP_KERNEL);
@@ -359,7 +358,6 @@ static int read_trusted_verity_root_digests(unsigned int fd)
}
kfree(data);
- fdput(f);
return 0;
@@ -379,8 +377,6 @@ static int read_trusted_verity_root_digests(unsigned int fd)
/* disallow further attempts after reading a corrupt/invalid file */
deny_reading_verity_digests = true;
- fdput(f);
-
return rc;
}
diff --git a/security/security.c b/security/security.c
index c5981e5..e2d47dd 100644
--- a/security/security.c
+++ b/security/security.c
@@ -2726,16 +2726,15 @@ int security_inode_listsecurity(struct inode *inode,
EXPORT_SYMBOL(security_inode_listsecurity);
/**
- * security_inode_getsecid() - Get an inode's secid
+ * security_inode_getlsmprop() - Get an inode's LSM data
* @inode: inode
- * @secid: secid to return
+ * @prop: lsm specific information to return
*
- * Get the secid associated with the node. In case of failure, @secid will be
- * set to zero.
+ * Get the lsm specific information associated with the node.
*/
-void security_inode_getsecid(struct inode *inode, u32 *secid)
+void security_inode_getlsmprop(struct inode *inode, struct lsm_prop *prop)
{
- call_void_hook(inode_getsecid, inode, secid);
+ call_void_hook(inode_getlsmprop, inode, prop);
}
/**
@@ -3276,6 +3275,21 @@ void security_cred_getsecid(const struct cred *c, u32 *secid)
EXPORT_SYMBOL(security_cred_getsecid);
/**
+ * security_cred_getlsmprop() - Get the LSM data from a set of credentials
+ * @c: credentials
+ * @prop: destination for the LSM data
+ *
+ * Retrieve the security data of the cred structure @c. In case of
+ * failure, @prop will be cleared.
+ */
+void security_cred_getlsmprop(const struct cred *c, struct lsm_prop *prop)
+{
+ lsmprop_init(prop);
+ call_void_hook(cred_getlsmprop, c, prop);
+}
+EXPORT_SYMBOL(security_cred_getlsmprop);
+
+/**
* security_kernel_act_as() - Set the kernel credentials to act as secid
* @new: credentials
* @secid: secid
@@ -3494,33 +3508,33 @@ int security_task_getsid(struct task_struct *p)
}
/**
- * security_current_getsecid_subj() - Get the current task's subjective secid
- * @secid: secid value
+ * security_current_getlsmprop_subj() - Current task's subjective LSM data
+ * @prop: lsm specific information
*
* Retrieve the subjective security identifier of the current task and return
- * it in @secid. In case of failure, @secid will be set to zero.
+ * it in @prop.
*/
-void security_current_getsecid_subj(u32 *secid)
+void security_current_getlsmprop_subj(struct lsm_prop *prop)
{
- *secid = 0;
- call_void_hook(current_getsecid_subj, secid);
+ lsmprop_init(prop);
+ call_void_hook(current_getlsmprop_subj, prop);
}
-EXPORT_SYMBOL(security_current_getsecid_subj);
+EXPORT_SYMBOL(security_current_getlsmprop_subj);
/**
- * security_task_getsecid_obj() - Get a task's objective secid
+ * security_task_getlsmprop_obj() - Get a task's objective LSM data
* @p: target task
- * @secid: secid value
+ * @prop: lsm specific information
*
* Retrieve the objective security identifier of the task_struct in @p and
- * return it in @secid. In case of failure, @secid will be set to zero.
+ * return it in @prop.
*/
-void security_task_getsecid_obj(struct task_struct *p, u32 *secid)
+void security_task_getlsmprop_obj(struct task_struct *p, struct lsm_prop *prop)
{
- *secid = 0;
- call_void_hook(task_getsecid_obj, p, secid);
+ lsmprop_init(prop);
+ call_void_hook(task_getlsmprop_obj, p, prop);
}
-EXPORT_SYMBOL(security_task_getsecid_obj);
+EXPORT_SYMBOL(security_task_getlsmprop_obj);
/**
* security_task_setnice() - Check if setting a task's nice value is allowed
@@ -3732,17 +3746,17 @@ int security_ipc_permission(struct kern_ipc_perm *ipcp, short flag)
}
/**
- * security_ipc_getsecid() - Get the sysv ipc object's secid
+ * security_ipc_getlsmprop() - Get the sysv ipc object LSM data
* @ipcp: ipc permission structure
- * @secid: secid pointer
+ * @prop: pointer to lsm information
*
- * Get the secid associated with the ipc object. In case of failure, @secid
- * will be set to zero.
+ * Get the lsm information associated with the ipc object.
*/
-void security_ipc_getsecid(struct kern_ipc_perm *ipcp, u32 *secid)
+
+void security_ipc_getlsmprop(struct kern_ipc_perm *ipcp, struct lsm_prop *prop)
{
- *secid = 0;
- call_void_hook(ipc_getsecid, ipcp, secid);
+ lsmprop_init(prop);
+ call_void_hook(ipc_getlsmprop, ipcp, prop);
}
/**
@@ -4314,6 +4328,27 @@ int security_secid_to_secctx(u32 secid, char **secdata, u32 *seclen)
EXPORT_SYMBOL(security_secid_to_secctx);
/**
+ * security_lsmprop_to_secctx() - Convert a lsm_prop to a secctx
+ * @prop: lsm specific information
+ * @secdata: secctx
+ * @seclen: secctx length
+ *
+ * Convert a @prop entry to security context. If @secdata is NULL the
+ * length of the result will be returned in @seclen, but no @secdata
+ * will be returned. This does mean that the length could change between
+ * calls to check the length and the next call which actually allocates
+ * and returns the @secdata.
+ *
+ * Return: Return 0 on success, error on failure.
+ */
+int security_lsmprop_to_secctx(struct lsm_prop *prop, char **secdata,
+ u32 *seclen)
+{
+ return call_int_hook(lsmprop_to_secctx, prop, secdata, seclen);
+}
+EXPORT_SYMBOL(security_lsmprop_to_secctx);
+
+/**
* security_secctx_to_secid() - Convert a secctx to a secid
* @secdata: secctx
* @seclen: length of secctx
@@ -5572,7 +5607,7 @@ void security_audit_rule_free(void *lsmrule)
/**
* security_audit_rule_match() - Check if a label matches an audit rule
- * @secid: security label
+ * @prop: security label
* @field: LSM audit field
* @op: matching operator
* @lsmrule: audit rule
@@ -5583,9 +5618,10 @@ void security_audit_rule_free(void *lsmrule)
* Return: Returns 1 if secid matches the rule, 0 if it does not, -ERRNO on
* failure.
*/
-int security_audit_rule_match(u32 secid, u32 field, u32 op, void *lsmrule)
+int security_audit_rule_match(struct lsm_prop *prop, u32 field, u32 op,
+ void *lsmrule)
{
- return call_int_hook(audit_rule_match, secid, field, op, lsmrule);
+ return call_int_hook(audit_rule_match, prop, field, op, lsmrule);
}
#endif /* CONFIG_AUDIT */
diff --git a/security/selinux/.gitignore b/security/selinux/.gitignore
index 168fae1..01c0df8 100644
--- a/security/selinux/.gitignore
+++ b/security/selinux/.gitignore
@@ -1,3 +1,4 @@
# SPDX-License-Identifier: GPL-2.0-only
av_permissions.h
flask.h
+/genheaders
diff --git a/security/selinux/Makefile b/security/selinux/Makefile
index c47519e..86f0575 100644
--- a/security/selinux/Makefile
+++ b/security/selinux/Makefile
@@ -36,7 +36,10 @@
# see the note above, replace the $targets and 'flask.h' rule with the lines
# below:
# targets += $(genhdrs)
-# $(addprefix $(obj)/,$(genhdrs)) &: scripts/selinux/...
+# $(addprefix $(obj)/,$(genhdrs)) &: $(obj)/genheaders FORCE
targets += flask.h
-$(obj)/flask.h: scripts/selinux/genheaders/genheaders FORCE
+$(obj)/flask.h: $(obj)/genheaders FORCE
$(call if_changed,genhdrs)
+
+hostprogs := genheaders
+HOST_EXTRACFLAGS += -I$(srctree)/security/selinux/include
diff --git a/scripts/selinux/genheaders/genheaders.c b/security/selinux/genheaders.c
similarity index 97%
rename from scripts/selinux/genheaders/genheaders.c
rename to security/selinux/genheaders.c
index 1552080..3834d7e 100644
--- a/scripts/selinux/genheaders/genheaders.c
+++ b/security/selinux/genheaders.c
@@ -1,8 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
-/* NOTE: we really do want to use the kernel headers here */
-#define __EXPORTED_HEADERS__
-
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index fc926d3..f5a08f9 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -3503,15 +3503,16 @@ static int selinux_inode_listsecurity(struct inode *inode, char *buffer, size_t
return len;
}
-static void selinux_inode_getsecid(struct inode *inode, u32 *secid)
+static void selinux_inode_getlsmprop(struct inode *inode, struct lsm_prop *prop)
{
struct inode_security_struct *isec = inode_security_novalidate(inode);
- *secid = isec->sid;
+
+ prop->selinux.secid = isec->sid;
}
static int selinux_inode_copy_up(struct dentry *src, struct cred **new)
{
- u32 sid;
+ struct lsm_prop prop;
struct task_security_struct *tsec;
struct cred *new_creds = *new;
@@ -3523,8 +3524,8 @@ static int selinux_inode_copy_up(struct dentry *src, struct cred **new)
tsec = selinux_cred(new_creds);
/* Get label from overlay inode and set it in create_sid */
- selinux_inode_getsecid(d_inode(src), &sid);
- tsec->create_sid = sid;
+ selinux_inode_getlsmprop(d_inode(src), &prop);
+ tsec->create_sid = prop.selinux.secid;
*new = new_creds;
return 0;
}
@@ -4034,6 +4035,11 @@ static void selinux_cred_getsecid(const struct cred *c, u32 *secid)
*secid = cred_sid(c);
}
+static void selinux_cred_getlsmprop(const struct cred *c, struct lsm_prop *prop)
+{
+ prop->selinux.secid = cred_sid(c);
+}
+
/*
* set the security data for a kernel service
* - all the creation contexts are set to unlabelled
@@ -4169,14 +4175,15 @@ static int selinux_task_getsid(struct task_struct *p)
PROCESS__GETSESSION, NULL);
}
-static void selinux_current_getsecid_subj(u32 *secid)
+static void selinux_current_getlsmprop_subj(struct lsm_prop *prop)
{
- *secid = current_sid();
+ prop->selinux.secid = current_sid();
}
-static void selinux_task_getsecid_obj(struct task_struct *p, u32 *secid)
+static void selinux_task_getlsmprop_obj(struct task_struct *p,
+ struct lsm_prop *prop)
{
- *secid = task_sid_obj(p);
+ prop->selinux.secid = task_sid_obj(p);
}
static int selinux_task_setnice(struct task_struct *p, int nice)
@@ -4590,14 +4597,10 @@ static int socket_sockcreate_sid(const struct task_security_struct *tsec,
secclass, NULL, socksid);
}
-static int sock_has_perm(struct sock *sk, u32 perms)
+static bool sock_skip_has_perm(u32 sid)
{
- struct sk_security_struct *sksec = selinux_sock(sk);
- struct common_audit_data ad;
- struct lsm_network_audit net;
-
- if (sksec->sid == SECINITSID_KERNEL)
- return 0;
+ if (sid == SECINITSID_KERNEL)
+ return true;
/*
* Before POLICYDB_CAP_USERSPACE_INITIAL_CONTEXT, sockets that
@@ -4611,7 +4614,19 @@ static int sock_has_perm(struct sock *sk, u32 perms)
* setting.
*/
if (!selinux_policycap_userspace_initial_context() &&
- sksec->sid == SECINITSID_INIT)
+ sid == SECINITSID_INIT)
+ return true;
+ return false;
+}
+
+
+static int sock_has_perm(struct sock *sk, u32 perms)
+{
+ struct sk_security_struct *sksec = sk->sk_security;
+ struct common_audit_data ad;
+ struct lsm_network_audit net;
+
+ if (sock_skip_has_perm(sksec->sid))
return 0;
ad_net_init_from_sk(&ad, &net, sk);
@@ -5920,6 +5935,26 @@ static unsigned int selinux_ip_postroute(void *priv,
}
#endif /* CONFIG_NETFILTER */
+static int nlmsg_sock_has_extended_perms(struct sock *sk, u32 perms, u16 nlmsg_type)
+{
+ struct sk_security_struct *sksec = sk->sk_security;
+ struct common_audit_data ad;
+ struct lsm_network_audit net;
+ u8 driver;
+ u8 xperm;
+
+ if (sock_skip_has_perm(sksec->sid))
+ return 0;
+
+ ad_net_init_from_sk(&ad, &net, sk);
+
+ driver = nlmsg_type >> 8;
+ xperm = nlmsg_type & 0xff;
+
+ return avc_has_extended_perms(current_sid(), sksec->sid, sksec->sclass,
+ perms, driver, xperm, &ad);
+}
+
static int selinux_netlink_send(struct sock *sk, struct sk_buff *skb)
{
int rc = 0;
@@ -5945,7 +5980,12 @@ static int selinux_netlink_send(struct sock *sk, struct sk_buff *skb)
rc = selinux_nlmsg_lookup(sclass, nlh->nlmsg_type, &perm);
if (rc == 0) {
- rc = sock_has_perm(sk, perm);
+ if (selinux_policycap_netlink_xperm()) {
+ rc = nlmsg_sock_has_extended_perms(
+ sk, perm, nlh->nlmsg_type);
+ } else {
+ rc = sock_has_perm(sk, perm);
+ }
if (rc)
return rc;
} else if (rc == -EINVAL) {
@@ -6319,10 +6359,11 @@ static int selinux_ipc_permission(struct kern_ipc_perm *ipcp, short flag)
return ipc_has_perm(ipcp, av);
}
-static void selinux_ipc_getsecid(struct kern_ipc_perm *ipcp, u32 *secid)
+static void selinux_ipc_getlsmprop(struct kern_ipc_perm *ipcp,
+ struct lsm_prop *prop)
{
struct ipc_security_struct *isec = selinux_ipc(ipcp);
- *secid = isec->sid;
+ prop->selinux.secid = isec->sid;
}
static void selinux_d_instantiate(struct dentry *dentry, struct inode *inode)
@@ -6601,8 +6642,13 @@ static int selinux_ismaclabel(const char *name)
static int selinux_secid_to_secctx(u32 secid, char **secdata, u32 *seclen)
{
- return security_sid_to_context(secid,
- secdata, seclen);
+ return security_sid_to_context(secid, secdata, seclen);
+}
+
+static int selinux_lsmprop_to_secctx(struct lsm_prop *prop, char **secdata,
+ u32 *seclen)
+{
+ return selinux_secid_to_secctx(prop->selinux.secid, secdata, seclen);
}
static int selinux_secctx_to_secid(const char *secdata, u32 seclen, u32 *secid)
@@ -7155,7 +7201,7 @@ static struct security_hook_list selinux_hooks[] __ro_after_init = {
LSM_HOOK_INIT(inode_getsecurity, selinux_inode_getsecurity),
LSM_HOOK_INIT(inode_setsecurity, selinux_inode_setsecurity),
LSM_HOOK_INIT(inode_listsecurity, selinux_inode_listsecurity),
- LSM_HOOK_INIT(inode_getsecid, selinux_inode_getsecid),
+ LSM_HOOK_INIT(inode_getlsmprop, selinux_inode_getlsmprop),
LSM_HOOK_INIT(inode_copy_up, selinux_inode_copy_up),
LSM_HOOK_INIT(inode_copy_up_xattr, selinux_inode_copy_up_xattr),
LSM_HOOK_INIT(path_notify, selinux_path_notify),
@@ -7181,6 +7227,7 @@ static struct security_hook_list selinux_hooks[] __ro_after_init = {
LSM_HOOK_INIT(cred_prepare, selinux_cred_prepare),
LSM_HOOK_INIT(cred_transfer, selinux_cred_transfer),
LSM_HOOK_INIT(cred_getsecid, selinux_cred_getsecid),
+ LSM_HOOK_INIT(cred_getlsmprop, selinux_cred_getlsmprop),
LSM_HOOK_INIT(kernel_act_as, selinux_kernel_act_as),
LSM_HOOK_INIT(kernel_create_files_as, selinux_kernel_create_files_as),
LSM_HOOK_INIT(kernel_module_request, selinux_kernel_module_request),
@@ -7189,8 +7236,8 @@ static struct security_hook_list selinux_hooks[] __ro_after_init = {
LSM_HOOK_INIT(task_setpgid, selinux_task_setpgid),
LSM_HOOK_INIT(task_getpgid, selinux_task_getpgid),
LSM_HOOK_INIT(task_getsid, selinux_task_getsid),
- LSM_HOOK_INIT(current_getsecid_subj, selinux_current_getsecid_subj),
- LSM_HOOK_INIT(task_getsecid_obj, selinux_task_getsecid_obj),
+ LSM_HOOK_INIT(current_getlsmprop_subj, selinux_current_getlsmprop_subj),
+ LSM_HOOK_INIT(task_getlsmprop_obj, selinux_task_getlsmprop_obj),
LSM_HOOK_INIT(task_setnice, selinux_task_setnice),
LSM_HOOK_INIT(task_setioprio, selinux_task_setioprio),
LSM_HOOK_INIT(task_getioprio, selinux_task_getioprio),
@@ -7204,7 +7251,7 @@ static struct security_hook_list selinux_hooks[] __ro_after_init = {
LSM_HOOK_INIT(userns_create, selinux_userns_create),
LSM_HOOK_INIT(ipc_permission, selinux_ipc_permission),
- LSM_HOOK_INIT(ipc_getsecid, selinux_ipc_getsecid),
+ LSM_HOOK_INIT(ipc_getlsmprop, selinux_ipc_getlsmprop),
LSM_HOOK_INIT(msg_queue_associate, selinux_msg_queue_associate),
LSM_HOOK_INIT(msg_queue_msgctl, selinux_msg_queue_msgctl),
@@ -7347,6 +7394,7 @@ static struct security_hook_list selinux_hooks[] __ro_after_init = {
LSM_HOOK_INIT(inode_alloc_security, selinux_inode_alloc_security),
LSM_HOOK_INIT(sem_alloc_security, selinux_sem_alloc_security),
LSM_HOOK_INIT(secid_to_secctx, selinux_secid_to_secctx),
+ LSM_HOOK_INIT(lsmprop_to_secctx, selinux_lsmprop_to_secctx),
LSM_HOOK_INIT(inode_getsecctx, selinux_inode_getsecctx),
LSM_HOOK_INIT(sk_alloc_security, selinux_sk_alloc_security),
LSM_HOOK_INIT(tun_dev_alloc_security, selinux_tun_dev_alloc_security),
diff --git a/security/selinux/include/audit.h b/security/selinux/include/audit.h
index 168d17b..d5b0425 100644
--- a/security/selinux/include/audit.h
+++ b/security/selinux/include/audit.h
@@ -41,7 +41,7 @@ void selinux_audit_rule_free(void *rule);
/**
* selinux_audit_rule_match - determine if a context ID matches a rule.
- * @sid: the context ID to check
+ * @prop: includes the context ID to check
* @field: the field this rule refers to
* @op: the operator the rule uses
* @rule: pointer to the audit rule to check against
@@ -49,7 +49,8 @@ void selinux_audit_rule_free(void *rule);
* Returns 1 if the context id matches the rule, 0 if it does not, and
* -errno on failure.
*/
-int selinux_audit_rule_match(u32 sid, u32 field, u32 op, void *rule);
+int selinux_audit_rule_match(struct lsm_prop *prop, u32 field, u32 op,
+ void *rule);
/**
* selinux_audit_rule_known - check to see if rule contains selinux fields.
diff --git a/security/selinux/include/classmap.h b/security/selinux/include/classmap.h
index 7229c9b..2bc2013 100644
--- a/security/selinux/include/classmap.h
+++ b/security/selinux/include/classmap.h
@@ -1,8 +1,5 @@
/* SPDX-License-Identifier: GPL-2.0 */
-#include <linux/capability.h>
-#include <linux/socket.h>
-
#define COMMON_FILE_SOCK_PERMS \
"ioctl", "read", "write", "create", "getattr", "setattr", "lock", \
"relabelfrom", "relabelto", "append", "map"
@@ -36,9 +33,13 @@
"mac_override", "mac_admin", "syslog", "wake_alarm", "block_suspend", \
"audit_read", "perfmon", "bpf", "checkpoint_restore"
+#ifdef __KERNEL__ /* avoid this check when building host programs */
+#include <linux/capability.h>
+
#if CAP_LAST_CAP > CAP_CHECKPOINT_RESTORE
#error New capability defined, please update COMMON_CAP2_PERMS.
#endif
+#endif
/*
* Note: The name for any socket class should be suffixed by "socket",
@@ -96,17 +97,17 @@ const struct security_class_mapping secclass_map[] = {
{ "shm", { COMMON_IPC_PERMS, "lock", NULL } },
{ "ipc", { COMMON_IPC_PERMS, NULL } },
{ "netlink_route_socket",
- { COMMON_SOCK_PERMS, "nlmsg_read", "nlmsg_write", NULL } },
+ { COMMON_SOCK_PERMS, "nlmsg_read", "nlmsg_write", "nlmsg", NULL } },
{ "netlink_tcpdiag_socket",
- { COMMON_SOCK_PERMS, "nlmsg_read", "nlmsg_write", NULL } },
+ { COMMON_SOCK_PERMS, "nlmsg_read", "nlmsg_write", "nlmsg", NULL } },
{ "netlink_nflog_socket", { COMMON_SOCK_PERMS, NULL } },
{ "netlink_xfrm_socket",
- { COMMON_SOCK_PERMS, "nlmsg_read", "nlmsg_write", NULL } },
+ { COMMON_SOCK_PERMS, "nlmsg_read", "nlmsg_write", "nlmsg", NULL } },
{ "netlink_selinux_socket", { COMMON_SOCK_PERMS, NULL } },
{ "netlink_iscsi_socket", { COMMON_SOCK_PERMS, NULL } },
{ "netlink_audit_socket",
{ COMMON_SOCK_PERMS, "nlmsg_read", "nlmsg_write", "nlmsg_relay",
- "nlmsg_readpriv", "nlmsg_tty_audit", NULL } },
+ "nlmsg_readpriv", "nlmsg_tty_audit", "nlmsg", NULL } },
{ "netlink_fib_lookup_socket", { COMMON_SOCK_PERMS, NULL } },
{ "netlink_connector_socket", { COMMON_SOCK_PERMS, NULL } },
{ "netlink_netfilter_socket", { COMMON_SOCK_PERMS, NULL } },
@@ -181,6 +182,10 @@ const struct security_class_mapping secclass_map[] = {
{ NULL }
};
+#ifdef __KERNEL__ /* avoid this check when building host programs */
+#include <linux/socket.h>
+
#if PF_MAX > 46
#error New address family defined, please update secclass_map.
#endif
+#endif
diff --git a/security/selinux/include/initial_sid_to_string.h b/security/selinux/include/initial_sid_to_string.h
index 99b353b..d7ba60b 100644
--- a/security/selinux/include/initial_sid_to_string.h
+++ b/security/selinux/include/initial_sid_to_string.h
@@ -1,6 +1,10 @@
/* SPDX-License-Identifier: GPL-2.0 */
+#ifdef __KERNEL__
#include <linux/stddef.h>
+#else
+#include <stddef.h>
+#endif
static const char *const initial_sid_to_string[] = {
NULL, /* zero placeholder, not used */
diff --git a/security/selinux/include/policycap.h b/security/selinux/include/policycap.h
index dc3674e..079679f 100644
--- a/security/selinux/include/policycap.h
+++ b/security/selinux/include/policycap.h
@@ -14,6 +14,7 @@ enum {
POLICYDB_CAP_GENFS_SECLABEL_SYMLINKS,
POLICYDB_CAP_IOCTL_SKIP_CLOEXEC,
POLICYDB_CAP_USERSPACE_INITIAL_CONTEXT,
+ POLICYDB_CAP_NETLINK_XPERM,
__POLICYDB_CAP_MAX
};
#define POLICYDB_CAP_MAX (__POLICYDB_CAP_MAX - 1)
diff --git a/security/selinux/include/policycap_names.h b/security/selinux/include/policycap_names.h
index 2cffcc1..e080827 100644
--- a/security/selinux/include/policycap_names.h
+++ b/security/selinux/include/policycap_names.h
@@ -17,6 +17,7 @@ const char *const selinux_policycap_names[__POLICYDB_CAP_MAX] = {
"genfs_seclabel_symlinks",
"ioctl_skip_cloexec",
"userspace_initial_context",
+ "netlink_xperm",
};
/* clang-format on */
diff --git a/security/selinux/include/security.h b/security/selinux/include/security.h
index 289bf92..c7f2731 100644
--- a/security/selinux/include/security.h
+++ b/security/selinux/include/security.h
@@ -195,6 +195,12 @@ static inline bool selinux_policycap_userspace_initial_context(void)
selinux_state.policycap[POLICYDB_CAP_USERSPACE_INITIAL_CONTEXT]);
}
+static inline bool selinux_policycap_netlink_xperm(void)
+{
+ return READ_ONCE(
+ selinux_state.policycap[POLICYDB_CAP_NETLINK_XPERM]);
+}
+
struct selinux_policy_convert_data;
struct selinux_load_state {
diff --git a/security/selinux/nlmsgtab.c b/security/selinux/nlmsgtab.c
index 8ff670c..3a95986 100644
--- a/security/selinux/nlmsgtab.c
+++ b/security/selinux/nlmsgtab.c
@@ -21,142 +21,142 @@
#include "security.h"
struct nlmsg_perm {
- u16 nlmsg_type;
- u32 perm;
+ u16 nlmsg_type;
+ u32 perm;
};
static const struct nlmsg_perm nlmsg_route_perms[] = {
- { RTM_NEWLINK, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
- { RTM_DELLINK, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
- { RTM_GETLINK, NETLINK_ROUTE_SOCKET__NLMSG_READ },
- { RTM_SETLINK, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
- { RTM_NEWADDR, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
- { RTM_DELADDR, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
- { RTM_GETADDR, NETLINK_ROUTE_SOCKET__NLMSG_READ },
- { RTM_NEWROUTE, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
- { RTM_DELROUTE, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
- { RTM_GETROUTE, NETLINK_ROUTE_SOCKET__NLMSG_READ },
- { RTM_NEWNEIGH, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
- { RTM_DELNEIGH, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
- { RTM_GETNEIGH, NETLINK_ROUTE_SOCKET__NLMSG_READ },
- { RTM_NEWRULE, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
- { RTM_DELRULE, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
- { RTM_GETRULE, NETLINK_ROUTE_SOCKET__NLMSG_READ },
- { RTM_NEWQDISC, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
- { RTM_DELQDISC, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
- { RTM_GETQDISC, NETLINK_ROUTE_SOCKET__NLMSG_READ },
- { RTM_NEWTCLASS, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
- { RTM_DELTCLASS, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
- { RTM_GETTCLASS, NETLINK_ROUTE_SOCKET__NLMSG_READ },
- { RTM_NEWTFILTER, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
- { RTM_DELTFILTER, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
- { RTM_GETTFILTER, NETLINK_ROUTE_SOCKET__NLMSG_READ },
- { RTM_NEWACTION, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
- { RTM_DELACTION, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
- { RTM_GETACTION, NETLINK_ROUTE_SOCKET__NLMSG_READ },
- { RTM_NEWPREFIX, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
- { RTM_GETMULTICAST, NETLINK_ROUTE_SOCKET__NLMSG_READ },
- { RTM_GETANYCAST, NETLINK_ROUTE_SOCKET__NLMSG_READ },
- { RTM_GETNEIGHTBL, NETLINK_ROUTE_SOCKET__NLMSG_READ },
- { RTM_SETNEIGHTBL, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
- { RTM_NEWADDRLABEL, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
- { RTM_DELADDRLABEL, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
- { RTM_GETADDRLABEL, NETLINK_ROUTE_SOCKET__NLMSG_READ },
- { RTM_GETDCB, NETLINK_ROUTE_SOCKET__NLMSG_READ },
- { RTM_SETDCB, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
- { RTM_NEWNETCONF, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
- { RTM_DELNETCONF, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
- { RTM_GETNETCONF, NETLINK_ROUTE_SOCKET__NLMSG_READ },
- { RTM_NEWMDB, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
- { RTM_DELMDB, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
- { RTM_GETMDB, NETLINK_ROUTE_SOCKET__NLMSG_READ },
- { RTM_NEWNSID, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
- { RTM_DELNSID, NETLINK_ROUTE_SOCKET__NLMSG_READ },
- { RTM_GETNSID, NETLINK_ROUTE_SOCKET__NLMSG_READ },
- { RTM_NEWSTATS, NETLINK_ROUTE_SOCKET__NLMSG_READ },
- { RTM_GETSTATS, NETLINK_ROUTE_SOCKET__NLMSG_READ },
- { RTM_SETSTATS, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
- { RTM_NEWCACHEREPORT, NETLINK_ROUTE_SOCKET__NLMSG_READ },
- { RTM_NEWCHAIN, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
- { RTM_DELCHAIN, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
- { RTM_GETCHAIN, NETLINK_ROUTE_SOCKET__NLMSG_READ },
- { RTM_NEWNEXTHOP, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
- { RTM_DELNEXTHOP, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
- { RTM_GETNEXTHOP, NETLINK_ROUTE_SOCKET__NLMSG_READ },
- { RTM_NEWLINKPROP, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
- { RTM_DELLINKPROP, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
- { RTM_NEWVLAN, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
- { RTM_DELVLAN, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
- { RTM_GETVLAN, NETLINK_ROUTE_SOCKET__NLMSG_READ },
- { RTM_NEWNEXTHOPBUCKET, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
- { RTM_DELNEXTHOPBUCKET, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
- { RTM_GETNEXTHOPBUCKET, NETLINK_ROUTE_SOCKET__NLMSG_READ },
- { RTM_NEWTUNNEL, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
- { RTM_DELTUNNEL, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
- { RTM_GETTUNNEL, NETLINK_ROUTE_SOCKET__NLMSG_READ },
+ { RTM_NEWLINK, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
+ { RTM_DELLINK, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
+ { RTM_GETLINK, NETLINK_ROUTE_SOCKET__NLMSG_READ },
+ { RTM_SETLINK, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
+ { RTM_NEWADDR, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
+ { RTM_DELADDR, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
+ { RTM_GETADDR, NETLINK_ROUTE_SOCKET__NLMSG_READ },
+ { RTM_NEWROUTE, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
+ { RTM_DELROUTE, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
+ { RTM_GETROUTE, NETLINK_ROUTE_SOCKET__NLMSG_READ },
+ { RTM_NEWNEIGH, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
+ { RTM_DELNEIGH, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
+ { RTM_GETNEIGH, NETLINK_ROUTE_SOCKET__NLMSG_READ },
+ { RTM_NEWRULE, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
+ { RTM_DELRULE, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
+ { RTM_GETRULE, NETLINK_ROUTE_SOCKET__NLMSG_READ },
+ { RTM_NEWQDISC, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
+ { RTM_DELQDISC, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
+ { RTM_GETQDISC, NETLINK_ROUTE_SOCKET__NLMSG_READ },
+ { RTM_NEWTCLASS, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
+ { RTM_DELTCLASS, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
+ { RTM_GETTCLASS, NETLINK_ROUTE_SOCKET__NLMSG_READ },
+ { RTM_NEWTFILTER, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
+ { RTM_DELTFILTER, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
+ { RTM_GETTFILTER, NETLINK_ROUTE_SOCKET__NLMSG_READ },
+ { RTM_NEWACTION, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
+ { RTM_DELACTION, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
+ { RTM_GETACTION, NETLINK_ROUTE_SOCKET__NLMSG_READ },
+ { RTM_NEWPREFIX, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
+ { RTM_GETMULTICAST, NETLINK_ROUTE_SOCKET__NLMSG_READ },
+ { RTM_GETANYCAST, NETLINK_ROUTE_SOCKET__NLMSG_READ },
+ { RTM_GETNEIGHTBL, NETLINK_ROUTE_SOCKET__NLMSG_READ },
+ { RTM_SETNEIGHTBL, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
+ { RTM_NEWADDRLABEL, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
+ { RTM_DELADDRLABEL, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
+ { RTM_GETADDRLABEL, NETLINK_ROUTE_SOCKET__NLMSG_READ },
+ { RTM_GETDCB, NETLINK_ROUTE_SOCKET__NLMSG_READ },
+ { RTM_SETDCB, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
+ { RTM_NEWNETCONF, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
+ { RTM_DELNETCONF, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
+ { RTM_GETNETCONF, NETLINK_ROUTE_SOCKET__NLMSG_READ },
+ { RTM_NEWMDB, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
+ { RTM_DELMDB, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
+ { RTM_GETMDB, NETLINK_ROUTE_SOCKET__NLMSG_READ },
+ { RTM_NEWNSID, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
+ { RTM_DELNSID, NETLINK_ROUTE_SOCKET__NLMSG_READ },
+ { RTM_GETNSID, NETLINK_ROUTE_SOCKET__NLMSG_READ },
+ { RTM_NEWSTATS, NETLINK_ROUTE_SOCKET__NLMSG_READ },
+ { RTM_GETSTATS, NETLINK_ROUTE_SOCKET__NLMSG_READ },
+ { RTM_SETSTATS, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
+ { RTM_NEWCACHEREPORT, NETLINK_ROUTE_SOCKET__NLMSG_READ },
+ { RTM_NEWCHAIN, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
+ { RTM_DELCHAIN, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
+ { RTM_GETCHAIN, NETLINK_ROUTE_SOCKET__NLMSG_READ },
+ { RTM_NEWNEXTHOP, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
+ { RTM_DELNEXTHOP, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
+ { RTM_GETNEXTHOP, NETLINK_ROUTE_SOCKET__NLMSG_READ },
+ { RTM_NEWLINKPROP, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
+ { RTM_DELLINKPROP, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
+ { RTM_NEWVLAN, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
+ { RTM_DELVLAN, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
+ { RTM_GETVLAN, NETLINK_ROUTE_SOCKET__NLMSG_READ },
+ { RTM_NEWNEXTHOPBUCKET, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
+ { RTM_DELNEXTHOPBUCKET, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
+ { RTM_GETNEXTHOPBUCKET, NETLINK_ROUTE_SOCKET__NLMSG_READ },
+ { RTM_NEWTUNNEL, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
+ { RTM_DELTUNNEL, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
+ { RTM_GETTUNNEL, NETLINK_ROUTE_SOCKET__NLMSG_READ },
};
static const struct nlmsg_perm nlmsg_tcpdiag_perms[] = {
- { TCPDIAG_GETSOCK, NETLINK_TCPDIAG_SOCKET__NLMSG_READ },
- { DCCPDIAG_GETSOCK, NETLINK_TCPDIAG_SOCKET__NLMSG_READ },
- { SOCK_DIAG_BY_FAMILY, NETLINK_TCPDIAG_SOCKET__NLMSG_READ },
- { SOCK_DESTROY, NETLINK_TCPDIAG_SOCKET__NLMSG_WRITE },
+ { TCPDIAG_GETSOCK, NETLINK_TCPDIAG_SOCKET__NLMSG_READ },
+ { DCCPDIAG_GETSOCK, NETLINK_TCPDIAG_SOCKET__NLMSG_READ },
+ { SOCK_DIAG_BY_FAMILY, NETLINK_TCPDIAG_SOCKET__NLMSG_READ },
+ { SOCK_DESTROY, NETLINK_TCPDIAG_SOCKET__NLMSG_WRITE },
};
static const struct nlmsg_perm nlmsg_xfrm_perms[] = {
- { XFRM_MSG_NEWSA, NETLINK_XFRM_SOCKET__NLMSG_WRITE },
- { XFRM_MSG_DELSA, NETLINK_XFRM_SOCKET__NLMSG_WRITE },
- { XFRM_MSG_GETSA, NETLINK_XFRM_SOCKET__NLMSG_READ },
- { XFRM_MSG_NEWPOLICY, NETLINK_XFRM_SOCKET__NLMSG_WRITE },
- { XFRM_MSG_DELPOLICY, NETLINK_XFRM_SOCKET__NLMSG_WRITE },
- { XFRM_MSG_GETPOLICY, NETLINK_XFRM_SOCKET__NLMSG_READ },
- { XFRM_MSG_ALLOCSPI, NETLINK_XFRM_SOCKET__NLMSG_WRITE },
- { XFRM_MSG_ACQUIRE, NETLINK_XFRM_SOCKET__NLMSG_WRITE },
- { XFRM_MSG_EXPIRE, NETLINK_XFRM_SOCKET__NLMSG_WRITE },
- { XFRM_MSG_UPDPOLICY, NETLINK_XFRM_SOCKET__NLMSG_WRITE },
- { XFRM_MSG_UPDSA, NETLINK_XFRM_SOCKET__NLMSG_WRITE },
- { XFRM_MSG_POLEXPIRE, NETLINK_XFRM_SOCKET__NLMSG_WRITE },
- { XFRM_MSG_FLUSHSA, NETLINK_XFRM_SOCKET__NLMSG_WRITE },
- { XFRM_MSG_FLUSHPOLICY, NETLINK_XFRM_SOCKET__NLMSG_WRITE },
- { XFRM_MSG_NEWAE, NETLINK_XFRM_SOCKET__NLMSG_WRITE },
- { XFRM_MSG_GETAE, NETLINK_XFRM_SOCKET__NLMSG_READ },
- { XFRM_MSG_REPORT, NETLINK_XFRM_SOCKET__NLMSG_READ },
- { XFRM_MSG_MIGRATE, NETLINK_XFRM_SOCKET__NLMSG_WRITE },
- { XFRM_MSG_NEWSADINFO, NETLINK_XFRM_SOCKET__NLMSG_READ },
- { XFRM_MSG_GETSADINFO, NETLINK_XFRM_SOCKET__NLMSG_READ },
- { XFRM_MSG_NEWSPDINFO, NETLINK_XFRM_SOCKET__NLMSG_WRITE },
- { XFRM_MSG_GETSPDINFO, NETLINK_XFRM_SOCKET__NLMSG_READ },
- { XFRM_MSG_MAPPING, NETLINK_XFRM_SOCKET__NLMSG_READ },
- { XFRM_MSG_SETDEFAULT, NETLINK_XFRM_SOCKET__NLMSG_WRITE },
- { XFRM_MSG_GETDEFAULT, NETLINK_XFRM_SOCKET__NLMSG_READ },
+ { XFRM_MSG_NEWSA, NETLINK_XFRM_SOCKET__NLMSG_WRITE },
+ { XFRM_MSG_DELSA, NETLINK_XFRM_SOCKET__NLMSG_WRITE },
+ { XFRM_MSG_GETSA, NETLINK_XFRM_SOCKET__NLMSG_READ },
+ { XFRM_MSG_NEWPOLICY, NETLINK_XFRM_SOCKET__NLMSG_WRITE },
+ { XFRM_MSG_DELPOLICY, NETLINK_XFRM_SOCKET__NLMSG_WRITE },
+ { XFRM_MSG_GETPOLICY, NETLINK_XFRM_SOCKET__NLMSG_READ },
+ { XFRM_MSG_ALLOCSPI, NETLINK_XFRM_SOCKET__NLMSG_WRITE },
+ { XFRM_MSG_ACQUIRE, NETLINK_XFRM_SOCKET__NLMSG_WRITE },
+ { XFRM_MSG_EXPIRE, NETLINK_XFRM_SOCKET__NLMSG_WRITE },
+ { XFRM_MSG_UPDPOLICY, NETLINK_XFRM_SOCKET__NLMSG_WRITE },
+ { XFRM_MSG_UPDSA, NETLINK_XFRM_SOCKET__NLMSG_WRITE },
+ { XFRM_MSG_POLEXPIRE, NETLINK_XFRM_SOCKET__NLMSG_WRITE },
+ { XFRM_MSG_FLUSHSA, NETLINK_XFRM_SOCKET__NLMSG_WRITE },
+ { XFRM_MSG_FLUSHPOLICY, NETLINK_XFRM_SOCKET__NLMSG_WRITE },
+ { XFRM_MSG_NEWAE, NETLINK_XFRM_SOCKET__NLMSG_WRITE },
+ { XFRM_MSG_GETAE, NETLINK_XFRM_SOCKET__NLMSG_READ },
+ { XFRM_MSG_REPORT, NETLINK_XFRM_SOCKET__NLMSG_READ },
+ { XFRM_MSG_MIGRATE, NETLINK_XFRM_SOCKET__NLMSG_WRITE },
+ { XFRM_MSG_NEWSADINFO, NETLINK_XFRM_SOCKET__NLMSG_READ },
+ { XFRM_MSG_GETSADINFO, NETLINK_XFRM_SOCKET__NLMSG_READ },
+ { XFRM_MSG_NEWSPDINFO, NETLINK_XFRM_SOCKET__NLMSG_WRITE },
+ { XFRM_MSG_GETSPDINFO, NETLINK_XFRM_SOCKET__NLMSG_READ },
+ { XFRM_MSG_MAPPING, NETLINK_XFRM_SOCKET__NLMSG_READ },
+ { XFRM_MSG_SETDEFAULT, NETLINK_XFRM_SOCKET__NLMSG_WRITE },
+ { XFRM_MSG_GETDEFAULT, NETLINK_XFRM_SOCKET__NLMSG_READ },
};
static const struct nlmsg_perm nlmsg_audit_perms[] = {
- { AUDIT_GET, NETLINK_AUDIT_SOCKET__NLMSG_READ },
- { AUDIT_SET, NETLINK_AUDIT_SOCKET__NLMSG_WRITE },
- { AUDIT_LIST, NETLINK_AUDIT_SOCKET__NLMSG_READPRIV },
- { AUDIT_ADD, NETLINK_AUDIT_SOCKET__NLMSG_WRITE },
- { AUDIT_DEL, NETLINK_AUDIT_SOCKET__NLMSG_WRITE },
- { AUDIT_LIST_RULES, NETLINK_AUDIT_SOCKET__NLMSG_READPRIV },
- { AUDIT_ADD_RULE, NETLINK_AUDIT_SOCKET__NLMSG_WRITE },
- { AUDIT_DEL_RULE, NETLINK_AUDIT_SOCKET__NLMSG_WRITE },
- { AUDIT_USER, NETLINK_AUDIT_SOCKET__NLMSG_RELAY },
- { AUDIT_SIGNAL_INFO, NETLINK_AUDIT_SOCKET__NLMSG_READ },
- { AUDIT_TRIM, NETLINK_AUDIT_SOCKET__NLMSG_WRITE },
- { AUDIT_MAKE_EQUIV, NETLINK_AUDIT_SOCKET__NLMSG_WRITE },
- { AUDIT_TTY_GET, NETLINK_AUDIT_SOCKET__NLMSG_READ },
- { AUDIT_TTY_SET, NETLINK_AUDIT_SOCKET__NLMSG_TTY_AUDIT },
- { AUDIT_GET_FEATURE, NETLINK_AUDIT_SOCKET__NLMSG_READ },
- { AUDIT_SET_FEATURE, NETLINK_AUDIT_SOCKET__NLMSG_WRITE },
+ { AUDIT_GET, NETLINK_AUDIT_SOCKET__NLMSG_READ },
+ { AUDIT_SET, NETLINK_AUDIT_SOCKET__NLMSG_WRITE },
+ { AUDIT_LIST, NETLINK_AUDIT_SOCKET__NLMSG_READPRIV },
+ { AUDIT_ADD, NETLINK_AUDIT_SOCKET__NLMSG_WRITE },
+ { AUDIT_DEL, NETLINK_AUDIT_SOCKET__NLMSG_WRITE },
+ { AUDIT_LIST_RULES, NETLINK_AUDIT_SOCKET__NLMSG_READPRIV },
+ { AUDIT_ADD_RULE, NETLINK_AUDIT_SOCKET__NLMSG_WRITE },
+ { AUDIT_DEL_RULE, NETLINK_AUDIT_SOCKET__NLMSG_WRITE },
+ { AUDIT_USER, NETLINK_AUDIT_SOCKET__NLMSG_RELAY },
+ { AUDIT_SIGNAL_INFO, NETLINK_AUDIT_SOCKET__NLMSG_READ },
+ { AUDIT_TRIM, NETLINK_AUDIT_SOCKET__NLMSG_WRITE },
+ { AUDIT_MAKE_EQUIV, NETLINK_AUDIT_SOCKET__NLMSG_WRITE },
+ { AUDIT_TTY_GET, NETLINK_AUDIT_SOCKET__NLMSG_READ },
+ { AUDIT_TTY_SET, NETLINK_AUDIT_SOCKET__NLMSG_TTY_AUDIT },
+ { AUDIT_GET_FEATURE, NETLINK_AUDIT_SOCKET__NLMSG_READ },
+ { AUDIT_SET_FEATURE, NETLINK_AUDIT_SOCKET__NLMSG_WRITE },
};
-
-static int nlmsg_perm(u16 nlmsg_type, u32 *perm, const struct nlmsg_perm *tab, size_t tabsize)
+static int nlmsg_perm(u16 nlmsg_type, u32 *perm, const struct nlmsg_perm *tab,
+ size_t tabsize)
{
unsigned int i;
int err = -EINVAL;
- for (i = 0; i < tabsize/sizeof(struct nlmsg_perm); i++)
+ for (i = 0; i < tabsize / sizeof(struct nlmsg_perm); i++)
if (nlmsg_type == tab[i].nlmsg_type) {
*perm = tab[i].perm;
err = 0;
@@ -168,7 +168,12 @@ static int nlmsg_perm(u16 nlmsg_type, u32 *perm, const struct nlmsg_perm *tab, s
int selinux_nlmsg_lookup(u16 sclass, u16 nlmsg_type, u32 *perm)
{
- int err = 0;
+ /* While it is possible to add a similar permission to other netlink
+ * classes, note that the extended permission value is matched against
+ * the nlmsg_type field. Notably, SECCLASS_NETLINK_GENERIC_SOCKET uses
+ * dynamic values for this field, which means that it cannot be added
+ * as-is.
+ */
switch (sclass) {
case SECCLASS_NETLINK_ROUTE_SOCKET:
@@ -178,42 +183,52 @@ int selinux_nlmsg_lookup(u16 sclass, u16 nlmsg_type, u32 *perm)
* before updating the BUILD_BUG_ON() macro!
*/
BUILD_BUG_ON(RTM_MAX != (RTM_NEWTUNNEL + 3));
- err = nlmsg_perm(nlmsg_type, perm, nlmsg_route_perms,
- sizeof(nlmsg_route_perms));
- break;
+ if (selinux_policycap_netlink_xperm()) {
+ *perm = NETLINK_ROUTE_SOCKET__NLMSG;
+ return 0;
+ }
+ return nlmsg_perm(nlmsg_type, perm, nlmsg_route_perms,
+ sizeof(nlmsg_route_perms));
+ break;
case SECCLASS_NETLINK_TCPDIAG_SOCKET:
- err = nlmsg_perm(nlmsg_type, perm, nlmsg_tcpdiag_perms,
- sizeof(nlmsg_tcpdiag_perms));
+ if (selinux_policycap_netlink_xperm()) {
+ *perm = NETLINK_TCPDIAG_SOCKET__NLMSG;
+ return 0;
+ }
+ return nlmsg_perm(nlmsg_type, perm, nlmsg_tcpdiag_perms,
+ sizeof(nlmsg_tcpdiag_perms));
break;
-
case SECCLASS_NETLINK_XFRM_SOCKET:
/* If the BUILD_BUG_ON() below fails you must update the
* structures at the top of this file with the new mappings
* before updating the BUILD_BUG_ON() macro!
*/
BUILD_BUG_ON(XFRM_MSG_MAX != XFRM_MSG_GETDEFAULT);
- err = nlmsg_perm(nlmsg_type, perm, nlmsg_xfrm_perms,
- sizeof(nlmsg_xfrm_perms));
- break;
- case SECCLASS_NETLINK_AUDIT_SOCKET:
- if ((nlmsg_type >= AUDIT_FIRST_USER_MSG &&
- nlmsg_type <= AUDIT_LAST_USER_MSG) ||
- (nlmsg_type >= AUDIT_FIRST_USER_MSG2 &&
- nlmsg_type <= AUDIT_LAST_USER_MSG2)) {
- *perm = NETLINK_AUDIT_SOCKET__NLMSG_RELAY;
- } else {
- err = nlmsg_perm(nlmsg_type, perm, nlmsg_audit_perms,
- sizeof(nlmsg_audit_perms));
+ if (selinux_policycap_netlink_xperm()) {
+ *perm = NETLINK_XFRM_SOCKET__NLMSG;
+ return 0;
}
+ return nlmsg_perm(nlmsg_type, perm, nlmsg_xfrm_perms,
+ sizeof(nlmsg_xfrm_perms));
break;
-
- /* No messaging from userspace, or class unknown/unhandled */
- default:
- err = -ENOENT;
+ case SECCLASS_NETLINK_AUDIT_SOCKET:
+ if (selinux_policycap_netlink_xperm()) {
+ *perm = NETLINK_AUDIT_SOCKET__NLMSG;
+ return 0;
+ } else if ((nlmsg_type >= AUDIT_FIRST_USER_MSG &&
+ nlmsg_type <= AUDIT_LAST_USER_MSG) ||
+ (nlmsg_type >= AUDIT_FIRST_USER_MSG2 &&
+ nlmsg_type <= AUDIT_LAST_USER_MSG2)) {
+ *perm = NETLINK_AUDIT_SOCKET__NLMSG_RELAY;
+ return 0;
+ }
+ return nlmsg_perm(nlmsg_type, perm, nlmsg_audit_perms,
+ sizeof(nlmsg_audit_perms));
break;
}
- return err;
+ /* No messaging from userspace, or class unknown/unhandled */
+ return -ENOENT;
}
diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c
index e172f18..234f478 100644
--- a/security/selinux/selinuxfs.c
+++ b/security/selinux/selinuxfs.c
@@ -1069,6 +1069,10 @@ static ssize_t sel_write_user(struct file *file, char *buf, size_t size)
int rc;
u32 i, len, nsids;
+ pr_warn_ratelimited("SELinux: %s (%d) wrote to /sys/fs/selinux/user!"
+ " This will not be supported in the future; please update your"
+ " userspace.\n", current->comm, current->pid);
+
length = avc_has_perm(current_sid(), SECINITSID_SECURITY,
SECCLASS_SECURITY, SECURITY__COMPUTE_USER,
NULL);
diff --git a/security/selinux/ss/avtab.h b/security/selinux/ss/avtab.h
index 8e88204..f440718 100644
--- a/security/selinux/ss/avtab.h
+++ b/security/selinux/ss/avtab.h
@@ -53,8 +53,9 @@ struct avtab_key {
*/
struct avtab_extended_perms {
/* These are not flags. All 256 values may be used */
-#define AVTAB_XPERMS_IOCTLFUNCTION 0x01
-#define AVTAB_XPERMS_IOCTLDRIVER 0x02
+#define AVTAB_XPERMS_IOCTLFUNCTION 0x01
+#define AVTAB_XPERMS_IOCTLDRIVER 0x02
+#define AVTAB_XPERMS_NLMSG 0x03
/* extension of the avtab_key specified */
u8 specified; /* ioctl, netfilter, ... */
/*
diff --git a/security/selinux/ss/services.c b/security/selinux/ss/services.c
index a9830fb..971c45d 100644
--- a/security/selinux/ss/services.c
+++ b/security/selinux/ss/services.c
@@ -582,8 +582,7 @@ static void type_attribute_bounds_av(struct policydb *policydb,
}
/*
- * flag which drivers have permissions
- * only looking for ioctl based extended permissions
+ * Flag which drivers have permissions.
*/
void services_compute_xperms_drivers(
struct extended_perms *xperms,
@@ -591,14 +590,18 @@ void services_compute_xperms_drivers(
{
unsigned int i;
- if (node->datum.u.xperms->specified == AVTAB_XPERMS_IOCTLDRIVER) {
+ switch (node->datum.u.xperms->specified) {
+ case AVTAB_XPERMS_IOCTLDRIVER:
/* if one or more driver has all permissions allowed */
for (i = 0; i < ARRAY_SIZE(xperms->drivers.p); i++)
xperms->drivers.p[i] |= node->datum.u.xperms->perms.p[i];
- } else if (node->datum.u.xperms->specified == AVTAB_XPERMS_IOCTLFUNCTION) {
+ break;
+ case AVTAB_XPERMS_IOCTLFUNCTION:
+ case AVTAB_XPERMS_NLMSG:
/* if allowing permissions within a driver */
security_xperm_set(xperms->drivers.p,
node->datum.u.xperms->driver);
+ break;
}
xperms->len = 1;
@@ -942,55 +945,58 @@ static void avd_init(struct selinux_policy *policy, struct av_decision *avd)
avd->flags = 0;
}
-void services_compute_xperms_decision(struct extended_perms_decision *xpermd,
- struct avtab_node *node)
+static void update_xperms_extended_data(u8 specified,
+ struct extended_perms_data *from,
+ struct extended_perms_data *xp_data)
{
unsigned int i;
- if (node->datum.u.xperms->specified == AVTAB_XPERMS_IOCTLFUNCTION) {
+ switch (specified) {
+ case AVTAB_XPERMS_IOCTLDRIVER:
+ memset(xp_data->p, 0xff, sizeof(xp_data->p));
+ break;
+ case AVTAB_XPERMS_IOCTLFUNCTION:
+ case AVTAB_XPERMS_NLMSG:
+ for (i = 0; i < ARRAY_SIZE(xp_data->p); i++)
+ xp_data->p[i] |= from->p[i];
+ break;
+ }
+
+}
+
+void services_compute_xperms_decision(struct extended_perms_decision *xpermd,
+ struct avtab_node *node)
+{
+ switch (node->datum.u.xperms->specified) {
+ case AVTAB_XPERMS_IOCTLFUNCTION:
+ case AVTAB_XPERMS_NLMSG:
if (xpermd->driver != node->datum.u.xperms->driver)
return;
- } else if (node->datum.u.xperms->specified == AVTAB_XPERMS_IOCTLDRIVER) {
+ break;
+ case AVTAB_XPERMS_IOCTLDRIVER:
if (!security_xperm_test(node->datum.u.xperms->perms.p,
xpermd->driver))
return;
- } else {
+ break;
+ default:
BUG();
}
if (node->key.specified == AVTAB_XPERMS_ALLOWED) {
xpermd->used |= XPERMS_ALLOWED;
- if (node->datum.u.xperms->specified == AVTAB_XPERMS_IOCTLDRIVER) {
- memset(xpermd->allowed->p, 0xff,
- sizeof(xpermd->allowed->p));
- }
- if (node->datum.u.xperms->specified == AVTAB_XPERMS_IOCTLFUNCTION) {
- for (i = 0; i < ARRAY_SIZE(xpermd->allowed->p); i++)
- xpermd->allowed->p[i] |=
- node->datum.u.xperms->perms.p[i];
- }
+ update_xperms_extended_data(node->datum.u.xperms->specified,
+ &node->datum.u.xperms->perms,
+ xpermd->allowed);
} else if (node->key.specified == AVTAB_XPERMS_AUDITALLOW) {
xpermd->used |= XPERMS_AUDITALLOW;
- if (node->datum.u.xperms->specified == AVTAB_XPERMS_IOCTLDRIVER) {
- memset(xpermd->auditallow->p, 0xff,
- sizeof(xpermd->auditallow->p));
- }
- if (node->datum.u.xperms->specified == AVTAB_XPERMS_IOCTLFUNCTION) {
- for (i = 0; i < ARRAY_SIZE(xpermd->auditallow->p); i++)
- xpermd->auditallow->p[i] |=
- node->datum.u.xperms->perms.p[i];
- }
+ update_xperms_extended_data(node->datum.u.xperms->specified,
+ &node->datum.u.xperms->perms,
+ xpermd->auditallow);
} else if (node->key.specified == AVTAB_XPERMS_DONTAUDIT) {
xpermd->used |= XPERMS_DONTAUDIT;
- if (node->datum.u.xperms->specified == AVTAB_XPERMS_IOCTLDRIVER) {
- memset(xpermd->dontaudit->p, 0xff,
- sizeof(xpermd->dontaudit->p));
- }
- if (node->datum.u.xperms->specified == AVTAB_XPERMS_IOCTLFUNCTION) {
- for (i = 0; i < ARRAY_SIZE(xpermd->dontaudit->p); i++)
- xpermd->dontaudit->p[i] |=
- node->datum.u.xperms->perms.p[i];
- }
+ update_xperms_extended_data(node->datum.u.xperms->specified,
+ &node->datum.u.xperms->perms,
+ xpermd->dontaudit);
} else {
BUG();
}
@@ -3635,7 +3641,7 @@ int selinux_audit_rule_known(struct audit_krule *rule)
return 0;
}
-int selinux_audit_rule_match(u32 sid, u32 field, u32 op, void *vrule)
+int selinux_audit_rule_match(struct lsm_prop *prop, u32 field, u32 op, void *vrule)
{
struct selinux_state *state = &selinux_state;
struct selinux_policy *policy;
@@ -3661,10 +3667,10 @@ int selinux_audit_rule_match(u32 sid, u32 field, u32 op, void *vrule)
goto out;
}
- ctxt = sidtab_search(policy->sidtab, sid);
+ ctxt = sidtab_search(policy->sidtab, prop->selinux.secid);
if (unlikely(!ctxt)) {
WARN_ONCE(1, "selinux_audit_rule_match: unrecognized SID %d\n",
- sid);
+ prop->selinux.secid);
match = -ENOENT;
goto out;
}
diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c
index 370fd59..0c47628 100644
--- a/security/smack/smack_lsm.c
+++ b/security/smack/smack_lsm.c
@@ -1649,15 +1649,13 @@ static int smack_inode_listsecurity(struct inode *inode, char *buffer,
}
/**
- * smack_inode_getsecid - Extract inode's security id
+ * smack_inode_getlsmprop - Extract inode's security id
* @inode: inode to extract the info from
- * @secid: where result will be saved
+ * @prop: where result will be saved
*/
-static void smack_inode_getsecid(struct inode *inode, u32 *secid)
+static void smack_inode_getlsmprop(struct inode *inode, struct lsm_prop *prop)
{
- struct smack_known *skp = smk_of_inode(inode);
-
- *secid = skp->smk_secid;
+ prop->smack.skp = smk_of_inode(inode);
}
/*
@@ -2149,6 +2147,21 @@ static void smack_cred_getsecid(const struct cred *cred, u32 *secid)
}
/**
+ * smack_cred_getlsmprop - get the Smack label for a creds structure
+ * @cred: the object creds
+ * @prop: where to put the data
+ *
+ * Sets the Smack part of the ref
+ */
+static void smack_cred_getlsmprop(const struct cred *cred,
+ struct lsm_prop *prop)
+{
+ rcu_read_lock();
+ prop->smack.skp = smk_of_task(smack_cred(cred));
+ rcu_read_unlock();
+}
+
+/**
* smack_kernel_act_as - Set the subjective context in a set of credentials
* @new: points to the set of credentials to be modified.
* @secid: specifies the security ID to be set
@@ -2239,30 +2252,27 @@ static int smack_task_getsid(struct task_struct *p)
}
/**
- * smack_current_getsecid_subj - get the subjective secid of the current task
- * @secid: where to put the result
+ * smack_current_getlsmprop_subj - get the subjective secid of the current task
+ * @prop: where to put the result
*
* Sets the secid to contain a u32 version of the task's subjective smack label.
*/
-static void smack_current_getsecid_subj(u32 *secid)
+static void smack_current_getlsmprop_subj(struct lsm_prop *prop)
{
- struct smack_known *skp = smk_of_current();
-
- *secid = skp->smk_secid;
+ prop->smack.skp = smk_of_current();
}
/**
- * smack_task_getsecid_obj - get the objective secid of the task
+ * smack_task_getlsmprop_obj - get the objective data of the task
* @p: the task
- * @secid: where to put the result
+ * @prop: where to put the result
*
* Sets the secid to contain a u32 version of the task's objective smack label.
*/
-static void smack_task_getsecid_obj(struct task_struct *p, u32 *secid)
+static void smack_task_getlsmprop_obj(struct task_struct *p,
+ struct lsm_prop *prop)
{
- struct smack_known *skp = smk_of_task_struct_obj(p);
-
- *secid = skp->smk_secid;
+ prop->smack.skp = smk_of_task_struct_obj(p);
}
/**
@@ -3435,16 +3445,15 @@ static int smack_ipc_permission(struct kern_ipc_perm *ipp, short flag)
}
/**
- * smack_ipc_getsecid - Extract smack security id
+ * smack_ipc_getlsmprop - Extract smack security data
* @ipp: the object permissions
- * @secid: where result will be saved
+ * @prop: where result will be saved
*/
-static void smack_ipc_getsecid(struct kern_ipc_perm *ipp, u32 *secid)
+static void smack_ipc_getlsmprop(struct kern_ipc_perm *ipp, struct lsm_prop *prop)
{
- struct smack_known **blob = smack_ipc(ipp);
- struct smack_known *iskp = *blob;
+ struct smack_known **iskpp = smack_ipc(ipp);
- *secid = iskp->smk_secid;
+ prop->smack.skp = *iskpp;
}
/**
@@ -4757,7 +4766,7 @@ static int smack_audit_rule_known(struct audit_krule *krule)
/**
* smack_audit_rule_match - Audit given object ?
- * @secid: security id for identifying the object to test
+ * @prop: security id for identifying the object to test
* @field: audit rule flags given from user-space
* @op: required testing operator
* @vrule: smack internal rule presentation
@@ -4765,9 +4774,10 @@ static int smack_audit_rule_known(struct audit_krule *krule)
* The core Audit hook. It's used to take the decision of
* whether to audit or not to audit a given object.
*/
-static int smack_audit_rule_match(u32 secid, u32 field, u32 op, void *vrule)
+static int smack_audit_rule_match(struct lsm_prop *prop, u32 field, u32 op,
+ void *vrule)
{
- struct smack_known *skp;
+ struct smack_known *skp = prop->smack.skp;
char *rule = vrule;
if (unlikely(!rule)) {
@@ -4778,8 +4788,6 @@ static int smack_audit_rule_match(u32 secid, u32 field, u32 op, void *vrule)
if (field != AUDIT_SUBJ_USER && field != AUDIT_OBJ_USER)
return 0;
- skp = smack_from_secid(secid);
-
/*
* No need to do string comparisons. If a match occurs,
* both pointers will point to the same smack_known
@@ -4809,7 +4817,6 @@ static int smack_ismaclabel(const char *name)
return (strcmp(name, XATTR_SMACK_SUFFIX) == 0);
}
-
/**
* smack_secid_to_secctx - return the smack label for a secid
* @secid: incoming integer
@@ -4829,6 +4836,25 @@ static int smack_secid_to_secctx(u32 secid, char **secdata, u32 *seclen)
}
/**
+ * smack_lsmprop_to_secctx - return the smack label
+ * @prop: includes incoming Smack data
+ * @secdata: destination
+ * @seclen: how long it is
+ *
+ * Exists for audit code.
+ */
+static int smack_lsmprop_to_secctx(struct lsm_prop *prop, char **secdata,
+ u32 *seclen)
+{
+ struct smack_known *skp = prop->smack.skp;
+
+ if (secdata)
+ *secdata = skp->smk_known;
+ *seclen = strlen(skp->smk_known);
+ return 0;
+}
+
+/**
* smack_secctx_to_secid - return the secid for a smack label
* @secdata: smack label
* @seclen: how long result is
@@ -5078,7 +5104,7 @@ static struct security_hook_list smack_hooks[] __ro_after_init = {
LSM_HOOK_INIT(inode_getsecurity, smack_inode_getsecurity),
LSM_HOOK_INIT(inode_setsecurity, smack_inode_setsecurity),
LSM_HOOK_INIT(inode_listsecurity, smack_inode_listsecurity),
- LSM_HOOK_INIT(inode_getsecid, smack_inode_getsecid),
+ LSM_HOOK_INIT(inode_getlsmprop, smack_inode_getlsmprop),
LSM_HOOK_INIT(file_alloc_security, smack_file_alloc_security),
LSM_HOOK_INIT(file_ioctl, smack_file_ioctl),
@@ -5098,13 +5124,14 @@ static struct security_hook_list smack_hooks[] __ro_after_init = {
LSM_HOOK_INIT(cred_prepare, smack_cred_prepare),
LSM_HOOK_INIT(cred_transfer, smack_cred_transfer),
LSM_HOOK_INIT(cred_getsecid, smack_cred_getsecid),
+ LSM_HOOK_INIT(cred_getlsmprop, smack_cred_getlsmprop),
LSM_HOOK_INIT(kernel_act_as, smack_kernel_act_as),
LSM_HOOK_INIT(kernel_create_files_as, smack_kernel_create_files_as),
LSM_HOOK_INIT(task_setpgid, smack_task_setpgid),
LSM_HOOK_INIT(task_getpgid, smack_task_getpgid),
LSM_HOOK_INIT(task_getsid, smack_task_getsid),
- LSM_HOOK_INIT(current_getsecid_subj, smack_current_getsecid_subj),
- LSM_HOOK_INIT(task_getsecid_obj, smack_task_getsecid_obj),
+ LSM_HOOK_INIT(current_getlsmprop_subj, smack_current_getlsmprop_subj),
+ LSM_HOOK_INIT(task_getlsmprop_obj, smack_task_getlsmprop_obj),
LSM_HOOK_INIT(task_setnice, smack_task_setnice),
LSM_HOOK_INIT(task_setioprio, smack_task_setioprio),
LSM_HOOK_INIT(task_getioprio, smack_task_getioprio),
@@ -5115,7 +5142,7 @@ static struct security_hook_list smack_hooks[] __ro_after_init = {
LSM_HOOK_INIT(task_to_inode, smack_task_to_inode),
LSM_HOOK_INIT(ipc_permission, smack_ipc_permission),
- LSM_HOOK_INIT(ipc_getsecid, smack_ipc_getsecid),
+ LSM_HOOK_INIT(ipc_getlsmprop, smack_ipc_getlsmprop),
LSM_HOOK_INIT(msg_msg_alloc_security, smack_msg_msg_alloc_security),
@@ -5187,6 +5214,7 @@ static struct security_hook_list smack_hooks[] __ro_after_init = {
LSM_HOOK_INIT(ismaclabel, smack_ismaclabel),
LSM_HOOK_INIT(secid_to_secctx, smack_secid_to_secctx),
+ LSM_HOOK_INIT(lsmprop_to_secctx, smack_lsmprop_to_secctx),
LSM_HOOK_INIT(secctx_to_secid, smack_secctx_to_secid),
LSM_HOOK_INIT(inode_notifysecctx, smack_inode_notifysecctx),
LSM_HOOK_INIT(inode_setsecctx, smack_inode_setsecctx),
diff --git a/security/smack/smackfs.c b/security/smack/smackfs.c
index 5dd1e16..1401412 100644
--- a/security/smack/smackfs.c
+++ b/security/smack/smackfs.c
@@ -182,11 +182,9 @@ static inline void smack_catset_bit(unsigned int cat, char *catsetp)
*/
static void smk_netlabel_audit_set(struct netlbl_audit *nap)
{
- struct smack_known *skp = smk_of_current();
-
nap->loginuid = audit_get_loginuid(current);
nap->sessionid = audit_get_sessionid(current);
- nap->secid = skp->smk_secid;
+ nap->prop.smack.skp = smk_of_current();
}
/*
diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c
index b465fb6..3320cce35 100644
--- a/sound/core/pcm_native.c
+++ b/sound/core/pcm_native.c
@@ -2250,7 +2250,7 @@ static int snd_pcm_link(struct snd_pcm_substream *substream, int fd)
bool nonatomic = substream->pcm->nonatomic;
CLASS(fd, f)(fd);
- if (!fd_file(f))
+ if (fd_empty(f))
return -EBADFD;
if (!is_pcm_file(fd_file(f)))
return -EBADFD;
diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c
index 571fa8a..24b4fe9 100644
--- a/sound/pci/hda/patch_realtek.c
+++ b/sound/pci/hda/patch_realtek.c
@@ -7450,7 +7450,6 @@ static void alc287_alc1318_playback_pcm_hook(struct hda_pcm_stream *hinfo,
struct snd_pcm_substream *substream,
int action)
{
- alc_write_coef_idx(codec, 0x10, 0x8806); /* Change MLK to GPIO3 */
switch (action) {
case HDA_GEN_PCM_ACT_OPEN:
alc_write_coefex_idx(codec, 0x5a, 0x00, 0x954f); /* write gpio3 to high */
@@ -7464,7 +7463,6 @@ static void alc287_alc1318_playback_pcm_hook(struct hda_pcm_stream *hinfo,
static void alc287_s4_power_gpio3_default(struct hda_codec *codec)
{
if (is_s4_suspend(codec)) {
- alc_write_coef_idx(codec, 0x10, 0x8806); /* Change MLK to GPIO3 */
alc_write_coefex_idx(codec, 0x5a, 0x00, 0x554f); /* write gpio3 as default value */
}
}
@@ -7473,9 +7471,17 @@ static void alc287_fixup_lenovo_thinkpad_with_alc1318(struct hda_codec *codec,
const struct hda_fixup *fix, int action)
{
struct alc_spec *spec = codec->spec;
+ static const struct coef_fw coefs[] = {
+ WRITE_COEF(0x24, 0x0013), WRITE_COEF(0x25, 0x0000), WRITE_COEF(0x26, 0xC300),
+ WRITE_COEF(0x28, 0x0001), WRITE_COEF(0x29, 0xb023),
+ WRITE_COEF(0x24, 0x0013), WRITE_COEF(0x25, 0x0000), WRITE_COEF(0x26, 0xC301),
+ WRITE_COEF(0x28, 0x0001), WRITE_COEF(0x29, 0xb023),
+ };
if (action != HDA_FIXUP_ACT_PRE_PROBE)
return;
+ alc_update_coef_idx(codec, 0x10, 1<<11, 1<<11);
+ alc_process_coef_fw(codec, coefs);
spec->power_hook = alc287_s4_power_gpio3_default;
spec->gen.pcm_playback_hook = alc287_alc1318_playback_pcm_hook;
}
@@ -10496,6 +10502,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = {
SND_PCI_QUIRK(0x103c, 0x8b59, "HP Elite mt645 G7 Mobile Thin Client U89", ALC236_FIXUP_HP_MUTE_LED_MICMUTE_VREF),
SND_PCI_QUIRK(0x103c, 0x8b5d, "HP", ALC236_FIXUP_HP_MUTE_LED_MICMUTE_VREF),
SND_PCI_QUIRK(0x103c, 0x8b5e, "HP", ALC236_FIXUP_HP_MUTE_LED_MICMUTE_VREF),
+ SND_PCI_QUIRK(0x103c, 0x8b5f, "HP", ALC236_FIXUP_HP_MUTE_LED_MICMUTE_VREF),
SND_PCI_QUIRK(0x103c, 0x8b63, "HP Elite Dragonfly 13.5 inch G4", ALC245_FIXUP_CS35L41_SPI_4_HP_GPIO_LED),
SND_PCI_QUIRK(0x103c, 0x8b65, "HP ProBook 455 15.6 inch G10 Notebook PC", ALC236_FIXUP_HP_MUTE_LED_MICMUTE_VREF),
SND_PCI_QUIRK(0x103c, 0x8b66, "HP", ALC236_FIXUP_HP_MUTE_LED_MICMUTE_VREF),
@@ -11673,6 +11680,8 @@ static const struct snd_hda_pin_quirk alc269_fallback_pin_fixup_tbl[] = {
{0x1a, 0x40000000}),
SND_HDA_PIN_QUIRK(0x10ec0256, 0x1043, "ASUS", ALC2XX_FIXUP_HEADSET_MIC,
{0x19, 0x40000000}),
+ SND_HDA_PIN_QUIRK(0x10ec0255, 0x1558, "Clevo", ALC2XX_FIXUP_HEADSET_MIC,
+ {0x19, 0x40000000}),
{}
};
diff --git a/sound/soc/codecs/max9768.c b/sound/soc/codecs/max9768.c
index e4793a5..8af3c7e 100644
--- a/sound/soc/codecs/max9768.c
+++ b/sound/soc/codecs/max9768.c
@@ -54,10 +54,17 @@ static int max9768_set_gpio(struct snd_kcontrol *kcontrol,
{
struct snd_soc_component *c = snd_soc_kcontrol_component(kcontrol);
struct max9768 *max9768 = snd_soc_component_get_drvdata(c);
+ bool val = !ucontrol->value.integer.value[0];
+ int ret;
- gpiod_set_value_cansleep(max9768->mute, !ucontrol->value.integer.value[0]);
+ if (val != gpiod_get_value_cansleep(max9768->mute))
+ ret = 1;
+ else
+ ret = 0;
- return 0;
+ gpiod_set_value_cansleep(max9768->mute, val);
+
+ return ret;
}
static const DECLARE_TLV_DB_RANGE(volume_tlv,
diff --git a/sound/soc/generic/audio-graph-card2.c b/sound/soc/generic/audio-graph-card2.c
index 4ad3d1b..93eee40 100644
--- a/sound/soc/generic/audio-graph-card2.c
+++ b/sound/soc/generic/audio-graph-card2.c
@@ -270,16 +270,19 @@ static enum graph_type __graph_get_type(struct device_node *lnk)
if (of_node_name_eq(np, GRAPH_NODENAME_MULTI)) {
ret = GRAPH_MULTI;
+ fw_devlink_purge_absent_suppliers(&np->fwnode);
goto out_put;
}
if (of_node_name_eq(np, GRAPH_NODENAME_DPCM)) {
ret = GRAPH_DPCM;
+ fw_devlink_purge_absent_suppliers(&np->fwnode);
goto out_put;
}
if (of_node_name_eq(np, GRAPH_NODENAME_C2C)) {
ret = GRAPH_C2C;
+ fw_devlink_purge_absent_suppliers(&np->fwnode);
goto out_put;
}
diff --git a/sound/soc/intel/boards/sof_sdw.c b/sound/soc/intel/boards/sof_sdw.c
index 35d707d..4a0ab50 100644
--- a/sound/soc/intel/boards/sof_sdw.c
+++ b/sound/soc/intel/boards/sof_sdw.c
@@ -594,6 +594,14 @@ static const struct dmi_system_id sof_sdw_quirk_table[] = {
.callback = sof_sdw_quirk_cb,
.matches = {
DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc"),
+ DMI_EXACT_MATCH(DMI_PRODUCT_SKU, "0CF1")
+ },
+ .driver_data = (void *)(SOC_SDW_CODEC_SPKR),
+ },
+ {
+ .callback = sof_sdw_quirk_cb,
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc"),
DMI_EXACT_MATCH(DMI_PRODUCT_SKU, "0CF7")
},
.driver_data = (void *)(SOC_SDW_CODEC_SPKR),
diff --git a/sound/usb/quirks-table.h b/sound/usb/quirks-table.h
index 24c981c..199d0603 100644
--- a/sound/usb/quirks-table.h
+++ b/sound/usb/quirks-table.h
@@ -324,7 +324,6 @@ YAMAHA_DEVICE(0x105a, NULL),
YAMAHA_DEVICE(0x105b, NULL),
YAMAHA_DEVICE(0x105c, NULL),
YAMAHA_DEVICE(0x105d, NULL),
-YAMAHA_DEVICE(0x1718, "P-125"),
{
USB_DEVICE(0x0499, 0x1503),
QUIRK_DRIVER_INFO {
@@ -391,6 +390,19 @@ YAMAHA_DEVICE(0x1718, "P-125"),
}
}
},
+{
+ USB_DEVICE(0x0499, 0x1718),
+ QUIRK_DRIVER_INFO {
+ /* .vendor_name = "Yamaha", */
+ /* .product_name = "P-125", */
+ QUIRK_DATA_COMPOSITE {
+ { QUIRK_DATA_STANDARD_AUDIO(1) },
+ { QUIRK_DATA_STANDARD_AUDIO(2) },
+ { QUIRK_DATA_MIDI_YAMAHA(3) },
+ QUIRK_COMPOSITE_END
+ }
+ }
+},
YAMAHA_DEVICE(0x2000, "DGP-7"),
YAMAHA_DEVICE(0x2001, "DGP-5"),
YAMAHA_DEVICE(0x2002, NULL),
diff --git a/tools/include/nolibc/arch-s390.h b/tools/include/nolibc/arch-s390.h
index 2ec13d8..f9ab83a 100644
--- a/tools/include/nolibc/arch-s390.h
+++ b/tools/include/nolibc/arch-s390.h
@@ -10,6 +10,7 @@
#include "compiler.h"
#include "crt.h"
+#include "std.h"
/* Syscalls for s390:
* - registers are 64-bit
diff --git a/tools/include/nolibc/compiler.h b/tools/include/nolibc/compiler.h
index 9bc6a70..fa1f547 100644
--- a/tools/include/nolibc/compiler.h
+++ b/tools/include/nolibc/compiler.h
@@ -32,4 +32,10 @@
# define __no_stack_protector __attribute__((__optimize__("-fno-stack-protector")))
#endif /* __nolibc_has_attribute(no_stack_protector) */
+#if __nolibc_has_attribute(fallthrough)
+# define __nolibc_fallthrough do { } while (0); __attribute__((fallthrough))
+#else
+# define __nolibc_fallthrough do { } while (0)
+#endif /* __nolibc_has_attribute(fallthrough) */
+
#endif /* _NOLIBC_COMPILER_H */
diff --git a/tools/include/nolibc/stdio.h b/tools/include/nolibc/stdio.h
index c968dbb..3892034 100644
--- a/tools/include/nolibc/stdio.h
+++ b/tools/include/nolibc/stdio.h
@@ -15,6 +15,7 @@
#include "stdarg.h"
#include "stdlib.h"
#include "string.h"
+#include "compiler.h"
#ifndef EOF
#define EOF (-1)
@@ -264,7 +265,7 @@ int vfprintf(FILE *stream, const char *fmt, va_list args)
case 'p':
*(out++) = '0';
*(out++) = 'x';
- /* fall through */
+ __nolibc_fallthrough;
default: /* 'x' and 'p' above */
u64toh_r(v, out);
break;
diff --git a/tools/lib/thermal/commands.c b/tools/lib/thermal/commands.c
index 73d4d4e..4998cec 100644
--- a/tools/lib/thermal/commands.c
+++ b/tools/lib/thermal/commands.c
@@ -5,6 +5,7 @@
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
+#include <limits.h>
#include <thermal.h>
#include "thermal_nl.h"
@@ -33,6 +34,11 @@ static struct nla_policy thermal_genl_policy[THERMAL_GENL_ATTR_MAX + 1] = {
[THERMAL_GENL_ATTR_CDEV_CUR_STATE] = { .type = NLA_U32 },
[THERMAL_GENL_ATTR_CDEV_MAX_STATE] = { .type = NLA_U32 },
[THERMAL_GENL_ATTR_CDEV_NAME] = { .type = NLA_STRING },
+
+ /* Thresholds */
+ [THERMAL_GENL_ATTR_THRESHOLD] = { .type = NLA_NESTED },
+ [THERMAL_GENL_ATTR_THRESHOLD_TEMP] = { .type = NLA_U32 },
+ [THERMAL_GENL_ATTR_THRESHOLD_DIRECTION] = { .type = NLA_U32 },
};
static int parse_tz_get(struct genl_info *info, struct thermal_zone **tz)
@@ -182,6 +188,48 @@ static int parse_tz_get_gov(struct genl_info *info, struct thermal_zone *tz)
return THERMAL_SUCCESS;
}
+static int parse_threshold_get(struct genl_info *info, struct thermal_zone *tz)
+{
+ struct nlattr *attr;
+ struct thermal_threshold *__tt = NULL;
+ size_t size = 0;
+ int rem;
+
+ /*
+ * The size contains the size of the array and we want to
+ * access the last element, size - 1.
+ *
+ * The variable size is initialized to zero but it will be
+ * then incremented by the first if() statement. The message
+ * attributes are ordered, so the first if() statement will be
+ * always called before the second one. If it happens that is
+ * not the case, then it is a kernel bug.
+ */
+ nla_for_each_nested(attr, info->attrs[THERMAL_GENL_ATTR_THRESHOLD], rem) {
+
+ if (nla_type(attr) == THERMAL_GENL_ATTR_THRESHOLD_TEMP) {
+
+ size++;
+
+ __tt = realloc(__tt, sizeof(*__tt) * (size + 2));
+ if (!__tt)
+ return THERMAL_ERROR;
+
+ __tt[size - 1].temperature = nla_get_u32(attr);
+ }
+
+ if (nla_type(attr) == THERMAL_GENL_ATTR_THRESHOLD_DIRECTION)
+ __tt[size - 1].direction = nla_get_u32(attr);
+ }
+
+ if (__tt)
+ __tt[size].temperature = INT_MAX;
+
+ tz->thresholds = __tt;
+
+ return THERMAL_SUCCESS;
+}
+
static int handle_netlink(struct nl_cache_ops *unused,
struct genl_cmd *cmd,
struct genl_info *info, void *arg)
@@ -210,6 +258,10 @@ static int handle_netlink(struct nl_cache_ops *unused,
ret = parse_tz_get_gov(info, arg);
break;
+ case THERMAL_GENL_CMD_THRESHOLD_GET:
+ ret = parse_threshold_get(info, arg);
+ break;
+
default:
return THERMAL_ERROR;
}
@@ -253,6 +305,34 @@ static struct genl_cmd thermal_cmds[] = {
.c_maxattr = THERMAL_GENL_ATTR_MAX,
.c_attr_policy = thermal_genl_policy,
},
+ {
+ .c_id = THERMAL_GENL_CMD_THRESHOLD_GET,
+ .c_name = (char *)"Get thresholds list",
+ .c_msg_parser = handle_netlink,
+ .c_maxattr = THERMAL_GENL_ATTR_MAX,
+ .c_attr_policy = thermal_genl_policy,
+ },
+ {
+ .c_id = THERMAL_GENL_CMD_THRESHOLD_ADD,
+ .c_name = (char *)"Add a threshold",
+ .c_msg_parser = handle_netlink,
+ .c_maxattr = THERMAL_GENL_ATTR_MAX,
+ .c_attr_policy = thermal_genl_policy,
+ },
+ {
+ .c_id = THERMAL_GENL_CMD_THRESHOLD_DELETE,
+ .c_name = (char *)"Delete a threshold",
+ .c_msg_parser = handle_netlink,
+ .c_maxattr = THERMAL_GENL_ATTR_MAX,
+ .c_attr_policy = thermal_genl_policy,
+ },
+ {
+ .c_id = THERMAL_GENL_CMD_THRESHOLD_FLUSH,
+ .c_name = (char *)"Flush the thresholds",
+ .c_msg_parser = handle_netlink,
+ .c_maxattr = THERMAL_GENL_ATTR_MAX,
+ .c_attr_policy = thermal_genl_policy,
+ },
};
static struct genl_ops thermal_cmd_ops = {
@@ -261,9 +341,41 @@ static struct genl_ops thermal_cmd_ops = {
.o_ncmds = ARRAY_SIZE(thermal_cmds),
};
-static thermal_error_t thermal_genl_auto(struct thermal_handler *th, int id, int cmd,
- int flags, void *arg)
+struct cmd_param {
+ int tz_id;
+ int temp;
+ int direction;
+};
+
+typedef int (*cmd_cb_t)(struct nl_msg *, struct cmd_param *);
+
+static int thermal_genl_tz_id_encode(struct nl_msg *msg, struct cmd_param *p)
{
+ if (nla_put_u32(msg, THERMAL_GENL_ATTR_TZ_ID, p->tz_id))
+ return -1;
+
+ return 0;
+}
+
+static int thermal_genl_threshold_encode(struct nl_msg *msg, struct cmd_param *p)
+{
+ if (thermal_genl_tz_id_encode(msg, p))
+ return -1;
+
+ if (nla_put_u32(msg, THERMAL_GENL_ATTR_THRESHOLD_TEMP, p->temp))
+ return -1;
+
+ if (nla_put_u32(msg, THERMAL_GENL_ATTR_THRESHOLD_DIRECTION, p->direction))
+ return -1;
+
+ return 0;
+}
+
+static thermal_error_t thermal_genl_auto(struct thermal_handler *th, cmd_cb_t cmd_cb,
+ struct cmd_param *param,
+ int cmd, int flags, void *arg)
+{
+ thermal_error_t ret = THERMAL_ERROR;
struct nl_msg *msg;
void *hdr;
@@ -274,45 +386,95 @@ static thermal_error_t thermal_genl_auto(struct thermal_handler *th, int id, int
hdr = genlmsg_put(msg, NL_AUTO_PORT, NL_AUTO_SEQ, thermal_cmd_ops.o_id,
0, flags, cmd, THERMAL_GENL_VERSION);
if (!hdr)
- return THERMAL_ERROR;
+ goto out;
- if (id >= 0 && nla_put_u32(msg, THERMAL_GENL_ATTR_TZ_ID, id))
- return THERMAL_ERROR;
+ if (cmd_cb && cmd_cb(msg, param))
+ goto out;
if (nl_send_msg(th->sk_cmd, th->cb_cmd, msg, genl_handle_msg, arg))
- return THERMAL_ERROR;
+ goto out;
+ ret = THERMAL_SUCCESS;
+out:
nlmsg_free(msg);
- return THERMAL_SUCCESS;
+ return ret;
}
thermal_error_t thermal_cmd_get_tz(struct thermal_handler *th, struct thermal_zone **tz)
{
- return thermal_genl_auto(th, -1, THERMAL_GENL_CMD_TZ_GET_ID,
+ return thermal_genl_auto(th, NULL, NULL, THERMAL_GENL_CMD_TZ_GET_ID,
NLM_F_DUMP | NLM_F_ACK, tz);
}
thermal_error_t thermal_cmd_get_cdev(struct thermal_handler *th, struct thermal_cdev **tc)
{
- return thermal_genl_auto(th, -1, THERMAL_GENL_CMD_CDEV_GET,
+ return thermal_genl_auto(th, NULL, NULL, THERMAL_GENL_CMD_CDEV_GET,
NLM_F_DUMP | NLM_F_ACK, tc);
}
thermal_error_t thermal_cmd_get_trip(struct thermal_handler *th, struct thermal_zone *tz)
{
- return thermal_genl_auto(th, tz->id, THERMAL_GENL_CMD_TZ_GET_TRIP,
- 0, tz);
+ struct cmd_param p = { .tz_id = tz->id };
+
+ return thermal_genl_auto(th, thermal_genl_tz_id_encode, &p,
+ THERMAL_GENL_CMD_TZ_GET_TRIP, 0, tz);
}
thermal_error_t thermal_cmd_get_governor(struct thermal_handler *th, struct thermal_zone *tz)
{
- return thermal_genl_auto(th, tz->id, THERMAL_GENL_CMD_TZ_GET_GOV, 0, tz);
+ struct cmd_param p = { .tz_id = tz->id };
+
+ return thermal_genl_auto(th, thermal_genl_tz_id_encode, &p,
+ THERMAL_GENL_CMD_TZ_GET_GOV, 0, tz);
}
thermal_error_t thermal_cmd_get_temp(struct thermal_handler *th, struct thermal_zone *tz)
{
- return thermal_genl_auto(th, tz->id, THERMAL_GENL_CMD_TZ_GET_TEMP, 0, tz);
+ struct cmd_param p = { .tz_id = tz->id };
+
+ return thermal_genl_auto(th, thermal_genl_tz_id_encode, &p,
+ THERMAL_GENL_CMD_TZ_GET_TEMP, 0, tz);
+}
+
+thermal_error_t thermal_cmd_threshold_get(struct thermal_handler *th,
+ struct thermal_zone *tz)
+{
+ struct cmd_param p = { .tz_id = tz->id };
+
+ return thermal_genl_auto(th, thermal_genl_tz_id_encode, &p,
+ THERMAL_GENL_CMD_THRESHOLD_GET, 0, tz);
+}
+
+thermal_error_t thermal_cmd_threshold_add(struct thermal_handler *th,
+ struct thermal_zone *tz,
+ int temperature,
+ int direction)
+{
+ struct cmd_param p = { .tz_id = tz->id, .temp = temperature, .direction = direction };
+
+ return thermal_genl_auto(th, thermal_genl_threshold_encode, &p,
+ THERMAL_GENL_CMD_THRESHOLD_ADD, 0, tz);
+}
+
+thermal_error_t thermal_cmd_threshold_delete(struct thermal_handler *th,
+ struct thermal_zone *tz,
+ int temperature,
+ int direction)
+{
+ struct cmd_param p = { .tz_id = tz->id, .temp = temperature, .direction = direction };
+
+ return thermal_genl_auto(th, thermal_genl_threshold_encode, &p,
+ THERMAL_GENL_CMD_THRESHOLD_DELETE, 0, tz);
+}
+
+thermal_error_t thermal_cmd_threshold_flush(struct thermal_handler *th,
+ struct thermal_zone *tz)
+{
+ struct cmd_param p = { .tz_id = tz->id };
+
+ return thermal_genl_auto(th, thermal_genl_tz_id_encode, &p,
+ THERMAL_GENL_CMD_THRESHOLD_FLUSH, 0, tz);
}
thermal_error_t thermal_cmd_exit(struct thermal_handler *th)
diff --git a/tools/lib/thermal/events.c b/tools/lib/thermal/events.c
index a7a55d1..bd851c8 100644
--- a/tools/lib/thermal/events.c
+++ b/tools/lib/thermal/events.c
@@ -94,6 +94,30 @@ static int handle_thermal_event(struct nl_msg *n, void *arg)
case THERMAL_GENL_EVENT_TZ_GOV_CHANGE:
return ops->gov_change(nla_get_u32(attrs[THERMAL_GENL_ATTR_TZ_ID]),
nla_get_string(attrs[THERMAL_GENL_ATTR_GOV_NAME]), arg);
+
+ case THERMAL_GENL_EVENT_THRESHOLD_ADD:
+ return ops->threshold_add(nla_get_u32(attrs[THERMAL_GENL_ATTR_TZ_ID]),
+ nla_get_u32(attrs[THERMAL_GENL_ATTR_THRESHOLD_TEMP]),
+ nla_get_u32(attrs[THERMAL_GENL_ATTR_THRESHOLD_DIRECTION]), arg);
+
+ case THERMAL_GENL_EVENT_THRESHOLD_DELETE:
+ return ops->threshold_delete(nla_get_u32(attrs[THERMAL_GENL_ATTR_TZ_ID]),
+ nla_get_u32(attrs[THERMAL_GENL_ATTR_THRESHOLD_TEMP]),
+ nla_get_u32(attrs[THERMAL_GENL_ATTR_THRESHOLD_DIRECTION]), arg);
+
+ case THERMAL_GENL_EVENT_THRESHOLD_FLUSH:
+ return ops->threshold_flush(nla_get_u32(attrs[THERMAL_GENL_ATTR_TZ_ID]), arg);
+
+ case THERMAL_GENL_EVENT_THRESHOLD_UP:
+ return ops->threshold_up(nla_get_u32(attrs[THERMAL_GENL_ATTR_TZ_ID]),
+ nla_get_u32(attrs[THERMAL_GENL_ATTR_TZ_TEMP]),
+ nla_get_u32(attrs[THERMAL_GENL_ATTR_TZ_PREV_TEMP]), arg);
+
+ case THERMAL_GENL_EVENT_THRESHOLD_DOWN:
+ return ops->threshold_down(nla_get_u32(attrs[THERMAL_GENL_ATTR_TZ_ID]),
+ nla_get_u32(attrs[THERMAL_GENL_ATTR_TZ_TEMP]),
+ nla_get_u32(attrs[THERMAL_GENL_ATTR_TZ_PREV_TEMP]), arg);
+
default:
return -1;
}
@@ -101,19 +125,24 @@ static int handle_thermal_event(struct nl_msg *n, void *arg)
static void thermal_events_ops_init(struct thermal_events_ops *ops)
{
- enabled_ops[THERMAL_GENL_EVENT_TZ_CREATE] = !!ops->tz_create;
- enabled_ops[THERMAL_GENL_EVENT_TZ_DELETE] = !!ops->tz_delete;
- enabled_ops[THERMAL_GENL_EVENT_TZ_DISABLE] = !!ops->tz_disable;
- enabled_ops[THERMAL_GENL_EVENT_TZ_ENABLE] = !!ops->tz_enable;
- enabled_ops[THERMAL_GENL_EVENT_TZ_TRIP_UP] = !!ops->trip_high;
- enabled_ops[THERMAL_GENL_EVENT_TZ_TRIP_DOWN] = !!ops->trip_low;
- enabled_ops[THERMAL_GENL_EVENT_TZ_TRIP_CHANGE] = !!ops->trip_change;
- enabled_ops[THERMAL_GENL_EVENT_TZ_TRIP_ADD] = !!ops->trip_add;
- enabled_ops[THERMAL_GENL_EVENT_TZ_TRIP_DELETE] = !!ops->trip_delete;
- enabled_ops[THERMAL_GENL_EVENT_CDEV_ADD] = !!ops->cdev_add;
- enabled_ops[THERMAL_GENL_EVENT_CDEV_DELETE] = !!ops->cdev_delete;
- enabled_ops[THERMAL_GENL_EVENT_CDEV_STATE_UPDATE] = !!ops->cdev_update;
- enabled_ops[THERMAL_GENL_EVENT_TZ_GOV_CHANGE] = !!ops->gov_change;
+ enabled_ops[THERMAL_GENL_EVENT_TZ_CREATE] = !!ops->tz_create;
+ enabled_ops[THERMAL_GENL_EVENT_TZ_DELETE] = !!ops->tz_delete;
+ enabled_ops[THERMAL_GENL_EVENT_TZ_DISABLE] = !!ops->tz_disable;
+ enabled_ops[THERMAL_GENL_EVENT_TZ_ENABLE] = !!ops->tz_enable;
+ enabled_ops[THERMAL_GENL_EVENT_TZ_TRIP_UP] = !!ops->trip_high;
+ enabled_ops[THERMAL_GENL_EVENT_TZ_TRIP_DOWN] = !!ops->trip_low;
+ enabled_ops[THERMAL_GENL_EVENT_TZ_TRIP_CHANGE] = !!ops->trip_change;
+ enabled_ops[THERMAL_GENL_EVENT_TZ_TRIP_ADD] = !!ops->trip_add;
+ enabled_ops[THERMAL_GENL_EVENT_TZ_TRIP_DELETE] = !!ops->trip_delete;
+ enabled_ops[THERMAL_GENL_EVENT_CDEV_ADD] = !!ops->cdev_add;
+ enabled_ops[THERMAL_GENL_EVENT_CDEV_DELETE] = !!ops->cdev_delete;
+ enabled_ops[THERMAL_GENL_EVENT_CDEV_STATE_UPDATE] = !!ops->cdev_update;
+ enabled_ops[THERMAL_GENL_EVENT_TZ_GOV_CHANGE] = !!ops->gov_change;
+ enabled_ops[THERMAL_GENL_EVENT_THRESHOLD_ADD] = !!ops->threshold_add;
+ enabled_ops[THERMAL_GENL_EVENT_THRESHOLD_DELETE] = !!ops->threshold_delete;
+ enabled_ops[THERMAL_GENL_EVENT_THRESHOLD_FLUSH] = !!ops->threshold_flush;
+ enabled_ops[THERMAL_GENL_EVENT_THRESHOLD_UP] = !!ops->threshold_up;
+ enabled_ops[THERMAL_GENL_EVENT_THRESHOLD_DOWN] = !!ops->threshold_down;
}
thermal_error_t thermal_events_handle(struct thermal_handler *th, void *arg)
diff --git a/tools/lib/thermal/include/thermal.h b/tools/lib/thermal/include/thermal.h
index 1abc560..818ecdf 100644
--- a/tools/lib/thermal/include/thermal.h
+++ b/tools/lib/thermal/include/thermal.h
@@ -4,11 +4,20 @@
#define __LIBTHERMAL_H
#include <linux/thermal.h>
+#include <sys/types.h>
#ifndef LIBTHERMAL_API
#define LIBTHERMAL_API __attribute__((visibility("default")))
#endif
+#ifndef THERMAL_THRESHOLD_WAY_UP
+#define THERMAL_THRESHOLD_WAY_UP 0x1
+#endif
+
+#ifndef THERMAL_THRESHOLD_WAY_DOWN
+#define THERMAL_THRESHOLD_WAY_DOWN 0x2
+#endif
+
#ifdef __cplusplus
extern "C" {
#endif
@@ -31,6 +40,11 @@ struct thermal_events_ops {
int (*cdev_delete)(int cdev_id, void *arg);
int (*cdev_update)(int cdev_id, int cur_state, void *arg);
int (*gov_change)(int tz_id, const char *gov_name, void *arg);
+ int (*threshold_add)(int tz_id, int temperature, int direction, void *arg);
+ int (*threshold_delete)(int tz_id, int temperature, int direction, void *arg);
+ int (*threshold_flush)(int tz_id, void *arg);
+ int (*threshold_up)(int tz_id, int temp, int prev_temp, void *arg);
+ int (*threshold_down)(int tz_id, int temp, int prev_temp, void *arg);
};
struct thermal_ops {
@@ -45,12 +59,18 @@ struct thermal_trip {
int hyst;
};
+struct thermal_threshold {
+ int temperature;
+ int direction;
+};
+
struct thermal_zone {
int id;
int temp;
char name[THERMAL_NAME_LENGTH];
char governor[THERMAL_NAME_LENGTH];
struct thermal_trip *trip;
+ struct thermal_threshold *thresholds;
};
struct thermal_cdev {
@@ -74,12 +94,16 @@ typedef int (*cb_tt_t)(struct thermal_trip *, void *);
typedef int (*cb_tc_t)(struct thermal_cdev *, void *);
+typedef int (*cb_th_t)(struct thermal_threshold *, void *);
+
LIBTHERMAL_API int for_each_thermal_zone(struct thermal_zone *tz, cb_tz_t cb, void *arg);
LIBTHERMAL_API int for_each_thermal_trip(struct thermal_trip *tt, cb_tt_t cb, void *arg);
LIBTHERMAL_API int for_each_thermal_cdev(struct thermal_cdev *cdev, cb_tc_t cb, void *arg);
+LIBTHERMAL_API int for_each_thermal_threshold(struct thermal_threshold *th, cb_th_t cb, void *arg);
+
LIBTHERMAL_API struct thermal_zone *thermal_zone_find_by_name(struct thermal_zone *tz,
const char *name);
@@ -124,6 +148,22 @@ LIBTHERMAL_API thermal_error_t thermal_cmd_get_governor(struct thermal_handler *
LIBTHERMAL_API thermal_error_t thermal_cmd_get_temp(struct thermal_handler *th,
struct thermal_zone *tz);
+LIBTHERMAL_API thermal_error_t thermal_cmd_threshold_get(struct thermal_handler *th,
+ struct thermal_zone *tz);
+
+LIBTHERMAL_API thermal_error_t thermal_cmd_threshold_add(struct thermal_handler *th,
+ struct thermal_zone *tz,
+ int temperature,
+ int direction);
+
+LIBTHERMAL_API thermal_error_t thermal_cmd_threshold_delete(struct thermal_handler *th,
+ struct thermal_zone *tz,
+ int temperature,
+ int direction);
+
+LIBTHERMAL_API thermal_error_t thermal_cmd_threshold_flush(struct thermal_handler *th,
+ struct thermal_zone *tz);
+
/*
* Netlink thermal samples
*/
diff --git a/tools/lib/thermal/libthermal.map b/tools/lib/thermal/libthermal.map
index d5e7773..d657176 100644
--- a/tools/lib/thermal/libthermal.map
+++ b/tools/lib/thermal/libthermal.map
@@ -4,6 +4,7 @@
for_each_thermal_zone;
for_each_thermal_trip;
for_each_thermal_cdev;
+ for_each_thermal_threshold;
thermal_zone_find_by_name;
thermal_zone_find_by_id;
thermal_zone_discover;
@@ -17,6 +18,10 @@
thermal_cmd_get_trip;
thermal_cmd_get_governor;
thermal_cmd_get_temp;
+ thermal_cmd_threshold_get;
+ thermal_cmd_threshold_add;
+ thermal_cmd_threshold_delete;
+ thermal_cmd_threshold_flush;
thermal_sampling_init;
thermal_sampling_handle;
thermal_sampling_fd;
diff --git a/tools/lib/thermal/thermal.c b/tools/lib/thermal/thermal.c
index 72a76dc..6f02e35 100644
--- a/tools/lib/thermal/thermal.c
+++ b/tools/lib/thermal/thermal.c
@@ -1,10 +1,24 @@
// SPDX-License-Identifier: LGPL-2.1+
// Copyright (C) 2022, Linaro Ltd - Daniel Lezcano <daniel.lezcano@linaro.org>
#include <stdio.h>
+#include <limits.h>
#include <thermal.h>
#include "thermal_nl.h"
+int for_each_thermal_threshold(struct thermal_threshold *th, cb_th_t cb, void *arg)
+{
+ int i, ret = 0;
+
+ if (!th)
+ return 0;
+
+ for (i = 0; th[i].temperature != INT_MAX; i++)
+ ret |= cb(&th[i], arg);
+
+ return ret;
+}
+
int for_each_thermal_cdev(struct thermal_cdev *cdev, cb_tc_t cb, void *arg)
{
int i, ret = 0;
@@ -80,6 +94,9 @@ static int __thermal_zone_discover(struct thermal_zone *tz, void *th)
if (thermal_cmd_get_trip(th, tz) < 0)
return -1;
+ if (thermal_cmd_threshold_get(th, tz))
+ return -1;
+
if (thermal_cmd_get_governor(th, tz))
return -1;
diff --git a/tools/mm/page-types.c b/tools/mm/page-types.c
index 6eb17cc..bcac7eb 100644
--- a/tools/mm/page-types.c
+++ b/tools/mm/page-types.c
@@ -420,7 +420,7 @@ static void show_page(unsigned long voffset, unsigned long offset,
if (opt_file)
printf("%lx\t", voffset);
if (opt_list_cgroup)
- printf("@%" PRIu64 "\t", cgroup)
+ printf("@%" PRIu64 "\t", cgroup);
if (opt_list_mapcnt)
printf("%" PRIu64 "\t", mapcnt);
diff --git a/tools/power/cpupower/.gitignore b/tools/power/cpupower/.gitignore
index 7677329..5113d5a 100644
--- a/tools/power/cpupower/.gitignore
+++ b/tools/power/cpupower/.gitignore
@@ -27,3 +27,6 @@
debug/i386/powernow-k8-decode
debug/x86_64/centrino-decode
debug/x86_64/powernow-k8-decode
+
+# Clang's compilation database file
+compile_commands.json
diff --git a/tools/power/cpupower/Makefile b/tools/power/cpupower/Makefile
index 6c02f40..175004c 100644
--- a/tools/power/cpupower/Makefile
+++ b/tools/power/cpupower/Makefile
@@ -57,7 +57,7 @@
PACKAGE = cpupower
PACKAGE_BUGREPORT = linux-pm@vger.kernel.org
-LANGUAGES = de fr it cs pt ka
+LANGUAGES = de fr it cs pt ka zh_CN
# Directory definitions. These are default and most probably
@@ -86,12 +86,12 @@
# If you are running a cross compiler, you may want to set this
# to something more interesting, like "arm-linux-". If you want
# to compile vs uClibc, that can be done here as well.
-CROSS = #/usr/i386-linux-uclibc/usr/bin/i386-uclibc-
-CC = $(CROSS)gcc
-LD = $(CROSS)gcc
-AR = $(CROSS)ar
-STRIP = $(CROSS)strip
-RANLIB = $(CROSS)ranlib
+CROSS ?= #/usr/i386-linux-uclibc/usr/bin/i386-uclibc-
+CC ?= $(CROSS)gcc
+LD ?= $(CROSS)gcc
+AR ?= $(CROSS)ar
+STRIP ?= $(CROSS)strip
+RANLIB ?= $(CROSS)ranlib
HOSTCC = gcc
MKDIR = mkdir
@@ -218,17 +218,28 @@
endif
$(QUIET) $(STRIPCMD) $@
+ifeq (, $(shell which xgettext))
+$(warning "Install xgettext to extract translatable strings.")
+else
$(OUTPUT)po/$(PACKAGE).pot: $(UTIL_SRC)
$(ECHO) " GETTEXT " $@
$(QUIET) xgettext --default-domain=$(PACKAGE) --add-comments \
--keyword=_ --keyword=N_ $(UTIL_SRC) -p $(@D) -o $(@F)
+endif
+ifeq (, $(shell which msgfmt))
+$(warning "Install msgfmt to generate binary message catalogs.")
+else
$(OUTPUT)po/%.gmo: po/%.po
$(ECHO) " MSGFMT " $@
$(QUIET) msgfmt -o $@ po/$*.po
+endif
create-gmo: ${GMO_FILES}
+ifeq (, $(shell which msgmerge))
+$(warning "Install msgmerge to merge translations.")
+else
update-po: $(OUTPUT)po/$(PACKAGE).pot
$(ECHO) " MSGMRG " $@
$(QUIET) @for HLANG in $(LANGUAGES); do \
@@ -241,6 +252,7 @@
rm -f $(OUTPUT)po/$$HLANG.new.po; \
fi; \
done;
+endif
compile-bench: $(OUTPUT)libcpupower.so.$(LIB_MAJ)
@V=$(V) confdir=$(confdir) $(MAKE) -C bench O=$(OUTPUT)
diff --git a/tools/power/cpupower/bench/parse.c b/tools/power/cpupower/bench/parse.c
index e63dc11..080678d 100644
--- a/tools/power/cpupower/bench/parse.c
+++ b/tools/power/cpupower/bench/parse.c
@@ -4,6 +4,7 @@
* Copyright (C) 2008 Christian Kornacker <ckornacker@suse.de>
*/
+#include <errno.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdarg.h>
@@ -165,8 +166,8 @@ int prepare_config(const char *path, struct config *config)
configfile = fopen(path, "r");
if (configfile == NULL) {
- perror("fopen");
- fprintf(stderr, "error: unable to read configfile\n");
+ fprintf(stderr, "error: unable to read configfile: %s, %s\n",
+ path, strerror(errno));
free(config);
return 1;
}
diff --git a/tools/power/cpupower/bindings/python/test_raw_pylibcpupower.py b/tools/power/cpupower/bindings/python/test_raw_pylibcpupower.py
index 3d6f62b..ca5aa46 100755
--- a/tools/power/cpupower/bindings/python/test_raw_pylibcpupower.py
+++ b/tools/power/cpupower/bindings/python/test_raw_pylibcpupower.py
@@ -15,22 +15,38 @@
print(f"cstate count error: return code: {cpu_cstates_count}")
"""
-Disable cstate (will fail if the above is 0, ex: a virtual machine)
+Disable cstate (will fail if the above returns is under 1, ex: a virtual machine)
"""
cstate_disabled = p.cpuidle_state_disable(0, 0, 1)
-if cpu_cstates_count == 0:
- print(f"CPU 0 has {cpu_cstates_count} c-states")
-else:
- print(f"cstate count error: return code: {cpu_cstates_count}")
match cstate_disabled:
case 0:
print(f"CPU state disabled")
case -1:
print(f"Idlestate not available")
+ case -2:
+ print(f"Disabling is not supported by the kernel")
+ case -3:
+ print(f"No write access to disable/enable C-states: try using sudo")
case _:
- print(f"Not documented")
+ print(f"Not documented: {cstate_disabled}")
+"""
+Test cstate is disabled
+"""
+is_cstate_disabled = p.cpuidle_is_state_disabled(0, 0)
+
+match is_cstate_disabled:
+ case 1:
+ print(f"CPU is disabled")
+ case 0:
+ print(f"CPU is enabled")
+ case -1:
+ print(f"Idlestate not available")
+ case -2:
+ print(f"Disabling is not supported by kernel")
+ case _:
+ print(f"Not documented: {is_cstate_disabled}")
# Pointer example
diff --git a/tools/power/cpupower/man/cpupower-set.1 b/tools/power/cpupower/man/cpupower-set.1
index 2bcc696..500653e 100644
--- a/tools/power/cpupower/man/cpupower-set.1
+++ b/tools/power/cpupower/man/cpupower-set.1
@@ -3,7 +3,7 @@
cpupower\-set \- Set processor power related kernel or hardware configurations
.SH SYNOPSIS
.ft B
-.B cpupower set [ \-b VAL ]
+.B cpupower set [ \-b VAL | \-e POLICY | \-m MODE | \-t BOOL ]
.SH DESCRIPTION
@@ -19,7 +19,7 @@
Use \fBcpupower info \fP to read out current settings and whether they are
supported on the system at all.
-.SH Options
+.SH OPTIONS
.PP
\-\-perf-bias, \-b
.RS 4
@@ -56,6 +56,40 @@
This options needs the msr kernel driver (CONFIG_X86_MSR) loaded.
.RE
+.PP
+\-\-epp, \-e
+.RS 4
+Sets the energy performance policy preference on supported Intel or AMD
+processors which use the Intel or AMD P-State cpufreq driver respectively.
+
+Available policies can be found with
+\fBcat /sys/devices/system/cpu/cpufreq/policy0/energy_performance_available_preferences\fP :
+.RS 4
+default performance balance_performance balance_power power
+.RE
+
+.RE
+
+.PP
+\-\-amd\-pstate\-mode, \-m
+.RS 4
+Sets the AMD P-State mode for supported AMD processors.
+Available modes are "active", "guided" or "passive".
+
+Refer to the AMD P-State kernel documentation for further information.
+
+.RE
+
+.PP
+\-\-turbo\-boost, \-t
+.RS 4
+This option is used to enable or disable the turbo boost feature on
+supported Intel and AMD processors.
+
+This option takes as parameter either \fB1\fP to enable, or \fB0\fP to disable the feature.
+
+.RE
+
.SH "SEE ALSO"
cpupower-info(1), cpupower-monitor(1), powertop(1)
.PP
diff --git a/tools/power/cpupower/po/zh_CN.po b/tools/power/cpupower/po/zh_CN.po
new file mode 100644
index 0000000..0489abf
--- /dev/null
+++ b/tools/power/cpupower/po/zh_CN.po
@@ -0,0 +1,942 @@
+# Chinese Simplified translations for cpufrequtils package
+# Copyright (C) 2004 THE PACKAGE'S COPYRIGHT HOLDER
+# This file is distributed under the same license as the cpufrequtils package.
+#
+#, fuzzy
+msgid ""
+msgstr ""
+"Project-Id-Version: cpufrequtils 006\n"
+"Report-Msgid-Bugs-To: \n"
+"POT-Creation-Date: 2011-03-08 17:03+0100\n"
+"PO-Revision-Date: 2024-05-22 15:36+0000\n"
+"Last-Translator: Kieran Moy <kfatyuip@gmail.com>\n"
+"Language-Team: NONE\n"
+"Language: zh_CN\n"
+"MIME-Version: 1.0\n"
+"Content-Type: text/plain; charset=UTF-8\n"
+"Content-Transfer-Encoding: 8bit\n"
+"X-Generator: Poedit 3.4.2\n"
+
+#: utils/idle_monitor/nhm_idle.c:36
+msgid "Processor Core C3"
+msgstr "处理器 Core C3"
+
+#: utils/idle_monitor/nhm_idle.c:43
+msgid "Processor Core C6"
+msgstr "处理器 Core C6"
+
+#: utils/idle_monitor/nhm_idle.c:51
+msgid "Processor Package C3"
+msgstr "处理器套件 C3"
+
+#: utils/idle_monitor/nhm_idle.c:58 utils/idle_monitor/amd_fam14h_idle.c:70
+msgid "Processor Package C6"
+msgstr "处理器套件 C6"
+
+#: utils/idle_monitor/snb_idle.c:33
+msgid "Processor Core C7"
+msgstr "处理器 Core C7"
+
+#: utils/idle_monitor/snb_idle.c:40
+msgid "Processor Package C2"
+msgstr "处理器套件 C2"
+
+#: utils/idle_monitor/snb_idle.c:47
+msgid "Processor Package C7"
+msgstr "处理器套件 C7"
+
+#: utils/idle_monitor/amd_fam14h_idle.c:56
+msgid "Package in sleep state (PC1 or deeper)"
+msgstr "Package in sleep state (PC1 或更深)"
+
+#: utils/idle_monitor/amd_fam14h_idle.c:63
+msgid "Processor Package C1"
+msgstr "处理器套件 C1"
+
+#: utils/idle_monitor/amd_fam14h_idle.c:77
+msgid "North Bridge P1 boolean counter (returns 0 or 1)"
+msgstr "北桥 P1 布尔计数器(返回 0 或 1)"
+
+#: utils/idle_monitor/mperf_monitor.c:35
+msgid "Processor Core not idle"
+msgstr "处理器 Core不空闲"
+
+#: utils/idle_monitor/mperf_monitor.c:42
+msgid "Processor Core in an idle state"
+msgstr "处理器 Core处于空闲状态"
+
+#: utils/idle_monitor/mperf_monitor.c:50
+msgid "Average Frequency (including boost) in MHz"
+msgstr "平均频率(包括增加频率),单位 MHz"
+
+#: utils/idle_monitor/cpupower-monitor.c:66
+#, c-format
+msgid ""
+"cpupower monitor: [-h] [ [-t] | [-l] | [-m <mon1>,[<mon2>] ] ] [-i "
+"interval_sec | -c command ...]\n"
+msgstr ""
+"cpupower monitor:[-h] [ [-t] | [-l] | [-m <mon1>,[<mon2>] ] ] [-i "
+"interval_sec | -c command...]\n"
+
+#: utils/idle_monitor/cpupower-monitor.c:69
+#, c-format
+msgid ""
+"cpupower monitor: [-v] [-h] [ [-t] | [-l] | [-m <mon1>,[<mon2>] ] ] [-i "
+"interval_sec | -c command ...]\n"
+msgstr ""
+"cpupower monitor:[-v] [-h] [ [-t] | [-l] | [-m <mon1>,[<mon2>] ] ] [-i "
+"interval_sec | -c command...]\n"
+
+#: utils/idle_monitor/cpupower-monitor.c:71
+#, c-format
+msgid "\t -v: be more verbose\n"
+msgstr "-v:更详细\n"
+
+#: utils/idle_monitor/cpupower-monitor.c:73
+#, c-format
+msgid "\t -h: print this help\n"
+msgstr "-h:打印此帮助\n"
+
+#: utils/idle_monitor/cpupower-monitor.c:74
+#, c-format
+msgid "\t -i: time interval to measure for in seconds (default 1)\n"
+msgstr "-i:测量的时间间隔(以秒为单位)(默认 1)\n"
+
+#: utils/idle_monitor/cpupower-monitor.c:75
+#, c-format
+msgid "\t -t: show CPU topology/hierarchy\n"
+msgstr "-t:显示CPU拓扑/层次结构\n"
+
+#: utils/idle_monitor/cpupower-monitor.c:76
+#, c-format
+msgid "\t -l: list available CPU sleep monitors (for use with -m)\n"
+msgstr "-l:列出可用的 CPU 睡眠监视器(与 -m 一起使用)\n"
+
+#: utils/idle_monitor/cpupower-monitor.c:77
+#, c-format
+msgid "\t -m: show specific CPU sleep monitors only (in same order)\n"
+msgstr "-m:仅显示特定的CPU睡眠监视器(按相同顺序)\n"
+
+#: utils/idle_monitor/cpupower-monitor.c:79
+#, c-format
+msgid ""
+"only one of: -t, -l, -m are allowed\n"
+"If none of them is passed,"
+msgstr ""
+"仅允许以下之一:-t、-l、-m\n"
+"如果都没有通过的话"
+
+#: utils/idle_monitor/cpupower-monitor.c:80
+#, c-format
+msgid " all supported monitors are shown\n"
+msgstr " 显示所有支持的显示器\n"
+
+#: utils/idle_monitor/cpupower-monitor.c:197
+#, c-format
+msgid "Monitor %s, Counter %s has no count function. Implementation error\n"
+msgstr "监视器 %s、计数器 %s 无计数功能。 执行错误\n"
+
+#: utils/idle_monitor/cpupower-monitor.c:207
+#, c-format
+msgid " *is offline\n"
+msgstr " *离线\n"
+
+#: utils/idle_monitor/cpupower-monitor.c:236
+#, c-format
+msgid "%s: max monitor name length (%d) exceeded\n"
+msgstr "%s:超出最大监视器名称长度 (%d)\n"
+
+#: utils/idle_monitor/cpupower-monitor.c:250
+#, c-format
+msgid "No matching monitor found in %s, try -l option\n"
+msgstr "在 %s 中找不到匹配的监视器,请尝试 -l 选项\n"
+
+#: utils/idle_monitor/cpupower-monitor.c:266
+#, c-format
+msgid "Monitor \"%s\" (%d states) - Might overflow after %u s\n"
+msgstr "监视器“%s”(%d 状态)- 可能会在 %u 秒后溢出\n"
+
+#: utils/idle_monitor/cpupower-monitor.c:319
+#, c-format
+msgid "%s took %.5f seconds and exited with status %d\n"
+msgstr "%s 用了 %.5f 秒并退出,状态为 %d\n"
+
+#: utils/idle_monitor/cpupower-monitor.c:406
+#, c-format
+msgid "Cannot read number of available processors\n"
+msgstr "无法读取可用处理器的数量\n"
+
+#: utils/idle_monitor/cpupower-monitor.c:417
+#, c-format
+msgid "Available monitor %s needs root access\n"
+msgstr "可用监视器 %s 需要 root 访问权限\n"
+
+#: utils/idle_monitor/cpupower-monitor.c:428
+#, c-format
+msgid "No HW Cstate monitors found\n"
+msgstr "未找到 HW Cstate 监视器\n"
+
+#: utils/cpupower.c:78
+#, c-format
+msgid "cpupower [ -c cpulist ] subcommand [ARGS]\n"
+msgstr "cpupower [ -c cpulist ] subcommand [ARGS]\n"
+
+#: utils/cpupower.c:79
+#, c-format
+msgid "cpupower --version\n"
+msgstr "cpupower --version\n"
+
+#: utils/cpupower.c:80
+#, c-format
+msgid "Supported subcommands are:\n"
+msgstr "支持的子命令有:\n"
+
+#: utils/cpupower.c:83
+#, c-format
+msgid ""
+"\n"
+"Some subcommands can make use of the -c cpulist option.\n"
+msgstr ""
+"\n"
+"某些子命令可以使用 -c cpulist 选项。\n"
+
+#: utils/cpupower.c:84
+#, c-format
+msgid "Look at the general cpupower manpage how to use it\n"
+msgstr "看看一般的cpupower manpage如何使用它\n"
+
+#: utils/cpupower.c:85
+#, c-format
+msgid "and read up the subcommand's manpage whether it is supported.\n"
+msgstr "并阅读子命令的manpage是否受支持。\n"
+
+#: utils/cpupower.c:86
+#, c-format
+msgid ""
+"\n"
+"Use cpupower help subcommand for getting help for above subcommands.\n"
+msgstr ""
+"\n"
+"使用 cpupower help subcommand获取上述子命令的帮助。\n"
+
+#: utils/cpupower.c:91
+#, c-format
+msgid "Report errors and bugs to %s, please.\n"
+msgstr "请向 %s 报告错误和错误。\n"
+
+#: utils/cpupower.c:114
+#, c-format
+msgid "Error parsing cpu list\n"
+msgstr "解析cpu列表时出错\n"
+
+#: utils/cpupower.c:172
+#, c-format
+msgid "Subcommand %s needs root privileges\n"
+msgstr "子命令 %s 需要 root 权限\n"
+
+#: utils/cpufreq-info.c:31
+#, c-format
+msgid "Couldn't count the number of CPUs (%s: %s), assuming 1\n"
+msgstr "无法计算 CPU 数量(%s:%s),假设为 1\n"
+
+#: utils/cpufreq-info.c:63
+#, c-format
+msgid ""
+" minimum CPU frequency - maximum CPU frequency - governor\n"
+msgstr "最低 CPU 频率 - 最高 CPU 频率 - 调速器\n"
+
+#: utils/cpufreq-info.c:151
+#, c-format
+msgid "Error while evaluating Boost Capabilities on CPU %d -- are you root?\n"
+msgstr "评估 CPU %d 上的 Boost 功能时出错 - 您是 root 吗?\n"
+
+#. P state changes via MSR are identified via cpuid 80000007
+#. on Intel and AMD, but we assume boost capable machines can do that
+#. if (cpuid_eax(0x80000000) >= 0x80000007
+#. && (cpuid_edx(0x80000007) & (1 << 7)))
+#.
+#: utils/cpufreq-info.c:161
+#, c-format
+msgid " boost state support: \n"
+msgstr " 升压状态支持:\n"
+
+#: utils/cpufreq-info.c:163
+#, c-format
+msgid " Supported: %s\n"
+msgstr " 支持:%s\n"
+
+#: utils/cpufreq-info.c:163 utils/cpufreq-info.c:164
+msgid "yes"
+msgstr "是"
+
+#: utils/cpufreq-info.c:163 utils/cpufreq-info.c:164
+msgid "no"
+msgstr "不是"
+
+#: utils/cpufreq-info.c:164
+#, c-format
+msgid " Active: %s\n"
+msgstr " 活跃:%s\n"
+
+#: utils/cpufreq-info.c:177
+#, c-format
+msgid " Boost States: %d\n"
+msgstr " 提升状态:%d\n"
+
+#: utils/cpufreq-info.c:178
+#, c-format
+msgid " Total States: %d\n"
+msgstr " 状态总数:%d\n"
+
+#: utils/cpufreq-info.c:181
+#, c-format
+msgid " Pstate-Pb%d: %luMHz (boost state)\n"
+msgstr " Pstate-Pb%d:%luMHz(升压状态)\n"
+
+#: utils/cpufreq-info.c:184
+#, c-format
+msgid " Pstate-P%d: %luMHz\n"
+msgstr " Pstate-P%d:%luMHz\n"
+
+#: utils/cpufreq-info.c:211
+#, c-format
+msgid " no or unknown cpufreq driver is active on this CPU\n"
+msgstr " 该 CPU 上没有或未知的 cpufreq 驱动程序处于活动状态\n"
+
+#: utils/cpufreq-info.c:213
+#, c-format
+msgid " driver: %s\n"
+msgstr " 驱动程序:%s\n"
+
+#: utils/cpufreq-info.c:219
+#, c-format
+msgid " CPUs which run at the same hardware frequency: "
+msgstr " 以相同硬件频率运行的 CPU:"
+
+#: utils/cpufreq-info.c:230
+#, c-format
+msgid " CPUs which need to have their frequency coordinated by software: "
+msgstr " 需要通过软件协调频率的 CPU:"
+
+#: utils/cpufreq-info.c:241
+#, c-format
+msgid " maximum transition latency: "
+msgstr " 最大转换延迟:"
+
+#: utils/cpufreq-info.c:247
+#, c-format
+msgid " hardware limits: "
+msgstr " 硬件限制:"
+
+#: utils/cpufreq-info.c:256
+#, c-format
+msgid " available frequency steps: "
+msgstr " 可用频率范围:"
+
+#: utils/cpufreq-info.c:269
+#, c-format
+msgid " available cpufreq governors: "
+msgstr " 可用的cpufreq调节器:"
+
+#: utils/cpufreq-info.c:280
+#, c-format
+msgid " current policy: frequency should be within "
+msgstr " 当前政策:频率应在"
+
+#: utils/cpufreq-info.c:282
+#, c-format
+msgid " and "
+msgstr "和"
+
+#: utils/cpufreq-info.c:286
+#, c-format
+msgid ""
+"The governor \"%s\" may decide which speed to use\n"
+" within this range.\n"
+msgstr ""
+"调速器“%s”可以决定使用哪种速度\n"
+" 在这个范围内。\n"
+
+#: utils/cpufreq-info.c:293
+#, c-format
+msgid " current CPU frequency is "
+msgstr " 当前CPU频率是"
+
+#: utils/cpufreq-info.c:296
+#, c-format
+msgid " (asserted by call to hardware)"
+msgstr " (通过调用硬件来断言)"
+
+#: utils/cpufreq-info.c:304
+#, c-format
+msgid " cpufreq stats: "
+msgstr " cpu频率统计:"
+
+#: utils/cpufreq-info.c:472
+#, c-format
+msgid "Usage: cpupower freqinfo [options]\n"
+msgstr "用法:cpupower freqinfo [选项]\n"
+
+#: utils/cpufreq-info.c:473 utils/cpufreq-set.c:26 utils/cpupower-set.c:23
+#: utils/cpupower-info.c:22 utils/cpuidle-info.c:148
+#, c-format
+msgid "Options:\n"
+msgstr "选项:\n"
+
+#: utils/cpufreq-info.c:474
+#, c-format
+msgid " -e, --debug Prints out debug information [default]\n"
+msgstr " -e, --debug 打印出调试信息[默认]\n"
+
+#: utils/cpufreq-info.c:475
+#, c-format
+msgid ""
+" -f, --freq Get frequency the CPU currently runs at, according\n"
+" to the cpufreq core *\n"
+msgstr ""
+" -f, --freq 获取CPU当前运行的频率,根据\n"
+" 到 cpufreq 核心 *\n"
+
+#: utils/cpufreq-info.c:477
+#, c-format
+msgid ""
+" -w, --hwfreq Get frequency the CPU currently runs at, by reading\n"
+" it from hardware (only available to root) *\n"
+msgstr ""
+" -w, --hwfreq 通过读取获取CPU当前运行的频率\n"
+" 它来自硬件(仅适用于root)*\n"
+
+#: utils/cpufreq-info.c:479
+#, c-format
+msgid ""
+" -l, --hwlimits Determine the minimum and maximum CPU frequency "
+"allowed *\n"
+msgstr " -l, --hwlimits 确定允许的最小和最大 CPU 频率 *\n"
+
+#: utils/cpufreq-info.c:480
+#, c-format
+msgid " -d, --driver Determines the used cpufreq kernel driver *\n"
+msgstr " -d, --driver 确定使用的 cpufreq 内核驱动程序 *\n"
+
+#: utils/cpufreq-info.c:481
+#, c-format
+msgid " -p, --policy Gets the currently used cpufreq policy *\n"
+msgstr " -p, --policy 获取当前使用的cpufreq策略 *\n"
+
+#: utils/cpufreq-info.c:482
+#, c-format
+msgid " -g, --governors Determines available cpufreq governors *\n"
+msgstr " -g, --governors 确定可用的 cpufreq 调节器 *\n"
+
+#: utils/cpufreq-info.c:483
+#, c-format
+msgid ""
+" -r, --related-cpus Determines which CPUs run at the same hardware "
+"frequency *\n"
+msgstr " -r, --lated-cpus 确定哪些 CPU 以相同的硬件频率运行 *\n"
+
+#: utils/cpufreq-info.c:484
+#, c-format
+msgid ""
+" -a, --affected-cpus Determines which CPUs need to have their frequency\n"
+" coordinated by software *\n"
+msgstr ""
+" -a, --affected-cpus 确定哪些 CPU 需要其频率\n"
+" 由软件协调*\n"
+
+#: utils/cpufreq-info.c:486
+#, c-format
+msgid " -s, --stats Shows cpufreq statistics if available\n"
+msgstr " -s, --stats 显示 cpufreq 统计信息(如果有)\n"
+
+#: utils/cpufreq-info.c:487
+#, c-format
+msgid ""
+" -y, --latency Determines the maximum latency on CPU frequency "
+"changes *\n"
+msgstr " -y, --latency 确定 CPU 频率变化的最大延迟*\n"
+
+#: utils/cpufreq-info.c:488
+#, c-format
+msgid " -b, --boost Checks for turbo or boost modes *\n"
+msgstr " -b, --boost 检查 Turbo 或 boost 模式 *\n"
+
+#: utils/cpufreq-info.c:489
+#, c-format
+msgid ""
+" -o, --proc Prints out information like provided by the /proc/"
+"cpufreq\n"
+" interface in 2.4. and early 2.6. kernels\n"
+msgstr ""
+" -o, --proc 打印 /proc/cpufreq 提供的信息\n"
+" 2.4 中的接口。 以及 2.6 之前的内核。\n"
+
+#: utils/cpufreq-info.c:491
+#, c-format
+msgid ""
+" -m, --human human-readable output for the -f, -w, -s and -y "
+"parameters\n"
+msgstr " -m, -- human -f, -w, -s 和 -y 参数的人类可读输出\n"
+
+#: utils/cpufreq-info.c:492 utils/cpuidle-info.c:152
+#, c-format
+msgid " -h, --help Prints out this screen\n"
+msgstr " -h, --help 打印此屏幕\n"
+
+#: utils/cpufreq-info.c:495
+#, c-format
+msgid ""
+"If no argument or only the -c, --cpu parameter is given, debug output "
+"about\n"
+"cpufreq is printed which is useful e.g. for reporting bugs.\n"
+msgstr ""
+"screen如果没有参数或仅给出了 -c, --cpu 参数,则调试输出有关\n"
+"cpufreq 被打印出来,这很有用,例如 用于报告错误。\n"
+
+#: utils/cpufreq-info.c:497
+#, c-format
+msgid ""
+"For the arguments marked with *, omitting the -c or --cpu argument is\n"
+"equivalent to setting it to zero\n"
+msgstr ""
+"对于标有 * 的参数,省略 -c 或 --cpu 参数是\n"
+"相当于将其设置为零\n"
+
+#: utils/cpufreq-info.c:580
+#, c-format
+msgid ""
+"The argument passed to this tool can't be combined with passing a --cpu "
+"argument\n"
+msgstr "传递给此工具的参数不能与传递 --cpu 参数结合使用\n"
+
+#: utils/cpufreq-info.c:596
+#, c-format
+msgid ""
+"You can't specify more than one --cpu parameter and/or\n"
+"more than one output-specific argument\n"
+msgstr ""
+"您不能指定多个 --cpu 参数和/或\n"
+"多个特定于输出的参数\n"
+
+#: utils/cpufreq-info.c:600 utils/cpufreq-set.c:82 utils/cpupower-set.c:42
+#: utils/cpupower-info.c:42 utils/cpuidle-info.c:213
+#, c-format
+msgid "invalid or unknown argument\n"
+msgstr "无效或未知的参数\n"
+
+#: utils/cpufreq-info.c:617
+#, c-format
+msgid "couldn't analyze CPU %d as it doesn't seem to be present\n"
+msgstr "无法分析 CPU %d,因为它似乎不存在\n"
+
+#: utils/cpufreq-info.c:620 utils/cpupower-info.c:142
+#, c-format
+msgid "analyzing CPU %d:\n"
+msgstr "分析 CPU %d:\n"
+
+#: utils/cpufreq-set.c:25
+#, c-format
+msgid "Usage: cpupower frequency-set [options]\n"
+msgstr "用法:cpupower frequency-set [选项]\n"
+
+#: utils/cpufreq-set.c:27
+#, c-format
+msgid ""
+" -d FREQ, --min FREQ new minimum CPU frequency the governor may "
+"select\n"
+msgstr " -d FREQ, --min FREQ 调控器可以选择的新的最小 CPU 频率\n"
+
+#: utils/cpufreq-set.c:28
+#, c-format
+msgid ""
+" -u FREQ, --max FREQ new maximum CPU frequency the governor may "
+"select\n"
+msgstr " -u FREQ, --max FREQ 调控器可以选择的新的最大 CPU 频率\n"
+
+#: utils/cpufreq-set.c:29
+#, c-format
+msgid " -g GOV, --governor GOV new cpufreq governor\n"
+msgstr " -g GOV, --governor GOV 新的 cpufreq 调节器\n"
+
+#: utils/cpufreq-set.c:30
+#, c-format
+msgid ""
+" -f FREQ, --freq FREQ specific frequency to be set. Requires "
+"userspace\n"
+" governor to be available and loaded\n"
+msgstr ""
+" -f FREQ, --freq FREQ 要设置的特定频率。 需要用户空间\n"
+" 调速器可用并已加载\n"
+
+#: utils/cpufreq-set.c:32
+#, c-format
+msgid " -r, --related Switches all hardware-related CPUs\n"
+msgstr " -r, --related 切换所有与硬件相关的CPU\n"
+
+#: utils/cpufreq-set.c:33 utils/cpupower-set.c:28 utils/cpupower-info.c:27
+#, c-format
+msgid " -h, --help Prints out this screen\n"
+msgstr " -h, --help 打印此屏幕\n"
+
+#: utils/cpufreq-set.c:35
+#, c-format
+msgid ""
+"Notes:\n"
+"1. Omitting the -c or --cpu argument is equivalent to setting it to "
+"\"all\"\n"
+msgstr ""
+"注意:\n"
+"1.省略-c或--cpu参数相当于将其设置为“all”\n"
+
+#: utils/cpufreq-set.c:37
+#, c-format
+msgid ""
+"2. The -f FREQ, --freq FREQ parameter cannot be combined with any other "
+"parameter\n"
+" except the -c CPU, --cpu CPU parameter\n"
+"3. FREQuencies can be passed in Hz, kHz (default), MHz, GHz, or THz\n"
+" by postfixing the value with the wanted unit name, without any space\n"
+" (FREQuency in kHz =^ Hz * 0.001 =^ MHz * 1000 =^ GHz * 1000000).\n"
+msgstr ""
+"2. -f FREQ、--freq FREQ参数不能与任何其他参数组合使用\n"
+" 除了 -c CPU、--cpu CPU 参数\n"
+"3. 频率可以以 Hz、kHz(默认)、MHz、GHz 或 THz 为单位传递\n"
+" 通过在值后面添加所需的单位名称,不带任何空格\n"
+" (以 kHz 为单位的频率 =^ Hz * 0.001 =^ MHz * 1000 =^ GHz * 1000000)。\n"
+
+#: utils/cpufreq-set.c:57
+#, c-format
+msgid ""
+"Error setting new values. Common errors:\n"
+"- Do you have proper administration rights? (super-user?)\n"
+"- Is the governor you requested available and modprobed?\n"
+"- Trying to set an invalid policy?\n"
+"- Trying to set a specific frequency, but userspace governor is not "
+"available,\n"
+" for example because of hardware which cannot be set to a specific "
+"frequency\n"
+" or because the userspace governor isn't loaded?\n"
+msgstr ""
+"设置新值时出错。 常见错误:\n"
+"- 您有适当的管理权吗? (超级用户?)\n"
+"- 您请求的调控器是否可用并已进行 modprobed?\n"
+"- 尝试设置无效的策略?\n"
+"- 尝试设置特定频率,但用户空间调控器不可用,\n"
+" 例如由于硬件无法设置为特定频率\n"
+" 或者因为用户空间调控器未加载?\n"
+
+#: utils/cpufreq-set.c:170
+#, c-format
+msgid "wrong, unknown or unhandled CPU?\n"
+msgstr "错误、未知或未处理的CPU?\n"
+
+#: utils/cpufreq-set.c:302
+#, c-format
+msgid ""
+"the -f/--freq parameter cannot be combined with -d/--min, -u/--max or\n"
+"-g/--governor parameters\n"
+msgstr ""
+"-f/--freq 参数不能与 -d/--min、-u/--max 或\n"
+"-g/--调速器参数\n"
+
+#: utils/cpufreq-set.c:308
+#, c-format
+msgid ""
+"At least one parameter out of -f/--freq, -d/--min, -u/--max, and\n"
+"-g/--governor must be passed\n"
+msgstr ""
+"-f/--freq、-d/--min、-u/--max 和 -f/--freq 中的至少一个参数\n"
+"-g/--governor 必须通过\n"
+
+#: utils/cpufreq-set.c:347
+#, c-format
+msgid "Setting cpu: %d\n"
+msgstr "设置CPU:%d\n"
+
+#: utils/cpupower-set.c:22
+#, c-format
+msgid "Usage: cpupower set [ -b val ] [ -m val ] [ -s val ]\n"
+msgstr "用法: cpupower set [ -b val ] [ -m val ] [ -s val ]\n"
+
+#: utils/cpupower-set.c:24
+#, c-format
+msgid ""
+" -b, --perf-bias [VAL] Sets CPU's power vs performance policy on some\n"
+" Intel models [0-15], see manpage for details\n"
+msgstr ""
+" -b, --perf-bias [VAL] 设置 CPU 的功耗与性能策略\n"
+" Intel 型号 [0-15],请参阅manpage了解详细信息\n"
+
+#: utils/cpupower-set.c:26
+#, c-format
+msgid ""
+" -m, --sched-mc [VAL] Sets the kernel's multi core scheduler policy.\n"
+msgstr " -m, --sched-mc [VAL] 设置内核的多核调度程序策略。\n"
+
+#: utils/cpupower-set.c:27
+#, c-format
+msgid ""
+" -s, --sched-smt [VAL] Sets the kernel's thread sibling scheduler "
+"policy.\n"
+msgstr " -s, --sched-smt [VAL] 设置内核的线程同级调度程序策略。\n"
+
+#: utils/cpupower-set.c:80
+#, c-format
+msgid "--perf-bias param out of range [0-%d]\n"
+msgstr "--perf-bias 参数超出范围 [0-%d]\n"
+
+#: utils/cpupower-set.c:91
+#, c-format
+msgid "--sched-mc param out of range [0-%d]\n"
+msgstr "--sched-mc 参数超出范围 [0-%d]\n"
+
+#: utils/cpupower-set.c:102
+#, c-format
+msgid "--sched-smt param out of range [0-%d]\n"
+msgstr "--sched-smt 参数超出范围 [0-%d]\n"
+
+#: utils/cpupower-set.c:121
+#, c-format
+msgid "Error setting sched-mc %s\n"
+msgstr "设置 sched-mc %s 时出错\n"
+
+#: utils/cpupower-set.c:127
+#, c-format
+msgid "Error setting sched-smt %s\n"
+msgstr "设置 sched-smt %s 时出错\n"
+
+#: utils/cpupower-set.c:146
+#, c-format
+msgid "Error setting perf-bias value on CPU %d\n"
+msgstr "在 CPU %d 上设置性能偏差值时出错\n"
+
+#: utils/cpupower-info.c:21
+#, c-format
+msgid "Usage: cpupower info [ -b ] [ -m ] [ -s ]\n"
+msgstr "用法:cpupower info [-b][-m][-s]\n"
+
+#: utils/cpupower-info.c:23
+#, c-format
+msgid ""
+" -b, --perf-bias Gets CPU's power vs performance policy on some\n"
+" Intel models [0-15], see manpage for details\n"
+msgstr ""
+" -b, --perf-bias 获取 CPU 在某些方面的功耗与性能策略\n"
+" Intel 型号 [0-15],请参阅联机帮助页了解详细信"
+"息\n"
+
+#: utils/cpupower-info.c:25
+#, c-format
+msgid " -m, --sched-mc Gets the kernel's multi core scheduler policy.\n"
+msgstr " -m, --sched-mc 获取内核的多核调度程序策略。\n"
+
+#: utils/cpupower-info.c:26
+#, c-format
+msgid ""
+" -s, --sched-smt Gets the kernel's thread sibling scheduler policy.\n"
+msgstr " -s, --sched-smt 获取内核的线程同级调度程序策略。\n"
+
+#: utils/cpupower-info.c:28
+#, c-format
+msgid ""
+"\n"
+"Passing no option will show all info, by default only on core 0\n"
+msgstr ""
+"\n"
+"不传递任何选项将显示所有信息,默认情况下仅在核心 0 上\n"
+
+#: utils/cpupower-info.c:102
+#, c-format
+msgid "System's multi core scheduler setting: "
+msgstr "系统的多核调度器设置:"
+
+#. if sysfs file is missing it's: errno == ENOENT
+#: utils/cpupower-info.c:105 utils/cpupower-info.c:114
+#, c-format
+msgid "not supported\n"
+msgstr "不支持\n"
+
+#: utils/cpupower-info.c:111
+#, c-format
+msgid "System's thread sibling scheduler setting: "
+msgstr "系统的线程兄调度程序设置:"
+
+#: utils/cpupower-info.c:126
+#, c-format
+msgid "Intel's performance bias setting needs root privileges\n"
+msgstr "Intel的性能偏差设置需要root权限\n"
+
+#: utils/cpupower-info.c:128
+#, c-format
+msgid "System does not support Intel's performance bias setting\n"
+msgstr "系统不支持Intel的性能偏差设置\n"
+
+#: utils/cpupower-info.c:147
+#, c-format
+msgid "Could not read perf-bias value\n"
+msgstr "无法读取性能偏差值\n"
+
+#: utils/cpupower-info.c:150
+#, c-format
+msgid "perf-bias: %d\n"
+msgstr "性能偏差:%d\n"
+
+#: utils/cpuidle-info.c:28
+#, c-format
+msgid "Analyzing CPU %d:\n"
+msgstr "正在分析 CPU %d:\n"
+
+#: utils/cpuidle-info.c:32
+#, c-format
+msgid "CPU %u: No idle states\n"
+msgstr "CPU %u:无空闲状态\n"
+
+#: utils/cpuidle-info.c:36
+#, c-format
+msgid "CPU %u: Can't read idle state info\n"
+msgstr "CPU %u:无法读取空闲状态信息\n"
+
+#: utils/cpuidle-info.c:41
+#, c-format
+msgid "Could not determine max idle state %u\n"
+msgstr "无法确定最大空闲状态 %u\n"
+
+#: utils/cpuidle-info.c:46
+#, c-format
+msgid "Number of idle states: %d\n"
+msgstr "空闲状态数:%d\n"
+
+#: utils/cpuidle-info.c:48
+#, c-format
+msgid "Available idle states:"
+msgstr "可用的空闲状态:"
+
+#: utils/cpuidle-info.c:71
+#, c-format
+msgid "Flags/Description: %s\n"
+msgstr "标志/描述:%s\n"
+
+#: utils/cpuidle-info.c:74
+#, c-format
+msgid "Latency: %lu\n"
+msgstr "延迟:%lu\n"
+
+#: utils/cpuidle-info.c:76
+#, c-format
+msgid "Usage: %lu\n"
+msgstr "用法:%lu\n"
+
+#: utils/cpuidle-info.c:78
+#, c-format
+msgid "Duration: %llu\n"
+msgstr "持续时间:%llu\n"
+
+#: utils/cpuidle-info.c:90
+#, c-format
+msgid "Could not determine cpuidle driver\n"
+msgstr "无法确定 cpuidle 驱动程序\n"
+
+#: utils/cpuidle-info.c:94
+#, c-format
+msgid "CPUidle driver: %s\n"
+msgstr "CPU 空闲驱动程序:%s\n"
+
+#: utils/cpuidle-info.c:99
+#, c-format
+msgid "Could not determine cpuidle governor\n"
+msgstr "无法确定 cpuidle 调控器\n"
+
+#: utils/cpuidle-info.c:103
+#, c-format
+msgid "CPUidle governor: %s\n"
+msgstr "CPU 空闲调节器:%s\n"
+
+#: utils/cpuidle-info.c:122
+#, c-format
+msgid "CPU %u: Can't read C-state info\n"
+msgstr "CPU %u:无法读取 C 状态信息\n"
+
+#. printf("Cstates: %d\n", cstates);
+#: utils/cpuidle-info.c:127
+#, c-format
+msgid "active state: C0\n"
+msgstr "活动状态: C0\n"
+
+#: utils/cpuidle-info.c:128
+#, c-format
+msgid "max_cstate: C%u\n"
+msgstr "最大c状态: C%u\n"
+
+#: utils/cpuidle-info.c:129
+#, c-format
+msgid "maximum allowed latency: %lu usec\n"
+msgstr "允许的最大延迟:%lu usec\n"
+
+#: utils/cpuidle-info.c:130
+#, c-format
+msgid "states:\t\n"
+msgstr "状态:\t\n"
+
+#: utils/cpuidle-info.c:132
+#, c-format
+msgid " C%d: type[C%d] "
+msgstr " C%d: 类型[C%d]"
+
+#: utils/cpuidle-info.c:134
+#, c-format
+msgid "promotion[--] demotion[--] "
+msgstr "晋升[--] 降级[--]"
+
+#: utils/cpuidle-info.c:135
+#, c-format
+msgid "latency[%03lu] "
+msgstr "延迟[%03lu]"
+
+#: utils/cpuidle-info.c:137
+#, c-format
+msgid "usage[%08lu] "
+msgstr "使用情况[%08lu]"
+
+#: utils/cpuidle-info.c:139
+#, c-format
+msgid "duration[%020Lu] \n"
+msgstr "持续时间[%020Lu]\n"
+
+#: utils/cpuidle-info.c:147
+#, c-format
+msgid "Usage: cpupower idleinfo [options]\n"
+msgstr "用法:cpupower idleinfo [选项]\n"
+
+#: utils/cpuidle-info.c:149
+#, c-format
+msgid " -s, --silent Only show general C-state information\n"
+msgstr " -s, --silent 只显示一般C状态信息\n"
+
+#: utils/cpuidle-info.c:150
+#, c-format
+msgid ""
+" -o, --proc Prints out information like provided by the /proc/"
+"acpi/processor/*/power\n"
+" interface in older kernels\n"
+msgstr ""
+" -o, --proc 打印 /proc/acpi/processor/*/power 提供的信息\n"
+" 旧内核中的接口\n"
+
+#: utils/cpuidle-info.c:209
+#, c-format
+msgid "You can't specify more than one output-specific argument\n"
+msgstr "您不能指定多个特定于输出的参数\n"
+
+#~ msgid ""
+#~ " -c CPU, --cpu CPU CPU number which information shall be determined "
+#~ "about\n"
+#~ msgstr ""
+#~ " -c CPU, --cpu CPU Numéro du CPU pour lequel l'information sera "
+#~ "affichée\n"
+
+#~ msgid ""
+#~ " -c CPU, --cpu CPU number of CPU where cpufreq settings shall be "
+#~ "modified\n"
+#~ msgstr ""
+#~ " -c CPU, --cpu CPU numéro du CPU à prendre en compte pour les\n"
+#~ " changements\n"
diff --git a/tools/power/pm-graph/sleepgraph.8 b/tools/power/pm-graph/sleepgraph.8
index 643271b..491ca21 100644
--- a/tools/power/pm-graph/sleepgraph.8
+++ b/tools/power/pm-graph/sleepgraph.8
@@ -81,6 +81,9 @@
.TP
\fB-wifitrace\fR
Trace through the wifi reconnect time and include it in the timeline.
+.TP
+\fB-debugtiming\fR
+Add timestamp to each printed output line, accurate to the millisecond.
.SS "advanced"
.TP
diff --git a/tools/power/pm-graph/sleepgraph.py b/tools/power/pm-graph/sleepgraph.py
index ef87e63..918eae5 100755
--- a/tools/power/pm-graph/sleepgraph.py
+++ b/tools/power/pm-graph/sleepgraph.py
@@ -18,7 +18,7 @@
#
# Links:
# Home Page
-# https://01.org/pm-graph
+# https://www.intel.com/content/www/us/en/developer/topic-technology/open/pm-graph/overview.html
# Source repo
# git@github.com:intel/pm-graph
#
@@ -65,6 +65,7 @@
from threading import Thread
from subprocess import call, Popen, PIPE
import base64
+import traceback
debugtiming = False
mystarttime = time.time()
@@ -86,7 +87,7 @@
# store system values and test parameters
class SystemValues:
title = 'SleepGraph'
- version = '5.12'
+ version = '5.13'
ansi = False
rs = 0
display = ''
@@ -236,7 +237,11 @@
'msleep': { 'args_x86_64': {'time':'%di:s32'}, 'ub': 1 },
'schedule_timeout': { 'args_x86_64': {'timeout':'%di:s32'}, 'ub': 1 },
'udelay': { 'func':'__const_udelay', 'args_x86_64': {'loops':'%di:s32'}, 'ub': 1 },
- 'usleep_range': { 'args_x86_64': {'min':'%di:s32', 'max':'%si:s32'}, 'ub': 1 },
+ 'usleep_range': {
+ 'func':'usleep_range_state',
+ 'args_x86_64': {'min':'%di:s32', 'max':'%si:s32'},
+ 'ub': 1
+ },
'mutex_lock_slowpath': { 'func':'__mutex_lock_slowpath', 'ub': 1 },
'acpi_os_stall': {'ub': 1},
'rt_mutex_slowlock': {'ub': 1},
@@ -342,15 +347,21 @@
if self.verbose or msg.startswith('WARNING:'):
pprint(msg)
def signalHandler(self, signum, frame):
- if not self.result:
- return
signame = self.signames[signum] if signum in self.signames else 'UNKNOWN'
- msg = 'Signal %s caused a tool exit, line %d' % (signame, frame.f_lineno)
+ if signame in ['SIGUSR1', 'SIGUSR2', 'SIGSEGV']:
+ traceback.print_stack()
+ stack = traceback.format_list(traceback.extract_stack())
+ self.outputResult({'stack':stack})
+ if signame == 'SIGUSR1':
+ return
+ msg = '%s caused a tool exit, line %d' % (signame, frame.f_lineno)
+ pprint(msg)
self.outputResult({'error':msg})
+ os.kill(os.getpid(), signal.SIGKILL)
sys.exit(3)
def signalHandlerInit(self):
capture = ['BUS', 'SYS', 'XCPU', 'XFSZ', 'PWR', 'HUP', 'INT', 'QUIT',
- 'ILL', 'ABRT', 'FPE', 'SEGV', 'TERM']
+ 'ILL', 'ABRT', 'FPE', 'SEGV', 'TERM', 'USR1', 'USR2']
self.signames = dict()
for i in capture:
s = 'SIG'+i
@@ -859,6 +870,11 @@
# files needed for any trace data
files = ['buffer_size_kb', 'current_tracer', 'trace', 'trace_clock',
'trace_marker', 'trace_options', 'tracing_on']
+ # legacy check for old systems
+ if not os.path.exists(self.tpath+'trace'):
+ self.tpath = '/sys/kernel/debug/tracing/'
+ if not os.path.exists(self.epath):
+ self.epath = '/sys/kernel/debug/tracing/events/power/'
# files needed for callgraph trace data
tp = self.tpath
if(self.usecallgraph):
@@ -911,6 +927,13 @@
if num > 0:
n = '%d' % num
fp = open(self.result, 'a')
+ if 'stack' in testdata:
+ fp.write('Printing stack trace:\n')
+ for line in testdata['stack']:
+ fp.write(line)
+ fp.close()
+ self.sudoUserchown(self.result)
+ return
if 'error' in testdata:
fp.write('result%s: fail\n' % n)
fp.write('error%s: %s\n' % (n, testdata['error']))
@@ -1980,7 +2003,7 @@
length = -1.0
if(start >= 0 and end >= 0):
length = end - start
- if pid == -2 or name not in sysvals.tracefuncs.keys():
+ if pid >= -2:
i = 2
origname = name
while(name in list):
@@ -2753,7 +2776,8 @@
def createHeader(self, sv, stamp):
if(not stamp['time']):
return
- self.html += '<div class="version"><a href="https://01.org/pm-graph">%s v%s</a></div>' \
+ self.html += '<div class="version"><a href="https://www.intel.com/content/www/'+\
+ 'us/en/developer/topic-technology/open/pm-graph/overview.html">%s v%s</a></div>' \
% (sv.title, sv.version)
if sv.logmsg and sv.testlog:
self.html += '<button id="showtest" class="logbtn btnfmt">log</button>'
@@ -5238,12 +5262,16 @@
}
var info = dev[i].title.split(" ");
var pname = info[info.length-1];
- pd[pname] = parseFloat(info[info.length-3].slice(1));
- total[0] += pd[pname];
- if(pname.indexOf("suspend") >= 0)
- total[tidx] += pd[pname];
+ var length = parseFloat(info[info.length-3].slice(1));
+ if (pname in pd)
+ pd[pname] += length;
else
- total[tidx+1] += pd[pname];
+ pd[pname] = length;
+ total[0] += length;
+ if(pname.indexOf("suspend") >= 0)
+ total[tidx] += length;
+ else
+ total[tidx+1] += length;
}
}
var devname = deviceTitle(this.title, total, cpu);
@@ -5262,7 +5290,7 @@
phases[i].style.left = left+"%";
phases[i].title = phases[i].id+" "+pd[phases[i].id]+" ms";
left += w;
- var time = "<t4 style=\"font-size:"+fs+"px\">"+pd[phases[i].id]+" ms<br></t4>";
+ var time = "<t4 style=\"font-size:"+fs+"px\">"+pd[phases[i].id].toFixed(3)+" ms<br></t4>";
var pname = "<t3 style=\"font-size:"+fs2+"px\">"+phases[i].id.replace(new RegExp("_", "g"), " ")+"</t3>";
phases[i].innerHTML = time+pname;
} else {
@@ -6742,6 +6770,7 @@
' -wifi If a wifi connection is available, check that it reconnects after resume.\n'\
' -wifitrace Trace kernel execution through wifi reconnect.\n'\
' -netfix Use netfix to reset the network in the event it fails to resume.\n'\
+ ' -debugtiming Add timestamp to each printed line\n'\
' [testprep]\n'\
' -sync Sync the filesystems before starting the test\n'\
' -rs on/off Enable/disable runtime suspend for all devices, restore all after test\n'\
@@ -7047,7 +7076,6 @@
except:
doError('No result file supplied', True)
sysvals.result = val
- sysvals.signalHandlerInit()
else:
doError('Invalid argument: '+arg, True)
@@ -7057,6 +7085,7 @@
if(sysvals.usecallgraph and sysvals.useprocmon):
doError('-proc is not compatible with -f')
+ sysvals.signalHandlerInit()
if sysvals.usecallgraph and sysvals.cgskip:
sysvals.vprint('Using cgskip file: %s' % sysvals.cgskip)
sysvals.setCallgraphBlacklist(sysvals.cgskip)
diff --git a/tools/sched_ext/scx_show_state.py b/tools/sched_ext/scx_show_state.py
index 8bc626ed..c4b3fdd 100644
--- a/tools/sched_ext/scx_show_state.py
+++ b/tools/sched_ext/scx_show_state.py
@@ -35,6 +35,6 @@
print(f'switching_all : {read_int("scx_switching_all")}')
print(f'switched_all : {read_static_key("__scx_switched_all")}')
print(f'enable_state : {ops_state_str(enable_state)} ({enable_state})')
-print(f'bypass_depth : {read_atomic("scx_ops_bypass_depth")}')
+print(f'bypass_depth : {prog["scx_ops_bypass_depth"].value_()}')
print(f'nr_rejected : {read_atomic("scx_nr_rejected")}')
print(f'enable_seq : {read_atomic("scx_enable_seq")}')
diff --git a/tools/testing/selftests/arm64/Makefile b/tools/testing/selftests/arm64/Makefile
index 28b93ca..22029e6 100644
--- a/tools/testing/selftests/arm64/Makefile
+++ b/tools/testing/selftests/arm64/Makefile
@@ -4,7 +4,7 @@
ARCH ?= $(shell uname -m 2>/dev/null || echo not)
ifneq (,$(filter $(ARCH),aarch64 arm64))
-ARM64_SUBTARGETS ?= tags signal pauth fp mte bti abi
+ARM64_SUBTARGETS ?= tags signal pauth fp mte bti abi gcs
else
ARM64_SUBTARGETS :=
endif
diff --git a/tools/testing/selftests/arm64/abi/hwcap.c b/tools/testing/selftests/arm64/abi/hwcap.c
index f2d6007..0029ed9 100644
--- a/tools/testing/selftests/arm64/abi/hwcap.c
+++ b/tools/testing/selftests/arm64/abi/hwcap.c
@@ -98,6 +98,17 @@ static void fpmr_sigill(void)
asm volatile("mrs x0, S3_3_C4_C4_2" : : : "x0");
}
+static void gcs_sigill(void)
+{
+ unsigned long *gcspr;
+
+ asm volatile(
+ "mrs %0, S3_3_C2_C5_1"
+ : "=r" (gcspr)
+ :
+ : "cc");
+}
+
static void ilrcpc_sigill(void)
{
/* LDAPUR W0, [SP, #8] */
@@ -361,8 +372,8 @@ static void sveaes_sigill(void)
static void sveb16b16_sigill(void)
{
- /* BFADD ZA.H[W0, 0], {Z0.H-Z1.H} */
- asm volatile(".inst 0xC1E41C00" : : : );
+ /* BFADD Z0.H, Z0.H, Z0.H */
+ asm volatile(".inst 0x65000000" : : : );
}
static void svepmull_sigill(void)
@@ -490,7 +501,7 @@ static const struct hwcap_data {
.name = "F8DP2",
.at_hwcap = AT_HWCAP2,
.hwcap_bit = HWCAP2_F8DP2,
- .cpuinfo = "f8dp4",
+ .cpuinfo = "f8dp2",
.sigill_fn = f8dp2_sigill,
},
{
@@ -535,6 +546,14 @@ static const struct hwcap_data {
.sigill_reliable = true,
},
{
+ .name = "GCS",
+ .at_hwcap = AT_HWCAP,
+ .hwcap_bit = HWCAP_GCS,
+ .cpuinfo = "gcs",
+ .sigill_fn = gcs_sigill,
+ .sigill_reliable = true,
+ },
+ {
.name = "JSCVT",
.at_hwcap = AT_HWCAP,
.hwcap_bit = HWCAP_JSCVT,
diff --git a/tools/testing/selftests/arm64/abi/syscall-abi.c b/tools/testing/selftests/arm64/abi/syscall-abi.c
index d704511..5ec9a18 100644
--- a/tools/testing/selftests/arm64/abi/syscall-abi.c
+++ b/tools/testing/selftests/arm64/abi/syscall-abi.c
@@ -81,7 +81,7 @@ static int check_gpr(struct syscall_cfg *cfg, int sve_vl, int sme_vl, uint64_t s
*/
for (i = 9; i < ARRAY_SIZE(gpr_in); i++) {
if (gpr_in[i] != gpr_out[i]) {
- ksft_print_msg("%s SVE VL %d mismatch in GPR %d: %llx != %llx\n",
+ ksft_print_msg("%s SVE VL %d mismatch in GPR %d: %lx != %lx\n",
cfg->name, sve_vl, i,
gpr_in[i], gpr_out[i]);
errors++;
@@ -112,7 +112,7 @@ static int check_fpr(struct syscall_cfg *cfg, int sve_vl, int sme_vl,
if (!sve_vl && !(svcr & SVCR_SM_MASK)) {
for (i = 0; i < ARRAY_SIZE(fpr_in); i++) {
if (fpr_in[i] != fpr_out[i]) {
- ksft_print_msg("%s Q%d/%d mismatch %llx != %llx\n",
+ ksft_print_msg("%s Q%d/%d mismatch %lx != %lx\n",
cfg->name,
i / 2, i % 2,
fpr_in[i], fpr_out[i]);
@@ -294,13 +294,13 @@ static int check_svcr(struct syscall_cfg *cfg, int sve_vl, int sme_vl,
int errors = 0;
if (svcr_out & SVCR_SM_MASK) {
- ksft_print_msg("%s Still in SM, SVCR %llx\n",
+ ksft_print_msg("%s Still in SM, SVCR %lx\n",
cfg->name, svcr_out);
errors++;
}
if ((svcr_in & SVCR_ZA_MASK) != (svcr_out & SVCR_ZA_MASK)) {
- ksft_print_msg("%s PSTATE.ZA changed, SVCR %llx != %llx\n",
+ ksft_print_msg("%s PSTATE.ZA changed, SVCR %lx != %lx\n",
cfg->name, svcr_in, svcr_out);
errors++;
}
diff --git a/tools/testing/selftests/arm64/fp/assembler.h b/tools/testing/selftests/arm64/fp/assembler.h
index 9b38a0d..1fc46a5 100644
--- a/tools/testing/selftests/arm64/fp/assembler.h
+++ b/tools/testing/selftests/arm64/fp/assembler.h
@@ -65,4 +65,19 @@ endfunction
bl puts
.endm
+#define PR_SET_SHADOW_STACK_STATUS 75
+# define PR_SHADOW_STACK_ENABLE (1UL << 0)
+
+.macro enable_gcs
+ // Run with GCS
+ mov x0, PR_SET_SHADOW_STACK_STATUS
+ mov x1, PR_SHADOW_STACK_ENABLE
+ mov x2, xzr
+ mov x3, xzr
+ mov x4, xzr
+ mov x5, xzr
+ mov x8, #__NR_prctl
+ svc #0
+.endm
+
#endif /* ! ASSEMBLER_H */
diff --git a/tools/testing/selftests/arm64/fp/fp-ptrace-asm.S b/tools/testing/selftests/arm64/fp/fp-ptrace-asm.S
index 7ad59d9..82c3ab70 100644
--- a/tools/testing/selftests/arm64/fp/fp-ptrace-asm.S
+++ b/tools/testing/selftests/arm64/fp/fp-ptrace-asm.S
@@ -15,10 +15,7 @@
// Load and save register values with pauses for ptrace
//
-// x0 - SVE in use
-// x1 - SME in use
-// x2 - SME2 in use
-// x3 - FA64 supported
+// x0 - HAVE_ flags indicating which features are in use
.globl load_and_save
load_and_save:
@@ -44,7 +41,7 @@
ldp q30, q31, [x7, #16 * 30]
// SME?
- cbz x1, check_sve_in
+ tbz x0, #HAVE_SME_SHIFT, check_sve_in
adrp x7, svcr_in
ldr x7, [x7, :lo12:svcr_in]
@@ -64,7 +61,7 @@
bne 1b
// ZT?
- cbz x2, check_sm_in
+ tbz x0, #HAVE_SME2_SHIFT, check_sm_in
adrp x6, zt_in
add x6, x6, :lo12:zt_in
_ldr_zt 6
@@ -72,12 +69,14 @@
// In streaming mode?
check_sm_in:
tbz x7, #SVCR_SM_SHIFT, check_sve_in
- mov x4, x3 // Load FFR if we have FA64
+
+ // Load FFR if we have FA64
+ ubfx x4, x0, #HAVE_FA64_SHIFT, #1
b load_sve
// SVE?
check_sve_in:
- cbz x0, wait_for_writes
+ tbz x0, #HAVE_SVE_SHIFT, check_fpmr_in
mov x4, #1
load_sve:
@@ -142,6 +141,13 @@
ldr p14, [x7, #14, MUL VL]
ldr p15, [x7, #15, MUL VL]
+ // This has to come after we set PSTATE.SM
+check_fpmr_in:
+ tbz x0, #HAVE_FPMR_SHIFT, wait_for_writes
+ adrp x7, fpmr_in
+ ldr x7, [x7, :lo12:fpmr_in]
+ msr REG_FPMR, x7
+
wait_for_writes:
// Wait for the parent
brk #0
@@ -165,8 +171,13 @@
stp q28, q29, [x7, #16 * 28]
stp q30, q31, [x7, #16 * 30]
- // SME?
- cbz x1, check_sve_out
+ tbz x0, #HAVE_FPMR_SHIFT, check_sme_out
+ mrs x7, REG_FPMR
+ adrp x6, fpmr_out
+ str x7, [x6, :lo12:fpmr_out]
+
+check_sme_out:
+ tbz x0, #HAVE_SME_SHIFT, check_sve_out
rdsvl 11, 1
adrp x6, sme_vl_out
@@ -187,7 +198,7 @@
bne 1b
// ZT?
- cbz x2, check_sm_out
+ tbz x0, #HAVE_SME2_SHIFT, check_sm_out
adrp x6, zt_out
add x6, x6, :lo12:zt_out
_str_zt 6
@@ -195,12 +206,14 @@
// In streaming mode?
check_sm_out:
tbz x7, #SVCR_SM_SHIFT, check_sve_out
- mov x4, x3 // FFR?
+
+ // Do we have FA64 and FFR?
+ ubfx x4, x0, #HAVE_FA64_SHIFT, #1
b read_sve
// SVE?
check_sve_out:
- cbz x0, wait_for_reads
+ tbz x0, #HAVE_SVE_SHIFT, wait_for_reads
mov x4, #1
rdvl x7, #1
@@ -271,7 +284,7 @@
brk #0
// Ensure we don't leave ourselves in streaming mode
- cbz x1, out
+ tbz x0, #HAVE_SME_SHIFT, out
msr S3_3_C4_C2_2, xzr
out:
diff --git a/tools/testing/selftests/arm64/fp/fp-ptrace.c b/tools/testing/selftests/arm64/fp/fp-ptrace.c
index c7ceafe..4930e03 100644
--- a/tools/testing/selftests/arm64/fp/fp-ptrace.c
+++ b/tools/testing/selftests/arm64/fp/fp-ptrace.c
@@ -31,6 +31,14 @@
#include "fp-ptrace.h"
+#include <linux/bits.h>
+
+#define FPMR_LSCALE2_MASK GENMASK(37, 32)
+#define FPMR_NSCALE_MASK GENMASK(31, 24)
+#define FPMR_LSCALE_MASK GENMASK(22, 16)
+#define FPMR_OSC_MASK GENMASK(15, 15)
+#define FPMR_OSM_MASK GENMASK(14, 14)
+
/* <linux/elf.h> and <sys/auxv.h> don't like each other, so: */
#ifndef NT_ARM_SVE
#define NT_ARM_SVE 0x405
@@ -48,11 +56,22 @@
#define NT_ARM_ZT 0x40d
#endif
+#ifndef NT_ARM_FPMR
+#define NT_ARM_FPMR 0x40e
+#endif
+
#define ARCH_VQ_MAX 256
/* VL 128..2048 in powers of 2 */
#define MAX_NUM_VLS 5
+/*
+ * FPMR bits we can set without doing feature checks to see if values
+ * are valid.
+ */
+#define FPMR_SAFE_BITS (FPMR_LSCALE2_MASK | FPMR_NSCALE_MASK | \
+ FPMR_LSCALE_MASK | FPMR_OSC_MASK | FPMR_OSM_MASK)
+
#define NUM_FPR 32
__uint128_t v_in[NUM_FPR];
__uint128_t v_expected[NUM_FPR];
@@ -78,11 +97,13 @@ char zt_in[ZT_SIG_REG_BYTES];
char zt_expected[ZT_SIG_REG_BYTES];
char zt_out[ZT_SIG_REG_BYTES];
+uint64_t fpmr_in, fpmr_expected, fpmr_out;
+
uint64_t sve_vl_out;
uint64_t sme_vl_out;
uint64_t svcr_in, svcr_expected, svcr_out;
-void load_and_save(int sve, int sme, int sme2, int fa64);
+void load_and_save(int flags);
static bool got_alarm;
@@ -128,6 +149,11 @@ static bool fa64_supported(void)
return getauxval(AT_HWCAP2) & HWCAP2_SME_FA64;
}
+static bool fpmr_supported(void)
+{
+ return getauxval(AT_HWCAP2) & HWCAP2_FPMR;
+}
+
static bool compare_buffer(const char *name, void *out,
void *expected, size_t size)
{
@@ -198,7 +224,7 @@ static int vl_expected(struct test_config *config)
static void run_child(struct test_config *config)
{
- int ret;
+ int ret, flags;
/* Let the parent attach to us */
ret = ptrace(PTRACE_TRACEME, 0, 0, 0);
@@ -224,8 +250,19 @@ static void run_child(struct test_config *config)
}
/* Load values and wait for the parent */
- load_and_save(sve_supported(), sme_supported(),
- sme2_supported(), fa64_supported());
+ flags = 0;
+ if (sve_supported())
+ flags |= HAVE_SVE;
+ if (sme_supported())
+ flags |= HAVE_SME;
+ if (sme2_supported())
+ flags |= HAVE_SME2;
+ if (fa64_supported())
+ flags |= HAVE_FA64;
+ if (fpmr_supported())
+ flags |= HAVE_FPMR;
+
+ load_and_save(flags);
exit(0);
}
@@ -312,6 +349,14 @@ static void read_child_regs(pid_t child)
iov_child.iov_len = sizeof(zt_out);
read_one_child_regs(child, "ZT", &iov_parent, &iov_child);
}
+
+ if (fpmr_supported()) {
+ iov_parent.iov_base = &fpmr_out;
+ iov_parent.iov_len = sizeof(fpmr_out);
+ iov_child.iov_base = &fpmr_out;
+ iov_child.iov_len = sizeof(fpmr_out);
+ read_one_child_regs(child, "FPMR", &iov_parent, &iov_child);
+ }
}
static bool continue_breakpoint(pid_t child,
@@ -586,6 +631,26 @@ static bool check_ptrace_values_zt(pid_t child, struct test_config *config)
return compare_buffer("initial ZT", buf, zt_in, ZT_SIG_REG_BYTES);
}
+static bool check_ptrace_values_fpmr(pid_t child, struct test_config *config)
+{
+ uint64_t val;
+ struct iovec iov;
+ int ret;
+
+ if (!fpmr_supported())
+ return true;
+
+ iov.iov_base = &val;
+ iov.iov_len = sizeof(val);
+ ret = ptrace(PTRACE_GETREGSET, child, NT_ARM_FPMR, &iov);
+ if (ret != 0) {
+ ksft_print_msg("Failed to read initial FPMR: %s (%d)\n",
+ strerror(errno), errno);
+ return false;
+ }
+
+ return compare_buffer("initial FPMR", &val, &fpmr_in, sizeof(val));
+}
static bool check_ptrace_values(pid_t child, struct test_config *config)
{
@@ -620,6 +685,9 @@ static bool check_ptrace_values(pid_t child, struct test_config *config)
if (!check_ptrace_values_zt(child, config))
pass = false;
+ if (!check_ptrace_values_fpmr(child, config))
+ pass = false;
+
return pass;
}
@@ -823,11 +891,18 @@ static void set_initial_values(struct test_config *config)
{
int vq = __sve_vq_from_vl(vl_in(config));
int sme_vq = __sve_vq_from_vl(config->sme_vl_in);
+ bool sm_change;
svcr_in = config->svcr_in;
svcr_expected = config->svcr_expected;
svcr_out = 0;
+ if (sme_supported() &&
+ (svcr_in & SVCR_SM) != (svcr_expected & SVCR_SM))
+ sm_change = true;
+ else
+ sm_change = false;
+
fill_random(&v_in, sizeof(v_in));
memcpy(v_expected, v_in, sizeof(v_in));
memset(v_out, 0, sizeof(v_out));
@@ -874,6 +949,21 @@ static void set_initial_values(struct test_config *config)
memset(zt_expected, 0, ZT_SIG_REG_BYTES);
memset(zt_out, 0, sizeof(zt_out));
}
+
+ if (fpmr_supported()) {
+ fill_random(&fpmr_in, sizeof(fpmr_in));
+ fpmr_in &= FPMR_SAFE_BITS;
+
+ /* Entering or exiting streaming mode clears FPMR */
+ if (sm_change)
+ fpmr_expected = 0;
+ else
+ fpmr_expected = fpmr_in;
+ } else {
+ fpmr_in = 0;
+ fpmr_expected = 0;
+ fpmr_out = 0;
+ }
}
static bool check_memory_values(struct test_config *config)
@@ -924,6 +1014,12 @@ static bool check_memory_values(struct test_config *config)
if (!compare_buffer("saved ZT", zt_out, zt_expected, ZT_SIG_REG_BYTES))
pass = false;
+ if (fpmr_out != fpmr_expected) {
+ ksft_print_msg("Mismatch in saved FPMR: %lx != %lx\n",
+ fpmr_out, fpmr_expected);
+ pass = false;
+ }
+
return pass;
}
@@ -1001,6 +1097,36 @@ static void fpsimd_write(pid_t child, struct test_config *test_config)
strerror(errno), errno);
}
+static bool fpmr_write_supported(struct test_config *config)
+{
+ if (!fpmr_supported())
+ return false;
+
+ if (!sve_sme_same(config))
+ return false;
+
+ return true;
+}
+
+static void fpmr_write_expected(struct test_config *config)
+{
+ fill_random(&fpmr_expected, sizeof(fpmr_expected));
+ fpmr_expected &= FPMR_SAFE_BITS;
+}
+
+static void fpmr_write(pid_t child, struct test_config *config)
+{
+ struct iovec iov;
+ int ret;
+
+ iov.iov_len = sizeof(fpmr_expected);
+ iov.iov_base = &fpmr_expected;
+ ret = ptrace(PTRACE_SETREGSET, child, NT_ARM_FPMR, &iov);
+ if (ret != 0)
+ ksft_print_msg("Failed to write FPMR: %s (%d)\n",
+ strerror(errno), errno);
+}
+
static void sve_write_expected(struct test_config *config)
{
int vl = vl_expected(config);
@@ -1069,21 +1195,19 @@ static void sve_write(pid_t child, struct test_config *config)
static bool za_write_supported(struct test_config *config)
{
- if (config->svcr_expected & SVCR_SM) {
- if (!(config->svcr_in & SVCR_SM))
+ if (config->sme_vl_in != config->sme_vl_expected) {
+ /* Changing the SME VL exits streaming mode. */
+ if (config->svcr_expected & SVCR_SM) {
return false;
-
- /* Changing the SME VL exits streaming mode */
- if (config->sme_vl_in != config->sme_vl_expected) {
+ }
+ } else {
+ /* Otherwise we can't change streaming mode */
+ if ((config->svcr_in & SVCR_SM) !=
+ (config->svcr_expected & SVCR_SM)) {
return false;
}
}
- /* Can't disable SM outside a VL change */
- if ((config->svcr_in & SVCR_SM) &&
- !(config->svcr_expected & SVCR_SM))
- return false;
-
return true;
}
@@ -1259,6 +1383,12 @@ static struct test_definition base_test_defs[] = {
.set_expected_values = fpsimd_write_expected,
.modify_values = fpsimd_write,
},
+ {
+ .name = "FPMR write",
+ .supported = fpmr_write_supported,
+ .set_expected_values = fpmr_write_expected,
+ .modify_values = fpmr_write,
+ },
};
static struct test_definition sve_test_defs[] = {
@@ -1468,6 +1598,9 @@ int main(void)
if (fa64_supported())
ksft_print_msg("FA64 supported\n");
+ if (fpmr_supported())
+ ksft_print_msg("FPMR supported\n");
+
ksft_set_plan(tests);
/* Get signal handers ready before we start any children */
diff --git a/tools/testing/selftests/arm64/fp/fp-ptrace.h b/tools/testing/selftests/arm64/fp/fp-ptrace.h
index db4f2c4..c06919a 100644
--- a/tools/testing/selftests/arm64/fp/fp-ptrace.h
+++ b/tools/testing/selftests/arm64/fp/fp-ptrace.h
@@ -10,4 +10,16 @@
#define SVCR_SM (1 << SVCR_SM_SHIFT)
#define SVCR_ZA (1 << SVCR_ZA_SHIFT)
+#define HAVE_SVE_SHIFT 0
+#define HAVE_SME_SHIFT 1
+#define HAVE_SME2_SHIFT 2
+#define HAVE_FA64_SHIFT 3
+#define HAVE_FPMR_SHIFT 4
+
+#define HAVE_SVE (1 << HAVE_SVE_SHIFT)
+#define HAVE_SME (1 << HAVE_SME_SHIFT)
+#define HAVE_SME2 (1 << HAVE_SME2_SHIFT)
+#define HAVE_FA64 (1 << HAVE_FA64_SHIFT)
+#define HAVE_FPMR (1 << HAVE_FPMR_SHIFT)
+
#endif
diff --git a/tools/testing/selftests/arm64/fp/fp-stress.c b/tools/testing/selftests/arm64/fp/fp-stress.c
index faac24b..74e2320 100644
--- a/tools/testing/selftests/arm64/fp/fp-stress.c
+++ b/tools/testing/selftests/arm64/fp/fp-stress.c
@@ -28,6 +28,9 @@
#define MAX_VLS 16
+#define SIGNAL_INTERVAL_MS 25
+#define LOG_INTERVALS (1000 / SIGNAL_INTERVAL_MS)
+
struct child_data {
char *name, *output;
pid_t pid;
@@ -79,7 +82,7 @@ static void child_start(struct child_data *child, const char *program)
*/
ret = dup2(pipefd[1], 1);
if (ret == -1) {
- fprintf(stderr, "dup2() %d\n", errno);
+ printf("dup2() %d\n", errno);
exit(EXIT_FAILURE);
}
@@ -89,7 +92,7 @@ static void child_start(struct child_data *child, const char *program)
*/
ret = dup2(startup_pipe[0], 3);
if (ret == -1) {
- fprintf(stderr, "dup2() %d\n", errno);
+ printf("dup2() %d\n", errno);
exit(EXIT_FAILURE);
}
@@ -107,16 +110,15 @@ static void child_start(struct child_data *child, const char *program)
*/
ret = read(3, &i, sizeof(i));
if (ret < 0)
- fprintf(stderr, "read(startp pipe) failed: %s (%d)\n",
- strerror(errno), errno);
+ printf("read(startp pipe) failed: %s (%d)\n",
+ strerror(errno), errno);
if (ret > 0)
- fprintf(stderr, "%d bytes of data on startup pipe\n",
- ret);
+ printf("%d bytes of data on startup pipe\n", ret);
close(3);
ret = execl(program, program, NULL);
- fprintf(stderr, "execl(%s) failed: %d (%s)\n",
- program, errno, strerror(errno));
+ printf("execl(%s) failed: %d (%s)\n",
+ program, errno, strerror(errno));
exit(EXIT_FAILURE);
} else {
@@ -221,7 +223,7 @@ static void child_output(struct child_data *child, uint32_t events,
static void child_tickle(struct child_data *child)
{
if (child->output_seen && !child->exited)
- kill(child->pid, SIGUSR2);
+ kill(child->pid, SIGUSR1);
}
static void child_stop(struct child_data *child)
@@ -449,7 +451,8 @@ static const struct option options[] = {
int main(int argc, char **argv)
{
int ret;
- int timeout = 10;
+ int timeout = 10 * (1000 / SIGNAL_INTERVAL_MS);
+ int poll_interval = 5000;
int cpus, i, j, c;
int sve_vl_count, sme_vl_count;
bool all_children_started = false;
@@ -505,7 +508,7 @@ int main(int argc, char **argv)
have_sme2 ? "present" : "absent");
if (timeout > 0)
- ksft_print_msg("Will run for %ds\n", timeout);
+ ksft_print_msg("Will run for %d\n", timeout);
else
ksft_print_msg("Will run until terminated\n");
@@ -578,14 +581,14 @@ int main(int argc, char **argv)
break;
/*
- * Timeout is counted in seconds with no output, the
- * tests print during startup then are silent when
- * running so this should ensure they all ran enough
- * to install the signal handler, this is especially
- * useful in emulation where we will both be slow and
- * likely to have a large set of VLs.
+ * Timeout is counted in poll intervals with no
+ * output, the tests print during startup then are
+ * silent when running so this should ensure they all
+ * ran enough to install the signal handler, this is
+ * especially useful in emulation where we will both
+ * be slow and likely to have a large set of VLs.
*/
- ret = epoll_wait(epoll_fd, evs, tests, 1000);
+ ret = epoll_wait(epoll_fd, evs, tests, poll_interval);
if (ret < 0) {
if (errno == EINTR)
continue;
@@ -623,10 +626,12 @@ int main(int argc, char **argv)
}
all_children_started = true;
+ poll_interval = SIGNAL_INTERVAL_MS;
}
- ksft_print_msg("Sending signals, timeout remaining: %d\n",
- timeout);
+ if ((timeout % LOG_INTERVALS) == 0)
+ ksft_print_msg("Sending signals, timeout remaining: %d\n",
+ timeout);
for (i = 0; i < num_children; i++)
child_tickle(&children[i]);
@@ -651,7 +656,5 @@ int main(int argc, char **argv)
drain_output(true);
- ksft_print_cnts();
-
- return 0;
+ ksft_finished();
}
diff --git a/tools/testing/selftests/arm64/fp/fpsimd-test.S b/tools/testing/selftests/arm64/fp/fpsimd-test.S
index 8b960d0..f89d678 100644
--- a/tools/testing/selftests/arm64/fp/fpsimd-test.S
+++ b/tools/testing/selftests/arm64/fp/fpsimd-test.S
@@ -134,8 +134,7 @@
b memcmp
endfunction
-// Any SVE register modified here can cause corruption in the main
-// thread -- but *only* the registers modified here.
+// Modify live register state, the signal return will undo our changes
function irritator_handler
// Increment the irritation signal count (x23):
ldr x0, [x2, #ucontext_regs + 8 * 23]
@@ -143,7 +142,6 @@
str x0, [x2, #ucontext_regs + 8 * 23]
// Corrupt some random V-regs
- adr x0, .text + (irritator_handler - .text) / 16 * 16
movi v0.8b, #7
movi v9.16b, #9
movi v31.8b, #31
@@ -215,6 +213,8 @@
// Main program entry point
.globl _start
function _start
+ enable_gcs
+
mov x23, #0 // signal count
mov w0, #SIGINT
diff --git a/tools/testing/selftests/arm64/fp/kernel-test.c b/tools/testing/selftests/arm64/fp/kernel-test.c
index e8da3b4..8593453 100644
--- a/tools/testing/selftests/arm64/fp/kernel-test.c
+++ b/tools/testing/selftests/arm64/fp/kernel-test.c
@@ -267,6 +267,10 @@ int main(void)
strerror(errno), errno);
sa.sa_sigaction = handle_kick_signal;
+ ret = sigaction(SIGUSR1, &sa, NULL);
+ if (ret < 0)
+ printf("Failed to install SIGUSR1 handler: %s (%d)\n",
+ strerror(errno), errno);
ret = sigaction(SIGUSR2, &sa, NULL);
if (ret < 0)
printf("Failed to install SIGUSR2 handler: %s (%d)\n",
diff --git a/tools/testing/selftests/arm64/fp/sme-inst.h b/tools/testing/selftests/arm64/fp/sme-inst.h
index 9292bba..85b9184 100644
--- a/tools/testing/selftests/arm64/fp/sme-inst.h
+++ b/tools/testing/selftests/arm64/fp/sme-inst.h
@@ -5,6 +5,8 @@
#ifndef SME_INST_H
#define SME_INST_H
+#define REG_FPMR S3_3_C4_C4_2
+
/*
* RDSVL X\nx, #\imm
*/
diff --git a/tools/testing/selftests/arm64/fp/sve-ptrace.c b/tools/testing/selftests/arm64/fp/sve-ptrace.c
index 6d61992..577b6e0 100644
--- a/tools/testing/selftests/arm64/fp/sve-ptrace.c
+++ b/tools/testing/selftests/arm64/fp/sve-ptrace.c
@@ -82,10 +82,12 @@ static void fill_buf(char *buf, size_t size)
static int do_child(void)
{
if (ptrace(PTRACE_TRACEME, -1, NULL, NULL))
- ksft_exit_fail_msg("PTRACE_TRACEME", strerror(errno));
+ ksft_exit_fail_msg("ptrace(PTRACE_TRACEME) failed: %s (%d)\n",
+ strerror(errno), errno);
if (raise(SIGSTOP))
- ksft_exit_fail_msg("raise(SIGSTOP)", strerror(errno));
+ ksft_exit_fail_msg("raise(SIGSTOP) failed: %s (%d)\n",
+ strerror(errno), errno);
return EXIT_SUCCESS;
}
@@ -340,7 +342,7 @@ static void ptrace_set_sve_get_sve_data(pid_t child,
data_size = SVE_PT_SVE_OFFSET + SVE_PT_SVE_SIZE(vq, SVE_PT_REGS_SVE);
write_buf = malloc(data_size);
if (!write_buf) {
- ksft_test_result_fail("Error allocating %d byte buffer for %s VL %u\n",
+ ksft_test_result_fail("Error allocating %ld byte buffer for %s VL %u\n",
data_size, type->name, vl);
return;
}
@@ -441,7 +443,7 @@ static void ptrace_set_sve_get_fpsimd_data(pid_t child,
data_size = SVE_PT_SVE_OFFSET + SVE_PT_SVE_SIZE(vq, SVE_PT_REGS_SVE);
write_buf = malloc(data_size);
if (!write_buf) {
- ksft_test_result_fail("Error allocating %d byte buffer for %s VL %u\n",
+ ksft_test_result_fail("Error allocating %ld byte buffer for %s VL %u\n",
data_size, type->name, vl);
return;
}
@@ -545,7 +547,7 @@ static void ptrace_set_fpsimd_get_sve_data(pid_t child,
read_sve = read_buf;
if (read_sve->vl != vl) {
- ksft_test_result_fail("Child VL != expected VL %d\n",
+ ksft_test_result_fail("Child VL != expected VL: %u != %u\n",
read_sve->vl, vl);
goto out;
}
@@ -555,7 +557,7 @@ static void ptrace_set_fpsimd_get_sve_data(pid_t child,
case SVE_PT_REGS_FPSIMD:
expected_size = SVE_PT_FPSIMD_SIZE(vq, SVE_PT_REGS_FPSIMD);
if (read_sve_size < expected_size) {
- ksft_test_result_fail("Read %d bytes, expected %d\n",
+ ksft_test_result_fail("Read %ld bytes, expected %ld\n",
read_sve_size, expected_size);
goto out;
}
@@ -571,7 +573,7 @@ static void ptrace_set_fpsimd_get_sve_data(pid_t child,
case SVE_PT_REGS_SVE:
expected_size = SVE_PT_SVE_SIZE(vq, SVE_PT_REGS_SVE);
if (read_sve_size < expected_size) {
- ksft_test_result_fail("Read %d bytes, expected %d\n",
+ ksft_test_result_fail("Read %ld bytes, expected %ld\n",
read_sve_size, expected_size);
goto out;
}
diff --git a/tools/testing/selftests/arm64/fp/sve-test.S b/tools/testing/selftests/arm64/fp/sve-test.S
index fff60e2..80e072f 100644
--- a/tools/testing/selftests/arm64/fp/sve-test.S
+++ b/tools/testing/selftests/arm64/fp/sve-test.S
@@ -291,8 +291,7 @@
#endif
endfunction
-// Any SVE register modified here can cause corruption in the main
-// thread -- but *only* the registers modified here.
+// Modify live register state, the signal return will undo our changes
function irritator_handler
// Increment the irritation signal count (x23):
ldr x0, [x2, #ucontext_regs + 8 * 23]
@@ -300,13 +299,12 @@
str x0, [x2, #ucontext_regs + 8 * 23]
// Corrupt some random Z-regs
- adr x0, .text + (irritator_handler - .text) / 16 * 16
movi v0.8b, #1
movi v9.16b, #2
movi v31.8b, #3
-#ifndef SSVE
// And P0
- rdffr p0.b
+ ptrue p0.d
+#ifndef SSVE
// And FFR
wrffr p15.b
#endif
@@ -378,6 +376,8 @@
// Main program entry point
.globl _start
function _start
+ enable_gcs
+
mov x23, #0 // Irritation signal count
mov w0, #SIGINT
diff --git a/tools/testing/selftests/arm64/fp/za-ptrace.c b/tools/testing/selftests/arm64/fp/za-ptrace.c
index ac27d87..08c777f 100644
--- a/tools/testing/selftests/arm64/fp/za-ptrace.c
+++ b/tools/testing/selftests/arm64/fp/za-ptrace.c
@@ -48,10 +48,12 @@ static void fill_buf(char *buf, size_t size)
static int do_child(void)
{
if (ptrace(PTRACE_TRACEME, -1, NULL, NULL))
- ksft_exit_fail_msg("PTRACE_TRACEME", strerror(errno));
+ ksft_exit_fail_msg("ptrace(PTRACE_TRACEME) failed: %s (%d)",
+ strerror(errno), errno);
if (raise(SIGSTOP))
- ksft_exit_fail_msg("raise(SIGSTOP)", strerror(errno));
+ ksft_exit_fail_msg("raise(SIGSTOP) failed: %s (%d)\n",
+ strerror(errno), errno);
return EXIT_SUCCESS;
}
@@ -201,7 +203,7 @@ static void ptrace_set_get_data(pid_t child, unsigned int vl)
data_size = ZA_PT_SIZE(vq);
write_buf = malloc(data_size);
if (!write_buf) {
- ksft_test_result_fail("Error allocating %d byte buffer for VL %u\n",
+ ksft_test_result_fail("Error allocating %ld byte buffer for VL %u\n",
data_size, vl);
return;
}
diff --git a/tools/testing/selftests/arm64/fp/za-test.S b/tools/testing/selftests/arm64/fp/za-test.S
index 095b455..9c33e13 100644
--- a/tools/testing/selftests/arm64/fp/za-test.S
+++ b/tools/testing/selftests/arm64/fp/za-test.S
@@ -148,21 +148,16 @@
b memcmp
endfunction
-// Any SME register modified here can cause corruption in the main
-// thread -- but *only* the locations modified here.
+// Modify the live SME register state, signal return will undo our changes
function irritator_handler
// Increment the irritation signal count (x23):
ldr x0, [x2, #ucontext_regs + 8 * 23]
add x0, x0, #1
str x0, [x2, #ucontext_regs + 8 * 23]
- // Corrupt some random ZA data
-#if 0
- adr x0, .text + (irritator_handler - .text) / 16 * 16
- movi v0.8b, #1
- movi v9.16b, #2
- movi v31.8b, #3
-#endif
+ // This will reset ZA to all bits 0
+ smstop
+ smstart_za
ret
endfunction
@@ -231,6 +226,8 @@
// Main program entry point
.globl _start
function _start
+ enable_gcs
+
mov x23, #0 // signal count
mov w0, #SIGINT
diff --git a/tools/testing/selftests/arm64/fp/zt-ptrace.c b/tools/testing/selftests/arm64/fp/zt-ptrace.c
index 996d961..584b8d5 100644
--- a/tools/testing/selftests/arm64/fp/zt-ptrace.c
+++ b/tools/testing/selftests/arm64/fp/zt-ptrace.c
@@ -43,10 +43,12 @@ static void fill_buf(char *buf, size_t size)
static int do_child(void)
{
if (ptrace(PTRACE_TRACEME, -1, NULL, NULL))
- ksft_exit_fail_msg("PTRACE_TRACEME", strerror(errno));
+ ksft_exit_fail_msg("ptrace(PTRACE_TRACEME) failed: %s (%d)\n",
+ strerror(errno), errno);
if (raise(SIGSTOP))
- ksft_exit_fail_msg("raise(SIGSTOP)", strerror(errno));
+ ksft_exit_fail_msg("raise(SIGSTOP) failed: %s (%d)\n",
+ strerror(errno), errno);
return EXIT_SUCCESS;
}
@@ -231,7 +233,7 @@ static void ptrace_enable_za_via_zt(pid_t child)
/* Should have register data */
if (za_out->size < ZA_PT_SIZE(vq)) {
ksft_print_msg("ZA data less than expected: %u < %u\n",
- za_out->size, ZA_PT_SIZE(vq));
+ za_out->size, (unsigned int)ZA_PT_SIZE(vq));
fail = true;
vq = 0;
}
diff --git a/tools/testing/selftests/arm64/fp/zt-test.S b/tools/testing/selftests/arm64/fp/zt-test.S
index b5c81e8..38080f3 100644
--- a/tools/testing/selftests/arm64/fp/zt-test.S
+++ b/tools/testing/selftests/arm64/fp/zt-test.S
@@ -117,21 +117,16 @@
b memcmp
endfunction
-// Any SME register modified here can cause corruption in the main
-// thread -- but *only* the locations modified here.
+// Modify the live SME register state, signal return will undo our changes
function irritator_handler
// Increment the irritation signal count (x23):
ldr x0, [x2, #ucontext_regs + 8 * 23]
add x0, x0, #1
str x0, [x2, #ucontext_regs + 8 * 23]
- // Corrupt some random ZT data
-#if 0
- adr x0, .text + (irritator_handler - .text) / 16 * 16
- movi v0.8b, #1
- movi v9.16b, #2
- movi v31.8b, #3
-#endif
+ // This will reset ZT to all bits 0
+ smstop
+ smstart_za
ret
endfunction
@@ -200,6 +195,8 @@
// Main program entry point
.globl _start
function _start
+ enable_gcs
+
mov x23, #0 // signal count
mov w0, #SIGINT
diff --git a/tools/testing/selftests/arm64/gcs/.gitignore b/tools/testing/selftests/arm64/gcs/.gitignore
new file mode 100644
index 0000000..bbb8e40
--- /dev/null
+++ b/tools/testing/selftests/arm64/gcs/.gitignore
@@ -0,0 +1,7 @@
+basic-gcs
+libc-gcs
+gcs-locking
+gcs-stress
+gcs-stress-thread
+gcspushm
+gcsstr
diff --git a/tools/testing/selftests/arm64/gcs/Makefile b/tools/testing/selftests/arm64/gcs/Makefile
new file mode 100644
index 0000000..d2f3497
--- /dev/null
+++ b/tools/testing/selftests/arm64/gcs/Makefile
@@ -0,0 +1,30 @@
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (C) 2023 ARM Limited
+#
+# In order to avoid interaction with the toolchain and dynamic linker the
+# portions of these tests that interact with the GCS are implemented using
+# nolibc.
+#
+
+TEST_GEN_PROGS := basic-gcs libc-gcs gcs-locking gcs-stress gcspushm gcsstr
+TEST_GEN_PROGS_EXTENDED := gcs-stress-thread
+
+LDLIBS+=-lpthread
+
+include ../../lib.mk
+
+$(OUTPUT)/basic-gcs: basic-gcs.c
+ $(CC) -g -fno-asynchronous-unwind-tables -fno-ident -s -Os -nostdlib \
+ -static -include ../../../../include/nolibc/nolibc.h \
+ -I../../../../../usr/include \
+ -std=gnu99 -I../.. -g \
+ -ffreestanding -Wall $^ -o $@ -lgcc
+
+$(OUTPUT)/gcs-stress-thread: gcs-stress-thread.S
+ $(CC) -nostdlib $^ -o $@
+
+$(OUTPUT)/gcspushm: gcspushm.S
+ $(CC) -nostdlib $^ -o $@
+
+$(OUTPUT)/gcsstr: gcsstr.S
+ $(CC) -nostdlib $^ -o $@
diff --git a/tools/testing/selftests/arm64/gcs/asm-offsets.h b/tools/testing/selftests/arm64/gcs/asm-offsets.h
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/tools/testing/selftests/arm64/gcs/asm-offsets.h
diff --git a/tools/testing/selftests/arm64/gcs/basic-gcs.c b/tools/testing/selftests/arm64/gcs/basic-gcs.c
new file mode 100644
index 0000000..3fb9742
--- /dev/null
+++ b/tools/testing/selftests/arm64/gcs/basic-gcs.c
@@ -0,0 +1,357 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2023 ARM Limited.
+ */
+
+#include <limits.h>
+#include <stdbool.h>
+
+#include <linux/prctl.h>
+
+#include <sys/mman.h>
+#include <asm/mman.h>
+#include <linux/sched.h>
+
+#include "kselftest.h"
+#include "gcs-util.h"
+
+/* nolibc doesn't have sysconf(), just hard code the maximum */
+static size_t page_size = 65536;
+
+static __attribute__((noinline)) void valid_gcs_function(void)
+{
+ /* Do something the compiler can't optimise out */
+ my_syscall1(__NR_prctl, PR_SVE_GET_VL);
+}
+
+static inline int gcs_set_status(unsigned long mode)
+{
+ bool enabling = mode & PR_SHADOW_STACK_ENABLE;
+ int ret;
+ unsigned long new_mode;
+
+ /*
+ * The prctl takes 1 argument but we need to ensure that the
+ * other 3 values passed in registers to the syscall are zero
+ * since the kernel validates them.
+ */
+ ret = my_syscall5(__NR_prctl, PR_SET_SHADOW_STACK_STATUS, mode,
+ 0, 0, 0);
+
+ if (ret == 0) {
+ ret = my_syscall5(__NR_prctl, PR_GET_SHADOW_STACK_STATUS,
+ &new_mode, 0, 0, 0);
+ if (ret == 0) {
+ if (new_mode != mode) {
+ ksft_print_msg("Mode set to %lx not %lx\n",
+ new_mode, mode);
+ ret = -EINVAL;
+ }
+ } else {
+ ksft_print_msg("Failed to validate mode: %d\n", ret);
+ }
+
+ if (enabling != chkfeat_gcs()) {
+ ksft_print_msg("%senabled by prctl but %senabled in CHKFEAT\n",
+ enabling ? "" : "not ",
+ chkfeat_gcs() ? "" : "not ");
+ ret = -EINVAL;
+ }
+ }
+
+ return ret;
+}
+
+/* Try to read the status */
+static bool read_status(void)
+{
+ unsigned long state;
+ int ret;
+
+ ret = my_syscall5(__NR_prctl, PR_GET_SHADOW_STACK_STATUS,
+ &state, 0, 0, 0);
+ if (ret != 0) {
+ ksft_print_msg("Failed to read state: %d\n", ret);
+ return false;
+ }
+
+ return state & PR_SHADOW_STACK_ENABLE;
+}
+
+/* Just a straight enable */
+static bool base_enable(void)
+{
+ int ret;
+
+ ret = gcs_set_status(PR_SHADOW_STACK_ENABLE);
+ if (ret) {
+ ksft_print_msg("PR_SHADOW_STACK_ENABLE failed %d\n", ret);
+ return false;
+ }
+
+ return true;
+}
+
+/* Check we can read GCSPR_EL0 when GCS is enabled */
+static bool read_gcspr_el0(void)
+{
+ unsigned long *gcspr_el0;
+
+ ksft_print_msg("GET GCSPR\n");
+ gcspr_el0 = get_gcspr();
+ ksft_print_msg("GCSPR_EL0 is %p\n", gcspr_el0);
+
+ return true;
+}
+
+/* Also allow writes to stack */
+static bool enable_writeable(void)
+{
+ int ret;
+
+ ret = gcs_set_status(PR_SHADOW_STACK_ENABLE | PR_SHADOW_STACK_WRITE);
+ if (ret) {
+ ksft_print_msg("PR_SHADOW_STACK_ENABLE writeable failed: %d\n", ret);
+ return false;
+ }
+
+ ret = gcs_set_status(PR_SHADOW_STACK_ENABLE);
+ if (ret) {
+ ksft_print_msg("failed to restore plain enable %d\n", ret);
+ return false;
+ }
+
+ return true;
+}
+
+/* Also allow writes to stack */
+static bool enable_push_pop(void)
+{
+ int ret;
+
+ ret = gcs_set_status(PR_SHADOW_STACK_ENABLE | PR_SHADOW_STACK_PUSH);
+ if (ret) {
+ ksft_print_msg("PR_SHADOW_STACK_ENABLE with push failed: %d\n",
+ ret);
+ return false;
+ }
+
+ ret = gcs_set_status(PR_SHADOW_STACK_ENABLE);
+ if (ret) {
+ ksft_print_msg("failed to restore plain enable %d\n", ret);
+ return false;
+ }
+
+ return true;
+}
+
+/* Enable GCS and allow everything */
+static bool enable_all(void)
+{
+ int ret;
+
+ ret = gcs_set_status(PR_SHADOW_STACK_ENABLE | PR_SHADOW_STACK_PUSH |
+ PR_SHADOW_STACK_WRITE);
+ if (ret) {
+ ksft_print_msg("PR_SHADOW_STACK_ENABLE with everything failed: %d\n",
+ ret);
+ return false;
+ }
+
+ ret = gcs_set_status(PR_SHADOW_STACK_ENABLE);
+ if (ret) {
+ ksft_print_msg("failed to restore plain enable %d\n", ret);
+ return false;
+ }
+
+ return true;
+}
+
+static bool enable_invalid(void)
+{
+ int ret = gcs_set_status(ULONG_MAX);
+ if (ret == 0) {
+ ksft_print_msg("GCS_SET_STATUS %lx succeeded\n", ULONG_MAX);
+ return false;
+ }
+
+ return true;
+}
+
+/* Map a GCS */
+static bool map_guarded_stack(void)
+{
+ int ret;
+ uint64_t *buf;
+ uint64_t expected_cap;
+ int elem;
+ bool pass = true;
+
+ buf = (void *)my_syscall3(__NR_map_shadow_stack, 0, page_size,
+ SHADOW_STACK_SET_MARKER |
+ SHADOW_STACK_SET_TOKEN);
+ if (buf == MAP_FAILED) {
+ ksft_print_msg("Failed to map %lu byte GCS: %d\n",
+ page_size, errno);
+ return false;
+ }
+ ksft_print_msg("Mapped GCS at %p-%p\n", buf,
+ (void *)((uint64_t)buf + page_size));
+
+ /* The top of the newly allocated region should be 0 */
+ elem = (page_size / sizeof(uint64_t)) - 1;
+ if (buf[elem]) {
+ ksft_print_msg("Last entry is 0x%llx not 0x0\n", buf[elem]);
+ pass = false;
+ }
+
+ /* Then a valid cap token */
+ elem--;
+ expected_cap = ((uint64_t)buf + page_size - 16);
+ expected_cap &= GCS_CAP_ADDR_MASK;
+ expected_cap |= GCS_CAP_VALID_TOKEN;
+ if (buf[elem] != expected_cap) {
+ ksft_print_msg("Cap entry is 0x%llx not 0x%llx\n",
+ buf[elem], expected_cap);
+ pass = false;
+ }
+ ksft_print_msg("cap token is 0x%llx\n", buf[elem]);
+
+ /* The rest should be zeros */
+ for (elem = 0; elem < page_size / sizeof(uint64_t) - 2; elem++) {
+ if (!buf[elem])
+ continue;
+ ksft_print_msg("GCS slot %d is 0x%llx not 0x0\n",
+ elem, buf[elem]);
+ pass = false;
+ }
+
+ ret = munmap(buf, page_size);
+ if (ret != 0) {
+ ksft_print_msg("Failed to unmap %ld byte GCS: %d\n",
+ page_size, errno);
+ pass = false;
+ }
+
+ return pass;
+}
+
+/* A fork()ed process can run */
+static bool test_fork(void)
+{
+ unsigned long child_mode;
+ int ret, status;
+ pid_t pid;
+ bool pass = true;
+
+ pid = fork();
+ if (pid == -1) {
+ ksft_print_msg("fork() failed: %d\n", errno);
+ pass = false;
+ goto out;
+ }
+ if (pid == 0) {
+ /* In child, make sure we can call a function, read
+ * the GCS pointer and status and then exit */
+ valid_gcs_function();
+ get_gcspr();
+
+ ret = my_syscall5(__NR_prctl, PR_GET_SHADOW_STACK_STATUS,
+ &child_mode, 0, 0, 0);
+ if (ret == 0 && !(child_mode & PR_SHADOW_STACK_ENABLE)) {
+ ksft_print_msg("GCS not enabled in child\n");
+ ret = -EINVAL;
+ }
+
+ exit(ret);
+ }
+
+ /*
+ * In parent, check we can still do function calls then block
+ * for the child.
+ */
+ valid_gcs_function();
+
+ ksft_print_msg("Waiting for child %d\n", pid);
+
+ ret = waitpid(pid, &status, 0);
+ if (ret == -1) {
+ ksft_print_msg("Failed to wait for child: %d\n",
+ errno);
+ return false;
+ }
+
+ if (!WIFEXITED(status)) {
+ ksft_print_msg("Child exited due to signal %d\n",
+ WTERMSIG(status));
+ pass = false;
+ } else {
+ if (WEXITSTATUS(status)) {
+ ksft_print_msg("Child exited with status %d\n",
+ WEXITSTATUS(status));
+ pass = false;
+ }
+ }
+
+out:
+
+ return pass;
+}
+
+typedef bool (*gcs_test)(void);
+
+static struct {
+ char *name;
+ gcs_test test;
+ bool needs_enable;
+} tests[] = {
+ { "read_status", read_status },
+ { "base_enable", base_enable, true },
+ { "read_gcspr_el0", read_gcspr_el0 },
+ { "enable_writeable", enable_writeable, true },
+ { "enable_push_pop", enable_push_pop, true },
+ { "enable_all", enable_all, true },
+ { "enable_invalid", enable_invalid, true },
+ { "map_guarded_stack", map_guarded_stack },
+ { "fork", test_fork },
+};
+
+int main(void)
+{
+ int i, ret;
+ unsigned long gcs_mode;
+
+ ksft_print_header();
+
+ /*
+ * We don't have getauxval() with nolibc so treat a failure to
+ * read GCS state as a lack of support and skip.
+ */
+ ret = my_syscall5(__NR_prctl, PR_GET_SHADOW_STACK_STATUS,
+ &gcs_mode, 0, 0, 0);
+ if (ret != 0)
+ ksft_exit_skip("Failed to read GCS state: %d\n", ret);
+
+ if (!(gcs_mode & PR_SHADOW_STACK_ENABLE)) {
+ gcs_mode = PR_SHADOW_STACK_ENABLE;
+ ret = my_syscall5(__NR_prctl, PR_SET_SHADOW_STACK_STATUS,
+ gcs_mode, 0, 0, 0);
+ if (ret != 0)
+ ksft_exit_fail_msg("Failed to enable GCS: %d\n", ret);
+ }
+
+ ksft_set_plan(ARRAY_SIZE(tests));
+
+ for (i = 0; i < ARRAY_SIZE(tests); i++) {
+ ksft_test_result((*tests[i].test)(), "%s\n", tests[i].name);
+ }
+
+ /* One last test: disable GCS, we can do this one time */
+ my_syscall5(__NR_prctl, PR_SET_SHADOW_STACK_STATUS, 0, 0, 0, 0);
+ if (ret != 0)
+ ksft_print_msg("Failed to disable GCS: %d\n", ret);
+
+ ksft_finished();
+
+ return 0;
+}
diff --git a/tools/testing/selftests/arm64/gcs/gcs-locking.c b/tools/testing/selftests/arm64/gcs/gcs-locking.c
new file mode 100644
index 0000000..989f75a
--- /dev/null
+++ b/tools/testing/selftests/arm64/gcs/gcs-locking.c
@@ -0,0 +1,200 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2023 ARM Limited.
+ *
+ * Tests for GCS mode locking. These tests rely on both having GCS
+ * unconfigured on entry and on the kselftest harness running each
+ * test in a fork()ed process which will have it's own mode.
+ */
+
+#include <limits.h>
+
+#include <sys/auxv.h>
+#include <sys/prctl.h>
+
+#include <asm/hwcap.h>
+
+#include "kselftest_harness.h"
+
+#include "gcs-util.h"
+
+#define my_syscall2(num, arg1, arg2) \
+({ \
+ register long _num __asm__ ("x8") = (num); \
+ register long _arg1 __asm__ ("x0") = (long)(arg1); \
+ register long _arg2 __asm__ ("x1") = (long)(arg2); \
+ register long _arg3 __asm__ ("x2") = 0; \
+ register long _arg4 __asm__ ("x3") = 0; \
+ register long _arg5 __asm__ ("x4") = 0; \
+ \
+ __asm__ volatile ( \
+ "svc #0\n" \
+ : "=r"(_arg1) \
+ : "r"(_arg1), "r"(_arg2), \
+ "r"(_arg3), "r"(_arg4), \
+ "r"(_arg5), "r"(_num) \
+ : "memory", "cc" \
+ ); \
+ _arg1; \
+})
+
+/* No mode bits are rejected for locking */
+TEST(lock_all_modes)
+{
+ int ret;
+
+ ret = prctl(PR_LOCK_SHADOW_STACK_STATUS, ULONG_MAX, 0, 0, 0);
+ ASSERT_EQ(ret, 0);
+}
+
+FIXTURE(valid_modes)
+{
+};
+
+FIXTURE_VARIANT(valid_modes)
+{
+ unsigned long mode;
+};
+
+FIXTURE_VARIANT_ADD(valid_modes, enable)
+{
+ .mode = PR_SHADOW_STACK_ENABLE,
+};
+
+FIXTURE_VARIANT_ADD(valid_modes, enable_write)
+{
+ .mode = PR_SHADOW_STACK_ENABLE | PR_SHADOW_STACK_WRITE,
+};
+
+FIXTURE_VARIANT_ADD(valid_modes, enable_push)
+{
+ .mode = PR_SHADOW_STACK_ENABLE | PR_SHADOW_STACK_PUSH,
+};
+
+FIXTURE_VARIANT_ADD(valid_modes, enable_write_push)
+{
+ .mode = PR_SHADOW_STACK_ENABLE | PR_SHADOW_STACK_WRITE |
+ PR_SHADOW_STACK_PUSH,
+};
+
+FIXTURE_SETUP(valid_modes)
+{
+}
+
+FIXTURE_TEARDOWN(valid_modes)
+{
+}
+
+/* We can set the mode at all */
+TEST_F(valid_modes, set)
+{
+ int ret;
+
+ ret = my_syscall2(__NR_prctl, PR_SET_SHADOW_STACK_STATUS,
+ variant->mode);
+ ASSERT_EQ(ret, 0);
+
+ _exit(0);
+}
+
+/* Enabling, locking then disabling is rejected */
+TEST_F(valid_modes, enable_lock_disable)
+{
+ unsigned long mode;
+ int ret;
+
+ ret = my_syscall2(__NR_prctl, PR_SET_SHADOW_STACK_STATUS,
+ variant->mode);
+ ASSERT_EQ(ret, 0);
+
+ ret = prctl(PR_GET_SHADOW_STACK_STATUS, &mode, 0, 0, 0);
+ ASSERT_EQ(ret, 0);
+ ASSERT_EQ(mode, variant->mode);
+
+ ret = prctl(PR_LOCK_SHADOW_STACK_STATUS, variant->mode, 0, 0, 0);
+ ASSERT_EQ(ret, 0);
+
+ ret = my_syscall2(__NR_prctl, PR_SET_SHADOW_STACK_STATUS, 0);
+ ASSERT_EQ(ret, -EBUSY);
+
+ _exit(0);
+}
+
+/* Locking then enabling is rejected */
+TEST_F(valid_modes, lock_enable)
+{
+ unsigned long mode;
+ int ret;
+
+ ret = prctl(PR_LOCK_SHADOW_STACK_STATUS, variant->mode, 0, 0, 0);
+ ASSERT_EQ(ret, 0);
+
+ ret = my_syscall2(__NR_prctl, PR_SET_SHADOW_STACK_STATUS,
+ variant->mode);
+ ASSERT_EQ(ret, -EBUSY);
+
+ ret = prctl(PR_GET_SHADOW_STACK_STATUS, &mode, 0, 0, 0);
+ ASSERT_EQ(ret, 0);
+ ASSERT_EQ(mode, 0);
+
+ _exit(0);
+}
+
+/* Locking then changing other modes is fine */
+TEST_F(valid_modes, lock_enable_disable_others)
+{
+ unsigned long mode;
+ int ret;
+
+ ret = my_syscall2(__NR_prctl, PR_SET_SHADOW_STACK_STATUS,
+ variant->mode);
+ ASSERT_EQ(ret, 0);
+
+ ret = prctl(PR_GET_SHADOW_STACK_STATUS, &mode, 0, 0, 0);
+ ASSERT_EQ(ret, 0);
+ ASSERT_EQ(mode, variant->mode);
+
+ ret = prctl(PR_LOCK_SHADOW_STACK_STATUS, variant->mode, 0, 0, 0);
+ ASSERT_EQ(ret, 0);
+
+ ret = my_syscall2(__NR_prctl, PR_SET_SHADOW_STACK_STATUS,
+ PR_SHADOW_STACK_ALL_MODES);
+ ASSERT_EQ(ret, 0);
+
+ ret = prctl(PR_GET_SHADOW_STACK_STATUS, &mode, 0, 0, 0);
+ ASSERT_EQ(ret, 0);
+ ASSERT_EQ(mode, PR_SHADOW_STACK_ALL_MODES);
+
+
+ ret = my_syscall2(__NR_prctl, PR_SET_SHADOW_STACK_STATUS,
+ variant->mode);
+ ASSERT_EQ(ret, 0);
+
+ ret = prctl(PR_GET_SHADOW_STACK_STATUS, &mode, 0, 0, 0);
+ ASSERT_EQ(ret, 0);
+ ASSERT_EQ(mode, variant->mode);
+
+ _exit(0);
+}
+
+int main(int argc, char **argv)
+{
+ unsigned long mode;
+ int ret;
+
+ if (!(getauxval(AT_HWCAP) & HWCAP_GCS))
+ ksft_exit_skip("SKIP GCS not supported\n");
+
+ ret = prctl(PR_GET_SHADOW_STACK_STATUS, &mode, 0, 0, 0);
+ if (ret) {
+ ksft_print_msg("Failed to read GCS state: %d\n", ret);
+ return EXIT_FAILURE;
+ }
+
+ if (mode & PR_SHADOW_STACK_ENABLE) {
+ ksft_print_msg("GCS was enabled, test unsupported\n");
+ return KSFT_SKIP;
+ }
+
+ return test_harness_run(argc, argv);
+}
diff --git a/tools/testing/selftests/arm64/gcs/gcs-stress-thread.S b/tools/testing/selftests/arm64/gcs/gcs-stress-thread.S
new file mode 100644
index 0000000..b88b252
--- /dev/null
+++ b/tools/testing/selftests/arm64/gcs/gcs-stress-thread.S
@@ -0,0 +1,311 @@
+// Program that loops for ever doing lots of recursions and system calls,
+// intended to be used as part of a stress test for GCS context switching.
+//
+// Copyright 2015-2023 Arm Ltd
+
+#include <asm/unistd.h>
+
+#define sa_sz 32
+#define sa_flags 8
+#define sa_handler 0
+#define sa_mask_sz 8
+
+#define si_code 8
+
+#define SIGINT 2
+#define SIGABRT 6
+#define SIGUSR1 10
+#define SIGSEGV 11
+#define SIGUSR2 12
+#define SIGTERM 15
+#define SEGV_CPERR 10
+
+#define SA_NODEFER 1073741824
+#define SA_SIGINFO 4
+#define ucontext_regs 184
+
+#define PR_SET_SHADOW_STACK_STATUS 75
+# define PR_SHADOW_STACK_ENABLE (1UL << 0)
+
+#define GCSPR_EL0 S3_3_C2_C5_1
+
+.macro function name
+ .macro endfunction
+ .type \name, @function
+ .purgem endfunction
+ .endm
+\name:
+.endm
+
+// Print a single character x0 to stdout
+// Clobbers x0-x2,x8
+function putc
+ str x0, [sp, #-16]!
+
+ mov x0, #1 // STDOUT_FILENO
+ mov x1, sp
+ mov x2, #1
+ mov x8, #__NR_write
+ svc #0
+
+ add sp, sp, #16
+ ret
+endfunction
+.globl putc
+
+// Print a NUL-terminated string starting at address x0 to stdout
+// Clobbers x0-x3,x8
+function puts
+ mov x1, x0
+
+ mov x2, #0
+0: ldrb w3, [x0], #1
+ cbz w3, 1f
+ add x2, x2, #1
+ b 0b
+
+1: mov w0, #1 // STDOUT_FILENO
+ mov x8, #__NR_write
+ svc #0
+
+ ret
+endfunction
+.globl puts
+
+// Utility macro to print a literal string
+// Clobbers x0-x4,x8
+.macro puts string
+ .pushsection .rodata.str1.1, "aMS", @progbits, 1
+.L__puts_literal\@: .string "\string"
+ .popsection
+
+ ldr x0, =.L__puts_literal\@
+ bl puts
+.endm
+
+// Print an unsigned decimal number x0 to stdout
+// Clobbers x0-x4,x8
+function putdec
+ mov x1, sp
+ str x30, [sp, #-32]! // Result can't be > 20 digits
+
+ mov x2, #0
+ strb w2, [x1, #-1]! // Write the NUL terminator
+
+ mov x2, #10
+0: udiv x3, x0, x2 // div-mod loop to generate the digits
+ msub x0, x3, x2, x0
+ add w0, w0, #'0'
+ strb w0, [x1, #-1]!
+ mov x0, x3
+ cbnz x3, 0b
+
+ ldrb w0, [x1]
+ cbnz w0, 1f
+ mov w0, #'0' // Print "0" for 0, not ""
+ strb w0, [x1, #-1]!
+
+1: mov x0, x1
+ bl puts
+
+ ldr x30, [sp], #32
+ ret
+endfunction
+.globl putdec
+
+// Print an unsigned decimal number x0 to stdout, followed by a newline
+// Clobbers x0-x5,x8
+function putdecn
+ mov x5, x30
+
+ bl putdec
+ mov x0, #'\n'
+ bl putc
+
+ ret x5
+endfunction
+.globl putdecn
+
+// Fill x1 bytes starting at x0 with 0.
+// Clobbers x1, x2.
+function memclr
+ mov w2, #0
+endfunction
+.globl memclr
+ // fall through to memfill
+
+// Trivial memory fill: fill x1 bytes starting at address x0 with byte w2
+// Clobbers x1
+function memfill
+ cmp x1, #0
+ b.eq 1f
+
+0: strb w2, [x0], #1
+ subs x1, x1, #1
+ b.ne 0b
+
+1: ret
+endfunction
+.globl memfill
+
+// w0: signal number
+// x1: sa_action
+// w2: sa_flags
+// Clobbers x0-x6,x8
+function setsignal
+ str x30, [sp, #-((sa_sz + 15) / 16 * 16 + 16)]!
+
+ mov w4, w0
+ mov x5, x1
+ mov w6, w2
+
+ add x0, sp, #16
+ mov x1, #sa_sz
+ bl memclr
+
+ mov w0, w4
+ add x1, sp, #16
+ str w6, [x1, #sa_flags]
+ str x5, [x1, #sa_handler]
+ mov x2, #0
+ mov x3, #sa_mask_sz
+ mov x8, #__NR_rt_sigaction
+ svc #0
+
+ cbz w0, 1f
+
+ puts "sigaction failure\n"
+ b abort
+
+1: ldr x30, [sp], #((sa_sz + 15) / 16 * 16 + 16)
+ ret
+endfunction
+
+
+function tickle_handler
+ // Perhaps collect GCSPR_EL0 here in future?
+ ret
+endfunction
+
+function terminate_handler
+ mov w21, w0
+ mov x20, x2
+
+ puts "Terminated by signal "
+ mov w0, w21
+ bl putdec
+ puts ", no error\n"
+
+ mov x0, #0
+ mov x8, #__NR_exit
+ svc #0
+endfunction
+
+function segv_handler
+ // stash the siginfo_t *
+ mov x20, x1
+
+ // Disable GCS, we don't want additional faults logging things
+ mov x0, PR_SET_SHADOW_STACK_STATUS
+ mov x1, xzr
+ mov x2, xzr
+ mov x3, xzr
+ mov x4, xzr
+ mov x5, xzr
+ mov x8, #__NR_prctl
+ svc #0
+
+ puts "Got SIGSEGV code "
+
+ ldr x21, [x20, #si_code]
+ mov x0, x21
+ bl putdec
+
+ // GCS faults should have si_code SEGV_CPERR
+ cmp x21, #SEGV_CPERR
+ bne 1f
+
+ puts " (GCS violation)"
+1:
+ mov x0, '\n'
+ bl putc
+ b abort
+endfunction
+
+// Recurse x20 times
+.macro recurse id
+function recurse\id
+ stp x29, x30, [sp, #-16]!
+ mov x29, sp
+
+ cmp x20, 0
+ beq 1f
+ sub x20, x20, 1
+ bl recurse\id
+
+1:
+ ldp x29, x30, [sp], #16
+
+ // Do a syscall immediately prior to returning to try to provoke
+ // scheduling and migration at a point where coherency issues
+ // might trigger.
+ mov x8, #__NR_getpid
+ svc #0
+
+ ret
+endfunction
+.endm
+
+// Generate and use two copies so we're changing the GCS contents
+recurse 1
+recurse 2
+
+.globl _start
+function _start
+ // Run with GCS
+ mov x0, PR_SET_SHADOW_STACK_STATUS
+ mov x1, PR_SHADOW_STACK_ENABLE
+ mov x2, xzr
+ mov x3, xzr
+ mov x4, xzr
+ mov x5, xzr
+ mov x8, #__NR_prctl
+ svc #0
+ cbz x0, 1f
+ puts "Failed to enable GCS\n"
+ b abort
+1:
+
+ mov w0, #SIGTERM
+ adr x1, terminate_handler
+ mov w2, #SA_SIGINFO
+ bl setsignal
+
+ mov w0, #SIGUSR1
+ adr x1, tickle_handler
+ mov w2, #SA_SIGINFO
+ orr w2, w2, #SA_NODEFER
+ bl setsignal
+
+ mov w0, #SIGSEGV
+ adr x1, segv_handler
+ mov w2, #SA_SIGINFO
+ orr w2, w2, #SA_NODEFER
+ bl setsignal
+
+ puts "Running\n"
+
+loop:
+ // Small recursion depth so we're frequently flipping between
+ // the two recursors and changing what's on the stack
+ mov x20, #5
+ bl recurse1
+ mov x20, #5
+ bl recurse2
+ b loop
+endfunction
+
+abort:
+ mov x0, #255
+ mov x8, #__NR_exit
+ svc #0
diff --git a/tools/testing/selftests/arm64/gcs/gcs-stress.c b/tools/testing/selftests/arm64/gcs/gcs-stress.c
new file mode 100644
index 0000000..bbc7f49
--- /dev/null
+++ b/tools/testing/selftests/arm64/gcs/gcs-stress.c
@@ -0,0 +1,530 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2022-3 ARM Limited.
+ */
+
+#define _GNU_SOURCE
+#define _POSIX_C_SOURCE 199309L
+
+#include <errno.h>
+#include <getopt.h>
+#include <poll.h>
+#include <signal.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/auxv.h>
+#include <sys/epoll.h>
+#include <sys/prctl.h>
+#include <sys/types.h>
+#include <sys/uio.h>
+#include <sys/wait.h>
+#include <asm/hwcap.h>
+
+#include "../../kselftest.h"
+
+struct child_data {
+ char *name, *output;
+ pid_t pid;
+ int stdout;
+ bool output_seen;
+ bool exited;
+ int exit_status;
+ int exit_signal;
+};
+
+static int epoll_fd;
+static struct child_data *children;
+static struct epoll_event *evs;
+static int tests;
+static int num_children;
+static bool terminate;
+
+static int startup_pipe[2];
+
+static int num_processors(void)
+{
+ long nproc = sysconf(_SC_NPROCESSORS_CONF);
+ if (nproc < 0) {
+ perror("Unable to read number of processors\n");
+ exit(EXIT_FAILURE);
+ }
+
+ return nproc;
+}
+
+static void start_thread(struct child_data *child, int id)
+{
+ int ret, pipefd[2], i;
+ struct epoll_event ev;
+
+ ret = pipe(pipefd);
+ if (ret != 0)
+ ksft_exit_fail_msg("Failed to create stdout pipe: %s (%d)\n",
+ strerror(errno), errno);
+
+ child->pid = fork();
+ if (child->pid == -1)
+ ksft_exit_fail_msg("fork() failed: %s (%d)\n",
+ strerror(errno), errno);
+
+ if (!child->pid) {
+ /*
+ * In child, replace stdout with the pipe, errors to
+ * stderr from here as kselftest prints to stdout.
+ */
+ ret = dup2(pipefd[1], 1);
+ if (ret == -1) {
+ fprintf(stderr, "dup2() %d\n", errno);
+ exit(EXIT_FAILURE);
+ }
+
+ /*
+ * Duplicate the read side of the startup pipe to
+ * FD 3 so we can close everything else.
+ */
+ ret = dup2(startup_pipe[0], 3);
+ if (ret == -1) {
+ fprintf(stderr, "dup2() %d\n", errno);
+ exit(EXIT_FAILURE);
+ }
+
+ /*
+ * Very dumb mechanism to clean open FDs other than
+ * stdio. We don't want O_CLOEXEC for the pipes...
+ */
+ for (i = 4; i < 8192; i++)
+ close(i);
+
+ /*
+ * Read from the startup pipe, there should be no data
+ * and we should block until it is closed. We just
+ * carry on on error since this isn't super critical.
+ */
+ ret = read(3, &i, sizeof(i));
+ if (ret < 0)
+ fprintf(stderr, "read(startp pipe) failed: %s (%d)\n",
+ strerror(errno), errno);
+ if (ret > 0)
+ fprintf(stderr, "%d bytes of data on startup pipe\n",
+ ret);
+ close(3);
+
+ ret = execl("gcs-stress-thread", "gcs-stress-thread", NULL);
+ fprintf(stderr, "execl(gcs-stress-thread) failed: %d (%s)\n",
+ errno, strerror(errno));
+
+ exit(EXIT_FAILURE);
+ } else {
+ /*
+ * In parent, remember the child and close our copy of the
+ * write side of stdout.
+ */
+ close(pipefd[1]);
+ child->stdout = pipefd[0];
+ child->output = NULL;
+ child->exited = false;
+ child->output_seen = false;
+
+ ev.events = EPOLLIN | EPOLLHUP;
+ ev.data.ptr = child;
+
+ ret = asprintf(&child->name, "Thread-%d", id);
+ if (ret == -1)
+ ksft_exit_fail_msg("asprintf() failed\n");
+
+ ret = epoll_ctl(epoll_fd, EPOLL_CTL_ADD, child->stdout, &ev);
+ if (ret < 0) {
+ ksft_exit_fail_msg("%s EPOLL_CTL_ADD failed: %s (%d)\n",
+ child->name, strerror(errno), errno);
+ }
+ }
+
+ ksft_print_msg("Started %s\n", child->name);
+ num_children++;
+}
+
+static bool child_output_read(struct child_data *child)
+{
+ char read_data[1024];
+ char work[1024];
+ int ret, len, cur_work, cur_read;
+
+ ret = read(child->stdout, read_data, sizeof(read_data));
+ if (ret < 0) {
+ if (errno == EINTR)
+ return true;
+
+ ksft_print_msg("%s: read() failed: %s (%d)\n",
+ child->name, strerror(errno),
+ errno);
+ return false;
+ }
+ len = ret;
+
+ child->output_seen = true;
+
+ /* Pick up any partial read */
+ if (child->output) {
+ strncpy(work, child->output, sizeof(work) - 1);
+ cur_work = strnlen(work, sizeof(work));
+ free(child->output);
+ child->output = NULL;
+ } else {
+ cur_work = 0;
+ }
+
+ cur_read = 0;
+ while (cur_read < len) {
+ work[cur_work] = read_data[cur_read++];
+
+ if (work[cur_work] == '\n') {
+ work[cur_work] = '\0';
+ ksft_print_msg("%s: %s\n", child->name, work);
+ cur_work = 0;
+ } else {
+ cur_work++;
+ }
+ }
+
+ if (cur_work) {
+ work[cur_work] = '\0';
+ ret = asprintf(&child->output, "%s", work);
+ if (ret == -1)
+ ksft_exit_fail_msg("Out of memory\n");
+ }
+
+ return false;
+}
+
+static void child_output(struct child_data *child, uint32_t events,
+ bool flush)
+{
+ bool read_more;
+
+ if (events & EPOLLIN) {
+ do {
+ read_more = child_output_read(child);
+ } while (read_more);
+ }
+
+ if (events & EPOLLHUP) {
+ close(child->stdout);
+ child->stdout = -1;
+ flush = true;
+ }
+
+ if (flush && child->output) {
+ ksft_print_msg("%s: %s<EOF>\n", child->name, child->output);
+ free(child->output);
+ child->output = NULL;
+ }
+}
+
+static void child_tickle(struct child_data *child)
+{
+ if (child->output_seen && !child->exited)
+ kill(child->pid, SIGUSR1);
+}
+
+static void child_stop(struct child_data *child)
+{
+ if (!child->exited)
+ kill(child->pid, SIGTERM);
+}
+
+static void child_cleanup(struct child_data *child)
+{
+ pid_t ret;
+ int status;
+ bool fail = false;
+
+ if (!child->exited) {
+ do {
+ ret = waitpid(child->pid, &status, 0);
+ if (ret == -1 && errno == EINTR)
+ continue;
+
+ if (ret == -1) {
+ ksft_print_msg("waitpid(%d) failed: %s (%d)\n",
+ child->pid, strerror(errno),
+ errno);
+ fail = true;
+ break;
+ }
+
+ if (WIFEXITED(status)) {
+ child->exit_status = WEXITSTATUS(status);
+ child->exited = true;
+ }
+
+ if (WIFSIGNALED(status)) {
+ child->exit_signal = WTERMSIG(status);
+ ksft_print_msg("%s: Exited due to signal %d\n",
+ child->name, child->exit_signal);
+ fail = true;
+ child->exited = true;
+ }
+ } while (!child->exited);
+ }
+
+ if (!child->output_seen) {
+ ksft_print_msg("%s no output seen\n", child->name);
+ fail = true;
+ }
+
+ if (child->exit_status != 0) {
+ ksft_print_msg("%s exited with error code %d\n",
+ child->name, child->exit_status);
+ fail = true;
+ }
+
+ ksft_test_result(!fail, "%s\n", child->name);
+}
+
+static void handle_child_signal(int sig, siginfo_t *info, void *context)
+{
+ int i;
+ bool found = false;
+
+ for (i = 0; i < num_children; i++) {
+ if (children[i].pid == info->si_pid) {
+ children[i].exited = true;
+ children[i].exit_status = info->si_status;
+ found = true;
+ break;
+ }
+ }
+
+ if (!found)
+ ksft_print_msg("SIGCHLD for unknown PID %d with status %d\n",
+ info->si_pid, info->si_status);
+}
+
+static void handle_exit_signal(int sig, siginfo_t *info, void *context)
+{
+ int i;
+
+ /* If we're already exiting then don't signal again */
+ if (terminate)
+ return;
+
+ ksft_print_msg("Got signal, exiting...\n");
+
+ terminate = true;
+
+ /*
+ * This should be redundant, the main loop should clean up
+ * after us, but for safety stop everything we can here.
+ */
+ for (i = 0; i < num_children; i++)
+ child_stop(&children[i]);
+}
+
+/* Handle any pending output without blocking */
+static void drain_output(bool flush)
+{
+ int ret = 1;
+ int i;
+
+ while (ret > 0) {
+ ret = epoll_wait(epoll_fd, evs, tests, 0);
+ if (ret < 0) {
+ if (errno == EINTR)
+ continue;
+ ksft_print_msg("epoll_wait() failed: %s (%d)\n",
+ strerror(errno), errno);
+ }
+
+ for (i = 0; i < ret; i++)
+ child_output(evs[i].data.ptr, evs[i].events, flush);
+ }
+}
+
+static const struct option options[] = {
+ { "timeout", required_argument, NULL, 't' },
+ { }
+};
+
+int main(int argc, char **argv)
+{
+ int seen_children;
+ bool all_children_started = false;
+ int gcs_threads;
+ int timeout = 10;
+ int ret, cpus, i, c;
+ struct sigaction sa;
+
+ while ((c = getopt_long(argc, argv, "t:", options, NULL)) != -1) {
+ switch (c) {
+ case 't':
+ ret = sscanf(optarg, "%d", &timeout);
+ if (ret != 1)
+ ksft_exit_fail_msg("Failed to parse timeout %s\n",
+ optarg);
+ break;
+ default:
+ ksft_exit_fail_msg("Unknown argument\n");
+ }
+ }
+
+ cpus = num_processors();
+ tests = 0;
+
+ if (getauxval(AT_HWCAP) & HWCAP_GCS) {
+ /* One extra thread, trying to trigger migrations */
+ gcs_threads = cpus + 1;
+ tests += gcs_threads;
+ } else {
+ gcs_threads = 0;
+ }
+
+ ksft_print_header();
+ ksft_set_plan(tests);
+
+ ksft_print_msg("%d CPUs, %d GCS threads\n",
+ cpus, gcs_threads);
+
+ if (!tests)
+ ksft_exit_skip("No tests scheduled\n");
+
+ if (timeout > 0)
+ ksft_print_msg("Will run for %ds\n", timeout);
+ else
+ ksft_print_msg("Will run until terminated\n");
+
+ children = calloc(sizeof(*children), tests);
+ if (!children)
+ ksft_exit_fail_msg("Unable to allocate child data\n");
+
+ ret = epoll_create1(EPOLL_CLOEXEC);
+ if (ret < 0)
+ ksft_exit_fail_msg("epoll_create1() failed: %s (%d)\n",
+ strerror(errno), ret);
+ epoll_fd = ret;
+
+ /* Create a pipe which children will block on before execing */
+ ret = pipe(startup_pipe);
+ if (ret != 0)
+ ksft_exit_fail_msg("Failed to create startup pipe: %s (%d)\n",
+ strerror(errno), errno);
+
+ /* Get signal handers ready before we start any children */
+ memset(&sa, 0, sizeof(sa));
+ sa.sa_sigaction = handle_exit_signal;
+ sa.sa_flags = SA_RESTART | SA_SIGINFO;
+ sigemptyset(&sa.sa_mask);
+ ret = sigaction(SIGINT, &sa, NULL);
+ if (ret < 0)
+ ksft_print_msg("Failed to install SIGINT handler: %s (%d)\n",
+ strerror(errno), errno);
+ ret = sigaction(SIGTERM, &sa, NULL);
+ if (ret < 0)
+ ksft_print_msg("Failed to install SIGTERM handler: %s (%d)\n",
+ strerror(errno), errno);
+ sa.sa_sigaction = handle_child_signal;
+ ret = sigaction(SIGCHLD, &sa, NULL);
+ if (ret < 0)
+ ksft_print_msg("Failed to install SIGCHLD handler: %s (%d)\n",
+ strerror(errno), errno);
+
+ evs = calloc(tests, sizeof(*evs));
+ if (!evs)
+ ksft_exit_fail_msg("Failed to allocated %d epoll events\n",
+ tests);
+
+ for (i = 0; i < gcs_threads; i++)
+ start_thread(&children[i], i);
+
+ /*
+ * All children started, close the startup pipe and let them
+ * run.
+ */
+ close(startup_pipe[0]);
+ close(startup_pipe[1]);
+
+ timeout *= 10;
+ for (;;) {
+ /* Did we get a signal asking us to exit? */
+ if (terminate)
+ break;
+
+ /*
+ * Timeout is counted in 100ms with no output, the
+ * tests print during startup then are silent when
+ * running so this should ensure they all ran enough
+ * to install the signal handler, this is especially
+ * useful in emulation where we will both be slow and
+ * likely to have a large set of VLs.
+ */
+ ret = epoll_wait(epoll_fd, evs, tests, 100);
+ if (ret < 0) {
+ if (errno == EINTR)
+ continue;
+ ksft_exit_fail_msg("epoll_wait() failed: %s (%d)\n",
+ strerror(errno), errno);
+ }
+
+ /* Output? */
+ if (ret > 0) {
+ for (i = 0; i < ret; i++) {
+ child_output(evs[i].data.ptr, evs[i].events,
+ false);
+ }
+ continue;
+ }
+
+ /* Otherwise epoll_wait() timed out */
+
+ /*
+ * If the child processes have not produced output they
+ * aren't actually running the tests yet.
+ */
+ if (!all_children_started) {
+ seen_children = 0;
+
+ for (i = 0; i < num_children; i++)
+ if (children[i].output_seen ||
+ children[i].exited)
+ seen_children++;
+
+ if (seen_children != num_children) {
+ ksft_print_msg("Waiting for %d children\n",
+ num_children - seen_children);
+ continue;
+ }
+
+ all_children_started = true;
+ }
+
+ ksft_print_msg("Sending signals, timeout remaining: %d00ms\n",
+ timeout);
+
+ for (i = 0; i < num_children; i++)
+ child_tickle(&children[i]);
+
+ /* Negative timeout means run indefinitely */
+ if (timeout < 0)
+ continue;
+ if (--timeout == 0)
+ break;
+ }
+
+ ksft_print_msg("Finishing up...\n");
+ terminate = true;
+
+ for (i = 0; i < tests; i++)
+ child_stop(&children[i]);
+
+ drain_output(false);
+
+ for (i = 0; i < tests; i++)
+ child_cleanup(&children[i]);
+
+ drain_output(true);
+
+ ksft_finished();
+}
diff --git a/tools/testing/selftests/arm64/gcs/gcs-util.h b/tools/testing/selftests/arm64/gcs/gcs-util.h
new file mode 100644
index 0000000..c99a6b3
--- /dev/null
+++ b/tools/testing/selftests/arm64/gcs/gcs-util.h
@@ -0,0 +1,100 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2023 ARM Limited.
+ */
+
+#ifndef GCS_UTIL_H
+#define GCS_UTIL_H
+
+#include <stdbool.h>
+
+#ifndef __NR_map_shadow_stack
+#define __NR_map_shadow_stack 453
+#endif
+
+#ifndef __NR_prctl
+#define __NR_prctl 167
+#endif
+
+#ifndef NT_ARM_GCS
+#define NT_ARM_GCS 0x410
+
+struct user_gcs {
+ __u64 features_enabled;
+ __u64 features_locked;
+ __u64 gcspr_el0;
+};
+#endif
+
+/* Shadow Stack/Guarded Control Stack interface */
+#define PR_GET_SHADOW_STACK_STATUS 74
+#define PR_SET_SHADOW_STACK_STATUS 75
+#define PR_LOCK_SHADOW_STACK_STATUS 76
+
+# define PR_SHADOW_STACK_ENABLE (1UL << 0)
+# define PR_SHADOW_STACK_WRITE (1UL << 1)
+# define PR_SHADOW_STACK_PUSH (1UL << 2)
+
+#define PR_SHADOW_STACK_ALL_MODES \
+ PR_SHADOW_STACK_ENABLE | PR_SHADOW_STACK_WRITE | PR_SHADOW_STACK_PUSH
+
+#define SHADOW_STACK_SET_TOKEN (1ULL << 0) /* Set up a restore token in the shadow stack */
+#define SHADOW_STACK_SET_MARKER (1ULL << 1) /* Set up a top of stack merker in the shadow stack */
+
+#define GCS_CAP_ADDR_MASK (0xfffffffffffff000UL)
+#define GCS_CAP_TOKEN_MASK (0x0000000000000fffUL)
+#define GCS_CAP_VALID_TOKEN 1
+#define GCS_CAP_IN_PROGRESS_TOKEN 5
+
+#define GCS_CAP(x) (((unsigned long)(x) & GCS_CAP_ADDR_MASK) | \
+ GCS_CAP_VALID_TOKEN)
+
+static inline unsigned long *get_gcspr(void)
+{
+ unsigned long *gcspr;
+
+ asm volatile(
+ "mrs %0, S3_3_C2_C5_1"
+ : "=r" (gcspr)
+ :
+ : "cc");
+
+ return gcspr;
+}
+
+static inline void __attribute__((always_inline)) gcsss1(unsigned long *Xt)
+{
+ asm volatile (
+ "sys #3, C7, C7, #2, %0\n"
+ :
+ : "rZ" (Xt)
+ : "memory");
+}
+
+static inline unsigned long __attribute__((always_inline)) *gcsss2(void)
+{
+ unsigned long *Xt;
+
+ asm volatile(
+ "SYSL %0, #3, C7, C7, #3\n"
+ : "=r" (Xt)
+ :
+ : "memory");
+
+ return Xt;
+}
+
+static inline bool chkfeat_gcs(void)
+{
+ register long val __asm__ ("x16") = 1;
+
+ /* CHKFEAT x16 */
+ asm volatile(
+ "hint #0x28\n"
+ : "=r" (val)
+ : "r" (val));
+
+ return val != 1;
+}
+
+#endif
diff --git a/tools/testing/selftests/arm64/gcs/gcspushm.S b/tools/testing/selftests/arm64/gcs/gcspushm.S
new file mode 100644
index 0000000..bbe17c1
--- /dev/null
+++ b/tools/testing/selftests/arm64/gcs/gcspushm.S
@@ -0,0 +1,96 @@
+// SPDX-License-Identifier: GPL-2.0-only
+//
+// Copyright 2024 Arm Limited
+//
+// Give ourselves GCS push permissions then use them
+
+#include <asm/unistd.h>
+
+/* Shadow Stack/Guarded Control Stack interface */
+#define PR_GET_SHADOW_STACK_STATUS 74
+#define PR_SET_SHADOW_STACK_STATUS 75
+#define PR_LOCK_SHADOW_STACK_STATUS 76
+
+# define PR_SHADOW_STACK_ENABLE (1UL << 0)
+# define PR_SHADOW_STACK_WRITE (1UL << 1)
+# define PR_SHADOW_STACK_PUSH (1UL << 2)
+
+#define KSFT_SKIP 4
+
+.macro function name
+ .macro endfunction
+ .type \name, @function
+ .purgem endfunction
+ .endm
+\name:
+.endm
+
+// Print a single character x0 to stdout
+// Clobbers x0-x2,x8
+function putc
+ str x0, [sp, #-16]!
+
+ mov x0, #1 // STDOUT_FILENO
+ mov x1, sp
+ mov x2, #1
+ mov x8, #__NR_write
+ svc #0
+
+ add sp, sp, #16
+ ret
+endfunction
+.globl putc
+
+// Print a NUL-terminated string starting at address x0 to stdout
+// Clobbers x0-x3,x8
+function puts
+ mov x1, x0
+
+ mov x2, #0
+0: ldrb w3, [x0], #1
+ cbz w3, 1f
+ add x2, x2, #1
+ b 0b
+
+1: mov w0, #1 // STDOUT_FILENO
+ mov x8, #__NR_write
+ svc #0
+
+ ret
+endfunction
+.globl puts
+
+// Utility macro to print a literal string
+// Clobbers x0-x4,x8
+.macro puts string
+ .pushsection .rodata.str1.1, "aMS", @progbits, 1
+.L__puts_literal\@: .string "\string"
+ .popsection
+
+ ldr x0, =.L__puts_literal\@
+ bl puts
+.endm
+
+.globl _start
+function _start
+ // Run with GCS
+ mov x0, PR_SET_SHADOW_STACK_STATUS
+ mov x1, PR_SHADOW_STACK_ENABLE | PR_SHADOW_STACK_PUSH
+ mov x2, xzr
+ mov x3, xzr
+ mov x4, xzr
+ mov x5, xzr
+ mov x8, #__NR_prctl
+ svc #0
+ cbz x0, 1f
+ puts "Failed to enable GCS with push permission\n"
+ mov x0, #KSFT_SKIP
+ b 2f
+1:
+ sys #3, c7, c7, #0, x0 // GCSPUSHM
+ sysl x0, #3, c7, c7, #1 // GCSPOPM
+
+ mov x0, #0
+2:
+ mov x8, #__NR_exit
+ svc #0
diff --git a/tools/testing/selftests/arm64/gcs/gcsstr.S b/tools/testing/selftests/arm64/gcs/gcsstr.S
new file mode 100644
index 0000000..a42bba6
--- /dev/null
+++ b/tools/testing/selftests/arm64/gcs/gcsstr.S
@@ -0,0 +1,99 @@
+// SPDX-License-Identifier: GPL-2.0-only
+//
+// Copyright 2024 Arm Limited
+//
+// Give ourselves GCS write permissions then use them
+
+#include <asm/unistd.h>
+
+/* Shadow Stack/Guarded Control Stack interface */
+#define PR_GET_SHADOW_STACK_STATUS 74
+#define PR_SET_SHADOW_STACK_STATUS 75
+#define PR_LOCK_SHADOW_STACK_STATUS 76
+
+# define PR_SHADOW_STACK_ENABLE (1UL << 0)
+# define PR_SHADOW_STACK_WRITE (1UL << 1)
+# define PR_SHADOW_STACK_PUSH (1UL << 2)
+
+#define GCSPR_EL0 S3_3_C2_C5_1
+
+#define KSFT_SKIP 4
+
+.macro function name
+ .macro endfunction
+ .type \name, @function
+ .purgem endfunction
+ .endm
+\name:
+.endm
+
+// Print a single character x0 to stdout
+// Clobbers x0-x2,x8
+function putc
+ str x0, [sp, #-16]!
+
+ mov x0, #1 // STDOUT_FILENO
+ mov x1, sp
+ mov x2, #1
+ mov x8, #__NR_write
+ svc #0
+
+ add sp, sp, #16
+ ret
+endfunction
+.globl putc
+
+// Print a NUL-terminated string starting at address x0 to stdout
+// Clobbers x0-x3,x8
+function puts
+ mov x1, x0
+
+ mov x2, #0
+0: ldrb w3, [x0], #1
+ cbz w3, 1f
+ add x2, x2, #1
+ b 0b
+
+1: mov w0, #1 // STDOUT_FILENO
+ mov x8, #__NR_write
+ svc #0
+
+ ret
+endfunction
+.globl puts
+
+// Utility macro to print a literal string
+// Clobbers x0-x4,x8
+.macro puts string
+ .pushsection .rodata.str1.1, "aMS", @progbits, 1
+.L__puts_literal\@: .string "\string"
+ .popsection
+
+ ldr x0, =.L__puts_literal\@
+ bl puts
+.endm
+
+.globl _start
+function _start
+ // Run with GCS
+ mov x0, PR_SET_SHADOW_STACK_STATUS
+ mov x1, PR_SHADOW_STACK_ENABLE | PR_SHADOW_STACK_WRITE
+ mov x2, xzr
+ mov x3, xzr
+ mov x4, xzr
+ mov x5, xzr
+ mov x8, #__NR_prctl
+ svc #0
+ cbz x0, 1f
+ puts "Failed to enable GCS with write permission\n"
+ mov x0, #KSFT_SKIP
+ b 2f
+1:
+ mrs x0, GCSPR_EL0
+ sub x0, x0, #8
+ .inst 0xd91f1c01 // GCSSTR x1, x0
+
+ mov x0, #0
+2:
+ mov x8, #__NR_exit
+ svc #0
diff --git a/tools/testing/selftests/arm64/gcs/libc-gcs.c b/tools/testing/selftests/arm64/gcs/libc-gcs.c
new file mode 100644
index 0000000..17b2fab
--- /dev/null
+++ b/tools/testing/selftests/arm64/gcs/libc-gcs.c
@@ -0,0 +1,728 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2023 ARM Limited.
+ */
+
+#define _GNU_SOURCE
+
+#include <pthread.h>
+#include <stdbool.h>
+
+#include <sys/auxv.h>
+#include <sys/mman.h>
+#include <sys/prctl.h>
+#include <sys/ptrace.h>
+#include <sys/uio.h>
+
+#include <asm/hwcap.h>
+#include <asm/mman.h>
+
+#include <linux/compiler.h>
+
+#include "kselftest_harness.h"
+
+#include "gcs-util.h"
+
+#define my_syscall2(num, arg1, arg2) \
+({ \
+ register long _num __asm__ ("x8") = (num); \
+ register long _arg1 __asm__ ("x0") = (long)(arg1); \
+ register long _arg2 __asm__ ("x1") = (long)(arg2); \
+ register long _arg3 __asm__ ("x2") = 0; \
+ register long _arg4 __asm__ ("x3") = 0; \
+ register long _arg5 __asm__ ("x4") = 0; \
+ \
+ __asm__ volatile ( \
+ "svc #0\n" \
+ : "=r"(_arg1) \
+ : "r"(_arg1), "r"(_arg2), \
+ "r"(_arg3), "r"(_arg4), \
+ "r"(_arg5), "r"(_num) \
+ : "memory", "cc" \
+ ); \
+ _arg1; \
+})
+
+static noinline void gcs_recurse(int depth)
+{
+ if (depth)
+ gcs_recurse(depth - 1);
+
+ /* Prevent tail call optimization so we actually recurse */
+ asm volatile("dsb sy" : : : "memory");
+}
+
+/* Smoke test that a function call and return works*/
+TEST(can_call_function)
+{
+ gcs_recurse(0);
+}
+
+static void *gcs_test_thread(void *arg)
+{
+ int ret;
+ unsigned long mode;
+
+ /*
+ * Some libcs don't seem to fill unused arguments with 0 but
+ * the kernel validates this so we supply all 5 arguments.
+ */
+ ret = prctl(PR_GET_SHADOW_STACK_STATUS, &mode, 0, 0, 0);
+ if (ret != 0) {
+ ksft_print_msg("PR_GET_SHADOW_STACK_STATUS failed: %d\n", ret);
+ return NULL;
+ }
+
+ if (!(mode & PR_SHADOW_STACK_ENABLE)) {
+ ksft_print_msg("GCS not enabled in thread, mode is %lu\n",
+ mode);
+ return NULL;
+ }
+
+ /* Just in case... */
+ gcs_recurse(0);
+
+ /* Use a non-NULL value to indicate a pass */
+ return &gcs_test_thread;
+}
+
+/* Verify that if we start a new thread it has GCS enabled */
+TEST(gcs_enabled_thread)
+{
+ pthread_t thread;
+ void *thread_ret;
+ int ret;
+
+ ret = pthread_create(&thread, NULL, gcs_test_thread, NULL);
+ ASSERT_TRUE(ret == 0);
+ if (ret != 0)
+ return;
+
+ ret = pthread_join(thread, &thread_ret);
+ ASSERT_TRUE(ret == 0);
+ if (ret != 0)
+ return;
+
+ ASSERT_TRUE(thread_ret != NULL);
+}
+
+/* Read the GCS until we find the terminator */
+TEST(gcs_find_terminator)
+{
+ unsigned long *gcs, *cur;
+
+ gcs = get_gcspr();
+ cur = gcs;
+ while (*cur)
+ cur++;
+
+ ksft_print_msg("GCS in use from %p-%p\n", gcs, cur);
+
+ /*
+ * We should have at least whatever called into this test so
+ * the two pointer should differ.
+ */
+ ASSERT_TRUE(gcs != cur);
+}
+
+/*
+ * We can access a GCS via ptrace
+ *
+ * This could usefully have a fixture but note that each test is
+ * fork()ed into a new child whcih causes issues. Might be better to
+ * lift at least some of this out into a separate, non-harness, test
+ * program.
+ */
+TEST(ptrace_read_write)
+{
+ pid_t child, pid;
+ int ret, status;
+ siginfo_t si;
+ uint64_t val, rval, gcspr;
+ struct user_gcs child_gcs;
+ struct iovec iov, local_iov, remote_iov;
+
+ child = fork();
+ if (child == -1) {
+ ksft_print_msg("fork() failed: %d (%s)\n",
+ errno, strerror(errno));
+ ASSERT_NE(child, -1);
+ }
+
+ if (child == 0) {
+ /*
+ * In child, make sure there's something on the stack and
+ * ask to be traced.
+ */
+ gcs_recurse(0);
+ if (ptrace(PTRACE_TRACEME, -1, NULL, NULL))
+ ksft_exit_fail_msg("PTRACE_TRACEME %s",
+ strerror(errno));
+
+ if (raise(SIGSTOP))
+ ksft_exit_fail_msg("raise(SIGSTOP) %s",
+ strerror(errno));
+
+ return;
+ }
+
+ ksft_print_msg("Child: %d\n", child);
+
+ /* Attach to the child */
+ while (1) {
+ int sig;
+
+ pid = wait(&status);
+ if (pid == -1) {
+ ksft_print_msg("wait() failed: %s",
+ strerror(errno));
+ goto error;
+ }
+
+ /*
+ * This should never happen but it's hard to flag in
+ * the framework.
+ */
+ if (pid != child)
+ continue;
+
+ if (WIFEXITED(status) || WIFSIGNALED(status))
+ ksft_exit_fail_msg("Child died unexpectedly\n");
+
+ if (!WIFSTOPPED(status))
+ goto error;
+
+ sig = WSTOPSIG(status);
+
+ if (ptrace(PTRACE_GETSIGINFO, pid, NULL, &si)) {
+ if (errno == ESRCH) {
+ ASSERT_NE(errno, ESRCH);
+ return;
+ }
+
+ if (errno == EINVAL) {
+ sig = 0; /* bust group-stop */
+ goto cont;
+ }
+
+ ksft_print_msg("PTRACE_GETSIGINFO: %s\n",
+ strerror(errno));
+ goto error;
+ }
+
+ if (sig == SIGSTOP && si.si_code == SI_TKILL &&
+ si.si_pid == pid)
+ break;
+
+ cont:
+ if (ptrace(PTRACE_CONT, pid, NULL, sig)) {
+ if (errno == ESRCH) {
+ ASSERT_NE(errno, ESRCH);
+ return;
+ }
+
+ ksft_print_msg("PTRACE_CONT: %s\n", strerror(errno));
+ goto error;
+ }
+ }
+
+ /* Where is the child GCS? */
+ iov.iov_base = &child_gcs;
+ iov.iov_len = sizeof(child_gcs);
+ ret = ptrace(PTRACE_GETREGSET, child, NT_ARM_GCS, &iov);
+ if (ret != 0) {
+ ksft_print_msg("Failed to read child GCS state: %s (%d)\n",
+ strerror(errno), errno);
+ goto error;
+ }
+
+ /* We should have inherited GCS over fork(), confirm */
+ if (!(child_gcs.features_enabled & PR_SHADOW_STACK_ENABLE)) {
+ ASSERT_TRUE(child_gcs.features_enabled &
+ PR_SHADOW_STACK_ENABLE);
+ goto error;
+ }
+
+ gcspr = child_gcs.gcspr_el0;
+ ksft_print_msg("Child GCSPR 0x%lx, flags %llx, locked %llx\n",
+ gcspr, child_gcs.features_enabled,
+ child_gcs.features_locked);
+
+ /* Ideally we'd cross check with the child memory map */
+
+ errno = 0;
+ val = ptrace(PTRACE_PEEKDATA, child, (void *)gcspr, NULL);
+ ret = errno;
+ if (ret != 0)
+ ksft_print_msg("PTRACE_PEEKDATA failed: %s (%d)\n",
+ strerror(ret), ret);
+ EXPECT_EQ(ret, 0);
+
+ /* The child should be in a function, the GCSPR shouldn't be 0 */
+ EXPECT_NE(val, 0);
+
+ /* Same thing via process_vm_readv() */
+ local_iov.iov_base = &rval;
+ local_iov.iov_len = sizeof(rval);
+ remote_iov.iov_base = (void *)gcspr;
+ remote_iov.iov_len = sizeof(rval);
+ ret = process_vm_readv(child, &local_iov, 1, &remote_iov, 1, 0);
+ if (ret == -1)
+ ksft_print_msg("process_vm_readv() failed: %s (%d)\n",
+ strerror(errno), errno);
+ EXPECT_EQ(ret, sizeof(rval));
+ EXPECT_EQ(val, rval);
+
+ /* Write data via a peek */
+ ret = ptrace(PTRACE_POKEDATA, child, (void *)gcspr, NULL);
+ if (ret == -1)
+ ksft_print_msg("PTRACE_POKEDATA failed: %s (%d)\n",
+ strerror(errno), errno);
+ EXPECT_EQ(ret, 0);
+ EXPECT_EQ(0, ptrace(PTRACE_PEEKDATA, child, (void *)gcspr, NULL));
+
+ /* Restore what we had before */
+ ret = ptrace(PTRACE_POKEDATA, child, (void *)gcspr, val);
+ if (ret == -1)
+ ksft_print_msg("PTRACE_POKEDATA failed: %s (%d)\n",
+ strerror(errno), errno);
+ EXPECT_EQ(ret, 0);
+ EXPECT_EQ(val, ptrace(PTRACE_PEEKDATA, child, (void *)gcspr, NULL));
+
+ /* That's all, folks */
+ kill(child, SIGKILL);
+ return;
+
+error:
+ kill(child, SIGKILL);
+ ASSERT_FALSE(true);
+}
+
+FIXTURE(map_gcs)
+{
+ unsigned long *stack;
+};
+
+FIXTURE_VARIANT(map_gcs)
+{
+ size_t stack_size;
+ unsigned long flags;
+};
+
+FIXTURE_VARIANT_ADD(map_gcs, s2k_cap_marker)
+{
+ .stack_size = 2 * 1024,
+ .flags = SHADOW_STACK_SET_MARKER | SHADOW_STACK_SET_TOKEN,
+};
+
+FIXTURE_VARIANT_ADD(map_gcs, s2k_cap)
+{
+ .stack_size = 2 * 1024,
+ .flags = SHADOW_STACK_SET_TOKEN,
+};
+
+FIXTURE_VARIANT_ADD(map_gcs, s2k_marker)
+{
+ .stack_size = 2 * 1024,
+ .flags = SHADOW_STACK_SET_MARKER,
+};
+
+FIXTURE_VARIANT_ADD(map_gcs, s2k)
+{
+ .stack_size = 2 * 1024,
+ .flags = 0,
+};
+
+FIXTURE_VARIANT_ADD(map_gcs, s4k_cap_marker)
+{
+ .stack_size = 4 * 1024,
+ .flags = SHADOW_STACK_SET_MARKER | SHADOW_STACK_SET_TOKEN,
+};
+
+FIXTURE_VARIANT_ADD(map_gcs, s4k_cap)
+{
+ .stack_size = 4 * 1024,
+ .flags = SHADOW_STACK_SET_TOKEN,
+};
+
+FIXTURE_VARIANT_ADD(map_gcs, s3k_marker)
+{
+ .stack_size = 4 * 1024,
+ .flags = SHADOW_STACK_SET_MARKER,
+};
+
+FIXTURE_VARIANT_ADD(map_gcs, s4k)
+{
+ .stack_size = 4 * 1024,
+ .flags = 0,
+};
+
+FIXTURE_VARIANT_ADD(map_gcs, s16k_cap_marker)
+{
+ .stack_size = 16 * 1024,
+ .flags = SHADOW_STACK_SET_MARKER | SHADOW_STACK_SET_TOKEN,
+};
+
+FIXTURE_VARIANT_ADD(map_gcs, s16k_cap)
+{
+ .stack_size = 16 * 1024,
+ .flags = SHADOW_STACK_SET_TOKEN,
+};
+
+FIXTURE_VARIANT_ADD(map_gcs, s16k_marker)
+{
+ .stack_size = 16 * 1024,
+ .flags = SHADOW_STACK_SET_MARKER,
+};
+
+FIXTURE_VARIANT_ADD(map_gcs, s16k)
+{
+ .stack_size = 16 * 1024,
+ .flags = 0,
+};
+
+FIXTURE_VARIANT_ADD(map_gcs, s64k_cap_marker)
+{
+ .stack_size = 64 * 1024,
+ .flags = SHADOW_STACK_SET_MARKER | SHADOW_STACK_SET_TOKEN,
+};
+
+FIXTURE_VARIANT_ADD(map_gcs, s64k_cap)
+{
+ .stack_size = 64 * 1024,
+ .flags = SHADOW_STACK_SET_TOKEN,
+};
+
+FIXTURE_VARIANT_ADD(map_gcs, s64k_marker)
+{
+ .stack_size = 64 * 1024,
+ .flags = SHADOW_STACK_SET_MARKER,
+};
+
+FIXTURE_VARIANT_ADD(map_gcs, s64k)
+{
+ .stack_size = 64 * 1024,
+ .flags = 0,
+};
+
+FIXTURE_VARIANT_ADD(map_gcs, s128k_cap_marker)
+{
+ .stack_size = 128 * 1024,
+ .flags = SHADOW_STACK_SET_MARKER | SHADOW_STACK_SET_TOKEN,
+};
+
+FIXTURE_VARIANT_ADD(map_gcs, s128k_cap)
+{
+ .stack_size = 128 * 1024,
+ .flags = SHADOW_STACK_SET_TOKEN,
+};
+
+FIXTURE_VARIANT_ADD(map_gcs, s128k_marker)
+{
+ .stack_size = 128 * 1024,
+ .flags = SHADOW_STACK_SET_MARKER,
+};
+
+FIXTURE_VARIANT_ADD(map_gcs, s128k)
+{
+ .stack_size = 128 * 1024,
+ .flags = 0,
+};
+
+FIXTURE_SETUP(map_gcs)
+{
+ self->stack = (void *)syscall(__NR_map_shadow_stack, 0,
+ variant->stack_size,
+ variant->flags);
+ ASSERT_FALSE(self->stack == MAP_FAILED);
+ ksft_print_msg("Allocated stack from %p-%p\n", self->stack,
+ self->stack + variant->stack_size);
+}
+
+FIXTURE_TEARDOWN(map_gcs)
+{
+ int ret;
+
+ if (self->stack != MAP_FAILED) {
+ ret = munmap(self->stack, variant->stack_size);
+ ASSERT_EQ(ret, 0);
+ }
+}
+
+/* The stack has a cap token */
+TEST_F(map_gcs, stack_capped)
+{
+ unsigned long *stack = self->stack;
+ size_t cap_index;
+
+ cap_index = (variant->stack_size / sizeof(unsigned long));
+
+ switch (variant->flags & (SHADOW_STACK_SET_MARKER | SHADOW_STACK_SET_TOKEN)) {
+ case SHADOW_STACK_SET_MARKER | SHADOW_STACK_SET_TOKEN:
+ cap_index -= 2;
+ break;
+ case SHADOW_STACK_SET_TOKEN:
+ cap_index -= 1;
+ break;
+ case SHADOW_STACK_SET_MARKER:
+ case 0:
+ /* No cap, no test */
+ return;
+ }
+
+ ASSERT_EQ(stack[cap_index], GCS_CAP(&stack[cap_index]));
+}
+
+/* The top of the stack is 0 */
+TEST_F(map_gcs, stack_terminated)
+{
+ unsigned long *stack = self->stack;
+ size_t term_index;
+
+ if (!(variant->flags & SHADOW_STACK_SET_MARKER))
+ return;
+
+ term_index = (variant->stack_size / sizeof(unsigned long)) - 1;
+
+ ASSERT_EQ(stack[term_index], 0);
+}
+
+/* Writes should fault */
+TEST_F_SIGNAL(map_gcs, not_writeable, SIGSEGV)
+{
+ self->stack[0] = 0;
+}
+
+/* Put it all together, we can safely switch to and from the stack */
+TEST_F(map_gcs, stack_switch)
+{
+ size_t cap_index;
+ cap_index = (variant->stack_size / sizeof(unsigned long));
+ unsigned long *orig_gcspr_el0, *pivot_gcspr_el0;
+
+ /* Skip over the stack terminator and point at the cap */
+ switch (variant->flags & (SHADOW_STACK_SET_MARKER | SHADOW_STACK_SET_TOKEN)) {
+ case SHADOW_STACK_SET_MARKER | SHADOW_STACK_SET_TOKEN:
+ cap_index -= 2;
+ break;
+ case SHADOW_STACK_SET_TOKEN:
+ cap_index -= 1;
+ break;
+ case SHADOW_STACK_SET_MARKER:
+ case 0:
+ /* No cap, no test */
+ return;
+ }
+ pivot_gcspr_el0 = &self->stack[cap_index];
+
+ /* Pivot to the new GCS */
+ ksft_print_msg("Pivoting to %p from %p, target has value 0x%lx\n",
+ pivot_gcspr_el0, get_gcspr(),
+ *pivot_gcspr_el0);
+ gcsss1(pivot_gcspr_el0);
+ orig_gcspr_el0 = gcsss2();
+ ksft_print_msg("Pivoted to %p from %p, target has value 0x%lx\n",
+ get_gcspr(), orig_gcspr_el0,
+ *pivot_gcspr_el0);
+
+ ksft_print_msg("Pivoted, GCSPR_EL0 now %p\n", get_gcspr());
+
+ /* New GCS must be in the new buffer */
+ ASSERT_TRUE((unsigned long)get_gcspr() > (unsigned long)self->stack);
+ ASSERT_TRUE((unsigned long)get_gcspr() <=
+ (unsigned long)self->stack + variant->stack_size);
+
+ /* We should be able to use all but 2 slots of the new stack */
+ ksft_print_msg("Recursing %zu levels\n", cap_index - 1);
+ gcs_recurse(cap_index - 1);
+
+ /* Pivot back to the original GCS */
+ gcsss1(orig_gcspr_el0);
+ pivot_gcspr_el0 = gcsss2();
+
+ gcs_recurse(0);
+ ksft_print_msg("Pivoted back to GCSPR_EL0 0x%p\n", get_gcspr());
+}
+
+/* We fault if we try to go beyond the end of the stack */
+TEST_F_SIGNAL(map_gcs, stack_overflow, SIGSEGV)
+{
+ size_t cap_index;
+ cap_index = (variant->stack_size / sizeof(unsigned long));
+ unsigned long *orig_gcspr_el0, *pivot_gcspr_el0;
+
+ /* Skip over the stack terminator and point at the cap */
+ switch (variant->flags & (SHADOW_STACK_SET_MARKER | SHADOW_STACK_SET_TOKEN)) {
+ case SHADOW_STACK_SET_MARKER | SHADOW_STACK_SET_TOKEN:
+ cap_index -= 2;
+ break;
+ case SHADOW_STACK_SET_TOKEN:
+ cap_index -= 1;
+ break;
+ case SHADOW_STACK_SET_MARKER:
+ case 0:
+ /* No cap, no test but we need to SEGV to avoid a false fail */
+ orig_gcspr_el0 = get_gcspr();
+ *orig_gcspr_el0 = 0;
+ return;
+ }
+ pivot_gcspr_el0 = &self->stack[cap_index];
+
+ /* Pivot to the new GCS */
+ ksft_print_msg("Pivoting to %p from %p, target has value 0x%lx\n",
+ pivot_gcspr_el0, get_gcspr(),
+ *pivot_gcspr_el0);
+ gcsss1(pivot_gcspr_el0);
+ orig_gcspr_el0 = gcsss2();
+ ksft_print_msg("Pivoted to %p from %p, target has value 0x%lx\n",
+ pivot_gcspr_el0, orig_gcspr_el0,
+ *pivot_gcspr_el0);
+
+ ksft_print_msg("Pivoted, GCSPR_EL0 now %p\n", get_gcspr());
+
+ /* New GCS must be in the new buffer */
+ ASSERT_TRUE((unsigned long)get_gcspr() > (unsigned long)self->stack);
+ ASSERT_TRUE((unsigned long)get_gcspr() <=
+ (unsigned long)self->stack + variant->stack_size);
+
+ /* Now try to recurse, we should fault doing this. */
+ ksft_print_msg("Recursing %zu levels...\n", cap_index + 1);
+ gcs_recurse(cap_index + 1);
+ ksft_print_msg("...done\n");
+
+ /* Clean up properly to try to guard against spurious passes. */
+ gcsss1(orig_gcspr_el0);
+ pivot_gcspr_el0 = gcsss2();
+ ksft_print_msg("Pivoted back to GCSPR_EL0 0x%p\n", get_gcspr());
+}
+
+FIXTURE(map_invalid_gcs)
+{
+};
+
+FIXTURE_VARIANT(map_invalid_gcs)
+{
+ size_t stack_size;
+};
+
+FIXTURE_SETUP(map_invalid_gcs)
+{
+}
+
+FIXTURE_TEARDOWN(map_invalid_gcs)
+{
+}
+
+/* GCS must be larger than 16 bytes */
+FIXTURE_VARIANT_ADD(map_invalid_gcs, too_small)
+{
+ .stack_size = 8,
+};
+
+/* GCS size must be 16 byte aligned */
+FIXTURE_VARIANT_ADD(map_invalid_gcs, unligned_1) { .stack_size = 1024 + 1 };
+FIXTURE_VARIANT_ADD(map_invalid_gcs, unligned_2) { .stack_size = 1024 + 2 };
+FIXTURE_VARIANT_ADD(map_invalid_gcs, unligned_3) { .stack_size = 1024 + 3 };
+FIXTURE_VARIANT_ADD(map_invalid_gcs, unligned_4) { .stack_size = 1024 + 4 };
+FIXTURE_VARIANT_ADD(map_invalid_gcs, unligned_5) { .stack_size = 1024 + 5 };
+FIXTURE_VARIANT_ADD(map_invalid_gcs, unligned_6) { .stack_size = 1024 + 6 };
+FIXTURE_VARIANT_ADD(map_invalid_gcs, unligned_7) { .stack_size = 1024 + 7 };
+
+TEST_F(map_invalid_gcs, do_map)
+{
+ void *stack;
+
+ stack = (void *)syscall(__NR_map_shadow_stack, 0,
+ variant->stack_size, 0);
+ ASSERT_TRUE(stack == MAP_FAILED);
+ if (stack != MAP_FAILED)
+ munmap(stack, variant->stack_size);
+}
+
+FIXTURE(invalid_mprotect)
+{
+ unsigned long *stack;
+ size_t stack_size;
+};
+
+FIXTURE_VARIANT(invalid_mprotect)
+{
+ unsigned long flags;
+};
+
+FIXTURE_SETUP(invalid_mprotect)
+{
+ self->stack_size = sysconf(_SC_PAGE_SIZE);
+ self->stack = (void *)syscall(__NR_map_shadow_stack, 0,
+ self->stack_size, 0);
+ ASSERT_FALSE(self->stack == MAP_FAILED);
+ ksft_print_msg("Allocated stack from %p-%p\n", self->stack,
+ self->stack + self->stack_size);
+}
+
+FIXTURE_TEARDOWN(invalid_mprotect)
+{
+ int ret;
+
+ if (self->stack != MAP_FAILED) {
+ ret = munmap(self->stack, self->stack_size);
+ ASSERT_EQ(ret, 0);
+ }
+}
+
+FIXTURE_VARIANT_ADD(invalid_mprotect, exec)
+{
+ .flags = PROT_EXEC,
+};
+
+TEST_F(invalid_mprotect, do_map)
+{
+ int ret;
+
+ ret = mprotect(self->stack, self->stack_size, variant->flags);
+ ASSERT_EQ(ret, -1);
+}
+
+TEST_F(invalid_mprotect, do_map_read)
+{
+ int ret;
+
+ ret = mprotect(self->stack, self->stack_size,
+ variant->flags | PROT_READ);
+ ASSERT_EQ(ret, -1);
+}
+
+int main(int argc, char **argv)
+{
+ unsigned long gcs_mode;
+ int ret;
+
+ if (!(getauxval(AT_HWCAP) & HWCAP_GCS))
+ ksft_exit_skip("SKIP GCS not supported\n");
+
+ /*
+ * Force shadow stacks on, our tests *should* be fine with or
+ * without libc support and with or without this having ended
+ * up tagged for GCS and enabled by the dynamic linker. We
+ * can't use the libc prctl() function since we can't return
+ * from enabling the stack.
+ */
+ ret = my_syscall2(__NR_prctl, PR_GET_SHADOW_STACK_STATUS, &gcs_mode);
+ if (ret) {
+ ksft_print_msg("Failed to read GCS state: %d\n", ret);
+ return EXIT_FAILURE;
+ }
+
+ if (!(gcs_mode & PR_SHADOW_STACK_ENABLE)) {
+ gcs_mode = PR_SHADOW_STACK_ENABLE;
+ ret = my_syscall2(__NR_prctl, PR_SET_SHADOW_STACK_STATUS,
+ gcs_mode);
+ if (ret) {
+ ksft_print_msg("Failed to configure GCS: %d\n", ret);
+ return EXIT_FAILURE;
+ }
+ }
+
+ /* Avoid returning in case libc doesn't understand GCS */
+ exit(test_harness_run(argc, argv));
+}
diff --git a/tools/testing/selftests/arm64/mte/check_buffer_fill.c b/tools/testing/selftests/arm64/mte/check_buffer_fill.c
index 1dbbbd4..2ee7f11 100644
--- a/tools/testing/selftests/arm64/mte/check_buffer_fill.c
+++ b/tools/testing/selftests/arm64/mte/check_buffer_fill.c
@@ -91,7 +91,7 @@ static int check_buffer_underflow_by_byte(int mem_type, int mode,
for (j = 0; j < sizes[i]; j++) {
if (ptr[j] != '1') {
err = true;
- ksft_print_msg("Buffer is not filled at index:%d of ptr:0x%lx\n",
+ ksft_print_msg("Buffer is not filled at index:%d of ptr:0x%p\n",
j, ptr);
break;
}
@@ -189,7 +189,7 @@ static int check_buffer_overflow_by_byte(int mem_type, int mode,
for (j = 0; j < sizes[i]; j++) {
if (ptr[j] != '1') {
err = true;
- ksft_print_msg("Buffer is not filled at index:%d of ptr:0x%lx\n",
+ ksft_print_msg("Buffer is not filled at index:%d of ptr:0x%p\n",
j, ptr);
break;
}
diff --git a/tools/testing/selftests/arm64/mte/check_hugetlb_options.c b/tools/testing/selftests/arm64/mte/check_hugetlb_options.c
new file mode 100644
index 0000000..303260a
--- /dev/null
+++ b/tools/testing/selftests/arm64/mte/check_hugetlb_options.c
@@ -0,0 +1,285 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (C) 2024 Ampere Computing LLC
+
+#define _GNU_SOURCE
+
+#include <errno.h>
+#include <fcntl.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <ucontext.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+
+#include "kselftest.h"
+#include "mte_common_util.h"
+#include "mte_def.h"
+
+#define TAG_CHECK_ON 0
+#define TAG_CHECK_OFF 1
+
+static unsigned long default_huge_page_size(void)
+{
+ unsigned long hps = 0;
+ char *line = NULL;
+ size_t linelen = 0;
+ FILE *f = fopen("/proc/meminfo", "r");
+
+ if (!f)
+ return 0;
+ while (getline(&line, &linelen, f) > 0) {
+ if (sscanf(line, "Hugepagesize: %lu kB", &hps) == 1) {
+ hps <<= 10;
+ break;
+ }
+ }
+
+ free(line);
+ fclose(f);
+ return hps;
+}
+
+static bool is_hugetlb_allocated(void)
+{
+ unsigned long hps = 0;
+ char *line = NULL;
+ size_t linelen = 0;
+ FILE *f = fopen("/proc/meminfo", "r");
+
+ if (!f)
+ return false;
+ while (getline(&line, &linelen, f) > 0) {
+ if (sscanf(line, "Hugetlb: %lu kB", &hps) == 1) {
+ hps <<= 10;
+ break;
+ }
+ }
+
+ free(line);
+ fclose(f);
+
+ if (hps > 0)
+ return true;
+
+ return false;
+}
+
+static void write_sysfs(char *str, unsigned long val)
+{
+ FILE *f;
+
+ f = fopen(str, "w");
+ if (!f) {
+ ksft_print_msg("ERR: missing %s\n", str);
+ return;
+ }
+ fprintf(f, "%lu", val);
+ fclose(f);
+}
+
+static void allocate_hugetlb()
+{
+ write_sysfs("/proc/sys/vm/nr_hugepages", 2);
+}
+
+static void free_hugetlb()
+{
+ write_sysfs("/proc/sys/vm/nr_hugepages", 0);
+}
+
+static int check_child_tag_inheritance(char *ptr, int size, int mode)
+{
+ int i, parent_tag, child_tag, fault, child_status;
+ pid_t child;
+
+ parent_tag = MT_FETCH_TAG((uintptr_t)ptr);
+ fault = 0;
+
+ child = fork();
+ if (child == -1) {
+ ksft_print_msg("FAIL: child process creation\n");
+ return KSFT_FAIL;
+ } else if (child == 0) {
+ mte_initialize_current_context(mode, (uintptr_t)ptr, size);
+ /* Do copy on write */
+ memset(ptr, '1', size);
+ mte_wait_after_trig();
+ if (cur_mte_cxt.fault_valid == true) {
+ fault = 1;
+ goto check_child_tag_inheritance_err;
+ }
+ for (i = 0; i < size; i += MT_GRANULE_SIZE) {
+ child_tag = MT_FETCH_TAG((uintptr_t)(mte_get_tag_address(ptr + i)));
+ if (parent_tag != child_tag) {
+ ksft_print_msg("FAIL: child mte tag (%d) mismatch\n", i);
+ fault = 1;
+ goto check_child_tag_inheritance_err;
+ }
+ }
+check_child_tag_inheritance_err:
+ _exit(fault);
+ }
+ /* Wait for child process to terminate */
+ wait(&child_status);
+ if (WIFEXITED(child_status))
+ fault = WEXITSTATUS(child_status);
+ else
+ fault = 1;
+ return (fault) ? KSFT_FAIL : KSFT_PASS;
+}
+
+static int check_mte_memory(char *ptr, int size, int mode, int tag_check)
+{
+ mte_initialize_current_context(mode, (uintptr_t)ptr, size);
+ memset(ptr, '1', size);
+ mte_wait_after_trig();
+ if (cur_mte_cxt.fault_valid == true)
+ return KSFT_FAIL;
+
+ return KSFT_PASS;
+}
+
+static int check_hugetlb_memory_mapping(int mem_type, int mode, int mapping, int tag_check)
+{
+ char *ptr, *map_ptr;
+ int result;
+ unsigned long map_size;
+
+ map_size = default_huge_page_size();
+
+ mte_switch_mode(mode, MTE_ALLOW_NON_ZERO_TAG);
+ map_ptr = (char *)mte_allocate_memory(map_size, mem_type, mapping, false);
+ if (check_allocated_memory(map_ptr, map_size, mem_type, false) != KSFT_PASS)
+ return KSFT_FAIL;
+
+ mte_initialize_current_context(mode, (uintptr_t)map_ptr, map_size);
+ /* Only mte enabled memory will allow tag insertion */
+ ptr = mte_insert_tags((void *)map_ptr, map_size);
+ if (!ptr || cur_mte_cxt.fault_valid == true) {
+ ksft_print_msg("FAIL: Insert tags on anonymous mmap memory\n");
+ munmap((void *)map_ptr, map_size);
+ return KSFT_FAIL;
+ }
+ result = check_mte_memory(ptr, map_size, mode, tag_check);
+ mte_clear_tags((void *)ptr, map_size);
+ mte_free_memory((void *)map_ptr, map_size, mem_type, false);
+ if (result == KSFT_FAIL)
+ return KSFT_FAIL;
+
+ return KSFT_PASS;
+}
+
+static int check_clear_prot_mte_flag(int mem_type, int mode, int mapping)
+{
+ char *map_ptr;
+ int prot_flag, result;
+ unsigned long map_size;
+
+ prot_flag = PROT_READ | PROT_WRITE;
+ mte_switch_mode(mode, MTE_ALLOW_NON_ZERO_TAG);
+ map_size = default_huge_page_size();
+ map_ptr = (char *)mte_allocate_memory_tag_range(map_size, mem_type, mapping,
+ 0, 0);
+ if (check_allocated_memory_range(map_ptr, map_size, mem_type,
+ 0, 0) != KSFT_PASS)
+ return KSFT_FAIL;
+ /* Try to clear PROT_MTE property and verify it by tag checking */
+ if (mprotect(map_ptr, map_size, prot_flag)) {
+ mte_free_memory_tag_range((void *)map_ptr, map_size, mem_type,
+ 0, 0);
+ ksft_print_msg("FAIL: mprotect not ignoring clear PROT_MTE property\n");
+ return KSFT_FAIL;
+ }
+ result = check_mte_memory(map_ptr, map_size, mode, TAG_CHECK_ON);
+ mte_free_memory_tag_range((void *)map_ptr, map_size, mem_type, 0, 0);
+ if (result != KSFT_PASS)
+ return KSFT_FAIL;
+
+ return KSFT_PASS;
+}
+
+static int check_child_hugetlb_memory_mapping(int mem_type, int mode, int mapping)
+{
+ char *ptr;
+ int result;
+ unsigned long map_size;
+
+ map_size = default_huge_page_size();
+
+ mte_switch_mode(mode, MTE_ALLOW_NON_ZERO_TAG);
+ ptr = (char *)mte_allocate_memory_tag_range(map_size, mem_type, mapping,
+ 0, 0);
+ if (check_allocated_memory_range(ptr, map_size, mem_type,
+ 0, 0) != KSFT_PASS)
+ return KSFT_FAIL;
+ result = check_child_tag_inheritance(ptr, map_size, mode);
+ mte_free_memory_tag_range((void *)ptr, map_size, mem_type, 0, 0);
+ if (result == KSFT_FAIL)
+ return result;
+
+ return KSFT_PASS;
+}
+
+int main(int argc, char *argv[])
+{
+ int err;
+
+ err = mte_default_setup();
+ if (err)
+ return err;
+
+ /* Register signal handlers */
+ mte_register_signal(SIGBUS, mte_default_handler);
+ mte_register_signal(SIGSEGV, mte_default_handler);
+
+ allocate_hugetlb();
+
+ if (!is_hugetlb_allocated()) {
+ ksft_print_msg("ERR: Unable allocate hugetlb pages\n");
+ return KSFT_FAIL;
+ }
+
+ /* Set test plan */
+ ksft_set_plan(12);
+
+ mte_enable_pstate_tco();
+
+ evaluate_test(check_hugetlb_memory_mapping(USE_MMAP, MTE_SYNC_ERR, MAP_PRIVATE | MAP_HUGETLB, TAG_CHECK_OFF),
+ "Check hugetlb memory with private mapping, sync error mode, mmap memory and tag check off\n");
+
+ mte_disable_pstate_tco();
+ evaluate_test(check_hugetlb_memory_mapping(USE_MMAP, MTE_NONE_ERR, MAP_PRIVATE | MAP_HUGETLB, TAG_CHECK_OFF),
+ "Check hugetlb memory with private mapping, no error mode, mmap memory and tag check off\n");
+
+ evaluate_test(check_hugetlb_memory_mapping(USE_MMAP, MTE_SYNC_ERR, MAP_PRIVATE | MAP_HUGETLB, TAG_CHECK_ON),
+ "Check hugetlb memory with private mapping, sync error mode, mmap memory and tag check on\n");
+ evaluate_test(check_hugetlb_memory_mapping(USE_MPROTECT, MTE_SYNC_ERR, MAP_PRIVATE | MAP_HUGETLB, TAG_CHECK_ON),
+ "Check hugetlb memory with private mapping, sync error mode, mmap/mprotect memory and tag check on\n");
+ evaluate_test(check_hugetlb_memory_mapping(USE_MMAP, MTE_ASYNC_ERR, MAP_PRIVATE | MAP_HUGETLB, TAG_CHECK_ON),
+ "Check hugetlb memory with private mapping, async error mode, mmap memory and tag check on\n");
+ evaluate_test(check_hugetlb_memory_mapping(USE_MPROTECT, MTE_ASYNC_ERR, MAP_PRIVATE | MAP_HUGETLB, TAG_CHECK_ON),
+ "Check hugetlb memory with private mapping, async error mode, mmap/mprotect memory and tag check on\n");
+
+ evaluate_test(check_clear_prot_mte_flag(USE_MMAP, MTE_SYNC_ERR, MAP_PRIVATE | MAP_HUGETLB),
+ "Check clear PROT_MTE flags with private mapping, sync error mode and mmap memory\n");
+ evaluate_test(check_clear_prot_mte_flag(USE_MPROTECT, MTE_SYNC_ERR, MAP_PRIVATE | MAP_HUGETLB),
+ "Check clear PROT_MTE flags with private mapping and sync error mode and mmap/mprotect memory\n");
+
+ evaluate_test(check_child_hugetlb_memory_mapping(USE_MMAP, MTE_SYNC_ERR, MAP_PRIVATE | MAP_HUGETLB),
+ "Check child hugetlb memory with private mapping, precise mode and mmap memory\n");
+ evaluate_test(check_child_hugetlb_memory_mapping(USE_MMAP, MTE_ASYNC_ERR, MAP_PRIVATE | MAP_HUGETLB),
+ "Check child hugetlb memory with private mapping, precise mode and mmap memory\n");
+ evaluate_test(check_child_hugetlb_memory_mapping(USE_MPROTECT, MTE_SYNC_ERR, MAP_PRIVATE | MAP_HUGETLB),
+ "Check child hugetlb memory with private mapping, precise mode and mmap/mprotect memory\n");
+ evaluate_test(check_child_hugetlb_memory_mapping(USE_MPROTECT, MTE_ASYNC_ERR, MAP_PRIVATE | MAP_HUGETLB),
+ "Check child hugetlb memory with private mapping, precise mode and mmap/mprotect memory\n");
+
+ mte_restore_setup();
+ free_hugetlb();
+ ksft_print_cnts();
+ return ksft_get_fail_cnt() == 0 ? KSFT_PASS : KSFT_FAIL;
+}
diff --git a/tools/testing/selftests/arm64/mte/check_prctl.c b/tools/testing/selftests/arm64/mte/check_prctl.c
index f139a33..4c89e95 100644
--- a/tools/testing/selftests/arm64/mte/check_prctl.c
+++ b/tools/testing/selftests/arm64/mte/check_prctl.c
@@ -85,7 +85,7 @@ void set_mode_test(const char *name, int hwcap2, int mask)
ksft_test_result_pass("%s\n", name);
} else {
ksft_print_msg("Got %x, expected %x\n",
- (ret & PR_MTE_TCF_MASK), mask);
+ (ret & (int)PR_MTE_TCF_MASK), mask);
ksft_test_result_fail("%s\n", name);
}
}
diff --git a/tools/testing/selftests/arm64/mte/check_tags_inclusion.c b/tools/testing/selftests/arm64/mte/check_tags_inclusion.c
index 2b1425b..a3d1e23 100644
--- a/tools/testing/selftests/arm64/mte/check_tags_inclusion.c
+++ b/tools/testing/selftests/arm64/mte/check_tags_inclusion.c
@@ -65,7 +65,7 @@ static int check_single_included_tags(int mem_type, int mode)
ptr = mte_insert_tags(ptr, BUFFER_SIZE);
/* Check tag value */
if (MT_FETCH_TAG((uintptr_t)ptr) == tag) {
- ksft_print_msg("FAIL: wrong tag = 0x%x with include mask=0x%x\n",
+ ksft_print_msg("FAIL: wrong tag = 0x%lx with include mask=0x%x\n",
MT_FETCH_TAG((uintptr_t)ptr),
MT_INCLUDE_VALID_TAG(tag));
result = KSFT_FAIL;
@@ -97,7 +97,7 @@ static int check_multiple_included_tags(int mem_type, int mode)
ptr = mte_insert_tags(ptr, BUFFER_SIZE);
/* Check tag value */
if (MT_FETCH_TAG((uintptr_t)ptr) < tag) {
- ksft_print_msg("FAIL: wrong tag = 0x%x with include mask=0x%x\n",
+ ksft_print_msg("FAIL: wrong tag = 0x%lx with include mask=0x%lx\n",
MT_FETCH_TAG((uintptr_t)ptr),
MT_INCLUDE_VALID_TAGS(excl_mask));
result = KSFT_FAIL;
diff --git a/tools/testing/selftests/arm64/mte/mte_common_util.c b/tools/testing/selftests/arm64/mte/mte_common_util.c
index 00ffd34..a1dc2fe 100644
--- a/tools/testing/selftests/arm64/mte/mte_common_util.c
+++ b/tools/testing/selftests/arm64/mte/mte_common_util.c
@@ -38,7 +38,7 @@ void mte_default_handler(int signum, siginfo_t *si, void *uc)
if (cur_mte_cxt.trig_si_code == si->si_code)
cur_mte_cxt.fault_valid = true;
else
- ksft_print_msg("Got unexpected SEGV_MTEAERR at pc=$lx, fault addr=%lx\n",
+ ksft_print_msg("Got unexpected SEGV_MTEAERR at pc=%llx, fault addr=%lx\n",
((ucontext_t *)uc)->uc_mcontext.pc,
addr);
return;
@@ -64,7 +64,7 @@ void mte_default_handler(int signum, siginfo_t *si, void *uc)
exit(1);
}
} else if (signum == SIGBUS) {
- ksft_print_msg("INFO: SIGBUS signal at pc=%lx, fault addr=%lx, si_code=%lx\n",
+ ksft_print_msg("INFO: SIGBUS signal at pc=%llx, fault addr=%lx, si_code=%x\n",
((ucontext_t *)uc)->uc_mcontext.pc, addr, si->si_code);
if ((cur_mte_cxt.trig_range >= 0 &&
addr >= MT_CLEAR_TAG(cur_mte_cxt.trig_addr) &&
@@ -100,7 +100,7 @@ void *mte_insert_tags(void *ptr, size_t size)
int align_size;
if (!ptr || (unsigned long)(ptr) & MT_ALIGN_GRANULE) {
- ksft_print_msg("FAIL: Addr=%lx: invalid\n", ptr);
+ ksft_print_msg("FAIL: Addr=%p: invalid\n", ptr);
return NULL;
}
align_size = MT_ALIGN_UP(size);
@@ -112,7 +112,7 @@ void *mte_insert_tags(void *ptr, size_t size)
void mte_clear_tags(void *ptr, size_t size)
{
if (!ptr || (unsigned long)(ptr) & MT_ALIGN_GRANULE) {
- ksft_print_msg("FAIL: Addr=%lx: invalid\n", ptr);
+ ksft_print_msg("FAIL: Addr=%p: invalid\n", ptr);
return;
}
size = MT_ALIGN_UP(size);
@@ -150,13 +150,13 @@ static void *__mte_allocate_memory_range(size_t size, int mem_type, int mapping,
map_flag |= MAP_PRIVATE;
ptr = mmap(NULL, entire_size, prot_flag, map_flag, fd, 0);
if (ptr == MAP_FAILED) {
- ksft_print_msg("FAIL: mmap allocation\n");
+ ksft_perror("mmap()");
return NULL;
}
if (mem_type == USE_MPROTECT) {
if (mprotect(ptr, entire_size, prot_flag | PROT_MTE)) {
+ ksft_perror("mprotect(PROT_MTE)");
munmap(ptr, size);
- ksft_print_msg("FAIL: mprotect PROT_MTE property\n");
return NULL;
}
}
@@ -190,13 +190,13 @@ void *mte_allocate_file_memory(size_t size, int mem_type, int mapping, bool tags
lseek(fd, 0, SEEK_SET);
for (index = INIT_BUFFER_SIZE; index < size; index += INIT_BUFFER_SIZE) {
if (write(fd, buffer, INIT_BUFFER_SIZE) != INIT_BUFFER_SIZE) {
- perror("initialising buffer");
+ ksft_perror("initialising buffer");
return NULL;
}
}
index -= INIT_BUFFER_SIZE;
if (write(fd, buffer, size - index) != size - index) {
- perror("initialising buffer");
+ ksft_perror("initialising buffer");
return NULL;
}
return __mte_allocate_memory_range(size, mem_type, mapping, 0, 0, tags, fd);
@@ -217,12 +217,12 @@ void *mte_allocate_file_memory_tag_range(size_t size, int mem_type, int mapping,
lseek(fd, 0, SEEK_SET);
for (index = INIT_BUFFER_SIZE; index < map_size; index += INIT_BUFFER_SIZE)
if (write(fd, buffer, INIT_BUFFER_SIZE) != INIT_BUFFER_SIZE) {
- perror("initialising buffer");
+ ksft_perror("initialising buffer");
return NULL;
}
index -= INIT_BUFFER_SIZE;
if (write(fd, buffer, map_size - index) != map_size - index) {
- perror("initialising buffer");
+ ksft_perror("initialising buffer");
return NULL;
}
return __mte_allocate_memory_range(size, mem_type, mapping, range_before,
@@ -319,10 +319,9 @@ int mte_default_setup(void)
unsigned long en = 0;
int ret;
- if (!(hwcaps2 & HWCAP2_MTE)) {
- ksft_print_msg("SKIP: MTE features unavailable\n");
- return KSFT_SKIP;
- }
+ if (!(hwcaps2 & HWCAP2_MTE))
+ ksft_exit_skip("MTE features unavailable\n");
+
/* Get current mte mode */
ret = prctl(PR_GET_TAGGED_ADDR_CTRL, en, 0, 0, 0);
if (ret < 0) {
@@ -359,7 +358,7 @@ int create_temp_file(void)
/* Create a file in the tmpfs filesystem */
fd = mkstemp(&filename[0]);
if (fd == -1) {
- perror(filename);
+ ksft_perror(filename);
ksft_print_msg("FAIL: Unable to open temporary file\n");
return 0;
}
diff --git a/tools/testing/selftests/arm64/mte/mte_common_util.h b/tools/testing/selftests/arm64/mte/mte_common_util.h
index 2d3e717..a0017a30 100644
--- a/tools/testing/selftests/arm64/mte/mte_common_util.h
+++ b/tools/testing/selftests/arm64/mte/mte_common_util.h
@@ -77,13 +77,13 @@ static inline void evaluate_test(int err, const char *msg)
{
switch (err) {
case KSFT_PASS:
- ksft_test_result_pass(msg);
+ ksft_test_result_pass("%s", msg);
break;
case KSFT_FAIL:
- ksft_test_result_fail(msg);
+ ksft_test_result_fail("%s", msg);
break;
case KSFT_SKIP:
- ksft_test_result_skip(msg);
+ ksft_test_result_skip("%s", msg);
break;
default:
ksft_test_result_error("Unknown return code %d from %s",
diff --git a/tools/testing/selftests/arm64/pauth/Makefile b/tools/testing/selftests/arm64/pauth/Makefile
index 72e290b..b5a1c80 100644
--- a/tools/testing/selftests/arm64/pauth/Makefile
+++ b/tools/testing/selftests/arm64/pauth/Makefile
@@ -7,8 +7,14 @@
endif
CFLAGS += -mbranch-protection=pac-ret
+
+# All supported LLVMs have PAC, test for GCC
+ifeq ($(LLVM),1)
+pauth_cc_support := 1
+else
# check if the compiler supports ARMv8.3 and branch protection with PAuth
pauth_cc_support := $(shell if ($(CC) $(CFLAGS) -march=armv8.3-a -E -x c /dev/null -o /dev/null 2>&1) then echo "1"; fi)
+endif
ifeq ($(pauth_cc_support),1)
TEST_GEN_PROGS := pac
diff --git a/tools/testing/selftests/arm64/pauth/pac.c b/tools/testing/selftests/arm64/pauth/pac.c
index b743daa..6d21b2f 100644
--- a/tools/testing/selftests/arm64/pauth/pac.c
+++ b/tools/testing/selftests/arm64/pauth/pac.c
@@ -13,7 +13,7 @@
#include "../../kselftest_harness.h"
#include "helper.h"
-#define PAC_COLLISION_ATTEMPTS 10
+#define PAC_COLLISION_ATTEMPTS 1000
/*
* The kernel sets TBID by default. So bits 55 and above should remain
* untouched no matter what.
@@ -182,6 +182,9 @@ int exec_sign_all(struct signatures *signed_vals, size_t val)
return -1;
}
+ close(new_stdin[1]);
+ close(new_stdout[0]);
+
return 0;
}
diff --git a/tools/testing/selftests/arm64/signal/.gitignore b/tools/testing/selftests/arm64/signal/.gitignore
index b2f2bfd..b257db6 100644
--- a/tools/testing/selftests/arm64/signal/.gitignore
+++ b/tools/testing/selftests/arm64/signal/.gitignore
@@ -3,6 +3,7 @@
fake_sigreturn_*
fpmr_*
poe_*
+gcs_*
sme_*
ssve_*
sve_*
diff --git a/tools/testing/selftests/arm64/signal/Makefile b/tools/testing/selftests/arm64/signal/Makefile
index edb3613..1381039 100644
--- a/tools/testing/selftests/arm64/signal/Makefile
+++ b/tools/testing/selftests/arm64/signal/Makefile
@@ -2,7 +2,7 @@
# Copyright (C) 2019 ARM Limited
# Additional include paths needed by kselftest.h and local headers
-CFLAGS += -D_GNU_SOURCE -std=gnu99 -I.
+CFLAGS += -std=gnu99 -I.
SRCS := $(filter-out testcases/testcases.c,$(wildcard testcases/*.c))
PROGS := $(patsubst %.c,%,$(SRCS))
diff --git a/tools/testing/selftests/arm64/signal/sve_helpers.h b/tools/testing/selftests/arm64/signal/sve_helpers.h
index 50948ce4..ca133b9 100644
--- a/tools/testing/selftests/arm64/signal/sve_helpers.h
+++ b/tools/testing/selftests/arm64/signal/sve_helpers.h
@@ -18,4 +18,17 @@ extern unsigned int nvls;
int sve_fill_vls(bool use_sme, int min_vls);
+static inline uint64_t get_svcr(void)
+{
+ uint64_t val;
+
+ asm volatile (
+ "mrs %0, S3_3_C4_C2_2\n"
+ : "=r"(val)
+ :
+ : "cc");
+
+ return val;
+}
+
#endif
diff --git a/tools/testing/selftests/arm64/signal/test_signals.c b/tools/testing/selftests/arm64/signal/test_signals.c
index 00051b4..1304c8e 100644
--- a/tools/testing/selftests/arm64/signal/test_signals.c
+++ b/tools/testing/selftests/arm64/signal/test_signals.c
@@ -7,6 +7,10 @@
* Each test provides its own tde struct tdescr descriptor to link with
* this wrapper. Framework provides common helpers.
*/
+
+#include <sys/auxv.h>
+#include <sys/prctl.h>
+
#include <kselftest.h>
#include "test_signals.h"
@@ -16,6 +20,16 @@ struct tdescr *current = &tde;
int main(int argc, char *argv[])
{
+ /*
+ * Ensure GCS is at least enabled throughout the tests if
+ * supported, otherwise the inability to return from the
+ * function that enabled GCS makes it very inconvenient to set
+ * up test cases. The prctl() may fail if GCS was locked by
+ * libc setup code.
+ */
+ if (getauxval(AT_HWCAP) & HWCAP_GCS)
+ gcs_set_state(PR_SHADOW_STACK_ENABLE);
+
ksft_print_msg("%s :: %s\n", current->name, current->descr);
if (test_setup(current) && test_init(current)) {
test_run(current);
@@ -23,5 +37,6 @@ int main(int argc, char *argv[])
}
test_result(current);
- return current->result;
+ /* Do not return in case GCS was enabled */
+ exit(current->result);
}
diff --git a/tools/testing/selftests/arm64/signal/test_signals.h b/tools/testing/selftests/arm64/signal/test_signals.h
index 1e6273d..ee75a2c 100644
--- a/tools/testing/selftests/arm64/signal/test_signals.h
+++ b/tools/testing/selftests/arm64/signal/test_signals.h
@@ -35,6 +35,7 @@ enum {
FSME_BIT,
FSME_FA64_BIT,
FSME2_BIT,
+ FGCS_BIT,
FMAX_END
};
@@ -43,6 +44,7 @@ enum {
#define FEAT_SME (1UL << FSME_BIT)
#define FEAT_SME_FA64 (1UL << FSME_FA64_BIT)
#define FEAT_SME2 (1UL << FSME2_BIT)
+#define FEAT_GCS (1UL << FGCS_BIT)
/*
* A descriptor used to describe and configure a test case.
@@ -69,6 +71,10 @@ struct tdescr {
* Zero when no signal is expected on success
*/
int sig_ok;
+ /*
+ * expected si_code for sig_ok, or 0 to not check
+ */
+ int sig_ok_code;
/* signum expected on unsupported CPU features. */
int sig_unsupp;
/* a timeout in second for test completion */
diff --git a/tools/testing/selftests/arm64/signal/test_signals_utils.c b/tools/testing/selftests/arm64/signal/test_signals_utils.c
index 0dc948d..5d36219 100644
--- a/tools/testing/selftests/arm64/signal/test_signals_utils.c
+++ b/tools/testing/selftests/arm64/signal/test_signals_utils.c
@@ -30,6 +30,7 @@ static char const *const feats_names[FMAX_END] = {
" SME ",
" FA64 ",
" SME2 ",
+ " GCS ",
};
#define MAX_FEATS_SZ 128
@@ -142,16 +143,25 @@ static bool handle_signal_ok(struct tdescr *td,
"current->token ZEROED...test is probably broken!\n");
abort();
}
- /*
- * Trying to narrow down the SEGV to the ones generated by Kernel itself
- * via arm64_notify_segfault(). This is a best-effort check anyway, and
- * the si_code check may need to change if this aspect of the kernel
- * ABI changes.
- */
- if (td->sig_ok == SIGSEGV && si->si_code != SEGV_ACCERR) {
- fprintf(stdout,
- "si_code != SEGV_ACCERR...test is probably broken!\n");
- abort();
+ if (td->sig_ok_code) {
+ if (si->si_code != td->sig_ok_code) {
+ fprintf(stdout, "si_code is %d not %d\n",
+ si->si_code, td->sig_ok_code);
+ abort();
+ }
+ } else {
+ /*
+ * Trying to narrow down the SEGV to the ones
+ * generated by Kernel itself via
+ * arm64_notify_segfault(). This is a best-effort
+ * check anyway, and the si_code check may need to
+ * change if this aspect of the kernel ABI changes.
+ */
+ if (td->sig_ok == SIGSEGV && si->si_code != SEGV_ACCERR) {
+ fprintf(stdout,
+ "si_code != SEGV_ACCERR...test is probably broken!\n");
+ abort();
+ }
}
td->pass = 1;
/*
@@ -329,6 +339,8 @@ int test_init(struct tdescr *td)
td->feats_supported |= FEAT_SME_FA64;
if (getauxval(AT_HWCAP2) & HWCAP2_SME2)
td->feats_supported |= FEAT_SME2;
+ if (getauxval(AT_HWCAP) & HWCAP_GCS)
+ td->feats_supported |= FEAT_GCS;
if (feats_ok(td)) {
if (td->feats_required & td->feats_supported)
fprintf(stderr,
diff --git a/tools/testing/selftests/arm64/signal/test_signals_utils.h b/tools/testing/selftests/arm64/signal/test_signals_utils.h
index 762c8fe..36fc12b 100644
--- a/tools/testing/selftests/arm64/signal/test_signals_utils.h
+++ b/tools/testing/selftests/arm64/signal/test_signals_utils.h
@@ -6,6 +6,7 @@
#include <assert.h>
#include <stdio.h>
+#include <stdint.h>
#include <string.h>
#include <linux/compiler.h>
@@ -18,6 +19,44 @@ void test_cleanup(struct tdescr *td);
int test_run(struct tdescr *td);
void test_result(struct tdescr *td);
+#ifndef __NR_prctl
+#define __NR_prctl 167
+#endif
+
+/*
+ * The prctl takes 1 argument but we need to ensure that the other
+ * values passed in registers to the syscall are zero since the kernel
+ * validates them.
+ */
+#define gcs_set_state(state) \
+ ({ \
+ register long _num __asm__ ("x8") = __NR_prctl; \
+ register long _arg1 __asm__ ("x0") = PR_SET_SHADOW_STACK_STATUS; \
+ register long _arg2 __asm__ ("x1") = (long)(state); \
+ register long _arg3 __asm__ ("x2") = 0; \
+ register long _arg4 __asm__ ("x3") = 0; \
+ register long _arg5 __asm__ ("x4") = 0; \
+ \
+ __asm__ volatile ( \
+ "svc #0\n" \
+ : "=r"(_arg1) \
+ : "r"(_arg1), "r"(_arg2), \
+ "r"(_arg3), "r"(_arg4), \
+ "r"(_arg5), "r"(_num) \
+ : "memory", "cc" \
+ ); \
+ _arg1; \
+ })
+
+static inline __attribute__((always_inline)) uint64_t get_gcspr_el0(void)
+{
+ uint64_t val;
+
+ asm volatile("mrs %0, S3_3_C2_C5_1" : "=r" (val));
+
+ return val;
+}
+
static inline bool feats_ok(struct tdescr *td)
{
if (td->feats_incompatible & td->feats_supported)
diff --git a/tools/testing/selftests/arm64/signal/testcases/gcs_exception_fault.c b/tools/testing/selftests/arm64/signal/testcases/gcs_exception_fault.c
new file mode 100644
index 0000000..6228448
--- /dev/null
+++ b/tools/testing/selftests/arm64/signal/testcases/gcs_exception_fault.c
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2023 ARM Limited
+ */
+
+#include <errno.h>
+#include <signal.h>
+#include <unistd.h>
+
+#include <sys/mman.h>
+#include <sys/prctl.h>
+
+#include "test_signals_utils.h"
+#include "testcases.h"
+
+/*
+ * We should get this from asm/siginfo.h but the testsuite is being
+ * clever with redefining siginfo_t.
+ */
+#ifndef SEGV_CPERR
+#define SEGV_CPERR 10
+#endif
+
+static inline void gcsss1(uint64_t Xt)
+{
+ asm volatile (
+ "sys #3, C7, C7, #2, %0\n"
+ :
+ : "rZ" (Xt)
+ : "memory");
+}
+
+static int gcs_op_fault_trigger(struct tdescr *td)
+{
+ /*
+ * The slot below our current GCS should be in a valid GCS but
+ * must not have a valid cap in it.
+ */
+ gcsss1(get_gcspr_el0() - 8);
+
+ return 0;
+}
+
+static int gcs_op_fault_signal(struct tdescr *td, siginfo_t *si,
+ ucontext_t *uc)
+{
+ ASSERT_GOOD_CONTEXT(uc);
+
+ return 1;
+}
+
+struct tdescr tde = {
+ .name = "Invalid GCS operation",
+ .descr = "An invalid GCS operation generates the expected signal",
+ .feats_required = FEAT_GCS,
+ .timeout = 3,
+ .sig_ok = SIGSEGV,
+ .sig_ok_code = SEGV_CPERR,
+ .sanity_disabled = true,
+ .trigger = gcs_op_fault_trigger,
+ .run = gcs_op_fault_signal,
+};
diff --git a/tools/testing/selftests/arm64/signal/testcases/gcs_frame.c b/tools/testing/selftests/arm64/signal/testcases/gcs_frame.c
new file mode 100644
index 0000000..b405d82
--- /dev/null
+++ b/tools/testing/selftests/arm64/signal/testcases/gcs_frame.c
@@ -0,0 +1,88 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2023 ARM Limited
+ */
+
+#include <signal.h>
+#include <ucontext.h>
+#include <sys/prctl.h>
+
+#include "test_signals_utils.h"
+#include "testcases.h"
+
+static union {
+ ucontext_t uc;
+ char buf[1024 * 64];
+} context;
+
+static int gcs_regs(struct tdescr *td, siginfo_t *si, ucontext_t *uc)
+{
+ size_t offset;
+ struct _aarch64_ctx *head = GET_BUF_RESV_HEAD(context);
+ struct gcs_context *gcs;
+ unsigned long expected, gcspr;
+ uint64_t *u64_val;
+ int ret;
+
+ ret = prctl(PR_GET_SHADOW_STACK_STATUS, &expected, 0, 0, 0);
+ if (ret != 0) {
+ fprintf(stderr, "Unable to query GCS status\n");
+ return 1;
+ }
+
+ /* We expect a cap to be added to the GCS in the signal frame */
+ gcspr = get_gcspr_el0();
+ gcspr -= 8;
+ fprintf(stderr, "Expecting GCSPR_EL0 %lx\n", gcspr);
+
+ if (!get_current_context(td, &context.uc, sizeof(context))) {
+ fprintf(stderr, "Failed getting context\n");
+ return 1;
+ }
+
+ /* Ensure that the signal restore token was consumed */
+ u64_val = (uint64_t *)get_gcspr_el0() + 1;
+ if (*u64_val) {
+ fprintf(stderr, "GCS value at %p is %lx not 0\n",
+ u64_val, *u64_val);
+ return 1;
+ }
+
+ fprintf(stderr, "Got context\n");
+
+ head = get_header(head, GCS_MAGIC, GET_BUF_RESV_SIZE(context),
+ &offset);
+ if (!head) {
+ fprintf(stderr, "No GCS context\n");
+ return 1;
+ }
+
+ gcs = (struct gcs_context *)head;
+
+ /* Basic size validation is done in get_current_context() */
+
+ if (gcs->features_enabled != expected) {
+ fprintf(stderr, "Features enabled %llx but expected %lx\n",
+ gcs->features_enabled, expected);
+ return 1;
+ }
+
+ if (gcs->gcspr != gcspr) {
+ fprintf(stderr, "Got GCSPR %llx but expected %lx\n",
+ gcs->gcspr, gcspr);
+ return 1;
+ }
+
+ fprintf(stderr, "GCS context validated\n");
+ td->pass = 1;
+
+ return 0;
+}
+
+struct tdescr tde = {
+ .name = "GCS basics",
+ .descr = "Validate a GCS signal context",
+ .feats_required = FEAT_GCS,
+ .timeout = 3,
+ .run = gcs_regs,
+};
diff --git a/tools/testing/selftests/arm64/signal/testcases/gcs_write_fault.c b/tools/testing/selftests/arm64/signal/testcases/gcs_write_fault.c
new file mode 100644
index 0000000..faeabb1
--- /dev/null
+++ b/tools/testing/selftests/arm64/signal/testcases/gcs_write_fault.c
@@ -0,0 +1,67 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2023 ARM Limited
+ */
+
+#include <errno.h>
+#include <signal.h>
+#include <unistd.h>
+
+#include <sys/mman.h>
+#include <sys/prctl.h>
+
+#include "test_signals_utils.h"
+#include "testcases.h"
+
+static uint64_t *gcs_page;
+
+#ifndef __NR_map_shadow_stack
+#define __NR_map_shadow_stack 453
+#endif
+
+static bool alloc_gcs(struct tdescr *td)
+{
+ long page_size = sysconf(_SC_PAGE_SIZE);
+
+ gcs_page = (void *)syscall(__NR_map_shadow_stack, 0,
+ page_size, 0);
+ if (gcs_page == MAP_FAILED) {
+ fprintf(stderr, "Failed to map %ld byte GCS: %d\n",
+ page_size, errno);
+ return false;
+ }
+
+ return true;
+}
+
+static int gcs_write_fault_trigger(struct tdescr *td)
+{
+ /* Verify that the page is readable (ie, not completely unmapped) */
+ fprintf(stderr, "Read value 0x%lx\n", gcs_page[0]);
+
+ /* A regular write should trigger a fault */
+ gcs_page[0] = EINVAL;
+
+ return 0;
+}
+
+static int gcs_write_fault_signal(struct tdescr *td, siginfo_t *si,
+ ucontext_t *uc)
+{
+ ASSERT_GOOD_CONTEXT(uc);
+
+ return 1;
+}
+
+
+struct tdescr tde = {
+ .name = "GCS write fault",
+ .descr = "Normal writes to a GCS segfault",
+ .feats_required = FEAT_GCS,
+ .timeout = 3,
+ .sig_ok = SIGSEGV,
+ .sanity_disabled = true,
+ .init = alloc_gcs,
+ .trigger = gcs_write_fault_trigger,
+ .run = gcs_write_fault_signal,
+};
diff --git a/tools/testing/selftests/arm64/signal/testcases/ssve_regs.c b/tools/testing/selftests/arm64/signal/testcases/ssve_regs.c
index 6dbe48c..1dbca9a 100644
--- a/tools/testing/selftests/arm64/signal/testcases/ssve_regs.c
+++ b/tools/testing/selftests/arm64/signal/testcases/ssve_regs.c
@@ -85,6 +85,11 @@ static int do_one_sme_vl(struct tdescr *td, siginfo_t *si, ucontext_t *uc,
fprintf(stderr, "Got expected size %u and VL %d\n",
head->size, ssve->vl);
+ if (get_svcr() != 0) {
+ fprintf(stderr, "Unexpected SVCR %lx\n", get_svcr());
+ return 1;
+ }
+
return 0;
}
diff --git a/tools/testing/selftests/arm64/signal/testcases/testcases.c b/tools/testing/selftests/arm64/signal/testcases/testcases.c
index e6daa94..0c1a6b2 100644
--- a/tools/testing/selftests/arm64/signal/testcases/testcases.c
+++ b/tools/testing/selftests/arm64/signal/testcases/testcases.c
@@ -198,6 +198,13 @@ bool validate_reserved(ucontext_t *uc, size_t resv_sz, char **err)
*err = "Bad size for fpmr_context";
new_flags |= FPMR_CTX;
break;
+ case GCS_MAGIC:
+ if (flags & GCS_CTX)
+ *err = "Multiple GCS_MAGIC";
+ if (head->size != sizeof(struct gcs_context))
+ *err = "Bad size for gcs_context";
+ new_flags |= GCS_CTX;
+ break;
case EXTRA_MAGIC:
if (flags & EXTRA_CTX)
*err = "Multiple EXTRA_MAGIC";
diff --git a/tools/testing/selftests/arm64/signal/testcases/testcases.h b/tools/testing/selftests/arm64/signal/testcases/testcases.h
index 9872b89..98b97ef 100644
--- a/tools/testing/selftests/arm64/signal/testcases/testcases.h
+++ b/tools/testing/selftests/arm64/signal/testcases/testcases.h
@@ -20,6 +20,7 @@
#define EXTRA_CTX (1 << 3)
#define ZT_CTX (1 << 4)
#define FPMR_CTX (1 << 5)
+#define GCS_CTX (1 << 6)
#define KSFT_BAD_MAGIC 0xdeadbeef
diff --git a/tools/testing/selftests/arm64/signal/testcases/za_regs.c b/tools/testing/selftests/arm64/signal/testcases/za_regs.c
index b9e13f2..badaead 100644
--- a/tools/testing/selftests/arm64/signal/testcases/za_regs.c
+++ b/tools/testing/selftests/arm64/signal/testcases/za_regs.c
@@ -91,6 +91,11 @@ static int do_one_sme_vl(struct tdescr *td, siginfo_t *si, ucontext_t *uc,
return 1;
}
+ if (get_svcr() != 0) {
+ fprintf(stderr, "Unexpected SVCR %lx\n", get_svcr());
+ return 1;
+ }
+
return 0;
}
diff --git a/tools/testing/selftests/bpf/progs/verifier_bits_iter.c b/tools/testing/selftests/bpf/progs/verifier_bits_iter.c
index 156cc27..7c881bc 100644
--- a/tools/testing/selftests/bpf/progs/verifier_bits_iter.c
+++ b/tools/testing/selftests/bpf/progs/verifier_bits_iter.c
@@ -57,9 +57,15 @@ __description("null pointer")
__success __retval(0)
int null_pointer(void)
{
- int nr = 0;
+ struct bpf_iter_bits iter;
+ int err, nr = 0;
int *bit;
+ err = bpf_iter_bits_new(&iter, NULL, 1);
+ bpf_iter_bits_destroy(&iter);
+ if (err != -EINVAL)
+ return 1;
+
bpf_for_each(bits, bit, NULL, 1)
nr++;
return nr;
@@ -194,15 +200,33 @@ __description("bad words")
__success __retval(0)
int bad_words(void)
{
- void *bad_addr = (void *)(3UL << 30);
- int nr = 0;
+ void *bad_addr = (void *)-4095;
+ struct bpf_iter_bits iter;
+ volatile int nr;
int *bit;
+ int err;
+ err = bpf_iter_bits_new(&iter, bad_addr, 1);
+ bpf_iter_bits_destroy(&iter);
+ if (err != -EFAULT)
+ return 1;
+
+ nr = 0;
bpf_for_each(bits, bit, bad_addr, 1)
nr++;
+ if (nr != 0)
+ return 2;
+ err = bpf_iter_bits_new(&iter, bad_addr, 4);
+ bpf_iter_bits_destroy(&iter);
+ if (err != -EFAULT)
+ return 3;
+
+ nr = 0;
bpf_for_each(bits, bit, bad_addr, 4)
nr++;
+ if (nr != 0)
+ return 4;
- return nr;
+ return 0;
}
diff --git a/tools/testing/selftests/drivers/net/bonding/bond_options.sh b/tools/testing/selftests/drivers/net/bonding/bond_options.sh
index 41d0859..edc56e2 100755
--- a/tools/testing/selftests/drivers/net/bonding/bond_options.sh
+++ b/tools/testing/selftests/drivers/net/bonding/bond_options.sh
@@ -11,6 +11,8 @@
lib_dir=$(dirname "$0")
source ${lib_dir}/bond_topo_3d1c.sh
+c_maddr="33:33:00:00:00:10"
+g_maddr="33:33:00:00:02:54"
skip_prio()
{
@@ -240,6 +242,54 @@
done
}
+# Testing correct multicast groups are added to slaves for ns targets
+arp_validate_mcast()
+{
+ RET=0
+ local arp_valid=$(cmd_jq "ip -n ${s_ns} -j -d link show bond0" ".[].linkinfo.info_data.arp_validate")
+ local active_slave=$(cmd_jq "ip -n ${s_ns} -d -j link show bond0" ".[].linkinfo.info_data.active_slave")
+
+ for i in $(seq 0 2); do
+ maddr_list=$(ip -n ${s_ns} maddr show dev eth${i})
+
+ # arp_valid == 0 or active_slave should not join any maddrs
+ if { [ "$arp_valid" == "null" ] || [ "eth${i}" == ${active_slave} ]; } && \
+ echo "$maddr_list" | grep -qE "${c_maddr}|${g_maddr}"; then
+ RET=1
+ check_err 1 "arp_valid $arp_valid active_slave $active_slave, eth$i has mcast group"
+ # arp_valid != 0 and backup_slave should join both maddrs
+ elif [ "$arp_valid" != "null" ] && [ "eth${i}" != ${active_slave} ] && \
+ ( ! echo "$maddr_list" | grep -q "${c_maddr}" || \
+ ! echo "$maddr_list" | grep -q "${m_maddr}"); then
+ RET=1
+ check_err 1 "arp_valid $arp_valid active_slave $active_slave, eth$i has mcast group"
+ fi
+ done
+
+ # Do failover
+ ip -n ${s_ns} link set ${active_slave} down
+ # wait for active link change
+ slowwait 2 active_slave_changed $active_slave
+ active_slave=$(cmd_jq "ip -n ${s_ns} -d -j link show bond0" ".[].linkinfo.info_data.active_slave")
+
+ for i in $(seq 0 2); do
+ maddr_list=$(ip -n ${s_ns} maddr show dev eth${i})
+
+ # arp_valid == 0 or active_slave should not join any maddrs
+ if { [ "$arp_valid" == "null" ] || [ "eth${i}" == ${active_slave} ]; } && \
+ echo "$maddr_list" | grep -qE "${c_maddr}|${g_maddr}"; then
+ RET=1
+ check_err 1 "arp_valid $arp_valid active_slave $active_slave, eth$i has mcast group"
+ # arp_valid != 0 and backup_slave should join both maddrs
+ elif [ "$arp_valid" != "null" ] && [ "eth${i}" != ${active_slave} ] && \
+ ( ! echo "$maddr_list" | grep -q "${c_maddr}" || \
+ ! echo "$maddr_list" | grep -q "${m_maddr}"); then
+ RET=1
+ check_err 1 "arp_valid $arp_valid active_slave $active_slave, eth$i has mcast group"
+ fi
+ done
+}
+
arp_validate_arp()
{
local mode=$1
@@ -261,8 +311,10 @@
fi
for val in $(seq 0 6); do
- arp_validate_test "mode $mode arp_interval 100 ns_ip6_target ${g_ip6} arp_validate $val"
+ arp_validate_test "mode $mode arp_interval 100 ns_ip6_target ${g_ip6},${c_ip6} arp_validate $val"
log_test "arp_validate" "$mode ns_ip6_target arp_validate $val"
+ arp_validate_mcast
+ log_test "arp_validate" "join mcast group"
done
}
diff --git a/tools/testing/selftests/filesystems/.gitignore b/tools/testing/selftests/filesystems/.gitignore
index f0c0ff2..828b66a 100644
--- a/tools/testing/selftests/filesystems/.gitignore
+++ b/tools/testing/selftests/filesystems/.gitignore
@@ -1,3 +1,4 @@
# SPDX-License-Identifier: GPL-2.0-only
dnotify_test
devpts_pts
+file_stressor
diff --git a/tools/testing/selftests/filesystems/Makefile b/tools/testing/selftests/filesystems/Makefile
index c647fd6a..66305fc 100644
--- a/tools/testing/selftests/filesystems/Makefile
+++ b/tools/testing/selftests/filesystems/Makefile
@@ -1,7 +1,7 @@
# SPDX-License-Identifier: GPL-2.0
CFLAGS += $(KHDR_INCLUDES)
-TEST_GEN_PROGS := devpts_pts
+TEST_GEN_PROGS := devpts_pts file_stressor
TEST_GEN_PROGS_EXTENDED := dnotify_test
include ../lib.mk
diff --git a/tools/testing/selftests/filesystems/file_stressor.c b/tools/testing/selftests/filesystems/file_stressor.c
new file mode 100644
index 0000000..1136f93
--- /dev/null
+++ b/tools/testing/selftests/filesystems/file_stressor.c
@@ -0,0 +1,194 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#define __SANE_USERSPACE_TYPES__
+
+#include <fcntl.h>
+#include <limits.h>
+#include <pthread.h>
+#include <sched.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/mount.h>
+#include <unistd.h>
+
+#include "../kselftest_harness.h"
+
+#include <linux/types.h>
+#include <linux/mount.h>
+#include <sys/syscall.h>
+
+static inline int sys_fsopen(const char *fsname, unsigned int flags)
+{
+ return syscall(__NR_fsopen, fsname, flags);
+}
+
+static inline int sys_fsconfig(int fd, unsigned int cmd, const char *key,
+ const char *value, int aux)
+{
+ return syscall(__NR_fsconfig, fd, cmd, key, value, aux);
+}
+
+static inline int sys_fsmount(int fd, unsigned int flags,
+ unsigned int attr_flags)
+{
+ return syscall(__NR_fsmount, fd, flags, attr_flags);
+}
+
+#ifndef MOVE_MOUNT_F_EMPTY_PATH
+#define MOVE_MOUNT_F_EMPTY_PATH 0x00000004 /* Empty from path permitted */
+#endif
+
+static inline int sys_move_mount(int from_dfd, const char *from_pathname,
+ int to_dfd, const char *to_pathname,
+ unsigned int flags)
+{
+ return syscall(__NR_move_mount, from_dfd, from_pathname, to_dfd,
+ to_pathname, flags);
+}
+
+FIXTURE(file_stressor) {
+ int fd_tmpfs;
+ int nr_procs;
+ int max_fds;
+ pid_t *pids_openers;
+ pid_t *pids_getdents;
+ int *fd_proc_pid;
+};
+
+FIXTURE_SETUP(file_stressor)
+{
+ int fd_context;
+
+ ASSERT_EQ(unshare(CLONE_NEWNS), 0);
+ ASSERT_EQ(mount(NULL, "/", NULL, MS_SLAVE | MS_REC, NULL), 0);
+ ASSERT_EQ(mkdir("/slab_typesafe_by_rcu", 0755), 0);
+
+ fd_context = sys_fsopen("tmpfs", 0);
+ ASSERT_GE(fd_context, 0);
+
+ ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_CMD_CREATE, NULL, NULL, 0), 0);
+ self->fd_tmpfs = sys_fsmount(fd_context, 0, 0);
+ ASSERT_GE(self->fd_tmpfs, 0);
+ ASSERT_EQ(close(fd_context), 0);
+
+ ASSERT_EQ(sys_move_mount(self->fd_tmpfs, "", -EBADF, "/slab_typesafe_by_rcu", MOVE_MOUNT_F_EMPTY_PATH), 0);
+
+ self->nr_procs = sysconf(_SC_NPROCESSORS_ONLN);
+ self->pids_openers = malloc(sizeof(pid_t) * self->nr_procs);
+ ASSERT_NE(self->pids_openers, NULL);
+ self->pids_getdents = malloc(sizeof(pid_t) * self->nr_procs);
+ ASSERT_NE(self->pids_getdents, NULL);
+ self->fd_proc_pid = malloc(sizeof(int) * self->nr_procs);
+ ASSERT_NE(self->fd_proc_pid, NULL);
+ self->max_fds = 500;
+}
+
+FIXTURE_TEARDOWN(file_stressor)
+{
+ for (int i = 0; i < self->nr_procs; i++) {
+ int wstatus;
+ pid_t pid;
+
+ pid = waitpid(self->pids_openers[i], &wstatus, 0);
+ ASSERT_EQ(pid, self->pids_openers[i]);
+ ASSERT_TRUE(!WIFEXITED(wstatus) || !WIFSIGNALED(wstatus));
+
+ pid = waitpid(self->pids_getdents[i], &wstatus, 0);
+ ASSERT_EQ(pid, self->pids_getdents[i]);
+ ASSERT_TRUE(!WIFEXITED(wstatus) || !WIFSIGNALED(wstatus));
+ }
+ free(self->pids_openers);
+ free(self->pids_getdents);
+ ASSERT_EQ(close(self->fd_tmpfs), 0);
+
+ umount2("/slab_typesafe_by_rcu", 0);
+ ASSERT_EQ(rmdir("/slab_typesafe_by_rcu"), 0);
+}
+
+TEST_F_TIMEOUT(file_stressor, slab_typesafe_by_rcu, 900 * 2)
+{
+ for (int i = 0; i < self->nr_procs; i++) {
+ pid_t pid_self;
+
+ self->pids_openers[i] = fork();
+ ASSERT_GE(self->pids_openers[i], 0);
+
+ if (self->pids_openers[i] != 0)
+ continue;
+
+ self->pids_openers[i] = getpid();
+ for (;;) {
+ for (int i = 0; i < self->max_fds; i++) {
+ char path[PATH_MAX];
+ int fd;
+
+ sprintf(path, "/slab_typesafe_by_rcu/file-%d-%d", self->pids_openers[i], i);
+ fd = open(path, O_CREAT | O_RDONLY | O_CLOEXEC, 0644);
+ if (fd < 0)
+ continue;
+ }
+
+ close_range(3, ~0U, 0);
+ }
+
+ exit(0);
+ }
+
+ for (int i = 0; i < self->nr_procs; i++) {
+ char path[PATH_MAX];
+
+ sprintf(path, "/proc/%d/fd/", self->pids_openers[i]);
+ self->fd_proc_pid[i] = open(path, O_DIRECTORY | O_RDONLY | O_CLOEXEC);
+ ASSERT_GE(self->fd_proc_pid[i], 0);
+ }
+
+ for (int i = 0; i < self->nr_procs; i++) {
+ self->pids_getdents[i] = fork();
+ ASSERT_GE(self->pids_getdents[i], 0);
+
+ if (self->pids_getdents[i] != 0)
+ continue;
+
+ self->pids_getdents[i] = getpid();
+ for (;;) {
+ char ents[1024];
+ ssize_t nr_read;
+
+ /*
+ * Concurrently read /proc/<pid>/fd/ which rougly does:
+ *
+ * f = fget_task_next(p, &fd);
+ * if (!f)
+ * break;
+ * data.mode = f->f_mode;
+ * fput(f);
+ *
+ * Which means that it'll try to get a reference to a
+ * file in another task's file descriptor table.
+ *
+ * Under heavy file load it is increasingly likely that
+ * the other task will manage to close @file and @file
+ * is being recycled due to SLAB_TYPEAFE_BY_RCU
+ * concurrently. This will trigger various warnings in
+ * the file reference counting code.
+ */
+ do {
+ nr_read = syscall(SYS_getdents64, self->fd_proc_pid[i], ents, sizeof(ents));
+ } while (nr_read >= 0);
+
+ lseek(self->fd_proc_pid[i], 0, SEEK_SET);
+ }
+
+ exit(0);
+ }
+
+ ASSERT_EQ(clock_nanosleep(CLOCK_MONOTONIC, 0, &(struct timespec){ .tv_sec = 900 /* 15 min */ }, NULL), 0);
+
+ for (int i = 0; i < self->nr_procs; i++) {
+ kill(self->pids_openers[i], SIGKILL);
+ kill(self->pids_getdents[i], SIGKILL);
+ }
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/filesystems/overlayfs/.gitignore b/tools/testing/selftests/filesystems/overlayfs/.gitignore
index 52ae618..e23a18c 100644
--- a/tools/testing/selftests/filesystems/overlayfs/.gitignore
+++ b/tools/testing/selftests/filesystems/overlayfs/.gitignore
@@ -1,2 +1,3 @@
# SPDX-License-Identifier: GPL-2.0-only
dev_in_maps
+set_layers_via_fds
diff --git a/tools/testing/selftests/filesystems/overlayfs/Makefile b/tools/testing/selftests/filesystems/overlayfs/Makefile
index 56b2b48..e8d1adb 100644
--- a/tools/testing/selftests/filesystems/overlayfs/Makefile
+++ b/tools/testing/selftests/filesystems/overlayfs/Makefile
@@ -1,6 +1,6 @@
# SPDX-License-Identifier: GPL-2.0
-TEST_GEN_PROGS := dev_in_maps
+TEST_GEN_PROGS := dev_in_maps set_layers_via_fds
CFLAGS := -Wall -Werror
diff --git a/tools/testing/selftests/filesystems/overlayfs/dev_in_maps.c b/tools/testing/selftests/filesystems/overlayfs/dev_in_maps.c
index 2862aae..3b79626 100644
--- a/tools/testing/selftests/filesystems/overlayfs/dev_in_maps.c
+++ b/tools/testing/selftests/filesystems/overlayfs/dev_in_maps.c
@@ -17,32 +17,7 @@
#include "../../kselftest.h"
#include "log.h"
-
-static int sys_fsopen(const char *fsname, unsigned int flags)
-{
- return syscall(__NR_fsopen, fsname, flags);
-}
-
-static int sys_fsconfig(int fd, unsigned int cmd, const char *key, const char *value, int aux)
-{
- return syscall(__NR_fsconfig, fd, cmd, key, value, aux);
-}
-
-static int sys_fsmount(int fd, unsigned int flags, unsigned int attr_flags)
-{
- return syscall(__NR_fsmount, fd, flags, attr_flags);
-}
-static int sys_mount(const char *src, const char *tgt, const char *fst,
- unsigned long flags, const void *data)
-{
- return syscall(__NR_mount, src, tgt, fst, flags, data);
-}
-static int sys_move_mount(int from_dfd, const char *from_pathname,
- int to_dfd, const char *to_pathname,
- unsigned int flags)
-{
- return syscall(__NR_move_mount, from_dfd, from_pathname, to_dfd, to_pathname, flags);
-}
+#include "wrappers.h"
static long get_file_dev_and_inode(void *addr, struct statx *stx)
{
diff --git a/tools/testing/selftests/filesystems/overlayfs/set_layers_via_fds.c b/tools/testing/selftests/filesystems/overlayfs/set_layers_via_fds.c
new file mode 100644
index 0000000..1d0ae78
--- /dev/null
+++ b/tools/testing/selftests/filesystems/overlayfs/set_layers_via_fds.c
@@ -0,0 +1,217 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#define __SANE_USERSPACE_TYPES__ // Use ll64
+
+#include <fcntl.h>
+#include <sched.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/mount.h>
+#include <unistd.h>
+
+#include "../../kselftest_harness.h"
+#include "log.h"
+#include "wrappers.h"
+
+FIXTURE(set_layers_via_fds) {
+};
+
+FIXTURE_SETUP(set_layers_via_fds)
+{
+ ASSERT_EQ(mkdir("/set_layers_via_fds", 0755), 0);
+}
+
+FIXTURE_TEARDOWN(set_layers_via_fds)
+{
+ umount2("/set_layers_via_fds", 0);
+ ASSERT_EQ(rmdir("/set_layers_via_fds"), 0);
+}
+
+TEST_F(set_layers_via_fds, set_layers_via_fds)
+{
+ int fd_context, fd_tmpfs, fd_overlay;
+ int layer_fds[] = { [0 ... 8] = -EBADF };
+ bool layers_found[] = { [0 ... 8] = false };
+ size_t len = 0;
+ char *line = NULL;
+ FILE *f_mountinfo;
+
+ ASSERT_EQ(unshare(CLONE_NEWNS), 0);
+ ASSERT_EQ(sys_mount(NULL, "/", NULL, MS_SLAVE | MS_REC, NULL), 0);
+
+ fd_context = sys_fsopen("tmpfs", 0);
+ ASSERT_GE(fd_context, 0);
+
+ ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_CMD_CREATE, NULL, NULL, 0), 0);
+ fd_tmpfs = sys_fsmount(fd_context, 0, 0);
+ ASSERT_GE(fd_tmpfs, 0);
+ ASSERT_EQ(close(fd_context), 0);
+
+ ASSERT_EQ(mkdirat(fd_tmpfs, "w", 0755), 0);
+ ASSERT_EQ(mkdirat(fd_tmpfs, "u", 0755), 0);
+ ASSERT_EQ(mkdirat(fd_tmpfs, "l1", 0755), 0);
+ ASSERT_EQ(mkdirat(fd_tmpfs, "l2", 0755), 0);
+ ASSERT_EQ(mkdirat(fd_tmpfs, "l3", 0755), 0);
+ ASSERT_EQ(mkdirat(fd_tmpfs, "l4", 0755), 0);
+ ASSERT_EQ(mkdirat(fd_tmpfs, "d1", 0755), 0);
+ ASSERT_EQ(mkdirat(fd_tmpfs, "d2", 0755), 0);
+ ASSERT_EQ(mkdirat(fd_tmpfs, "d3", 0755), 0);
+
+ layer_fds[0] = openat(fd_tmpfs, "w", O_DIRECTORY);
+ ASSERT_GE(layer_fds[0], 0);
+
+ layer_fds[1] = openat(fd_tmpfs, "u", O_DIRECTORY);
+ ASSERT_GE(layer_fds[1], 0);
+
+ layer_fds[2] = openat(fd_tmpfs, "l1", O_DIRECTORY);
+ ASSERT_GE(layer_fds[2], 0);
+
+ layer_fds[3] = openat(fd_tmpfs, "l2", O_DIRECTORY);
+ ASSERT_GE(layer_fds[3], 0);
+
+ layer_fds[4] = openat(fd_tmpfs, "l3", O_DIRECTORY);
+ ASSERT_GE(layer_fds[4], 0);
+
+ layer_fds[5] = openat(fd_tmpfs, "l4", O_DIRECTORY);
+ ASSERT_GE(layer_fds[5], 0);
+
+ layer_fds[6] = openat(fd_tmpfs, "d1", O_DIRECTORY);
+ ASSERT_GE(layer_fds[6], 0);
+
+ layer_fds[7] = openat(fd_tmpfs, "d2", O_DIRECTORY);
+ ASSERT_GE(layer_fds[7], 0);
+
+ layer_fds[8] = openat(fd_tmpfs, "d3", O_DIRECTORY);
+ ASSERT_GE(layer_fds[8], 0);
+
+ ASSERT_EQ(sys_move_mount(fd_tmpfs, "", -EBADF, "/tmp", MOVE_MOUNT_F_EMPTY_PATH), 0);
+ ASSERT_EQ(close(fd_tmpfs), 0);
+
+ fd_context = sys_fsopen("overlay", 0);
+ ASSERT_GE(fd_context, 0);
+
+ ASSERT_NE(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "lowerdir", NULL, layer_fds[2]), 0);
+
+ ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "workdir", NULL, layer_fds[0]), 0);
+ ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "upperdir", NULL, layer_fds[1]), 0);
+ ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "lowerdir+", NULL, layer_fds[2]), 0);
+ ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "lowerdir+", NULL, layer_fds[3]), 0);
+ ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "lowerdir+", NULL, layer_fds[4]), 0);
+ ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "lowerdir+", NULL, layer_fds[5]), 0);
+ ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "datadir+", NULL, layer_fds[6]), 0);
+ ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "datadir+", NULL, layer_fds[7]), 0);
+ ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "datadir+", NULL, layer_fds[8]), 0);
+
+ ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_STRING, "metacopy", "on", 0), 0);
+
+ ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_CMD_CREATE, NULL, NULL, 0), 0);
+
+ fd_overlay = sys_fsmount(fd_context, 0, 0);
+ ASSERT_GE(fd_overlay, 0);
+
+ ASSERT_EQ(sys_move_mount(fd_overlay, "", -EBADF, "/set_layers_via_fds", MOVE_MOUNT_F_EMPTY_PATH), 0);
+
+ f_mountinfo = fopen("/proc/self/mountinfo", "r");
+ ASSERT_NE(f_mountinfo, NULL);
+
+ while (getline(&line, &len, f_mountinfo) != -1) {
+ char *haystack = line;
+
+ if (strstr(haystack, "workdir=/tmp/w"))
+ layers_found[0] = true;
+ if (strstr(haystack, "upperdir=/tmp/u"))
+ layers_found[1] = true;
+ if (strstr(haystack, "lowerdir+=/tmp/l1"))
+ layers_found[2] = true;
+ if (strstr(haystack, "lowerdir+=/tmp/l2"))
+ layers_found[3] = true;
+ if (strstr(haystack, "lowerdir+=/tmp/l3"))
+ layers_found[4] = true;
+ if (strstr(haystack, "lowerdir+=/tmp/l4"))
+ layers_found[5] = true;
+ if (strstr(haystack, "datadir+=/tmp/d1"))
+ layers_found[6] = true;
+ if (strstr(haystack, "datadir+=/tmp/d2"))
+ layers_found[7] = true;
+ if (strstr(haystack, "datadir+=/tmp/d3"))
+ layers_found[8] = true;
+ }
+ free(line);
+
+ for (int i = 0; i < ARRAY_SIZE(layer_fds); i++) {
+ ASSERT_EQ(layers_found[i], true);
+ ASSERT_EQ(close(layer_fds[i]), 0);
+ }
+
+ ASSERT_EQ(close(fd_context), 0);
+ ASSERT_EQ(close(fd_overlay), 0);
+ ASSERT_EQ(fclose(f_mountinfo), 0);
+}
+
+TEST_F(set_layers_via_fds, set_500_layers_via_fds)
+{
+ int fd_context, fd_tmpfs, fd_overlay, fd_work, fd_upper, fd_lower;
+ int layer_fds[500] = { [0 ... 499] = -EBADF };
+
+ ASSERT_EQ(unshare(CLONE_NEWNS), 0);
+ ASSERT_EQ(sys_mount(NULL, "/", NULL, MS_SLAVE | MS_REC, NULL), 0);
+
+ fd_context = sys_fsopen("tmpfs", 0);
+ ASSERT_GE(fd_context, 0);
+
+ ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_CMD_CREATE, NULL, NULL, 0), 0);
+ fd_tmpfs = sys_fsmount(fd_context, 0, 0);
+ ASSERT_GE(fd_tmpfs, 0);
+ ASSERT_EQ(close(fd_context), 0);
+
+ for (int i = 0; i < ARRAY_SIZE(layer_fds); i++) {
+ char path[100];
+
+ sprintf(path, "l%d", i);
+ ASSERT_EQ(mkdirat(fd_tmpfs, path, 0755), 0);
+ layer_fds[i] = openat(fd_tmpfs, path, O_DIRECTORY);
+ ASSERT_GE(layer_fds[i], 0);
+ }
+
+ ASSERT_EQ(mkdirat(fd_tmpfs, "w", 0755), 0);
+ fd_work = openat(fd_tmpfs, "w", O_DIRECTORY);
+ ASSERT_GE(fd_work, 0);
+
+ ASSERT_EQ(mkdirat(fd_tmpfs, "u", 0755), 0);
+ fd_upper = openat(fd_tmpfs, "u", O_DIRECTORY);
+ ASSERT_GE(fd_upper, 0);
+
+ ASSERT_EQ(mkdirat(fd_tmpfs, "l501", 0755), 0);
+ fd_lower = openat(fd_tmpfs, "l501", O_DIRECTORY);
+ ASSERT_GE(fd_lower, 0);
+
+ ASSERT_EQ(sys_move_mount(fd_tmpfs, "", -EBADF, "/tmp", MOVE_MOUNT_F_EMPTY_PATH), 0);
+ ASSERT_EQ(close(fd_tmpfs), 0);
+
+ fd_context = sys_fsopen("overlay", 0);
+ ASSERT_GE(fd_context, 0);
+
+ ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "workdir", NULL, fd_work), 0);
+ ASSERT_EQ(close(fd_work), 0);
+
+ ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "upperdir", NULL, fd_upper), 0);
+ ASSERT_EQ(close(fd_upper), 0);
+
+ for (int i = 0; i < ARRAY_SIZE(layer_fds); i++) {
+ ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "lowerdir+", NULL, layer_fds[i]), 0);
+ ASSERT_EQ(close(layer_fds[i]), 0);
+ }
+
+ ASSERT_NE(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "lowerdir+", NULL, fd_lower), 0);
+ ASSERT_EQ(close(fd_lower), 0);
+
+ ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_CMD_CREATE, NULL, NULL, 0), 0);
+
+ fd_overlay = sys_fsmount(fd_context, 0, 0);
+ ASSERT_GE(fd_overlay, 0);
+ ASSERT_EQ(close(fd_context), 0);
+ ASSERT_EQ(close(fd_overlay), 0);
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/filesystems/overlayfs/wrappers.h b/tools/testing/selftests/filesystems/overlayfs/wrappers.h
new file mode 100644
index 0000000..071b95f
--- /dev/null
+++ b/tools/testing/selftests/filesystems/overlayfs/wrappers.h
@@ -0,0 +1,47 @@
+// SPDX-License-Identifier: GPL-2.0
+//
+#ifndef __SELFTEST_OVERLAYFS_WRAPPERS_H__
+#define __SELFTEST_OVERLAYFS_WRAPPERS_H__
+
+#define _GNU_SOURCE
+
+#include <linux/types.h>
+#include <linux/mount.h>
+#include <sys/syscall.h>
+
+static inline int sys_fsopen(const char *fsname, unsigned int flags)
+{
+ return syscall(__NR_fsopen, fsname, flags);
+}
+
+static inline int sys_fsconfig(int fd, unsigned int cmd, const char *key,
+ const char *value, int aux)
+{
+ return syscall(__NR_fsconfig, fd, cmd, key, value, aux);
+}
+
+static inline int sys_fsmount(int fd, unsigned int flags,
+ unsigned int attr_flags)
+{
+ return syscall(__NR_fsmount, fd, flags, attr_flags);
+}
+
+static inline int sys_mount(const char *src, const char *tgt, const char *fst,
+ unsigned long flags, const void *data)
+{
+ return syscall(__NR_mount, src, tgt, fst, flags, data);
+}
+
+#ifndef MOVE_MOUNT_F_EMPTY_PATH
+#define MOVE_MOUNT_F_EMPTY_PATH 0x00000004 /* Empty from path permitted */
+#endif
+
+static inline int sys_move_mount(int from_dfd, const char *from_pathname,
+ int to_dfd, const char *to_pathname,
+ unsigned int flags)
+{
+ return syscall(__NR_move_mount, from_dfd, from_pathname, to_dfd,
+ to_pathname, flags);
+}
+
+#endif
diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile
index 156fbfa..48645a2 100644
--- a/tools/testing/selftests/kvm/Makefile
+++ b/tools/testing/selftests/kvm/Makefile
@@ -241,16 +241,18 @@
-Wno-gnu-variable-sized-type-not-at-end -MD -MP -DCONFIG_64BIT \
-fno-builtin-memcmp -fno-builtin-memcpy \
-fno-builtin-memset -fno-builtin-strnlen \
- -fno-stack-protector -fno-PIE -I$(LINUX_TOOL_INCLUDE) \
- -I$(LINUX_TOOL_ARCH_INCLUDE) -I$(LINUX_HDR_PATH) -Iinclude \
- -I$(<D) -Iinclude/$(ARCH_DIR) -I ../rseq -I.. $(EXTRA_CFLAGS) \
- $(KHDR_INCLUDES)
+ -fno-stack-protector -fno-PIE -fno-strict-aliasing \
+ -I$(LINUX_TOOL_INCLUDE) -I$(LINUX_TOOL_ARCH_INCLUDE) \
+ -I$(LINUX_HDR_PATH) -Iinclude -I$(<D) -Iinclude/$(ARCH_DIR) \
+ -I ../rseq -I.. $(EXTRA_CFLAGS) $(KHDR_INCLUDES)
ifeq ($(ARCH),s390)
CFLAGS += -march=z10
endif
ifeq ($(ARCH),x86)
+ifeq ($(shell echo "void foo(void) { }" | $(CC) -march=x86-64-v2 -x c - -c -o /dev/null 2>/dev/null; echo "$$?"),0)
CFLAGS += -march=x86-64-v2
endif
+endif
ifeq ($(ARCH),arm64)
tools_dir := $(top_srcdir)/tools
arm64_tools_dir := $(tools_dir)/arch/arm64/tools/
diff --git a/tools/testing/selftests/kvm/guest_memfd_test.c b/tools/testing/selftests/kvm/guest_memfd_test.c
index ba0c8e9..ce687f8 100644
--- a/tools/testing/selftests/kvm/guest_memfd_test.c
+++ b/tools/testing/selftests/kvm/guest_memfd_test.c
@@ -134,7 +134,7 @@ static void test_create_guest_memfd_invalid(struct kvm_vm *vm)
size);
}
- for (flag = 0; flag; flag <<= 1) {
+ for (flag = BIT(0); flag; flag <<= 1) {
fd = __vm_create_guest_memfd(vm, page_size, flag);
TEST_ASSERT(fd == -1 && errno == EINVAL,
"guest_memfd() with flag '0x%lx' should fail with EINVAL",
diff --git a/tools/testing/selftests/kvm/lib/x86_64/vmx.c b/tools/testing/selftests/kvm/lib/x86_64/vmx.c
index 089b892..d7ac122 100644
--- a/tools/testing/selftests/kvm/lib/x86_64/vmx.c
+++ b/tools/testing/selftests/kvm/lib/x86_64/vmx.c
@@ -200,7 +200,7 @@ static inline void init_vmcs_control_fields(struct vmx_pages *vmx)
if (vmx->eptp_gpa) {
uint64_t ept_paddr;
struct eptPageTablePointer eptp = {
- .memory_type = VMX_BASIC_MEM_TYPE_WB,
+ .memory_type = X86_MEMTYPE_WB,
.page_walk_length = 3, /* + 1 */
.ad_enabled = ept_vpid_cap_supported(VMX_EPT_VPID_CAP_AD_BITS),
.address = vmx->eptp_gpa >> PAGE_SHIFT_4K,
diff --git a/tools/testing/selftests/kvm/memslot_perf_test.c b/tools/testing/selftests/kvm/memslot_perf_test.c
index 989ffe0..e3711be 100644
--- a/tools/testing/selftests/kvm/memslot_perf_test.c
+++ b/tools/testing/selftests/kvm/memslot_perf_test.c
@@ -417,7 +417,7 @@ static bool _guest_should_exit(void)
*/
static noinline void host_perform_sync(struct sync_area *sync)
{
- alarm(2);
+ alarm(10);
atomic_store_explicit(&sync->sync_flag, true, memory_order_release);
while (atomic_load_explicit(&sync->sync_flag, memory_order_acquire))
diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile
index 02e1204..0f8c110 100644
--- a/tools/testing/selftests/mm/Makefile
+++ b/tools/testing/selftests/mm/Makefile
@@ -105,12 +105,12 @@
ifeq ($(CAN_BUILD_X86_64),1)
TEST_GEN_FILES += $(BINARIES_64)
endif
-else
-ifneq (,$(filter $(ARCH),arm64 powerpc))
+else ifeq ($(ARCH),arm64)
TEST_GEN_FILES += protection_keys
-endif
-
+TEST_GEN_FILES += pkey_sighandler_tests
+else ifeq ($(ARCH),powerpc)
+TEST_GEN_FILES += protection_keys
endif
ifneq (,$(filter $(ARCH),arm64 mips64 parisc64 powerpc riscv64 s390x sparc64 x86_64 s390))
diff --git a/tools/testing/selftests/mm/hugetlb_dio.c b/tools/testing/selftests/mm/hugetlb_dio.c
index 60001c1..432d5af 100644
--- a/tools/testing/selftests/mm/hugetlb_dio.c
+++ b/tools/testing/selftests/mm/hugetlb_dio.c
@@ -44,6 +44,13 @@ void run_dio_using_hugetlb(unsigned int start_off, unsigned int end_off)
if (fd < 0)
ksft_exit_fail_perror("Error opening file\n");
+ /* Get the free huge pages before allocation */
+ free_hpage_b = get_free_hugepages();
+ if (free_hpage_b == 0) {
+ close(fd);
+ ksft_exit_skip("No free hugepage, exiting!\n");
+ }
+
/* Allocate a hugetlb page */
orig_buffer = mmap(NULL, h_pagesize, mmap_prot, mmap_flags, -1, 0);
if (orig_buffer == MAP_FAILED) {
diff --git a/tools/testing/selftests/mm/pkey-arm64.h b/tools/testing/selftests/mm/pkey-arm64.h
index 580e1b0..d9d2100 100644
--- a/tools/testing/selftests/mm/pkey-arm64.h
+++ b/tools/testing/selftests/mm/pkey-arm64.h
@@ -31,6 +31,7 @@
#define NR_RESERVED_PKEYS 1 /* pkey-0 */
#define PKEY_ALLOW_ALL 0x77777777
+#define PKEY_REG_ALLOW_NONE 0x0
#define PKEY_BITS_PER_PKEY 4
#define PAGE_SIZE sysconf(_SC_PAGESIZE)
@@ -126,7 +127,7 @@ static inline u64 get_pkey_bits(u64 reg, int pkey)
return 0;
}
-static void aarch64_write_signal_pkey(ucontext_t *uctxt, u64 pkey)
+static inline void aarch64_write_signal_pkey(ucontext_t *uctxt, u64 pkey)
{
struct _aarch64_ctx *ctx = GET_UC_RESV_HEAD(uctxt);
struct poe_context *poe_ctx =
diff --git a/tools/testing/selftests/mm/pkey-helpers.h b/tools/testing/selftests/mm/pkey-helpers.h
index 9ab6a3e..f7cfe16 100644
--- a/tools/testing/selftests/mm/pkey-helpers.h
+++ b/tools/testing/selftests/mm/pkey-helpers.h
@@ -112,6 +112,13 @@ void record_pkey_malloc(void *ptr, long size, int prot);
#define PKEY_MASK (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE)
#endif
+/*
+ * FIXME: Remove once the generic PKEY_UNRESTRICTED definition is merged.
+ */
+#ifndef PKEY_UNRESTRICTED
+#define PKEY_UNRESTRICTED 0x0
+#endif
+
#ifndef set_pkey_bits
static inline u64 set_pkey_bits(u64 reg, int pkey, u64 flags)
{
diff --git a/tools/testing/selftests/mm/pkey-x86.h b/tools/testing/selftests/mm/pkey-x86.h
index 5f28e26..ac91777 100644
--- a/tools/testing/selftests/mm/pkey-x86.h
+++ b/tools/testing/selftests/mm/pkey-x86.h
@@ -34,6 +34,8 @@
#define PAGE_SIZE 4096
#define MB (1<<20)
+#define PKEY_REG_ALLOW_NONE 0x55555555
+
static inline void __page_o_noops(void)
{
/* 8-bytes of instruction * 512 bytes = 1 page */
diff --git a/tools/testing/selftests/mm/pkey_sighandler_tests.c b/tools/testing/selftests/mm/pkey_sighandler_tests.c
index a8088b64..c593a42 100644
--- a/tools/testing/selftests/mm/pkey_sighandler_tests.c
+++ b/tools/testing/selftests/mm/pkey_sighandler_tests.c
@@ -11,6 +11,7 @@
*/
#define _GNU_SOURCE
#define __SANE_USERSPACE_TYPES__
+#include <linux/mman.h>
#include <errno.h>
#include <sys/syscall.h>
#include <string.h>
@@ -59,12 +60,58 @@ long syscall_raw(long n, long a1, long a2, long a3, long a4, long a5, long a6)
: "=a"(ret)
: "a"(n), "b"(a1), "c"(a2), "d"(a3), "S"(a4), "D"(a5)
: "memory");
+#elif defined __aarch64__
+ register long x0 asm("x0") = a1;
+ register long x1 asm("x1") = a2;
+ register long x2 asm("x2") = a3;
+ register long x3 asm("x3") = a4;
+ register long x4 asm("x4") = a5;
+ register long x5 asm("x5") = a6;
+ register long x8 asm("x8") = n;
+ asm volatile ("svc #0"
+ : "=r"(x0)
+ : "r"(x0), "r"(x1), "r"(x2), "r"(x3), "r"(x4), "r"(x5), "r"(x8)
+ : "memory");
+ ret = x0;
#else
# error syscall_raw() not implemented
#endif
return ret;
}
+static inline long clone_raw(unsigned long flags, void *stack,
+ int *parent_tid, int *child_tid)
+{
+ long a1 = flags;
+ long a2 = (long)stack;
+ long a3 = (long)parent_tid;
+#if defined(__x86_64__) || defined(__i386)
+ long a4 = (long)child_tid;
+ long a5 = 0;
+#elif defined(__aarch64__)
+ long a4 = 0;
+ long a5 = (long)child_tid;
+#else
+# error clone_raw() not implemented
+#endif
+
+ return syscall_raw(SYS_clone, a1, a2, a3, a4, a5, 0);
+}
+
+/*
+ * Returns the most restrictive pkey register value that can be used by the
+ * tests.
+ */
+static inline u64 pkey_reg_restrictive_default(void)
+{
+ /*
+ * Disallow everything except execution on pkey 0, so that each caller
+ * doesn't need to enable it explicitly (the selftest code runs with
+ * its code mapped with pkey 0).
+ */
+ return set_pkey_bits(PKEY_REG_ALLOW_NONE, 0, PKEY_DISABLE_ACCESS);
+}
+
static void sigsegv_handler(int signo, siginfo_t *info, void *ucontext)
{
pthread_mutex_lock(&mutex);
@@ -113,7 +160,7 @@ static void raise_sigusr2(void)
static void *thread_segv_with_pkey0_disabled(void *ptr)
{
/* Disable MPK 0 (and all others too) */
- __write_pkey_reg(0x55555555);
+ __write_pkey_reg(pkey_reg_restrictive_default());
/* Segfault (with SEGV_MAPERR) */
*(int *) (0x1) = 1;
@@ -123,7 +170,7 @@ static void *thread_segv_with_pkey0_disabled(void *ptr)
static void *thread_segv_pkuerr_stack(void *ptr)
{
/* Disable MPK 0 (and all others too) */
- __write_pkey_reg(0x55555555);
+ __write_pkey_reg(pkey_reg_restrictive_default());
/* After we disable MPK 0, we can't access the stack to return */
return NULL;
@@ -133,6 +180,7 @@ static void *thread_segv_maperr_ptr(void *ptr)
{
stack_t *stack = ptr;
int *bad = (int *)1;
+ u64 pkey_reg;
/*
* Setup alternate signal stack, which should be pkey_mprotect()ed by
@@ -142,7 +190,9 @@ static void *thread_segv_maperr_ptr(void *ptr)
syscall_raw(SYS_sigaltstack, (long)stack, 0, 0, 0, 0, 0);
/* Disable MPK 0. Only MPK 1 is enabled. */
- __write_pkey_reg(0x55555551);
+ pkey_reg = pkey_reg_restrictive_default();
+ pkey_reg = set_pkey_bits(pkey_reg, 1, PKEY_UNRESTRICTED);
+ __write_pkey_reg(pkey_reg);
/* Segfault */
*bad = 1;
@@ -240,6 +290,7 @@ static void test_sigsegv_handler_with_different_pkey_for_stack(void)
int pkey;
int parent_pid = 0;
int child_pid = 0;
+ u64 pkey_reg;
sa.sa_flags = SA_SIGINFO | SA_ONSTACK;
@@ -257,7 +308,10 @@ static void test_sigsegv_handler_with_different_pkey_for_stack(void)
assert(stack != MAP_FAILED);
/* Allow access to MPK 0 and MPK 1 */
- __write_pkey_reg(0x55555550);
+ pkey_reg = pkey_reg_restrictive_default();
+ pkey_reg = set_pkey_bits(pkey_reg, 0, PKEY_UNRESTRICTED);
+ pkey_reg = set_pkey_bits(pkey_reg, 1, PKEY_UNRESTRICTED);
+ __write_pkey_reg(pkey_reg);
/* Protect the new stack with MPK 1 */
pkey = pkey_alloc(0, 0);
@@ -272,14 +326,13 @@ static void test_sigsegv_handler_with_different_pkey_for_stack(void)
memset(&siginfo, 0, sizeof(siginfo));
/* Use clone to avoid newer glibcs using rseq on new threads */
- long ret = syscall_raw(SYS_clone,
- CLONE_VM | CLONE_FS | CLONE_FILES |
- CLONE_SIGHAND | CLONE_THREAD | CLONE_SYSVSEM |
- CLONE_PARENT_SETTID | CLONE_CHILD_CLEARTID |
- CLONE_DETACHED,
- (long) ((char *)(stack) + STACK_SIZE),
- (long) &parent_pid,
- (long) &child_pid, 0, 0);
+ long ret = clone_raw(CLONE_VM | CLONE_FS | CLONE_FILES |
+ CLONE_SIGHAND | CLONE_THREAD | CLONE_SYSVSEM |
+ CLONE_PARENT_SETTID | CLONE_CHILD_CLEARTID |
+ CLONE_DETACHED,
+ stack + STACK_SIZE,
+ &parent_pid,
+ &child_pid);
if (ret < 0) {
errno = -ret;
@@ -307,7 +360,13 @@ static void test_sigsegv_handler_with_different_pkey_for_stack(void)
static void test_pkru_preserved_after_sigusr1(void)
{
struct sigaction sa;
- unsigned long pkru = 0x45454544;
+ u64 pkey_reg;
+
+ /* Allow access to MPK 0 and an arbitrary set of keys */
+ pkey_reg = pkey_reg_restrictive_default();
+ pkey_reg = set_pkey_bits(pkey_reg, 0, PKEY_UNRESTRICTED);
+ pkey_reg = set_pkey_bits(pkey_reg, 3, PKEY_UNRESTRICTED);
+ pkey_reg = set_pkey_bits(pkey_reg, 7, PKEY_UNRESTRICTED);
sa.sa_flags = SA_SIGINFO;
@@ -320,7 +379,7 @@ static void test_pkru_preserved_after_sigusr1(void)
memset(&siginfo, 0, sizeof(siginfo));
- __write_pkey_reg(pkru);
+ __write_pkey_reg(pkey_reg);
raise(SIGUSR1);
@@ -330,7 +389,7 @@ static void test_pkru_preserved_after_sigusr1(void)
pthread_mutex_unlock(&mutex);
/* Ensure the pkru value is the same after returning from signal. */
- ksft_test_result(pkru == __read_pkey_reg() &&
+ ksft_test_result(pkey_reg == __read_pkey_reg() &&
siginfo.si_signo == SIGUSR1,
"%s\n", __func__);
}
@@ -347,6 +406,7 @@ static noinline void *thread_sigusr2_self(void *ptr)
'S', 'I', 'G', 'U', 'S', 'R', '2',
'.', '.', '.', '\n', '\0'};
stack_t *stack = ptr;
+ u64 pkey_reg;
/*
* Setup alternate signal stack, which should be pkey_mprotect()ed by
@@ -356,7 +416,9 @@ static noinline void *thread_sigusr2_self(void *ptr)
syscall(SYS_sigaltstack, (long)stack, 0, 0, 0, 0, 0);
/* Disable MPK 0. Only MPK 2 is enabled. */
- __write_pkey_reg(0x55555545);
+ pkey_reg = pkey_reg_restrictive_default();
+ pkey_reg = set_pkey_bits(pkey_reg, 2, PKEY_UNRESTRICTED);
+ __write_pkey_reg(pkey_reg);
raise_sigusr2();
@@ -384,6 +446,7 @@ static void test_pkru_sigreturn(void)
int pkey;
int parent_pid = 0;
int child_pid = 0;
+ u64 pkey_reg;
sa.sa_handler = SIG_DFL;
sa.sa_flags = 0;
@@ -418,7 +481,10 @@ static void test_pkru_sigreturn(void)
* the current thread's stack is protected by the default MPK 0. Hence
* both need to be enabled.
*/
- __write_pkey_reg(0x55555544);
+ pkey_reg = pkey_reg_restrictive_default();
+ pkey_reg = set_pkey_bits(pkey_reg, 0, PKEY_UNRESTRICTED);
+ pkey_reg = set_pkey_bits(pkey_reg, 2, PKEY_UNRESTRICTED);
+ __write_pkey_reg(pkey_reg);
/* Protect the stack with MPK 2 */
pkey = pkey_alloc(0, 0);
@@ -431,14 +497,13 @@ static void test_pkru_sigreturn(void)
sigstack.ss_size = STACK_SIZE;
/* Use clone to avoid newer glibcs using rseq on new threads */
- long ret = syscall_raw(SYS_clone,
- CLONE_VM | CLONE_FS | CLONE_FILES |
- CLONE_SIGHAND | CLONE_THREAD | CLONE_SYSVSEM |
- CLONE_PARENT_SETTID | CLONE_CHILD_CLEARTID |
- CLONE_DETACHED,
- (long) ((char *)(stack) + STACK_SIZE),
- (long) &parent_pid,
- (long) &child_pid, 0, 0);
+ long ret = clone_raw(CLONE_VM | CLONE_FS | CLONE_FILES |
+ CLONE_SIGHAND | CLONE_THREAD | CLONE_SYSVSEM |
+ CLONE_PARENT_SETTID | CLONE_CHILD_CLEARTID |
+ CLONE_DETACHED,
+ stack + STACK_SIZE,
+ &parent_pid,
+ &child_pid);
if (ret < 0) {
errno = -ret;
diff --git a/tools/testing/selftests/net/.gitignore b/tools/testing/selftests/net/.gitignore
index 217d8b7..59fe07e 100644
--- a/tools/testing/selftests/net/.gitignore
+++ b/tools/testing/selftests/net/.gitignore
@@ -19,6 +19,7 @@
msg_oob
msg_zerocopy
ncdevmem
+netlink-dumps
nettest
psock_fanout
psock_snd
diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile
index 649f1fe..5e86f7a 100644
--- a/tools/testing/selftests/net/Makefile
+++ b/tools/testing/selftests/net/Makefile
@@ -78,6 +78,7 @@
TEST_GEN_FILES += io_uring_zerocopy_tx
TEST_PROGS += io_uring_zerocopy_tx.sh
TEST_GEN_FILES += bind_bhash
+TEST_GEN_PROGS += netlink-dumps
TEST_GEN_PROGS += sk_bind_sendto_listen
TEST_GEN_PROGS += sk_connect_zero_addr
TEST_GEN_PROGS += sk_so_peek_off
diff --git a/tools/testing/selftests/net/netlink-dumps.c b/tools/testing/selftests/net/netlink-dumps.c
new file mode 100644
index 0000000..7ee6dcd
--- /dev/null
+++ b/tools/testing/selftests/net/netlink-dumps.c
@@ -0,0 +1,110 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define _GNU_SOURCE
+
+#include <fcntl.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <linux/genetlink.h>
+#include <linux/netlink.h>
+#include <linux/mqueue.h>
+
+#include "../kselftest_harness.h"
+
+static const struct {
+ struct nlmsghdr nlhdr;
+ struct genlmsghdr genlhdr;
+ struct nlattr ahdr;
+ __u16 val;
+ __u16 pad;
+} dump_policies = {
+ .nlhdr = {
+ .nlmsg_len = sizeof(dump_policies),
+ .nlmsg_type = GENL_ID_CTRL,
+ .nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK | NLM_F_DUMP,
+ .nlmsg_seq = 1,
+ },
+ .genlhdr = {
+ .cmd = CTRL_CMD_GETPOLICY,
+ .version = 2,
+ },
+ .ahdr = {
+ .nla_len = 6,
+ .nla_type = CTRL_ATTR_FAMILY_ID,
+ },
+ .val = GENL_ID_CTRL,
+ .pad = 0,
+};
+
+// Sanity check for the test itself, make sure the dump doesn't fit in one msg
+TEST(test_sanity)
+{
+ int netlink_sock;
+ char buf[8192];
+ ssize_t n;
+
+ netlink_sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_GENERIC);
+ ASSERT_GE(netlink_sock, 0);
+
+ n = send(netlink_sock, &dump_policies, sizeof(dump_policies), 0);
+ ASSERT_EQ(n, sizeof(dump_policies));
+
+ n = recv(netlink_sock, buf, sizeof(buf), MSG_DONTWAIT);
+ ASSERT_GE(n, sizeof(struct nlmsghdr));
+
+ n = recv(netlink_sock, buf, sizeof(buf), MSG_DONTWAIT);
+ ASSERT_GE(n, sizeof(struct nlmsghdr));
+
+ close(netlink_sock);
+}
+
+TEST(close_in_progress)
+{
+ int netlink_sock;
+ ssize_t n;
+
+ netlink_sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_GENERIC);
+ ASSERT_GE(netlink_sock, 0);
+
+ n = send(netlink_sock, &dump_policies, sizeof(dump_policies), 0);
+ ASSERT_EQ(n, sizeof(dump_policies));
+
+ close(netlink_sock);
+}
+
+TEST(close_with_ref)
+{
+ char cookie[NOTIFY_COOKIE_LEN] = {};
+ int netlink_sock, mq_fd;
+ struct sigevent sigev;
+ ssize_t n;
+
+ netlink_sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_GENERIC);
+ ASSERT_GE(netlink_sock, 0);
+
+ n = send(netlink_sock, &dump_policies, sizeof(dump_policies), 0);
+ ASSERT_EQ(n, sizeof(dump_policies));
+
+ mq_fd = syscall(__NR_mq_open, "sed", O_CREAT | O_WRONLY, 0600, 0);
+ ASSERT_GE(mq_fd, 0);
+
+ memset(&sigev, 0, sizeof(sigev));
+ sigev.sigev_notify = SIGEV_THREAD;
+ sigev.sigev_value.sival_ptr = cookie;
+ sigev.sigev_signo = netlink_sock;
+
+ syscall(__NR_mq_notify, mq_fd, &sigev);
+
+ close(netlink_sock);
+
+ // give mqueue time to fire
+ usleep(100 * 1000);
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/nolibc/Makefile b/tools/testing/selftests/nolibc/Makefile
index 8de98ea..e92e0b88 100644
--- a/tools/testing/selftests/nolibc/Makefile
+++ b/tools/testing/selftests/nolibc/Makefile
@@ -130,9 +130,9 @@
QEMU_ARGS_ppc64 = -M powernv -append "console=hvc0 panic=-1 $(TEST:%=NOLIBC_TEST=%)"
QEMU_ARGS_ppc64le = -M powernv -append "console=hvc0 panic=-1 $(TEST:%=NOLIBC_TEST=%)"
QEMU_ARGS_riscv = -M virt -append "console=ttyS0 panic=-1 $(TEST:%=NOLIBC_TEST=%)"
-QEMU_ARGS_s390 = -M s390-ccw-virtio -m 1G -append "console=ttyS0 panic=-1 $(TEST:%=NOLIBC_TEST=%)"
+QEMU_ARGS_s390 = -M s390-ccw-virtio -append "console=ttyS0 panic=-1 $(TEST:%=NOLIBC_TEST=%)"
QEMU_ARGS_loongarch = -M virt -append "console=ttyS0,115200 panic=-1 $(TEST:%=NOLIBC_TEST=%)"
-QEMU_ARGS = $(QEMU_ARGS_$(XARCH)) $(QEMU_ARGS_BIOS) $(QEMU_ARGS_EXTRA)
+QEMU_ARGS = -m 1G $(QEMU_ARGS_$(XARCH)) $(QEMU_ARGS_BIOS) $(QEMU_ARGS_EXTRA)
# OUTPUT is only set when run from the main makefile, otherwise
# it defaults to this nolibc directory.
diff --git a/tools/testing/selftests/pidfd/pidfd_open_test.c b/tools/testing/selftests/pidfd/pidfd_open_test.c
index c62564c..ce413a2 100644
--- a/tools/testing/selftests/pidfd/pidfd_open_test.c
+++ b/tools/testing/selftests/pidfd/pidfd_open_test.c
@@ -13,6 +13,7 @@
#include <stdlib.h>
#include <string.h>
#include <syscall.h>
+#include <sys/ioctl.h>
#include <sys/mount.h>
#include <sys/prctl.h>
#include <sys/wait.h>
@@ -21,6 +22,32 @@
#include "pidfd.h"
#include "../kselftest.h"
+#ifndef PIDFS_IOCTL_MAGIC
+#define PIDFS_IOCTL_MAGIC 0xFF
+#endif
+
+#ifndef PIDFD_GET_INFO
+#define PIDFD_GET_INFO _IOWR(PIDFS_IOCTL_MAGIC, 11, struct pidfd_info)
+#define PIDFD_INFO_CGROUPID (1UL << 0)
+
+struct pidfd_info {
+ __u64 request_mask;
+ __u64 cgroupid;
+ __u32 pid;
+ __u32 tgid;
+ __u32 ppid;
+ __u32 ruid;
+ __u32 rgid;
+ __u32 euid;
+ __u32 egid;
+ __u32 suid;
+ __u32 sgid;
+ __u32 fsuid;
+ __u32 fsgid;
+ __u32 spare0[1];
+};
+#endif
+
static int safe_int(const char *numstr, int *converted)
{
char *err = NULL;
@@ -120,10 +147,13 @@ static pid_t get_pid_from_fdinfo_file(int pidfd, const char *key, size_t keylen)
int main(int argc, char **argv)
{
+ struct pidfd_info info = {
+ .request_mask = PIDFD_INFO_CGROUPID,
+ };
int pidfd = -1, ret = 1;
pid_t pid;
- ksft_set_plan(3);
+ ksft_set_plan(4);
pidfd = sys_pidfd_open(-1, 0);
if (pidfd >= 0) {
@@ -153,6 +183,56 @@ int main(int argc, char **argv)
pid = get_pid_from_fdinfo_file(pidfd, "Pid:", sizeof("Pid:") - 1);
ksft_print_msg("pidfd %d refers to process with pid %d\n", pidfd, pid);
+ if (ioctl(pidfd, PIDFD_GET_INFO, &info) < 0) {
+ ksft_print_msg("%s - failed to get info from pidfd\n", strerror(errno));
+ goto on_error;
+ }
+ if (info.pid != pid) {
+ ksft_print_msg("pid from fdinfo file %d does not match pid from ioctl %d\n",
+ pid, info.pid);
+ goto on_error;
+ }
+ if (info.ppid != getppid()) {
+ ksft_print_msg("ppid %d does not match ppid from ioctl %d\n",
+ pid, info.pid);
+ goto on_error;
+ }
+ if (info.ruid != getuid()) {
+ ksft_print_msg("uid %d does not match uid from ioctl %d\n",
+ getuid(), info.ruid);
+ goto on_error;
+ }
+ if (info.rgid != getgid()) {
+ ksft_print_msg("gid %d does not match gid from ioctl %d\n",
+ getgid(), info.rgid);
+ goto on_error;
+ }
+ if (info.euid != geteuid()) {
+ ksft_print_msg("euid %d does not match euid from ioctl %d\n",
+ geteuid(), info.euid);
+ goto on_error;
+ }
+ if (info.egid != getegid()) {
+ ksft_print_msg("egid %d does not match egid from ioctl %d\n",
+ getegid(), info.egid);
+ goto on_error;
+ }
+ if (info.suid != geteuid()) {
+ ksft_print_msg("suid %d does not match suid from ioctl %d\n",
+ geteuid(), info.suid);
+ goto on_error;
+ }
+ if (info.sgid != getegid()) {
+ ksft_print_msg("sgid %d does not match sgid from ioctl %d\n",
+ getegid(), info.sgid);
+ goto on_error;
+ }
+ if ((info.request_mask & PIDFD_INFO_CGROUPID) && info.cgroupid == 0) {
+ ksft_print_msg("cgroupid should not be 0 when PIDFD_INFO_CGROUPID is set\n");
+ goto on_error;
+ }
+ ksft_test_result_pass("get info from pidfd test: passed\n");
+
ret = 0;
on_error:
diff --git a/tools/testing/selftests/tc-testing/tc-tests/filters/u32.json b/tools/testing/selftests/tc-testing/tc-tests/filters/u32.json
index 24bd0c2..b2ca9d4 100644
--- a/tools/testing/selftests/tc-testing/tc-tests/filters/u32.json
+++ b/tools/testing/selftests/tc-testing/tc-tests/filters/u32.json
@@ -329,5 +329,29 @@
"teardown": [
"$TC qdisc del dev $DEV1 parent root drr"
]
+ },
+ {
+ "id": "1234",
+ "name": "Exercise IDR leaks by creating/deleting a filter many (2048) times",
+ "category": [
+ "filter",
+ "u32"
+ ],
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "setup": [
+ "$TC qdisc add dev $DEV1 parent root handle 10: drr",
+ "$TC filter add dev $DEV1 parent 10:0 protocol ip prio 2 u32 match ip src 0.0.0.2/32 action drop",
+ "$TC filter add dev $DEV1 parent 10:0 protocol ip prio 3 u32 match ip src 0.0.0.3/32 action drop"
+ ],
+ "cmdUnderTest": "bash -c 'for i in {1..2048} ;do echo filter delete dev $DEV1 pref 3;echo filter add dev $DEV1 parent 10:0 protocol ip prio 3 u32 match ip src 0.0.0.3/32 action drop;done | $TC -b -'",
+ "expExitCode": "0",
+ "verifyCmd": "$TC filter show dev $DEV1",
+ "matchPattern": "protocol ip pref 3 u32",
+ "matchCount": "3",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 parent root drr"
+ ]
}
]
diff --git a/tools/thermal/lib/Makefile b/tools/thermal/lib/Makefile
index 82db451..f2552f7 100644
--- a/tools/thermal/lib/Makefile
+++ b/tools/thermal/lib/Makefile
@@ -3,7 +3,7 @@
LIBTHERMAL_TOOLS_VERSION = 0
LIBTHERMAL_TOOLS_PATCHLEVEL = 0
-LIBTHERMAL_TOOLS_EXTRAVERSION = 1
+LIBTHERMAL_TOOLS_EXTRAVERSION = 2
MAKEFLAGS += --no-print-directory
diff --git a/tools/thermal/thermal-engine/thermal-engine.c b/tools/thermal/thermal-engine/thermal-engine.c
index 9b1476a..0764dc7 100644
--- a/tools/thermal/thermal-engine/thermal-engine.c
+++ b/tools/thermal/thermal-engine/thermal-engine.c
@@ -38,6 +38,14 @@ struct thermal_data {
struct thermal_handler *th;
};
+static int show_threshold(struct thermal_threshold *th, __maybe_unused void *arg)
+{
+ INFO("threshold temp=%d, direction=%d\n",
+ th->temperature, th->direction);
+
+ return 0;
+}
+
static int show_trip(struct thermal_trip *tt, __maybe_unused void *arg)
{
INFO("trip id=%d, type=%d, temp=%d, hyst=%d\n",
@@ -70,6 +78,8 @@ static int show_tz(struct thermal_zone *tz, __maybe_unused void *arg)
for_each_thermal_trip(tz->trip, show_trip, NULL);
+ for_each_thermal_threshold(tz->thresholds, show_threshold, NULL);
+
show_temp(tz, arg);
show_governor(tz, arg);
@@ -77,6 +87,30 @@ static int show_tz(struct thermal_zone *tz, __maybe_unused void *arg)
return 0;
}
+static int set_threshold(struct thermal_zone *tz, __maybe_unused void *arg)
+{
+ struct thermal_handler *th = arg;
+ int thresholds[] = { 43000, 65000, 49000, 55000, 57000 };
+ size_t i;
+
+ INFO("Setting threshold for thermal zone '%s', id=%d\n", tz->name, tz->id);
+
+ if (thermal_cmd_threshold_flush(th, tz)) {
+ ERROR("Failed to flush all previous thresholds\n");
+ return -1;
+ }
+
+ for (i = 0; i < sizeof(thresholds) / sizeof(thresholds[0]); i++)
+ if (thermal_cmd_threshold_add(th, tz, thresholds[i],
+ THERMAL_THRESHOLD_WAY_UP |
+ THERMAL_THRESHOLD_WAY_DOWN)) {
+ ERROR("Failed to set threshold\n");
+ return -1;
+ }
+
+ return 0;
+}
+
static int tz_create(const char *name, int tz_id, __maybe_unused void *arg)
{
INFO("Thermal zone '%s'/%d created\n", name, tz_id);
@@ -197,20 +231,62 @@ static int gov_change(int tz_id, const char *name, __maybe_unused void *arg)
return 0;
}
+static int threshold_add(int tz_id, int temp, int direction, __maybe_unused void *arg)
+{
+ INFO("Threshold added tz_id=%d: temp=%d, direction=%d\n", tz_id, temp, direction);
+
+ return 0;
+}
+
+static int threshold_delete(int tz_id, int temp, int direction, __maybe_unused void *arg)
+{
+ INFO("Threshold deleted tz_id=%d: temp=%d, direction=%d\n", tz_id, temp, direction);
+
+ return 0;
+}
+
+static int threshold_flush(int tz_id, __maybe_unused void *arg)
+{
+ INFO("Thresholds flushed tz_id=%d\n", tz_id);
+
+ return 0;
+}
+
+static int threshold_up(int tz_id, int temp, int prev_temp, __maybe_unused void *arg)
+{
+ INFO("Threshold crossed way up tz_id=%d: temp=%d, prev_temp=%d\n",
+ tz_id, temp, prev_temp);
+
+ return 0;
+}
+
+static int threshold_down(int tz_id, int temp, int prev_temp, __maybe_unused void *arg)
+{
+ INFO("Threshold crossed way down tz_id=%d: temp=%d, prev_temp=%d\n",
+ tz_id, temp, prev_temp);
+
+ return 0;
+}
+
static struct thermal_ops ops = {
- .events.tz_create = tz_create,
- .events.tz_delete = tz_delete,
- .events.tz_disable = tz_disable,
- .events.tz_enable = tz_enable,
- .events.trip_high = trip_high,
- .events.trip_low = trip_low,
- .events.trip_add = trip_add,
- .events.trip_delete = trip_delete,
- .events.trip_change = trip_change,
- .events.cdev_add = cdev_add,
- .events.cdev_delete = cdev_delete,
- .events.cdev_update = cdev_update,
- .events.gov_change = gov_change
+ .events.tz_create = tz_create,
+ .events.tz_delete = tz_delete,
+ .events.tz_disable = tz_disable,
+ .events.tz_enable = tz_enable,
+ .events.trip_high = trip_high,
+ .events.trip_low = trip_low,
+ .events.trip_add = trip_add,
+ .events.trip_delete = trip_delete,
+ .events.trip_change = trip_change,
+ .events.cdev_add = cdev_add,
+ .events.cdev_delete = cdev_delete,
+ .events.cdev_update = cdev_update,
+ .events.gov_change = gov_change,
+ .events.threshold_add = threshold_add,
+ .events.threshold_delete = threshold_delete,
+ .events.threshold_flush = threshold_flush,
+ .events.threshold_up = threshold_up,
+ .events.threshold_down = threshold_down,
};
static int thermal_event(__maybe_unused int fd, __maybe_unused void *arg)
@@ -280,6 +356,7 @@ enum {
THERMAL_ENGINE_DAEMON_ERROR,
THERMAL_ENGINE_LOG_ERROR,
THERMAL_ENGINE_THERMAL_ERROR,
+ THERMAL_ENGINE_THRESHOLD_ERROR,
THERMAL_ENGINE_MAINLOOP_ERROR,
};
@@ -318,6 +395,8 @@ int main(int argc, char *argv[])
return THERMAL_ENGINE_THERMAL_ERROR;
}
+ for_each_thermal_zone(td.tz, set_threshold, td.th);
+
for_each_thermal_zone(td.tz, show_tz, td.th);
if (mainloop_init()) {
diff --git a/tools/virtio/vringh_test.c b/tools/virtio/vringh_test.c
index 43d3a6a..b959122 100644
--- a/tools/virtio/vringh_test.c
+++ b/tools/virtio/vringh_test.c
@@ -519,7 +519,7 @@ int main(int argc, char *argv[])
errx(1, "virtqueue_add_sgs: %i", err);
__kmalloc_fake = NULL;
- /* Host retreives it. */
+ /* Host retrieves it. */
vringh_iov_init(&riov, host_riov, ARRAY_SIZE(host_riov));
vringh_iov_init(&wiov, host_wiov, ARRAY_SIZE(host_wiov));
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index 6b390b62..249ba5b 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -304,7 +304,6 @@ static int
kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
{
struct kvm_kernel_irqfd *irqfd, *tmp;
- struct fd f;
struct eventfd_ctx *eventfd = NULL, *resamplefd = NULL;
int ret;
__poll_t events;
@@ -327,8 +326,8 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
INIT_WORK(&irqfd->shutdown, irqfd_shutdown);
seqcount_spinlock_init(&irqfd->irq_entry_sc, &kvm->irqfds.lock);
- f = fdget(args->fd);
- if (!fd_file(f)) {
+ CLASS(fd, f)(args->fd);
+ if (fd_empty(f)) {
ret = -EBADF;
goto out;
}
@@ -336,7 +335,7 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
eventfd = eventfd_ctx_fileget(fd_file(f));
if (IS_ERR(eventfd)) {
ret = PTR_ERR(eventfd);
- goto fail;
+ goto out;
}
irqfd->eventfd = eventfd;
@@ -440,12 +439,6 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
#endif
srcu_read_unlock(&kvm->irq_srcu, idx);
-
- /*
- * do not drop the file until the irqfd is fully initialized, otherwise
- * we might race against the EPOLLHUP
- */
- fdput(f);
return 0;
fail:
@@ -458,8 +451,6 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
if (eventfd && !IS_ERR(eventfd))
eventfd_ctx_put(eventfd);
- fdput(f);
-
out:
kfree(irqfd);
return ret;
diff --git a/virt/kvm/vfio.c b/virt/kvm/vfio.c
index 388ae47..72aa1fd 100644
--- a/virt/kvm/vfio.c
+++ b/virt/kvm/vfio.c
@@ -190,11 +190,10 @@ static int kvm_vfio_file_del(struct kvm_device *dev, unsigned int fd)
{
struct kvm_vfio *kv = dev->private;
struct kvm_vfio_file *kvf;
- struct fd f;
+ CLASS(fd, f)(fd);
int ret;
- f = fdget(fd);
- if (!fd_file(f))
+ if (fd_empty(f))
return -EBADF;
ret = -ENOENT;
@@ -220,9 +219,6 @@ static int kvm_vfio_file_del(struct kvm_device *dev, unsigned int fd)
kvm_vfio_update_coherency(dev);
mutex_unlock(&kv->lock);
-
- fdput(f);
-
return ret;
}
@@ -233,14 +229,13 @@ static int kvm_vfio_file_set_spapr_tce(struct kvm_device *dev,
struct kvm_vfio_spapr_tce param;
struct kvm_vfio *kv = dev->private;
struct kvm_vfio_file *kvf;
- struct fd f;
int ret;
if (copy_from_user(¶m, arg, sizeof(struct kvm_vfio_spapr_tce)))
return -EFAULT;
- f = fdget(param.groupfd);
- if (!fd_file(f))
+ CLASS(fd, f)(param.groupfd);
+ if (fd_empty(f))
return -EBADF;
ret = -ENOENT;
@@ -266,7 +261,6 @@ static int kvm_vfio_file_set_spapr_tce(struct kvm_device *dev,
err_fdput:
mutex_unlock(&kv->lock);
- fdput(f);
return ret;
}
#endif