blob: 5b9772cdf2651bb924e1c68d50eea83d9c1a3e96 [file] [log] [blame]
// SPDX-License-Identifier: LGPL-2.1
/*
* rseq.c
*
* Copyright (C) 2016 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; only
* version 2.1 of the License.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*/
#define _GNU_SOURCE
#include <errno.h>
#include <sched.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <syscall.h>
#include <assert.h>
#include <signal.h>
#include <limits.h>
#include <dlfcn.h>
#include <stddef.h>
#include <sys/auxv.h>
#include <linux/auxvec.h>
#include <linux/compiler.h>
#include "../kselftest.h"
#include "rseq.h"
/*
* Define weak versions to play nice with binaries that are statically linked
* against a libc that doesn't support registering its own rseq.
*/
__weak ptrdiff_t __rseq_offset;
__weak unsigned int __rseq_size;
__weak unsigned int __rseq_flags;
static const ptrdiff_t *libc_rseq_offset_p = &__rseq_offset;
static const unsigned int *libc_rseq_size_p = &__rseq_size;
static const unsigned int *libc_rseq_flags_p = &__rseq_flags;
/* Offset from the thread pointer to the rseq area. */
ptrdiff_t rseq_offset;
/*
* Size of the registered rseq area. 0 if the registration was
* unsuccessful.
*/
unsigned int rseq_size = -1U;
/* Flags used during rseq registration. */
unsigned int rseq_flags;
static int rseq_ownership;
static int rseq_reg_success; /* At least one rseq registration has succeded. */
/* Allocate a large area for the TLS. */
#define RSEQ_THREAD_AREA_ALLOC_SIZE 1024
/* Original struct rseq feature size is 20 bytes. */
#define ORIG_RSEQ_FEATURE_SIZE 20
/* Original struct rseq allocation size is 32 bytes. */
#define ORIG_RSEQ_ALLOC_SIZE 32
static
__thread struct rseq_abi __rseq_abi __attribute__((tls_model("initial-exec"), aligned(RSEQ_THREAD_AREA_ALLOC_SIZE))) = {
.cpu_id = RSEQ_ABI_CPU_ID_UNINITIALIZED,
};
static int sys_rseq(struct rseq_abi *rseq_abi, uint32_t rseq_len,
int flags, uint32_t sig)
{
return syscall(__NR_rseq, rseq_abi, rseq_len, flags, sig);
}
static int sys_getcpu(unsigned *cpu, unsigned *node)
{
return syscall(__NR_getcpu, cpu, node, NULL);
}
int rseq_available(void)
{
int rc;
rc = sys_rseq(NULL, 0, 0, 0);
if (rc != -1)
abort();
switch (errno) {
case ENOSYS:
return 0;
case EINVAL:
return 1;
default:
abort();
}
}
/* The rseq areas need to be at least 32 bytes. */
static
unsigned int get_rseq_min_alloc_size(void)
{
unsigned int alloc_size = rseq_size;
if (alloc_size < ORIG_RSEQ_ALLOC_SIZE)
alloc_size = ORIG_RSEQ_ALLOC_SIZE;
return alloc_size;
}
/*
* Return the feature size supported by the kernel.
*
* Depending on the value returned by getauxval(AT_RSEQ_FEATURE_SIZE):
*
* 0: Return ORIG_RSEQ_FEATURE_SIZE (20)
* > 0: Return the value from getauxval(AT_RSEQ_FEATURE_SIZE).
*
* It should never return a value below ORIG_RSEQ_FEATURE_SIZE.
*/
static
unsigned int get_rseq_kernel_feature_size(void)
{
unsigned long auxv_rseq_feature_size, auxv_rseq_align;
auxv_rseq_align = getauxval(AT_RSEQ_ALIGN);
assert(!auxv_rseq_align || auxv_rseq_align <= RSEQ_THREAD_AREA_ALLOC_SIZE);
auxv_rseq_feature_size = getauxval(AT_RSEQ_FEATURE_SIZE);
assert(!auxv_rseq_feature_size || auxv_rseq_feature_size <= RSEQ_THREAD_AREA_ALLOC_SIZE);
if (auxv_rseq_feature_size)
return auxv_rseq_feature_size;
else
return ORIG_RSEQ_FEATURE_SIZE;
}
int rseq_register_current_thread(void)
{
int rc;
if (!rseq_ownership) {
/* Treat libc's ownership as a successful registration. */
return 0;
}
rc = sys_rseq(&__rseq_abi, get_rseq_min_alloc_size(), 0, RSEQ_SIG);
if (rc) {
if (RSEQ_READ_ONCE(rseq_reg_success)) {
/* Incoherent success/failure within process. */
abort();
}
return -1;
}
assert(rseq_current_cpu_raw() >= 0);
RSEQ_WRITE_ONCE(rseq_reg_success, 1);
return 0;
}
int rseq_unregister_current_thread(void)
{
int rc;
if (!rseq_ownership) {
/* Treat libc's ownership as a successful unregistration. */
return 0;
}
rc = sys_rseq(&__rseq_abi, get_rseq_min_alloc_size(), RSEQ_ABI_FLAG_UNREGISTER, RSEQ_SIG);
if (rc)
return -1;
return 0;
}
static __attribute__((constructor))
void rseq_init(void)
{
/*
* If the libc's registered rseq size isn't already valid, it may be
* because the binary is dynamically linked and not necessarily due to
* libc not having registered a restartable sequence. Try to find the
* symbols if that's the case.
*/
if (!*libc_rseq_size_p) {
libc_rseq_offset_p = dlsym(RTLD_NEXT, "__rseq_offset");
libc_rseq_size_p = dlsym(RTLD_NEXT, "__rseq_size");
libc_rseq_flags_p = dlsym(RTLD_NEXT, "__rseq_flags");
}
if (libc_rseq_size_p && libc_rseq_offset_p && libc_rseq_flags_p &&
*libc_rseq_size_p != 0) {
unsigned int libc_rseq_size;
/* rseq registration owned by glibc */
rseq_offset = *libc_rseq_offset_p;
libc_rseq_size = *libc_rseq_size_p;
rseq_flags = *libc_rseq_flags_p;
/*
* Previous versions of glibc expose the value
* 32 even though the kernel only supported 20
* bytes initially. Therefore treat 32 as a
* special-case. glibc 2.40 exposes a 20 bytes
* __rseq_size without using getauxval(3) to
* query the supported size, while still allocating a 32
* bytes area. Also treat 20 as a special-case.
*
* Special-cases are handled by using the following
* value as active feature set size:
*
* rseq_size = min(32, get_rseq_kernel_feature_size())
*/
switch (libc_rseq_size) {
case ORIG_RSEQ_FEATURE_SIZE:
fallthrough;
case ORIG_RSEQ_ALLOC_SIZE:
{
unsigned int rseq_kernel_feature_size = get_rseq_kernel_feature_size();
if (rseq_kernel_feature_size < ORIG_RSEQ_ALLOC_SIZE)
rseq_size = rseq_kernel_feature_size;
else
rseq_size = ORIG_RSEQ_ALLOC_SIZE;
break;
}
default:
/* Otherwise just use the __rseq_size from libc as rseq_size. */
rseq_size = libc_rseq_size;
break;
}
return;
}
rseq_ownership = 1;
if (!rseq_available()) {
rseq_size = 0;
return;
}
rseq_offset = (void *)&__rseq_abi - rseq_thread_pointer();
rseq_flags = 0;
}
static __attribute__((destructor))
void rseq_exit(void)
{
if (!rseq_ownership)
return;
rseq_offset = 0;
rseq_size = -1U;
rseq_ownership = 0;
}
int32_t rseq_fallback_current_cpu(void)
{
int32_t cpu;
cpu = sched_getcpu();
if (cpu < 0) {
perror("sched_getcpu()");
abort();
}
return cpu;
}
int32_t rseq_fallback_current_node(void)
{
uint32_t cpu_id, node_id;
int ret;
ret = sys_getcpu(&cpu_id, &node_id);
if (ret) {
perror("sys_getcpu()");
return ret;
}
return (int32_t) node_id;
}