// SPDX-License-Identifier: BSD-2-Clause /* * Copyright (c) 2020, Huawei Technologies Co., Ltd */ /* * Support for Thread-Local Storage (TLS) ABIs for ARMv7/Aarch32 and Aarch64. * * TAs are currently single-threaded, so the only benefit of implementing these * ABIs is to support toolchains that need them even when the target program is * single-threaded. Such as, the g++ compiler from the GCC toolchain targeting a * "Posix thread" Linux runtime, which OP-TEE has been using for quite some time * (arm-linux-gnueabihf-* and aarch64-linux-gnu-*). This allows building C++ TAs * without having to build a specific toolchain with --disable-threads. * * This implementation is based on [1]. * * - "TLS data structures variant 1" (section 3): the AArch64 compiler uses the * TPIDR_EL0 to access TLS data directly. This assumes a specific layout for * the TCB, and (for shared objects) the use of R_AARCH64_TLS_TPREL * relocations. * - The "General Dynamic access model" (section 4.1): the ARMv7/Aarch32 * compiler inserts calls to the __tls_get_addr() function which has to be * implemented by the runtime library. The function takes a module ID and an * offset parameter, which are provided thanks to R_ARM_TLS_DTPMOD32 and * R_ARM_TLS_DTPOFF32 relocations. * * In addition, dl_iterate_phdr() is implemented here, because it is used by the * g++ Aarch64 exception handling and it does use the TCB to provide TLS * information to the caller. * * [1] "ELF Handling For Thread-Local Storage" * https://www.akkadia.org/drepper/tls.pdf */ #include #include #include #include #include #include #include "user_ta_header.h" /* DTV - Dynamic Thread Vector * * Maintains an array of pointers to TLS data for each module in the TCB. Each * module that has a TLS segment has an entry (and consequently, some space in * the tcb_head::tls buffer). The index is the "module ID". * dtv[0].size is the number of elements in the vector * dtv[1].tls points to TLS for the main executable (may be NULL) * tls[2 .. (size-1)] are for shared libraries */ union dtv { unsigned long size; uint8_t *tls; }; #define DTV_SIZE(size) (sizeof(union dtv) + (size)) /* Thread Control Block */ struct tcb_head { /* Two words are reserved as per the "TLS variant 1" ABI */ union dtv *dtv; unsigned long reserved; /* * The rest of the structure contains the TLS blocks for each ELF module * having a PT_TLS segment. Each block is a copy of the .tdata section * plus some zero-initialized space for .tbss. */ uint8_t tls[]; }; /* * Since TAs are single threaded, only one TCB is needed. This would need to * change if multi-threading is introduced. */ static struct tcb_head *_tcb; static size_t _tls_size; #define TCB_SIZE(tls_size) (sizeof(*_tcb) + (tls_size)) /* * Initialize or update the TCB. * Called on application initialization and when additional shared objects are * loaded via dlopen(). */ void __utee_tcb_init(void) { struct dl_phdr_info *dlpi = NULL; const Elf_Phdr *phdr = NULL; size_t total_size = 0; size_t size = 0; size_t i = 0; size_t j = 0; /* Compute the size needed for all the TLS blocks */ for (i = 0; i < __elf_phdr_info.count; i++) { dlpi = __elf_phdr_info.dlpi + i; for (j = 0; j < dlpi->dlpi_phnum; j++) { phdr = dlpi->dlpi_phdr + j; if (phdr->p_type == PT_TLS) { total_size += phdr->p_memsz; break; } } } /* ELF modules currently cannot be unmapped */ assert(total_size >= _tls_size); if (total_size == _tls_size) return; /* (Re-)allocate the TCB */ _tcb = realloc(_tcb, TCB_SIZE(total_size)); if (!_tcb) { EMSG("TCB allocation failed (%zu bytes)", TCB_SIZE(total_size)); abort(); } /* (Re-)allocate the DTV. + 1 since dtv[0] holds the size */ size = DTV_SIZE((__elf_phdr_info.count + 1) * sizeof(union dtv)); _tcb->dtv = realloc(_tcb->dtv, size); if (!_tcb->dtv) { EMSG("DTV allocation failed (%zu bytes)", size); abort(); } /* Copy TLS data to the TCB */ size = 0; for (i = 0; i < __elf_phdr_info.count; i++) { dlpi = __elf_phdr_info.dlpi + i; for (j = 0; j < dlpi->dlpi_phnum; j++) { phdr = dlpi->dlpi_phdr + j; if (phdr->p_type != PT_TLS) continue; if (size + phdr->p_memsz <= _tls_size) { /* Already copied */ break; } _tcb->dtv[i + 1].tls = _tcb->tls + size; /* Copy .tdata */ memcpy(_tcb->tls + size, (void *)(dlpi->dlpi_addr + phdr->p_vaddr), phdr->p_filesz); /* Initialize .tbss */ memset(_tcb->tls + size + phdr->p_filesz, 0, phdr->p_memsz - phdr->p_filesz); size += phdr->p_memsz; } } _tcb->dtv[0].size = i; _tls_size = total_size; #ifdef ARM64 /* * Aarch64 ABI requirement: the thread pointer shall point to the * thread's TCB. ARMv7 and Aarch32 access the TCB via _tls_get_addr(). */ write_tpidr_el0((vaddr_t)_tcb); #endif } struct tls_index { unsigned long module; unsigned long offset; }; void *__tls_get_addr(struct tls_index *ti); void *__tls_get_addr(struct tls_index *ti) { return _tcb->dtv[ti->module].tls + ti->offset; } int dl_iterate_phdr(int (*callback)(struct dl_phdr_info *, size_t, void *), void *data) { struct dl_phdr_info *dlpi = NULL; size_t id = 0; size_t i = 0; int st = 0; /* * dlpi_tls_data is thread-specific so if we were to support * multi-threading, we would need one copy of struct dl_phdr_info per * thread. Could be a pre-allocated area, or could be allocated on the * heap. Doing the latter here so that it would at least work if/when we * add thread support. Further optimization can always come later. */ dlpi = calloc(1, sizeof(*dlpi)); if (!dlpi) { EMSG("dl_phdr_info allocation failed"); abort(); } for (i = 0; i < __elf_phdr_info.count; i++) { memcpy(dlpi, __elf_phdr_info.dlpi + i, sizeof(*dlpi)); dlpi->dlpi_tls_data = NULL; id = dlpi->dlpi_tls_modid; if (id) dlpi->dlpi_tls_data = _tcb->dtv[id].tls; st = callback(dlpi, sizeof(*dlpi), data); } free(dlpi); return st; }