1 // SPDX-License-Identifier: BSD-2-Clause
2 /*
3 * Copyright (c) 2020, Huawei Technologies Co., Ltd
4 */
5 /*
6 * Support for Thread-Local Storage (TLS) ABIs for ARMv7/Aarch32 and Aarch64.
7 *
8 * TAs are currently single-threaded, so the only benefit of implementing these
9 * ABIs is to support toolchains that need them even when the target program is
10 * single-threaded. Such as, the g++ compiler from the GCC toolchain targeting a
11 * "Posix thread" Linux runtime, which OP-TEE has been using for quite some time
12 * (arm-linux-gnueabihf-* and aarch64-linux-gnu-*). This allows building C++ TAs
13 * without having to build a specific toolchain with --disable-threads.
14 *
15 * This implementation is based on [1].
16 *
17 * - "TLS data structures variant 1" (section 3): the AArch64 compiler uses the
18 * TPIDR_EL0 to access TLS data directly. This assumes a specific layout for
19 * the TCB, and (for shared objects) the use of R_AARCH64_TLS_TPREL
20 * relocations.
21 * - The "General Dynamic access model" (section 4.1): the ARMv7/Aarch32
22 * compiler inserts calls to the __tls_get_addr() function which has to be
23 * implemented by the runtime library. The function takes a module ID and an
24 * offset parameter, which are provided thanks to R_ARM_TLS_DTPMOD32 and
25 * R_ARM_TLS_DTPOFF32 relocations.
26 *
27 * In addition, dl_iterate_phdr() is implemented here, because it is used by the
28 * g++ Aarch64 exception handling and it does use the TCB to provide TLS
29 * information to the caller.
30 *
31 * [1] "ELF Handling For Thread-Local Storage"
32 * https://www.akkadia.org/drepper/tls.pdf
33 */
34
35 #include <arm64_user_sysreg.h>
36 #include <assert.h>
37 #include <link.h>
38 #include <stdlib.h>
39 #include <string.h>
40 #include <sys/queue.h>
41 #include "user_ta_header.h"
42
43 /* DTV - Dynamic Thread Vector
44 *
45 * Maintains an array of pointers to TLS data for each module in the TCB. Each
46 * module that has a TLS segment has an entry (and consequently, some space in
47 * the tcb_head::tls buffer). The index is the "module ID".
48 * dtv[0].size is the number of elements in the vector
49 * dtv[1].tls points to TLS for the main executable (may be NULL)
50 * tls[2 .. (size-1)] are for shared libraries
51 */
52 union dtv {
53 unsigned long size;
54 uint8_t *tls;
55 };
56
57 #define DTV_SIZE(size) (sizeof(union dtv) + (size))
58
59 /* Thread Control Block */
60 struct tcb_head {
61 /* Two words are reserved as per the "TLS variant 1" ABI */
62 union dtv *dtv;
63 unsigned long reserved;
64 /*
65 * The rest of the structure contains the TLS blocks for each ELF module
66 * having a PT_TLS segment. Each block is a copy of the .tdata section
67 * plus some zero-initialized space for .tbss.
68 */
69 uint8_t tls[];
70 };
71
72 /*
73 * Since TAs are single threaded, only one TCB is needed. This would need to
74 * change if multi-threading is introduced.
75 */
76 static struct tcb_head *_tcb;
77 static size_t _tls_size;
78
79 #define TCB_SIZE(tls_size) (sizeof(*_tcb) + (tls_size))
80
81 /*
82 * Initialize or update the TCB.
83 * Called on application initialization and when additional shared objects are
84 * loaded via dlopen().
85 */
__utee_tcb_init(void)86 void __utee_tcb_init(void)
87 {
88 struct dl_phdr_info *dlpi = NULL;
89 const Elf_Phdr *phdr = NULL;
90 size_t total_size = 0;
91 size_t size = 0;
92 size_t i = 0;
93 size_t j = 0;
94
95 /* Compute the size needed for all the TLS blocks */
96 for (i = 0; i < __elf_phdr_info.count; i++) {
97 dlpi = __elf_phdr_info.dlpi + i;
98 for (j = 0; j < dlpi->dlpi_phnum; j++) {
99 phdr = dlpi->dlpi_phdr + j;
100 if (phdr->p_type == PT_TLS) {
101 total_size += phdr->p_memsz;
102 break;
103 }
104 }
105 }
106
107 /* ELF modules currently cannot be unmapped */
108 assert(total_size >= _tls_size);
109
110 if (total_size == _tls_size)
111 return;
112
113 /* (Re-)allocate the TCB */
114 _tcb = realloc(_tcb, TCB_SIZE(total_size));
115 if (!_tcb) {
116 EMSG("TCB allocation failed (%zu bytes)", TCB_SIZE(total_size));
117 abort();
118 }
119
120 /* (Re-)allocate the DTV. + 1 since dtv[0] holds the size */
121 size = DTV_SIZE((__elf_phdr_info.count + 1) * sizeof(union dtv));
122 _tcb->dtv = realloc(_tcb->dtv, size);
123 if (!_tcb->dtv) {
124 EMSG("DTV allocation failed (%zu bytes)", size);
125 abort();
126 }
127
128 /* Copy TLS data to the TCB */
129 size = 0;
130 for (i = 0; i < __elf_phdr_info.count; i++) {
131 dlpi = __elf_phdr_info.dlpi + i;
132 for (j = 0; j < dlpi->dlpi_phnum; j++) {
133 phdr = dlpi->dlpi_phdr + j;
134 if (phdr->p_type != PT_TLS)
135 continue;
136 if (size + phdr->p_memsz <= _tls_size) {
137 /* Already copied */
138 break;
139 }
140 _tcb->dtv[i + 1].tls = _tcb->tls + size;
141 /* Copy .tdata */
142 memcpy(_tcb->tls + size,
143 (void *)(dlpi->dlpi_addr + phdr->p_vaddr),
144 phdr->p_filesz);
145 /* Initialize .tbss */
146 memset(_tcb->tls + size + phdr->p_filesz, 0,
147 phdr->p_memsz - phdr->p_filesz);
148 size += phdr->p_memsz;
149 }
150 }
151 _tcb->dtv[0].size = i;
152
153 _tls_size = total_size;
154 #ifdef ARM64
155 /*
156 * Aarch64 ABI requirement: the thread pointer shall point to the
157 * thread's TCB. ARMv7 and Aarch32 access the TCB via _tls_get_addr().
158 */
159 write_tpidr_el0((vaddr_t)_tcb);
160 #endif
161 }
162
163 struct tls_index {
164 unsigned long module;
165 unsigned long offset;
166 };
167
168 void *__tls_get_addr(struct tls_index *ti);
169
__tls_get_addr(struct tls_index * ti)170 void *__tls_get_addr(struct tls_index *ti)
171 {
172 return _tcb->dtv[ti->module].tls + ti->offset;
173 }
174
dl_iterate_phdr(int (* callback)(struct dl_phdr_info *,size_t,void *),void * data)175 int dl_iterate_phdr(int (*callback)(struct dl_phdr_info *, size_t, void *),
176 void *data)
177 {
178 struct dl_phdr_info *dlpi = NULL;
179 size_t id = 0;
180 size_t i = 0;
181 int st = 0;
182
183 /*
184 * dlpi_tls_data is thread-specific so if we were to support
185 * multi-threading, we would need one copy of struct dl_phdr_info per
186 * thread. Could be a pre-allocated area, or could be allocated on the
187 * heap. Doing the latter here so that it would at least work if/when we
188 * add thread support. Further optimization can always come later.
189 */
190 dlpi = calloc(1, sizeof(*dlpi));
191 if (!dlpi) {
192 EMSG("dl_phdr_info allocation failed");
193 abort();
194 }
195
196 for (i = 0; i < __elf_phdr_info.count; i++) {
197 memcpy(dlpi, __elf_phdr_info.dlpi + i, sizeof(*dlpi));
198 dlpi->dlpi_tls_data = NULL;
199 id = dlpi->dlpi_tls_modid;
200 if (id)
201 dlpi->dlpi_tls_data = _tcb->dtv[id].tls;
202 st = callback(dlpi, sizeof(*dlpi), data);
203 }
204
205 free(dlpi);
206 return st;
207 }
208