/Users/deen/code/yugabyte-db/src/postgres/src/backend/storage/ipc/shm_toc.c
Line | Count | Source (jump to first uncovered line) |
1 | | /*------------------------------------------------------------------------- |
2 | | * |
3 | | * shm_toc.c |
4 | | * shared memory segment table of contents |
5 | | * |
6 | | * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group |
7 | | * Portions Copyright (c) 1994, Regents of the University of California |
8 | | * |
9 | | * src/backend/storage/ipc/shm_toc.c |
10 | | * |
11 | | *------------------------------------------------------------------------- |
12 | | */ |
13 | | |
14 | | #include "postgres.h" |
15 | | |
16 | | #include "port/atomics.h" |
17 | | #include "storage/shm_toc.h" |
18 | | #include "storage/spin.h" |
19 | | |
20 | | typedef struct shm_toc_entry |
21 | | { |
22 | | uint64 key; /* Arbitrary identifier */ |
23 | | Size offset; /* Offset, in bytes, from TOC start */ |
24 | | } shm_toc_entry; |
25 | | |
26 | | struct shm_toc |
27 | | { |
28 | | uint64 toc_magic; /* Magic number identifying this TOC */ |
29 | | slock_t toc_mutex; /* Spinlock for mutual exclusion */ |
30 | | Size toc_total_bytes; /* Bytes managed by this TOC */ |
31 | | Size toc_allocated_bytes; /* Bytes allocated of those managed */ |
32 | | uint32 toc_nentry; /* Number of entries in TOC */ |
33 | | shm_toc_entry toc_entry[FLEXIBLE_ARRAY_MEMBER]; |
34 | | }; |
35 | | |
36 | | /* |
37 | | * Initialize a region of shared memory with a table of contents. |
38 | | */ |
39 | | shm_toc * |
40 | | shm_toc_create(uint64 magic, void *address, Size nbytes) |
41 | 0 | { |
42 | 0 | shm_toc *toc = (shm_toc *) address; |
43 | |
|
44 | 0 | Assert(nbytes > offsetof(shm_toc, toc_entry)); |
45 | 0 | toc->toc_magic = magic; |
46 | 0 | SpinLockInit(&toc->toc_mutex); |
47 | | |
48 | | /* |
49 | | * The alignment code in shm_toc_allocate() assumes that the starting |
50 | | * value is buffer-aligned. |
51 | | */ |
52 | 0 | toc->toc_total_bytes = BUFFERALIGN_DOWN(nbytes); |
53 | 0 | toc->toc_allocated_bytes = 0; |
54 | 0 | toc->toc_nentry = 0; |
55 | |
|
56 | 0 | return toc; |
57 | 0 | } |
58 | | |
59 | | /* |
60 | | * Attach to an existing table of contents. If the magic number found at |
61 | | * the target address doesn't match our expectations, return NULL. |
62 | | */ |
63 | | shm_toc * |
64 | | shm_toc_attach(uint64 magic, void *address) |
65 | 0 | { |
66 | 0 | shm_toc *toc = (shm_toc *) address; |
67 | |
|
68 | 0 | if (toc->toc_magic != magic) |
69 | 0 | return NULL; |
70 | | |
71 | 0 | Assert(toc->toc_total_bytes >= toc->toc_allocated_bytes); |
72 | 0 | Assert(toc->toc_total_bytes > offsetof(shm_toc, toc_entry)); |
73 | | |
74 | 0 | return toc; |
75 | 0 | } |
76 | | |
77 | | /* |
78 | | * Allocate shared memory from a segment managed by a table of contents. |
79 | | * |
80 | | * This is not a full-blown allocator; there's no way to free memory. It's |
81 | | * just a way of dividing a single physical shared memory segment into logical |
82 | | * chunks that may be used for different purposes. |
83 | | * |
84 | | * We allocate backwards from the end of the segment, so that the TOC entries |
85 | | * can grow forward from the start of the segment. |
86 | | */ |
87 | | void * |
88 | | shm_toc_allocate(shm_toc *toc, Size nbytes) |
89 | 0 | { |
90 | 0 | volatile shm_toc *vtoc = toc; |
91 | 0 | Size total_bytes; |
92 | 0 | Size allocated_bytes; |
93 | 0 | Size nentry; |
94 | 0 | Size toc_bytes; |
95 | | |
96 | | /* |
97 | | * Make sure request is well-aligned. XXX: MAXALIGN is not enough, |
98 | | * because atomic ops might need a wider alignment. We don't have a |
99 | | * proper definition for the minimum to make atomic ops safe, but |
100 | | * BUFFERALIGN ought to be enough. |
101 | | */ |
102 | 0 | nbytes = BUFFERALIGN(nbytes); |
103 | |
|
104 | 0 | SpinLockAcquire(&toc->toc_mutex); |
105 | |
|
106 | 0 | total_bytes = vtoc->toc_total_bytes; |
107 | 0 | allocated_bytes = vtoc->toc_allocated_bytes; |
108 | 0 | nentry = vtoc->toc_nentry; |
109 | 0 | toc_bytes = offsetof(shm_toc, toc_entry) + nentry * sizeof(shm_toc_entry) |
110 | 0 | + allocated_bytes; |
111 | | |
112 | | /* Check for memory exhaustion and overflow. */ |
113 | 0 | if (toc_bytes + nbytes > total_bytes || toc_bytes + nbytes < toc_bytes) |
114 | 0 | { |
115 | 0 | SpinLockRelease(&toc->toc_mutex); |
116 | 0 | ereport(ERROR, |
117 | 0 | (errcode(ERRCODE_OUT_OF_MEMORY), |
118 | 0 | errmsg("out of shared memory"))); |
119 | 0 | } |
120 | 0 | vtoc->toc_allocated_bytes += nbytes; |
121 | |
|
122 | 0 | SpinLockRelease(&toc->toc_mutex); |
123 | |
|
124 | 0 | return ((char *) toc) + (total_bytes - allocated_bytes - nbytes); |
125 | 0 | } |
126 | | |
127 | | /* |
128 | | * Return the number of bytes that can still be allocated. |
129 | | */ |
130 | | Size |
131 | | shm_toc_freespace(shm_toc *toc) |
132 | 0 | { |
133 | 0 | volatile shm_toc *vtoc = toc; |
134 | 0 | Size total_bytes; |
135 | 0 | Size allocated_bytes; |
136 | 0 | Size nentry; |
137 | 0 | Size toc_bytes; |
138 | |
|
139 | 0 | SpinLockAcquire(&toc->toc_mutex); |
140 | 0 | total_bytes = vtoc->toc_total_bytes; |
141 | 0 | allocated_bytes = vtoc->toc_allocated_bytes; |
142 | 0 | nentry = vtoc->toc_nentry; |
143 | 0 | SpinLockRelease(&toc->toc_mutex); |
144 | |
|
145 | 0 | toc_bytes = offsetof(shm_toc, toc_entry) + nentry * sizeof(shm_toc_entry); |
146 | 0 | Assert(allocated_bytes + BUFFERALIGN(toc_bytes) <= total_bytes); |
147 | 0 | return total_bytes - (allocated_bytes + BUFFERALIGN(toc_bytes)); |
148 | 0 | } |
149 | | |
150 | | /* |
151 | | * Insert a TOC entry. |
152 | | * |
153 | | * The idea here is that the process setting up the shared memory segment will |
154 | | * register the addresses of data structures within the segment using this |
155 | | * function. Each data structure will be identified using a 64-bit key, which |
156 | | * is assumed to be a well-known or discoverable integer. Other processes |
157 | | * accessing the shared memory segment can pass the same key to |
158 | | * shm_toc_lookup() to discover the addresses of those data structures. |
159 | | * |
160 | | * Since the shared memory segment may be mapped at different addresses within |
161 | | * different backends, we store relative rather than absolute pointers. |
162 | | * |
163 | | * This won't scale well to a large number of keys. Hopefully, that isn't |
164 | | * necessary; if it proves to be, we might need to provide a more sophisticated |
165 | | * data structure here. But the real idea here is just to give someone mapping |
166 | | * a dynamic shared memory the ability to find the bare minimum number of |
167 | | * pointers that they need to bootstrap. If you're storing a lot of stuff in |
168 | | * the TOC, you're doing it wrong. |
169 | | */ |
170 | | void |
171 | | shm_toc_insert(shm_toc *toc, uint64 key, void *address) |
172 | 0 | { |
173 | 0 | volatile shm_toc *vtoc = toc; |
174 | 0 | Size total_bytes; |
175 | 0 | Size allocated_bytes; |
176 | 0 | Size nentry; |
177 | 0 | Size toc_bytes; |
178 | 0 | Size offset; |
179 | | |
180 | | /* Relativize pointer. */ |
181 | 0 | Assert(address > (void *) toc); |
182 | 0 | offset = ((char *) address) - (char *) toc; |
183 | |
|
184 | 0 | SpinLockAcquire(&toc->toc_mutex); |
185 | |
|
186 | 0 | total_bytes = vtoc->toc_total_bytes; |
187 | 0 | allocated_bytes = vtoc->toc_allocated_bytes; |
188 | 0 | nentry = vtoc->toc_nentry; |
189 | 0 | toc_bytes = offsetof(shm_toc, toc_entry) + nentry * sizeof(shm_toc_entry) |
190 | 0 | + allocated_bytes; |
191 | | |
192 | | /* Check for memory exhaustion and overflow. */ |
193 | 0 | if (toc_bytes + sizeof(shm_toc_entry) > total_bytes || |
194 | 0 | toc_bytes + sizeof(shm_toc_entry) < toc_bytes || |
195 | 0 | nentry >= PG_UINT32_MAX) |
196 | 0 | { |
197 | 0 | SpinLockRelease(&toc->toc_mutex); |
198 | 0 | ereport(ERROR, |
199 | 0 | (errcode(ERRCODE_OUT_OF_MEMORY), |
200 | 0 | errmsg("out of shared memory"))); |
201 | 0 | } |
202 | | |
203 | 0 | Assert(offset < total_bytes); |
204 | 0 | vtoc->toc_entry[nentry].key = key; |
205 | 0 | vtoc->toc_entry[nentry].offset = offset; |
206 | | |
207 | | /* |
208 | | * By placing a write barrier after filling in the entry and before |
209 | | * updating the number of entries, we make it safe to read the TOC |
210 | | * unlocked. |
211 | | */ |
212 | 0 | pg_write_barrier(); |
213 | |
|
214 | 0 | vtoc->toc_nentry++; |
215 | |
|
216 | 0 | SpinLockRelease(&toc->toc_mutex); |
217 | 0 | } |
218 | | |
219 | | /* |
220 | | * Look up a TOC entry. |
221 | | * |
222 | | * If the key is not found, returns NULL if noError is true, otherwise |
223 | | * throws elog(ERROR). |
224 | | * |
225 | | * Unlike the other functions in this file, this operation acquires no lock; |
226 | | * it uses only barriers. It probably wouldn't hurt concurrency very much even |
227 | | * if it did get a lock, but since it's reasonably likely that a group of |
228 | | * worker processes could each read a series of entries from the same TOC |
229 | | * right around the same time, there seems to be some value in avoiding it. |
230 | | */ |
231 | | void * |
232 | | shm_toc_lookup(shm_toc *toc, uint64 key, bool noError) |
233 | 0 | { |
234 | 0 | uint32 nentry; |
235 | 0 | uint32 i; |
236 | | |
237 | | /* |
238 | | * Read the number of entries before we examine any entry. We assume that |
239 | | * reading a uint32 is atomic. |
240 | | */ |
241 | 0 | nentry = toc->toc_nentry; |
242 | 0 | pg_read_barrier(); |
243 | | |
244 | | /* Now search for a matching entry. */ |
245 | 0 | for (i = 0; i < nentry; ++i) |
246 | 0 | { |
247 | 0 | if (toc->toc_entry[i].key == key) |
248 | 0 | return ((char *) toc) + toc->toc_entry[i].offset; |
249 | 0 | } |
250 | | |
251 | | /* No matching entry was found. */ |
252 | 0 | if (!noError) |
253 | 0 | elog(ERROR, "could not find key " UINT64_FORMAT " in shm TOC at %p", |
254 | 0 | key, toc); |
255 | 0 | return NULL; |
256 | 0 | } |
257 | | |
258 | | /* |
259 | | * Estimate how much shared memory will be required to store a TOC and its |
260 | | * dependent data structures. |
261 | | */ |
262 | | Size |
263 | | shm_toc_estimate(shm_toc_estimator *e) |
264 | 0 | { |
265 | 0 | Size sz; |
266 | |
|
267 | 0 | sz = offsetof(shm_toc, toc_entry); |
268 | 0 | sz = add_size(sz, mul_size(e->number_of_keys, sizeof(shm_toc_entry))); |
269 | 0 | sz = add_size(sz, e->space_for_chunks); |
270 | |
|
271 | 0 | return BUFFERALIGN(sz); |
272 | 0 | } |