/Users/deen/code/yugabyte-db/build/debugcov-clang-dynamic-arm64-ninja/postgres_build/src/backend/port/pg_shmem.c
Line | Count | Source (jump to first uncovered line) |
1 | | /*------------------------------------------------------------------------- |
2 | | * |
3 | | * sysv_shmem.c |
4 | | * Implement shared memory using SysV facilities |
5 | | * |
6 | | * These routines used to be a fairly thin layer on top of SysV shared |
7 | | * memory functionality. With the addition of anonymous-shmem logic, |
8 | | * they're a bit fatter now. We still require a SysV shmem block to |
9 | | * exist, though, because mmap'd shmem provides no way to find out how |
10 | | * many processes are attached, which we need for interlocking purposes. |
11 | | * |
12 | | * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group |
13 | | * Portions Copyright (c) 1994, Regents of the University of California |
14 | | * |
15 | | * IDENTIFICATION |
16 | | * src/backend/port/sysv_shmem.c |
17 | | * |
18 | | *------------------------------------------------------------------------- |
19 | | */ |
20 | | #include "postgres.h" |
21 | | |
22 | | #include <signal.h> |
23 | | #include <unistd.h> |
24 | | #include <sys/file.h> |
25 | | #include <sys/mman.h> |
26 | | #include <sys/stat.h> |
27 | | #ifdef HAVE_SYS_IPC_H |
28 | | #include <sys/ipc.h> |
29 | | #endif |
30 | | #ifdef HAVE_SYS_SHM_H |
31 | | #include <sys/shm.h> |
32 | | #endif |
33 | | |
34 | | #include "miscadmin.h" |
35 | | #include "portability/mem.h" |
36 | | #include "storage/dsm.h" |
37 | | #include "storage/fd.h" |
38 | | #include "storage/ipc.h" |
39 | | #include "storage/pg_shmem.h" |
40 | | #include "utils/guc.h" |
41 | | #include "utils/pidfile.h" |
42 | | |
43 | | |
44 | | /* |
45 | | * As of PostgreSQL 9.3, we normally allocate only a very small amount of |
46 | | * System V shared memory, and only for the purposes of providing an |
47 | | * interlock to protect the data directory. The real shared memory block |
48 | | * is allocated using mmap(). This works around the problem that many |
49 | | * systems have very low limits on the amount of System V shared memory |
50 | | * that can be allocated. Even a limit of a few megabytes will be enough |
51 | | * to run many copies of PostgreSQL without needing to adjust system settings. |
52 | | * |
53 | | * We assume that no one will attempt to run PostgreSQL 9.3 or later on |
54 | | * systems that are ancient enough that anonymous shared memory is not |
55 | | * supported, such as pre-2.4 versions of Linux. If that turns out to be |
56 | | * false, we might need to add compile and/or run-time tests here and do this |
57 | | * only if the running kernel supports it. |
58 | | * |
59 | | * However, we must always disable this logic in the EXEC_BACKEND case, and |
60 | | * fall back to the old method of allocating the entire segment using System V |
61 | | * shared memory, because there's no way to attach an anonymous mmap'd segment |
62 | | * to a process after exec(). Since EXEC_BACKEND is intended only for |
63 | | * developer use, this shouldn't be a big problem. Because of this, we do |
64 | | * not worry about supporting anonymous shmem in the EXEC_BACKEND cases below. |
65 | | * |
66 | | * As of PostgreSQL 12, we regained the ability to use a large System V shared |
67 | | * memory region even in non-EXEC_BACKEND builds, if shared_memory_type is set |
68 | | * to sysv (though this is not the default). |
69 | | */ |
70 | | |
71 | | |
72 | | typedef key_t IpcMemoryKey; /* shared memory key passed to shmget(2) */ |
73 | | typedef int IpcMemoryId; /* shared memory ID returned by shmget(2) */ |
74 | | |
75 | | /* |
76 | | * How does a given IpcMemoryId relate to this PostgreSQL process? |
77 | | * |
78 | | * One could recycle unattached segments of different data directories if we |
79 | | * distinguished that case from other SHMSTATE_FOREIGN cases. Doing so would |
80 | | * cause us to visit less of the key space, making us less likely to detect a |
81 | | * SHMSTATE_ATTACHED key. It would also complicate the concurrency analysis, |
82 | | * in that postmasters of different data directories could simultaneously |
83 | | * attempt to recycle a given key. We'll waste keys longer in some cases, but |
84 | | * avoiding the problems of the alternative justifies that loss. |
85 | | */ |
86 | | typedef enum |
87 | | { |
88 | | SHMSTATE_ANALYSIS_FAILURE, /* unexpected failure to analyze the ID */ |
89 | | SHMSTATE_ATTACHED, /* pertinent to DataDir, has attached PIDs */ |
90 | | SHMSTATE_ENOENT, /* no segment of that ID */ |
91 | | SHMSTATE_FOREIGN, /* exists, but not pertinent to DataDir */ |
92 | | SHMSTATE_UNATTACHED /* pertinent to DataDir, no attached PIDs */ |
93 | | } IpcMemoryState; |
94 | | |
95 | | |
96 | | unsigned long UsedShmemSegID = 0; |
97 | | void *UsedShmemSegAddr = NULL; |
98 | | |
99 | | static Size AnonymousShmemSize; |
100 | | static void *AnonymousShmem = NULL; |
101 | | |
102 | | static void *InternalIpcMemoryCreate(IpcMemoryKey memKey, Size size); |
103 | | static void IpcMemoryDetach(int status, Datum shmaddr); |
104 | | static void IpcMemoryDelete(int status, Datum shmId); |
105 | | static IpcMemoryState PGSharedMemoryAttach(IpcMemoryId shmId, |
106 | | PGShmemHeader **addr); |
107 | | |
108 | | |
109 | | /* |
110 | | * InternalIpcMemoryCreate(memKey, size) |
111 | | * |
112 | | * Attempt to create a new shared memory segment with the specified key. |
113 | | * Will fail (return NULL) if such a segment already exists. If successful, |
114 | | * attach the segment to the current process and return its attached address. |
115 | | * On success, callbacks are registered with on_shmem_exit to detach and |
116 | | * delete the segment when on_shmem_exit is called. |
117 | | * |
118 | | * If we fail with a failure code other than collision-with-existing-segment, |
119 | | * print out an error and abort. Other types of errors are not recoverable. |
120 | | */ |
121 | | static void * |
122 | | InternalIpcMemoryCreate(IpcMemoryKey memKey, Size size) |
123 | 8.96k | { |
124 | 8.96k | IpcMemoryId shmid; |
125 | 8.96k | void *requestedAddress = NULL; |
126 | 8.96k | void *memAddress; |
127 | | |
128 | | /* |
129 | | * Normally we just pass requestedAddress = NULL to shmat(), allowing the |
130 | | * system to choose where the segment gets mapped. But in an EXEC_BACKEND |
131 | | * build, it's possible for whatever is chosen in the postmaster to not |
132 | | * work for backends, due to variations in address space layout. As a |
133 | | * rather klugy workaround, allow the user to specify the address to use |
134 | | * via setting the environment variable PG_SHMEM_ADDR. (If this were of |
135 | | * interest for anything except debugging, we'd probably create a cleaner |
136 | | * and better-documented way to set it, such as a GUC.) |
137 | | */ |
138 | | #ifdef EXEC_BACKEND |
139 | | { |
140 | | char *pg_shmem_addr = getenv("PG_SHMEM_ADDR"); |
141 | | |
142 | | if (pg_shmem_addr) |
143 | | requestedAddress = (void *) strtoul(pg_shmem_addr, NULL, 0); |
144 | | } |
145 | | #endif |
146 | | |
147 | 8.96k | shmid = shmget(memKey, size, IPC_CREAT | IPC_EXCL | IPCProtection); |
148 | | |
149 | 8.96k | if (shmid < 0) |
150 | 5.35k | { |
151 | 5.35k | int shmget_errno = errno; |
152 | | |
153 | | /* |
154 | | * Fail quietly if error indicates a collision with existing segment. |
155 | | * One would expect EEXIST, given that we said IPC_EXCL, but perhaps |
156 | | * we could get a permission violation instead? Also, EIDRM might |
157 | | * occur if an old seg is slated for destruction but not gone yet. |
158 | | */ |
159 | 5.35k | if (shmget_errno == EEXIST || shmget_errno == EACCES |
160 | 5.35k | #ifdef EIDRM |
161 | 0 | || shmget_errno == EIDRM |
162 | 5.35k | #endif |
163 | 5.35k | ) |
164 | 5.35k | return NULL; |
165 | | |
166 | | /* |
167 | | * Some BSD-derived kernels are known to return EINVAL, not EEXIST, if |
168 | | * there is an existing segment but it's smaller than "size" (this is |
169 | | * a result of poorly-thought-out ordering of error tests). To |
170 | | * distinguish between collision and invalid size in such cases, we |
171 | | * make a second try with size = 0. These kernels do not test size |
172 | | * against SHMMIN in the preexisting-segment case, so we will not get |
173 | | * EINVAL a second time if there is such a segment. |
174 | | */ |
175 | 0 | if (shmget_errno == EINVAL) |
176 | 0 | { |
177 | 0 | shmid = shmget(memKey, 0, IPC_CREAT | IPC_EXCL | IPCProtection); |
178 | |
|
179 | 0 | if (shmid < 0) |
180 | 0 | { |
181 | | /* As above, fail quietly if we verify a collision */ |
182 | 0 | if (errno == EEXIST || errno == EACCES |
183 | 0 | #ifdef EIDRM |
184 | 0 | || errno == EIDRM |
185 | 0 | #endif |
186 | 0 | ) |
187 | 0 | return NULL; |
188 | | /* Otherwise, fall through to report the original error */ |
189 | 0 | } |
190 | 0 | else |
191 | 0 | { |
192 | | /* |
193 | | * On most platforms we cannot get here because SHMMIN is |
194 | | * greater than zero. However, if we do succeed in creating a |
195 | | * zero-size segment, free it and then fall through to report |
196 | | * the original error. |
197 | | */ |
198 | 0 | if (shmctl(shmid, IPC_RMID, NULL) < 0) |
199 | 0 | elog(LOG, "shmctl(%d, %d, 0) failed: %m", |
200 | 0 | (int) shmid, IPC_RMID); |
201 | 0 | } |
202 | 0 | } |
203 | | |
204 | | /* |
205 | | * Else complain and abort. |
206 | | * |
207 | | * Note: at this point EINVAL should mean that either SHMMIN or SHMMAX |
208 | | * is violated. SHMALL violation might be reported as either ENOMEM |
209 | | * (BSDen) or ENOSPC (Linux); the Single Unix Spec fails to say which |
210 | | * it should be. SHMMNI violation is ENOSPC, per spec. Just plain |
211 | | * not-enough-RAM is ENOMEM. |
212 | | */ |
213 | 0 | errno = shmget_errno; |
214 | 0 | ereport(FATAL, |
215 | 0 | (errmsg("could not create shared memory segment: %m"), |
216 | 0 | errdetail("Failed system call was shmget(key=%lu, size=%zu, 0%o).", |
217 | 0 | (unsigned long) memKey, size, |
218 | 0 | IPC_CREAT | IPC_EXCL | IPCProtection), |
219 | 0 | (shmget_errno == EINVAL) ? |
220 | 0 | errhint("This error usually means that PostgreSQL's request for a shared memory " |
221 | 0 | "segment exceeded your kernel's SHMMAX parameter, or possibly that " |
222 | 0 | "it is less than " |
223 | 0 | "your kernel's SHMMIN parameter.\n" |
224 | 0 | "The PostgreSQL documentation contains more information about shared " |
225 | 0 | "memory configuration.") : 0, |
226 | 0 | (shmget_errno == ENOMEM) ? |
227 | 0 | errhint("This error usually means that PostgreSQL's request for a shared " |
228 | 0 | "memory segment exceeded your kernel's SHMALL parameter. You might need " |
229 | 0 | "to reconfigure the kernel with larger SHMALL.\n" |
230 | 0 | "The PostgreSQL documentation contains more information about shared " |
231 | 0 | "memory configuration.") : 0, |
232 | 0 | (shmget_errno == ENOSPC) ? |
233 | 0 | errhint("This error does *not* mean that you have run out of disk space. " |
234 | 0 | "It occurs either if all available shared memory IDs have been taken, " |
235 | 0 | "in which case you need to raise the SHMMNI parameter in your kernel, " |
236 | 0 | "or because the system's overall limit for shared memory has been " |
237 | 0 | "reached.\n" |
238 | 0 | "The PostgreSQL documentation contains more information about shared " |
239 | 0 | "memory configuration.") : 0)); |
240 | 0 | } |
241 | | |
242 | | /* Register on-exit routine to delete the new segment */ |
243 | 3.61k | on_shmem_exit(IpcMemoryDelete, Int32GetDatum(shmid)); |
244 | | |
245 | | /* OK, should be able to attach to the segment */ |
246 | 3.61k | memAddress = shmat(shmid, requestedAddress, PG_SHMAT_FLAGS); |
247 | | |
248 | 3.61k | if (memAddress == (void *) -1) |
249 | 0 | elog(FATAL, "shmat(id=%d, addr=%p, flags=0x%x) failed: %m", |
250 | 3.61k | shmid, requestedAddress, PG_SHMAT_FLAGS); |
251 | | |
252 | | /* Register on-exit routine to detach new segment before deleting */ |
253 | 3.61k | on_shmem_exit(IpcMemoryDetach, PointerGetDatum(memAddress)); |
254 | | |
255 | | /* |
256 | | * Store shmem key and ID in data directory lockfile. Format to try to |
257 | | * keep it the same length always (trailing junk in the lockfile won't |
258 | | * hurt, but might confuse humans). |
259 | | */ |
260 | 3.61k | { |
261 | 3.61k | char line[64]; |
262 | | |
263 | 3.61k | sprintf(line, "%9lu %9lu", |
264 | 3.61k | (unsigned long) memKey, (unsigned long) shmid); |
265 | 3.61k | AddToDataDirLockFile(LOCK_FILE_LINE_SHMEM_KEY, line); |
266 | 3.61k | } |
267 | | |
268 | 3.61k | return memAddress; |
269 | 3.61k | } |
270 | | |
271 | | /****************************************************************************/ |
272 | | /* IpcMemoryDetach(status, shmaddr) removes a shared memory segment */ |
273 | | /* from process' address space */ |
274 | | /* (called as an on_shmem_exit callback, hence funny argument list) */ |
275 | | /****************************************************************************/ |
276 | | static void |
277 | | IpcMemoryDetach(int status, Datum shmaddr) |
278 | 3.61k | { |
279 | | /* Detach System V shared memory block. */ |
280 | 3.61k | if (shmdt(DatumGetPointer(shmaddr)) < 0) |
281 | 0 | elog(LOG, "shmdt(%p) failed: %m", DatumGetPointer(shmaddr)); |
282 | 3.61k | } |
283 | | |
284 | | /****************************************************************************/ |
285 | | /* IpcMemoryDelete(status, shmId) deletes a shared memory segment */ |
286 | | /* (called as an on_shmem_exit callback, hence funny argument list) */ |
287 | | /****************************************************************************/ |
288 | | static void |
289 | | IpcMemoryDelete(int status, Datum shmId) |
290 | 3.61k | { |
291 | 3.61k | if (shmctl(DatumGetInt32(shmId), IPC_RMID, NULL) < 0) |
292 | 0 | elog(LOG, "shmctl(%d, %d, 0) failed: %m", |
293 | 3.61k | DatumGetInt32(shmId), IPC_RMID); |
294 | 3.61k | } |
295 | | |
296 | | /* |
297 | | * PGSharedMemoryIsInUse |
298 | | * |
299 | | * Is a previously-existing shmem segment still existing and in use? |
300 | | * |
301 | | * The point of this exercise is to detect the case where a prior postmaster |
302 | | * crashed, but it left child backends that are still running. Therefore |
303 | | * we only care about shmem segments that are associated with the intended |
304 | | * DataDir. This is an important consideration since accidental matches of |
305 | | * shmem segment IDs are reasonably common. |
306 | | */ |
307 | | bool |
308 | | PGSharedMemoryIsInUse(unsigned long id1, unsigned long id2) |
309 | 0 | { |
310 | 0 | PGShmemHeader *memAddress; |
311 | 0 | IpcMemoryState state; |
312 | |
|
313 | 0 | state = PGSharedMemoryAttach((IpcMemoryId) id2, &memAddress); |
314 | 0 | if (memAddress && shmdt(memAddress) < 0) |
315 | 0 | elog(LOG, "shmdt(%p) failed: %m", memAddress); |
316 | 0 | switch (state) |
317 | 0 | { |
318 | 0 | case SHMSTATE_ENOENT: |
319 | 0 | case SHMSTATE_FOREIGN: |
320 | 0 | case SHMSTATE_UNATTACHED: |
321 | 0 | return false; |
322 | 0 | case SHMSTATE_ANALYSIS_FAILURE: |
323 | 0 | case SHMSTATE_ATTACHED: |
324 | 0 | return true; |
325 | 0 | } |
326 | 0 | return true; |
327 | 0 | } |
328 | | |
329 | | /* See comment at IpcMemoryState. */ |
330 | | static IpcMemoryState |
331 | | PGSharedMemoryAttach(IpcMemoryId shmId, |
332 | | PGShmemHeader **addr) |
333 | 5.35k | { |
334 | 5.35k | struct shmid_ds shmStat; |
335 | 5.35k | struct stat statbuf; |
336 | 5.35k | PGShmemHeader *hdr; |
337 | | |
338 | 5.35k | *addr = NULL; |
339 | | |
340 | | /* |
341 | | * We detect whether a shared memory segment is in use by seeing whether |
342 | | * it (a) exists and (b) has any processes attached to it. |
343 | | */ |
344 | 5.35k | if (shmctl(shmId, IPC_STAT, &shmStat) < 0) |
345 | 0 | { |
346 | | /* |
347 | | * EINVAL actually has multiple possible causes documented in the |
348 | | * shmctl man page, but we assume it must mean the segment no longer |
349 | | * exists. |
350 | | */ |
351 | 0 | if (errno == EINVAL) |
352 | 0 | return SHMSTATE_ENOENT; |
353 | | |
354 | | /* |
355 | | * EACCES implies we have no read permission, which means it is not a |
356 | | * Postgres shmem segment (or at least, not one that is relevant to |
357 | | * our data directory). |
358 | | */ |
359 | 0 | if (errno == EACCES) |
360 | 0 | return SHMSTATE_FOREIGN; |
361 | | |
362 | | /* |
363 | | * Some Linux kernel versions (in fact, all of them as of July 2007) |
364 | | * sometimes return EIDRM when EINVAL is correct. The Linux kernel |
365 | | * actually does not have any internal state that would justify |
366 | | * returning EIDRM, so we can get away with assuming that EIDRM is |
367 | | * equivalent to EINVAL on that platform. |
368 | | */ |
369 | | #ifdef HAVE_LINUX_EIDRM_BUG |
370 | | if (errno == EIDRM) |
371 | | return SHMSTATE_ENOENT; |
372 | | #endif |
373 | | |
374 | | /* |
375 | | * Otherwise, we had better assume that the segment is in use. The |
376 | | * only likely case is EIDRM, which implies that the segment has been |
377 | | * IPC_RMID'd but there are still processes attached to it. |
378 | | */ |
379 | 0 | return SHMSTATE_ANALYSIS_FAILURE; |
380 | 0 | } |
381 | | |
382 | | /* |
383 | | * Try to attach to the segment and see if it matches our data directory. |
384 | | * This avoids shmid-conflict problems on machines that are running |
385 | | * several postmasters under the same userid. |
386 | | */ |
387 | 5.35k | if (stat(DataDir, &statbuf) < 0) |
388 | 0 | return SHMSTATE_ANALYSIS_FAILURE; /* can't stat; be conservative */ |
389 | | |
390 | | /* |
391 | | * Attachment fails if we have no write permission. Since that will never |
392 | | * happen with Postgres IPCProtection, such a failure shows the segment is |
393 | | * not a Postgres segment. If attachment fails for some other reason, be |
394 | | * conservative. |
395 | | */ |
396 | 5.35k | hdr = (PGShmemHeader *) shmat(shmId, UsedShmemSegAddr, PG_SHMAT_FLAGS); |
397 | 5.35k | if (hdr == (PGShmemHeader *) -1) |
398 | 0 | { |
399 | 0 | if (errno == EACCES) |
400 | 0 | return SHMSTATE_FOREIGN; |
401 | 0 | else |
402 | 0 | return SHMSTATE_ANALYSIS_FAILURE; |
403 | 5.35k | } |
404 | 5.35k | *addr = hdr; |
405 | | |
406 | 5.35k | if (hdr->magic != PGShmemMagic || |
407 | 5.33k | hdr->device != statbuf.st_dev || |
408 | 5.33k | hdr->inode != statbuf.st_ino) |
409 | 5.35k | { |
410 | | /* |
411 | | * It's either not a Postgres segment, or not one for my data |
412 | | * directory. |
413 | | */ |
414 | 5.35k | return SHMSTATE_FOREIGN; |
415 | 5.35k | } |
416 | | |
417 | 0 | return shmStat.shm_nattch == 0 ? SHMSTATE_UNATTACHED : SHMSTATE_ATTACHED; |
418 | 0 | } |
419 | | |
420 | | #ifdef MAP_HUGETLB |
421 | | |
422 | | /* |
423 | | * Identify the huge page size to use. |
424 | | * |
425 | | * Some Linux kernel versions have a bug causing mmap() to fail on requests |
426 | | * that are not a multiple of the hugepage size. Versions without that bug |
427 | | * instead silently round the request up to the next hugepage multiple --- |
428 | | * and then munmap() fails when we give it a size different from that. |
429 | | * So we have to round our request up to a multiple of the actual hugepage |
430 | | * size to avoid trouble. |
431 | | * |
432 | | * Doing the round-up ourselves also lets us make use of the extra memory, |
433 | | * rather than just wasting it. Currently, we just increase the available |
434 | | * space recorded in the shmem header, which will make the extra usable for |
435 | | * purposes such as additional locktable entries. Someday, for very large |
436 | | * hugepage sizes, we might want to think about more invasive strategies, |
437 | | * such as increasing shared_buffers to absorb the extra space. |
438 | | * |
439 | | * Returns the (real or assumed) page size into *hugepagesize, |
440 | | * and the hugepage-related mmap flags to use into *mmap_flags. |
441 | | * |
442 | | * Currently *mmap_flags is always just MAP_HUGETLB. Someday, on systems |
443 | | * that support it, we might OR in additional bits to specify a particular |
444 | | * non-default huge page size. |
445 | | */ |
446 | | static void |
447 | | GetHugePageSize(Size *hugepagesize, int *mmap_flags) |
448 | | { |
449 | | /* |
450 | | * If we fail to find out the system's default huge page size, assume it |
451 | | * is 2MB. This will work fine when the actual size is less. If it's |
452 | | * more, we might get mmap() or munmap() failures due to unaligned |
453 | | * requests; but at this writing, there are no reports of any non-Linux |
454 | | * systems being picky about that. |
455 | | */ |
456 | | *hugepagesize = 2 * 1024 * 1024; |
457 | | *mmap_flags = MAP_HUGETLB; |
458 | | |
459 | | /* |
460 | | * System-dependent code to find out the default huge page size. |
461 | | * |
462 | | * On Linux, read /proc/meminfo looking for a line like "Hugepagesize: |
463 | | * nnnn kB". Ignore any failures, falling back to the preset default. |
464 | | */ |
465 | | #ifdef __linux__ |
466 | | { |
467 | | FILE *fp = AllocateFile("/proc/meminfo", "r"); |
468 | | char buf[128]; |
469 | | unsigned int sz; |
470 | | char ch; |
471 | | |
472 | | if (fp) |
473 | | { |
474 | | while (fgets(buf, sizeof(buf), fp)) |
475 | | { |
476 | | if (sscanf(buf, "Hugepagesize: %u %c", &sz, &ch) == 2) |
477 | | { |
478 | | if (ch == 'k') |
479 | | { |
480 | | *hugepagesize = sz * (Size) 1024; |
481 | | break; |
482 | | } |
483 | | /* We could accept other units besides kB, if needed */ |
484 | | } |
485 | | } |
486 | | FreeFile(fp); |
487 | | } |
488 | | } |
489 | | #endif /* __linux__ */ |
490 | | } |
491 | | |
492 | | #endif /* MAP_HUGETLB */ |
493 | | |
494 | | /* |
495 | | * Creates an anonymous mmap()ed shared memory segment. |
496 | | * |
497 | | * Pass the requested size in *size. This function will modify *size to the |
498 | | * actual size of the allocation, if it ends up allocating a segment that is |
499 | | * larger than requested. |
500 | | */ |
501 | | static void * |
502 | | CreateAnonymousSegment(Size *size) |
503 | 3.61k | { |
504 | 3.61k | Size allocsize = *size; |
505 | 3.61k | void *ptr = MAP_FAILED; |
506 | 3.61k | int mmap_errno = 0; |
507 | | |
508 | 3.61k | #ifndef MAP_HUGETLB |
509 | | /* PGSharedMemoryCreate should have dealt with this case */ |
510 | 3.61k | Assert(huge_pages != HUGE_PAGES_ON); |
511 | | #else |
512 | | if (huge_pages == HUGE_PAGES_ON || huge_pages == HUGE_PAGES_TRY) |
513 | | { |
514 | | /* |
515 | | * Round up the request size to a suitable large value. |
516 | | */ |
517 | | Size hugepagesize; |
518 | | int mmap_flags; |
519 | | |
520 | | GetHugePageSize(&hugepagesize, &mmap_flags); |
521 | | |
522 | | if (allocsize % hugepagesize != 0) |
523 | | allocsize += hugepagesize - (allocsize % hugepagesize); |
524 | | |
525 | | ptr = mmap(NULL, allocsize, PROT_READ | PROT_WRITE, |
526 | | PG_MMAP_FLAGS | mmap_flags, -1, 0); |
527 | | mmap_errno = errno; |
528 | | if (huge_pages == HUGE_PAGES_TRY && ptr == MAP_FAILED) |
529 | | elog(DEBUG1, "mmap(%zu) with MAP_HUGETLB failed, huge pages disabled: %m", |
530 | | allocsize); |
531 | | } |
532 | | #endif |
533 | | |
534 | 3.61k | if (ptr == MAP_FAILED && huge_pages != HUGE_PAGES_ON) |
535 | 3.61k | { |
536 | | /* |
537 | | * Use the original size, not the rounded-up value, when falling back |
538 | | * to non-huge pages. |
539 | | */ |
540 | 3.61k | allocsize = *size; |
541 | 3.61k | ptr = mmap(NULL, allocsize, PROT_READ | PROT_WRITE, |
542 | 3.61k | PG_MMAP_FLAGS, -1, 0); |
543 | 3.61k | mmap_errno = errno; |
544 | 3.61k | } |
545 | | |
546 | 3.61k | if (ptr == MAP_FAILED) |
547 | 0 | { |
548 | 0 | errno = mmap_errno; |
549 | 0 | ereport(FATAL, |
550 | 0 | (errmsg("could not map anonymous shared memory: %m"), |
551 | 0 | (mmap_errno == ENOMEM) ? |
552 | 0 | errhint("This error usually means that PostgreSQL's request " |
553 | 0 | "for a shared memory segment exceeded available memory, " |
554 | 0 | "swap space, or huge pages. To reduce the request size " |
555 | 0 | "(currently %zu bytes), reduce PostgreSQL's shared " |
556 | 0 | "memory usage, perhaps by reducing shared_buffers or " |
557 | 0 | "max_connections.", |
558 | 0 | *size) : 0)); |
559 | 0 | } |
560 | | |
561 | 3.61k | *size = allocsize; |
562 | 3.61k | return ptr; |
563 | 3.61k | } |
564 | | |
565 | | /* |
566 | | * AnonymousShmemDetach --- detach from an anonymous mmap'd block |
567 | | * (called as an on_shmem_exit callback, hence funny argument list) |
568 | | */ |
569 | | static void |
570 | | AnonymousShmemDetach(int status, Datum arg) |
571 | 3.61k | { |
572 | | /* Release anonymous shared memory block, if any. */ |
573 | 3.61k | if (AnonymousShmem != NULL) |
574 | 3.61k | { |
575 | 3.61k | if (munmap(AnonymousShmem, AnonymousShmemSize) < 0) |
576 | 0 | elog(LOG, "munmap(%p, %zu) failed: %m", |
577 | 3.61k | AnonymousShmem, AnonymousShmemSize); |
578 | 3.61k | AnonymousShmem = NULL; |
579 | 3.61k | } |
580 | 3.61k | } |
581 | | |
582 | | /* |
583 | | * PGSharedMemoryCreate |
584 | | * |
585 | | * Create a shared memory segment of the given size and initialize its |
586 | | * standard header. Also, register an on_shmem_exit callback to release |
587 | | * the storage. |
588 | | * |
589 | | * Dead Postgres segments pertinent to this DataDir are recycled if found, but |
590 | | * we do not fail upon collision with foreign shmem segments. The idea here |
591 | | * is to detect and re-use keys that may have been assigned by a crashed |
592 | | * postmaster or backend. |
593 | | * |
594 | | * The port number is passed for possible use as a key (for SysV, we use |
595 | | * it to generate the starting shmem key). |
596 | | */ |
597 | | PGShmemHeader * |
598 | | PGSharedMemoryCreate(Size size, int port, |
599 | | PGShmemHeader **shim) |
600 | 3.61k | { |
601 | 3.61k | IpcMemoryKey NextShmemSegID; |
602 | 3.61k | void *memAddress; |
603 | 3.61k | PGShmemHeader *hdr; |
604 | 3.61k | struct stat statbuf; |
605 | 3.61k | Size sysvsize; |
606 | | |
607 | | /* Complain if hugepages demanded but we can't possibly support them */ |
608 | 3.61k | #if !defined(MAP_HUGETLB) |
609 | 3.61k | if (huge_pages == HUGE_PAGES_ON) |
610 | 3.61k | ereport(ERROR, |
611 | 3.61k | (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), |
612 | 3.61k | errmsg("huge pages not supported on this platform"))); |
613 | 3.61k | #endif |
614 | | |
615 | | /* Room for a header? */ |
616 | 3.61k | Assert(size > MAXALIGN(sizeof(PGShmemHeader))); |
617 | | |
618 | 3.61k | if (shared_memory_type == SHMEM_TYPE_MMAP) |
619 | 3.61k | { |
620 | 3.61k | AnonymousShmem = CreateAnonymousSegment(&size); |
621 | 3.61k | AnonymousShmemSize = size; |
622 | | |
623 | | /* Register on-exit routine to unmap the anonymous segment */ |
624 | 3.61k | on_shmem_exit(AnonymousShmemDetach, (Datum) 0); |
625 | | |
626 | | /* Now we need only allocate a minimal-sized SysV shmem block. */ |
627 | 3.61k | sysvsize = sizeof(PGShmemHeader); |
628 | 3.61k | } |
629 | 0 | else |
630 | 0 | sysvsize = size; |
631 | | |
632 | | /* Make sure PGSharedMemoryAttach doesn't fail without need */ |
633 | 3.61k | UsedShmemSegAddr = NULL; |
634 | | |
635 | | /* |
636 | | * Loop till we find a free IPC key. Trust CreateDataDirLockFile() to |
637 | | * ensure no more than one postmaster per data directory can enter this |
638 | | * loop simultaneously. (CreateDataDirLockFile() does not ensure that, |
639 | | * but prefer fixing it over coping here.) |
640 | | */ |
641 | 3.61k | NextShmemSegID = 1 + port * 1000; |
642 | | |
643 | 3.61k | for (;;) |
644 | 8.96k | { |
645 | 8.96k | IpcMemoryId shmid; |
646 | 8.96k | PGShmemHeader *oldhdr; |
647 | 8.96k | IpcMemoryState state; |
648 | | |
649 | | /* Try to create new segment */ |
650 | 8.96k | memAddress = InternalIpcMemoryCreate(NextShmemSegID, sysvsize); |
651 | 8.96k | if (memAddress) |
652 | 3.61k | break; /* successful create and attach */ |
653 | | |
654 | | /* Check shared memory and possibly remove and recreate */ |
655 | | |
656 | | /* |
657 | | * shmget() failure is typically EACCES, hence SHMSTATE_FOREIGN. |
658 | | * ENOENT, a narrow possibility, implies SHMSTATE_ENOENT, but one can |
659 | | * safely treat SHMSTATE_ENOENT like SHMSTATE_FOREIGN. |
660 | | */ |
661 | 5.35k | shmid = shmget(NextShmemSegID, sizeof(PGShmemHeader), 0); |
662 | 5.35k | if (shmid < 0) |
663 | 0 | { |
664 | 0 | oldhdr = NULL; |
665 | 0 | state = SHMSTATE_FOREIGN; |
666 | 0 | } |
667 | 5.35k | else |
668 | 5.35k | state = PGSharedMemoryAttach(shmid, &oldhdr); |
669 | | |
670 | 5.35k | switch (state) |
671 | 5.35k | { |
672 | 0 | case SHMSTATE_ANALYSIS_FAILURE: |
673 | 0 | case SHMSTATE_ATTACHED: |
674 | 0 | ereport(FATAL, |
675 | 0 | (errcode(ERRCODE_LOCK_FILE_EXISTS), |
676 | 0 | errmsg("pre-existing shared memory block (key %lu, ID %lu) is still in use", |
677 | 0 | (unsigned long) NextShmemSegID, |
678 | 0 | (unsigned long) shmid), |
679 | 0 | errhint("Terminate any old server processes associated with data directory \"%s\".", |
680 | 0 | DataDir))); |
681 | 0 | break; |
682 | 0 | case SHMSTATE_ENOENT: |
683 | | |
684 | | /* |
685 | | * To our surprise, some other process deleted since our last |
686 | | * InternalIpcMemoryCreate(). Moments earlier, we would have |
687 | | * seen SHMSTATE_FOREIGN. Try that same ID again. |
688 | | */ |
689 | 0 | elog(LOG, |
690 | 0 | "shared memory block (key %lu, ID %lu) deleted during startup", |
691 | 0 | (unsigned long) NextShmemSegID, |
692 | 0 | (unsigned long) shmid); |
693 | 0 | break; |
694 | 5.35k | case SHMSTATE_FOREIGN: |
695 | 5.35k | NextShmemSegID++; |
696 | 5.35k | break; |
697 | 0 | case SHMSTATE_UNATTACHED: |
698 | | |
699 | | /* |
700 | | * The segment pertains to DataDir, and every process that had |
701 | | * used it has died or detached. Zap it, if possible, and any |
702 | | * associated dynamic shared memory segments, as well. This |
703 | | * shouldn't fail, but if it does, assume the segment belongs |
704 | | * to someone else after all, and try the next candidate. |
705 | | * Otherwise, try again to create the segment. That may fail |
706 | | * if some other process creates the same shmem key before we |
707 | | * do, in which case we'll try the next key. |
708 | | */ |
709 | 0 | if (oldhdr->dsm_control != 0) |
710 | 0 | dsm_cleanup_using_control_segment(oldhdr->dsm_control); |
711 | 0 | if (shmctl(shmid, IPC_RMID, NULL) < 0) |
712 | 0 | NextShmemSegID++; |
713 | 0 | break; |
714 | 5.35k | } |
715 | | |
716 | 5.35k | if (oldhdr && shmdt(oldhdr) < 0) |
717 | 0 | elog(LOG, "shmdt(%p) failed: %m", oldhdr); |
718 | 5.35k | } |
719 | | |
720 | | /* Initialize new segment. */ |
721 | 3.61k | hdr = (PGShmemHeader *) memAddress; |
722 | 3.61k | hdr->creatorPID = getpid(); |
723 | 3.61k | hdr->magic = PGShmemMagic; |
724 | 3.61k | hdr->dsm_control = 0; |
725 | | |
726 | | /* Fill in the data directory ID info, too */ |
727 | 3.61k | if (stat(DataDir, &statbuf) < 0) |
728 | 3.61k | ereport(FATAL, |
729 | 3.61k | (errcode_for_file_access(), |
730 | 3.61k | errmsg("could not stat data directory \"%s\": %m", |
731 | 3.61k | DataDir))); |
732 | 3.61k | hdr->device = statbuf.st_dev; |
733 | 3.61k | hdr->inode = statbuf.st_ino; |
734 | | |
735 | | /* |
736 | | * Initialize space allocation status for segment. |
737 | | */ |
738 | 3.61k | hdr->totalsize = size; |
739 | 3.61k | hdr->freeoffset = MAXALIGN(sizeof(PGShmemHeader)); |
740 | 3.61k | *shim = hdr; |
741 | | |
742 | | /* Save info for possible future use */ |
743 | 3.61k | UsedShmemSegAddr = memAddress; |
744 | 3.61k | UsedShmemSegID = (unsigned long) NextShmemSegID; |
745 | | |
746 | | /* |
747 | | * If AnonymousShmem is NULL here, then we're not using anonymous shared |
748 | | * memory, and should return a pointer to the System V shared memory |
749 | | * block. Otherwise, the System V shared memory block is only a shim, and |
750 | | * we must return a pointer to the real block. |
751 | | */ |
752 | 3.61k | if (AnonymousShmem == NULL) |
753 | 0 | return hdr; |
754 | 3.61k | memcpy(AnonymousShmem, hdr, sizeof(PGShmemHeader)); |
755 | 3.61k | return (PGShmemHeader *) AnonymousShmem; |
756 | 3.61k | } |
757 | | |
758 | | #ifdef EXEC_BACKEND |
759 | | |
760 | | /* |
761 | | * PGSharedMemoryReAttach |
762 | | * |
763 | | * This is called during startup of a postmaster child process to re-attach to |
764 | | * an already existing shared memory segment. This is needed only in the |
765 | | * EXEC_BACKEND case; otherwise postmaster children inherit the shared memory |
766 | | * segment attachment via fork(). |
767 | | * |
768 | | * UsedShmemSegID and UsedShmemSegAddr are implicit parameters to this |
769 | | * routine. The caller must have already restored them to the postmaster's |
770 | | * values. |
771 | | */ |
772 | | void |
773 | | PGSharedMemoryReAttach(void) |
774 | | { |
775 | | IpcMemoryId shmid; |
776 | | PGShmemHeader *hdr; |
777 | | IpcMemoryState state; |
778 | | void *origUsedShmemSegAddr = UsedShmemSegAddr; |
779 | | |
780 | | Assert(UsedShmemSegAddr != NULL); |
781 | | Assert(IsUnderPostmaster); |
782 | | |
783 | | #ifdef __CYGWIN__ |
784 | | /* cygipc (currently) appears to not detach on exec. */ |
785 | | PGSharedMemoryDetach(); |
786 | | UsedShmemSegAddr = origUsedShmemSegAddr; |
787 | | #endif |
788 | | |
789 | | elog(DEBUG3, "attaching to %p", UsedShmemSegAddr); |
790 | | shmid = shmget(UsedShmemSegID, sizeof(PGShmemHeader), 0); |
791 | | if (shmid < 0) |
792 | | state = SHMSTATE_FOREIGN; |
793 | | else |
794 | | state = PGSharedMemoryAttach(shmid, &hdr); |
795 | | if (state != SHMSTATE_ATTACHED) |
796 | | elog(FATAL, "could not reattach to shared memory (key=%d, addr=%p): %m", |
797 | | (int) UsedShmemSegID, UsedShmemSegAddr); |
798 | | if (hdr != origUsedShmemSegAddr) |
799 | | elog(FATAL, "reattaching to shared memory returned unexpected address (got %p, expected %p)", |
800 | | hdr, origUsedShmemSegAddr); |
801 | | dsm_set_control_handle(hdr->dsm_control); |
802 | | |
803 | | UsedShmemSegAddr = hdr; /* probably redundant */ |
804 | | } |
805 | | |
806 | | /* |
807 | | * PGSharedMemoryNoReAttach |
808 | | * |
809 | | * This is called during startup of a postmaster child process when we choose |
810 | | * *not* to re-attach to the existing shared memory segment. We must clean up |
811 | | * to leave things in the appropriate state. This is not used in the non |
812 | | * EXEC_BACKEND case, either. |
813 | | * |
814 | | * The child process startup logic might or might not call PGSharedMemoryDetach |
815 | | * after this; make sure that it will be a no-op if called. |
816 | | * |
817 | | * UsedShmemSegID and UsedShmemSegAddr are implicit parameters to this |
818 | | * routine. The caller must have already restored them to the postmaster's |
819 | | * values. |
820 | | */ |
821 | | void |
822 | | PGSharedMemoryNoReAttach(void) |
823 | | { |
824 | | Assert(UsedShmemSegAddr != NULL); |
825 | | Assert(IsUnderPostmaster); |
826 | | |
827 | | #ifdef __CYGWIN__ |
828 | | /* cygipc (currently) appears to not detach on exec. */ |
829 | | PGSharedMemoryDetach(); |
830 | | #endif |
831 | | |
832 | | /* For cleanliness, reset UsedShmemSegAddr to show we're not attached. */ |
833 | | UsedShmemSegAddr = NULL; |
834 | | /* And the same for UsedShmemSegID. */ |
835 | | UsedShmemSegID = 0; |
836 | | } |
837 | | |
838 | | #endif /* EXEC_BACKEND */ |
839 | | |
840 | | /* |
841 | | * PGSharedMemoryDetach |
842 | | * |
843 | | * Detach from the shared memory segment, if still attached. This is not |
844 | | * intended to be called explicitly by the process that originally created the |
845 | | * segment (it will have on_shmem_exit callback(s) registered to do that). |
846 | | * Rather, this is for subprocesses that have inherited an attachment and want |
847 | | * to get rid of it. |
848 | | * |
849 | | * UsedShmemSegID and UsedShmemSegAddr are implicit parameters to this |
850 | | * routine, also AnonymousShmem and AnonymousShmemSize. |
851 | | */ |
852 | | void |
853 | | PGSharedMemoryDetach(void) |
854 | 901 | { |
855 | 901 | if (UsedShmemSegAddr != NULL) |
856 | 901 | { |
857 | 901 | if ((shmdt(UsedShmemSegAddr) < 0) |
858 | | #if defined(EXEC_BACKEND) && defined(__CYGWIN__) |
859 | | /* Work-around for cygipc exec bug */ |
860 | | && shmdt(NULL) < 0 |
861 | | #endif |
862 | 901 | ) |
863 | 0 | elog(LOG, "shmdt(%p) failed: %m", UsedShmemSegAddr); |
864 | 901 | UsedShmemSegAddr = NULL; |
865 | 901 | } |
866 | | |
867 | 901 | if (AnonymousShmem != NULL) |
868 | 901 | { |
869 | 901 | if (munmap(AnonymousShmem, AnonymousShmemSize) < 0) |
870 | 0 | elog(LOG, "munmap(%p, %zu) failed: %m", |
871 | 901 | AnonymousShmem, AnonymousShmemSize); |
872 | 901 | AnonymousShmem = NULL; |
873 | 901 | } |
874 | 901 | } |