YugabyteDB (2.13.1.0-b60, 21121d69985fbf76aa6958d8f04a9bfa936293b5)

Coverage Report

Created: 2022-03-22 16:43

/Users/deen/code/yugabyte-db/src/postgres/src/backend/access/transam/xlog.c
Line
Count
Source (jump to first uncovered line)
1
/*-------------------------------------------------------------------------
2
 *
3
 * xlog.c
4
 *    PostgreSQL write-ahead log manager
5
 *
6
 *
7
 * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
8
 * Portions Copyright (c) 1994, Regents of the University of California
9
 *
10
 * src/backend/access/transam/xlog.c
11
 *
12
 *-------------------------------------------------------------------------
13
 */
14
15
#include "postgres.h"
16
17
#include <ctype.h>
18
#include <math.h>
19
#include <time.h>
20
#include <fcntl.h>
21
#include <sys/stat.h>
22
#include <sys/time.h>
23
#include <unistd.h>
24
25
#include "access/clog.h"
26
#include "access/commit_ts.h"
27
#include "access/multixact.h"
28
#include "access/rewriteheap.h"
29
#include "access/subtrans.h"
30
#include "access/timeline.h"
31
#include "access/transam.h"
32
#include "access/tuptoaster.h"
33
#include "access/twophase.h"
34
#include "access/xact.h"
35
#include "access/xlog_internal.h"
36
#include "access/xloginsert.h"
37
#include "access/xlogreader.h"
38
#include "access/xlogutils.h"
39
#include "catalog/catversion.h"
40
#include "catalog/pg_control.h"
41
#include "catalog/pg_database.h"
42
#include "commands/tablespace.h"
43
#include "miscadmin.h"
44
#include "pgstat.h"
45
#include "port/atomics.h"
46
#include "postmaster/bgwriter.h"
47
#include "postmaster/walwriter.h"
48
#include "postmaster/startup.h"
49
#include "replication/basebackup.h"
50
#include "replication/logical.h"
51
#include "replication/slot.h"
52
#include "replication/origin.h"
53
#include "replication/snapbuild.h"
54
#include "replication/walreceiver.h"
55
#include "replication/walsender.h"
56
#include "storage/bufmgr.h"
57
#include "storage/fd.h"
58
#include "storage/ipc.h"
59
#include "storage/large_object.h"
60
#include "storage/latch.h"
61
#include "storage/pmsignal.h"
62
#include "storage/predicate.h"
63
#include "storage/proc.h"
64
#include "storage/procarray.h"
65
#include "storage/reinit.h"
66
#include "storage/smgr.h"
67
#include "storage/spin.h"
68
#include "utils/backend_random.h"
69
#include "utils/builtins.h"
70
#include "utils/guc.h"
71
#include "utils/memutils.h"
72
#include "utils/pg_lsn.h"
73
#include "utils/ps_status.h"
74
#include "utils/relmapper.h"
75
#include "utils/snapmgr.h"
76
#include "utils/timestamp.h"
77
#include "pg_trace.h"
78
79
extern uint32 bootstrap_data_checksum_version;
80
81
/* File path names (all relative to $PGDATA) */
82
3.99k
#define RECOVERY_COMMAND_FILE "recovery.conf"
83
0
#define RECOVERY_COMMAND_DONE "recovery.done"
84
1.99k
#define PROMOTE_SIGNAL_FILE   "promote"
85
1.99k
#define FALLBACK_PROMOTE_SIGNAL_FILE "fallback_promote"
86
87
88
/* User-settable parameters */
89
int     max_wal_size_mb = 1024; /* 1 GB */
90
int     min_wal_size_mb = 80; /* 80 MB */
91
int     wal_keep_segments = 0;
92
int     XLOGbuffers = -1;
93
int     XLogArchiveTimeout = 0;
94
int     XLogArchiveMode = ARCHIVE_MODE_OFF;
95
char     *XLogArchiveCommand = NULL;
96
bool    EnableHotStandby = false;
97
bool    fullPageWrites = true;
98
bool    wal_log_hints = false;
99
bool    wal_compression = false;
100
char     *wal_consistency_checking_string = NULL;
101
bool     *wal_consistency_checking = NULL;
102
bool    log_checkpoints = false;
103
int     sync_method = DEFAULT_SYNC_METHOD;
104
int     wal_level = WAL_LEVEL_MINIMAL;
105
int     CommitDelay = 0;  /* precommit delay in microseconds */
106
int     CommitSiblings = 5; /* # concurrent xacts needed to sleep */
107
int     wal_retrieve_retry_interval = 5000;
108
109
#ifdef WAL_DEBUG
110
bool    XLOG_DEBUG = false;
111
#endif
112
113
int     wal_segment_size = DEFAULT_XLOG_SEG_SIZE;
114
115
/*
116
 * Number of WAL insertion locks to use. A higher value allows more insertions
117
 * to happen concurrently, but adds some CPU overhead to flushing the WAL,
118
 * which needs to iterate all the locks.
119
 */
120
186k
#define NUM_XLOGINSERT_LOCKS  8
121
122
/*
123
 * Max distance from last checkpoint, before triggering a new xlog-based
124
 * checkpoint.
125
 */
126
int     CheckPointSegments;
127
128
/* Estimated distance between checkpoints, in bytes */
129
static double CheckPointDistanceEstimate = 0;
130
static double PrevCheckPointDistance = 0;
131
132
/*
133
 * GUC support
134
 */
135
const struct config_enum_entry sync_method_options[] = {
136
  {"fsync", SYNC_METHOD_FSYNC, false},
137
#ifdef HAVE_FSYNC_WRITETHROUGH
138
  {"fsync_writethrough", SYNC_METHOD_FSYNC_WRITETHROUGH, false},
139
#endif
140
#ifdef HAVE_FDATASYNC
141
  {"fdatasync", SYNC_METHOD_FDATASYNC, false},
142
#endif
143
#ifdef OPEN_SYNC_FLAG
144
  {"open_sync", SYNC_METHOD_OPEN, false},
145
#endif
146
#ifdef OPEN_DATASYNC_FLAG
147
  {"open_datasync", SYNC_METHOD_OPEN_DSYNC, false},
148
#endif
149
  {NULL, 0, false}
150
};
151
152
153
/*
154
 * Although only "on", "off", and "always" are documented,
155
 * we accept all the likely variants of "on" and "off".
156
 */
157
const struct config_enum_entry archive_mode_options[] = {
158
  {"always", ARCHIVE_MODE_ALWAYS, false},
159
  {"on", ARCHIVE_MODE_ON, false},
160
  {"off", ARCHIVE_MODE_OFF, false},
161
  {"true", ARCHIVE_MODE_ON, true},
162
  {"false", ARCHIVE_MODE_OFF, true},
163
  {"yes", ARCHIVE_MODE_ON, true},
164
  {"no", ARCHIVE_MODE_OFF, true},
165
  {"1", ARCHIVE_MODE_ON, true},
166
  {"0", ARCHIVE_MODE_OFF, true},
167
  {NULL, 0, false}
168
};
169
170
/*
171
 * Statistics for current checkpoint are collected in this global struct.
172
 * Because only the checkpointer or a stand-alone backend can perform
173
 * checkpoints, this will be unused in normal backends.
174
 */
175
CheckpointStatsData CheckpointStats;
176
177
/*
178
 * ThisTimeLineID will be same in all backends --- it identifies current
179
 * WAL timeline for the database system.
180
 */
181
TimeLineID  ThisTimeLineID = 0;
182
183
/*
184
 * Are we doing recovery from XLOG?
185
 *
186
 * This is only ever true in the startup process; it should be read as meaning
187
 * "this process is replaying WAL records", rather than "the system is in
188
 * recovery mode".  It should be examined primarily by functions that need
189
 * to act differently when called from a WAL redo function (e.g., to skip WAL
190
 * logging).  To check whether the system is in recovery regardless of which
191
 * process you're running in, use RecoveryInProgress() but only after shared
192
 * memory startup and lock initialization.
193
 */
194
bool    InRecovery = false;
195
196
/* Are we in Hot Standby mode? Only valid in startup process, see xlog.h */
197
HotStandbyState standbyState = STANDBY_DISABLED;
198
199
static XLogRecPtr LastRec;
200
201
/* Local copy of WalRcv->receivedUpto */
202
static XLogRecPtr receivedUpto = 0;
203
static TimeLineID receiveTLI = 0;
204
205
/*
206
 * During recovery, lastFullPageWrites keeps track of full_page_writes that
207
 * the replayed WAL records indicate. It's initialized with full_page_writes
208
 * that the recovery starting checkpoint record indicates, and then updated
209
 * each time XLOG_FPW_CHANGE record is replayed.
210
 */
211
static bool lastFullPageWrites;
212
213
/*
214
 * Local copy of SharedRecoveryInProgress variable. True actually means "not
215
 * known, need to check the shared state".
216
 */
217
static bool LocalRecoveryInProgress = true;
218
219
/*
220
 * Local copy of SharedHotStandbyActive variable. False actually means "not
221
 * known, need to check the shared state".
222
 */
223
static bool LocalHotStandbyActive = false;
224
225
/*
226
 * Local state for XLogInsertAllowed():
227
 *    1: unconditionally allowed to insert XLOG
228
 *    0: unconditionally not allowed to insert XLOG
229
 *    -1: must check RecoveryInProgress(); disallow until it is false
230
 * Most processes start with -1 and transition to 1 after seeing that recovery
231
 * is not in progress.  But we can also force the value for special cases.
232
 * The coding in XLogInsertAllowed() depends on the first two of these states
233
 * being numerically the same as bool true and false.
234
 */
235
static int  LocalXLogInsertAllowed = -1;
236
237
/*
238
 * When ArchiveRecoveryRequested is set, archive recovery was requested,
239
 * ie. recovery.conf file was present. When InArchiveRecovery is set, we are
240
 * currently recovering using offline XLOG archives. These variables are only
241
 * valid in the startup process.
242
 *
243
 * When ArchiveRecoveryRequested is true, but InArchiveRecovery is false, we're
244
 * currently performing crash recovery using only XLOG files in pg_wal, but
245
 * will switch to using offline XLOG archives as soon as we reach the end of
246
 * WAL in pg_wal.
247
*/
248
bool    ArchiveRecoveryRequested = false;
249
bool    InArchiveRecovery = false;
250
251
/* Was the last xlog file restored from archive, or local? */
252
static bool restoredFromArchive = false;
253
254
/* Buffers dedicated to consistency checks of size BLCKSZ */
255
static char *replay_image_masked = NULL;
256
static char *master_image_masked = NULL;
257
258
/* options taken from recovery.conf for archive recovery */
259
char     *recoveryRestoreCommand = NULL;
260
static char *recoveryEndCommand = NULL;
261
static char *archiveCleanupCommand = NULL;
262
static RecoveryTargetType recoveryTarget = RECOVERY_TARGET_UNSET;
263
static bool recoveryTargetInclusive = true;
264
static RecoveryTargetAction recoveryTargetAction = RECOVERY_TARGET_ACTION_PAUSE;
265
static TransactionId recoveryTargetXid;
266
static TimestampTz recoveryTargetTime;
267
static char *recoveryTargetName;
268
static XLogRecPtr recoveryTargetLSN;
269
static int  recovery_min_apply_delay = 0;
270
static TimestampTz recoveryDelayUntilTime;
271
272
/* options taken from recovery.conf for XLOG streaming */
273
static bool StandbyModeRequested = false;
274
static char *PrimaryConnInfo = NULL;
275
static char *PrimarySlotName = NULL;
276
static char *TriggerFile = NULL;
277
278
/* are we currently in standby mode? */
279
bool    StandbyMode = false;
280
281
/* whether request for fast promotion has been made yet */
282
static bool fast_promote = false;
283
284
/*
285
 * if recoveryStopsBefore/After returns true, it saves information of the stop
286
 * point here
287
 */
288
static TransactionId recoveryStopXid;
289
static TimestampTz recoveryStopTime;
290
static XLogRecPtr recoveryStopLSN;
291
static char recoveryStopName[MAXFNAMELEN];
292
static bool recoveryStopAfter;
293
294
/*
295
 * During normal operation, the only timeline we care about is ThisTimeLineID.
296
 * During recovery, however, things are more complicated.  To simplify life
297
 * for rmgr code, we keep ThisTimeLineID set to the "current" timeline as we
298
 * scan through the WAL history (that is, it is the line that was active when
299
 * the currently-scanned WAL record was generated).  We also need these
300
 * timeline values:
301
 *
302
 * recoveryTargetTLI: the desired timeline that we want to end in.
303
 *
304
 * recoveryTargetIsLatest: was the requested target timeline 'latest'?
305
 *
306
 * expectedTLEs: a list of TimeLineHistoryEntries for recoveryTargetTLI and the timelines of
307
 * its known parents, newest first (so recoveryTargetTLI is always the
308
 * first list member).  Only these TLIs are expected to be seen in the WAL
309
 * segments we read, and indeed only these TLIs will be considered as
310
 * candidate WAL files to open at all.
311
 *
312
 * curFileTLI: the TLI appearing in the name of the current input WAL file.
313
 * (This is not necessarily the same as ThisTimeLineID, because we could
314
 * be scanning data that was copied from an ancestor timeline when the current
315
 * file was created.)  During a sequential scan we do not allow this value
316
 * to decrease.
317
 */
318
static TimeLineID recoveryTargetTLI;
319
static bool recoveryTargetIsLatest = false;
320
static List *expectedTLEs;
321
static TimeLineID curFileTLI;
322
323
/*
324
 * ProcLastRecPtr points to the start of the last XLOG record inserted by the
325
 * current backend.  It is updated for all inserts.  XactLastRecEnd points to
326
 * end+1 of the last record, and is reset when we end a top-level transaction,
327
 * or start a new one; so it can be used to tell if the current transaction has
328
 * created any XLOG records.
329
 *
330
 * While in parallel mode, this may not be fully up to date.  When committing,
331
 * a transaction can assume this covers all xlog records written either by the
332
 * user backend or by any parallel worker which was present at any point during
333
 * the transaction.  But when aborting, or when still in parallel mode, other
334
 * parallel backends may have written WAL records at later LSNs than the value
335
 * stored here.  The parallel leader advances its own copy, when necessary,
336
 * in WaitForParallelWorkersToFinish.
337
 */
338
XLogRecPtr  ProcLastRecPtr = InvalidXLogRecPtr;
339
XLogRecPtr  XactLastRecEnd = InvalidXLogRecPtr;
340
XLogRecPtr  XactLastCommitEnd = InvalidXLogRecPtr;
341
342
/*
343
 * RedoRecPtr is this backend's local copy of the REDO record pointer
344
 * (which is almost but not quite the same as a pointer to the most recent
345
 * CHECKPOINT record).  We update this from the shared-memory copy,
346
 * XLogCtl->Insert.RedoRecPtr, whenever we can safely do so (ie, when we
347
 * hold an insertion lock).  See XLogInsertRecord for details.  We are also
348
 * allowed to update from XLogCtl->RedoRecPtr if we hold the info_lck;
349
 * see GetRedoRecPtr.  A freshly spawned backend obtains the value during
350
 * InitXLOGAccess.
351
 */
352
static XLogRecPtr RedoRecPtr;
353
354
/*
355
 * doPageWrites is this backend's local copy of (forcePageWrites ||
356
 * fullPageWrites).  It is used together with RedoRecPtr to decide whether
357
 * a full-page image of a page need to be taken.
358
 */
359
static bool doPageWrites;
360
361
/* Has the recovery code requested a walreceiver wakeup? */
362
static bool doRequestWalReceiverReply;
363
364
/*
365
 * RedoStartLSN points to the checkpoint's REDO location which is specified
366
 * in a backup label file, backup history file or control file. In standby
367
 * mode, XLOG streaming usually starts from the position where an invalid
368
 * record was found. But if we fail to read even the initial checkpoint
369
 * record, we use the REDO location instead of the checkpoint location as
370
 * the start position of XLOG streaming. Otherwise we would have to jump
371
 * backwards to the REDO location after reading the checkpoint record,
372
 * because the REDO record can precede the checkpoint record.
373
 */
374
static XLogRecPtr RedoStartLSN = InvalidXLogRecPtr;
375
376
/*----------
377
 * Shared-memory data structures for XLOG control
378
 *
379
 * LogwrtRqst indicates a byte position that we need to write and/or fsync
380
 * the log up to (all records before that point must be written or fsynced).
381
 * LogwrtResult indicates the byte positions we have already written/fsynced.
382
 * These structs are identical but are declared separately to indicate their
383
 * slightly different functions.
384
 *
385
 * To read XLogCtl->LogwrtResult, you must hold either info_lck or
386
 * WALWriteLock.  To update it, you need to hold both locks.  The point of
387
 * this arrangement is that the value can be examined by code that already
388
 * holds WALWriteLock without needing to grab info_lck as well.  In addition
389
 * to the shared variable, each backend has a private copy of LogwrtResult,
390
 * which is updated when convenient.
391
 *
392
 * The request bookkeeping is simpler: there is a shared XLogCtl->LogwrtRqst
393
 * (protected by info_lck), but we don't need to cache any copies of it.
394
 *
395
 * info_lck is only held long enough to read/update the protected variables,
396
 * so it's a plain spinlock.  The other locks are held longer (potentially
397
 * over I/O operations), so we use LWLocks for them.  These locks are:
398
 *
399
 * WALBufMappingLock: must be held to replace a page in the WAL buffer cache.
400
 * It is only held while initializing and changing the mapping.  If the
401
 * contents of the buffer being replaced haven't been written yet, the mapping
402
 * lock is released while the write is done, and reacquired afterwards.
403
 *
404
 * WALWriteLock: must be held to write WAL buffers to disk (XLogWrite or
405
 * XLogFlush).
406
 *
407
 * ControlFileLock: must be held to read/update control file or create
408
 * new log file.
409
 *
410
 * CheckpointLock: must be held to do a checkpoint or restartpoint (ensures
411
 * only one checkpointer at a time; currently, with all checkpoints done by
412
 * the checkpointer, this is just pro forma).
413
 *
414
 *----------
415
 */
416
417
typedef struct XLogwrtRqst
418
{
419
  XLogRecPtr  Write;      /* last byte + 1 to write out */
420
  XLogRecPtr  Flush;      /* last byte + 1 to flush */
421
} XLogwrtRqst;
422
423
typedef struct XLogwrtResult
424
{
425
  XLogRecPtr  Write;      /* last byte + 1 written out */
426
  XLogRecPtr  Flush;      /* last byte + 1 flushed */
427
} XLogwrtResult;
428
429
/*
430
 * Inserting to WAL is protected by a small fixed number of WAL insertion
431
 * locks. To insert to the WAL, you must hold one of the locks - it doesn't
432
 * matter which one. To lock out other concurrent insertions, you must hold
433
 * of them. Each WAL insertion lock consists of a lightweight lock, plus an
434
 * indicator of how far the insertion has progressed (insertingAt).
435
 *
436
 * The insertingAt values are read when a process wants to flush WAL from
437
 * the in-memory buffers to disk, to check that all the insertions to the
438
 * region the process is about to write out have finished. You could simply
439
 * wait for all currently in-progress insertions to finish, but the
440
 * insertingAt indicator allows you to ignore insertions to later in the WAL,
441
 * so that you only wait for the insertions that are modifying the buffers
442
 * you're about to write out.
443
 *
444
 * This isn't just an optimization. If all the WAL buffers are dirty, an
445
 * inserter that's holding a WAL insert lock might need to evict an old WAL
446
 * buffer, which requires flushing the WAL. If it's possible for an inserter
447
 * to block on another inserter unnecessarily, deadlock can arise when two
448
 * inserters holding a WAL insert lock wait for each other to finish their
449
 * insertion.
450
 *
451
 * Small WAL records that don't cross a page boundary never update the value,
452
 * the WAL record is just copied to the page and the lock is released. But
453
 * to avoid the deadlock-scenario explained above, the indicator is always
454
 * updated before sleeping while holding an insertion lock.
455
 *
456
 * lastImportantAt contains the LSN of the last important WAL record inserted
457
 * using a given lock. This value is used to detect if there has been
458
 * important WAL activity since the last time some action, like a checkpoint,
459
 * was performed - allowing to not repeat the action if not. The LSN is
460
 * updated for all insertions, unless the XLOG_MARK_UNIMPORTANT flag was
461
 * set. lastImportantAt is never cleared, only overwritten by the LSN of newer
462
 * records.  Tracking the WAL activity directly in WALInsertLock has the
463
 * advantage of not needing any additional locks to update the value.
464
 */
465
typedef struct
466
{
467
  LWLock    lock;
468
  XLogRecPtr  insertingAt;
469
  XLogRecPtr  lastImportantAt;
470
} WALInsertLock;
471
472
/*
473
 * All the WAL insertion locks are allocated as an array in shared memory. We
474
 * force the array stride to be a power of 2, which saves a few cycles in
475
 * indexing, but more importantly also ensures that individual slots don't
476
 * cross cache line boundaries. (Of course, we have to also ensure that the
477
 * array start address is suitably aligned.)
478
 */
479
typedef union WALInsertLockPadded
480
{
481
  WALInsertLock l;
482
  char    pad[PG_CACHE_LINE_SIZE];
483
} WALInsertLockPadded;
484
485
/*
486
 * State of an exclusive backup, necessary to control concurrent activities
487
 * across sessions when working on exclusive backups.
488
 *
489
 * EXCLUSIVE_BACKUP_NONE means that there is no exclusive backup actually
490
 * running, to be more precise pg_start_backup() is not being executed for
491
 * an exclusive backup and there is no exclusive backup in progress.
492
 * EXCLUSIVE_BACKUP_STARTING means that pg_start_backup() is starting an
493
 * exclusive backup.
494
 * EXCLUSIVE_BACKUP_IN_PROGRESS means that pg_start_backup() has finished
495
 * running and an exclusive backup is in progress. pg_stop_backup() is
496
 * needed to finish it.
497
 * EXCLUSIVE_BACKUP_STOPPING means that pg_stop_backup() is stopping an
498
 * exclusive backup.
499
 */
500
typedef enum ExclusiveBackupState
501
{
502
  EXCLUSIVE_BACKUP_NONE = 0,
503
  EXCLUSIVE_BACKUP_STARTING,
504
  EXCLUSIVE_BACKUP_IN_PROGRESS,
505
  EXCLUSIVE_BACKUP_STOPPING
506
} ExclusiveBackupState;
507
508
/*
509
 * Session status of running backup, used for sanity checks in SQL-callable
510
 * functions to start and stop backups.
511
 */
512
static SessionBackupState sessionBackupState = SESSION_BACKUP_NONE;
513
514
/*
515
 * Shared state data for WAL insertion.
516
 */
517
typedef struct XLogCtlInsert
518
{
519
  slock_t   insertpos_lck;  /* protects CurrBytePos and PrevBytePos */
520
521
  /*
522
   * CurrBytePos is the end of reserved WAL. The next record will be
523
   * inserted at that position. PrevBytePos is the start position of the
524
   * previously inserted (or rather, reserved) record - it is copied to the
525
   * prev-link of the next record. These are stored as "usable byte
526
   * positions" rather than XLogRecPtrs (see XLogBytePosToRecPtr()).
527
   */
528
  uint64    CurrBytePos;
529
  uint64    PrevBytePos;
530
531
  /*
532
   * Make sure the above heavily-contended spinlock and byte positions are
533
   * on their own cache line. In particular, the RedoRecPtr and full page
534
   * write variables below should be on a different cache line. They are
535
   * read on every WAL insertion, but updated rarely, and we don't want
536
   * those reads to steal the cache line containing Curr/PrevBytePos.
537
   */
538
  char    pad[PG_CACHE_LINE_SIZE];
539
540
  /*
541
   * fullPageWrites is the master copy used by all backends to determine
542
   * whether to write full-page to WAL, instead of using process-local one.
543
   * This is required because, when full_page_writes is changed by SIGHUP,
544
   * we must WAL-log it before it actually affects WAL-logging by backends.
545
   * Checkpointer sets at startup or after SIGHUP.
546
   *
547
   * To read these fields, you must hold an insertion lock. To modify them,
548
   * you must hold ALL the locks.
549
   */
550
  XLogRecPtr  RedoRecPtr;   /* current redo point for insertions */
551
  bool    forcePageWrites;  /* forcing full-page writes for PITR? */
552
  bool    fullPageWrites;
553
554
  /*
555
   * exclusiveBackupState indicates the state of an exclusive backup (see
556
   * comments of ExclusiveBackupState for more details). nonExclusiveBackups
557
   * is a counter indicating the number of streaming base backups currently
558
   * in progress. forcePageWrites is set to true when either of these is
559
   * non-zero. lastBackupStart is the latest checkpoint redo location used
560
   * as a starting point for an online backup.
561
   */
562
  ExclusiveBackupState exclusiveBackupState;
563
  int     nonExclusiveBackups;
564
  XLogRecPtr  lastBackupStart;
565
566
  /*
567
   * WAL insertion locks.
568
   */
569
  WALInsertLockPadded *WALInsertLocks;
570
} XLogCtlInsert;
571
572
/*
573
 * Total shared-memory state for XLOG.
574
 */
575
typedef struct XLogCtlData
576
{
577
  XLogCtlInsert Insert;
578
579
  /* Protected by info_lck: */
580
  XLogwrtRqst LogwrtRqst;
581
  XLogRecPtr  RedoRecPtr;   /* a recent copy of Insert->RedoRecPtr */
582
  uint32    ckptXidEpoch; /* nextXID & epoch of latest checkpoint */
583
  TransactionId ckptXid;
584
  XLogRecPtr  asyncXactLSN; /* LSN of newest async commit/abort */
585
  XLogRecPtr  replicationSlotMinLSN;  /* oldest LSN needed by any slot */
586
587
  XLogSegNo lastRemovedSegNo; /* latest removed/recycled XLOG segment */
588
589
  /* Fake LSN counter, for unlogged relations. Protected by ulsn_lck. */
590
  XLogRecPtr  unloggedLSN;
591
  slock_t   ulsn_lck;
592
593
  /* Time and LSN of last xlog segment switch. Protected by WALWriteLock. */
594
  pg_time_t lastSegSwitchTime;
595
  XLogRecPtr  lastSegSwitchLSN;
596
597
  /*
598
   * Protected by info_lck and WALWriteLock (you must hold either lock to
599
   * read it, but both to update)
600
   */
601
  XLogwrtResult LogwrtResult;
602
603
  /*
604
   * Latest initialized page in the cache (last byte position + 1).
605
   *
606
   * To change the identity of a buffer (and InitializedUpTo), you need to
607
   * hold WALBufMappingLock.  To change the identity of a buffer that's
608
   * still dirty, the old page needs to be written out first, and for that
609
   * you need WALWriteLock, and you need to ensure that there are no
610
   * in-progress insertions to the page by calling
611
   * WaitXLogInsertionsToFinish().
612
   */
613
  XLogRecPtr  InitializedUpTo;
614
615
  /*
616
   * These values do not change after startup, although the pointed-to pages
617
   * and xlblocks values certainly do.  xlblock values are protected by
618
   * WALBufMappingLock.
619
   */
620
  char     *pages;      /* buffers for unwritten XLOG pages */
621
  XLogRecPtr *xlblocks;   /* 1st byte ptr-s + XLOG_BLCKSZ */
622
  int     XLogCacheBlck;  /* highest allocated xlog buffer index */
623
624
  /*
625
   * Shared copy of ThisTimeLineID. Does not change after end-of-recovery.
626
   * If we created a new timeline when the system was started up,
627
   * PrevTimeLineID is the old timeline's ID that we forked off from.
628
   * Otherwise it's equal to ThisTimeLineID.
629
   */
630
  TimeLineID  ThisTimeLineID;
631
  TimeLineID  PrevTimeLineID;
632
633
  /*
634
   * archiveCleanupCommand is read from recovery.conf but needs to be in
635
   * shared memory so that the checkpointer process can access it.
636
   */
637
  char    archiveCleanupCommand[MAXPGPATH];
638
639
  /*
640
   * SharedRecoveryInProgress indicates if we're still in crash or archive
641
   * recovery.  Protected by info_lck.
642
   */
643
  bool    SharedRecoveryInProgress;
644
645
  /*
646
   * SharedHotStandbyActive indicates if we're still in crash or archive
647
   * recovery.  Protected by info_lck.
648
   */
649
  bool    SharedHotStandbyActive;
650
651
  /*
652
   * WalWriterSleeping indicates whether the WAL writer is currently in
653
   * low-power mode (and hence should be nudged if an async commit occurs).
654
   * Protected by info_lck.
655
   */
656
  bool    WalWriterSleeping;
657
658
  /*
659
   * recoveryWakeupLatch is used to wake up the startup process to continue
660
   * WAL replay, if it is waiting for WAL to arrive or failover trigger file
661
   * to appear.
662
   */
663
  Latch   recoveryWakeupLatch;
664
665
  /*
666
   * During recovery, we keep a copy of the latest checkpoint record here.
667
   * lastCheckPointRecPtr points to start of checkpoint record and
668
   * lastCheckPointEndPtr points to end+1 of checkpoint record.  Used by the
669
   * checkpointer when it wants to create a restartpoint.
670
   *
671
   * Protected by info_lck.
672
   */
673
  XLogRecPtr  lastCheckPointRecPtr;
674
  XLogRecPtr  lastCheckPointEndPtr;
675
  CheckPoint  lastCheckPoint;
676
677
  /*
678
   * lastReplayedEndRecPtr points to end+1 of the last record successfully
679
   * replayed. When we're currently replaying a record, ie. in a redo
680
   * function, replayEndRecPtr points to the end+1 of the record being
681
   * replayed, otherwise it's equal to lastReplayedEndRecPtr.
682
   */
683
  XLogRecPtr  lastReplayedEndRecPtr;
684
  TimeLineID  lastReplayedTLI;
685
  XLogRecPtr  replayEndRecPtr;
686
  TimeLineID  replayEndTLI;
687
  /* timestamp of last COMMIT/ABORT record replayed (or being replayed) */
688
  TimestampTz recoveryLastXTime;
689
690
  /*
691
   * timestamp of when we started replaying the current chunk of WAL data,
692
   * only relevant for replication or archive recovery
693
   */
694
  TimestampTz currentChunkStartTime;
695
  /* Are we requested to pause recovery? */
696
  bool    recoveryPause;
697
698
  /*
699
   * lastFpwDisableRecPtr points to the start of the last replayed
700
   * XLOG_FPW_CHANGE record that instructs full_page_writes is disabled.
701
   */
702
  XLogRecPtr  lastFpwDisableRecPtr;
703
704
  slock_t   info_lck;   /* locks shared variables shown above */
705
} XLogCtlData;
706
707
static XLogCtlData *XLogCtl = NULL;
708
709
/* a private copy of XLogCtl->Insert.WALInsertLocks, for convenience */
710
static WALInsertLockPadded *WALInsertLocks = NULL;
711
712
/*
713
 * We maintain an image of pg_control in shared memory.
714
 */
715
static ControlFileData *ControlFile = NULL;
716
717
/*
718
 * Calculate the amount of space left on the page after 'endptr'. Beware
719
 * multiple evaluation!
720
 */
721
#define INSERT_FREESPACE(endptr)  \
722
5.42k
  (((endptr) % XLOG_BLCKSZ == 0) ? 
00
: (XLOG_BLCKSZ - (endptr) % XLOG_BLCKSZ))
723
724
/* Macro to advance to next buffer index. */
725
#define NextBufIdx(idx)   \
726
1
    (((idx) == XLogCtl->XLogCacheBlck) ? 
00
: ((idx) + 1))
727
728
/*
729
 * XLogRecPtrToBufIdx returns the index of the WAL buffer that holds, or
730
 * would hold if it was in cache, the page containing 'recptr'.
731
 */
732
#define XLogRecPtrToBufIdx(recptr)  \
733
8.58k
  (((recptr) / XLOG_BLCKSZ) % (XLogCtl->XLogCacheBlck + 1))
734
735
/*
736
 * These are the number of bytes in a WAL page usable for WAL data.
737
 */
738
4.17k
#define UsableBytesInPage (XLOG_BLCKSZ - SizeOfXLogShortPHD)
739
740
/* Convert min_wal_size_mb and max wal_size_mb to equivalent segment count */
741
#define ConvertToXSegs(x, segsize)  \
742
24.0k
  (x / ((segsize) / (1024 * 1024)))
743
744
/* The number of bytes in a WAL segment usable for WAL data. */
745
static int  UsableBytesInSegment;
746
747
/*
748
 * Private, possibly out-of-date copy of shared LogwrtResult.
749
 * See discussion above.
750
 */
751
static XLogwrtResult LogwrtResult = {0, 0};
752
753
/*
754
 * Codes indicating where we got a WAL file from during recovery, or where
755
 * to attempt to get one.
756
 */
757
typedef enum
758
{
759
  XLOG_FROM_ANY = 0,      /* request to read WAL from any source */
760
  XLOG_FROM_ARCHIVE,      /* restored using restore_command */
761
  XLOG_FROM_PG_WAL,     /* existing file in pg_wal */
762
  XLOG_FROM_STREAM      /* streamed from master */
763
} XLogSource;
764
765
/* human-readable names for XLogSources, for debugging output */
766
static const char *xlogSourceNames[] = {"any", "archive", "pg_wal", "stream"};
767
768
/*
769
 * openLogFile is -1 or a kernel FD for an open log file segment.
770
 * When it's open, openLogOff is the current seek offset in the file.
771
 * openLogSegNo identifies the segment.  These variables are only
772
 * used to write the XLOG, and so will normally refer to the active segment.
773
 */
774
static int  openLogFile = -1;
775
static XLogSegNo openLogSegNo = 0;
776
static uint32 openLogOff = 0;
777
778
/*
779
 * These variables are used similarly to the ones above, but for reading
780
 * the XLOG.  Note, however, that readOff generally represents the offset
781
 * of the page just read, not the seek position of the FD itself, which
782
 * will be just past that page. readLen indicates how much of the current
783
 * page has been read into readBuf, and readSource indicates where we got
784
 * the currently open file from.
785
 */
786
static int  readFile = -1;
787
static XLogSegNo readSegNo = 0;
788
static uint32 readOff = 0;
789
static uint32 readLen = 0;
790
static XLogSource readSource = 0; /* XLOG_FROM_* code */
791
792
/*
793
 * Keeps track of which source we're currently reading from. This is
794
 * different from readSource in that this is always set, even when we don't
795
 * currently have a WAL file open. If lastSourceFailed is set, our last
796
 * attempt to read from currentSource failed, and we should try another source
797
 * next.
798
 */
799
static XLogSource currentSource = 0;  /* XLOG_FROM_* code */
800
static bool lastSourceFailed = false;
801
802
typedef struct XLogPageReadPrivate
803
{
804
  int     emode;
805
  bool    fetching_ckpt;  /* are we fetching a checkpoint record? */
806
  bool    randAccess;
807
} XLogPageReadPrivate;
808
809
/*
810
 * These variables track when we last obtained some WAL data to process,
811
 * and where we got it from.  (XLogReceiptSource is initially the same as
812
 * readSource, but readSource gets reset to zero when we don't have data
813
 * to process right now.  It is also different from currentSource, which
814
 * also changes when we try to read from a source and fail, while
815
 * XLogReceiptSource tracks where we last successfully read some WAL.)
816
 */
817
static TimestampTz XLogReceiptTime = 0;
818
static XLogSource XLogReceiptSource = 0;  /* XLOG_FROM_* code */
819
820
/* State information for XLOG reading */
821
static XLogRecPtr ReadRecPtr; /* start of last record read */
822
static XLogRecPtr EndRecPtr;  /* end+1 of last record read */
823
824
/*
825
 * Local copies of equivalent fields in the control file.  When running
826
 * crash recovery, minRecoveryPoint is set to InvalidXLogRecPtr as we
827
 * expect to replay all the WAL available, and updateMinRecoveryPoint is
828
 * switched to false to prevent any updates while replaying records.
829
 * Those values are kept consistent as long as crash recovery runs.
830
 */
831
static XLogRecPtr minRecoveryPoint;
832
static TimeLineID minRecoveryPointTLI;
833
static bool updateMinRecoveryPoint = true;
834
835
/*
836
 * Have we reached a consistent database state? In crash recovery, we have
837
 * to replay all the WAL, so reachedConsistency is never set. During archive
838
 * recovery, the database is consistent once minRecoveryPoint is reached.
839
 */
840
bool    reachedConsistency = false;
841
842
static bool InRedo = false;
843
844
/* Have we launched bgwriter during recovery? */
845
static bool bgwriterLaunched = false;
846
847
/* For WALInsertLockAcquire/Release functions */
848
static int  MyLockNo = 0;
849
static bool holdingAllLocks = false;
850
851
#ifdef WAL_DEBUG
852
static MemoryContext walDebugCxt = NULL;
853
#endif
854
855
static void readRecoveryCommandFile(void);
856
static void exitArchiveRecovery(TimeLineID endTLI, XLogRecPtr endOfLog);
857
static bool recoveryStopsBefore(XLogReaderState *record);
858
static bool recoveryStopsAfter(XLogReaderState *record);
859
static void recoveryPausesHere(void);
860
static bool recoveryApplyDelay(XLogReaderState *record);
861
static void SetLatestXTime(TimestampTz xtime);
862
static void SetCurrentChunkStartTime(TimestampTz xtime);
863
static void CheckRequiredParameterValues(void);
864
static void XLogReportParameters(void);
865
static void checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI,
866
          TimeLineID prevTLI);
867
static void LocalSetXLogInsertAllowed(void);
868
static void CreateEndOfRecoveryRecord(void);
869
static void CheckPointGuts(XLogRecPtr checkPointRedo, int flags);
870
static void KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo);
871
static XLogRecPtr XLogGetReplicationSlotMinimumLSN(void);
872
873
static void AdvanceXLInsertBuffer(XLogRecPtr upto, bool opportunistic);
874
static bool XLogCheckpointNeeded(XLogSegNo new_segno);
875
static void XLogWrite(XLogwrtRqst WriteRqst, bool flexible);
876
static bool InstallXLogFileSegment(XLogSegNo *segno, char *tmppath,
877
             bool find_free, XLogSegNo max_segno,
878
             bool use_lock);
879
static int XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli,
880
       int source, bool notfoundOk);
881
static int  XLogFileReadAnyTLI(XLogSegNo segno, int emode, int source);
882
static int XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr,
883
       int reqLen, XLogRecPtr targetRecPtr, char *readBuf,
884
       TimeLineID *readTLI);
885
static bool WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
886
              bool fetching_ckpt, XLogRecPtr tliRecPtr);
887
static int  emode_for_corrupt_record(int emode, XLogRecPtr RecPtr);
888
static void XLogFileClose(void);
889
static void PreallocXlogFiles(XLogRecPtr endptr);
890
static void RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr RedoRecPtr, XLogRecPtr endptr);
891
static void RemoveXlogFile(const char *segname, XLogRecPtr RedoRecPtr, XLogRecPtr endptr);
892
static void UpdateLastRemovedPtr(char *filename);
893
static void ValidateXLOGDirectoryStructure(void);
894
static void CleanupBackupHistory(void);
895
static void UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force);
896
static XLogRecord *ReadRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr,
897
       int emode, bool fetching_ckpt);
898
static void CheckRecoveryConsistency(void);
899
static XLogRecord *ReadCheckpointRecord(XLogReaderState *xlogreader,
900
           XLogRecPtr RecPtr, int whichChkpti, bool report);
901
static bool rescanLatestTimeLine(void);
902
static void WriteControlFile(void);
903
static void ReadControlFile(void);
904
static char *str_time(pg_time_t tnow);
905
static bool CheckForStandbyTrigger(void);
906
907
#ifdef WAL_DEBUG
908
static void xlog_outrec(StringInfo buf, XLogReaderState *record);
909
#endif
910
static void xlog_outdesc(StringInfo buf, XLogReaderState *record);
911
static void pg_start_backup_callback(int code, Datum arg);
912
static void pg_stop_backup_callback(int code, Datum arg);
913
static bool read_backup_label(XLogRecPtr *checkPointLoc,
914
          bool *backupEndRequired, bool *backupFromStandby);
915
static bool read_tablespace_map(List **tablespaces);
916
917
static void rm_redo_error_callback(void *arg);
918
static int  get_sync_bit(int method);
919
920
static void CopyXLogRecordToWAL(int write_len, bool isLogSwitch,
921
          XLogRecData *rdata,
922
          XLogRecPtr StartPos, XLogRecPtr EndPos);
923
static void ReserveXLogInsertLocation(int size, XLogRecPtr *StartPos,
924
              XLogRecPtr *EndPos, XLogRecPtr *PrevPtr);
925
static bool ReserveXLogSwitch(XLogRecPtr *StartPos, XLogRecPtr *EndPos,
926
          XLogRecPtr *PrevPtr);
927
static XLogRecPtr WaitXLogInsertionsToFinish(XLogRecPtr upto);
928
static char *GetXLogBuffer(XLogRecPtr ptr);
929
static XLogRecPtr XLogBytePosToRecPtr(uint64 bytepos);
930
static XLogRecPtr XLogBytePosToEndRecPtr(uint64 bytepos);
931
static uint64 XLogRecPtrToBytePos(XLogRecPtr ptr);
932
static void checkXLogConsistency(XLogReaderState *record);
933
934
static void WALInsertLockAcquire(void);
935
static void WALInsertLockAcquireExclusive(void);
936
static void WALInsertLockRelease(void);
937
static void WALInsertLockUpdateInsertingAt(XLogRecPtr insertingAt);
938
939
/*
940
 * Insert an XLOG record represented by an already-constructed chain of data
941
 * chunks.  This is a low-level routine; to construct the WAL record header
942
 * and data, use the higher-level routines in xloginsert.c.
943
 *
944
 * If 'fpw_lsn' is valid, it is the oldest LSN among the pages that this
945
 * WAL record applies to, that were not included in the record as full page
946
 * images.  If fpw_lsn <= RedoRecPtr, the function does not perform the
947
 * insertion and returns InvalidXLogRecPtr.  The caller can then recalculate
948
 * which pages need a full-page image, and retry.  If fpw_lsn is invalid, the
949
 * record is always inserted.
950
 *
951
 * 'flags' gives more in-depth control on the record being inserted. See
952
 * XLogSetRecordFlags() for details.
953
 *
954
 * The first XLogRecData in the chain must be for the record header, and its
955
 * data must be MAXALIGNed.  XLogInsertRecord fills in the xl_prev and
956
 * xl_crc fields in the header, the rest of the header must already be filled
957
 * by the caller.
958
 *
959
 * Returns XLOG pointer to end of record (beginning of next record).
960
 * This can be used as LSN for data pages affected by the logged action.
961
 * (LSN is the XLOG point up to which the XLOG must be flushed to disk
962
 * before the data page can be written out.  This implements the basic
963
 * WAL rule "write the log before the data".)
964
 */
965
XLogRecPtr
966
XLogInsertRecord(XLogRecData *rdata,
967
         XLogRecPtr fpw_lsn,
968
         uint8 flags)
969
3.06k
{
970
3.06k
  XLogCtlInsert *Insert = &XLogCtl->Insert;
971
3.06k
  pg_crc32c rdata_crc;
972
3.06k
  bool    inserted;
973
3.06k
  XLogRecord *rechdr = (XLogRecord *) rdata->data;
974
3.06k
  uint8   info = rechdr->xl_info & ~XLR_INFO_MASK;
975
3.06k
  bool    isLogSwitch = (rechdr->xl_rmid == RM_XLOG_ID &&
976
3.06k
                 
info == 2.36k
XLOG_SWITCH2.36k
);
977
3.06k
  XLogRecPtr  StartPos;
978
3.06k
  XLogRecPtr  EndPos;
979
3.06k
  bool    prevDoPageWrites = doPageWrites;
980
981
  /* we assume that all of the record header is in the first chunk */
982
3.06k
  Assert(rdata->len >= SizeOfXLogRecord);
983
984
  /* cross-check on whether we should be here or not */
985
3.06k
  if (!XLogInsertAllowed())
986
0
    elog(ERROR, "cannot make new WAL entries during recovery");
987
988
  /*----------
989
   *
990
   * We have now done all the preparatory work we can without holding a
991
   * lock or modifying shared state. From here on, inserting the new WAL
992
   * record to the shared WAL buffer cache is a two-step process:
993
   *
994
   * 1. Reserve the right amount of space from the WAL. The current head of
995
   *    reserved space is kept in Insert->CurrBytePos, and is protected by
996
   *    insertpos_lck.
997
   *
998
   * 2. Copy the record to the reserved WAL space. This involves finding the
999
   *    correct WAL buffer containing the reserved space, and copying the
1000
   *    record in place. This can be done concurrently in multiple processes.
1001
   *
1002
   * To keep track of which insertions are still in-progress, each concurrent
1003
   * inserter acquires an insertion lock. In addition to just indicating that
1004
   * an insertion is in progress, the lock tells others how far the inserter
1005
   * has progressed. There is a small fixed number of insertion locks,
1006
   * determined by NUM_XLOGINSERT_LOCKS. When an inserter crosses a page
1007
   * boundary, it updates the value stored in the lock to the how far it has
1008
   * inserted, to allow the previous buffer to be flushed.
1009
   *
1010
   * Holding onto an insertion lock also protects RedoRecPtr and
1011
   * fullPageWrites from changing until the insertion is finished.
1012
   *
1013
   * Step 2 can usually be done completely in parallel. If the required WAL
1014
   * page is not initialized yet, you have to grab WALBufMappingLock to
1015
   * initialize it, but the WAL writer tries to do that ahead of insertions
1016
   * to avoid that from happening in the critical path.
1017
   *
1018
   *----------
1019
   */
1020
3.06k
  START_CRIT_SECTION();
1021
3.06k
  if (isLogSwitch)
1022
0
    WALInsertLockAcquireExclusive();
1023
3.06k
  else
1024
3.06k
    WALInsertLockAcquire();
1025
1026
  /*
1027
   * Check to see if my copy of RedoRecPtr is out of date. If so, may have
1028
   * to go back and have the caller recompute everything. This can only
1029
   * happen just after a checkpoint, so it's better to be slow in this case
1030
   * and fast otherwise.
1031
   *
1032
   * Also check to see if fullPageWrites or forcePageWrites was just turned
1033
   * on; if we weren't already doing full-page writes then go back and
1034
   * recompute.
1035
   *
1036
   * If we aren't doing full-page writes then RedoRecPtr doesn't actually
1037
   * affect the contents of the XLOG record, so we'll update our local copy
1038
   * but not force a recomputation.  (If doPageWrites was just turned off,
1039
   * we could recompute the record without full pages, but we choose not to
1040
   * bother.)
1041
   */
1042
3.06k
  if (RedoRecPtr != Insert->RedoRecPtr)
1043
1
  {
1044
1
    Assert(RedoRecPtr < Insert->RedoRecPtr);
1045
1
    RedoRecPtr = Insert->RedoRecPtr;
1046
1
  }
1047
3.06k
  doPageWrites = (Insert->fullPageWrites || 
Insert->forcePageWrites0
);
1048
1049
3.06k
  if (doPageWrites &&
1050
3.06k
    (!prevDoPageWrites ||
1051
3.06k
     (fpw_lsn != InvalidXLogRecPtr && 
fpw_lsn <= RedoRecPtr0
)))
1052
0
  {
1053
    /*
1054
     * Oops, some buffer now needs to be backed up that the caller didn't
1055
     * back up.  Start over.
1056
     */
1057
0
    WALInsertLockRelease();
1058
0
    END_CRIT_SECTION();
1059
0
    return InvalidXLogRecPtr;
1060
0
  }
1061
1062
  /*
1063
   * Reserve space for the record in the WAL. This also sets the xl_prev
1064
   * pointer.
1065
   */
1066
3.06k
  if (isLogSwitch)
1067
0
    inserted = ReserveXLogSwitch(&StartPos, &EndPos, &rechdr->xl_prev);
1068
3.06k
  else
1069
3.06k
  {
1070
3.06k
    ReserveXLogInsertLocation(rechdr->xl_tot_len, &StartPos, &EndPos,
1071
3.06k
                  &rechdr->xl_prev);
1072
3.06k
    inserted = true;
1073
3.06k
  }
1074
1075
3.06k
  if (inserted)
1076
3.06k
  {
1077
    /*
1078
     * Now that xl_prev has been filled in, calculate CRC of the record
1079
     * header.
1080
     */
1081
3.06k
    rdata_crc = rechdr->xl_crc;
1082
3.06k
    COMP_CRC32C(rdata_crc, rechdr, offsetof(XLogRecord, xl_crc));
1083
3.06k
    FIN_CRC32C(rdata_crc);
1084
3.06k
    rechdr->xl_crc = rdata_crc;
1085
1086
    /*
1087
     * All the record data, including the header, is now ready to be
1088
     * inserted. Copy the record in the space reserved.
1089
     */
1090
3.06k
    CopyXLogRecordToWAL(rechdr->xl_tot_len, isLogSwitch, rdata,
1091
3.06k
              StartPos, EndPos);
1092
1093
    /*
1094
     * Unless record is flagged as not important, update LSN of last
1095
     * important record in the current slot. When holding all locks, just
1096
     * update the first one.
1097
     */
1098
3.06k
    if ((flags & XLOG_MARK_UNIMPORTANT) == 0)
1099
2.72k
    {
1100
2.72k
      int     lockno = holdingAllLocks ? 
00
: MyLockNo;
1101
1102
2.72k
      WALInsertLocks[lockno].l.lastImportantAt = StartPos;
1103
2.72k
    }
1104
3.06k
  }
1105
0
  else
1106
0
  {
1107
    /*
1108
     * This was an xlog-switch record, but the current insert location was
1109
     * already exactly at the beginning of a segment, so there was no need
1110
     * to do anything.
1111
     */
1112
0
  }
1113
1114
  /*
1115
   * Done! Let others know that we're finished.
1116
   */
1117
3.06k
  WALInsertLockRelease();
1118
1119
3.06k
  MarkCurrentTransactionIdLoggedIfAny();
1120
1121
3.06k
  END_CRIT_SECTION();
1122
1123
  /*
1124
   * Update shared LogwrtRqst.Write, if we crossed page boundary.
1125
   */
1126
3.06k
  if (StartPos / XLOG_BLCKSZ != EndPos / XLOG_BLCKSZ)
1127
1
  {
1128
1
    SpinLockAcquire(&XLogCtl->info_lck);
1129
    /* advance global request to include new block(s) */
1130
1
    if (XLogCtl->LogwrtRqst.Write < EndPos)
1131
1
      XLogCtl->LogwrtRqst.Write = EndPos;
1132
    /* update local result copy while I have the chance */
1133
1
    LogwrtResult = XLogCtl->LogwrtResult;
1134
1
    SpinLockRelease(&XLogCtl->info_lck);
1135
1
  }
1136
1137
  /*
1138
   * If this was an XLOG_SWITCH record, flush the record and the empty
1139
   * padding space that fills the rest of the segment, and perform
1140
   * end-of-segment actions (eg, notifying archiver).
1141
   */
1142
3.06k
  if (isLogSwitch)
1143
0
  {
1144
0
    TRACE_POSTGRESQL_WAL_SWITCH();
1145
0
    XLogFlush(EndPos);
1146
1147
    /*
1148
     * Even though we reserved the rest of the segment for us, which is
1149
     * reflected in EndPos, we return a pointer to just the end of the
1150
     * xlog-switch record.
1151
     */
1152
0
    if (inserted)
1153
0
    {
1154
0
      EndPos = StartPos + SizeOfXLogRecord;
1155
0
      if (StartPos / XLOG_BLCKSZ != EndPos / XLOG_BLCKSZ)
1156
0
      {
1157
0
        uint64    offset = XLogSegmentOffset(EndPos, wal_segment_size);
1158
1159
0
        if (offset == EndPos % XLOG_BLCKSZ)
1160
0
          EndPos += SizeOfXLogLongPHD;
1161
0
        else
1162
0
          EndPos += SizeOfXLogShortPHD;
1163
0
      }
1164
0
    }
1165
0
  }
1166
1167
#ifdef WAL_DEBUG
1168
  if (XLOG_DEBUG)
1169
  {
1170
    static XLogReaderState *debug_reader = NULL;
1171
    StringInfoData buf;
1172
    StringInfoData recordBuf;
1173
    char     *errormsg = NULL;
1174
    MemoryContext oldCxt;
1175
1176
    oldCxt = MemoryContextSwitchTo(walDebugCxt);
1177
1178
    initStringInfo(&buf);
1179
    appendStringInfo(&buf, "INSERT @ %X/%X: ",
1180
             (uint32) (EndPos >> 32), (uint32) EndPos);
1181
1182
    /*
1183
     * We have to piece together the WAL record data from the XLogRecData
1184
     * entries, so that we can pass it to the rm_desc function as one
1185
     * contiguous chunk.
1186
     */
1187
    initStringInfo(&recordBuf);
1188
    for (; rdata != NULL; rdata = rdata->next)
1189
      appendBinaryStringInfo(&recordBuf, rdata->data, rdata->len);
1190
1191
    if (!debug_reader)
1192
      debug_reader = XLogReaderAllocate(wal_segment_size, NULL, NULL);
1193
1194
    if (!debug_reader)
1195
    {
1196
      appendStringInfoString(&buf, "error decoding record: out of memory");
1197
    }
1198
    else if (!DecodeXLogRecord(debug_reader, (XLogRecord *) recordBuf.data,
1199
                   &errormsg))
1200
    {
1201
      appendStringInfo(&buf, "error decoding record: %s",
1202
               errormsg ? errormsg : "no error message");
1203
    }
1204
    else
1205
    {
1206
      appendStringInfoString(&buf, " - ");
1207
      xlog_outdesc(&buf, debug_reader);
1208
    }
1209
    elog(LOG, "%s", buf.data);
1210
1211
    pfree(buf.data);
1212
    pfree(recordBuf.data);
1213
    MemoryContextSwitchTo(oldCxt);
1214
  }
1215
#endif
1216
1217
  /*
1218
   * Update our global variables
1219
   */
1220
3.06k
  ProcLastRecPtr = StartPos;
1221
3.06k
  XactLastRecEnd = EndPos;
1222
1223
3.06k
  return EndPos;
1224
3.06k
}
1225
1226
/*
1227
 * Reserves the right amount of space for a record of given size from the WAL.
1228
 * *StartPos is set to the beginning of the reserved section, *EndPos to
1229
 * its end+1. *PrevPtr is set to the beginning of the previous record; it is
1230
 * used to set the xl_prev of this record.
1231
 *
1232
 * This is the performance critical part of XLogInsert that must be serialized
1233
 * across backends. The rest can happen mostly in parallel. Try to keep this
1234
 * section as short as possible, insertpos_lck can be heavily contended on a
1235
 * busy system.
1236
 *
1237
 * NB: The space calculation here must match the code in CopyXLogRecordToWAL,
1238
 * where we actually copy the record to the reserved space.
1239
 */
1240
static void
1241
ReserveXLogInsertLocation(int size, XLogRecPtr *StartPos, XLogRecPtr *EndPos,
1242
              XLogRecPtr *PrevPtr)
1243
3.06k
{
1244
3.06k
  XLogCtlInsert *Insert = &XLogCtl->Insert;
1245
3.06k
  uint64    startbytepos;
1246
3.06k
  uint64    endbytepos;
1247
3.06k
  uint64    prevbytepos;
1248
1249
3.06k
  size = MAXALIGN(size);
1250
1251
  /* All (non xlog-switch) records should contain data. */
1252
3.06k
  Assert(size > SizeOfXLogRecord);
1253
1254
  /*
1255
   * The duration the spinlock needs to be held is minimized by minimizing
1256
   * the calculations that have to be done while holding the lock. The
1257
   * current tip of reserved WAL is kept in CurrBytePos, as a byte position
1258
   * that only counts "usable" bytes in WAL, that is, it excludes all WAL
1259
   * page headers. The mapping between "usable" byte positions and physical
1260
   * positions (XLogRecPtrs) can be done outside the locked region, and
1261
   * because the usable byte position doesn't include any headers, reserving
1262
   * X bytes from WAL is almost as simple as "CurrBytePos += X".
1263
   */
1264
3.06k
  SpinLockAcquire(&Insert->insertpos_lck);
1265
1266
3.06k
  startbytepos = Insert->CurrBytePos;
1267
3.06k
  endbytepos = startbytepos + size;
1268
3.06k
  prevbytepos = Insert->PrevBytePos;
1269
3.06k
  Insert->CurrBytePos = endbytepos;
1270
3.06k
  Insert->PrevBytePos = startbytepos;
1271
1272
3.06k
  SpinLockRelease(&Insert->insertpos_lck);
1273
1274
3.06k
  *StartPos = XLogBytePosToRecPtr(startbytepos);
1275
3.06k
  *EndPos = XLogBytePosToEndRecPtr(endbytepos);
1276
3.06k
  *PrevPtr = XLogBytePosToRecPtr(prevbytepos);
1277
1278
  /*
1279
   * Check that the conversions between "usable byte positions" and
1280
   * XLogRecPtrs work consistently in both directions.
1281
   */
1282
3.06k
  Assert(XLogRecPtrToBytePos(*StartPos) == startbytepos);
1283
3.06k
  Assert(XLogRecPtrToBytePos(*EndPos) == endbytepos);
1284
3.06k
  Assert(XLogRecPtrToBytePos(*PrevPtr) == prevbytepos);
1285
3.06k
}
1286
1287
/*
1288
 * Like ReserveXLogInsertLocation(), but for an xlog-switch record.
1289
 *
1290
 * A log-switch record is handled slightly differently. The rest of the
1291
 * segment will be reserved for this insertion, as indicated by the returned
1292
 * *EndPos value. However, if we are already at the beginning of the current
1293
 * segment, *StartPos and *EndPos are set to the current location without
1294
 * reserving any space, and the function returns false.
1295
*/
1296
static bool
1297
ReserveXLogSwitch(XLogRecPtr *StartPos, XLogRecPtr *EndPos, XLogRecPtr *PrevPtr)
1298
0
{
1299
0
  XLogCtlInsert *Insert = &XLogCtl->Insert;
1300
0
  uint64    startbytepos;
1301
0
  uint64    endbytepos;
1302
0
  uint64    prevbytepos;
1303
0
  uint32    size = MAXALIGN(SizeOfXLogRecord);
1304
0
  XLogRecPtr  ptr;
1305
0
  uint32    segleft;
1306
1307
  /*
1308
   * These calculations are a bit heavy-weight to be done while holding a
1309
   * spinlock, but since we're holding all the WAL insertion locks, there
1310
   * are no other inserters competing for it. GetXLogInsertRecPtr() does
1311
   * compete for it, but that's not called very frequently.
1312
   */
1313
0
  SpinLockAcquire(&Insert->insertpos_lck);
1314
1315
0
  startbytepos = Insert->CurrBytePos;
1316
1317
0
  ptr = XLogBytePosToEndRecPtr(startbytepos);
1318
0
  if (XLogSegmentOffset(ptr, wal_segment_size) == 0)
1319
0
  {
1320
0
    SpinLockRelease(&Insert->insertpos_lck);
1321
0
    *EndPos = *StartPos = ptr;
1322
0
    return false;
1323
0
  }
1324
1325
0
  endbytepos = startbytepos + size;
1326
0
  prevbytepos = Insert->PrevBytePos;
1327
1328
0
  *StartPos = XLogBytePosToRecPtr(startbytepos);
1329
0
  *EndPos = XLogBytePosToEndRecPtr(endbytepos);
1330
1331
0
  segleft = wal_segment_size - XLogSegmentOffset(*EndPos, wal_segment_size);
1332
0
  if (segleft != wal_segment_size)
1333
0
  {
1334
    /* consume the rest of the segment */
1335
0
    *EndPos += segleft;
1336
0
    endbytepos = XLogRecPtrToBytePos(*EndPos);
1337
0
  }
1338
0
  Insert->CurrBytePos = endbytepos;
1339
0
  Insert->PrevBytePos = startbytepos;
1340
1341
0
  SpinLockRelease(&Insert->insertpos_lck);
1342
1343
0
  *PrevPtr = XLogBytePosToRecPtr(prevbytepos);
1344
1345
0
  Assert(XLogSegmentOffset(*EndPos, wal_segment_size) == 0);
1346
0
  Assert(XLogRecPtrToBytePos(*EndPos) == endbytepos);
1347
0
  Assert(XLogRecPtrToBytePos(*StartPos) == startbytepos);
1348
0
  Assert(XLogRecPtrToBytePos(*PrevPtr) == prevbytepos);
1349
1350
0
  return true;
1351
0
}
1352
1353
/*
1354
 * Checks whether the current buffer page and backup page stored in the
1355
 * WAL record are consistent or not. Before comparing the two pages, a
1356
 * masking can be applied to the pages to ignore certain areas like hint bits,
1357
 * unused space between pd_lower and pd_upper among other things. This
1358
 * function should be called once WAL replay has been completed for a
1359
 * given record.
1360
 */
1361
static void
1362
checkXLogConsistency(XLogReaderState *record)
1363
0
{
1364
0
  RmgrId    rmid = XLogRecGetRmid(record);
1365
0
  RelFileNode rnode;
1366
0
  ForkNumber  forknum;
1367
0
  BlockNumber blkno;
1368
0
  int     block_id;
1369
1370
  /* Records with no backup blocks have no need for consistency checks. */
1371
0
  if (!XLogRecHasAnyBlockRefs(record))
1372
0
    return;
1373
1374
0
  Assert((XLogRecGetInfo(record) & XLR_CHECK_CONSISTENCY) != 0);
1375
1376
0
  for (block_id = 0; block_id <= record->max_block_id; block_id++)
1377
0
  {
1378
0
    Buffer    buf;
1379
0
    Page    page;
1380
1381
0
    if (!XLogRecGetBlockTag(record, block_id, &rnode, &forknum, &blkno))
1382
0
    {
1383
      /*
1384
       * WAL record doesn't contain a block reference with the given id.
1385
       * Do nothing.
1386
       */
1387
0
      continue;
1388
0
    }
1389
1390
0
    Assert(XLogRecHasBlockImage(record, block_id));
1391
1392
0
    if (XLogRecBlockImageApply(record, block_id))
1393
0
    {
1394
      /*
1395
       * WAL record has already applied the page, so bypass the
1396
       * consistency check as that would result in comparing the full
1397
       * page stored in the record with itself.
1398
       */
1399
0
      continue;
1400
0
    }
1401
1402
    /*
1403
     * Read the contents from the current buffer and store it in a
1404
     * temporary page.
1405
     */
1406
0
    buf = XLogReadBufferExtended(rnode, forknum, blkno,
1407
0
                   RBM_NORMAL_NO_LOG);
1408
0
    if (!BufferIsValid(buf))
1409
0
      continue;
1410
1411
0
    LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
1412
0
    page = BufferGetPage(buf);
1413
1414
    /*
1415
     * Take a copy of the local page where WAL has been applied to have a
1416
     * comparison base before masking it...
1417
     */
1418
0
    memcpy(replay_image_masked, page, BLCKSZ);
1419
1420
    /* No need for this page anymore now that a copy is in. */
1421
0
    UnlockReleaseBuffer(buf);
1422
1423
    /*
1424
     * If the block LSN is already ahead of this WAL record, we can't
1425
     * expect contents to match.  This can happen if recovery is
1426
     * restarted.
1427
     */
1428
0
    if (PageGetLSN(replay_image_masked) > record->EndRecPtr)
1429
0
      continue;
1430
1431
    /*
1432
     * Read the contents from the backup copy, stored in WAL record and
1433
     * store it in a temporary page. There is no need to allocate a new
1434
     * page here, a local buffer is fine to hold its contents and a mask
1435
     * can be directly applied on it.
1436
     */
1437
0
    if (!RestoreBlockImage(record, block_id, master_image_masked))
1438
0
      elog(ERROR, "failed to restore block image");
1439
1440
    /*
1441
     * If masking function is defined, mask both the master and replay
1442
     * images
1443
     */
1444
0
    if (RmgrTable[rmid].rm_mask != NULL)
1445
0
    {
1446
0
      RmgrTable[rmid].rm_mask(replay_image_masked, blkno);
1447
0
      RmgrTable[rmid].rm_mask(master_image_masked, blkno);
1448
0
    }
1449
1450
    /* Time to compare the master and replay images. */
1451
0
    if (memcmp(replay_image_masked, master_image_masked, BLCKSZ) != 0)
1452
0
    {
1453
0
      elog(FATAL,
1454
0
         "inconsistent page found, rel %u/%u/%u, forknum %u, blkno %u",
1455
0
         rnode.spcNode, rnode.dbNode, rnode.relNode,
1456
0
         forknum, blkno);
1457
0
    }
1458
0
  }
1459
0
}
1460
1461
/*
1462
 * Subroutine of XLogInsertRecord.  Copies a WAL record to an already-reserved
1463
 * area in the WAL.
1464
 */
1465
static void
1466
CopyXLogRecordToWAL(int write_len, bool isLogSwitch, XLogRecData *rdata,
1467
          XLogRecPtr StartPos, XLogRecPtr EndPos)
1468
3.06k
{
1469
3.06k
  char     *currpos;
1470
3.06k
  int     freespace;
1471
3.06k
  int     written;
1472
3.06k
  XLogRecPtr  CurrPos;
1473
3.06k
  XLogPageHeader pagehdr;
1474
1475
  /*
1476
   * Get a pointer to the right place in the right WAL buffer to start
1477
   * inserting to.
1478
   */
1479
3.06k
  CurrPos = StartPos;
1480
3.06k
  currpos = GetXLogBuffer(CurrPos);
1481
3.06k
  freespace = INSERT_FREESPACE(CurrPos);
1482
1483
  /*
1484
   * there should be enough space for at least the first field (xl_tot_len)
1485
   * on this page.
1486
   */
1487
3.06k
  Assert(freespace >= sizeof(uint32));
1488
1489
  /* Copy record data */
1490
3.06k
  written = 0;
1491
9.26k
  while (rdata != NULL)
1492
6.19k
  {
1493
6.19k
    char     *rdata_data = rdata->data;
1494
6.19k
    int     rdata_len = rdata->len;
1495
1496
6.19k
    while (rdata_len > freespace)
1497
1
    {
1498
      /*
1499
       * Write what fits on this page, and continue on the next page.
1500
       */
1501
1
      Assert(CurrPos % XLOG_BLCKSZ >= SizeOfXLogShortPHD || freespace == 0);
1502
1
      memcpy(currpos, rdata_data, freespace);
1503
1
      rdata_data += freespace;
1504
1
      rdata_len -= freespace;
1505
1
      written += freespace;
1506
1
      CurrPos += freespace;
1507
1508
      /*
1509
       * Get pointer to beginning of next page, and set the xlp_rem_len
1510
       * in the page header. Set XLP_FIRST_IS_CONTRECORD.
1511
       *
1512
       * It's safe to set the contrecord flag and xlp_rem_len without a
1513
       * lock on the page. All the other flags were already set when the
1514
       * page was initialized, in AdvanceXLInsertBuffer, and we're the
1515
       * only backend that needs to set the contrecord flag.
1516
       */
1517
1
      currpos = GetXLogBuffer(CurrPos);
1518
1
      pagehdr = (XLogPageHeader) currpos;
1519
1
      pagehdr->xlp_rem_len = write_len - written;
1520
1
      pagehdr->xlp_info |= XLP_FIRST_IS_CONTRECORD;
1521
1522
      /* skip over the page header */
1523
1
      if (XLogSegmentOffset(CurrPos, wal_segment_size) == 0)
1524
0
      {
1525
0
        CurrPos += SizeOfXLogLongPHD;
1526
0
        currpos += SizeOfXLogLongPHD;
1527
0
      }
1528
1
      else
1529
1
      {
1530
1
        CurrPos += SizeOfXLogShortPHD;
1531
1
        currpos += SizeOfXLogShortPHD;
1532
1
      }
1533
1
      freespace = INSERT_FREESPACE(CurrPos);
1534
1
    }
1535
1536
6.19k
    Assert(CurrPos % XLOG_BLCKSZ >= SizeOfXLogShortPHD || rdata_len == 0);
1537
6.19k
    memcpy(currpos, rdata_data, rdata_len);
1538
6.19k
    currpos += rdata_len;
1539
6.19k
    CurrPos += rdata_len;
1540
6.19k
    freespace -= rdata_len;
1541
6.19k
    written += rdata_len;
1542
1543
6.19k
    rdata = rdata->next;
1544
6.19k
  }
1545
3.06k
  Assert(written == write_len);
1546
1547
  /*
1548
   * If this was an xlog-switch, it's not enough to write the switch record,
1549
   * we also have to consume all the remaining space in the WAL segment.  We
1550
   * have already reserved that space, but we need to actually fill it.
1551
   */
1552
3.06k
  if (isLogSwitch && 
XLogSegmentOffset0
(CurrPos, wal_segment_size) != 00
)
1553
0
  {
1554
    /* An xlog-switch record doesn't contain any data besides the header */
1555
0
    Assert(write_len == SizeOfXLogRecord);
1556
1557
    /* Assert that we did reserve the right amount of space */
1558
0
    Assert(XLogSegmentOffset(EndPos, wal_segment_size) == 0);
1559
1560
    /* Use up all the remaining space on the current page */
1561
0
    CurrPos += freespace;
1562
1563
    /*
1564
     * Cause all remaining pages in the segment to be flushed, leaving the
1565
     * XLog position where it should be, at the start of the next segment.
1566
     * We do this one page at a time, to make sure we don't deadlock
1567
     * against ourselves if wal_buffers < wal_segment_size.
1568
     */
1569
0
    while (CurrPos < EndPos)
1570
0
    {
1571
      /*
1572
       * The minimal action to flush the page would be to call
1573
       * WALInsertLockUpdateInsertingAt(CurrPos) followed by
1574
       * AdvanceXLInsertBuffer(...).  The page would be left initialized
1575
       * mostly to zeros, except for the page header (always the short
1576
       * variant, as this is never a segment's first page).
1577
       *
1578
       * The large vistas of zeros are good for compressibility, but the
1579
       * headers interrupting them every XLOG_BLCKSZ (with values that
1580
       * differ from page to page) are not.  The effect varies with
1581
       * compression tool, but bzip2 for instance compresses about an
1582
       * order of magnitude worse if those headers are left in place.
1583
       *
1584
       * Rather than complicating AdvanceXLInsertBuffer itself (which is
1585
       * called in heavily-loaded circumstances as well as this lightly-
1586
       * loaded one) with variant behavior, we just use GetXLogBuffer
1587
       * (which itself calls the two methods we need) to get the pointer
1588
       * and zero most of the page.  Then we just zero the page header.
1589
       */
1590
0
      currpos = GetXLogBuffer(CurrPos);
1591
0
      MemSet(currpos, 0, SizeOfXLogShortPHD);
1592
1593
0
      CurrPos += XLOG_BLCKSZ;
1594
0
    }
1595
0
  }
1596
3.06k
  else
1597
3.06k
  {
1598
    /* Align the end position, so that the next record starts aligned */
1599
3.06k
    CurrPos = MAXALIGN64(CurrPos);
1600
3.06k
  }
1601
1602
3.06k
  if (CurrPos != EndPos)
1603
0
    elog(PANIC, "space reserved for WAL record does not match what was written");
1604
3.06k
}
1605
1606
/*
1607
 * Acquire a WAL insertion lock, for inserting to WAL.
1608
 */
1609
static void
1610
WALInsertLockAcquire(void)
1611
3.06k
{
1612
3.06k
  bool    immed;
1613
1614
  /*
1615
   * It doesn't matter which of the WAL insertion locks we acquire, so try
1616
   * the one we used last time.  If the system isn't particularly busy, it's
1617
   * a good bet that it's still available, and it's good to have some
1618
   * affinity to a particular lock so that you don't unnecessarily bounce
1619
   * cache lines between processes when there's no contention.
1620
   *
1621
   * If this is the first time through in this backend, pick a lock
1622
   * (semi-)randomly.  This allows the locks to be used evenly if you have a
1623
   * lot of very short connections.
1624
   */
1625
3.06k
  static int  lockToTry = -1;
1626
1627
3.06k
  if (lockToTry == -1)
1628
2.19k
    lockToTry = MyProc->pgprocno % NUM_XLOGINSERT_LOCKS;
1629
3.06k
  MyLockNo = lockToTry;
1630
1631
  /*
1632
   * The insertingAt value is initially set to 0, as we don't know our
1633
   * insert location yet.
1634
   */
1635
3.06k
  immed = LWLockAcquire(&WALInsertLocks[MyLockNo].l.lock, LW_EXCLUSIVE);
1636
3.06k
  if (!immed)
1637
0
  {
1638
    /*
1639
     * If we couldn't get the lock immediately, try another lock next
1640
     * time.  On a system with more insertion locks than concurrent
1641
     * inserters, this causes all the inserters to eventually migrate to a
1642
     * lock that no-one else is using.  On a system with more inserters
1643
     * than locks, it still helps to distribute the inserters evenly
1644
     * across the locks.
1645
     */
1646
0
    lockToTry = (lockToTry + 1) % NUM_XLOGINSERT_LOCKS;
1647
0
  }
1648
3.06k
}
1649
1650
/*
1651
 * Acquire all WAL insertion locks, to prevent other backends from inserting
1652
 * to WAL.
1653
 */
1654
static void
1655
WALInsertLockAcquireExclusive(void)
1656
2.57k
{
1657
2.57k
  int     i;
1658
1659
  /*
1660
   * When holding all the locks, all but the last lock's insertingAt
1661
   * indicator is set to 0xFFFFFFFFFFFFFFFF, which is higher than any real
1662
   * XLogRecPtr value, to make sure that no-one blocks waiting on those.
1663
   */
1664
20.5k
  for (i = 0; i < NUM_XLOGINSERT_LOCKS - 1; 
i++18.0k
)
1665
18.0k
  {
1666
18.0k
    LWLockAcquire(&WALInsertLocks[i].l.lock, LW_EXCLUSIVE);
1667
18.0k
    LWLockUpdateVar(&WALInsertLocks[i].l.lock,
1668
18.0k
            &WALInsertLocks[i].l.insertingAt,
1669
18.0k
            PG_UINT64_MAX);
1670
18.0k
  }
1671
  /* Variable value reset to 0 at release */
1672
2.57k
  LWLockAcquire(&WALInsertLocks[i].l.lock, LW_EXCLUSIVE);
1673
1674
2.57k
  holdingAllLocks = true;
1675
2.57k
}
1676
1677
/*
1678
 * Release our insertion lock (or locks, if we're holding them all).
1679
 *
1680
 * NB: Reset all variables to 0, so they cause LWLockWaitForVar to block the
1681
 * next time the lock is acquired.
1682
 */
1683
static void
1684
WALInsertLockRelease(void)
1685
5.64k
{
1686
5.64k
  if (holdingAllLocks)
1687
2.57k
  {
1688
2.57k
    int     i;
1689
1690
23.1k
    for (i = 0; i < NUM_XLOGINSERT_LOCKS; 
i++20.5k
)
1691
20.5k
      LWLockReleaseClearVar(&WALInsertLocks[i].l.lock,
1692
20.5k
                  &WALInsertLocks[i].l.insertingAt,
1693
20.5k
                  0);
1694
1695
2.57k
    holdingAllLocks = false;
1696
2.57k
  }
1697
3.06k
  else
1698
3.06k
  {
1699
3.06k
    LWLockReleaseClearVar(&WALInsertLocks[MyLockNo].l.lock,
1700
3.06k
                &WALInsertLocks[MyLockNo].l.insertingAt,
1701
3.06k
                0);
1702
3.06k
  }
1703
5.64k
}
1704
1705
/*
1706
 * Update our insertingAt value, to let others know that we've finished
1707
 * inserting up to that point.
1708
 */
1709
static void
1710
WALInsertLockUpdateInsertingAt(XLogRecPtr insertingAt)
1711
1
{
1712
1
  if (holdingAllLocks)
1713
0
  {
1714
    /*
1715
     * We use the last lock to mark our actual position, see comments in
1716
     * WALInsertLockAcquireExclusive.
1717
     */
1718
0
    LWLockUpdateVar(&WALInsertLocks[NUM_XLOGINSERT_LOCKS - 1].l.lock,
1719
0
            &WALInsertLocks[NUM_XLOGINSERT_LOCKS - 1].l.insertingAt,
1720
0
            insertingAt);
1721
0
  }
1722
1
  else
1723
1
    LWLockUpdateVar(&WALInsertLocks[MyLockNo].l.lock,
1724
1
            &WALInsertLocks[MyLockNo].l.insertingAt,
1725
1
            insertingAt);
1726
1
}
1727
1728
/*
1729
 * Wait for any WAL insertions < upto to finish.
1730
 *
1731
 * Returns the location of the oldest insertion that is still in-progress.
1732
 * Any WAL prior to that point has been fully copied into WAL buffers, and
1733
 * can be flushed out to disk. Because this waits for any insertions older
1734
 * than 'upto' to finish, the return value is always >= 'upto'.
1735
 *
1736
 * Note: When you are about to write out WAL, you must call this function
1737
 * *before* acquiring WALWriteLock, to avoid deadlocks. This function might
1738
 * need to wait for an insertion to finish (or at least advance to next
1739
 * uninitialized page), and the inserter might need to evict an old WAL buffer
1740
 * to make room for a new one, which in turn requires WALWriteLock.
1741
 */
1742
static XLogRecPtr
1743
WaitXLogInsertionsToFinish(XLogRecPtr upto)
1744
2.39k
{
1745
2.39k
  uint64    bytepos;
1746
2.39k
  XLogRecPtr  reservedUpto;
1747
2.39k
  XLogRecPtr  finishedUpto;
1748
2.39k
  XLogCtlInsert *Insert = &XLogCtl->Insert;
1749
2.39k
  int     i;
1750
1751
2.39k
  if (MyProc == NULL)
1752
0
    elog(PANIC, "cannot wait without a PGPROC structure");
1753
1754
  /* Read the current insert position */
1755
2.39k
  SpinLockAcquire(&Insert->insertpos_lck);
1756
2.39k
  bytepos = Insert->CurrBytePos;
1757
2.39k
  SpinLockRelease(&Insert->insertpos_lck);
1758
2.39k
  reservedUpto = XLogBytePosToEndRecPtr(bytepos);
1759
1760
  /*
1761
   * No-one should request to flush a piece of WAL that hasn't even been
1762
   * reserved yet. However, it can happen if there is a block with a bogus
1763
   * LSN on disk, for example. XLogFlush checks for that situation and
1764
   * complains, but only after the flush. Here we just assume that to mean
1765
   * that all WAL that has been reserved needs to be finished. In this
1766
   * corner-case, the return value can be smaller than 'upto' argument.
1767
   */
1768
2.39k
  if (upto > reservedUpto)
1769
0
  {
1770
0
    elog(LOG, "request to flush past end of generated WAL; request %X/%X, currpos %X/%X",
1771
0
       (uint32) (upto >> 32), (uint32) upto,
1772
0
       (uint32) (reservedUpto >> 32), (uint32) reservedUpto);
1773
0
    upto = reservedUpto;
1774
0
  }
1775
1776
  /*
1777
   * Loop through all the locks, sleeping on any in-progress insert older
1778
   * than 'upto'.
1779
   *
1780
   * finishedUpto is our return value, indicating the point upto which all
1781
   * the WAL insertions have been finished. Initialize it to the head of
1782
   * reserved WAL, and as we iterate through the insertion locks, back it
1783
   * out for any insertion that's still in progress.
1784
   */
1785
2.39k
  finishedUpto = reservedUpto;
1786
21.5k
  for (i = 0; i < NUM_XLOGINSERT_LOCKS; 
i++19.1k
)
1787
19.1k
  {
1788
19.1k
    XLogRecPtr  insertingat = InvalidXLogRecPtr;
1789
1790
19.1k
    do
1791
19.1k
    {
1792
      /*
1793
       * See if this insertion is in progress. LWLockWait will wait for
1794
       * the lock to be released, or for the 'value' to be set by a
1795
       * LWLockUpdateVar call.  When a lock is initially acquired, its
1796
       * value is 0 (InvalidXLogRecPtr), which means that we don't know
1797
       * where it's inserting yet.  We will have to wait for it.  If
1798
       * it's a small insertion, the record will most likely fit on the
1799
       * same page and the inserter will release the lock without ever
1800
       * calling LWLockUpdateVar.  But if it has to sleep, it will
1801
       * advertise the insertion point with LWLockUpdateVar before
1802
       * sleeping.
1803
       */
1804
19.1k
      if (LWLockWaitForVar(&WALInsertLocks[i].l.lock,
1805
19.1k
                 &WALInsertLocks[i].l.insertingAt,
1806
19.1k
                 insertingat, &insertingat))
1807
19.1k
      {
1808
        /* the lock was free, so no insertion in progress */
1809
19.1k
        insertingat = InvalidXLogRecPtr;
1810
19.1k
        break;
1811
19.1k
      }
1812
1813
      /*
1814
       * This insertion is still in progress. Have to wait, unless the
1815
       * inserter has proceeded past 'upto'.
1816
       */
1817
19.1k
    } while (
insertingat < upto0
);
1818
1819
19.1k
    if (insertingat != InvalidXLogRecPtr && 
insertingat < finishedUpto0
)
1820
0
      finishedUpto = insertingat;
1821
19.1k
  }
1822
2.39k
  return finishedUpto;
1823
2.39k
}
1824
1825
/*
1826
 * Get a pointer to the right location in the WAL buffer containing the
1827
 * given XLogRecPtr.
1828
 *
1829
 * If the page is not initialized yet, it is initialized. That might require
1830
 * evicting an old dirty buffer from the buffer cache, which means I/O.
1831
 *
1832
 * The caller must ensure that the page containing the requested location
1833
 * isn't evicted yet, and won't be evicted. The way to ensure that is to
1834
 * hold onto a WAL insertion lock with the insertingAt position set to
1835
 * something <= ptr. GetXLogBuffer() will update insertingAt if it needs
1836
 * to evict an old page from the buffer. (This means that once you call
1837
 * GetXLogBuffer() with a given 'ptr', you must not access anything before
1838
 * that point anymore, and must not call GetXLogBuffer() with an older 'ptr'
1839
 * later, because older buffers might be recycled already)
1840
 */
1841
static char *
1842
GetXLogBuffer(XLogRecPtr ptr)
1843
3.07k
{
1844
3.07k
  int     idx;
1845
3.07k
  XLogRecPtr  endptr;
1846
3.07k
  static uint64 cachedPage = 0;
1847
3.07k
  static char *cachedPos = NULL;
1848
3.07k
  XLogRecPtr  expectedEndPtr;
1849
1850
  /*
1851
   * Fast path for the common case that we need to access again the same
1852
   * page as last time.
1853
   */
1854
3.07k
  if (ptr / XLOG_BLCKSZ == cachedPage)
1855
877
  {
1856
877
    Assert(((XLogPageHeader) cachedPos)->xlp_magic == XLOG_PAGE_MAGIC);
1857
877
    Assert(((XLogPageHeader) cachedPos)->xlp_pageaddr == ptr - (ptr % XLOG_BLCKSZ));
1858
877
    return cachedPos + ptr % XLOG_BLCKSZ;
1859
877
  }
1860
1861
  /*
1862
   * The XLog buffer cache is organized so that a page is always loaded to a
1863
   * particular buffer.  That way we can easily calculate the buffer a given
1864
   * page must be loaded into, from the XLogRecPtr alone.
1865
   */
1866
2.19k
  idx = XLogRecPtrToBufIdx(ptr);
1867
1868
  /*
1869
   * See what page is loaded in the buffer at the moment. It could be the
1870
   * page we're looking for, or something older. It can't be anything newer
1871
   * - that would imply the page we're looking for has already been written
1872
   * out to disk and evicted, and the caller is responsible for making sure
1873
   * that doesn't happen.
1874
   *
1875
   * However, we don't hold a lock while we read the value. If someone has
1876
   * just initialized the page, it's possible that we get a "torn read" of
1877
   * the XLogRecPtr if 64-bit fetches are not atomic on this platform. In
1878
   * that case we will see a bogus value. That's ok, we'll grab the mapping
1879
   * lock (in AdvanceXLInsertBuffer) and retry if we see anything else than
1880
   * the page we're looking for. But it means that when we do this unlocked
1881
   * read, we might see a value that appears to be ahead of the page we're
1882
   * looking for. Don't PANIC on that, until we've verified the value while
1883
   * holding the lock.
1884
   */
1885
2.19k
  expectedEndPtr = ptr;
1886
2.19k
  expectedEndPtr += XLOG_BLCKSZ - ptr % XLOG_BLCKSZ;
1887
1888
2.19k
  endptr = XLogCtl->xlblocks[idx];
1889
2.19k
  if (expectedEndPtr != endptr)
1890
1
  {
1891
1
    XLogRecPtr  initializedUpto;
1892
1893
    /*
1894
     * Before calling AdvanceXLInsertBuffer(), which can block, let others
1895
     * know how far we're finished with inserting the record.
1896
     *
1897
     * NB: If 'ptr' points to just after the page header, advertise a
1898
     * position at the beginning of the page rather than 'ptr' itself. If
1899
     * there are no other insertions running, someone might try to flush
1900
     * up to our advertised location. If we advertised a position after
1901
     * the page header, someone might try to flush the page header, even
1902
     * though page might actually not be initialized yet. As the first
1903
     * inserter on the page, we are effectively responsible for making
1904
     * sure that it's initialized, before we let insertingAt to move past
1905
     * the page header.
1906
     */
1907
1
    if (ptr % XLOG_BLCKSZ == SizeOfXLogShortPHD &&
1908
1
      
XLogSegmentOffset0
(ptr, wal_segment_size) > 0
XLOG_BLCKSZ0
)
1909
0
      initializedUpto = ptr - SizeOfXLogShortPHD;
1910
1
    else if (ptr % XLOG_BLCKSZ == SizeOfXLogLongPHD &&
1911
1
         
XLogSegmentOffset0
(ptr, wal_segment_size) < 0
XLOG_BLCKSZ0
)
1912
0
      initializedUpto = ptr - SizeOfXLogLongPHD;
1913
1
    else
1914
1
      initializedUpto = ptr;
1915
1916
1
    WALInsertLockUpdateInsertingAt(initializedUpto);
1917
1918
1
    AdvanceXLInsertBuffer(ptr, false);
1919
1
    endptr = XLogCtl->xlblocks[idx];
1920
1921
1
    if (expectedEndPtr != endptr)
1922
0
      elog(PANIC, "could not find WAL buffer for %X/%X",
1923
1
         (uint32) (ptr >> 32), (uint32) ptr);
1924
1
  }
1925
2.19k
  else
1926
2.19k
  {
1927
    /*
1928
     * Make sure the initialization of the page is visible to us, and
1929
     * won't arrive later to overwrite the WAL data we write on the page.
1930
     */
1931
2.19k
    pg_memory_barrier();
1932
2.19k
  }
1933
1934
  /*
1935
   * Found the buffer holding this page. Return a pointer to the right
1936
   * offset within the page.
1937
   */
1938
2.19k
  cachedPage = ptr / XLOG_BLCKSZ;
1939
2.19k
  cachedPos = XLogCtl->pages + idx * (Size) XLOG_BLCKSZ;
1940
1941
2.19k
  Assert(((XLogPageHeader) cachedPos)->xlp_magic == XLOG_PAGE_MAGIC);
1942
2.19k
  Assert(((XLogPageHeader) cachedPos)->xlp_pageaddr == ptr - (ptr % XLOG_BLCKSZ));
1943
1944
2.19k
  return cachedPos + ptr % XLOG_BLCKSZ;
1945
2.19k
}
1946
1947
/*
1948
 * Converts a "usable byte position" to XLogRecPtr. A usable byte position
1949
 * is the position starting from the beginning of WAL, excluding all WAL
1950
 * page headers.
1951
 */
1952
static XLogRecPtr
1953
XLogBytePosToRecPtr(uint64 bytepos)
1954
8.71k
{
1955
8.71k
  uint64    fullsegs;
1956
8.71k
  uint64    fullpages;
1957
8.71k
  uint64    bytesleft;
1958
8.71k
  uint32    seg_offset;
1959
8.71k
  XLogRecPtr  result;
1960
1961
8.71k
  fullsegs = bytepos / UsableBytesInSegment;
1962
8.71k
  bytesleft = bytepos % UsableBytesInSegment;
1963
1964
8.71k
  if (bytesleft < XLOG_BLCKSZ - SizeOfXLogLongPHD)
1965
8.67k
  {
1966
    /* fits on first page of segment */
1967
8.67k
    seg_offset = bytesleft + SizeOfXLogLongPHD;
1968
8.67k
  }
1969
38
  else
1970
38
  {
1971
    /* account for the first page on segment with long header */
1972
38
    seg_offset = XLOG_BLCKSZ;
1973
38
    bytesleft -= XLOG_BLCKSZ - SizeOfXLogLongPHD;
1974
1975
38
    fullpages = bytesleft / UsableBytesInPage;
1976
38
    bytesleft = bytesleft % UsableBytesInPage;
1977
1978
38
    seg_offset += fullpages * XLOG_BLCKSZ + bytesleft + SizeOfXLogShortPHD;
1979
38
  }
1980
1981
8.71k
  XLogSegNoOffsetToRecPtr(fullsegs, seg_offset, wal_segment_size, result);
1982
1983
8.71k
  return result;
1984
8.71k
}
1985
1986
/*
1987
 * Like XLogBytePosToRecPtr, but if the position is at a page boundary,
1988
 * returns a pointer to the beginning of the page (ie. before page header),
1989
 * not to where the first xlog record on that page would go to. This is used
1990
 * when converting a pointer to the end of a record.
1991
 */
1992
static XLogRecPtr
1993
XLogBytePosToEndRecPtr(uint64 bytepos)
1994
5.46k
{
1995
5.46k
  uint64    fullsegs;
1996
5.46k
  uint64    fullpages;
1997
5.46k
  uint64    bytesleft;
1998
5.46k
  uint32    seg_offset;
1999
5.46k
  XLogRecPtr  result;
2000
2001
5.46k
  fullsegs = bytepos / UsableBytesInSegment;
2002
5.46k
  bytesleft = bytepos % UsableBytesInSegment;
2003
2004
5.46k
  if (bytesleft < XLOG_BLCKSZ - SizeOfXLogLongPHD)
2005
5.44k
  {
2006
    /* fits on first page of segment */
2007
5.44k
    if (bytesleft == 0)
2008
0
      seg_offset = 0;
2009
5.44k
    else
2010
5.44k
      seg_offset = bytesleft + SizeOfXLogLongPHD;
2011
5.44k
  }
2012
25
  else
2013
25
  {
2014
    /* account for the first page on segment with long header */
2015
25
    seg_offset = XLOG_BLCKSZ;
2016
25
    bytesleft -= XLOG_BLCKSZ - SizeOfXLogLongPHD;
2017
2018
25
    fullpages = bytesleft / UsableBytesInPage;
2019
25
    bytesleft = bytesleft % UsableBytesInPage;
2020
2021
25
    if (bytesleft == 0)
2022
0
      seg_offset += fullpages * XLOG_BLCKSZ + bytesleft;
2023
25
    else
2024
25
      seg_offset += fullpages * XLOG_BLCKSZ + bytesleft + SizeOfXLogShortPHD;
2025
25
  }
2026
2027
5.46k
  XLogSegNoOffsetToRecPtr(fullsegs, seg_offset, wal_segment_size, result);
2028
2029
5.46k
  return result;
2030
5.46k
}
2031
2032
/*
2033
 * Convert an XLogRecPtr to a "usable byte position".
2034
 */
2035
static uint64
2036
XLogRecPtrToBytePos(XLogRecPtr ptr)
2037
17.2k
{
2038
17.2k
  uint64    fullsegs;
2039
17.2k
  uint32    fullpages;
2040
17.2k
  uint32    offset;
2041
17.2k
  uint64    result;
2042
2043
17.2k
  XLByteToSeg(ptr, fullsegs, wal_segment_size);
2044
2045
17.2k
  fullpages = (XLogSegmentOffset(ptr, wal_segment_size)) / XLOG_BLCKSZ;
2046
17.2k
  offset = ptr % XLOG_BLCKSZ;
2047
2048
17.2k
  if (fullpages == 0)
2049
17.1k
  {
2050
17.1k
    result = fullsegs * UsableBytesInSegment;
2051
17.1k
    if (offset > 0)
2052
17.1k
    {
2053
17.1k
      Assert(offset >= SizeOfXLogLongPHD);
2054
17.1k
      result += offset - SizeOfXLogLongPHD;
2055
17.1k
    }
2056
17.1k
  }
2057
48
  else
2058
48
  {
2059
48
    result = fullsegs * UsableBytesInSegment +
2060
48
      (XLOG_BLCKSZ - SizeOfXLogLongPHD) + /* account for first page */
2061
48
      (fullpages - 1) * UsableBytesInPage; /* full pages */
2062
48
    if (offset > 0)
2063
48
    {
2064
48
      Assert(offset >= SizeOfXLogShortPHD);
2065
48
      result += offset - SizeOfXLogShortPHD;
2066
48
    }
2067
48
  }
2068
2069
17.2k
  return result;
2070
17.2k
}
2071
2072
/*
2073
 * Initialize XLOG buffers, writing out old buffers if they still contain
2074
 * unwritten data, upto the page containing 'upto'. Or if 'opportunistic' is
2075
 * true, initialize as many pages as we can without having to write out
2076
 * unwritten data. Any new pages are initialized to zeros, with pages headers
2077
 * initialized properly.
2078
 */
2079
static void
2080
AdvanceXLInsertBuffer(XLogRecPtr upto, bool opportunistic)
2081
1
{
2082
1
  XLogCtlInsert *Insert = &XLogCtl->Insert;
2083
1
  int     nextidx;
2084
1
  XLogRecPtr  OldPageRqstPtr;
2085
1
  XLogwrtRqst WriteRqst;
2086
1
  XLogRecPtr  NewPageEndPtr = InvalidXLogRecPtr;
2087
1
  XLogRecPtr  NewPageBeginPtr;
2088
1
  XLogPageHeader NewPage;
2089
1
  int     npages = 0;
2090
2091
1
  LWLockAcquire(WALBufMappingLock, LW_EXCLUSIVE);
2092
2093
  /*
2094
   * Now that we have the lock, check if someone initialized the page
2095
   * already.
2096
   */
2097
2
  while (upto >= XLogCtl->InitializedUpTo || 
opportunistic1
)
2098
1
  {
2099
1
    nextidx = XLogRecPtrToBufIdx(XLogCtl->InitializedUpTo);
2100
2101
    /*
2102
     * Get ending-offset of the buffer page we need to replace (this may
2103
     * be zero if the buffer hasn't been used yet).  Fall through if it's
2104
     * already written out.
2105
     */
2106
1
    OldPageRqstPtr = XLogCtl->xlblocks[nextidx];
2107
1
    if (LogwrtResult.Write < OldPageRqstPtr)
2108
0
    {
2109
      /*
2110
       * Nope, got work to do. If we just want to pre-initialize as much
2111
       * as we can without flushing, give up now.
2112
       */
2113
0
      if (opportunistic)
2114
0
        break;
2115
2116
      /* Before waiting, get info_lck and update LogwrtResult */
2117
0
      SpinLockAcquire(&XLogCtl->info_lck);
2118
0
      if (XLogCtl->LogwrtRqst.Write < OldPageRqstPtr)
2119
0
        XLogCtl->LogwrtRqst.Write = OldPageRqstPtr;
2120
0
      LogwrtResult = XLogCtl->LogwrtResult;
2121
0
      SpinLockRelease(&XLogCtl->info_lck);
2122
2123
      /*
2124
       * Now that we have an up-to-date LogwrtResult value, see if we
2125
       * still need to write it or if someone else already did.
2126
       */
2127
0
      if (LogwrtResult.Write < OldPageRqstPtr)
2128
0
      {
2129
        /*
2130
         * Must acquire write lock. Release WALBufMappingLock first,
2131
         * to make sure that all insertions that we need to wait for
2132
         * can finish (up to this same position). Otherwise we risk
2133
         * deadlock.
2134
         */
2135
0
        LWLockRelease(WALBufMappingLock);
2136
2137
0
        WaitXLogInsertionsToFinish(OldPageRqstPtr);
2138
2139
0
        LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
2140
2141
0
        LogwrtResult = XLogCtl->LogwrtResult;
2142
0
        if (LogwrtResult.Write >= OldPageRqstPtr)
2143
0
        {
2144
          /* OK, someone wrote it already */
2145
0
          LWLockRelease(WALWriteLock);
2146
0
        }
2147
0
        else
2148
0
        {
2149
          /* Have to write it ourselves */
2150
0
          TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_START();
2151
0
          WriteRqst.Write = OldPageRqstPtr;
2152
0
          WriteRqst.Flush = 0;
2153
0
          XLogWrite(WriteRqst, false);
2154
0
          LWLockRelease(WALWriteLock);
2155
0
          TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_DONE();
2156
0
        }
2157
        /* Re-acquire WALBufMappingLock and retry */
2158
0
        LWLockAcquire(WALBufMappingLock, LW_EXCLUSIVE);
2159
0
        continue;
2160
0
      }
2161
0
    }
2162
2163
    /*
2164
     * Now the next buffer slot is free and we can set it up to be the
2165
     * next output page.
2166
     */
2167
1
    NewPageBeginPtr = XLogCtl->InitializedUpTo;
2168
1
    NewPageEndPtr = NewPageBeginPtr + XLOG_BLCKSZ;
2169
2170
1
    Assert(XLogRecPtrToBufIdx(NewPageBeginPtr) == nextidx);
2171
2172
1
    NewPage = (XLogPageHeader) (XLogCtl->pages + nextidx * (Size) XLOG_BLCKSZ);
2173
2174
    /*
2175
     * Be sure to re-zero the buffer so that bytes beyond what we've
2176
     * written will look like zeroes and not valid XLOG records...
2177
     */
2178
1
    MemSet((char *) NewPage, 0, XLOG_BLCKSZ);
2179
2180
    /*
2181
     * Fill the new page's header
2182
     */
2183
1
    NewPage->xlp_magic = XLOG_PAGE_MAGIC;
2184
2185
    /* NewPage->xlp_info = 0; */  /* done by memset */
2186
1
    NewPage->xlp_tli = ThisTimeLineID;
2187
1
    NewPage->xlp_pageaddr = NewPageBeginPtr;
2188
2189
    /* NewPage->xlp_rem_len = 0; */ /* done by memset */
2190
2191
    /*
2192
     * If online backup is not in progress, mark the header to indicate
2193
     * that WAL records beginning in this page have removable backup
2194
     * blocks.  This allows the WAL archiver to know whether it is safe to
2195
     * compress archived WAL data by transforming full-block records into
2196
     * the non-full-block format.  It is sufficient to record this at the
2197
     * page level because we force a page switch (in fact a segment
2198
     * switch) when starting a backup, so the flag will be off before any
2199
     * records can be written during the backup.  At the end of a backup,
2200
     * the last page will be marked as all unsafe when perhaps only part
2201
     * is unsafe, but at worst the archiver would miss the opportunity to
2202
     * compress a few records.
2203
     */
2204
1
    if (!Insert->forcePageWrites)
2205
1
      NewPage->xlp_info |= XLP_BKP_REMOVABLE;
2206
2207
    /*
2208
     * If first page of an XLOG segment file, make it a long header.
2209
     */
2210
1
    if ((XLogSegmentOffset(NewPage->xlp_pageaddr, wal_segment_size)) == 0)
2211
0
    {
2212
0
      XLogLongPageHeader NewLongPage = (XLogLongPageHeader) NewPage;
2213
2214
0
      NewLongPage->xlp_sysid = ControlFile->system_identifier;
2215
0
      NewLongPage->xlp_seg_size = wal_segment_size;
2216
0
      NewLongPage->xlp_xlog_blcksz = XLOG_BLCKSZ;
2217
0
      NewPage->xlp_info |= XLP_LONG_HEADER;
2218
0
    }
2219
2220
    /*
2221
     * Make sure the initialization of the page becomes visible to others
2222
     * before the xlblocks update. GetXLogBuffer() reads xlblocks without
2223
     * holding a lock.
2224
     */
2225
1
    pg_write_barrier();
2226
2227
1
    *((volatile XLogRecPtr *) &XLogCtl->xlblocks[nextidx]) = NewPageEndPtr;
2228
2229
1
    XLogCtl->InitializedUpTo = NewPageEndPtr;
2230
2231
1
    npages++;
2232
1
  }
2233
1
  LWLockRelease(WALBufMappingLock);
2234
2235
#ifdef WAL_DEBUG
2236
  if (XLOG_DEBUG && npages > 0)
2237
  {
2238
    elog(DEBUG1, "initialized %d pages, up to %X/%X",
2239
       npages, (uint32) (NewPageEndPtr >> 32), (uint32) NewPageEndPtr);
2240
  }
2241
#endif
2242
1
}
2243
2244
/*
2245
 * Calculate CheckPointSegments based on max_wal_size_mb and
2246
 * checkpoint_completion_target.
2247
 */
2248
static void
2249
CalculateCheckpointSegments(void)
2250
16.0k
{
2251
16.0k
  double    target;
2252
2253
  /*-------
2254
   * Calculate the distance at which to trigger a checkpoint, to avoid
2255
   * exceeding max_wal_size_mb. This is based on two assumptions:
2256
   *
2257
   * a) we keep WAL for only one checkpoint cycle (prior to PG11 we kept
2258
   *    WAL for two checkpoint cycles to allow us to recover from the
2259
   *    secondary checkpoint if the first checkpoint failed, though we
2260
   *    only did this on the master anyway, not on standby. Keeping just
2261
   *    one checkpoint simplifies processing and reduces disk space in
2262
   *    many smaller databases.)
2263
   * b) during checkpoint, we consume checkpoint_completion_target *
2264
   *    number of segments consumed between checkpoints.
2265
   *-------
2266
   */
2267
16.0k
  target = (double) ConvertToXSegs(max_wal_size_mb, wal_segment_size) /
2268
16.0k
    (1.0 + CheckPointCompletionTarget);
2269
2270
  /* round down */
2271
16.0k
  CheckPointSegments = (int) target;
2272
2273
16.0k
  if (CheckPointSegments < 1)
2274
0
    CheckPointSegments = 1;
2275
16.0k
}
2276
2277
void
2278
assign_max_wal_size(int newval, void *extra)
2279
12.0k
{
2280
12.0k
  max_wal_size_mb = newval;
2281
12.0k
  CalculateCheckpointSegments();
2282
12.0k
}
2283
2284
void
2285
assign_checkpoint_completion_target(double newval, void *extra)
2286
0
{
2287
0
  CheckPointCompletionTarget = newval;
2288
0
  CalculateCheckpointSegments();
2289
0
}
2290
2291
/*
2292
 * At a checkpoint, how many WAL segments to recycle as preallocated future
2293
 * XLOG segments? Returns the highest segment that should be preallocated.
2294
 */
2295
static XLogSegNo
2296
XLOGfileslop(XLogRecPtr RedoRecPtr)
2297
0
{
2298
0
  XLogSegNo minSegNo;
2299
0
  XLogSegNo maxSegNo;
2300
0
  double    distance;
2301
0
  XLogSegNo recycleSegNo;
2302
2303
  /*
2304
   * Calculate the segment numbers that min_wal_size_mb and max_wal_size_mb
2305
   * correspond to. Always recycle enough segments to meet the minimum, and
2306
   * remove enough segments to stay below the maximum.
2307
   */
2308
0
  minSegNo = RedoRecPtr / wal_segment_size +
2309
0
    ConvertToXSegs(min_wal_size_mb, wal_segment_size) - 1;
2310
0
  maxSegNo = RedoRecPtr / wal_segment_size +
2311
0
    ConvertToXSegs(max_wal_size_mb, wal_segment_size) - 1;
2312
2313
  /*
2314
   * Between those limits, recycle enough segments to get us through to the
2315
   * estimated end of next checkpoint.
2316
   *
2317
   * To estimate where the next checkpoint will finish, assume that the
2318
   * system runs steadily consuming CheckPointDistanceEstimate bytes between
2319
   * every checkpoint.
2320
   */
2321
0
  distance = (1.0 + CheckPointCompletionTarget) * CheckPointDistanceEstimate;
2322
  /* add 10% for good measure. */
2323
0
  distance *= 1.10;
2324
2325
0
  recycleSegNo = (XLogSegNo) ceil(((double) RedoRecPtr + distance) /
2326
0
                  wal_segment_size);
2327
2328
0
  if (recycleSegNo < minSegNo)
2329
0
    recycleSegNo = minSegNo;
2330
0
  if (recycleSegNo > maxSegNo)
2331
0
    recycleSegNo = maxSegNo;
2332
2333
0
  return recycleSegNo;
2334
0
}
2335
2336
/*
2337
 * Check whether we've consumed enough xlog space that a checkpoint is needed.
2338
 *
2339
 * new_segno indicates a log file that has just been filled up (or read
2340
 * during recovery). We measure the distance from RedoRecPtr to new_segno
2341
 * and see if that exceeds CheckPointSegments.
2342
 *
2343
 * Note: it is caller's responsibility that RedoRecPtr is up-to-date.
2344
 */
2345
static bool
2346
XLogCheckpointNeeded(XLogSegNo new_segno)
2347
0
{
2348
0
  XLogSegNo old_segno;
2349
2350
0
  XLByteToSeg(RedoRecPtr, old_segno, wal_segment_size);
2351
2352
0
  if (new_segno >= old_segno + (uint64) (CheckPointSegments - 1))
2353
0
    return true;
2354
0
  return false;
2355
0
}
2356
2357
/*
2358
 * Write and/or fsync the log at least as far as WriteRqst indicates.
2359
 *
2360
 * If flexible == true, we don't have to write as far as WriteRqst, but
2361
 * may stop at any convenient boundary (such as a cache or logfile boundary).
2362
 * This option allows us to avoid uselessly issuing multiple writes when a
2363
 * single one would do.
2364
 *
2365
 * Must be called with WALWriteLock held. WaitXLogInsertionsToFinish(WriteRqst)
2366
 * must be called before grabbing the lock, to make sure the data is ready to
2367
 * write.
2368
 */
2369
static void
2370
XLogWrite(XLogwrtRqst WriteRqst, bool flexible)
2371
2.39k
{
2372
2.39k
  bool    ispartialpage;
2373
2.39k
  bool    last_iteration;
2374
2.39k
  bool    finishing_seg;
2375
2.39k
  bool    use_existent;
2376
2.39k
  int     curridx;
2377
2.39k
  int     npages;
2378
2.39k
  int     startidx;
2379
2.39k
  uint32    startoffset;
2380
2381
  /* We should always be inside a critical section here */
2382
2.39k
  Assert(CritSectionCount > 0);
2383
2384
  /*
2385
   * Update local LogwrtResult (caller probably did this already, but...)
2386
   */
2387
2.39k
  LogwrtResult = XLogCtl->LogwrtResult;
2388
2389
  /*
2390
   * Since successive pages in the xlog cache are consecutively allocated,
2391
   * we can usually gather multiple pages together and issue just one
2392
   * write() call.  npages is the number of pages we have determined can be
2393
   * written together; startidx is the cache block index of the first one,
2394
   * and startoffset is the file offset at which it should go. The latter
2395
   * two variables are only valid when npages > 0, but we must initialize
2396
   * all of them to keep the compiler quiet.
2397
   */
2398
2.39k
  npages = 0;
2399
2.39k
  startidx = 0;
2400
2.39k
  startoffset = 0;
2401
2402
  /*
2403
   * Within the loop, curridx is the cache block index of the page to
2404
   * consider writing.  Begin at the buffer containing the next unwritten
2405
   * page, or last partially written page.
2406
   */
2407
2.39k
  curridx = XLogRecPtrToBufIdx(LogwrtResult.Write);
2408
2409
2.39k
  while (LogwrtResult.Write < WriteRqst.Write)
2410
2.39k
  {
2411
    /*
2412
     * Make sure we're not ahead of the insert process.  This could happen
2413
     * if we're passed a bogus WriteRqst.Write that is past the end of the
2414
     * last page that's been initialized by AdvanceXLInsertBuffer.
2415
     */
2416
2.39k
    XLogRecPtr  EndPtr = XLogCtl->xlblocks[curridx];
2417
2418
2.39k
    if (LogwrtResult.Write >= EndPtr)
2419
0
      elog(PANIC, "xlog write request %X/%X is past end of log %X/%X",
2420
2.39k
         (uint32) (LogwrtResult.Write >> 32),
2421
2.39k
         (uint32) LogwrtResult.Write,
2422
2.39k
         (uint32) (EndPtr >> 32), (uint32) EndPtr);
2423
2424
    /* Advance LogwrtResult.Write to end of current buffer page */
2425
2.39k
    LogwrtResult.Write = EndPtr;
2426
2.39k
    ispartialpage = WriteRqst.Write < LogwrtResult.Write;
2427
2428
2.39k
    if (!XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo,
2429
2.39k
               wal_segment_size))
2430
2.16k
    {
2431
      /*
2432
       * Switch to new logfile segment.  We cannot have any pending
2433
       * pages here (since we dump what we have at segment end).
2434
       */
2435
2.16k
      Assert(npages == 0);
2436
2.16k
      if (openLogFile >= 0)
2437
0
        XLogFileClose();
2438
2.16k
      XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo,
2439
2.16k
              wal_segment_size);
2440
2441
      /* create/use new log file */
2442
2.16k
      use_existent = true;
2443
2.16k
      openLogFile = XLogFileInit(openLogSegNo, &use_existent, true);
2444
2.16k
      openLogOff = 0;
2445
2.16k
    }
2446
2447
    /* Make sure we have the current logfile open */
2448
2.39k
    if (openLogFile < 0)
2449
0
    {
2450
0
      XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo,
2451
0
              wal_segment_size);
2452
0
      openLogFile = XLogFileOpen(openLogSegNo);
2453
0
      openLogOff = 0;
2454
0
    }
2455
2456
    /* Add current page to the set of pending pages-to-dump */
2457
2.39k
    if (npages == 0)
2458
2.39k
    {
2459
      /* first of group */
2460
2.39k
      startidx = curridx;
2461
2.39k
      startoffset = XLogSegmentOffset(LogwrtResult.Write - XLOG_BLCKSZ,
2462
2.39k
                      wal_segment_size);
2463
2.39k
    }
2464
2.39k
    npages++;
2465
2466
    /*
2467
     * Dump the set if this will be the last loop iteration, or if we are
2468
     * at the last page of the cache area (since the next page won't be
2469
     * contiguous in memory), or if we are at the end of the logfile
2470
     * segment.
2471
     */
2472
2.39k
    last_iteration = WriteRqst.Write <= LogwrtResult.Write;
2473
2474
2.39k
    finishing_seg = !ispartialpage &&
2475
2.39k
      
(startoffset + npages * 1
XLOG_BLCKSZ1
) >= wal_segment_size;
2476
2477
2.39k
    if (last_iteration ||
2478
2.39k
      
curridx == XLogCtl->XLogCacheBlck1
||
2479
2.39k
      
finishing_seg1
)
2480
2.39k
    {
2481
2.39k
      char     *from;
2482
2.39k
      Size    nbytes;
2483
2.39k
      Size    nleft;
2484
2.39k
      int     written;
2485
2486
      /* Need to seek in the file? */
2487
2.39k
      if (openLogOff != startoffset)
2488
227
      {
2489
227
        if (lseek(openLogFile, (off_t) startoffset, SEEK_SET) < 0)
2490
227
          ereport(PANIC,
2491
227
              (errcode_for_file_access(),
2492
227
               errmsg("could not seek in log file %s to offset %u: %m",
2493
227
                  XLogFileNameP(ThisTimeLineID, openLogSegNo),
2494
227
                  startoffset)));
2495
227
        openLogOff = startoffset;
2496
227
      }
2497
2498
      /* OK to write the page(s) */
2499
2.39k
      from = XLogCtl->pages + startidx * (Size) XLOG_BLCKSZ;
2500
2.39k
      nbytes = npages * (Size) XLOG_BLCKSZ;
2501
2.39k
      nleft = nbytes;
2502
2.39k
      do
2503
2.39k
      {
2504
2.39k
        errno = 0;
2505
2.39k
        pgstat_report_wait_start(WAIT_EVENT_WAL_WRITE);
2506
2.39k
        written = write(openLogFile, from, nleft);
2507
2.39k
        pgstat_report_wait_end();
2508
2.39k
        if (written <= 0)
2509
0
        {
2510
0
          if (errno == EINTR)
2511
0
            continue;
2512
0
          ereport(PANIC,
2513
0
              (errcode_for_file_access(),
2514
0
               errmsg("could not write to log file %s "
2515
0
                  "at offset %u, length %zu: %m",
2516
0
                  XLogFileNameP(ThisTimeLineID, openLogSegNo),
2517
0
                  openLogOff, nbytes)));
2518
0
        }
2519
2.39k
        nleft -= written;
2520
2.39k
        from += written;
2521
2.39k
      } while (nleft > 0);
2522
2523
      /* Update state for write */
2524
2.39k
      openLogOff += nbytes;
2525
2.39k
      npages = 0;
2526
2527
      /*
2528
       * If we just wrote the whole last page of a logfile segment,
2529
       * fsync the segment immediately.  This avoids having to go back
2530
       * and re-open prior segments when an fsync request comes along
2531
       * later. Doing it here ensures that one and only one backend will
2532
       * perform this fsync.
2533
       *
2534
       * This is also the right place to notify the Archiver that the
2535
       * segment is ready to copy to archival storage, and to update the
2536
       * timer for archive_timeout, and to signal for a checkpoint if
2537
       * too many logfile segments have been used since the last
2538
       * checkpoint.
2539
       */
2540
2.39k
      if (finishing_seg)
2541
0
      {
2542
0
        issue_xlog_fsync(openLogFile, openLogSegNo);
2543
2544
        /* signal that we need to wakeup walsenders later */
2545
0
        WalSndWakeupRequest();
2546
2547
0
        LogwrtResult.Flush = LogwrtResult.Write;  /* end of page */
2548
2549
0
        if (XLogArchivingActive())
2550
0
          XLogArchiveNotifySeg(openLogSegNo);
2551
2552
0
        XLogCtl->lastSegSwitchTime = (pg_time_t) time(NULL);
2553
0
        XLogCtl->lastSegSwitchLSN = LogwrtResult.Flush;
2554
2555
        /*
2556
         * Request a checkpoint if we've consumed too much xlog since
2557
         * the last one.  For speed, we first check using the local
2558
         * copy of RedoRecPtr, which might be out of date; if it looks
2559
         * like a checkpoint is needed, forcibly update RedoRecPtr and
2560
         * recheck.
2561
         */
2562
0
        if (IsUnderPostmaster && XLogCheckpointNeeded(openLogSegNo))
2563
0
        {
2564
0
          (void) GetRedoRecPtr();
2565
0
          if (XLogCheckpointNeeded(openLogSegNo))
2566
0
            RequestCheckpoint(CHECKPOINT_CAUSE_XLOG);
2567
0
        }
2568
0
      }
2569
2.39k
    }
2570
2571
2.39k
    if (ispartialpage)
2572
2.39k
    {
2573
      /* Only asked to write a partial page */
2574
2.39k
      LogwrtResult.Write = WriteRqst.Write;
2575
2.39k
      break;
2576
2.39k
    }
2577
1
    curridx = NextBufIdx(curridx);
2578
2579
    /* If flexible, break out of loop as soon as we wrote something */
2580
1
    if (flexible && 
npages == 00
)
2581
0
      break;
2582
1
  }
2583
2584
2.39k
  Assert(npages == 0);
2585
2586
  /*
2587
   * If asked to flush, do so
2588
   */
2589
2.39k
  if (LogwrtResult.Flush < WriteRqst.Flush &&
2590
2.39k
    LogwrtResult.Flush < LogwrtResult.Write)
2591
2592
2.39k
  {
2593
    /*
2594
     * Could get here without iterating above loop, in which case we might
2595
     * have no open file or the wrong one.  However, we do not need to
2596
     * fsync more than one file.
2597
     */
2598
2.39k
    if (sync_method != SYNC_METHOD_OPEN &&
2599
2.39k
      sync_method != SYNC_METHOD_OPEN_DSYNC)
2600
0
    {
2601
0
      if (openLogFile >= 0 &&
2602
0
        !XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo,
2603
0
                 wal_segment_size))
2604
0
        XLogFileClose();
2605
0
      if (openLogFile < 0)
2606
0
      {
2607
0
        XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo,
2608
0
                wal_segment_size);
2609
0
        openLogFile = XLogFileOpen(openLogSegNo);
2610
0
        openLogOff = 0;
2611
0
      }
2612
2613
0
      issue_xlog_fsync(openLogFile, openLogSegNo);
2614
0
    }
2615
2616
    /* signal that we need to wakeup walsenders later */
2617
2.39k
    WalSndWakeupRequest();
2618
2619
2.39k
    LogwrtResult.Flush = LogwrtResult.Write;
2620
2.39k
  }
2621
2622
  /*
2623
   * Update shared-memory status
2624
   *
2625
   * We make sure that the shared 'request' values do not fall behind the
2626
   * 'result' values.  This is not absolutely essential, but it saves some
2627
   * code in a couple of places.
2628
   */
2629
2.39k
  {
2630
2.39k
    SpinLockAcquire(&XLogCtl->info_lck);
2631
2.39k
    XLogCtl->LogwrtResult = LogwrtResult;
2632
2.39k
    if (XLogCtl->LogwrtRqst.Write < LogwrtResult.Write)
2633
2.39k
      XLogCtl->LogwrtRqst.Write = LogwrtResult.Write;
2634
2.39k
    if (XLogCtl->LogwrtRqst.Flush < LogwrtResult.Flush)
2635
2.39k
      XLogCtl->LogwrtRqst.Flush = LogwrtResult.Flush;
2636
2.39k
    SpinLockRelease(&XLogCtl->info_lck);
2637
2.39k
  }
2638
2.39k
}
2639
2640
/*
2641
 * Record the LSN for an asynchronous transaction commit/abort
2642
 * and nudge the WALWriter if there is work for it to do.
2643
 * (This should not be called for synchronous commits.)
2644
 */
2645
void
2646
XLogSetAsyncXactLSN(XLogRecPtr asyncXactLSN)
2647
631
{
2648
631
  XLogRecPtr  WriteRqstPtr = asyncXactLSN;
2649
631
  bool    sleeping;
2650
2651
631
  SpinLockAcquire(&XLogCtl->info_lck);
2652
631
  LogwrtResult = XLogCtl->LogwrtResult;
2653
631
  sleeping = XLogCtl->WalWriterSleeping;
2654
631
  if (XLogCtl->asyncXactLSN < asyncXactLSN)
2655
631
    XLogCtl->asyncXactLSN = asyncXactLSN;
2656
631
  SpinLockRelease(&XLogCtl->info_lck);
2657
2658
  /*
2659
   * If the WALWriter is sleeping, we should kick it to make it come out of
2660
   * low-power mode.  Otherwise, determine whether there's a full page of
2661
   * WAL available to write.
2662
   */
2663
631
  if (!sleeping)
2664
631
  {
2665
    /* back off to last completed page boundary */
2666
631
    WriteRqstPtr -= WriteRqstPtr % XLOG_BLCKSZ;
2667
2668
    /* if we have already flushed that far, we're done */
2669
631
    if (WriteRqstPtr <= LogwrtResult.Flush)
2670
628
      return;
2671
631
  }
2672
2673
  /*
2674
   * Nudge the WALWriter: it has a full page of WAL to write, or we want it
2675
   * to come out of low-power mode so that this async commit will reach disk
2676
   * within the expected amount of time.
2677
   */
2678
3
  if (ProcGlobal->walwriterLatch)
2679
0
    SetLatch(ProcGlobal->walwriterLatch);
2680
3
}
2681
2682
/*
2683
 * Record the LSN up to which we can remove WAL because it's not required by
2684
 * any replication slot.
2685
 */
2686
void
2687
XLogSetReplicationSlotMinimumLSN(XLogRecPtr lsn)
2688
3.99k
{
2689
3.99k
  SpinLockAcquire(&XLogCtl->info_lck);
2690
3.99k
  XLogCtl->replicationSlotMinLSN = lsn;
2691
3.99k
  SpinLockRelease(&XLogCtl->info_lck);
2692
3.99k
}
2693
2694
2695
/*
2696
 * Return the oldest LSN we must retain to satisfy the needs of some
2697
 * replication slot.
2698
 */
2699
static XLogRecPtr
2700
XLogGetReplicationSlotMinimumLSN(void)
2701
2.35k
{
2702
2.35k
  XLogRecPtr  retval;
2703
2704
2.35k
  SpinLockAcquire(&XLogCtl->info_lck);
2705
2.35k
  retval = XLogCtl->replicationSlotMinLSN;
2706
2.35k
  SpinLockRelease(&XLogCtl->info_lck);
2707
2708
2.35k
  return retval;
2709
2.35k
}
2710
2711
/*
2712
 * Advance minRecoveryPoint in control file.
2713
 *
2714
 * If we crash during recovery, we must reach this point again before the
2715
 * database is consistent.
2716
 *
2717
 * If 'force' is true, 'lsn' argument is ignored. Otherwise, minRecoveryPoint
2718
 * is only updated if it's not already greater than or equal to 'lsn'.
2719
 */
2720
static void
2721
UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force)
2722
0
{
2723
  /* Quick check using our local copy of the variable */
2724
0
  if (!updateMinRecoveryPoint || (!force && lsn <= minRecoveryPoint))
2725
0
    return;
2726
2727
  /*
2728
   * An invalid minRecoveryPoint means that we need to recover all the WAL,
2729
   * i.e., we're doing crash recovery.  We never modify the control file's
2730
   * value in that case, so we can short-circuit future checks here too. The
2731
   * local values of minRecoveryPoint and minRecoveryPointTLI should not be
2732
   * updated until crash recovery finishes.  We only do this for the startup
2733
   * process as it should not update its own reference of minRecoveryPoint
2734
   * until it has finished crash recovery to make sure that all WAL
2735
   * available is replayed in this case.  This also saves from extra locks
2736
   * taken on the control file from the startup process.
2737
   */
2738
0
  if (XLogRecPtrIsInvalid(minRecoveryPoint) && InRecovery)
2739
0
  {
2740
0
    updateMinRecoveryPoint = false;
2741
0
    return;
2742
0
  }
2743
2744
0
  LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
2745
2746
  /* update local copy */
2747
0
  minRecoveryPoint = ControlFile->minRecoveryPoint;
2748
0
  minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
2749
2750
0
  if (XLogRecPtrIsInvalid(minRecoveryPoint))
2751
0
    updateMinRecoveryPoint = false;
2752
0
  else if (force || minRecoveryPoint < lsn)
2753
0
  {
2754
0
    XLogRecPtr  newMinRecoveryPoint;
2755
0
    TimeLineID  newMinRecoveryPointTLI;
2756
2757
    /*
2758
     * To avoid having to update the control file too often, we update it
2759
     * all the way to the last record being replayed, even though 'lsn'
2760
     * would suffice for correctness.  This also allows the 'force' case
2761
     * to not need a valid 'lsn' value.
2762
     *
2763
     * Another important reason for doing it this way is that the passed
2764
     * 'lsn' value could be bogus, i.e., past the end of available WAL, if
2765
     * the caller got it from a corrupted heap page.  Accepting such a
2766
     * value as the min recovery point would prevent us from coming up at
2767
     * all.  Instead, we just log a warning and continue with recovery.
2768
     * (See also the comments about corrupt LSNs in XLogFlush.)
2769
     */
2770
0
    SpinLockAcquire(&XLogCtl->info_lck);
2771
0
    newMinRecoveryPoint = XLogCtl->replayEndRecPtr;
2772
0
    newMinRecoveryPointTLI = XLogCtl->replayEndTLI;
2773
0
    SpinLockRelease(&XLogCtl->info_lck);
2774
2775
0
    if (!force && newMinRecoveryPoint < lsn)
2776
0
      elog(WARNING,
2777
0
         "xlog min recovery request %X/%X is past current point %X/%X",
2778
0
         (uint32) (lsn >> 32), (uint32) lsn,
2779
0
         (uint32) (newMinRecoveryPoint >> 32),
2780
0
         (uint32) newMinRecoveryPoint);
2781
2782
    /* update control file */
2783
0
    if (ControlFile->minRecoveryPoint < newMinRecoveryPoint)
2784
0
    {
2785
0
      ControlFile->minRecoveryPoint = newMinRecoveryPoint;
2786
0
      ControlFile->minRecoveryPointTLI = newMinRecoveryPointTLI;
2787
0
      UpdateControlFile();
2788
0
      minRecoveryPoint = newMinRecoveryPoint;
2789
0
      minRecoveryPointTLI = newMinRecoveryPointTLI;
2790
2791
0
      ereport(DEBUG2,
2792
0
          (errmsg("updated min recovery point to %X/%X on timeline %u",
2793
0
              (uint32) (minRecoveryPoint >> 32),
2794
0
              (uint32) minRecoveryPoint,
2795
0
              newMinRecoveryPointTLI)));
2796
0
    }
2797
0
  }
2798
0
  LWLockRelease(ControlFileLock);
2799
0
}
2800
2801
/*
2802
 * Ensure that all XLOG data through the given position is flushed to disk.
2803
 *
2804
 * NOTE: this differs from XLogWrite mainly in that the WALWriteLock is not
2805
 * already held, and we try to avoid acquiring it if possible.
2806
 */
2807
void
2808
XLogFlush(XLogRecPtr record)
2809
2.39k
{
2810
2.39k
  XLogRecPtr  WriteRqstPtr;
2811
2.39k
  XLogwrtRqst WriteRqst;
2812
2813
  /*
2814
   * During REDO, we are reading not writing WAL.  Therefore, instead of
2815
   * trying to flush the WAL, we should update minRecoveryPoint instead. We
2816
   * test XLogInsertAllowed(), not InRecovery, because we need checkpointer
2817
   * to act this way too, and because when it tries to write the
2818
   * end-of-recovery checkpoint, it should indeed flush.
2819
   */
2820
2.39k
  if (!XLogInsertAllowed())
2821
0
  {
2822
0
    UpdateMinRecoveryPoint(record, false);
2823
0
    return;
2824
0
  }
2825
2826
  /* Quick exit if already known flushed */
2827
2.39k
  if (record <= LogwrtResult.Flush)
2828
0
    return;
2829
2830
#ifdef WAL_DEBUG
2831
  if (XLOG_DEBUG)
2832
    elog(LOG, "xlog flush request %X/%X; write %X/%X; flush %X/%X",
2833
       (uint32) (record >> 32), (uint32) record,
2834
       (uint32) (LogwrtResult.Write >> 32), (uint32) LogwrtResult.Write,
2835
       (uint32) (LogwrtResult.Flush >> 32), (uint32) LogwrtResult.Flush);
2836
#endif
2837
2838
2.39k
  START_CRIT_SECTION();
2839
2840
  /*
2841
   * Since fsync is usually a horribly expensive operation, we try to
2842
   * piggyback as much data as we can on each fsync: if we see any more data
2843
   * entered into the xlog buffer, we'll write and fsync that too, so that
2844
   * the final value of LogwrtResult.Flush is as large as possible. This
2845
   * gives us some chance of avoiding another fsync immediately after.
2846
   */
2847
2848
  /* initialize to given target; may increase below */
2849
2.39k
  WriteRqstPtr = record;
2850
2851
  /*
2852
   * Now wait until we get the write lock, or someone else does the flush
2853
   * for us.
2854
   */
2855
2.39k
  for (;;)
2856
2.39k
  {
2857
2.39k
    XLogRecPtr  insertpos;
2858
2859
    /* read LogwrtResult and update local state */
2860
2.39k
    SpinLockAcquire(&XLogCtl->info_lck);
2861
2.39k
    if (WriteRqstPtr < XLogCtl->LogwrtRqst.Write)
2862
0
      WriteRqstPtr = XLogCtl->LogwrtRqst.Write;
2863
2.39k
    LogwrtResult = XLogCtl->LogwrtResult;
2864
2.39k
    SpinLockRelease(&XLogCtl->info_lck);
2865
2866
    /* done already? */
2867
2.39k
    if (record <= LogwrtResult.Flush)
2868
0
      break;
2869
2870
    /*
2871
     * Before actually performing the write, wait for all in-flight
2872
     * insertions to the pages we're about to write to finish.
2873
     */
2874
2.39k
    insertpos = WaitXLogInsertionsToFinish(WriteRqstPtr);
2875
2876
    /*
2877
     * Try to get the write lock. If we can't get it immediately, wait
2878
     * until it's released, and recheck if we still need to do the flush
2879
     * or if the backend that held the lock did it for us already. This
2880
     * helps to maintain a good rate of group committing when the system
2881
     * is bottlenecked by the speed of fsyncing.
2882
     */
2883
2.39k
    if (!LWLockAcquireOrWait(WALWriteLock, LW_EXCLUSIVE))
2884
0
    {
2885
      /*
2886
       * The lock is now free, but we didn't acquire it yet. Before we
2887
       * do, loop back to check if someone else flushed the record for
2888
       * us already.
2889
       */
2890
0
      continue;
2891
0
    }
2892
2893
    /* Got the lock; recheck whether request is satisfied */
2894
2.39k
    LogwrtResult = XLogCtl->LogwrtResult;
2895
2.39k
    if (record <= LogwrtResult.Flush)
2896
0
    {
2897
0
      LWLockRelease(WALWriteLock);
2898
0
      break;
2899
0
    }
2900
2901
    /*
2902
     * Sleep before flush! By adding a delay here, we may give further
2903
     * backends the opportunity to join the backlog of group commit
2904
     * followers; this can significantly improve transaction throughput,
2905
     * at the risk of increasing transaction latency.
2906
     *
2907
     * We do not sleep if enableFsync is not turned on, nor if there are
2908
     * fewer than CommitSiblings other backends with active transactions.
2909
     */
2910
2.39k
    if (CommitDelay > 0 && 
enableFsync0
&&
2911
2.39k
      
MinimumActiveBackends(CommitSiblings)0
)
2912
0
    {
2913
0
      pg_usleep(CommitDelay);
2914
2915
      /*
2916
       * Re-check how far we can now flush the WAL. It's generally not
2917
       * safe to call WaitXLogInsertionsToFinish while holding
2918
       * WALWriteLock, because an in-progress insertion might need to
2919
       * also grab WALWriteLock to make progress. But we know that all
2920
       * the insertions up to insertpos have already finished, because
2921
       * that's what the earlier WaitXLogInsertionsToFinish() returned.
2922
       * We're only calling it again to allow insertpos to be moved
2923
       * further forward, not to actually wait for anyone.
2924
       */
2925
0
      insertpos = WaitXLogInsertionsToFinish(insertpos);
2926
0
    }
2927
2928
    /* try to write/flush later additions to XLOG as well */
2929
2.39k
    WriteRqst.Write = insertpos;
2930
2.39k
    WriteRqst.Flush = insertpos;
2931
2932
2.39k
    XLogWrite(WriteRqst, false);
2933
2934
2.39k
    LWLockRelease(WALWriteLock);
2935
    /* done */
2936
2.39k
    break;
2937
2.39k
  }
2938
2939
2.39k
  END_CRIT_SECTION();
2940
2941
  /* wake up walsenders now that we've released heavily contended locks */
2942
2.39k
  WalSndWakeupProcessRequests();
2943
2944
  /*
2945
   * If we still haven't flushed to the request point then we have a
2946
   * problem; most likely, the requested flush point is past end of XLOG.
2947
   * This has been seen to occur when a disk page has a corrupted LSN.
2948
   *
2949
   * Formerly we treated this as a PANIC condition, but that hurts the
2950
   * system's robustness rather than helping it: we do not want to take down
2951
   * the whole system due to corruption on one data page.  In particular, if
2952
   * the bad page is encountered again during recovery then we would be
2953
   * unable to restart the database at all!  (This scenario actually
2954
   * happened in the field several times with 7.1 releases.)  As of 8.4, bad
2955
   * LSNs encountered during recovery are UpdateMinRecoveryPoint's problem;
2956
   * the only time we can reach here during recovery is while flushing the
2957
   * end-of-recovery checkpoint record, and we don't expect that to have a
2958
   * bad LSN.
2959
   *
2960
   * Note that for calls from xact.c, the ERROR will be promoted to PANIC
2961
   * since xact.c calls this routine inside a critical section.  However,
2962
   * calls from bufmgr.c are not within critical sections and so we will not
2963
   * force a restart for a bad LSN on a data page.
2964
   */
2965
2.39k
  if (LogwrtResult.Flush < record)
2966
0
    elog(ERROR,
2967
2.39k
       "xlog flush request %X/%X is not satisfied --- flushed only to %X/%X",
2968
2.39k
       (uint32) (record >> 32), (uint32) record,
2969
2.39k
       (uint32) (LogwrtResult.Flush >> 32), (uint32) LogwrtResult.Flush);
2970
2.39k
}
2971
2972
/*
2973
 * Write & flush xlog, but without specifying exactly where to.
2974
 *
2975
 * We normally write only completed blocks; but if there is nothing to do on
2976
 * that basis, we check for unwritten async commits in the current incomplete
2977
 * block, and write through the latest one of those.  Thus, if async commits
2978
 * are not being used, we will write complete blocks only.
2979
 *
2980
 * If, based on the above, there's anything to write we do so immediately. But
2981
 * to avoid calling fsync, fdatasync et. al. at a rate that'd impact
2982
 * concurrent IO, we only flush WAL every wal_writer_delay ms, or if there's
2983
 * more than wal_writer_flush_after unflushed blocks.
2984
 *
2985
 * We can guarantee that async commits reach disk after at most three
2986
 * wal_writer_delay cycles. (When flushing complete blocks, we allow XLogWrite
2987
 * to write "flexibly", meaning it can stop at the end of the buffer ring;
2988
 * this makes a difference only with very high load or long wal_writer_delay,
2989
 * but imposes one extra cycle for the worst case for async commits.)
2990
 *
2991
 * This routine is invoked periodically by the background walwriter process.
2992
 *
2993
 * Returns true if there was any work to do, even if we skipped flushing due
2994
 * to wal_writer_delay/wal_writer_flush_after.
2995
 */
2996
bool
2997
XLogBackgroundFlush(void)
2998
0
{
2999
0
  XLogwrtRqst WriteRqst;
3000
0
  bool    flexible = true;
3001
0
  static TimestampTz lastflush;
3002
0
  TimestampTz now;
3003
0
  int     flushbytes;
3004
3005
  /* XLOG doesn't need flushing during recovery */
3006
0
  if (RecoveryInProgress())
3007
0
    return false;
3008
3009
  /* read LogwrtResult and update local state */
3010
0
  SpinLockAcquire(&XLogCtl->info_lck);
3011
0
  LogwrtResult = XLogCtl->LogwrtResult;
3012
0
  WriteRqst = XLogCtl->LogwrtRqst;
3013
0
  SpinLockRelease(&XLogCtl->info_lck);
3014
3015
  /* back off to last completed page boundary */
3016
0
  WriteRqst.Write -= WriteRqst.Write % XLOG_BLCKSZ;
3017
3018
  /* if we have already flushed that far, consider async commit records */
3019
0
  if (WriteRqst.Write <= LogwrtResult.Flush)
3020
0
  {
3021
0
    SpinLockAcquire(&XLogCtl->info_lck);
3022
0
    WriteRqst.Write = XLogCtl->asyncXactLSN;
3023
0
    SpinLockRelease(&XLogCtl->info_lck);
3024
0
    flexible = false;   /* ensure it all gets written */
3025
0
  }
3026
3027
  /*
3028
   * If already known flushed, we're done. Just need to check if we are
3029
   * holding an open file handle to a logfile that's no longer in use,
3030
   * preventing the file from being deleted.
3031
   */
3032
0
  if (WriteRqst.Write <= LogwrtResult.Flush)
3033
0
  {
3034
0
    if (openLogFile >= 0)
3035
0
    {
3036
0
      if (!XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo,
3037
0
                 wal_segment_size))
3038
0
      {
3039
0
        XLogFileClose();
3040
0
      }
3041
0
    }
3042
0
    return false;
3043
0
  }
3044
3045
  /*
3046
   * Determine how far to flush WAL, based on the wal_writer_delay and
3047
   * wal_writer_flush_after GUCs.
3048
   */
3049
0
  now = GetCurrentTimestamp();
3050
0
  flushbytes =
3051
0
    WriteRqst.Write / XLOG_BLCKSZ - LogwrtResult.Flush / XLOG_BLCKSZ;
3052
3053
0
  if (WalWriterFlushAfter == 0 || lastflush == 0)
3054
0
  {
3055
    /* first call, or block based limits disabled */
3056
0
    WriteRqst.Flush = WriteRqst.Write;
3057
0
    lastflush = now;
3058
0
  }
3059
0
  else if (TimestampDifferenceExceeds(lastflush, now, WalWriterDelay))
3060
0
  {
3061
    /*
3062
     * Flush the writes at least every WalWriteDelay ms. This is important
3063
     * to bound the amount of time it takes for an asynchronous commit to
3064
     * hit disk.
3065
     */
3066
0
    WriteRqst.Flush = WriteRqst.Write;
3067
0
    lastflush = now;
3068
0
  }
3069
0
  else if (flushbytes >= WalWriterFlushAfter)
3070
0
  {
3071
    /* exceeded wal_writer_flush_after blocks, flush */
3072
0
    WriteRqst.Flush = WriteRqst.Write;
3073
0
    lastflush = now;
3074
0
  }
3075
0
  else
3076
0
  {
3077
    /* no flushing, this time round */
3078
0
    WriteRqst.Flush = 0;
3079
0
  }
3080
3081
#ifdef WAL_DEBUG
3082
  if (XLOG_DEBUG)
3083
    elog(LOG, "xlog bg flush request write %X/%X; flush: %X/%X, current is write %X/%X; flush %X/%X",
3084
       (uint32) (WriteRqst.Write >> 32), (uint32) WriteRqst.Write,
3085
       (uint32) (WriteRqst.Flush >> 32), (uint32) WriteRqst.Flush,
3086
       (uint32) (LogwrtResult.Write >> 32), (uint32) LogwrtResult.Write,
3087
       (uint32) (LogwrtResult.Flush >> 32), (uint32) LogwrtResult.Flush);
3088
#endif
3089
3090
0
  START_CRIT_SECTION();
3091
3092
  /* now wait for any in-progress insertions to finish and get write lock */
3093
0
  WaitXLogInsertionsToFinish(WriteRqst.Write);
3094
0
  LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
3095
0
  LogwrtResult = XLogCtl->LogwrtResult;
3096
0
  if (WriteRqst.Write > LogwrtResult.Write ||
3097
0
    WriteRqst.Flush > LogwrtResult.Flush)
3098
0
  {
3099
0
    XLogWrite(WriteRqst, flexible);
3100
0
  }
3101
0
  LWLockRelease(WALWriteLock);
3102
3103
0
  END_CRIT_SECTION();
3104
3105
  /* wake up walsenders now that we've released heavily contended locks */
3106
0
  WalSndWakeupProcessRequests();
3107
3108
  /*
3109
   * Great, done. To take some work off the critical path, try to initialize
3110
   * as many of the no-longer-needed WAL buffers for future use as we can.
3111
   */
3112
0
  AdvanceXLInsertBuffer(InvalidXLogRecPtr, true);
3113
3114
  /*
3115
   * If we determined that we need to write data, but somebody else
3116
   * wrote/flushed already, it should be considered as being active, to
3117
   * avoid hibernating too early.
3118
   */
3119
0
  return true;
3120
0
}
3121
3122
/*
3123
 * Test whether XLOG data has been flushed up to (at least) the given position.
3124
 *
3125
 * Returns true if a flush is still needed.  (It may be that someone else
3126
 * is already in process of flushing that far, however.)
3127
 */
3128
bool
3129
XLogNeedsFlush(XLogRecPtr record)
3130
0
{
3131
  /*
3132
   * During recovery, we don't flush WAL but update minRecoveryPoint
3133
   * instead. So "needs flush" is taken to mean whether minRecoveryPoint
3134
   * would need to be updated.
3135
   */
3136
0
  if (RecoveryInProgress())
3137
0
  {
3138
    /*
3139
     * An invalid minRecoveryPoint means that we need to recover all the
3140
     * WAL, i.e., we're doing crash recovery.  We never modify the control
3141
     * file's value in that case, so we can short-circuit future checks
3142
     * here too.  This triggers a quick exit path for the startup process,
3143
     * which cannot update its local copy of minRecoveryPoint as long as
3144
     * it has not replayed all WAL available when doing crash recovery.
3145
     */
3146
0
    if (XLogRecPtrIsInvalid(minRecoveryPoint) && InRecovery)
3147
0
      updateMinRecoveryPoint = false;
3148
3149
    /* Quick exit if already known to be updated or cannot be updated */
3150
0
    if (record <= minRecoveryPoint || !updateMinRecoveryPoint)
3151
0
      return false;
3152
3153
    /*
3154
     * Update local copy of minRecoveryPoint. But if the lock is busy,
3155
     * just return a conservative guess.
3156
     */
3157
0
    if (!LWLockConditionalAcquire(ControlFileLock, LW_SHARED))
3158
0
      return true;
3159
0
    minRecoveryPoint = ControlFile->minRecoveryPoint;
3160
0
    minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
3161
0
    LWLockRelease(ControlFileLock);
3162
3163
    /*
3164
     * Check minRecoveryPoint for any other process than the startup
3165
     * process doing crash recovery, which should not update the control
3166
     * file value if crash recovery is still running.
3167
     */
3168
0
    if (XLogRecPtrIsInvalid(minRecoveryPoint))
3169
0
      updateMinRecoveryPoint = false;
3170
3171
    /* check again */
3172
0
    if (record <= minRecoveryPoint || !updateMinRecoveryPoint)
3173
0
      return false;
3174
0
    else
3175
0
      return true;
3176
0
  }
3177
3178
  /* Quick exit if already known flushed */
3179
0
  if (record <= LogwrtResult.Flush)
3180
0
    return false;
3181
3182
  /* read LogwrtResult and update local state */
3183
0
  SpinLockAcquire(&XLogCtl->info_lck);
3184
0
  LogwrtResult = XLogCtl->LogwrtResult;
3185
0
  SpinLockRelease(&XLogCtl->info_lck);
3186
3187
  /* check again */
3188
0
  if (record <= LogwrtResult.Flush)
3189
0
    return false;
3190
3191
0
  return true;
3192
0
}
3193
3194
/*
3195
 * Create a new XLOG file segment, or open a pre-existing one.
3196
 *
3197
 * log, seg: identify segment to be created/opened.
3198
 *
3199
 * *use_existent: if true, OK to use a pre-existing file (else, any
3200
 * pre-existing file will be deleted).  On return, true if a pre-existing
3201
 * file was used.
3202
 *
3203
 * use_lock: if true, acquire ControlFileLock while moving file into
3204
 * place.  This should be true except during bootstrap log creation.  The
3205
 * caller must *not* hold the lock at call.
3206
 *
3207
 * Returns FD of opened file.
3208
 *
3209
 * Note: errors here are ERROR not PANIC because we might or might not be
3210
 * inside a critical section (eg, during checkpoint there is no reason to
3211
 * take down the system on failure).  They will promote to PANIC if we are
3212
 * in a critical section.
3213
 */
3214
int
3215
XLogFileInit(XLogSegNo logsegno, bool *use_existent, bool use_lock)
3216
4.17k
{
3217
4.17k
  char    path[MAXPGPATH];
3218
4.17k
  char    tmppath[MAXPGPATH];
3219
4.17k
  PGAlignedXLogBlock zbuffer;
3220
4.17k
  XLogSegNo installed_segno;
3221
4.17k
  XLogSegNo max_segno;
3222
4.17k
  int     fd;
3223
4.17k
  int     nbytes;
3224
3225
4.17k
  XLogFilePath(path, ThisTimeLineID, logsegno, wal_segment_size);
3226
3227
  /*
3228
   * Try to use existent file (checkpoint maker may have created it already)
3229
   */
3230
4.17k
  if (*use_existent)
3231
2.16k
  {
3232
2.16k
    fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method));
3233
2.16k
    if (fd < 0)
3234
0
    {
3235
0
      if (errno != ENOENT)
3236
0
        ereport(ERROR,
3237
0
            (errcode_for_file_access(),
3238
0
             errmsg("could not open file \"%s\": %m", path)));
3239
0
    }
3240
2.16k
    else
3241
2.16k
      return fd;
3242
2.16k
  }
3243
3244
  /*
3245
   * Initialize an empty (all zeroes) segment.  NOTE: it is possible that
3246
   * another process is doing the same thing.  If so, we will end up
3247
   * pre-creating an extra log segment.  That seems OK, and better than
3248
   * holding the lock throughout this lengthy process.
3249
   */
3250
2.00k
  elog(DEBUG2, "creating and filling new WAL file");
3251
3252
2.00k
  snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
3253
3254
2.00k
  unlink(tmppath);
3255
3256
  /* do not use get_sync_bit() here --- want to fsync only at end of fill */
3257
2.00k
  fd = BasicOpenFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY);
3258
2.00k
  if (fd < 0)
3259
2.00k
    ereport(ERROR,
3260
2.00k
        (errcode_for_file_access(),
3261
2.00k
         errmsg("could not create file \"%s\": %m", tmppath)));
3262
3263
  /*
3264
   * Zero-fill the file.  We have to do this the hard way to ensure that all
3265
   * the file space has really been allocated --- on platforms that allow
3266
   * "holes" in files, just seeking to the end doesn't allocate intermediate
3267
   * space.  This way, we know that we have all the space and (after the
3268
   * fsync below) that all the indirect blocks are down on disk.  Therefore,
3269
   * fdatasync(2) or O_DSYNC will be sufficient to sync future writes to the
3270
   * log file.
3271
   */
3272
2.00k
  memset(zbuffer.data, 0, XLOG_BLCKSZ);
3273
4.10M
  for (nbytes = 0; nbytes < wal_segment_size; 
nbytes += 4.10M
XLOG_BLCKSZ4.10M
)
3274
4.10M
  {
3275
4.10M
    errno = 0;
3276
4.10M
    pgstat_report_wait_start(WAIT_EVENT_WAL_INIT_WRITE);
3277
4.10M
    if ((int) write(fd, zbuffer.data, XLOG_BLCKSZ) != (int) XLOG_BLCKSZ)
3278
0
    {
3279
0
      int     save_errno = errno;
3280
3281
      /*
3282
       * If we fail to make the file, delete it to release disk space
3283
       */
3284
0
      unlink(tmppath);
3285
3286
0
      close(fd);
3287
3288
      /* if write didn't set errno, assume problem is no disk space */
3289
0
      errno = save_errno ? save_errno : ENOSPC;
3290
3291
0
      ereport(ERROR,
3292
0
          (errcode_for_file_access(),
3293
0
           errmsg("could not write to file \"%s\": %m", tmppath)));
3294
0
    }
3295
4.10M
    pgstat_report_wait_end();
3296
4.10M
  }
3297
3298
2.00k
  pgstat_report_wait_start(WAIT_EVENT_WAL_INIT_SYNC);
3299
2.00k
  if (pg_fsync(fd) != 0)
3300
0
  {
3301
0
    int     save_errno = errno;
3302
3303
0
    close(fd);
3304
0
    errno = save_errno;
3305
0
    ereport(ERROR,
3306
0
        (errcode_for_file_access(),
3307
0
         errmsg("could not fsync file \"%s\": %m", tmppath)));
3308
0
  }
3309
2.00k
  pgstat_report_wait_end();
3310
3311
2.00k
  if (close(fd))
3312
2.00k
    ereport(ERROR,
3313
2.00k
        (errcode_for_file_access(),
3314
2.00k
         errmsg("could not close file \"%s\": %m", tmppath)));
3315
3316
  /*
3317
   * Now move the segment into place with its final name.
3318
   *
3319
   * If caller didn't want to use a pre-existing file, get rid of any
3320
   * pre-existing file.  Otherwise, cope with possibility that someone else
3321
   * has created the file while we were filling ours: if so, use ours to
3322
   * pre-create a future log segment.
3323
   */
3324
2.00k
  installed_segno = logsegno;
3325
3326
  /*
3327
   * XXX: What should we use as max_segno? We used to use XLOGfileslop when
3328
   * that was a constant, but that was always a bit dubious: normally, at a
3329
   * checkpoint, XLOGfileslop was the offset from the checkpoint record, but
3330
   * here, it was the offset from the insert location. We can't do the
3331
   * normal XLOGfileslop calculation here because we don't have access to
3332
   * the prior checkpoint's redo location. So somewhat arbitrarily, just use
3333
   * CheckPointSegments.
3334
   */
3335
2.00k
  max_segno = logsegno + CheckPointSegments;
3336
2.00k
  if (!InstallXLogFileSegment(&installed_segno, tmppath,
3337
2.00k
                *use_existent, max_segno,
3338
2.00k
                use_lock))
3339
0
  {
3340
    /*
3341
     * No need for any more future segments, or InstallXLogFileSegment()
3342
     * failed to rename the file into place. If the rename failed, opening
3343
     * the file below will fail.
3344
     */
3345
0
    unlink(tmppath);
3346
0
  }
3347
3348
  /* Set flag to tell caller there was no existent file */
3349
2.00k
  *use_existent = false;
3350
3351
  /* Now open original target segment (might not be file I just made) */
3352
2.00k
  fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method));
3353
2.00k
  if (fd < 0)
3354
2.00k
    ereport(ERROR,
3355
2.00k
        (errcode_for_file_access(),
3356
2.00k
         errmsg("could not open file \"%s\": %m", path)));
3357
3358
2.00k
  elog(DEBUG2, "done creating and filling new WAL file");
3359
3360
2.00k
  return fd;
3361
2.00k
}
3362
3363
/*
3364
 * Create a new XLOG file segment by copying a pre-existing one.
3365
 *
3366
 * destsegno: identify segment to be created.
3367
 *
3368
 * srcTLI, srcsegno: identify segment to be copied (could be from
3369
 *    a different timeline)
3370
 *
3371
 * upto: how much of the source file to copy (the rest is filled with
3372
 *    zeros)
3373
 *
3374
 * Currently this is only used during recovery, and so there are no locking
3375
 * considerations.  But we should be just as tense as XLogFileInit to avoid
3376
 * emplacing a bogus file.
3377
 */
3378
static void
3379
XLogFileCopy(XLogSegNo destsegno, TimeLineID srcTLI, XLogSegNo srcsegno,
3380
       int upto)
3381
0
{
3382
0
  char    path[MAXPGPATH];
3383
0
  char    tmppath[MAXPGPATH];
3384
0
  PGAlignedXLogBlock buffer;
3385
0
  int     srcfd;
3386
0
  int     fd;
3387
0
  int     nbytes;
3388
3389
  /*
3390
   * Open the source file
3391
   */
3392
0
  XLogFilePath(path, srcTLI, srcsegno, wal_segment_size);
3393
0
  srcfd = OpenTransientFile(path, O_RDONLY | PG_BINARY);
3394
0
  if (srcfd < 0)
3395
0
    ereport(ERROR,
3396
0
        (errcode_for_file_access(),
3397
0
         errmsg("could not open file \"%s\": %m", path)));
3398
3399
  /*
3400
   * Copy into a temp file name.
3401
   */
3402
0
  snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
3403
3404
0
  unlink(tmppath);
3405
3406
  /* do not use get_sync_bit() here --- want to fsync only at end of fill */
3407
0
  fd = OpenTransientFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY);
3408
0
  if (fd < 0)
3409
0
    ereport(ERROR,
3410
0
        (errcode_for_file_access(),
3411
0
         errmsg("could not create file \"%s\": %m", tmppath)));
3412
3413
  /*
3414
   * Do the data copying.
3415
   */
3416
0
  for (nbytes = 0; nbytes < wal_segment_size; nbytes += sizeof(buffer))
3417
0
  {
3418
0
    int     nread;
3419
3420
0
    nread = upto - nbytes;
3421
3422
    /*
3423
     * The part that is not read from the source file is filled with
3424
     * zeros.
3425
     */
3426
0
    if (nread < sizeof(buffer))
3427
0
      memset(buffer.data, 0, sizeof(buffer));
3428
3429
0
    if (nread > 0)
3430
0
    {
3431
0
      if (nread > sizeof(buffer))
3432
0
        nread = sizeof(buffer);
3433
0
      errno = 0;
3434
0
      pgstat_report_wait_start(WAIT_EVENT_WAL_COPY_READ);
3435
0
      if (read(srcfd, buffer.data, nread) != nread)
3436
0
      {
3437
0
        if (errno != 0)
3438
0
          ereport(ERROR,
3439
0
              (errcode_for_file_access(),
3440
0
               errmsg("could not read file \"%s\": %m",
3441
0
                  path)));
3442
0
        else
3443
0
          ereport(ERROR,
3444
0
              (errmsg("not enough data in file \"%s\"",
3445
0
                  path)));
3446
0
      }
3447
0
      pgstat_report_wait_end();
3448
0
    }
3449
0
    errno = 0;
3450
0
    pgstat_report_wait_start(WAIT_EVENT_WAL_COPY_WRITE);
3451
0
    if ((int) write(fd, buffer.data, sizeof(buffer)) != (int) sizeof(buffer))
3452
0
    {
3453
0
      int     save_errno = errno;
3454
3455
      /*
3456
       * If we fail to make the file, delete it to release disk space
3457
       */
3458
0
      unlink(tmppath);
3459
      /* if write didn't set errno, assume problem is no disk space */
3460
0
      errno = save_errno ? save_errno : ENOSPC;
3461
3462
0
      ereport(ERROR,
3463
0
          (errcode_for_file_access(),
3464
0
           errmsg("could not write to file \"%s\": %m", tmppath)));
3465
0
    }
3466
0
    pgstat_report_wait_end();
3467
0
  }
3468
3469
0
  pgstat_report_wait_start(WAIT_EVENT_WAL_COPY_SYNC);
3470
0
  if (pg_fsync(fd) != 0)
3471
0
    ereport(data_sync_elevel(ERROR),
3472
0
        (errcode_for_file_access(),
3473
0
         errmsg("could not fsync file \"%s\": %m", tmppath)));
3474
0
  pgstat_report_wait_end();
3475
3476
0
  if (CloseTransientFile(fd))
3477
0
    ereport(ERROR,
3478
0
        (errcode_for_file_access(),
3479
0
         errmsg("could not close file \"%s\": %m", tmppath)));
3480
3481
0
  CloseTransientFile(srcfd);
3482
3483
  /*
3484
   * Now move the segment into place with its final name.
3485
   */
3486
0
  if (!InstallXLogFileSegment(&destsegno, tmppath, false, 0, false))
3487
0
    elog(ERROR, "InstallXLogFileSegment should not have failed");
3488
0
}
3489
3490
/*
3491
 * Install a new XLOG segment file as a current or future log segment.
3492
 *
3493
 * This is used both to install a newly-created segment (which has a temp
3494
 * filename while it's being created) and to recycle an old segment.
3495
 *
3496
 * *segno: identify segment to install as (or first possible target).
3497
 * When find_free is true, this is modified on return to indicate the
3498
 * actual installation location or last segment searched.
3499
 *
3500
 * tmppath: initial name of file to install.  It will be renamed into place.
3501
 *
3502
 * find_free: if true, install the new segment at the first empty segno
3503
 * number at or after the passed numbers.  If false, install the new segment
3504
 * exactly where specified, deleting any existing segment file there.
3505
 *
3506
 * max_segno: maximum segment number to install the new file as.  Fail if no
3507
 * free slot is found between *segno and max_segno. (Ignored when find_free
3508
 * is false.)
3509
 *
3510
 * use_lock: if true, acquire ControlFileLock while moving file into
3511
 * place.  This should be true except during bootstrap log creation.  The
3512
 * caller must *not* hold the lock at call.
3513
 *
3514
 * Returns true if the file was installed successfully.  false indicates that
3515
 * max_segno limit was exceeded, or an error occurred while renaming the
3516
 * file into place.
3517
 */
3518
static bool
3519
InstallXLogFileSegment(XLogSegNo *segno, char *tmppath,
3520
             bool find_free, XLogSegNo max_segno,
3521
             bool use_lock)
3522
2.00k
{
3523
2.00k
  char    path[MAXPGPATH];
3524
2.00k
  struct stat stat_buf;
3525
3526
2.00k
  XLogFilePath(path, ThisTimeLineID, *segno, wal_segment_size);
3527
3528
  /*
3529
   * We want to be sure that only one process does this at a time.
3530
   */
3531
2.00k
  if (use_lock)
3532
0
    LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
3533
3534
2.00k
  if (!find_free)
3535
2.00k
  {
3536
    /* Force installation: get rid of any pre-existing segment file */
3537
2.00k
    durable_unlink(path, DEBUG1);
3538
2.00k
  }
3539
0
  else
3540
0
  {
3541
    /* Find a free slot to put it in */
3542
0
    while (stat(path, &stat_buf) == 0)
3543
0
    {
3544
0
      if ((*segno) >= max_segno)
3545
0
      {
3546
        /* Failed to find a free slot within specified range */
3547
0
        if (use_lock)
3548
0
          LWLockRelease(ControlFileLock);
3549
0
        return false;
3550
0
      }
3551
0
      (*segno)++;
3552
0
      XLogFilePath(path, ThisTimeLineID, *segno, wal_segment_size);
3553
0
    }
3554
0
  }
3555
3556
  /*
3557
   * Perform the rename using link if available, paranoidly trying to avoid
3558
   * overwriting an existing file (there shouldn't be one).
3559
   */
3560
2.00k
  if (durable_link_or_rename(tmppath, path, LOG) != 0)
3561
0
  {
3562
0
    if (use_lock)
3563
0
      LWLockRelease(ControlFileLock);
3564
    /* durable_link_or_rename already emitted log message */
3565
0
    return false;
3566
0
  }
3567
3568
2.00k
  if (use_lock)
3569
0
    LWLockRelease(ControlFileLock);
3570
3571
2.00k
  return true;
3572
2.00k
}
3573
3574
/*
3575
 * Open a pre-existing logfile segment for writing.
3576
 */
3577
int
3578
XLogFileOpen(XLogSegNo segno)
3579
0
{
3580
0
  char    path[MAXPGPATH];
3581
0
  int     fd;
3582
3583
0
  XLogFilePath(path, ThisTimeLineID, segno, wal_segment_size);
3584
3585
0
  fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method));
3586
0
  if (fd < 0)
3587
0
    ereport(PANIC,
3588
0
        (errcode_for_file_access(),
3589
0
         errmsg("could not open write-ahead log file \"%s\": %m", path)));
3590
3591
0
  return fd;
3592
0
}
3593
3594
/*
3595
 * Open a logfile segment for reading (during recovery).
3596
 *
3597
 * If source == XLOG_FROM_ARCHIVE, the segment is retrieved from archive.
3598
 * Otherwise, it's assumed to be already available in pg_wal.
3599
 */
3600
static int
3601
XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli,
3602
       int source, bool notfoundOk)
3603
4.00k
{
3604
4.00k
  char    xlogfname[MAXFNAMELEN];
3605
4.00k
  char    activitymsg[MAXFNAMELEN + 16];
3606
4.00k
  char    path[MAXPGPATH];
3607
4.00k
  int     fd;
3608
3609
4.00k
  XLogFileName(xlogfname, tli, segno, wal_segment_size);
3610
3611
4.00k
  switch (source)
3612
4.00k
  {
3613
0
    case XLOG_FROM_ARCHIVE:
3614
      /* Report recovery progress in PS display */
3615
0
      snprintf(activitymsg, sizeof(activitymsg), "waiting for %s",
3616
0
           xlogfname);
3617
0
      set_ps_display(activitymsg, false);
3618
3619
0
      restoredFromArchive = RestoreArchivedFile(path, xlogfname,
3620
0
                            "RECOVERYXLOG",
3621
0
                            wal_segment_size,
3622
0
                            InRedo);
3623
0
      if (!restoredFromArchive)
3624
0
        return -1;
3625
0
      break;
3626
3627
4.00k
    case XLOG_FROM_PG_WAL:
3628
4.00k
    case XLOG_FROM_STREAM:
3629
4.00k
      XLogFilePath(path, tli, segno, wal_segment_size);
3630
4.00k
      restoredFromArchive = false;
3631
4.00k
      break;
3632
3633
0
    default:
3634
0
      elog(ERROR, "invalid XLogFileRead source %d", source);
3635
4.00k
  }
3636
3637
  /*
3638
   * If the segment was fetched from archival storage, replace the existing
3639
   * xlog segment (if any) with the archival version.
3640
   */
3641
4.00k
  if (source == XLOG_FROM_ARCHIVE)
3642
0
  {
3643
0
    KeepFileRestoredFromArchive(path, xlogfname);
3644
3645
    /*
3646
     * Set path to point at the new file in pg_wal.
3647
     */
3648
0
    snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlogfname);
3649
0
  }
3650
3651
4.00k
  fd = BasicOpenFile(path, O_RDONLY | PG_BINARY);
3652
4.00k
  if (fd >= 0)
3653
4.00k
  {
3654
    /* Success! */
3655
4.00k
    curFileTLI = tli;
3656
3657
    /* Report recovery progress in PS display */
3658
4.00k
    snprintf(activitymsg, sizeof(activitymsg), "recovering %s",
3659
4.00k
         xlogfname);
3660
4.00k
    set_ps_display(activitymsg, false);
3661
3662
    /* Track source of data in assorted state variables */
3663
4.00k
    readSource = source;
3664
4.00k
    XLogReceiptSource = source;
3665
    /* In FROM_STREAM case, caller tracks receipt time, not me */
3666
4.00k
    if (source != XLOG_FROM_STREAM)
3667
4.00k
      XLogReceiptTime = GetCurrentTimestamp();
3668
3669
4.00k
    return fd;
3670
4.00k
  }
3671
0
  if (errno != ENOENT || !notfoundOk) /* unexpected failure? */
3672
0
    ereport(PANIC,
3673
0
        (errcode_for_file_access(),
3674
0
         errmsg("could not open file \"%s\": %m", path)));
3675
0
  return -1;
3676
0
}
3677
3678
/*
3679
 * Open a logfile segment for reading (during recovery).
3680
 *
3681
 * This version searches for the segment with any TLI listed in expectedTLEs.
3682
 */
3683
static int
3684
XLogFileReadAnyTLI(XLogSegNo segno, int emode, int source)
3685
4.00k
{
3686
4.00k
  char    path[MAXPGPATH];
3687
4.00k
  ListCell   *cell;
3688
4.00k
  int     fd;
3689
4.00k
  List     *tles;
3690
3691
  /*
3692
   * Loop looking for a suitable timeline ID: we might need to read any of
3693
   * the timelines listed in expectedTLEs.
3694
   *
3695
   * We expect curFileTLI on entry to be the TLI of the preceding file in
3696
   * sequence, or 0 if there was no predecessor.  We do not allow curFileTLI
3697
   * to go backwards; this prevents us from picking up the wrong file when a
3698
   * parent timeline extends to higher segment numbers than the child we
3699
   * want to read.
3700
   *
3701
   * If we haven't read the timeline history file yet, read it now, so that
3702
   * we know which TLIs to scan.  We don't save the list in expectedTLEs,
3703
   * however, unless we actually find a valid segment.  That way if there is
3704
   * neither a timeline history file nor a WAL segment in the archive, and
3705
   * streaming replication is set up, we'll read the timeline history file
3706
   * streamed from the master when we start streaming, instead of recovering
3707
   * with a dummy history generated here.
3708
   */
3709
4.00k
  if (expectedTLEs)
3710
5
    tles = expectedTLEs;
3711
3.99k
  else
3712
3.99k
    tles = readTimeLineHistory(recoveryTargetTLI);
3713
3714
4.00k
  foreach(cell, tles)
3715
4.00k
  {
3716
4.00k
    TimeLineID  tli = ((TimeLineHistoryEntry *) lfirst(cell))->tli;
3717
3718
4.00k
    if (tli < curFileTLI)
3719
0
      break;       /* don't bother looking at too-old TLIs */
3720
3721
4.00k
    if (source == XLOG_FROM_ANY || source == XLOG_FROM_ARCHIVE)
3722
0
    {
3723
0
      fd = XLogFileRead(segno, emode, tli,
3724
0
                XLOG_FROM_ARCHIVE, true);
3725
0
      if (fd != -1)
3726
0
      {
3727
0
        elog(DEBUG1, "got WAL segment from archive");
3728
0
        if (!expectedTLEs)
3729
0
          expectedTLEs = tles;
3730
0
        return fd;
3731
0
      }
3732
0
    }
3733
3734
4.00k
    if (source == XLOG_FROM_ANY || source == XLOG_FROM_PG_WAL)
3735
4.00k
    {
3736
4.00k
      fd = XLogFileRead(segno, emode, tli,
3737
4.00k
                XLOG_FROM_PG_WAL, true);
3738
4.00k
      if (fd != -1)
3739
4.00k
      {
3740
4.00k
        if (!expectedTLEs)
3741
3.99k
          expectedTLEs = tles;
3742
4.00k
        return fd;
3743
4.00k
      }
3744
4.00k
    }
3745
4.00k
  }
3746
3747
  /* Couldn't find it.  For simplicity, complain about front timeline */
3748
0
  XLogFilePath(path, recoveryTargetTLI, segno, wal_segment_size);
3749
0
  errno = ENOENT;
3750
0
  ereport(emode,
3751
0
      (errcode_for_file_access(),
3752
0
       errmsg("could not open file \"%s\": %m", path)));
3753
0
  return -1;
3754
0
}
3755
3756
/*
3757
 * Close the current logfile segment for writing.
3758
 */
3759
static void
3760
XLogFileClose(void)
3761
0
{
3762
0
  Assert(openLogFile >= 0);
3763
3764
  /*
3765
   * WAL segment files will not be re-read in normal operation, so we advise
3766
   * the OS to release any cached pages.  But do not do so if WAL archiving
3767
   * or streaming is active, because archiver and walsender process could
3768
   * use the cache to read the WAL segment.
3769
   */
3770
#if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
3771
  if (!XLogIsNeeded())
3772
    (void) posix_fadvise(openLogFile, 0, 0, POSIX_FADV_DONTNEED);
3773
#endif
3774
3775
0
  if (close(openLogFile))
3776
0
    ereport(PANIC,
3777
0
        (errcode_for_file_access(),
3778
0
         errmsg("could not close log file %s: %m",
3779
0
            XLogFileNameP(ThisTimeLineID, openLogSegNo))));
3780
0
  openLogFile = -1;
3781
0
}
3782
3783
/*
3784
 * Preallocate log files beyond the specified log endpoint.
3785
 *
3786
 * XXX this is currently extremely conservative, since it forces only one
3787
 * future log segment to exist, and even that only if we are 75% done with
3788
 * the current one.  This is only appropriate for very low-WAL-volume systems.
3789
 * High-volume systems will be OK once they've built up a sufficient set of
3790
 * recycled log segments, but the startup transient is likely to include
3791
 * a lot of segment creations by foreground processes, which is not so good.
3792
 */
3793
static void
3794
PreallocXlogFiles(XLogRecPtr endptr)
3795
4.33k
{
3796
4.33k
  XLogSegNo _logSegNo;
3797
4.33k
  int     lf;
3798
4.33k
  bool    use_existent;
3799
4.33k
  uint64    offset;
3800
3801
4.33k
  XLByteToPrevSeg(endptr, _logSegNo, wal_segment_size);
3802
4.33k
  offset = XLogSegmentOffset(endptr - 1, wal_segment_size);
3803
4.33k
  if (offset >= (uint32) (0.75 * wal_segment_size))
3804
0
  {
3805
0
    _logSegNo++;
3806
0
    use_existent = true;
3807
0
    lf = XLogFileInit(_logSegNo, &use_existent, true);
3808
0
    close(lf);
3809
0
    if (!use_existent)
3810
0
      CheckpointStats.ckpt_segs_added++;
3811
0
  }
3812
4.33k
}
3813
3814
/*
3815
 * Throws an error if the given log segment has already been removed or
3816
 * recycled. The caller should only pass a segment that it knows to have
3817
 * existed while the server has been running, as this function always
3818
 * succeeds if no WAL segments have been removed since startup.
3819
 * 'tli' is only used in the error message.
3820
 *
3821
 * Note: this function guarantees to keep errno unchanged on return.
3822
 * This supports callers that use this to possibly deliver a better
3823
 * error message about a missing file, while still being able to throw
3824
 * a normal file-access error afterwards, if this does return.
3825
 */
3826
void
3827
CheckXLogRemoved(XLogSegNo segno, TimeLineID tli)
3828
0
{
3829
0
  int     save_errno = errno;
3830
0
  XLogSegNo lastRemovedSegNo;
3831
3832
0
  SpinLockAcquire(&XLogCtl->info_lck);
3833
0
  lastRemovedSegNo = XLogCtl->lastRemovedSegNo;
3834
0
  SpinLockRelease(&XLogCtl->info_lck);
3835
3836
0
  if (segno <= lastRemovedSegNo)
3837
0
  {
3838
0
    char    filename[MAXFNAMELEN];
3839
3840
0
    XLogFileName(filename, tli, segno, wal_segment_size);
3841
0
    errno = save_errno;
3842
0
    ereport(ERROR,
3843
0
        (errcode_for_file_access(),
3844
0
         errmsg("requested WAL segment %s has already been removed",
3845
0
            filename)));
3846
0
  }
3847
0
  errno = save_errno;
3848
0
}
3849
3850
/*
3851
 * Return the last WAL segment removed, or 0 if no segment has been removed
3852
 * since startup.
3853
 *
3854
 * NB: the result can be out of date arbitrarily fast, the caller has to deal
3855
 * with that.
3856
 */
3857
XLogSegNo
3858
XLogGetLastRemovedSegno(void)
3859
0
{
3860
0
  XLogSegNo lastRemovedSegNo;
3861
3862
0
  SpinLockAcquire(&XLogCtl->info_lck);
3863
0
  lastRemovedSegNo = XLogCtl->lastRemovedSegNo;
3864
0
  SpinLockRelease(&XLogCtl->info_lck);
3865
3866
0
  return lastRemovedSegNo;
3867
0
}
3868
3869
/*
3870
 * Update the last removed segno pointer in shared memory, to reflect
3871
 * that the given XLOG file has been removed.
3872
 */
3873
static void
3874
UpdateLastRemovedPtr(char *filename)
3875
0
{
3876
0
  uint32    tli;
3877
0
  XLogSegNo segno;
3878
3879
0
  XLogFromFileName(filename, &tli, &segno, wal_segment_size);
3880
3881
0
  SpinLockAcquire(&XLogCtl->info_lck);
3882
0
  if (segno > XLogCtl->lastRemovedSegNo)
3883
0
    XLogCtl->lastRemovedSegNo = segno;
3884
0
  SpinLockRelease(&XLogCtl->info_lck);
3885
0
}
3886
3887
/*
3888
 * Recycle or remove all log files older or equal to passed segno.
3889
 *
3890
 * endptr is current (or recent) end of xlog, and RedoRecPtr is the
3891
 * redo pointer of the last checkpoint. These are used to determine
3892
 * whether we want to recycle rather than delete no-longer-wanted log files.
3893
 */
3894
static void
3895
RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr RedoRecPtr, XLogRecPtr endptr)
3896
2.35k
{
3897
2.35k
  DIR      *xldir;
3898
2.35k
  struct dirent *xlde;
3899
2.35k
  char    lastoff[MAXFNAMELEN];
3900
3901
  /*
3902
   * Construct a filename of the last segment to be kept. The timeline ID
3903
   * doesn't matter, we ignore that in the comparison. (During recovery,
3904
   * ThisTimeLineID isn't set, so we can't use that.)
3905
   */
3906
2.35k
  XLogFileName(lastoff, 0, segno, wal_segment_size);
3907
3908
2.35k
  elog(DEBUG2, "attempting to remove WAL segments older than log file %s",
3909
2.35k
     lastoff);
3910
3911
2.35k
  xldir = AllocateDir(XLOGDIR);
3912
3913
11.7k
  while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
3914
9.40k
  {
3915
    /* Ignore files that are not XLOG segments */
3916
9.40k
    if (!IsXLogFileName(xlde->d_name) &&
3917
9.40k
      
!7.05k
IsPartialXLogFileName7.05k
(xlde->d_name))
3918
7.05k
      continue;
3919
3920
    /*
3921
     * We ignore the timeline part of the XLOG segment identifiers in
3922
     * deciding whether a segment is still needed.  This ensures that we
3923
     * won't prematurely remove a segment from a parent timeline. We could
3924
     * probably be a little more proactive about removing segments of
3925
     * non-parent timelines, but that would be a whole lot more
3926
     * complicated.
3927
     *
3928
     * We use the alphanumeric sorting property of the filenames to decide
3929
     * which ones are earlier than the lastoff segment.
3930
     */
3931
2.35k
    if (strcmp(xlde->d_name + 8, lastoff + 8) <= 0)
3932
0
    {
3933
0
      if (XLogArchiveCheckDone(xlde->d_name))
3934
0
      {
3935
        /* Update the last removed location in shared memory first */
3936
0
        UpdateLastRemovedPtr(xlde->d_name);
3937
3938
0
        RemoveXlogFile(xlde->d_name, RedoRecPtr, endptr);
3939
0
      }
3940
0
    }
3941
2.35k
  }
3942
3943
2.35k
  FreeDir(xldir);
3944
2.35k
}
3945
3946
/*
3947
 * Remove WAL files that are not part of the given timeline's history.
3948
 *
3949
 * This is called during recovery, whenever we switch to follow a new
3950
 * timeline, and at the end of recovery when we create a new timeline. We
3951
 * wouldn't otherwise care about extra WAL files lying in pg_wal, but they
3952
 * might be leftover pre-allocated or recycled WAL segments on the old timeline
3953
 * that we haven't used yet, and contain garbage. If we just leave them in
3954
 * pg_wal, they will eventually be archived, and we can't let that happen.
3955
 * Files that belong to our timeline history are valid, because we have
3956
 * successfully replayed them, but from others we can't be sure.
3957
 *
3958
 * 'switchpoint' is the current point in WAL where we switch to new timeline,
3959
 * and 'newTLI' is the new timeline we switch to.
3960
 */
3961
static void
3962
RemoveNonParentXlogFiles(XLogRecPtr switchpoint, TimeLineID newTLI)
3963
0
{
3964
0
  DIR      *xldir;
3965
0
  struct dirent *xlde;
3966
0
  char    switchseg[MAXFNAMELEN];
3967
0
  XLogSegNo endLogSegNo;
3968
3969
0
  XLByteToPrevSeg(switchpoint, endLogSegNo, wal_segment_size);
3970
3971
  /*
3972
   * Construct a filename of the last segment to be kept.
3973
   */
3974
0
  XLogFileName(switchseg, newTLI, endLogSegNo, wal_segment_size);
3975
3976
0
  elog(DEBUG2, "attempting to remove WAL segments newer than log file %s",
3977
0
     switchseg);
3978
3979
0
  xldir = AllocateDir(XLOGDIR);
3980
3981
0
  while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
3982
0
  {
3983
    /* Ignore files that are not XLOG segments */
3984
0
    if (!IsXLogFileName(xlde->d_name))
3985
0
      continue;
3986
3987
    /*
3988
     * Remove files that are on a timeline older than the new one we're
3989
     * switching to, but with a segment number >= the first segment on the
3990
     * new timeline.
3991
     */
3992
0
    if (strncmp(xlde->d_name, switchseg, 8) < 0 &&
3993
0
      strcmp(xlde->d_name + 8, switchseg + 8) > 0)
3994
0
    {
3995
      /*
3996
       * If the file has already been marked as .ready, however, don't
3997
       * remove it yet. It should be OK to remove it - files that are
3998
       * not part of our timeline history are not required for recovery
3999
       * - but seems safer to let them be archived and removed later.
4000
       */
4001
0
      if (!XLogArchiveIsReady(xlde->d_name))
4002
0
        RemoveXlogFile(xlde->d_name, InvalidXLogRecPtr, switchpoint);
4003
0
    }
4004
0
  }
4005
4006
0
  FreeDir(xldir);
4007
0
}
4008
4009
/*
4010
 * Recycle or remove a log file that's no longer needed.
4011
 *
4012
 * endptr is current (or recent) end of xlog, and RedoRecPtr is the
4013
 * redo pointer of the last checkpoint. These are used to determine
4014
 * whether we want to recycle rather than delete no-longer-wanted log files.
4015
 * If RedoRecPtr is not known, pass invalid, and the function will recycle,
4016
 * somewhat arbitrarily, 10 future segments.
4017
 */
4018
static void
4019
RemoveXlogFile(const char *segname, XLogRecPtr RedoRecPtr, XLogRecPtr endptr)
4020
0
{
4021
0
  char    path[MAXPGPATH];
4022
#ifdef WIN32
4023
  char    newpath[MAXPGPATH];
4024
#endif
4025
0
  struct stat statbuf;
4026
0
  XLogSegNo endlogSegNo;
4027
0
  XLogSegNo recycleSegNo;
4028
4029
  /*
4030
   * Initialize info about where to try to recycle to.
4031
   */
4032
0
  XLByteToSeg(endptr, endlogSegNo, wal_segment_size);
4033
0
  if (RedoRecPtr == InvalidXLogRecPtr)
4034
0
    recycleSegNo = endlogSegNo + 10;
4035
0
  else
4036
0
    recycleSegNo = XLOGfileslop(RedoRecPtr);
4037
4038
0
  snprintf(path, MAXPGPATH, XLOGDIR "/%s", segname);
4039
4040
  /*
4041
   * Before deleting the file, see if it can be recycled as a future log
4042
   * segment. Only recycle normal files, pg_standby for example can create
4043
   * symbolic links pointing to a separate archive directory.
4044
   */
4045
0
  if (endlogSegNo <= recycleSegNo &&
4046
0
    lstat(path, &statbuf) == 0 && S_ISREG(statbuf.st_mode) &&
4047
0
    InstallXLogFileSegment(&endlogSegNo, path,
4048
0
                 true, recycleSegNo, true))
4049
0
  {
4050
0
    ereport(DEBUG2,
4051
0
        (errmsg("recycled write-ahead log file \"%s\"",
4052
0
            segname)));
4053
0
    CheckpointStats.ckpt_segs_recycled++;
4054
    /* Needn't recheck that slot on future iterations */
4055
0
    endlogSegNo++;
4056
0
  }
4057
0
  else
4058
0
  {
4059
    /* No need for any more future segments... */
4060
0
    int     rc;
4061
4062
0
    ereport(DEBUG2,
4063
0
        (errmsg("removing write-ahead log file \"%s\"",
4064
0
            segname)));
4065
4066
#ifdef WIN32
4067
4068
    /*
4069
     * On Windows, if another process (e.g another backend) holds the file
4070
     * open in FILE_SHARE_DELETE mode, unlink will succeed, but the file
4071
     * will still show up in directory listing until the last handle is
4072
     * closed. To avoid confusing the lingering deleted file for a live
4073
     * WAL file that needs to be archived, rename it before deleting it.
4074
     *
4075
     * If another process holds the file open without FILE_SHARE_DELETE
4076
     * flag, rename will fail. We'll try again at the next checkpoint.
4077
     */
4078
    snprintf(newpath, MAXPGPATH, "%s.deleted", path);
4079
    if (rename(path, newpath) != 0)
4080
    {
4081
      ereport(LOG,
4082
          (errcode_for_file_access(),
4083
           errmsg("could not rename old write-ahead log file \"%s\": %m",
4084
              path)));
4085
      return;
4086
    }
4087
    rc = durable_unlink(newpath, LOG);
4088
#else
4089
0
    rc = durable_unlink(path, LOG);
4090
0
#endif
4091
0
    if (rc != 0)
4092
0
    {
4093
      /* Message already logged by durable_unlink() */
4094
0
      return;
4095
0
    }
4096
0
    CheckpointStats.ckpt_segs_removed++;
4097
0
  }
4098
4099
0
  XLogArchiveCleanup(segname);
4100
0
}
4101
4102
/*
4103
 * Verify whether pg_wal and pg_wal/archive_status exist.
4104
 * If the latter does not exist, recreate it.
4105
 *
4106
 * It is not the goal of this function to verify the contents of these
4107
 * directories, but to help in cases where someone has performed a cluster
4108
 * copy for PITR purposes but omitted pg_wal from the copy.
4109
 *
4110
 * We could also recreate pg_wal if it doesn't exist, but a deliberate
4111
 * policy decision was made not to.  It is fairly common for pg_wal to be
4112
 * a symlink, and if that was the DBA's intent then automatically making a
4113
 * plain directory would result in degraded performance with no notice.
4114
 */
4115
static void
4116
ValidateXLOGDirectoryStructure(void)
4117
3.99k
{
4118
3.99k
  char    path[MAXPGPATH];
4119
3.99k
  struct stat stat_buf;
4120
4121
  /* Check for pg_wal; if it doesn't exist, error out */
4122
3.99k
  if (stat(XLOGDIR, &stat_buf) != 0 ||
4123
3.99k
    !S_ISDIR(stat_buf.st_mode))
4124
3.99k
    ereport(FATAL,
4125
3.99k
        (errmsg("required WAL directory \"%s\" does not exist",
4126
3.99k
            XLOGDIR)));
4127
4128
  /* Check for archive_status */
4129
3.99k
  snprintf(path, MAXPGPATH, XLOGDIR "/archive_status");
4130
3.99k
  if (stat(path, &stat_buf) == 0)
4131
3.99k
  {
4132
    /* Check for weird cases where it exists but isn't a directory */
4133
3.99k
    if (!S_ISDIR(stat_buf.st_mode))
4134
3.99k
      ereport(FATAL,
4135
3.99k
          (errmsg("required WAL directory \"%s\" does not exist",
4136
3.99k
              path)));
4137
3.99k
  }
4138
0
  else
4139
0
  {
4140
0
    ereport(LOG,
4141
0
        (errmsg("creating missing WAL directory \"%s\"", path)));
4142
0
    if (MakePGDirectory(path) < 0)
4143
0
      ereport(FATAL,
4144
0
          (errmsg("could not create missing directory \"%s\": %m",
4145
0
              path)));
4146
0
  }
4147
3.99k
}
4148
4149
/*
4150
 * Remove previous backup history files.  This also retries creation of
4151
 * .ready files for any backup history files for which XLogArchiveNotify
4152
 * failed earlier.
4153
 */
4154
static void
4155
CleanupBackupHistory(void)
4156
0
{
4157
0
  DIR      *xldir;
4158
0
  struct dirent *xlde;
4159
0
  char    path[MAXPGPATH + sizeof(XLOGDIR)];
4160
4161
0
  xldir = AllocateDir(XLOGDIR);
4162
4163
0
  while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
4164
0
  {
4165
0
    if (IsBackupHistoryFileName(xlde->d_name))
4166
0
    {
4167
0
      if (XLogArchiveCheckDone(xlde->d_name))
4168
0
      {
4169
0
        elog(DEBUG2, "removing WAL backup history file \"%s\"",
4170
0
           xlde->d_name);
4171
0
        snprintf(path, sizeof(path), XLOGDIR "/%s", xlde->d_name);
4172
0
        unlink(path);
4173
0
        XLogArchiveCleanup(xlde->d_name);
4174
0
      }
4175
0
    }
4176
0
  }
4177
4178
0
  FreeDir(xldir);
4179
0
}
4180
4181
/*
4182
 * Attempt to read an XLOG record.
4183
 *
4184
 * If RecPtr is valid, try to read a record at that position.  Otherwise
4185
 * try to read a record just after the last one previously read.
4186
 *
4187
 * If no valid record is available, returns NULL, or fails if emode is PANIC.
4188
 * (emode must be either PANIC, LOG). In standby mode, retries until a valid
4189
 * record is available.
4190
 *
4191
 * The record is copied into readRecordBuf, so that on successful return,
4192
 * the returned record pointer always points there.
4193
 */
4194
static XLogRecord *
4195
ReadRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr, int emode,
4196
       bool fetching_ckpt)
4197
8.00k
{
4198
8.00k
  XLogRecord *record;
4199
8.00k
  XLogPageReadPrivate *private = (XLogPageReadPrivate *) xlogreader->private_data;
4200
4201
  /* Pass through parameters to XLogPageRead */
4202
8.00k
  private->fetching_ckpt = fetching_ckpt;
4203
8.00k
  private->emode = emode;
4204
8.00k
  private->randAccess = (RecPtr != InvalidXLogRecPtr);
4205
4206
  /* This is the first attempt to read this page. */
4207
8.00k
  lastSourceFailed = false;
4208
4209
8.00k
  for (;;)
4210
8.00k
  {
4211
8.00k
    char     *errormsg;
4212
4213
8.00k
    record = XLogReadRecord(xlogreader, RecPtr, &errormsg);
4214
8.00k
    ReadRecPtr = xlogreader->ReadRecPtr;
4215
8.00k
    EndRecPtr = xlogreader->EndRecPtr;
4216
8.00k
    if (record == NULL)
4217
5
    {
4218
5
      if (readFile >= 0)
4219
5
      {
4220
5
        close(readFile);
4221
5
        readFile = -1;
4222
5
      }
4223
4224
      /*
4225
       * We only end up here without a message when XLogPageRead()
4226
       * failed - in that case we already logged something. In
4227
       * StandbyMode that only happens if we have been triggered, so we
4228
       * shouldn't loop anymore in that case.
4229
       */
4230
5
      if (errormsg)
4231
5
        ereport(emode_for_corrupt_record(emode,
4232
5
                         RecPtr ? RecPtr : EndRecPtr),
4233
5
            (errmsg_internal("%s", errormsg) /* already translated */ ));
4234
5
    }
4235
4236
    /*
4237
     * Check page TLI is one of the expected values.
4238
     */
4239
8.00k
    else if (!tliInHistory(xlogreader->latestPageTLI, expectedTLEs))
4240
0
    {
4241
0
      char    fname[MAXFNAMELEN];
4242
0
      XLogSegNo segno;
4243
0
      int32   offset;
4244
4245
0
      XLByteToSeg(xlogreader->latestPagePtr, segno, wal_segment_size);
4246
0
      offset = XLogSegmentOffset(xlogreader->latestPagePtr,
4247
0
                     wal_segment_size);
4248
0
      XLogFileName(fname, xlogreader->readPageTLI, segno,
4249
0
             wal_segment_size);
4250
0
      ereport(emode_for_corrupt_record(emode,
4251
0
                       RecPtr ? RecPtr : EndRecPtr),
4252
0
          (errmsg("unexpected timeline ID %u in log segment %s, offset %u",
4253
0
              xlogreader->latestPageTLI,
4254
0
              fname,
4255
0
              offset)));
4256
0
      record = NULL;
4257
0
    }
4258
4259
8.00k
    if (record)
4260
8.00k
    {
4261
      /* Great, got a record */
4262
8.00k
      return record;
4263
8.00k
    }
4264
5
    else
4265
5
    {
4266
      /* No valid record available from this source */
4267
5
      lastSourceFailed = true;
4268
4269
      /*
4270
       * If archive recovery was requested, but we were still doing
4271
       * crash recovery, switch to archive recovery and retry using the
4272
       * offline archive. We have now replayed all the valid WAL in
4273
       * pg_wal, so we are presumably now consistent.
4274
       *
4275
       * We require that there's at least some valid WAL present in
4276
       * pg_wal, however (!fetching_ckpt).  We could recover using the
4277
       * WAL from the archive, even if pg_wal is completely empty, but
4278
       * we'd have no idea how far we'd have to replay to reach
4279
       * consistency.  So err on the safe side and give up.
4280
       */
4281
5
      if (!InArchiveRecovery && ArchiveRecoveryRequested &&
4282
5
        
!fetching_ckpt0
)
4283
0
      {
4284
0
        ereport(DEBUG1,
4285
0
            (errmsg_internal("reached end of WAL in pg_wal, entering archive recovery")));
4286
0
        InArchiveRecovery = true;
4287
0
        if (StandbyModeRequested)
4288
0
          StandbyMode = true;
4289
4290
        /* initialize minRecoveryPoint to this record */
4291
0
        LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
4292
0
        ControlFile->state = DB_IN_ARCHIVE_RECOVERY;
4293
0
        if (ControlFile->minRecoveryPoint < EndRecPtr)
4294
0
        {
4295
0
          ControlFile->minRecoveryPoint = EndRecPtr;
4296
0
          ControlFile->minRecoveryPointTLI = ThisTimeLineID;
4297
0
        }
4298
        /* update local copy */
4299
0
        minRecoveryPoint = ControlFile->minRecoveryPoint;
4300
0
        minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
4301
4302
        /*
4303
         * The startup process can update its local copy of
4304
         * minRecoveryPoint from this point.
4305
         */
4306
0
        updateMinRecoveryPoint = true;
4307
4308
0
        UpdateControlFile();
4309
0
        LWLockRelease(ControlFileLock);
4310
4311
0
        CheckRecoveryConsistency();
4312
4313
        /*
4314
         * Before we retry, reset lastSourceFailed and currentSource
4315
         * so that we will check the archive next.
4316
         */
4317
0
        lastSourceFailed = false;
4318
0
        currentSource = 0;
4319
4320
0
        continue;
4321
0
      }
4322
4323
      /* In standby mode, loop back to retry. Otherwise, give up. */
4324
5
      if (StandbyMode && 
!CheckForStandbyTrigger()0
)
4325
0
        continue;
4326
5
      else
4327
5
        return NULL;
4328
5
    }
4329
8.00k
  }
4330
8.00k
}
4331
4332
/*
4333
 * Scan for new timelines that might have appeared in the archive since we
4334
 * started recovery.
4335
 *
4336
 * If there are any, the function changes recovery target TLI to the latest
4337
 * one and returns 'true'.
4338
 */
4339
static bool
4340
rescanLatestTimeLine(void)
4341
0
{
4342
0
  List     *newExpectedTLEs;
4343
0
  bool    found;
4344
0
  ListCell   *cell;
4345
0
  TimeLineID  newtarget;
4346
0
  TimeLineID  oldtarget = recoveryTargetTLI;
4347
0
  TimeLineHistoryEntry *currentTle = NULL;
4348
4349
0
  newtarget = findNewestTimeLine(recoveryTargetTLI);
4350
0
  if (newtarget == recoveryTargetTLI)
4351
0
  {
4352
    /* No new timelines found */
4353
0
    return false;
4354
0
  }
4355
4356
  /*
4357
   * Determine the list of expected TLIs for the new TLI
4358
   */
4359
4360
0
  newExpectedTLEs = readTimeLineHistory(newtarget);
4361
4362
  /*
4363
   * If the current timeline is not part of the history of the new timeline,
4364
   * we cannot proceed to it.
4365
   */
4366
0
  found = false;
4367
0
  foreach(cell, newExpectedTLEs)
4368
0
  {
4369
0
    currentTle = (TimeLineHistoryEntry *) lfirst(cell);
4370
4371
0
    if (currentTle->tli == recoveryTargetTLI)
4372
0
    {
4373
0
      found = true;
4374
0
      break;
4375
0
    }
4376
0
  }
4377
0
  if (!found)
4378
0
  {
4379
0
    ereport(LOG,
4380
0
        (errmsg("new timeline %u is not a child of database system timeline %u",
4381
0
            newtarget,
4382
0
            ThisTimeLineID)));
4383
0
    return false;
4384
0
  }
4385
4386
  /*
4387
   * The current timeline was found in the history file, but check that the
4388
   * next timeline was forked off from it *after* the current recovery
4389
   * location.
4390
   */
4391
0
  if (currentTle->end < EndRecPtr)
4392
0
  {
4393
0
    ereport(LOG,
4394
0
        (errmsg("new timeline %u forked off current database system timeline %u before current recovery point %X/%X",
4395
0
            newtarget,
4396
0
            ThisTimeLineID,
4397
0
            (uint32) (EndRecPtr >> 32), (uint32) EndRecPtr)));
4398
0
    return false;
4399
0
  }
4400
4401
  /* The new timeline history seems valid. Switch target */
4402
0
  recoveryTargetTLI = newtarget;
4403
0
  list_free_deep(expectedTLEs);
4404
0
  expectedTLEs = newExpectedTLEs;
4405
4406
  /*
4407
   * As in StartupXLOG(), try to ensure we have all the history files
4408
   * between the old target and new target in pg_wal.
4409
   */
4410
0
  restoreTimeLineHistoryFiles(oldtarget + 1, newtarget);
4411
4412
0
  ereport(LOG,
4413
0
      (errmsg("new target timeline is %u",
4414
0
          recoveryTargetTLI)));
4415
4416
0
  return true;
4417
0
}
4418
4419
/*
4420
 * I/O routines for pg_control
4421
 *
4422
 * *ControlFile is a buffer in shared memory that holds an image of the
4423
 * contents of pg_control.  WriteControlFile() initializes pg_control
4424
 * given a preloaded buffer, ReadControlFile() loads the buffer from
4425
 * the pg_control file (during postmaster or standalone-backend startup),
4426
 * and UpdateControlFile() rewrites pg_control after we modify xlog state.
4427
 *
4428
 * For simplicity, WriteControlFile() initializes the fields of pg_control
4429
 * that are related to checking backend/database compatibility, and
4430
 * ReadControlFile() verifies they are correct.  We could split out the
4431
 * I/O and compatibility-check functions, but there seems no need currently.
4432
 */
4433
static void
4434
WriteControlFile(void)
4435
2.00k
{
4436
2.00k
  int     fd;
4437
2.00k
  char    buffer[PG_CONTROL_FILE_SIZE]; /* need not be aligned */
4438
4439
  /*
4440
   * Ensure that the size of the pg_control data structure is sane.  See the
4441
   * comments for these symbols in pg_control.h.
4442
   */
4443
2.00k
  StaticAssertStmt(sizeof(ControlFileData) <= PG_CONTROL_MAX_SAFE_SIZE,
4444
2.00k
           "pg_control is too large for atomic disk writes");
4445
2.00k
  StaticAssertStmt(sizeof(ControlFileData) <= PG_CONTROL_FILE_SIZE,
4446
2.00k
           "sizeof(ControlFileData) exceeds PG_CONTROL_FILE_SIZE");
4447
4448
  /*
4449
   * Initialize version and compatibility-check fields
4450
   */
4451
2.00k
  ControlFile->pg_control_version = PG_CONTROL_VERSION;
4452
2.00k
  ControlFile->catalog_version_no = CATALOG_VERSION_NO;
4453
4454
2.00k
  ControlFile->maxAlign = MAXIMUM_ALIGNOF;
4455
2.00k
  ControlFile->floatFormat = FLOATFORMAT_VALUE;
4456
4457
2.00k
  ControlFile->blcksz = BLCKSZ;
4458
2.00k
  ControlFile->relseg_size = RELSEG_SIZE;
4459
2.00k
  ControlFile->xlog_blcksz = XLOG_BLCKSZ;
4460
2.00k
  ControlFile->xlog_seg_size = wal_segment_size;
4461
4462
2.00k
  ControlFile->nameDataLen = NAMEDATALEN;
4463
2.00k
  ControlFile->indexMaxKeys = INDEX_MAX_KEYS;
4464
4465
2.00k
  ControlFile->toast_max_chunk_size = TOAST_MAX_CHUNK_SIZE;
4466
2.00k
  ControlFile->loblksize = LOBLKSIZE;
4467
4468
2.00k
  ControlFile->float4ByVal = FLOAT4PASSBYVAL;
4469
2.00k
  ControlFile->float8ByVal = FLOAT8PASSBYVAL;
4470
4471
  /* Contents are protected with a CRC */
4472
2.00k
  INIT_CRC32C(ControlFile->crc);
4473
2.00k
  COMP_CRC32C(ControlFile->crc,
4474
2.00k
        (char *) ControlFile,
4475
2.00k
        offsetof(ControlFileData, crc));
4476
2.00k
  FIN_CRC32C(ControlFile->crc);
4477
4478
  /*
4479
   * We write out PG_CONTROL_FILE_SIZE bytes into pg_control, zero-padding
4480
   * the excess over sizeof(ControlFileData).  This reduces the odds of
4481
   * premature-EOF errors when reading pg_control.  We'll still fail when we
4482
   * check the contents of the file, but hopefully with a more specific
4483
   * error than "couldn't read pg_control".
4484
   */
4485
2.00k
  memset(buffer, 0, PG_CONTROL_FILE_SIZE);
4486
2.00k
  memcpy(buffer, ControlFile, sizeof(ControlFileData));
4487
4488
2.00k
  fd = BasicOpenFile(XLOG_CONTROL_FILE,
4489
2.00k
             O_RDWR | O_CREAT | O_EXCL | PG_BINARY);
4490
2.00k
  if (fd < 0)
4491
2.00k
    ereport(PANIC,
4492
2.00k
        (errcode_for_file_access(),
4493
2.00k
         errmsg("could not create control file \"%s\": %m",
4494
2.00k
            XLOG_CONTROL_FILE)));
4495
4496
2.00k
  errno = 0;
4497
2.00k
  pgstat_report_wait_start(WAIT_EVENT_CONTROL_FILE_WRITE);
4498
2.00k
  if (write(fd, buffer, PG_CONTROL_FILE_SIZE) != PG_CONTROL_FILE_SIZE)
4499
0
  {
4500
    /* if write didn't set errno, assume problem is no disk space */
4501
0
    if (errno == 0)
4502
0
      errno = ENOSPC;
4503
0
    ereport(PANIC,
4504
0
        (errcode_for_file_access(),
4505
0
         errmsg("could not write to control file: %m")));
4506
0
  }
4507
2.00k
  pgstat_report_wait_end();
4508
4509
2.00k
  pgstat_report_wait_start(WAIT_EVENT_CONTROL_FILE_SYNC);
4510
2.00k
  if (pg_fsync(fd) != 0)
4511
2.00k
    ereport(PANIC,
4512
2.00k
        (errcode_for_file_access(),
4513
2.00k
         errmsg("could not fsync control file: %m")));
4514
2.00k
  pgstat_report_wait_end();
4515
4516
2.00k
  if (close(fd))
4517
2.00k
    ereport(PANIC,
4518
2.00k
        (errcode_for_file_access(),
4519
2.00k
         errmsg("could not close control file: %m")));
4520
2.00k
}
4521
4522
static void
4523
ReadControlFile(void)
4524
4.00k
{
4525
4.00k
  pg_crc32c crc;
4526
4.00k
  int     fd;
4527
4.00k
  static char wal_segsz_str[20];
4528
4.00k
  int     r;
4529
4530
  /*
4531
   * Read data...
4532
   */
4533
4.00k
  fd = BasicOpenFile(XLOG_CONTROL_FILE,
4534
4.00k
             O_RDWR | PG_BINARY);
4535
4.00k
  if (fd < 0)
4536
4.00k
    ereport(PANIC,
4537
4.00k
        (errcode_for_file_access(),
4538
4.00k
         errmsg("could not open control file \"%s\": %m",
4539
4.00k
            XLOG_CONTROL_FILE)));
4540
4541
4.00k
  pgstat_report_wait_start(WAIT_EVENT_CONTROL_FILE_READ);
4542
4.00k
  r = read(fd, ControlFile, sizeof(ControlFileData));
4543
4.00k
  if (r != sizeof(ControlFileData))
4544
0
  {
4545
0
    if (r < 0)
4546
0
      ereport(PANIC,
4547
0
          (errcode_for_file_access(),
4548
0
           errmsg("could not read from control file: %m")));
4549
0
    else
4550
0
      ereport(PANIC,
4551
0
          (errmsg("could not read from control file: read %d bytes, expected %d", r, (int) sizeof(ControlFileData))));
4552
0
  }
4553
4.00k
  pgstat_report_wait_end();
4554
4555
4.00k
  close(fd);
4556
4557
  /*
4558
   * Check for expected pg_control format version.  If this is wrong, the
4559
   * CRC check will likely fail because we'll be checking the wrong number
4560
   * of bytes.  Complaining about wrong version will probably be more
4561
   * enlightening than complaining about wrong CRC.
4562
   */
4563
4564
4.00k
  if (ControlFile->pg_control_version != PG_CONTROL_VERSION && 
ControlFile->pg_control_version % 65536 == 00
&&
ControlFile->pg_control_version / 65536 != 00
)
4565
4.00k
    ereport(FATAL,
4566
4.00k
        (errmsg("database files are incompatible with server"),
4567
4.00k
         errdetail("The database cluster was initialized with PG_CONTROL_VERSION %d (0x%08x),"
4568
4.00k
               " but the server was compiled with PG_CONTROL_VERSION %d (0x%08x).",
4569
4.00k
               ControlFile->pg_control_version, ControlFile->pg_control_version,
4570
4.00k
               PG_CONTROL_VERSION, PG_CONTROL_VERSION),
4571
4.00k
         errhint("This could be a problem of mismatched byte ordering.  It looks like you need to initdb.")));
4572
4573
4.00k
  if (ControlFile->pg_control_version != PG_CONTROL_VERSION)
4574
4.00k
    ereport(FATAL,
4575
4.00k
        (errmsg("database files are incompatible with server"),
4576
4.00k
         errdetail("The database cluster was initialized with PG_CONTROL_VERSION %d,"
4577
4.00k
               " but the server was compiled with PG_CONTROL_VERSION %d.",
4578
4.00k
               ControlFile->pg_control_version, PG_CONTROL_VERSION),
4579
4.00k
         errhint("It looks like you need to initdb.")));
4580
4581
  /* Now check the CRC. */
4582
4.00k
  INIT_CRC32C(crc);
4583
4.00k
  COMP_CRC32C(crc,
4584
4.00k
        (char *) ControlFile,
4585
4.00k
        offsetof(ControlFileData, crc));
4586
4.00k
  FIN_CRC32C(crc);
4587
4588
4.00k
  if (!EQ_CRC32C(crc, ControlFile->crc))
4589
4.00k
    ereport(FATAL,
4590
4.00k
        (errmsg("incorrect checksum in control file")));
4591
4592
  /*
4593
   * Do compatibility checking immediately.  If the database isn't
4594
   * compatible with the backend executable, we want to abort before we can
4595
   * possibly do any damage.
4596
   */
4597
4.00k
  if (ControlFile->catalog_version_no != CATALOG_VERSION_NO)
4598
4.00k
    ereport(FATAL,
4599
4.00k
        (errmsg("database files are incompatible with server"),
4600
4.00k
         errdetail("The database cluster was initialized with CATALOG_VERSION_NO %d,"
4601
4.00k
               " but the server was compiled with CATALOG_VERSION_NO %d.",
4602
4.00k
               ControlFile->catalog_version_no, CATALOG_VERSION_NO),
4603
4.00k
         errhint("It looks like you need to initdb.")));
4604
4.00k
  if (ControlFile->maxAlign != MAXIMUM_ALIGNOF)
4605
4.00k
    ereport(FATAL,
4606
4.00k
        (errmsg("database files are incompatible with server"),
4607
4.00k
         errdetail("The database cluster was initialized with MAXALIGN %d,"
4608
4.00k
               " but the server was compiled with MAXALIGN %d.",
4609
4.00k
               ControlFile->maxAlign, MAXIMUM_ALIGNOF),
4610
4.00k
         errhint("It looks like you need to initdb.")));
4611
4.00k
  if (ControlFile->floatFormat != FLOATFORMAT_VALUE)
4612
4.00k
    ereport(FATAL,
4613
4.00k
        (errmsg("database files are incompatible with server"),
4614
4.00k
         errdetail("The database cluster appears to use a different floating-point number format than the server executable."),
4615
4.00k
         errhint("It looks like you need to initdb.")));
4616
4.00k
  if (ControlFile->blcksz != BLCKSZ)
4617
4.00k
    ereport(FATAL,
4618
4.00k
        (errmsg("database files are incompatible with server"),
4619
4.00k
         errdetail("The database cluster was initialized with BLCKSZ %d,"
4620
4.00k
               " but the server was compiled with BLCKSZ %d.",
4621
4.00k
               ControlFile->blcksz, BLCKSZ),
4622
4.00k
         errhint("It looks like you need to recompile or initdb.")));
4623
4.00k
  if (ControlFile->relseg_size != RELSEG_SIZE)
4624
4.00k
    ereport(FATAL,
4625
4.00k
        (errmsg("database files are incompatible with server"),
4626
4.00k
         errdetail("The database cluster was initialized with RELSEG_SIZE %d,"
4627
4.00k
               " but the server was compiled with RELSEG_SIZE %d.",
4628
4.00k
               ControlFile->relseg_size, RELSEG_SIZE),
4629
4.00k
         errhint("It looks like you need to recompile or initdb.")));
4630
4.00k
  if (ControlFile->xlog_blcksz != XLOG_BLCKSZ)
4631
4.00k
    ereport(FATAL,
4632
4.00k
        (errmsg("database files are incompatible with server"),
4633
4.00k
         errdetail("The database cluster was initialized with XLOG_BLCKSZ %d,"
4634
4.00k
               " but the server was compiled with XLOG_BLCKSZ %d.",
4635
4.00k
               ControlFile->xlog_blcksz, XLOG_BLCKSZ),
4636
4.00k
         errhint("It looks like you need to recompile or initdb.")));
4637
4.00k
  if (ControlFile->nameDataLen != NAMEDATALEN)
4638
4.00k
    ereport(FATAL,
4639
4.00k
        (errmsg("database files are incompatible with server"),
4640
4.00k
         errdetail("The database cluster was initialized with NAMEDATALEN %d,"
4641
4.00k
               " but the server was compiled with NAMEDATALEN %d.",
4642
4.00k
               ControlFile->nameDataLen, NAMEDATALEN),
4643
4.00k
         errhint("It looks like you need to recompile or initdb.")));
4644
4.00k
  if (ControlFile->indexMaxKeys != INDEX_MAX_KEYS)
4645
4.00k
    ereport(FATAL,
4646
4.00k
        (errmsg("database files are incompatible with server"),
4647
4.00k
         errdetail("The database cluster was initialized with INDEX_MAX_KEYS %d,"
4648
4.00k
               " but the server was compiled with INDEX_MAX_KEYS %d.",
4649
4.00k
               ControlFile->indexMaxKeys, INDEX_MAX_KEYS),
4650
4.00k
         errhint("It looks like you need to recompile or initdb.")));
4651
4.00k
  if (ControlFile->toast_max_chunk_size != TOAST_MAX_CHUNK_SIZE)
4652
4.00k
    ereport(FATAL,
4653
4.00k
        (errmsg("database files are incompatible with server"),
4654
4.00k
         errdetail("The database cluster was initialized with TOAST_MAX_CHUNK_SIZE %d,"
4655
4.00k
               " but the server was compiled with TOAST_MAX_CHUNK_SIZE %d.",
4656
4.00k
               ControlFile->toast_max_chunk_size, (int) TOAST_MAX_CHUNK_SIZE),
4657
4.00k
         errhint("It looks like you need to recompile or initdb.")));
4658
4.00k
  if (ControlFile->loblksize != LOBLKSIZE)
4659
4.00k
    ereport(FATAL,
4660
4.00k
        (errmsg("database files are incompatible with server"),
4661
4.00k
         errdetail("The database cluster was initialized with LOBLKSIZE %d,"
4662
4.00k
               " but the server was compiled with LOBLKSIZE %d.",
4663
4.00k
               ControlFile->loblksize, (int) LOBLKSIZE),
4664
4.00k
         errhint("It looks like you need to recompile or initdb.")));
4665
4666
4.00k
#ifdef USE_FLOAT4_BYVAL
4667
4.00k
  if (ControlFile->float4ByVal != true)
4668
4.00k
    ereport(FATAL,
4669
4.00k
        (errmsg("database files are incompatible with server"),
4670
4.00k
         errdetail("The database cluster was initialized without USE_FLOAT4_BYVAL"
4671
4.00k
               " but the server was compiled with USE_FLOAT4_BYVAL."),
4672
4.00k
         errhint("It looks like you need to recompile or initdb.")));
4673
#else
4674
  if (ControlFile->float4ByVal != false)
4675
    ereport(FATAL,
4676
        (errmsg("database files are incompatible with server"),
4677
         errdetail("The database cluster was initialized with USE_FLOAT4_BYVAL"
4678
               " but the server was compiled without USE_FLOAT4_BYVAL."),
4679
         errhint("It looks like you need to recompile or initdb.")));
4680
#endif
4681
4682
4.00k
#ifdef USE_FLOAT8_BYVAL
4683
4.00k
  if (ControlFile->float8ByVal != true)
4684
4.00k
    ereport(FATAL,
4685
4.00k
        (errmsg("database files are incompatible with server"),
4686
4.00k
         errdetail("The database cluster was initialized without USE_FLOAT8_BYVAL"
4687
4.00k
               " but the server was compiled with USE_FLOAT8_BYVAL."),
4688
4.00k
         errhint("It looks like you need to recompile or initdb.")));
4689
#else
4690
  if (ControlFile->float8ByVal != false)
4691
    ereport(FATAL,
4692
        (errmsg("database files are incompatible with server"),
4693
         errdetail("The database cluster was initialized with USE_FLOAT8_BYVAL"
4694
               " but the server was compiled without USE_FLOAT8_BYVAL."),
4695
         errhint("It looks like you need to recompile or initdb.")));
4696
#endif
4697
4698
4.00k
  wal_segment_size = ControlFile->xlog_seg_size;
4699
4700
4.00k
  if (!IsValidWalSegSize(wal_segment_size))
4701
4.00k
    ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4702
4.00k
            errmsg_plural("WAL segment size must be a power of two between 1 MB and 1 GB, but the control file specifies %d byte",
4703
4.00k
                    "WAL segment size must be a power of two between 1 MB and 1 GB, but the control file specifies %d bytes",
4704
4.00k
                    wal_segment_size,
4705
4.00k
                    wal_segment_size)));
4706
4707
4.00k
  snprintf(wal_segsz_str, sizeof(wal_segsz_str), "%d", wal_segment_size);
4708
4.00k
  SetConfigOption("wal_segment_size", wal_segsz_str, PGC_INTERNAL,
4709
4.00k
          PGC_S_OVERRIDE);
4710
4711
  /* check and update variables dependent on wal_segment_size */
4712
4.00k
  if (ConvertToXSegs(min_wal_size_mb, wal_segment_size) < 2)
4713
4.00k
    ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4714
4.00k
            errmsg("\"min_wal_size\" must be at least twice \"wal_segment_size\"")));
4715
4716
4.00k
  if (ConvertToXSegs(max_wal_size_mb, wal_segment_size) < 2)
4717
4.00k
    ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4718
4.00k
            errmsg("\"max_wal_size\" must be at least twice \"wal_segment_size\"")));
4719
4720
4.00k
  UsableBytesInSegment =
4721
4.00k
    (wal_segment_size / XLOG_BLCKSZ * UsableBytesInPage) -
4722
4.00k
    (SizeOfXLogLongPHD - SizeOfXLogShortPHD);
4723
4724
4.00k
  CalculateCheckpointSegments();
4725
4726
  /* Make the initdb settings visible as GUC variables, too */
4727
4.00k
  SetConfigOption("data_checksums", DataChecksumsEnabled() ? 
"yes"0
: "no",
4728
4.00k
          PGC_INTERNAL, PGC_S_OVERRIDE);
4729
4.00k
}
4730
4731
void
4732
UpdateControlFile(void)
4733
8.37k
{
4734
8.37k
  int     fd;
4735
4736
8.37k
  INIT_CRC32C(ControlFile->crc);
4737
8.37k
  COMP_CRC32C(ControlFile->crc,
4738
8.37k
        (char *) ControlFile,
4739
8.37k
        offsetof(ControlFileData, crc));
4740
8.37k
  FIN_CRC32C(ControlFile->crc);
4741
4742
8.37k
  fd = BasicOpenFile(XLOG_CONTROL_FILE,
4743
8.37k
             O_RDWR | PG_BINARY);
4744
8.37k
  if (fd < 0)
4745
8.37k
    ereport(PANIC,
4746
8.37k
        (errcode_for_file_access(),
4747
8.37k
         errmsg("could not open control file \"%s\": %m",
4748
8.37k
            XLOG_CONTROL_FILE)));
4749
4750
8.37k
  errno = 0;
4751
8.37k
  pgstat_report_wait_start(WAIT_EVENT_CONTROL_FILE_WRITE_UPDATE);
4752
8.37k
  if (write(fd, ControlFile, sizeof(ControlFileData)) != sizeof(ControlFileData))
4753
0
  {
4754
    /* if write didn't set errno, assume problem is no disk space */
4755
0
    if (errno == 0)
4756
0
      errno = ENOSPC;
4757
0
    ereport(PANIC,
4758
0
        (errcode_for_file_access(),
4759
0
         errmsg("could not write to control file: %m")));
4760
0
  }
4761
8.37k
  pgstat_report_wait_end();
4762
4763
8.37k
  pgstat_report_wait_start(WAIT_EVENT_CONTROL_FILE_SYNC_UPDATE);
4764
8.37k
  if (pg_fsync(fd) != 0)
4765
8.37k
    ereport(PANIC,
4766
8.37k
        (errcode_for_file_access(),
4767
8.37k
         errmsg("could not fsync control file: %m")));
4768
8.37k
  pgstat_report_wait_end();
4769
4770
8.37k
  if (close(fd))
4771
8.37k
    ereport(PANIC,
4772
8.37k
        (errcode_for_file_access(),
4773
8.37k
         errmsg("could not close control file: %m")));
4774
8.37k
}
4775
4776
/*
4777
 * Returns the unique system identifier from control file.
4778
 */
4779
uint64
4780
GetSystemIdentifier(void)
4781
0
{
4782
0
  Assert(ControlFile != NULL);
4783
0
  return ControlFile->system_identifier;
4784
0
}
4785
4786
/*
4787
 * Returns the random nonce from control file.
4788
 */
4789
char *
4790
GetMockAuthenticationNonce(void)
4791
0
{
4792
0
  Assert(ControlFile != NULL);
4793
0
  return ControlFile->mock_authentication_nonce;
4794
0
}
4795
4796
/*
4797
 * Are checksums enabled for data pages?
4798
 */
4799
bool
4800
DataChecksumsEnabled(void)
4801
56.2k
{
4802
56.2k
  Assert(ControlFile != NULL);
4803
56.2k
  return (ControlFile->data_checksum_version > 0);
4804
56.2k
}
4805
4806
/*
4807
 * Returns a fake LSN for unlogged relations.
4808
 *
4809
 * Each call generates an LSN that is greater than any previous value
4810
 * returned. The current counter value is saved and restored across clean
4811
 * shutdowns, but like unlogged relations, does not survive a crash. This can
4812
 * be used in lieu of real LSN values returned by XLogInsert, if you need an
4813
 * LSN-like increasing sequence of numbers without writing any WAL.
4814
 */
4815
XLogRecPtr
4816
GetFakeLSNForUnloggedRel(void)
4817
0
{
4818
0
  XLogRecPtr  nextUnloggedLSN;
4819
4820
  /* increment the unloggedLSN counter, need SpinLock */
4821
0
  SpinLockAcquire(&XLogCtl->ulsn_lck);
4822
0
  nextUnloggedLSN = XLogCtl->unloggedLSN++;
4823
0
  SpinLockRelease(&XLogCtl->ulsn_lck);
4824
4825
0
  return nextUnloggedLSN;
4826
0
}
4827
4828
/*
4829
 * Auto-tune the number of XLOG buffers.
4830
 *
4831
 * The preferred setting for wal_buffers is about 3% of shared_buffers, with
4832
 * a maximum of one XLOG segment (there is little reason to think that more
4833
 * is helpful, at least so long as we force an fsync when switching log files)
4834
 * and a minimum of 8 blocks (which was the default value prior to PostgreSQL
4835
 * 9.1, when auto-tuning was added).
4836
 *
4837
 * This should not be called until NBuffers has received its final value.
4838
 */
4839
static int
4840
XLOGChooseNumBuffers(void)
4841
8.01k
{
4842
8.01k
  int     xbuffers;
4843
4844
8.01k
  xbuffers = NBuffers / 32;
4845
8.01k
  if (xbuffers > (wal_segment_size / XLOG_BLCKSZ))
4846
0
    xbuffers = (wal_segment_size / XLOG_BLCKSZ);
4847
8.01k
  if (xbuffers < 8)
4848
0
    xbuffers = 8;
4849
8.01k
  return xbuffers;
4850
8.01k
}
4851
4852
/*
4853
 * GUC check_hook for wal_buffers
4854
 */
4855
bool
4856
check_wal_buffers(int *newval, void **extra, GucSource source)
4857
16.0k
{
4858
  /*
4859
   * -1 indicates a request for auto-tune.
4860
   */
4861
16.0k
  if (*newval == -1)
4862
8.04k
  {
4863
    /*
4864
     * If we haven't yet changed the boot_val default of -1, just let it
4865
     * be.  We'll fix it when XLOGShmemSize is called.
4866
     */
4867
8.04k
    if (XLOGbuffers == -1)
4868
8.04k
      return true;
4869
4870
    /* Otherwise, substitute the auto-tune value */
4871
0
    *newval = XLOGChooseNumBuffers();
4872
0
  }
4873
4874
  /*
4875
   * We clamp manually-set values to at least 4 blocks.  Prior to PostgreSQL
4876
   * 9.1, a minimum of 4 was enforced by guc.c, but since that is no longer
4877
   * the case, we just silently treat such values as a request for the
4878
   * minimum.  (We could throw an error instead, but that doesn't seem very
4879
   * helpful.)
4880
   */
4881
8.01k
  if (*newval < 4)
4882
0
    *newval = 4;
4883
4884
8.01k
  return true;
4885
16.0k
}
4886
4887
/*
4888
 * Read the control file, set respective GUCs.
4889
 *
4890
 * This is to be called during startup, including a crash recovery cycle,
4891
 * unless in bootstrap mode, where no control file yet exists.  As there's no
4892
 * usable shared memory yet (its sizing can depend on the contents of the
4893
 * control file!), first store the contents in local memory. XLOGShmemInit()
4894
 * will then copy it to shared memory later.
4895
 *
4896
 * reset just controls whether previous contents are to be expected (in the
4897
 * reset case, there's a dangling pointer into old shared memory), or not.
4898
 */
4899
void
4900
LocalProcessControlFile(bool reset)
4901
1.99k
{
4902
1.99k
  Assert(reset || ControlFile == NULL);
4903
1.99k
  ControlFile = palloc(sizeof(ControlFileData));
4904
1.99k
  ReadControlFile();
4905
1.99k
}
4906
4907
/*
4908
 * Initialization of shared memory for XLOG
4909
 */
4910
Size
4911
XLOGShmemSize(void)
4912
16.0k
{
4913
16.0k
  Size    size;
4914
4915
  /*
4916
   * If the value of wal_buffers is -1, use the preferred auto-tune value.
4917
   * This isn't an amazingly clean place to do this, but we must wait till
4918
   * NBuffers has received its final value, and must do it before using the
4919
   * value of XLOGbuffers to do anything important.
4920
   */
4921
16.0k
  if (XLOGbuffers == -1)
4922
8.01k
  {
4923
8.01k
    char    buf[32];
4924
4925
8.01k
    snprintf(buf, sizeof(buf), "%d", XLOGChooseNumBuffers());
4926
8.01k
    SetConfigOption("wal_buffers", buf, PGC_POSTMASTER, PGC_S_OVERRIDE);
4927
8.01k
  }
4928
16.0k
  Assert(XLOGbuffers > 0);
4929
4930
  /* XLogCtl */
4931
16.0k
  size = sizeof(XLogCtlData);
4932
4933
  /* WAL insertion locks, plus alignment */
4934
16.0k
  size = add_size(size, mul_size(sizeof(WALInsertLockPadded), NUM_XLOGINSERT_LOCKS + 1));
4935
  /* xlblocks array */
4936
16.0k
  size = add_size(size, mul_size(sizeof(XLogRecPtr), XLOGbuffers));
4937
  /* extra alignment padding for XLOG I/O buffers */
4938
16.0k
  size = add_size(size, XLOG_BLCKSZ);
4939
  /* and the buffers themselves */
4940
16.0k
  size = add_size(size, mul_size(XLOG_BLCKSZ, XLOGbuffers));
4941
4942
  /*
4943
   * Note: we don't count ControlFileData, it comes out of the "slop factor"
4944
   * added by CreateSharedMemoryAndSemaphores.  This lets us use this
4945
   * routine again below to compute the actual allocation size.
4946
   */
4947
4948
16.0k
  return size;
4949
16.0k
}
4950
4951
void
4952
XLOGShmemInit(void)
4953
8.01k
{
4954
8.01k
  bool    foundCFile,
4955
8.01k
        foundXLog;
4956
8.01k
  char     *allocptr;
4957
8.01k
  int     i;
4958
8.01k
  ControlFileData *localControlFile;
4959
4960
#ifdef WAL_DEBUG
4961
4962
  /*
4963
   * Create a memory context for WAL debugging that's exempt from the normal
4964
   * "no pallocs in critical section" rule. Yes, that can lead to a PANIC if
4965
   * an allocation fails, but wal_debug is not for production use anyway.
4966
   */
4967
  if (walDebugCxt == NULL)
4968
  {
4969
    walDebugCxt = AllocSetContextCreate(TopMemoryContext,
4970
                      "WAL Debug",
4971
                      ALLOCSET_DEFAULT_SIZES);
4972
    MemoryContextAllowInCriticalSection(walDebugCxt, true);
4973
  }
4974
#endif
4975
4976
4977
8.01k
  XLogCtl = (XLogCtlData *)
4978
8.01k
    ShmemInitStruct("XLOG Ctl", XLOGShmemSize(), &foundXLog);
4979
4980
8.01k
  localControlFile = ControlFile;
4981
8.01k
  ControlFile = (ControlFileData *)
4982
8.01k
    ShmemInitStruct("Control File", sizeof(ControlFileData), &foundCFile);
4983
4984
8.01k
  if (foundCFile || foundXLog)
4985
0
  {
4986
    /* both should be present or neither */
4987
0
    Assert(foundCFile && foundXLog);
4988
4989
    /* Initialize local copy of WALInsertLocks and register the tranche */
4990
0
    WALInsertLocks = XLogCtl->Insert.WALInsertLocks;
4991
0
    LWLockRegisterTranche(LWTRANCHE_WAL_INSERT,
4992
0
                "wal_insert");
4993
4994
0
    if (localControlFile)
4995
0
      pfree(localControlFile);
4996
0
    return;
4997
0
  }
4998
8.01k
  memset(XLogCtl, 0, sizeof(XLogCtlData));
4999
5000
  /*
5001
   * Already have read control file locally, unless in bootstrap mode. Move
5002
   * contents into shared memory.
5003
   */
5004
8.01k
  if (localControlFile)
5005
1.99k
  {
5006
1.99k
    memcpy(ControlFile, localControlFile, sizeof(ControlFileData));
5007
1.99k
    pfree(localControlFile);
5008
1.99k
  }
5009
5010
  /*
5011
   * Since XLogCtlData contains XLogRecPtr fields, its sizeof should be a
5012
   * multiple of the alignment for same, so no extra alignment padding is
5013
   * needed here.
5014
   */
5015
8.01k
  allocptr = ((char *) XLogCtl) + sizeof(XLogCtlData);
5016
8.01k
  XLogCtl->xlblocks = (XLogRecPtr *) allocptr;
5017
8.01k
  memset(XLogCtl->xlblocks, 0, sizeof(XLogRecPtr) * XLOGbuffers);
5018
8.01k
  allocptr += sizeof(XLogRecPtr) * XLOGbuffers;
5019
5020
5021
  /* WAL insertion locks. Ensure they're aligned to the full padded size */
5022
8.01k
  allocptr += sizeof(WALInsertLockPadded) -
5023
8.01k
    ((uintptr_t) allocptr) % sizeof(WALInsertLockPadded);
5024
8.01k
  WALInsertLocks = XLogCtl->Insert.WALInsertLocks =
5025
8.01k
    (WALInsertLockPadded *) allocptr;
5026
8.01k
  allocptr += sizeof(WALInsertLockPadded) * NUM_XLOGINSERT_LOCKS;
5027
5028
8.01k
  LWLockRegisterTranche(LWTRANCHE_WAL_INSERT, "wal_insert");
5029
72.1k
  for (i = 0; i < NUM_XLOGINSERT_LOCKS; 
i++64.1k
)
5030
64.1k
  {
5031
64.1k
    LWLockInitialize(&WALInsertLocks[i].l.lock, LWTRANCHE_WAL_INSERT);
5032
64.1k
    WALInsertLocks[i].l.insertingAt = InvalidXLogRecPtr;
5033
64.1k
    WALInsertLocks[i].l.lastImportantAt = InvalidXLogRecPtr;
5034
64.1k
  }
5035
5036
  /*
5037
   * Align the start of the page buffers to a full xlog block size boundary.
5038
   * This simplifies some calculations in XLOG insertion. It is also
5039
   * required for O_DIRECT.
5040
   */
5041
8.01k
  allocptr = (char *) TYPEALIGN(XLOG_BLCKSZ, allocptr);
5042
8.01k
  XLogCtl->pages = allocptr;
5043
8.01k
  memset(XLogCtl->pages, 0, (Size) XLOG_BLCKSZ * XLOGbuffers);
5044
5045
  /*
5046
   * Do basic initialization of XLogCtl shared data. (StartupXLOG will fill
5047
   * in additional info.)
5048
   */
5049
8.01k
  XLogCtl->XLogCacheBlck = XLOGbuffers - 1;
5050
8.01k
  XLogCtl->SharedRecoveryInProgress = true;
5051
8.01k
  XLogCtl->SharedHotStandbyActive = false;
5052
8.01k
  XLogCtl->WalWriterSleeping = false;
5053
5054
8.01k
  SpinLockInit(&XLogCtl->Insert.insertpos_lck);
5055
8.01k
  SpinLockInit(&XLogCtl->info_lck);
5056
8.01k
  SpinLockInit(&XLogCtl->ulsn_lck);
5057
8.01k
  InitSharedLatch(&XLogCtl->recoveryWakeupLatch);
5058
8.01k
}
5059
5060
/*
5061
 * This func must be called ONCE on system install.  It creates pg_control
5062
 * and the initial XLOG segment.
5063
 */
5064
void
5065
BootStrapXLOG(void)
5066
2.00k
{
5067
2.00k
  CheckPoint  checkPoint;
5068
2.00k
  char     *buffer;
5069
2.00k
  XLogPageHeader page;
5070
2.00k
  XLogLongPageHeader longpage;
5071
2.00k
  XLogRecord *record;
5072
2.00k
  char     *recptr;
5073
2.00k
  bool    use_existent;
5074
2.00k
  uint64    sysidentifier;
5075
2.00k
  char    mock_auth_nonce[MOCK_AUTH_NONCE_LEN];
5076
2.00k
  struct timeval tv;
5077
2.00k
  pg_crc32c crc;
5078
5079
  /*
5080
   * Select a hopefully-unique system identifier code for this installation.
5081
   * We use the result of gettimeofday(), including the fractional seconds
5082
   * field, as being about as unique as we can easily get.  (Think not to
5083
   * use random(), since it hasn't been seeded and there's no portable way
5084
   * to seed it other than the system clock value...)  The upper half of the
5085
   * uint64 value is just the tv_sec part, while the lower half contains the
5086
   * tv_usec part (which must fit in 20 bits), plus 12 bits from our current
5087
   * PID for a little extra uniqueness.  A person knowing this encoding can
5088
   * determine the initialization time of the installation, which could
5089
   * perhaps be useful sometimes.
5090
   */
5091
2.00k
  gettimeofday(&tv, NULL);
5092
2.00k
  sysidentifier = ((uint64) tv.tv_sec) << 32;
5093
2.00k
  sysidentifier |= ((uint64) tv.tv_usec) << 12;
5094
2.00k
  sysidentifier |= getpid() & 0xFFF;
5095
5096
  /*
5097
   * Generate a random nonce. This is used for authentication requests that
5098
   * will fail because the user does not exist. The nonce is used to create
5099
   * a genuine-looking password challenge for the non-existent user, in lieu
5100
   * of an actual stored password.
5101
   */
5102
2.00k
  if (!pg_backend_random(mock_auth_nonce, MOCK_AUTH_NONCE_LEN))
5103
2.00k
    ereport(PANIC,
5104
2.00k
        (errcode(ERRCODE_INTERNAL_ERROR),
5105
2.00k
         errmsg("could not generate secret authorization token")));
5106
5107
  /* First timeline ID is always 1 */
5108
2.00k
  ThisTimeLineID = 1;
5109
5110
  /* page buffer must be aligned suitably for O_DIRECT */
5111
2.00k
  buffer = (char *) palloc(XLOG_BLCKSZ + XLOG_BLCKSZ);
5112
2.00k
  page = (XLogPageHeader) TYPEALIGN(XLOG_BLCKSZ, buffer);
5113
2.00k
  memset(page, 0, XLOG_BLCKSZ);
5114
5115
  /*
5116
   * Set up information for the initial checkpoint record
5117
   *
5118
   * The initial checkpoint record is written to the beginning of the WAL
5119
   * segment with logid=0 logseg=1. The very first WAL segment, 0/0, is not
5120
   * used, so that we can use 0/0 to mean "before any valid WAL segment".
5121
   */
5122
2.00k
  checkPoint.redo = wal_segment_size + SizeOfXLogLongPHD;
5123
2.00k
  checkPoint.ThisTimeLineID = ThisTimeLineID;
5124
2.00k
  checkPoint.PrevTimeLineID = ThisTimeLineID;
5125
2.00k
  checkPoint.fullPageWrites = fullPageWrites;
5126
2.00k
  checkPoint.nextXidEpoch = 0;
5127
2.00k
  checkPoint.nextXid = FirstNormalTransactionId;
5128
2.00k
  checkPoint.nextOid = FirstBootstrapObjectId;
5129
2.00k
  checkPoint.nextMulti = FirstMultiXactId;
5130
2.00k
  checkPoint.nextMultiOffset = 0;
5131
2.00k
  checkPoint.oldestXid = FirstNormalTransactionId;
5132
2.00k
  checkPoint.oldestXidDB = TemplateDbOid;
5133
2.00k
  checkPoint.oldestMulti = FirstMultiXactId;
5134
2.00k
  checkPoint.oldestMultiDB = TemplateDbOid;
5135
2.00k
  checkPoint.oldestCommitTsXid = InvalidTransactionId;
5136
2.00k
  checkPoint.newestCommitTsXid = InvalidTransactionId;
5137
2.00k
  checkPoint.time = (pg_time_t) time(NULL);
5138
2.00k
  checkPoint.oldestActiveXid = InvalidTransactionId;
5139
5140
2.00k
  ShmemVariableCache->nextXid = checkPoint.nextXid;
5141
2.00k
  ShmemVariableCache->nextOid = checkPoint.nextOid;
5142
2.00k
  ShmemVariableCache->oidCount = 0;
5143
2.00k
  MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
5144
2.00k
  AdvanceOldestClogXid(checkPoint.oldestXid);
5145
2.00k
  SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
5146
2.00k
  SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB, true);
5147
2.00k
  SetCommitTsLimit(InvalidTransactionId, InvalidTransactionId);
5148
5149
  /* Set up the XLOG page header */
5150
2.00k
  page->xlp_magic = XLOG_PAGE_MAGIC;
5151
2.00k
  page->xlp_info = XLP_LONG_HEADER;
5152
2.00k
  page->xlp_tli = ThisTimeLineID;
5153
2.00k
  page->xlp_pageaddr = wal_segment_size;
5154
2.00k
  longpage = (XLogLongPageHeader) page;
5155
2.00k
  longpage->xlp_sysid = sysidentifier;
5156
2.00k
  longpage->xlp_seg_size = wal_segment_size;
5157
2.00k
  longpage->xlp_xlog_blcksz = XLOG_BLCKSZ;
5158
5159
  /* Insert the initial checkpoint record */
5160
2.00k
  recptr = ((char *) page + SizeOfXLogLongPHD);
5161
2.00k
  record = (XLogRecord *) recptr;
5162
2.00k
  record->xl_prev = 0;
5163
2.00k
  record->xl_xid = InvalidTransactionId;
5164
2.00k
  record->xl_tot_len = SizeOfXLogRecord + SizeOfXLogRecordDataHeaderShort + sizeof(checkPoint);
5165
2.00k
  record->xl_info = XLOG_CHECKPOINT_SHUTDOWN;
5166
2.00k
  record->xl_rmid = RM_XLOG_ID;
5167
2.00k
  recptr += SizeOfXLogRecord;
5168
  /* fill the XLogRecordDataHeaderShort struct */
5169
2.00k
  *(recptr++) = (char) XLR_BLOCK_ID_DATA_SHORT;
5170
2.00k
  *(recptr++) = sizeof(checkPoint);
5171
2.00k
  memcpy(recptr, &checkPoint, sizeof(checkPoint));
5172
2.00k
  recptr += sizeof(checkPoint);
5173
2.00k
  Assert(recptr - (char *) record == record->xl_tot_len);
5174
5175
2.00k
  INIT_CRC32C(crc);
5176
2.00k
  COMP_CRC32C(crc, ((char *) record) + SizeOfXLogRecord, record->xl_tot_len - SizeOfXLogRecord);
5177
2.00k
  COMP_CRC32C(crc, (char *) record, offsetof(XLogRecord, xl_crc));
5178
2.00k
  FIN_CRC32C(crc);
5179
2.00k
  record->xl_crc = crc;
5180
5181
  /* Create first XLOG segment file */
5182
2.00k
  use_existent = false;
5183
2.00k
  openLogFile = XLogFileInit(1, &use_existent, false);
5184
5185
  /* Write the first page with the initial record */
5186
2.00k
  errno = 0;
5187
2.00k
  pgstat_report_wait_start(WAIT_EVENT_WAL_BOOTSTRAP_WRITE);
5188
2.00k
  if (write(openLogFile, page, XLOG_BLCKSZ) != XLOG_BLCKSZ)
5189
0
  {
5190
    /* if write didn't set errno, assume problem is no disk space */
5191
0
    if (errno == 0)
5192
0
      errno = ENOSPC;
5193
0
    ereport(PANIC,
5194
0
        (errcode_for_file_access(),
5195
0
         errmsg("could not write bootstrap write-ahead log file: %m")));
5196
0
  }
5197
2.00k
  pgstat_report_wait_end();
5198
5199
2.00k
  pgstat_report_wait_start(WAIT_EVENT_WAL_BOOTSTRAP_SYNC);
5200
2.00k
  if (pg_fsync(openLogFile) != 0)
5201
2.00k
    ereport(PANIC,
5202
2.00k
        (errcode_for_file_access(),
5203
2.00k
         errmsg("could not fsync bootstrap write-ahead log file: %m")));
5204
2.00k
  pgstat_report_wait_end();
5205
5206
2.00k
  if (close(openLogFile))
5207
2.00k
    ereport(PANIC,
5208
2.00k
        (errcode_for_file_access(),
5209
2.00k
         errmsg("could not close bootstrap write-ahead log file: %m")));
5210
5211
2.00k
  openLogFile = -1;
5212
5213
  /* Now create pg_control */
5214
5215
2.00k
  memset(ControlFile, 0, sizeof(ControlFileData));
5216
  /* Initialize pg_control status fields */
5217
2.00k
  ControlFile->system_identifier = sysidentifier;
5218
2.00k
  memcpy(ControlFile->mock_authentication_nonce, mock_auth_nonce, MOCK_AUTH_NONCE_LEN);
5219
2.00k
  ControlFile->state = DB_SHUTDOWNED;
5220
2.00k
  ControlFile->time = checkPoint.time;
5221
2.00k
  ControlFile->checkPoint = checkPoint.redo;
5222
2.00k
  ControlFile->checkPointCopy = checkPoint;
5223
2.00k
  ControlFile->unloggedLSN = 1;
5224
5225
  /* Set important parameter values for use when replaying WAL */
5226
2.00k
  ControlFile->MaxConnections = MaxConnections;
5227
2.00k
  ControlFile->max_worker_processes = max_worker_processes;
5228
2.00k
  ControlFile->max_prepared_xacts = max_prepared_xacts;
5229
2.00k
  ControlFile->max_locks_per_xact = max_locks_per_xact;
5230
2.00k
  ControlFile->wal_level = wal_level;
5231
2.00k
  ControlFile->wal_log_hints = wal_log_hints;
5232
2.00k
  ControlFile->track_commit_timestamp = track_commit_timestamp;
5233
2.00k
  ControlFile->data_checksum_version = bootstrap_data_checksum_version;
5234
5235
  /* some additional ControlFile fields are set in WriteControlFile() */
5236
5237
2.00k
  WriteControlFile();
5238
5239
  /* Bootstrap the commit log, too */
5240
2.00k
  BootStrapCLOG();
5241
2.00k
  BootStrapCommitTs();
5242
2.00k
  BootStrapSUBTRANS();
5243
2.00k
  BootStrapMultiXact();
5244
5245
2.00k
  pfree(buffer);
5246
5247
  /*
5248
   * Force control file to be read - in contrast to normal processing we'd
5249
   * otherwise never run the checks and GUC related initializations therein.
5250
   */
5251
2.00k
  ReadControlFile();
5252
2.00k
}
5253
5254
static char *
5255
str_time(pg_time_t tnow)
5256
1.99k
{
5257
1.99k
  static char buf[128];
5258
5259
1.99k
  pg_strftime(buf, sizeof(buf),
5260
1.99k
        "%Y-%m-%d %H:%M:%S %Z",
5261
1.99k
        pg_localtime(&tnow, log_timezone));
5262
5263
1.99k
  return buf;
5264
1.99k
}
5265
5266
/*
5267
 * See if there is a recovery command file (recovery.conf), and if so
5268
 * read in parameters for archive recovery and XLOG streaming.
5269
 *
5270
 * The file is parsed using the main configuration parser.
5271
 */
5272
static void
5273
readRecoveryCommandFile(void)
5274
3.99k
{
5275
3.99k
  FILE     *fd;
5276
3.99k
  TimeLineID  rtli = 0;
5277
3.99k
  bool    rtliGiven = false;
5278
3.99k
  ConfigVariable *item,
5279
3.99k
         *head = NULL,
5280
3.99k
         *tail = NULL;
5281
3.99k
  bool    recoveryTargetActionSet = false;
5282
5283
5284
3.99k
  fd = AllocateFile(RECOVERY_COMMAND_FILE, "r");
5285
3.99k
  if (fd == NULL)
5286
3.99k
  {
5287
3.99k
    if (errno == ENOENT)
5288
3.99k
      return;        /* not there, so no archive recovery */
5289
0
    ereport(FATAL,
5290
0
        (errcode_for_file_access(),
5291
0
         errmsg("could not open recovery command file \"%s\": %m",
5292
0
            RECOVERY_COMMAND_FILE)));
5293
0
  }
5294
5295
  /*
5296
   * Since we're asking ParseConfigFp() to report errors as FATAL, there's
5297
   * no need to check the return value.
5298
   */
5299
0
  (void) ParseConfigFp(fd, RECOVERY_COMMAND_FILE, 0, FATAL, &head, &tail);
5300
5301
0
  FreeFile(fd);
5302
5303
0
  for (item = head; item; item = item->next)
5304
0
  {
5305
0
    if (strcmp(item->name, "restore_command") == 0)
5306
0
    {
5307
0
      recoveryRestoreCommand = pstrdup(item->value);
5308
0
      ereport(DEBUG2,
5309
0
          (errmsg_internal("restore_command = '%s'",
5310
0
                   recoveryRestoreCommand)));
5311
0
    }
5312
0
    else if (strcmp(item->name, "recovery_end_command") == 0)
5313
0
    {
5314
0
      recoveryEndCommand = pstrdup(item->value);
5315
0
      ereport(DEBUG2,
5316
0
          (errmsg_internal("recovery_end_command = '%s'",
5317
0
                   recoveryEndCommand)));
5318
0
    }
5319
0
    else if (strcmp(item->name, "archive_cleanup_command") == 0)
5320
0
    {
5321
0
      archiveCleanupCommand = pstrdup(item->value);
5322
0
      ereport(DEBUG2,
5323
0
          (errmsg_internal("archive_cleanup_command = '%s'",
5324
0
                   archiveCleanupCommand)));
5325
0
    }
5326
0
    else if (strcmp(item->name, "recovery_target_action") == 0)
5327
0
    {
5328
0
      if (strcmp(item->value, "pause") == 0)
5329
0
        recoveryTargetAction = RECOVERY_TARGET_ACTION_PAUSE;
5330
0
      else if (strcmp(item->value, "promote") == 0)
5331
0
        recoveryTargetAction = RECOVERY_TARGET_ACTION_PROMOTE;
5332
0
      else if (strcmp(item->value, "shutdown") == 0)
5333
0
        recoveryTargetAction = RECOVERY_TARGET_ACTION_SHUTDOWN;
5334
0
      else
5335
0
        ereport(ERROR,
5336
0
            (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5337
0
             errmsg("invalid value for recovery parameter \"%s\": \"%s\"",
5338
0
                "recovery_target_action",
5339
0
                item->value),
5340
0
             errhint("Valid values are \"pause\", \"promote\", and \"shutdown\".")));
5341
5342
0
      ereport(DEBUG2,
5343
0
          (errmsg_internal("recovery_target_action = '%s'",
5344
0
                   item->value)));
5345
5346
0
      recoveryTargetActionSet = true;
5347
0
    }
5348
0
    else if (strcmp(item->name, "recovery_target_timeline") == 0)
5349
0
    {
5350
0
      rtliGiven = true;
5351
0
      if (strcmp(item->value, "latest") == 0)
5352
0
        rtli = 0;
5353
0
      else
5354
0
      {
5355
0
        errno = 0;
5356
0
        rtli = (TimeLineID) strtoul(item->value, NULL, 0);
5357
0
        if (errno == EINVAL || errno == ERANGE)
5358
0
          ereport(FATAL,
5359
0
              (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5360
0
               errmsg("recovery_target_timeline is not a valid number: \"%s\"",
5361
0
                  item->value)));
5362
0
      }
5363
0
      if (rtli)
5364
0
        ereport(DEBUG2,
5365
0
            (errmsg_internal("recovery_target_timeline = %u", rtli)));
5366
0
      else
5367
0
        ereport(DEBUG2,
5368
0
            (errmsg_internal("recovery_target_timeline = latest")));
5369
0
    }
5370
0
    else if (strcmp(item->name, "recovery_target_xid") == 0)
5371
0
    {
5372
0
      errno = 0;
5373
0
      recoveryTargetXid = (TransactionId) strtoul(item->value, NULL, 0);
5374
0
      if (errno == EINVAL || errno == ERANGE)
5375
0
        ereport(FATAL,
5376
0
            (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5377
0
             errmsg("recovery_target_xid is not a valid number: \"%s\"",
5378
0
                item->value)));
5379
0
      ereport(DEBUG2,
5380
0
          (errmsg_internal("recovery_target_xid = %u",
5381
0
                   recoveryTargetXid)));
5382
0
      recoveryTarget = RECOVERY_TARGET_XID;
5383
0
    }
5384
0
    else if (strcmp(item->name, "recovery_target_time") == 0)
5385
0
    {
5386
0
      recoveryTarget = RECOVERY_TARGET_TIME;
5387
5388
0
      if (strcmp(item->value, "epoch") == 0 ||
5389
0
        strcmp(item->value, "infinity") == 0 ||
5390
0
        strcmp(item->value, "-infinity") == 0 ||
5391
0
        strcmp(item->value, "now") == 0 ||
5392
0
        strcmp(item->value, "today") == 0 ||
5393
0
        strcmp(item->value, "tomorrow") == 0 ||
5394
0
        strcmp(item->value, "yesterday") == 0)
5395
0
        ereport(FATAL,
5396
0
            (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5397
0
             errmsg("recovery_target_time is not a valid timestamp: \"%s\"",
5398
0
                item->value)));
5399
5400
      /*
5401
       * Convert the time string given by the user to TimestampTz form.
5402
       */
5403
0
      recoveryTargetTime =
5404
0
        DatumGetTimestampTz(DirectFunctionCall3(timestamptz_in,
5405
0
                            CStringGetDatum(item->value),
5406
0
                            ObjectIdGetDatum(InvalidOid),
5407
0
                            Int32GetDatum(-1)));
5408
0
      ereport(DEBUG2,
5409
0
          (errmsg_internal("recovery_target_time = '%s'",
5410
0
                   timestamptz_to_str(recoveryTargetTime))));
5411
0
    }
5412
0
    else if (strcmp(item->name, "recovery_target_name") == 0)
5413
0
    {
5414
0
      recoveryTarget = RECOVERY_TARGET_NAME;
5415
5416
0
      recoveryTargetName = pstrdup(item->value);
5417
0
      if (strlen(recoveryTargetName) >= MAXFNAMELEN)
5418
0
        ereport(FATAL,
5419
0
            (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5420
0
             errmsg("recovery_target_name is too long (maximum %d characters)",
5421
0
                MAXFNAMELEN - 1)));
5422
5423
0
      ereport(DEBUG2,
5424
0
          (errmsg_internal("recovery_target_name = '%s'",
5425
0
                   recoveryTargetName)));
5426
0
    }
5427
0
    else if (strcmp(item->name, "recovery_target_lsn") == 0)
5428
0
    {
5429
0
      recoveryTarget = RECOVERY_TARGET_LSN;
5430
5431
      /*
5432
       * Convert the LSN string given by the user to XLogRecPtr form.
5433
       */
5434
0
      recoveryTargetLSN =
5435
0
        DatumGetLSN(DirectFunctionCall3(pg_lsn_in,
5436
0
                        CStringGetDatum(item->value),
5437
0
                        ObjectIdGetDatum(InvalidOid),
5438
0
                        Int32GetDatum(-1)));
5439
0
      ereport(DEBUG2,
5440
0
          (errmsg_internal("recovery_target_lsn = '%X/%X'",
5441
0
                   (uint32) (recoveryTargetLSN >> 32),
5442
0
                   (uint32) recoveryTargetLSN)));
5443
0
    }
5444
0
    else if (strcmp(item->name, "recovery_target") == 0)
5445
0
    {
5446
0
      if (strcmp(item->value, "immediate") == 0)
5447
0
        recoveryTarget = RECOVERY_TARGET_IMMEDIATE;
5448
0
      else
5449
0
        ereport(ERROR,
5450
0
            (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5451
0
             errmsg("invalid value for recovery parameter \"%s\": \"%s\"",
5452
0
                "recovery_target",
5453
0
                item->value),
5454
0
             errhint("The only allowed value is \"immediate\".")));
5455
0
      ereport(DEBUG2,
5456
0
          (errmsg_internal("recovery_target = '%s'",
5457
0
                   item->value)));
5458
0
    }
5459
0
    else if (strcmp(item->name, "recovery_target_inclusive") == 0)
5460
0
    {
5461
      /*
5462
       * does nothing if a recovery_target is not also set
5463
       */
5464
0
      if (!parse_bool(item->value, &recoveryTargetInclusive))
5465
0
        ereport(ERROR,
5466
0
            (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5467
0
             errmsg("parameter \"%s\" requires a Boolean value",
5468
0
                "recovery_target_inclusive")));
5469
0
      ereport(DEBUG2,
5470
0
          (errmsg_internal("recovery_target_inclusive = %s",
5471
0
                   item->value)));
5472
0
    }
5473
0
    else if (strcmp(item->name, "standby_mode") == 0)
5474
0
    {
5475
0
      if (!parse_bool(item->value, &StandbyModeRequested))
5476
0
        ereport(ERROR,
5477
0
            (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5478
0
             errmsg("parameter \"%s\" requires a Boolean value",
5479
0
                "standby_mode")));
5480
0
      ereport(DEBUG2,
5481
0
          (errmsg_internal("standby_mode = '%s'", item->value)));
5482
0
    }
5483
0
    else if (strcmp(item->name, "primary_conninfo") == 0)
5484
0
    {
5485
0
      PrimaryConnInfo = pstrdup(item->value);
5486
0
      ereport(DEBUG2,
5487
0
          (errmsg_internal("primary_conninfo = '%s'",
5488
0
                   PrimaryConnInfo)));
5489
0
    }
5490
0
    else if (strcmp(item->name, "primary_slot_name") == 0)
5491
0
    {
5492
0
      ReplicationSlotValidateName(item->value, ERROR);
5493
0
      PrimarySlotName = pstrdup(item->value);
5494
0
      ereport(DEBUG2,
5495
0
          (errmsg_internal("primary_slot_name = '%s'",
5496
0
                   PrimarySlotName)));
5497
0
    }
5498
0
    else if (strcmp(item->name, "trigger_file") == 0)
5499
0
    {
5500
0
      TriggerFile = pstrdup(item->value);
5501
0
      ereport(DEBUG2,
5502
0
          (errmsg_internal("trigger_file = '%s'",
5503
0
                   TriggerFile)));
5504
0
    }
5505
0
    else if (strcmp(item->name, "recovery_min_apply_delay") == 0)
5506
0
    {
5507
0
      const char *hintmsg;
5508
5509
0
      if (!parse_int(item->value, &recovery_min_apply_delay, GUC_UNIT_MS,
5510
0
               &hintmsg))
5511
0
        ereport(ERROR,
5512
0
            (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5513
0
             errmsg("parameter \"%s\" requires a temporal value",
5514
0
                "recovery_min_apply_delay"),
5515
0
             hintmsg ? errhint("%s", _(hintmsg)) : 0));
5516
0
      ereport(DEBUG2,
5517
0
          (errmsg_internal("recovery_min_apply_delay = '%s'", item->value)));
5518
0
    }
5519
0
    else
5520
0
      ereport(FATAL,
5521
0
          (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5522
0
           errmsg("unrecognized recovery parameter \"%s\"",
5523
0
              item->name)));
5524
0
  }
5525
5526
  /*
5527
   * Check for compulsory parameters
5528
   */
5529
0
  if (StandbyModeRequested)
5530
0
  {
5531
0
    if (PrimaryConnInfo == NULL && recoveryRestoreCommand == NULL)
5532
0
      ereport(WARNING,
5533
0
          (errmsg("recovery command file \"%s\" specified neither primary_conninfo nor restore_command",
5534
0
              RECOVERY_COMMAND_FILE),
5535
0
           errhint("The database server will regularly poll the pg_wal subdirectory to check for files placed there.")));
5536
0
  }
5537
0
  else
5538
0
  {
5539
0
    if (recoveryRestoreCommand == NULL)
5540
0
      ereport(FATAL,
5541
0
          (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5542
0
           errmsg("recovery command file \"%s\" must specify restore_command when standby mode is not enabled",
5543
0
              RECOVERY_COMMAND_FILE)));
5544
0
  }
5545
5546
  /*
5547
   * Override any inconsistent requests. Not that this is a change of
5548
   * behaviour in 9.5; prior to this we simply ignored a request to pause if
5549
   * hot_standby = off, which was surprising behaviour.
5550
   */
5551
0
  if (recoveryTargetAction == RECOVERY_TARGET_ACTION_PAUSE &&
5552
0
    recoveryTargetActionSet &&
5553
0
    !EnableHotStandby)
5554
0
    recoveryTargetAction = RECOVERY_TARGET_ACTION_SHUTDOWN;
5555
5556
  /*
5557
   * We don't support standby_mode in standalone backends; that requires
5558
   * other processes such as the WAL receiver to be alive.
5559
   */
5560
0
  if (StandbyModeRequested && !IsUnderPostmaster)
5561
0
    ereport(FATAL,
5562
0
        (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
5563
0
         errmsg("standby mode is not supported by single-user servers")));
5564
5565
  /* Enable fetching from archive recovery area */
5566
0
  ArchiveRecoveryRequested = true;
5567
5568
  /*
5569
   * If user specified recovery_target_timeline, validate it or compute the
5570
   * "latest" value.  We can't do this until after we've gotten the restore
5571
   * command and set InArchiveRecovery, because we need to fetch timeline
5572
   * history files from the archive.
5573
   */
5574
0
  if (rtliGiven)
5575
0
  {
5576
0
    if (rtli)
5577
0
    {
5578
      /* Timeline 1 does not have a history file, all else should */
5579
0
      if (rtli != 1 && !existsTimeLineHistory(rtli))
5580
0
        ereport(FATAL,
5581
0
            (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5582
0
             errmsg("recovery target timeline %u does not exist",
5583
0
                rtli)));
5584
0
      recoveryTargetTLI = rtli;
5585
0
      recoveryTargetIsLatest = false;
5586
0
    }
5587
0
    else
5588
0
    {
5589
      /* We start the "latest" search from pg_control's timeline */
5590
0
      recoveryTargetTLI = findNewestTimeLine(recoveryTargetTLI);
5591
0
      recoveryTargetIsLatest = true;
5592
0
    }
5593
0
  }
5594
5595
0
  FreeConfigVariables(head);
5596
0
}
5597
5598
/*
5599
 * Exit archive-recovery state
5600
 */
5601
static void
5602
exitArchiveRecovery(TimeLineID endTLI, XLogRecPtr endOfLog)
5603
0
{
5604
0
  char    recoveryPath[MAXPGPATH];
5605
0
  char    xlogfname[MAXFNAMELEN];
5606
0
  XLogSegNo endLogSegNo;
5607
0
  XLogSegNo startLogSegNo;
5608
5609
  /* we always switch to a new timeline after archive recovery */
5610
0
  Assert(endTLI != ThisTimeLineID);
5611
5612
  /*
5613
   * We are no longer in archive recovery state.
5614
   */
5615
0
  InArchiveRecovery = false;
5616
5617
  /*
5618
   * Update min recovery point one last time.
5619
   */
5620
0
  UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);
5621
5622
  /*
5623
   * If the ending log segment is still open, close it (to avoid problems on
5624
   * Windows with trying to rename or delete an open file).
5625
   */
5626
0
  if (readFile >= 0)
5627
0
  {
5628
0
    close(readFile);
5629
0
    readFile = -1;
5630
0
  }
5631
5632
  /*
5633
   * Calculate the last segment on the old timeline, and the first segment
5634
   * on the new timeline. If the switch happens in the middle of a segment,
5635
   * they are the same, but if the switch happens exactly at a segment
5636
   * boundary, startLogSegNo will be endLogSegNo + 1.
5637
   */
5638
0
  XLByteToPrevSeg(endOfLog, endLogSegNo, wal_segment_size);
5639
0
  XLByteToSeg(endOfLog, startLogSegNo, wal_segment_size);
5640
5641
  /*
5642
   * Initialize the starting WAL segment for the new timeline. If the switch
5643
   * happens in the middle of a segment, copy data from the last WAL segment
5644
   * of the old timeline up to the switch point, to the starting WAL segment
5645
   * on the new timeline.
5646
   */
5647
0
  if (endLogSegNo == startLogSegNo)
5648
0
  {
5649
    /*
5650
     * Make a copy of the file on the new timeline.
5651
     *
5652
     * Writing WAL isn't allowed yet, so there are no locking
5653
     * considerations. But we should be just as tense as XLogFileInit to
5654
     * avoid emplacing a bogus file.
5655
     */
5656
0
    XLogFileCopy(endLogSegNo, endTLI, endLogSegNo,
5657
0
           XLogSegmentOffset(endOfLog, wal_segment_size));
5658
0
  }
5659
0
  else
5660
0
  {
5661
    /*
5662
     * The switch happened at a segment boundary, so just create the next
5663
     * segment on the new timeline.
5664
     */
5665
0
    bool    use_existent = true;
5666
0
    int     fd;
5667
5668
0
    fd = XLogFileInit(startLogSegNo, &use_existent, true);
5669
5670
0
    if (close(fd))
5671
0
      ereport(ERROR,
5672
0
          (errcode_for_file_access(),
5673
0
           errmsg("could not close log file %s: %m",
5674
0
              XLogFileNameP(ThisTimeLineID, startLogSegNo))));
5675
0
  }
5676
5677
  /*
5678
   * Let's just make real sure there are not .ready or .done flags posted
5679
   * for the new segment.
5680
   */
5681
0
  XLogFileName(xlogfname, ThisTimeLineID, startLogSegNo, wal_segment_size);
5682
0
  XLogArchiveCleanup(xlogfname);
5683
5684
  /*
5685
   * Since there might be a partial WAL segment named RECOVERYXLOG, get rid
5686
   * of it.
5687
   */
5688
0
  snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYXLOG");
5689
0
  unlink(recoveryPath);   /* ignore any error */
5690
5691
  /* Get rid of any remaining recovered timeline-history file, too */
5692
0
  snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYHISTORY");
5693
0
  unlink(recoveryPath);   /* ignore any error */
5694
5695
  /*
5696
   * Rename the config file out of the way, so that we don't accidentally
5697
   * re-enter archive recovery mode in a subsequent crash.
5698
   */
5699
0
  unlink(RECOVERY_COMMAND_DONE);
5700
0
  durable_rename(RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE, FATAL);
5701
5702
0
  ereport(LOG,
5703
0
      (errmsg("archive recovery complete")));
5704
0
}
5705
5706
/*
5707
 * Extract timestamp from WAL record.
5708
 *
5709
 * If the record contains a timestamp, returns true, and saves the timestamp
5710
 * in *recordXtime. If the record type has no timestamp, returns false.
5711
 * Currently, only transaction commit/abort records and restore points contain
5712
 * timestamps.
5713
 */
5714
static bool
5715
getRecordTimestamp(XLogReaderState *record, TimestampTz *recordXtime)
5716
0
{
5717
0
  uint8   info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
5718
0
  uint8   xact_info = info & XLOG_XACT_OPMASK;
5719
0
  uint8   rmid = XLogRecGetRmid(record);
5720
5721
0
  if (rmid == RM_XLOG_ID && info == XLOG_RESTORE_POINT)
5722
0
  {
5723
0
    *recordXtime = ((xl_restore_point *) XLogRecGetData(record))->rp_time;
5724
0
    return true;
5725
0
  }
5726
0
  if (rmid == RM_XACT_ID && (xact_info == XLOG_XACT_COMMIT ||
5727
0
                 xact_info == XLOG_XACT_COMMIT_PREPARED))
5728
0
  {
5729
0
    *recordXtime = ((xl_xact_commit *) XLogRecGetData(record))->xact_time;
5730
0
    return true;
5731
0
  }
5732
0
  if (rmid == RM_XACT_ID && (xact_info == XLOG_XACT_ABORT ||
5733
0
                 xact_info == XLOG_XACT_ABORT_PREPARED))
5734
0
  {
5735
0
    *recordXtime = ((xl_xact_abort *) XLogRecGetData(record))->xact_time;
5736
0
    return true;
5737
0
  }
5738
0
  return false;
5739
0
}
5740
5741
/*
5742
 * For point-in-time recovery, this function decides whether we want to
5743
 * stop applying the XLOG before the current record.
5744
 *
5745
 * Returns true if we are stopping, false otherwise. If stopping, some
5746
 * information is saved in recoveryStopXid et al for use in annotating the
5747
 * new timeline's history file.
5748
 */
5749
static bool
5750
recoveryStopsBefore(XLogReaderState *record)
5751
2
{
5752
2
  bool    stopsHere = false;
5753
2
  uint8   xact_info;
5754
2
  bool    isCommit;
5755
2
  TimestampTz recordXtime = 0;
5756
2
  TransactionId recordXid;
5757
5758
  /* Check if we should stop as soon as reaching consistency */
5759
2
  if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE && 
reachedConsistency0
)
5760
0
  {
5761
0
    ereport(LOG,
5762
0
        (errmsg("recovery stopping after reaching consistency")));
5763
5764
0
    recoveryStopAfter = false;
5765
0
    recoveryStopXid = InvalidTransactionId;
5766
0
    recoveryStopLSN = InvalidXLogRecPtr;
5767
0
    recoveryStopTime = 0;
5768
0
    recoveryStopName[0] = '\0';
5769
0
    return true;
5770
0
  }
5771
5772
  /* Check if target LSN has been reached */
5773
2
  if (recoveryTarget == RECOVERY_TARGET_LSN &&
5774
2
    
!recoveryTargetInclusive0
&&
5775
2
    
record->ReadRecPtr >= recoveryTargetLSN0
)
5776
0
  {
5777
0
    recoveryStopAfter = false;
5778
0
    recoveryStopXid = InvalidTransactionId;
5779
0
    recoveryStopLSN = record->ReadRecPtr;
5780
0
    recoveryStopTime = 0;
5781
0
    recoveryStopName[0] = '\0';
5782
0
    ereport(LOG,
5783
0
        (errmsg("recovery stopping before WAL location (LSN) \"%X/%X\"",
5784
0
            (uint32) (recoveryStopLSN >> 32),
5785
0
            (uint32) recoveryStopLSN)));
5786
0
    return true;
5787
0
  }
5788
5789
  /* Otherwise we only consider stopping before COMMIT or ABORT records. */
5790
2
  if (XLogRecGetRmid(record) != RM_XACT_ID)
5791
2
    return false;
5792
5793
0
  xact_info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK;
5794
5795
0
  if (xact_info == XLOG_XACT_COMMIT)
5796
0
  {
5797
0
    isCommit = true;
5798
0
    recordXid = XLogRecGetXid(record);
5799
0
  }
5800
0
  else if (xact_info == XLOG_XACT_COMMIT_PREPARED)
5801
0
  {
5802
0
    xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record);
5803
0
    xl_xact_parsed_commit parsed;
5804
5805
0
    isCommit = true;
5806
0
    ParseCommitRecord(XLogRecGetInfo(record),
5807
0
              xlrec,
5808
0
              &parsed);
5809
0
    recordXid = parsed.twophase_xid;
5810
0
  }
5811
0
  else if (xact_info == XLOG_XACT_ABORT)
5812
0
  {
5813
0
    isCommit = false;
5814
0
    recordXid = XLogRecGetXid(record);
5815
0
  }
5816
0
  else if (xact_info == XLOG_XACT_ABORT_PREPARED)
5817
0
  {
5818
0
    xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record);
5819
0
    xl_xact_parsed_abort parsed;
5820
5821
0
    isCommit = true;
5822
0
    ParseAbortRecord(XLogRecGetInfo(record),
5823
0
             xlrec,
5824
0
             &parsed);
5825
0
    recordXid = parsed.twophase_xid;
5826
0
  }
5827
0
  else
5828
0
    return false;
5829
5830
0
  if (recoveryTarget == RECOVERY_TARGET_XID && !recoveryTargetInclusive)
5831
0
  {
5832
    /*
5833
     * There can be only one transaction end record with this exact
5834
     * transactionid
5835
     *
5836
     * when testing for an xid, we MUST test for equality only, since
5837
     * transactions are numbered in the order they start, not the order
5838
     * they complete. A higher numbered xid will complete before you about
5839
     * 50% of the time...
5840
     */
5841
0
    stopsHere = (recordXid == recoveryTargetXid);
5842
0
  }
5843
5844
0
  if (recoveryTarget == RECOVERY_TARGET_TIME &&
5845
0
    getRecordTimestamp(record, &recordXtime))
5846
0
  {
5847
    /*
5848
     * There can be many transactions that share the same commit time, so
5849
     * we stop after the last one, if we are inclusive, or stop at the
5850
     * first one if we are exclusive
5851
     */
5852
0
    if (recoveryTargetInclusive)
5853
0
      stopsHere = (recordXtime > recoveryTargetTime);
5854
0
    else
5855
0
      stopsHere = (recordXtime >= recoveryTargetTime);
5856
0
  }
5857
5858
0
  if (stopsHere)
5859
0
  {
5860
0
    recoveryStopAfter = false;
5861
0
    recoveryStopXid = recordXid;
5862
0
    recoveryStopTime = recordXtime;
5863
0
    recoveryStopLSN = InvalidXLogRecPtr;
5864
0
    recoveryStopName[0] = '\0';
5865
5866
0
    if (isCommit)
5867
0
    {
5868
0
      ereport(LOG,
5869
0
          (errmsg("recovery stopping before commit of transaction %u, time %s",
5870
0
              recoveryStopXid,
5871
0
              timestamptz_to_str(recoveryStopTime))));
5872
0
    }
5873
0
    else
5874
0
    {
5875
0
      ereport(LOG,
5876
0
          (errmsg("recovery stopping before abort of transaction %u, time %s",
5877
0
              recoveryStopXid,
5878
0
              timestamptz_to_str(recoveryStopTime))));
5879
0
    }
5880
0
  }
5881
5882
0
  return stopsHere;
5883
0
}
5884
5885
/*
5886
 * Same as recoveryStopsBefore, but called after applying the record.
5887
 *
5888
 * We also track the timestamp of the latest applied COMMIT/ABORT
5889
 * record in XLogCtl->recoveryLastXTime.
5890
 */
5891
static bool
5892
recoveryStopsAfter(XLogReaderState *record)
5893
2
{
5894
2
  uint8   info;
5895
2
  uint8   xact_info;
5896
2
  uint8   rmid;
5897
2
  TimestampTz recordXtime;
5898
5899
2
  info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
5900
2
  rmid = XLogRecGetRmid(record);
5901
5902
  /*
5903
   * There can be many restore points that share the same name; we stop at
5904
   * the first one.
5905
   */
5906
2
  if (recoveryTarget == RECOVERY_TARGET_NAME &&
5907
2
    
rmid == RM_XLOG_ID0
&&
info == 0
XLOG_RESTORE_POINT0
)
5908
0
  {
5909
0
    xl_restore_point *recordRestorePointData;
5910
5911
0
    recordRestorePointData = (xl_restore_point *) XLogRecGetData(record);
5912
5913
0
    if (strcmp(recordRestorePointData->rp_name, recoveryTargetName) == 0)
5914
0
    {
5915
0
      recoveryStopAfter = true;
5916
0
      recoveryStopXid = InvalidTransactionId;
5917
0
      recoveryStopLSN = InvalidXLogRecPtr;
5918
0
      (void) getRecordTimestamp(record, &recoveryStopTime);
5919
0
      strlcpy(recoveryStopName, recordRestorePointData->rp_name, MAXFNAMELEN);
5920
5921
0
      ereport(LOG,
5922
0
          (errmsg("recovery stopping at restore point \"%s\", time %s",
5923
0
              recoveryStopName,
5924
0
              timestamptz_to_str(recoveryStopTime))));
5925
0
      return true;
5926
0
    }
5927
0
  }
5928
5929
  /* Check if the target LSN has been reached */
5930
2
  if (recoveryTarget == RECOVERY_TARGET_LSN &&
5931
2
    
recoveryTargetInclusive0
&&
5932
2
    
record->ReadRecPtr >= recoveryTargetLSN0
)
5933
0
  {
5934
0
    recoveryStopAfter = true;
5935
0
    recoveryStopXid = InvalidTransactionId;
5936
0
    recoveryStopLSN = record->ReadRecPtr;
5937
0
    recoveryStopTime = 0;
5938
0
    recoveryStopName[0] = '\0';
5939
0
    ereport(LOG,
5940
0
        (errmsg("recovery stopping after WAL location (LSN) \"%X/%X\"",
5941
0
            (uint32) (recoveryStopLSN >> 32),
5942
0
            (uint32) recoveryStopLSN)));
5943
0
    return true;
5944
0
  }
5945
5946
2
  if (rmid != RM_XACT_ID)
5947
2
    return false;
5948
5949
0
  xact_info = info & XLOG_XACT_OPMASK;
5950
5951
0
  if (xact_info == XLOG_XACT_COMMIT ||
5952
0
    xact_info == XLOG_XACT_COMMIT_PREPARED ||
5953
0
    xact_info == XLOG_XACT_ABORT ||
5954
0
    xact_info == XLOG_XACT_ABORT_PREPARED)
5955
0
  {
5956
0
    TransactionId recordXid;
5957
5958
    /* Update the last applied transaction timestamp */
5959
0
    if (getRecordTimestamp(record, &recordXtime))
5960
0
      SetLatestXTime(recordXtime);
5961
5962
    /* Extract the XID of the committed/aborted transaction */
5963
0
    if (xact_info == XLOG_XACT_COMMIT_PREPARED)
5964
0
    {
5965
0
      xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record);
5966
0
      xl_xact_parsed_commit parsed;
5967
5968
0
      ParseCommitRecord(XLogRecGetInfo(record),
5969
0
                xlrec,
5970
0
                &parsed);
5971
0
      recordXid = parsed.twophase_xid;
5972
0
    }
5973
0
    else if (xact_info == XLOG_XACT_ABORT_PREPARED)
5974
0
    {
5975
0
      xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record);
5976
0
      xl_xact_parsed_abort parsed;
5977
5978
0
      ParseAbortRecord(XLogRecGetInfo(record),
5979
0
               xlrec,
5980
0
               &parsed);
5981
0
      recordXid = parsed.twophase_xid;
5982
0
    }
5983
0
    else
5984
0
      recordXid = XLogRecGetXid(record);
5985
5986
    /*
5987
     * There can be only one transaction end record with this exact
5988
     * transactionid
5989
     *
5990
     * when testing for an xid, we MUST test for equality only, since
5991
     * transactions are numbered in the order they start, not the order
5992
     * they complete. A higher numbered xid will complete before you about
5993
     * 50% of the time...
5994
     */
5995
0
    if (recoveryTarget == RECOVERY_TARGET_XID && recoveryTargetInclusive &&
5996
0
      recordXid == recoveryTargetXid)
5997
0
    {
5998
0
      recoveryStopAfter = true;
5999
0
      recoveryStopXid = recordXid;
6000
0
      recoveryStopTime = recordXtime;
6001
0
      recoveryStopLSN = InvalidXLogRecPtr;
6002
0
      recoveryStopName[0] = '\0';
6003
6004
0
      if (xact_info == XLOG_XACT_COMMIT ||
6005
0
        xact_info == XLOG_XACT_COMMIT_PREPARED)
6006
0
      {
6007
0
        ereport(LOG,
6008
0
            (errmsg("recovery stopping after commit of transaction %u, time %s",
6009
0
                recoveryStopXid,
6010
0
                timestamptz_to_str(recoveryStopTime))));
6011
0
      }
6012
0
      else if (xact_info == XLOG_XACT_ABORT ||
6013
0
           xact_info == XLOG_XACT_ABORT_PREPARED)
6014
0
      {
6015
0
        ereport(LOG,
6016
0
            (errmsg("recovery stopping after abort of transaction %u, time %s",
6017
0
                recoveryStopXid,
6018
0
                timestamptz_to_str(recoveryStopTime))));
6019
0
      }
6020
0
      return true;
6021
0
    }
6022
0
  }
6023
6024
  /* Check if we should stop as soon as reaching consistency */
6025
0
  if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE && reachedConsistency)
6026
0
  {
6027
0
    ereport(LOG,
6028
0
        (errmsg("recovery stopping after reaching consistency")));
6029
6030
0
    recoveryStopAfter = true;
6031
0
    recoveryStopXid = InvalidTransactionId;
6032
0
    recoveryStopTime = 0;
6033
0
    recoveryStopLSN = InvalidXLogRecPtr;
6034
0
    recoveryStopName[0] = '\0';
6035
0
    return true;
6036
0
  }
6037
6038
0
  return false;
6039
0
}
6040
6041
/*
6042
 * Wait until shared recoveryPause flag is cleared.
6043
 *
6044
 * XXX Could also be done with shared latch, avoiding the pg_usleep loop.
6045
 * Probably not worth the trouble though.  This state shouldn't be one that
6046
 * anyone cares about server power consumption in.
6047
 */
6048
static void
6049
recoveryPausesHere(void)
6050
0
{
6051
  /* Don't pause unless users can connect! */
6052
0
  if (!LocalHotStandbyActive)
6053
0
    return;
6054
6055
0
  ereport(LOG,
6056
0
      (errmsg("recovery has paused"),
6057
0
       errhint("Execute pg_wal_replay_resume() to continue.")));
6058
6059
0
  while (RecoveryIsPaused())
6060
0
  {
6061
0
    pg_usleep(1000000L);  /* 1000 ms */
6062
0
    HandleStartupProcInterrupts();
6063
0
  }
6064
0
}
6065
6066
bool
6067
RecoveryIsPaused(void)
6068
0
{
6069
0
  bool    recoveryPause;
6070
6071
0
  SpinLockAcquire(&XLogCtl->info_lck);
6072
0
  recoveryPause = XLogCtl->recoveryPause;
6073
0
  SpinLockRelease(&XLogCtl->info_lck);
6074
6075
0
  return recoveryPause;
6076
0
}
6077
6078
void
6079
SetRecoveryPause(bool recoveryPause)
6080
0
{
6081
0
  SpinLockAcquire(&XLogCtl->info_lck);
6082
0
  XLogCtl->recoveryPause = recoveryPause;
6083
0
  SpinLockRelease(&XLogCtl->info_lck);
6084
0
}
6085
6086
/*
6087
 * When recovery_min_apply_delay is set, we wait long enough to make sure
6088
 * certain record types are applied at least that interval behind the master.
6089
 *
6090
 * Returns true if we waited.
6091
 *
6092
 * Note that the delay is calculated between the WAL record log time and
6093
 * the current time on standby. We would prefer to keep track of when this
6094
 * standby received each WAL record, which would allow a more consistent
6095
 * approach and one not affected by time synchronisation issues, but that
6096
 * is significantly more effort and complexity for little actual gain in
6097
 * usability.
6098
 */
6099
static bool
6100
recoveryApplyDelay(XLogReaderState *record)
6101
2
{
6102
2
  uint8   xact_info;
6103
2
  TimestampTz xtime;
6104
2
  long    secs;
6105
2
  int     microsecs;
6106
6107
  /* nothing to do if no delay configured */
6108
2
  if (recovery_min_apply_delay <= 0)
6109
2
    return false;
6110
6111
  /* no delay is applied on a database not yet consistent */
6112
0
  if (!reachedConsistency)
6113
0
    return false;
6114
6115
  /*
6116
   * Is it a COMMIT record?
6117
   *
6118
   * We deliberately choose not to delay aborts since they have no effect on
6119
   * MVCC. We already allow replay of records that don't have a timestamp,
6120
   * so there is already opportunity for issues caused by early conflicts on
6121
   * standbys.
6122
   */
6123
0
  if (XLogRecGetRmid(record) != RM_XACT_ID)
6124
0
    return false;
6125
6126
0
  xact_info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK;
6127
6128
0
  if (xact_info != XLOG_XACT_COMMIT &&
6129
0
    xact_info != XLOG_XACT_COMMIT_PREPARED)
6130
0
    return false;
6131
6132
0
  if (!getRecordTimestamp(record, &xtime))
6133
0
    return false;
6134
6135
0
  recoveryDelayUntilTime =
6136
0
    TimestampTzPlusMilliseconds(xtime, recovery_min_apply_delay);
6137
6138
  /*
6139
   * Exit without arming the latch if it's already past time to apply this
6140
   * record
6141
   */
6142
0
  TimestampDifference(GetCurrentTimestamp(), recoveryDelayUntilTime,
6143
0
            &secs, &microsecs);
6144
0
  if (secs <= 0 && microsecs <= 0)
6145
0
    return false;
6146
6147
0
  while (true)
6148
0
  {
6149
0
    ResetLatch(&XLogCtl->recoveryWakeupLatch);
6150
6151
    /* might change the trigger file's location */
6152
0
    HandleStartupProcInterrupts();
6153
6154
0
    if (CheckForStandbyTrigger())
6155
0
      break;
6156
6157
    /*
6158
     * Wait for difference between GetCurrentTimestamp() and
6159
     * recoveryDelayUntilTime
6160
     */
6161
0
    TimestampDifference(GetCurrentTimestamp(), recoveryDelayUntilTime,
6162
0
              &secs, &microsecs);
6163
6164
    /* NB: We're ignoring waits below min_apply_delay's resolution. */
6165
0
    if (secs <= 0 && microsecs / 1000 <= 0)
6166
0
      break;
6167
6168
0
    elog(DEBUG2, "recovery apply delay %ld seconds, %d milliseconds",
6169
0
       secs, microsecs / 1000);
6170
6171
0
    WaitLatch(&XLogCtl->recoveryWakeupLatch,
6172
0
          WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH,
6173
0
          secs * 1000L + microsecs / 1000,
6174
0
          WAIT_EVENT_RECOVERY_APPLY_DELAY);
6175
0
  }
6176
0
  return true;
6177
0
}
6178
6179
/*
6180
 * Save timestamp of latest processed commit/abort record.
6181
 *
6182
 * We keep this in XLogCtl, not a simple static variable, so that it can be
6183
 * seen by processes other than the startup process.  Note in particular
6184
 * that CreateRestartPoint is executed in the checkpointer.
6185
 */
6186
static void
6187
SetLatestXTime(TimestampTz xtime)
6188
0
{
6189
0
  SpinLockAcquire(&XLogCtl->info_lck);
6190
0
  XLogCtl->recoveryLastXTime = xtime;
6191
0
  SpinLockRelease(&XLogCtl->info_lck);
6192
0
}
6193
6194
/*
6195
 * Fetch timestamp of latest processed commit/abort record.
6196
 */
6197
TimestampTz
6198
GetLatestXTime(void)
6199
1
{
6200
1
  TimestampTz xtime;
6201
6202
1
  SpinLockAcquire(&XLogCtl->info_lck);
6203
1
  xtime = XLogCtl->recoveryLastXTime;
6204
1
  SpinLockRelease(&XLogCtl->info_lck);
6205
6206
1
  return xtime;
6207
1
}
6208
6209
/*
6210
 * Save timestamp of the next chunk of WAL records to apply.
6211
 *
6212
 * We keep this in XLogCtl, not a simple static variable, so that it can be
6213
 * seen by all backends.
6214
 */
6215
static void
6216
SetCurrentChunkStartTime(TimestampTz xtime)
6217
0
{
6218
0
  SpinLockAcquire(&XLogCtl->info_lck);
6219
0
  XLogCtl->currentChunkStartTime = xtime;
6220
0
  SpinLockRelease(&XLogCtl->info_lck);
6221
0
}
6222
6223
/*
6224
 * Fetch timestamp of latest processed commit/abort record.
6225
 * Startup process maintains an accurate local copy in XLogReceiptTime
6226
 */
6227
TimestampTz
6228
GetCurrentChunkReplayStartTime(void)
6229
0
{
6230
0
  TimestampTz xtime;
6231
6232
0
  SpinLockAcquire(&XLogCtl->info_lck);
6233
0
  xtime = XLogCtl->currentChunkStartTime;
6234
0
  SpinLockRelease(&XLogCtl->info_lck);
6235
6236
0
  return xtime;
6237
0
}
6238
6239
/*
6240
 * Returns time of receipt of current chunk of XLOG data, as well as
6241
 * whether it was received from streaming replication or from archives.
6242
 */
6243
void
6244
GetXLogReceiptTime(TimestampTz *rtime, bool *fromStream)
6245
0
{
6246
  /*
6247
   * This must be executed in the startup process, since we don't export the
6248
   * relevant state to shared memory.
6249
   */
6250
0
  Assert(InRecovery);
6251
6252
0
  *rtime = XLogReceiptTime;
6253
0
  *fromStream = (XLogReceiptSource == XLOG_FROM_STREAM);
6254
0
}
6255
6256
/*
6257
 * Note that text field supplied is a parameter name and does not require
6258
 * translation
6259
 */
6260
0
#define RecoveryRequiresIntParameter(param_name, currValue, minValue) \
6261
0
do { \
6262
0
  if ((currValue) < (minValue)) \
6263
0
    ereport(ERROR, \
6264
0
        (errcode(ERRCODE_INVALID_PARAMETER_VALUE), \
6265
0
         errmsg("hot standby is not possible because " \
6266
0
            "%s = %d is a lower setting than on the master server " \
6267
0
            "(its value was %d)", \
6268
0
            param_name, \
6269
0
            currValue, \
6270
0
            minValue))); \
6271
0
} while(0)
6272
6273
/*
6274
 * Check to see if required parameters are set high enough on this server
6275
 * for various aspects of recovery operation.
6276
 *
6277
 * Note that all the parameters which this function tests need to be
6278
 * listed in Administrator's Overview section in high-availability.sgml.
6279
 * If you change them, don't forget to update the list.
6280
 */
6281
static void
6282
CheckRequiredParameterValues(void)
6283
5
{
6284
  /*
6285
   * For archive recovery, the WAL must be generated with at least 'replica'
6286
   * wal_level.
6287
   */
6288
5
  if (ArchiveRecoveryRequested && 
ControlFile->wal_level == WAL_LEVEL_MINIMAL0
)
6289
0
  {
6290
0
    ereport(WARNING,
6291
0
        (errmsg("WAL was generated with wal_level=minimal, data may be missing"),
6292
0
         errhint("This happens if you temporarily set wal_level=minimal without taking a new base backup.")));
6293
0
  }
6294
6295
  /*
6296
   * For Hot Standby, the WAL must be generated with 'replica' mode, and we
6297
   * must have at least as many backend slots as the primary.
6298
   */
6299
5
  if (ArchiveRecoveryRequested && 
EnableHotStandby0
)
6300
0
  {
6301
0
    if (ControlFile->wal_level < WAL_LEVEL_REPLICA)
6302
0
      ereport(ERROR,
6303
0
          (errmsg("hot standby is not possible because wal_level was not set to \"replica\" or higher on the master server"),
6304
0
           errhint("Either set wal_level to \"replica\" on the master, or turn off hot_standby here.")));
6305
6306
    /* We ignore autovacuum_max_workers when we make this test. */
6307
0
    RecoveryRequiresIntParameter("max_connections",
6308
0
                   MaxConnections,
6309
0
                   ControlFile->MaxConnections);
6310
0
    RecoveryRequiresIntParameter("max_worker_processes",
6311
0
                   max_worker_processes,
6312
0
                   ControlFile->max_worker_processes);
6313
0
    RecoveryRequiresIntParameter("max_prepared_transactions",
6314
0
                   max_prepared_xacts,
6315
0
                   ControlFile->max_prepared_xacts);
6316
0
    RecoveryRequiresIntParameter("max_locks_per_transaction",
6317
0
                   max_locks_per_xact,
6318
0
                   ControlFile->max_locks_per_xact);
6319
0
  }
6320
5
}
6321
6322
/*
6323
 * This must be called ONCE during postmaster or standalone-backend startup
6324
 */
6325
void
6326
StartupXLOG(void)
6327
3.99k
{
6328
3.99k
  XLogCtlInsert *Insert;
6329
3.99k
  CheckPoint  checkPoint;
6330
3.99k
  bool    wasShutdown;
6331
3.99k
  bool    reachedStopPoint = false;
6332
3.99k
  bool    haveBackupLabel = false;
6333
3.99k
  bool    haveTblspcMap = false;
6334
3.99k
  XLogRecPtr  RecPtr,
6335
3.99k
        checkPointLoc,
6336
3.99k
        EndOfLog;
6337
3.99k
  TimeLineID  EndOfLogTLI;
6338
3.99k
  TimeLineID  PrevTimeLineID;
6339
3.99k
  XLogRecord *record;
6340
3.99k
  TransactionId oldestActiveXID;
6341
3.99k
  bool    backupEndRequired = false;
6342
3.99k
  bool    backupFromStandby = false;
6343
3.99k
  DBState   dbstate_at_startup;
6344
3.99k
  XLogReaderState *xlogreader;
6345
3.99k
  XLogPageReadPrivate private;
6346
3.99k
  bool    fast_promoted = false;
6347
3.99k
  struct stat st;
6348
6349
  /*
6350
   * Verify XLOG status looks valid.
6351
   */
6352
3.99k
  if (ControlFile->state < DB_SHUTDOWNED ||
6353
3.99k
    ControlFile->state > DB_IN_PRODUCTION ||
6354
3.99k
    !XRecOffIsValid(ControlFile->checkPoint))
6355
3.99k
    ereport(FATAL,
6356
3.99k
        (errmsg("control file contains invalid data")));
6357
6358
3.99k
  if (ControlFile->state == DB_SHUTDOWNED)
6359
3.99k
  {
6360
    /* This is the expected case, so don't be chatty in standalone mode */
6361
3.99k
    ereport(IsPostmasterEnvironment ? LOG : NOTICE,
6362
3.99k
        (errmsg("database system was shut down at %s",
6363
3.99k
            str_time(ControlFile->time))));
6364
3.99k
  }
6365
5
  else if (ControlFile->state == DB_SHUTDOWNED_IN_RECOVERY)
6366
5
    ereport(LOG,
6367
5
        (errmsg("database system was shut down in recovery at %s",
6368
5
            str_time(ControlFile->time))));
6369
5
  else if (ControlFile->state == DB_SHUTDOWNING)
6370
5
    ereport(LOG,
6371
5
        (errmsg("database system shutdown was interrupted; last known up at %s",
6372
5
            str_time(ControlFile->time))));
6373
5
  else if (ControlFile->state == DB_IN_CRASH_RECOVERY)
6374
5
    ereport(LOG,
6375
5
        (errmsg("database system was interrupted while in recovery at %s",
6376
5
            str_time(ControlFile->time)),
6377
5
         errhint("This probably means that some data is corrupted and"
6378
5
             " you will have to use the last backup for recovery.")));
6379
5
  else if (ControlFile->state == DB_IN_ARCHIVE_RECOVERY)
6380
5
    ereport(LOG,
6381
5
        (errmsg("database system was interrupted while in recovery at log time %s",
6382
5
            str_time(ControlFile->checkPointCopy.time)),
6383
5
         errhint("If this has occurred more than once some data might be corrupted"
6384
5
             " and you might need to choose an earlier recovery target.")));
6385
5
  else if (ControlFile->state == DB_IN_PRODUCTION)
6386
5
    ereport(LOG,
6387
3.99k
        (errmsg("database system was interrupted; last known up at %s",
6388
3.99k
            str_time(ControlFile->time))));
6389
6390
  /* This is just to allow attaching to startup process with a debugger */
6391
#ifdef XLOG_REPLAY_DELAY
6392
  if (ControlFile->state != DB_SHUTDOWNED)
6393
    pg_usleep(60000000L);
6394
#endif
6395
6396
  /*
6397
   * Verify that pg_wal and pg_wal/archive_status exist.  In cases where
6398
   * someone has performed a copy for PITR, these directories may have been
6399
   * excluded and need to be re-created.
6400
   */
6401
3.99k
  ValidateXLOGDirectoryStructure();
6402
6403
  /*
6404
   * If we previously crashed, there might be data which we had written,
6405
   * intending to fsync it, but which we had not actually fsync'd yet.
6406
   * Therefore, a power failure in the near future might cause earlier
6407
   * unflushed writes to be lost, even though more recent data written to
6408
   * disk from here on would be persisted.  To avoid that, fsync the entire
6409
   * data directory.
6410
   */
6411
3.99k
  if (ControlFile->state != DB_SHUTDOWNED &&
6412
3.99k
    
ControlFile->state != DB_SHUTDOWNED_IN_RECOVERY5
)
6413
5
    SyncDataDirectory();
6414
6415
  /*
6416
   * Initialize on the assumption we want to recover to the latest timeline
6417
   * that's active according to pg_control.
6418
   */
6419
3.99k
  if (ControlFile->minRecoveryPointTLI >
6420
3.99k
    ControlFile->checkPointCopy.ThisTimeLineID)
6421
0
    recoveryTargetTLI = ControlFile->minRecoveryPointTLI;
6422
3.99k
  else
6423
3.99k
    recoveryTargetTLI = ControlFile->checkPointCopy.ThisTimeLineID;
6424
6425
  /*
6426
   * Check for recovery control file, and if so set up state for offline
6427
   * recovery
6428
   */
6429
3.99k
  readRecoveryCommandFile();
6430
6431
  /*
6432
   * Save archive_cleanup_command in shared memory so that other processes
6433
   * can see it.
6434
   */
6435
3.99k
  strlcpy(XLogCtl->archiveCleanupCommand,
6436
3.99k
      archiveCleanupCommand ? archiveCleanupCommand : "",
6437
3.99k
      sizeof(XLogCtl->archiveCleanupCommand));
6438
6439
3.99k
  if (ArchiveRecoveryRequested)
6440
0
  {
6441
0
    if (StandbyModeRequested)
6442
0
      ereport(LOG,
6443
0
          (errmsg("entering standby mode")));
6444
0
    else if (recoveryTarget == RECOVERY_TARGET_XID)
6445
0
      ereport(LOG,
6446
0
          (errmsg("starting point-in-time recovery to XID %u",
6447
0
              recoveryTargetXid)));
6448
0
    else if (recoveryTarget == RECOVERY_TARGET_TIME)
6449
0
      ereport(LOG,
6450
0
          (errmsg("starting point-in-time recovery to %s",
6451
0
              timestamptz_to_str(recoveryTargetTime))));
6452
0
    else if (recoveryTarget == RECOVERY_TARGET_NAME)
6453
0
      ereport(LOG,
6454
0
          (errmsg("starting point-in-time recovery to \"%s\"",
6455
0
              recoveryTargetName)));
6456
0
    else if (recoveryTarget == RECOVERY_TARGET_LSN)
6457
0
      ereport(LOG,
6458
0
          (errmsg("starting point-in-time recovery to WAL location (LSN) \"%X/%X\"",
6459
0
              (uint32) (recoveryTargetLSN >> 32),
6460
0
              (uint32) recoveryTargetLSN)));
6461
0
    else if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE)
6462
0
      ereport(LOG,
6463
0
          (errmsg("starting point-in-time recovery to earliest consistent point")));
6464
0
    else
6465
0
      ereport(LOG,
6466
0
          (errmsg("starting archive recovery")));
6467
0
  }
6468
6469
  /*
6470
   * Take ownership of the wakeup latch if we're going to sleep during
6471
   * recovery.
6472
   */
6473
3.99k
  if (StandbyModeRequested)
6474
0
    OwnLatch(&XLogCtl->recoveryWakeupLatch);
6475
6476
  /* Set up XLOG reader facility */
6477
3.99k
  MemSet(&private, 0, sizeof(XLogPageReadPrivate));
6478
3.99k
  xlogreader = XLogReaderAllocate(wal_segment_size, &XLogPageRead, &private);
6479
3.99k
  if (!xlogreader)
6480
3.99k
    ereport(ERROR,
6481
3.99k
        (errcode(ERRCODE_OUT_OF_MEMORY),
6482
3.99k
         errmsg("out of memory"),
6483
3.99k
         errdetail("Failed while allocating a WAL reading processor.")));
6484
3.99k
  xlogreader->system_identifier = ControlFile->system_identifier;
6485
6486
  /*
6487
   * Allocate two page buffers dedicated to WAL consistency checks.  We do
6488
   * it this way, rather than just making static arrays, for two reasons:
6489
   * (1) no need to waste the storage in most instantiations of the backend;
6490
   * (2) a static char array isn't guaranteed to have any particular
6491
   * alignment, whereas palloc() will provide MAXALIGN'd storage.
6492
   */
6493
3.99k
  replay_image_masked = (char *) palloc(BLCKSZ);
6494
3.99k
  master_image_masked = (char *) palloc(BLCKSZ);
6495
6496
3.99k
  if (read_backup_label(&checkPointLoc, &backupEndRequired,
6497
3.99k
              &backupFromStandby))
6498
0
  {
6499
0
    List     *tablespaces = NIL;
6500
6501
    /*
6502
     * Archive recovery was requested, and thanks to the backup label
6503
     * file, we know how far we need to replay to reach consistency. Enter
6504
     * archive recovery directly.
6505
     */
6506
0
    InArchiveRecovery = true;
6507
0
    if (StandbyModeRequested)
6508
0
      StandbyMode = true;
6509
6510
    /*
6511
     * When a backup_label file is present, we want to roll forward from
6512
     * the checkpoint it identifies, rather than using pg_control.
6513
     */
6514
0
    record = ReadCheckpointRecord(xlogreader, checkPointLoc, 0, true);
6515
0
    if (record != NULL)
6516
0
    {
6517
0
      memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
6518
0
      wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN);
6519
0
      ereport(DEBUG1,
6520
0
          (errmsg("checkpoint record is at %X/%X",
6521
0
              (uint32) (checkPointLoc >> 32), (uint32) checkPointLoc)));
6522
0
      InRecovery = true;  /* force recovery even if SHUTDOWNED */
6523
6524
      /*
6525
       * Make sure that REDO location exists. This may not be the case
6526
       * if there was a crash during an online backup, which left a
6527
       * backup_label around that references a WAL segment that's
6528
       * already been archived.
6529
       */
6530
0
      if (checkPoint.redo < checkPointLoc)
6531
0
      {
6532
0
        if (!ReadRecord(xlogreader, checkPoint.redo, LOG, false))
6533
0
          ereport(FATAL,
6534
0
              (errmsg("could not find redo location referenced by checkpoint record"),
6535
0
               errhint("If you are not restoring from a backup, try removing the file \"%s/backup_label\".", DataDir)));
6536
0
      }
6537
0
    }
6538
0
    else
6539
0
    {
6540
0
      ereport(FATAL,
6541
0
          (errmsg("could not locate required checkpoint record"),
6542
0
           errhint("If you are not restoring from a backup, try removing the file \"%s/backup_label\".", DataDir)));
6543
0
      wasShutdown = false;  /* keep compiler quiet */
6544
0
    }
6545
6546
    /* read the tablespace_map file if present and create symlinks. */
6547
0
    if (read_tablespace_map(&tablespaces))
6548
0
    {
6549
0
      ListCell   *lc;
6550
6551
0
      foreach(lc, tablespaces)
6552
0
      {
6553
0
        tablespaceinfo *ti = lfirst(lc);
6554
0
        char     *linkloc;
6555
6556
0
        linkloc = psprintf("pg_tblspc/%s", ti->oid);
6557
6558
        /*
6559
         * Remove the existing symlink if any and Create the symlink
6560
         * under PGDATA.
6561
         */
6562
0
        remove_tablespace_symlink(linkloc);
6563
6564
0
        if (symlink(ti->path, linkloc) < 0)
6565
0
          ereport(ERROR,
6566
0
              (errcode_for_file_access(),
6567
0
               errmsg("could not create symbolic link \"%s\": %m",
6568
0
                  linkloc)));
6569
6570
0
        pfree(ti->oid);
6571
0
        pfree(ti->path);
6572
0
        pfree(ti);
6573
0
      }
6574
6575
      /* set flag to delete it later */
6576
0
      haveTblspcMap = true;
6577
0
    }
6578
6579
    /* set flag to delete it later */
6580
0
    haveBackupLabel = true;
6581
0
  }
6582
3.99k
  else
6583
3.99k
  {
6584
    /*
6585
     * If tablespace_map file is present without backup_label file, there
6586
     * is no use of such file.  There is no harm in retaining it, but it
6587
     * is better to get rid of the map file so that we don't have any
6588
     * redundant file in data directory and it will avoid any sort of
6589
     * confusion.  It seems prudent though to just rename the file out of
6590
     * the way rather than delete it completely, also we ignore any error
6591
     * that occurs in rename operation as even if map file is present
6592
     * without backup_label file, it is harmless.
6593
     */
6594
3.99k
    if (stat(TABLESPACE_MAP, &st) == 0)
6595
0
    {
6596
0
      unlink(TABLESPACE_MAP_OLD);
6597
0
      if (durable_rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD, DEBUG1) == 0)
6598
0
        ereport(LOG,
6599
0
            (errmsg("ignoring file \"%s\" because no file \"%s\" exists",
6600
0
                TABLESPACE_MAP, BACKUP_LABEL_FILE),
6601
0
             errdetail("File \"%s\" was renamed to \"%s\".",
6602
0
                   TABLESPACE_MAP, TABLESPACE_MAP_OLD)));
6603
0
      else
6604
0
        ereport(LOG,
6605
0
            (errmsg("ignoring file \"%s\" because no file \"%s\" exists",
6606
0
                TABLESPACE_MAP, BACKUP_LABEL_FILE),
6607
0
             errdetail("Could not rename file \"%s\" to \"%s\": %m.",
6608
0
                   TABLESPACE_MAP, TABLESPACE_MAP_OLD)));
6609
0
    }
6610
6611
    /*
6612
     * It's possible that archive recovery was requested, but we don't
6613
     * know how far we need to replay the WAL before we reach consistency.
6614
     * This can happen for example if a base backup is taken from a
6615
     * running server using an atomic filesystem snapshot, without calling
6616
     * pg_start/stop_backup. Or if you just kill a running master server
6617
     * and put it into archive recovery by creating a recovery.conf file.
6618
     *
6619
     * Our strategy in that case is to perform crash recovery first,
6620
     * replaying all the WAL present in pg_wal, and only enter archive
6621
     * recovery after that.
6622
     *
6623
     * But usually we already know how far we need to replay the WAL (up
6624
     * to minRecoveryPoint, up to backupEndPoint, or until we see an
6625
     * end-of-backup record), and we can enter archive recovery directly.
6626
     */
6627
3.99k
    if (ArchiveRecoveryRequested &&
6628
3.99k
      
(0
ControlFile->minRecoveryPoint != 0
InvalidXLogRecPtr0
||
6629
0
       ControlFile->backupEndRequired ||
6630
0
       ControlFile->backupEndPoint != InvalidXLogRecPtr ||
6631
0
       ControlFile->state == DB_SHUTDOWNED))
6632
0
    {
6633
0
      InArchiveRecovery = true;
6634
0
      if (StandbyModeRequested)
6635
0
        StandbyMode = true;
6636
0
    }
6637
6638
    /* Get the last valid checkpoint record. */
6639
3.99k
    checkPointLoc = ControlFile->checkPoint;
6640
3.99k
    RedoStartLSN = ControlFile->checkPointCopy.redo;
6641
3.99k
    record = ReadCheckpointRecord(xlogreader, checkPointLoc, 1, true);
6642
3.99k
    if (record != NULL)
6643
3.99k
    {
6644
3.99k
      ereport(DEBUG1,
6645
3.99k
          (errmsg("checkpoint record is at %X/%X",
6646
3.99k
              (uint32) (checkPointLoc >> 32), (uint32) checkPointLoc)));
6647
3.99k
    }
6648
0
    else
6649
0
    {
6650
      /*
6651
       * We used to attempt to go back to a secondary checkpoint record
6652
       * here, but only when not in standby_mode. We now just fail if we
6653
       * can't read the last checkpoint because this allows us to
6654
       * simplify processing around checkpoints.
6655
       */
6656
0
      ereport(PANIC,
6657
0
          (errmsg("could not locate a valid checkpoint record")));
6658
0
    }
6659
3.99k
    memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
6660
3.99k
    wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN);
6661
3.99k
  }
6662
6663
  /*
6664
   * Clear out any old relcache cache files.  This is *necessary* if we do
6665
   * any WAL replay, since that would probably result in the cache files
6666
   * being out of sync with database reality.  In theory we could leave them
6667
   * in place if the database had been cleanly shut down, but it seems
6668
   * safest to just remove them always and let them be rebuilt during the
6669
   * first backend startup.  These files needs to be removed from all
6670
   * directories including pg_tblspc, however the symlinks are created only
6671
   * after reading tablespace_map file in case of archive recovery from
6672
   * backup, so needs to clear old relcache files here after creating
6673
   * symlinks.
6674
   */
6675
3.99k
  RelationCacheInitFileRemove();
6676
6677
  /*
6678
   * If the location of the checkpoint record is not on the expected
6679
   * timeline in the history of the requested timeline, we cannot proceed:
6680
   * the backup is not part of the history of the requested timeline.
6681
   */
6682
3.99k
  Assert(expectedTLEs);   /* was initialized by reading checkpoint
6683
                 * record */
6684
3.99k
  if (tliOfPointInHistory(checkPointLoc, expectedTLEs) !=
6685
3.99k
    checkPoint.ThisTimeLineID)
6686
0
  {
6687
0
    XLogRecPtr  switchpoint;
6688
6689
    /*
6690
     * tliSwitchPoint will throw an error if the checkpoint's timeline is
6691
     * not in expectedTLEs at all.
6692
     */
6693
0
    switchpoint = tliSwitchPoint(ControlFile->checkPointCopy.ThisTimeLineID, expectedTLEs, NULL);
6694
0
    ereport(FATAL,
6695
0
        (errmsg("requested timeline %u is not a child of this server's history",
6696
0
            recoveryTargetTLI),
6697
0
         errdetail("Latest checkpoint is at %X/%X on timeline %u, but in the history of the requested timeline, the server forked off from that timeline at %X/%X.",
6698
0
               (uint32) (ControlFile->checkPoint >> 32),
6699
0
               (uint32) ControlFile->checkPoint,
6700
0
               ControlFile->checkPointCopy.ThisTimeLineID,
6701
0
               (uint32) (switchpoint >> 32),
6702
0
               (uint32) switchpoint)));
6703
0
  }
6704
6705
  /*
6706
   * The min recovery point should be part of the requested timeline's
6707
   * history, too.
6708
   */
6709
3.99k
  if (!XLogRecPtrIsInvalid(ControlFile->minRecoveryPoint) &&
6710
3.99k
    tliOfPointInHistory(ControlFile->minRecoveryPoint - 1, expectedTLEs) !=
6711
0
    ControlFile->minRecoveryPointTLI)
6712
3.99k
    ereport(FATAL,
6713
3.99k
        (errmsg("requested timeline %u does not contain minimum recovery point %X/%X on timeline %u",
6714
3.99k
            recoveryTargetTLI,
6715
3.99k
            (uint32) (ControlFile->minRecoveryPoint >> 32),
6716
3.99k
            (uint32) ControlFile->minRecoveryPoint,
6717
3.99k
            ControlFile->minRecoveryPointTLI)));
6718
6719
3.99k
  LastRec = RecPtr = checkPointLoc;
6720
6721
3.99k
  ereport(DEBUG1,
6722
3.99k
      (errmsg_internal("redo record is at %X/%X; shutdown %s",
6723
3.99k
               (uint32) (checkPoint.redo >> 32), (uint32) checkPoint.redo,
6724
3.99k
               wasShutdown ? "true" : "false")));
6725
3.99k
  ereport(DEBUG1,
6726
3.99k
      (errmsg_internal("next transaction ID: %u:%u; next OID: %u",
6727
3.99k
               checkPoint.nextXidEpoch, checkPoint.nextXid,
6728
3.99k
               checkPoint.nextOid)));
6729
3.99k
  ereport(DEBUG1,
6730
3.99k
      (errmsg_internal("next MultiXactId: %u; next MultiXactOffset: %u",
6731
3.99k
               checkPoint.nextMulti, checkPoint.nextMultiOffset)));
6732
3.99k
  ereport(DEBUG1,
6733
3.99k
      (errmsg_internal("oldest unfrozen transaction ID: %u, in database %u",
6734
3.99k
               checkPoint.oldestXid, checkPoint.oldestXidDB)));
6735
3.99k
  ereport(DEBUG1,
6736
3.99k
      (errmsg_internal("oldest MultiXactId: %u, in database %u",
6737
3.99k
               checkPoint.oldestMulti, checkPoint.oldestMultiDB)));
6738
3.99k
  ereport(DEBUG1,
6739
3.99k
      (errmsg_internal("commit timestamp Xid oldest/newest: %u/%u",
6740
3.99k
               checkPoint.oldestCommitTsXid,
6741
3.99k
               checkPoint.newestCommitTsXid)));
6742
3.99k
  if (!TransactionIdIsNormal(checkPoint.nextXid))
6743
3.99k
    ereport(PANIC,
6744
3.99k
        (errmsg("invalid next transaction ID")));
6745
6746
  /* initialize shared memory variables from the checkpoint record */
6747
3.99k
  ShmemVariableCache->nextXid = checkPoint.nextXid;
6748
3.99k
  ShmemVariableCache->nextOid = checkPoint.nextOid;
6749
3.99k
  ShmemVariableCache->oidCount = 0;
6750
3.99k
  MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
6751
3.99k
  AdvanceOldestClogXid(checkPoint.oldestXid);
6752
3.99k
  SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
6753
3.99k
  SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB, true);
6754
3.99k
  SetCommitTsLimit(checkPoint.oldestCommitTsXid,
6755
3.99k
           checkPoint.newestCommitTsXid);
6756
3.99k
  XLogCtl->ckptXidEpoch = checkPoint.nextXidEpoch;
6757
3.99k
  XLogCtl->ckptXid = checkPoint.nextXid;
6758
6759
  /*
6760
   * Initialize replication slots, before there's a chance to remove
6761
   * required resources.
6762
   */
6763
3.99k
  StartupReplicationSlots();
6764
6765
  /*
6766
   * Startup logical state, needs to be setup now so we have proper data
6767
   * during crash recovery.
6768
   */
6769
3.99k
  StartupReorderBuffer();
6770
6771
  /*
6772
   * Startup MultiXact. We need to do this early to be able to replay
6773
   * truncations.
6774
   */
6775
3.99k
  StartupMultiXact();
6776
6777
  /*
6778
   * Ditto for commit timestamps.  Activate the facility if the setting is
6779
   * enabled in the control file, as there should be no tracking of commit
6780
   * timestamps done when the setting was disabled.  This facility can be
6781
   * started or stopped when replaying a XLOG_PARAMETER_CHANGE record.
6782
   */
6783
3.99k
  if (ControlFile->track_commit_timestamp)
6784
0
    StartupCommitTs();
6785
6786
  /*
6787
   * Recover knowledge about replay progress of known replication partners.
6788
   */
6789
3.99k
  StartupReplicationOrigin();
6790
6791
  /*
6792
   * Initialize unlogged LSN. On a clean shutdown, it's restored from the
6793
   * control file. On recovery, all unlogged relations are blown away, so
6794
   * the unlogged LSN counter can be reset too.
6795
   */
6796
3.99k
  if (ControlFile->state == DB_SHUTDOWNED)
6797
3.99k
    XLogCtl->unloggedLSN = ControlFile->unloggedLSN;
6798
5
  else
6799
5
    XLogCtl->unloggedLSN = 1;
6800
6801
  /*
6802
   * We must replay WAL entries using the same TimeLineID they were created
6803
   * under, so temporarily adopt the TLI indicated by the checkpoint (see
6804
   * also xlog_redo()).
6805
   */
6806
3.99k
  ThisTimeLineID = checkPoint.ThisTimeLineID;
6807
6808
  /*
6809
   * Copy any missing timeline history files between 'now' and the recovery
6810
   * target timeline from archive to pg_wal. While we don't need those files
6811
   * ourselves - the history file of the recovery target timeline covers all
6812
   * the previous timelines in the history too - a cascading standby server
6813
   * might be interested in them. Or, if you archive the WAL from this
6814
   * server to a different archive than the master, it'd be good for all the
6815
   * history files to get archived there after failover, so that you can use
6816
   * one of the old timelines as a PITR target. Timeline history files are
6817
   * small, so it's better to copy them unnecessarily than not copy them and
6818
   * regret later.
6819
   */
6820
3.99k
  restoreTimeLineHistoryFiles(ThisTimeLineID, recoveryTargetTLI);
6821
6822
  /*
6823
   * Before running in recovery, scan pg_twophase and fill in its status to
6824
   * be able to work on entries generated by redo.  Doing a scan before
6825
   * taking any recovery action has the merit to discard any 2PC files that
6826
   * are newer than the first record to replay, saving from any conflicts at
6827
   * replay.  This avoids as well any subsequent scans when doing recovery
6828
   * of the on-disk two-phase data.
6829
   */
6830
3.99k
  restoreTwoPhaseData();
6831
6832
3.99k
  lastFullPageWrites = checkPoint.fullPageWrites;
6833
6834
3.99k
  RedoRecPtr = XLogCtl->RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo;
6835
3.99k
  doPageWrites = lastFullPageWrites;
6836
6837
3.99k
  if (RecPtr < checkPoint.redo)
6838
3.99k
    ereport(PANIC,
6839
3.99k
        (errmsg("invalid redo in checkpoint record")));
6840
6841
  /*
6842
   * Check whether we need to force recovery from WAL.  If it appears to
6843
   * have been a clean shutdown and we did not have a recovery.conf file,
6844
   * then assume no recovery needed.
6845
   */
6846
3.99k
  if (checkPoint.redo < RecPtr)
6847
1
  {
6848
1
    if (wasShutdown)
6849
1
      ereport(PANIC,
6850
1
          (errmsg("invalid redo record in shutdown checkpoint")));
6851
1
    InRecovery = true;
6852
1
  }
6853
3.99k
  else if (ControlFile->state != DB_SHUTDOWNED)
6854
4
    InRecovery = true;
6855
3.99k
  else if (ArchiveRecoveryRequested)
6856
0
  {
6857
    /* force recovery due to presence of recovery.conf */
6858
0
    InRecovery = true;
6859
0
  }
6860
6861
  /* REDO */
6862
3.99k
  if (InRecovery)
6863
5
  {
6864
5
    int     rmid;
6865
6866
    /*
6867
     * Update pg_control to show that we are recovering and to show the
6868
     * selected checkpoint as the place we are starting from. We also mark
6869
     * pg_control with any minimum recovery stop point obtained from a
6870
     * backup history file.
6871
     */
6872
5
    dbstate_at_startup = ControlFile->state;
6873
5
    if (InArchiveRecovery)
6874
0
      ControlFile->state = DB_IN_ARCHIVE_RECOVERY;
6875
5
    else
6876
5
    {
6877
5
      ereport(LOG,
6878
5
          (errmsg("database system was not properly shut down; "
6879
5
              "automatic recovery in progress")));
6880
5
      if (recoveryTargetTLI > ControlFile->checkPointCopy.ThisTimeLineID)
6881
5
        ereport(LOG,
6882
5
            (errmsg("crash recovery starts in timeline %u "
6883
5
                "and has target timeline %u",
6884
5
                ControlFile->checkPointCopy.ThisTimeLineID,
6885
5
                recoveryTargetTLI)));
6886
5
      ControlFile->state = DB_IN_CRASH_RECOVERY;
6887
5
    }
6888
5
    ControlFile->checkPoint = checkPointLoc;
6889
5
    ControlFile->checkPointCopy = checkPoint;
6890
5
    if (InArchiveRecovery)
6891
0
    {
6892
      /* initialize minRecoveryPoint if not set yet */
6893
0
      if (ControlFile->minRecoveryPoint < checkPoint.redo)
6894
0
      {
6895
0
        ControlFile->minRecoveryPoint = checkPoint.redo;
6896
0
        ControlFile->minRecoveryPointTLI = checkPoint.ThisTimeLineID;
6897
0
      }
6898
0
    }
6899
6900
    /*
6901
     * Set backupStartPoint if we're starting recovery from a base backup.
6902
     *
6903
     * Also set backupEndPoint and use minRecoveryPoint as the backup end
6904
     * location if we're starting recovery from a base backup which was
6905
     * taken from a standby. In this case, the database system status in
6906
     * pg_control must indicate that the database was already in recovery.
6907
     * Usually that will be DB_IN_ARCHIVE_RECOVERY but also can be
6908
     * DB_SHUTDOWNED_IN_RECOVERY if recovery previously was interrupted
6909
     * before reaching this point; e.g. because restore_command or
6910
     * primary_conninfo were faulty.
6911
     *
6912
     * Any other state indicates that the backup somehow became corrupted
6913
     * and we can't sensibly continue with recovery.
6914
     */
6915
5
    if (haveBackupLabel)
6916
0
    {
6917
0
      ControlFile->backupStartPoint = checkPoint.redo;
6918
0
      ControlFile->backupEndRequired = backupEndRequired;
6919
6920
0
      if (backupFromStandby)
6921
0
      {
6922
0
        if (dbstate_at_startup != DB_IN_ARCHIVE_RECOVERY &&
6923
0
          dbstate_at_startup != DB_SHUTDOWNED_IN_RECOVERY)
6924
0
          ereport(FATAL,
6925
0
              (errmsg("backup_label contains data inconsistent with control file"),
6926
0
               errhint("This means that the backup is corrupted and you will "
6927
0
                   "have to use another backup for recovery.")));
6928
0
        ControlFile->backupEndPoint = ControlFile->minRecoveryPoint;
6929
0
      }
6930
0
    }
6931
5
    ControlFile->time = (pg_time_t) time(NULL);
6932
    /* No need to hold ControlFileLock yet, we aren't up far enough */
6933
5
    UpdateControlFile();
6934
6935
    /*
6936
     * Initialize our local copy of minRecoveryPoint.  When doing crash
6937
     * recovery we want to replay up to the end of WAL.  Particularly, in
6938
     * the case of a promoted standby minRecoveryPoint value in the
6939
     * control file is only updated after the first checkpoint.  However,
6940
     * if the instance crashes before the first post-recovery checkpoint
6941
     * is completed then recovery will use a stale location causing the
6942
     * startup process to think that there are still invalid page
6943
     * references when checking for data consistency.
6944
     */
6945
5
    if (InArchiveRecovery)
6946
0
    {
6947
0
      minRecoveryPoint = ControlFile->minRecoveryPoint;
6948
0
      minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
6949
0
    }
6950
5
    else
6951
5
    {
6952
5
      minRecoveryPoint = InvalidXLogRecPtr;
6953
5
      minRecoveryPointTLI = 0;
6954
5
    }
6955
6956
    /*
6957
     * Reset pgstat data, because it may be invalid after recovery.
6958
     */
6959
5
    pgstat_reset_all();
6960
6961
    /*
6962
     * If there was a backup label file, it's done its job and the info
6963
     * has now been propagated into pg_control.  We must get rid of the
6964
     * label file so that if we crash during recovery, we'll pick up at
6965
     * the latest recovery restartpoint instead of going all the way back
6966
     * to the backup start point.  It seems prudent though to just rename
6967
     * the file out of the way rather than delete it completely.
6968
     */
6969
5
    if (haveBackupLabel)
6970
0
    {
6971
0
      unlink(BACKUP_LABEL_OLD);
6972
0
      durable_rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD, FATAL);
6973
0
    }
6974
6975
    /*
6976
     * If there was a tablespace_map file, it's done its job and the
6977
     * symlinks have been created.  We must get rid of the map file so
6978
     * that if we crash during recovery, we don't create symlinks again.
6979
     * It seems prudent though to just rename the file out of the way
6980
     * rather than delete it completely.
6981
     */
6982
5
    if (haveTblspcMap)
6983
0
    {
6984
0
      unlink(TABLESPACE_MAP_OLD);
6985
0
      durable_rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD, FATAL);
6986
0
    }
6987
6988
    /* Check that the GUCs used to generate the WAL allow recovery */
6989
5
    CheckRequiredParameterValues();
6990
6991
    /*
6992
     * We're in recovery, so unlogged relations may be trashed and must be
6993
     * reset.  This should be done BEFORE allowing Hot Standby
6994
     * connections, so that read-only backends don't try to read whatever
6995
     * garbage is left over from before.
6996
     */
6997
5
    ResetUnloggedRelations(UNLOGGED_RELATION_CLEANUP);
6998
6999
    /*
7000
     * Likewise, delete any saved transaction snapshot files that got left
7001
     * behind by crashed backends.
7002
     */
7003
5
    DeleteAllExportedSnapshotFiles();
7004
7005
    /*
7006
     * Initialize for Hot Standby, if enabled. We won't let backends in
7007
     * yet, not until we've reached the min recovery point specified in
7008
     * control file and we've established a recovery snapshot from a
7009
     * running-xacts WAL record.
7010
     */
7011
5
    if (ArchiveRecoveryRequested && 
EnableHotStandby0
)
7012
0
    {
7013
0
      TransactionId *xids;
7014
0
      int     nxids;
7015
7016
0
      ereport(DEBUG1,
7017
0
          (errmsg("initializing for hot standby")));
7018
7019
0
      InitRecoveryTransactionEnvironment();
7020
7021
0
      if (wasShutdown)
7022
0
        oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids);
7023
0
      else
7024
0
        oldestActiveXID = checkPoint.oldestActiveXid;
7025
0
      Assert(TransactionIdIsValid(oldestActiveXID));
7026
7027
      /* Tell procarray about the range of xids it has to deal with */
7028
0
      ProcArrayInitRecovery(ShmemVariableCache->nextXid);
7029
7030
      /*
7031
       * Startup commit log and subtrans only.  MultiXact and commit
7032
       * timestamp have already been started up and other SLRUs are not
7033
       * maintained during recovery and need not be started yet.
7034
       */
7035
0
      StartupCLOG();
7036
0
      StartupSUBTRANS(oldestActiveXID);
7037
7038
      /*
7039
       * If we're beginning at a shutdown checkpoint, we know that
7040
       * nothing was running on the master at this point. So fake-up an
7041
       * empty running-xacts record and use that here and now. Recover
7042
       * additional standby state for prepared transactions.
7043
       */
7044
0
      if (wasShutdown)
7045
0
      {
7046
0
        RunningTransactionsData running;
7047
0
        TransactionId latestCompletedXid;
7048
7049
        /*
7050
         * Construct a RunningTransactions snapshot representing a
7051
         * shut down server, with only prepared transactions still
7052
         * alive. We're never overflowed at this point because all
7053
         * subxids are listed with their parent prepared transactions.
7054
         */
7055
0
        running.xcnt = nxids;
7056
0
        running.subxcnt = 0;
7057
0
        running.subxid_overflow = false;
7058
0
        running.nextXid = checkPoint.nextXid;
7059
0
        running.oldestRunningXid = oldestActiveXID;
7060
0
        latestCompletedXid = checkPoint.nextXid;
7061
0
        TransactionIdRetreat(latestCompletedXid);
7062
0
        Assert(TransactionIdIsNormal(latestCompletedXid));
7063
0
        running.latestCompletedXid = latestCompletedXid;
7064
0
        running.xids = xids;
7065
7066
0
        ProcArrayApplyRecoveryInfo(&running);
7067
7068
0
        StandbyRecoverPreparedTransactions();
7069
0
      }
7070
0
    }
7071
7072
    /* Initialize resource managers */
7073
115
    
for (rmid = 0; 5
rmid <= RM_MAX_ID;
rmid++110
)
7074
110
    {
7075
110
      if (RmgrTable[rmid].rm_startup != NULL)
7076
15
        RmgrTable[rmid].rm_startup();
7077
110
    }
7078
7079
    /*
7080
     * Initialize shared variables for tracking progress of WAL replay, as
7081
     * if we had just replayed the record before the REDO location (or the
7082
     * checkpoint record itself, if it's a shutdown checkpoint).
7083
     */
7084
5
    SpinLockAcquire(&XLogCtl->info_lck);
7085
5
    if (checkPoint.redo < RecPtr)
7086
1
      XLogCtl->replayEndRecPtr = checkPoint.redo;
7087
4
    else
7088
4
      XLogCtl->replayEndRecPtr = EndRecPtr;
7089
5
    XLogCtl->replayEndTLI = ThisTimeLineID;
7090
5
    XLogCtl->lastReplayedEndRecPtr = XLogCtl->replayEndRecPtr;
7091
5
    XLogCtl->lastReplayedTLI = XLogCtl->replayEndTLI;
7092
5
    XLogCtl->recoveryLastXTime = 0;
7093
5
    XLogCtl->currentChunkStartTime = 0;
7094
5
    XLogCtl->recoveryPause = false;
7095
5
    SpinLockRelease(&XLogCtl->info_lck);
7096
7097
    /* Also ensure XLogReceiptTime has a sane value */
7098
5
    XLogReceiptTime = GetCurrentTimestamp();
7099
7100
    /*
7101
     * Let postmaster know we've started redo now, so that it can launch
7102
     * checkpointer to perform restartpoints.  We don't bother during
7103
     * crash recovery as restartpoints can only be performed during
7104
     * archive recovery.  And we'd like to keep crash recovery simple, to
7105
     * avoid introducing bugs that could affect you when recovering after
7106
     * crash.
7107
     *
7108
     * After this point, we can no longer assume that we're the only
7109
     * process in addition to postmaster!  Also, fsync requests are
7110
     * subsequently to be handled by the checkpointer, not locally.
7111
     */
7112
5
    if (ArchiveRecoveryRequested && 
IsUnderPostmaster0
)
7113
0
    {
7114
0
      PublishStartupProcessInformation();
7115
0
      SetForwardFsyncRequests();
7116
0
      SendPostmasterSignal(PMSIGNAL_RECOVERY_STARTED);
7117
0
      bgwriterLaunched = true;
7118
0
    }
7119
7120
    /*
7121
     * Allow read-only connections immediately if we're consistent
7122
     * already.
7123
     */
7124
5
    CheckRecoveryConsistency();
7125
7126
    /*
7127
     * Find the first record that logically follows the checkpoint --- it
7128
     * might physically precede it, though.
7129
     */
7130
5
    if (checkPoint.redo < RecPtr)
7131
1
    {
7132
      /* back up to find the record */
7133
1
      record = ReadRecord(xlogreader, checkPoint.redo, PANIC, false);
7134
1
    }
7135
4
    else
7136
4
    {
7137
      /* just have to read next record after CheckPoint */
7138
4
      record = ReadRecord(xlogreader, InvalidXLogRecPtr, LOG, false);
7139
4
    }
7140
7141
5
    if (record != NULL)
7142
1
    {
7143
1
      ErrorContextCallback errcallback;
7144
1
      TimestampTz xtime;
7145
7146
1
      InRedo = true;
7147
7148
1
      ereport(LOG,
7149
1
          (errmsg("redo starts at %X/%X",
7150
1
              (uint32) (ReadRecPtr >> 32), (uint32) ReadRecPtr)));
7151
7152
      /*
7153
       * main redo apply loop
7154
       */
7155
1
      do
7156
2
      {
7157
2
        bool    switchedTLI = false;
7158
7159
#ifdef WAL_DEBUG
7160
        if (XLOG_DEBUG ||
7161
          (rmid == RM_XACT_ID && trace_recovery_messages <= DEBUG2) ||
7162
          (rmid != RM_XACT_ID && trace_recovery_messages <= DEBUG3))
7163
        {
7164
          StringInfoData buf;
7165
7166
          initStringInfo(&buf);
7167
          appendStringInfo(&buf, "REDO @ %X/%X; LSN %X/%X: ",
7168
                   (uint32) (ReadRecPtr >> 32), (uint32) ReadRecPtr,
7169
                   (uint32) (EndRecPtr >> 32), (uint32) EndRecPtr);
7170
          xlog_outrec(&buf, xlogreader);
7171
          appendStringInfoString(&buf, " - ");
7172
          xlog_outdesc(&buf, xlogreader);
7173
          elog(LOG, "%s", buf.data);
7174
          pfree(buf.data);
7175
        }
7176
#endif
7177
7178
        /* Handle interrupt signals of startup process */
7179
2
        HandleStartupProcInterrupts();
7180
7181
        /*
7182
         * Pause WAL replay, if requested by a hot-standby session via
7183
         * SetRecoveryPause().
7184
         *
7185
         * Note that we intentionally don't take the info_lck spinlock
7186
         * here.  We might therefore read a slightly stale value of
7187
         * the recoveryPause flag, but it can't be very stale (no
7188
         * worse than the last spinlock we did acquire).  Since a
7189
         * pause request is a pretty asynchronous thing anyway,
7190
         * possibly responding to it one WAL record later than we
7191
         * otherwise would is a minor issue, so it doesn't seem worth
7192
         * adding another spinlock cycle to prevent that.
7193
         */
7194
2
        if (((volatile XLogCtlData *) XLogCtl)->recoveryPause)
7195
0
          recoveryPausesHere();
7196
7197
        /*
7198
         * Have we reached our recovery target?
7199
         */
7200
2
        if (recoveryStopsBefore(xlogreader))
7201
0
        {
7202
0
          reachedStopPoint = true;  /* see below */
7203
0
          break;
7204
0
        }
7205
7206
        /*
7207
         * If we've been asked to lag the master, wait on latch until
7208
         * enough time has passed.
7209
         */
7210
2
        if (recoveryApplyDelay(xlogreader))
7211
0
        {
7212
          /*
7213
           * We test for paused recovery again here. If user sets
7214
           * delayed apply, it may be because they expect to pause
7215
           * recovery in case of problems, so we must test again
7216
           * here otherwise pausing during the delay-wait wouldn't
7217
           * work.
7218
           */
7219
0
          if (((volatile XLogCtlData *) XLogCtl)->recoveryPause)
7220
0
            recoveryPausesHere();
7221
0
        }
7222
7223
        /* Setup error traceback support for ereport() */
7224
2
        errcallback.callback = rm_redo_error_callback;
7225
2
        errcallback.arg = (void *) xlogreader;
7226
2
        errcallback.previous = error_context_stack;
7227
2
        error_context_stack = &errcallback;
7228
7229
        /*
7230
         * ShmemVariableCache->nextXid must be beyond record's xid.
7231
         *
7232
         * We don't expect anyone else to modify nextXid, hence we
7233
         * don't need to hold a lock while examining it.  We still
7234
         * acquire the lock to modify it, though.
7235
         */
7236
2
        if (TransactionIdFollowsOrEquals(record->xl_xid,
7237
2
                         ShmemVariableCache->nextXid))
7238
0
        {
7239
0
          LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
7240
0
          ShmemVariableCache->nextXid = record->xl_xid;
7241
0
          TransactionIdAdvance(ShmemVariableCache->nextXid);
7242
0
          LWLockRelease(XidGenLock);
7243
0
        }
7244
7245
        /*
7246
         * Before replaying this record, check if this record causes
7247
         * the current timeline to change. The record is already
7248
         * considered to be part of the new timeline, so we update
7249
         * ThisTimeLineID before replaying it. That's important so
7250
         * that replayEndTLI, which is recorded as the minimum
7251
         * recovery point's TLI if recovery stops after this record,
7252
         * is set correctly.
7253
         */
7254
2
        if (record->xl_rmid == RM_XLOG_ID)
7255
1
        {
7256
1
          TimeLineID  newTLI = ThisTimeLineID;
7257
1
          TimeLineID  prevTLI = ThisTimeLineID;
7258
1
          uint8   info = record->xl_info & ~XLR_INFO_MASK;
7259
7260
1
          if (info == XLOG_CHECKPOINT_SHUTDOWN)
7261
0
          {
7262
0
            CheckPoint  checkPoint;
7263
7264
0
            memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
7265
0
            newTLI = checkPoint.ThisTimeLineID;
7266
0
            prevTLI = checkPoint.PrevTimeLineID;
7267
0
          }
7268
1
          else if (info == XLOG_END_OF_RECOVERY)
7269
0
          {
7270
0
            xl_end_of_recovery xlrec;
7271
7272
0
            memcpy(&xlrec, XLogRecGetData(xlogreader), sizeof(xl_end_of_recovery));
7273
0
            newTLI = xlrec.ThisTimeLineID;
7274
0
            prevTLI = xlrec.PrevTimeLineID;
7275
0
          }
7276
7277
1
          if (newTLI != ThisTimeLineID)
7278
0
          {
7279
            /* Check that it's OK to switch to this TLI */
7280
0
            checkTimeLineSwitch(EndRecPtr, newTLI, prevTLI);
7281
7282
            /* Following WAL records should be run with new TLI */
7283
0
            ThisTimeLineID = newTLI;
7284
0
            switchedTLI = true;
7285
0
          }
7286
1
        }
7287
7288
        /*
7289
         * Update shared replayEndRecPtr before replaying this record,
7290
         * so that XLogFlush will update minRecoveryPoint correctly.
7291
         */
7292
2
        SpinLockAcquire(&XLogCtl->info_lck);
7293
2
        XLogCtl->replayEndRecPtr = EndRecPtr;
7294
2
        XLogCtl->replayEndTLI = ThisTimeLineID;
7295
2
        SpinLockRelease(&XLogCtl->info_lck);
7296
7297
        /*
7298
         * If we are attempting to enter Hot Standby mode, process
7299
         * XIDs we see
7300
         */
7301
2
        if (standbyState >= STANDBY_INITIALIZED &&
7302
2
          
TransactionIdIsValid0
(record->xl_xid))
7303
0
          RecordKnownAssignedTransactionIds(record->xl_xid);
7304
7305
        /* Now apply the WAL record itself */
7306
2
        RmgrTable[record->xl_rmid].rm_redo(xlogreader);
7307
7308
        /*
7309
         * After redo, check whether the backup pages associated with
7310
         * the WAL record are consistent with the existing pages. This
7311
         * check is done only if consistency check is enabled for this
7312
         * record.
7313
         */
7314
2
        if ((record->xl_info & XLR_CHECK_CONSISTENCY) != 0)
7315
0
          checkXLogConsistency(xlogreader);
7316
7317
        /* Pop the error context stack */
7318
2
        error_context_stack = errcallback.previous;
7319
7320
        /*
7321
         * Update lastReplayedEndRecPtr after this record has been
7322
         * successfully replayed.
7323
         */
7324
2
        SpinLockAcquire(&XLogCtl->info_lck);
7325
2
        XLogCtl->lastReplayedEndRecPtr = EndRecPtr;
7326
2
        XLogCtl->lastReplayedTLI = ThisTimeLineID;
7327
2
        SpinLockRelease(&XLogCtl->info_lck);
7328
7329
        /*
7330
         * If rm_redo called XLogRequestWalReceiverReply, then we wake
7331
         * up the receiver so that it notices the updated
7332
         * lastReplayedEndRecPtr and sends a reply to the master.
7333
         */
7334
2
        if (doRequestWalReceiverReply)
7335
0
        {
7336
0
          doRequestWalReceiverReply = false;
7337
0
          WalRcvForceReply();
7338
0
        }
7339
7340
        /* Remember this record as the last-applied one */
7341
2
        LastRec = ReadRecPtr;
7342
7343
        /* Allow read-only connections if we're consistent now */
7344
2
        CheckRecoveryConsistency();
7345
7346
        /* Is this a timeline switch? */
7347
2
        if (switchedTLI)
7348
0
        {
7349
          /*
7350
           * Before we continue on the new timeline, clean up any
7351
           * (possibly bogus) future WAL segments on the old
7352
           * timeline.
7353
           */
7354
0
          RemoveNonParentXlogFiles(EndRecPtr, ThisTimeLineID);
7355
7356
          /*
7357
           * Wake up any walsenders to notice that we are on a new
7358
           * timeline.
7359
           */
7360
0
          if (switchedTLI && AllowCascadeReplication())
7361
0
            WalSndWakeup();
7362
0
        }
7363
7364
        /* Exit loop if we reached inclusive recovery target */
7365
2
        if (recoveryStopsAfter(xlogreader))
7366
0
        {
7367
0
          reachedStopPoint = true;
7368
0
          break;
7369
0
        }
7370
7371
        /* Else, try to fetch the next WAL record */
7372
2
        record = ReadRecord(xlogreader, InvalidXLogRecPtr, LOG, false);
7373
2
      } while (record != NULL);
7374
7375
      /*
7376
       * end of main redo apply loop
7377
       */
7378
7379
1
      if (reachedStopPoint)
7380
0
      {
7381
0
        if (!reachedConsistency)
7382
0
          ereport(FATAL,
7383
0
              (errmsg("requested recovery stop point is before consistent recovery point")));
7384
7385
        /*
7386
         * This is the last point where we can restart recovery with a
7387
         * new recovery target, if we shutdown and begin again. After
7388
         * this, Resource Managers may choose to do permanent
7389
         * corrective actions at end of recovery.
7390
         */
7391
0
        switch (recoveryTargetAction)
7392
0
        {
7393
0
          case RECOVERY_TARGET_ACTION_SHUTDOWN:
7394
7395
            /*
7396
             * exit with special return code to request shutdown
7397
             * of postmaster.  Log messages issued from
7398
             * postmaster.
7399
             */
7400
0
            proc_exit(3);
7401
7402
0
          case RECOVERY_TARGET_ACTION_PAUSE:
7403
0
            SetRecoveryPause(true);
7404
0
            recoveryPausesHere();
7405
7406
            /* drop into promote */
7407
0
            switch_fallthrough();
7408
7409
0
          case RECOVERY_TARGET_ACTION_PROMOTE:
7410
0
            break;
7411
0
        }
7412
0
      }
7413
7414
      /* Allow resource managers to do any required cleanup. */
7415
23
      
for (rmid = 0; 1
rmid <= RM_MAX_ID;
rmid++22
)
7416
22
      {
7417
22
        if (RmgrTable[rmid].rm_cleanup != NULL)
7418
3
          RmgrTable[rmid].rm_cleanup();
7419
22
      }
7420
7421
1
      ereport(LOG,
7422
1
          (errmsg("redo done at %X/%X",
7423
1
              (uint32) (ReadRecPtr >> 32), (uint32) ReadRecPtr)));
7424
1
      xtime = GetLatestXTime();
7425
1
      if (xtime)
7426
1
        ereport(LOG,
7427
1
            (errmsg("last completed transaction was at log time %s",
7428
1
                timestamptz_to_str(xtime))));
7429
7430
1
      InRedo = false;
7431
1
    }
7432
4
    else
7433
4
    {
7434
      /* there are no WAL records following the checkpoint */
7435
4
      ereport(LOG,
7436
4
          (errmsg("redo is not required")));
7437
4
    }
7438
5
  }
7439
7440
  /*
7441
   * Kill WAL receiver, if it's still running, before we continue to write
7442
   * the startup checkpoint record. It will trump over the checkpoint and
7443
   * subsequent records if it's still alive when we start writing WAL.
7444
   */
7445
3.99k
  ShutdownWalRcv();
7446
7447
  /*
7448
   * Reset unlogged relations to the contents of their INIT fork. This is
7449
   * done AFTER recovery is complete so as to include any unlogged relations
7450
   * created during recovery, but BEFORE recovery is marked as having
7451
   * completed successfully. Otherwise we'd not retry if any of the post
7452
   * end-of-recovery steps fail.
7453
   */
7454
3.99k
  if (InRecovery)
7455
5
    ResetUnloggedRelations(UNLOGGED_RELATION_INIT);
7456
7457
  /*
7458
   * We don't need the latch anymore. It's not strictly necessary to disown
7459
   * it, but let's do it for the sake of tidiness.
7460
   */
7461
3.99k
  if (StandbyModeRequested)
7462
0
    DisownLatch(&XLogCtl->recoveryWakeupLatch);
7463
7464
  /*
7465
   * We are now done reading the xlog from stream. Turn off streaming
7466
   * recovery to force fetching the files (which would be required at end of
7467
   * recovery, e.g., timeline history file) from archive or pg_wal.
7468
   */
7469
3.99k
  StandbyMode = false;
7470
7471
  /*
7472
   * Re-fetch the last valid or last applied record, so we can identify the
7473
   * exact endpoint of what we consider the valid portion of WAL.
7474
   */
7475
3.99k
  record = ReadRecord(xlogreader, LastRec, PANIC, false);
7476
3.99k
  EndOfLog = EndRecPtr;
7477
7478
  /*
7479
   * EndOfLogTLI is the TLI in the filename of the XLOG segment containing
7480
   * the end-of-log. It could be different from the timeline that EndOfLog
7481
   * nominally belongs to, if there was a timeline switch in that segment,
7482
   * and we were reading the old WAL from a segment belonging to a higher
7483
   * timeline.
7484
   */
7485
3.99k
  EndOfLogTLI = xlogreader->readPageTLI;
7486
7487
  /*
7488
   * Complain if we did not roll forward far enough to render the backup
7489
   * dump consistent.  Note: it is indeed okay to look at the local variable
7490
   * minRecoveryPoint here, even though ControlFile->minRecoveryPoint might
7491
   * be further ahead --- ControlFile->minRecoveryPoint cannot have been
7492
   * advanced beyond the WAL we processed.
7493
   */
7494
3.99k
  if (InRecovery &&
7495
3.99k
    
(5
EndOfLog < minRecoveryPoint5
||
7496
5
     !XLogRecPtrIsInvalid(ControlFile->backupStartPoint)))
7497
0
  {
7498
    /*
7499
     * Ran off end of WAL before reaching end-of-backup WAL record, or
7500
     * minRecoveryPoint. That's usually a bad sign, indicating that you
7501
     * tried to recover from an online backup but never called
7502
     * pg_stop_backup(), or you didn't archive all the WAL up to that
7503
     * point. However, this also happens in crash recovery, if the system
7504
     * crashes while an online backup is in progress. We must not treat
7505
     * that as an error, or the database will refuse to start up.
7506
     */
7507
0
    if (ArchiveRecoveryRequested || ControlFile->backupEndRequired)
7508
0
    {
7509
0
      if (ControlFile->backupEndRequired)
7510
0
        ereport(FATAL,
7511
0
            (errmsg("WAL ends before end of online backup"),
7512
0
             errhint("All WAL generated while online backup was taken must be available at recovery.")));
7513
0
      else if (!XLogRecPtrIsInvalid(ControlFile->backupStartPoint))
7514
0
        ereport(FATAL,
7515
0
            (errmsg("WAL ends before end of online backup"),
7516
0
             errhint("Online backup started with pg_start_backup() must be ended with pg_stop_backup(), and all WAL up to that point must be available at recovery.")));
7517
0
      else
7518
0
        ereport(FATAL,
7519
0
            (errmsg("WAL ends before consistent recovery point")));
7520
0
    }
7521
0
  }
7522
7523
  /*
7524
   * Pre-scan prepared transactions to find out the range of XIDs present.
7525
   * This information is not quite needed yet, but it is positioned here so
7526
   * as potential problems are detected before any on-disk change is done.
7527
   */
7528
3.99k
  oldestActiveXID = PrescanPreparedTransactions(NULL, NULL);
7529
7530
  /*
7531
   * Consider whether we need to assign a new timeline ID.
7532
   *
7533
   * If we are doing an archive recovery, we always assign a new ID.  This
7534
   * handles a couple of issues.  If we stopped short of the end of WAL
7535
   * during recovery, then we are clearly generating a new timeline and must
7536
   * assign it a unique new ID.  Even if we ran to the end, modifying the
7537
   * current last segment is problematic because it may result in trying to
7538
   * overwrite an already-archived copy of that segment, and we encourage
7539
   * DBAs to make their archive_commands reject that.  We can dodge the
7540
   * problem by making the new active segment have a new timeline ID.
7541
   *
7542
   * In a normal crash recovery, we can just extend the timeline we were in.
7543
   */
7544
3.99k
  PrevTimeLineID = ThisTimeLineID;
7545
3.99k
  if (ArchiveRecoveryRequested)
7546
0
  {
7547
0
    char    reason[200];
7548
7549
0
    Assert(InArchiveRecovery);
7550
7551
0
    ThisTimeLineID = findNewestTimeLine(recoveryTargetTLI) + 1;
7552
0
    ereport(LOG,
7553
0
        (errmsg("selected new timeline ID: %u", ThisTimeLineID)));
7554
7555
    /*
7556
     * Create a comment for the history file to explain why and where
7557
     * timeline changed.
7558
     */
7559
0
    if (recoveryTarget == RECOVERY_TARGET_XID)
7560
0
      snprintf(reason, sizeof(reason),
7561
0
           "%s transaction %u",
7562
0
           recoveryStopAfter ? "after" : "before",
7563
0
           recoveryStopXid);
7564
0
    else if (recoveryTarget == RECOVERY_TARGET_TIME)
7565
0
      snprintf(reason, sizeof(reason),
7566
0
           "%s %s\n",
7567
0
           recoveryStopAfter ? "after" : "before",
7568
0
           timestamptz_to_str(recoveryStopTime));
7569
0
    else if (recoveryTarget == RECOVERY_TARGET_LSN)
7570
0
      snprintf(reason, sizeof(reason),
7571
0
           "%s LSN %X/%X\n",
7572
0
           recoveryStopAfter ? "after" : "before",
7573
0
           (uint32) (recoveryStopLSN >> 32),
7574
0
           (uint32) recoveryStopLSN);
7575
0
    else if (recoveryTarget == RECOVERY_TARGET_NAME)
7576
0
      snprintf(reason, sizeof(reason),
7577
0
           "at restore point \"%s\"",
7578
0
           recoveryStopName);
7579
0
    else if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE)
7580
0
      snprintf(reason, sizeof(reason), "reached consistency");
7581
0
    else
7582
0
      snprintf(reason, sizeof(reason), "no recovery target specified");
7583
7584
    /*
7585
     * We are now done reading the old WAL.  Turn off archive fetching if
7586
     * it was active, and make a writable copy of the last WAL segment.
7587
     * (Note that we also have a copy of the last block of the old WAL in
7588
     * readBuf; we will use that below.)
7589
     */
7590
0
    exitArchiveRecovery(EndOfLogTLI, EndOfLog);
7591
7592
    /*
7593
     * Write the timeline history file, and have it archived. After this
7594
     * point (or rather, as soon as the file is archived), the timeline
7595
     * will appear as "taken" in the WAL archive and to any standby
7596
     * servers.  If we crash before actually switching to the new
7597
     * timeline, standby servers will nevertheless think that we switched
7598
     * to the new timeline, and will try to connect to the new timeline.
7599
     * To minimize the window for that, try to do as little as possible
7600
     * between here and writing the end-of-recovery record.
7601
     */
7602
0
    writeTimeLineHistory(ThisTimeLineID, recoveryTargetTLI,
7603
0
               EndRecPtr, reason);
7604
0
  }
7605
7606
  /* Save the selected TimeLineID in shared memory, too */
7607
3.99k
  XLogCtl->ThisTimeLineID = ThisTimeLineID;
7608
3.99k
  XLogCtl->PrevTimeLineID = PrevTimeLineID;
7609
7610
  /*
7611
   * Prepare to write WAL starting at EndOfLog location, and init xlog
7612
   * buffer cache using the block containing the last record from the
7613
   * previous incarnation.
7614
   */
7615
3.99k
  Insert = &XLogCtl->Insert;
7616
3.99k
  Insert->PrevBytePos = XLogRecPtrToBytePos(LastRec);
7617
3.99k
  Insert->CurrBytePos = XLogRecPtrToBytePos(EndOfLog);
7618
7619
  /*
7620
   * Tricky point here: readBuf contains the *last* block that the LastRec
7621
   * record spans, not the one it starts in.  The last block is indeed the
7622
   * one we want to use.
7623
   */
7624
3.99k
  if (EndOfLog % XLOG_BLCKSZ != 0)
7625
3.99k
  {
7626
3.99k
    char     *page;
7627
3.99k
    int     len;
7628
3.99k
    int     firstIdx;
7629
3.99k
    XLogRecPtr  pageBeginPtr;
7630
7631
3.99k
    pageBeginPtr = EndOfLog - (EndOfLog % XLOG_BLCKSZ);
7632
3.99k
    Assert(readOff == XLogSegmentOffset(pageBeginPtr, wal_segment_size));
7633
7634
3.99k
    firstIdx = XLogRecPtrToBufIdx(EndOfLog);
7635
7636
    /* Copy the valid part of the last block, and zero the rest */
7637
3.99k
    page = &XLogCtl->pages[firstIdx * XLOG_BLCKSZ];
7638
3.99k
    len = EndOfLog % XLOG_BLCKSZ;
7639
3.99k
    memcpy(page, xlogreader->readBuf, len);
7640
3.99k
    memset(page + len, 0, XLOG_BLCKSZ - len);
7641
7642
3.99k
    XLogCtl->xlblocks[firstIdx] = pageBeginPtr + XLOG_BLCKSZ;
7643
3.99k
    XLogCtl->InitializedUpTo = pageBeginPtr + XLOG_BLCKSZ;
7644
3.99k
  }
7645
0
  else
7646
0
  {
7647
    /*
7648
     * There is no partial block to copy. Just set InitializedUpTo, and
7649
     * let the first attempt to insert a log record to initialize the next
7650
     * buffer.
7651
     */
7652
0
    XLogCtl->InitializedUpTo = EndOfLog;
7653
0
  }
7654
7655
3.99k
  LogwrtResult.Write = LogwrtResult.Flush = EndOfLog;
7656
7657
3.99k
  XLogCtl->LogwrtResult = LogwrtResult;
7658
7659
3.99k
  XLogCtl->LogwrtRqst.Write = EndOfLog;
7660
3.99k
  XLogCtl->LogwrtRqst.Flush = EndOfLog;
7661
7662
  /*
7663
   * Update full_page_writes in shared memory and write an XLOG_FPW_CHANGE
7664
   * record before resource manager writes cleanup WAL records or checkpoint
7665
   * record is written.
7666
   */
7667
3.99k
  Insert->fullPageWrites = lastFullPageWrites;
7668
3.99k
  LocalSetXLogInsertAllowed();
7669
3.99k
  UpdateFullPageWrites();
7670
3.99k
  LocalXLogInsertAllowed = -1;
7671
7672
3.99k
  if (InRecovery)
7673
5
  {
7674
    /*
7675
     * Perform a checkpoint to update all our recovery activity to disk.
7676
     *
7677
     * Note that we write a shutdown checkpoint rather than an on-line
7678
     * one. This is not particularly critical, but since we may be
7679
     * assigning a new TLI, using a shutdown checkpoint allows us to have
7680
     * the rule that TLI only changes in shutdown checkpoints, which
7681
     * allows some extra error checking in xlog_redo.
7682
     *
7683
     * In fast promotion, only create a lightweight end-of-recovery record
7684
     * instead of a full checkpoint. A checkpoint is requested later,
7685
     * after we're fully out of recovery mode and already accepting
7686
     * queries.
7687
     */
7688
5
    if (bgwriterLaunched)
7689
0
    {
7690
0
      if (fast_promote)
7691
0
      {
7692
0
        checkPointLoc = ControlFile->checkPoint;
7693
7694
        /*
7695
         * Confirm the last checkpoint is available for us to recover
7696
         * from if we fail.
7697
         */
7698
0
        record = ReadCheckpointRecord(xlogreader, checkPointLoc, 1, false);
7699
0
        if (record != NULL)
7700
0
        {
7701
0
          fast_promoted = true;
7702
7703
          /*
7704
           * Insert a special WAL record to mark the end of
7705
           * recovery, since we aren't doing a checkpoint. That
7706
           * means that the checkpointer process may likely be in
7707
           * the middle of a time-smoothed restartpoint and could
7708
           * continue to be for minutes after this. That sounds
7709
           * strange, but the effect is roughly the same and it
7710
           * would be stranger to try to come out of the
7711
           * restartpoint and then checkpoint. We request a
7712
           * checkpoint later anyway, just for safety.
7713
           */
7714
0
          CreateEndOfRecoveryRecord();
7715
0
        }
7716
0
      }
7717
7718
0
      if (!fast_promoted)
7719
0
        RequestCheckpoint(CHECKPOINT_END_OF_RECOVERY |
7720
0
                  CHECKPOINT_IMMEDIATE |
7721
0
                  CHECKPOINT_WAIT);
7722
0
    }
7723
5
    else
7724
5
      CreateCheckPoint(CHECKPOINT_END_OF_RECOVERY | CHECKPOINT_IMMEDIATE);
7725
7726
    /*
7727
     * And finally, execute the recovery_end_command, if any.
7728
     */
7729
5
    if (recoveryEndCommand)
7730
0
      ExecuteRecoveryCommand(recoveryEndCommand,
7731
0
                   "recovery_end_command",
7732
0
                   true);
7733
5
  }
7734
7735
3.99k
  if (ArchiveRecoveryRequested)
7736
0
  {
7737
    /*
7738
     * We switched to a new timeline. Clean up segments on the old
7739
     * timeline.
7740
     *
7741
     * If there are any higher-numbered segments on the old timeline,
7742
     * remove them. They might contain valid WAL, but they might also be
7743
     * pre-allocated files containing garbage. In any case, they are not
7744
     * part of the new timeline's history so we don't need them.
7745
     */
7746
0
    RemoveNonParentXlogFiles(EndOfLog, ThisTimeLineID);
7747
7748
    /*
7749
     * If the switch happened in the middle of a segment, what to do with
7750
     * the last, partial segment on the old timeline? If we don't archive
7751
     * it, and the server that created the WAL never archives it either
7752
     * (e.g. because it was hit by a meteor), it will never make it to the
7753
     * archive. That's OK from our point of view, because the new segment
7754
     * that we created with the new TLI contains all the WAL from the old
7755
     * timeline up to the switch point. But if you later try to do PITR to
7756
     * the "missing" WAL on the old timeline, recovery won't find it in
7757
     * the archive. It's physically present in the new file with new TLI,
7758
     * but recovery won't look there when it's recovering to the older
7759
     * timeline. On the other hand, if we archive the partial segment, and
7760
     * the original server on that timeline is still running and archives
7761
     * the completed version of the same segment later, it will fail. (We
7762
     * used to do that in 9.4 and below, and it caused such problems).
7763
     *
7764
     * As a compromise, we rename the last segment with the .partial
7765
     * suffix, and archive it. Archive recovery will never try to read
7766
     * .partial segments, so they will normally go unused. But in the odd
7767
     * PITR case, the administrator can copy them manually to the pg_wal
7768
     * directory (removing the suffix). They can be useful in debugging,
7769
     * too.
7770
     *
7771
     * If a .done or .ready file already exists for the old timeline,
7772
     * however, we had already determined that the segment is complete, so
7773
     * we can let it be archived normally. (In particular, if it was
7774
     * restored from the archive to begin with, it's expected to have a
7775
     * .done file).
7776
     */
7777
0
    if (XLogSegmentOffset(EndOfLog, wal_segment_size) != 0 &&
7778
0
      XLogArchivingActive())
7779
0
    {
7780
0
      char    origfname[MAXFNAMELEN];
7781
0
      XLogSegNo endLogSegNo;
7782
7783
0
      XLByteToPrevSeg(EndOfLog, endLogSegNo, wal_segment_size);
7784
0
      XLogFileName(origfname, EndOfLogTLI, endLogSegNo, wal_segment_size);
7785
7786
0
      if (!XLogArchiveIsReadyOrDone(origfname))
7787
0
      {
7788
0
        char    origpath[MAXPGPATH];
7789
0
        char    partialfname[MAXFNAMELEN];
7790
0
        char    partialpath[MAXPGPATH];
7791
7792
0
        XLogFilePath(origpath, EndOfLogTLI, endLogSegNo, wal_segment_size);
7793
0
        snprintf(partialfname, MAXFNAMELEN, "%s.partial", origfname);
7794
0
        snprintf(partialpath, MAXPGPATH, "%s.partial", origpath);
7795
7796
        /*
7797
         * Make sure there's no .done or .ready file for the .partial
7798
         * file.
7799
         */
7800
0
        XLogArchiveCleanup(partialfname);
7801
7802
0
        durable_rename(origpath, partialpath, ERROR);
7803
0
        XLogArchiveNotify(partialfname);
7804
0
      }
7805
0
    }
7806
0
  }
7807
7808
  /*
7809
   * Preallocate additional log files, if wanted.
7810
   */
7811
0
  PreallocXlogFiles(EndOfLog);
7812
7813
  /*
7814
   * Okay, we're officially UP.
7815
   */
7816
3.99k
  InRecovery = false;
7817
7818
  /* start the archive_timeout timer and LSN running */
7819
3.99k
  XLogCtl->lastSegSwitchTime = (pg_time_t) time(NULL);
7820
3.99k
  XLogCtl->lastSegSwitchLSN = EndOfLog;
7821
7822
  /* also initialize latestCompletedXid, to nextXid - 1 */
7823
3.99k
  LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
7824
3.99k
  ShmemVariableCache->latestCompletedXid = ShmemVariableCache->nextXid;
7825
3.99k
  TransactionIdRetreat(ShmemVariableCache->latestCompletedXid);
7826
3.99k
  LWLockRelease(ProcArrayLock);
7827
7828
  /*
7829
   * Start up the commit log and subtrans, if not already done for hot
7830
   * standby.  (commit timestamps are started below, if necessary.)
7831
   */
7832
3.99k
  if (standbyState == STANDBY_DISABLED)
7833
3.99k
  {
7834
3.99k
    StartupCLOG();
7835
3.99k
    StartupSUBTRANS(oldestActiveXID);
7836
3.99k
  }
7837
7838
  /*
7839
   * Perform end of recovery actions for any SLRUs that need it.
7840
   */
7841
3.99k
  TrimCLOG();
7842
3.99k
  TrimMultiXact();
7843
7844
  /* Reload shared-memory state for prepared transactions */
7845
3.99k
  RecoverPreparedTransactions();
7846
7847
  /*
7848
   * Shutdown the recovery environment. This must occur after
7849
   * RecoverPreparedTransactions(), see notes for lock_twophase_recover()
7850
   */
7851
3.99k
  if (standbyState != STANDBY_DISABLED)
7852
0
    ShutdownRecoveryTransactionEnvironment();
7853
7854
  /* Shut down xlogreader */
7855
3.99k
  if (readFile >= 0)
7856
3.99k
  {
7857
3.99k
    close(readFile);
7858
3.99k
    readFile = -1;
7859
3.99k
  }
7860
3.99k
  XLogReaderFree(xlogreader);
7861
7862
  /*
7863
   * If any of the critical GUCs have changed, log them before we allow
7864
   * backends to write WAL.
7865
   */
7866
3.99k
  LocalSetXLogInsertAllowed();
7867
3.99k
  XLogReportParameters();
7868
7869
  /*
7870
   * Local WAL inserts enabled, so it's time to finish initialization of
7871
   * commit timestamp.
7872
   */
7873
3.99k
  CompleteCommitTsInitialization();
7874
7875
  /*
7876
   * All done with end-of-recovery actions.
7877
   *
7878
   * Now allow backends to write WAL and update the control file status in
7879
   * consequence.  The boolean flag allowing backends to write WAL is
7880
   * updated while holding ControlFileLock to prevent other backends to look
7881
   * at an inconsistent state of the control file in shared memory.  There
7882
   * is still a small window during which backends can write WAL and the
7883
   * control file is still referring to a system not in DB_IN_PRODUCTION
7884
   * state while looking at the on-disk control file.
7885
   *
7886
   * Also, although the boolean flag to allow WAL is probably atomic in
7887
   * itself, we use the info_lck here to ensure that there are no race
7888
   * conditions concerning visibility of other recent updates to shared
7889
   * memory.
7890
   */
7891
3.99k
  LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
7892
3.99k
  ControlFile->state = DB_IN_PRODUCTION;
7893
3.99k
  ControlFile->time = (pg_time_t) time(NULL);
7894
7895
3.99k
  SpinLockAcquire(&XLogCtl->info_lck);
7896
3.99k
  XLogCtl->SharedRecoveryInProgress = false;
7897
3.99k
  SpinLockRelease(&XLogCtl->info_lck);
7898
7899
3.99k
  UpdateControlFile();
7900
3.99k
  LWLockRelease(ControlFileLock);
7901
7902
  /*
7903
   * If there were cascading standby servers connected to us, nudge any wal
7904
   * sender processes to notice that we've been promoted.
7905
   */
7906
3.99k
  WalSndWakeup();
7907
7908
  /*
7909
   * If this was a fast promotion, request an (online) checkpoint now. This
7910
   * isn't required for consistency, but the last restartpoint might be far
7911
   * back, and in case of a crash, recovering from it might take a longer
7912
   * than is appropriate now that we're not in standby mode anymore.
7913
   */
7914
3.99k
  if (fast_promoted)
7915
0
    RequestCheckpoint(CHECKPOINT_FORCE);
7916
3.99k
}
7917
7918
/*
7919
 * Checks if recovery has reached a consistent state. When consistency is
7920
 * reached and we have a valid starting standby snapshot, tell postmaster
7921
 * that it can start accepting read-only connections.
7922
 */
7923
static void
7924
CheckRecoveryConsistency(void)
7925
7
{
7926
7
  XLogRecPtr  lastReplayedEndRecPtr;
7927
7928
  /*
7929
   * During crash recovery, we don't reach a consistent state until we've
7930
   * replayed all the WAL.
7931
   */
7932
7
  if (XLogRecPtrIsInvalid(minRecoveryPoint))
7933
7
    return;
7934
7935
0
  Assert(InArchiveRecovery);
7936
7937
  /*
7938
   * assume that we are called in the startup process, and hence don't need
7939
   * a lock to read lastReplayedEndRecPtr
7940
   */
7941
0
  lastReplayedEndRecPtr = XLogCtl->lastReplayedEndRecPtr;
7942
7943
  /*
7944
   * Have we reached the point where our base backup was completed?
7945
   */
7946
0
  if (!XLogRecPtrIsInvalid(ControlFile->backupEndPoint) &&
7947
0
    ControlFile->backupEndPoint <= lastReplayedEndRecPtr)
7948
0
  {
7949
    /*
7950
     * We have reached the end of base backup, as indicated by pg_control.
7951
     * The data on disk is now consistent. Reset backupStartPoint and
7952
     * backupEndPoint, and update minRecoveryPoint to make sure we don't
7953
     * allow starting up at an earlier point even if recovery is stopped
7954
     * and restarted soon after this.
7955
     */
7956
0
    elog(DEBUG1, "end of backup reached");
7957
7958
0
    LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
7959
7960
0
    if (ControlFile->minRecoveryPoint < lastReplayedEndRecPtr)
7961
0
      ControlFile->minRecoveryPoint = lastReplayedEndRecPtr;
7962
7963
0
    ControlFile->backupStartPoint = InvalidXLogRecPtr;
7964
0
    ControlFile->backupEndPoint = InvalidXLogRecPtr;
7965
0
    ControlFile->backupEndRequired = false;
7966
0
    UpdateControlFile();
7967
7968
0
    LWLockRelease(ControlFileLock);
7969
0
  }
7970
7971
  /*
7972
   * Have we passed our safe starting point? Note that minRecoveryPoint is
7973
   * known to be incorrectly set if ControlFile->backupEndRequired, until
7974
   * the XLOG_BACKUP_END arrives to advise us of the correct
7975
   * minRecoveryPoint. All we know prior to that is that we're not
7976
   * consistent yet.
7977
   */
7978
0
  if (!reachedConsistency && !ControlFile->backupEndRequired &&
7979
0
    minRecoveryPoint <= lastReplayedEndRecPtr &&
7980
0
    XLogRecPtrIsInvalid(ControlFile->backupStartPoint))
7981
0
  {
7982
    /*
7983
     * Check to see if the XLOG sequence contained any unresolved
7984
     * references to uninitialized pages.
7985
     */
7986
0
    XLogCheckInvalidPages();
7987
7988
0
    reachedConsistency = true;
7989
0
    ereport(LOG,
7990
0
        (errmsg("consistent recovery state reached at %X/%X",
7991
0
            (uint32) (lastReplayedEndRecPtr >> 32),
7992
0
            (uint32) lastReplayedEndRecPtr)));
7993
0
  }
7994
7995
  /*
7996
   * Have we got a valid starting snapshot that will allow queries to be
7997
   * run? If so, we can tell postmaster that the database is consistent now,
7998
   * enabling connections.
7999
   */
8000
0
  if (standbyState == STANDBY_SNAPSHOT_READY &&
8001
0
    !LocalHotStandbyActive &&
8002
0
    reachedConsistency &&
8003
0
    IsUnderPostmaster)
8004
0
  {
8005
0
    SpinLockAcquire(&XLogCtl->info_lck);
8006
0
    XLogCtl->SharedHotStandbyActive = true;
8007
0
    SpinLockRelease(&XLogCtl->info_lck);
8008
8009
0
    LocalHotStandbyActive = true;
8010
8011
0
    SendPostmasterSignal(PMSIGNAL_BEGIN_HOT_STANDBY);
8012
0
  }
8013
0
}
8014
8015
/*
8016
 * Is the system still in recovery?
8017
 *
8018
 * Unlike testing InRecovery, this works in any process that's connected to
8019
 * shared memory.
8020
 *
8021
 * As a side-effect, we initialize the local TimeLineID and RedoRecPtr
8022
 * variables the first time we see that recovery is finished.
8023
 */
8024
bool
8025
RecoveryInProgress(void)
8026
1.36M
{
8027
  /*
8028
   * We check shared state each time only until we leave recovery mode. We
8029
   * can't re-enter recovery, so there's no need to keep checking after the
8030
   * shared variable has once been seen false.
8031
   */
8032
1.36M
  if (!LocalRecoveryInProgress)
8033
1.35M
    return false;
8034
8.30k
  else
8035
8.30k
  {
8036
    /*
8037
     * use volatile pointer to make sure we make a fresh read of the
8038
     * shared variable.
8039
     */
8040
8.30k
    volatile XLogCtlData *xlogctl = XLogCtl;
8041
8042
8.30k
    LocalRecoveryInProgress = xlogctl->SharedRecoveryInProgress;
8043
8044
    /*
8045
     * Initialize TimeLineID and RedoRecPtr when we discover that recovery
8046
     * is finished. InitPostgres() relies upon this behaviour to ensure
8047
     * that InitXLOGAccess() is called at backend startup.  (If you change
8048
     * this, see also LocalSetXLogInsertAllowed.)
8049
     */
8050
8.30k
    if (!LocalRecoveryInProgress)
8051
8.20k
    {
8052
      /*
8053
       * If we just exited recovery, make sure we read TimeLineID and
8054
       * RedoRecPtr after SharedRecoveryInProgress (for machines with
8055
       * weak memory ordering).
8056
       */
8057
8.20k
      pg_memory_barrier();
8058
8.20k
      InitXLOGAccess();
8059
8.20k
    }
8060
8061
    /*
8062
     * Note: We don't need a memory barrier when we're still in recovery.
8063
     * We might exit recovery immediately after return, so the caller
8064
     * can't rely on 'true' meaning that we're still in recovery anyway.
8065
     */
8066
8067
8.30k
    return LocalRecoveryInProgress;
8068
8.30k
  }
8069
1.36M
}
8070
8071
/*
8072
 * Is HotStandby active yet? This is only important in special backends
8073
 * since normal backends won't ever be able to connect until this returns
8074
 * true. Postmaster knows this by way of signal, not via shared memory.
8075
 *
8076
 * Unlike testing standbyState, this works in any process that's connected to
8077
 * shared memory.  (And note that standbyState alone doesn't tell the truth
8078
 * anyway.)
8079
 */
8080
bool
8081
HotStandbyActive(void)
8082
0
{
8083
  /*
8084
   * We check shared state each time only until Hot Standby is active. We
8085
   * can't de-activate Hot Standby, so there's no need to keep checking
8086
   * after the shared variable has once been seen true.
8087
   */
8088
0
  if (LocalHotStandbyActive)
8089
0
    return true;
8090
0
  else
8091
0
  {
8092
    /* spinlock is essential on machines with weak memory ordering! */
8093
0
    SpinLockAcquire(&XLogCtl->info_lck);
8094
0
    LocalHotStandbyActive = XLogCtl->SharedHotStandbyActive;
8095
0
    SpinLockRelease(&XLogCtl->info_lck);
8096
8097
0
    return LocalHotStandbyActive;
8098
0
  }
8099
0
}
8100
8101
/*
8102
 * Like HotStandbyActive(), but to be used only in WAL replay code,
8103
 * where we don't need to ask any other process what the state is.
8104
 */
8105
bool
8106
HotStandbyActiveInReplay(void)
8107
0
{
8108
0
  Assert(AmStartupProcess() || !IsPostmasterEnvironment);
8109
0
  return LocalHotStandbyActive;
8110
0
}
8111
8112
/*
8113
 * Is this process allowed to insert new WAL records?
8114
 *
8115
 * Ordinarily this is essentially equivalent to !RecoveryInProgress().
8116
 * But we also have provisions for forcing the result "true" or "false"
8117
 * within specific processes regardless of the global state.
8118
 */
8119
bool
8120
XLogInsertAllowed(void)
8121
8.53k
{
8122
  /*
8123
   * If value is "unconditionally true" or "unconditionally false", just
8124
   * return it.  This provides the normal fast path once recovery is known
8125
   * done.
8126
   */
8127
8.53k
  if (LocalXLogInsertAllowed >= 0)
8128
8.36k
    return (bool) LocalXLogInsertAllowed;
8129
8130
  /*
8131
   * Else, must check to see if we're still in recovery.
8132
   */
8133
172
  if (RecoveryInProgress())
8134
0
    return false;
8135
8136
  /*
8137
   * On exit from recovery, reset to "unconditionally true", since there is
8138
   * no need to keep checking.
8139
   */
8140
172
  LocalXLogInsertAllowed = 1;
8141
172
  return true;
8142
172
}
8143
8144
/*
8145
 * Make XLogInsertAllowed() return true in the current process only.
8146
 *
8147
 * Note: it is allowed to switch LocalXLogInsertAllowed back to -1 later,
8148
 * and even call LocalSetXLogInsertAllowed() again after that.
8149
 */
8150
static void
8151
LocalSetXLogInsertAllowed(void)
8152
8.00k
{
8153
8.00k
  Assert(LocalXLogInsertAllowed == -1);
8154
8.00k
  LocalXLogInsertAllowed = 1;
8155
8156
  /* Initialize as RecoveryInProgress() would do when switching state */
8157
8.00k
  InitXLOGAccess();
8158
8.00k
}
8159
8160
/*
8161
 * Subroutine to try to fetch and validate a prior checkpoint record.
8162
 *
8163
 * whichChkpt identifies the checkpoint (merely for reporting purposes).
8164
 * 1 for "primary", 0 for "other" (backup_label)
8165
 */
8166
static XLogRecord *
8167
ReadCheckpointRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr,
8168
           int whichChkpt, bool report)
8169
3.99k
{
8170
3.99k
  XLogRecord *record;
8171
3.99k
  uint8   info;
8172
8173
3.99k
  if (!XRecOffIsValid(RecPtr))
8174
0
  {
8175
0
    if (!report)
8176
0
      return NULL;
8177
8178
0
    switch (whichChkpt)
8179
0
    {
8180
0
      case 1:
8181
0
        ereport(LOG,
8182
0
            (errmsg("invalid primary checkpoint link in control file")));
8183
0
        break;
8184
0
      default:
8185
0
        ereport(LOG,
8186
0
            (errmsg("invalid checkpoint link in backup_label file")));
8187
0
        break;
8188
0
    }
8189
0
    return NULL;
8190
0
  }
8191
8192
3.99k
  record = ReadRecord(xlogreader, RecPtr, LOG, true);
8193
8194
3.99k
  if (record == NULL)
8195
0
  {
8196
0
    if (!report)
8197
0
      return NULL;
8198
8199
0
    switch (whichChkpt)
8200
0
    {
8201
0
      case 1:
8202
0
        ereport(LOG,
8203
0
            (errmsg("invalid primary checkpoint record")));
8204
0
        break;
8205
0
      default:
8206
0
        ereport(LOG,
8207
0
            (errmsg("invalid checkpoint record")));
8208
0
        break;
8209
0
    }
8210
0
    return NULL;
8211
0
  }
8212
3.99k
  if (record->xl_rmid != RM_XLOG_ID)
8213
0
  {
8214
0
    switch (whichChkpt)
8215
0
    {
8216
0
      case 1:
8217
0
        ereport(LOG,
8218
0
            (errmsg("invalid resource manager ID in primary checkpoint record")));
8219
0
        break;
8220
0
      default:
8221
0
        ereport(LOG,
8222
0
            (errmsg("invalid resource manager ID in checkpoint record")));
8223
0
        break;
8224
0
    }
8225
0
    return NULL;
8226
0
  }
8227
3.99k
  info = record->xl_info & ~XLR_INFO_MASK;
8228
3.99k
  if (info != XLOG_CHECKPOINT_SHUTDOWN &&
8229
3.99k
    
info != 1
XLOG_CHECKPOINT_ONLINE1
)
8230
0
  {
8231
0
    switch (whichChkpt)
8232
0
    {
8233
0
      case 1:
8234
0
        ereport(LOG,
8235
0
            (errmsg("invalid xl_info in primary checkpoint record")));
8236
0
        break;
8237
0
      default:
8238
0
        ereport(LOG,
8239
0
            (errmsg("invalid xl_info in checkpoint record")));
8240
0
        break;
8241
0
    }
8242
0
    return NULL;
8243
0
  }
8244
3.99k
  if (record->xl_tot_len != SizeOfXLogRecord + SizeOfXLogRecordDataHeaderShort + sizeof(CheckPoint))
8245
0
  {
8246
0
    switch (whichChkpt)
8247
0
    {
8248
0
      case 1:
8249
0
        ereport(LOG,
8250
0
            (errmsg("invalid length of primary checkpoint record")));
8251
0
        break;
8252
0
      default:
8253
0
        ereport(LOG,
8254
0
            (errmsg("invalid length of checkpoint record")));
8255
0
        break;
8256
0
    }
8257
0
    return NULL;
8258
0
  }
8259
3.99k
  return record;
8260
3.99k
}
8261
8262
/*
8263
 * This must be called in a backend process before creating WAL records
8264
 * (except in a standalone backend, which does StartupXLOG instead).  We need
8265
 * to initialize the local copies of ThisTimeLineID and RedoRecPtr.
8266
 *
8267
 * Note: before Postgres 8.0, we went to some effort to keep the postmaster
8268
 * process's copies of ThisTimeLineID and RedoRecPtr valid too.  This was
8269
 * unnecessary however, since the postmaster itself never touches XLOG anyway.
8270
 */
8271
void
8272
InitXLOGAccess(void)
8273
16.2k
{
8274
16.2k
  XLogCtlInsert *Insert = &XLogCtl->Insert;
8275
8276
  /* ThisTimeLineID doesn't change so we need no lock to copy it */
8277
16.2k
  ThisTimeLineID = XLogCtl->ThisTimeLineID;
8278
16.2k
  Assert(ThisTimeLineID != 0 || IsBootstrapProcessingMode());
8279
8280
  /* set wal_segment_size */
8281
16.2k
  wal_segment_size = ControlFile->xlog_seg_size;
8282
8283
  /* Use GetRedoRecPtr to copy the RedoRecPtr safely */
8284
16.2k
  (void) GetRedoRecPtr();
8285
  /* Also update our copy of doPageWrites. */
8286
16.2k
  doPageWrites = (Insert->fullPageWrites || 
Insert->forcePageWrites0
);
8287
8288
  /* Also initialize the working areas for constructing WAL records */
8289
16.2k
  InitXLogInsert();
8290
16.2k
}
8291
8292
/*
8293
 * Return the current Redo pointer from shared memory.
8294
 *
8295
 * As a side-effect, the local RedoRecPtr copy is updated.
8296
 */
8297
XLogRecPtr
8298
GetRedoRecPtr(void)
8299
20.9k
{
8300
20.9k
  XLogRecPtr  ptr;
8301
8302
  /*
8303
   * The possibly not up-to-date copy in XlogCtl is enough. Even if we
8304
   * grabbed a WAL insertion lock to read the master copy, someone might
8305
   * update it just after we've released the lock.
8306
   */
8307
20.9k
  SpinLockAcquire(&XLogCtl->info_lck);
8308
20.9k
  ptr = XLogCtl->RedoRecPtr;
8309
20.9k
  SpinLockRelease(&XLogCtl->info_lck);
8310
8311
20.9k
  if (RedoRecPtr < ptr)
8312
6.19k
    RedoRecPtr = ptr;
8313
8314
20.9k
  return RedoRecPtr;
8315
20.9k
}
8316
8317
/*
8318
 * Return information needed to decide whether a modified block needs a
8319
 * full-page image to be included in the WAL record.
8320
 *
8321
 * The returned values are cached copies from backend-private memory, and
8322
 * possibly out-of-date.  XLogInsertRecord will re-check them against
8323
 * up-to-date values, while holding the WAL insert lock.
8324
 */
8325
void
8326
GetFullPageWriteInfo(XLogRecPtr *RedoRecPtr_p, bool *doPageWrites_p)
8327
3.06k
{
8328
3.06k
  *RedoRecPtr_p = RedoRecPtr;
8329
3.06k
  *doPageWrites_p = doPageWrites;
8330
3.06k
}
8331
8332
/*
8333
 * GetInsertRecPtr -- Returns the current insert position.
8334
 *
8335
 * NOTE: The value *actually* returned is the position of the last full
8336
 * xlog page. It lags behind the real insert position by at most 1 page.
8337
 * For that, we don't need to scan through WAL insertion locks, and an
8338
 * approximation is enough for the current usage of this function.
8339
 */
8340
XLogRecPtr
8341
GetInsertRecPtr(void)
8342
546
{
8343
546
  XLogRecPtr  recptr;
8344
8345
546
  SpinLockAcquire(&XLogCtl->info_lck);
8346
546
  recptr = XLogCtl->LogwrtRqst.Write;
8347
546
  SpinLockRelease(&XLogCtl->info_lck);
8348
8349
546
  return recptr;
8350
546
}
8351
8352
/*
8353
 * GetFlushRecPtr -- Returns the current flush position, ie, the last WAL
8354
 * position known to be fsync'd to disk.
8355
 */
8356
XLogRecPtr
8357
GetFlushRecPtr(void)
8358
0
{
8359
0
  SpinLockAcquire(&XLogCtl->info_lck);
8360
0
  LogwrtResult = XLogCtl->LogwrtResult;
8361
0
  SpinLockRelease(&XLogCtl->info_lck);
8362
8363
0
  return LogwrtResult.Flush;
8364
0
}
8365
8366
/*
8367
 * GetLastImportantRecPtr -- Returns the LSN of the last important record
8368
 * inserted. All records not explicitly marked as unimportant are considered
8369
 * important.
8370
 *
8371
 * The LSN is determined by computing the maximum of
8372
 * WALInsertLocks[i].lastImportantAt.
8373
 */
8374
XLogRecPtr
8375
GetLastImportantRecPtr(void)
8376
2.57k
{
8377
2.57k
  XLogRecPtr  res = InvalidXLogRecPtr;
8378
2.57k
  int     i;
8379
8380
23.1k
  for (i = 0; i < NUM_XLOGINSERT_LOCKS; 
i++20.5k
)
8381
20.5k
  {
8382
20.5k
    XLogRecPtr  last_important;
8383
8384
    /*
8385
     * Need to take a lock to prevent torn reads of the LSN, which are
8386
     * possible on some of the supported platforms. WAL insert locks only
8387
     * support exclusive mode, so we have to use that.
8388
     */
8389
20.5k
    LWLockAcquire(&WALInsertLocks[i].l.lock, LW_EXCLUSIVE);
8390
20.5k
    last_important = WALInsertLocks[i].l.lastImportantAt;
8391
20.5k
    LWLockRelease(&WALInsertLocks[i].l.lock);
8392
8393
20.5k
    if (res < last_important)
8394
451
      res = last_important;
8395
20.5k
  }
8396
8397
2.57k
  return res;
8398
2.57k
}
8399
8400
/*
8401
 * Get the time and LSN of the last xlog segment switch
8402
 */
8403
pg_time_t
8404
GetLastSegSwitchData(XLogRecPtr *lastSwitchLSN)
8405
0
{
8406
0
  pg_time_t result;
8407
8408
  /* Need WALWriteLock, but shared lock is sufficient */
8409
0
  LWLockAcquire(WALWriteLock, LW_SHARED);
8410
0
  result = XLogCtl->lastSegSwitchTime;
8411
0
  *lastSwitchLSN = XLogCtl->lastSegSwitchLSN;
8412
0
  LWLockRelease(WALWriteLock);
8413
8414
0
  return result;
8415
0
}
8416
8417
/*
8418
 * GetNextXidAndEpoch - get the current nextXid value and associated epoch
8419
 *
8420
 * This is exported for use by code that would like to have 64-bit XIDs.
8421
 * We don't really support such things, but all XIDs within the system
8422
 * can be presumed "close to" the result, and thus the epoch associated
8423
 * with them can be determined.
8424
 */
8425
void
8426
GetNextXidAndEpoch(TransactionId *xid, uint32 *epoch)
8427
16
{
8428
16
  uint32    ckptXidEpoch;
8429
16
  TransactionId ckptXid;
8430
16
  TransactionId nextXid;
8431
8432
  /* Must read checkpoint info first, else have race condition */
8433
16
  SpinLockAcquire(&XLogCtl->info_lck);
8434
16
  ckptXidEpoch = XLogCtl->ckptXidEpoch;
8435
16
  ckptXid = XLogCtl->ckptXid;
8436
16
  SpinLockRelease(&XLogCtl->info_lck);
8437
8438
  /* Now fetch current nextXid */
8439
16
  nextXid = ReadNewTransactionId();
8440
8441
  /*
8442
   * nextXid is certainly logically later than ckptXid.  So if it's
8443
   * numerically less, it must have wrapped into the next epoch.
8444
   */
8445
16
  if (nextXid < ckptXid)
8446
0
    ckptXidEpoch++;
8447
8448
16
  *xid = nextXid;
8449
16
  *epoch = ckptXidEpoch;
8450
16
}
8451
8452
/*
8453
 * This must be called ONCE during postmaster or standalone-backend shutdown
8454
 */
8455
void
8456
ShutdownXLOG(int code, Datum arg)
8457
2.00k
{
8458
  /* Don't be chatty in standalone mode */
8459
2.00k
  ereport(IsPostmasterEnvironment ? LOG : NOTICE,
8460
2.00k
      (errmsg("shutting down")));
8461
8462
  /*
8463
   * Signal walsenders to move to stopping state.
8464
   */
8465
2.00k
  WalSndInitStopping();
8466
8467
  /*
8468
   * Wait for WAL senders to be in stopping state.  This prevents commands
8469
   * from writing new WAL.
8470
   */
8471
2.00k
  WalSndWaitStopping();
8472
8473
2.00k
  if (RecoveryInProgress())
8474
0
    CreateRestartPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
8475
2.00k
  else
8476
2.00k
  {
8477
    /*
8478
     * If archiving is enabled, rotate the last XLOG file so that all the
8479
     * remaining records are archived (postmaster wakes up the archiver
8480
     * process one more time at the end of shutdown). The checkpoint
8481
     * record will go to the next XLOG file and won't be archived (yet).
8482
     */
8483
2.00k
    if (XLogArchivingActive() && 
XLogArchiveCommandSet0
())
8484
0
      RequestXLogSwitch(false);
8485
8486
2.00k
    CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
8487
2.00k
  }
8488
0
  ShutdownCLOG();
8489
2.00k
  ShutdownCommitTs();
8490
2.00k
  ShutdownSUBTRANS();
8491
2.00k
  ShutdownMultiXact();
8492
2.00k
}
8493
8494
/*
8495
 * Log start of a checkpoint.
8496
 */
8497
static void
8498
LogCheckpointStart(int flags, bool restartpoint)
8499
0
{
8500
0
  elog(LOG, "%s starting:%s%s%s%s%s%s%s%s",
8501
0
     restartpoint ? "restartpoint" : "checkpoint",
8502
0
     (flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "",
8503
0
     (flags & CHECKPOINT_END_OF_RECOVERY) ? " end-of-recovery" : "",
8504
0
     (flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "",
8505
0
     (flags & CHECKPOINT_FORCE) ? " force" : "",
8506
0
     (flags & CHECKPOINT_WAIT) ? " wait" : "",
8507
0
     (flags & CHECKPOINT_CAUSE_XLOG) ? " xlog" : "",
8508
0
     (flags & CHECKPOINT_CAUSE_TIME) ? " time" : "",
8509
0
     (flags & CHECKPOINT_FLUSH_ALL) ? " flush-all" : "");
8510
0
}
8511
8512
/*
8513
 * Log end of a checkpoint.
8514
 */
8515
static void
8516
LogCheckpointEnd(bool restartpoint)
8517
2.35k
{
8518
2.35k
  long    write_secs,
8519
2.35k
        sync_secs,
8520
2.35k
        total_secs,
8521
2.35k
        longest_secs,
8522
2.35k
        average_secs;
8523
2.35k
  int     write_usecs,
8524
2.35k
        sync_usecs,
8525
2.35k
        total_usecs,
8526
2.35k
        longest_usecs,
8527
2.35k
        average_usecs;
8528
2.35k
  uint64    average_sync_time;
8529
8530
2.35k
  CheckpointStats.ckpt_end_t = GetCurrentTimestamp();
8531
8532
2.35k
  TimestampDifference(CheckpointStats.ckpt_write_t,
8533
2.35k
            CheckpointStats.ckpt_sync_t,
8534
2.35k
            &write_secs, &write_usecs);
8535
8536
2.35k
  TimestampDifference(CheckpointStats.ckpt_sync_t,
8537
2.35k
            CheckpointStats.ckpt_sync_end_t,
8538
2.35k
            &sync_secs, &sync_usecs);
8539
8540
  /* Accumulate checkpoint timing summary data, in milliseconds. */
8541
2.35k
  BgWriterStats.m_checkpoint_write_time +=
8542
2.35k
    write_secs * 1000 + write_usecs / 1000;
8543
2.35k
  BgWriterStats.m_checkpoint_sync_time +=
8544
2.35k
    sync_secs * 1000 + sync_usecs / 1000;
8545
8546
  /*
8547
   * All of the published timing statistics are accounted for.  Only
8548
   * continue if a log message is to be written.
8549
   */
8550
2.35k
  if (!log_checkpoints)
8551
2.35k
    return;
8552
8553
0
  TimestampDifference(CheckpointStats.ckpt_start_t,
8554
0
            CheckpointStats.ckpt_end_t,
8555
0
            &total_secs, &total_usecs);
8556
8557
  /*
8558
   * Timing values returned from CheckpointStats are in microseconds.
8559
   * Convert to the second plus microsecond form that TimestampDifference
8560
   * returns for homogeneous printing.
8561
   */
8562
0
  longest_secs = (long) (CheckpointStats.ckpt_longest_sync / 1000000);
8563
0
  longest_usecs = CheckpointStats.ckpt_longest_sync -
8564
0
    (uint64) longest_secs * 1000000;
8565
8566
0
  average_sync_time = 0;
8567
0
  if (CheckpointStats.ckpt_sync_rels > 0)
8568
0
    average_sync_time = CheckpointStats.ckpt_agg_sync_time /
8569
0
      CheckpointStats.ckpt_sync_rels;
8570
0
  average_secs = (long) (average_sync_time / 1000000);
8571
0
  average_usecs = average_sync_time - (uint64) average_secs * 1000000;
8572
8573
0
  elog(LOG, "%s complete: wrote %d buffers (%.1f%%); "
8574
0
     "%d WAL file(s) added, %d removed, %d recycled; "
8575
0
     "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s; "
8576
0
     "sync files=%d, longest=%ld.%03d s, average=%ld.%03d s; "
8577
0
     "distance=%d kB, estimate=%d kB",
8578
0
     restartpoint ? "restartpoint" : "checkpoint",
8579
0
     CheckpointStats.ckpt_bufs_written,
8580
0
     (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
8581
0
     CheckpointStats.ckpt_segs_added,
8582
0
     CheckpointStats.ckpt_segs_removed,
8583
0
     CheckpointStats.ckpt_segs_recycled,
8584
0
     write_secs, write_usecs / 1000,
8585
0
     sync_secs, sync_usecs / 1000,
8586
0
     total_secs, total_usecs / 1000,
8587
0
     CheckpointStats.ckpt_sync_rels,
8588
0
     longest_secs, longest_usecs / 1000,
8589
0
     average_secs, average_usecs / 1000,
8590
0
     (int) (PrevCheckPointDistance / 1024.0),
8591
0
     (int) (CheckPointDistanceEstimate / 1024.0));
8592
0
}
8593
8594
/*
8595
 * Update the estimate of distance between checkpoints.
8596
 *
8597
 * The estimate is used to calculate the number of WAL segments to keep
8598
 * preallocated, see XLOGFileSlop().
8599
 */
8600
static void
8601
UpdateCheckPointDistanceEstimate(uint64 nbytes)
8602
2.35k
{
8603
  /*
8604
   * To estimate the number of segments consumed between checkpoints, keep a
8605
   * moving average of the amount of WAL generated in previous checkpoint
8606
   * cycles. However, if the load is bursty, with quiet periods and busy
8607
   * periods, we want to cater for the peak load. So instead of a plain
8608
   * moving average, let the average decline slowly if the previous cycle
8609
   * used less WAL than estimated, but bump it up immediately if it used
8610
   * more.
8611
   *
8612
   * When checkpoints are triggered by max_wal_size, this should converge to
8613
   * CheckpointSegments * wal_segment_size,
8614
   *
8615
   * Note: This doesn't pay any attention to what caused the checkpoint.
8616
   * Checkpoints triggered manually with CHECKPOINT command, or by e.g.
8617
   * starting a base backup, are counted the same as those created
8618
   * automatically. The slow-decline will largely mask them out, if they are
8619
   * not frequent. If they are frequent, it seems reasonable to count them
8620
   * in as any others; if you issue a manual checkpoint every 5 minutes and
8621
   * never let a timed checkpoint happen, it makes sense to base the
8622
   * preallocation on that 5 minute interval rather than whatever
8623
   * checkpoint_timeout is set to.
8624
   */
8625
2.35k
  PrevCheckPointDistance = nbytes;
8626
2.35k
  if (CheckPointDistanceEstimate < nbytes)
8627
2.21k
    CheckPointDistanceEstimate = nbytes;
8628
139
  else
8629
139
    CheckPointDistanceEstimate =
8630
139
      (0.90 * CheckPointDistanceEstimate + 0.10 * (double) nbytes);
8631
2.35k
}
8632
8633
/*
8634
 * Perform a checkpoint --- either during shutdown, or on-the-fly
8635
 *
8636
 * flags is a bitwise OR of the following:
8637
 *  CHECKPOINT_IS_SHUTDOWN: checkpoint is for database shutdown.
8638
 *  CHECKPOINT_END_OF_RECOVERY: checkpoint is for end of WAL recovery.
8639
 *  CHECKPOINT_IMMEDIATE: finish the checkpoint ASAP,
8640
 *    ignoring checkpoint_completion_target parameter.
8641
 *  CHECKPOINT_FORCE: force a checkpoint even if no XLOG activity has occurred
8642
 *    since the last one (implied by CHECKPOINT_IS_SHUTDOWN or
8643
 *    CHECKPOINT_END_OF_RECOVERY).
8644
 *  CHECKPOINT_FLUSH_ALL: also flush buffers of unlogged tables.
8645
 *
8646
 * Note: flags contains other bits, of interest here only for logging purposes.
8647
 * In particular note that this routine is synchronous and does not pay
8648
 * attention to CHECKPOINT_WAIT.
8649
 *
8650
 * If !shutdown then we are writing an online checkpoint. This is a very special
8651
 * kind of operation and WAL record because the checkpoint action occurs over
8652
 * a period of time yet logically occurs at just a single LSN. The logical
8653
 * position of the WAL record (redo ptr) is the same or earlier than the
8654
 * physical position. When we replay WAL we locate the checkpoint via its
8655
 * physical position then read the redo ptr and actually start replay at the
8656
 * earlier logical position. Note that we don't write *anything* to WAL at
8657
 * the logical position, so that location could be any other kind of WAL record.
8658
 * All of this mechanism allows us to continue working while we checkpoint.
8659
 * As a result, timing of actions is critical here and be careful to note that
8660
 * this function will likely take minutes to execute on a busy system.
8661
 */
8662
void
8663
CreateCheckPoint(int flags)
8664
2.57k
{
8665
2.57k
  bool    shutdown;
8666
2.57k
  CheckPoint  checkPoint;
8667
2.57k
  XLogRecPtr  recptr;
8668
2.57k
  XLogSegNo _logSegNo;
8669
2.57k
  XLogCtlInsert *Insert = &XLogCtl->Insert;
8670
2.57k
  uint32    freespace;
8671
2.57k
  XLogRecPtr  PriorRedoPtr;
8672
2.57k
  XLogRecPtr  curInsert;
8673
2.57k
  XLogRecPtr  last_important_lsn;
8674
2.57k
  VirtualTransactionId *vxids;
8675
2.57k
  int     nvxids;
8676
8677
  /*
8678
   * An end-of-recovery checkpoint is really a shutdown checkpoint, just
8679
   * issued at a different time.
8680
   */
8681
2.57k
  if (flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY))
8682
2.01k
    shutdown = true;
8683
562
  else
8684
562
    shutdown = false;
8685
8686
  /* sanity check */
8687
2.57k
  if (RecoveryInProgress() && 
(flags & 5
CHECKPOINT_END_OF_RECOVERY5
) == 0)
8688
0
    elog(ERROR, "can't create a checkpoint during recovery");
8689
8690
  /*
8691
   * Initialize InitXLogInsert working areas before entering the critical
8692
   * section.  Normally, this is done by the first call to
8693
   * RecoveryInProgress() or LocalSetXLogInsertAllowed(), but when creating
8694
   * an end-of-recovery checkpoint, the LocalSetXLogInsertAllowed call is
8695
   * done below in a critical section, and InitXLogInsert cannot be called
8696
   * in a critical section.
8697
   */
8698
2.57k
  InitXLogInsert();
8699
8700
  /*
8701
   * Acquire CheckpointLock to ensure only one checkpoint happens at a time.
8702
   * (This is just pro forma, since in the present system structure there is
8703
   * only one process that is allowed to issue checkpoints at any given
8704
   * time.)
8705
   */
8706
2.57k
  LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
8707
8708
  /*
8709
   * Prepare to accumulate statistics.
8710
   *
8711
   * Note: because it is possible for log_checkpoints to change while a
8712
   * checkpoint proceeds, we always accumulate stats, even if
8713
   * log_checkpoints is currently off.
8714
   */
8715
2.57k
  MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
8716
2.57k
  CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
8717
8718
  /*
8719
   * Use a critical section to force system panic if we have trouble.
8720
   */
8721
2.57k
  START_CRIT_SECTION();
8722
8723
2.57k
  if (shutdown)
8724
2.01k
  {
8725
2.01k
    LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
8726
2.01k
    ControlFile->state = DB_SHUTDOWNING;
8727
2.01k
    ControlFile->time = (pg_time_t) time(NULL);
8728
2.01k
    UpdateControlFile();
8729
2.01k
    LWLockRelease(ControlFileLock);
8730
2.01k
  }
8731
8732
  /*
8733
   * Let smgr prepare for checkpoint; this has to happen before we determine
8734
   * the REDO pointer.  Note that smgr must not do anything that'd have to
8735
   * be undone if we decide no checkpoint is needed.
8736
   */
8737
2.57k
  smgrpreckpt();
8738
8739
  /* Begin filling in the checkpoint WAL record */
8740
2.57k
  MemSet(&checkPoint, 0, sizeof(checkPoint));
8741
2.57k
  checkPoint.time = (pg_time_t) time(NULL);
8742
8743
  /*
8744
   * For Hot Standby, derive the oldestActiveXid before we fix the redo
8745
   * pointer. This allows us to begin accumulating changes to assemble our
8746
   * starting snapshot of locks and transactions.
8747
   */
8748
2.57k
  if (!shutdown && 
XLogStandbyInfoActive562
())
8749
562
    checkPoint.oldestActiveXid = GetOldestActiveTransactionId();
8750
2.01k
  else
8751
2.01k
    checkPoint.oldestActiveXid = InvalidTransactionId;
8752
8753
  /*
8754
   * Get location of last important record before acquiring insert locks (as
8755
   * GetLastImportantRecPtr() also locks WAL locks).
8756
   */
8757
2.57k
  last_important_lsn = GetLastImportantRecPtr();
8758
8759
  /*
8760
   * We must block concurrent insertions while examining insert state to
8761
   * determine the checkpoint REDO pointer.
8762
   */
8763
2.57k
  WALInsertLockAcquireExclusive();
8764
2.57k
  curInsert = XLogBytePosToRecPtr(Insert->CurrBytePos);
8765
8766
  /*
8767
   * If this isn't a shutdown or forced checkpoint, and if there has been no
8768
   * WAL activity requiring a checkpoint, skip it.  The idea here is to
8769
   * avoid inserting duplicate checkpoints when the system is idle.
8770
   */
8771
2.57k
  if ((flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY |
8772
2.57k
          CHECKPOINT_FORCE)) == 0)
8773
252
  {
8774
252
    if (last_important_lsn == ControlFile->checkPoint)
8775
222
    {
8776
222
      WALInsertLockRelease();
8777
222
      LWLockRelease(CheckpointLock);
8778
222
      END_CRIT_SECTION();
8779
222
      ereport(DEBUG1,
8780
222
          (errmsg("checkpoint skipped because system is idle")));
8781
222
      return;
8782
222
    }
8783
252
  }
8784
8785
  /*
8786
   * An end-of-recovery checkpoint is created before anyone is allowed to
8787
   * write WAL. To allow us to write the checkpoint record, temporarily
8788
   * enable XLogInsertAllowed.  (This also ensures ThisTimeLineID is
8789
   * initialized, which we need here and in AdvanceXLInsertBuffer.)
8790
   */
8791
2.35k
  if (flags & CHECKPOINT_END_OF_RECOVERY)
8792
5
    LocalSetXLogInsertAllowed();
8793
8794
2.35k
  checkPoint.ThisTimeLineID = ThisTimeLineID;
8795
2.35k
  if (flags & CHECKPOINT_END_OF_RECOVERY)
8796
5
    checkPoint.PrevTimeLineID = XLogCtl->PrevTimeLineID;
8797
2.34k
  else
8798
2.34k
    checkPoint.PrevTimeLineID = ThisTimeLineID;
8799
8800
2.35k
  checkPoint.fullPageWrites = Insert->fullPageWrites;
8801
8802
  /*
8803
   * Compute new REDO record ptr = location of next XLOG record.
8804
   *
8805
   * NB: this is NOT necessarily where the checkpoint record itself will be,
8806
   * since other backends may insert more XLOG records while we're off doing
8807
   * the buffer flush work.  Those XLOG records are logically after the
8808
   * checkpoint, even though physically before it.  Got that?
8809
   */
8810
2.35k
  freespace = INSERT_FREESPACE(curInsert);
8811
2.35k
  if (freespace == 0)
8812
0
  {
8813
0
    if (XLogSegmentOffset(curInsert, wal_segment_size) == 0)
8814
0
      curInsert += SizeOfXLogLongPHD;
8815
0
    else
8816
0
      curInsert += SizeOfXLogShortPHD;
8817
0
  }
8818
2.35k
  checkPoint.redo = curInsert;
8819
8820
  /*
8821
   * Here we update the shared RedoRecPtr for future XLogInsert calls; this
8822
   * must be done while holding all the insertion locks.
8823
   *
8824
   * Note: if we fail to complete the checkpoint, RedoRecPtr will be left
8825
   * pointing past where it really needs to point.  This is okay; the only
8826
   * consequence is that XLogInsert might back up whole buffers that it
8827
   * didn't really need to.  We can't postpone advancing RedoRecPtr because
8828
   * XLogInserts that happen while we are dumping buffers must assume that
8829
   * their buffer changes are not included in the checkpoint.
8830
   */
8831
2.35k
  RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo;
8832
8833
  /*
8834
   * Now we can release the WAL insertion locks, allowing other xacts to
8835
   * proceed while we are flushing disk buffers.
8836
   */
8837
2.35k
  WALInsertLockRelease();
8838
8839
  /* Update the info_lck-protected copy of RedoRecPtr as well */
8840
2.35k
  SpinLockAcquire(&XLogCtl->info_lck);
8841
2.35k
  XLogCtl->RedoRecPtr = checkPoint.redo;
8842
2.35k
  SpinLockRelease(&XLogCtl->info_lck);
8843
8844
  /*
8845
   * If enabled, log checkpoint start.  We postpone this until now so as not
8846
   * to log anything if we decided to skip the checkpoint.
8847
   */
8848
2.35k
  if (log_checkpoints)
8849
0
    LogCheckpointStart(flags, false);
8850
8851
2.35k
  TRACE_POSTGRESQL_CHECKPOINT_START(flags);
8852
8853
  /*
8854
   * Get the other info we need for the checkpoint record.
8855
   *
8856
   * We don't need to save oldestClogXid in the checkpoint, it only matters
8857
   * for the short period in which clog is being truncated, and if we crash
8858
   * during that we'll redo the clog truncation and fix up oldestClogXid
8859
   * there.
8860
   */
8861
2.35k
  LWLockAcquire(XidGenLock, LW_SHARED);
8862
2.35k
  checkPoint.nextXid = ShmemVariableCache->nextXid;
8863
2.35k
  checkPoint.oldestXid = ShmemVariableCache->oldestXid;
8864
2.35k
  checkPoint.oldestXidDB = ShmemVariableCache->oldestXidDB;
8865
2.35k
  LWLockRelease(XidGenLock);
8866
8867
2.35k
  LWLockAcquire(CommitTsLock, LW_SHARED);
8868
2.35k
  checkPoint.oldestCommitTsXid = ShmemVariableCache->oldestCommitTsXid;
8869
2.35k
  checkPoint.newestCommitTsXid = ShmemVariableCache->newestCommitTsXid;
8870
2.35k
  LWLockRelease(CommitTsLock);
8871
8872
  /* Increase XID epoch if we've wrapped around since last checkpoint */
8873
2.35k
  checkPoint.nextXidEpoch = ControlFile->checkPointCopy.nextXidEpoch;
8874
2.35k
  if (checkPoint.nextXid < ControlFile->checkPointCopy.nextXid)
8875
0
    checkPoint.nextXidEpoch++;
8876
8877
2.35k
  LWLockAcquire(OidGenLock, LW_SHARED);
8878
2.35k
  checkPoint.nextOid = ShmemVariableCache->nextOid;
8879
2.35k
  if (!shutdown)
8880
340
    checkPoint.nextOid += ShmemVariableCache->oidCount;
8881
2.35k
  LWLockRelease(OidGenLock);
8882
8883
2.35k
  MultiXactGetCheckptMulti(shutdown,
8884
2.35k
               &checkPoint.nextMulti,
8885
2.35k
               &checkPoint.nextMultiOffset,
8886
2.35k
               &checkPoint.oldestMulti,
8887
2.35k
               &checkPoint.oldestMultiDB);
8888
8889
  /*
8890
   * Having constructed the checkpoint record, ensure all shmem disk buffers
8891
   * and commit-log buffers are flushed to disk.
8892
   *
8893
   * This I/O could fail for various reasons.  If so, we will fail to
8894
   * complete the checkpoint, but there is no reason to force a system
8895
   * panic. Accordingly, exit critical section while doing it.
8896
   */
8897
2.35k
  END_CRIT_SECTION();
8898
8899
  /*
8900
   * In some cases there are groups of actions that must all occur on one
8901
   * side or the other of a checkpoint record. Before flushing the
8902
   * checkpoint record we must explicitly wait for any backend currently
8903
   * performing those groups of actions.
8904
   *
8905
   * One example is end of transaction, so we must wait for any transactions
8906
   * that are currently in commit critical sections.  If an xact inserted
8907
   * its commit record into XLOG just before the REDO point, then a crash
8908
   * restart from the REDO point would not replay that record, which means
8909
   * that our flushing had better include the xact's update of pg_xact.  So
8910
   * we wait till he's out of his commit critical section before proceeding.
8911
   * See notes in RecordTransactionCommit().
8912
   *
8913
   * Because we've already released the insertion locks, this test is a bit
8914
   * fuzzy: it is possible that we will wait for xacts we didn't really need
8915
   * to wait for.  But the delay should be short and it seems better to make
8916
   * checkpoint take a bit longer than to hold off insertions longer than
8917
   * necessary. (In fact, the whole reason we have this issue is that xact.c
8918
   * does commit record XLOG insertion and clog update as two separate steps
8919
   * protected by different locks, but again that seems best on grounds of
8920
   * minimizing lock contention.)
8921
   *
8922
   * A transaction that has not yet set delayChkpt when we look cannot be at
8923
   * risk, since he's not inserted his commit record yet; and one that's
8924
   * already cleared it is not at risk either, since he's done fixing clog
8925
   * and we will correctly flush the update below.  So we cannot miss any
8926
   * xacts we need to wait for.
8927
   */
8928
2.35k
  vxids = GetVirtualXIDsDelayingChkpt(&nvxids);
8929
2.35k
  if (nvxids > 0)
8930
0
  {
8931
0
    do
8932
0
    {
8933
0
      pg_usleep(10000L);  /* wait for 10 msec */
8934
0
    } while (HaveVirtualXIDsDelayingChkpt(vxids, nvxids));
8935
0
  }
8936
2.35k
  pfree(vxids);
8937
8938
2.35k
  CheckPointGuts(checkPoint.redo, flags);
8939
8940
  /*
8941
   * Take a snapshot of running transactions and write this to WAL. This
8942
   * allows us to reconstruct the state of running transactions during
8943
   * archive recovery, if required. Skip, if this info disabled.
8944
   *
8945
   * If we are shutting down, or Startup process is completing crash
8946
   * recovery we don't need to write running xact data.
8947
   */
8948
2.35k
  if (!shutdown && 
XLogStandbyInfoActive340
())
8949
340
    LogStandbySnapshot();
8950
8951
2.35k
  START_CRIT_SECTION();
8952
8953
  /*
8954
   * Now insert the checkpoint record into XLOG.
8955
   */
8956
2.35k
  XLogBeginInsert();
8957
2.35k
  XLogRegisterData((char *) (&checkPoint), sizeof(checkPoint));
8958
2.35k
  recptr = XLogInsert(RM_XLOG_ID,
8959
2.35k
            shutdown ? 
XLOG_CHECKPOINT_SHUTDOWN2.01k
:
8960
2.35k
            
XLOG_CHECKPOINT_ONLINE340
);
8961
8962
2.35k
  XLogFlush(recptr);
8963
8964
  /*
8965
   * We mustn't write any new WAL after a shutdown checkpoint, or it will be
8966
   * overwritten at next startup.  No-one should even try, this just allows
8967
   * sanity-checking.  In the case of an end-of-recovery checkpoint, we want
8968
   * to just temporarily disable writing until the system has exited
8969
   * recovery.
8970
   */
8971
2.35k
  if (shutdown)
8972
2.01k
  {
8973
2.01k
    if (flags & CHECKPOINT_END_OF_RECOVERY)
8974
5
      LocalXLogInsertAllowed = -1;  /* return to "check" state */
8975
2.00k
    else
8976
2.00k
      LocalXLogInsertAllowed = 0; /* never again write WAL */
8977
2.01k
  }
8978
8979
  /*
8980
   * We now have ProcLastRecPtr = start of actual checkpoint record, recptr
8981
   * = end of actual checkpoint record.
8982
   */
8983
2.35k
  if (shutdown && 
checkPoint.redo != ProcLastRecPtr2.01k
)
8984
2.35k
    ereport(PANIC,
8985
2.35k
        (errmsg("concurrent write-ahead log activity while database system is shutting down")));
8986
8987
  /*
8988
   * Remember the prior checkpoint's redo ptr for
8989
   * UpdateCheckPointDistanceEstimate()
8990
   */
8991
2.35k
  PriorRedoPtr = ControlFile->checkPointCopy.redo;
8992
8993
  /*
8994
   * Update the control file.
8995
   */
8996
2.35k
  LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
8997
2.35k
  if (shutdown)
8998
2.01k
    ControlFile->state = DB_SHUTDOWNED;
8999
2.35k
  ControlFile->checkPoint = ProcLastRecPtr;
9000
2.35k
  ControlFile->checkPointCopy = checkPoint;
9001
2.35k
  ControlFile->time = (pg_time_t) time(NULL);
9002
  /* crash recovery should always recover to the end of WAL */
9003
2.35k
  ControlFile->minRecoveryPoint = InvalidXLogRecPtr;
9004
2.35k
  ControlFile->minRecoveryPointTLI = 0;
9005
9006
  /*
9007
   * Persist unloggedLSN value. It's reset on crash recovery, so this goes
9008
   * unused on non-shutdown checkpoints, but seems useful to store it always
9009
   * for debugging purposes.
9010
   */
9011
2.35k
  SpinLockAcquire(&XLogCtl->ulsn_lck);
9012
2.35k
  ControlFile->unloggedLSN = XLogCtl->unloggedLSN;
9013
2.35k
  SpinLockRelease(&XLogCtl->ulsn_lck);
9014
9015
2.35k
  UpdateControlFile();
9016
2.35k
  LWLockRelease(ControlFileLock);
9017
9018
  /* Update shared-memory copy of checkpoint XID/epoch */
9019
2.35k
  SpinLockAcquire(&XLogCtl->info_lck);
9020
2.35k
  XLogCtl->ckptXidEpoch = checkPoint.nextXidEpoch;
9021
2.35k
  XLogCtl->ckptXid = checkPoint.nextXid;
9022
2.35k
  SpinLockRelease(&XLogCtl->info_lck);
9023
9024
  /*
9025
   * We are now done with critical updates; no need for system panic if we
9026
   * have trouble while fooling with old log segments.
9027
   */
9028
2.35k
  END_CRIT_SECTION();
9029
9030
  /*
9031
   * Let smgr do post-checkpoint cleanup (eg, deleting old files).
9032
   */
9033
2.35k
  smgrpostckpt();
9034
9035
  /*
9036
   * Update the average distance between checkpoints if the prior checkpoint
9037
   * exists.
9038
   */
9039
2.35k
  if (PriorRedoPtr != InvalidXLogRecPtr)
9040
2.35k
    UpdateCheckPointDistanceEstimate(RedoRecPtr - PriorRedoPtr);
9041
9042
  /*
9043
   * Delete old log files, those no longer needed for last checkpoint to
9044
   * prevent the disk holding the xlog from growing full.
9045
   */
9046
2.35k
  XLByteToSeg(RedoRecPtr, _logSegNo, wal_segment_size);
9047
2.35k
  KeepLogSeg(recptr, &_logSegNo);
9048
2.35k
  _logSegNo--;
9049
2.35k
  RemoveOldXlogFiles(_logSegNo, RedoRecPtr, recptr);
9050
9051
  /*
9052
   * Make more log segments if needed.  (Do this after recycling old log
9053
   * segments, since that may supply some of the needed files.)
9054
   */
9055
2.35k
  if (!shutdown)
9056
340
    PreallocXlogFiles(recptr);
9057
9058
  /*
9059
   * Truncate pg_subtrans if possible.  We can throw away all data before
9060
   * the oldest XMIN of any running transaction.  No future transaction will
9061
   * attempt to reference any pg_subtrans entry older than that (see Asserts
9062
   * in subtrans.c).  During recovery, though, we mustn't do this because
9063
   * StartupSUBTRANS hasn't been called yet.
9064
   */
9065
2.35k
  if (!RecoveryInProgress())
9066
2.34k
    TruncateSUBTRANS(GetOldestXmin(NULL, PROCARRAY_FLAGS_DEFAULT));
9067
9068
  /* Real work is done, but log and update stats before releasing lock. */
9069
2.35k
  LogCheckpointEnd(false);
9070
9071
2.35k
  TRACE_POSTGRESQL_CHECKPOINT_DONE(CheckpointStats.ckpt_bufs_written,
9072
2.35k
                   NBuffers,
9073
2.35k
                   CheckpointStats.ckpt_segs_added,
9074
2.35k
                   CheckpointStats.ckpt_segs_removed,
9075
2.35k
                   CheckpointStats.ckpt_segs_recycled);
9076
9077
2.35k
  LWLockRelease(CheckpointLock);
9078
2.35k
}
9079
9080
/*
9081
 * Mark the end of recovery in WAL though without running a full checkpoint.
9082
 * We can expect that a restartpoint is likely to be in progress as we
9083
 * do this, though we are unwilling to wait for it to complete. So be
9084
 * careful to avoid taking the CheckpointLock anywhere here.
9085
 *
9086
 * CreateRestartPoint() allows for the case where recovery may end before
9087
 * the restartpoint completes so there is no concern of concurrent behaviour.
9088
 */
9089
static void
9090
CreateEndOfRecoveryRecord(void)
9091
0
{
9092
0
  xl_end_of_recovery xlrec;
9093
0
  XLogRecPtr  recptr;
9094
9095
  /* sanity check */
9096
0
  if (!RecoveryInProgress())
9097
0
    elog(ERROR, "can only be used to end recovery");
9098
9099
0
  xlrec.end_time = GetCurrentTimestamp();
9100
9101
0
  WALInsertLockAcquireExclusive();
9102
0
  xlrec.ThisTimeLineID = ThisTimeLineID;
9103
0
  xlrec.PrevTimeLineID = XLogCtl->PrevTimeLineID;
9104
0
  WALInsertLockRelease();
9105
9106
0
  LocalSetXLogInsertAllowed();
9107
9108
0
  START_CRIT_SECTION();
9109
9110
0
  XLogBeginInsert();
9111
0
  XLogRegisterData((char *) &xlrec, sizeof(xl_end_of_recovery));
9112
0
  recptr = XLogInsert(RM_XLOG_ID, XLOG_END_OF_RECOVERY);
9113
9114
0
  XLogFlush(recptr);
9115
9116
  /*
9117
   * Update the control file so that crash recovery can follow the timeline
9118
   * changes to this point.
9119
   */
9120
0
  LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
9121
0
  ControlFile->time = (pg_time_t) time(NULL);
9122
0
  ControlFile->minRecoveryPoint = recptr;
9123
0
  ControlFile->minRecoveryPointTLI = ThisTimeLineID;
9124
0
  UpdateControlFile();
9125
0
  LWLockRelease(ControlFileLock);
9126
9127
0
  END_CRIT_SECTION();
9128
9129
0
  LocalXLogInsertAllowed = -1;  /* return to "check" state */
9130
0
}
9131
9132
/*
9133
 * Flush all data in shared memory to disk, and fsync
9134
 *
9135
 * This is the common code shared between regular checkpoints and
9136
 * recovery restartpoints.
9137
 */
9138
static void
9139
CheckPointGuts(XLogRecPtr checkPointRedo, int flags)
9140
2.35k
{
9141
2.35k
  CheckPointCLOG();
9142
2.35k
  CheckPointCommitTs();
9143
2.35k
  CheckPointSUBTRANS();
9144
2.35k
  CheckPointMultiXact();
9145
2.35k
  CheckPointPredicate();
9146
2.35k
  CheckPointRelationMap();
9147
2.35k
  CheckPointReplicationSlots();
9148
2.35k
  CheckPointSnapBuild();
9149
2.35k
  CheckPointLogicalRewriteHeap();
9150
2.35k
  CheckPointBuffers(flags); /* performs all required fsyncs */
9151
2.35k
  CheckPointReplicationOrigin();
9152
  /* We deliberately delay 2PC checkpointing as long as possible */
9153
2.35k
  CheckPointTwoPhase(checkPointRedo);
9154
2.35k
}
9155
9156
/*
9157
 * Save a checkpoint for recovery restart if appropriate
9158
 *
9159
 * This function is called each time a checkpoint record is read from XLOG.
9160
 * It must determine whether the checkpoint represents a safe restartpoint or
9161
 * not.  If so, the checkpoint record is stashed in shared memory so that
9162
 * CreateRestartPoint can consult it.  (Note that the latter function is
9163
 * executed by the checkpointer, while this one will be executed by the
9164
 * startup process.)
9165
 */
9166
static void
9167
RecoveryRestartPoint(const CheckPoint *checkPoint)
9168
1
{
9169
  /*
9170
   * Also refrain from creating a restartpoint if we have seen any
9171
   * references to non-existent pages. Restarting recovery from the
9172
   * restartpoint would not see the references, so we would lose the
9173
   * cross-check that the pages belonged to a relation that was dropped
9174
   * later.
9175
   */
9176
1
  if (XLogHaveInvalidPages())
9177
0
  {
9178
0
    elog(trace_recovery(DEBUG2),
9179
0
       "could not record restart point at %X/%X because there "
9180
0
       "are unresolved references to invalid pages",
9181
0
       (uint32) (checkPoint->redo >> 32),
9182
0
       (uint32) checkPoint->redo);
9183
0
    return;
9184
0
  }
9185
9186
  /*
9187
   * Copy the checkpoint record to shared memory, so that checkpointer can
9188
   * work out the next time it wants to perform a restartpoint.
9189
   */
9190
1
  SpinLockAcquire(&XLogCtl->info_lck);
9191
1
  XLogCtl->lastCheckPointRecPtr = ReadRecPtr;
9192
1
  XLogCtl->lastCheckPointEndPtr = EndRecPtr;
9193
1
  XLogCtl->lastCheckPoint = *checkPoint;
9194
1
  SpinLockRelease(&XLogCtl->info_lck);
9195
1
}
9196
9197
/*
9198
 * Establish a restartpoint if possible.
9199
 *
9200
 * This is similar to CreateCheckPoint, but is used during WAL recovery
9201
 * to establish a point from which recovery can roll forward without
9202
 * replaying the entire recovery log.
9203
 *
9204
 * Returns true if a new restartpoint was established. We can only establish
9205
 * a restartpoint if we have replayed a safe checkpoint record since last
9206
 * restartpoint.
9207
 */
9208
bool
9209
CreateRestartPoint(int flags)
9210
0
{
9211
0
  XLogRecPtr  lastCheckPointRecPtr;
9212
0
  XLogRecPtr  lastCheckPointEndPtr;
9213
0
  CheckPoint  lastCheckPoint;
9214
0
  XLogRecPtr  PriorRedoPtr;
9215
0
  XLogRecPtr  receivePtr;
9216
0
  XLogRecPtr  replayPtr;
9217
0
  TimeLineID  replayTLI;
9218
0
  XLogRecPtr  endptr;
9219
0
  XLogSegNo _logSegNo;
9220
0
  TimestampTz xtime;
9221
9222
  /*
9223
   * Acquire CheckpointLock to ensure only one restartpoint or checkpoint
9224
   * happens at a time.
9225
   */
9226
0
  LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
9227
9228
  /* Get a local copy of the last safe checkpoint record. */
9229
0
  SpinLockAcquire(&XLogCtl->info_lck);
9230
0
  lastCheckPointRecPtr = XLogCtl->lastCheckPointRecPtr;
9231
0
  lastCheckPointEndPtr = XLogCtl->lastCheckPointEndPtr;
9232
0
  lastCheckPoint = XLogCtl->lastCheckPoint;
9233
0
  SpinLockRelease(&XLogCtl->info_lck);
9234
9235
  /*
9236
   * Check that we're still in recovery mode. It's ok if we exit recovery
9237
   * mode after this check, the restart point is valid anyway.
9238
   */
9239
0
  if (!RecoveryInProgress())
9240
0
  {
9241
0
    ereport(DEBUG2,
9242
0
        (errmsg("skipping restartpoint, recovery has already ended")));
9243
0
    LWLockRelease(CheckpointLock);
9244
0
    return false;
9245
0
  }
9246
9247
  /*
9248
   * If the last checkpoint record we've replayed is already our last
9249
   * restartpoint, we can't perform a new restart point. We still update
9250
   * minRecoveryPoint in that case, so that if this is a shutdown restart
9251
   * point, we won't start up earlier than before. That's not strictly
9252
   * necessary, but when hot standby is enabled, it would be rather weird if
9253
   * the database opened up for read-only connections at a point-in-time
9254
   * before the last shutdown. Such time travel is still possible in case of
9255
   * immediate shutdown, though.
9256
   *
9257
   * We don't explicitly advance minRecoveryPoint when we do create a
9258
   * restartpoint. It's assumed that flushing the buffers will do that as a
9259
   * side-effect.
9260
   */
9261
0
  if (XLogRecPtrIsInvalid(lastCheckPointRecPtr) ||
9262
0
    lastCheckPoint.redo <= ControlFile->checkPointCopy.redo)
9263
0
  {
9264
0
    ereport(DEBUG2,
9265
0
        (errmsg("skipping restartpoint, already performed at %X/%X",
9266
0
            (uint32) (lastCheckPoint.redo >> 32),
9267
0
            (uint32) lastCheckPoint.redo)));
9268
9269
0
    UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);
9270
0
    if (flags & CHECKPOINT_IS_SHUTDOWN)
9271
0
    {
9272
0
      LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
9273
0
      ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY;
9274
0
      ControlFile->time = (pg_time_t) time(NULL);
9275
0
      UpdateControlFile();
9276
0
      LWLockRelease(ControlFileLock);
9277
0
    }
9278
0
    LWLockRelease(CheckpointLock);
9279
0
    return false;
9280
0
  }
9281
9282
  /*
9283
   * Update the shared RedoRecPtr so that the startup process can calculate
9284
   * the number of segments replayed since last restartpoint, and request a
9285
   * restartpoint if it exceeds CheckPointSegments.
9286
   *
9287
   * Like in CreateCheckPoint(), hold off insertions to update it, although
9288
   * during recovery this is just pro forma, because no WAL insertions are
9289
   * happening.
9290
   */
9291
0
  WALInsertLockAcquireExclusive();
9292
0
  RedoRecPtr = XLogCtl->Insert.RedoRecPtr = lastCheckPoint.redo;
9293
0
  WALInsertLockRelease();
9294
9295
  /* Also update the info_lck-protected copy */
9296
0
  SpinLockAcquire(&XLogCtl->info_lck);
9297
0
  XLogCtl->RedoRecPtr = lastCheckPoint.redo;
9298
0
  SpinLockRelease(&XLogCtl->info_lck);
9299
9300
  /*
9301
   * Prepare to accumulate statistics.
9302
   *
9303
   * Note: because it is possible for log_checkpoints to change while a
9304
   * checkpoint proceeds, we always accumulate stats, even if
9305
   * log_checkpoints is currently off.
9306
   */
9307
0
  MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
9308
0
  CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
9309
9310
0
  if (log_checkpoints)
9311
0
    LogCheckpointStart(flags, true);
9312
9313
0
  CheckPointGuts(lastCheckPoint.redo, flags);
9314
9315
  /*
9316
   * Remember the prior checkpoint's redo ptr for
9317
   * UpdateCheckPointDistanceEstimate()
9318
   */
9319
0
  PriorRedoPtr = ControlFile->checkPointCopy.redo;
9320
9321
  /*
9322
   * Update pg_control, using current time.  Check that it still shows
9323
   * DB_IN_ARCHIVE_RECOVERY state and an older checkpoint, else do nothing;
9324
   * this is a quick hack to make sure nothing really bad happens if somehow
9325
   * we get here after the end-of-recovery checkpoint.
9326
   */
9327
0
  LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
9328
0
  if (ControlFile->state == DB_IN_ARCHIVE_RECOVERY &&
9329
0
    ControlFile->checkPointCopy.redo < lastCheckPoint.redo)
9330
0
  {
9331
0
    ControlFile->checkPoint = lastCheckPointRecPtr;
9332
0
    ControlFile->checkPointCopy = lastCheckPoint;
9333
0
    ControlFile->time = (pg_time_t) time(NULL);
9334
9335
    /*
9336
     * Ensure minRecoveryPoint is past the checkpoint record.  Normally,
9337
     * this will have happened already while writing out dirty buffers,
9338
     * but not necessarily - e.g. because no buffers were dirtied.  We do
9339
     * this because a non-exclusive base backup uses minRecoveryPoint to
9340
     * determine which WAL files must be included in the backup, and the
9341
     * file (or files) containing the checkpoint record must be included,
9342
     * at a minimum. Note that for an ordinary restart of recovery there's
9343
     * no value in having the minimum recovery point any earlier than this
9344
     * anyway, because redo will begin just after the checkpoint record.
9345
     */
9346
0
    if (ControlFile->minRecoveryPoint < lastCheckPointEndPtr)
9347
0
    {
9348
0
      ControlFile->minRecoveryPoint = lastCheckPointEndPtr;
9349
0
      ControlFile->minRecoveryPointTLI = lastCheckPoint.ThisTimeLineID;
9350
9351
      /* update local copy */
9352
0
      minRecoveryPoint = ControlFile->minRecoveryPoint;
9353
0
      minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
9354
0
    }
9355
0
    if (flags & CHECKPOINT_IS_SHUTDOWN)
9356
0
      ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY;
9357
0
    UpdateControlFile();
9358
0
  }
9359
0
  LWLockRelease(ControlFileLock);
9360
9361
  /*
9362
   * Update the average distance between checkpoints/restartpoints if the
9363
   * prior checkpoint exists.
9364
   */
9365
0
  if (PriorRedoPtr != InvalidXLogRecPtr)
9366
0
    UpdateCheckPointDistanceEstimate(RedoRecPtr - PriorRedoPtr);
9367
9368
  /*
9369
   * Delete old log files, those no longer needed for last restartpoint to
9370
   * prevent the disk holding the xlog from growing full.
9371
   */
9372
0
  XLByteToSeg(RedoRecPtr, _logSegNo, wal_segment_size);
9373
9374
  /*
9375
   * Retreat _logSegNo using the current end of xlog replayed or received,
9376
   * whichever is later.
9377
   */
9378
0
  receivePtr = GetWalRcvWriteRecPtr(NULL, NULL);
9379
0
  replayPtr = GetXLogReplayRecPtr(&replayTLI);
9380
0
  endptr = (receivePtr < replayPtr) ? replayPtr : receivePtr;
9381
0
  KeepLogSeg(endptr, &_logSegNo);
9382
0
  _logSegNo--;
9383
9384
  /*
9385
   * Try to recycle segments on a useful timeline. If we've been promoted
9386
   * since the beginning of this restartpoint, use the new timeline chosen
9387
   * at end of recovery (RecoveryInProgress() sets ThisTimeLineID in that
9388
   * case). If we're still in recovery, use the timeline we're currently
9389
   * replaying.
9390
   *
9391
   * There is no guarantee that the WAL segments will be useful on the
9392
   * current timeline; if recovery proceeds to a new timeline right after
9393
   * this, the pre-allocated WAL segments on this timeline will not be used,
9394
   * and will go wasted until recycled on the next restartpoint. We'll live
9395
   * with that.
9396
   */
9397
0
  if (RecoveryInProgress())
9398
0
    ThisTimeLineID = replayTLI;
9399
9400
0
  RemoveOldXlogFiles(_logSegNo, RedoRecPtr, endptr);
9401
9402
  /*
9403
   * Make more log segments if needed.  (Do this after recycling old log
9404
   * segments, since that may supply some of the needed files.)
9405
   */
9406
0
  PreallocXlogFiles(endptr);
9407
9408
  /*
9409
   * ThisTimeLineID is normally not set when we're still in recovery.
9410
   * However, recycling/preallocating segments above needed ThisTimeLineID
9411
   * to determine which timeline to install the segments on. Reset it now,
9412
   * to restore the normal state of affairs for debugging purposes.
9413
   */
9414
0
  if (RecoveryInProgress())
9415
0
    ThisTimeLineID = 0;
9416
9417
  /*
9418
   * Truncate pg_subtrans if possible.  We can throw away all data before
9419
   * the oldest XMIN of any running transaction.  No future transaction will
9420
   * attempt to reference any pg_subtrans entry older than that (see Asserts
9421
   * in subtrans.c).  When hot standby is disabled, though, we mustn't do
9422
   * this because StartupSUBTRANS hasn't been called yet.
9423
   */
9424
0
  if (EnableHotStandby)
9425
0
    TruncateSUBTRANS(GetOldestXmin(NULL, PROCARRAY_FLAGS_DEFAULT));
9426
9427
  /* Real work is done, but log and update before releasing lock. */
9428
0
  LogCheckpointEnd(true);
9429
9430
0
  xtime = GetLatestXTime();
9431
0
  ereport((log_checkpoints ? LOG : DEBUG2),
9432
0
      (errmsg("recovery restart point at %X/%X",
9433
0
          (uint32) (lastCheckPoint.redo >> 32), (uint32) lastCheckPoint.redo),
9434
0
       xtime ? errdetail("Last completed transaction was at log time %s.",
9435
0
                 timestamptz_to_str(xtime)) : 0));
9436
9437
0
  LWLockRelease(CheckpointLock);
9438
9439
  /*
9440
   * Finally, execute archive_cleanup_command, if any.
9441
   */
9442
0
  if (XLogCtl->archiveCleanupCommand[0])
9443
0
    ExecuteRecoveryCommand(XLogCtl->archiveCleanupCommand,
9444
0
                 "archive_cleanup_command",
9445
0
                 false);
9446
9447
0
  return true;
9448
0
}
9449
9450
/*
9451
 * Retreat *logSegNo to the last segment that we need to retain because of
9452
 * either wal_keep_segments or replication slots.
9453
 *
9454
 * This is calculated by subtracting wal_keep_segments from the given xlog
9455
 * location, recptr and by making sure that that result is below the
9456
 * requirement of replication slots.
9457
 */
9458
static void
9459
KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo)
9460
2.35k
{
9461
2.35k
  XLogSegNo segno;
9462
2.35k
  XLogRecPtr  keep;
9463
9464
2.35k
  XLByteToSeg(recptr, segno, wal_segment_size);
9465
2.35k
  keep = XLogGetReplicationSlotMinimumLSN();
9466
9467
  /* compute limit for wal_keep_segments first */
9468
2.35k
  if (wal_keep_segments > 0)
9469
0
  {
9470
    /* avoid underflow, don't go below 1 */
9471
0
    if (segno <= wal_keep_segments)
9472
0
      segno = 1;
9473
0
    else
9474
0
      segno = segno - wal_keep_segments;
9475
0
  }
9476
9477
  /* then check whether slots limit removal further */
9478
2.35k
  if (max_replication_slots > 0 && keep != InvalidXLogRecPtr)
9479
0
  {
9480
0
    XLogSegNo slotSegNo;
9481
9482
0
    XLByteToSeg(keep, slotSegNo, wal_segment_size);
9483
9484
0
    if (slotSegNo <= 0)
9485
0
      segno = 1;
9486
0
    else if (slotSegNo < segno)
9487
0
      segno = slotSegNo;
9488
0
  }
9489
9490
  /* don't delete WAL segments newer than the calculated segment */
9491
2.35k
  if (segno < *logSegNo)
9492
0
    *logSegNo = segno;
9493
2.35k
}
9494
9495
/*
9496
 * Write a NEXTOID log record
9497
 */
9498
void
9499
XLogPutNextOid(Oid nextOid)
9500
0
{
9501
0
  XLogBeginInsert();
9502
0
  XLogRegisterData((char *) (&nextOid), sizeof(Oid));
9503
0
  (void) XLogInsert(RM_XLOG_ID, XLOG_NEXTOID);
9504
9505
  /*
9506
   * We need not flush the NEXTOID record immediately, because any of the
9507
   * just-allocated OIDs could only reach disk as part of a tuple insert or
9508
   * update that would have its own XLOG record that must follow the NEXTOID
9509
   * record.  Therefore, the standard buffer LSN interlock applied to those
9510
   * records will ensure no such OID reaches disk before the NEXTOID record
9511
   * does.
9512
   *
9513
   * Note, however, that the above statement only covers state "within" the
9514
   * database.  When we use a generated OID as a file or directory name, we
9515
   * are in a sense violating the basic WAL rule, because that filesystem
9516
   * change may reach disk before the NEXTOID WAL record does.  The impact
9517
   * of this is that if a database crash occurs immediately afterward, we
9518
   * might after restart re-generate the same OID and find that it conflicts
9519
   * with the leftover file or directory.  But since for safety's sake we
9520
   * always loop until finding a nonconflicting filename, this poses no real
9521
   * problem in practice. See pgsql-hackers discussion 27-Sep-2006.
9522
   */
9523
0
}
9524
9525
/*
9526
 * Write an XLOG SWITCH record.
9527
 *
9528
 * Here we just blindly issue an XLogInsert request for the record.
9529
 * All the magic happens inside XLogInsert.
9530
 *
9531
 * The return value is either the end+1 address of the switch record,
9532
 * or the end+1 address of the prior segment if we did not need to
9533
 * write a switch record because we are already at segment start.
9534
 */
9535
XLogRecPtr
9536
RequestXLogSwitch(bool mark_unimportant)
9537
0
{
9538
0
  XLogRecPtr  RecPtr;
9539
9540
  /* XLOG SWITCH has no data */
9541
0
  XLogBeginInsert();
9542
9543
0
  if (mark_unimportant)
9544
0
    XLogSetRecordFlags(XLOG_MARK_UNIMPORTANT);
9545
0
  RecPtr = XLogInsert(RM_XLOG_ID, XLOG_SWITCH);
9546
9547
0
  return RecPtr;
9548
0
}
9549
9550
/*
9551
 * Write a RESTORE POINT record
9552
 */
9553
XLogRecPtr
9554
XLogRestorePoint(const char *rpName)
9555
0
{
9556
0
  XLogRecPtr  RecPtr;
9557
0
  xl_restore_point xlrec;
9558
9559
0
  xlrec.rp_time = GetCurrentTimestamp();
9560
0
  strlcpy(xlrec.rp_name, rpName, MAXFNAMELEN);
9561
9562
0
  XLogBeginInsert();
9563
0
  XLogRegisterData((char *) &xlrec, sizeof(xl_restore_point));
9564
9565
0
  RecPtr = XLogInsert(RM_XLOG_ID, XLOG_RESTORE_POINT);
9566
9567
0
  ereport(LOG,
9568
0
      (errmsg("restore point \"%s\" created at %X/%X",
9569
0
          rpName, (uint32) (RecPtr >> 32), (uint32) RecPtr)));
9570
9571
0
  return RecPtr;
9572
0
}
9573
9574
/*
9575
 * Check if any of the GUC parameters that are critical for hot standby
9576
 * have changed, and update the value in pg_control file if necessary.
9577
 */
9578
static void
9579
XLogReportParameters(void)
9580
3.99k
{
9581
3.99k
  if (wal_level != ControlFile->wal_level ||
9582
3.99k
    wal_log_hints != ControlFile->wal_log_hints ||
9583
3.99k
    MaxConnections != ControlFile->MaxConnections ||
9584
3.99k
    
max_worker_processes != ControlFile->max_worker_processes3.98k
||
9585
3.99k
    
max_prepared_xacts != ControlFile->max_prepared_xacts3.98k
||
9586
3.99k
    
max_locks_per_xact != ControlFile->max_locks_per_xact3.98k
||
9587
3.99k
    
track_commit_timestamp != ControlFile->track_commit_timestamp3.98k
)
9588
10
  {
9589
    /*
9590
     * The change in number of backend slots doesn't need to be WAL-logged
9591
     * if archiving is not enabled, as you can't start archive recovery
9592
     * with wal_level=minimal anyway. We don't really care about the
9593
     * values in pg_control either if wal_level=minimal, but seems better
9594
     * to keep them up-to-date to avoid confusion.
9595
     */
9596
10
    if (wal_level != ControlFile->wal_level || XLogIsNeeded())
9597
10
    {
9598
10
      xl_parameter_change xlrec;
9599
10
      XLogRecPtr  recptr;
9600
9601
10
      xlrec.MaxConnections = MaxConnections;
9602
10
      xlrec.max_worker_processes = max_worker_processes;
9603
10
      xlrec.max_prepared_xacts = max_prepared_xacts;
9604
10
      xlrec.max_locks_per_xact = max_locks_per_xact;
9605
10
      xlrec.wal_level = wal_level;
9606
10
      xlrec.wal_log_hints = wal_log_hints;
9607
10
      xlrec.track_commit_timestamp = track_commit_timestamp;
9608
9609
10
      XLogBeginInsert();
9610
10
      XLogRegisterData((char *) &xlrec, sizeof(xlrec));
9611
9612
10
      recptr = XLogInsert(RM_XLOG_ID, XLOG_PARAMETER_CHANGE);
9613
10
      XLogFlush(recptr);
9614
10
    }
9615
9616
10
    ControlFile->MaxConnections = MaxConnections;
9617
10
    ControlFile->max_worker_processes = max_worker_processes;
9618
10
    ControlFile->max_prepared_xacts = max_prepared_xacts;
9619
10
    ControlFile->max_locks_per_xact = max_locks_per_xact;
9620
10
    ControlFile->wal_level = wal_level;
9621
10
    ControlFile->wal_log_hints = wal_log_hints;
9622
10
    ControlFile->track_commit_timestamp = track_commit_timestamp;
9623
10
    UpdateControlFile();
9624
10
  }
9625
3.99k
}
9626
9627
/*
9628
 * Update full_page_writes in shared memory, and write an
9629
 * XLOG_FPW_CHANGE record if necessary.
9630
 *
9631
 * Note: this function assumes there is no other process running
9632
 * concurrently that could update it.
9633
 */
9634
void
9635
UpdateFullPageWrites(void)
9636
6.00k
{
9637
6.00k
  XLogCtlInsert *Insert = &XLogCtl->Insert;
9638
6.00k
  bool    recoveryInProgress;
9639
9640
  /*
9641
   * Do nothing if full_page_writes has not been changed.
9642
   *
9643
   * It's safe to check the shared full_page_writes without the lock,
9644
   * because we assume that there is no concurrently running process which
9645
   * can update it.
9646
   */
9647
6.00k
  if (fullPageWrites == Insert->fullPageWrites)
9648
6.00k
    return;
9649
9650
  /*
9651
   * Perform this outside critical section so that the WAL insert
9652
   * initialization done by RecoveryInProgress() doesn't trigger an
9653
   * assertion failure.
9654
   */
9655
0
  recoveryInProgress = RecoveryInProgress();
9656
9657
0
  START_CRIT_SECTION();
9658
9659
  /*
9660
   * It's always safe to take full page images, even when not strictly
9661
   * required, but not the other round. So if we're setting full_page_writes
9662
   * to true, first set it true and then write the WAL record. If we're
9663
   * setting it to false, first write the WAL record and then set the global
9664
   * flag.
9665
   */
9666
0
  if (fullPageWrites)
9667
0
  {
9668
0
    WALInsertLockAcquireExclusive();
9669
0
    Insert->fullPageWrites = true;
9670
0
    WALInsertLockRelease();
9671
0
  }
9672
9673
  /*
9674
   * Write an XLOG_FPW_CHANGE record. This allows us to keep track of
9675
   * full_page_writes during archive recovery, if required.
9676
   */
9677
0
  if (XLogStandbyInfoActive() && !recoveryInProgress)
9678
0
  {
9679
0
    XLogBeginInsert();
9680
0
    XLogRegisterData((char *) (&fullPageWrites), sizeof(bool));
9681
9682
0
    XLogInsert(RM_XLOG_ID, XLOG_FPW_CHANGE);
9683
0
  }
9684
9685
0
  if (!fullPageWrites)
9686
0
  {
9687
0
    WALInsertLockAcquireExclusive();
9688
0
    Insert->fullPageWrites = false;
9689
0
    WALInsertLockRelease();
9690
0
  }
9691
0
  END_CRIT_SECTION();
9692
0
}
9693
9694
/*
9695
 * Check that it's OK to switch to new timeline during recovery.
9696
 *
9697
 * 'lsn' is the address of the shutdown checkpoint record we're about to
9698
 * replay. (Currently, timeline can only change at a shutdown checkpoint).
9699
 */
9700
static void
9701
checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI, TimeLineID prevTLI)
9702
0
{
9703
  /* Check that the record agrees on what the current (old) timeline is */
9704
0
  if (prevTLI != ThisTimeLineID)
9705
0
    ereport(PANIC,
9706
0
        (errmsg("unexpected previous timeline ID %u (current timeline ID %u) in checkpoint record",
9707
0
            prevTLI, ThisTimeLineID)));
9708
9709
  /*
9710
   * The new timeline better be in the list of timelines we expect to see,
9711
   * according to the timeline history. It should also not decrease.
9712
   */
9713
0
  if (newTLI < ThisTimeLineID || !tliInHistory(newTLI, expectedTLEs))
9714
0
    ereport(PANIC,
9715
0
        (errmsg("unexpected timeline ID %u (after %u) in checkpoint record",
9716
0
            newTLI, ThisTimeLineID)));
9717
9718
  /*
9719
   * If we have not yet reached min recovery point, and we're about to
9720
   * switch to a timeline greater than the timeline of the min recovery
9721
   * point: trouble. After switching to the new timeline, we could not
9722
   * possibly visit the min recovery point on the correct timeline anymore.
9723
   * This can happen if there is a newer timeline in the archive that
9724
   * branched before the timeline the min recovery point is on, and you
9725
   * attempt to do PITR to the new timeline.
9726
   */
9727
0
  if (!XLogRecPtrIsInvalid(minRecoveryPoint) &&
9728
0
    lsn < minRecoveryPoint &&
9729
0
    newTLI > minRecoveryPointTLI)
9730
0
    ereport(PANIC,
9731
0
        (errmsg("unexpected timeline ID %u in checkpoint record, before reaching minimum recovery point %X/%X on timeline %u",
9732
0
            newTLI,
9733
0
            (uint32) (minRecoveryPoint >> 32),
9734
0
            (uint32) minRecoveryPoint,
9735
0
            minRecoveryPointTLI)));
9736
9737
  /* Looks good */
9738
0
}
9739
9740
/*
9741
 * XLOG resource manager's routines
9742
 *
9743
 * Definitions of info values are in include/catalog/pg_control.h, though
9744
 * not all record types are related to control file updates.
9745
 */
9746
void
9747
xlog_redo(XLogReaderState *record)
9748
1
{
9749
1
  uint8   info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
9750
1
  XLogRecPtr  lsn = record->EndRecPtr;
9751
9752
  /* in XLOG rmgr, backup blocks are only used by XLOG_FPI records */
9753
1
  Assert(info == XLOG_FPI || info == XLOG_FPI_FOR_HINT ||
9754
1
       !XLogRecHasAnyBlockRefs(record));
9755
9756
1
  if (info == XLOG_NEXTOID)
9757
0
  {
9758
0
    Oid     nextOid;
9759
9760
    /*
9761
     * We used to try to take the maximum of ShmemVariableCache->nextOid
9762
     * and the recorded nextOid, but that fails if the OID counter wraps
9763
     * around.  Since no OID allocation should be happening during replay
9764
     * anyway, better to just believe the record exactly.  We still take
9765
     * OidGenLock while setting the variable, just in case.
9766
     */
9767
0
    memcpy(&nextOid, XLogRecGetData(record), sizeof(Oid));
9768
0
    LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
9769
0
    ShmemVariableCache->nextOid = nextOid;
9770
0
    ShmemVariableCache->oidCount = 0;
9771
0
    LWLockRelease(OidGenLock);
9772
0
  }
9773
1
  else if (info == XLOG_CHECKPOINT_SHUTDOWN)
9774
0
  {
9775
0
    CheckPoint  checkPoint;
9776
9777
0
    memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
9778
    /* In a SHUTDOWN checkpoint, believe the counters exactly */
9779
0
    LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
9780
0
    ShmemVariableCache->nextXid = checkPoint.nextXid;
9781
0
    LWLockRelease(XidGenLock);
9782
0
    LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
9783
0
    ShmemVariableCache->nextOid = checkPoint.nextOid;
9784
0
    ShmemVariableCache->oidCount = 0;
9785
0
    LWLockRelease(OidGenLock);
9786
0
    MultiXactSetNextMXact(checkPoint.nextMulti,
9787
0
                checkPoint.nextMultiOffset);
9788
9789
0
    MultiXactAdvanceOldest(checkPoint.oldestMulti,
9790
0
                 checkPoint.oldestMultiDB);
9791
9792
    /*
9793
     * No need to set oldestClogXid here as well; it'll be set when we
9794
     * redo an xl_clog_truncate if it changed since initialization.
9795
     */
9796
0
    SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
9797
9798
    /*
9799
     * If we see a shutdown checkpoint while waiting for an end-of-backup
9800
     * record, the backup was canceled and the end-of-backup record will
9801
     * never arrive.
9802
     */
9803
0
    if (ArchiveRecoveryRequested &&
9804
0
      !XLogRecPtrIsInvalid(ControlFile->backupStartPoint) &&
9805
0
      XLogRecPtrIsInvalid(ControlFile->backupEndPoint))
9806
0
      ereport(PANIC,
9807
0
          (errmsg("online backup was canceled, recovery cannot continue")));
9808
9809
    /*
9810
     * If we see a shutdown checkpoint, we know that nothing was running
9811
     * on the master at this point. So fake-up an empty running-xacts
9812
     * record and use that here and now. Recover additional standby state
9813
     * for prepared transactions.
9814
     */
9815
0
    if (standbyState >= STANDBY_INITIALIZED)
9816
0
    {
9817
0
      TransactionId *xids;
9818
0
      int     nxids;
9819
0
      TransactionId oldestActiveXID;
9820
0
      TransactionId latestCompletedXid;
9821
0
      RunningTransactionsData running;
9822
9823
0
      oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids);
9824
9825
      /*
9826
       * Construct a RunningTransactions snapshot representing a shut
9827
       * down server, with only prepared transactions still alive. We're
9828
       * never overflowed at this point because all subxids are listed
9829
       * with their parent prepared transactions.
9830
       */
9831
0
      running.xcnt = nxids;
9832
0
      running.subxcnt = 0;
9833
0
      running.subxid_overflow = false;
9834
0
      running.nextXid = checkPoint.nextXid;
9835
0
      running.oldestRunningXid = oldestActiveXID;
9836
0
      latestCompletedXid = checkPoint.nextXid;
9837
0
      TransactionIdRetreat(latestCompletedXid);
9838
0
      Assert(TransactionIdIsNormal(latestCompletedXid));
9839
0
      running.latestCompletedXid = latestCompletedXid;
9840
0
      running.xids = xids;
9841
9842
0
      ProcArrayApplyRecoveryInfo(&running);
9843
9844
0
      StandbyRecoverPreparedTransactions();
9845
0
    }
9846
9847
    /* ControlFile->checkPointCopy always tracks the latest ckpt XID */
9848
0
    ControlFile->checkPointCopy.nextXidEpoch = checkPoint.nextXidEpoch;
9849
0
    ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
9850
9851
    /* Update shared-memory copy of checkpoint XID/epoch */
9852
0
    SpinLockAcquire(&XLogCtl->info_lck);
9853
0
    XLogCtl->ckptXidEpoch = checkPoint.nextXidEpoch;
9854
0
    XLogCtl->ckptXid = checkPoint.nextXid;
9855
0
    SpinLockRelease(&XLogCtl->info_lck);
9856
9857
    /*
9858
     * We should've already switched to the new TLI before replaying this
9859
     * record.
9860
     */
9861
0
    if (checkPoint.ThisTimeLineID != ThisTimeLineID)
9862
0
      ereport(PANIC,
9863
0
          (errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
9864
0
              checkPoint.ThisTimeLineID, ThisTimeLineID)));
9865
9866
0
    RecoveryRestartPoint(&checkPoint);
9867
0
  }
9868
1
  else if (info == XLOG_CHECKPOINT_ONLINE)
9869
1
  {
9870
1
    CheckPoint  checkPoint;
9871
9872
1
    memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
9873
    /* In an ONLINE checkpoint, treat the XID counter as a minimum */
9874
1
    LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
9875
1
    if (TransactionIdPrecedes(ShmemVariableCache->nextXid,
9876
1
                  checkPoint.nextXid))
9877
0
      ShmemVariableCache->nextXid = checkPoint.nextXid;
9878
1
    LWLockRelease(XidGenLock);
9879
9880
    /*
9881
     * We ignore the nextOid counter in an ONLINE checkpoint, preferring
9882
     * to track OID assignment through XLOG_NEXTOID records.  The nextOid
9883
     * counter is from the start of the checkpoint and might well be stale
9884
     * compared to later XLOG_NEXTOID records.  We could try to take the
9885
     * maximum of the nextOid counter and our latest value, but since
9886
     * there's no particular guarantee about the speed with which the OID
9887
     * counter wraps around, that's a risky thing to do.  In any case,
9888
     * users of the nextOid counter are required to avoid assignment of
9889
     * duplicates, so that a somewhat out-of-date value should be safe.
9890
     */
9891
9892
    /* Handle multixact */
9893
1
    MultiXactAdvanceNextMXact(checkPoint.nextMulti,
9894
1
                  checkPoint.nextMultiOffset);
9895
9896
    /*
9897
     * NB: This may perform multixact truncation when replaying WAL
9898
     * generated by an older primary.
9899
     */
9900
1
    MultiXactAdvanceOldest(checkPoint.oldestMulti,
9901
1
                 checkPoint.oldestMultiDB);
9902
1
    if (TransactionIdPrecedes(ShmemVariableCache->oldestXid,
9903
1
                  checkPoint.oldestXid))
9904
0
      SetTransactionIdLimit(checkPoint.oldestXid,
9905
0
                  checkPoint.oldestXidDB);
9906
    /* ControlFile->checkPointCopy always tracks the latest ckpt XID */
9907
1
    ControlFile->checkPointCopy.nextXidEpoch = checkPoint.nextXidEpoch;
9908
1
    ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
9909
9910
    /* Update shared-memory copy of checkpoint XID/epoch */
9911
1
    SpinLockAcquire(&XLogCtl->info_lck);
9912
1
    XLogCtl->ckptXidEpoch = checkPoint.nextXidEpoch;
9913
1
    XLogCtl->ckptXid = checkPoint.nextXid;
9914
1
    SpinLockRelease(&XLogCtl->info_lck);
9915
9916
    /* TLI should not change in an on-line checkpoint */
9917
1
    if (checkPoint.ThisTimeLineID != ThisTimeLineID)
9918
1
      ereport(PANIC,
9919
1
          (errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
9920
1
              checkPoint.ThisTimeLineID, ThisTimeLineID)));
9921
9922
1
    RecoveryRestartPoint(&checkPoint);
9923
1
  }
9924
0
  else if (info == XLOG_END_OF_RECOVERY)
9925
0
  {
9926
0
    xl_end_of_recovery xlrec;
9927
9928
0
    memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_end_of_recovery));
9929
9930
    /*
9931
     * For Hot Standby, we could treat this like a Shutdown Checkpoint,
9932
     * but this case is rarer and harder to test, so the benefit doesn't
9933
     * outweigh the potential extra cost of maintenance.
9934
     */
9935
9936
    /*
9937
     * We should've already switched to the new TLI before replaying this
9938
     * record.
9939
     */
9940
0
    if (xlrec.ThisTimeLineID != ThisTimeLineID)
9941
0
      ereport(PANIC,
9942
0
          (errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
9943
0
              xlrec.ThisTimeLineID, ThisTimeLineID)));
9944
0
  }
9945
0
  else if (info == XLOG_NOOP)
9946
0
  {
9947
    /* nothing to do here */
9948
0
  }
9949
0
  else if (info == XLOG_SWITCH)
9950
0
  {
9951
    /* nothing to do here */
9952
0
  }
9953
0
  else if (info == XLOG_RESTORE_POINT)
9954
0
  {
9955
    /* nothing to do here */
9956
0
  }
9957
0
  else if (info == XLOG_FPI || info == XLOG_FPI_FOR_HINT)
9958
0
  {
9959
0
    Buffer    buffer;
9960
9961
    /*
9962
     * Full-page image (FPI) records contain nothing else but a backup
9963
     * block. The block reference must include a full-page image -
9964
     * otherwise there would be no point in this record.
9965
     *
9966
     * No recovery conflicts are generated by these generic records - if a
9967
     * resource manager needs to generate conflicts, it has to define a
9968
     * separate WAL record type and redo routine.
9969
     *
9970
     * XLOG_FPI_FOR_HINT records are generated when a page needs to be
9971
     * WAL- logged because of a hint bit update. They are only generated
9972
     * when checksums are enabled. There is no difference in handling
9973
     * XLOG_FPI and XLOG_FPI_FOR_HINT records, they use a different info
9974
     * code just to distinguish them for statistics purposes.
9975
     */
9976
0
    if (XLogReadBufferForRedo(record, 0, &buffer) != BLK_RESTORED)
9977
0
      elog(ERROR, "unexpected XLogReadBufferForRedo result when restoring backup block");
9978
0
    UnlockReleaseBuffer(buffer);
9979
0
  }
9980
0
  else if (info == XLOG_BACKUP_END)
9981
0
  {
9982
0
    XLogRecPtr  startpoint;
9983
9984
0
    memcpy(&startpoint, XLogRecGetData(record), sizeof(startpoint));
9985
9986
0
    if (ControlFile->backupStartPoint == startpoint)
9987
0
    {
9988
      /*
9989
       * We have reached the end of base backup, the point where
9990
       * pg_stop_backup() was done. The data on disk is now consistent.
9991
       * Reset backupStartPoint, and update minRecoveryPoint to make
9992
       * sure we don't allow starting up at an earlier point even if
9993
       * recovery is stopped and restarted soon after this.
9994
       */
9995
0
      elog(DEBUG1, "end of backup reached");
9996
9997
0
      LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
9998
9999
0
      if (ControlFile->minRecoveryPoint < lsn)
10000
0
      {
10001
0
        ControlFile->minRecoveryPoint = lsn;
10002
0
        ControlFile->minRecoveryPointTLI = ThisTimeLineID;
10003
0
      }
10004
0
      ControlFile->backupStartPoint = InvalidXLogRecPtr;
10005
0
      ControlFile->backupEndRequired = false;
10006
0
      UpdateControlFile();
10007
10008
0
      LWLockRelease(ControlFileLock);
10009
0
    }
10010
0
  }
10011
0
  else if (info == XLOG_PARAMETER_CHANGE)
10012
0
  {
10013
0
    xl_parameter_change xlrec;
10014
10015
    /* Update our copy of the parameters in pg_control */
10016
0
    memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_parameter_change));
10017
10018
0
    LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
10019
0
    ControlFile->MaxConnections = xlrec.MaxConnections;
10020
0
    ControlFile->max_worker_processes = xlrec.max_worker_processes;
10021
0
    ControlFile->max_prepared_xacts = xlrec.max_prepared_xacts;
10022
0
    ControlFile->max_locks_per_xact = xlrec.max_locks_per_xact;
10023
0
    ControlFile->wal_level = xlrec.wal_level;
10024
0
    ControlFile->wal_log_hints = xlrec.wal_log_hints;
10025
10026
    /*
10027
     * Update minRecoveryPoint to ensure that if recovery is aborted, we
10028
     * recover back up to this point before allowing hot standby again.
10029
     * This is important if the max_* settings are decreased, to ensure
10030
     * you don't run queries against the WAL preceding the change. The
10031
     * local copies cannot be updated as long as crash recovery is
10032
     * happening and we expect all the WAL to be replayed.
10033
     */
10034
0
    if (InArchiveRecovery)
10035
0
    {
10036
0
      minRecoveryPoint = ControlFile->minRecoveryPoint;
10037
0
      minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
10038
0
    }
10039
0
    if (minRecoveryPoint != InvalidXLogRecPtr && minRecoveryPoint < lsn)
10040
0
    {
10041
0
      ControlFile->minRecoveryPoint = lsn;
10042
0
      ControlFile->minRecoveryPointTLI = ThisTimeLineID;
10043
0
    }
10044
10045
0
    CommitTsParameterChange(xlrec.track_commit_timestamp,
10046
0
                ControlFile->track_commit_timestamp);
10047
0
    ControlFile->track_commit_timestamp = xlrec.track_commit_timestamp;
10048
10049
0
    UpdateControlFile();
10050
0
    LWLockRelease(ControlFileLock);
10051
10052
    /* Check to see if any changes to max_connections give problems */
10053
0
    CheckRequiredParameterValues();
10054
0
  }
10055
0
  else if (info == XLOG_FPW_CHANGE)
10056
0
  {
10057
0
    bool    fpw;
10058
10059
0
    memcpy(&fpw, XLogRecGetData(record), sizeof(bool));
10060
10061
    /*
10062
     * Update the LSN of the last replayed XLOG_FPW_CHANGE record so that
10063
     * do_pg_start_backup() and do_pg_stop_backup() can check whether
10064
     * full_page_writes has been disabled during online backup.
10065
     */
10066
0
    if (!fpw)
10067
0
    {
10068
0
      SpinLockAcquire(&XLogCtl->info_lck);
10069
0
      if (XLogCtl->lastFpwDisableRecPtr < ReadRecPtr)
10070
0
        XLogCtl->lastFpwDisableRecPtr = ReadRecPtr;
10071
0
      SpinLockRelease(&XLogCtl->info_lck);
10072
0
    }
10073
10074
    /* Keep track of full_page_writes */
10075
0
    lastFullPageWrites = fpw;
10076
0
  }
10077
1
}
10078
10079
#ifdef WAL_DEBUG
10080
10081
static void
10082
xlog_outrec(StringInfo buf, XLogReaderState *record)
10083
{
10084
  int     block_id;
10085
10086
  appendStringInfo(buf, "prev %X/%X; xid %u",
10087
           (uint32) (XLogRecGetPrev(record) >> 32),
10088
           (uint32) XLogRecGetPrev(record),
10089
           XLogRecGetXid(record));
10090
10091
  appendStringInfo(buf, "; len %u",
10092
           XLogRecGetDataLen(record));
10093
10094
  /* decode block references */
10095
  for (block_id = 0; block_id <= record->max_block_id; block_id++)
10096
  {
10097
    RelFileNode rnode;
10098
    ForkNumber  forknum;
10099
    BlockNumber blk;
10100
10101
    if (!XLogRecHasBlockRef(record, block_id))
10102
      continue;
10103
10104
    XLogRecGetBlockTag(record, block_id, &rnode, &forknum, &blk);
10105
    if (forknum != MAIN_FORKNUM)
10106
      appendStringInfo(buf, "; blkref #%u: rel %u/%u/%u, fork %u, blk %u",
10107
               block_id,
10108
               rnode.spcNode, rnode.dbNode, rnode.relNode,
10109
               forknum,
10110
               blk);
10111
    else
10112
      appendStringInfo(buf, "; blkref #%u: rel %u/%u/%u, blk %u",
10113
               block_id,
10114
               rnode.spcNode, rnode.dbNode, rnode.relNode,
10115
               blk);
10116
    if (XLogRecHasBlockImage(record, block_id))
10117
      appendStringInfoString(buf, " FPW");
10118
  }
10119
}
10120
#endif              /* WAL_DEBUG */
10121
10122
/*
10123
 * Returns a string describing an XLogRecord, consisting of its identity
10124
 * optionally followed by a colon, a space, and a further description.
10125
 */
10126
static void
10127
xlog_outdesc(StringInfo buf, XLogReaderState *record)
10128
0
{
10129
0
  RmgrId    rmid = XLogRecGetRmid(record);
10130
0
  uint8   info = XLogRecGetInfo(record);
10131
0
  const char *id;
10132
10133
0
  appendStringInfoString(buf, RmgrTable[rmid].rm_name);
10134
0
  appendStringInfoChar(buf, '/');
10135
10136
0
  id = RmgrTable[rmid].rm_identify(info);
10137
0
  if (id == NULL)
10138
0
    appendStringInfo(buf, "UNKNOWN (%X): ", info & ~XLR_INFO_MASK);
10139
0
  else
10140
0
    appendStringInfo(buf, "%s: ", id);
10141
10142
0
  RmgrTable[rmid].rm_desc(buf, record);
10143
0
}
10144
10145
10146
/*
10147
 * Return the (possible) sync flag used for opening a file, depending on the
10148
 * value of the GUC wal_sync_method.
10149
 */
10150
static int
10151
get_sync_bit(int method)
10152
4.17k
{
10153
4.17k
  int     o_direct_flag = 0;
10154
10155
  /* If fsync is disabled, never open in sync mode */
10156
4.17k
  if (!enableFsync)
10157
4.00k
    return 0;
10158
10159
  /*
10160
   * Optimize writes by bypassing kernel cache with O_DIRECT when using
10161
   * O_SYNC/O_FSYNC and O_DSYNC.  But only if archiving and streaming are
10162
   * disabled, otherwise the archive command or walsender process will read
10163
   * the WAL soon after writing it, which is guaranteed to cause a physical
10164
   * read if we bypassed the kernel cache. We also skip the
10165
   * posix_fadvise(POSIX_FADV_DONTNEED) call in XLogFileClose() for the same
10166
   * reason.
10167
   *
10168
   * Never use O_DIRECT in walreceiver process for similar reasons; the WAL
10169
   * written by walreceiver is normally read by the startup process soon
10170
   * after its written. Also, walreceiver performs unaligned writes, which
10171
   * don't work with O_DIRECT, so it is required for correctness too.
10172
   */
10173
165
  if (!XLogIsNeeded() && 
!0
AmWalReceiverProcess0
())
10174
0
    o_direct_flag = PG_O_DIRECT;
10175
10176
165
  switch (method)
10177
165
  {
10178
      /*
10179
       * enum values for all sync options are defined even if they are
10180
       * not supported on the current platform.  But if not, they are
10181
       * not included in the enum option array, and therefore will never
10182
       * be seen here.
10183
       */
10184
0
    case SYNC_METHOD_FSYNC:
10185
0
    case SYNC_METHOD_FSYNC_WRITETHROUGH:
10186
0
    case SYNC_METHOD_FDATASYNC:
10187
0
      return 0;
10188
0
#ifdef OPEN_SYNC_FLAG
10189
0
    case SYNC_METHOD_OPEN:
10190
0
      return OPEN_SYNC_FLAG | o_direct_flag;
10191
0
#endif
10192
0
#ifdef OPEN_DATASYNC_FLAG
10193
165
    case SYNC_METHOD_OPEN_DSYNC:
10194
165
      return OPEN_DATASYNC_FLAG | o_direct_flag;
10195
0
#endif
10196
0
    default:
10197
      /* can't happen (unless we are out of sync with option array) */
10198
0
      elog(ERROR, "unrecognized wal_sync_method: %d", method);
10199
0
      return 0;     /* silence warning */
10200
165
  }
10201
165
}
10202
10203
/*
10204
 * GUC support
10205
 */
10206
void
10207
assign_xlog_sync_method(int new_sync_method, void *extra)
10208
8.04k
{
10209
8.04k
  if (sync_method != new_sync_method)
10210
0
  {
10211
    /*
10212
     * To ensure that no blocks escape unsynced, force an fsync on the
10213
     * currently open log segment (if any).  Also, if the open flag is
10214
     * changing, close the log file so it will be reopened (with new flag
10215
     * bit) at next use.
10216
     */
10217
0
    if (openLogFile >= 0)
10218
0
    {
10219
0
      pgstat_report_wait_start(WAIT_EVENT_WAL_SYNC_METHOD_ASSIGN);
10220
0
      if (pg_fsync(openLogFile) != 0)
10221
0
        ereport(PANIC,
10222
0
            (errcode_for_file_access(),
10223
0
             errmsg("could not fsync log segment %s: %m",
10224
0
                XLogFileNameP(ThisTimeLineID, openLogSegNo))));
10225
0
      pgstat_report_wait_end();
10226
0
      if (get_sync_bit(sync_method) != get_sync_bit(new_sync_method))
10227
0
        XLogFileClose();
10228
0
    }
10229
0
  }
10230
8.04k
}
10231
10232
10233
/*
10234
 * Issue appropriate kind of fsync (if any) for an XLOG output file.
10235
 *
10236
 * 'fd' is a file descriptor for the XLOG file to be fsync'd.
10237
 * 'log' and 'seg' are for error reporting purposes.
10238
 */
10239
void
10240
issue_xlog_fsync(int fd, XLogSegNo segno)
10241
0
{
10242
0
  switch (sync_method)
10243
0
  {
10244
0
    case SYNC_METHOD_FSYNC:
10245
0
      if (pg_fsync_no_writethrough(fd) != 0)
10246
0
        ereport(PANIC,
10247
0
            (errcode_for_file_access(),
10248
0
             errmsg("could not fsync log file %s: %m",
10249
0
                XLogFileNameP(ThisTimeLineID, segno))));
10250
0
      break;
10251
0
#ifdef HAVE_FSYNC_WRITETHROUGH
10252
0
    case SYNC_METHOD_FSYNC_WRITETHROUGH:
10253
0
      if (pg_fsync_writethrough(fd) != 0)
10254
0
        ereport(PANIC,
10255
0
            (errcode_for_file_access(),
10256
0
             errmsg("could not fsync write-through log file %s: %m",
10257
0
                XLogFileNameP(ThisTimeLineID, segno))));
10258
0
      break;
10259
0
#endif
10260
0
#ifdef HAVE_FDATASYNC
10261
0
    case SYNC_METHOD_FDATASYNC:
10262
0
      if (pg_fdatasync(fd) != 0)
10263
0
        ereport(PANIC,
10264
0
            (errcode_for_file_access(),
10265
0
             errmsg("could not fdatasync log file %s: %m",
10266
0
                XLogFileNameP(ThisTimeLineID, segno))));
10267
0
      break;
10268
0
#endif
10269
0
    case SYNC_METHOD_OPEN:
10270
0
    case SYNC_METHOD_OPEN_DSYNC:
10271
      /* write synced it already */
10272
0
      break;
10273
0
    default:
10274
0
      elog(PANIC, "unrecognized wal_sync_method: %d", sync_method);
10275
0
      break;
10276
0
  }
10277
0
}
10278
10279
/*
10280
 * Return the filename of given log segment, as a palloc'd string.
10281
 */
10282
char *
10283
XLogFileNameP(TimeLineID tli, XLogSegNo segno)
10284
0
{
10285
0
  char     *result = palloc(MAXFNAMELEN);
10286
10287
0
  XLogFileName(result, tli, segno, wal_segment_size);
10288
0
  return result;
10289
0
}
10290
10291
/*
10292
 * do_pg_start_backup is the workhorse of the user-visible pg_start_backup()
10293
 * function. It creates the necessary starting checkpoint and constructs the
10294
 * backup label file.
10295
 *
10296
 * There are two kind of backups: exclusive and non-exclusive. An exclusive
10297
 * backup is started with pg_start_backup(), and there can be only one active
10298
 * at a time. The backup and tablespace map files of an exclusive backup are
10299
 * written to $PGDATA/backup_label and $PGDATA/tablespace_map, and they are
10300
 * removed by pg_stop_backup().
10301
 *
10302
 * A non-exclusive backup is used for the streaming base backups (see
10303
 * src/backend/replication/basebackup.c). The difference to exclusive backups
10304
 * is that the backup label and tablespace map files are not written to disk.
10305
 * Instead, their would-be contents are returned in *labelfile and *tblspcmapfile,
10306
 * and the caller is responsible for including them in the backup archive as
10307
 * 'backup_label' and 'tablespace_map'. There can be many non-exclusive backups
10308
 * active at the same time, and they don't conflict with an exclusive backup
10309
 * either.
10310
 *
10311
 * tblspcmapfile is required mainly for tar format in windows as native windows
10312
 * utilities are not able to create symlinks while extracting files from tar.
10313
 * However for consistency, the same is used for all platforms.
10314
 *
10315
 * needtblspcmapfile is true for the cases (exclusive backup and for
10316
 * non-exclusive backup only when tar format is used for taking backup)
10317
 * when backup needs to generate tablespace_map file, it is used to
10318
 * embed escape character before newline character in tablespace path.
10319
 *
10320
 * Returns the minimum WAL location that must be present to restore from this
10321
 * backup, and the corresponding timeline ID in *starttli_p.
10322
 *
10323
 * Every successfully started non-exclusive backup must be stopped by calling
10324
 * do_pg_stop_backup() or do_pg_abort_backup().
10325
 *
10326
 * It is the responsibility of the caller of this function to verify the
10327
 * permissions of the calling user!
10328
 */
10329
XLogRecPtr
10330
do_pg_start_backup(const char *backupidstr, bool fast, TimeLineID *starttli_p,
10331
           StringInfo labelfile, List **tablespaces,
10332
           StringInfo tblspcmapfile, bool infotbssize,
10333
           bool needtblspcmapfile)
10334
0
{
10335
0
  bool    exclusive = (labelfile == NULL);
10336
0
  bool    backup_started_in_recovery = false;
10337
0
  XLogRecPtr  checkpointloc;
10338
0
  XLogRecPtr  startpoint;
10339
0
  TimeLineID  starttli;
10340
0
  pg_time_t stamp_time;
10341
0
  char    strfbuf[128];
10342
0
  char    xlogfilename[MAXFNAMELEN];
10343
0
  XLogSegNo _logSegNo;
10344
0
  struct stat stat_buf;
10345
0
  FILE     *fp;
10346
10347
0
  backup_started_in_recovery = RecoveryInProgress();
10348
10349
  /*
10350
   * Currently only non-exclusive backup can be taken during recovery.
10351
   */
10352
0
  if (backup_started_in_recovery && exclusive)
10353
0
    ereport(ERROR,
10354
0
        (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10355
0
         errmsg("recovery is in progress"),
10356
0
         errhint("WAL control functions cannot be executed during recovery.")));
10357
10358
  /*
10359
   * During recovery, we don't need to check WAL level. Because, if WAL
10360
   * level is not sufficient, it's impossible to get here during recovery.
10361
   */
10362
0
  if (!backup_started_in_recovery && !XLogIsNeeded())
10363
0
    ereport(ERROR,
10364
0
        (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10365
0
         errmsg("WAL level not sufficient for making an online backup"),
10366
0
         errhint("wal_level must be set to \"replica\" or \"logical\" at server start.")));
10367
10368
0
  if (strlen(backupidstr) > MAXPGPATH)
10369
0
    ereport(ERROR,
10370
0
        (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
10371
0
         errmsg("backup label too long (max %d bytes)",
10372
0
            MAXPGPATH)));
10373
10374
  /*
10375
   * Mark backup active in shared memory.  We must do full-page WAL writes
10376
   * during an on-line backup even if not doing so at other times, because
10377
   * it's quite possible for the backup dump to obtain a "torn" (partially
10378
   * written) copy of a database page if it reads the page concurrently with
10379
   * our write to the same page.  This can be fixed as long as the first
10380
   * write to the page in the WAL sequence is a full-page write. Hence, we
10381
   * turn on forcePageWrites and then force a CHECKPOINT, to ensure there
10382
   * are no dirty pages in shared memory that might get dumped while the
10383
   * backup is in progress without having a corresponding WAL record.  (Once
10384
   * the backup is complete, we need not force full-page writes anymore,
10385
   * since we expect that any pages not modified during the backup interval
10386
   * must have been correctly captured by the backup.)
10387
   *
10388
   * Note that forcePageWrites has no effect during an online backup from
10389
   * the standby.
10390
   *
10391
   * We must hold all the insertion locks to change the value of
10392
   * forcePageWrites, to ensure adequate interlocking against
10393
   * XLogInsertRecord().
10394
   */
10395
0
  WALInsertLockAcquireExclusive();
10396
0
  if (exclusive)
10397
0
  {
10398
    /*
10399
     * At first, mark that we're now starting an exclusive backup, to
10400
     * ensure that there are no other sessions currently running
10401
     * pg_start_backup() or pg_stop_backup().
10402
     */
10403
0
    if (XLogCtl->Insert.exclusiveBackupState != EXCLUSIVE_BACKUP_NONE)
10404
0
    {
10405
0
      WALInsertLockRelease();
10406
0
      ereport(ERROR,
10407
0
          (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10408
0
           errmsg("a backup is already in progress"),
10409
0
           errhint("Run pg_stop_backup() and try again.")));
10410
0
    }
10411
0
    XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_STARTING;
10412
0
  }
10413
0
  else
10414
0
    XLogCtl->Insert.nonExclusiveBackups++;
10415
0
  XLogCtl->Insert.forcePageWrites = true;
10416
0
  WALInsertLockRelease();
10417
10418
  /* Ensure we release forcePageWrites if fail below */
10419
0
  PG_ENSURE_ERROR_CLEANUP(pg_start_backup_callback, (Datum) BoolGetDatum(exclusive));
10420
0
  {
10421
0
    bool    gotUniqueStartpoint = false;
10422
0
    DIR      *tblspcdir;
10423
0
    struct dirent *de;
10424
0
    tablespaceinfo *ti;
10425
0
    int     datadirpathlen;
10426
10427
    /*
10428
     * Force an XLOG file switch before the checkpoint, to ensure that the
10429
     * WAL segment the checkpoint is written to doesn't contain pages with
10430
     * old timeline IDs.  That would otherwise happen if you called
10431
     * pg_start_backup() right after restoring from a PITR archive: the
10432
     * first WAL segment containing the startup checkpoint has pages in
10433
     * the beginning with the old timeline ID.  That can cause trouble at
10434
     * recovery: we won't have a history file covering the old timeline if
10435
     * pg_wal directory was not included in the base backup and the WAL
10436
     * archive was cleared too before starting the backup.
10437
     *
10438
     * This also ensures that we have emitted a WAL page header that has
10439
     * XLP_BKP_REMOVABLE off before we emit the checkpoint record.
10440
     * Therefore, if a WAL archiver (such as pglesslog) is trying to
10441
     * compress out removable backup blocks, it won't remove any that
10442
     * occur after this point.
10443
     *
10444
     * During recovery, we skip forcing XLOG file switch, which means that
10445
     * the backup taken during recovery is not available for the special
10446
     * recovery case described above.
10447
     */
10448
0
    if (!backup_started_in_recovery)
10449
0
      RequestXLogSwitch(false);
10450
10451
0
    do
10452
0
    {
10453
0
      bool    checkpointfpw;
10454
10455
      /*
10456
       * Force a CHECKPOINT.  Aside from being necessary to prevent torn
10457
       * page problems, this guarantees that two successive backup runs
10458
       * will have different checkpoint positions and hence different
10459
       * history file names, even if nothing happened in between.
10460
       *
10461
       * During recovery, establish a restartpoint if possible. We use
10462
       * the last restartpoint as the backup starting checkpoint. This
10463
       * means that two successive backup runs can have same checkpoint
10464
       * positions.
10465
       *
10466
       * Since the fact that we are executing do_pg_start_backup()
10467
       * during recovery means that checkpointer is running, we can use
10468
       * RequestCheckpoint() to establish a restartpoint.
10469
       *
10470
       * We use CHECKPOINT_IMMEDIATE only if requested by user (via
10471
       * passing fast = true).  Otherwise this can take awhile.
10472
       */
10473
0
      RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_WAIT |
10474
0
                (fast ? CHECKPOINT_IMMEDIATE : 0));
10475
10476
      /*
10477
       * Now we need to fetch the checkpoint record location, and also
10478
       * its REDO pointer.  The oldest point in WAL that would be needed
10479
       * to restore starting from the checkpoint is precisely the REDO
10480
       * pointer.
10481
       */
10482
0
      LWLockAcquire(ControlFileLock, LW_SHARED);
10483
0
      checkpointloc = ControlFile->checkPoint;
10484
0
      startpoint = ControlFile->checkPointCopy.redo;
10485
0
      starttli = ControlFile->checkPointCopy.ThisTimeLineID;
10486
0
      checkpointfpw = ControlFile->checkPointCopy.fullPageWrites;
10487
0
      LWLockRelease(ControlFileLock);
10488
10489
0
      if (backup_started_in_recovery)
10490
0
      {
10491
0
        XLogRecPtr  recptr;
10492
10493
        /*
10494
         * Check to see if all WAL replayed during online backup
10495
         * (i.e., since last restartpoint used as backup starting
10496
         * checkpoint) contain full-page writes.
10497
         */
10498
0
        SpinLockAcquire(&XLogCtl->info_lck);
10499
0
        recptr = XLogCtl->lastFpwDisableRecPtr;
10500
0
        SpinLockRelease(&XLogCtl->info_lck);
10501
10502
0
        if (!checkpointfpw || startpoint <= recptr)
10503
0
          ereport(ERROR,
10504
0
              (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10505
0
               errmsg("WAL generated with full_page_writes=off was replayed "
10506
0
                  "since last restartpoint"),
10507
0
               errhint("This means that the backup being taken on the standby "
10508
0
                   "is corrupt and should not be used. "
10509
0
                   "Enable full_page_writes and run CHECKPOINT on the master, "
10510
0
                   "and then try an online backup again.")));
10511
10512
        /*
10513
         * During recovery, since we don't use the end-of-backup WAL
10514
         * record and don't write the backup history file, the
10515
         * starting WAL location doesn't need to be unique. This means
10516
         * that two base backups started at the same time might use
10517
         * the same checkpoint as starting locations.
10518
         */
10519
0
        gotUniqueStartpoint = true;
10520
0
      }
10521
10522
      /*
10523
       * If two base backups are started at the same time (in WAL sender
10524
       * processes), we need to make sure that they use different
10525
       * checkpoints as starting locations, because we use the starting
10526
       * WAL location as a unique identifier for the base backup in the
10527
       * end-of-backup WAL record and when we write the backup history
10528
       * file. Perhaps it would be better generate a separate unique ID
10529
       * for each backup instead of forcing another checkpoint, but
10530
       * taking a checkpoint right after another is not that expensive
10531
       * either because only few buffers have been dirtied yet.
10532
       */
10533
0
      WALInsertLockAcquireExclusive();
10534
0
      if (XLogCtl->Insert.lastBackupStart < startpoint)
10535
0
      {
10536
0
        XLogCtl->Insert.lastBackupStart = startpoint;
10537
0
        gotUniqueStartpoint = true;
10538
0
      }
10539
0
      WALInsertLockRelease();
10540
0
    } while (!gotUniqueStartpoint);
10541
10542
0
    XLByteToSeg(startpoint, _logSegNo, wal_segment_size);
10543
0
    XLogFileName(xlogfilename, starttli, _logSegNo, wal_segment_size);
10544
10545
    /*
10546
     * Construct tablespace_map file
10547
     */
10548
0
    if (exclusive)
10549
0
      tblspcmapfile = makeStringInfo();
10550
10551
0
    datadirpathlen = strlen(DataDir);
10552
10553
    /* Collect information about all tablespaces */
10554
0
    tblspcdir = AllocateDir("pg_tblspc");
10555
0
    while ((de = ReadDir(tblspcdir, "pg_tblspc")) != NULL)
10556
0
    {
10557
0
      char    fullpath[MAXPGPATH + 10];
10558
0
      char    linkpath[MAXPGPATH];
10559
0
      char     *relpath = NULL;
10560
0
      int     rllen;
10561
0
      StringInfoData buflinkpath;
10562
0
      char     *s = linkpath;
10563
10564
      /* Skip special stuff */
10565
0
      if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0)
10566
0
        continue;
10567
10568
0
      snprintf(fullpath, sizeof(fullpath), "pg_tblspc/%s", de->d_name);
10569
10570
0
#if defined(HAVE_READLINK) || defined(WIN32)
10571
0
      rllen = readlink(fullpath, linkpath, sizeof(linkpath));
10572
0
      if (rllen < 0)
10573
0
      {
10574
0
        ereport(WARNING,
10575
0
            (errmsg("could not read symbolic link \"%s\": %m",
10576
0
                fullpath)));
10577
0
        continue;
10578
0
      }
10579
0
      else if (rllen >= sizeof(linkpath))
10580
0
      {
10581
0
        ereport(WARNING,
10582
0
            (errmsg("symbolic link \"%s\" target is too long",
10583
0
                fullpath)));
10584
0
        continue;
10585
0
      }
10586
0
      linkpath[rllen] = '\0';
10587
10588
      /*
10589
       * Add the escape character '\\' before newline in a string to
10590
       * ensure that we can distinguish between the newline in the
10591
       * tablespace path and end of line while reading tablespace_map
10592
       * file during archive recovery.
10593
       */
10594
0
      initStringInfo(&buflinkpath);
10595
10596
0
      while (*s)
10597
0
      {
10598
0
        if ((*s == '\n' || *s == '\r') && needtblspcmapfile)
10599
0
          appendStringInfoChar(&buflinkpath, '\\');
10600
0
        appendStringInfoChar(&buflinkpath, *s++);
10601
0
      }
10602
10603
      /*
10604
       * Relpath holds the relative path of the tablespace directory
10605
       * when it's located within PGDATA, or NULL if it's located
10606
       * elsewhere.
10607
       */
10608
0
      if (rllen > datadirpathlen &&
10609
0
        strncmp(linkpath, DataDir, datadirpathlen) == 0 &&
10610
0
        IS_DIR_SEP(linkpath[datadirpathlen]))
10611
0
        relpath = linkpath + datadirpathlen + 1;
10612
10613
0
      ti = palloc(sizeof(tablespaceinfo));
10614
0
      ti->oid = pstrdup(de->d_name);
10615
0
      ti->path = pstrdup(buflinkpath.data);
10616
0
      ti->rpath = relpath ? pstrdup(relpath) : NULL;
10617
0
      ti->size = infotbssize ? sendTablespace(fullpath, true) : -1;
10618
10619
0
      if (tablespaces)
10620
0
        *tablespaces = lappend(*tablespaces, ti);
10621
10622
0
      appendStringInfo(tblspcmapfile, "%s %s\n", ti->oid, ti->path);
10623
10624
0
      pfree(buflinkpath.data);
10625
#else
10626
10627
      /*
10628
       * If the platform does not have symbolic links, it should not be
10629
       * possible to have tablespaces - clearly somebody else created
10630
       * them. Warn about it and ignore.
10631
       */
10632
      ereport(WARNING,
10633
          (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
10634
           errmsg("tablespaces are not supported on this platform")));
10635
#endif
10636
0
    }
10637
0
    FreeDir(tblspcdir);
10638
10639
    /*
10640
     * Construct backup label file
10641
     */
10642
0
    if (exclusive)
10643
0
      labelfile = makeStringInfo();
10644
10645
    /* Use the log timezone here, not the session timezone */
10646
0
    stamp_time = (pg_time_t) time(NULL);
10647
0
    pg_strftime(strfbuf, sizeof(strfbuf),
10648
0
          "%Y-%m-%d %H:%M:%S %Z",
10649
0
          pg_localtime(&stamp_time, log_timezone));
10650
0
    appendStringInfo(labelfile, "START WAL LOCATION: %X/%X (file %s)\n",
10651
0
             (uint32) (startpoint >> 32), (uint32) startpoint, xlogfilename);
10652
0
    appendStringInfo(labelfile, "CHECKPOINT LOCATION: %X/%X\n",
10653
0
             (uint32) (checkpointloc >> 32), (uint32) checkpointloc);
10654
0
    appendStringInfo(labelfile, "BACKUP METHOD: %s\n",
10655
0
             exclusive ? "pg_start_backup" : "streamed");
10656
0
    appendStringInfo(labelfile, "BACKUP FROM: %s\n",
10657
0
             backup_started_in_recovery ? "standby" : "master");
10658
0
    appendStringInfo(labelfile, "START TIME: %s\n", strfbuf);
10659
0
    appendStringInfo(labelfile, "LABEL: %s\n", backupidstr);
10660
0
    appendStringInfo(labelfile, "START TIMELINE: %u\n", starttli);
10661
10662
    /*
10663
     * Okay, write the file, or return its contents to caller.
10664
     */
10665
0
    if (exclusive)
10666
0
    {
10667
      /*
10668
       * Check for existing backup label --- implies a backup is already
10669
       * running.  (XXX given that we checked exclusiveBackupState
10670
       * above, maybe it would be OK to just unlink any such label
10671
       * file?)
10672
       */
10673
0
      if (stat(BACKUP_LABEL_FILE, &stat_buf) != 0)
10674
0
      {
10675
0
        if (errno != ENOENT)
10676
0
          ereport(ERROR,
10677
0
              (errcode_for_file_access(),
10678
0
               errmsg("could not stat file \"%s\": %m",
10679
0
                  BACKUP_LABEL_FILE)));
10680
0
      }
10681
0
      else
10682
0
        ereport(ERROR,
10683
0
            (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10684
0
             errmsg("a backup is already in progress"),
10685
0
             errhint("If you're sure there is no backup in progress, remove file \"%s\" and try again.",
10686
0
                 BACKUP_LABEL_FILE)));
10687
10688
0
      fp = AllocateFile(BACKUP_LABEL_FILE, "w");
10689
10690
0
      if (!fp)
10691
0
        ereport(ERROR,
10692
0
            (errcode_for_file_access(),
10693
0
             errmsg("could not create file \"%s\": %m",
10694
0
                BACKUP_LABEL_FILE)));
10695
0
      if (fwrite(labelfile->data, labelfile->len, 1, fp) != 1 ||
10696
0
        fflush(fp) != 0 ||
10697
0
        pg_fsync(fileno(fp)) != 0 ||
10698
0
        ferror(fp) ||
10699
0
        FreeFile(fp))
10700
0
        ereport(ERROR,
10701
0
            (errcode_for_file_access(),
10702
0
             errmsg("could not write file \"%s\": %m",
10703
0
                BACKUP_LABEL_FILE)));
10704
      /* Allocated locally for exclusive backups, so free separately */
10705
0
      pfree(labelfile->data);
10706
0
      pfree(labelfile);
10707
10708
      /* Write backup tablespace_map file. */
10709
0
      if (tblspcmapfile->len > 0)
10710
0
      {
10711
0
        if (stat(TABLESPACE_MAP, &stat_buf) != 0)
10712
0
        {
10713
0
          if (errno != ENOENT)
10714
0
            ereport(ERROR,
10715
0
                (errcode_for_file_access(),
10716
0
                 errmsg("could not stat file \"%s\": %m",
10717
0
                    TABLESPACE_MAP)));
10718
0
        }
10719
0
        else
10720
0
          ereport(ERROR,
10721
0
              (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10722
0
               errmsg("a backup is already in progress"),
10723
0
               errhint("If you're sure there is no backup in progress, remove file \"%s\" and try again.",
10724
0
                   TABLESPACE_MAP)));
10725
10726
0
        fp = AllocateFile(TABLESPACE_MAP, "w");
10727
10728
0
        if (!fp)
10729
0
          ereport(ERROR,
10730
0
              (errcode_for_file_access(),
10731
0
               errmsg("could not create file \"%s\": %m",
10732
0
                  TABLESPACE_MAP)));
10733
0
        if (fwrite(tblspcmapfile->data, tblspcmapfile->len, 1, fp) != 1 ||
10734
0
          fflush(fp) != 0 ||
10735
0
          pg_fsync(fileno(fp)) != 0 ||
10736
0
          ferror(fp) ||
10737
0
          FreeFile(fp))
10738
0
          ereport(ERROR,
10739
0
              (errcode_for_file_access(),
10740
0
               errmsg("could not write file \"%s\": %m",
10741
0
                  TABLESPACE_MAP)));
10742
0
      }
10743
10744
      /* Allocated locally for exclusive backups, so free separately */
10745
0
      pfree(tblspcmapfile->data);
10746
0
      pfree(tblspcmapfile);
10747
0
    }
10748
0
  }
10749
0
  PG_END_ENSURE_ERROR_CLEANUP(pg_start_backup_callback, (Datum) BoolGetDatum(exclusive));
10750
10751
  /*
10752
   * Mark that start phase has correctly finished for an exclusive backup.
10753
   * Session-level locks are updated as well to reflect that state.
10754
   *
10755
   * Note that CHECK_FOR_INTERRUPTS() must not occur while updating backup
10756
   * counters and session-level lock. Otherwise they can be updated
10757
   * inconsistently, and which might cause do_pg_abort_backup() to fail.
10758
   */
10759
0
  if (exclusive)
10760
0
  {
10761
0
    WALInsertLockAcquireExclusive();
10762
0
    XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_IN_PROGRESS;
10763
10764
    /* Set session-level lock */
10765
0
    sessionBackupState = SESSION_BACKUP_EXCLUSIVE;
10766
0
    WALInsertLockRelease();
10767
0
  }
10768
0
  else
10769
0
    sessionBackupState = SESSION_BACKUP_NON_EXCLUSIVE;
10770
10771
  /*
10772
   * We're done.  As a convenience, return the starting WAL location.
10773
   */
10774
0
  if (starttli_p)
10775
0
    *starttli_p = starttli;
10776
0
  return startpoint;
10777
0
}
10778
10779
/* Error cleanup callback for pg_start_backup */
10780
static void
10781
pg_start_backup_callback(int code, Datum arg)
10782
0
{
10783
0
  bool    exclusive = DatumGetBool(arg);
10784
10785
  /* Update backup counters and forcePageWrites on failure */
10786
0
  WALInsertLockAcquireExclusive();
10787
0
  if (exclusive)
10788
0
  {
10789
0
    Assert(XLogCtl->Insert.exclusiveBackupState == EXCLUSIVE_BACKUP_STARTING);
10790
0
    XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_NONE;
10791
0
  }
10792
0
  else
10793
0
  {
10794
0
    Assert(XLogCtl->Insert.nonExclusiveBackups > 0);
10795
0
    XLogCtl->Insert.nonExclusiveBackups--;
10796
0
  }
10797
10798
0
  if (XLogCtl->Insert.exclusiveBackupState == EXCLUSIVE_BACKUP_NONE &&
10799
0
    XLogCtl->Insert.nonExclusiveBackups == 0)
10800
0
  {
10801
0
    XLogCtl->Insert.forcePageWrites = false;
10802
0
  }
10803
0
  WALInsertLockRelease();
10804
0
}
10805
10806
/*
10807
 * Error cleanup callback for pg_stop_backup
10808
 */
10809
static void
10810
pg_stop_backup_callback(int code, Datum arg)
10811
0
{
10812
0
  bool    exclusive = DatumGetBool(arg);
10813
10814
  /* Update backup status on failure */
10815
0
  WALInsertLockAcquireExclusive();
10816
0
  if (exclusive)
10817
0
  {
10818
0
    Assert(XLogCtl->Insert.exclusiveBackupState == EXCLUSIVE_BACKUP_STOPPING);
10819
0
    XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_IN_PROGRESS;
10820
0
  }
10821
0
  WALInsertLockRelease();
10822
0
}
10823
10824
/*
10825
 * Utility routine to fetch the session-level status of a backup running.
10826
 */
10827
SessionBackupState
10828
get_backup_status(void)
10829
0
{
10830
0
  return sessionBackupState;
10831
0
}
10832
10833
/*
10834
 * do_pg_stop_backup is the workhorse of the user-visible pg_stop_backup()
10835
 * function.
10836
 *
10837
 * If labelfile is NULL, this stops an exclusive backup. Otherwise this stops
10838
 * the non-exclusive backup specified by 'labelfile'.
10839
 *
10840
 * Returns the last WAL location that must be present to restore from this
10841
 * backup, and the corresponding timeline ID in *stoptli_p.
10842
 *
10843
 * It is the responsibility of the caller of this function to verify the
10844
 * permissions of the calling user!
10845
 */
10846
XLogRecPtr
10847
do_pg_stop_backup(char *labelfile, bool waitforarchive, TimeLineID *stoptli_p)
10848
0
{
10849
0
  bool    exclusive = (labelfile == NULL);
10850
0
  bool    backup_started_in_recovery = false;
10851
0
  XLogRecPtr  startpoint;
10852
0
  XLogRecPtr  stoppoint;
10853
0
  TimeLineID  stoptli;
10854
0
  pg_time_t stamp_time;
10855
0
  char    strfbuf[128];
10856
0
  char    histfilepath[MAXPGPATH];
10857
0
  char    startxlogfilename[MAXFNAMELEN];
10858
0
  char    stopxlogfilename[MAXFNAMELEN];
10859
0
  char    lastxlogfilename[MAXFNAMELEN];
10860
0
  char    histfilename[MAXFNAMELEN];
10861
0
  char    backupfrom[20];
10862
0
  XLogSegNo _logSegNo;
10863
0
  FILE     *lfp;
10864
0
  FILE     *fp;
10865
0
  char    ch;
10866
0
  int     seconds_before_warning;
10867
0
  int     waits = 0;
10868
0
  bool    reported_waiting = false;
10869
0
  char     *remaining;
10870
0
  char     *ptr;
10871
0
  uint32    hi,
10872
0
        lo;
10873
10874
0
  backup_started_in_recovery = RecoveryInProgress();
10875
10876
  /*
10877
   * Currently only non-exclusive backup can be taken during recovery.
10878
   */
10879
0
  if (backup_started_in_recovery && exclusive)
10880
0
    ereport(ERROR,
10881
0
        (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10882
0
         errmsg("recovery is in progress"),
10883
0
         errhint("WAL control functions cannot be executed during recovery.")));
10884
10885
  /*
10886
   * During recovery, we don't need to check WAL level. Because, if WAL
10887
   * level is not sufficient, it's impossible to get here during recovery.
10888
   */
10889
0
  if (!backup_started_in_recovery && !XLogIsNeeded())
10890
0
    ereport(ERROR,
10891
0
        (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10892
0
         errmsg("WAL level not sufficient for making an online backup"),
10893
0
         errhint("wal_level must be set to \"replica\" or \"logical\" at server start.")));
10894
10895
0
  if (exclusive)
10896
0
  {
10897
    /*
10898
     * At first, mark that we're now stopping an exclusive backup, to
10899
     * ensure that there are no other sessions currently running
10900
     * pg_start_backup() or pg_stop_backup().
10901
     */
10902
0
    WALInsertLockAcquireExclusive();
10903
0
    if (XLogCtl->Insert.exclusiveBackupState != EXCLUSIVE_BACKUP_IN_PROGRESS)
10904
0
    {
10905
0
      WALInsertLockRelease();
10906
0
      ereport(ERROR,
10907
0
          (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10908
0
           errmsg("exclusive backup not in progress")));
10909
0
    }
10910
0
    XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_STOPPING;
10911
0
    WALInsertLockRelease();
10912
10913
    /*
10914
     * Remove backup_label. In case of failure, the state for an exclusive
10915
     * backup is switched back to in-progress.
10916
     */
10917
0
    PG_ENSURE_ERROR_CLEANUP(pg_stop_backup_callback, (Datum) BoolGetDatum(exclusive));
10918
0
    {
10919
      /*
10920
       * Read the existing label file into memory.
10921
       */
10922
0
      struct stat statbuf;
10923
0
      int     r;
10924
10925
0
      if (stat(BACKUP_LABEL_FILE, &statbuf))
10926
0
      {
10927
        /* should not happen per the upper checks */
10928
0
        if (errno != ENOENT)
10929
0
          ereport(ERROR,
10930
0
              (errcode_for_file_access(),
10931
0
               errmsg("could not stat file \"%s\": %m",
10932
0
                  BACKUP_LABEL_FILE)));
10933
0
        ereport(ERROR,
10934
0
            (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10935
0
             errmsg("a backup is not in progress")));
10936
0
      }
10937
10938
0
      lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
10939
0
      if (!lfp)
10940
0
      {
10941
0
        ereport(ERROR,
10942
0
            (errcode_for_file_access(),
10943
0
             errmsg("could not read file \"%s\": %m",
10944
0
                BACKUP_LABEL_FILE)));
10945
0
      }
10946
0
      labelfile = palloc(statbuf.st_size + 1);
10947
0
      r = fread(labelfile, statbuf.st_size, 1, lfp);
10948
0
      labelfile[statbuf.st_size] = '\0';
10949
10950
      /*
10951
       * Close and remove the backup label file
10952
       */
10953
0
      if (r != 1 || ferror(lfp) || FreeFile(lfp))
10954
0
        ereport(ERROR,
10955
0
            (errcode_for_file_access(),
10956
0
             errmsg("could not read file \"%s\": %m",
10957
0
                BACKUP_LABEL_FILE)));
10958
0
      durable_unlink(BACKUP_LABEL_FILE, ERROR);
10959
10960
      /*
10961
       * Remove tablespace_map file if present, it is created only if
10962
       * there are tablespaces.
10963
       */
10964
0
      durable_unlink(TABLESPACE_MAP, DEBUG1);
10965
0
    }
10966
0
    PG_END_ENSURE_ERROR_CLEANUP(pg_stop_backup_callback, (Datum) BoolGetDatum(exclusive));
10967
0
  }
10968
10969
  /*
10970
   * OK to update backup counters, forcePageWrites and session-level lock.
10971
   *
10972
   * Note that CHECK_FOR_INTERRUPTS() must not occur while updating them.
10973
   * Otherwise they can be updated inconsistently, and which might cause
10974
   * do_pg_abort_backup() to fail.
10975
   */
10976
0
  WALInsertLockAcquireExclusive();
10977
0
  if (exclusive)
10978
0
  {
10979
0
    XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_NONE;
10980
0
  }
10981
0
  else
10982
0
  {
10983
    /*
10984
     * The user-visible pg_start/stop_backup() functions that operate on
10985
     * exclusive backups can be called at any time, but for non-exclusive
10986
     * backups, it is expected that each do_pg_start_backup() call is
10987
     * matched by exactly one do_pg_stop_backup() call.
10988
     */
10989
0
    Assert(XLogCtl->Insert.nonExclusiveBackups > 0);
10990
0
    XLogCtl->Insert.nonExclusiveBackups--;
10991
0
  }
10992
10993
0
  if (XLogCtl->Insert.exclusiveBackupState == EXCLUSIVE_BACKUP_NONE &&
10994
0
    XLogCtl->Insert.nonExclusiveBackups == 0)
10995
0
  {
10996
0
    XLogCtl->Insert.forcePageWrites = false;
10997
0
  }
10998
10999
  /*
11000
   * Clean up session-level lock.
11001
   *
11002
   * You might think that WALInsertLockRelease() can be called before
11003
   * cleaning up session-level lock because session-level lock doesn't need
11004
   * to be protected with WAL insertion lock. But since
11005
   * CHECK_FOR_INTERRUPTS() can occur in it, session-level lock must be
11006
   * cleaned up before it.
11007
   */
11008
0
  sessionBackupState = SESSION_BACKUP_NONE;
11009
11010
0
  WALInsertLockRelease();
11011
11012
  /*
11013
   * Read and parse the START WAL LOCATION line (this code is pretty crude,
11014
   * but we are not expecting any variability in the file format).
11015
   */
11016
0
  if (sscanf(labelfile, "START WAL LOCATION: %X/%X (file %24s)%c",
11017
0
         &hi, &lo, startxlogfilename,
11018
0
         &ch) != 4 || ch != '\n')
11019
0
    ereport(ERROR,
11020
0
        (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
11021
0
         errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
11022
0
  startpoint = ((uint64) hi) << 32 | lo;
11023
0
  remaining = strchr(labelfile, '\n') + 1;  /* %n is not portable enough */
11024
11025
  /*
11026
   * Parse the BACKUP FROM line. If we are taking an online backup from the
11027
   * standby, we confirm that the standby has not been promoted during the
11028
   * backup.
11029
   */
11030
0
  ptr = strstr(remaining, "BACKUP FROM:");
11031
0
  if (!ptr || sscanf(ptr, "BACKUP FROM: %19s\n", backupfrom) != 1)
11032
0
    ereport(ERROR,
11033
0
        (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
11034
0
         errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
11035
0
  if (strcmp(backupfrom, "standby") == 0 && !backup_started_in_recovery)
11036
0
    ereport(ERROR,
11037
0
        (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
11038
0
         errmsg("the standby was promoted during online backup"),
11039
0
         errhint("This means that the backup being taken is corrupt "
11040
0
             "and should not be used. "
11041
0
             "Try taking another online backup.")));
11042
11043
  /*
11044
   * During recovery, we don't write an end-of-backup record. We assume that
11045
   * pg_control was backed up last and its minimum recovery point can be
11046
   * available as the backup end location. Since we don't have an
11047
   * end-of-backup record, we use the pg_control value to check whether
11048
   * we've reached the end of backup when starting recovery from this
11049
   * backup. We have no way of checking if pg_control wasn't backed up last
11050
   * however.
11051
   *
11052
   * We don't force a switch to new WAL file but it is still possible to
11053
   * wait for all the required files to be archived if waitforarchive is
11054
   * true. This is okay if we use the backup to start a standby and fetch
11055
   * the missing WAL using streaming replication. But in the case of an
11056
   * archive recovery, a user should set waitforarchive to true and wait for
11057
   * them to be archived to ensure that all the required files are
11058
   * available.
11059
   *
11060
   * We return the current minimum recovery point as the backup end
11061
   * location. Note that it can be greater than the exact backup end
11062
   * location if the minimum recovery point is updated after the backup of
11063
   * pg_control. This is harmless for current uses.
11064
   *
11065
   * XXX currently a backup history file is for informational and debug
11066
   * purposes only. It's not essential for an online backup. Furthermore,
11067
   * even if it's created, it will not be archived during recovery because
11068
   * an archiver is not invoked. So it doesn't seem worthwhile to write a
11069
   * backup history file during recovery.
11070
   */
11071
0
  if (backup_started_in_recovery)
11072
0
  {
11073
0
    XLogRecPtr  recptr;
11074
11075
    /*
11076
     * Check to see if all WAL replayed during online backup contain
11077
     * full-page writes.
11078
     */
11079
0
    SpinLockAcquire(&XLogCtl->info_lck);
11080
0
    recptr = XLogCtl->lastFpwDisableRecPtr;
11081
0
    SpinLockRelease(&XLogCtl->info_lck);
11082
11083
0
    if (startpoint <= recptr)
11084
0
      ereport(ERROR,
11085
0
          (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
11086
0
           errmsg("WAL generated with full_page_writes=off was replayed "
11087
0
              "during online backup"),
11088
0
           errhint("This means that the backup being taken on the standby "
11089
0
               "is corrupt and should not be used. "
11090
0
               "Enable full_page_writes and run CHECKPOINT on the master, "
11091
0
               "and then try an online backup again.")));
11092
11093
11094
0
    LWLockAcquire(ControlFileLock, LW_SHARED);
11095
0
    stoppoint = ControlFile->minRecoveryPoint;
11096
0
    stoptli = ControlFile->minRecoveryPointTLI;
11097
0
    LWLockRelease(ControlFileLock);
11098
0
  }
11099
0
  else
11100
0
  {
11101
    /*
11102
     * Write the backup-end xlog record
11103
     */
11104
0
    XLogBeginInsert();
11105
0
    XLogRegisterData((char *) (&startpoint), sizeof(startpoint));
11106
0
    stoppoint = XLogInsert(RM_XLOG_ID, XLOG_BACKUP_END);
11107
0
    stoptli = ThisTimeLineID;
11108
11109
    /*
11110
     * Force a switch to a new xlog segment file, so that the backup is
11111
     * valid as soon as archiver moves out the current segment file.
11112
     */
11113
0
    RequestXLogSwitch(false);
11114
11115
0
    XLByteToPrevSeg(stoppoint, _logSegNo, wal_segment_size);
11116
0
    XLogFileName(stopxlogfilename, stoptli, _logSegNo, wal_segment_size);
11117
11118
    /* Use the log timezone here, not the session timezone */
11119
0
    stamp_time = (pg_time_t) time(NULL);
11120
0
    pg_strftime(strfbuf, sizeof(strfbuf),
11121
0
          "%Y-%m-%d %H:%M:%S %Z",
11122
0
          pg_localtime(&stamp_time, log_timezone));
11123
11124
    /*
11125
     * Write the backup history file
11126
     */
11127
0
    XLByteToSeg(startpoint, _logSegNo, wal_segment_size);
11128
0
    BackupHistoryFilePath(histfilepath, stoptli, _logSegNo,
11129
0
                startpoint, wal_segment_size);
11130
0
    fp = AllocateFile(histfilepath, "w");
11131
0
    if (!fp)
11132
0
      ereport(ERROR,
11133
0
          (errcode_for_file_access(),
11134
0
           errmsg("could not create file \"%s\": %m",
11135
0
              histfilepath)));
11136
0
    fprintf(fp, "START WAL LOCATION: %X/%X (file %s)\n",
11137
0
        (uint32) (startpoint >> 32), (uint32) startpoint, startxlogfilename);
11138
0
    fprintf(fp, "STOP WAL LOCATION: %X/%X (file %s)\n",
11139
0
        (uint32) (stoppoint >> 32), (uint32) stoppoint, stopxlogfilename);
11140
11141
    /*
11142
     * Transfer remaining lines including label and start timeline to
11143
     * history file.
11144
     */
11145
0
    fprintf(fp, "%s", remaining);
11146
0
    fprintf(fp, "STOP TIME: %s\n", strfbuf);
11147
0
    fprintf(fp, "STOP TIMELINE: %u\n", stoptli);
11148
0
    if (fflush(fp) || ferror(fp) || FreeFile(fp))
11149
0
      ereport(ERROR,
11150
0
          (errcode_for_file_access(),
11151
0
           errmsg("could not write file \"%s\": %m",
11152
0
              histfilepath)));
11153
11154
    /*
11155
     * Clean out any no-longer-needed history files.  As a side effect,
11156
     * this will post a .ready file for the newly created history file,
11157
     * notifying the archiver that history file may be archived
11158
     * immediately.
11159
     */
11160
0
    CleanupBackupHistory();
11161
0
  }
11162
11163
  /*
11164
   * If archiving is enabled, wait for all the required WAL files to be
11165
   * archived before returning. If archiving isn't enabled, the required WAL
11166
   * needs to be transported via streaming replication (hopefully with
11167
   * wal_keep_segments set high enough), or some more exotic mechanism like
11168
   * polling and copying files from pg_wal with script. We have no knowledge
11169
   * of those mechanisms, so it's up to the user to ensure that he gets all
11170
   * the required WAL.
11171
   *
11172
   * We wait until both the last WAL file filled during backup and the
11173
   * history file have been archived, and assume that the alphabetic sorting
11174
   * property of the WAL files ensures any earlier WAL files are safely
11175
   * archived as well.
11176
   *
11177
   * We wait forever, since archive_command is supposed to work and we
11178
   * assume the admin wanted his backup to work completely. If you don't
11179
   * wish to wait, then either waitforarchive should be passed in as false,
11180
   * or you can set statement_timeout.  Also, some notices are issued to
11181
   * clue in anyone who might be doing this interactively.
11182
   */
11183
11184
0
  if (waitforarchive &&
11185
0
    ((!backup_started_in_recovery && XLogArchivingActive()) ||
11186
0
     (backup_started_in_recovery && XLogArchivingAlways())))
11187
0
  {
11188
0
    XLByteToPrevSeg(stoppoint, _logSegNo, wal_segment_size);
11189
0
    XLogFileName(lastxlogfilename, stoptli, _logSegNo, wal_segment_size);
11190
11191
0
    XLByteToSeg(startpoint, _logSegNo, wal_segment_size);
11192
0
    BackupHistoryFileName(histfilename, stoptli, _logSegNo,
11193
0
                startpoint, wal_segment_size);
11194
11195
0
    seconds_before_warning = 60;
11196
0
    waits = 0;
11197
11198
0
    while (XLogArchiveIsBusy(lastxlogfilename) ||
11199
0
         XLogArchiveIsBusy(histfilename))
11200
0
    {
11201
0
      CHECK_FOR_INTERRUPTS();
11202
11203
0
      if (!reported_waiting && waits > 5)
11204
0
      {
11205
0
        ereport(NOTICE,
11206
0
            (errmsg("pg_stop_backup cleanup done, waiting for required WAL segments to be archived")));
11207
0
        reported_waiting = true;
11208
0
      }
11209
11210
0
      pg_usleep(1000000L);
11211
11212
0
      if (++waits >= seconds_before_warning)
11213
0
      {
11214
0
        seconds_before_warning *= 2;  /* This wraps in >10 years... */
11215
0
        ereport(WARNING,
11216
0
            (errmsg("pg_stop_backup still waiting for all required WAL segments to be archived (%d seconds elapsed)",
11217
0
                waits),
11218
0
             errhint("Check that your archive_command is executing properly.  "
11219
0
                 "pg_stop_backup can be canceled safely, "
11220
0
                 "but the database backup will not be usable without all the WAL segments.")));
11221
0
      }
11222
0
    }
11223
11224
0
    ereport(NOTICE,
11225
0
        (errmsg("pg_stop_backup complete, all required WAL segments have been archived")));
11226
0
  }
11227
0
  else if (waitforarchive)
11228
0
    ereport(NOTICE,
11229
0
        (errmsg("WAL archiving is not enabled; you must ensure that all required WAL segments are copied through other means to complete the backup")));
11230
11231
  /*
11232
   * We're done.  As a convenience, return the ending WAL location.
11233
   */
11234
0
  if (stoptli_p)
11235
0
    *stoptli_p = stoptli;
11236
0
  return stoppoint;
11237
0
}
11238
11239
11240
/*
11241
 * do_pg_abort_backup: abort a running backup
11242
 *
11243
 * This does just the most basic steps of do_pg_stop_backup(), by taking the
11244
 * system out of backup mode, thus making it a lot more safe to call from
11245
 * an error handler.
11246
 *
11247
 * NB: This is only for aborting a non-exclusive backup that doesn't write
11248
 * backup_label. A backup started with pg_start_backup() needs to be finished
11249
 * with pg_stop_backup().
11250
 */
11251
void
11252
do_pg_abort_backup(void)
11253
0
{
11254
  /*
11255
   * Quick exit if session is not keeping around a non-exclusive backup
11256
   * already started.
11257
   */
11258
0
  if (sessionBackupState == SESSION_BACKUP_NONE)
11259
0
    return;
11260
11261
0
  WALInsertLockAcquireExclusive();
11262
0
  Assert(XLogCtl->Insert.nonExclusiveBackups > 0);
11263
0
  Assert(sessionBackupState == SESSION_BACKUP_NON_EXCLUSIVE);
11264
0
  XLogCtl->Insert.nonExclusiveBackups--;
11265
11266
0
  if (XLogCtl->Insert.exclusiveBackupState == EXCLUSIVE_BACKUP_NONE &&
11267
0
    XLogCtl->Insert.nonExclusiveBackups == 0)
11268
0
  {
11269
0
    XLogCtl->Insert.forcePageWrites = false;
11270
0
  }
11271
0
  WALInsertLockRelease();
11272
0
}
11273
11274
/*
11275
 * Get latest redo apply position.
11276
 *
11277
 * Exported to allow WALReceiver to read the pointer directly.
11278
 */
11279
XLogRecPtr
11280
GetXLogReplayRecPtr(TimeLineID *replayTLI)
11281
0
{
11282
0
  XLogRecPtr  recptr;
11283
0
  TimeLineID  tli;
11284
11285
0
  SpinLockAcquire(&XLogCtl->info_lck);
11286
0
  recptr = XLogCtl->lastReplayedEndRecPtr;
11287
0
  tli = XLogCtl->lastReplayedTLI;
11288
0
  SpinLockRelease(&XLogCtl->info_lck);
11289
11290
0
  if (replayTLI)
11291
0
    *replayTLI = tli;
11292
0
  return recptr;
11293
0
}
11294
11295
/*
11296
 * Get latest WAL insert pointer
11297
 */
11298
XLogRecPtr
11299
GetXLogInsertRecPtr(void)
11300
0
{
11301
0
  XLogCtlInsert *Insert = &XLogCtl->Insert;
11302
0
  uint64    current_bytepos;
11303
11304
0
  SpinLockAcquire(&Insert->insertpos_lck);
11305
0
  current_bytepos = Insert->CurrBytePos;
11306
0
  SpinLockRelease(&Insert->insertpos_lck);
11307
11308
0
  return XLogBytePosToRecPtr(current_bytepos);
11309
0
}
11310
11311
/*
11312
 * Get latest WAL write pointer
11313
 */
11314
XLogRecPtr
11315
GetXLogWriteRecPtr(void)
11316
0
{
11317
0
  SpinLockAcquire(&XLogCtl->info_lck);
11318
0
  LogwrtResult = XLogCtl->LogwrtResult;
11319
0
  SpinLockRelease(&XLogCtl->info_lck);
11320
11321
0
  return LogwrtResult.Write;
11322
0
}
11323
11324
/*
11325
 * Returns the redo pointer of the last checkpoint or restartpoint. This is
11326
 * the oldest point in WAL that we still need, if we have to restart recovery.
11327
 */
11328
void
11329
GetOldestRestartPoint(XLogRecPtr *oldrecptr, TimeLineID *oldtli)
11330
0
{
11331
0
  LWLockAcquire(ControlFileLock, LW_SHARED);
11332
0
  *oldrecptr = ControlFile->checkPointCopy.redo;
11333
0
  *oldtli = ControlFile->checkPointCopy.ThisTimeLineID;
11334
0
  LWLockRelease(ControlFileLock);
11335
0
}
11336
11337
/*
11338
 * read_backup_label: check to see if a backup_label file is present
11339
 *
11340
 * If we see a backup_label during recovery, we assume that we are recovering
11341
 * from a backup dump file, and we therefore roll forward from the checkpoint
11342
 * identified by the label file, NOT what pg_control says.  This avoids the
11343
 * problem that pg_control might have been archived one or more checkpoints
11344
 * later than the start of the dump, and so if we rely on it as the start
11345
 * point, we will fail to restore a consistent database state.
11346
 *
11347
 * Returns true if a backup_label was found (and fills the checkpoint
11348
 * location and its REDO location into *checkPointLoc and RedoStartLSN,
11349
 * respectively); returns false if not. If this backup_label came from a
11350
 * streamed backup, *backupEndRequired is set to true. If this backup_label
11351
 * was created during recovery, *backupFromStandby is set to true.
11352
 */
11353
static bool
11354
read_backup_label(XLogRecPtr *checkPointLoc, bool *backupEndRequired,
11355
          bool *backupFromStandby)
11356
3.99k
{
11357
3.99k
  char    startxlogfilename[MAXFNAMELEN];
11358
3.99k
  TimeLineID  tli_from_walseg,
11359
3.99k
        tli_from_file;
11360
3.99k
  FILE     *lfp;
11361
3.99k
  char    ch;
11362
3.99k
  char    backuptype[20];
11363
3.99k
  char    backupfrom[20];
11364
3.99k
  char    backuplabel[MAXPGPATH];
11365
3.99k
  char    backuptime[128];
11366
3.99k
  uint32    hi,
11367
3.99k
        lo;
11368
11369
3.99k
  *backupEndRequired = false;
11370
3.99k
  *backupFromStandby = false;
11371
11372
  /*
11373
   * See if label file is present
11374
   */
11375
3.99k
  lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
11376
3.99k
  if (!lfp)
11377
3.99k
  {
11378
3.99k
    if (errno != ENOENT)
11379
3.99k
      ereport(FATAL,
11380
3.99k
          (errcode_for_file_access(),
11381
3.99k
           errmsg("could not read file \"%s\": %m",
11382
3.99k
              BACKUP_LABEL_FILE)));
11383
3.99k
    return false;     /* it's not there, all is fine */
11384
3.99k
  }
11385
11386
  /*
11387
   * Read and parse the START WAL LOCATION and CHECKPOINT lines (this code
11388
   * is pretty crude, but we are not expecting any variability in the file
11389
   * format).
11390
   */
11391
0
  if (fscanf(lfp, "START WAL LOCATION: %X/%X (file %08X%16s)%c",
11392
0
         &hi, &lo, &tli_from_walseg, startxlogfilename, &ch) != 5 || ch != '\n')
11393
0
    ereport(FATAL,
11394
0
        (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
11395
0
         errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
11396
0
  RedoStartLSN = ((uint64) hi) << 32 | lo;
11397
0
  if (fscanf(lfp, "CHECKPOINT LOCATION: %X/%X%c",
11398
0
         &hi, &lo, &ch) != 3 || ch != '\n')
11399
0
    ereport(FATAL,
11400
0
        (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
11401
0
         errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
11402
0
  *checkPointLoc = ((uint64) hi) << 32 | lo;
11403
11404
  /*
11405
   * BACKUP METHOD and BACKUP FROM lines are new in 9.2. We can't restore
11406
   * from an older backup anyway, but since the information on it is not
11407
   * strictly required, don't error out if it's missing for some reason.
11408
   */
11409
0
  if (fscanf(lfp, "BACKUP METHOD: %19s\n", backuptype) == 1)
11410
0
  {
11411
0
    if (strcmp(backuptype, "streamed") == 0)
11412
0
      *backupEndRequired = true;
11413
0
  }
11414
11415
0
  if (fscanf(lfp, "BACKUP FROM: %19s\n", backupfrom) == 1)
11416
0
  {
11417
0
    if (strcmp(backupfrom, "standby") == 0)
11418
0
      *backupFromStandby = true;
11419
0
  }
11420
11421
  /*
11422
   * Parse START TIME and LABEL. Those are not mandatory fields for recovery
11423
   * but checking for their presence is useful for debugging and the next
11424
   * sanity checks. Cope also with the fact that the result buffers have a
11425
   * pre-allocated size, hence if the backup_label file has been generated
11426
   * with strings longer than the maximum assumed here an incorrect parsing
11427
   * happens. That's fine as only minor consistency checks are done
11428
   * afterwards.
11429
   */
11430
0
  if (fscanf(lfp, "START TIME: %127[^\n]\n", backuptime) == 1)
11431
0
    ereport(DEBUG1,
11432
0
        (errmsg("backup time %s in file \"%s\"",
11433
0
            backuptime, BACKUP_LABEL_FILE)));
11434
11435
0
  if (fscanf(lfp, "LABEL: %1023[^\n]\n", backuplabel) == 1)
11436
0
    ereport(DEBUG1,
11437
0
        (errmsg("backup label %s in file \"%s\"",
11438
0
            backuplabel, BACKUP_LABEL_FILE)));
11439
11440
  /*
11441
   * START TIMELINE is new as of 11. Its parsing is not mandatory, still use
11442
   * it as a sanity check if present.
11443
   */
11444
0
  if (fscanf(lfp, "START TIMELINE: %u\n", &tli_from_file) == 1)
11445
0
  {
11446
0
    if (tli_from_walseg != tli_from_file)
11447
0
      ereport(FATAL,
11448
0
          (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
11449
0
           errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE),
11450
0
           errdetail("Timeline ID parsed is %u, but expected %u",
11451
0
                 tli_from_file, tli_from_walseg)));
11452
11453
0
    ereport(DEBUG1,
11454
0
        (errmsg("backup timeline %u in file \"%s\"",
11455
0
            tli_from_file, BACKUP_LABEL_FILE)));
11456
0
  }
11457
11458
0
  if (ferror(lfp) || FreeFile(lfp))
11459
0
    ereport(FATAL,
11460
0
        (errcode_for_file_access(),
11461
0
         errmsg("could not read file \"%s\": %m",
11462
0
            BACKUP_LABEL_FILE)));
11463
11464
0
  return true;
11465
0
}
11466
11467
/*
11468
 * read_tablespace_map: check to see if a tablespace_map file is present
11469
 *
11470
 * If we see a tablespace_map file during recovery, we assume that we are
11471
 * recovering from a backup dump file, and we therefore need to create symlinks
11472
 * as per the information present in tablespace_map file.
11473
 *
11474
 * Returns true if a tablespace_map file was found (and fills the link
11475
 * information for all the tablespace links present in file); returns false
11476
 * if not.
11477
 */
11478
static bool
11479
read_tablespace_map(List **tablespaces)
11480
0
{
11481
0
  tablespaceinfo *ti;
11482
0
  FILE     *lfp;
11483
0
  char    tbsoid[MAXPGPATH];
11484
0
  char     *tbslinkpath;
11485
0
  char    str[MAXPGPATH];
11486
0
  int     ch,
11487
0
        prev_ch = -1,
11488
0
        i = 0,
11489
0
        n;
11490
11491
  /*
11492
   * See if tablespace_map file is present
11493
   */
11494
0
  lfp = AllocateFile(TABLESPACE_MAP, "r");
11495
0
  if (!lfp)
11496
0
  {
11497
0
    if (errno != ENOENT)
11498
0
      ereport(FATAL,
11499
0
          (errcode_for_file_access(),
11500
0
           errmsg("could not read file \"%s\": %m",
11501
0
              TABLESPACE_MAP)));
11502
0
    return false;     /* it's not there, all is fine */
11503
0
  }
11504
11505
  /*
11506
   * Read and parse the link name and path lines from tablespace_map file
11507
   * (this code is pretty crude, but we are not expecting any variability in
11508
   * the file format).  While taking backup we embed escape character '\\'
11509
   * before newline in tablespace path, so that during reading of
11510
   * tablespace_map file, we could distinguish newline in tablespace path
11511
   * and end of line.  Now while reading tablespace_map file, remove the
11512
   * escape character that has been added in tablespace path during backup.
11513
   */
11514
0
  while ((ch = fgetc(lfp)) != EOF)
11515
0
  {
11516
0
    if ((ch == '\n' || ch == '\r') && prev_ch != '\\')
11517
0
    {
11518
0
      str[i] = '\0';
11519
0
      if (sscanf(str, "%s %n", tbsoid, &n) != 1)
11520
0
        ereport(FATAL,
11521
0
            (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
11522
0
             errmsg("invalid data in file \"%s\"", TABLESPACE_MAP)));
11523
0
      tbslinkpath = str + n;
11524
0
      i = 0;
11525
11526
0
      ti = palloc(sizeof(tablespaceinfo));
11527
0
      ti->oid = pstrdup(tbsoid);
11528
0
      ti->path = pstrdup(tbslinkpath);
11529
11530
0
      *tablespaces = lappend(*tablespaces, ti);
11531
0
      continue;
11532
0
    }
11533
0
    else if ((ch == '\n' || ch == '\r') && prev_ch == '\\')
11534
0
      str[i - 1] = ch;
11535
0
    else
11536
0
      str[i++] = ch;
11537
0
    prev_ch = ch;
11538
0
  }
11539
11540
0
  if (ferror(lfp) || FreeFile(lfp))
11541
0
    ereport(FATAL,
11542
0
        (errcode_for_file_access(),
11543
0
         errmsg("could not read file \"%s\": %m",
11544
0
            TABLESPACE_MAP)));
11545
11546
0
  return true;
11547
0
}
11548
11549
/*
11550
 * Error context callback for errors occurring during rm_redo().
11551
 */
11552
static void
11553
rm_redo_error_callback(void *arg)
11554
0
{
11555
0
  XLogReaderState *record = (XLogReaderState *) arg;
11556
0
  StringInfoData buf;
11557
11558
0
  initStringInfo(&buf);
11559
0
  xlog_outdesc(&buf, record);
11560
11561
  /* translator: %s is a WAL record description */
11562
0
  errcontext("WAL redo at %X/%X for %s",
11563
0
         (uint32) (record->ReadRecPtr >> 32),
11564
0
         (uint32) record->ReadRecPtr,
11565
0
         buf.data);
11566
11567
0
  pfree(buf.data);
11568
0
}
11569
11570
/*
11571
 * BackupInProgress: check if online backup mode is active
11572
 *
11573
 * This is done by checking for existence of the "backup_label" file.
11574
 */
11575
bool
11576
BackupInProgress(void)
11577
3
{
11578
3
  struct stat stat_buf;
11579
11580
3
  return (stat(BACKUP_LABEL_FILE, &stat_buf) == 0);
11581
3
}
11582
11583
/*
11584
 * CancelBackup: rename the "backup_label" and "tablespace_map"
11585
 *         files to cancel backup mode
11586
 *
11587
 * If the "backup_label" file exists, it will be renamed to "backup_label.old".
11588
 * Similarly, if the "tablespace_map" file exists, it will be renamed to
11589
 * "tablespace_map.old".
11590
 *
11591
 * Note that this will render an online backup in progress
11592
 * useless. To correctly finish an online backup, pg_stop_backup must be
11593
 * called.
11594
 */
11595
void
11596
CancelBackup(void)
11597
3
{
11598
3
  struct stat stat_buf;
11599
11600
  /* if the backup_label file is not there, return */
11601
3
  if (stat(BACKUP_LABEL_FILE, &stat_buf) < 0)
11602
3
    return;
11603
11604
  /* remove leftover file from previously canceled backup if it exists */
11605
0
  unlink(BACKUP_LABEL_OLD);
11606
11607
0
  if (durable_rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD, DEBUG1) != 0)
11608
0
  {
11609
0
    ereport(WARNING,
11610
0
        (errcode_for_file_access(),
11611
0
         errmsg("online backup mode was not canceled"),
11612
0
         errdetail("File \"%s\" could not be renamed to \"%s\": %m.",
11613
0
               BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
11614
0
    return;
11615
0
  }
11616
11617
  /* if the tablespace_map file is not there, return */
11618
0
  if (stat(TABLESPACE_MAP, &stat_buf) < 0)
11619
0
  {
11620
0
    ereport(LOG,
11621
0
        (errmsg("online backup mode canceled"),
11622
0
         errdetail("File \"%s\" was renamed to \"%s\".",
11623
0
               BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
11624
0
    return;
11625
0
  }
11626
11627
  /* remove leftover file from previously canceled backup if it exists */
11628
0
  unlink(TABLESPACE_MAP_OLD);
11629
11630
0
  if (durable_rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD, DEBUG1) == 0)
11631
0
  {
11632
0
    ereport(LOG,
11633
0
        (errmsg("online backup mode canceled"),
11634
0
         errdetail("Files \"%s\" and \"%s\" were renamed to "
11635
0
               "\"%s\" and \"%s\", respectively.",
11636
0
               BACKUP_LABEL_FILE, TABLESPACE_MAP,
11637
0
               BACKUP_LABEL_OLD, TABLESPACE_MAP_OLD)));
11638
0
  }
11639
0
  else
11640
0
  {
11641
0
    ereport(WARNING,
11642
0
        (errcode_for_file_access(),
11643
0
         errmsg("online backup mode canceled"),
11644
0
         errdetail("File \"%s\" was renamed to \"%s\", but "
11645
0
               "file \"%s\" could not be renamed to \"%s\": %m.",
11646
0
               BACKUP_LABEL_FILE, BACKUP_LABEL_OLD,
11647
0
               TABLESPACE_MAP, TABLESPACE_MAP_OLD)));
11648
0
  }
11649
0
}
11650
11651
/*
11652
 * Read the XLOG page containing RecPtr into readBuf (if not read already).
11653
 * Returns number of bytes read, if the page is read successfully, or -1
11654
 * in case of errors.  When errors occur, they are ereport'ed, but only
11655
 * if they have not been previously reported.
11656
 *
11657
 * This is responsible for restoring files from archive as needed, as well
11658
 * as for waiting for the requested WAL record to arrive in standby mode.
11659
 *
11660
 * 'emode' specifies the log level used for reporting "file not found" or
11661
 * "end of WAL" situations in archive recovery, or in standby mode when a
11662
 * trigger file is found. If set to WARNING or below, XLogPageRead() returns
11663
 * false in those situations, on higher log levels the ereport() won't
11664
 * return.
11665
 *
11666
 * In standby mode, if after a successful return of XLogPageRead() the
11667
 * caller finds the record it's interested in to be broken, it should
11668
 * ereport the error with the level determined by
11669
 * emode_for_corrupt_record(), and then set lastSourceFailed
11670
 * and call XLogPageRead() again with the same arguments. This lets
11671
 * XLogPageRead() to try fetching the record from another source, or to
11672
 * sleep and retry.
11673
 */
11674
static int
11675
XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen,
11676
       XLogRecPtr targetRecPtr, char *readBuf, TimeLineID *readTLI)
11677
4.00k
{
11678
4.00k
  XLogPageReadPrivate *private =
11679
4.00k
  (XLogPageReadPrivate *) xlogreader->private_data;
11680
4.00k
  int     emode = private->emode;
11681
4.00k
  uint32    targetPageOff;
11682
4.00k
  XLogSegNo targetSegNo PG_USED_FOR_ASSERTS_ONLY;
11683
11684
4.00k
  XLByteToSeg(targetPagePtr, targetSegNo, wal_segment_size);
11685
4.00k
  targetPageOff = XLogSegmentOffset(targetPagePtr, wal_segment_size);
11686
11687
  /*
11688
   * See if we need to switch to a new segment because the requested record
11689
   * is not in the currently open one.
11690
   */
11691
4.00k
  if (readFile >= 0 &&
11692
4.00k
    
!0
XLByteInSeg0
(targetPagePtr, readSegNo, wal_segment_size))
11693
0
  {
11694
    /*
11695
     * Request a restartpoint if we've replayed too much xlog since the
11696
     * last one.
11697
     */
11698
0
    if (bgwriterLaunched)
11699
0
    {
11700
0
      if (XLogCheckpointNeeded(readSegNo))
11701
0
      {
11702
0
        (void) GetRedoRecPtr();
11703
0
        if (XLogCheckpointNeeded(readSegNo))
11704
0
          RequestCheckpoint(CHECKPOINT_CAUSE_XLOG);
11705
0
      }
11706
0
    }
11707
11708
0
    close(readFile);
11709
0
    readFile = -1;
11710
0
    readSource = 0;
11711
0
  }
11712
11713
4.00k
  XLByteToSeg(targetPagePtr, readSegNo, wal_segment_size);
11714
11715
4.00k
retry:
11716
  /* See if we need to retrieve more data */
11717
4.00k
  if (readFile < 0 ||
11718
4.00k
    
(0
readSource == XLOG_FROM_STREAM0
&&
11719
0
     receivedUpto < targetPagePtr + reqLen))
11720
4.00k
  {
11721
4.00k
    if (!WaitForWALToBecomeAvailable(targetPagePtr + reqLen,
11722
4.00k
                     private->randAccess,
11723
4.00k
                     private->fetching_ckpt,
11724
4.00k
                     targetRecPtr))
11725
0
    {
11726
0
      if (readFile >= 0)
11727
0
        close(readFile);
11728
0
      readFile = -1;
11729
0
      readLen = 0;
11730
0
      readSource = 0;
11731
11732
0
      return -1;
11733
0
    }
11734
4.00k
  }
11735
11736
  /*
11737
   * At this point, we have the right segment open and if we're streaming we
11738
   * know the requested record is in it.
11739
   */
11740
4.00k
  Assert(readFile != -1);
11741
11742
  /*
11743
   * If the current segment is being streamed from master, calculate how
11744
   * much of the current page we have received already. We know the
11745
   * requested record has been received, but this is for the benefit of
11746
   * future calls, to allow quick exit at the top of this function.
11747
   */
11748
4.00k
  if (readSource == XLOG_FROM_STREAM)
11749
0
  {
11750
0
    if (((targetPagePtr) / XLOG_BLCKSZ) != (receivedUpto / XLOG_BLCKSZ))
11751
0
      readLen = XLOG_BLCKSZ;
11752
0
    else
11753
0
      readLen = XLogSegmentOffset(receivedUpto, wal_segment_size) -
11754
0
        targetPageOff;
11755
0
  }
11756
4.00k
  else
11757
4.00k
    readLen = XLOG_BLCKSZ;
11758
11759
  /* Read the requested page */
11760
4.00k
  readOff = targetPageOff;
11761
4.00k
  if (lseek(readFile, (off_t) readOff, SEEK_SET) < 0)
11762
0
  {
11763
0
    char    fname[MAXFNAMELEN];
11764
0
    int     save_errno = errno;
11765
11766
0
    XLogFileName(fname, curFileTLI, readSegNo, wal_segment_size);
11767
0
    errno = save_errno;
11768
0
    ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
11769
0
        (errcode_for_file_access(),
11770
0
         errmsg("could not seek in log segment %s to offset %u: %m",
11771
0
            fname, readOff)));
11772
0
    goto next_record_is_invalid;
11773
0
  }
11774
11775
4.00k
  pgstat_report_wait_start(WAIT_EVENT_WAL_READ);
11776
4.00k
  if (read(readFile, readBuf, XLOG_BLCKSZ) != XLOG_BLCKSZ)
11777
0
  {
11778
0
    char    fname[MAXFNAMELEN];
11779
0
    int     save_errno = errno;
11780
11781
0
    pgstat_report_wait_end();
11782
0
    XLogFileName(fname, curFileTLI, readSegNo, wal_segment_size);
11783
0
    errno = save_errno;
11784
0
    ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
11785
0
        (errcode_for_file_access(),
11786
0
         errmsg("could not read from log segment %s, offset %u: %m",
11787
0
            fname, readOff)));
11788
0
    goto next_record_is_invalid;
11789
0
  }
11790
4.00k
  pgstat_report_wait_end();
11791
11792
4.00k
  Assert(targetSegNo == readSegNo);
11793
4.00k
  Assert(targetPageOff == readOff);
11794
4.00k
  Assert(reqLen <= readLen);
11795
11796
4.00k
  *readTLI = curFileTLI;
11797
11798
  /*
11799
   * Check the page header immediately, so that we can retry immediately if
11800
   * it's not valid. This may seem unnecessary, because XLogReadRecord()
11801
   * validates the page header anyway, and would propagate the failure up to
11802
   * ReadRecord(), which would retry. However, there's a corner case with
11803
   * continuation records, if a record is split across two pages such that
11804
   * we would need to read the two pages from different sources. For
11805
   * example, imagine a scenario where a streaming replica is started up,
11806
   * and replay reaches a record that's split across two WAL segments. The
11807
   * first page is only available locally, in pg_wal, because it's already
11808
   * been recycled in the master. The second page, however, is not present
11809
   * in pg_wal, and we should stream it from the master. There is a recycled
11810
   * WAL segment present in pg_wal, with garbage contents, however. We would
11811
   * read the first page from the local WAL segment, but when reading the
11812
   * second page, we would read the bogus, recycled, WAL segment. If we
11813
   * didn't catch that case here, we would never recover, because
11814
   * ReadRecord() would retry reading the whole record from the beginning.
11815
   *
11816
   * Of course, this only catches errors in the page header, which is what
11817
   * happens in the case of a recycled WAL segment. Other kinds of errors or
11818
   * corruption still has the same problem. But this at least fixes the
11819
   * common case, which can happen as part of normal operation.
11820
   *
11821
   * Validating the page header is cheap enough that doing it twice
11822
   * shouldn't be a big deal from a performance point of view.
11823
   */
11824
4.00k
  if (!XLogReaderValidatePageHeader(xlogreader, targetPagePtr, readBuf))
11825
0
  {
11826
    /* reset any error XLogReaderValidatePageHeader() might have set */
11827
0
    xlogreader->errormsg_buf[0] = '\0';
11828
0
    goto next_record_is_invalid;
11829
0
  }
11830
11831
4.00k
  return readLen;
11832
11833
0
next_record_is_invalid:
11834
0
  lastSourceFailed = true;
11835
11836
0
  if (readFile >= 0)
11837
0
    close(readFile);
11838
0
  readFile = -1;
11839
0
  readLen = 0;
11840
0
  readSource = 0;
11841
11842
  /* In standby-mode, keep trying */
11843
0
  if (StandbyMode)
11844
0
    goto retry;
11845
0
  else
11846
0
    return -1;
11847
0
}
11848
11849
/*
11850
 * Open the WAL segment containing WAL location 'RecPtr'.
11851
 *
11852
 * The segment can be fetched via restore_command, or via walreceiver having
11853
 * streamed the record, or it can already be present in pg_wal. Checking
11854
 * pg_wal is mainly for crash recovery, but it will be polled in standby mode
11855
 * too, in case someone copies a new segment directly to pg_wal. That is not
11856
 * documented or recommended, though.
11857
 *
11858
 * If 'fetching_ckpt' is true, we're fetching a checkpoint record, and should
11859
 * prepare to read WAL starting from RedoStartLSN after this.
11860
 *
11861
 * 'RecPtr' might not point to the beginning of the record we're interested
11862
 * in, it might also point to the page or segment header. In that case,
11863
 * 'tliRecPtr' is the position of the WAL record we're interested in. It is
11864
 * used to decide which timeline to stream the requested WAL from.
11865
 *
11866
 * If the record is not immediately available, the function returns false
11867
 * if we're not in standby mode. In standby mode, waits for it to become
11868
 * available.
11869
 *
11870
 * When the requested record becomes available, the function opens the file
11871
 * containing it (if not open already), and returns true. When end of standby
11872
 * mode is triggered by the user, and there is no more WAL available, returns
11873
 * false.
11874
 */
11875
static bool
11876
WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
11877
              bool fetching_ckpt, XLogRecPtr tliRecPtr)
11878
4.00k
{
11879
4.00k
  static TimestampTz last_fail_time = 0;
11880
4.00k
  TimestampTz now;
11881
4.00k
  bool    streaming_reply_sent = false;
11882
11883
  /*-------
11884
   * Standby mode is implemented by a state machine:
11885
   *
11886
   * 1. Read from either archive or pg_wal (XLOG_FROM_ARCHIVE), or just
11887
   *    pg_wal (XLOG_FROM_PG_WAL)
11888
   * 2. Check trigger file
11889
   * 3. Read from primary server via walreceiver (XLOG_FROM_STREAM)
11890
   * 4. Rescan timelines
11891
   * 5. Sleep wal_retrieve_retry_interval milliseconds, and loop back to 1.
11892
   *
11893
   * Failure to read from the current source advances the state machine to
11894
   * the next state.
11895
   *
11896
   * 'currentSource' indicates the current state. There are no currentSource
11897
   * values for "check trigger", "rescan timelines", and "sleep" states,
11898
   * those actions are taken when reading from the previous source fails, as
11899
   * part of advancing to the next state.
11900
   *-------
11901
   */
11902
4.00k
  if (!InArchiveRecovery)
11903
4.00k
    currentSource = XLOG_FROM_PG_WAL;
11904
0
  else if (currentSource == 0)
11905
0
    currentSource = XLOG_FROM_ARCHIVE;
11906
11907
4.00k
  for (;;)
11908
4.00k
  {
11909
4.00k
    int     oldSource = currentSource;
11910
11911
    /*
11912
     * First check if we failed to read from the current source, and
11913
     * advance the state machine if so. The failure to read might've
11914
     * happened outside this function, e.g when a CRC check fails on a
11915
     * record, or within this loop.
11916
     */
11917
4.00k
    if (lastSourceFailed)
11918
0
    {
11919
0
      switch (currentSource)
11920
0
      {
11921
0
        case XLOG_FROM_ARCHIVE:
11922
0
        case XLOG_FROM_PG_WAL:
11923
11924
          /*
11925
           * Check to see if the trigger file exists. Note that we
11926
           * do this only after failure, so when you create the
11927
           * trigger file, we still finish replaying as much as we
11928
           * can from archive and pg_wal before failover.
11929
           */
11930
0
          if (StandbyMode && CheckForStandbyTrigger())
11931
0
          {
11932
0
            ShutdownWalRcv();
11933
0
            return false;
11934
0
          }
11935
11936
          /*
11937
           * Not in standby mode, and we've now tried the archive
11938
           * and pg_wal.
11939
           */
11940
0
          if (!StandbyMode)
11941
0
            return false;
11942
11943
          /*
11944
           * If primary_conninfo is set, launch walreceiver to try
11945
           * to stream the missing WAL.
11946
           *
11947
           * If fetching_ckpt is true, RecPtr points to the initial
11948
           * checkpoint location. In that case, we use RedoStartLSN
11949
           * as the streaming start position instead of RecPtr, so
11950
           * that when we later jump backwards to start redo at
11951
           * RedoStartLSN, we will have the logs streamed already.
11952
           */
11953
0
          if (PrimaryConnInfo)
11954
0
          {
11955
0
            XLogRecPtr  ptr;
11956
0
            TimeLineID  tli;
11957
11958
0
            if (fetching_ckpt)
11959
0
            {
11960
0
              ptr = RedoStartLSN;
11961
0
              tli = ControlFile->checkPointCopy.ThisTimeLineID;
11962
0
            }
11963
0
            else
11964
0
            {
11965
0
              ptr = RecPtr;
11966
11967
              /*
11968
               * Use the record begin position to determine the
11969
               * TLI, rather than the position we're reading.
11970
               */
11971
0
              tli = tliOfPointInHistory(tliRecPtr, expectedTLEs);
11972
11973
0
              if (curFileTLI > 0 && tli < curFileTLI)
11974
0
                elog(ERROR, "according to history file, WAL location %X/%X belongs to timeline %u, but previous recovered WAL file came from timeline %u",
11975
0
                   (uint32) (tliRecPtr >> 32),
11976
0
                   (uint32) tliRecPtr,
11977
0
                   tli, curFileTLI);
11978
0
            }
11979
0
            curFileTLI = tli;
11980
0
            RequestXLogStreaming(tli, ptr, PrimaryConnInfo,
11981
0
                       PrimarySlotName);
11982
0
            receivedUpto = 0;
11983
0
          }
11984
11985
          /*
11986
           * Move to XLOG_FROM_STREAM state in either case. We'll
11987
           * get immediate failure if we didn't launch walreceiver,
11988
           * and move on to the next state.
11989
           */
11990
0
          currentSource = XLOG_FROM_STREAM;
11991
0
          break;
11992
11993
0
        case XLOG_FROM_STREAM:
11994
11995
          /*
11996
           * Failure while streaming. Most likely, we got here
11997
           * because streaming replication was terminated, or
11998
           * promotion was triggered. But we also get here if we
11999
           * find an invalid record in the WAL streamed from master,
12000
           * in which case something is seriously wrong. There's
12001
           * little chance that the problem will just go away, but
12002
           * PANIC is not good for availability either, especially
12003
           * in hot standby mode. So, we treat that the same as
12004
           * disconnection, and retry from archive/pg_wal again. The
12005
           * WAL in the archive should be identical to what was
12006
           * streamed, so it's unlikely that it helps, but one can
12007
           * hope...
12008
           */
12009
12010
          /*
12011
           * Before we leave XLOG_FROM_STREAM state, make sure that
12012
           * walreceiver is not active, so that it won't overwrite
12013
           * WAL that we restore from archive.
12014
           */
12015
0
          if (WalRcvStreaming())
12016
0
            ShutdownWalRcv();
12017
12018
          /*
12019
           * Before we sleep, re-scan for possible new timelines if
12020
           * we were requested to recover to the latest timeline.
12021
           */
12022
0
          if (recoveryTargetIsLatest)
12023
0
          {
12024
0
            if (rescanLatestTimeLine())
12025
0
            {
12026
0
              currentSource = XLOG_FROM_ARCHIVE;
12027
0
              break;
12028
0
            }
12029
0
          }
12030
12031
          /*
12032
           * XLOG_FROM_STREAM is the last state in our state
12033
           * machine, so we've exhausted all the options for
12034
           * obtaining the requested WAL. We're going to loop back
12035
           * and retry from the archive, but if it hasn't been long
12036
           * since last attempt, sleep wal_retrieve_retry_interval
12037
           * milliseconds to avoid busy-waiting.
12038
           */
12039
0
          now = GetCurrentTimestamp();
12040
0
          if (!TimestampDifferenceExceeds(last_fail_time, now,
12041
0
                          wal_retrieve_retry_interval))
12042
0
          {
12043
0
            long    secs,
12044
0
                  wait_time;
12045
0
            int     usecs;
12046
12047
0
            TimestampDifference(last_fail_time, now, &secs, &usecs);
12048
0
            wait_time = wal_retrieve_retry_interval -
12049
0
              (secs * 1000 + usecs / 1000);
12050
12051
0
            WaitLatch(&XLogCtl->recoveryWakeupLatch,
12052
0
                  WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH,
12053
0
                  wait_time, WAIT_EVENT_RECOVERY_WAL_STREAM);
12054
0
            ResetLatch(&XLogCtl->recoveryWakeupLatch);
12055
0
            now = GetCurrentTimestamp();
12056
0
          }
12057
0
          last_fail_time = now;
12058
0
          currentSource = XLOG_FROM_ARCHIVE;
12059
0
          break;
12060
12061
0
        default:
12062
0
          elog(ERROR, "unexpected WAL source %d", currentSource);
12063
0
      }
12064
0
    }
12065
4.00k
    else if (currentSource == XLOG_FROM_PG_WAL)
12066
4.00k
    {
12067
      /*
12068
       * We just successfully read a file in pg_wal. We prefer files in
12069
       * the archive over ones in pg_wal, so try the next file again
12070
       * from the archive first.
12071
       */
12072
4.00k
      if (InArchiveRecovery)
12073
0
        currentSource = XLOG_FROM_ARCHIVE;
12074
4.00k
    }
12075
12076
4.00k
    if (currentSource != oldSource)
12077
0
      elog(DEBUG2, "switched WAL source from %s to %s after %s",
12078
4.00k
         xlogSourceNames[oldSource], xlogSourceNames[currentSource],
12079
4.00k
         lastSourceFailed ? "failure" : "success");
12080
12081
    /*
12082
     * We've now handled possible failure. Try to read from the chosen
12083
     * source.
12084
     */
12085
4.00k
    lastSourceFailed = false;
12086
12087
4.00k
    switch (currentSource)
12088
4.00k
    {
12089
0
      case XLOG_FROM_ARCHIVE:
12090
4.00k
      case XLOG_FROM_PG_WAL:
12091
        /* Close any old file we might have open. */
12092
4.00k
        if (readFile >= 0)
12093
0
        {
12094
0
          close(readFile);
12095
0
          readFile = -1;
12096
0
        }
12097
        /* Reset curFileTLI if random fetch. */
12098
4.00k
        if (randAccess)
12099
4.00k
          curFileTLI = 0;
12100
12101
        /*
12102
         * Try to restore the file from archive, or read an existing
12103
         * file from pg_wal.
12104
         */
12105
4.00k
        readFile = XLogFileReadAnyTLI(readSegNo, DEBUG2,
12106
4.00k
                        currentSource == XLOG_FROM_ARCHIVE ? 
XLOG_FROM_ANY0
:
12107
4.00k
                        currentSource);
12108
4.00k
        if (readFile >= 0)
12109
4.00k
          return true;  /* success! */
12110
12111
        /*
12112
         * Nope, not found in archive or pg_wal.
12113
         */
12114
0
        lastSourceFailed = true;
12115
0
        break;
12116
12117
0
      case XLOG_FROM_STREAM:
12118
0
        {
12119
0
          bool    havedata;
12120
12121
          /*
12122
           * Check if WAL receiver is still active.
12123
           */
12124
0
          if (!WalRcvStreaming())
12125
0
          {
12126
0
            lastSourceFailed = true;
12127
0
            break;
12128
0
          }
12129
12130
          /*
12131
           * Walreceiver is active, so see if new data has arrived.
12132
           *
12133
           * We only advance XLogReceiptTime when we obtain fresh
12134
           * WAL from walreceiver and observe that we had already
12135
           * processed everything before the most recent "chunk"
12136
           * that it flushed to disk.  In steady state where we are
12137
           * keeping up with the incoming data, XLogReceiptTime will
12138
           * be updated on each cycle. When we are behind,
12139
           * XLogReceiptTime will not advance, so the grace time
12140
           * allotted to conflicting queries will decrease.
12141
           */
12142
0
          if (RecPtr < receivedUpto)
12143
0
            havedata = true;
12144
0
          else
12145
0
          {
12146
0
            XLogRecPtr  latestChunkStart;
12147
12148
0
            receivedUpto = GetWalRcvWriteRecPtr(&latestChunkStart, &receiveTLI);
12149
0
            if (RecPtr < receivedUpto && receiveTLI == curFileTLI)
12150
0
            {
12151
0
              havedata = true;
12152
0
              if (latestChunkStart <= RecPtr)
12153
0
              {
12154
0
                XLogReceiptTime = GetCurrentTimestamp();
12155
0
                SetCurrentChunkStartTime(XLogReceiptTime);
12156
0
              }
12157
0
            }
12158
0
            else
12159
0
              havedata = false;
12160
0
          }
12161
0
          if (havedata)
12162
0
          {
12163
            /*
12164
             * Great, streamed far enough.  Open the file if it's
12165
             * not open already.  Also read the timeline history
12166
             * file if we haven't initialized timeline history
12167
             * yet; it should be streamed over and present in
12168
             * pg_wal by now.  Use XLOG_FROM_STREAM so that source
12169
             * info is set correctly and XLogReceiptTime isn't
12170
             * changed.
12171
             */
12172
0
            if (readFile < 0)
12173
0
            {
12174
0
              if (!expectedTLEs)
12175
0
                expectedTLEs = readTimeLineHistory(receiveTLI);
12176
0
              readFile = XLogFileRead(readSegNo, PANIC,
12177
0
                          receiveTLI,
12178
0
                          XLOG_FROM_STREAM, false);
12179
0
              Assert(readFile >= 0);
12180
0
            }
12181
0
            else
12182
0
            {
12183
              /* just make sure source info is correct... */
12184
0
              readSource = XLOG_FROM_STREAM;
12185
0
              XLogReceiptSource = XLOG_FROM_STREAM;
12186
0
              return true;
12187
0
            }
12188
0
            break;
12189
0
          }
12190
12191
          /*
12192
           * Data not here yet. Check for trigger, then wait for
12193
           * walreceiver to wake us up when new WAL arrives.
12194
           */
12195
0
          if (CheckForStandbyTrigger())
12196
0
          {
12197
            /*
12198
             * Note that we don't "return false" immediately here.
12199
             * After being triggered, we still want to replay all
12200
             * the WAL that was already streamed. It's in pg_wal
12201
             * now, so we just treat this as a failure, and the
12202
             * state machine will move on to replay the streamed
12203
             * WAL from pg_wal, and then recheck the trigger and
12204
             * exit replay.
12205
             */
12206
0
            lastSourceFailed = true;
12207
0
            break;
12208
0
          }
12209
12210
          /*
12211
           * Since we have replayed everything we have received so
12212
           * far and are about to start waiting for more WAL, let's
12213
           * tell the upstream server our replay location now so
12214
           * that pg_stat_replication doesn't show stale
12215
           * information.
12216
           */
12217
0
          if (!streaming_reply_sent)
12218
0
          {
12219
0
            WalRcvForceReply();
12220
0
            streaming_reply_sent = true;
12221
0
          }
12222
12223
          /*
12224
           * Wait for more WAL to arrive. Time out after 5 seconds
12225
           * to react to a trigger file promptly.
12226
           */
12227
0
          WaitLatch(&XLogCtl->recoveryWakeupLatch,
12228
0
                WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH,
12229
0
                5000L, WAIT_EVENT_RECOVERY_WAL_ALL);
12230
0
          ResetLatch(&XLogCtl->recoveryWakeupLatch);
12231
0
          break;
12232
0
        }
12233
12234
0
      default:
12235
0
        elog(ERROR, "unexpected WAL source %d", currentSource);
12236
4.00k
    }
12237
12238
    /*
12239
     * This possibly-long loop needs to handle interrupts of startup
12240
     * process.
12241
     */
12242
0
    HandleStartupProcInterrupts();
12243
0
  }
12244
12245
0
  return false;       /* not reached */
12246
4.00k
}
12247
12248
/*
12249
 * Determine what log level should be used to report a corrupt WAL record
12250
 * in the current WAL page, previously read by XLogPageRead().
12251
 *
12252
 * 'emode' is the error mode that would be used to report a file-not-found
12253
 * or legitimate end-of-WAL situation.   Generally, we use it as-is, but if
12254
 * we're retrying the exact same record that we've tried previously, only
12255
 * complain the first time to keep the noise down.  However, we only do when
12256
 * reading from pg_wal, because we don't expect any invalid records in archive
12257
 * or in records streamed from master. Files in the archive should be complete,
12258
 * and we should never hit the end of WAL because we stop and wait for more WAL
12259
 * to arrive before replaying it.
12260
 *
12261
 * NOTE: This function remembers the RecPtr value it was last called with,
12262
 * to suppress repeated messages about the same record. Only call this when
12263
 * you are about to ereport(), or you might cause a later message to be
12264
 * erroneously suppressed.
12265
 */
12266
static int
12267
emode_for_corrupt_record(int emode, XLogRecPtr RecPtr)
12268
5
{
12269
5
  static XLogRecPtr lastComplaint = 0;
12270
12271
5
  if (readSource == XLOG_FROM_PG_WAL && emode == LOG)
12272
5
  {
12273
5
    if (RecPtr == lastComplaint)
12274
0
      emode = DEBUG1;
12275
5
    else
12276
5
      lastComplaint = RecPtr;
12277
5
  }
12278
5
  return emode;
12279
5
}
12280
12281
/*
12282
 * Check to see whether the user-specified trigger file exists and whether a
12283
 * promote request has arrived.  If either condition holds, return true.
12284
 */
12285
static bool
12286
CheckForStandbyTrigger(void)
12287
0
{
12288
0
  struct stat stat_buf;
12289
0
  static bool triggered = false;
12290
12291
0
  if (triggered)
12292
0
    return true;
12293
12294
0
  if (IsPromoteTriggered())
12295
0
  {
12296
    /*
12297
     * In 9.1 and 9.2 the postmaster unlinked the promote file inside the
12298
     * signal handler. It now leaves the file in place and lets the
12299
     * Startup process do the unlink. This allows Startup to know whether
12300
     * it should create a full checkpoint before starting up (fallback
12301
     * mode). Fast promotion takes precedence.
12302
     */
12303
0
    if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
12304
0
    {
12305
0
      unlink(PROMOTE_SIGNAL_FILE);
12306
0
      unlink(FALLBACK_PROMOTE_SIGNAL_FILE);
12307
0
      fast_promote = true;
12308
0
    }
12309
0
    else if (stat(FALLBACK_PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
12310
0
    {
12311
0
      unlink(FALLBACK_PROMOTE_SIGNAL_FILE);
12312
0
      fast_promote = false;
12313
0
    }
12314
12315
0
    ereport(LOG, (errmsg("received promote request")));
12316
12317
0
    ResetPromoteTriggered();
12318
0
    triggered = true;
12319
0
    return true;
12320
0
  }
12321
12322
0
  if (TriggerFile == NULL)
12323
0
    return false;
12324
12325
0
  if (stat(TriggerFile, &stat_buf) == 0)
12326
0
  {
12327
0
    ereport(LOG,
12328
0
        (errmsg("trigger file found: %s", TriggerFile)));
12329
0
    unlink(TriggerFile);
12330
0
    triggered = true;
12331
0
    fast_promote = true;
12332
0
    return true;
12333
0
  }
12334
0
  else if (errno != ENOENT)
12335
0
    ereport(ERROR,
12336
0
        (errcode_for_file_access(),
12337
0
         errmsg("could not stat trigger file \"%s\": %m",
12338
0
            TriggerFile)));
12339
12340
0
  return false;
12341
0
}
12342
12343
/*
12344
 * Remove the files signaling a standby promotion request.
12345
 */
12346
void
12347
RemovePromoteSignalFiles(void)
12348
1.99k
{
12349
1.99k
  unlink(PROMOTE_SIGNAL_FILE);
12350
1.99k
  unlink(FALLBACK_PROMOTE_SIGNAL_FILE);
12351
1.99k
}
12352
12353
/*
12354
 * Check to see if a promote request has arrived. Should be
12355
 * called by postmaster after receiving SIGUSR1.
12356
 */
12357
bool
12358
CheckPromoteSignal(void)
12359
0
{
12360
0
  struct stat stat_buf;
12361
12362
0
  if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0 ||
12363
0
    stat(FALLBACK_PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
12364
0
    return true;
12365
12366
0
  return false;
12367
0
}
12368
12369
/*
12370
 * Wake up startup process to replay newly arrived WAL, or to notice that
12371
 * failover has been requested.
12372
 */
12373
void
12374
WakeupRecovery(void)
12375
0
{
12376
0
  SetLatch(&XLogCtl->recoveryWakeupLatch);
12377
0
}
12378
12379
/*
12380
 * Update the WalWriterSleeping flag.
12381
 */
12382
void
12383
SetWalWriterSleeping(bool sleeping)
12384
0
{
12385
0
  SpinLockAcquire(&XLogCtl->info_lck);
12386
0
  XLogCtl->WalWriterSleeping = sleeping;
12387
0
  SpinLockRelease(&XLogCtl->info_lck);
12388
0
}
12389
12390
/*
12391
 * Schedule a walreceiver wakeup in the main recovery loop.
12392
 */
12393
void
12394
XLogRequestWalReceiverReply(void)
12395
0
{
12396
0
  doRequestWalReceiverReply = true;
12397
0
}