YugabyteDB (2.13.0.0-b42, bfc6a6643e7399ac8a0e81d06a3ee6d6571b33ab)

Coverage Report

Created: 2022-03-09 17:30

/Users/deen/code/yugabyte-db/src/postgres/src/backend/postmaster/autovacuum.c
Line
Count
Source (jump to first uncovered line)
1
/*-------------------------------------------------------------------------
2
 *
3
 * autovacuum.c
4
 *
5
 * PostgreSQL Integrated Autovacuum Daemon
6
 *
7
 * The autovacuum system is structured in two different kinds of processes: the
8
 * autovacuum launcher and the autovacuum worker.  The launcher is an
9
 * always-running process, started by the postmaster when the autovacuum GUC
10
 * parameter is set.  The launcher schedules autovacuum workers to be started
11
 * when appropriate.  The workers are the processes which execute the actual
12
 * vacuuming; they connect to a database as determined in the launcher, and
13
 * once connected they examine the catalogs to select the tables to vacuum.
14
 *
15
 * The autovacuum launcher cannot start the worker processes by itself,
16
 * because doing so would cause robustness issues (namely, failure to shut
17
 * them down on exceptional conditions, and also, since the launcher is
18
 * connected to shared memory and is thus subject to corruption there, it is
19
 * not as robust as the postmaster).  So it leaves that task to the postmaster.
20
 *
21
 * There is an autovacuum shared memory area, where the launcher stores
22
 * information about the database it wants vacuumed.  When it wants a new
23
 * worker to start, it sets a flag in shared memory and sends a signal to the
24
 * postmaster.  Then postmaster knows nothing more than it must start a worker;
25
 * so it forks a new child, which turns into a worker.  This new process
26
 * connects to shared memory, and there it can inspect the information that the
27
 * launcher has set up.
28
 *
29
 * If the fork() call fails in the postmaster, it sets a flag in the shared
30
 * memory area, and sends a signal to the launcher.  The launcher, upon
31
 * noticing the flag, can try starting the worker again by resending the
32
 * signal.  Note that the failure can only be transient (fork failure due to
33
 * high load, memory pressure, too many processes, etc); more permanent
34
 * problems, like failure to connect to a database, are detected later in the
35
 * worker and dealt with just by having the worker exit normally.  The launcher
36
 * will launch a new worker again later, per schedule.
37
 *
38
 * When the worker is done vacuuming it sends SIGUSR2 to the launcher.  The
39
 * launcher then wakes up and is able to launch another worker, if the schedule
40
 * is so tight that a new worker is needed immediately.  At this time the
41
 * launcher can also balance the settings for the various remaining workers'
42
 * cost-based vacuum delay feature.
43
 *
44
 * Note that there can be more than one worker in a database concurrently.
45
 * They will store the table they are currently vacuuming in shared memory, so
46
 * that other workers avoid being blocked waiting for the vacuum lock for that
47
 * table.  They will also reload the pgstats data just before vacuuming each
48
 * table, to avoid vacuuming a table that was just finished being vacuumed by
49
 * another worker and thus is no longer noted in shared memory.  However,
50
 * there is a window (caused by pgstat delay) on which a worker may choose a
51
 * table that was already vacuumed; this is a bug in the current design.
52
 *
53
 * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
54
 * Portions Copyright (c) 1994, Regents of the University of California
55
 *
56
 *
57
 * IDENTIFICATION
58
 *    src/backend/postmaster/autovacuum.c
59
 *
60
 *-------------------------------------------------------------------------
61
 */
62
#include "postgres.h"
63
64
#include <signal.h>
65
#include <sys/time.h>
66
#include <unistd.h>
67
68
#include "access/heapam.h"
69
#include "access/htup_details.h"
70
#include "access/multixact.h"
71
#include "access/reloptions.h"
72
#include "access/transam.h"
73
#include "access/xact.h"
74
#include "catalog/dependency.h"
75
#include "catalog/namespace.h"
76
#include "catalog/pg_database.h"
77
#include "commands/dbcommands.h"
78
#include "commands/vacuum.h"
79
#include "lib/ilist.h"
80
#include "libpq/pqsignal.h"
81
#include "miscadmin.h"
82
#include "nodes/makefuncs.h"
83
#include "pgstat.h"
84
#include "postmaster/autovacuum.h"
85
#include "postmaster/fork_process.h"
86
#include "postmaster/postmaster.h"
87
#include "storage/bufmgr.h"
88
#include "storage/ipc.h"
89
#include "storage/latch.h"
90
#include "storage/lmgr.h"
91
#include "storage/pmsignal.h"
92
#include "storage/proc.h"
93
#include "storage/procsignal.h"
94
#include "storage/sinvaladt.h"
95
#include "storage/smgr.h"
96
#include "tcop/tcopprot.h"
97
#include "utils/fmgroids.h"
98
#include "utils/fmgrprotos.h"
99
#include "utils/lsyscache.h"
100
#include "utils/memutils.h"
101
#include "utils/ps_status.h"
102
#include "utils/rel.h"
103
#include "utils/snapmgr.h"
104
#include "utils/syscache.h"
105
#include "utils/timeout.h"
106
#include "utils/timestamp.h"
107
#include "utils/tqual.h"
108
109
110
/*
111
 * GUC parameters
112
 */
113
bool    autovacuum_start_daemon = false;
114
int     autovacuum_max_workers;
115
int     autovacuum_work_mem = -1;
116
int     autovacuum_naptime;
117
int     autovacuum_vac_thresh;
118
double    autovacuum_vac_scale;
119
int     autovacuum_anl_thresh;
120
double    autovacuum_anl_scale;
121
int     autovacuum_freeze_max_age;
122
int     autovacuum_multixact_freeze_max_age;
123
124
int     autovacuum_vac_cost_delay;
125
int     autovacuum_vac_cost_limit;
126
127
int     Log_autovacuum_min_duration = -1;
128
129
/* how long to keep pgstat data in the launcher, in milliseconds */
130
0
#define STATS_READ_DELAY 1000
131
132
/* the minimum allowed time between two awakenings of the launcher */
133
0
#define MIN_AUTOVAC_SLEEPTIME 100.0 /* milliseconds */
134
0
#define MAX_AUTOVAC_SLEEPTIME 300  /* seconds */
135
136
/* Flags to tell if we are in an autovacuum process */
137
static bool am_autovacuum_launcher = false;
138
static bool am_autovacuum_worker = false;
139
140
/* Flags set by signal handlers */
141
static volatile sig_atomic_t got_SIGHUP = false;
142
static volatile sig_atomic_t got_SIGUSR2 = false;
143
static volatile sig_atomic_t got_SIGTERM = false;
144
145
/* Comparison points for determining whether freeze_max_age is exceeded */
146
static TransactionId recentXid;
147
static MultiXactId recentMulti;
148
149
/* Default freeze ages to use for autovacuum (varies by database) */
150
static int  default_freeze_min_age;
151
static int  default_freeze_table_age;
152
static int  default_multixact_freeze_min_age;
153
static int  default_multixact_freeze_table_age;
154
155
/* Memory context for long-lived data */
156
static MemoryContext AutovacMemCxt;
157
158
/* struct to keep track of databases in launcher */
159
typedef struct avl_dbase
160
{
161
  Oid     adl_datid;    /* hash key -- must be first */
162
  TimestampTz adl_next_worker;
163
  int     adl_score;
164
  dlist_node  adl_node;
165
} avl_dbase;
166
167
/* struct to keep track of databases in worker */
168
typedef struct avw_dbase
169
{
170
  Oid     adw_datid;
171
  char     *adw_name;
172
  TransactionId adw_frozenxid;
173
  MultiXactId adw_minmulti;
174
  PgStat_StatDBEntry *adw_entry;
175
} avw_dbase;
176
177
/* struct to keep track of tables to vacuum and/or analyze, in 1st pass */
178
typedef struct av_relation
179
{
180
  Oid     ar_toastrelid;  /* hash key - must be first */
181
  Oid     ar_relid;
182
  bool    ar_hasrelopts;
183
  AutoVacOpts ar_reloptions;  /* copy of AutoVacOpts from the main table's
184
                 * reloptions, or NULL if none */
185
} av_relation;
186
187
/* struct to keep track of tables to vacuum and/or analyze, after rechecking */
188
typedef struct autovac_table
189
{
190
  Oid     at_relid;
191
  int     at_vacoptions;  /* bitmask of VacuumOption */
192
  VacuumParams at_params;
193
  int     at_vacuum_cost_delay;
194
  int     at_vacuum_cost_limit;
195
  bool    at_dobalance;
196
  bool    at_sharedrel;
197
  char     *at_relname;
198
  char     *at_nspname;
199
  char     *at_datname;
200
} autovac_table;
201
202
/*-------------
203
 * This struct holds information about a single worker's whereabouts.  We keep
204
 * an array of these in shared memory, sized according to
205
 * autovacuum_max_workers.
206
 *
207
 * wi_links   entry into free list or running list
208
 * wi_dboid   OID of the database this worker is supposed to work on
209
 * wi_tableoid  OID of the table currently being vacuumed, if any
210
 * wi_sharedrel flag indicating whether table is marked relisshared
211
 * wi_proc    pointer to PGPROC of the running worker, NULL if not started
212
 * wi_launchtime Time at which this worker was launched
213
 * wi_cost_*  Vacuum cost-based delay parameters current in this worker
214
 *
215
 * All fields are protected by AutovacuumLock, except for wi_tableoid and
216
 * wi_sharedrel which are protected by AutovacuumScheduleLock (note these
217
 * two fields are read-only for everyone except that worker itself).
218
 *-------------
219
 */
220
typedef struct WorkerInfoData
221
{
222
  dlist_node  wi_links;
223
  Oid     wi_dboid;
224
  Oid     wi_tableoid;
225
  PGPROC     *wi_proc;
226
  TimestampTz wi_launchtime;
227
  bool    wi_dobalance;
228
  bool    wi_sharedrel;
229
  int     wi_cost_delay;
230
  int     wi_cost_limit;
231
  int     wi_cost_limit_base;
232
} WorkerInfoData;
233
234
typedef struct WorkerInfoData *WorkerInfo;
235
236
/*
237
 * Possible signals received by the launcher from remote processes.  These are
238
 * stored atomically in shared memory so that other processes can set them
239
 * without locking.
240
 */
241
typedef enum
242
{
243
  AutoVacForkFailed,      /* failed trying to start a worker */
244
  AutoVacRebalance,     /* rebalance the cost limits */
245
  AutoVacNumSignals     /* must be last */
246
}     AutoVacuumSignal;
247
248
/*
249
 * Autovacuum workitem array, stored in AutoVacuumShmem->av_workItems.  This
250
 * list is mostly protected by AutovacuumLock, except that if an item is
251
 * marked 'active' other processes must not modify the work-identifying
252
 * members.
253
 */
254
typedef struct AutoVacuumWorkItem
255
{
256
  AutoVacuumWorkItemType avw_type;
257
  bool    avw_used;   /* below data is valid */
258
  bool    avw_active;   /* being processed */
259
  Oid     avw_database;
260
  Oid     avw_relation;
261
  BlockNumber avw_blockNumber;
262
} AutoVacuumWorkItem;
263
264
0
#define NUM_WORKITEMS 256
265
266
/*-------------
267
 * The main autovacuum shmem struct.  On shared memory we store this main
268
 * struct and the array of WorkerInfo structs.  This struct keeps:
269
 *
270
 * av_signal    set by other processes to indicate various conditions
271
 * av_launcherpid the PID of the autovacuum launcher
272
 * av_freeWorkers the WorkerInfo freelist
273
 * av_runningWorkers the WorkerInfo non-free queue
274
 * av_startingWorker pointer to WorkerInfo currently being started (cleared by
275
 *          the worker itself as soon as it's up and running)
276
 * av_workItems   work item array
277
 *
278
 * This struct is protected by AutovacuumLock, except for av_signal and parts
279
 * of the worker list (see above).
280
 *-------------
281
 */
282
typedef struct
283
{
284
  sig_atomic_t av_signal[AutoVacNumSignals];
285
  pid_t   av_launcherpid;
286
  dlist_head  av_freeWorkers;
287
  dlist_head  av_runningWorkers;
288
  WorkerInfo  av_startingWorker;
289
  AutoVacuumWorkItem av_workItems[NUM_WORKITEMS];
290
} AutoVacuumShmemStruct;
291
292
static AutoVacuumShmemStruct *AutoVacuumShmem;
293
294
/*
295
 * the database list (of avl_dbase elements) in the launcher, and the context
296
 * that contains it
297
 */
298
static dlist_head DatabaseList = DLIST_STATIC_INIT(DatabaseList);
299
static MemoryContext DatabaseListCxt = NULL;
300
301
/* Pointer to my own WorkerInfo, valid on each worker */
302
static WorkerInfo MyWorkerInfo = NULL;
303
304
/* PID of launcher, valid only in worker while shutting down */
305
int     AutovacuumLauncherPid = 0;
306
307
#ifdef EXEC_BACKEND
308
static pid_t avlauncher_forkexec(void);
309
static pid_t avworker_forkexec(void);
310
#endif
311
NON_EXEC_STATIC void AutoVacWorkerMain(int argc, char *argv[]) pg_attribute_noreturn();
312
NON_EXEC_STATIC void AutoVacLauncherMain(int argc, char *argv[]) pg_attribute_noreturn();
313
314
static Oid  do_start_worker(void);
315
static void launcher_determine_sleep(bool canlaunch, bool recursing,
316
             struct timeval *nap);
317
static void launch_worker(TimestampTz now);
318
static List *get_database_list(void);
319
static void rebuild_database_list(Oid newdb);
320
static int  db_comparator(const void *a, const void *b);
321
static void autovac_balance_cost(void);
322
323
static void do_autovacuum(void);
324
static void FreeWorkerInfo(int code, Datum arg);
325
326
static autovac_table *table_recheck_autovac(Oid relid, HTAB *table_toast_map,
327
            TupleDesc pg_class_desc,
328
            int effective_multixact_freeze_max_age);
329
static void relation_needs_vacanalyze(Oid relid, AutoVacOpts *relopts,
330
              Form_pg_class classForm,
331
              PgStat_StatTabEntry *tabentry,
332
              int effective_multixact_freeze_max_age,
333
              bool *dovacuum, bool *doanalyze, bool *wraparound);
334
335
static void autovacuum_do_vac_analyze(autovac_table *tab,
336
              BufferAccessStrategy bstrategy);
337
static AutoVacOpts *extract_autovac_opts(HeapTuple tup,
338
           TupleDesc pg_class_desc);
339
static PgStat_StatTabEntry *get_pgstat_tabentry_relid(Oid relid, bool isshared,
340
              PgStat_StatDBEntry *shared,
341
              PgStat_StatDBEntry *dbentry);
342
static void perform_work_item(AutoVacuumWorkItem *workitem);
343
static void autovac_report_activity(autovac_table *tab);
344
static void autovac_report_workitem(AutoVacuumWorkItem *workitem,
345
            const char *nspname, const char *relname);
346
static void av_sighup_handler(SIGNAL_ARGS);
347
static void avl_sigusr2_handler(SIGNAL_ARGS);
348
static void avl_sigterm_handler(SIGNAL_ARGS);
349
static void autovac_refresh_stats(void);
350
351
352
353
/********************************************************************
354
 *            AUTOVACUUM LAUNCHER CODE
355
 ********************************************************************/
356
357
#ifdef EXEC_BACKEND
358
/*
359
 * forkexec routine for the autovacuum launcher process.
360
 *
361
 * Format up the arglist, then fork and exec.
362
 */
363
static pid_t
364
avlauncher_forkexec(void)
365
{
366
  char     *av[10];
367
  int     ac = 0;
368
369
  av[ac++] = "postgres";
370
  av[ac++] = "--forkavlauncher";
371
  av[ac++] = NULL;      /* filled in by postmaster_forkexec */
372
  av[ac] = NULL;
373
374
  Assert(ac < lengthof(av));
375
376
  return postmaster_forkexec(ac, av);
377
}
378
379
/*
380
 * We need this set from the outside, before InitProcess is called
381
 */
382
void
383
AutovacuumLauncherIAm(void)
384
{
385
  am_autovacuum_launcher = true;
386
}
387
#endif
388
389
/*
390
 * Main entry point for autovacuum launcher process, to be called from the
391
 * postmaster.
392
 */
393
int
394
StartAutoVacLauncher(void)
395
0
{
396
0
  pid_t   AutoVacPID;
397
398
#ifdef EXEC_BACKEND
399
  switch ((AutoVacPID = avlauncher_forkexec()))
400
#else
401
0
  switch ((AutoVacPID = fork_process()))
402
0
#endif
403
0
  {
404
0
    case -1:
405
0
      ereport(LOG,
406
0
          (errmsg("could not fork autovacuum launcher process: %m")));
407
0
      return 0;
408
409
0
#ifndef EXEC_BACKEND
410
0
    case 0:
411
      /* in postmaster child ... */
412
0
      InitPostmasterChild();
413
414
      /* Close the postmaster's sockets */
415
0
      ClosePostmasterPorts(false);
416
417
0
      AutoVacLauncherMain(0, NULL);
418
0
      break;
419
0
#endif
420
0
    default:
421
0
      return (int) AutoVacPID;
422
0
  }
423
424
  /* shouldn't get here */
425
0
  return 0;
426
0
}
427
428
/*
429
 * Main loop for the autovacuum launcher process.
430
 */
431
NON_EXEC_STATIC void
432
AutoVacLauncherMain(int argc, char *argv[])
433
0
{
434
0
  sigjmp_buf  local_sigjmp_buf;
435
436
0
  am_autovacuum_launcher = true;
437
438
  /* Identify myself via ps */
439
0
  init_ps_display(pgstat_get_backend_desc(B_AUTOVAC_LAUNCHER), "", "", "");
440
441
0
  ereport(DEBUG1,
442
0
      (errmsg("autovacuum launcher started")));
443
444
0
  if (PostAuthDelay)
445
0
    pg_usleep(PostAuthDelay * 1000000L);
446
447
0
  SetProcessingMode(InitProcessing);
448
449
  /*
450
   * Set up signal handlers.  We operate on databases much like a regular
451
   * backend, so we use the same signal handling.  See equivalent code in
452
   * tcop/postgres.c.
453
   */
454
0
  pqsignal(SIGHUP, av_sighup_handler);
455
0
  pqsignal(SIGINT, StatementCancelHandler);
456
0
  pqsignal(SIGTERM, avl_sigterm_handler);
457
458
0
  pqsignal(SIGQUIT, quickdie);
459
0
  InitializeTimeouts();   /* establishes SIGALRM handler */
460
461
0
  pqsignal(SIGPIPE, SIG_IGN);
462
0
  pqsignal(SIGUSR1, procsignal_sigusr1_handler);
463
0
  pqsignal(SIGUSR2, avl_sigusr2_handler);
464
0
  pqsignal(SIGFPE, FloatExceptionHandler);
465
0
  pqsignal(SIGCHLD, SIG_DFL);
466
467
  /* Early initialization */
468
0
  BaseInit();
469
470
  /*
471
   * Create a per-backend PGPROC struct in shared memory, except in the
472
   * EXEC_BACKEND case where this was done in SubPostmasterMain. We must do
473
   * this before we can use LWLocks (and in the EXEC_BACKEND case we already
474
   * had to do some stuff with LWLocks).
475
   */
476
0
#ifndef EXEC_BACKEND
477
0
  InitProcess();
478
0
#endif
479
480
0
  InitPostgres(NULL, InvalidOid, NULL, InvalidOid, NULL, false);
481
482
0
  SetProcessingMode(NormalProcessing);
483
484
  /*
485
   * Create a memory context that we will do all our work in.  We do this so
486
   * that we can reset the context during error recovery and thereby avoid
487
   * possible memory leaks.
488
   */
489
0
  AutovacMemCxt = AllocSetContextCreate(TopMemoryContext,
490
0
                      "Autovacuum Launcher",
491
0
                      ALLOCSET_DEFAULT_SIZES);
492
0
  MemoryContextSwitchTo(AutovacMemCxt);
493
494
  /*
495
   * If an exception is encountered, processing resumes here.
496
   *
497
   * This code is a stripped down version of PostgresMain error recovery.
498
   */
499
0
  if (sigsetjmp(local_sigjmp_buf, 1) != 0)
500
0
  {
501
    /* since not using PG_TRY, must reset error stack by hand */
502
0
    error_context_stack = NULL;
503
504
    /* Prevents interrupts while cleaning up */
505
0
    HOLD_INTERRUPTS();
506
507
    /* Forget any pending QueryCancel or timeout request */
508
0
    disable_all_timeouts(false);
509
0
    QueryCancelPending = false; /* second to avoid race condition */
510
511
    /* Report the error to the server log */
512
0
    EmitErrorReport();
513
514
    /* Abort the current transaction in order to recover */
515
0
    AbortCurrentTransaction();
516
517
    /*
518
     * Release any other resources, for the case where we were not in a
519
     * transaction.
520
     */
521
0
    LWLockReleaseAll();
522
0
    pgstat_report_wait_end();
523
0
    AbortBufferIO();
524
0
    UnlockBuffers();
525
0
    if (CurrentResourceOwner)
526
0
    {
527
0
      ResourceOwnerRelease(CurrentResourceOwner,
528
0
                 RESOURCE_RELEASE_BEFORE_LOCKS,
529
0
                 false, true);
530
      /* we needn't bother with the other ResourceOwnerRelease phases */
531
0
    }
532
0
    AtEOXact_Buffers(false);
533
0
    AtEOXact_SMgr();
534
0
    AtEOXact_Files(false);
535
0
    AtEOXact_HashTables(false);
536
537
    /*
538
     * Now return to normal top-level context and clear ErrorContext for
539
     * next time.
540
     */
541
0
    MemoryContextSwitchTo(AutovacMemCxt);
542
0
    FlushErrorState();
543
544
    /* Flush any leaked data in the top-level context */
545
0
    MemoryContextResetAndDeleteChildren(AutovacMemCxt);
546
547
    /* don't leave dangling pointers to freed memory */
548
0
    DatabaseListCxt = NULL;
549
0
    dlist_init(&DatabaseList);
550
551
    /*
552
     * Make sure pgstat also considers our stat data as gone.  Note: we
553
     * mustn't use autovac_refresh_stats here.
554
     */
555
0
    pgstat_clear_snapshot();
556
557
    /* Now we can allow interrupts again */
558
0
    RESUME_INTERRUPTS();
559
560
    /* if in shutdown mode, no need for anything further; just go away */
561
0
    if (got_SIGTERM)
562
0
      goto shutdown;
563
564
    /*
565
     * Sleep at least 1 second after any error.  We don't want to be
566
     * filling the error logs as fast as we can.
567
     */
568
0
    pg_usleep(1000000L);
569
0
  }
570
571
  /* We can now handle ereport(ERROR) */
572
0
  PG_exception_stack = &local_sigjmp_buf;
573
574
  /* must unblock signals before calling rebuild_database_list */
575
0
  PG_SETMASK(&UnBlockSig);
576
577
  /*
578
   * Set always-secure search path.  Launcher doesn't connect to a database,
579
   * so this has no effect.
580
   */
581
0
  SetConfigOption("search_path", "", PGC_SUSET, PGC_S_OVERRIDE);
582
583
  /*
584
   * Force zero_damaged_pages OFF in the autovac process, even if it is set
585
   * in postgresql.conf.  We don't really want such a dangerous option being
586
   * applied non-interactively.
587
   */
588
0
  SetConfigOption("zero_damaged_pages", "false", PGC_SUSET, PGC_S_OVERRIDE);
589
590
  /*
591
   * Force settable timeouts off to avoid letting these settings prevent
592
   * regular maintenance from being executed.
593
   */
594
0
  SetConfigOption("statement_timeout", "0", PGC_SUSET, PGC_S_OVERRIDE);
595
0
  SetConfigOption("lock_timeout", "0", PGC_SUSET, PGC_S_OVERRIDE);
596
0
  SetConfigOption("idle_in_transaction_session_timeout", "0",
597
0
          PGC_SUSET, PGC_S_OVERRIDE);
598
599
  /*
600
   * Force default_transaction_isolation to READ COMMITTED.  We don't want
601
   * to pay the overhead of serializable mode, nor add any risk of causing
602
   * deadlocks or delaying other transactions.
603
   */
604
0
  SetConfigOption("default_transaction_isolation", "read committed",
605
0
          PGC_SUSET, PGC_S_OVERRIDE);
606
607
  /*
608
   * In emergency mode, just start a worker (unless shutdown was requested)
609
   * and go away.
610
   */
611
0
  if (!AutoVacuumingActive())
612
0
  {
613
0
    if (!got_SIGTERM)
614
0
      do_start_worker();
615
0
    proc_exit(0);     /* done */
616
0
  }
617
618
0
  AutoVacuumShmem->av_launcherpid = MyProcPid;
619
620
  /*
621
   * Create the initial database list.  The invariant we want this list to
622
   * keep is that it's ordered by decreasing next_time.  As soon as an entry
623
   * is updated to a higher time, it will be moved to the front (which is
624
   * correct because the only operation is to add autovacuum_naptime to the
625
   * entry, and time always increases).
626
   */
627
0
  rebuild_database_list(InvalidOid);
628
629
  /* loop until shutdown request */
630
0
  while (!got_SIGTERM)
631
0
  {
632
0
    struct timeval nap;
633
0
    TimestampTz current_time = 0;
634
0
    bool    can_launch;
635
0
    int     rc;
636
637
    /*
638
     * This loop is a bit different from the normal use of WaitLatch,
639
     * because we'd like to sleep before the first launch of a child
640
     * process.  So it's WaitLatch, then ResetLatch, then check for
641
     * wakening conditions.
642
     */
643
644
0
    launcher_determine_sleep(!dlist_is_empty(&AutoVacuumShmem->av_freeWorkers),
645
0
                 false, &nap);
646
647
    /*
648
     * Wait until naptime expires or we get some type of signal (all the
649
     * signal handlers will wake us by calling SetLatch).
650
     */
651
0
    rc = WaitLatch(MyLatch,
652
0
             WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH,
653
0
             (nap.tv_sec * 1000L) + (nap.tv_usec / 1000L),
654
0
             WAIT_EVENT_AUTOVACUUM_MAIN);
655
656
0
    ResetLatch(MyLatch);
657
658
    /* Process sinval catchup interrupts that happened while sleeping */
659
0
    ProcessCatchupInterrupt();
660
661
    /*
662
     * Emergency bailout if postmaster has died.  This is to avoid the
663
     * necessity for manual cleanup of all postmaster children.
664
     */
665
0
    if (rc & WL_POSTMASTER_DEATH)
666
0
      proc_exit(1);
667
668
    /* the normal shutdown case */
669
0
    if (got_SIGTERM)
670
0
      break;
671
672
0
    if (got_SIGHUP)
673
0
    {
674
0
      got_SIGHUP = false;
675
0
      ProcessConfigFile(PGC_SIGHUP);
676
677
      /* shutdown requested in config file? */
678
0
      if (!AutoVacuumingActive())
679
0
        break;
680
681
      /* rebalance in case the default cost parameters changed */
682
0
      LWLockAcquire(AutovacuumLock, LW_EXCLUSIVE);
683
0
      autovac_balance_cost();
684
0
      LWLockRelease(AutovacuumLock);
685
686
      /* rebuild the list in case the naptime changed */
687
0
      rebuild_database_list(InvalidOid);
688
0
    }
689
690
    /*
691
     * a worker finished, or postmaster signalled failure to start a
692
     * worker
693
     */
694
0
    if (got_SIGUSR2)
695
0
    {
696
0
      got_SIGUSR2 = false;
697
698
      /* rebalance cost limits, if needed */
699
0
      if (AutoVacuumShmem->av_signal[AutoVacRebalance])
700
0
      {
701
0
        LWLockAcquire(AutovacuumLock, LW_EXCLUSIVE);
702
0
        AutoVacuumShmem->av_signal[AutoVacRebalance] = false;
703
0
        autovac_balance_cost();
704
0
        LWLockRelease(AutovacuumLock);
705
0
      }
706
707
0
      if (AutoVacuumShmem->av_signal[AutoVacForkFailed])
708
0
      {
709
        /*
710
         * If the postmaster failed to start a new worker, we sleep
711
         * for a little while and resend the signal.  The new worker's
712
         * state is still in memory, so this is sufficient.  After
713
         * that, we restart the main loop.
714
         *
715
         * XXX should we put a limit to the number of times we retry?
716
         * I don't think it makes much sense, because a future start
717
         * of a worker will continue to fail in the same way.
718
         */
719
0
        AutoVacuumShmem->av_signal[AutoVacForkFailed] = false;
720
0
        pg_usleep(1000000L);  /* 1s */
721
0
        SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_WORKER);
722
0
        continue;
723
0
      }
724
0
    }
725
726
    /*
727
     * There are some conditions that we need to check before trying to
728
     * start a worker.  First, we need to make sure that there is a worker
729
     * slot available.  Second, we need to make sure that no other worker
730
     * failed while starting up.
731
     */
732
733
0
    current_time = GetCurrentTimestamp();
734
0
    LWLockAcquire(AutovacuumLock, LW_SHARED);
735
736
0
    can_launch = !dlist_is_empty(&AutoVacuumShmem->av_freeWorkers);
737
738
0
    if (AutoVacuumShmem->av_startingWorker != NULL)
739
0
    {
740
0
      int     waittime;
741
0
      WorkerInfo  worker = AutoVacuumShmem->av_startingWorker;
742
743
      /*
744
       * We can't launch another worker when another one is still
745
       * starting up (or failed while doing so), so just sleep for a bit
746
       * more; that worker will wake us up again as soon as it's ready.
747
       * We will only wait autovacuum_naptime seconds (up to a maximum
748
       * of 60 seconds) for this to happen however.  Note that failure
749
       * to connect to a particular database is not a problem here,
750
       * because the worker removes itself from the startingWorker
751
       * pointer before trying to connect.  Problems detected by the
752
       * postmaster (like fork() failure) are also reported and handled
753
       * differently.  The only problems that may cause this code to
754
       * fire are errors in the earlier sections of AutoVacWorkerMain,
755
       * before the worker removes the WorkerInfo from the
756
       * startingWorker pointer.
757
       */
758
0
      waittime = Min(autovacuum_naptime, 60) * 1000;
759
0
      if (TimestampDifferenceExceeds(worker->wi_launchtime, current_time,
760
0
                       waittime))
761
0
      {
762
0
        LWLockRelease(AutovacuumLock);
763
0
        LWLockAcquire(AutovacuumLock, LW_EXCLUSIVE);
764
765
        /*
766
         * No other process can put a worker in starting mode, so if
767
         * startingWorker is still INVALID after exchanging our lock,
768
         * we assume it's the same one we saw above (so we don't
769
         * recheck the launch time).
770
         */
771
0
        if (AutoVacuumShmem->av_startingWorker != NULL)
772
0
        {
773
0
          worker = AutoVacuumShmem->av_startingWorker;
774
0
          worker->wi_dboid = InvalidOid;
775
0
          worker->wi_tableoid = InvalidOid;
776
0
          worker->wi_sharedrel = false;
777
0
          worker->wi_proc = NULL;
778
0
          worker->wi_launchtime = 0;
779
0
          dlist_push_head(&AutoVacuumShmem->av_freeWorkers,
780
0
                  &worker->wi_links);
781
0
          AutoVacuumShmem->av_startingWorker = NULL;
782
0
          elog(WARNING, "worker took too long to start; canceled");
783
0
        }
784
0
      }
785
0
      else
786
0
        can_launch = false;
787
0
    }
788
0
    LWLockRelease(AutovacuumLock);  /* either shared or exclusive */
789
790
    /* if we can't do anything, just go back to sleep */
791
0
    if (!can_launch)
792
0
      continue;
793
794
    /* We're OK to start a new worker */
795
796
0
    if (dlist_is_empty(&DatabaseList))
797
0
    {
798
      /*
799
       * Special case when the list is empty: start a worker right away.
800
       * This covers the initial case, when no database is in pgstats
801
       * (thus the list is empty).  Note that the constraints in
802
       * launcher_determine_sleep keep us from starting workers too
803
       * quickly (at most once every autovacuum_naptime when the list is
804
       * empty).
805
       */
806
0
      launch_worker(current_time);
807
0
    }
808
0
    else
809
0
    {
810
      /*
811
       * because rebuild_database_list constructs a list with most
812
       * distant adl_next_worker first, we obtain our database from the
813
       * tail of the list.
814
       */
815
0
      avl_dbase  *avdb;
816
817
0
      avdb = dlist_tail_element(avl_dbase, adl_node, &DatabaseList);
818
819
      /*
820
       * launch a worker if next_worker is right now or it is in the
821
       * past
822
       */
823
0
      if (TimestampDifferenceExceeds(avdb->adl_next_worker,
824
0
                       current_time, 0))
825
0
        launch_worker(current_time);
826
0
    }
827
0
  }
828
829
  /* Normal exit from the autovac launcher is here */
830
0
shutdown:
831
0
  ereport(DEBUG1,
832
0
      (errmsg("autovacuum launcher shutting down")));
833
0
  AutoVacuumShmem->av_launcherpid = 0;
834
835
0
  proc_exit(0);       /* done */
836
0
}
837
838
/*
839
 * Determine the time to sleep, based on the database list.
840
 *
841
 * The "canlaunch" parameter indicates whether we can start a worker right now,
842
 * for example due to the workers being all busy.  If this is false, we will
843
 * cause a long sleep, which will be interrupted when a worker exits.
844
 */
845
static void
846
launcher_determine_sleep(bool canlaunch, bool recursing, struct timeval *nap)
847
0
{
848
  /*
849
   * We sleep until the next scheduled vacuum.  We trust that when the
850
   * database list was built, care was taken so that no entries have times
851
   * in the past; if the first entry has too close a next_worker value, or a
852
   * time in the past, we will sleep a small nominal time.
853
   */
854
0
  if (!canlaunch)
855
0
  {
856
0
    nap->tv_sec = autovacuum_naptime;
857
0
    nap->tv_usec = 0;
858
0
  }
859
0
  else if (!dlist_is_empty(&DatabaseList))
860
0
  {
861
0
    TimestampTz current_time = GetCurrentTimestamp();
862
0
    TimestampTz next_wakeup;
863
0
    avl_dbase  *avdb;
864
0
    long    secs;
865
0
    int     usecs;
866
867
0
    avdb = dlist_tail_element(avl_dbase, adl_node, &DatabaseList);
868
869
0
    next_wakeup = avdb->adl_next_worker;
870
0
    TimestampDifference(current_time, next_wakeup, &secs, &usecs);
871
872
0
    nap->tv_sec = secs;
873
0
    nap->tv_usec = usecs;
874
0
  }
875
0
  else
876
0
  {
877
    /* list is empty, sleep for whole autovacuum_naptime seconds  */
878
0
    nap->tv_sec = autovacuum_naptime;
879
0
    nap->tv_usec = 0;
880
0
  }
881
882
  /*
883
   * If the result is exactly zero, it means a database had an entry with
884
   * time in the past.  Rebuild the list so that the databases are evenly
885
   * distributed again, and recalculate the time to sleep.  This can happen
886
   * if there are more tables needing vacuum than workers, and they all take
887
   * longer to vacuum than autovacuum_naptime.
888
   *
889
   * We only recurse once.  rebuild_database_list should always return times
890
   * in the future, but it seems best not to trust too much on that.
891
   */
892
0
  if (nap->tv_sec == 0 && nap->tv_usec == 0 && !recursing)
893
0
  {
894
0
    rebuild_database_list(InvalidOid);
895
0
    launcher_determine_sleep(canlaunch, true, nap);
896
0
    return;
897
0
  }
898
899
  /* The smallest time we'll allow the launcher to sleep. */
900
0
  if (nap->tv_sec <= 0 && nap->tv_usec <= MIN_AUTOVAC_SLEEPTIME * 1000)
901
0
  {
902
0
    nap->tv_sec = 0;
903
0
    nap->tv_usec = MIN_AUTOVAC_SLEEPTIME * 1000;
904
0
  }
905
906
  /*
907
   * If the sleep time is too large, clamp it to an arbitrary maximum (plus
908
   * any fractional seconds, for simplicity).  This avoids an essentially
909
   * infinite sleep in strange cases like the system clock going backwards a
910
   * few years.
911
   */
912
0
  if (nap->tv_sec > MAX_AUTOVAC_SLEEPTIME)
913
0
    nap->tv_sec = MAX_AUTOVAC_SLEEPTIME;
914
0
}
915
916
/*
917
 * Build an updated DatabaseList.  It must only contain databases that appear
918
 * in pgstats, and must be sorted by next_worker from highest to lowest,
919
 * distributed regularly across the next autovacuum_naptime interval.
920
 *
921
 * Receives the Oid of the database that made this list be generated (we call
922
 * this the "new" database, because when the database was already present on
923
 * the list, we expect that this function is not called at all).  The
924
 * preexisting list, if any, will be used to preserve the order of the
925
 * databases in the autovacuum_naptime period.  The new database is put at the
926
 * end of the interval.  The actual values are not saved, which should not be
927
 * much of a problem.
928
 */
929
static void
930
rebuild_database_list(Oid newdb)
931
0
{
932
0
  List     *dblist;
933
0
  ListCell   *cell;
934
0
  MemoryContext newcxt;
935
0
  MemoryContext oldcxt;
936
0
  MemoryContext tmpcxt;
937
0
  HASHCTL   hctl;
938
0
  int     score;
939
0
  int     nelems;
940
0
  HTAB     *dbhash;
941
0
  dlist_iter  iter;
942
943
  /* use fresh stats */
944
0
  autovac_refresh_stats();
945
946
0
  newcxt = AllocSetContextCreate(AutovacMemCxt,
947
0
                   "AV dblist",
948
0
                   ALLOCSET_DEFAULT_SIZES);
949
0
  tmpcxt = AllocSetContextCreate(newcxt,
950
0
                   "tmp AV dblist",
951
0
                   ALLOCSET_DEFAULT_SIZES);
952
0
  oldcxt = MemoryContextSwitchTo(tmpcxt);
953
954
  /*
955
   * Implementing this is not as simple as it sounds, because we need to put
956
   * the new database at the end of the list; next the databases that were
957
   * already on the list, and finally (at the tail of the list) all the
958
   * other databases that are not on the existing list.
959
   *
960
   * To do this, we build an empty hash table of scored databases.  We will
961
   * start with the lowest score (zero) for the new database, then
962
   * increasing scores for the databases in the existing list, in order, and
963
   * lastly increasing scores for all databases gotten via
964
   * get_database_list() that are not already on the hash.
965
   *
966
   * Then we will put all the hash elements into an array, sort the array by
967
   * score, and finally put the array elements into the new doubly linked
968
   * list.
969
   */
970
0
  hctl.keysize = sizeof(Oid);
971
0
  hctl.entrysize = sizeof(avl_dbase);
972
0
  hctl.hcxt = tmpcxt;
973
0
  dbhash = hash_create("db hash", 20, &hctl,  /* magic number here FIXME */
974
0
             HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
975
976
  /* start by inserting the new database */
977
0
  score = 0;
978
0
  if (OidIsValid(newdb))
979
0
  {
980
0
    avl_dbase  *db;
981
0
    PgStat_StatDBEntry *entry;
982
983
    /* only consider this database if it has a pgstat entry */
984
0
    entry = pgstat_fetch_stat_dbentry(newdb);
985
0
    if (entry != NULL)
986
0
    {
987
      /* we assume it isn't found because the hash was just created */
988
0
      db = hash_search(dbhash, &newdb, HASH_ENTER, NULL);
989
990
      /* hash_search already filled in the key */
991
0
      db->adl_score = score++;
992
      /* next_worker is filled in later */
993
0
    }
994
0
  }
995
996
  /* Now insert the databases from the existing list */
997
0
  dlist_foreach(iter, &DatabaseList)
998
0
  {
999
0
    avl_dbase  *avdb = dlist_container(avl_dbase, adl_node, iter.cur);
1000
0
    avl_dbase  *db;
1001
0
    bool    found;
1002
0
    PgStat_StatDBEntry *entry;
1003
1004
    /*
1005
     * skip databases with no stat entries -- in particular, this gets rid
1006
     * of dropped databases
1007
     */
1008
0
    entry = pgstat_fetch_stat_dbentry(avdb->adl_datid);
1009
0
    if (entry == NULL)
1010
0
      continue;
1011
1012
0
    db = hash_search(dbhash, &(avdb->adl_datid), HASH_ENTER, &found);
1013
1014
0
    if (!found)
1015
0
    {
1016
      /* hash_search already filled in the key */
1017
0
      db->adl_score = score++;
1018
      /* next_worker is filled in later */
1019
0
    }
1020
0
  }
1021
1022
  /* finally, insert all qualifying databases not previously inserted */
1023
0
  dblist = get_database_list();
1024
0
  foreach(cell, dblist)
1025
0
  {
1026
0
    avw_dbase  *avdb = lfirst(cell);
1027
0
    avl_dbase  *db;
1028
0
    bool    found;
1029
0
    PgStat_StatDBEntry *entry;
1030
1031
    /* only consider databases with a pgstat entry */
1032
0
    entry = pgstat_fetch_stat_dbentry(avdb->adw_datid);
1033
0
    if (entry == NULL)
1034
0
      continue;
1035
1036
0
    db = hash_search(dbhash, &(avdb->adw_datid), HASH_ENTER, &found);
1037
    /* only update the score if the database was not already on the hash */
1038
0
    if (!found)
1039
0
    {
1040
      /* hash_search already filled in the key */
1041
0
      db->adl_score = score++;
1042
      /* next_worker is filled in later */
1043
0
    }
1044
0
  }
1045
0
  nelems = score;
1046
1047
  /* from here on, the allocated memory belongs to the new list */
1048
0
  MemoryContextSwitchTo(newcxt);
1049
0
  dlist_init(&DatabaseList);
1050
1051
0
  if (nelems > 0)
1052
0
  {
1053
0
    TimestampTz current_time;
1054
0
    int     millis_increment;
1055
0
    avl_dbase  *dbary;
1056
0
    avl_dbase  *db;
1057
0
    HASH_SEQ_STATUS seq;
1058
0
    int     i;
1059
1060
    /* put all the hash elements into an array */
1061
0
    dbary = palloc(nelems * sizeof(avl_dbase));
1062
1063
0
    i = 0;
1064
0
    hash_seq_init(&seq, dbhash);
1065
0
    while ((db = hash_seq_search(&seq)) != NULL)
1066
0
      memcpy(&(dbary[i++]), db, sizeof(avl_dbase));
1067
1068
    /* sort the array */
1069
0
    qsort(dbary, nelems, sizeof(avl_dbase), db_comparator);
1070
1071
    /*
1072
     * Determine the time interval between databases in the schedule. If
1073
     * we see that the configured naptime would take us to sleep times
1074
     * lower than our min sleep time (which launcher_determine_sleep is
1075
     * coded not to allow), silently use a larger naptime (but don't touch
1076
     * the GUC variable).
1077
     */
1078
0
    millis_increment = 1000.0 * autovacuum_naptime / nelems;
1079
0
    if (millis_increment <= MIN_AUTOVAC_SLEEPTIME)
1080
0
      millis_increment = MIN_AUTOVAC_SLEEPTIME * 1.1;
1081
1082
0
    current_time = GetCurrentTimestamp();
1083
1084
    /*
1085
     * move the elements from the array into the dllist, setting the
1086
     * next_worker while walking the array
1087
     */
1088
0
    for (i = 0; i < nelems; i++)
1089
0
    {
1090
0
      avl_dbase  *db = &(dbary[i]);
1091
1092
0
      current_time = TimestampTzPlusMilliseconds(current_time,
1093
0
                             millis_increment);
1094
0
      db->adl_next_worker = current_time;
1095
1096
      /* later elements should go closer to the head of the list */
1097
0
      dlist_push_head(&DatabaseList, &db->adl_node);
1098
0
    }
1099
0
  }
1100
1101
  /* all done, clean up memory */
1102
0
  if (DatabaseListCxt != NULL)
1103
0
    MemoryContextDelete(DatabaseListCxt);
1104
0
  MemoryContextDelete(tmpcxt);
1105
0
  DatabaseListCxt = newcxt;
1106
0
  MemoryContextSwitchTo(oldcxt);
1107
0
}
1108
1109
/* qsort comparator for avl_dbase, using adl_score */
1110
static int
1111
db_comparator(const void *a, const void *b)
1112
0
{
1113
0
  if (((const avl_dbase *) a)->adl_score == ((const avl_dbase *) b)->adl_score)
1114
0
    return 0;
1115
0
  else
1116
0
    return (((const avl_dbase *) a)->adl_score < ((const avl_dbase *) b)->adl_score) ? 1 : -1;
1117
0
}
1118
1119
/*
1120
 * do_start_worker
1121
 *
1122
 * Bare-bones procedure for starting an autovacuum worker from the launcher.
1123
 * It determines what database to work on, sets up shared memory stuff and
1124
 * signals postmaster to start the worker.  It fails gracefully if invoked when
1125
 * autovacuum_workers are already active.
1126
 *
1127
 * Return value is the OID of the database that the worker is going to process,
1128
 * or InvalidOid if no worker was actually started.
1129
 */
1130
static Oid
1131
do_start_worker(void)
1132
0
{
1133
0
  List     *dblist;
1134
0
  ListCell   *cell;
1135
0
  TransactionId xidForceLimit;
1136
0
  MultiXactId multiForceLimit;
1137
0
  bool    for_xid_wrap;
1138
0
  bool    for_multi_wrap;
1139
0
  avw_dbase  *avdb;
1140
0
  TimestampTz current_time;
1141
0
  bool    skipit = false;
1142
0
  Oid     retval = InvalidOid;
1143
0
  MemoryContext tmpcxt,
1144
0
        oldcxt;
1145
1146
  /* return quickly when there are no free workers */
1147
0
  LWLockAcquire(AutovacuumLock, LW_SHARED);
1148
0
  if (dlist_is_empty(&AutoVacuumShmem->av_freeWorkers))
1149
0
  {
1150
0
    LWLockRelease(AutovacuumLock);
1151
0
    return InvalidOid;
1152
0
  }
1153
0
  LWLockRelease(AutovacuumLock);
1154
1155
  /*
1156
   * Create and switch to a temporary context to avoid leaking the memory
1157
   * allocated for the database list.
1158
   */
1159
0
  tmpcxt = AllocSetContextCreate(GetCurrentMemoryContext(),
1160
0
                   "Start worker tmp cxt",
1161
0
                   ALLOCSET_DEFAULT_SIZES);
1162
0
  oldcxt = MemoryContextSwitchTo(tmpcxt);
1163
1164
  /* use fresh stats */
1165
0
  autovac_refresh_stats();
1166
1167
  /* Get a list of databases */
1168
0
  dblist = get_database_list();
1169
1170
  /*
1171
   * Determine the oldest datfrozenxid/relfrozenxid that we will allow to
1172
   * pass without forcing a vacuum.  (This limit can be tightened for
1173
   * particular tables, but not loosened.)
1174
   */
1175
0
  recentXid = ReadNewTransactionId();
1176
0
  xidForceLimit = recentXid - autovacuum_freeze_max_age;
1177
  /* ensure it's a "normal" XID, else TransactionIdPrecedes misbehaves */
1178
  /* this can cause the limit to go backwards by 3, but that's OK */
1179
0
  if (xidForceLimit < FirstNormalTransactionId)
1180
0
    xidForceLimit -= FirstNormalTransactionId;
1181
1182
  /* Also determine the oldest datminmxid we will consider. */
1183
0
  recentMulti = ReadNextMultiXactId();
1184
0
  multiForceLimit = recentMulti - MultiXactMemberFreezeThreshold();
1185
0
  if (multiForceLimit < FirstMultiXactId)
1186
0
    multiForceLimit -= FirstMultiXactId;
1187
1188
  /*
1189
   * Choose a database to connect to.  We pick the database that was least
1190
   * recently auto-vacuumed, or one that needs vacuuming to prevent Xid
1191
   * wraparound-related data loss.  If any db at risk of Xid wraparound is
1192
   * found, we pick the one with oldest datfrozenxid, independently of
1193
   * autovacuum times; similarly we pick the one with the oldest datminmxid
1194
   * if any is in MultiXactId wraparound.  Note that those in Xid wraparound
1195
   * danger are given more priority than those in multi wraparound danger.
1196
   *
1197
   * Note that a database with no stats entry is not considered, except for
1198
   * Xid wraparound purposes.  The theory is that if no one has ever
1199
   * connected to it since the stats were last initialized, it doesn't need
1200
   * vacuuming.
1201
   *
1202
   * XXX This could be improved if we had more info about whether it needs
1203
   * vacuuming before connecting to it.  Perhaps look through the pgstats
1204
   * data for the database's tables?  One idea is to keep track of the
1205
   * number of new and dead tuples per database in pgstats.  However it
1206
   * isn't clear how to construct a metric that measures that and not cause
1207
   * starvation for less busy databases.
1208
   */
1209
0
  avdb = NULL;
1210
0
  for_xid_wrap = false;
1211
0
  for_multi_wrap = false;
1212
0
  current_time = GetCurrentTimestamp();
1213
0
  foreach(cell, dblist)
1214
0
  {
1215
0
    avw_dbase  *tmp = lfirst(cell);
1216
0
    dlist_iter  iter;
1217
1218
    /* Check to see if this one is at risk of wraparound */
1219
0
    if (TransactionIdPrecedes(tmp->adw_frozenxid, xidForceLimit))
1220
0
    {
1221
0
      if (avdb == NULL ||
1222
0
        TransactionIdPrecedes(tmp->adw_frozenxid,
1223
0
                    avdb->adw_frozenxid))
1224
0
        avdb = tmp;
1225
0
      for_xid_wrap = true;
1226
0
      continue;
1227
0
    }
1228
0
    else if (for_xid_wrap)
1229
0
      continue;     /* ignore not-at-risk DBs */
1230
0
    else if (MultiXactIdPrecedes(tmp->adw_minmulti, multiForceLimit))
1231
0
    {
1232
0
      if (avdb == NULL ||
1233
0
        MultiXactIdPrecedes(tmp->adw_minmulti, avdb->adw_minmulti))
1234
0
        avdb = tmp;
1235
0
      for_multi_wrap = true;
1236
0
      continue;
1237
0
    }
1238
0
    else if (for_multi_wrap)
1239
0
      continue;     /* ignore not-at-risk DBs */
1240
1241
    /* Find pgstat entry if any */
1242
0
    tmp->adw_entry = pgstat_fetch_stat_dbentry(tmp->adw_datid);
1243
1244
    /*
1245
     * Skip a database with no pgstat entry; it means it hasn't seen any
1246
     * activity.
1247
     */
1248
0
    if (!tmp->adw_entry)
1249
0
      continue;
1250
1251
    /*
1252
     * Also, skip a database that appears on the database list as having
1253
     * been processed recently (less than autovacuum_naptime seconds ago).
1254
     * We do this so that we don't select a database which we just
1255
     * selected, but that pgstat hasn't gotten around to updating the last
1256
     * autovacuum time yet.
1257
     */
1258
0
    skipit = false;
1259
1260
0
    dlist_reverse_foreach(iter, &DatabaseList)
1261
0
    {
1262
0
      avl_dbase  *dbp = dlist_container(avl_dbase, adl_node, iter.cur);
1263
1264
0
      if (dbp->adl_datid == tmp->adw_datid)
1265
0
      {
1266
        /*
1267
         * Skip this database if its next_worker value falls between
1268
         * the current time and the current time plus naptime.
1269
         */
1270
0
        if (!TimestampDifferenceExceeds(dbp->adl_next_worker,
1271
0
                        current_time, 0) &&
1272
0
          !TimestampDifferenceExceeds(current_time,
1273
0
                        dbp->adl_next_worker,
1274
0
                        autovacuum_naptime * 1000))
1275
0
          skipit = true;
1276
1277
0
        break;
1278
0
      }
1279
0
    }
1280
0
    if (skipit)
1281
0
      continue;
1282
1283
    /*
1284
     * Remember the db with oldest autovac time.  (If we are here, both
1285
     * tmp->entry and db->entry must be non-null.)
1286
     */
1287
0
    if (avdb == NULL ||
1288
0
      tmp->adw_entry->last_autovac_time < avdb->adw_entry->last_autovac_time)
1289
0
      avdb = tmp;
1290
0
  }
1291
1292
  /* Found a database -- process it */
1293
0
  if (avdb != NULL)
1294
0
  {
1295
0
    WorkerInfo  worker;
1296
0
    dlist_node *wptr;
1297
1298
0
    LWLockAcquire(AutovacuumLock, LW_EXCLUSIVE);
1299
1300
    /*
1301
     * Get a worker entry from the freelist.  We checked above, so there
1302
     * really should be a free slot.
1303
     */
1304
0
    wptr = dlist_pop_head_node(&AutoVacuumShmem->av_freeWorkers);
1305
1306
0
    worker = dlist_container(WorkerInfoData, wi_links, wptr);
1307
0
    worker->wi_dboid = avdb->adw_datid;
1308
0
    worker->wi_proc = NULL;
1309
0
    worker->wi_launchtime = GetCurrentTimestamp();
1310
1311
0
    AutoVacuumShmem->av_startingWorker = worker;
1312
1313
0
    LWLockRelease(AutovacuumLock);
1314
1315
0
    SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_WORKER);
1316
1317
0
    retval = avdb->adw_datid;
1318
0
  }
1319
0
  else if (skipit)
1320
0
  {
1321
    /*
1322
     * If we skipped all databases on the list, rebuild it, because it
1323
     * probably contains a dropped database.
1324
     */
1325
0
    rebuild_database_list(InvalidOid);
1326
0
  }
1327
1328
0
  MemoryContextSwitchTo(oldcxt);
1329
0
  MemoryContextDelete(tmpcxt);
1330
1331
0
  return retval;
1332
0
}
1333
1334
/*
1335
 * launch_worker
1336
 *
1337
 * Wrapper for starting a worker from the launcher.  Besides actually starting
1338
 * it, update the database list to reflect the next time that another one will
1339
 * need to be started on the selected database.  The actual database choice is
1340
 * left to do_start_worker.
1341
 *
1342
 * This routine is also expected to insert an entry into the database list if
1343
 * the selected database was previously absent from the list.
1344
 */
1345
static void
1346
launch_worker(TimestampTz now)
1347
0
{
1348
0
  Oid     dbid;
1349
0
  dlist_iter  iter;
1350
1351
0
  dbid = do_start_worker();
1352
0
  if (OidIsValid(dbid))
1353
0
  {
1354
0
    bool    found = false;
1355
1356
    /*
1357
     * Walk the database list and update the corresponding entry.  If the
1358
     * database is not on the list, we'll recreate the list.
1359
     */
1360
0
    dlist_foreach(iter, &DatabaseList)
1361
0
    {
1362
0
      avl_dbase  *avdb = dlist_container(avl_dbase, adl_node, iter.cur);
1363
1364
0
      if (avdb->adl_datid == dbid)
1365
0
      {
1366
0
        found = true;
1367
1368
        /*
1369
         * add autovacuum_naptime seconds to the current time, and use
1370
         * that as the new "next_worker" field for this database.
1371
         */
1372
0
        avdb->adl_next_worker =
1373
0
          TimestampTzPlusMilliseconds(now, autovacuum_naptime * 1000);
1374
1375
0
        dlist_move_head(&DatabaseList, iter.cur);
1376
0
        break;
1377
0
      }
1378
0
    }
1379
1380
    /*
1381
     * If the database was not present in the database list, we rebuild
1382
     * the list.  It's possible that the database does not get into the
1383
     * list anyway, for example if it's a database that doesn't have a
1384
     * pgstat entry, but this is not a problem because we don't want to
1385
     * schedule workers regularly into those in any case.
1386
     */
1387
0
    if (!found)
1388
0
      rebuild_database_list(dbid);
1389
0
  }
1390
0
}
1391
1392
/*
1393
 * Called from postmaster to signal a failure to fork a process to become
1394
 * worker.  The postmaster should kill(SIGUSR2) the launcher shortly
1395
 * after calling this function.
1396
 */
1397
void
1398
AutoVacWorkerFailed(void)
1399
0
{
1400
0
  AutoVacuumShmem->av_signal[AutoVacForkFailed] = true;
1401
0
}
1402
1403
/* SIGHUP: set flag to re-read config file at next convenient time */
1404
static void
1405
av_sighup_handler(SIGNAL_ARGS)
1406
0
{
1407
0
  int     save_errno = errno;
1408
1409
0
  got_SIGHUP = true;
1410
0
  SetLatch(MyLatch);
1411
1412
0
  errno = save_errno;
1413
0
}
1414
1415
/* SIGUSR2: a worker is up and running, or just finished, or failed to fork */
1416
static void
1417
avl_sigusr2_handler(SIGNAL_ARGS)
1418
0
{
1419
0
  int     save_errno = errno;
1420
1421
0
  got_SIGUSR2 = true;
1422
0
  SetLatch(MyLatch);
1423
1424
0
  errno = save_errno;
1425
0
}
1426
1427
/* SIGTERM: time to die */
1428
static void
1429
avl_sigterm_handler(SIGNAL_ARGS)
1430
0
{
1431
0
  int     save_errno = errno;
1432
1433
0
  got_SIGTERM = true;
1434
0
  SetLatch(MyLatch);
1435
1436
0
  errno = save_errno;
1437
0
}
1438
1439
1440
/********************************************************************
1441
 *            AUTOVACUUM WORKER CODE
1442
 ********************************************************************/
1443
1444
#ifdef EXEC_BACKEND
1445
/*
1446
 * forkexec routines for the autovacuum worker.
1447
 *
1448
 * Format up the arglist, then fork and exec.
1449
 */
1450
static pid_t
1451
avworker_forkexec(void)
1452
{
1453
  char     *av[10];
1454
  int     ac = 0;
1455
1456
  av[ac++] = "postgres";
1457
  av[ac++] = "--forkavworker";
1458
  av[ac++] = NULL;      /* filled in by postmaster_forkexec */
1459
  av[ac] = NULL;
1460
1461
  Assert(ac < lengthof(av));
1462
1463
  return postmaster_forkexec(ac, av);
1464
}
1465
1466
/*
1467
 * We need this set from the outside, before InitProcess is called
1468
 */
1469
void
1470
AutovacuumWorkerIAm(void)
1471
{
1472
  am_autovacuum_worker = true;
1473
}
1474
#endif
1475
1476
/*
1477
 * Main entry point for autovacuum worker process.
1478
 *
1479
 * This code is heavily based on pgarch.c, q.v.
1480
 */
1481
int
1482
StartAutoVacWorker(void)
1483
0
{
1484
0
  pid_t   worker_pid;
1485
1486
#ifdef EXEC_BACKEND
1487
  switch ((worker_pid = avworker_forkexec()))
1488
#else
1489
0
  switch ((worker_pid = fork_process()))
1490
0
#endif
1491
0
  {
1492
0
    case -1:
1493
0
      ereport(LOG,
1494
0
          (errmsg("could not fork autovacuum worker process: %m")));
1495
0
      return 0;
1496
1497
0
#ifndef EXEC_BACKEND
1498
0
    case 0:
1499
      /* in postmaster child ... */
1500
0
      InitPostmasterChild();
1501
1502
      /* Close the postmaster's sockets */
1503
0
      ClosePostmasterPorts(false);
1504
1505
0
      AutoVacWorkerMain(0, NULL);
1506
0
      break;
1507
0
#endif
1508
0
    default:
1509
0
      return (int) worker_pid;
1510
0
  }
1511
1512
  /* shouldn't get here */
1513
0
  return 0;
1514
0
}
1515
1516
/*
1517
 * AutoVacWorkerMain
1518
 */
1519
NON_EXEC_STATIC void
1520
AutoVacWorkerMain(int argc, char *argv[])
1521
0
{
1522
0
  sigjmp_buf  local_sigjmp_buf;
1523
0
  Oid     dbid;
1524
1525
0
  am_autovacuum_worker = true;
1526
1527
  /* Identify myself via ps */
1528
0
  init_ps_display(pgstat_get_backend_desc(B_AUTOVAC_WORKER), "", "", "");
1529
1530
0
  SetProcessingMode(InitProcessing);
1531
1532
  /*
1533
   * Set up signal handlers.  We operate on databases much like a regular
1534
   * backend, so we use the same signal handling.  See equivalent code in
1535
   * tcop/postgres.c.
1536
   */
1537
0
  pqsignal(SIGHUP, av_sighup_handler);
1538
1539
  /*
1540
   * SIGINT is used to signal canceling the current table's vacuum; SIGTERM
1541
   * means abort and exit cleanly, and SIGQUIT means abandon ship.
1542
   */
1543
0
  pqsignal(SIGINT, StatementCancelHandler);
1544
0
  pqsignal(SIGTERM, die);
1545
0
  pqsignal(SIGQUIT, quickdie);
1546
0
  InitializeTimeouts();   /* establishes SIGALRM handler */
1547
1548
0
  pqsignal(SIGPIPE, SIG_IGN);
1549
0
  pqsignal(SIGUSR1, procsignal_sigusr1_handler);
1550
0
  pqsignal(SIGUSR2, SIG_IGN);
1551
0
  pqsignal(SIGFPE, FloatExceptionHandler);
1552
0
  pqsignal(SIGCHLD, SIG_DFL);
1553
1554
  /* Early initialization */
1555
0
  BaseInit();
1556
1557
  /*
1558
   * Create a per-backend PGPROC struct in shared memory, except in the
1559
   * EXEC_BACKEND case where this was done in SubPostmasterMain. We must do
1560
   * this before we can use LWLocks (and in the EXEC_BACKEND case we already
1561
   * had to do some stuff with LWLocks).
1562
   */
1563
0
#ifndef EXEC_BACKEND
1564
0
  InitProcess();
1565
0
#endif
1566
1567
  /*
1568
   * If an exception is encountered, processing resumes here.
1569
   *
1570
   * See notes in postgres.c about the design of this coding.
1571
   */
1572
0
  if (sigsetjmp(local_sigjmp_buf, 1) != 0)
1573
0
  {
1574
    /* Prevents interrupts while cleaning up */
1575
0
    HOLD_INTERRUPTS();
1576
1577
    /* Report the error to the server log */
1578
0
    EmitErrorReport();
1579
1580
    /*
1581
     * We can now go away.  Note that because we called InitProcess, a
1582
     * callback was registered to do ProcKill, which will clean up
1583
     * necessary state.
1584
     */
1585
0
    proc_exit(0);
1586
0
  }
1587
1588
  /* We can now handle ereport(ERROR) */
1589
0
  PG_exception_stack = &local_sigjmp_buf;
1590
1591
0
  PG_SETMASK(&UnBlockSig);
1592
1593
  /*
1594
   * Set always-secure search path, so malicious users can't redirect user
1595
   * code (e.g. pg_index.indexprs).  (That code runs in a
1596
   * SECURITY_RESTRICTED_OPERATION sandbox, so malicious users could not
1597
   * take control of the entire autovacuum worker in any case.)
1598
   */
1599
0
  SetConfigOption("search_path", "", PGC_SUSET, PGC_S_OVERRIDE);
1600
1601
  /*
1602
   * Force zero_damaged_pages OFF in the autovac process, even if it is set
1603
   * in postgresql.conf.  We don't really want such a dangerous option being
1604
   * applied non-interactively.
1605
   */
1606
0
  SetConfigOption("zero_damaged_pages", "false", PGC_SUSET, PGC_S_OVERRIDE);
1607
1608
  /*
1609
   * Force settable timeouts off to avoid letting these settings prevent
1610
   * regular maintenance from being executed.
1611
   */
1612
0
  SetConfigOption("statement_timeout", "0", PGC_SUSET, PGC_S_OVERRIDE);
1613
0
  SetConfigOption("lock_timeout", "0", PGC_SUSET, PGC_S_OVERRIDE);
1614
0
  SetConfigOption("idle_in_transaction_session_timeout", "0",
1615
0
          PGC_SUSET, PGC_S_OVERRIDE);
1616
1617
  /*
1618
   * Force default_transaction_isolation to READ COMMITTED.  We don't want
1619
   * to pay the overhead of serializable mode, nor add any risk of causing
1620
   * deadlocks or delaying other transactions.
1621
   */
1622
0
  SetConfigOption("default_transaction_isolation", "read committed",
1623
0
          PGC_SUSET, PGC_S_OVERRIDE);
1624
1625
  /*
1626
   * Force synchronous replication off to allow regular maintenance even if
1627
   * we are waiting for standbys to connect. This is important to ensure we
1628
   * aren't blocked from performing anti-wraparound tasks.
1629
   */
1630
0
  if (synchronous_commit > SYNCHRONOUS_COMMIT_LOCAL_FLUSH)
1631
0
    SetConfigOption("synchronous_commit", "local",
1632
0
            PGC_SUSET, PGC_S_OVERRIDE);
1633
1634
  /*
1635
   * Get the info about the database we're going to work on.
1636
   */
1637
0
  LWLockAcquire(AutovacuumLock, LW_EXCLUSIVE);
1638
1639
  /*
1640
   * beware of startingWorker being INVALID; this should normally not
1641
   * happen, but if a worker fails after forking and before this, the
1642
   * launcher might have decided to remove it from the queue and start
1643
   * again.
1644
   */
1645
0
  if (AutoVacuumShmem->av_startingWorker != NULL)
1646
0
  {
1647
0
    MyWorkerInfo = AutoVacuumShmem->av_startingWorker;
1648
0
    dbid = MyWorkerInfo->wi_dboid;
1649
0
    MyWorkerInfo->wi_proc = MyProc;
1650
1651
    /* insert into the running list */
1652
0
    dlist_push_head(&AutoVacuumShmem->av_runningWorkers,
1653
0
            &MyWorkerInfo->wi_links);
1654
1655
    /*
1656
     * remove from the "starting" pointer, so that the launcher can start
1657
     * a new worker if required
1658
     */
1659
0
    AutoVacuumShmem->av_startingWorker = NULL;
1660
0
    LWLockRelease(AutovacuumLock);
1661
1662
0
    on_shmem_exit(FreeWorkerInfo, 0);
1663
1664
    /* wake up the launcher */
1665
0
    if (AutoVacuumShmem->av_launcherpid != 0)
1666
0
      kill(AutoVacuumShmem->av_launcherpid, SIGUSR2);
1667
0
  }
1668
0
  else
1669
0
  {
1670
    /* no worker entry for me, go away */
1671
0
    elog(WARNING, "autovacuum worker started without a worker entry");
1672
0
    dbid = InvalidOid;
1673
0
    LWLockRelease(AutovacuumLock);
1674
0
  }
1675
1676
0
  if (OidIsValid(dbid))
1677
0
  {
1678
0
    char    dbname[NAMEDATALEN];
1679
1680
    /*
1681
     * Report autovac startup to the stats collector.  We deliberately do
1682
     * this before InitPostgres, so that the last_autovac_time will get
1683
     * updated even if the connection attempt fails.  This is to prevent
1684
     * autovac from getting "stuck" repeatedly selecting an unopenable
1685
     * database, rather than making any progress on stuff it can connect
1686
     * to.
1687
     */
1688
0
    pgstat_report_autovac(dbid);
1689
1690
    /*
1691
     * Connect to the selected database
1692
     *
1693
     * Note: if we have selected a just-deleted database (due to using
1694
     * stale stats info), we'll fail and exit here.
1695
     */
1696
0
    InitPostgres(NULL, dbid, NULL, InvalidOid, dbname, false);
1697
0
    SetProcessingMode(NormalProcessing);
1698
0
    set_ps_display(dbname, false);
1699
0
    ereport(DEBUG1,
1700
0
        (errmsg("autovacuum: processing database \"%s\"", dbname)));
1701
1702
0
    if (PostAuthDelay)
1703
0
      pg_usleep(PostAuthDelay * 1000000L);
1704
1705
    /* And do an appropriate amount of work */
1706
0
    recentXid = ReadNewTransactionId();
1707
0
    recentMulti = ReadNextMultiXactId();
1708
0
    do_autovacuum();
1709
0
  }
1710
1711
  /*
1712
   * The launcher will be notified of my death in ProcKill, *if* we managed
1713
   * to get a worker slot at all
1714
   */
1715
1716
  /* All done, go away */
1717
0
  proc_exit(0);
1718
0
}
1719
1720
/*
1721
 * Return a WorkerInfo to the free list
1722
 */
1723
static void
1724
FreeWorkerInfo(int code, Datum arg)
1725
0
{
1726
0
  if (MyWorkerInfo != NULL)
1727
0
  {
1728
0
    LWLockAcquire(AutovacuumLock, LW_EXCLUSIVE);
1729
1730
    /*
1731
     * Wake the launcher up so that he can launch a new worker immediately
1732
     * if required.  We only save the launcher's PID in local memory here;
1733
     * the actual signal will be sent when the PGPROC is recycled.  Note
1734
     * that we always do this, so that the launcher can rebalance the cost
1735
     * limit setting of the remaining workers.
1736
     *
1737
     * We somewhat ignore the risk that the launcher changes its PID
1738
     * between us reading it and the actual kill; we expect ProcKill to be
1739
     * called shortly after us, and we assume that PIDs are not reused too
1740
     * quickly after a process exits.
1741
     */
1742
0
    AutovacuumLauncherPid = AutoVacuumShmem->av_launcherpid;
1743
1744
0
    dlist_delete(&MyWorkerInfo->wi_links);
1745
0
    MyWorkerInfo->wi_dboid = InvalidOid;
1746
0
    MyWorkerInfo->wi_tableoid = InvalidOid;
1747
0
    MyWorkerInfo->wi_sharedrel = false;
1748
0
    MyWorkerInfo->wi_proc = NULL;
1749
0
    MyWorkerInfo->wi_launchtime = 0;
1750
0
    MyWorkerInfo->wi_dobalance = false;
1751
0
    MyWorkerInfo->wi_cost_delay = 0;
1752
0
    MyWorkerInfo->wi_cost_limit = 0;
1753
0
    MyWorkerInfo->wi_cost_limit_base = 0;
1754
0
    dlist_push_head(&AutoVacuumShmem->av_freeWorkers,
1755
0
            &MyWorkerInfo->wi_links);
1756
    /* not mine anymore */
1757
0
    MyWorkerInfo = NULL;
1758
1759
    /*
1760
     * now that we're inactive, cause a rebalancing of the surviving
1761
     * workers
1762
     */
1763
0
    AutoVacuumShmem->av_signal[AutoVacRebalance] = true;
1764
0
    LWLockRelease(AutovacuumLock);
1765
0
  }
1766
0
}
1767
1768
/*
1769
 * Update the cost-based delay parameters, so that multiple workers consume
1770
 * each a fraction of the total available I/O.
1771
 */
1772
void
1773
AutoVacuumUpdateDelay(void)
1774
0
{
1775
0
  if (MyWorkerInfo)
1776
0
  {
1777
0
    VacuumCostDelay = MyWorkerInfo->wi_cost_delay;
1778
0
    VacuumCostLimit = MyWorkerInfo->wi_cost_limit;
1779
0
  }
1780
0
}
1781
1782
/*
1783
 * autovac_balance_cost
1784
 *    Recalculate the cost limit setting for each active worker.
1785
 *
1786
 * Caller must hold the AutovacuumLock in exclusive mode.
1787
 */
1788
static void
1789
autovac_balance_cost(void)
1790
0
{
1791
  /*
1792
   * The idea here is that we ration out I/O equally.  The amount of I/O
1793
   * that a worker can consume is determined by cost_limit/cost_delay, so we
1794
   * try to equalize those ratios rather than the raw limit settings.
1795
   *
1796
   * note: in cost_limit, zero also means use value from elsewhere, because
1797
   * zero is not a valid value.
1798
   */
1799
0
  int     vac_cost_limit = (autovacuum_vac_cost_limit > 0 ?
1800
0
                  autovacuum_vac_cost_limit : VacuumCostLimit);
1801
0
  int     vac_cost_delay = (autovacuum_vac_cost_delay >= 0 ?
1802
0
                  autovacuum_vac_cost_delay : VacuumCostDelay);
1803
0
  double    cost_total;
1804
0
  double    cost_avail;
1805
0
  dlist_iter  iter;
1806
1807
  /* not set? nothing to do */
1808
0
  if (vac_cost_limit <= 0 || vac_cost_delay <= 0)
1809
0
    return;
1810
1811
  /* calculate the total base cost limit of participating active workers */
1812
0
  cost_total = 0.0;
1813
0
  dlist_foreach(iter, &AutoVacuumShmem->av_runningWorkers)
1814
0
  {
1815
0
    WorkerInfo  worker = dlist_container(WorkerInfoData, wi_links, iter.cur);
1816
1817
0
    if (worker->wi_proc != NULL &&
1818
0
      worker->wi_dobalance &&
1819
0
      worker->wi_cost_limit_base > 0 && worker->wi_cost_delay > 0)
1820
0
      cost_total +=
1821
0
        (double) worker->wi_cost_limit_base / worker->wi_cost_delay;
1822
0
  }
1823
1824
  /* there are no cost limits -- nothing to do */
1825
0
  if (cost_total <= 0)
1826
0
    return;
1827
1828
  /*
1829
   * Adjust cost limit of each active worker to balance the total of cost
1830
   * limit to autovacuum_vacuum_cost_limit.
1831
   */
1832
0
  cost_avail = (double) vac_cost_limit / vac_cost_delay;
1833
0
  dlist_foreach(iter, &AutoVacuumShmem->av_runningWorkers)
1834
0
  {
1835
0
    WorkerInfo  worker = dlist_container(WorkerInfoData, wi_links, iter.cur);
1836
1837
0
    if (worker->wi_proc != NULL &&
1838
0
      worker->wi_dobalance &&
1839
0
      worker->wi_cost_limit_base > 0 && worker->wi_cost_delay > 0)
1840
0
    {
1841
0
      int     limit = (int)
1842
0
      (cost_avail * worker->wi_cost_limit_base / cost_total);
1843
1844
      /*
1845
       * We put a lower bound of 1 on the cost_limit, to avoid division-
1846
       * by-zero in the vacuum code.  Also, in case of roundoff trouble
1847
       * in these calculations, let's be sure we don't ever set
1848
       * cost_limit to more than the base value.
1849
       */
1850
0
      worker->wi_cost_limit = Max(Min(limit,
1851
0
                      worker->wi_cost_limit_base),
1852
0
                    1);
1853
0
    }
1854
1855
0
    if (worker->wi_proc != NULL)
1856
0
      elog(DEBUG2, "autovac_balance_cost(pid=%u db=%u, rel=%u, dobalance=%s cost_limit=%d, cost_limit_base=%d, cost_delay=%d)",
1857
0
         worker->wi_proc->pid, worker->wi_dboid, worker->wi_tableoid,
1858
0
         worker->wi_dobalance ? "yes" : "no",
1859
0
         worker->wi_cost_limit, worker->wi_cost_limit_base,
1860
0
         worker->wi_cost_delay);
1861
0
  }
1862
0
}
1863
1864
/*
1865
 * get_database_list
1866
 *    Return a list of all databases found in pg_database.
1867
 *
1868
 * The list and associated data is allocated in the caller's memory context,
1869
 * which is in charge of ensuring that it's properly cleaned up afterwards.
1870
 *
1871
 * Note: this is the only function in which the autovacuum launcher uses a
1872
 * transaction.  Although we aren't attached to any particular database and
1873
 * therefore can't access most catalogs, we do have enough infrastructure
1874
 * to do a seqscan on pg_database.
1875
 */
1876
static List *
1877
get_database_list(void)
1878
0
{
1879
0
  List     *dblist = NIL;
1880
0
  Relation  rel;
1881
0
  HeapScanDesc scan;
1882
0
  HeapTuple tup;
1883
0
  MemoryContext resultcxt;
1884
1885
  /* This is the context that we will allocate our output data in */
1886
0
  resultcxt = GetCurrentMemoryContext();
1887
1888
  /*
1889
   * Start a transaction so we can access pg_database, and get a snapshot.
1890
   * We don't have a use for the snapshot itself, but we're interested in
1891
   * the secondary effect that it sets RecentGlobalXmin.  (This is critical
1892
   * for anything that reads heap pages, because HOT may decide to prune
1893
   * them even if the process doesn't attempt to modify any tuples.)
1894
   */
1895
0
  StartTransactionCommand();
1896
0
  (void) GetTransactionSnapshot();
1897
1898
0
  rel = heap_open(DatabaseRelationId, AccessShareLock);
1899
0
  scan = heap_beginscan_catalog(rel, 0, NULL);
1900
1901
0
  while (HeapTupleIsValid(tup = heap_getnext(scan, ForwardScanDirection)))
1902
0
  {
1903
0
    Form_pg_database pgdatabase = (Form_pg_database) GETSTRUCT(tup);
1904
0
    avw_dbase  *avdb;
1905
0
    MemoryContext oldcxt;
1906
1907
    /*
1908
     * Allocate our results in the caller's context, not the
1909
     * transaction's. We do this inside the loop, and restore the original
1910
     * context at the end, so that leaky things like heap_getnext() are
1911
     * not called in a potentially long-lived context.
1912
     */
1913
0
    oldcxt = MemoryContextSwitchTo(resultcxt);
1914
1915
0
    avdb = (avw_dbase *) palloc(sizeof(avw_dbase));
1916
1917
0
    avdb->adw_datid = HeapTupleGetOid(tup);
1918
0
    avdb->adw_name = pstrdup(NameStr(pgdatabase->datname));
1919
0
    avdb->adw_frozenxid = pgdatabase->datfrozenxid;
1920
0
    avdb->adw_minmulti = pgdatabase->datminmxid;
1921
    /* this gets set later: */
1922
0
    avdb->adw_entry = NULL;
1923
1924
0
    dblist = lappend(dblist, avdb);
1925
0
    MemoryContextSwitchTo(oldcxt);
1926
0
  }
1927
1928
0
  heap_endscan(scan);
1929
0
  heap_close(rel, AccessShareLock);
1930
1931
0
  CommitTransactionCommand();
1932
1933
0
  return dblist;
1934
0
}
1935
1936
/*
1937
 * Process a database table-by-table
1938
 *
1939
 * Note that CHECK_FOR_INTERRUPTS is supposed to be used in certain spots in
1940
 * order not to ignore shutdown commands for too long.
1941
 */
1942
static void
1943
do_autovacuum(void)
1944
0
{
1945
0
  Relation  classRel;
1946
0
  HeapTuple tuple;
1947
0
  HeapScanDesc relScan;
1948
0
  Form_pg_database dbForm;
1949
0
  List     *table_oids = NIL;
1950
0
  List     *orphan_oids = NIL;
1951
0
  HASHCTL   ctl;
1952
0
  HTAB     *table_toast_map;
1953
0
  ListCell   *volatile cell;
1954
0
  PgStat_StatDBEntry *shared;
1955
0
  PgStat_StatDBEntry *dbentry;
1956
0
  BufferAccessStrategy bstrategy;
1957
0
  ScanKeyData key;
1958
0
  TupleDesc pg_class_desc;
1959
0
  int     effective_multixact_freeze_max_age;
1960
0
  bool    did_vacuum = false;
1961
0
  bool    found_concurrent_worker = false;
1962
0
  int     i;
1963
1964
  /*
1965
   * StartTransactionCommand and CommitTransactionCommand will automatically
1966
   * switch to other contexts.  We need this one to keep the list of
1967
   * relations to vacuum/analyze across transactions.
1968
   */
1969
0
  AutovacMemCxt = AllocSetContextCreate(TopMemoryContext,
1970
0
                      "AV worker",
1971
0
                      ALLOCSET_DEFAULT_SIZES);
1972
0
  MemoryContextSwitchTo(AutovacMemCxt);
1973
1974
  /*
1975
   * may be NULL if we couldn't find an entry (only happens if we are
1976
   * forcing a vacuum for anti-wrap purposes).
1977
   */
1978
0
  dbentry = pgstat_fetch_stat_dbentry(MyDatabaseId);
1979
1980
  /* Start a transaction so our commands have one to play into. */
1981
0
  StartTransactionCommand();
1982
1983
  /*
1984
   * Clean up any dead statistics collector entries for this DB. We always
1985
   * want to do this exactly once per DB-processing cycle, even if we find
1986
   * nothing worth vacuuming in the database.
1987
   */
1988
0
  pgstat_vacuum_stat();
1989
1990
  /*
1991
   * Compute the multixact age for which freezing is urgent.  This is
1992
   * normally autovacuum_multixact_freeze_max_age, but may be less if we are
1993
   * short of multixact member space.
1994
   */
1995
0
  effective_multixact_freeze_max_age = MultiXactMemberFreezeThreshold();
1996
1997
  /*
1998
   * Find the pg_database entry and select the default freeze ages. We use
1999
   * zero in template and nonconnectable databases, else the system-wide
2000
   * default.
2001
   */
2002
0
  tuple = SearchSysCache1(DATABASEOID, ObjectIdGetDatum(MyDatabaseId));
2003
0
  if (!HeapTupleIsValid(tuple))
2004
0
    elog(ERROR, "cache lookup failed for database %u", MyDatabaseId);
2005
0
  dbForm = (Form_pg_database) GETSTRUCT(tuple);
2006
2007
0
  if (dbForm->datistemplate || !dbForm->datallowconn)
2008
0
  {
2009
0
    default_freeze_min_age = 0;
2010
0
    default_freeze_table_age = 0;
2011
0
    default_multixact_freeze_min_age = 0;
2012
0
    default_multixact_freeze_table_age = 0;
2013
0
  }
2014
0
  else
2015
0
  {
2016
0
    default_freeze_min_age = vacuum_freeze_min_age;
2017
0
    default_freeze_table_age = vacuum_freeze_table_age;
2018
0
    default_multixact_freeze_min_age = vacuum_multixact_freeze_min_age;
2019
0
    default_multixact_freeze_table_age = vacuum_multixact_freeze_table_age;
2020
0
  }
2021
2022
0
  ReleaseSysCache(tuple);
2023
2024
  /* StartTransactionCommand changed elsewhere */
2025
0
  MemoryContextSwitchTo(AutovacMemCxt);
2026
2027
  /* The database hash where pgstat keeps shared relations */
2028
0
  shared = pgstat_fetch_stat_dbentry(InvalidOid);
2029
2030
0
  classRel = heap_open(RelationRelationId, AccessShareLock);
2031
2032
  /* create a copy so we can use it after closing pg_class */
2033
0
  pg_class_desc = CreateTupleDescCopy(RelationGetDescr(classRel));
2034
2035
  /* create hash table for toast <-> main relid mapping */
2036
0
  MemSet(&ctl, 0, sizeof(ctl));
2037
0
  ctl.keysize = sizeof(Oid);
2038
0
  ctl.entrysize = sizeof(av_relation);
2039
2040
0
  table_toast_map = hash_create("TOAST to main relid map",
2041
0
                  100,
2042
0
                  &ctl,
2043
0
                  HASH_ELEM | HASH_BLOBS);
2044
2045
  /*
2046
   * Scan pg_class to determine which tables to vacuum.
2047
   *
2048
   * We do this in two passes: on the first one we collect the list of plain
2049
   * relations and materialized views, and on the second one we collect
2050
   * TOAST tables. The reason for doing the second pass is that during it we
2051
   * want to use the main relation's pg_class.reloptions entry if the TOAST
2052
   * table does not have any, and we cannot obtain it unless we know
2053
   * beforehand what's the main table OID.
2054
   *
2055
   * We need to check TOAST tables separately because in cases with short,
2056
   * wide tables there might be proportionally much more activity in the
2057
   * TOAST table than in its parent.
2058
   */
2059
0
  relScan = heap_beginscan_catalog(classRel, 0, NULL);
2060
2061
  /*
2062
   * On the first pass, we collect main tables to vacuum, and also the main
2063
   * table relid to TOAST relid mapping.
2064
   */
2065
0
  while ((tuple = heap_getnext(relScan, ForwardScanDirection)) != NULL)
2066
0
  {
2067
0
    Form_pg_class classForm = (Form_pg_class) GETSTRUCT(tuple);
2068
0
    PgStat_StatTabEntry *tabentry;
2069
0
    AutoVacOpts *relopts;
2070
0
    Oid     relid;
2071
0
    bool    dovacuum;
2072
0
    bool    doanalyze;
2073
0
    bool    wraparound;
2074
2075
0
    if (classForm->relkind != RELKIND_RELATION &&
2076
0
      classForm->relkind != RELKIND_MATVIEW)
2077
0
      continue;
2078
2079
0
    relid = HeapTupleGetOid(tuple);
2080
2081
    /*
2082
     * Check if it is a temp table (presumably, of some other backend's).
2083
     * We cannot safely process other backends' temp tables.
2084
     */
2085
0
    if (classForm->relpersistence == RELPERSISTENCE_TEMP)
2086
0
    {
2087
      /*
2088
       * We just ignore it if the owning backend is still active and
2089
       * using the temporary schema.
2090
       */
2091
0
      if (!isTempNamespaceInUse(classForm->relnamespace))
2092
0
      {
2093
        /*
2094
         * The table seems to be orphaned -- although it might be that
2095
         * the owning backend has already deleted it and exited; our
2096
         * pg_class scan snapshot is not necessarily up-to-date
2097
         * anymore, so we could be looking at a committed-dead entry.
2098
         * Remember it so we can try to delete it later.
2099
         */
2100
0
        orphan_oids = lappend_oid(orphan_oids, relid);
2101
0
      }
2102
0
      continue;
2103
0
    }
2104
2105
    /* Fetch reloptions and the pgstat entry for this table */
2106
0
    relopts = extract_autovac_opts(tuple, pg_class_desc);
2107
0
    tabentry = get_pgstat_tabentry_relid(relid, classForm->relisshared,
2108
0
                       shared, dbentry);
2109
2110
    /* Check if it needs vacuum or analyze */
2111
0
    relation_needs_vacanalyze(relid, relopts, classForm, tabentry,
2112
0
                  effective_multixact_freeze_max_age,
2113
0
                  &dovacuum, &doanalyze, &wraparound);
2114
2115
    /* Relations that need work are added to table_oids */
2116
0
    if (dovacuum || doanalyze)
2117
0
      table_oids = lappend_oid(table_oids, relid);
2118
2119
    /*
2120
     * Remember TOAST associations for the second pass.  Note: we must do
2121
     * this whether or not the table is going to be vacuumed, because we
2122
     * don't automatically vacuum toast tables along the parent table.
2123
     */
2124
0
    if (OidIsValid(classForm->reltoastrelid))
2125
0
    {
2126
0
      av_relation *hentry;
2127
0
      bool    found;
2128
2129
0
      hentry = hash_search(table_toast_map,
2130
0
                 &classForm->reltoastrelid,
2131
0
                 HASH_ENTER, &found);
2132
2133
0
      if (!found)
2134
0
      {
2135
        /* hash_search already filled in the key */
2136
0
        hentry->ar_relid = relid;
2137
0
        hentry->ar_hasrelopts = false;
2138
0
        if (relopts != NULL)
2139
0
        {
2140
0
          hentry->ar_hasrelopts = true;
2141
0
          memcpy(&hentry->ar_reloptions, relopts,
2142
0
               sizeof(AutoVacOpts));
2143
0
        }
2144
0
      }
2145
0
    }
2146
0
  }
2147
2148
0
  heap_endscan(relScan);
2149
2150
  /* second pass: check TOAST tables */
2151
0
  ScanKeyInit(&key,
2152
0
        Anum_pg_class_relkind,
2153
0
        BTEqualStrategyNumber, F_CHAREQ,
2154
0
        CharGetDatum(RELKIND_TOASTVALUE));
2155
2156
0
  relScan = heap_beginscan_catalog(classRel, 1, &key);
2157
0
  while ((tuple = heap_getnext(relScan, ForwardScanDirection)) != NULL)
2158
0
  {
2159
0
    Form_pg_class classForm = (Form_pg_class) GETSTRUCT(tuple);
2160
0
    PgStat_StatTabEntry *tabentry;
2161
0
    Oid     relid;
2162
0
    AutoVacOpts *relopts = NULL;
2163
0
    bool    dovacuum;
2164
0
    bool    doanalyze;
2165
0
    bool    wraparound;
2166
2167
    /*
2168
     * We cannot safely process other backends' temp tables, so skip 'em.
2169
     */
2170
0
    if (classForm->relpersistence == RELPERSISTENCE_TEMP)
2171
0
      continue;
2172
2173
0
    relid = HeapTupleGetOid(tuple);
2174
2175
    /*
2176
     * fetch reloptions -- if this toast table does not have them, try the
2177
     * main rel
2178
     */
2179
0
    relopts = extract_autovac_opts(tuple, pg_class_desc);
2180
0
    if (relopts == NULL)
2181
0
    {
2182
0
      av_relation *hentry;
2183
0
      bool    found;
2184
2185
0
      hentry = hash_search(table_toast_map, &relid, HASH_FIND, &found);
2186
0
      if (found && hentry->ar_hasrelopts)
2187
0
        relopts = &hentry->ar_reloptions;
2188
0
    }
2189
2190
    /* Fetch the pgstat entry for this table */
2191
0
    tabentry = get_pgstat_tabentry_relid(relid, classForm->relisshared,
2192
0
                       shared, dbentry);
2193
2194
0
    relation_needs_vacanalyze(relid, relopts, classForm, tabentry,
2195
0
                  effective_multixact_freeze_max_age,
2196
0
                  &dovacuum, &doanalyze, &wraparound);
2197
2198
    /* ignore analyze for toast tables */
2199
0
    if (dovacuum)
2200
0
      table_oids = lappend_oid(table_oids, relid);
2201
0
  }
2202
2203
0
  heap_endscan(relScan);
2204
0
  heap_close(classRel, AccessShareLock);
2205
2206
  /*
2207
   * Recheck orphan temporary tables, and if they still seem orphaned, drop
2208
   * them.  We'll eat a transaction per dropped table, which might seem
2209
   * excessive, but we should only need to do anything as a result of a
2210
   * previous backend crash, so this should not happen often enough to
2211
   * justify "optimizing".  Using separate transactions ensures that we
2212
   * don't bloat the lock table if there are many temp tables to be dropped,
2213
   * and it ensures that we don't lose work if a deletion attempt fails.
2214
   */
2215
0
  foreach(cell, orphan_oids)
2216
0
  {
2217
0
    Oid     relid = lfirst_oid(cell);
2218
0
    Form_pg_class classForm;
2219
0
    ObjectAddress object;
2220
2221
    /*
2222
     * Check for user-requested abort.
2223
     */
2224
0
    CHECK_FOR_INTERRUPTS();
2225
2226
    /*
2227
     * Try to lock the table.  If we can't get the lock immediately,
2228
     * somebody else is using (or dropping) the table, so it's not our
2229
     * concern anymore.  Having the lock prevents race conditions below.
2230
     */
2231
0
    if (!ConditionalLockRelationOid(relid, AccessExclusiveLock))
2232
0
      continue;
2233
2234
    /*
2235
     * Re-fetch the pg_class tuple and re-check whether it still seems to
2236
     * be an orphaned temp table.  If it's not there or no longer the same
2237
     * relation, ignore it.
2238
     */
2239
0
    tuple = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(relid));
2240
0
    if (!HeapTupleIsValid(tuple))
2241
0
    {
2242
      /* be sure to drop useless lock so we don't bloat lock table */
2243
0
      UnlockRelationOid(relid, AccessExclusiveLock);
2244
0
      continue;
2245
0
    }
2246
0
    classForm = (Form_pg_class) GETSTRUCT(tuple);
2247
2248
    /*
2249
     * Make all the same tests made in the loop above.  In event of OID
2250
     * counter wraparound, the pg_class entry we have now might be
2251
     * completely unrelated to the one we saw before.
2252
     */
2253
0
    if (!((classForm->relkind == RELKIND_RELATION ||
2254
0
         classForm->relkind == RELKIND_MATVIEW) &&
2255
0
        classForm->relpersistence == RELPERSISTENCE_TEMP))
2256
0
    {
2257
0
      UnlockRelationOid(relid, AccessExclusiveLock);
2258
0
      continue;
2259
0
    }
2260
2261
0
    if (isTempNamespaceInUse(classForm->relnamespace))
2262
0
    {
2263
0
      UnlockRelationOid(relid, AccessExclusiveLock);
2264
0
      continue;
2265
0
    }
2266
2267
    /* OK, let's delete it */
2268
0
    ereport(LOG,
2269
0
        (errmsg("autovacuum: dropping orphan temp table \"%s.%s.%s\"",
2270
0
            get_database_name(MyDatabaseId),
2271
0
            get_namespace_name(classForm->relnamespace),
2272
0
            NameStr(classForm->relname))));
2273
2274
0
    object.classId = RelationRelationId;
2275
0
    object.objectId = relid;
2276
0
    object.objectSubId = 0;
2277
0
    performDeletion(&object, DROP_CASCADE,
2278
0
            PERFORM_DELETION_INTERNAL |
2279
0
            PERFORM_DELETION_QUIETLY |
2280
0
            PERFORM_DELETION_SKIP_EXTENSIONS);
2281
2282
    /*
2283
     * To commit the deletion, end current transaction and start a new
2284
     * one.  Note this also releases the lock we took.
2285
     */
2286
0
    CommitTransactionCommand();
2287
0
    StartTransactionCommand();
2288
2289
    /* StartTransactionCommand changed current memory context */
2290
0
    MemoryContextSwitchTo(AutovacMemCxt);
2291
0
  }
2292
2293
  /*
2294
   * Create a buffer access strategy object for VACUUM to use.  We want to
2295
   * use the same one across all the vacuum operations we perform, since the
2296
   * point is for VACUUM not to blow out the shared cache.
2297
   */
2298
0
  bstrategy = GetAccessStrategy(BAS_VACUUM);
2299
2300
  /*
2301
   * create a memory context to act as fake PortalContext, so that the
2302
   * contexts created in the vacuum code are cleaned up for each table.
2303
   */
2304
0
  PortalContext = AllocSetContextCreate(AutovacMemCxt,
2305
0
                      "Autovacuum Portal",
2306
0
                      ALLOCSET_DEFAULT_SIZES);
2307
2308
  /*
2309
   * Perform operations on collected tables.
2310
   */
2311
0
  foreach(cell, table_oids)
2312
0
  {
2313
0
    Oid     relid = lfirst_oid(cell);
2314
0
    HeapTuple classTup;
2315
0
    autovac_table *tab;
2316
0
    bool    isshared;
2317
0
    bool    skipit;
2318
0
    int     stdVacuumCostDelay;
2319
0
    int     stdVacuumCostLimit;
2320
0
    dlist_iter  iter;
2321
2322
0
    CHECK_FOR_INTERRUPTS();
2323
2324
    /*
2325
     * Check for config changes before processing each collected table.
2326
     */
2327
0
    if (got_SIGHUP)
2328
0
    {
2329
0
      got_SIGHUP = false;
2330
0
      ProcessConfigFile(PGC_SIGHUP);
2331
2332
      /*
2333
       * You might be tempted to bail out if we see autovacuum is now
2334
       * disabled.  Must resist that temptation -- this might be a
2335
       * for-wraparound emergency worker, in which case that would be
2336
       * entirely inappropriate.
2337
       */
2338
0
    }
2339
2340
    /*
2341
     * Find out whether the table is shared or not.  (It's slightly
2342
     * annoying to fetch the syscache entry just for this, but in typical
2343
     * cases it adds little cost because table_recheck_autovac would
2344
     * refetch the entry anyway.  We could buy that back by copying the
2345
     * tuple here and passing it to table_recheck_autovac, but that
2346
     * increases the odds of that function working with stale data.)
2347
     */
2348
0
    classTup = SearchSysCache1(RELOID, ObjectIdGetDatum(relid));
2349
0
    if (!HeapTupleIsValid(classTup))
2350
0
      continue;     /* somebody deleted the rel, forget it */
2351
0
    isshared = ((Form_pg_class) GETSTRUCT(classTup))->relisshared;
2352
0
    ReleaseSysCache(classTup);
2353
2354
    /*
2355
     * Hold schedule lock from here until we've claimed the table.  We
2356
     * also need the AutovacuumLock to walk the worker array, but that one
2357
     * can just be a shared lock.
2358
     */
2359
0
    LWLockAcquire(AutovacuumScheduleLock, LW_EXCLUSIVE);
2360
0
    LWLockAcquire(AutovacuumLock, LW_SHARED);
2361
2362
    /*
2363
     * Check whether the table is being vacuumed concurrently by another
2364
     * worker.
2365
     */
2366
0
    skipit = false;
2367
0
    dlist_foreach(iter, &AutoVacuumShmem->av_runningWorkers)
2368
0
    {
2369
0
      WorkerInfo  worker = dlist_container(WorkerInfoData, wi_links, iter.cur);
2370
2371
      /* ignore myself */
2372
0
      if (worker == MyWorkerInfo)
2373
0
        continue;
2374
2375
      /* ignore workers in other databases (unless table is shared) */
2376
0
      if (!worker->wi_sharedrel && worker->wi_dboid != MyDatabaseId)
2377
0
        continue;
2378
2379
0
      if (worker->wi_tableoid == relid)
2380
0
      {
2381
0
        skipit = true;
2382
0
        found_concurrent_worker = true;
2383
0
        break;
2384
0
      }
2385
0
    }
2386
0
    LWLockRelease(AutovacuumLock);
2387
0
    if (skipit)
2388
0
    {
2389
0
      LWLockRelease(AutovacuumScheduleLock);
2390
0
      continue;
2391
0
    }
2392
2393
    /*
2394
     * Store the table's OID in shared memory before releasing the
2395
     * schedule lock, so that other workers don't try to vacuum it
2396
     * concurrently.  (We claim it here so as not to hold
2397
     * AutovacuumScheduleLock while rechecking the stats.)
2398
     */
2399
0
    MyWorkerInfo->wi_tableoid = relid;
2400
0
    MyWorkerInfo->wi_sharedrel = isshared;
2401
0
    LWLockRelease(AutovacuumScheduleLock);
2402
2403
    /*
2404
     * Check whether pgstat data still says we need to vacuum this table.
2405
     * It could have changed if something else processed the table while
2406
     * we weren't looking.
2407
     *
2408
     * Note: we have a special case in pgstat code to ensure that the
2409
     * stats we read are as up-to-date as possible, to avoid the problem
2410
     * that somebody just finished vacuuming this table.  The window to
2411
     * the race condition is not closed but it is very small.
2412
     */
2413
0
    MemoryContextSwitchTo(AutovacMemCxt);
2414
0
    tab = table_recheck_autovac(relid, table_toast_map, pg_class_desc,
2415
0
                  effective_multixact_freeze_max_age);
2416
0
    if (tab == NULL)
2417
0
    {
2418
      /* someone else vacuumed the table, or it went away */
2419
0
      LWLockAcquire(AutovacuumScheduleLock, LW_EXCLUSIVE);
2420
0
      MyWorkerInfo->wi_tableoid = InvalidOid;
2421
0
      MyWorkerInfo->wi_sharedrel = false;
2422
0
      LWLockRelease(AutovacuumScheduleLock);
2423
0
      continue;
2424
0
    }
2425
2426
    /*
2427
     * Remember the prevailing values of the vacuum cost GUCs.  We have to
2428
     * restore these at the bottom of the loop, else we'll compute wrong
2429
     * values in the next iteration of autovac_balance_cost().
2430
     */
2431
0
    stdVacuumCostDelay = VacuumCostDelay;
2432
0
    stdVacuumCostLimit = VacuumCostLimit;
2433
2434
    /* Must hold AutovacuumLock while mucking with cost balance info */
2435
0
    LWLockAcquire(AutovacuumLock, LW_EXCLUSIVE);
2436
2437
    /* advertise my cost delay parameters for the balancing algorithm */
2438
0
    MyWorkerInfo->wi_dobalance = tab->at_dobalance;
2439
0
    MyWorkerInfo->wi_cost_delay = tab->at_vacuum_cost_delay;
2440
0
    MyWorkerInfo->wi_cost_limit = tab->at_vacuum_cost_limit;
2441
0
    MyWorkerInfo->wi_cost_limit_base = tab->at_vacuum_cost_limit;
2442
2443
    /* do a balance */
2444
0
    autovac_balance_cost();
2445
2446
    /* set the active cost parameters from the result of that */
2447
0
    AutoVacuumUpdateDelay();
2448
2449
    /* done */
2450
0
    LWLockRelease(AutovacuumLock);
2451
2452
    /* clean up memory before each iteration */
2453
0
    MemoryContextResetAndDeleteChildren(PortalContext);
2454
2455
    /*
2456
     * Save the relation name for a possible error message, to avoid a
2457
     * catalog lookup in case of an error.  If any of these return NULL,
2458
     * then the relation has been dropped since last we checked; skip it.
2459
     * Note: they must live in a long-lived memory context because we call
2460
     * vacuum and analyze in different transactions.
2461
     */
2462
2463
0
    tab->at_relname = get_rel_name(tab->at_relid);
2464
0
    tab->at_nspname = get_namespace_name(get_rel_namespace(tab->at_relid));
2465
0
    tab->at_datname = get_database_name(MyDatabaseId);
2466
0
    if (!tab->at_relname || !tab->at_nspname || !tab->at_datname)
2467
0
      goto deleted;
2468
2469
    /*
2470
     * We will abort vacuuming the current table if something errors out,
2471
     * and continue with the next one in schedule; in particular, this
2472
     * happens if we are interrupted with SIGINT.
2473
     */
2474
0
    PG_TRY();
2475
0
    {
2476
      /* Use PortalContext for any per-table allocations */
2477
0
      MemoryContextSwitchTo(PortalContext);
2478
2479
      /* have at it */
2480
0
      autovacuum_do_vac_analyze(tab, bstrategy);
2481
2482
      /*
2483
       * Clear a possible query-cancel signal, to avoid a late reaction
2484
       * to an automatically-sent signal because of vacuuming the
2485
       * current table (we're done with it, so it would make no sense to
2486
       * cancel at this point.)
2487
       */
2488
0
      QueryCancelPending = false;
2489
0
    }
2490
0
    PG_CATCH();
2491
0
    {
2492
      /*
2493
       * Abort the transaction, start a new one, and proceed with the
2494
       * next table in our list.
2495
       */
2496
0
      HOLD_INTERRUPTS();
2497
0
      if (tab->at_vacoptions & VACOPT_VACUUM)
2498
0
        errcontext("automatic vacuum of table \"%s.%s.%s\"",
2499
0
               tab->at_datname, tab->at_nspname, tab->at_relname);
2500
0
      else
2501
0
        errcontext("automatic analyze of table \"%s.%s.%s\"",
2502
0
               tab->at_datname, tab->at_nspname, tab->at_relname);
2503
0
      EmitErrorReport();
2504
2505
      /* this resets the PGXACT flags too */
2506
0
      AbortOutOfAnyTransaction();
2507
0
      FlushErrorState();
2508
0
      MemoryContextResetAndDeleteChildren(PortalContext);
2509
2510
      /* restart our transaction for the following operations */
2511
0
      StartTransactionCommand();
2512
0
      RESUME_INTERRUPTS();
2513
0
    }
2514
0
    PG_END_TRY();
2515
2516
    /* Make sure we're back in AutovacMemCxt */
2517
0
    MemoryContextSwitchTo(AutovacMemCxt);
2518
2519
0
    did_vacuum = true;
2520
2521
    /* the PGXACT flags are reset at the next end of transaction */
2522
2523
    /* be tidy */
2524
0
deleted:
2525
0
    if (tab->at_datname != NULL)
2526
0
      pfree(tab->at_datname);
2527
0
    if (tab->at_nspname != NULL)
2528
0
      pfree(tab->at_nspname);
2529
0
    if (tab->at_relname != NULL)
2530
0
      pfree(tab->at_relname);
2531
0
    pfree(tab);
2532
2533
    /*
2534
     * Remove my info from shared memory.  We could, but intentionally
2535
     * don't, clear wi_cost_limit and friends --- this is on the
2536
     * assumption that we probably have more to do with similar cost
2537
     * settings, so we don't want to give up our share of I/O for a very
2538
     * short interval and thereby thrash the global balance.
2539
     */
2540
0
    LWLockAcquire(AutovacuumScheduleLock, LW_EXCLUSIVE);
2541
0
    MyWorkerInfo->wi_tableoid = InvalidOid;
2542
0
    MyWorkerInfo->wi_sharedrel = false;
2543
0
    LWLockRelease(AutovacuumScheduleLock);
2544
2545
    /* restore vacuum cost GUCs for the next iteration */
2546
0
    VacuumCostDelay = stdVacuumCostDelay;
2547
0
    VacuumCostLimit = stdVacuumCostLimit;
2548
0
  }
2549
2550
  /*
2551
   * Perform additional work items, as requested by backends.
2552
   */
2553
0
  LWLockAcquire(AutovacuumLock, LW_EXCLUSIVE);
2554
0
  for (i = 0; i < NUM_WORKITEMS; i++)
2555
0
  {
2556
0
    AutoVacuumWorkItem *workitem = &AutoVacuumShmem->av_workItems[i];
2557
2558
0
    if (!workitem->avw_used)
2559
0
      continue;
2560
0
    if (workitem->avw_active)
2561
0
      continue;
2562
0
    if (workitem->avw_database != MyDatabaseId)
2563
0
      continue;
2564
2565
    /* claim this one, and release lock while performing it */
2566
0
    workitem->avw_active = true;
2567
0
    LWLockRelease(AutovacuumLock);
2568
2569
0
    perform_work_item(workitem);
2570
2571
    /*
2572
     * Check for config changes before acquiring lock for further jobs.
2573
     */
2574
0
    CHECK_FOR_INTERRUPTS();
2575
0
    if (got_SIGHUP)
2576
0
    {
2577
0
      got_SIGHUP = false;
2578
0
      ProcessConfigFile(PGC_SIGHUP);
2579
0
    }
2580
2581
0
    LWLockAcquire(AutovacuumLock, LW_EXCLUSIVE);
2582
2583
    /* and mark it done */
2584
0
    workitem->avw_active = false;
2585
0
    workitem->avw_used = false;
2586
0
  }
2587
0
  LWLockRelease(AutovacuumLock);
2588
2589
  /*
2590
   * We leak table_toast_map here (among other things), but since we're
2591
   * going away soon, it's not a problem.
2592
   */
2593
2594
  /*
2595
   * Update pg_database.datfrozenxid, and truncate pg_xact if possible. We
2596
   * only need to do this once, not after each table.
2597
   *
2598
   * Even if we didn't vacuum anything, it may still be important to do
2599
   * this, because one indirect effect of vac_update_datfrozenxid() is to
2600
   * update ShmemVariableCache->xidVacLimit.  That might need to be done
2601
   * even if we haven't vacuumed anything, because relations with older
2602
   * relfrozenxid values or other databases with older datfrozenxid values
2603
   * might have been dropped, allowing xidVacLimit to advance.
2604
   *
2605
   * However, it's also important not to do this blindly in all cases,
2606
   * because when autovacuum=off this will restart the autovacuum launcher.
2607
   * If we're not careful, an infinite loop can result, where workers find
2608
   * no work to do and restart the launcher, which starts another worker in
2609
   * the same database that finds no work to do.  To prevent that, we skip
2610
   * this if (1) we found no work to do and (2) we skipped at least one
2611
   * table due to concurrent autovacuum activity.  In that case, the other
2612
   * worker has already done it, or will do so when it finishes.
2613
   */
2614
0
  if (did_vacuum || !found_concurrent_worker)
2615
0
    vac_update_datfrozenxid();
2616
2617
  /* Finally close out the last transaction. */
2618
0
  CommitTransactionCommand();
2619
0
}
2620
2621
/*
2622
 * Execute a previously registered work item.
2623
 */
2624
static void
2625
perform_work_item(AutoVacuumWorkItem *workitem)
2626
0
{
2627
0
  char     *cur_datname = NULL;
2628
0
  char     *cur_nspname = NULL;
2629
0
  char     *cur_relname = NULL;
2630
2631
  /*
2632
   * Note we do not store table info in MyWorkerInfo, since this is not
2633
   * vacuuming proper.
2634
   */
2635
2636
  /*
2637
   * Save the relation name for a possible error message, to avoid a catalog
2638
   * lookup in case of an error.  If any of these return NULL, then the
2639
   * relation has been dropped since last we checked; skip it.
2640
   */
2641
0
  Assert(GetCurrentMemoryContext() == AutovacMemCxt);
2642
2643
0
  cur_relname = get_rel_name(workitem->avw_relation);
2644
0
  cur_nspname = get_namespace_name(get_rel_namespace(workitem->avw_relation));
2645
0
  cur_datname = get_database_name(MyDatabaseId);
2646
0
  if (!cur_relname || !cur_nspname || !cur_datname)
2647
0
    goto deleted2;
2648
2649
0
  autovac_report_workitem(workitem, cur_nspname, cur_datname);
2650
2651
  /* clean up memory before each work item */
2652
0
  MemoryContextResetAndDeleteChildren(PortalContext);
2653
2654
  /*
2655
   * We will abort the current work item if something errors out, and
2656
   * continue with the next one; in particular, this happens if we are
2657
   * interrupted with SIGINT.  Note that this means that the work item list
2658
   * can be lossy.
2659
   */
2660
0
  PG_TRY();
2661
0
  {
2662
    /* Use PortalContext for any per-work-item allocations */
2663
0
    MemoryContextSwitchTo(PortalContext);
2664
2665
    /* have at it */
2666
0
    switch (workitem->avw_type)
2667
0
    {
2668
0
      case AVW_BRINSummarizeRange:
2669
0
        DirectFunctionCall2(brin_summarize_range,
2670
0
                  ObjectIdGetDatum(workitem->avw_relation),
2671
0
                  Int64GetDatum((int64) workitem->avw_blockNumber));
2672
0
        break;
2673
0
      default:
2674
0
        elog(WARNING, "unrecognized work item found: type %d",
2675
0
           workitem->avw_type);
2676
0
        break;
2677
0
    }
2678
2679
    /*
2680
     * Clear a possible query-cancel signal, to avoid a late reaction to
2681
     * an automatically-sent signal because of vacuuming the current table
2682
     * (we're done with it, so it would make no sense to cancel at this
2683
     * point.)
2684
     */
2685
0
    QueryCancelPending = false;
2686
0
  }
2687
0
  PG_CATCH();
2688
0
  {
2689
    /*
2690
     * Abort the transaction, start a new one, and proceed with the next
2691
     * table in our list.
2692
     */
2693
0
    HOLD_INTERRUPTS();
2694
0
    errcontext("processing work entry for relation \"%s.%s.%s\"",
2695
0
           cur_datname, cur_nspname, cur_relname);
2696
0
    EmitErrorReport();
2697
2698
    /* this resets the PGXACT flags too */
2699
0
    AbortOutOfAnyTransaction();
2700
0
    FlushErrorState();
2701
0
    MemoryContextResetAndDeleteChildren(PortalContext);
2702
2703
    /* restart our transaction for the following operations */
2704
0
    StartTransactionCommand();
2705
0
    RESUME_INTERRUPTS();
2706
0
  }
2707
0
  PG_END_TRY();
2708
2709
  /* Make sure we're back in AutovacMemCxt */
2710
0
  MemoryContextSwitchTo(AutovacMemCxt);
2711
2712
  /* We intentionally do not set did_vacuum here */
2713
2714
  /* be tidy */
2715
0
deleted2:
2716
0
  if (cur_datname)
2717
0
    pfree(cur_datname);
2718
0
  if (cur_nspname)
2719
0
    pfree(cur_nspname);
2720
0
  if (cur_relname)
2721
0
    pfree(cur_relname);
2722
0
}
2723
2724
/*
2725
 * extract_autovac_opts
2726
 *
2727
 * Given a relation's pg_class tuple, return the AutoVacOpts portion of
2728
 * reloptions, if set; otherwise, return NULL.
2729
 */
2730
static AutoVacOpts *
2731
extract_autovac_opts(HeapTuple tup, TupleDesc pg_class_desc)
2732
0
{
2733
0
  bytea    *relopts;
2734
0
  AutoVacOpts *av;
2735
2736
0
  Assert(((Form_pg_class) GETSTRUCT(tup))->relkind == RELKIND_RELATION ||
2737
0
       ((Form_pg_class) GETSTRUCT(tup))->relkind == RELKIND_MATVIEW ||
2738
0
       ((Form_pg_class) GETSTRUCT(tup))->relkind == RELKIND_TOASTVALUE);
2739
2740
0
  relopts = extractRelOptions(tup, pg_class_desc, NULL);
2741
0
  if (relopts == NULL)
2742
0
    return NULL;
2743
2744
0
  av = palloc(sizeof(AutoVacOpts));
2745
0
  memcpy(av, &(((StdRdOptions *) relopts)->autovacuum), sizeof(AutoVacOpts));
2746
0
  pfree(relopts);
2747
2748
0
  return av;
2749
0
}
2750
2751
/*
2752
 * get_pgstat_tabentry_relid
2753
 *
2754
 * Fetch the pgstat entry of a table, either local to a database or shared.
2755
 */
2756
static PgStat_StatTabEntry *
2757
get_pgstat_tabentry_relid(Oid relid, bool isshared, PgStat_StatDBEntry *shared,
2758
              PgStat_StatDBEntry *dbentry)
2759
0
{
2760
0
  PgStat_StatTabEntry *tabentry = NULL;
2761
2762
0
  if (isshared)
2763
0
  {
2764
0
    if (PointerIsValid(shared))
2765
0
      tabentry = hash_search(shared->tables, &relid,
2766
0
                   HASH_FIND, NULL);
2767
0
  }
2768
0
  else if (PointerIsValid(dbentry))
2769
0
    tabentry = hash_search(dbentry->tables, &relid,
2770
0
                 HASH_FIND, NULL);
2771
2772
0
  return tabentry;
2773
0
}
2774
2775
/*
2776
 * table_recheck_autovac
2777
 *
2778
 * Recheck whether a table still needs vacuum or analyze.  Return value is a
2779
 * valid autovac_table pointer if it does, NULL otherwise.
2780
 *
2781
 * Note that the returned autovac_table does not have the name fields set.
2782
 */
2783
static autovac_table *
2784
table_recheck_autovac(Oid relid, HTAB *table_toast_map,
2785
            TupleDesc pg_class_desc,
2786
            int effective_multixact_freeze_max_age)
2787
0
{
2788
0
  Form_pg_class classForm;
2789
0
  HeapTuple classTup;
2790
0
  bool    dovacuum;
2791
0
  bool    doanalyze;
2792
0
  autovac_table *tab = NULL;
2793
0
  PgStat_StatTabEntry *tabentry;
2794
0
  PgStat_StatDBEntry *shared;
2795
0
  PgStat_StatDBEntry *dbentry;
2796
0
  bool    wraparound;
2797
0
  AutoVacOpts *avopts;
2798
2799
  /* use fresh stats */
2800
0
  autovac_refresh_stats();
2801
2802
0
  shared = pgstat_fetch_stat_dbentry(InvalidOid);
2803
0
  dbentry = pgstat_fetch_stat_dbentry(MyDatabaseId);
2804
2805
  /* fetch the relation's relcache entry */
2806
0
  classTup = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(relid));
2807
0
  if (!HeapTupleIsValid(classTup))
2808
0
    return NULL;
2809
0
  classForm = (Form_pg_class) GETSTRUCT(classTup);
2810
2811
  /*
2812
   * Get the applicable reloptions.  If it is a TOAST table, try to get the
2813
   * main table reloptions if the toast table itself doesn't have.
2814
   */
2815
0
  avopts = extract_autovac_opts(classTup, pg_class_desc);
2816
0
  if (classForm->relkind == RELKIND_TOASTVALUE &&
2817
0
    avopts == NULL && table_toast_map != NULL)
2818
0
  {
2819
0
    av_relation *hentry;
2820
0
    bool    found;
2821
2822
0
    hentry = hash_search(table_toast_map, &relid, HASH_FIND, &found);
2823
0
    if (found && hentry->ar_hasrelopts)
2824
0
      avopts = &hentry->ar_reloptions;
2825
0
  }
2826
2827
  /* fetch the pgstat table entry */
2828
0
  tabentry = get_pgstat_tabentry_relid(relid, classForm->relisshared,
2829
0
                     shared, dbentry);
2830
2831
0
  relation_needs_vacanalyze(relid, avopts, classForm, tabentry,
2832
0
                effective_multixact_freeze_max_age,
2833
0
                &dovacuum, &doanalyze, &wraparound);
2834
2835
  /* ignore ANALYZE for toast tables */
2836
0
  if (classForm->relkind == RELKIND_TOASTVALUE)
2837
0
    doanalyze = false;
2838
2839
  /* OK, it needs something done */
2840
0
  if (doanalyze || dovacuum)
2841
0
  {
2842
0
    int     freeze_min_age;
2843
0
    int     freeze_table_age;
2844
0
    int     multixact_freeze_min_age;
2845
0
    int     multixact_freeze_table_age;
2846
0
    int     vac_cost_limit;
2847
0
    int     vac_cost_delay;
2848
0
    int     log_min_duration;
2849
2850
    /*
2851
     * Calculate the vacuum cost parameters and the freeze ages.  If there
2852
     * are options set in pg_class.reloptions, use them; in the case of a
2853
     * toast table, try the main table too.  Otherwise use the GUC
2854
     * defaults, autovacuum's own first and plain vacuum second.
2855
     */
2856
2857
    /* -1 in autovac setting means use plain vacuum_cost_delay */
2858
0
    vac_cost_delay = (avopts && avopts->vacuum_cost_delay >= 0)
2859
0
      ? avopts->vacuum_cost_delay
2860
0
      : (autovacuum_vac_cost_delay >= 0)
2861
0
      ? autovacuum_vac_cost_delay
2862
0
      : VacuumCostDelay;
2863
2864
    /* 0 or -1 in autovac setting means use plain vacuum_cost_limit */
2865
0
    vac_cost_limit = (avopts && avopts->vacuum_cost_limit > 0)
2866
0
      ? avopts->vacuum_cost_limit
2867
0
      : (autovacuum_vac_cost_limit > 0)
2868
0
      ? autovacuum_vac_cost_limit
2869
0
      : VacuumCostLimit;
2870
2871
    /* -1 in autovac setting means use log_autovacuum_min_duration */
2872
0
    log_min_duration = (avopts && avopts->log_min_duration >= 0)
2873
0
      ? avopts->log_min_duration
2874
0
      : Log_autovacuum_min_duration;
2875
2876
    /* these do not have autovacuum-specific settings */
2877
0
    freeze_min_age = (avopts && avopts->freeze_min_age >= 0)
2878
0
      ? avopts->freeze_min_age
2879
0
      : default_freeze_min_age;
2880
2881
0
    freeze_table_age = (avopts && avopts->freeze_table_age >= 0)
2882
0
      ? avopts->freeze_table_age
2883
0
      : default_freeze_table_age;
2884
2885
0
    multixact_freeze_min_age = (avopts &&
2886
0
                  avopts->multixact_freeze_min_age >= 0)
2887
0
      ? avopts->multixact_freeze_min_age
2888
0
      : default_multixact_freeze_min_age;
2889
2890
0
    multixact_freeze_table_age = (avopts &&
2891
0
                    avopts->multixact_freeze_table_age >= 0)
2892
0
      ? avopts->multixact_freeze_table_age
2893
0
      : default_multixact_freeze_table_age;
2894
2895
0
    tab = palloc(sizeof(autovac_table));
2896
0
    tab->at_relid = relid;
2897
0
    tab->at_sharedrel = classForm->relisshared;
2898
0
    tab->at_vacoptions = VACOPT_SKIPTOAST |
2899
0
      (dovacuum ? VACOPT_VACUUM : 0) |
2900
0
      (doanalyze ? VACOPT_ANALYZE : 0) |
2901
0
      (!wraparound ? VACOPT_NOWAIT : 0);
2902
0
    tab->at_params.freeze_min_age = freeze_min_age;
2903
0
    tab->at_params.freeze_table_age = freeze_table_age;
2904
0
    tab->at_params.multixact_freeze_min_age = multixact_freeze_min_age;
2905
0
    tab->at_params.multixact_freeze_table_age = multixact_freeze_table_age;
2906
0
    tab->at_params.is_wraparound = wraparound;
2907
0
    tab->at_params.log_min_duration = log_min_duration;
2908
0
    tab->at_vacuum_cost_limit = vac_cost_limit;
2909
0
    tab->at_vacuum_cost_delay = vac_cost_delay;
2910
0
    tab->at_relname = NULL;
2911
0
    tab->at_nspname = NULL;
2912
0
    tab->at_datname = NULL;
2913
2914
    /*
2915
     * If any of the cost delay parameters has been set individually for
2916
     * this table, disable the balancing algorithm.
2917
     */
2918
0
    tab->at_dobalance =
2919
0
      !(avopts && (avopts->vacuum_cost_limit > 0 ||
2920
0
             avopts->vacuum_cost_delay > 0));
2921
0
  }
2922
2923
0
  heap_freetuple(classTup);
2924
2925
0
  return tab;
2926
0
}
2927
2928
/*
2929
 * relation_needs_vacanalyze
2930
 *
2931
 * Check whether a relation needs to be vacuumed or analyzed; return each into
2932
 * "dovacuum" and "doanalyze", respectively.  Also return whether the vacuum is
2933
 * being forced because of Xid or multixact wraparound.
2934
 *
2935
 * relopts is a pointer to the AutoVacOpts options (either for itself in the
2936
 * case of a plain table, or for either itself or its parent table in the case
2937
 * of a TOAST table), NULL if none; tabentry is the pgstats entry, which can be
2938
 * NULL.
2939
 *
2940
 * A table needs to be vacuumed if the number of dead tuples exceeds a
2941
 * threshold.  This threshold is calculated as
2942
 *
2943
 * threshold = vac_base_thresh + vac_scale_factor * reltuples
2944
 *
2945
 * For analyze, the analysis done is that the number of tuples inserted,
2946
 * deleted and updated since the last analyze exceeds a threshold calculated
2947
 * in the same fashion as above.  Note that the collector actually stores
2948
 * the number of tuples (both live and dead) that there were as of the last
2949
 * analyze.  This is asymmetric to the VACUUM case.
2950
 *
2951
 * We also force vacuum if the table's relfrozenxid is more than freeze_max_age
2952
 * transactions back, and if its relminmxid is more than
2953
 * multixact_freeze_max_age multixacts back.
2954
 *
2955
 * A table whose autovacuum_enabled option is false is
2956
 * automatically skipped (unless we have to vacuum it due to freeze_max_age).
2957
 * Thus autovacuum can be disabled for specific tables. Also, when the stats
2958
 * collector does not have data about a table, it will be skipped.
2959
 *
2960
 * A table whose vac_base_thresh value is < 0 takes the base value from the
2961
 * autovacuum_vacuum_threshold GUC variable.  Similarly, a vac_scale_factor
2962
 * value < 0 is substituted with the value of
2963
 * autovacuum_vacuum_scale_factor GUC variable.  Ditto for analyze.
2964
 */
2965
static void
2966
relation_needs_vacanalyze(Oid relid,
2967
              AutoVacOpts *relopts,
2968
              Form_pg_class classForm,
2969
              PgStat_StatTabEntry *tabentry,
2970
              int effective_multixact_freeze_max_age,
2971
 /* output params below */
2972
              bool *dovacuum,
2973
              bool *doanalyze,
2974
              bool *wraparound)
2975
0
{
2976
0
  bool    force_vacuum;
2977
0
  bool    av_enabled;
2978
0
  float4    reltuples;    /* pg_class.reltuples */
2979
2980
  /* constants from reloptions or GUC variables */
2981
0
  int     vac_base_thresh,
2982
0
        anl_base_thresh;
2983
0
  float4    vac_scale_factor,
2984
0
        anl_scale_factor;
2985
2986
  /* thresholds calculated from above constants */
2987
0
  float4    vacthresh,
2988
0
        anlthresh;
2989
2990
  /* number of vacuum (resp. analyze) tuples at this time */
2991
0
  float4    vactuples,
2992
0
        anltuples;
2993
2994
  /* freeze parameters */
2995
0
  int     freeze_max_age;
2996
0
  int     multixact_freeze_max_age;
2997
0
  TransactionId xidForceLimit;
2998
0
  MultiXactId multiForceLimit;
2999
3000
0
  AssertArg(classForm != NULL);
3001
0
  AssertArg(OidIsValid(relid));
3002
3003
  /*
3004
   * Determine vacuum/analyze equation parameters.  We have two possible
3005
   * sources: the passed reloptions (which could be a main table or a toast
3006
   * table), or the autovacuum GUC variables.
3007
   */
3008
3009
  /* -1 in autovac setting means use plain vacuum_cost_delay */
3010
0
  vac_scale_factor = (relopts && relopts->vacuum_scale_factor >= 0)
3011
0
    ? relopts->vacuum_scale_factor
3012
0
    : autovacuum_vac_scale;
3013
3014
0
  vac_base_thresh = (relopts && relopts->vacuum_threshold >= 0)
3015
0
    ? relopts->vacuum_threshold
3016
0
    : autovacuum_vac_thresh;
3017
3018
0
  anl_scale_factor = (relopts && relopts->analyze_scale_factor >= 0)
3019
0
    ? relopts->analyze_scale_factor
3020
0
    : autovacuum_anl_scale;
3021
3022
0
  anl_base_thresh = (relopts && relopts->analyze_threshold >= 0)
3023
0
    ? relopts->analyze_threshold
3024
0
    : autovacuum_anl_thresh;
3025
3026
0
  freeze_max_age = (relopts && relopts->freeze_max_age >= 0)
3027
0
    ? Min(relopts->freeze_max_age, autovacuum_freeze_max_age)
3028
0
    : autovacuum_freeze_max_age;
3029
3030
0
  multixact_freeze_max_age = (relopts && relopts->multixact_freeze_max_age >= 0)
3031
0
    ? Min(relopts->multixact_freeze_max_age, effective_multixact_freeze_max_age)
3032
0
    : effective_multixact_freeze_max_age;
3033
3034
0
  av_enabled = (relopts ? relopts->enabled : true);
3035
3036
  /* Force vacuum if table is at risk of wraparound */
3037
0
  xidForceLimit = recentXid - freeze_max_age;
3038
0
  if (xidForceLimit < FirstNormalTransactionId)
3039
0
    xidForceLimit -= FirstNormalTransactionId;
3040
0
  force_vacuum = (TransactionIdIsNormal(classForm->relfrozenxid) &&
3041
0
          TransactionIdPrecedes(classForm->relfrozenxid,
3042
0
                      xidForceLimit));
3043
0
  if (!force_vacuum)
3044
0
  {
3045
0
    multiForceLimit = recentMulti - multixact_freeze_max_age;
3046
0
    if (multiForceLimit < FirstMultiXactId)
3047
0
      multiForceLimit -= FirstMultiXactId;
3048
0
    force_vacuum = MultiXactIdPrecedes(classForm->relminmxid,
3049
0
                       multiForceLimit);
3050
0
  }
3051
0
  *wraparound = force_vacuum;
3052
3053
  /* User disabled it in pg_class.reloptions?  (But ignore if at risk) */
3054
0
  if (!av_enabled && !force_vacuum)
3055
0
  {
3056
0
    *doanalyze = false;
3057
0
    *dovacuum = false;
3058
0
    return;
3059
0
  }
3060
3061
  /*
3062
   * If we found the table in the stats hash, and autovacuum is currently
3063
   * enabled, make a threshold-based decision whether to vacuum and/or
3064
   * analyze.  If autovacuum is currently disabled, we must be here for
3065
   * anti-wraparound vacuuming only, so don't vacuum (or analyze) anything
3066
   * that's not being forced.
3067
   */
3068
0
  if (PointerIsValid(tabentry) && AutoVacuumingActive())
3069
0
  {
3070
0
    reltuples = classForm->reltuples;
3071
0
    vactuples = tabentry->n_dead_tuples;
3072
0
    anltuples = tabentry->changes_since_analyze;
3073
3074
0
    vacthresh = (float4) vac_base_thresh + vac_scale_factor * reltuples;
3075
0
    anlthresh = (float4) anl_base_thresh + anl_scale_factor * reltuples;
3076
3077
    /*
3078
     * Note that we don't need to take special consideration for stat
3079
     * reset, because if that happens, the last vacuum and analyze counts
3080
     * will be reset too.
3081
     */
3082
0
    elog(DEBUG3, "%s: vac: %.0f (threshold %.0f), anl: %.0f (threshold %.0f)",
3083
0
       NameStr(classForm->relname),
3084
0
       vactuples, vacthresh, anltuples, anlthresh);
3085
3086
    /* Determine if this table needs vacuum or analyze. */
3087
0
    *dovacuum = force_vacuum || (vactuples > vacthresh);
3088
0
    *doanalyze = (anltuples > anlthresh);
3089
0
  }
3090
0
  else
3091
0
  {
3092
    /*
3093
     * Skip a table not found in stat hash, unless we have to force vacuum
3094
     * for anti-wrap purposes.  If it's not acted upon, there's no need to
3095
     * vacuum it.
3096
     */
3097
0
    *dovacuum = force_vacuum;
3098
0
    *doanalyze = false;
3099
0
  }
3100
3101
  /* ANALYZE refuses to work with pg_statistic */
3102
0
  if (relid == StatisticRelationId)
3103
0
    *doanalyze = false;
3104
0
}
3105
3106
/*
3107
 * autovacuum_do_vac_analyze
3108
 *    Vacuum and/or analyze the specified table
3109
 */
3110
static void
3111
autovacuum_do_vac_analyze(autovac_table *tab, BufferAccessStrategy bstrategy)
3112
0
{
3113
0
  RangeVar   *rangevar;
3114
0
  VacuumRelation *rel;
3115
0
  List     *rel_list;
3116
3117
  /* Let pgstat know what we're doing */
3118
0
  autovac_report_activity(tab);
3119
3120
  /* Set up one VacuumRelation target, identified by OID, for vacuum() */
3121
0
  rangevar = makeRangeVar(tab->at_nspname, tab->at_relname, -1);
3122
0
  rel = makeVacuumRelation(rangevar, tab->at_relid, NIL);
3123
0
  rel_list = list_make1(rel);
3124
3125
0
  vacuum(tab->at_vacoptions, rel_list, &tab->at_params, bstrategy, true);
3126
0
}
3127
3128
/*
3129
 * autovac_report_activity
3130
 *    Report to pgstat what autovacuum is doing
3131
 *
3132
 * We send a SQL string corresponding to what the user would see if the
3133
 * equivalent command was to be issued manually.
3134
 *
3135
 * Note we assume that we are going to report the next command as soon as we're
3136
 * done with the current one, and exit right after the last one, so we don't
3137
 * bother to report "<IDLE>" or some such.
3138
 */
3139
static void
3140
autovac_report_activity(autovac_table *tab)
3141
0
{
3142
0
#define MAX_AUTOVAC_ACTIV_LEN (NAMEDATALEN * 2 + 56)
3143
0
  char    activity[MAX_AUTOVAC_ACTIV_LEN];
3144
0
  int     len;
3145
3146
  /* Report the command and possible options */
3147
0
  if (tab->at_vacoptions & VACOPT_VACUUM)
3148
0
    snprintf(activity, MAX_AUTOVAC_ACTIV_LEN,
3149
0
         "autovacuum: VACUUM%s",
3150
0
         tab->at_vacoptions & VACOPT_ANALYZE ? " ANALYZE" : "");
3151
0
  else
3152
0
    snprintf(activity, MAX_AUTOVAC_ACTIV_LEN,
3153
0
         "autovacuum: ANALYZE");
3154
3155
  /*
3156
   * Report the qualified name of the relation.
3157
   */
3158
0
  len = strlen(activity);
3159
3160
0
  snprintf(activity + len, MAX_AUTOVAC_ACTIV_LEN - len,
3161
0
       " %s.%s%s", tab->at_nspname, tab->at_relname,
3162
0
       tab->at_params.is_wraparound ? " (to prevent wraparound)" : "");
3163
3164
  /* Set statement_timestamp() to current time for pg_stat_activity */
3165
0
  SetCurrentStatementStartTimestamp();
3166
3167
0
  pgstat_report_activity(STATE_RUNNING, activity);
3168
0
}
3169
3170
/*
3171
 * autovac_report_workitem
3172
 *    Report to pgstat that autovacuum is processing a work item
3173
 */
3174
static void
3175
autovac_report_workitem(AutoVacuumWorkItem *workitem,
3176
            const char *nspname, const char *relname)
3177
0
{
3178
0
  char    activity[MAX_AUTOVAC_ACTIV_LEN + 12 + 2];
3179
0
  char    blk[12 + 2];
3180
0
  int     len;
3181
3182
0
  switch (workitem->avw_type)
3183
0
  {
3184
0
    case AVW_BRINSummarizeRange:
3185
0
      snprintf(activity, MAX_AUTOVAC_ACTIV_LEN,
3186
0
           "autovacuum: BRIN summarize");
3187
0
      break;
3188
0
  }
3189
3190
  /*
3191
   * Report the qualified name of the relation, and the block number if any
3192
   */
3193
0
  len = strlen(activity);
3194
3195
0
  if (BlockNumberIsValid(workitem->avw_blockNumber))
3196
0
    snprintf(blk, sizeof(blk), " %u", workitem->avw_blockNumber);
3197
0
  else
3198
0
    blk[0] = '\0';
3199
3200
0
  snprintf(activity + len, MAX_AUTOVAC_ACTIV_LEN - len,
3201
0
       " %s.%s%s", nspname, relname, blk);
3202
3203
  /* Set statement_timestamp() to current time for pg_stat_activity */
3204
0
  SetCurrentStatementStartTimestamp();
3205
3206
0
  pgstat_report_activity(STATE_RUNNING, activity);
3207
0
}
3208
3209
/*
3210
 * AutoVacuumingActive
3211
 *    Check GUC vars and report whether the autovacuum process should be
3212
 *    running.
3213
 */
3214
bool
3215
AutoVacuumingActive(void)
3216
5.07k
{
3217
5.07k
  if (!autovacuum_start_daemon || !pgstat_track_counts)
3218
0
    return false;
3219
5.07k
  return true;
3220
5.07k
}
3221
3222
/*
3223
 * Request one work item to the next autovacuum run processing our database.
3224
 * Return false if the request can't be recorded.
3225
 */
3226
bool
3227
AutoVacuumRequestWork(AutoVacuumWorkItemType type, Oid relationId,
3228
            BlockNumber blkno)
3229
0
{
3230
0
  int     i;
3231
0
  bool    result = false;
3232
3233
0
  LWLockAcquire(AutovacuumLock, LW_EXCLUSIVE);
3234
3235
  /*
3236
   * Locate an unused work item and fill it with the given data.
3237
   */
3238
0
  for (i = 0; i < NUM_WORKITEMS; i++)
3239
0
  {
3240
0
    AutoVacuumWorkItem *workitem = &AutoVacuumShmem->av_workItems[i];
3241
3242
0
    if (workitem->avw_used)
3243
0
      continue;
3244
3245
0
    workitem->avw_used = true;
3246
0
    workitem->avw_active = false;
3247
0
    workitem->avw_type = type;
3248
0
    workitem->avw_database = MyDatabaseId;
3249
0
    workitem->avw_relation = relationId;
3250
0
    workitem->avw_blockNumber = blkno;
3251
0
    result = true;
3252
3253
    /* done */
3254
0
    break;
3255
0
  }
3256
3257
0
  LWLockRelease(AutovacuumLock);
3258
3259
0
  return result;
3260
0
}
3261
3262
/*
3263
 * autovac_init
3264
 *    This is called at postmaster initialization.
3265
 *
3266
 * All we do here is annoy the user if he got it wrong.
3267
 */
3268
void
3269
autovac_init(void)
3270
902
{
3271
902
  if (autovacuum_start_daemon && !pgstat_track_counts)
3272
902
    ereport(WARNING,
3273
902
        (errmsg("autovacuum not started because of misconfiguration"),
3274
902
         errhint("Enable the \"track_counts\" option.")));
3275
902
}
3276
3277
/*
3278
 * IsAutoVacuum functions
3279
 *    Return whether this is either a launcher autovacuum process or a worker
3280
 *    process.
3281
 */
3282
bool
3283
IsAutoVacuumLauncherProcess(void)
3284
12.7k
{
3285
12.7k
  return am_autovacuum_launcher;
3286
12.7k
}
3287
3288
bool
3289
IsAutoVacuumWorkerProcess(void)
3290
12.1k
{
3291
12.1k
  return am_autovacuum_worker;
3292
12.1k
}
3293
3294
3295
/*
3296
 * AutoVacuumShmemSize
3297
 *    Compute space needed for autovacuum-related shared memory
3298
 */
3299
Size
3300
AutoVacuumShmemSize(void)
3301
7.22k
{
3302
7.22k
  Size    size;
3303
3304
  /*
3305
   * Need the fixed struct and the array of WorkerInfoData.
3306
   */
3307
7.22k
  size = sizeof(AutoVacuumShmemStruct);
3308
7.22k
  size = MAXALIGN(size);
3309
7.22k
  size = add_size(size, mul_size(autovacuum_max_workers,
3310
7.22k
                   sizeof(WorkerInfoData)));
3311
7.22k
  return size;
3312
7.22k
}
3313
3314
/*
3315
 * AutoVacuumShmemInit
3316
 *    Allocate and initialize autovacuum-related shared memory
3317
 */
3318
void
3319
AutoVacuumShmemInit(void)
3320
3.61k
{
3321
3.61k
  bool    found;
3322
3323
3.61k
  AutoVacuumShmem = (AutoVacuumShmemStruct *)
3324
3.61k
    ShmemInitStruct("AutoVacuum Data",
3325
3.61k
            AutoVacuumShmemSize(),
3326
3.61k
            &found);
3327
3328
3.61k
  if (!IsUnderPostmaster)
3329
3.61k
  {
3330
3.61k
    WorkerInfo  worker;
3331
3.61k
    int     i;
3332
3333
3.61k
    Assert(!found);
3334
3335
3.61k
    AutoVacuumShmem->av_launcherpid = 0;
3336
3.61k
    dlist_init(&AutoVacuumShmem->av_freeWorkers);
3337
3.61k
    dlist_init(&AutoVacuumShmem->av_runningWorkers);
3338
3.61k
    AutoVacuumShmem->av_startingWorker = NULL;
3339
3.61k
    memset(AutoVacuumShmem->av_workItems, 0,
3340
3.61k
         sizeof(AutoVacuumWorkItem) * NUM_WORKITEMS);
3341
3342
3.61k
    worker = (WorkerInfo) ((char *) AutoVacuumShmem +
3343
3.61k
                 MAXALIGN(sizeof(AutoVacuumShmemStruct)));
3344
3345
    /* initialize the WorkerInfo free list */
3346
14.4k
    for (i = 0; i < autovacuum_max_workers; i++)
3347
10.8k
      dlist_push_head(&AutoVacuumShmem->av_freeWorkers,
3348
10.8k
              &worker[i].wi_links);
3349
3.61k
  }
3350
3.61k
  else
3351
3.61k
    Assert(found);
3352
3.61k
}
3353
3354
/*
3355
 * autovac_refresh_stats
3356
 *    Refresh pgstats data for an autovacuum process
3357
 *
3358
 * Cause the next pgstats read operation to obtain fresh data, but throttle
3359
 * such refreshing in the autovacuum launcher.  This is mostly to avoid
3360
 * rereading the pgstats files too many times in quick succession when there
3361
 * are many databases.
3362
 *
3363
 * Note: we avoid throttling in the autovac worker, as it would be
3364
 * counterproductive in the recheck logic.
3365
 */
3366
static void
3367
autovac_refresh_stats(void)
3368
0
{
3369
0
  if (IsAutoVacuumLauncherProcess())
3370
0
  {
3371
0
    static TimestampTz last_read = 0;
3372
0
    TimestampTz current_time;
3373
3374
0
    current_time = GetCurrentTimestamp();
3375
3376
0
    if (!TimestampDifferenceExceeds(last_read, current_time,
3377
0
                    STATS_READ_DELAY))
3378
0
      return;
3379
3380
0
    last_read = current_time;
3381
0
  }
3382
3383
0
  pgstat_clear_snapshot();
3384
0
}