YugabyteDB (2.13.1.0-b60, 21121d69985fbf76aa6958d8f04a9bfa936293b5)

Coverage Report

Created: 2022-03-22 16:43

/Users/deen/code/yugabyte-db/src/postgres/src/backend/utils/misc/sampling.c
Line
Count
Source (jump to first uncovered line)
1
/*-------------------------------------------------------------------------
2
 *
3
 * sampling.c
4
 *    Relation block sampling routines.
5
 *
6
 * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
7
 * Portions Copyright (c) 1994, Regents of the University of California
8
 *
9
 *
10
 * IDENTIFICATION
11
 *    src/backend/utils/misc/sampling.c
12
 *
13
 *-------------------------------------------------------------------------
14
 */
15
16
#include "postgres.h"
17
18
#include <math.h>
19
20
#include "utils/sampling.h"
21
22
23
/*
24
 * BlockSampler_Init -- prepare for random sampling of blocknumbers
25
 *
26
 * BlockSampler provides algorithm for block level sampling of a relation
27
 * as discussed on pgsql-hackers 2004-04-02 (subject "Large DB")
28
 * It selects a random sample of samplesize blocks out of
29
 * the nblocks blocks in the table. If the table has less than
30
 * samplesize blocks, all blocks are selected.
31
 *
32
 * Since we know the total number of blocks in advance, we can use the
33
 * straightforward Algorithm S from Knuth 3.4.2, rather than Vitter's
34
 * algorithm.
35
 */
36
void
37
BlockSampler_Init(BlockSampler bs, BlockNumber nblocks, int samplesize,
38
          long randseed)
39
42
{
40
42
  bs->N = nblocks;      /* measured table size */
41
42
  /*
43
   * If we decide to reduce samplesize for tables that have less or not much
44
   * more than samplesize blocks, here is the place to do it.
45
   */
46
42
  bs->n = samplesize;
47
42
  bs->t = 0;          /* blocks scanned so far */
48
42
  bs->m = 0;          /* blocks selected so far */
49
50
42
  sampler_random_init_state(randseed, bs->randstate);
51
42
}
52
53
bool
54
BlockSampler_HasMore(BlockSampler bs)
55
236
{
56
236
  return (bs->t < bs->N) && 
(bs->m < bs->n)194
;
57
236
}
58
59
BlockNumber
60
BlockSampler_Next(BlockSampler bs)
61
97
{
62
97
  BlockNumber K = bs->N - bs->t;  /* remaining blocks */
63
97
  int     k = bs->n - bs->m;  /* blocks still to sample */
64
97
  double    p;        /* probability to skip block */
65
97
  double    V;        /* random */
66
67
97
  Assert(BlockSampler_HasMore(bs)); /* hence K > 0 and k > 0 */
68
69
97
  if ((BlockNumber) k >= K)
70
97
  {
71
    /* need all the rest */
72
97
    bs->m++;
73
97
    return bs->t++;
74
97
  }
75
76
  /*----------
77
   * It is not obvious that this code matches Knuth's Algorithm S.
78
   * Knuth says to skip the current block with probability 1 - k/K.
79
   * If we are to skip, we should advance t (hence decrease K), and
80
   * repeat the same probabilistic test for the next block.  The naive
81
   * implementation thus requires a sampler_random_fract() call for each
82
   * block number.  But we can reduce this to one sampler_random_fract()
83
   * call per selected block, by noting that each time the while-test
84
   * succeeds, we can reinterpret V as a uniform random number in the range
85
   * 0 to p. Therefore, instead of choosing a new V, we just adjust p to be
86
   * the appropriate fraction of its former value, and our next loop
87
   * makes the appropriate probabilistic test.
88
   *
89
   * We have initially K > k > 0.  If the loop reduces K to equal k,
90
   * the next while-test must fail since p will become exactly zero
91
   * (we assume there will not be roundoff error in the division).
92
   * (Note: Knuth suggests a "<=" loop condition, but we use "<" just
93
   * to be doubly sure about roundoff error.)  Therefore K cannot become
94
   * less than k, which means that we cannot fail to select enough blocks.
95
   *----------
96
   */
97
0
  V = sampler_random_fract(bs->randstate);
98
0
  p = 1.0 - (double) k / (double) K;
99
0
  while (V < p)
100
0
  {
101
    /* skip */
102
0
    bs->t++;
103
0
    K--;          /* keep K == N - t */
104
105
    /* adjust p to be new cutoff point in reduced range */
106
0
    p *= 1.0 - (double) k / (double) K;
107
0
  }
108
109
  /* select */
110
0
  bs->m++;
111
0
  return bs->t++;
112
97
}
113
114
/*
115
 * These two routines embody Algorithm Z from "Random sampling with a
116
 * reservoir" by Jeffrey S. Vitter, in ACM Trans. Math. Softw. 11, 1
117
 * (Mar. 1985), Pages 37-57.  Vitter describes his algorithm in terms
118
 * of the count S of records to skip before processing another record.
119
 * It is computed primarily based on t, the number of records already read.
120
 * The only extra state needed between calls is W, a random state variable.
121
 *
122
 * reservoir_init_selection_state computes the initial W value.
123
 *
124
 * Given that we've already read t records (t >= n), reservoir_get_next_S
125
 * determines the number of records to skip before the next record is
126
 * processed.
127
 */
128
void
129
reservoir_init_selection_state(ReservoirState rs, int n)
130
233
{
131
  /*
132
   * Reservoir sampling is not used anywhere where it would need to return
133
   * repeatable results so we can initialize it randomly.
134
   */
135
233
  sampler_random_init_state(random(), rs->randstate);
136
137
  /* Initial value of W (for use when Algorithm Z is first applied) */
138
233
  rs->W = exp(-log(sampler_random_fract(rs->randstate)) / n);
139
233
}
140
141
double
142
reservoir_get_next_S(ReservoirState rs, double t, int n)
143
0
{
144
0
  double    S;
145
146
  /* The magic constant here is T from Vitter's paper */
147
0
  if (t <= (22.0 * n))
148
0
  {
149
    /* Process records using Algorithm X until t is large enough */
150
0
    double    V,
151
0
          quot;
152
153
0
    V = sampler_random_fract(rs->randstate);  /* Generate V */
154
0
    S = 0;
155
0
    t += 1;
156
    /* Note: "num" in Vitter's code is always equal to t - n */
157
0
    quot = (t - (double) n) / t;
158
    /* Find min S satisfying (4.1) */
159
0
    while (quot > V)
160
0
    {
161
0
      S += 1;
162
0
      t += 1;
163
0
      quot *= (t - (double) n) / t;
164
0
    }
165
0
  }
166
0
  else
167
0
  {
168
    /* Now apply Algorithm Z */
169
0
    double    W = rs->W;
170
0
    double    term = t - (double) n + 1;
171
172
0
    for (;;)
173
0
    {
174
0
      double    numer,
175
0
            numer_lim,
176
0
            denom;
177
0
      double    U,
178
0
            X,
179
0
            lhs,
180
0
            rhs,
181
0
            y,
182
0
            tmp;
183
184
      /* Generate U and X */
185
0
      U = sampler_random_fract(rs->randstate);
186
0
      X = t * (W - 1.0);
187
0
      S = floor(X);   /* S is tentatively set to floor(X) */
188
      /* Test if U <= h(S)/cg(X) in the manner of (6.3) */
189
0
      tmp = (t + 1) / term;
190
0
      lhs = exp(log(((U * tmp * tmp) * (term + S)) / (t + X)) / n);
191
0
      rhs = (((t + X) / (term + S)) * term) / t;
192
0
      if (lhs <= rhs)
193
0
      {
194
0
        W = rhs / lhs;
195
0
        break;
196
0
      }
197
      /* Test if U <= f(S)/cg(X) */
198
0
      y = (((U * (t + 1)) / term) * (t + S + 1)) / (t + X);
199
0
      if ((double) n < S)
200
0
      {
201
0
        denom = t;
202
0
        numer_lim = term + S;
203
0
      }
204
0
      else
205
0
      {
206
0
        denom = t - (double) n + S;
207
0
        numer_lim = t + 1;
208
0
      }
209
0
      for (numer = t + S; numer >= numer_lim; numer -= 1)
210
0
      {
211
0
        y *= numer / denom;
212
0
        denom -= 1;
213
0
      }
214
0
      W = exp(-log(sampler_random_fract(rs->randstate)) / n); /* Generate W in advance */
215
0
      if (exp(log(y) / n) <= (t + X) / t)
216
0
        break;
217
0
    }
218
0
    rs->W = W;
219
0
  }
220
0
  return S;
221
0
}
222
223
224
/*----------
225
 * Random number generator used by sampling
226
 *----------
227
 */
228
void
229
sampler_random_init_state(long seed, SamplerRandomState randstate)
230
275
{
231
275
  randstate[0] = 0x330e;    /* same as pg_erand48, but could be anything */
232
275
  randstate[1] = (unsigned short) seed;
233
275
  randstate[2] = (unsigned short) (seed >> 16);
234
275
}
235
236
/* Select a random value R uniformly distributed in (0 - 1) */
237
double
238
sampler_random_fract(SamplerRandomState randstate)
239
233
{
240
233
  double    res;
241
242
  /* pg_erand48 returns a value in [0.0 - 1.0), so we must reject 0 */
243
233
  do
244
233
  {
245
233
    res = pg_erand48(randstate);
246
233
  } while (res == 0.0);
247
233
  return res;
248
233
}
249
250
251
/*
252
 * Backwards-compatible API for block sampling
253
 *
254
 * This code is now deprecated, but since it's still in use by many FDWs,
255
 * we should keep it for awhile at least.  The functionality is the same as
256
 * sampler_random_fract/reservoir_init_selection_state/reservoir_get_next_S,
257
 * except that a common random state is used across all callers.
258
 */
259
static ReservoirStateData oldrs;
260
261
double
262
anl_random_fract(void)
263
0
{
264
  /* initialize if first time through */
265
0
  if (oldrs.randstate[0] == 0)
266
0
    sampler_random_init_state(random(), oldrs.randstate);
267
268
  /* and compute a random fraction */
269
0
  return sampler_random_fract(oldrs.randstate);
270
0
}
271
272
double
273
anl_init_selection_state(int n)
274
0
{
275
  /* initialize if first time through */
276
0
  if (oldrs.randstate[0] == 0)
277
0
    sampler_random_init_state(random(), oldrs.randstate);
278
279
  /* Initial value of W (for use when Algorithm Z is first applied) */
280
0
  return exp(-log(sampler_random_fract(oldrs.randstate)) / n);
281
0
}
282
283
double
284
anl_get_next_S(double t, int n, double *stateptr)
285
0
{
286
0
  double    result;
287
288
0
  oldrs.W = *stateptr;
289
0
  result = reservoir_get_next_S(&oldrs, t, n);
290
0
  *stateptr = oldrs.W;
291
0
  return result;
292
0
}