/Users/deen/code/yugabyte-db/src/postgres/src/backend/utils/misc/sampling.c

Source (jump to first uncovered line)
/*-------------------------------------------------------------------------
 *
 * sampling.c
 *    Relation block sampling routines.
 *
 * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 *
 * IDENTIFICATION
 *    src/backend/utils/misc/sampling.c
 *
 *-------------------------------------------------------------------------
 */

#include "postgres.h"

#include <math.h>

#include "utils/sampling.h"


/*
 * BlockSampler_Init -- prepare for random sampling of blocknumbers
 *
 * BlockSampler provides algorithm for block level sampling of a relation
 * as discussed on pgsql-hackers 2004-04-02 (subject "Large DB")
 * It selects a random sample of samplesize blocks out of
 * the nblocks blocks in the table. If the table has less than
 * samplesize blocks, all blocks are selected.
 *
 * Since we know the total number of blocks in advance, we can use the
 * straightforward Algorithm S from Knuth 3.4.2, rather than Vitter's
 * algorithm.
 */
void
BlockSampler_Init(BlockSampler bs, BlockNumber nblocks, int samplesize,
          long randseed)
{
  bs->N = nblocks;      /* measured table size */

  /*
   * If we decide to reduce samplesize for tables that have less or not much
   * more than samplesize blocks, here is the place to do it.
   */
  bs->n = samplesize;
  bs->t = 0;          /* blocks scanned so far */
  bs->m = 0;          /* blocks selected so far */

  sampler_random_init_state(randseed, bs->randstate);
}

bool
BlockSampler_HasMore(BlockSampler bs)
{
  return (bs->t < bs->N) && (bs->m < bs->n)194;
}

BlockNumber
BlockSampler_Next(BlockSampler bs)
{
  BlockNumber K = bs->N - bs->t;  /* remaining blocks */
  int     k = bs->n - bs->m;  /* blocks still to sample */
  double    p;        /* probability to skip block */
  double    V;        /* random */

  Assert(BlockSampler_HasMore(bs)); /* hence K > 0 and k > 0 */

  if ((BlockNumber) k >= K)
  {
    /* need all the rest */
    bs->m++;
    return bs->t++;
  }

  /*----------
   * It is not obvious that this code matches Knuth's Algorithm S.
   * Knuth says to skip the current block with probability 1 - k/K.
   * If we are to skip, we should advance t (hence decrease K), and
   * repeat the same probabilistic test for the next block.  The naive
   * implementation thus requires a sampler_random_fract() call for each
   * block number.  But we can reduce this to one sampler_random_fract()
   * call per selected block, by noting that each time the while-test
   * succeeds, we can reinterpret V as a uniform random number in the range
   * 0 to p. Therefore, instead of choosing a new V, we just adjust p to be
   * the appropriate fraction of its former value, and our next loop
   * makes the appropriate probabilistic test.
   *
   * We have initially K > k > 0.  If the loop reduces K to equal k,
   * the next while-test must fail since p will become exactly zero
   * (we assume there will not be roundoff error in the division).
   * (Note: Knuth suggests a "<=" loop condition, but we use "<" just
   * to be doubly sure about roundoff error.)  Therefore K cannot become
   * less than k, which means that we cannot fail to select enough blocks.
   *----------
   */
  V = sampler_random_fract(bs->randstate);
  p = 1.0 - (double) k / (double) K;
  while (V < p)
  {
    /* skip */
    bs->t++;
    K--;          /* keep K == N - t */

    /* adjust p to be new cutoff point in reduced range */
    p *= 1.0 - (double) k / (double) K;
  }

  /* select */
  bs->m++;
  return bs->t++;
}

/*
 * These two routines embody Algorithm Z from "Random sampling with a
 * reservoir" by Jeffrey S. Vitter, in ACM Trans. Math. Softw. 11, 1
 * (Mar. 1985), Pages 37-57.  Vitter describes his algorithm in terms
 * of the count S of records to skip before processing another record.
 * It is computed primarily based on t, the number of records already read.
 * The only extra state needed between calls is W, a random state variable.
 *
 * reservoir_init_selection_state computes the initial W value.
 *
 * Given that we've already read t records (t >= n), reservoir_get_next_S
 * determines the number of records to skip before the next record is
 * processed.
 */
void
reservoir_init_selection_state(ReservoirState rs, int n)
{
  /*
   * Reservoir sampling is not used anywhere where it would need to return
   * repeatable results so we can initialize it randomly.
   */
  sampler_random_init_state(random(), rs->randstate);

  /* Initial value of W (for use when Algorithm Z is first applied) */
  rs->W = exp(-log(sampler_random_fract(rs->randstate)) / n);
}

double
reservoir_get_next_S(ReservoirState rs, double t, int n)
{
  double    S;

  /* The magic constant here is T from Vitter's paper */
  if (t <= (22.0 * n))
  {
    /* Process records using Algorithm X until t is large enough */
    double    V,
          quot;

    V = sampler_random_fract(rs->randstate);  /* Generate V */
    S = 0;
    t += 1;
    /* Note: "num" in Vitter's code is always equal to t - n */
    quot = (t - (double) n) / t;
    /* Find min S satisfying (4.1) */
    while (quot > V)
    {
      S += 1;
      t += 1;
      quot *= (t - (double) n) / t;
    }
  }
  else
  {
    /* Now apply Algorithm Z */
    double    W = rs->W;
    double    term = t - (double) n + 1;

    for (;;)
    {
      double    numer,
            numer_lim,
            denom;
      double    U,
            X,
            lhs,
            rhs,
            y,
            tmp;

      /* Generate U and X */
      U = sampler_random_fract(rs->randstate);
      X = t * (W - 1.0);
      S = floor(X);   /* S is tentatively set to floor(X) */
      /* Test if U <= h(S)/cg(X) in the manner of (6.3) */
      tmp = (t + 1) / term;
      lhs = exp(log(((U * tmp * tmp) * (term + S)) / (t + X)) / n);
      rhs = (((t + X) / (term + S)) * term) / t;
      if (lhs <= rhs)
      {
        W = rhs / lhs;
        break;
      }
      /* Test if U <= f(S)/cg(X) */
      y = (((U * (t + 1)) / term) * (t + S + 1)) / (t + X);
      if ((double) n < S)
      {
        denom = t;
        numer_lim = term + S;
      }
      else
      {
        denom = t - (double) n + S;
        numer_lim = t + 1;
      }
      for (numer = t + S; numer >= numer_lim; numer -= 1)
      {
        y *= numer / denom;
        denom -= 1;
      }
      W = exp(-log(sampler_random_fract(rs->randstate)) / n); /* Generate W in advance */
      if (exp(log(y) / n) <= (t + X) / t)
        break;
    }
    rs->W = W;
  }
  return S;
}


/*----------
 * Random number generator used by sampling
 *----------
 */
void
sampler_random_init_state(long seed, SamplerRandomState randstate)
{
  randstate[0] = 0x330e;    /* same as pg_erand48, but could be anything */
  randstate[1] = (unsigned short) seed;
  randstate[2] = (unsigned short) (seed >> 16);
}

/* Select a random value R uniformly distributed in (0 - 1) */
double
sampler_random_fract(SamplerRandomState randstate)
{
  double    res;

  /* pg_erand48 returns a value in [0.0 - 1.0), so we must reject 0 */
  do
  {
    res = pg_erand48(randstate);
  } while (res == 0.0);
  return res;
}


/*
 * Backwards-compatible API for block sampling
 *
 * This code is now deprecated, but since it's still in use by many FDWs,
 * we should keep it for awhile at least.  The functionality is the same as
 * sampler_random_fract/reservoir_init_selection_state/reservoir_get_next_S,
 * except that a common random state is used across all callers.
 */
static ReservoirStateData oldrs;

double
anl_random_fract(void)
{
  /* initialize if first time through */
  if (oldrs.randstate[0] == 0)
    sampler_random_init_state(random(), oldrs.randstate);

  /* and compute a random fraction */
  return sampler_random_fract(oldrs.randstate);
}

double
anl_init_selection_state(int n)
{
  /* initialize if first time through */
  if (oldrs.randstate[0] == 0)
    sampler_random_init_state(random(), oldrs.randstate);

  /* Initial value of W (for use when Algorithm Z is first applied) */
  return exp(-log(sampler_random_fract(oldrs.randstate)) / n);
}

double
anl_get_next_S(double t, int n, double *stateptr)
{
  double    result;

  oldrs.W = *stateptr;
  result = reservoir_get_next_S(&oldrs, t, n);
  *stateptr = oldrs.W;
  return result;
}

YugabyteDB (2.13.1.0-b60, 21121d69985fbf76aa6958d8f04a9bfa936293b5)

Coverage Report

Created: 2022-03-22 16:43

Line	Count	Source (jump to first uncovered line)
1		/*-------------------------------------------------------------------------
2		*
3		* sampling.c
4		* Relation block sampling routines.
5		*
6		* Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
7		* Portions Copyright (c) 1994, Regents of the University of California
8		*
9		*
10		* IDENTIFICATION
11		* src/backend/utils/misc/sampling.c
12		*
13		*-------------------------------------------------------------------------
14		*/
15
16		#include "postgres.h"
17
18		#include <math.h>
19
20		#include "utils/sampling.h"
21
22
23		/*
24		* BlockSampler_Init -- prepare for random sampling of blocknumbers
25		*
26		* BlockSampler provides algorithm for block level sampling of a relation
27		* as discussed on pgsql-hackers 2004-04-02 (subject "Large DB")
28		* It selects a random sample of samplesize blocks out of
29		* the nblocks blocks in the table. If the table has less than
30		* samplesize blocks, all blocks are selected.
31		*
32		* Since we know the total number of blocks in advance, we can use the
33		* straightforward Algorithm S from Knuth 3.4.2, rather than Vitter's
34		* algorithm.
35		*/
36		void
37		BlockSampler_Init(BlockSampler bs, BlockNumber nblocks, int samplesize,
38		long randseed)
39	42	{
40	42	bs->N = nblocks; /* measured table size */
41
42		/*
43		* If we decide to reduce samplesize for tables that have less or not much
44		* more than samplesize blocks, here is the place to do it.
45		*/
46	42	bs->n = samplesize;
47	42	bs->t = 0; /* blocks scanned so far */
48	42	bs->m = 0; /* blocks selected so far */
49
50	42	sampler_random_init_state(randseed, bs->randstate);
51	42	}
52
53		bool
54		BlockSampler_HasMore(BlockSampler bs)
55	236	{
56	236	return (bs->t < bs->N) && (bs->m < bs->n)194 ;
57	236	}
58
59		BlockNumber
60		BlockSampler_Next(BlockSampler bs)
61	97	{
62	97	BlockNumber K = bs->N - bs->t; /* remaining blocks */
63	97	int k = bs->n - bs->m; /* blocks still to sample */
64	97	double p; /* probability to skip block */
65	97	double V; /* random */
66
67	97	Assert(BlockSampler_HasMore(bs)); /* hence K > 0 and k > 0 */
68
69	97	if ((BlockNumber) k >= K)
70	97	{
71		/* need all the rest */
72	97	bs->m++;
73	97	return bs->t++;
74	97	}
75
76		/*----------
77		* It is not obvious that this code matches Knuth's Algorithm S.
78		* Knuth says to skip the current block with probability 1 - k/K.
79		* If we are to skip, we should advance t (hence decrease K), and
80		* repeat the same probabilistic test for the next block. The naive
81		* implementation thus requires a sampler_random_fract() call for each
82		* block number. But we can reduce this to one sampler_random_fract()
83		* call per selected block, by noting that each time the while-test
84		* succeeds, we can reinterpret V as a uniform random number in the range
85		* 0 to p. Therefore, instead of choosing a new V, we just adjust p to be
86		* the appropriate fraction of its former value, and our next loop
87		* makes the appropriate probabilistic test.
88		*
89		* We have initially K > k > 0. If the loop reduces K to equal k,
90		* the next while-test must fail since p will become exactly zero
91		* (we assume there will not be roundoff error in the division).
92		* (Note: Knuth suggests a "<=" loop condition, but we use "<" just
93		* to be doubly sure about roundoff error.) Therefore K cannot become
94		* less than k, which means that we cannot fail to select enough blocks.
95		*----------
96		*/
97	0	V = sampler_random_fract(bs->randstate);
98	0	p = 1.0 - (double) k / (double) K;
99	0	while (V < p)
100	0	{
101		/* skip */
102	0	bs->t++;
103	0	K--; /* keep K == N - t */
104
105		/* adjust p to be new cutoff point in reduced range */
106	0	p *= 1.0 - (double) k / (double) K;
107	0	}
108
109		/* select */
110	0	bs->m++;
111	0	return bs->t++;
112	97	}
113
114		/*
115		* These two routines embody Algorithm Z from "Random sampling with a
116		* reservoir" by Jeffrey S. Vitter, in ACM Trans. Math. Softw. 11, 1
117		* (Mar. 1985), Pages 37-57. Vitter describes his algorithm in terms
118		* of the count S of records to skip before processing another record.
119		* It is computed primarily based on t, the number of records already read.
120		* The only extra state needed between calls is W, a random state variable.
121		*
122		* reservoir_init_selection_state computes the initial W value.
123		*
124		* Given that we've already read t records (t >= n), reservoir_get_next_S
125		* determines the number of records to skip before the next record is
126		* processed.
127		*/
128		void
129		reservoir_init_selection_state(ReservoirState rs, int n)
130	233	{
131		/*
132		* Reservoir sampling is not used anywhere where it would need to return
133		* repeatable results so we can initialize it randomly.
134		*/
135	233	sampler_random_init_state(random(), rs->randstate);
136
137		/* Initial value of W (for use when Algorithm Z is first applied) */
138	233	rs->W = exp(-log(sampler_random_fract(rs->randstate)) / n);
139	233	}
140
141		double
142		reservoir_get_next_S(ReservoirState rs, double t, int n)
143	0	{
144	0	double S;
145
146		/* The magic constant here is T from Vitter's paper */
147	0	if (t <= (22.0 * n))
148	0	{
149		/* Process records using Algorithm X until t is large enough */
150	0	double V,
151	0	quot;
152
153	0	V = sampler_random_fract(rs->randstate); /* Generate V */
154	0	S = 0;
155	0	t += 1;
156		/* Note: "num" in Vitter's code is always equal to t - n */
157	0	quot = (t - (double) n) / t;
158		/* Find min S satisfying (4.1) */
159	0	while (quot > V)
160	0	{
161	0	S += 1;
162	0	t += 1;
163	0	quot *= (t - (double) n) / t;
164	0	}
165	0	}
166	0	else
167	0	{
168		/* Now apply Algorithm Z */
169	0	double W = rs->W;
170	0	double term = t - (double) n + 1;
171
172	0	for (;;)
173	0	{
174	0	double numer,
175	0	numer_lim,
176	0	denom;
177	0	double U,
178	0	X,
179	0	lhs,
180	0	rhs,
181	0	y,
182	0	tmp;
183
184		/* Generate U and X */
185	0	U = sampler_random_fract(rs->randstate);
186	0	X = t * (W - 1.0);
187	0	S = floor(X); /* S is tentatively set to floor(X) */
188		/* Test if U <= h(S)/cg(X) in the manner of (6.3) */
189	0	tmp = (t + 1) / term;
190	0	lhs = exp(log(((U * tmp * tmp) * (term + S)) / (t + X)) / n);
191	0	rhs = (((t + X) / (term + S)) * term) / t;
192	0	if (lhs <= rhs)
193	0	{
194	0	W = rhs / lhs;
195	0	break;
196	0	}
197		/* Test if U <= f(S)/cg(X) */
198	0	y = (((U * (t + 1)) / term) * (t + S + 1)) / (t + X);
199	0	if ((double) n < S)
200	0	{
201	0	denom = t;
202	0	numer_lim = term + S;
203	0	}
204	0	else
205	0	{
206	0	denom = t - (double) n + S;
207	0	numer_lim = t + 1;
208	0	}
209	0	for (numer = t + S; numer >= numer_lim; numer -= 1)
210	0	{
211	0	y *= numer / denom;
212	0	denom -= 1;
213	0	}
214	0	W = exp(-log(sampler_random_fract(rs->randstate)) / n); /* Generate W in advance */
215	0	if (exp(log(y) / n) <= (t + X) / t)
216	0	break;
217	0	}
218	0	rs->W = W;
219	0	}
220	0	return S;
221	0	}
222
223
224		/*----------
225		* Random number generator used by sampling
226		*----------
227		*/
228		void
229		sampler_random_init_state(long seed, SamplerRandomState randstate)
230	275	{
231	275	randstate[0] = 0x330e; /* same as pg_erand48, but could be anything */
232	275	randstate[1] = (unsigned short) seed;
233	275	randstate[2] = (unsigned short) (seed >> 16);
234	275	}
235
236		/* Select a random value R uniformly distributed in (0 - 1) */
237		double
238		sampler_random_fract(SamplerRandomState randstate)
239	233	{
240	233	double res;
241
242		/* pg_erand48 returns a value in [0.0 - 1.0), so we must reject 0 */
243	233	do
244	233	{
245	233	res = pg_erand48(randstate);
246	233	} while (res == 0.0);
247	233	return res;
248	233	}
249
250
251		/*
252		* Backwards-compatible API for block sampling
253		*
254		* This code is now deprecated, but since it's still in use by many FDWs,
255		* we should keep it for awhile at least. The functionality is the same as
256		* sampler_random_fract/reservoir_init_selection_state/reservoir_get_next_S,
257		* except that a common random state is used across all callers.
258		*/
259		static ReservoirStateData oldrs;
260
261		double
262		anl_random_fract(void)
263	0	{
264		/* initialize if first time through */
265	0	if (oldrs.randstate[0] == 0)
266	0	sampler_random_init_state(random(), oldrs.randstate);
267
268		/* and compute a random fraction */
269	0	return sampler_random_fract(oldrs.randstate);
270	0	}
271
272		double
273		anl_init_selection_state(int n)
274	0	{
275		/* initialize if first time through */
276	0	if (oldrs.randstate[0] == 0)
277	0	sampler_random_init_state(random(), oldrs.randstate);
278
279		/* Initial value of W (for use when Algorithm Z is first applied) */
280	0	return exp(-log(sampler_random_fract(oldrs.randstate)) / n);
281	0	}
282
283		double
284		anl_get_next_S(double t, int n, double *stateptr)
285	0	{
286	0	double result;
287
288	0	oldrs.W = *stateptr;
289	0	result = reservoir_get_next_S(&oldrs, t, n);
290	0	*stateptr = oldrs.W;
291	0	return result;
292	0	}