/Users/deen/code/yugabyte-db/src/postgres/src/backend/statistics/mvdistinct.c
Line | Count | Source (jump to first uncovered line) |
1 | | /*------------------------------------------------------------------------- |
2 | | * |
3 | | * mvdistinct.c |
4 | | * POSTGRES multivariate ndistinct coefficients |
5 | | * |
6 | | * Estimating number of groups in a combination of columns (e.g. for GROUP BY) |
7 | | * is tricky, and the estimation error is often significant. |
8 | | |
9 | | * The multivariate ndistinct coefficients address this by storing ndistinct |
10 | | * estimates for combinations of the user-specified columns. So for example |
11 | | * given a statistics object on three columns (a,b,c), this module estimates |
12 | | * and stores n-distinct for (a,b), (a,c), (b,c) and (a,b,c). The per-column |
13 | | * estimates are already available in pg_statistic. |
14 | | * |
15 | | * |
16 | | * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group |
17 | | * Portions Copyright (c) 1994, Regents of the University of California |
18 | | * |
19 | | * IDENTIFICATION |
20 | | * src/backend/statistics/mvdistinct.c |
21 | | * |
22 | | *------------------------------------------------------------------------- |
23 | | */ |
24 | | #include "postgres.h" |
25 | | |
26 | | #include <math.h> |
27 | | |
28 | | #include "access/htup_details.h" |
29 | | #include "catalog/pg_statistic_ext.h" |
30 | | #include "utils/fmgrprotos.h" |
31 | | #include "utils/lsyscache.h" |
32 | | #include "lib/stringinfo.h" |
33 | | #include "utils/syscache.h" |
34 | | #include "utils/typcache.h" |
35 | | #include "statistics/extended_stats_internal.h" |
36 | | #include "statistics/statistics.h" |
37 | | |
38 | | |
39 | | static double ndistinct_for_combination(double totalrows, int numrows, |
40 | | HeapTuple *rows, VacAttrStats **stats, |
41 | | int k, int *combination); |
42 | | static double estimate_ndistinct(double totalrows, int numrows, int d, int f1); |
43 | | static int n_choose_k(int n, int k); |
44 | | static int num_combinations(int n); |
45 | | |
46 | | /* Combination generator API */ |
47 | | |
48 | | /* internal state for generator of k-combinations of n elements */ |
49 | | typedef struct CombinationGenerator |
50 | | { |
51 | | int k; /* size of the combination */ |
52 | | int n; /* total number of elements */ |
53 | | int current; /* index of the next combination to return */ |
54 | | int ncombinations; /* number of combinations (size of array) */ |
55 | | int *combinations; /* array of pre-built combinations */ |
56 | | } CombinationGenerator; |
57 | | |
58 | | static CombinationGenerator *generator_init(int n, int k); |
59 | | static void generator_free(CombinationGenerator *state); |
60 | | static int *generator_next(CombinationGenerator *state); |
61 | | static void generate_combinations(CombinationGenerator *state); |
62 | | |
63 | | |
64 | | /* |
65 | | * statext_ndistinct_build |
66 | | * Compute ndistinct coefficient for the combination of attributes. |
67 | | * |
68 | | * This computes the ndistinct estimate using the same estimator used |
69 | | * in analyze.c and then computes the coefficient. |
70 | | */ |
71 | | MVNDistinct * |
72 | | statext_ndistinct_build(double totalrows, int numrows, HeapTuple *rows, |
73 | | Bitmapset *attrs, VacAttrStats **stats) |
74 | 0 | { |
75 | 0 | MVNDistinct *result; |
76 | 0 | int k; |
77 | 0 | int itemcnt; |
78 | 0 | int numattrs = bms_num_members(attrs); |
79 | 0 | int numcombs = num_combinations(numattrs); |
80 | |
|
81 | 0 | result = palloc(offsetof(MVNDistinct, items) + |
82 | 0 | numcombs * sizeof(MVNDistinctItem)); |
83 | 0 | result->magic = STATS_NDISTINCT_MAGIC; |
84 | 0 | result->type = STATS_NDISTINCT_TYPE_BASIC; |
85 | 0 | result->nitems = numcombs; |
86 | |
|
87 | 0 | itemcnt = 0; |
88 | 0 | for (k = 2; k <= numattrs; k++) |
89 | 0 | { |
90 | 0 | int *combination; |
91 | 0 | CombinationGenerator *generator; |
92 | | |
93 | | /* generate combinations of K out of N elements */ |
94 | 0 | generator = generator_init(numattrs, k); |
95 | |
|
96 | 0 | while ((combination = generator_next(generator))) |
97 | 0 | { |
98 | 0 | MVNDistinctItem *item = &result->items[itemcnt]; |
99 | 0 | int j; |
100 | |
|
101 | 0 | item->attrs = NULL; |
102 | 0 | for (j = 0; j < k; j++) |
103 | 0 | item->attrs = bms_add_member(item->attrs, |
104 | 0 | stats[combination[j]]->attr->attnum); |
105 | 0 | item->ndistinct = |
106 | 0 | ndistinct_for_combination(totalrows, numrows, rows, |
107 | 0 | stats, k, combination); |
108 | |
|
109 | 0 | itemcnt++; |
110 | 0 | Assert(itemcnt <= result->nitems); |
111 | 0 | } |
112 | | |
113 | 0 | generator_free(generator); |
114 | 0 | } |
115 | | |
116 | | /* must consume exactly the whole output array */ |
117 | 0 | Assert(itemcnt == result->nitems); |
118 | | |
119 | 0 | return result; |
120 | 0 | } |
121 | | |
122 | | /* |
123 | | * statext_ndistinct_load |
124 | | * Load the ndistinct value for the indicated pg_statistic_ext tuple |
125 | | */ |
126 | | MVNDistinct * |
127 | | statext_ndistinct_load(Oid mvoid) |
128 | 0 | { |
129 | 0 | MVNDistinct *result; |
130 | 0 | bool isnull; |
131 | 0 | Datum ndist; |
132 | 0 | HeapTuple htup; |
133 | |
|
134 | 0 | htup = SearchSysCache1(STATEXTOID, ObjectIdGetDatum(mvoid)); |
135 | 0 | if (!HeapTupleIsValid(htup)) |
136 | 0 | elog(ERROR, "cache lookup failed for statistics object %u", mvoid); |
137 | | |
138 | 0 | ndist = SysCacheGetAttr(STATEXTOID, htup, |
139 | 0 | Anum_pg_statistic_ext_stxndistinct, &isnull); |
140 | 0 | if (isnull) |
141 | 0 | elog(ERROR, |
142 | 0 | "requested statistic kind \"%c\" is not yet built for statistics object %u", |
143 | 0 | STATS_EXT_NDISTINCT, mvoid); |
144 | | |
145 | 0 | result = statext_ndistinct_deserialize(DatumGetByteaPP(ndist)); |
146 | |
|
147 | 0 | ReleaseSysCache(htup); |
148 | |
|
149 | 0 | return result; |
150 | 0 | } |
151 | | |
152 | | /* |
153 | | * statext_ndistinct_serialize |
154 | | * serialize ndistinct to the on-disk bytea format |
155 | | */ |
156 | | bytea * |
157 | | statext_ndistinct_serialize(MVNDistinct *ndistinct) |
158 | 0 | { |
159 | 0 | int i; |
160 | 0 | bytea *output; |
161 | 0 | char *tmp; |
162 | 0 | Size len; |
163 | |
|
164 | 0 | Assert(ndistinct->magic == STATS_NDISTINCT_MAGIC); |
165 | 0 | Assert(ndistinct->type == STATS_NDISTINCT_TYPE_BASIC); |
166 | | |
167 | | /* |
168 | | * Base size is size of scalar fields in the struct, plus one base struct |
169 | | * for each item, including number of items for each. |
170 | | */ |
171 | 0 | len = VARHDRSZ + SizeOfMVNDistinct + |
172 | 0 | ndistinct->nitems * (offsetof(MVNDistinctItem, attrs) + sizeof(int)); |
173 | | |
174 | | /* and also include space for the actual attribute numbers */ |
175 | 0 | for (i = 0; i < ndistinct->nitems; i++) |
176 | 0 | { |
177 | 0 | int nmembers; |
178 | |
|
179 | 0 | nmembers = bms_num_members(ndistinct->items[i].attrs); |
180 | 0 | Assert(nmembers >= 2); |
181 | 0 | len += sizeof(AttrNumber) * nmembers; |
182 | 0 | } |
183 | | |
184 | 0 | output = (bytea *) palloc(len); |
185 | 0 | SET_VARSIZE(output, len); |
186 | |
|
187 | 0 | tmp = VARDATA(output); |
188 | | |
189 | | /* Store the base struct values (magic, type, nitems) */ |
190 | 0 | memcpy(tmp, &ndistinct->magic, sizeof(uint32)); |
191 | 0 | tmp += sizeof(uint32); |
192 | 0 | memcpy(tmp, &ndistinct->type, sizeof(uint32)); |
193 | 0 | tmp += sizeof(uint32); |
194 | 0 | memcpy(tmp, &ndistinct->nitems, sizeof(uint32)); |
195 | 0 | tmp += sizeof(uint32); |
196 | | |
197 | | /* |
198 | | * store number of attributes and attribute numbers for each ndistinct |
199 | | * entry |
200 | | */ |
201 | 0 | for (i = 0; i < ndistinct->nitems; i++) |
202 | 0 | { |
203 | 0 | MVNDistinctItem item = ndistinct->items[i]; |
204 | 0 | int nmembers = bms_num_members(item.attrs); |
205 | 0 | int x; |
206 | |
|
207 | 0 | memcpy(tmp, &item.ndistinct, sizeof(double)); |
208 | 0 | tmp += sizeof(double); |
209 | 0 | memcpy(tmp, &nmembers, sizeof(int)); |
210 | 0 | tmp += sizeof(int); |
211 | |
|
212 | 0 | x = -1; |
213 | 0 | while ((x = bms_next_member(item.attrs, x)) >= 0) |
214 | 0 | { |
215 | 0 | AttrNumber value = (AttrNumber) x; |
216 | |
|
217 | 0 | memcpy(tmp, &value, sizeof(AttrNumber)); |
218 | 0 | tmp += sizeof(AttrNumber); |
219 | 0 | } |
220 | |
|
221 | 0 | Assert(tmp <= ((char *) output + len)); |
222 | 0 | } |
223 | | |
224 | 0 | return output; |
225 | 0 | } |
226 | | |
227 | | /* |
228 | | * statext_ndistinct_deserialize |
229 | | * Read an on-disk bytea format MVNDistinct to in-memory format |
230 | | */ |
231 | | MVNDistinct * |
232 | | statext_ndistinct_deserialize(bytea *data) |
233 | 0 | { |
234 | 0 | int i; |
235 | 0 | Size minimum_size; |
236 | 0 | MVNDistinct ndist; |
237 | 0 | MVNDistinct *ndistinct; |
238 | 0 | char *tmp; |
239 | |
|
240 | 0 | if (data == NULL) |
241 | 0 | return NULL; |
242 | | |
243 | | /* we expect at least the basic fields of MVNDistinct struct */ |
244 | 0 | if (VARSIZE_ANY_EXHDR(data) < SizeOfMVNDistinct) |
245 | 0 | elog(ERROR, "invalid MVNDistinct size %zd (expected at least %zd)", |
246 | 0 | VARSIZE_ANY_EXHDR(data), SizeOfMVNDistinct); |
247 | | |
248 | | /* initialize pointer to the data part (skip the varlena header) */ |
249 | 0 | tmp = VARDATA_ANY(data); |
250 | | |
251 | | /* read the header fields and perform basic sanity checks */ |
252 | 0 | memcpy(&ndist.magic, tmp, sizeof(uint32)); |
253 | 0 | tmp += sizeof(uint32); |
254 | 0 | memcpy(&ndist.type, tmp, sizeof(uint32)); |
255 | 0 | tmp += sizeof(uint32); |
256 | 0 | memcpy(&ndist.nitems, tmp, sizeof(uint32)); |
257 | 0 | tmp += sizeof(uint32); |
258 | |
|
259 | 0 | if (ndist.magic != STATS_NDISTINCT_MAGIC) |
260 | 0 | ereport(ERROR, |
261 | 0 | (errcode(ERRCODE_DATA_CORRUPTED), |
262 | 0 | errmsg("invalid ndistinct magic %08x (expected %08x)", |
263 | 0 | ndist.magic, STATS_NDISTINCT_MAGIC))); |
264 | 0 | if (ndist.type != STATS_NDISTINCT_TYPE_BASIC) |
265 | 0 | ereport(ERROR, |
266 | 0 | (errcode(ERRCODE_DATA_CORRUPTED), |
267 | 0 | errmsg("invalid ndistinct type %d (expected %d)", |
268 | 0 | ndist.type, STATS_NDISTINCT_TYPE_BASIC))); |
269 | 0 | if (ndist.nitems == 0) |
270 | 0 | ereport(ERROR, |
271 | 0 | (errcode(ERRCODE_DATA_CORRUPTED), |
272 | 0 | errmsg("invalid zero-length item array in MVNDistinct"))); |
273 | | |
274 | | /* what minimum bytea size do we expect for those parameters */ |
275 | 0 | minimum_size = (SizeOfMVNDistinct + |
276 | 0 | ndist.nitems * (SizeOfMVNDistinctItem + |
277 | 0 | sizeof(AttrNumber) * 2)); |
278 | 0 | if (VARSIZE_ANY_EXHDR(data) < minimum_size) |
279 | 0 | ereport(ERROR, |
280 | 0 | (errcode(ERRCODE_DATA_CORRUPTED), |
281 | 0 | errmsg("invalid MVNDistinct size %zd (expected at least %zd)", |
282 | 0 | VARSIZE_ANY_EXHDR(data), minimum_size))); |
283 | | |
284 | | /* |
285 | | * Allocate space for the ndistinct items (no space for each item's |
286 | | * attnos: those live in bitmapsets allocated separately) |
287 | | */ |
288 | 0 | ndistinct = palloc0(MAXALIGN(SizeOfMVNDistinct) + |
289 | 0 | (ndist.nitems * sizeof(MVNDistinctItem))); |
290 | 0 | ndistinct->magic = ndist.magic; |
291 | 0 | ndistinct->type = ndist.type; |
292 | 0 | ndistinct->nitems = ndist.nitems; |
293 | |
|
294 | 0 | for (i = 0; i < ndistinct->nitems; i++) |
295 | 0 | { |
296 | 0 | MVNDistinctItem *item = &ndistinct->items[i]; |
297 | 0 | int nelems; |
298 | |
|
299 | 0 | item->attrs = NULL; |
300 | | |
301 | | /* ndistinct value */ |
302 | 0 | memcpy(&item->ndistinct, tmp, sizeof(double)); |
303 | 0 | tmp += sizeof(double); |
304 | | |
305 | | /* number of attributes */ |
306 | 0 | memcpy(&nelems, tmp, sizeof(int)); |
307 | 0 | tmp += sizeof(int); |
308 | 0 | Assert((nelems >= 2) && (nelems <= STATS_MAX_DIMENSIONS)); |
309 | | |
310 | 0 | while (nelems-- > 0) |
311 | 0 | { |
312 | 0 | AttrNumber attno; |
313 | |
|
314 | 0 | memcpy(&attno, tmp, sizeof(AttrNumber)); |
315 | 0 | tmp += sizeof(AttrNumber); |
316 | 0 | item->attrs = bms_add_member(item->attrs, attno); |
317 | 0 | } |
318 | | |
319 | | /* still within the bytea */ |
320 | 0 | Assert(tmp <= ((char *) data + VARSIZE_ANY(data))); |
321 | 0 | } |
322 | | |
323 | | /* we should have consumed the whole bytea exactly */ |
324 | 0 | Assert(tmp == ((char *) data + VARSIZE_ANY(data))); |
325 | | |
326 | 0 | return ndistinct; |
327 | 0 | } |
328 | | |
329 | | /* |
330 | | * pg_ndistinct_in |
331 | | * input routine for type pg_ndistinct |
332 | | * |
333 | | * pg_ndistinct is real enough to be a table column, but it has no |
334 | | * operations of its own, and disallows input (jus like pg_node_tree). |
335 | | */ |
336 | | Datum |
337 | | pg_ndistinct_in(PG_FUNCTION_ARGS) |
338 | 0 | { |
339 | 0 | ereport(ERROR, |
340 | 0 | (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), |
341 | 0 | errmsg("cannot accept a value of type %s", "pg_ndistinct"))); |
342 | | |
343 | 0 | PG_RETURN_VOID(); /* keep compiler quiet */ |
344 | 0 | } |
345 | | |
346 | | /* |
347 | | * pg_ndistinct |
348 | | * output routine for type pg_ndistinct |
349 | | * |
350 | | * Produces a human-readable representation of the value. |
351 | | */ |
352 | | Datum |
353 | | pg_ndistinct_out(PG_FUNCTION_ARGS) |
354 | 0 | { |
355 | 0 | bytea *data = PG_GETARG_BYTEA_PP(0); |
356 | 0 | MVNDistinct *ndist = statext_ndistinct_deserialize(data); |
357 | 0 | int i; |
358 | 0 | StringInfoData str; |
359 | |
|
360 | 0 | initStringInfo(&str); |
361 | 0 | appendStringInfoChar(&str, '{'); |
362 | |
|
363 | 0 | for (i = 0; i < ndist->nitems; i++) |
364 | 0 | { |
365 | 0 | MVNDistinctItem item = ndist->items[i]; |
366 | 0 | int x = -1; |
367 | 0 | bool first = true; |
368 | |
|
369 | 0 | if (i > 0) |
370 | 0 | appendStringInfoString(&str, ", "); |
371 | |
|
372 | 0 | while ((x = bms_next_member(item.attrs, x)) >= 0) |
373 | 0 | { |
374 | 0 | appendStringInfo(&str, "%s%d", first ? "\"" : ", ", x); |
375 | 0 | first = false; |
376 | 0 | } |
377 | 0 | appendStringInfo(&str, "\": %d", (int) item.ndistinct); |
378 | 0 | } |
379 | |
|
380 | 0 | appendStringInfoChar(&str, '}'); |
381 | |
|
382 | 0 | PG_RETURN_CSTRING(str.data); |
383 | 0 | } |
384 | | |
385 | | /* |
386 | | * pg_ndistinct_recv |
387 | | * binary input routine for type pg_ndistinct |
388 | | */ |
389 | | Datum |
390 | | pg_ndistinct_recv(PG_FUNCTION_ARGS) |
391 | 0 | { |
392 | 0 | ereport(ERROR, |
393 | 0 | (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), |
394 | 0 | errmsg("cannot accept a value of type %s", "pg_ndistinct"))); |
395 | | |
396 | 0 | PG_RETURN_VOID(); /* keep compiler quiet */ |
397 | 0 | } |
398 | | |
399 | | /* |
400 | | * pg_ndistinct_send |
401 | | * binary output routine for type pg_ndistinct |
402 | | * |
403 | | * n-distinct is serialized into a bytea value, so let's send that. |
404 | | */ |
405 | | Datum |
406 | | pg_ndistinct_send(PG_FUNCTION_ARGS) |
407 | 0 | { |
408 | 0 | return byteasend(fcinfo); |
409 | 0 | } |
410 | | |
411 | | /* |
412 | | * ndistinct_for_combination |
413 | | * Estimates number of distinct values in a combination of columns. |
414 | | * |
415 | | * This uses the same ndistinct estimator as compute_scalar_stats() in |
416 | | * ANALYZE, i.e., |
417 | | * n*d / (n - f1 + f1*n/N) |
418 | | * |
419 | | * except that instead of values in a single column we are dealing with |
420 | | * combination of multiple columns. |
421 | | */ |
422 | | static double |
423 | | ndistinct_for_combination(double totalrows, int numrows, HeapTuple *rows, |
424 | | VacAttrStats **stats, int k, int *combination) |
425 | 0 | { |
426 | 0 | int i, |
427 | 0 | j; |
428 | 0 | int f1, |
429 | 0 | cnt, |
430 | 0 | d; |
431 | 0 | bool *isnull; |
432 | 0 | Datum *values; |
433 | 0 | SortItem *items; |
434 | 0 | MultiSortSupport mss; |
435 | |
|
436 | 0 | mss = multi_sort_init(k); |
437 | | |
438 | | /* |
439 | | * In order to determine the number of distinct elements, create separate |
440 | | * values[]/isnull[] arrays with all the data we have, then sort them |
441 | | * using the specified column combination as dimensions. We could try to |
442 | | * sort in place, but it'd probably be more complex and bug-prone. |
443 | | */ |
444 | 0 | items = (SortItem *) palloc(numrows * sizeof(SortItem)); |
445 | 0 | values = (Datum *) palloc0(sizeof(Datum) * numrows * k); |
446 | 0 | isnull = (bool *) palloc0(sizeof(bool) * numrows * k); |
447 | |
|
448 | 0 | for (i = 0; i < numrows; i++) |
449 | 0 | { |
450 | 0 | items[i].values = &values[i * k]; |
451 | 0 | items[i].isnull = &isnull[i * k]; |
452 | 0 | } |
453 | | |
454 | | /* |
455 | | * For each dimension, set up sort-support and fill in the values from the |
456 | | * sample data. |
457 | | */ |
458 | 0 | for (i = 0; i < k; i++) |
459 | 0 | { |
460 | 0 | VacAttrStats *colstat = stats[combination[i]]; |
461 | 0 | TypeCacheEntry *type; |
462 | |
|
463 | 0 | type = lookup_type_cache(colstat->attrtypid, TYPECACHE_LT_OPR); |
464 | 0 | if (type->lt_opr == InvalidOid) /* shouldn't happen */ |
465 | 0 | elog(ERROR, "cache lookup failed for ordering operator for type %u", |
466 | 0 | colstat->attrtypid); |
467 | | |
468 | | /* prepare the sort function for this dimension */ |
469 | 0 | multi_sort_add_dimension(mss, i, type->lt_opr); |
470 | | |
471 | | /* accumulate all the data for this dimension into the arrays */ |
472 | 0 | for (j = 0; j < numrows; j++) |
473 | 0 | { |
474 | 0 | items[j].values[i] = |
475 | 0 | heap_getattr(rows[j], |
476 | 0 | colstat->attr->attnum, |
477 | 0 | colstat->tupDesc, |
478 | 0 | &items[j].isnull[i]); |
479 | 0 | } |
480 | 0 | } |
481 | | |
482 | | /* We can sort the array now ... */ |
483 | 0 | qsort_arg((void *) items, numrows, sizeof(SortItem), |
484 | 0 | multi_sort_compare, mss); |
485 | | |
486 | | /* ... and count the number of distinct combinations */ |
487 | |
|
488 | 0 | f1 = 0; |
489 | 0 | cnt = 1; |
490 | 0 | d = 1; |
491 | 0 | for (i = 1; i < numrows; i++) |
492 | 0 | { |
493 | 0 | if (multi_sort_compare(&items[i], &items[i - 1], mss) != 0) |
494 | 0 | { |
495 | 0 | if (cnt == 1) |
496 | 0 | f1 += 1; |
497 | |
|
498 | 0 | d++; |
499 | 0 | cnt = 0; |
500 | 0 | } |
501 | |
|
502 | 0 | cnt += 1; |
503 | 0 | } |
504 | |
|
505 | 0 | if (cnt == 1) |
506 | 0 | f1 += 1; |
507 | |
|
508 | 0 | return estimate_ndistinct(totalrows, numrows, d, f1); |
509 | 0 | } |
510 | | |
511 | | /* The Duj1 estimator (already used in analyze.c). */ |
512 | | static double |
513 | | estimate_ndistinct(double totalrows, int numrows, int d, int f1) |
514 | 0 | { |
515 | 0 | double numer, |
516 | 0 | denom, |
517 | 0 | ndistinct; |
518 | |
|
519 | 0 | numer = (double) numrows * (double) d; |
520 | |
|
521 | 0 | denom = (double) (numrows - f1) + |
522 | 0 | (double) f1 * (double) numrows / totalrows; |
523 | |
|
524 | 0 | ndistinct = numer / denom; |
525 | | |
526 | | /* Clamp to sane range in case of roundoff error */ |
527 | 0 | if (ndistinct < (double) d) |
528 | 0 | ndistinct = (double) d; |
529 | |
|
530 | 0 | if (ndistinct > totalrows) |
531 | 0 | ndistinct = totalrows; |
532 | |
|
533 | 0 | return floor(ndistinct + 0.5); |
534 | 0 | } |
535 | | |
536 | | /* |
537 | | * n_choose_k |
538 | | * computes binomial coefficients using an algorithm that is both |
539 | | * efficient and prevents overflows |
540 | | */ |
541 | | static int |
542 | | n_choose_k(int n, int k) |
543 | 0 | { |
544 | 0 | int d, |
545 | 0 | r; |
546 | |
|
547 | 0 | Assert((k > 0) && (n >= k)); |
548 | | |
549 | | /* use symmetry of the binomial coefficients */ |
550 | 0 | k = Min(k, n - k); |
551 | |
|
552 | 0 | r = 1; |
553 | 0 | for (d = 1; d <= k; ++d) |
554 | 0 | { |
555 | 0 | r *= n--; |
556 | 0 | r /= d; |
557 | 0 | } |
558 | |
|
559 | 0 | return r; |
560 | 0 | } |
561 | | |
562 | | /* |
563 | | * num_combinations |
564 | | * number of combinations, excluding single-value combinations |
565 | | */ |
566 | | static int |
567 | | num_combinations(int n) |
568 | 0 | { |
569 | 0 | int k; |
570 | 0 | int ncombs = 1; |
571 | |
|
572 | 0 | for (k = 1; k <= n; k++) |
573 | 0 | ncombs *= 2; |
574 | |
|
575 | 0 | ncombs -= (n + 1); |
576 | |
|
577 | 0 | return ncombs; |
578 | 0 | } |
579 | | |
580 | | /* |
581 | | * generator_init |
582 | | * initialize the generator of combinations |
583 | | * |
584 | | * The generator produces combinations of K elements in the interval (0..N). |
585 | | * We prebuild all the combinations in this method, which is simpler than |
586 | | * generating them on the fly. |
587 | | */ |
588 | | static CombinationGenerator * |
589 | | generator_init(int n, int k) |
590 | 0 | { |
591 | 0 | CombinationGenerator *state; |
592 | |
|
593 | 0 | Assert((n >= k) && (k > 0)); |
594 | | |
595 | | /* allocate the generator state as a single chunk of memory */ |
596 | 0 | state = (CombinationGenerator *) palloc(sizeof(CombinationGenerator)); |
597 | |
|
598 | 0 | state->ncombinations = n_choose_k(n, k); |
599 | | |
600 | | /* pre-allocate space for all combinations */ |
601 | 0 | state->combinations = (int *) palloc(sizeof(int) * k * state->ncombinations); |
602 | |
|
603 | 0 | state->current = 0; |
604 | 0 | state->k = k; |
605 | 0 | state->n = n; |
606 | | |
607 | | /* now actually pre-generate all the combinations of K elements */ |
608 | 0 | generate_combinations(state); |
609 | | |
610 | | /* make sure we got the expected number of combinations */ |
611 | 0 | Assert(state->current == state->ncombinations); |
612 | | |
613 | | /* reset the number, so we start with the first one */ |
614 | 0 | state->current = 0; |
615 | |
|
616 | 0 | return state; |
617 | 0 | } |
618 | | |
619 | | /* |
620 | | * generator_next |
621 | | * returns the next combination from the prebuilt list |
622 | | * |
623 | | * Returns a combination of K array indexes (0 .. N), as specified to |
624 | | * generator_init), or NULL when there are no more combination. |
625 | | */ |
626 | | static int * |
627 | | generator_next(CombinationGenerator *state) |
628 | 0 | { |
629 | 0 | if (state->current == state->ncombinations) |
630 | 0 | return NULL; |
631 | | |
632 | 0 | return &state->combinations[state->k * state->current++]; |
633 | 0 | } |
634 | | |
635 | | /* |
636 | | * generator_free |
637 | | * free the internal state of the generator |
638 | | * |
639 | | * Releases the generator internal state (pre-built combinations). |
640 | | */ |
641 | | static void |
642 | | generator_free(CombinationGenerator *state) |
643 | 0 | { |
644 | 0 | pfree(state->combinations); |
645 | 0 | pfree(state); |
646 | 0 | } |
647 | | |
648 | | /* |
649 | | * generate_combinations_recurse |
650 | | * given a prefix, generate all possible combinations |
651 | | * |
652 | | * Given a prefix (first few elements of the combination), generate following |
653 | | * elements recursively. We generate the combinations in lexicographic order, |
654 | | * which eliminates permutations of the same combination. |
655 | | */ |
656 | | static void |
657 | | generate_combinations_recurse(CombinationGenerator *state, |
658 | | int index, int start, int *current) |
659 | 0 | { |
660 | | /* If we haven't filled all the elements, simply recurse. */ |
661 | 0 | if (index < state->k) |
662 | 0 | { |
663 | 0 | int i; |
664 | | |
665 | | /* |
666 | | * The values have to be in ascending order, so make sure we start |
667 | | * with the value passed by parameter. |
668 | | */ |
669 | |
|
670 | 0 | for (i = start; i < state->n; i++) |
671 | 0 | { |
672 | 0 | current[index] = i; |
673 | 0 | generate_combinations_recurse(state, (index + 1), (i + 1), current); |
674 | 0 | } |
675 | |
|
676 | 0 | return; |
677 | 0 | } |
678 | 0 | else |
679 | 0 | { |
680 | | /* we got a valid combination, add it to the array */ |
681 | 0 | memcpy(&state->combinations[(state->k * state->current)], |
682 | 0 | current, state->k * sizeof(int)); |
683 | 0 | state->current++; |
684 | 0 | } |
685 | 0 | } |
686 | | |
687 | | /* |
688 | | * generate_combinations |
689 | | * generate all k-combinations of N elements |
690 | | */ |
691 | | static void |
692 | | generate_combinations(CombinationGenerator *state) |
693 | 0 | { |
694 | 0 | int *current = (int *) palloc0(sizeof(int) * state->k); |
695 | |
|
696 | 0 | generate_combinations_recurse(state, 0, 0, current); |
697 | |
|
698 | 0 | pfree(current); |
699 | 0 | } |