/Users/deen/code/yugabyte-db/src/postgres/src/backend/statistics/dependencies.c
Line | Count | Source (jump to first uncovered line) |
1 | | /*------------------------------------------------------------------------- |
2 | | * |
3 | | * dependencies.c |
4 | | * POSTGRES functional dependencies |
5 | | * |
6 | | * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group |
7 | | * Portions Copyright (c) 1994, Regents of the University of California |
8 | | * |
9 | | * IDENTIFICATION |
10 | | * src/backend/statistics/dependencies.c |
11 | | * |
12 | | *------------------------------------------------------------------------- |
13 | | */ |
14 | | #include "postgres.h" |
15 | | |
16 | | #include "access/htup_details.h" |
17 | | #include "access/sysattr.h" |
18 | | #include "catalog/pg_operator.h" |
19 | | #include "catalog/pg_statistic_ext.h" |
20 | | #include "lib/stringinfo.h" |
21 | | #include "optimizer/clauses.h" |
22 | | #include "optimizer/cost.h" |
23 | | #include "optimizer/var.h" |
24 | | #include "nodes/nodes.h" |
25 | | #include "nodes/relation.h" |
26 | | #include "statistics/extended_stats_internal.h" |
27 | | #include "statistics/statistics.h" |
28 | | #include "utils/bytea.h" |
29 | | #include "utils/fmgroids.h" |
30 | | #include "utils/fmgrprotos.h" |
31 | | #include "utils/lsyscache.h" |
32 | | #include "utils/syscache.h" |
33 | | #include "utils/typcache.h" |
34 | | |
35 | | /* |
36 | | * Internal state for DependencyGenerator of dependencies. Dependencies are similar to |
37 | | * k-permutations of n elements, except that the order does not matter for the |
38 | | * first (k-1) elements. That is, (a,b=>c) and (b,a=>c) are equivalent. |
39 | | */ |
40 | | typedef struct DependencyGeneratorData |
41 | | { |
42 | | int k; /* size of the dependency */ |
43 | | int n; /* number of possible attributes */ |
44 | | int current; /* next dependency to return (index) */ |
45 | | AttrNumber ndependencies; /* number of dependencies generated */ |
46 | | AttrNumber *dependencies; /* array of pre-generated dependencies */ |
47 | | } DependencyGeneratorData; |
48 | | |
49 | | typedef DependencyGeneratorData *DependencyGenerator; |
50 | | |
51 | | static void generate_dependencies_recurse(DependencyGenerator state, |
52 | | int index, AttrNumber start, AttrNumber *current); |
53 | | static void generate_dependencies(DependencyGenerator state); |
54 | | static DependencyGenerator DependencyGenerator_init(int n, int k); |
55 | | static void DependencyGenerator_free(DependencyGenerator state); |
56 | | static AttrNumber *DependencyGenerator_next(DependencyGenerator state); |
57 | | static double dependency_degree(int numrows, HeapTuple *rows, int k, |
58 | | AttrNumber *dependency, VacAttrStats **stats, Bitmapset *attrs); |
59 | | static bool dependency_is_fully_matched(MVDependency *dependency, |
60 | | Bitmapset *attnums); |
61 | | static bool dependency_implies_attribute(MVDependency *dependency, |
62 | | AttrNumber attnum); |
63 | | static bool dependency_is_compatible_clause(Node *clause, Index relid, |
64 | | AttrNumber *attnum); |
65 | | static MVDependency *find_strongest_dependency(StatisticExtInfo *stats, |
66 | | MVDependencies *dependencies, |
67 | | Bitmapset *attnums); |
68 | | |
69 | | static void |
70 | | generate_dependencies_recurse(DependencyGenerator state, int index, |
71 | | AttrNumber start, AttrNumber *current) |
72 | 0 | { |
73 | | /* |
74 | | * The generator handles the first (k-1) elements differently from the |
75 | | * last element. |
76 | | */ |
77 | 0 | if (index < (state->k - 1)) |
78 | 0 | { |
79 | 0 | AttrNumber i; |
80 | | |
81 | | /* |
82 | | * The first (k-1) values have to be in ascending order, which we |
83 | | * generate recursively. |
84 | | */ |
85 | |
|
86 | 0 | for (i = start; i < state->n; i++) |
87 | 0 | { |
88 | 0 | current[index] = i; |
89 | 0 | generate_dependencies_recurse(state, (index + 1), (i + 1), current); |
90 | 0 | } |
91 | 0 | } |
92 | 0 | else |
93 | 0 | { |
94 | 0 | int i; |
95 | | |
96 | | /* |
97 | | * the last element is the implied value, which does not respect the |
98 | | * ascending order. We just need to check that the value is not in the |
99 | | * first (k-1) elements. |
100 | | */ |
101 | |
|
102 | 0 | for (i = 0; i < state->n; i++) |
103 | 0 | { |
104 | 0 | int j; |
105 | 0 | bool match = false; |
106 | |
|
107 | 0 | current[index] = i; |
108 | |
|
109 | 0 | for (j = 0; j < index; j++) |
110 | 0 | { |
111 | 0 | if (current[j] == i) |
112 | 0 | { |
113 | 0 | match = true; |
114 | 0 | break; |
115 | 0 | } |
116 | 0 | } |
117 | | |
118 | | /* |
119 | | * If the value is not found in the first part of the dependency, |
120 | | * we're done. |
121 | | */ |
122 | 0 | if (!match) |
123 | 0 | { |
124 | 0 | state->dependencies = (AttrNumber *) repalloc(state->dependencies, |
125 | 0 | state->k * (state->ndependencies + 1) * sizeof(AttrNumber)); |
126 | 0 | memcpy(&state->dependencies[(state->k * state->ndependencies)], |
127 | 0 | current, state->k * sizeof(AttrNumber)); |
128 | 0 | state->ndependencies++; |
129 | 0 | } |
130 | 0 | } |
131 | 0 | } |
132 | 0 | } |
133 | | |
134 | | /* generate all dependencies (k-permutations of n elements) */ |
135 | | static void |
136 | | generate_dependencies(DependencyGenerator state) |
137 | 0 | { |
138 | 0 | AttrNumber *current = (AttrNumber *) palloc0(sizeof(AttrNumber) * state->k); |
139 | |
|
140 | 0 | generate_dependencies_recurse(state, 0, 0, current); |
141 | |
|
142 | 0 | pfree(current); |
143 | 0 | } |
144 | | |
145 | | /* |
146 | | * initialize the DependencyGenerator of variations, and prebuild the variations |
147 | | * |
148 | | * This pre-builds all the variations. We could also generate them in |
149 | | * DependencyGenerator_next(), but this seems simpler. |
150 | | */ |
151 | | static DependencyGenerator |
152 | | DependencyGenerator_init(int n, int k) |
153 | 0 | { |
154 | 0 | DependencyGenerator state; |
155 | |
|
156 | 0 | Assert((n >= k) && (k > 0)); |
157 | | |
158 | | /* allocate the DependencyGenerator state */ |
159 | 0 | state = (DependencyGenerator) palloc0(sizeof(DependencyGeneratorData)); |
160 | 0 | state->dependencies = (AttrNumber *) palloc(k * sizeof(AttrNumber)); |
161 | |
|
162 | 0 | state->ndependencies = 0; |
163 | 0 | state->current = 0; |
164 | 0 | state->k = k; |
165 | 0 | state->n = n; |
166 | | |
167 | | /* now actually pre-generate all the variations */ |
168 | 0 | generate_dependencies(state); |
169 | |
|
170 | 0 | return state; |
171 | 0 | } |
172 | | |
173 | | /* free the DependencyGenerator state */ |
174 | | static void |
175 | | DependencyGenerator_free(DependencyGenerator state) |
176 | 0 | { |
177 | 0 | pfree(state->dependencies); |
178 | 0 | pfree(state); |
179 | |
|
180 | 0 | } |
181 | | |
182 | | /* generate next combination */ |
183 | | static AttrNumber * |
184 | | DependencyGenerator_next(DependencyGenerator state) |
185 | 0 | { |
186 | 0 | if (state->current == state->ndependencies) |
187 | 0 | return NULL; |
188 | | |
189 | 0 | return &state->dependencies[state->k * state->current++]; |
190 | 0 | } |
191 | | |
192 | | |
193 | | /* |
194 | | * validates functional dependency on the data |
195 | | * |
196 | | * An actual work horse of detecting functional dependencies. Given a variation |
197 | | * of k attributes, it checks that the first (k-1) are sufficient to determine |
198 | | * the last one. |
199 | | */ |
200 | | static double |
201 | | dependency_degree(int numrows, HeapTuple *rows, int k, AttrNumber *dependency, |
202 | | VacAttrStats **stats, Bitmapset *attrs) |
203 | 0 | { |
204 | 0 | int i, |
205 | 0 | j; |
206 | 0 | int nvalues = numrows * k; |
207 | 0 | MultiSortSupport mss; |
208 | 0 | SortItem *items; |
209 | 0 | Datum *values; |
210 | 0 | bool *isnull; |
211 | 0 | int *attnums; |
212 | | |
213 | | /* counters valid within a group */ |
214 | 0 | int group_size = 0; |
215 | 0 | int n_violations = 0; |
216 | | |
217 | | /* total number of rows supporting (consistent with) the dependency */ |
218 | 0 | int n_supporting_rows = 0; |
219 | | |
220 | | /* Make sure we have at least two input attributes. */ |
221 | 0 | Assert(k >= 2); |
222 | | |
223 | | /* sort info for all attributes columns */ |
224 | 0 | mss = multi_sort_init(k); |
225 | | |
226 | | /* data for the sort */ |
227 | 0 | items = (SortItem *) palloc(numrows * sizeof(SortItem)); |
228 | 0 | values = (Datum *) palloc(sizeof(Datum) * nvalues); |
229 | 0 | isnull = (bool *) palloc(sizeof(bool) * nvalues); |
230 | | |
231 | | /* fix the pointers to values/isnull */ |
232 | 0 | for (i = 0; i < numrows; i++) |
233 | 0 | { |
234 | 0 | items[i].values = &values[i * k]; |
235 | 0 | items[i].isnull = &isnull[i * k]; |
236 | 0 | } |
237 | | |
238 | | /* |
239 | | * Transform the bms into an array, to make accessing i-th member easier. |
240 | | */ |
241 | 0 | attnums = (int *) palloc(sizeof(int) * bms_num_members(attrs)); |
242 | 0 | i = 0; |
243 | 0 | j = -1; |
244 | 0 | while ((j = bms_next_member(attrs, j)) >= 0) |
245 | 0 | attnums[i++] = j; |
246 | | |
247 | | /* |
248 | | * Verify the dependency (a,b,...)->z, using a rather simple algorithm: |
249 | | * |
250 | | * (a) sort the data lexicographically |
251 | | * |
252 | | * (b) split the data into groups by first (k-1) columns |
253 | | * |
254 | | * (c) for each group count different values in the last column |
255 | | */ |
256 | | |
257 | | /* prepare the sort function for the first dimension, and SortItem array */ |
258 | 0 | for (i = 0; i < k; i++) |
259 | 0 | { |
260 | 0 | VacAttrStats *colstat = stats[dependency[i]]; |
261 | 0 | TypeCacheEntry *type; |
262 | |
|
263 | 0 | type = lookup_type_cache(colstat->attrtypid, TYPECACHE_LT_OPR); |
264 | 0 | if (type->lt_opr == InvalidOid) /* shouldn't happen */ |
265 | 0 | elog(ERROR, "cache lookup failed for ordering operator for type %u", |
266 | 0 | colstat->attrtypid); |
267 | | |
268 | | /* prepare the sort function for this dimension */ |
269 | 0 | multi_sort_add_dimension(mss, i, type->lt_opr); |
270 | | |
271 | | /* accumulate all the data for both columns into an array and sort it */ |
272 | 0 | for (j = 0; j < numrows; j++) |
273 | 0 | { |
274 | 0 | items[j].values[i] = |
275 | 0 | heap_getattr(rows[j], attnums[dependency[i]], |
276 | 0 | stats[i]->tupDesc, &items[j].isnull[i]); |
277 | 0 | } |
278 | 0 | } |
279 | | |
280 | | /* sort the items so that we can detect the groups */ |
281 | 0 | qsort_arg((void *) items, numrows, sizeof(SortItem), |
282 | 0 | multi_sort_compare, mss); |
283 | | |
284 | | /* |
285 | | * Walk through the sorted array, split it into rows according to the |
286 | | * first (k-1) columns. If there's a single value in the last column, we |
287 | | * count the group as 'supporting' the functional dependency. Otherwise we |
288 | | * count it as contradicting. |
289 | | */ |
290 | | |
291 | | /* start with the first row forming a group */ |
292 | 0 | group_size = 1; |
293 | | |
294 | | /* loop 1 beyond the end of the array so that we count the final group */ |
295 | 0 | for (i = 1; i <= numrows; i++) |
296 | 0 | { |
297 | | /* |
298 | | * Check if the group ended, which may be either because we processed |
299 | | * all the items (i==numrows), or because the i-th item is not equal |
300 | | * to the preceding one. |
301 | | */ |
302 | 0 | if (i == numrows || |
303 | 0 | multi_sort_compare_dims(0, k - 2, &items[i - 1], &items[i], mss) != 0) |
304 | 0 | { |
305 | | /* |
306 | | * If no violations were found in the group then track the rows of |
307 | | * the group as supporting the functional dependency. |
308 | | */ |
309 | 0 | if (n_violations == 0) |
310 | 0 | n_supporting_rows += group_size; |
311 | | |
312 | | /* Reset counters for the new group */ |
313 | 0 | n_violations = 0; |
314 | 0 | group_size = 1; |
315 | 0 | continue; |
316 | 0 | } |
317 | | /* first columns match, but the last one does not (so contradicting) */ |
318 | 0 | else if (multi_sort_compare_dim(k - 1, &items[i - 1], &items[i], mss) != 0) |
319 | 0 | n_violations++; |
320 | |
|
321 | 0 | group_size++; |
322 | 0 | } |
323 | |
|
324 | 0 | pfree(items); |
325 | 0 | pfree(values); |
326 | 0 | pfree(isnull); |
327 | 0 | pfree(mss); |
328 | | |
329 | | /* Compute the 'degree of validity' as (supporting/total). */ |
330 | 0 | return (n_supporting_rows * 1.0 / numrows); |
331 | 0 | } |
332 | | |
333 | | /* |
334 | | * detects functional dependencies between groups of columns |
335 | | * |
336 | | * Generates all possible subsets of columns (variations) and computes |
337 | | * the degree of validity for each one. For example when creating statistics |
338 | | * on three columns (a,b,c) there are 9 possible dependencies |
339 | | * |
340 | | * two columns three columns |
341 | | * ----------- ------------- |
342 | | * (a) -> b (a,b) -> c |
343 | | * (a) -> c (a,c) -> b |
344 | | * (b) -> a (b,c) -> a |
345 | | * (b) -> c |
346 | | * (c) -> a |
347 | | * (c) -> b |
348 | | */ |
349 | | MVDependencies * |
350 | | statext_dependencies_build(int numrows, HeapTuple *rows, Bitmapset *attrs, |
351 | | VacAttrStats **stats) |
352 | 0 | { |
353 | 0 | int i, |
354 | 0 | j, |
355 | 0 | k; |
356 | 0 | int numattrs; |
357 | 0 | int *attnums; |
358 | | |
359 | | /* result */ |
360 | 0 | MVDependencies *dependencies = NULL; |
361 | |
|
362 | 0 | numattrs = bms_num_members(attrs); |
363 | | |
364 | | /* |
365 | | * Transform the bms into an array, to make accessing i-th member easier. |
366 | | */ |
367 | 0 | attnums = palloc(sizeof(int) * bms_num_members(attrs)); |
368 | 0 | i = 0; |
369 | 0 | j = -1; |
370 | 0 | while ((j = bms_next_member(attrs, j)) >= 0) |
371 | 0 | attnums[i++] = j; |
372 | |
|
373 | 0 | Assert(numattrs >= 2); |
374 | | |
375 | | /* |
376 | | * We'll try build functional dependencies starting from the smallest ones |
377 | | * covering just 2 columns, to the largest ones, covering all columns |
378 | | * included in the statistics object. We start from the smallest ones |
379 | | * because we want to be able to skip already implied ones. |
380 | | */ |
381 | 0 | for (k = 2; k <= numattrs; k++) |
382 | 0 | { |
383 | 0 | AttrNumber *dependency; /* array with k elements */ |
384 | | |
385 | | /* prepare a DependencyGenerator of variation */ |
386 | 0 | DependencyGenerator DependencyGenerator = DependencyGenerator_init(numattrs, k); |
387 | | |
388 | | /* generate all possible variations of k values (out of n) */ |
389 | 0 | while ((dependency = DependencyGenerator_next(DependencyGenerator))) |
390 | 0 | { |
391 | 0 | double degree; |
392 | 0 | MVDependency *d; |
393 | | |
394 | | /* compute how valid the dependency seems */ |
395 | 0 | degree = dependency_degree(numrows, rows, k, dependency, stats, attrs); |
396 | | |
397 | | /* |
398 | | * if the dependency seems entirely invalid, don't store it |
399 | | */ |
400 | 0 | if (degree == 0.0) |
401 | 0 | continue; |
402 | | |
403 | 0 | d = (MVDependency *) palloc0(offsetof(MVDependency, attributes) |
404 | 0 | + k * sizeof(AttrNumber)); |
405 | | |
406 | | /* copy the dependency (and keep the indexes into stxkeys) */ |
407 | 0 | d->degree = degree; |
408 | 0 | d->nattributes = k; |
409 | 0 | for (i = 0; i < k; i++) |
410 | 0 | d->attributes[i] = attnums[dependency[i]]; |
411 | | |
412 | | /* initialize the list of dependencies */ |
413 | 0 | if (dependencies == NULL) |
414 | 0 | { |
415 | 0 | dependencies |
416 | 0 | = (MVDependencies *) palloc0(sizeof(MVDependencies)); |
417 | |
|
418 | 0 | dependencies->magic = STATS_DEPS_MAGIC; |
419 | 0 | dependencies->type = STATS_DEPS_TYPE_BASIC; |
420 | 0 | dependencies->ndeps = 0; |
421 | 0 | } |
422 | |
|
423 | 0 | dependencies->ndeps++; |
424 | 0 | dependencies = (MVDependencies *) repalloc(dependencies, |
425 | 0 | offsetof(MVDependencies, deps) |
426 | 0 | + dependencies->ndeps * sizeof(MVDependency)); |
427 | |
|
428 | 0 | dependencies->deps[dependencies->ndeps - 1] = d; |
429 | 0 | } |
430 | | |
431 | | /* |
432 | | * we're done with variations of k elements, so free the |
433 | | * DependencyGenerator |
434 | | */ |
435 | 0 | DependencyGenerator_free(DependencyGenerator); |
436 | 0 | } |
437 | |
|
438 | 0 | return dependencies; |
439 | 0 | } |
440 | | |
441 | | |
442 | | /* |
443 | | * Serialize list of dependencies into a bytea value. |
444 | | */ |
445 | | bytea * |
446 | | statext_dependencies_serialize(MVDependencies *dependencies) |
447 | 0 | { |
448 | 0 | int i; |
449 | 0 | bytea *output; |
450 | 0 | char *tmp; |
451 | 0 | Size len; |
452 | | |
453 | | /* we need to store ndeps, with a number of attributes for each one */ |
454 | 0 | len = VARHDRSZ + SizeOfDependencies |
455 | 0 | + dependencies->ndeps * SizeOfDependency; |
456 | | |
457 | | /* and also include space for the actual attribute numbers and degrees */ |
458 | 0 | for (i = 0; i < dependencies->ndeps; i++) |
459 | 0 | len += (sizeof(AttrNumber) * dependencies->deps[i]->nattributes); |
460 | |
|
461 | 0 | output = (bytea *) palloc0(len); |
462 | 0 | SET_VARSIZE(output, len); |
463 | |
|
464 | 0 | tmp = VARDATA(output); |
465 | | |
466 | | /* Store the base struct values (magic, type, ndeps) */ |
467 | 0 | memcpy(tmp, &dependencies->magic, sizeof(uint32)); |
468 | 0 | tmp += sizeof(uint32); |
469 | 0 | memcpy(tmp, &dependencies->type, sizeof(uint32)); |
470 | 0 | tmp += sizeof(uint32); |
471 | 0 | memcpy(tmp, &dependencies->ndeps, sizeof(uint32)); |
472 | 0 | tmp += sizeof(uint32); |
473 | | |
474 | | /* store number of attributes and attribute numbers for each dependency */ |
475 | 0 | for (i = 0; i < dependencies->ndeps; i++) |
476 | 0 | { |
477 | 0 | MVDependency *d = dependencies->deps[i]; |
478 | |
|
479 | 0 | memcpy(tmp, d, SizeOfDependency); |
480 | 0 | tmp += SizeOfDependency; |
481 | |
|
482 | 0 | memcpy(tmp, d->attributes, sizeof(AttrNumber) * d->nattributes); |
483 | 0 | tmp += sizeof(AttrNumber) * d->nattributes; |
484 | |
|
485 | 0 | Assert(tmp <= ((char *) output + len)); |
486 | 0 | } |
487 | |
|
488 | 0 | return output; |
489 | 0 | } |
490 | | |
491 | | /* |
492 | | * Reads serialized dependencies into MVDependencies structure. |
493 | | */ |
494 | | MVDependencies * |
495 | | statext_dependencies_deserialize(bytea *data) |
496 | 0 | { |
497 | 0 | int i; |
498 | 0 | Size min_expected_size; |
499 | 0 | MVDependencies *dependencies; |
500 | 0 | char *tmp; |
501 | |
|
502 | 0 | if (data == NULL) |
503 | 0 | return NULL; |
504 | | |
505 | 0 | if (VARSIZE_ANY_EXHDR(data) < SizeOfDependencies) |
506 | 0 | elog(ERROR, "invalid MVDependencies size %zd (expected at least %zd)", |
507 | 0 | VARSIZE_ANY_EXHDR(data), SizeOfDependencies); |
508 | | |
509 | | /* read the MVDependencies header */ |
510 | 0 | dependencies = (MVDependencies *) palloc0(sizeof(MVDependencies)); |
511 | | |
512 | | /* initialize pointer to the data part (skip the varlena header) */ |
513 | 0 | tmp = VARDATA_ANY(data); |
514 | | |
515 | | /* read the header fields and perform basic sanity checks */ |
516 | 0 | memcpy(&dependencies->magic, tmp, sizeof(uint32)); |
517 | 0 | tmp += sizeof(uint32); |
518 | 0 | memcpy(&dependencies->type, tmp, sizeof(uint32)); |
519 | 0 | tmp += sizeof(uint32); |
520 | 0 | memcpy(&dependencies->ndeps, tmp, sizeof(uint32)); |
521 | 0 | tmp += sizeof(uint32); |
522 | |
|
523 | 0 | if (dependencies->magic != STATS_DEPS_MAGIC) |
524 | 0 | elog(ERROR, "invalid dependency magic %d (expected %d)", |
525 | 0 | dependencies->magic, STATS_DEPS_MAGIC); |
526 | |
|
527 | 0 | if (dependencies->type != STATS_DEPS_TYPE_BASIC) |
528 | 0 | elog(ERROR, "invalid dependency type %d (expected %d)", |
529 | 0 | dependencies->type, STATS_DEPS_TYPE_BASIC); |
530 | |
|
531 | 0 | if (dependencies->ndeps == 0) |
532 | 0 | ereport(ERROR, |
533 | 0 | (errcode(ERRCODE_DATA_CORRUPTED), |
534 | 0 | errmsg("invalid zero-length item array in MVDependencies"))); |
535 | | |
536 | | /* what minimum bytea size do we expect for those parameters */ |
537 | 0 | min_expected_size = SizeOfDependencies + |
538 | 0 | dependencies->ndeps * (SizeOfDependency + |
539 | 0 | sizeof(AttrNumber) * 2); |
540 | |
|
541 | 0 | if (VARSIZE_ANY_EXHDR(data) < min_expected_size) |
542 | 0 | elog(ERROR, "invalid dependencies size %zd (expected at least %zd)", |
543 | 0 | VARSIZE_ANY_EXHDR(data), min_expected_size); |
544 | | |
545 | | /* allocate space for the MCV items */ |
546 | 0 | dependencies = repalloc(dependencies, offsetof(MVDependencies, deps) |
547 | 0 | + (dependencies->ndeps * sizeof(MVDependency *))); |
548 | |
|
549 | 0 | for (i = 0; i < dependencies->ndeps; i++) |
550 | 0 | { |
551 | 0 | double degree; |
552 | 0 | AttrNumber k; |
553 | 0 | MVDependency *d; |
554 | | |
555 | | /* degree of validity */ |
556 | 0 | memcpy(°ree, tmp, sizeof(double)); |
557 | 0 | tmp += sizeof(double); |
558 | | |
559 | | /* number of attributes */ |
560 | 0 | memcpy(&k, tmp, sizeof(AttrNumber)); |
561 | 0 | tmp += sizeof(AttrNumber); |
562 | | |
563 | | /* is the number of attributes valid? */ |
564 | 0 | Assert((k >= 2) && (k <= STATS_MAX_DIMENSIONS)); |
565 | | |
566 | | /* now that we know the number of attributes, allocate the dependency */ |
567 | 0 | d = (MVDependency *) palloc0(offsetof(MVDependency, attributes) |
568 | 0 | + (k * sizeof(AttrNumber))); |
569 | |
|
570 | 0 | d->degree = degree; |
571 | 0 | d->nattributes = k; |
572 | | |
573 | | /* copy attribute numbers */ |
574 | 0 | memcpy(d->attributes, tmp, sizeof(AttrNumber) * d->nattributes); |
575 | 0 | tmp += sizeof(AttrNumber) * d->nattributes; |
576 | |
|
577 | 0 | dependencies->deps[i] = d; |
578 | | |
579 | | /* still within the bytea */ |
580 | 0 | Assert(tmp <= ((char *) data + VARSIZE_ANY(data))); |
581 | 0 | } |
582 | | |
583 | | /* we should have consumed the whole bytea exactly */ |
584 | 0 | Assert(tmp == ((char *) data + VARSIZE_ANY(data))); |
585 | |
|
586 | 0 | return dependencies; |
587 | 0 | } |
588 | | |
589 | | /* |
590 | | * dependency_is_fully_matched |
591 | | * checks that a functional dependency is fully matched given clauses on |
592 | | * attributes (assuming the clauses are suitable equality clauses) |
593 | | */ |
594 | | static bool |
595 | | dependency_is_fully_matched(MVDependency *dependency, Bitmapset *attnums) |
596 | 0 | { |
597 | 0 | int j; |
598 | | |
599 | | /* |
600 | | * Check that the dependency actually is fully covered by clauses. We have |
601 | | * to translate all attribute numbers, as those are referenced |
602 | | */ |
603 | 0 | for (j = 0; j < dependency->nattributes; j++) |
604 | 0 | { |
605 | 0 | int attnum = dependency->attributes[j]; |
606 | |
|
607 | 0 | if (!bms_is_member(attnum, attnums)) |
608 | 0 | return false; |
609 | 0 | } |
610 | |
|
611 | 0 | return true; |
612 | 0 | } |
613 | | |
614 | | /* |
615 | | * dependency_implies_attribute |
616 | | * check that the attnum matches is implied by the functional dependency |
617 | | */ |
618 | | static bool |
619 | | dependency_implies_attribute(MVDependency *dependency, AttrNumber attnum) |
620 | 0 | { |
621 | 0 | if (attnum == dependency->attributes[dependency->nattributes - 1]) |
622 | 0 | return true; |
623 | | |
624 | 0 | return false; |
625 | 0 | } |
626 | | |
627 | | /* |
628 | | * statext_dependencies_load |
629 | | * Load the functional dependencies for the indicated pg_statistic_ext tuple |
630 | | */ |
631 | | MVDependencies * |
632 | | statext_dependencies_load(Oid mvoid) |
633 | 0 | { |
634 | 0 | MVDependencies *result; |
635 | 0 | bool isnull; |
636 | 0 | Datum deps; |
637 | 0 | HeapTuple htup; |
638 | |
|
639 | 0 | htup = SearchSysCache1(STATEXTOID, ObjectIdGetDatum(mvoid)); |
640 | 0 | if (!HeapTupleIsValid(htup)) |
641 | 0 | elog(ERROR, "cache lookup failed for statistics object %u", mvoid); |
642 | |
|
643 | 0 | deps = SysCacheGetAttr(STATEXTOID, htup, |
644 | 0 | Anum_pg_statistic_ext_stxdependencies, &isnull); |
645 | 0 | if (isnull) |
646 | 0 | elog(ERROR, |
647 | 0 | "requested statistic kind \"%c\" is not yet built for statistics object %u", |
648 | 0 | STATS_EXT_DEPENDENCIES, mvoid); |
649 | |
|
650 | 0 | result = statext_dependencies_deserialize(DatumGetByteaPP(deps)); |
651 | |
|
652 | 0 | ReleaseSysCache(htup); |
653 | |
|
654 | 0 | return result; |
655 | 0 | } |
656 | | |
657 | | /* |
658 | | * pg_dependencies_in - input routine for type pg_dependencies. |
659 | | * |
660 | | * pg_dependencies is real enough to be a table column, but it has no operations |
661 | | * of its own, and disallows input too |
662 | | */ |
663 | | Datum |
664 | | pg_dependencies_in(PG_FUNCTION_ARGS) |
665 | 0 | { |
666 | | /* |
667 | | * pg_node_list stores the data in binary form and parsing text input is |
668 | | * not needed, so disallow this. |
669 | | */ |
670 | 0 | ereport(ERROR, |
671 | 0 | (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), |
672 | 0 | errmsg("cannot accept a value of type %s", "pg_dependencies"))); |
673 | |
|
674 | 0 | PG_RETURN_VOID(); /* keep compiler quiet */ |
675 | 0 | } |
676 | | |
677 | | /* |
678 | | * pg_dependencies - output routine for type pg_dependencies. |
679 | | */ |
680 | | Datum |
681 | | pg_dependencies_out(PG_FUNCTION_ARGS) |
682 | 0 | { |
683 | 0 | bytea *data = PG_GETARG_BYTEA_PP(0); |
684 | 0 | MVDependencies *dependencies = statext_dependencies_deserialize(data); |
685 | 0 | int i, |
686 | 0 | j; |
687 | 0 | StringInfoData str; |
688 | |
|
689 | 0 | initStringInfo(&str); |
690 | 0 | appendStringInfoChar(&str, '{'); |
691 | |
|
692 | 0 | for (i = 0; i < dependencies->ndeps; i++) |
693 | 0 | { |
694 | 0 | MVDependency *dependency = dependencies->deps[i]; |
695 | |
|
696 | 0 | if (i > 0) |
697 | 0 | appendStringInfoString(&str, ", "); |
698 | |
|
699 | 0 | appendStringInfoChar(&str, '"'); |
700 | 0 | for (j = 0; j < dependency->nattributes; j++) |
701 | 0 | { |
702 | 0 | if (j == dependency->nattributes - 1) |
703 | 0 | appendStringInfoString(&str, " => "); |
704 | 0 | else if (j > 0) |
705 | 0 | appendStringInfoString(&str, ", "); |
706 | |
|
707 | 0 | appendStringInfo(&str, "%d", dependency->attributes[j]); |
708 | 0 | } |
709 | 0 | appendStringInfo(&str, "\": %f", dependency->degree); |
710 | 0 | } |
711 | |
|
712 | 0 | appendStringInfoChar(&str, '}'); |
713 | |
|
714 | 0 | PG_RETURN_CSTRING(str.data); |
715 | 0 | } |
716 | | |
717 | | /* |
718 | | * pg_dependencies_recv - binary input routine for type pg_dependencies. |
719 | | */ |
720 | | Datum |
721 | | pg_dependencies_recv(PG_FUNCTION_ARGS) |
722 | 0 | { |
723 | 0 | ereport(ERROR, |
724 | 0 | (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), |
725 | 0 | errmsg("cannot accept a value of type %s", "pg_dependencies"))); |
726 | |
|
727 | 0 | PG_RETURN_VOID(); /* keep compiler quiet */ |
728 | 0 | } |
729 | | |
730 | | /* |
731 | | * pg_dependencies_send - binary output routine for type pg_dependencies. |
732 | | * |
733 | | * Functional dependencies are serialized in a bytea value (although the type |
734 | | * is named differently), so let's just send that. |
735 | | */ |
736 | | Datum |
737 | | pg_dependencies_send(PG_FUNCTION_ARGS) |
738 | 0 | { |
739 | 0 | return byteasend(fcinfo); |
740 | 0 | } |
741 | | |
742 | | /* |
743 | | * dependency_is_compatible_clause |
744 | | * Determines if the clause is compatible with functional dependencies |
745 | | * |
746 | | * Only clauses that have the form of equality to a pseudoconstant, or can be |
747 | | * interpreted that way, are currently accepted. Furthermore the variable |
748 | | * part of the clause must be a simple Var belonging to the specified |
749 | | * relation, whose attribute number we return in *attnum on success. |
750 | | */ |
751 | | static bool |
752 | | dependency_is_compatible_clause(Node *clause, Index relid, AttrNumber *attnum) |
753 | 0 | { |
754 | 0 | RestrictInfo *rinfo = (RestrictInfo *) clause; |
755 | 0 | Var *var; |
756 | |
|
757 | 0 | if (!IsA(rinfo, RestrictInfo)) |
758 | 0 | return false; |
759 | | |
760 | | /* Pseudoconstants are not interesting (they couldn't contain a Var) */ |
761 | 0 | if (rinfo->pseudoconstant) |
762 | 0 | return false; |
763 | | |
764 | | /* Clauses referencing multiple, or no, varnos are incompatible */ |
765 | 0 | if (bms_membership(rinfo->clause_relids) != BMS_SINGLETON) |
766 | 0 | return false; |
767 | | |
768 | 0 | if (is_opclause(rinfo->clause)) |
769 | 0 | { |
770 | | /* If it's an opclause, check for Var = Const or Const = Var. */ |
771 | 0 | OpExpr *expr = (OpExpr *) rinfo->clause; |
772 | | |
773 | | /* Only expressions with two arguments are candidates. */ |
774 | 0 | if (list_length(expr->args) != 2) |
775 | 0 | return false; |
776 | | |
777 | | /* Make sure non-selected argument is a pseudoconstant. */ |
778 | 0 | if (is_pseudo_constant_clause(lsecond(expr->args))) |
779 | 0 | var = linitial(expr->args); |
780 | 0 | else if (is_pseudo_constant_clause(linitial(expr->args))) |
781 | 0 | var = lsecond(expr->args); |
782 | 0 | else |
783 | 0 | return false; |
784 | | |
785 | | /* |
786 | | * If it's not an "=" operator, just ignore the clause, as it's not |
787 | | * compatible with functional dependencies. |
788 | | * |
789 | | * This uses the function for estimating selectivity, not the operator |
790 | | * directly (a bit awkward, but well ...). |
791 | | * |
792 | | * XXX this is pretty dubious; probably it'd be better to check btree |
793 | | * or hash opclass membership, so as not to be fooled by custom |
794 | | * selectivity functions, and to be more consistent with decisions |
795 | | * elsewhere in the planner. |
796 | | */ |
797 | 0 | if (get_oprrest(expr->opno) != F_EQSEL) |
798 | 0 | return false; |
799 | | |
800 | | /* OK to proceed with checking "var" */ |
801 | 0 | } |
802 | 0 | else if (not_clause((Node *) rinfo->clause)) |
803 | 0 | { |
804 | | /* |
805 | | * "NOT x" can be interpreted as "x = false", so get the argument and |
806 | | * proceed with seeing if it's a suitable Var. |
807 | | */ |
808 | 0 | var = (Var *) get_notclausearg(rinfo->clause); |
809 | 0 | } |
810 | 0 | else |
811 | 0 | { |
812 | | /* |
813 | | * A boolean expression "x" can be interpreted as "x = true", so |
814 | | * proceed with seeing if it's a suitable Var. |
815 | | */ |
816 | 0 | var = (Var *) rinfo->clause; |
817 | 0 | } |
818 | | |
819 | | /* |
820 | | * We may ignore any RelabelType node above the operand. (There won't be |
821 | | * more than one, since eval_const_expressions has been applied already.) |
822 | | */ |
823 | 0 | if (IsA(var, RelabelType)) |
824 | 0 | var = (Var *) ((RelabelType *) var)->arg; |
825 | | |
826 | | /* We only support plain Vars for now */ |
827 | 0 | if (!IsA(var, Var)) |
828 | 0 | return false; |
829 | | |
830 | | /* Ensure Var is from the correct relation */ |
831 | 0 | if (var->varno != relid) |
832 | 0 | return false; |
833 | | |
834 | | /* We also better ensure the Var is from the current level */ |
835 | 0 | if (var->varlevelsup != 0) |
836 | 0 | return false; |
837 | | |
838 | | /* Also ignore system attributes (we don't allow stats on those) */ |
839 | 0 | if (!AttrNumberIsForUserDefinedAttr(var->varattno)) |
840 | 0 | return false; |
841 | | |
842 | 0 | *attnum = var->varattno; |
843 | 0 | return true; |
844 | 0 | } |
845 | | |
846 | | /* |
847 | | * find_strongest_dependency |
848 | | * find the strongest dependency on the attributes |
849 | | * |
850 | | * When applying functional dependencies, we start with the strongest |
851 | | * dependencies. That is, we select the dependency that: |
852 | | * |
853 | | * (a) has all attributes covered by equality clauses |
854 | | * |
855 | | * (b) has the most attributes |
856 | | * |
857 | | * (c) has the highest degree of validity |
858 | | * |
859 | | * This guarantees that we eliminate the most redundant conditions first |
860 | | * (see the comment in dependencies_clauselist_selectivity). |
861 | | */ |
862 | | static MVDependency * |
863 | | find_strongest_dependency(StatisticExtInfo *stats, MVDependencies *dependencies, |
864 | | Bitmapset *attnums) |
865 | 0 | { |
866 | 0 | int i; |
867 | 0 | MVDependency *strongest = NULL; |
868 | | |
869 | | /* number of attnums in clauses */ |
870 | 0 | int nattnums = bms_num_members(attnums); |
871 | | |
872 | | /* |
873 | | * Iterate over the MVDependency items and find the strongest one from the |
874 | | * fully-matched dependencies. We do the cheap checks first, before |
875 | | * matching it against the attnums. |
876 | | */ |
877 | 0 | for (i = 0; i < dependencies->ndeps; i++) |
878 | 0 | { |
879 | 0 | MVDependency *dependency = dependencies->deps[i]; |
880 | | |
881 | | /* |
882 | | * Skip dependencies referencing more attributes than available |
883 | | * clauses, as those can't be fully matched. |
884 | | */ |
885 | 0 | if (dependency->nattributes > nattnums) |
886 | 0 | continue; |
887 | | |
888 | 0 | if (strongest) |
889 | 0 | { |
890 | | /* skip dependencies on fewer attributes than the strongest. */ |
891 | 0 | if (dependency->nattributes < strongest->nattributes) |
892 | 0 | continue; |
893 | | |
894 | | /* also skip weaker dependencies when attribute count matches */ |
895 | 0 | if (strongest->nattributes == dependency->nattributes && |
896 | 0 | strongest->degree > dependency->degree) |
897 | 0 | continue; |
898 | 0 | } |
899 | | |
900 | | /* |
901 | | * this dependency is stronger, but we must still check that it's |
902 | | * fully matched to these attnums. We perform this check last as it's |
903 | | * slightly more expensive than the previous checks. |
904 | | */ |
905 | 0 | if (dependency_is_fully_matched(dependency, attnums)) |
906 | 0 | strongest = dependency; /* save new best match */ |
907 | 0 | } |
908 | |
|
909 | 0 | return strongest; |
910 | 0 | } |
911 | | |
912 | | /* |
913 | | * dependencies_clauselist_selectivity |
914 | | * Return the estimated selectivity of (a subset of) the given clauses |
915 | | * using functional dependency statistics, or 1.0 if no useful functional |
916 | | * dependency statistic exists. |
917 | | * |
918 | | * 'estimatedclauses' is an output argument that gets a bit set corresponding |
919 | | * to the (zero-based) list index of each clause that is included in the |
920 | | * estimated selectivity. |
921 | | * |
922 | | * Given equality clauses on attributes (a,b) we find the strongest dependency |
923 | | * between them, i.e. either (a=>b) or (b=>a). Assuming (a=>b) is the selected |
924 | | * dependency, we then combine the per-clause selectivities using the formula |
925 | | * |
926 | | * P(a,b) = P(a) * [f + (1-f)*P(b)] |
927 | | * |
928 | | * where 'f' is the degree of the dependency. |
929 | | * |
930 | | * With clauses on more than two attributes, the dependencies are applied |
931 | | * recursively, starting with the widest/strongest dependencies. For example |
932 | | * P(a,b,c) is first split like this: |
933 | | * |
934 | | * P(a,b,c) = P(a,b) * [f + (1-f)*P(c)] |
935 | | * |
936 | | * assuming (a,b=>c) is the strongest dependency. |
937 | | */ |
938 | | Selectivity |
939 | | dependencies_clauselist_selectivity(PlannerInfo *root, |
940 | | List *clauses, |
941 | | int varRelid, |
942 | | JoinType jointype, |
943 | | SpecialJoinInfo *sjinfo, |
944 | | RelOptInfo *rel, |
945 | | Bitmapset **estimatedclauses) |
946 | 0 | { |
947 | 0 | Selectivity s1 = 1.0; |
948 | 0 | ListCell *l; |
949 | 0 | Bitmapset *clauses_attnums = NULL; |
950 | 0 | StatisticExtInfo *stat; |
951 | 0 | MVDependencies *dependencies; |
952 | 0 | AttrNumber *list_attnums; |
953 | 0 | int listidx; |
954 | | |
955 | | /* initialize output argument */ |
956 | 0 | *estimatedclauses = NULL; |
957 | | |
958 | | /* check if there's any stats that might be useful for us. */ |
959 | 0 | if (!has_stats_of_kind(rel->statlist, STATS_EXT_DEPENDENCIES)) |
960 | 0 | return 1.0; |
961 | | |
962 | 0 | list_attnums = (AttrNumber *) palloc(sizeof(AttrNumber) * |
963 | 0 | list_length(clauses)); |
964 | | |
965 | | /* |
966 | | * Pre-process the clauses list to extract the attnums seen in each item. |
967 | | * We need to determine if there's any clauses which will be useful for |
968 | | * dependency selectivity estimations. Along the way we'll record all of |
969 | | * the attnums for each clause in a list which we'll reference later so we |
970 | | * don't need to repeat the same work again. We'll also keep track of all |
971 | | * attnums seen. |
972 | | */ |
973 | 0 | listidx = 0; |
974 | 0 | foreach(l, clauses) |
975 | 0 | { |
976 | 0 | Node *clause = (Node *) lfirst(l); |
977 | 0 | AttrNumber attnum; |
978 | |
|
979 | 0 | if (dependency_is_compatible_clause(clause, rel->relid, &attnum)) |
980 | 0 | { |
981 | 0 | list_attnums[listidx] = attnum; |
982 | 0 | clauses_attnums = bms_add_member(clauses_attnums, attnum); |
983 | 0 | } |
984 | 0 | else |
985 | 0 | list_attnums[listidx] = InvalidAttrNumber; |
986 | |
|
987 | 0 | listidx++; |
988 | 0 | } |
989 | | |
990 | | /* |
991 | | * If there's not at least two distinct attnums then reject the whole list |
992 | | * of clauses. We must return 1.0 so the calling function's selectivity is |
993 | | * unaffected. |
994 | | */ |
995 | 0 | if (bms_num_members(clauses_attnums) < 2) |
996 | 0 | { |
997 | 0 | pfree(list_attnums); |
998 | 0 | return 1.0; |
999 | 0 | } |
1000 | | |
1001 | | /* find the best suited statistics object for these attnums */ |
1002 | 0 | stat = choose_best_statistics(rel->statlist, clauses_attnums, |
1003 | 0 | STATS_EXT_DEPENDENCIES); |
1004 | | |
1005 | | /* if no matching stats could be found then we've nothing to do */ |
1006 | 0 | if (!stat) |
1007 | 0 | { |
1008 | 0 | pfree(list_attnums); |
1009 | 0 | return 1.0; |
1010 | 0 | } |
1011 | | |
1012 | | /* load the dependency items stored in the statistics object */ |
1013 | 0 | dependencies = statext_dependencies_load(stat->statOid); |
1014 | | |
1015 | | /* |
1016 | | * Apply the dependencies recursively, starting with the widest/strongest |
1017 | | * ones, and proceeding to the smaller/weaker ones. At the end of each |
1018 | | * round we factor in the selectivity of clauses on the implied attribute, |
1019 | | * and remove the clauses from the list. |
1020 | | */ |
1021 | 0 | while (true) |
1022 | 0 | { |
1023 | 0 | Selectivity s2 = 1.0; |
1024 | 0 | MVDependency *dependency; |
1025 | | |
1026 | | /* the widest/strongest dependency, fully matched by clauses */ |
1027 | 0 | dependency = find_strongest_dependency(stat, dependencies, |
1028 | 0 | clauses_attnums); |
1029 | | |
1030 | | /* if no suitable dependency was found, we're done */ |
1031 | 0 | if (!dependency) |
1032 | 0 | break; |
1033 | | |
1034 | | /* |
1035 | | * We found an applicable dependency, so find all the clauses on the |
1036 | | * implied attribute - with dependency (a,b => c) we look for clauses |
1037 | | * on 'c'. |
1038 | | */ |
1039 | 0 | listidx = -1; |
1040 | 0 | foreach(l, clauses) |
1041 | 0 | { |
1042 | 0 | Node *clause; |
1043 | |
|
1044 | 0 | listidx++; |
1045 | | |
1046 | | /* |
1047 | | * Skip incompatible clauses, and ones we've already estimated on. |
1048 | | */ |
1049 | 0 | if (list_attnums[listidx] == InvalidAttrNumber || |
1050 | 0 | bms_is_member(listidx, *estimatedclauses)) |
1051 | 0 | continue; |
1052 | | |
1053 | | /* |
1054 | | * Technically we could find more than one clause for a given |
1055 | | * attnum. Since these clauses must be equality clauses, we choose |
1056 | | * to only take the selectivity estimate from the final clause in |
1057 | | * the list for this attnum. If the attnum happens to be compared |
1058 | | * to a different Const in another clause then no rows will match |
1059 | | * anyway. If it happens to be compared to the same Const, then |
1060 | | * ignoring the additional clause is just the thing to do. |
1061 | | */ |
1062 | 0 | if (dependency_implies_attribute(dependency, |
1063 | 0 | list_attnums[listidx])) |
1064 | 0 | { |
1065 | 0 | clause = (Node *) lfirst(l); |
1066 | |
|
1067 | 0 | s2 = clause_selectivity(root, clause, varRelid, jointype, |
1068 | 0 | sjinfo); |
1069 | | |
1070 | | /* mark this one as done, so we don't touch it again. */ |
1071 | 0 | *estimatedclauses = bms_add_member(*estimatedclauses, listidx); |
1072 | | |
1073 | | /* |
1074 | | * Mark that we've got and used the dependency on this clause. |
1075 | | * We'll want to ignore this when looking for the next |
1076 | | * strongest dependency above. |
1077 | | */ |
1078 | 0 | clauses_attnums = bms_del_member(clauses_attnums, |
1079 | 0 | list_attnums[listidx]); |
1080 | 0 | } |
1081 | 0 | } |
1082 | | |
1083 | | /* |
1084 | | * Now factor in the selectivity for all the "implied" clauses into |
1085 | | * the final one, using this formula: |
1086 | | * |
1087 | | * P(a,b) = P(a) * (f + (1-f) * P(b)) |
1088 | | * |
1089 | | * where 'f' is the degree of validity of the dependency. |
1090 | | */ |
1091 | 0 | s1 *= (dependency->degree + (1 - dependency->degree) * s2); |
1092 | 0 | } |
1093 | |
|
1094 | 0 | pfree(dependencies); |
1095 | 0 | pfree(list_attnums); |
1096 | |
|
1097 | 0 | return s1; |
1098 | 0 | } |