YugabyteDB (2.13.1.0-b60, 21121d69985fbf76aa6958d8f04a9bfa936293b5)

Coverage Report

Created: 2022-03-22 16:43

/Users/deen/code/yugabyte-db/build/debugcov-clang-dynamic-arm64-ninja/postgres_build/src/interfaces/libpq/unicode_norm.c
Line
Count
Source (jump to first uncovered line)
1
/*-------------------------------------------------------------------------
2
 * unicode_norm.c
3
 *    Normalize a Unicode string to NFKC form
4
 *
5
 * This implements Unicode normalization, per the documentation at
6
 * http://www.unicode.org/reports/tr15/.
7
 *
8
 * Portions Copyright (c) 2017-2018, PostgreSQL Global Development Group
9
 *
10
 * IDENTIFICATION
11
 *    src/common/unicode_norm.c
12
 *
13
 *-------------------------------------------------------------------------
14
 */
15
#ifndef FRONTEND
16
#include "postgres.h"
17
#else
18
#include "postgres_fe.h"
19
#endif
20
21
#include "common/unicode_norm.h"
22
#include "common/unicode_norm_table.h"
23
24
#ifndef FRONTEND
25
#define ALLOC(size) palloc(size)
26
#define FREE(size) pfree(size)
27
#else
28
0
#define ALLOC(size) malloc(size)
29
0
#define FREE(size) free(size)
30
#endif
31
32
/* Constants for calculations with Hangul characters */
33
0
#define SBASE   0xAC00    /* U+AC00 */
34
0
#define LBASE   0x1100    /* U+1100 */
35
0
#define VBASE   0x1161    /* U+1161 */
36
0
#define TBASE   0x11A7    /* U+11A7 */
37
0
#define LCOUNT    19
38
0
#define VCOUNT    21
39
0
#define TCOUNT    28
40
0
#define NCOUNT    VCOUNT * TCOUNT
41
0
#define SCOUNT    LCOUNT * NCOUNT
42
43
/* comparison routine for bsearch() of decomposition lookup table. */
44
static int
45
conv_compare(const void *p1, const void *p2)
46
0
{
47
0
  uint32    v1,
48
0
        v2;
49
50
0
  v1 = *(const uint32 *) p1;
51
0
  v2 = ((const pg_unicode_decomposition *) p2)->codepoint;
52
0
  return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1);
53
0
}
54
55
/*
56
 * Get the entry corresponding to code in the decomposition lookup table.
57
 */
58
static pg_unicode_decomposition *
59
get_code_entry(pg_wchar code)
60
0
{
61
0
  return bsearch(&(code),
62
0
           (void *) UnicodeDecompMain,
63
0
           lengthof(UnicodeDecompMain),
64
0
           sizeof(pg_unicode_decomposition),
65
0
           conv_compare);
66
0
}
67
68
/*
69
 * Given a decomposition entry looked up earlier, get the decomposed
70
 * characters.
71
 *
72
 * Note: the returned pointer can point to statically allocated buffer, and
73
 * is only valid until next call to this function!
74
 */
75
static const pg_wchar *
76
get_code_decomposition(pg_unicode_decomposition *entry, int *dec_size)
77
0
{
78
0
  static pg_wchar x;
79
80
0
  if (DECOMPOSITION_IS_INLINE(entry))
81
0
  {
82
0
    Assert(DECOMPOSITION_SIZE(entry) == 1);
83
0
    x = (pg_wchar) entry->dec_index;
84
0
    *dec_size = 1;
85
0
    return &x;
86
0
  }
87
0
  else
88
0
  {
89
0
    *dec_size = DECOMPOSITION_SIZE(entry);
90
0
    return &UnicodeDecomp_codepoints[entry->dec_index];
91
0
  }
92
0
}
93
94
/*
95
 * Calculate how many characters a given character will decompose to.
96
 *
97
 * This needs to recurse, if the character decomposes into characters that
98
 * are, in turn, decomposable.
99
 */
100
static int
101
get_decomposed_size(pg_wchar code)
102
0
{
103
0
  pg_unicode_decomposition *entry;
104
0
  int     size = 0;
105
0
  int     i;
106
0
  const uint32 *decomp;
107
0
  int     dec_size;
108
109
  /*
110
   * Fast path for Hangul characters not stored in tables to save memory as
111
   * decomposition is algorithmic. See
112
   * http://unicode.org/reports/tr15/tr15-18.html, annex 10 for details on
113
   * the matter.
114
   */
115
0
  if (code >= SBASE && code < SBASE + SCOUNT)
116
0
  {
117
0
    uint32    tindex,
118
0
          sindex;
119
120
0
    sindex = code - SBASE;
121
0
    tindex = sindex % TCOUNT;
122
123
0
    if (tindex != 0)
124
0
      return 3;
125
0
    return 2;
126
0
  }
127
128
0
  entry = get_code_entry(code);
129
130
  /*
131
   * Just count current code if no other decompositions.  A NULL entry is
132
   * equivalent to a character with class 0 and no decompositions.
133
   */
134
0
  if (entry == NULL || DECOMPOSITION_SIZE(entry) == 0)
135
0
    return 1;
136
137
  /*
138
   * If this entry has other decomposition codes look at them as well. First
139
   * get its decomposition in the list of tables available.
140
   */
141
0
  decomp = get_code_decomposition(entry, &dec_size);
142
0
  for (i = 0; i < dec_size; i++)
143
0
  {
144
0
    uint32    lcode = decomp[i];
145
146
0
    size += get_decomposed_size(lcode);
147
0
  }
148
149
0
  return size;
150
0
}
151
152
/*
153
 * Recompose a set of characters. For hangul characters, the calculation
154
 * is algorithmic. For others, an inverse lookup at the decomposition
155
 * table is necessary. Returns true if a recomposition can be done, and
156
 * false otherwise.
157
 */
158
static bool
159
recompose_code(uint32 start, uint32 code, uint32 *result)
160
0
{
161
  /*
162
   * Handle Hangul characters algorithmically, per the Unicode spec.
163
   *
164
   * Check if two current characters are L and V.
165
   */
166
0
  if (start >= LBASE && start < LBASE + LCOUNT &&
167
0
    code >= VBASE && code < VBASE + VCOUNT)
168
0
  {
169
    /* make syllable of form LV */
170
0
    uint32    lindex = start - LBASE;
171
0
    uint32    vindex = code - VBASE;
172
173
0
    *result = SBASE + (lindex * VCOUNT + vindex) * TCOUNT;
174
0
    return true;
175
0
  }
176
  /* Check if two current characters are LV and T */
177
0
  else if (start >= SBASE && start < (SBASE + SCOUNT) &&
178
0
       ((start - SBASE) % TCOUNT) == 0 &&
179
0
       code >= TBASE && code < (TBASE + TCOUNT))
180
0
  {
181
    /* make syllable of from LVT */
182
0
    uint32    tindex = code - TBASE;
183
184
0
    *result = start + tindex;
185
0
    return true;
186
0
  }
187
0
  else
188
0
  {
189
0
    int     i;
190
191
    /*
192
     * Do an inverse lookup of the decomposition tables to see if anything
193
     * matches. The comparison just needs to be a perfect match on the
194
     * sub-table of size two, because the start character has already been
195
     * recomposed partially.
196
     */
197
0
    for (i = 0; i < lengthof(UnicodeDecompMain); i++)
198
0
    {
199
0
      const pg_unicode_decomposition *entry = &UnicodeDecompMain[i];
200
201
0
      if (DECOMPOSITION_SIZE(entry) != 2)
202
0
        continue;
203
204
0
      if (DECOMPOSITION_NO_COMPOSE(entry))
205
0
        continue;
206
207
0
      if (start == UnicodeDecomp_codepoints[entry->dec_index] &&
208
0
        code == UnicodeDecomp_codepoints[entry->dec_index + 1])
209
0
      {
210
0
        *result = entry->codepoint;
211
0
        return true;
212
0
      }
213
0
    }
214
0
  }
215
216
0
  return false;
217
0
}
218
219
/*
220
 * Decompose the given code into the array given by caller. The
221
 * decomposition begins at the position given by caller, saving one
222
 * lookup on the decomposition table. The current position needs to be
223
 * updated here to let the caller know from where to continue filling
224
 * in the array result.
225
 */
226
static void
227
decompose_code(pg_wchar code, pg_wchar **result, int *current)
228
0
{
229
0
  pg_unicode_decomposition *entry;
230
0
  int     i;
231
0
  const uint32 *decomp;
232
0
  int     dec_size;
233
234
  /*
235
   * Fast path for Hangul characters not stored in tables to save memory as
236
   * decomposition is algorithmic. See
237
   * http://unicode.org/reports/tr15/tr15-18.html, annex 10 for details on
238
   * the matter.
239
   */
240
0
  if (code >= SBASE && code < SBASE + SCOUNT)
241
0
  {
242
0
    uint32    l,
243
0
          v,
244
0
          tindex,
245
0
          sindex;
246
0
    pg_wchar   *res = *result;
247
248
0
    sindex = code - SBASE;
249
0
    l = LBASE + sindex / (VCOUNT * TCOUNT);
250
0
    v = VBASE + (sindex % (VCOUNT * TCOUNT)) / TCOUNT;
251
0
    tindex = sindex % TCOUNT;
252
253
0
    res[*current] = l;
254
0
    (*current)++;
255
0
    res[*current] = v;
256
0
    (*current)++;
257
258
0
    if (tindex != 0)
259
0
    {
260
0
      res[*current] = TBASE + tindex;
261
0
      (*current)++;
262
0
    }
263
264
0
    return;
265
0
  }
266
267
0
  entry = get_code_entry(code);
268
269
  /*
270
   * Just fill in with the current decomposition if there are no
271
   * decomposition codes to recurse to.  A NULL entry is equivalent to a
272
   * character with class 0 and no decompositions, so just leave also in
273
   * this case.
274
   */
275
0
  if (entry == NULL || DECOMPOSITION_SIZE(entry) == 0)
276
0
  {
277
0
    pg_wchar   *res = *result;
278
279
0
    res[*current] = code;
280
0
    (*current)++;
281
0
    return;
282
0
  }
283
284
  /*
285
   * If this entry has other decomposition codes look at them as well.
286
   */
287
0
  decomp = get_code_decomposition(entry, &dec_size);
288
0
  for (i = 0; i < dec_size; i++)
289
0
  {
290
0
    pg_wchar  lcode = (pg_wchar) decomp[i];
291
292
    /* Leave if no more decompositions */
293
0
    decompose_code(lcode, result, current);
294
0
  }
295
0
}
296
297
/*
298
 * unicode_normalize_kc - Normalize a Unicode string to NFKC form.
299
 *
300
 * The input is a 0-terminated array of codepoints.
301
 *
302
 * In frontend, returns a 0-terminated array of codepoints, allocated with
303
 * malloc. Or NULL if we run out of memory. In frontend, the returned
304
 * string is palloc'd instead, and OOM is reported with ereport().
305
 */
306
pg_wchar *
307
unicode_normalize_kc(const pg_wchar *input)
308
0
{
309
0
  pg_wchar   *decomp_chars;
310
0
  pg_wchar   *recomp_chars;
311
0
  int     decomp_size,
312
0
        current_size;
313
0
  int     count;
314
0
  const pg_wchar *p;
315
316
  /* variables for recomposition */
317
0
  int     last_class;
318
0
  int     starter_pos;
319
0
  int     target_pos;
320
0
  uint32    starter_ch;
321
322
  /* First, do character decomposition */
323
324
  /*
325
   * Calculate how many characters long the decomposed version will be.
326
   */
327
0
  decomp_size = 0;
328
0
  for (p = input; *p; p++)
329
0
    decomp_size += get_decomposed_size(*p);
330
331
0
  decomp_chars = (pg_wchar *) ALLOC((decomp_size + 1) * sizeof(pg_wchar));
332
0
  if (decomp_chars == NULL)
333
0
    return NULL;
334
335
  /*
336
   * Now fill in each entry recursively. This needs a second pass on the
337
   * decomposition table.
338
   */
339
0
  current_size = 0;
340
0
  for (p = input; *p; p++)
341
0
    decompose_code(*p, &decomp_chars, &current_size);
342
0
  decomp_chars[decomp_size] = '\0';
343
0
  Assert(decomp_size == current_size);
344
345
  /*
346
   * Now apply canonical ordering.
347
   */
348
0
  for (count = 1; count < decomp_size; count++)
349
0
  {
350
0
    pg_wchar  prev = decomp_chars[count - 1];
351
0
    pg_wchar  next = decomp_chars[count];
352
0
    pg_wchar  tmp;
353
0
    pg_unicode_decomposition *prevEntry = get_code_entry(prev);
354
0
    pg_unicode_decomposition *nextEntry = get_code_entry(next);
355
356
    /*
357
     * If no entries are found, the character used is either an Hangul
358
     * character or a character with a class of 0 and no decompositions,
359
     * so move to next result.
360
     */
361
0
    if (prevEntry == NULL || nextEntry == NULL)
362
0
      continue;
363
364
    /*
365
     * Per Unicode (http://unicode.org/reports/tr15/tr15-18.html) annex 4,
366
     * a sequence of two adjacent characters in a string is an
367
     * exchangeable pair if the combining class (from the Unicode
368
     * Character Database) for the first character is greater than the
369
     * combining class for the second, and the second is not a starter.  A
370
     * character is a starter if its combining class is 0.
371
     */
372
0
    if (nextEntry->comb_class == 0x0 || prevEntry->comb_class == 0x0)
373
0
      continue;
374
375
0
    if (prevEntry->comb_class <= nextEntry->comb_class)
376
0
      continue;
377
378
    /* exchange can happen */
379
0
    tmp = decomp_chars[count - 1];
380
0
    decomp_chars[count - 1] = decomp_chars[count];
381
0
    decomp_chars[count] = tmp;
382
383
    /* backtrack to check again */
384
0
    if (count > 1)
385
0
      count -= 2;
386
0
  }
387
388
  /*
389
   * The last phase of NFKC is the recomposition of the reordered Unicode
390
   * string using combining classes. The recomposed string cannot be longer
391
   * than the decomposed one, so make the allocation of the output string
392
   * based on that assumption.
393
   */
394
0
  recomp_chars = (pg_wchar *) ALLOC((decomp_size + 1) * sizeof(pg_wchar));
395
0
  if (!recomp_chars)
396
0
  {
397
0
    FREE(decomp_chars);
398
0
    return NULL;
399
0
  }
400
401
0
  last_class = -1;      /* this eliminates a special check */
402
0
  starter_pos = 0;
403
0
  target_pos = 1;
404
0
  starter_ch = recomp_chars[0] = decomp_chars[0];
405
406
0
  for (count = 1; count < decomp_size; count++)
407
0
  {
408
0
    pg_wchar  ch = decomp_chars[count];
409
0
    pg_unicode_decomposition *ch_entry = get_code_entry(ch);
410
0
    int     ch_class = (ch_entry == NULL) ? 0 : ch_entry->comb_class;
411
0
    pg_wchar  composite;
412
413
0
    if (last_class < ch_class &&
414
0
      recompose_code(starter_ch, ch, &composite))
415
0
    {
416
0
      recomp_chars[starter_pos] = composite;
417
0
      starter_ch = composite;
418
0
    }
419
0
    else if (ch_class == 0)
420
0
    {
421
0
      starter_pos = target_pos;
422
0
      starter_ch = ch;
423
0
      last_class = -1;
424
0
      recomp_chars[target_pos++] = ch;
425
0
    }
426
0
    else
427
0
    {
428
0
      last_class = ch_class;
429
0
      recomp_chars[target_pos++] = ch;
430
0
    }
431
0
  }
432
0
  recomp_chars[target_pos] = (pg_wchar) '\0';
433
434
0
  FREE(decomp_chars);
435
436
0
  return recomp_chars;
437
0
}