YugabyteDB (2.13.0.0-b42, bfc6a6643e7399ac8a0e81d06a3ee6d6571b33ab)

Coverage Report

Created: 2022-03-09 17:30

/Users/deen/code/yugabyte-db/build/debugcov-clang-dynamic-arm64-ninja/postgres_build/src/interfaces/libpq/wchar.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * conversion functions between pg_wchar and multibyte streams.
3
 * Tatsuo Ishii
4
 * src/backend/utils/mb/wchar.c
5
 *
6
 */
7
/* can be used in either frontend or backend */
8
#ifdef FRONTEND
9
#include "postgres_fe.h"
10
#else
11
#include "postgres.h"
12
#endif
13
14
#include "mb/pg_wchar.h"
15
16
17
/*
18
 * conversion to pg_wchar is done by "table driven."
19
 * to add an encoding support, define mb2wchar_with_len(), mblen(), dsplen()
20
 * for the particular encoding. Note that if the encoding is only
21
 * supported in the client, you don't need to define
22
 * mb2wchar_with_len() function (SJIS is the case).
23
 *
24
 * These functions generally assume that their input is validly formed.
25
 * The "verifier" functions, further down in the file, have to be more
26
 * paranoid.  We expect that mblen() does not need to examine more than
27
 * the first byte of the character to discover the correct length.
28
 *
29
 * Note: for the display output of psql to work properly, the return values
30
 * of the dsplen functions must conform to the Unicode standard. In particular
31
 * the NUL character is zero width and control characters are generally
32
 * width -1. It is recommended that non-ASCII encodings refer their ASCII
33
 * subset to the ASCII routines to ensure consistency.
34
 */
35
36
/*
37
 * SQL/ASCII
38
 */
39
static int
40
pg_ascii2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
41
0
{
42
0
  int     cnt = 0;
43
44
0
  while (len > 0 && *from)
45
0
  {
46
0
    *to++ = *from++;
47
0
    len--;
48
0
    cnt++;
49
0
  }
50
0
  *to = 0;
51
0
  return cnt;
52
0
}
53
54
static int
55
pg_ascii_mblen(const unsigned char *s)
56
0
{
57
0
  return 1;
58
0
}
59
60
static int
61
pg_ascii_dsplen(const unsigned char *s)
62
0
{
63
0
  if (*s == '\0')
64
0
    return 0;
65
0
  if (*s < 0x20 || *s == 0x7f)
66
0
    return -1;
67
68
0
  return 1;
69
0
}
70
71
/*
72
 * EUC
73
 */
74
static int
75
pg_euc2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
76
0
{
77
0
  int     cnt = 0;
78
79
0
  while (len > 0 && *from)
80
0
  {
81
0
    if (*from == SS2 && len >= 2) /* JIS X 0201 (so called "1 byte
82
                     * KANA") */
83
0
    {
84
0
      from++;
85
0
      *to = (SS2 << 8) | *from++;
86
0
      len -= 2;
87
0
    }
88
0
    else if (*from == SS3 && len >= 3) /* JIS X 0212 KANJI */
89
0
    {
90
0
      from++;
91
0
      *to = (SS3 << 16) | (*from++ << 8);
92
0
      *to |= *from++;
93
0
      len -= 3;
94
0
    }
95
0
    else if (IS_HIGHBIT_SET(*from) && len >= 2) /* JIS X 0208 KANJI */
96
0
    {
97
0
      *to = *from++ << 8;
98
0
      *to |= *from++;
99
0
      len -= 2;
100
0
    }
101
0
    else          /* must be ASCII */
102
0
    {
103
0
      *to = *from++;
104
0
      len--;
105
0
    }
106
0
    to++;
107
0
    cnt++;
108
0
  }
109
0
  *to = 0;
110
0
  return cnt;
111
0
}
112
113
static inline int
114
pg_euc_mblen(const unsigned char *s)
115
0
{
116
0
  int     len;
117
118
0
  if (*s == SS2)
119
0
    len = 2;
120
0
  else if (*s == SS3)
121
0
    len = 3;
122
0
  else if (IS_HIGHBIT_SET(*s))
123
0
    len = 2;
124
0
  else
125
0
    len = 1;
126
0
  return len;
127
0
}
128
129
static inline int
130
pg_euc_dsplen(const unsigned char *s)
131
0
{
132
0
  int     len;
133
134
0
  if (*s == SS2)
135
0
    len = 2;
136
0
  else if (*s == SS3)
137
0
    len = 2;
138
0
  else if (IS_HIGHBIT_SET(*s))
139
0
    len = 2;
140
0
  else
141
0
    len = pg_ascii_dsplen(s);
142
0
  return len;
143
0
}
144
145
/*
146
 * EUC_JP
147
 */
148
static int
149
pg_eucjp2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
150
0
{
151
0
  return pg_euc2wchar_with_len(from, to, len);
152
0
}
153
154
static int
155
pg_eucjp_mblen(const unsigned char *s)
156
0
{
157
0
  return pg_euc_mblen(s);
158
0
}
159
160
static int
161
pg_eucjp_dsplen(const unsigned char *s)
162
0
{
163
0
  int     len;
164
165
0
  if (*s == SS2)
166
0
    len = 1;
167
0
  else if (*s == SS3)
168
0
    len = 2;
169
0
  else if (IS_HIGHBIT_SET(*s))
170
0
    len = 2;
171
0
  else
172
0
    len = pg_ascii_dsplen(s);
173
0
  return len;
174
0
}
175
176
/*
177
 * EUC_KR
178
 */
179
static int
180
pg_euckr2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
181
0
{
182
0
  return pg_euc2wchar_with_len(from, to, len);
183
0
}
184
185
static int
186
pg_euckr_mblen(const unsigned char *s)
187
0
{
188
0
  return pg_euc_mblen(s);
189
0
}
190
191
static int
192
pg_euckr_dsplen(const unsigned char *s)
193
0
{
194
0
  return pg_euc_dsplen(s);
195
0
}
196
197
/*
198
 * EUC_CN
199
 *
200
 */
201
static int
202
pg_euccn2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
203
0
{
204
0
  int     cnt = 0;
205
206
0
  while (len > 0 && *from)
207
0
  {
208
0
    if (*from == SS2 && len >= 3) /* code set 2 (unused?) */
209
0
    {
210
0
      from++;
211
0
      *to = (SS2 << 16) | (*from++ << 8);
212
0
      *to |= *from++;
213
0
      len -= 3;
214
0
    }
215
0
    else if (*from == SS3 && len >= 3) /* code set 3 (unused ?) */
216
0
    {
217
0
      from++;
218
0
      *to = (SS3 << 16) | (*from++ << 8);
219
0
      *to |= *from++;
220
0
      len -= 3;
221
0
    }
222
0
    else if (IS_HIGHBIT_SET(*from) && len >= 2) /* code set 1 */
223
0
    {
224
0
      *to = *from++ << 8;
225
0
      *to |= *from++;
226
0
      len -= 2;
227
0
    }
228
0
    else
229
0
    {
230
0
      *to = *from++;
231
0
      len--;
232
0
    }
233
0
    to++;
234
0
    cnt++;
235
0
  }
236
0
  *to = 0;
237
0
  return cnt;
238
0
}
239
240
static int
241
pg_euccn_mblen(const unsigned char *s)
242
0
{
243
0
  int     len;
244
245
0
  if (IS_HIGHBIT_SET(*s))
246
0
    len = 2;
247
0
  else
248
0
    len = 1;
249
0
  return len;
250
0
}
251
252
static int
253
pg_euccn_dsplen(const unsigned char *s)
254
0
{
255
0
  int     len;
256
257
0
  if (IS_HIGHBIT_SET(*s))
258
0
    len = 2;
259
0
  else
260
0
    len = pg_ascii_dsplen(s);
261
0
  return len;
262
0
}
263
264
/*
265
 * EUC_TW
266
 *
267
 */
268
static int
269
pg_euctw2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
270
0
{
271
0
  int     cnt = 0;
272
273
0
  while (len > 0 && *from)
274
0
  {
275
0
    if (*from == SS2 && len >= 4) /* code set 2 */
276
0
    {
277
0
      from++;
278
0
      *to = (((uint32) SS2) << 24) | (*from++ << 16);
279
0
      *to |= *from++ << 8;
280
0
      *to |= *from++;
281
0
      len -= 4;
282
0
    }
283
0
    else if (*from == SS3 && len >= 3) /* code set 3 (unused?) */
284
0
    {
285
0
      from++;
286
0
      *to = (SS3 << 16) | (*from++ << 8);
287
0
      *to |= *from++;
288
0
      len -= 3;
289
0
    }
290
0
    else if (IS_HIGHBIT_SET(*from) && len >= 2) /* code set 2 */
291
0
    {
292
0
      *to = *from++ << 8;
293
0
      *to |= *from++;
294
0
      len -= 2;
295
0
    }
296
0
    else
297
0
    {
298
0
      *to = *from++;
299
0
      len--;
300
0
    }
301
0
    to++;
302
0
    cnt++;
303
0
  }
304
0
  *to = 0;
305
0
  return cnt;
306
0
}
307
308
static int
309
pg_euctw_mblen(const unsigned char *s)
310
0
{
311
0
  int     len;
312
313
0
  if (*s == SS2)
314
0
    len = 4;
315
0
  else if (*s == SS3)
316
0
    len = 3;
317
0
  else if (IS_HIGHBIT_SET(*s))
318
0
    len = 2;
319
0
  else
320
0
    len = 1;
321
0
  return len;
322
0
}
323
324
static int
325
pg_euctw_dsplen(const unsigned char *s)
326
0
{
327
0
  int     len;
328
329
0
  if (*s == SS2)
330
0
    len = 2;
331
0
  else if (*s == SS3)
332
0
    len = 2;
333
0
  else if (IS_HIGHBIT_SET(*s))
334
0
    len = 2;
335
0
  else
336
0
    len = pg_ascii_dsplen(s);
337
0
  return len;
338
0
}
339
340
/*
341
 * Convert pg_wchar to EUC_* encoding.
342
 * caller must allocate enough space for "to", including a trailing zero!
343
 * len: length of from.
344
 * "from" not necessarily null terminated.
345
 */
346
static int
347
pg_wchar2euc_with_len(const pg_wchar *from, unsigned char *to, int len)
348
0
{
349
0
  int     cnt = 0;
350
351
0
  while (len > 0 && *from)
352
0
  {
353
0
    unsigned char c;
354
355
0
    if ((c = (*from >> 24)))
356
0
    {
357
0
      *to++ = c;
358
0
      *to++ = (*from >> 16) & 0xff;
359
0
      *to++ = (*from >> 8) & 0xff;
360
0
      *to++ = *from & 0xff;
361
0
      cnt += 4;
362
0
    }
363
0
    else if ((c = (*from >> 16)))
364
0
    {
365
0
      *to++ = c;
366
0
      *to++ = (*from >> 8) & 0xff;
367
0
      *to++ = *from & 0xff;
368
0
      cnt += 3;
369
0
    }
370
0
    else if ((c = (*from >> 8)))
371
0
    {
372
0
      *to++ = c;
373
0
      *to++ = *from & 0xff;
374
0
      cnt += 2;
375
0
    }
376
0
    else
377
0
    {
378
0
      *to++ = *from;
379
0
      cnt++;
380
0
    }
381
0
    from++;
382
0
    len--;
383
0
  }
384
0
  *to = 0;
385
0
  return cnt;
386
0
}
387
388
389
/*
390
 * JOHAB
391
 */
392
static int
393
pg_johab_mblen(const unsigned char *s)
394
0
{
395
0
  return pg_euc_mblen(s);
396
0
}
397
398
static int
399
pg_johab_dsplen(const unsigned char *s)
400
0
{
401
0
  return pg_euc_dsplen(s);
402
0
}
403
404
/*
405
 * convert UTF8 string to pg_wchar (UCS-4)
406
 * caller must allocate enough space for "to", including a trailing zero!
407
 * len: length of from.
408
 * "from" not necessarily null terminated.
409
 */
410
static int
411
pg_utf2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
412
2.42k
{
413
2.42k
  int     cnt = 0;
414
2.42k
  uint32    c1,
415
2.42k
        c2,
416
2.42k
        c3,
417
2.42k
        c4;
418
419
363k
  while (len > 0 && *from)
420
361k
  {
421
361k
    if ((*from & 0x80) == 0)
422
361k
    {
423
361k
      *to = *from++;
424
361k
      len--;
425
361k
    }
426
0
    else if ((*from & 0xe0) == 0xc0)
427
0
    {
428
0
      if (len < 2)
429
0
        break;     /* drop trailing incomplete char */
430
0
      c1 = *from++ & 0x1f;
431
0
      c2 = *from++ & 0x3f;
432
0
      *to = (c1 << 6) | c2;
433
0
      len -= 2;
434
0
    }
435
0
    else if ((*from & 0xf0) == 0xe0)
436
0
    {
437
0
      if (len < 3)
438
0
        break;     /* drop trailing incomplete char */
439
0
      c1 = *from++ & 0x0f;
440
0
      c2 = *from++ & 0x3f;
441
0
      c3 = *from++ & 0x3f;
442
0
      *to = (c1 << 12) | (c2 << 6) | c3;
443
0
      len -= 3;
444
0
    }
445
0
    else if ((*from & 0xf8) == 0xf0)
446
0
    {
447
0
      if (len < 4)
448
0
        break;     /* drop trailing incomplete char */
449
0
      c1 = *from++ & 0x07;
450
0
      c2 = *from++ & 0x3f;
451
0
      c3 = *from++ & 0x3f;
452
0
      c4 = *from++ & 0x3f;
453
0
      *to = (c1 << 18) | (c2 << 12) | (c3 << 6) | c4;
454
0
      len -= 4;
455
0
    }
456
0
    else
457
0
    {
458
      /* treat a bogus char as length 1; not ours to raise error */
459
0
      *to = *from++;
460
0
      len--;
461
0
    }
462
361k
    to++;
463
361k
    cnt++;
464
361k
  }
465
2.42k
  *to = 0;
466
2.42k
  return cnt;
467
2.42k
}
468
469
470
/*
471
 * Map a Unicode code point to UTF-8.  utf8string must have 4 bytes of
472
 * space allocated.
473
 */
474
unsigned char *
475
unicode_to_utf8(pg_wchar c, unsigned char *utf8string)
476
1.50k
{
477
1.50k
  if (c <= 0x7F)
478
1.50k
  {
479
1.50k
    utf8string[0] = c;
480
1.50k
  }
481
0
  else if (c <= 0x7FF)
482
0
  {
483
0
    utf8string[0] = 0xC0 | ((c >> 6) & 0x1F);
484
0
    utf8string[1] = 0x80 | (c & 0x3F);
485
0
  }
486
0
  else if (c <= 0xFFFF)
487
0
  {
488
0
    utf8string[0] = 0xE0 | ((c >> 12) & 0x0F);
489
0
    utf8string[1] = 0x80 | ((c >> 6) & 0x3F);
490
0
    utf8string[2] = 0x80 | (c & 0x3F);
491
0
  }
492
0
  else
493
0
  {
494
0
    utf8string[0] = 0xF0 | ((c >> 18) & 0x07);
495
0
    utf8string[1] = 0x80 | ((c >> 12) & 0x3F);
496
0
    utf8string[2] = 0x80 | ((c >> 6) & 0x3F);
497
0
    utf8string[3] = 0x80 | (c & 0x3F);
498
0
  }
499
500
1.50k
  return utf8string;
501
1.50k
}
502
503
/*
504
 * Trivial conversion from pg_wchar to UTF-8.
505
 * caller should allocate enough space for "to"
506
 * len: length of from.
507
 * "from" not necessarily null terminated.
508
 */
509
static int
510
pg_wchar2utf_with_len(const pg_wchar *from, unsigned char *to, int len)
511
618
{
512
618
  int     cnt = 0;
513
514
2.12k
  while (len > 0 && *from)
515
1.50k
  {
516
1.50k
    int     char_len;
517
518
1.50k
    unicode_to_utf8(*from, to);
519
1.50k
    char_len = pg_utf_mblen(to);
520
1.50k
    cnt += char_len;
521
1.50k
    to += char_len;
522
1.50k
    from++;
523
1.50k
    len--;
524
1.50k
  }
525
618
  *to = 0;
526
618
  return cnt;
527
618
}
528
529
/*
530
 * Return the byte length of a UTF8 character pointed to by s
531
 *
532
 * Note: in the current implementation we do not support UTF8 sequences
533
 * of more than 4 bytes; hence do NOT return a value larger than 4.
534
 * We return "1" for any leading byte that is either flat-out illegal or
535
 * indicates a length larger than we support.
536
 *
537
 * pg_utf2wchar_with_len(), utf8_to_unicode(), pg_utf8_islegal(), and perhaps
538
 * other places would need to be fixed to change this.
539
 */
540
int
541
pg_utf_mblen(const unsigned char *s)
542
3.30M
{
543
3.30M
  int     len;
544
545
3.30M
  if ((*s & 0x80) == 0)
546
3.30M
    len = 1;
547
0
  else if ((*s & 0xe0) == 0xc0)
548
0
    len = 2;
549
0
  else if ((*s & 0xf0) == 0xe0)
550
0
    len = 3;
551
0
  else if ((*s & 0xf8) == 0xf0)
552
0
    len = 4;
553
#ifdef NOT_USED
554
  else if ((*s & 0xfc) == 0xf8)
555
    len = 5;
556
  else if ((*s & 0xfe) == 0xfc)
557
    len = 6;
558
#endif
559
0
  else
560
0
    len = 1;
561
3.30M
  return len;
562
3.30M
}
563
564
/*
565
 * This is an implementation of wcwidth() and wcswidth() as defined in
566
 * "The Single UNIX Specification, Version 2, The Open Group, 1997"
567
 * <http://www.UNIX-systems.org/online.html>
568
 *
569
 * Markus Kuhn -- 2001-09-08 -- public domain
570
 *
571
 * customised for PostgreSQL
572
 *
573
 * original available at : http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
574
 */
575
576
struct mbinterval
577
{
578
  unsigned short first;
579
  unsigned short last;
580
};
581
582
/* auxiliary function for binary search in interval table */
583
static int
584
mbbisearch(pg_wchar ucs, const struct mbinterval *table, int max)
585
610k
{
586
610k
  int     min = 0;
587
610k
  int     mid;
588
589
610k
  if (ucs < table[0].first || ucs > table[max].last)
590
610k
    return 0;
591
0
  while (max >= min)
592
0
  {
593
0
    mid = (min + max) / 2;
594
0
    if (ucs > table[mid].last)
595
0
      min = mid + 1;
596
0
    else if (ucs < table[mid].first)
597
0
      max = mid - 1;
598
0
    else
599
0
      return 1;
600
0
  }
601
602
0
  return 0;
603
0
}
604
605
606
/* The following functions define the column width of an ISO 10646
607
 * character as follows:
608
 *
609
 *    - The null character (U+0000) has a column width of 0.
610
 *
611
 *    - Other C0/C1 control characters and DEL will lead to a return
612
 *    value of -1.
613
 *
614
 *    - Non-spacing and enclosing combining characters (general
615
 *    category code Mn or Me in the Unicode database) have a
616
 *    column width of 0.
617
 *
618
 *    - Other format characters (general category code Cf in the Unicode
619
 *    database) and ZERO WIDTH SPACE (U+200B) have a column width of 0.
620
 *
621
 *    - Hangul Jamo medial vowels and final consonants (U+1160-U+11FF)
622
 *    have a column width of 0.
623
 *
624
 *    - Spacing characters in the East Asian Wide (W) or East Asian
625
 *    FullWidth (F) category as defined in Unicode Technical
626
 *    Report #11 have a column width of 2.
627
 *
628
 *    - All remaining characters (including all printable
629
 *    ISO 8859-1 and WGL4 characters, Unicode control characters,
630
 *    etc.) have a column width of 1.
631
 *
632
 * This implementation assumes that wchar_t characters are encoded
633
 * in ISO 10646.
634
 */
635
636
static int
637
ucs_wcwidth(pg_wchar ucs)
638
611k
{
639
  /* sorted list of non-overlapping intervals of non-spacing characters */
640
611k
  static const struct mbinterval combining[] = {
641
611k
    {0x0300, 0x034E}, {0x0360, 0x0362}, {0x0483, 0x0486},
642
611k
    {0x0488, 0x0489}, {0x0591, 0x05A1}, {0x05A3, 0x05B9},
643
611k
    {0x05BB, 0x05BD}, {0x05BF, 0x05BF}, {0x05C1, 0x05C2},
644
611k
    {0x05C4, 0x05C4}, {0x064B, 0x0655}, {0x0670, 0x0670},
645
611k
    {0x06D6, 0x06E4}, {0x06E7, 0x06E8}, {0x06EA, 0x06ED},
646
611k
    {0x070F, 0x070F}, {0x0711, 0x0711}, {0x0730, 0x074A},
647
611k
    {0x07A6, 0x07B0}, {0x0901, 0x0902}, {0x093C, 0x093C},
648
611k
    {0x0941, 0x0948}, {0x094D, 0x094D}, {0x0951, 0x0954},
649
611k
    {0x0962, 0x0963}, {0x0981, 0x0981}, {0x09BC, 0x09BC},
650
611k
    {0x09C1, 0x09C4}, {0x09CD, 0x09CD}, {0x09E2, 0x09E3},
651
611k
    {0x0A02, 0x0A02}, {0x0A3C, 0x0A3C}, {0x0A41, 0x0A42},
652
611k
    {0x0A47, 0x0A48}, {0x0A4B, 0x0A4D}, {0x0A70, 0x0A71},
653
611k
    {0x0A81, 0x0A82}, {0x0ABC, 0x0ABC}, {0x0AC1, 0x0AC5},
654
611k
    {0x0AC7, 0x0AC8}, {0x0ACD, 0x0ACD}, {0x0B01, 0x0B01},
655
611k
    {0x0B3C, 0x0B3C}, {0x0B3F, 0x0B3F}, {0x0B41, 0x0B43},
656
611k
    {0x0B4D, 0x0B4D}, {0x0B56, 0x0B56}, {0x0B82, 0x0B82},
657
611k
    {0x0BC0, 0x0BC0}, {0x0BCD, 0x0BCD}, {0x0C3E, 0x0C40},
658
611k
    {0x0C46, 0x0C48}, {0x0C4A, 0x0C4D}, {0x0C55, 0x0C56},
659
611k
    {0x0CBF, 0x0CBF}, {0x0CC6, 0x0CC6}, {0x0CCC, 0x0CCD},
660
611k
    {0x0D41, 0x0D43}, {0x0D4D, 0x0D4D}, {0x0DCA, 0x0DCA},
661
611k
    {0x0DD2, 0x0DD4}, {0x0DD6, 0x0DD6}, {0x0E31, 0x0E31},
662
611k
    {0x0E34, 0x0E3A}, {0x0E47, 0x0E4E}, {0x0EB1, 0x0EB1},
663
611k
    {0x0EB4, 0x0EB9}, {0x0EBB, 0x0EBC}, {0x0EC8, 0x0ECD},
664
611k
    {0x0F18, 0x0F19}, {0x0F35, 0x0F35}, {0x0F37, 0x0F37},
665
611k
    {0x0F39, 0x0F39}, {0x0F71, 0x0F7E}, {0x0F80, 0x0F84},
666
611k
    {0x0F86, 0x0F87}, {0x0F90, 0x0F97}, {0x0F99, 0x0FBC},
667
611k
    {0x0FC6, 0x0FC6}, {0x102D, 0x1030}, {0x1032, 0x1032},
668
611k
    {0x1036, 0x1037}, {0x1039, 0x1039}, {0x1058, 0x1059},
669
611k
    {0x1160, 0x11FF}, {0x17B7, 0x17BD}, {0x17C6, 0x17C6},
670
611k
    {0x17C9, 0x17D3}, {0x180B, 0x180E}, {0x18A9, 0x18A9},
671
611k
    {0x200B, 0x200F}, {0x202A, 0x202E}, {0x206A, 0x206F},
672
611k
    {0x20D0, 0x20E3}, {0x302A, 0x302F}, {0x3099, 0x309A},
673
611k
    {0xFB1E, 0xFB1E}, {0xFE20, 0xFE23}, {0xFEFF, 0xFEFF},
674
611k
    {0xFFF9, 0xFFFB}
675
611k
  };
676
677
  /* test for 8-bit control characters */
678
611k
  if (ucs == 0)
679
0
    return 0;
680
681
611k
  if (ucs < 0x20 || (ucs >= 0x7f && ucs < 0xa0) || ucs > 0x0010ffff)
682
125
    return -1;
683
684
  /* binary search in table of non-spacing characters */
685
610k
  if (mbbisearch(ucs, combining,
686
610k
           sizeof(combining) / sizeof(struct mbinterval) - 1))
687
0
    return 0;
688
689
  /*
690
   * if we arrive here, ucs is not a combining or C0/C1 control character
691
   */
692
693
610k
  return 1 +
694
610k
    (ucs >= 0x1100 &&
695
0
     (ucs <= 0x115f ||   /* Hangul Jamo init. consonants */
696
0
      (ucs >= 0x2e80 && ucs <= 0xa4cf && (ucs & ~0x0011) != 0x300a &&
697
0
       ucs != 0x303f) ||  /* CJK ... Yi */
698
0
      (ucs >= 0xac00 && ucs <= 0xd7a3) ||  /* Hangul Syllables */
699
0
      (ucs >= 0xf900 && ucs <= 0xfaff) ||  /* CJK Compatibility
700
                         * Ideographs */
701
0
      (ucs >= 0xfe30 && ucs <= 0xfe6f) ||  /* CJK Compatibility Forms */
702
0
      (ucs >= 0xff00 && ucs <= 0xff5f) ||  /* Fullwidth Forms */
703
0
      (ucs >= 0xffe0 && ucs <= 0xffe6) ||
704
0
      (ucs >= 0x20000 && ucs <= 0x2ffff)));
705
610k
}
706
707
/*
708
 * Convert a UTF-8 character to a Unicode code point.
709
 * This is a one-character version of pg_utf2wchar_with_len.
710
 *
711
 * No error checks here, c must point to a long-enough string.
712
 */
713
pg_wchar
714
utf8_to_unicode(const unsigned char *c)
715
611k
{
716
611k
  if ((*c & 0x80) == 0)
717
611k
    return (pg_wchar) c[0];
718
0
  else if ((*c & 0xe0) == 0xc0)
719
0
    return (pg_wchar) (((c[0] & 0x1f) << 6) |
720
0
               (c[1] & 0x3f));
721
0
  else if ((*c & 0xf0) == 0xe0)
722
0
    return (pg_wchar) (((c[0] & 0x0f) << 12) |
723
0
               ((c[1] & 0x3f) << 6) |
724
0
               (c[2] & 0x3f));
725
0
  else if ((*c & 0xf8) == 0xf0)
726
0
    return (pg_wchar) (((c[0] & 0x07) << 18) |
727
0
               ((c[1] & 0x3f) << 12) |
728
0
               ((c[2] & 0x3f) << 6) |
729
0
               (c[3] & 0x3f));
730
0
  else
731
    /* that is an invalid code on purpose */
732
0
    return 0xffffffff;
733
611k
}
734
735
static int
736
pg_utf_dsplen(const unsigned char *s)
737
611k
{
738
611k
  return ucs_wcwidth(utf8_to_unicode(s));
739
611k
}
740
741
/*
742
 * convert mule internal code to pg_wchar
743
 * caller should allocate enough space for "to"
744
 * len: length of from.
745
 * "from" not necessarily null terminated.
746
 */
747
static int
748
pg_mule2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
749
0
{
750
0
  int     cnt = 0;
751
752
0
  while (len > 0 && *from)
753
0
  {
754
0
    if (IS_LC1(*from) && len >= 2)
755
0
    {
756
0
      *to = *from++ << 16;
757
0
      *to |= *from++;
758
0
      len -= 2;
759
0
    }
760
0
    else if (IS_LCPRV1(*from) && len >= 3)
761
0
    {
762
0
      from++;
763
0
      *to = *from++ << 16;
764
0
      *to |= *from++;
765
0
      len -= 3;
766
0
    }
767
0
    else if (IS_LC2(*from) && len >= 3)
768
0
    {
769
0
      *to = *from++ << 16;
770
0
      *to |= *from++ << 8;
771
0
      *to |= *from++;
772
0
      len -= 3;
773
0
    }
774
0
    else if (IS_LCPRV2(*from) && len >= 4)
775
0
    {
776
0
      from++;
777
0
      *to = *from++ << 16;
778
0
      *to |= *from++ << 8;
779
0
      *to |= *from++;
780
0
      len -= 4;
781
0
    }
782
0
    else
783
0
    {           /* assume ASCII */
784
0
      *to = (unsigned char) *from++;
785
0
      len--;
786
0
    }
787
0
    to++;
788
0
    cnt++;
789
0
  }
790
0
  *to = 0;
791
0
  return cnt;
792
0
}
793
794
/*
795
 * convert pg_wchar to mule internal code
796
 * caller should allocate enough space for "to"
797
 * len: length of from.
798
 * "from" not necessarily null terminated.
799
 */
800
static int
801
pg_wchar2mule_with_len(const pg_wchar *from, unsigned char *to, int len)
802
0
{
803
0
  int     cnt = 0;
804
805
0
  while (len > 0 && *from)
806
0
  {
807
0
    unsigned char lb;
808
809
0
    lb = (*from >> 16) & 0xff;
810
0
    if (IS_LC1(lb))
811
0
    {
812
0
      *to++ = lb;
813
0
      *to++ = *from & 0xff;
814
0
      cnt += 2;
815
0
    }
816
0
    else if (IS_LC2(lb))
817
0
    {
818
0
      *to++ = lb;
819
0
      *to++ = (*from >> 8) & 0xff;
820
0
      *to++ = *from & 0xff;
821
0
      cnt += 3;
822
0
    }
823
0
    else if (IS_LCPRV1_A_RANGE(lb))
824
0
    {
825
0
      *to++ = LCPRV1_A;
826
0
      *to++ = lb;
827
0
      *to++ = *from & 0xff;
828
0
      cnt += 3;
829
0
    }
830
0
    else if (IS_LCPRV1_B_RANGE(lb))
831
0
    {
832
0
      *to++ = LCPRV1_B;
833
0
      *to++ = lb;
834
0
      *to++ = *from & 0xff;
835
0
      cnt += 3;
836
0
    }
837
0
    else if (IS_LCPRV2_A_RANGE(lb))
838
0
    {
839
0
      *to++ = LCPRV2_A;
840
0
      *to++ = lb;
841
0
      *to++ = (*from >> 8) & 0xff;
842
0
      *to++ = *from & 0xff;
843
0
      cnt += 4;
844
0
    }
845
0
    else if (IS_LCPRV2_B_RANGE(lb))
846
0
    {
847
0
      *to++ = LCPRV2_B;
848
0
      *to++ = lb;
849
0
      *to++ = (*from >> 8) & 0xff;
850
0
      *to++ = *from & 0xff;
851
0
      cnt += 4;
852
0
    }
853
0
    else
854
0
    {
855
0
      *to++ = *from & 0xff;
856
0
      cnt += 1;
857
0
    }
858
0
    from++;
859
0
    len--;
860
0
  }
861
0
  *to = 0;
862
0
  return cnt;
863
0
}
864
865
int
866
pg_mule_mblen(const unsigned char *s)
867
0
{
868
0
  int     len;
869
870
0
  if (IS_LC1(*s))
871
0
    len = 2;
872
0
  else if (IS_LCPRV1(*s))
873
0
    len = 3;
874
0
  else if (IS_LC2(*s))
875
0
    len = 3;
876
0
  else if (IS_LCPRV2(*s))
877
0
    len = 4;
878
0
  else
879
0
    len = 1;       /* assume ASCII */
880
0
  return len;
881
0
}
882
883
static int
884
pg_mule_dsplen(const unsigned char *s)
885
0
{
886
0
  int     len;
887
888
  /*
889
   * Note: it's not really appropriate to assume that all multibyte charsets
890
   * are double-wide on screen.  But this seems an okay approximation for
891
   * the MULE charsets we currently support.
892
   */
893
894
0
  if (IS_LC1(*s))
895
0
    len = 1;
896
0
  else if (IS_LCPRV1(*s))
897
0
    len = 1;
898
0
  else if (IS_LC2(*s))
899
0
    len = 2;
900
0
  else if (IS_LCPRV2(*s))
901
0
    len = 2;
902
0
  else
903
0
    len = 1;       /* assume ASCII */
904
905
0
  return len;
906
0
}
907
908
/*
909
 * ISO8859-1
910
 */
911
static int
912
pg_latin12wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
913
0
{
914
0
  int     cnt = 0;
915
916
0
  while (len > 0 && *from)
917
0
  {
918
0
    *to++ = *from++;
919
0
    len--;
920
0
    cnt++;
921
0
  }
922
0
  *to = 0;
923
0
  return cnt;
924
0
}
925
926
/*
927
 * Trivial conversion from pg_wchar to single byte encoding. Just ignores
928
 * high bits.
929
 * caller should allocate enough space for "to"
930
 * len: length of from.
931
 * "from" not necessarily null terminated.
932
 */
933
static int
934
pg_wchar2single_with_len(const pg_wchar *from, unsigned char *to, int len)
935
0
{
936
0
  int     cnt = 0;
937
938
0
  while (len > 0 && *from)
939
0
  {
940
0
    *to++ = *from++;
941
0
    len--;
942
0
    cnt++;
943
0
  }
944
0
  *to = 0;
945
0
  return cnt;
946
0
}
947
948
static int
949
pg_latin1_mblen(const unsigned char *s)
950
0
{
951
0
  return 1;
952
0
}
953
954
static int
955
pg_latin1_dsplen(const unsigned char *s)
956
0
{
957
0
  return pg_ascii_dsplen(s);
958
0
}
959
960
/*
961
 * SJIS
962
 */
963
static int
964
pg_sjis_mblen(const unsigned char *s)
965
0
{
966
0
  int     len;
967
968
0
  if (*s >= 0xa1 && *s <= 0xdf)
969
0
    len = 1;       /* 1 byte kana? */
970
0
  else if (IS_HIGHBIT_SET(*s))
971
0
    len = 2;       /* kanji? */
972
0
  else
973
0
    len = 1;       /* should be ASCII */
974
0
  return len;
975
0
}
976
977
static int
978
pg_sjis_dsplen(const unsigned char *s)
979
0
{
980
0
  int     len;
981
982
0
  if (*s >= 0xa1 && *s <= 0xdf)
983
0
    len = 1;       /* 1 byte kana? */
984
0
  else if (IS_HIGHBIT_SET(*s))
985
0
    len = 2;       /* kanji? */
986
0
  else
987
0
    len = pg_ascii_dsplen(s); /* should be ASCII */
988
0
  return len;
989
0
}
990
991
/*
992
 * Big5
993
 */
994
static int
995
pg_big5_mblen(const unsigned char *s)
996
0
{
997
0
  int     len;
998
999
0
  if (IS_HIGHBIT_SET(*s))
1000
0
    len = 2;       /* kanji? */
1001
0
  else
1002
0
    len = 1;       /* should be ASCII */
1003
0
  return len;
1004
0
}
1005
1006
static int
1007
pg_big5_dsplen(const unsigned char *s)
1008
0
{
1009
0
  int     len;
1010
1011
0
  if (IS_HIGHBIT_SET(*s))
1012
0
    len = 2;       /* kanji? */
1013
0
  else
1014
0
    len = pg_ascii_dsplen(s); /* should be ASCII */
1015
0
  return len;
1016
0
}
1017
1018
/*
1019
 * GBK
1020
 */
1021
static int
1022
pg_gbk_mblen(const unsigned char *s)
1023
0
{
1024
0
  int     len;
1025
1026
0
  if (IS_HIGHBIT_SET(*s))
1027
0
    len = 2;       /* kanji? */
1028
0
  else
1029
0
    len = 1;       /* should be ASCII */
1030
0
  return len;
1031
0
}
1032
1033
static int
1034
pg_gbk_dsplen(const unsigned char *s)
1035
0
{
1036
0
  int     len;
1037
1038
0
  if (IS_HIGHBIT_SET(*s))
1039
0
    len = 2;       /* kanji? */
1040
0
  else
1041
0
    len = pg_ascii_dsplen(s); /* should be ASCII */
1042
0
  return len;
1043
0
}
1044
1045
/*
1046
 * UHC
1047
 */
1048
static int
1049
pg_uhc_mblen(const unsigned char *s)
1050
0
{
1051
0
  int     len;
1052
1053
0
  if (IS_HIGHBIT_SET(*s))
1054
0
    len = 2;       /* 2byte? */
1055
0
  else
1056
0
    len = 1;       /* should be ASCII */
1057
0
  return len;
1058
0
}
1059
1060
static int
1061
pg_uhc_dsplen(const unsigned char *s)
1062
0
{
1063
0
  int     len;
1064
1065
0
  if (IS_HIGHBIT_SET(*s))
1066
0
    len = 2;       /* 2byte? */
1067
0
  else
1068
0
    len = pg_ascii_dsplen(s); /* should be ASCII */
1069
0
  return len;
1070
0
}
1071
1072
/*
1073
 * GB18030
1074
 *  Added by Bill Huang <bhuang@redhat.com>,<bill_huanghb@ybb.ne.jp>
1075
 */
1076
static int
1077
pg_gb18030_mblen(const unsigned char *s)
1078
0
{
1079
0
  int     len;
1080
1081
0
  if (!IS_HIGHBIT_SET(*s))
1082
0
    len = 1;       /* ASCII */
1083
0
  else if (*(s + 1) >= 0x30 && *(s + 1) <= 0x39)
1084
0
    len = 4;
1085
0
  else
1086
0
    len = 2;
1087
0
  return len;
1088
0
}
1089
1090
static int
1091
pg_gb18030_dsplen(const unsigned char *s)
1092
0
{
1093
0
  int     len;
1094
1095
0
  if (IS_HIGHBIT_SET(*s))
1096
0
    len = 2;
1097
0
  else
1098
0
    len = pg_ascii_dsplen(s); /* ASCII */
1099
0
  return len;
1100
0
}
1101
1102
/*
1103
 *-------------------------------------------------------------------
1104
 * multibyte sequence validators
1105
 *
1106
 * These functions accept "s", a pointer to the first byte of a string,
1107
 * and "len", the remaining length of the string.  If there is a validly
1108
 * encoded character beginning at *s, return its length in bytes; else
1109
 * return -1.
1110
 *
1111
 * The functions can assume that len > 0 and that *s != '\0', but they must
1112
 * test for and reject zeroes in any additional bytes of a multibyte character.
1113
 *
1114
 * Note that this definition allows the function for a single-byte
1115
 * encoding to be just "return 1".
1116
 *-------------------------------------------------------------------
1117
 */
1118
1119
static int
1120
pg_ascii_verifier(const unsigned char *s, int len)
1121
0
{
1122
0
  return 1;
1123
0
}
1124
1125
0
#define IS_EUC_RANGE_VALID(c) ((c) >= 0xa1 && (c) <= 0xfe)
1126
1127
static int
1128
pg_eucjp_verifier(const unsigned char *s, int len)
1129
0
{
1130
0
  int     l;
1131
0
  unsigned char c1,
1132
0
        c2;
1133
1134
0
  c1 = *s++;
1135
1136
0
  switch (c1)
1137
0
  {
1138
0
    case SS2:       /* JIS X 0201 */
1139
0
      l = 2;
1140
0
      if (l > len)
1141
0
        return -1;
1142
0
      c2 = *s++;
1143
0
      if (c2 < 0xa1 || c2 > 0xdf)
1144
0
        return -1;
1145
0
      break;
1146
1147
0
    case SS3:       /* JIS X 0212 */
1148
0
      l = 3;
1149
0
      if (l > len)
1150
0
        return -1;
1151
0
      c2 = *s++;
1152
0
      if (!IS_EUC_RANGE_VALID(c2))
1153
0
        return -1;
1154
0
      c2 = *s++;
1155
0
      if (!IS_EUC_RANGE_VALID(c2))
1156
0
        return -1;
1157
0
      break;
1158
1159
0
    default:
1160
0
      if (IS_HIGHBIT_SET(c1)) /* JIS X 0208? */
1161
0
      {
1162
0
        l = 2;
1163
0
        if (l > len)
1164
0
          return -1;
1165
0
        if (!IS_EUC_RANGE_VALID(c1))
1166
0
          return -1;
1167
0
        c2 = *s++;
1168
0
        if (!IS_EUC_RANGE_VALID(c2))
1169
0
          return -1;
1170
0
      }
1171
0
      else
1172
        /* must be ASCII */
1173
0
      {
1174
0
        l = 1;
1175
0
      }
1176
0
      break;
1177
0
  }
1178
1179
0
  return l;
1180
0
}
1181
1182
static int
1183
pg_euckr_verifier(const unsigned char *s, int len)
1184
0
{
1185
0
  int     l;
1186
0
  unsigned char c1,
1187
0
        c2;
1188
1189
0
  c1 = *s++;
1190
1191
0
  if (IS_HIGHBIT_SET(c1))
1192
0
  {
1193
0
    l = 2;
1194
0
    if (l > len)
1195
0
      return -1;
1196
0
    if (!IS_EUC_RANGE_VALID(c1))
1197
0
      return -1;
1198
0
    c2 = *s++;
1199
0
    if (!IS_EUC_RANGE_VALID(c2))
1200
0
      return -1;
1201
0
  }
1202
0
  else
1203
    /* must be ASCII */
1204
0
  {
1205
0
    l = 1;
1206
0
  }
1207
1208
0
  return l;
1209
0
}
1210
1211
/* EUC-CN byte sequences are exactly same as EUC-KR */
1212
#define pg_euccn_verifier pg_euckr_verifier
1213
1214
static int
1215
pg_euctw_verifier(const unsigned char *s, int len)
1216
0
{
1217
0
  int     l;
1218
0
  unsigned char c1,
1219
0
        c2;
1220
1221
0
  c1 = *s++;
1222
1223
0
  switch (c1)
1224
0
  {
1225
0
    case SS2:       /* CNS 11643 Plane 1-7 */
1226
0
      l = 4;
1227
0
      if (l > len)
1228
0
        return -1;
1229
0
      c2 = *s++;
1230
0
      if (c2 < 0xa1 || c2 > 0xa7)
1231
0
        return -1;
1232
0
      c2 = *s++;
1233
0
      if (!IS_EUC_RANGE_VALID(c2))
1234
0
        return -1;
1235
0
      c2 = *s++;
1236
0
      if (!IS_EUC_RANGE_VALID(c2))
1237
0
        return -1;
1238
0
      break;
1239
1240
0
    case SS3:       /* unused */
1241
0
      return -1;
1242
1243
0
    default:
1244
0
      if (IS_HIGHBIT_SET(c1)) /* CNS 11643 Plane 1 */
1245
0
      {
1246
0
        l = 2;
1247
0
        if (l > len)
1248
0
          return -1;
1249
        /* no further range check on c1? */
1250
0
        c2 = *s++;
1251
0
        if (!IS_EUC_RANGE_VALID(c2))
1252
0
          return -1;
1253
0
      }
1254
0
      else
1255
        /* must be ASCII */
1256
0
      {
1257
0
        l = 1;
1258
0
      }
1259
0
      break;
1260
0
  }
1261
0
  return l;
1262
0
}
1263
1264
static int
1265
pg_johab_verifier(const unsigned char *s, int len)
1266
0
{
1267
0
  int     l,
1268
0
        mbl;
1269
0
  unsigned char c;
1270
1271
0
  l = mbl = pg_johab_mblen(s);
1272
1273
0
  if (len < l)
1274
0
    return -1;
1275
1276
0
  if (!IS_HIGHBIT_SET(*s))
1277
0
    return mbl;
1278
1279
0
  while (--l > 0)
1280
0
  {
1281
0
    c = *++s;
1282
0
    if (!IS_EUC_RANGE_VALID(c))
1283
0
      return -1;
1284
0
  }
1285
0
  return mbl;
1286
0
}
1287
1288
static int
1289
pg_mule_verifier(const unsigned char *s, int len)
1290
0
{
1291
0
  int     l,
1292
0
        mbl;
1293
0
  unsigned char c;
1294
1295
0
  l = mbl = pg_mule_mblen(s);
1296
1297
0
  if (len < l)
1298
0
    return -1;
1299
1300
0
  while (--l > 0)
1301
0
  {
1302
0
    c = *++s;
1303
0
    if (!IS_HIGHBIT_SET(c))
1304
0
      return -1;
1305
0
  }
1306
0
  return mbl;
1307
0
}
1308
1309
static int
1310
pg_latin1_verifier(const unsigned char *s, int len)
1311
0
{
1312
0
  return 1;
1313
0
}
1314
1315
static int
1316
pg_sjis_verifier(const unsigned char *s, int len)
1317
0
{
1318
0
  int     l,
1319
0
        mbl;
1320
0
  unsigned char c1,
1321
0
        c2;
1322
1323
0
  l = mbl = pg_sjis_mblen(s);
1324
1325
0
  if (len < l)
1326
0
    return -1;
1327
1328
0
  if (l == 1)         /* pg_sjis_mblen already verified it */
1329
0
    return mbl;
1330
1331
0
  c1 = *s++;
1332
0
  c2 = *s;
1333
0
  if (!ISSJISHEAD(c1) || !ISSJISTAIL(c2))
1334
0
    return -1;
1335
0
  return mbl;
1336
0
}
1337
1338
static int
1339
pg_big5_verifier(const unsigned char *s, int len)
1340
0
{
1341
0
  int     l,
1342
0
        mbl;
1343
1344
0
  l = mbl = pg_big5_mblen(s);
1345
1346
0
  if (len < l)
1347
0
    return -1;
1348
1349
0
  while (--l > 0)
1350
0
  {
1351
0
    if (*++s == '\0')
1352
0
      return -1;
1353
0
  }
1354
1355
0
  return mbl;
1356
0
}
1357
1358
static int
1359
pg_gbk_verifier(const unsigned char *s, int len)
1360
0
{
1361
0
  int     l,
1362
0
        mbl;
1363
1364
0
  l = mbl = pg_gbk_mblen(s);
1365
1366
0
  if (len < l)
1367
0
    return -1;
1368
1369
0
  while (--l > 0)
1370
0
  {
1371
0
    if (*++s == '\0')
1372
0
      return -1;
1373
0
  }
1374
1375
0
  return mbl;
1376
0
}
1377
1378
static int
1379
pg_uhc_verifier(const unsigned char *s, int len)
1380
0
{
1381
0
  int     l,
1382
0
        mbl;
1383
1384
0
  l = mbl = pg_uhc_mblen(s);
1385
1386
0
  if (len < l)
1387
0
    return -1;
1388
1389
0
  while (--l > 0)
1390
0
  {
1391
0
    if (*++s == '\0')
1392
0
      return -1;
1393
0
  }
1394
1395
0
  return mbl;
1396
0
}
1397
1398
static int
1399
pg_gb18030_verifier(const unsigned char *s, int len)
1400
0
{
1401
0
  int     l;
1402
1403
0
  if (!IS_HIGHBIT_SET(*s))
1404
0
    l = 1;         /* ASCII */
1405
0
  else if (len >= 4 && *(s + 1) >= 0x30 && *(s + 1) <= 0x39)
1406
0
  {
1407
    /* Should be 4-byte, validate remaining bytes */
1408
0
    if (*s >= 0x81 && *s <= 0xfe &&
1409
0
      *(s + 2) >= 0x81 && *(s + 2) <= 0xfe &&
1410
0
      *(s + 3) >= 0x30 && *(s + 3) <= 0x39)
1411
0
      l = 4;
1412
0
    else
1413
0
      l = -1;
1414
0
  }
1415
0
  else if (len >= 2 && *s >= 0x81 && *s <= 0xfe)
1416
0
  {
1417
    /* Should be 2-byte, validate */
1418
0
    if ((*(s + 1) >= 0x40 && *(s + 1) <= 0x7e) ||
1419
0
      (*(s + 1) >= 0x80 && *(s + 1) <= 0xfe))
1420
0
      l = 2;
1421
0
    else
1422
0
      l = -1;
1423
0
  }
1424
0
  else
1425
0
    l = -1;
1426
0
  return l;
1427
0
}
1428
1429
static int
1430
pg_utf8_verifier(const unsigned char *s, int len)
1431
0
{
1432
0
  int     l = pg_utf_mblen(s);
1433
1434
0
  if (len < l)
1435
0
    return -1;
1436
1437
0
  if (!pg_utf8_islegal(s, l))
1438
0
    return -1;
1439
1440
0
  return l;
1441
0
}
1442
1443
/*
1444
 * Check for validity of a single UTF-8 encoded character
1445
 *
1446
 * This directly implements the rules in RFC3629.  The bizarre-looking
1447
 * restrictions on the second byte are meant to ensure that there isn't
1448
 * more than one encoding of a given Unicode character point; that is,
1449
 * you may not use a longer-than-necessary byte sequence with high order
1450
 * zero bits to represent a character that would fit in fewer bytes.
1451
 * To do otherwise is to create security hazards (eg, create an apparent
1452
 * non-ASCII character that decodes to plain ASCII).
1453
 *
1454
 * length is assumed to have been obtained by pg_utf_mblen(), and the
1455
 * caller must have checked that that many bytes are present in the buffer.
1456
 */
1457
bool
1458
pg_utf8_islegal(const unsigned char *source, int length)
1459
0
{
1460
0
  unsigned char a;
1461
1462
0
  switch (length)
1463
0
  {
1464
0
    default:
1465
      /* reject lengths 5 and 6 for now */
1466
0
      return false;
1467
0
    case 4:
1468
0
      a = source[3];
1469
0
      if (a < 0x80 || a > 0xBF)
1470
0
        return false;
1471
0
      switch_fallthrough();
1472
0
    case 3:
1473
0
      a = source[2];
1474
0
      if (a < 0x80 || a > 0xBF)
1475
0
        return false;
1476
0
      switch_fallthrough();
1477
0
    case 2:
1478
0
      a = source[1];
1479
0
      switch (*source)
1480
0
      {
1481
0
        case 0xE0:
1482
0
          if (a < 0xA0 || a > 0xBF)
1483
0
            return false;
1484
0
          break;
1485
0
        case 0xED:
1486
0
          if (a < 0x80 || a > 0x9F)
1487
0
            return false;
1488
0
          break;
1489
0
        case 0xF0:
1490
0
          if (a < 0x90 || a > 0xBF)
1491
0
            return false;
1492
0
          break;
1493
0
        case 0xF4:
1494
0
          if (a < 0x80 || a > 0x8F)
1495
0
            return false;
1496
0
          break;
1497
0
        default:
1498
0
          if (a < 0x80 || a > 0xBF)
1499
0
            return false;
1500
0
          break;
1501
0
      }
1502
0
      switch_fallthrough();
1503
0
    case 1:
1504
0
      a = *source;
1505
0
      if (a >= 0x80 && a < 0xC2)
1506
0
        return false;
1507
0
      if (a > 0xF4)
1508
0
        return false;
1509
0
      break;
1510
0
  }
1511
0
  return true;
1512
0
}
1513
1514
#ifndef FRONTEND
1515
1516
/*
1517
 * Generic character incrementer function.
1518
 *
1519
 * Not knowing anything about the properties of the encoding in use, we just
1520
 * keep incrementing the last byte until we get a validly-encoded result,
1521
 * or we run out of values to try.  We don't bother to try incrementing
1522
 * higher-order bytes, so there's no growth in runtime for wider characters.
1523
 * (If we did try to do that, we'd need to consider the likelihood that 255
1524
 * is not a valid final byte in the encoding.)
1525
 */
1526
static bool
1527
pg_generic_charinc(unsigned char *charptr, int len)
1528
{
1529
  unsigned char *lastbyte = charptr + len - 1;
1530
  mbverifier  mbverify;
1531
1532
  /* We can just invoke the character verifier directly. */
1533
  mbverify = pg_wchar_table[GetDatabaseEncoding()].mbverify;
1534
1535
  while (*lastbyte < (unsigned char) 255)
1536
  {
1537
    (*lastbyte)++;
1538
    if ((*mbverify) (charptr, len) == len)
1539
      return true;
1540
  }
1541
1542
  return false;
1543
}
1544
1545
/*
1546
 * UTF-8 character incrementer function.
1547
 *
1548
 * For a one-byte character less than 0x7F, we just increment the byte.
1549
 *
1550
 * For a multibyte character, every byte but the first must fall between 0x80
1551
 * and 0xBF; and the first byte must be between 0xC0 and 0xF4.  We increment
1552
 * the last byte that's not already at its maximum value.  If we can't find a
1553
 * byte that's less than the maximum allowable value, we simply fail.  We also
1554
 * need some special-case logic to skip regions used for surrogate pair
1555
 * handling, as those should not occur in valid UTF-8.
1556
 *
1557
 * Note that we don't reset lower-order bytes back to their minimums, since
1558
 * we can't afford to make an exhaustive search (see make_greater_string).
1559
 */
1560
static bool
1561
pg_utf8_increment(unsigned char *charptr, int length)
1562
{
1563
  unsigned char a;
1564
  unsigned char limit;
1565
1566
  switch (length)
1567
  {
1568
    default:
1569
      /* reject lengths 5 and 6 for now */
1570
      return false;
1571
    case 4:
1572
      a = charptr[3];
1573
      if (a < 0xBF)
1574
      {
1575
        charptr[3]++;
1576
        break;
1577
      }
1578
      switch_fallthrough();
1579
    case 3:
1580
      a = charptr[2];
1581
      if (a < 0xBF)
1582
      {
1583
        charptr[2]++;
1584
        break;
1585
      }
1586
      switch_fallthrough();
1587
    case 2:
1588
      a = charptr[1];
1589
      switch (*charptr)
1590
      {
1591
        case 0xED:
1592
          limit = 0x9F;
1593
          break;
1594
        case 0xF4:
1595
          limit = 0x8F;
1596
          break;
1597
        default:
1598
          limit = 0xBF;
1599
          break;
1600
      }
1601
      if (a < limit)
1602
      {
1603
        charptr[1]++;
1604
        break;
1605
      }
1606
      switch_fallthrough();
1607
    case 1:
1608
      a = *charptr;
1609
      if (a == 0x7F || a == 0xDF || a == 0xEF || a == 0xF4)
1610
        return false;
1611
      charptr[0]++;
1612
      break;
1613
  }
1614
1615
  return true;
1616
}
1617
1618
/*
1619
 * EUC-JP character incrementer function.
1620
 *
1621
 * If the sequence starts with SS2 (0x8e), it must be a two-byte sequence
1622
 * representing JIS X 0201 characters with the second byte ranging between
1623
 * 0xa1 and 0xdf.  We just increment the last byte if it's less than 0xdf,
1624
 * and otherwise rewrite the whole sequence to 0xa1 0xa1.
1625
 *
1626
 * If the sequence starts with SS3 (0x8f), it must be a three-byte sequence
1627
 * in which the last two bytes range between 0xa1 and 0xfe.  The last byte
1628
 * is incremented if possible, otherwise the second-to-last byte.
1629
 *
1630
 * If the sequence starts with a value other than the above and its MSB
1631
 * is set, it must be a two-byte sequence representing JIS X 0208 characters
1632
 * with both bytes ranging between 0xa1 and 0xfe.  The last byte is
1633
 * incremented if possible, otherwise the second-to-last byte.
1634
 *
1635
 * Otherwise, the sequence is a single-byte ASCII character. It is
1636
 * incremented up to 0x7f.
1637
 */
1638
static bool
1639
pg_eucjp_increment(unsigned char *charptr, int length)
1640
{
1641
  unsigned char c1,
1642
        c2;
1643
  int     i;
1644
1645
  c1 = *charptr;
1646
1647
  switch (c1)
1648
  {
1649
    case SS2:       /* JIS X 0201 */
1650
      if (length != 2)
1651
        return false;
1652
1653
      c2 = charptr[1];
1654
1655
      if (c2 >= 0xdf)
1656
        charptr[0] = charptr[1] = 0xa1;
1657
      else if (c2 < 0xa1)
1658
        charptr[1] = 0xa1;
1659
      else
1660
        charptr[1]++;
1661
      break;
1662
1663
    case SS3:       /* JIS X 0212 */
1664
      if (length != 3)
1665
        return false;
1666
1667
      for (i = 2; i > 0; i--)
1668
      {
1669
        c2 = charptr[i];
1670
        if (c2 < 0xa1)
1671
        {
1672
          charptr[i] = 0xa1;
1673
          return true;
1674
        }
1675
        else if (c2 < 0xfe)
1676
        {
1677
          charptr[i]++;
1678
          return true;
1679
        }
1680
      }
1681
1682
      /* Out of 3-byte code region */
1683
      return false;
1684
1685
    default:
1686
      if (IS_HIGHBIT_SET(c1)) /* JIS X 0208? */
1687
      {
1688
        if (length != 2)
1689
          return false;
1690
1691
        for (i = 1; i >= 0; i--)
1692
        {
1693
          c2 = charptr[i];
1694
          if (c2 < 0xa1)
1695
          {
1696
            charptr[i] = 0xa1;
1697
            return true;
1698
          }
1699
          else if (c2 < 0xfe)
1700
          {
1701
            charptr[i]++;
1702
            return true;
1703
          }
1704
        }
1705
1706
        /* Out of 2 byte code region */
1707
        return false;
1708
      }
1709
      else
1710
      {         /* ASCII, single byte */
1711
        if (c1 > 0x7e)
1712
          return false;
1713
        (*charptr)++;
1714
      }
1715
      break;
1716
  }
1717
1718
  return true;
1719
}
1720
#endif              /* !FRONTEND */
1721
1722
1723
/*
1724
 *-------------------------------------------------------------------
1725
 * encoding info table
1726
 * XXX must be sorted by the same order as enum pg_enc (in mb/pg_wchar.h)
1727
 *-------------------------------------------------------------------
1728
 */
1729
const pg_wchar_tbl pg_wchar_table[] = {
1730
  {pg_ascii2wchar_with_len, pg_wchar2single_with_len, pg_ascii_mblen, pg_ascii_dsplen, pg_ascii_verifier, 1}, /* PG_SQL_ASCII */
1731
  {pg_eucjp2wchar_with_len, pg_wchar2euc_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifier, 3},  /* PG_EUC_JP */
1732
  {pg_euccn2wchar_with_len, pg_wchar2euc_with_len, pg_euccn_mblen, pg_euccn_dsplen, pg_euccn_verifier, 2},  /* PG_EUC_CN */
1733
  {pg_euckr2wchar_with_len, pg_wchar2euc_with_len, pg_euckr_mblen, pg_euckr_dsplen, pg_euckr_verifier, 3},  /* PG_EUC_KR */
1734
  {pg_euctw2wchar_with_len, pg_wchar2euc_with_len, pg_euctw_mblen, pg_euctw_dsplen, pg_euctw_verifier, 4},  /* PG_EUC_TW */
1735
  {pg_eucjp2wchar_with_len, pg_wchar2euc_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifier, 3},  /* PG_EUC_JIS_2004 */
1736
  {pg_utf2wchar_with_len, pg_wchar2utf_with_len, pg_utf_mblen, pg_utf_dsplen, pg_utf8_verifier, 4}, /* PG_UTF8 */
1737
  {pg_mule2wchar_with_len, pg_wchar2mule_with_len, pg_mule_mblen, pg_mule_dsplen, pg_mule_verifier, 4}, /* PG_MULE_INTERNAL */
1738
  {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN1 */
1739
  {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN2 */
1740
  {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN3 */
1741
  {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN4 */
1742
  {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN5 */
1743
  {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN6 */
1744
  {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN7 */
1745
  {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN8 */
1746
  {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN9 */
1747
  {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN10 */
1748
  {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1256 */
1749
  {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1258 */
1750
  {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN866 */
1751
  {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN874 */
1752
  {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_KOI8R */
1753
  {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1251 */
1754
  {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1252 */
1755
  {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* ISO-8859-5 */
1756
  {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* ISO-8859-6 */
1757
  {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* ISO-8859-7 */
1758
  {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* ISO-8859-8 */
1759
  {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1250 */
1760
  {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1253 */
1761
  {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1254 */
1762
  {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1255 */
1763
  {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1257 */
1764
  {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_KOI8U */
1765
  {0, 0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifier, 2}, /* PG_SJIS */
1766
  {0, 0, pg_big5_mblen, pg_big5_dsplen, pg_big5_verifier, 2}, /* PG_BIG5 */
1767
  {0, 0, pg_gbk_mblen, pg_gbk_dsplen, pg_gbk_verifier, 2},  /* PG_GBK */
1768
  {0, 0, pg_uhc_mblen, pg_uhc_dsplen, pg_uhc_verifier, 2},  /* PG_UHC */
1769
  {0, 0, pg_gb18030_mblen, pg_gb18030_dsplen, pg_gb18030_verifier, 4},  /* PG_GB18030 */
1770
  {0, 0, pg_johab_mblen, pg_johab_dsplen, pg_johab_verifier, 3},  /* PG_JOHAB */
1771
  {0, 0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifier, 2}  /* PG_SHIFT_JIS_2004 */
1772
};
1773
1774
/* returns the byte length of a word for mule internal code */
1775
int
1776
pg_mic_mblen(const unsigned char *mbstr)
1777
0
{
1778
0
  return pg_mule_mblen(mbstr);
1779
0
}
1780
1781
/*
1782
 * Returns the byte length of a multibyte character.
1783
 */
1784
int
1785
pg_encoding_mblen(int encoding, const char *mbstr)
1786
611k
{
1787
611k
  return (PG_VALID_ENCODING(encoding) ?
1788
611k
      pg_wchar_table[encoding].mblen((const unsigned char *) mbstr) :
1789
0
      pg_wchar_table[PG_SQL_ASCII].mblen((const unsigned char *) mbstr));
1790
611k
}
1791
1792
/*
1793
 * Returns the display length of a multibyte character.
1794
 */
1795
int
1796
pg_encoding_dsplen(int encoding, const char *mbstr)
1797
611k
{
1798
611k
  return (PG_VALID_ENCODING(encoding) ?
1799
611k
      pg_wchar_table[encoding].dsplen((const unsigned char *) mbstr) :
1800
0
      pg_wchar_table[PG_SQL_ASCII].dsplen((const unsigned char *) mbstr));
1801
611k
}
1802
1803
/*
1804
 * Verify the first multibyte character of the given string.
1805
 * Return its byte length if good, -1 if bad.  (See comments above for
1806
 * full details of the mbverify API.)
1807
 */
1808
int
1809
pg_encoding_verifymb(int encoding, const char *mbstr, int len)
1810
0
{
1811
0
  return (PG_VALID_ENCODING(encoding) ?
1812
0
      pg_wchar_table[encoding].mbverify((const unsigned char *) mbstr, len) :
1813
0
      pg_wchar_table[PG_SQL_ASCII].mbverify((const unsigned char *) mbstr, len));
1814
0
}
1815
1816
/*
1817
 * fetch maximum length of a given encoding
1818
 */
1819
int
1820
pg_encoding_max_length(int encoding)
1821
89
{
1822
89
  Assert(PG_VALID_ENCODING(encoding));
1823
1824
89
  return pg_wchar_table[encoding].maxmblen;
1825
89
}
1826
1827
#ifndef FRONTEND
1828
1829
/*
1830
 * fetch maximum length of the encoding for the current database
1831
 */
1832
int
1833
pg_database_encoding_max_length(void)
1834
{
1835
  return pg_wchar_table[GetDatabaseEncoding()].maxmblen;
1836
}
1837
1838
/*
1839
 * get the character incrementer for the encoding for the current database
1840
 */
1841
mbcharacter_incrementer
1842
pg_database_encoding_character_incrementer(void)
1843
{
1844
  /*
1845
   * Eventually it might be best to add a field to pg_wchar_table[], but for
1846
   * now we just use a switch.
1847
   */
1848
  switch (GetDatabaseEncoding())
1849
  {
1850
    case PG_UTF8:
1851
      return pg_utf8_increment;
1852
1853
    case PG_EUC_JP:
1854
      return pg_eucjp_increment;
1855
1856
    default:
1857
      return pg_generic_charinc;
1858
  }
1859
}
1860
1861
/*
1862
 * Verify mbstr to make sure that it is validly encoded in the current
1863
 * database encoding.  Otherwise same as pg_verify_mbstr().
1864
 */
1865
bool
1866
pg_verifymbstr(const char *mbstr, int len, bool noError)
1867
{
1868
  return
1869
    pg_verify_mbstr_len(GetDatabaseEncoding(), mbstr, len, noError) >= 0;
1870
}
1871
1872
/*
1873
 * Verify mbstr to make sure that it is validly encoded in the specified
1874
 * encoding.
1875
 */
1876
bool
1877
pg_verify_mbstr(int encoding, const char *mbstr, int len, bool noError)
1878
{
1879
  return pg_verify_mbstr_len(encoding, mbstr, len, noError) >= 0;
1880
}
1881
1882
/*
1883
 * Verify mbstr to make sure that it is validly encoded in the specified
1884
 * encoding.
1885
 *
1886
 * mbstr is not necessarily zero terminated; length of mbstr is
1887
 * specified by len.
1888
 *
1889
 * If OK, return length of string in the encoding.
1890
 * If a problem is found, return -1 when noError is
1891
 * true; when noError is false, ereport() a descriptive message.
1892
 */
1893
int
1894
pg_verify_mbstr_len(int encoding, const char *mbstr, int len, bool noError)
1895
{
1896
  mbverifier  mbverify;
1897
  int     mb_len;
1898
1899
  Assert(PG_VALID_ENCODING(encoding));
1900
1901
  /*
1902
   * In single-byte encodings, we need only reject nulls (\0).
1903
   */
1904
  if (pg_encoding_max_length(encoding) <= 1)
1905
  {
1906
    const char *nullpos = memchr(mbstr, 0, len);
1907
1908
    if (nullpos == NULL)
1909
      return len;
1910
    if (noError)
1911
      return -1;
1912
    report_invalid_encoding(encoding, nullpos, 1);
1913
  }
1914
1915
  /* fetch function pointer just once */
1916
  mbverify = pg_wchar_table[encoding].mbverify;
1917
1918
  mb_len = 0;
1919
1920
  while (len > 0)
1921
  {
1922
    int     l;
1923
1924
    /* fast path for ASCII-subset characters */
1925
    if (!IS_HIGHBIT_SET(*mbstr))
1926
    {
1927
      if (*mbstr != '\0')
1928
      {
1929
        mb_len++;
1930
        mbstr++;
1931
        len--;
1932
        continue;
1933
      }
1934
      if (noError)
1935
        return -1;
1936
      report_invalid_encoding(encoding, mbstr, len);
1937
    }
1938
1939
    l = (*mbverify) ((const unsigned char *) mbstr, len);
1940
1941
    if (l < 0)
1942
    {
1943
      if (noError)
1944
        return -1;
1945
      report_invalid_encoding(encoding, mbstr, len);
1946
    }
1947
1948
    mbstr += l;
1949
    len -= l;
1950
    mb_len++;
1951
  }
1952
  return mb_len;
1953
}
1954
1955
/*
1956
 * check_encoding_conversion_args: check arguments of a conversion function
1957
 *
1958
 * "expected" arguments can be either an encoding ID or -1 to indicate that
1959
 * the caller will check whether it accepts the ID.
1960
 *
1961
 * Note: the errors here are not really user-facing, so elog instead of
1962
 * ereport seems sufficient.  Also, we trust that the "expected" encoding
1963
 * arguments are valid encoding IDs, but we don't trust the actuals.
1964
 */
1965
void
1966
check_encoding_conversion_args(int src_encoding,
1967
                 int dest_encoding,
1968
                 int len,
1969
                 int expected_src_encoding,
1970
                 int expected_dest_encoding)
1971
{
1972
  if (!PG_VALID_ENCODING(src_encoding))
1973
    elog(ERROR, "invalid source encoding ID: %d", src_encoding);
1974
  if (src_encoding != expected_src_encoding && expected_src_encoding >= 0)
1975
    elog(ERROR, "expected source encoding \"%s\", but got \"%s\"",
1976
       pg_enc2name_tbl[expected_src_encoding].name,
1977
       pg_enc2name_tbl[src_encoding].name);
1978
  if (!PG_VALID_ENCODING(dest_encoding))
1979
    elog(ERROR, "invalid destination encoding ID: %d", dest_encoding);
1980
  if (dest_encoding != expected_dest_encoding && expected_dest_encoding >= 0)
1981
    elog(ERROR, "expected destination encoding \"%s\", but got \"%s\"",
1982
       pg_enc2name_tbl[expected_dest_encoding].name,
1983
       pg_enc2name_tbl[dest_encoding].name);
1984
  if (len < 0)
1985
    elog(ERROR, "encoding conversion length must not be negative");
1986
}
1987
1988
/*
1989
 * report_invalid_encoding: complain about invalid multibyte character
1990
 *
1991
 * note: len is remaining length of string, not length of character;
1992
 * len must be greater than zero, as we always examine the first byte.
1993
 */
1994
void
1995
report_invalid_encoding(int encoding, const char *mbstr, int len)
1996
{
1997
  int     l = pg_encoding_mblen(encoding, mbstr);
1998
  char    buf[8 * 5 + 1];
1999
  char     *p = buf;
2000
  int     j,
2001
        jlimit;
2002
2003
  jlimit = Min(l, len);
2004
  jlimit = Min(jlimit, 8);  /* prevent buffer overrun */
2005
2006
  for (j = 0; j < jlimit; j++)
2007
  {
2008
    p += sprintf(p, "0x%02x", (unsigned char) mbstr[j]);
2009
    if (j < jlimit - 1)
2010
      p += sprintf(p, " ");
2011
  }
2012
2013
  ereport(ERROR,
2014
      (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
2015
       errmsg("invalid byte sequence for encoding \"%s\": %s",
2016
          pg_enc2name_tbl[encoding].name,
2017
          buf)));
2018
}
2019
2020
/*
2021
 * report_untranslatable_char: complain about untranslatable character
2022
 *
2023
 * note: len is remaining length of string, not length of character;
2024
 * len must be greater than zero, as we always examine the first byte.
2025
 */
2026
void
2027
report_untranslatable_char(int src_encoding, int dest_encoding,
2028
               const char *mbstr, int len)
2029
{
2030
  int     l = pg_encoding_mblen(src_encoding, mbstr);
2031
  char    buf[8 * 5 + 1];
2032
  char     *p = buf;
2033
  int     j,
2034
        jlimit;
2035
2036
  jlimit = Min(l, len);
2037
  jlimit = Min(jlimit, 8);  /* prevent buffer overrun */
2038
2039
  for (j = 0; j < jlimit; j++)
2040
  {
2041
    p += sprintf(p, "0x%02x", (unsigned char) mbstr[j]);
2042
    if (j < jlimit - 1)
2043
      p += sprintf(p, " ");
2044
  }
2045
2046
  ereport(ERROR,
2047
      (errcode(ERRCODE_UNTRANSLATABLE_CHARACTER),
2048
       errmsg("character with byte sequence %s in encoding \"%s\" has no equivalent in encoding \"%s\"",
2049
          buf,
2050
          pg_enc2name_tbl[src_encoding].name,
2051
          pg_enc2name_tbl[dest_encoding].name)));
2052
}
2053
2054
#endif              /* !FRONTEND */