/Users/deen/code/yugabyte-db/src/postgres/src/backend/utils/mb/conv.c

Source (jump to first uncovered line)
/*-------------------------------------------------------------------------
 *
 *    Utility functions for conversion procs.
 *
 * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 * IDENTIFICATION
 *    src/backend/utils/mb/conv.c
 *
 *-------------------------------------------------------------------------
 */
#include "postgres.h"
#include "mb/pg_wchar.h"


/*
 * local2local: a generic single byte charset encoding
 * conversion between two ASCII-superset encodings.
 *
 * l points to the source string of length len
 * p is the output area (must be large enough!)
 * src_encoding is the PG identifier for the source encoding
 * dest_encoding is the PG identifier for the target encoding
 * tab holds conversion entries for the source charset
 * starting from 128 (0x80). each entry in the table holds the corresponding
 * code point for the target charset, or 0 if there is no equivalent code.
 */
void
local2local(const unsigned char *l,
      unsigned char *p,
      int len,
      int src_encoding,
      int dest_encoding,
      const unsigned char *tab)
{
  unsigned char c1,
        c2;

  while (len > 0)
  {
    c1 = *l;
    if (c1 == 0)
      report_invalid_encoding(src_encoding, (const char *) l, len);
    if (!IS_HIGHBIT_SET(c1))
      *p++ = c1;
    else
    {
      c2 = tab[c1 - HIGHBIT];
      if (c2)
        *p++ = c2;
      else
        report_untranslatable_char(src_encoding, dest_encoding,
                       (const char *) l, len);
    }
    l++;
    len--;
  }
  *p = '\0';
}

/*
 * LATINn ---> MIC when the charset's local codes map directly to MIC
 *
 * l points to the source string of length len
 * p is the output area (must be large enough!)
 * lc is the mule character set id for the local encoding
 * encoding is the PG identifier for the local encoding
 */
void
latin2mic(const unsigned char *l, unsigned char *p, int len,
      int lc, int encoding)
{
  int     c1;

  while (len > 0)
  {
    c1 = *l;
    if (c1 == 0)
      report_invalid_encoding(encoding, (const char *) l, len);
    if (IS_HIGHBIT_SET(c1))
      *p++ = lc;
    *p++ = c1;
    l++;
    len--;
  }
  *p = '\0';
}

/*
 * MIC ---> LATINn when the charset's local codes map directly to MIC
 *
 * mic points to the source string of length len
 * p is the output area (must be large enough!)
 * lc is the mule character set id for the local encoding
 * encoding is the PG identifier for the local encoding
 */
void
mic2latin(const unsigned char *mic, unsigned char *p, int len,
      int lc, int encoding)
{
  int     c1;

  while (len > 0)
  {
    c1 = *mic;
    if (c1 == 0)
      report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len);
    if (!IS_HIGHBIT_SET(c1))
    {
      /* easy for ASCII */
      *p++ = c1;
      mic++;
      len--;
    }
    else
    {
      int     l = pg_mic_mblen(mic);

      if (len < l)
        report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic,
                    len);
      if (l != 2 || c1 != lc || !IS_HIGHBIT_SET(mic[1]))
        report_untranslatable_char(PG_MULE_INTERNAL, encoding,
                       (const char *) mic, len);
      *p++ = mic[1];
      mic += 2;
      len -= 2;
    }
  }
  *p = '\0';
}


/*
 * ASCII ---> MIC
 *
 * While ordinarily SQL_ASCII encoding is forgiving of high-bit-set
 * characters, here we must take a hard line because we don't know
 * the appropriate MIC equivalent.
 */
void
pg_ascii2mic(const unsigned char *l, unsigned char *p, int len)
{
  int     c1;

  while (len > 0)
  {
    c1 = *l;
    if (c1 == 0 || IS_HIGHBIT_SET(c1))
      report_invalid_encoding(PG_SQL_ASCII, (const char *) l, len);
    *p++ = c1;
    l++;
    len--;
  }
  *p = '\0';
}

/*
 * MIC ---> ASCII
 */
void
pg_mic2ascii(const unsigned char *mic, unsigned char *p, int len)
{
  int     c1;

  while (len > 0)
  {
    c1 = *mic;
    if (c1 == 0 || IS_HIGHBIT_SET(c1))
      report_untranslatable_char(PG_MULE_INTERNAL, PG_SQL_ASCII,
                     (const char *) mic, len);
    *p++ = c1;
    mic++;
    len--;
  }
  *p = '\0';
}

/*
 * latin2mic_with_table: a generic single byte charset encoding
 * conversion from a local charset to the mule internal code.
 *
 * l points to the source string of length len
 * p is the output area (must be large enough!)
 * lc is the mule character set id for the local encoding
 * encoding is the PG identifier for the local encoding
 * tab holds conversion entries for the local charset
 * starting from 128 (0x80). each entry in the table holds the corresponding
 * code point for the mule encoding, or 0 if there is no equivalent code.
 */
void
latin2mic_with_table(const unsigned char *l,
           unsigned char *p,
           int len,
           int lc,
           int encoding,
           const unsigned char *tab)
{
  unsigned char c1,
        c2;

  while (len > 0)
  {
    c1 = *l;
    if (c1 == 0)
      report_invalid_encoding(encoding, (const char *) l, len);
    if (!IS_HIGHBIT_SET(c1))
      *p++ = c1;
    else
    {
      c2 = tab[c1 - HIGHBIT];
      if (c2)
      {
        *p++ = lc;
        *p++ = c2;
      }
      else
        report_untranslatable_char(encoding, PG_MULE_INTERNAL,
                       (const char *) l, len);
    }
    l++;
    len--;
  }
  *p = '\0';
}

/*
 * mic2latin_with_table: a generic single byte charset encoding
 * conversion from the mule internal code to a local charset.
 *
 * mic points to the source string of length len
 * p is the output area (must be large enough!)
 * lc is the mule character set id for the local encoding
 * encoding is the PG identifier for the local encoding
 * tab holds conversion entries for the mule internal code's second byte,
 * starting from 128 (0x80). each entry in the table holds the corresponding
 * code point for the local charset, or 0 if there is no equivalent code.
 */
void
mic2latin_with_table(const unsigned char *mic,
           unsigned char *p,
           int len,
           int lc,
           int encoding,
           const unsigned char *tab)
{
  unsigned char c1,
        c2;

  while (len > 0)
  {
    c1 = *mic;
    if (c1 == 0)
      report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len);
    if (!IS_HIGHBIT_SET(c1))
    {
      /* easy for ASCII */
      *p++ = c1;
      mic++;
      len--;
    }
    else
    {
      int     l = pg_mic_mblen(mic);

      if (len < l)
        report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic,
                    len);
      if (l != 2 || c1 != lc || !IS_HIGHBIT_SET(mic[1]) ||
        (c2 = tab[mic[1] - HIGHBIT]) == 0)
      {
        report_untranslatable_char(PG_MULE_INTERNAL, encoding,
                       (const char *) mic, len);
        break;     /* keep compiler quiet */
      }
      *p++ = c2;
      mic += 2;
      len -= 2;
    }
  }
  *p = '\0';
}

/*
 * comparison routine for bsearch()
 * this routine is intended for combined UTF8 -> local code
 */
static int
compare3(const void *p1, const void *p2)
{
  uint32    s1,
        s2,
        d1,
        d2;

  s1 = *(const uint32 *) p1;
  s2 = *((const uint32 *) p1 + 1);
  d1 = ((const pg_utf_to_local_combined *) p2)->utf1;
  d2 = ((const pg_utf_to_local_combined *) p2)->utf2;
  return (s1 > d1 || (s1 == d1 && s2 > d2)) ? 1 : ((s1 == d1 && s2 == d2) ? 0 : -1);
}

/*
 * comparison routine for bsearch()
 * this routine is intended for local code -> combined UTF8
 */
static int
compare4(const void *p1, const void *p2)
{
  uint32    v1,
        v2;

  v1 = *(const uint32 *) p1;
  v2 = ((const pg_local_to_utf_combined *) p2)->code;
  return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1);
}

/*
 * store 32bit character representation into multibyte stream
 */
static inline unsigned char *
store_coded_char(unsigned char *dest, uint32 code)
{
  if (code & 0xff000000)
    *dest++ = code >> 24;
  if (code & 0x00ff0000)
    *dest++ = code >> 16;
  if (code & 0x0000ff00)
    *dest++ = code >> 8;
  if (code & 0x000000ff)
    *dest++ = code;
  return dest;
}

/*
 * Convert a character using a conversion radix tree.
 *
 * 'l' is the length of the input character in bytes, and b1-b4 are
 * the input character's bytes.
 */
static inline uint32
pg_mb_radix_conv(const pg_mb_radix_tree *rt,
         int l,
         unsigned char b1,
         unsigned char b2,
         unsigned char b3,
         unsigned char b4)
{
  if (l == 4)
  {
    /* 4-byte code */

    /* check code validity */
    if (b1 < rt->b4_1_lower || b1 > rt->b4_1_upper ||
      b2 < rt->b4_2_lower || b2 > rt->b4_2_upper ||
      b3 < rt->b4_3_lower || b3 > rt->b4_3_upper ||
      b4 < rt->b4_4_lower || b4 > rt->b4_4_upper)
      return 0;

    /* perform lookup */
    if (rt->chars32)
    {
      uint32    idx = rt->b4root;

      idx = rt->chars32[b1 + idx - rt->b4_1_lower];
      idx = rt->chars32[b2 + idx - rt->b4_2_lower];
      idx = rt->chars32[b3 + idx - rt->b4_3_lower];
      return rt->chars32[b4 + idx - rt->b4_4_lower];
    }
    else
    {
      uint16    idx = rt->b4root;

      idx = rt->chars16[b1 + idx - rt->b4_1_lower];
      idx = rt->chars16[b2 + idx - rt->b4_2_lower];
      idx = rt->chars16[b3 + idx - rt->b4_3_lower];
      return rt->chars16[b4 + idx - rt->b4_4_lower];
    }
  }
  else if (l == 3)
  {
    /* 3-byte code */

    /* check code validity */
    if (b2 < rt->b3_1_lower || b2 > rt->b3_1_upper ||
      b3 < rt->b3_2_lower || b3 > rt->b3_2_upper ||
      b4 < rt->b3_3_lower || b4 > rt->b3_3_upper)
      return 0;

    /* perform lookup */
    if (rt->chars32)
    {
      uint32    idx = rt->b3root;

      idx = rt->chars32[b2 + idx - rt->b3_1_lower];
      idx = rt->chars32[b3 + idx - rt->b3_2_lower];
      return rt->chars32[b4 + idx - rt->b3_3_lower];
    }
    else
    {
      uint16    idx = rt->b3root;

      idx = rt->chars16[b2 + idx - rt->b3_1_lower];
      idx = rt->chars16[b3 + idx - rt->b3_2_lower];
      return rt->chars16[b4 + idx - rt->b3_3_lower];
    }
  }
  else if (l == 2)
  {
    /* 2-byte code */

    /* check code validity - first byte */
    if (b3 < rt->b2_1_lower || b3 > rt->b2_1_upper ||
      b4 < rt->b2_2_lower || b4 > rt->b2_2_upper)
      return 0;

    /* perform lookup */
    if (rt->chars32)
    {
      uint32    idx = rt->b2root;

      idx = rt->chars32[b3 + idx - rt->b2_1_lower];
      return rt->chars32[b4 + idx - rt->b2_2_lower];
    }
    else
    {
      uint16    idx = rt->b2root;

      idx = rt->chars16[b3 + idx - rt->b2_1_lower];
      return rt->chars16[b4 + idx - rt->b2_2_lower];
    }
  }
  else if (l == 1)
  {
    /* 1-byte code */

    /* check code validity - first byte */
    if (b4 < rt->b1_lower || b4 > rt->b1_upper)
      return 0;

    /* perform lookup */
    if (rt->chars32)
      return rt->chars32[b4 + rt->b1root - rt->b1_lower];
    else
      return rt->chars16[b4 + rt->b1root - rt->b1_lower];
  }
  return 0;         /* shouldn't happen */
}

/*
 * UTF8 ---> local code
 *
 * utf: input string in UTF8 encoding (need not be null-terminated)
 * len: length of input string (in bytes)
 * iso: pointer to the output area (must be large enough!)
      (output string will be null-terminated)
 * map: conversion map for single characters
 * cmap: conversion map for combined characters
 *      (optional, pass NULL if none)
 * cmapsize: number of entries in the conversion map for combined characters
 *      (optional, pass 0 if none)
 * conv_func: algorithmic encoding conversion function
 *      (optional, pass NULL if none)
 * encoding: PG identifier for the local encoding
 *
 * For each character, the cmap (if provided) is consulted first; if no match,
 * the map is consulted next; if still no match, the conv_func (if provided)
 * is applied.  An error is raised if no match is found.
 *
 * See pg_wchar.h for more details about the data structures used here.
 */
void
UtfToLocal(const unsigned char *utf, int len,
       unsigned char *iso,
       const pg_mb_radix_tree *map,
       const pg_utf_to_local_combined *cmap, int cmapsize,
       utf_local_conversion_func conv_func,
       int encoding)
{
  uint32    iutf;
  int     l;
  const pg_utf_to_local_combined *cp;

  if (!PG_VALID_ENCODING(encoding))
    ereport(ERROR,
        (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
         errmsg("invalid encoding number: %d", encoding)));

  for (; len > 0; len -= l)
  {
    unsigned char b1 = 0;
    unsigned char b2 = 0;
    unsigned char b3 = 0;
    unsigned char b4 = 0;

    /* "break" cases all represent errors */
    if (*utf == '\0')
      break;

    l = pg_utf_mblen(utf);
    if (len < l)
      break;

    if (!pg_utf8_islegal(utf, l))
      break;

    if (l == 1)
    {
      /* ASCII case is easy, assume it's one-to-one conversion */
      *iso++ = *utf++;
      continue;
    }

    /* collect coded char of length l */
    if (l == 2)
    {
      b3 = *utf++;
      b4 = *utf++;
    }
    else if (l == 3)
    {
      b2 = *utf++;
      b3 = *utf++;
      b4 = *utf++;
    }
    else if (l == 4)
    {
      b1 = *utf++;
      b2 = *utf++;
      b3 = *utf++;
      b4 = *utf++;
    }
    else
    {
      elog(ERROR, "unsupported character length %d", l);
      iutf = 0;     /* keep compiler quiet */
    }
    iutf = (b1 << 24 | b2 << 16 | b3 << 8 | b4);

    /* First, try with combined map if possible */
    if (cmap && len > l)
    {
      const unsigned char *utf_save = utf;
      int     len_save = len;
      int     l_save = l;

      /* collect next character, same as above */
      len -= l;

      l = pg_utf_mblen(utf);
      if (len < l)
        break;

      if (!pg_utf8_islegal(utf, l))
        break;

      /* We assume ASCII character cannot be in combined map */
      if (l > 1)
      {
        uint32    iutf2;
        uint32    cutf[2];

        if (l == 2)
        {
          iutf2 = *utf++ << 8;
          iutf2 |= *utf++;
        }
        else if (l == 3)
        {
          iutf2 = *utf++ << 16;
          iutf2 |= *utf++ << 8;
          iutf2 |= *utf++;
        }
        else if (l == 4)
        {
          iutf2 = *utf++ << 24;
          iutf2 |= *utf++ << 16;
          iutf2 |= *utf++ << 8;
          iutf2 |= *utf++;
        }
        else
        {
          elog(ERROR, "unsupported character length %d", l);
          iutf2 = 0;  /* keep compiler quiet */
        }

        cutf[0] = iutf;
        cutf[1] = iutf2;

        cp = bsearch(cutf, cmap, cmapsize,
               sizeof(pg_utf_to_local_combined), compare3);

        if (cp)
        {
          iso = store_coded_char(iso, cp->code);
          continue;
        }
      }

      /* fail, so back up to reprocess second character next time */
      utf = utf_save;
      len = len_save;
      l = l_save;
    }

    /* Now check ordinary map */
    if (map)
    {
      uint32    converted = pg_mb_radix_conv(map, l, b1, b2, b3, b4);

      if (converted)
      {
        iso = store_coded_char(iso, converted);
        continue;
      }
    }

    /* if there's a conversion function, try that */
    if (conv_func)
    {
      uint32    converted = (*conv_func) (iutf);

      if (converted)
      {
        iso = store_coded_char(iso, converted);
        continue;
      }
    }

    /* failed to translate this character */
    report_untranslatable_char(PG_UTF8, encoding,
                   (const char *) (utf - l), len);
  }

  /* if we broke out of loop early, must be invalid input */
  if (len > 0)
    report_invalid_encoding(PG_UTF8, (const char *) utf, len);

  *iso = '\0';
}

/*
 * local code ---> UTF8
 *
 * iso: input string in local encoding (need not be null-terminated)
 * len: length of input string (in bytes)
 * utf: pointer to the output area (must be large enough!)
      (output string will be null-terminated)
 * map: conversion map for single characters
 * cmap: conversion map for combined characters
 *      (optional, pass NULL if none)
 * cmapsize: number of entries in the conversion map for combined characters
 *      (optional, pass 0 if none)
 * conv_func: algorithmic encoding conversion function
 *      (optional, pass NULL if none)
 * encoding: PG identifier for the local encoding
 *
 * For each character, the map is consulted first; if no match, the cmap
 * (if provided) is consulted next; if still no match, the conv_func
 * (if provided) is applied.  An error is raised if no match is found.
 *
 * See pg_wchar.h for more details about the data structures used here.
 */
void
LocalToUtf(const unsigned char *iso, int len,
       unsigned char *utf,
       const pg_mb_radix_tree *map,
       const pg_local_to_utf_combined *cmap, int cmapsize,
       utf_local_conversion_func conv_func,
       int encoding)
{
  uint32    iiso;
  int     l;
  const pg_local_to_utf_combined *cp;

  if (!PG_VALID_ENCODING(encoding))
    ereport(ERROR,
        (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
         errmsg("invalid encoding number: %d", encoding)));

  for (; len > 0; len -= l)
  {
    unsigned char b1 = 0;
    unsigned char b2 = 0;
    unsigned char b3 = 0;
    unsigned char b4 = 0;

    /* "break" cases all represent errors */
    if (*iso == '\0')
      break;

    if (!IS_HIGHBIT_SET(*iso))
    {
      /* ASCII case is easy, assume it's one-to-one conversion */
      *utf++ = *iso++;
      l = 1;
      continue;
    }

    l = pg_encoding_verifymb(encoding, (const char *) iso, len);
    if (l < 0)
      break;

    /* collect coded char of length l */
    if (l == 1)
      b4 = *iso++;
    else if (l == 2)
    {
      b3 = *iso++;
      b4 = *iso++;
    }
    else if (l == 3)
    {
      b2 = *iso++;
      b3 = *iso++;
      b4 = *iso++;
    }
    else if (l == 4)
    {
      b1 = *iso++;
      b2 = *iso++;
      b3 = *iso++;
      b4 = *iso++;
    }
    else
    {
      elog(ERROR, "unsupported character length %d", l);
      iiso = 0;     /* keep compiler quiet */
    }
    iiso = (b1 << 24 | b2 << 16 | b3 << 8 | b4);

    if (map)
    {
      uint32    converted = pg_mb_radix_conv(map, l, b1, b2, b3, b4);

      if (converted)
      {
        utf = store_coded_char(utf, converted);
        continue;
      }

      /* If there's a combined character map, try that */
      if (cmap)
      {
        cp = bsearch(&iiso, cmap, cmapsize,
               sizeof(pg_local_to_utf_combined), compare4);

        if (cp)
        {
          utf = store_coded_char(utf, cp->utf1);
          utf = store_coded_char(utf, cp->utf2);
          continue;
        }
      }
    }

    /* if there's a conversion function, try that */
    if (conv_func)
    {
      uint32    converted = (*conv_func) (iiso);

      if (converted)
      {
        utf = store_coded_char(utf, converted);
        continue;
      }
    }

    /* failed to translate this character */
    report_untranslatable_char(encoding, PG_UTF8,
                   (const char *) (iso - l), len);
  }

  /* if we broke out of loop early, must be invalid input */
  if (len > 0)
    report_invalid_encoding(encoding, (const char *) iso, len);

  *utf = '\0';
}

YugabyteDB (2.13.1.0-b60, 21121d69985fbf76aa6958d8f04a9bfa936293b5)

Coverage Report

Created: 2022-03-22 16:43

Line	Count	Source (jump to first uncovered line)
1		/*-------------------------------------------------------------------------
2		*
3		* Utility functions for conversion procs.
4		*
5		* Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
6		* Portions Copyright (c) 1994, Regents of the University of California
7		*
8		* IDENTIFICATION
9		* src/backend/utils/mb/conv.c
10		*
11		*-------------------------------------------------------------------------
12		*/
13		#include "postgres.h"
14		#include "mb/pg_wchar.h"
15
16
17		/*
18		* local2local: a generic single byte charset encoding
19		* conversion between two ASCII-superset encodings.
20		*
21		* l points to the source string of length len
22		* p is the output area (must be large enough!)
23		* src_encoding is the PG identifier for the source encoding
24		* dest_encoding is the PG identifier for the target encoding
25		* tab holds conversion entries for the source charset
26		* starting from 128 (0x80). each entry in the table holds the corresponding
27		* code point for the target charset, or 0 if there is no equivalent code.
28		*/
29		void
30		local2local(const unsigned char *l,
31		unsigned char *p,
32		int len,
33		int src_encoding,
34		int dest_encoding,
35		const unsigned char *tab)
36	0	{
37	0	unsigned char c1,
38	0	c2;
39
40	0	while (len > 0)
41	0	{
42	0	c1 = *l;
43	0	if (c1 == 0)
44	0	report_invalid_encoding(src_encoding, (const char *) l, len);
45	0	if (!IS_HIGHBIT_SET(c1))
46	0	*p++ = c1;
47	0	else
48	0	{
49	0	c2 = tab[c1 - HIGHBIT];
50	0	if (c2)
51	0	*p++ = c2;
52	0	else
53	0	report_untranslatable_char(src_encoding, dest_encoding,
54	0	(const char *) l, len);
55	0	}
56	0	l++;
57	0	len--;
58	0	}
59	0	*p = '\0';
60	0	}
61
62		/*
63		* LATINn ---> MIC when the charset's local codes map directly to MIC
64		*
65		* l points to the source string of length len
66		* p is the output area (must be large enough!)
67		* lc is the mule character set id for the local encoding
68		* encoding is the PG identifier for the local encoding
69		*/
70		void
71		latin2mic(const unsigned char l, unsigned char p, int len,
72		int lc, int encoding)
73	0	{
74	0	int c1;
75
76	0	while (len > 0)
77	0	{
78	0	c1 = *l;
79	0	if (c1 == 0)
80	0	report_invalid_encoding(encoding, (const char *) l, len);
81	0	if (IS_HIGHBIT_SET(c1))
82	0	*p++ = lc;
83	0	*p++ = c1;
84	0	l++;
85	0	len--;
86	0	}
87	0	*p = '\0';
88	0	}
89
90		/*
91		* MIC ---> LATINn when the charset's local codes map directly to MIC
92		*
93		* mic points to the source string of length len
94		* p is the output area (must be large enough!)
95		* lc is the mule character set id for the local encoding
96		* encoding is the PG identifier for the local encoding
97		*/
98		void
99		mic2latin(const unsigned char mic, unsigned char p, int len,
100		int lc, int encoding)
101	0	{
102	0	int c1;
103
104	0	while (len > 0)
105	0	{
106	0	c1 = *mic;
107	0	if (c1 == 0)
108	0	report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len);
109	0	if (!IS_HIGHBIT_SET(c1))
110	0	{
111		/* easy for ASCII */
112	0	*p++ = c1;
113	0	mic++;
114	0	len--;
115	0	}
116	0	else
117	0	{
118	0	int l = pg_mic_mblen(mic);
119
120	0	if (len < l)
121	0	report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic,
122	0	len);
123	0	if (l != 2 \|\| c1 != lc \|\| !IS_HIGHBIT_SET(mic[1]))
124	0	report_untranslatable_char(PG_MULE_INTERNAL, encoding,
125	0	(const char *) mic, len);
126	0	*p++ = mic[1];
127	0	mic += 2;
128	0	len -= 2;
129	0	}
130	0	}
131	0	*p = '\0';
132	0	}
133
134
135		/*
136		* ASCII ---> MIC
137		*
138		* While ordinarily SQL_ASCII encoding is forgiving of high-bit-set
139		* characters, here we must take a hard line because we don't know
140		* the appropriate MIC equivalent.
141		*/
142		void
143		pg_ascii2mic(const unsigned char l, unsigned char p, int len)
144	0	{
145	0	int c1;
146
147	0	while (len > 0)
148	0	{
149	0	c1 = *l;
150	0	if (c1 == 0 \|\| IS_HIGHBIT_SET(c1))
151	0	report_invalid_encoding(PG_SQL_ASCII, (const char *) l, len);
152	0	*p++ = c1;
153	0	l++;
154	0	len--;
155	0	}
156	0	*p = '\0';
157	0	}
158
159		/*
160		* MIC ---> ASCII
161		*/
162		void
163		pg_mic2ascii(const unsigned char mic, unsigned char p, int len)
164	0	{
165	0	int c1;
166
167	0	while (len > 0)
168	0	{
169	0	c1 = *mic;
170	0	if (c1 == 0 \|\| IS_HIGHBIT_SET(c1))
171	0	report_untranslatable_char(PG_MULE_INTERNAL, PG_SQL_ASCII,
172	0	(const char *) mic, len);
173	0	*p++ = c1;
174	0	mic++;
175	0	len--;
176	0	}
177	0	*p = '\0';
178	0	}
179
180		/*
181		* latin2mic_with_table: a generic single byte charset encoding
182		* conversion from a local charset to the mule internal code.
183		*
184		* l points to the source string of length len
185		* p is the output area (must be large enough!)
186		* lc is the mule character set id for the local encoding
187		* encoding is the PG identifier for the local encoding
188		* tab holds conversion entries for the local charset
189		* starting from 128 (0x80). each entry in the table holds the corresponding
190		* code point for the mule encoding, or 0 if there is no equivalent code.
191		*/
192		void
193		latin2mic_with_table(const unsigned char *l,
194		unsigned char *p,
195		int len,
196		int lc,
197		int encoding,
198		const unsigned char *tab)
199	0	{
200	0	unsigned char c1,
201	0	c2;
202
203	0	while (len > 0)
204	0	{
205	0	c1 = *l;
206	0	if (c1 == 0)
207	0	report_invalid_encoding(encoding, (const char *) l, len);
208	0	if (!IS_HIGHBIT_SET(c1))
209	0	*p++ = c1;
210	0	else
211	0	{
212	0	c2 = tab[c1 - HIGHBIT];
213	0	if (c2)
214	0	{
215	0	*p++ = lc;
216	0	*p++ = c2;
217	0	}
218	0	else
219	0	report_untranslatable_char(encoding, PG_MULE_INTERNAL,
220	0	(const char *) l, len);
221	0	}
222	0	l++;
223	0	len--;
224	0	}
225	0	*p = '\0';
226	0	}
227
228		/*
229		* mic2latin_with_table: a generic single byte charset encoding
230		* conversion from the mule internal code to a local charset.
231		*
232		* mic points to the source string of length len
233		* p is the output area (must be large enough!)
234		* lc is the mule character set id for the local encoding
235		* encoding is the PG identifier for the local encoding
236		* tab holds conversion entries for the mule internal code's second byte,
237		* starting from 128 (0x80). each entry in the table holds the corresponding
238		* code point for the local charset, or 0 if there is no equivalent code.
239		*/
240		void
241		mic2latin_with_table(const unsigned char *mic,
242		unsigned char *p,
243		int len,
244		int lc,
245		int encoding,
246		const unsigned char *tab)
247	0	{
248	0	unsigned char c1,
249	0	c2;
250
251	0	while (len > 0)
252	0	{
253	0	c1 = *mic;
254	0	if (c1 == 0)
255	0	report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len);
256	0	if (!IS_HIGHBIT_SET(c1))
257	0	{
258		/* easy for ASCII */
259	0	*p++ = c1;
260	0	mic++;
261	0	len--;
262	0	}
263	0	else
264	0	{
265	0	int l = pg_mic_mblen(mic);
266
267	0	if (len < l)
268	0	report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic,
269	0	len);
270	0	if (l != 2 \|\| c1 != lc \|\| !IS_HIGHBIT_SET(mic[1]) \|\|
271	0	(c2 = tab[mic[1] - HIGHBIT]) == 0)
272	0	{
273	0	report_untranslatable_char(PG_MULE_INTERNAL, encoding,
274	0	(const char *) mic, len);
275	0	break; /* keep compiler quiet */
276	0	}
277	0	*p++ = c2;
278	0	mic += 2;
279	0	len -= 2;
280	0	}
281	0	}
282	0	*p = '\0';
283	0	}
284
285		/*
286		* comparison routine for bsearch()
287		* this routine is intended for combined UTF8 -> local code
288		*/
289		static int
290		compare3(const void p1, const void p2)
291	0	{
292	0	uint32 s1,
293	0	s2,
294	0	d1,
295	0	d2;
296
297	0	s1 = (const uint32 ) p1;
298	0	s2 = ((const uint32 ) p1 + 1);
299	0	d1 = ((const pg_utf_to_local_combined *) p2)->utf1;
300	0	d2 = ((const pg_utf_to_local_combined *) p2)->utf2;
301	0	return (s1 > d1 \|\| (s1 == d1 && s2 > d2)) ? 1 : ((s1 == d1 && s2 == d2) ? 0 : -1);
302	0	}
303
304		/*
305		* comparison routine for bsearch()
306		* this routine is intended for local code -> combined UTF8
307		*/
308		static int
309		compare4(const void p1, const void p2)
310	0	{
311	0	uint32 v1,
312	0	v2;
313
314	0	v1 = (const uint32 ) p1;
315	0	v2 = ((const pg_local_to_utf_combined *) p2)->code;
316	0	return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1);
317	0	}
318
319		/*
320		* store 32bit character representation into multibyte stream
321		*/
322		static inline unsigned char *
323		store_coded_char(unsigned char *dest, uint32 code)
324	0	{
325	0	if (code & 0xff000000)
326	0	*dest++ = code >> 24;
327	0	if (code & 0x00ff0000)
328	0	*dest++ = code >> 16;
329	0	if (code & 0x0000ff00)
330	0	*dest++ = code >> 8;
331	0	if (code & 0x000000ff)
332	0	*dest++ = code;
333	0	return dest;
334	0	}
335
336		/*
337		* Convert a character using a conversion radix tree.
338		*
339		* 'l' is the length of the input character in bytes, and b1-b4 are
340		* the input character's bytes.
341		*/
342		static inline uint32
343		pg_mb_radix_conv(const pg_mb_radix_tree *rt,
344		int l,
345		unsigned char b1,
346		unsigned char b2,
347		unsigned char b3,
348		unsigned char b4)
349	0	{
350	0	if (l == 4)
351	0	{
352		/* 4-byte code */
353
354		/* check code validity */
355	0	if (b1 < rt->b4_1_lower \|\| b1 > rt->b4_1_upper \|\|
356	0	b2 < rt->b4_2_lower \|\| b2 > rt->b4_2_upper \|\|
357	0	b3 < rt->b4_3_lower \|\| b3 > rt->b4_3_upper \|\|
358	0	b4 < rt->b4_4_lower \|\| b4 > rt->b4_4_upper)
359	0	return 0;
360
361		/* perform lookup */
362	0	if (rt->chars32)
363	0	{
364	0	uint32 idx = rt->b4root;
365
366	0	idx = rt->chars32[b1 + idx - rt->b4_1_lower];
367	0	idx = rt->chars32[b2 + idx - rt->b4_2_lower];
368	0	idx = rt->chars32[b3 + idx - rt->b4_3_lower];
369	0	return rt->chars32[b4 + idx - rt->b4_4_lower];
370	0	}
371	0	else
372	0	{
373	0	uint16 idx = rt->b4root;
374
375	0	idx = rt->chars16[b1 + idx - rt->b4_1_lower];
376	0	idx = rt->chars16[b2 + idx - rt->b4_2_lower];
377	0	idx = rt->chars16[b3 + idx - rt->b4_3_lower];
378	0	return rt->chars16[b4 + idx - rt->b4_4_lower];
379	0	}
380	0	}
381	0	else if (l == 3)
382	0	{
383		/* 3-byte code */
384
385		/* check code validity */
386	0	if (b2 < rt->b3_1_lower \|\| b2 > rt->b3_1_upper \|\|
387	0	b3 < rt->b3_2_lower \|\| b3 > rt->b3_2_upper \|\|
388	0	b4 < rt->b3_3_lower \|\| b4 > rt->b3_3_upper)
389	0	return 0;
390
391		/* perform lookup */
392	0	if (rt->chars32)
393	0	{
394	0	uint32 idx = rt->b3root;
395
396	0	idx = rt->chars32[b2 + idx - rt->b3_1_lower];
397	0	idx = rt->chars32[b3 + idx - rt->b3_2_lower];
398	0	return rt->chars32[b4 + idx - rt->b3_3_lower];
399	0	}
400	0	else
401	0	{
402	0	uint16 idx = rt->b3root;
403
404	0	idx = rt->chars16[b2 + idx - rt->b3_1_lower];
405	0	idx = rt->chars16[b3 + idx - rt->b3_2_lower];
406	0	return rt->chars16[b4 + idx - rt->b3_3_lower];
407	0	}
408	0	}
409	0	else if (l == 2)
410	0	{
411		/* 2-byte code */
412
413		/* check code validity - first byte */
414	0	if (b3 < rt->b2_1_lower \|\| b3 > rt->b2_1_upper \|\|
415	0	b4 < rt->b2_2_lower \|\| b4 > rt->b2_2_upper)
416	0	return 0;
417
418		/* perform lookup */
419	0	if (rt->chars32)
420	0	{
421	0	uint32 idx = rt->b2root;
422
423	0	idx = rt->chars32[b3 + idx - rt->b2_1_lower];
424	0	return rt->chars32[b4 + idx - rt->b2_2_lower];
425	0	}
426	0	else
427	0	{
428	0	uint16 idx = rt->b2root;
429
430	0	idx = rt->chars16[b3 + idx - rt->b2_1_lower];
431	0	return rt->chars16[b4 + idx - rt->b2_2_lower];
432	0	}
433	0	}
434	0	else if (l == 1)
435	0	{
436		/* 1-byte code */
437
438		/* check code validity - first byte */
439	0	if (b4 < rt->b1_lower \|\| b4 > rt->b1_upper)
440	0	return 0;
441
442		/* perform lookup */
443	0	if (rt->chars32)
444	0	return rt->chars32[b4 + rt->b1root - rt->b1_lower];
445	0	else
446	0	return rt->chars16[b4 + rt->b1root - rt->b1_lower];
447	0	}
448	0	return 0; /* shouldn't happen */
449	0	}
450
451		/*
452		* UTF8 ---> local code
453		*
454		* utf: input string in UTF8 encoding (need not be null-terminated)
455		* len: length of input string (in bytes)
456		* iso: pointer to the output area (must be large enough!)
457		(output string will be null-terminated)
458		* map: conversion map for single characters
459		* cmap: conversion map for combined characters
460		* (optional, pass NULL if none)
461		* cmapsize: number of entries in the conversion map for combined characters
462		* (optional, pass 0 if none)
463		* conv_func: algorithmic encoding conversion function
464		* (optional, pass NULL if none)
465		* encoding: PG identifier for the local encoding
466		*
467		* For each character, the cmap (if provided) is consulted first; if no match,
468		* the map is consulted next; if still no match, the conv_func (if provided)
469		* is applied. An error is raised if no match is found.
470		*
471		* See pg_wchar.h for more details about the data structures used here.
472		*/
473		void
474		UtfToLocal(const unsigned char *utf, int len,
475		unsigned char *iso,
476		const pg_mb_radix_tree *map,
477		const pg_utf_to_local_combined *cmap, int cmapsize,
478		utf_local_conversion_func conv_func,
479		int encoding)
480	0	{
481	0	uint32 iutf;
482	0	int l;
483	0	const pg_utf_to_local_combined *cp;
484
485	0	if (!PG_VALID_ENCODING(encoding))
486	0	ereport(ERROR,
487	0	(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
488	0	errmsg("invalid encoding number: %d", encoding)));
489
490	0	for (; len > 0; len -= l)
491	0	{
492	0	unsigned char b1 = 0;
493	0	unsigned char b2 = 0;
494	0	unsigned char b3 = 0;
495	0	unsigned char b4 = 0;
496
497		/* "break" cases all represent errors */
498	0	if (*utf == '\0')
499	0	break;
500
501	0	l = pg_utf_mblen(utf);
502	0	if (len < l)
503	0	break;
504
505	0	if (!pg_utf8_islegal(utf, l))
506	0	break;
507
508	0	if (l == 1)
509	0	{
510		/* ASCII case is easy, assume it's one-to-one conversion */
511	0	iso++ = utf++;
512	0	continue;
513	0	}
514
515		/* collect coded char of length l */
516	0	if (l == 2)
517	0	{
518	0	b3 = *utf++;
519	0	b4 = *utf++;
520	0	}
521	0	else if (l == 3)
522	0	{
523	0	b2 = *utf++;
524	0	b3 = *utf++;
525	0	b4 = *utf++;
526	0	}
527	0	else if (l == 4)
528	0	{
529	0	b1 = *utf++;
530	0	b2 = *utf++;
531	0	b3 = *utf++;
532	0	b4 = *utf++;
533	0	}
534	0	else
535	0	{
536	0	elog(ERROR, "unsupported character length %d", l);
537	0	iutf = 0; /* keep compiler quiet */
538	0	}
539	0	iutf = (b1 << 24 \| b2 << 16 \| b3 << 8 \| b4);
540
541		/* First, try with combined map if possible */
542	0	if (cmap && len > l)
543	0	{
544	0	const unsigned char *utf_save = utf;
545	0	int len_save = len;
546	0	int l_save = l;
547
548		/* collect next character, same as above */
549	0	len -= l;
550
551	0	l = pg_utf_mblen(utf);
552	0	if (len < l)
553	0	break;
554
555	0	if (!pg_utf8_islegal(utf, l))
556	0	break;
557
558		/* We assume ASCII character cannot be in combined map */
559	0	if (l > 1)
560	0	{
561	0	uint32 iutf2;
562	0	uint32 cutf[2];
563
564	0	if (l == 2)
565	0	{
566	0	iutf2 = *utf++ << 8;
567	0	iutf2 \|= *utf++;
568	0	}
569	0	else if (l == 3)
570	0	{
571	0	iutf2 = *utf++ << 16;
572	0	iutf2 \|= *utf++ << 8;
573	0	iutf2 \|= *utf++;
574	0	}
575	0	else if (l == 4)
576	0	{
577	0	iutf2 = *utf++ << 24;
578	0	iutf2 \|= *utf++ << 16;
579	0	iutf2 \|= *utf++ << 8;
580	0	iutf2 \|= *utf++;
581	0	}
582	0	else
583	0	{
584	0	elog(ERROR, "unsupported character length %d", l);
585	0	iutf2 = 0; /* keep compiler quiet */
586	0	}
587
588	0	cutf[0] = iutf;
589	0	cutf[1] = iutf2;
590
591	0	cp = bsearch(cutf, cmap, cmapsize,
592	0	sizeof(pg_utf_to_local_combined), compare3);
593
594	0	if (cp)
595	0	{
596	0	iso = store_coded_char(iso, cp->code);
597	0	continue;
598	0	}
599	0	}
600
601		/* fail, so back up to reprocess second character next time */
602	0	utf = utf_save;
603	0	len = len_save;
604	0	l = l_save;
605	0	}
606
607		/* Now check ordinary map */
608	0	if (map)
609	0	{
610	0	uint32 converted = pg_mb_radix_conv(map, l, b1, b2, b3, b4);
611
612	0	if (converted)
613	0	{
614	0	iso = store_coded_char(iso, converted);
615	0	continue;
616	0	}
617	0	}
618
619		/* if there's a conversion function, try that */
620	0	if (conv_func)
621	0	{
622	0	uint32 converted = (*conv_func) (iutf);
623
624	0	if (converted)
625	0	{
626	0	iso = store_coded_char(iso, converted);
627	0	continue;
628	0	}
629	0	}
630
631		/* failed to translate this character */
632	0	report_untranslatable_char(PG_UTF8, encoding,
633	0	(const char *) (utf - l), len);
634	0	}
635
636		/* if we broke out of loop early, must be invalid input */
637	0	if (len > 0)
638	0	report_invalid_encoding(PG_UTF8, (const char *) utf, len);
639
640	0	*iso = '\0';
641	0	}
642
643		/*
644		* local code ---> UTF8
645		*
646		* iso: input string in local encoding (need not be null-terminated)
647		* len: length of input string (in bytes)
648		* utf: pointer to the output area (must be large enough!)
649		(output string will be null-terminated)
650		* map: conversion map for single characters
651		* cmap: conversion map for combined characters
652		* (optional, pass NULL if none)
653		* cmapsize: number of entries in the conversion map for combined characters
654		* (optional, pass 0 if none)
655		* conv_func: algorithmic encoding conversion function
656		* (optional, pass NULL if none)
657		* encoding: PG identifier for the local encoding
658		*
659		* For each character, the map is consulted first; if no match, the cmap
660		* (if provided) is consulted next; if still no match, the conv_func
661		* (if provided) is applied. An error is raised if no match is found.
662		*
663		* See pg_wchar.h for more details about the data structures used here.
664		*/
665		void
666		LocalToUtf(const unsigned char *iso, int len,
667		unsigned char *utf,
668		const pg_mb_radix_tree *map,
669		const pg_local_to_utf_combined *cmap, int cmapsize,
670		utf_local_conversion_func conv_func,
671		int encoding)
672	0	{
673	0	uint32 iiso;
674	0	int l;
675	0	const pg_local_to_utf_combined *cp;
676
677	0	if (!PG_VALID_ENCODING(encoding))
678	0	ereport(ERROR,
679	0	(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
680	0	errmsg("invalid encoding number: %d", encoding)));
681
682	0	for (; len > 0; len -= l)
683	0	{
684	0	unsigned char b1 = 0;
685	0	unsigned char b2 = 0;
686	0	unsigned char b3 = 0;
687	0	unsigned char b4 = 0;
688
689		/* "break" cases all represent errors */
690	0	if (*iso == '\0')
691	0	break;
692
693	0	if (!IS_HIGHBIT_SET(*iso))
694	0	{
695		/* ASCII case is easy, assume it's one-to-one conversion */
696	0	utf++ = iso++;
697	0	l = 1;
698	0	continue;
699	0	}
700
701	0	l = pg_encoding_verifymb(encoding, (const char *) iso, len);
702	0	if (l < 0)
703	0	break;
704
705		/* collect coded char of length l */
706	0	if (l == 1)
707	0	b4 = *iso++;
708	0	else if (l == 2)
709	0	{
710	0	b3 = *iso++;
711	0	b4 = *iso++;
712	0	}
713	0	else if (l == 3)
714	0	{
715	0	b2 = *iso++;
716	0	b3 = *iso++;
717	0	b4 = *iso++;
718	0	}
719	0	else if (l == 4)
720	0	{
721	0	b1 = *iso++;
722	0	b2 = *iso++;
723	0	b3 = *iso++;
724	0	b4 = *iso++;
725	0	}
726	0	else
727	0	{
728	0	elog(ERROR, "unsupported character length %d", l);
729	0	iiso = 0; /* keep compiler quiet */
730	0	}
731	0	iiso = (b1 << 24 \| b2 << 16 \| b3 << 8 \| b4);
732
733	0	if (map)
734	0	{
735	0	uint32 converted = pg_mb_radix_conv(map, l, b1, b2, b3, b4);
736
737	0	if (converted)
738	0	{
739	0	utf = store_coded_char(utf, converted);
740	0	continue;
741	0	}
742
743		/* If there's a combined character map, try that */
744	0	if (cmap)
745	0	{
746	0	cp = bsearch(&iiso, cmap, cmapsize,
747	0	sizeof(pg_local_to_utf_combined), compare4);
748
749	0	if (cp)
750	0	{
751	0	utf = store_coded_char(utf, cp->utf1);
752	0	utf = store_coded_char(utf, cp->utf2);
753	0	continue;
754	0	}
755	0	}
756	0	}
757
758		/* if there's a conversion function, try that */
759	0	if (conv_func)
760	0	{
761	0	uint32 converted = (*conv_func) (iiso);
762
763	0	if (converted)
764	0	{
765	0	utf = store_coded_char(utf, converted);
766	0	continue;
767	0	}
768	0	}
769
770		/* failed to translate this character */
771	0	report_untranslatable_char(encoding, PG_UTF8,
772	0	(const char *) (iso - l), len);
773	0	}
774
775		/* if we broke out of loop early, must be invalid input */
776	0	if (len > 0)
777	0	report_invalid_encoding(encoding, (const char *) iso, len);
778
779	0	*utf = '\0';
780	0	}