/Users/deen/code/yugabyte-db/src/postgres/src/backend/parser/scansup.c

Source (jump to first uncovered line)
/*-------------------------------------------------------------------------
 *
 * scansup.c
 *    support routines for the lex/flex scanner, used by both the normal
 * backend as well as the bootstrap backend
 *
 * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 *
 * IDENTIFICATION
 *    src/backend/parser/scansup.c
 *
 *-------------------------------------------------------------------------
 */
#include "postgres.h"

#include <ctype.h>

#include "parser/scansup.h"
#include "mb/pg_wchar.h"


/* ----------------
 *    scanstr
 *
 * if the string passed in has escaped codes, map the escape codes to actual
 * chars
 *
 * the string returned is palloc'd and should eventually be pfree'd by the
 * caller!
 * ----------------
 */

char *
scanstr(const char *s)
{
  char     *newStr;
  int     len,
        i,
        j;

  if (s == NULL || s[0] == '\0')
    return pstrdup("");

  len = strlen(s);

  newStr = palloc(len + 1); /* string cannot get longer */

  for (i = 0, j = 0; i < len; i++)
  {
    if (s[i] == '\'')
    {
      /*
       * Note: if scanner is working right, unescaped quotes can only
       * appear in pairs, so there should be another character.
       */
      i++;
      /* The bootstrap parser is not as smart, so check here. */
      Assert(s[i] == '\'');
      newStr[j] = s[i];
    }
    else if (s[i] == '\\')
    {
      i++;
      switch (s[i])
      {
        case 'b':
          newStr[j] = '\b';
          break;
        case 'f':
          newStr[j] = '\f';
          break;
        case 'n':
          newStr[j] = '\n';
          break;
        case 'r':
          newStr[j] = '\r';
          break;
        case 't':
          newStr[j] = '\t';
          break;
        case '0':
        case '1':
        case '2':
        case '3':
        case '4':
        case '5':
        case '6':
        case '7':
          {
            int     k;
            long    octVal = 0;

            for (k = 0;
               s[i + k] >= '0' && s[i + k] <= '7' && k < 3;
               k++)
              octVal = (octVal << 3) + (s[i + k] - '0');
            i += k - 1;
            newStr[j] = ((char) octVal);
          }
          break;
        default:
          newStr[j] = s[i];
          break;
      }         /* switch */
    }           /* s[i] == '\\' */
    else
      newStr[j] = s[i];
    j++;
  }
  newStr[j] = '\0';
  return newStr;
}


/*
 * downcase_truncate_identifier() --- do appropriate downcasing and
 * truncation of an unquoted identifier.  Optionally warn of truncation.
 *
 * Returns a palloc'd string containing the adjusted identifier.
 *
 * Note: in some usages the passed string is not null-terminated.
 *
 * Note: the API of this function is designed to allow for downcasing
 * transformations that increase the string length, but we don't yet
 * support that.  If you want to implement it, you'll need to fix
 * SplitIdentifierString() in utils/adt/varlena.c.
 */
char *
downcase_truncate_identifier(const char *ident, int len, bool warn)
{
  return downcase_identifier(ident, len, warn, true);
}

/*
 * a workhorse for downcase_truncate_identifier
 */
char *
downcase_identifier(const char *ident, int len, bool warn, bool truncate)
{
  char     *result;
  int     i;
  bool    enc_is_single_byte;

  result = palloc(len + 1);
  enc_is_single_byte = pg_database_encoding_max_length() == 1;

  /*
   * SQL99 specifies Unicode-aware case normalization, which we don't yet
   * have the infrastructure for.  Instead we use tolower() to provide a
   * locale-aware translation.  However, there are some locales where this
   * is not right either (eg, Turkish may do strange things with 'i' and
   * 'I').  Our current compromise is to use tolower() for characters with
   * the high bit set, as long as they aren't part of a multi-byte
   * character, and use an ASCII-only downcasing for 7-bit characters.
   */
  for (i = 0; i < len; i++)
  {
    unsigned char ch = (unsigned char) ident[i];

    if (ch >= 'A' && ch <= 'Z')
      ch += 'a' - 'A';
    else if (enc_is_single_byte && IS_HIGHBIT_SET(ch) && isupper(ch))
      ch = tolower(ch);
    result[i] = (char) ch;
  }
  result[i] = '\0';

  if (i >= NAMEDATALEN && truncate)
    truncate_identifier(result, i, warn);

  return result;
}


/*
 * truncate_identifier() --- truncate an identifier to NAMEDATALEN-1 bytes.
 *
 * The given string is modified in-place, if necessary.  A warning is
 * issued if requested.
 *
 * We require the caller to pass in the string length since this saves a
 * strlen() call in some common usages.
 */
void
truncate_identifier(char *ident, int len, bool warn)
{
  if (len >= NAMEDATALEN)
  {
    len = pg_mbcliplen(ident, len, NAMEDATALEN - 1);
    if (warn)
    {
      /*
       * We avoid using %.*s here because it can misbehave if the data
       * is not valid in what libc thinks is the prevailing encoding.
       */
      char    buf[NAMEDATALEN];

      memcpy(buf, ident, len);
      buf[len] = '\0';
      ereport(NOTICE,
          (errcode(ERRCODE_NAME_TOO_LONG),
           errmsg("identifier \"%s\" will be truncated to \"%s\"",
              ident, buf)));
    }
    ident[len] = '\0';
  }
}

/*
 * scanner_isspace() --- return true if flex scanner considers char whitespace
 *
 * This should be used instead of the potentially locale-dependent isspace()
 * function when it's important to match the lexer's behavior.
 *
 * In principle we might need similar functions for isalnum etc, but for the
 * moment only isspace seems needed.
 */
bool
scanner_isspace(char ch)
{
  /* This must match scan.l's list of {space} characters */
  if (ch == ' ' ||
    ch == '\t' ||
    ch == '\n' ||
    ch == '\r' ||
    ch == '\f')
    return true;
  return false;
}

YugabyteDB (2.13.0.0-b42, bfc6a6643e7399ac8a0e81d06a3ee6d6571b33ab)

Coverage Report

Created: 2022-03-09 17:30

Line	Count	Source (jump to first uncovered line)
1		/*-------------------------------------------------------------------------
2		*
3		* scansup.c
4		* support routines for the lex/flex scanner, used by both the normal
5		* backend as well as the bootstrap backend
6		*
7		* Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
8		* Portions Copyright (c) 1994, Regents of the University of California
9		*
10		*
11		* IDENTIFICATION
12		* src/backend/parser/scansup.c
13		*
14		*-------------------------------------------------------------------------
15		*/
16		#include "postgres.h"
17
18		#include <ctype.h>
19
20		#include "parser/scansup.h"
21		#include "mb/pg_wchar.h"
22
23
24		/* ----------------
25		* scanstr
26		*
27		* if the string passed in has escaped codes, map the escape codes to actual
28		* chars
29		*
30		* the string returned is palloc'd and should eventually be pfree'd by the
31		* caller!
32		* ----------------
33		*/
34
35		char *
36		scanstr(const char *s)
37	0	{
38	0	char *newStr;
39	0	int len,
40	0	i,
41	0	j;
42
43	0	if (s == NULL \|\| s[0] == '\0')
44	0	return pstrdup("");
45
46	0	len = strlen(s);
47
48	0	newStr = palloc(len + 1); /* string cannot get longer */
49
50	0	for (i = 0, j = 0; i < len; i++)
51	0	{
52	0	if (s[i] == '\'')
53	0	{
54		/*
55		* Note: if scanner is working right, unescaped quotes can only
56		* appear in pairs, so there should be another character.
57		*/
58	0	i++;
59		/* The bootstrap parser is not as smart, so check here. */
60	0	Assert(s[i] == '\'');
61	0	newStr[j] = s[i];
62	0	}
63	0	else if (s[i] == '\\')
64	0	{
65	0	i++;
66	0	switch (s[i])
67	0	{
68	0	case 'b':
69	0	newStr[j] = '\b';
70	0	break;
71	0	case 'f':
72	0	newStr[j] = '\f';
73	0	break;
74	0	case 'n':
75	0	newStr[j] = '\n';
76	0	break;
77	0	case 'r':
78	0	newStr[j] = '\r';
79	0	break;
80	0	case 't':
81	0	newStr[j] = '\t';
82	0	break;
83	0	case '0':
84	0	case '1':
85	0	case '2':
86	0	case '3':
87	0	case '4':
88	0	case '5':
89	0	case '6':
90	0	case '7':
91	0	{
92	0	int k;
93	0	long octVal = 0;
94
95	0	for (k = 0;
96	0	s[i + k] >= '0' && s[i + k] <= '7' && k < 3;
97	0	k++)
98	0	octVal = (octVal << 3) + (s[i + k] - '0');
99	0	i += k - 1;
100	0	newStr[j] = ((char) octVal);
101	0	}
102	0	break;
103	0	default:
104	0	newStr[j] = s[i];
105	0	break;
106	0	} /* switch */
107	0	} /* s[i] == '\\' */
108	0	else
109	0	newStr[j] = s[i];
110	0	j++;
111	0	}
112	0	newStr[j] = '\0';
113	0	return newStr;
114	0	}
115
116
117		/*
118		* downcase_truncate_identifier() --- do appropriate downcasing and
119		* truncation of an unquoted identifier. Optionally warn of truncation.
120		*
121		* Returns a palloc'd string containing the adjusted identifier.
122		*
123		* Note: in some usages the passed string is not null-terminated.
124		*
125		* Note: the API of this function is designed to allow for downcasing
126		* transformations that increase the string length, but we don't yet
127		* support that. If you want to implement it, you'll need to fix
128		* SplitIdentifierString() in utils/adt/varlena.c.
129		*/
130		char *
131		downcase_truncate_identifier(const char *ident, int len, bool warn)
132	613k	{
133	613k	return downcase_identifier(ident, len, warn, true);
134	613k	}
135
136		/*
137		* a workhorse for downcase_truncate_identifier
138		*/
139		char *
140		downcase_identifier(const char *ident, int len, bool warn, bool truncate)
141	613k	{
142	613k	char *result;
143	613k	int i;
144	613k	bool enc_is_single_byte;
145
146	613k	result = palloc(len + 1);
147	613k	enc_is_single_byte = pg_database_encoding_max_length() == 1;
148
149		/*
150		* SQL99 specifies Unicode-aware case normalization, which we don't yet
151		* have the infrastructure for. Instead we use tolower() to provide a
152		* locale-aware translation. However, there are some locales where this
153		* is not right either (eg, Turkish may do strange things with 'i' and
154		* 'I'). Our current compromise is to use tolower() for characters with
155		* the high bit set, as long as they aren't part of a multi-byte
156		* character, and use an ASCII-only downcasing for 7-bit characters.
157		*/
158	5.03M	for (i = 0; i < len; i++)
159	4.42M	{
160	4.42M	unsigned char ch = (unsigned char) ident[i];
161
162	4.42M	if (ch >= 'A' && ch <= 'Z')
163	171k	ch += 'a' - 'A';
164	4.25M	else if (enc_is_single_byte && IS_HIGHBIT_SET(ch) && isupper(ch))
165	0	ch = tolower(ch);
166	4.42M	result[i] = (char) ch;
167	4.42M	}
168	613k	result[i] = '\0';
169
170	613k	if (i >= NAMEDATALEN && truncate)
171	0	truncate_identifier(result, i, warn);
172
173	613k	return result;
174	613k	}
175
176
177		/*
178		* truncate_identifier() --- truncate an identifier to NAMEDATALEN-1 bytes.
179		*
180		* The given string is modified in-place, if necessary. A warning is
181		* issued if requested.
182		*
183		* We require the caller to pass in the string length since this saves a
184		* strlen() call in some common usages.
185		*/
186		void
187		truncate_identifier(char *ident, int len, bool warn)
188	41.3k	{
189	41.3k	if (len >= NAMEDATALEN)
190	0	{
191	0	len = pg_mbcliplen(ident, len, NAMEDATALEN - 1);
192	0	if (warn)
193	0	{
194		/*
195		* We avoid using %.*s here because it can misbehave if the data
196		* is not valid in what libc thinks is the prevailing encoding.
197		*/
198	0	char buf[NAMEDATALEN];
199
200	0	memcpy(buf, ident, len);
201	0	buf[len] = '\0';
202	0	ereport(NOTICE,
203	0	(errcode(ERRCODE_NAME_TOO_LONG),
204	0	errmsg("identifier \"%s\" will be truncated to \"%s\"",
205	0	ident, buf)));
206	0	}
207	0	ident[len] = '\0';
208	0	}
209	41.3k	}
210
211		/*
212		* scanner_isspace() --- return true if flex scanner considers char whitespace
213		*
214		* This should be used instead of the potentially locale-dependent isspace()
215		* function when it's important to match the lexer's behavior.
216		*
217		* In principle we might need similar functions for isalnum etc, but for the
218		* moment only isspace seems needed.
219		*/
220		bool
221		scanner_isspace(char ch)
222	952k	{
223		/* This must match scan.l's list of {space} characters */
224	952k	if (ch == ' ' \|\|
225	939k	ch == '\t' \|\|
226	929k	ch == '\n' \|\|
227	918k	ch == '\r' \|\|
228	918k	ch == '\f')
229	34.3k	return true;
230	918k	return false;
231	918k	}