/Users/deen/code/yugabyte-db/src/yb/yql/cql/ql/parser/scanner_util.cc

Source (jump to first uncovered line)
//--------------------------------------------------------------------------------------------------
// NOTE: All entities in this modules are copies of PostgreQL's code. We made some minor changes
// to avoid lint errors such as using '{' for if blocks and change the comment style from '/**/'
// to '//'.
//--------------------------------------------------------------------------------------------------

//--------------------------------------------------------------------------------------------------
// The following only applies to changes made to this file as part of YugaByte development.
//
// Copyright (c) YugaByte, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
// in compliance with the License.  You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software distributed under the License
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
// or implied.  See the License for the specific language governing permissions and limitations
// under the License.
//
// Portions Copyright (c) 1996-2015, PostgreSQL Global Development Group
// Portions Copyright (c) 1994, Regents of the University of California
//
// Scanning utility functions.
//--------------------------------------------------------------------------------------------------
#include "yb/yql/cql/ql/parser/scanner_util.h"

#include <algorithm>

#include "yb/gutil/macros.h"

namespace yb {
namespace ql {

using std::min;

//--------------------------------------------------------------------------------------------------

unsigned int hexval(unsigned char c) {
  if (c >= '0' && c <= '9')
    return c - '0';
  if (c >= 'a' && c <= 'f')
    return c - 'a' + 0xA;
  if (c >= 'A' && c <= 'F')
    return c - 'A' + 0xA;

  LOG(ERROR) << "invalid hexadecimal digit";
  return 0; /* not reached */
}

//--------------------------------------------------------------------------------------------------

void downcase_truncate_identifier(char *result, const char *ident, int len, bool warn) {
  int i;

  // SQL99 specifies Unicode-aware case normalization, which we don't yet
  // have the infrastructure for.  Instead we use tolower() to provide a
  // locale-aware translation.  However, there are some locales where this
  // is not right either (eg, Turkish may do strange things with 'i' and
  // 'I').  Our current compromise is to use tolower() for characters with
  // the high bit set, as long as they aren't part of a multi-byte
  // character, and use an ASCII-only downcasing for 7-bit characters.
  for (i = 0; i < len; i++) {
    unsigned char ch = static_cast<unsigned char>(ident[i]);

    if (ch >= 'A' && ch <= 'Z') {
      ch += 'a' - 'A';
    }
    result[i] = static_cast<char>(ch);
  }
  result[i] = '\0';

  if (i >= NAMEDATALEN) {
    truncate_identifier(result, i, warn);
  }
}

void truncate_identifier(char *ident, size_t len, bool warn) {
  if (len >= NAMEDATALEN) {
    len = pg_encoding_mbcliplen(ident, len, NAMEDATALEN - 1);
    if (warn) {
      // We avoid using %.*s here because it can misbehave if the data
      // is not valid in what libc thinks is the prevailing encoding.
      char buf[NAMEDATALEN];

      memcpy(buf, ident, len);
      buf[len] = '\0';
      LOG(WARNING) << "SQL Warning: " << ErrorText(ErrorCode::NAME_TOO_LONG)
                   << "Identifier " << ident << " will be truncated to " << buf;
    }
    ident[len] = '\0';
  }
}

//--------------------------------------------------------------------------------------------------

bool scanner_isspace(char ch) {
  // This must match scan.l's list of {space} characters.
  return (ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r' || ch == '\f');
}

bool check_uescapechar(unsigned char escape) {
  if (isxdigit(escape)
    || escape == '+'
    || escape == '\''
    || escape == '"'
    || scanner_isspace(escape)) {
    return false;
  } else {
    return true;
  }
}

void check_unicode_value(pg_wchar c, char *loc) {
}

unsigned char *unicode_to_utf8(pg_wchar c, unsigned char *utf8string) {
  if (c <= 0x7F) {
    utf8string[0] = c;
  } else if (c <= 0x7FF) {
    utf8string[0] = 0xC0 | ((c >> 6) & 0x1F);
    utf8string[1] = 0x80 | (c & 0x3F);
  } else if (c <= 0xFFFF) {
    utf8string[0] = 0xE0 | ((c >> 12) & 0x0F);
    utf8string[1] = 0x80 | ((c >> 6) & 0x3F);
    utf8string[2] = 0x80 | (c & 0x3F);
  } else {
    utf8string[0] = 0xF0 | ((c >> 18) & 0x07);
    utf8string[1] = 0x80 | ((c >> 12) & 0x3F);
    utf8string[2] = 0x80 | ((c >> 6) & 0x3F);
    utf8string[3] = 0x80 | (c & 0x3F);
  }

  return utf8string;
}

//--------------------------------------------------------------------------------------------------

bool is_utf16_surrogate_first(pg_wchar c) {
  return (c >= 0xD800 && c <= 0xDBFF);
}

bool is_utf16_surrogate_second(pg_wchar c) {
  return (c >= 0xDC00 && c <= 0xDFFF);
}

pg_wchar surrogate_pair_to_codepoint(pg_wchar first, pg_wchar second) {
  return ((first & 0x3FF) << 10) + 0x10000 + (second & 0x3FF);
}

//--------------------------------------------------------------------------------------------------

size_t pg_utf_mblen(const unsigned char *s) {
  if ((*s & 0x80) == 0)
    return 1;
  else if ((*s & 0xe0) == 0xc0)
    return 2;
  else if ((*s & 0xf0) == 0xe0)
    return 3;
  else if ((*s & 0xf8) == 0xf0)
    return 4;
#ifdef NOT_USED
  else if ((*s & 0xfc) == 0xf8)
    return 5;
  else if ((*s & 0xfe) == 0xfc)
    return 6;
#endif
  else
    return 1;
}

size_t pg_mbstrlen_with_len(const char *mbstr, size_t limit) {
  size_t len = 0;

  while (limit > 0 && *mbstr) {
    auto l = pg_utf_mblen((const unsigned char *)mbstr);

    limit -= l;
    mbstr += l;
    len++;
  }
  return len;
}

size_t pg_verify_mbstr_len(const char *mbstr, size_t len, bool noError) {
  size_t mb_len = 0;
  while (len > 0) {
    /* fast path for ASCII-subset characters */
    if (!is_utf_highbit_set(static_cast<unsigned char>(*mbstr))) {
      if (*mbstr != '\0') {
        mb_len++;
        mbstr++;
        len--;
        continue;
      }
      if (noError) {
        return -1;
      }
      report_invalid_encoding(mbstr, len);
    }

    auto l = pg_utf8_verifier((const unsigned char *) mbstr, len);

    if (l < 0) {
      if (noError)
        return -1;
      report_invalid_encoding(mbstr, len);
    }

    mbstr += l;
    len -= l;
    mb_len++;
  }
  return mb_len;
}

//--------------------------------------------------------------------------------------------------

void report_invalid_encoding(const char *mbstr, size_t len) {
  auto    l = pg_utf_mblen((const unsigned char *)mbstr);
  char    buf[8 * 5 + 1];
  char   *p = buf;

  auto jlimit = min(l, len);
  jlimit = min<size_t>(jlimit, 8);  /* prevent buffer overrun */

  // The following NOLINTs are used as I tried to leave PostgreQL's code as is. Eventually, when we
  // use our own error reporting for QL interface, the NOLINTs should be gone then.
  for (size_t j = 0; j < jlimit; j++) {
    p += sprintf(p, "0x%02x", (unsigned char) mbstr[j]);  // NOLINT(*)
    if (j < jlimit - 1)
      p += sprintf(p, " ");  // NOLINT(*)
  }

  LOG(ERROR) << "SQL Error: " << ErrorText(ErrorCode::CHARACTER_NOT_IN_REPERTOIRE)
             << ". Invalid byte sequence for UTF8 \"" << buf << "\"";
}

//--------------------------------------------------------------------------------------------------

ssize_t pg_utf8_verifier(const unsigned char *s, size_t len) {
  auto l = pg_utf_mblen(s);

  if (len < l)
    return -1;

  if (!pg_utf8_islegal(s, l))
    return -1;

  return l;
}

//--------------------------------------------------------------------------------------------------

bool pg_utf8_islegal(const unsigned char *source, size_t length) {
  unsigned char a;

  switch (length) {
  default:
    /* reject lengths 5 and 6 for now */
    return false;

  case 4:
    a = source[3];
    if (a < 0x80 || a > 0xBF)
      return false;
    FALLTHROUGH_INTENDED;

  case 3:
    a = source[2];
    if (a < 0x80 || a > 0xBF)
      return false;
    FALLTHROUGH_INTENDED;

  case 2:
    a = source[1];
    switch (*source) {
    case 0xE0:
      if (a < 0xA0 || a > 0xBF)
        return false;
      break;
    case 0xED:
      if (a < 0x80 || a > 0x9F)
        return false;
      break;
    case 0xF0:
      if (a < 0x90 || a > 0xBF)
        return false;
      break;
    case 0xF4:
      if (a < 0x80 || a > 0x8F)
        return false;
      break;
    default:
      if (a < 0x80 || a > 0xBF)
        return false;
      break;
    }
    FALLTHROUGH_INTENDED;

  case 1:
    a = *source;
    if (a >= 0x80 && a < 0xC2)
      return false;
    if (a > 0xF4)
      return false;
    break;
  }
  return true;
}

//--------------------------------------------------------------------------------------------------

size_t pg_encoding_mbcliplen(const char *mbstr, size_t len, size_t limit) {
  size_t     clen = 0;

  while (clen < len && *mbstr) {
    auto l = pg_utf_mblen((const unsigned char *) mbstr);
    if ((clen + l) > limit)
      break;
    clen += l;
    if (clen == limit)
      break;
    mbstr += l;
  }
  return clen;
}

}  // namespace ql
}  // namespace yb

YugabyteDB (2.13.1.0-b60, 21121d69985fbf76aa6958d8f04a9bfa936293b5)

Coverage Report

Created: 2022-03-22 16:43

Line	Count	Source (jump to first uncovered line)
1		//--------------------------------------------------------------------------------------------------
2		// NOTE: All entities in this modules are copies of PostgreQL's code. We made some minor changes
3		// to avoid lint errors such as using '{' for if blocks and change the comment style from '/**/'
4		// to '//'.
5		//--------------------------------------------------------------------------------------------------
6
7		//--------------------------------------------------------------------------------------------------
8		// The following only applies to changes made to this file as part of YugaByte development.
9		//
10		// Copyright (c) YugaByte, Inc.
11		//
12		// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
13		// in compliance with the License. You may obtain a copy of the License at
14		//
15		// http://www.apache.org/licenses/LICENSE-2.0
16		//
17		// Unless required by applicable law or agreed to in writing, software distributed under the License
18		// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
19		// or implied. See the License for the specific language governing permissions and limitations
20		// under the License.
21		//
22		// Portions Copyright (c) 1996-2015, PostgreSQL Global Development Group
23		// Portions Copyright (c) 1994, Regents of the University of California
24		//
25		// Scanning utility functions.
26		//--------------------------------------------------------------------------------------------------
27		#include "yb/yql/cql/ql/parser/scanner_util.h"
28
29		#include <algorithm>
30
31		#include "yb/gutil/macros.h"
32
33		namespace yb {
34		namespace ql {
35
36		using std::min;
37
38		//--------------------------------------------------------------------------------------------------
39
40	0	unsigned int hexval(unsigned char c) {
41	0	if (c >= '0' && c <= '9')
42	0	return c - '0';
43	0	if (c >= 'a' && c <= 'f')
44	0	return c - 'a' + 0xA;
45	0	if (c >= 'A' && c <= 'F')
46	0	return c - 'A' + 0xA;
47
48	0	LOG(ERROR) << "invalid hexadecimal digit";
49	0	return 0; /* not reached */
50	0	}
51
52		//--------------------------------------------------------------------------------------------------
53
54	0	void downcase_truncate_identifier(char result, const char ident, int len, bool warn) {
55	0	int i;
56
57		// SQL99 specifies Unicode-aware case normalization, which we don't yet
58		// have the infrastructure for. Instead we use tolower() to provide a
59		// locale-aware translation. However, there are some locales where this
60		// is not right either (eg, Turkish may do strange things with 'i' and
61		// 'I'). Our current compromise is to use tolower() for characters with
62		// the high bit set, as long as they aren't part of a multi-byte
63		// character, and use an ASCII-only downcasing for 7-bit characters.
64	0	for (i = 0; i < len; i++) {
65	0	unsigned char ch = static_cast<unsigned char>(ident[i]);
66
67	0	if (ch >= 'A' && ch <= 'Z') {
68	0	ch += 'a' - 'A';
69	0	}
70	0	result[i] = static_cast<char>(ch);
71	0	}
72	0	result[i] = '\0';
73
74	0	if (i >= NAMEDATALEN) {
75	0	truncate_identifier(result, i, warn);
76	0	}
77	0	}
78
79	0	void truncate_identifier(char *ident, size_t len, bool warn) {
80	0	if (len >= NAMEDATALEN) {
81	0	len = pg_encoding_mbcliplen(ident, len, NAMEDATALEN - 1);
82	0	if (warn) {
83		// We avoid using %.*s here because it can misbehave if the data
84		// is not valid in what libc thinks is the prevailing encoding.
85	0	char buf[NAMEDATALEN];
86
87	0	memcpy(buf, ident, len);
88	0	buf[len] = '\0';
89	0	LOG(WARNING) << "SQL Warning: " << ErrorText(ErrorCode::NAME_TOO_LONG)
90	0	<< "Identifier " << ident << " will be truncated to " << buf;
91	0	}
92	0	ident[len] = '\0';
93	0	}
94	0	}
95
96		//--------------------------------------------------------------------------------------------------
97
98	0	bool scanner_isspace(char ch) {
99		// This must match scan.l's list of {space} characters.
100	0	return (ch == ' ' \|\| ch == '\t' \|\| ch == '\n' \|\| ch == '\r' \|\| ch == '\f');
101	0	}
102
103	0	bool check_uescapechar(unsigned char escape) {
104	0	if (isxdigit(escape)
105	0	\|\| escape == '+'
106	0	\|\| escape == '\''
107	0	\|\| escape == '"'
108	0	\|\| scanner_isspace(escape)) {
109	0	return false;
110	0	} else {
111	0	return true;
112	0	}
113	0	}
114
115	0	void check_unicode_value(pg_wchar c, char *loc) {
116	0	}
117
118	0	unsigned char unicode_to_utf8(pg_wchar c, unsigned char utf8string) {
119	0	if (c <= 0x7F) {
120	0	utf8string[0] = c;
121	0	} else if (c <= 0x7FF) {
122	0	utf8string[0] = 0xC0 \| ((c >> 6) & 0x1F);
123	0	utf8string[1] = 0x80 \| (c & 0x3F);
124	0	} else if (c <= 0xFFFF) {
125	0	utf8string[0] = 0xE0 \| ((c >> 12) & 0x0F);
126	0	utf8string[1] = 0x80 \| ((c >> 6) & 0x3F);
127	0	utf8string[2] = 0x80 \| (c & 0x3F);
128	0	} else {
129	0	utf8string[0] = 0xF0 \| ((c >> 18) & 0x07);
130	0	utf8string[1] = 0x80 \| ((c >> 12) & 0x3F);
131	0	utf8string[2] = 0x80 \| ((c >> 6) & 0x3F);
132	0	utf8string[3] = 0x80 \| (c & 0x3F);
133	0	}
134
135	0	return utf8string;
136	0	}
137
138		//--------------------------------------------------------------------------------------------------
139
140	0	bool is_utf16_surrogate_first(pg_wchar c) {
141	0	return (c >= 0xD800 && c <= 0xDBFF);
142	0	}
143
144	0	bool is_utf16_surrogate_second(pg_wchar c) {
145	0	return (c >= 0xDC00 && c <= 0xDFFF);
146	0	}
147
148	0	pg_wchar surrogate_pair_to_codepoint(pg_wchar first, pg_wchar second) {
149	0	return ((first & 0x3FF) << 10) + 0x10000 + (second & 0x3FF);
150	0	}
151
152		//--------------------------------------------------------------------------------------------------
153
154	0	size_t pg_utf_mblen(const unsigned char *s) {
155	0	if ((*s & 0x80) == 0)
156	0	return 1;
157	0	else if ((*s & 0xe0) == 0xc0)
158	0	return 2;
159	0	else if ((*s & 0xf0) == 0xe0)
160	0	return 3;
161	0	else if ((*s & 0xf8) == 0xf0)
162	0	return 4;
163		#ifdef NOT_USED
164		else if ((*s & 0xfc) == 0xf8)
165		return 5;
166		else if ((*s & 0xfe) == 0xfc)
167		return 6;
168		#endif
169	0	else
170	0	return 1;
171	0	}
172
173	0	size_t pg_mbstrlen_with_len(const char *mbstr, size_t limit) {
174	0	size_t len = 0;
175
176	0	while (limit > 0 && *mbstr) {
177	0	auto l = pg_utf_mblen((const unsigned char *)mbstr);
178
179	0	limit -= l;
180	0	mbstr += l;
181	0	len++;
182	0	}
183	0	return len;
184	0	}
185
186	0	size_t pg_verify_mbstr_len(const char *mbstr, size_t len, bool noError) {
187	0	size_t mb_len = 0;
188	0	while (len > 0) {
189		/* fast path for ASCII-subset characters */
190	0	if (!is_utf_highbit_set(static_cast<unsigned char>(*mbstr))) {
191	0	if (*mbstr != '\0') {
192	0	mb_len++;
193	0	mbstr++;
194	0	len--;
195	0	continue;
196	0	}
197	0	if (noError) {
198	0	return -1;
199	0	}
200	0	report_invalid_encoding(mbstr, len);
201	0	}
202
203	0	auto l = pg_utf8_verifier((const unsigned char *) mbstr, len);
204
205	0	if (l < 0) {
206	0	if (noError)
207	0	return -1;
208	0	report_invalid_encoding(mbstr, len);
209	0	}
210
211	0	mbstr += l;
212	0	len -= l;
213	0	mb_len++;
214	0	}
215	0	return mb_len;
216	0	}
217
218		//--------------------------------------------------------------------------------------------------
219
220	0	void report_invalid_encoding(const char *mbstr, size_t len) {
221	0	auto l = pg_utf_mblen((const unsigned char *)mbstr);
222	0	char buf[8 * 5 + 1];
223	0	char *p = buf;
224
225	0	auto jlimit = min(l, len);
226	0	jlimit = min<size_t>(jlimit, 8); /* prevent buffer overrun */
227
228		// The following NOLINTs are used as I tried to leave PostgreQL's code as is. Eventually, when we
229		// use our own error reporting for QL interface, the NOLINTs should be gone then.
230	0	for (size_t j = 0; j < jlimit; j++) {
231	0	p += sprintf(p, "0x%02x", (unsigned char) mbstr[j]); // NOLINT(*)
232	0	if (j < jlimit - 1)
233	0	p += sprintf(p, " "); // NOLINT(*)
234	0	}
235
236	0	LOG(ERROR) << "SQL Error: " << ErrorText(ErrorCode::CHARACTER_NOT_IN_REPERTOIRE)
237	0	<< ". Invalid byte sequence for UTF8 \"" << buf << "\"";
238	0	}
239
240		//--------------------------------------------------------------------------------------------------
241
242	0	ssize_t pg_utf8_verifier(const unsigned char *s, size_t len) {
243	0	auto l = pg_utf_mblen(s);
244
245	0	if (len < l)
246	0	return -1;
247
248	0	if (!pg_utf8_islegal(s, l))
249	0	return -1;
250
251	0	return l;
252	0	}
253
254		//--------------------------------------------------------------------------------------------------
255
256	0	bool pg_utf8_islegal(const unsigned char *source, size_t length) {
257	0	unsigned char a;
258
259	0	switch (length) {
260	0	default:
261		/* reject lengths 5 and 6 for now */
262	0	return false;
263
264	0	case 4:
265	0	a = source[3];
266	0	if (a < 0x80 \|\| a > 0xBF)
267	0	return false;
268	0	FALLTHROUGH_INTENDED;
269
270	0	case 3:
271	0	a = source[2];
272	0	if (a < 0x80 \|\| a > 0xBF)
273	0	return false;
274	0	FALLTHROUGH_INTENDED;
275
276	0	case 2:
277	0	a = source[1];
278	0	switch (*source) {
279	0	case 0xE0:
280	0	if (a < 0xA0 \|\| a > 0xBF)
281	0	return false;
282	0	break;
283	0	case 0xED:
284	0	if (a < 0x80 \|\| a > 0x9F)
285	0	return false;
286	0	break;
287	0	case 0xF0:
288	0	if (a < 0x90 \|\| a > 0xBF)
289	0	return false;
290	0	break;
291	0	case 0xF4:
292	0	if (a < 0x80 \|\| a > 0x8F)
293	0	return false;
294	0	break;
295	0	default:
296	0	if (a < 0x80 \|\| a > 0xBF)
297	0	return false;
298	0	break;
299	0	}
300	0	FALLTHROUGH_INTENDED;
301
302	0	case 1:
303	0	a = *source;
304	0	if (a >= 0x80 && a < 0xC2)
305	0	return false;
306	0	if (a > 0xF4)
307	0	return false;
308	0	break;
309	0	}
310	0	return true;
311	0	}
312
313		//--------------------------------------------------------------------------------------------------
314
315	0	size_t pg_encoding_mbcliplen(const char *mbstr, size_t len, size_t limit) {
316	0	size_t clen = 0;
317
318	0	while (clen < len && *mbstr) {
319	0	auto l = pg_utf_mblen((const unsigned char *) mbstr);
320	0	if ((clen + l) > limit)
321	0	break;
322	0	clen += l;
323	0	if (clen == limit)
324	0	break;
325	0	mbstr += l;
326	0	}
327	0	return clen;
328	0	}
329
330		} // namespace ql
331		} // namespace yb