/Users/deen/code/yugabyte-db/src/yb/yql/cql/ql/parser/scanner.cc
Line | Count | Source (jump to first uncovered line) |
1 | | //-------------------------------------------------------------------------------------------------- |
2 | | // The following only applies to changes made to this file as part of YugaByte development. |
3 | | // |
4 | | // Copyright (c) YugaByte, Inc. |
5 | | // |
6 | | // Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except |
7 | | // in compliance with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, software distributed under the License |
12 | | // is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express |
13 | | // or implied. See the License for the specific language governing permissions and limitations |
14 | | // under the License. |
15 | | // |
16 | | // Portions Copyright (c) 1996-2015, PostgreSQL Global Development Group |
17 | | // Portions Copyright (c) 1994, Regents of the University of California |
18 | | // |
19 | | // API for the core scanner (flex machine). Some of the functions are not following Yugabyte naming |
20 | | // convention because they are PostgreQL code. |
21 | | //-------------------------------------------------------------------------------------------------- |
22 | | |
23 | | // #include <algorithm> |
24 | | #include <unordered_map> |
25 | | |
26 | | #include "yb/gutil/casts.h" |
27 | | |
28 | | #include "yb/yql/cql/ql/parser/parser.h" |
29 | | #include "yb/yql/cql/ql/parser/scanner.h" |
30 | | #include "yb/yql/cql/ql/parser/scanner_util.h" |
31 | | #include "yb/util/logging.h" |
32 | | |
33 | | namespace yb { |
34 | | namespace ql { |
35 | | |
36 | | using std::unordered_map; |
37 | | |
38 | | //-------------------------------------------------------------------------------------------------- |
39 | | // Class LexProcessor. |
40 | | //-------------------------------------------------------------------------------------------------- |
41 | | |
42 | | LexProcessor::LexProcessor() |
43 | | : literalbuf_(nullptr), |
44 | | literallen_(0), |
45 | | literalalloc_(0), |
46 | | backslash_quote_(BackslashQuoteType::SAFE_ENCODING), |
47 | | escape_string_warning_(true), |
48 | 18.0k | standard_conforming_strings_(true) { |
49 | 18.0k | } |
50 | | |
51 | 1 | LexProcessor::~LexProcessor() { |
52 | 1 | if (literalbuf_ != nullptr) { |
53 | 0 | free(literalbuf_); |
54 | 0 | } |
55 | 1 | } |
56 | | |
57 | | //-------------------------------------------------------------------------------------------------- |
58 | | |
59 | 342k | void LexProcessor::ScanInit(ParseContext *parse_context) { |
60 | 342k | yyrestart(parse_context->ql_file()); |
61 | | |
62 | 342k | token_loc_.initialize(); |
63 | 342k | cursor_.initialize(); |
64 | 342k | lookahead_.type = 0; |
65 | | |
66 | 342k | literallen_ = 0; |
67 | 342k | xcdepth_ = 0; |
68 | 342k | dolqstart_ = nullptr; |
69 | 342k | utf16_first_part_ = 0; |
70 | 342k | warn_on_first_escape_ = false; |
71 | 342k | saw_non_ascii_ = false; |
72 | | |
73 | 342k | backslash_quote_ = BackslashQuoteType::SAFE_ENCODING; |
74 | 342k | escape_string_warning_ = true; |
75 | 342k | standard_conforming_strings_ = true; |
76 | | |
77 | 342k | parse_context_ = parse_context; |
78 | 342k | if (parse_context_ != nullptr) { |
79 | 339k | yy_flex_debug = parse_context->trace_scanning(); |
80 | 339k | } |
81 | 342k | } |
82 | | |
83 | | //-------------------------------------------------------------------------------------------------- |
84 | | |
85 | 4.44M | GramProcessor::symbol_type LexProcessor::Scan() { |
86 | | // Use the lookahead from the context if it's available. Otherwise, read the token. |
87 | 4.44M | GramProcessor::symbol_type cur_token; |
88 | 4.44M | ScanState scan_state; |
89 | | |
90 | 4.44M | if (lookahead_.token() != 0) { |
91 | | // Remove lookahead from the context and reset it to type 0. |
92 | 4.09k | cur_token.move(lookahead_); |
93 | 4.09k | lookahead_.type = 0; |
94 | 4.43M | } else { |
95 | | // Read the next token and save it to 'cur_token'. |
96 | 4.43M | ScanNextToken(scan_state, &cur_token); |
97 | 4.43M | } |
98 | | |
99 | | // Return the token if it doesn't require lookahead. Otherwise, set the token length. |
100 | 4.44M | switch (cur_token.token()) { |
101 | 28 | case GramProcessor::token::TOK_GROUP_P: |
102 | 450 | case GramProcessor::token::TOK_OFFSET: |
103 | 2.09k | case GramProcessor::token::TOK_NOT: |
104 | 2.09k | case GramProcessor::token::TOK_NULLS_P: |
105 | 4.09k | case GramProcessor::token::TOK_WITH: { |
106 | 4.09k | break; |
107 | 2.09k | } |
108 | | |
109 | 4.44M | default: { |
110 | | // Return 'cur_token' as it does not require lookahead. |
111 | 4.44M | return cur_token; |
112 | 2.09k | } |
113 | 4.44M | } |
114 | | |
115 | | // Cache the lookahead token. |
116 | 4.09k | ScanNextToken(scan_state, &lookahead_); |
117 | | |
118 | | // Replace cur_token if needed, based on lookahead. |
119 | 4.09k | GramProcessor::token_type next_token_type = lookahead_.token(); |
120 | 4.09k | switch (cur_token.token()) { |
121 | 28 | case GramProcessor::token::TOK_GROUP_P: { |
122 | | // Replace GROUP_P with GROUP_LA to support SELECT ... GROUP BY ... |
123 | | // - Token GROUP_P is accepted when being used as column name (practically all names). |
124 | | // - Token GROUP_LA is accepted when being used in GROUP BY clause. |
125 | | // group_clause: GROUP_LA BY <group_by_list> |
126 | 28 | int next_tok = static_cast<int>(next_token_type); |
127 | 28 | if (next_tok == GramProcessor::token::TOK_BY) { |
128 | 3 | return GramProcessor::make_GROUP_LA(cursor_); |
129 | 3 | } |
130 | 25 | break; |
131 | 28 | } |
132 | | |
133 | 422 | case GramProcessor::token::TOK_OFFSET: { |
134 | | // Replace OFFSET with OFFSET_LA to support SELECT ... OFFSET ... |
135 | | // - Token OFFSET is accepted when being used as column name (practically all names). |
136 | | // - Token OFFSET_LA is accepted when being used in OFFSET clause. |
137 | | // offset_clause: OFFSET_LA <int constant> |
138 | | // OFFSET_LA '?' --> Bind variable |
139 | | // OFFSET_LA ':' --> Bind variable |
140 | 422 | int next_tok = static_cast<int>(next_token_type); |
141 | 422 | if (next_tok == GramProcessor::token::TOK_ICONST || next_tok == '?'34 || next_tok == ':'26 ) { |
142 | 399 | return GramProcessor::make_OFFSET_LA(cursor_); |
143 | 399 | } |
144 | 23 | break; |
145 | 422 | } |
146 | | |
147 | 1.64k | case GramProcessor::token::TOK_NOT: { |
148 | | // Replace NOT by NOT_LA if it's followed by BETWEEN, IN, etc. |
149 | 1.64k | switch (next_token_type) { |
150 | 0 | case GramProcessor::token::TOK_BETWEEN: |
151 | 1.55k | case GramProcessor::token::TOK_EXISTS: |
152 | 1.64k | case GramProcessor::token::TOK_IN_P: |
153 | 1.64k | case GramProcessor::token::TOK_LIKE: |
154 | 1.64k | case GramProcessor::token::TOK_ILIKE: |
155 | 1.64k | case GramProcessor::token::TOK_SIMILAR: { |
156 | 1.64k | return GramProcessor::make_NOT_LA(cursor_); |
157 | 1.64k | } |
158 | | |
159 | 0 | default: { |
160 | 0 | break; |
161 | 1.64k | } |
162 | 1.64k | } |
163 | 0 | break; |
164 | 1.64k | } |
165 | | |
166 | 0 | case GramProcessor::token::TOK_NULLS_P: { |
167 | | // Replace NULLS_P by NULLS_LA if it's followed by FIRST or LAST. |
168 | 0 | switch (next_token_type) { |
169 | 0 | case GramProcessor::token::TOK_FIRST_P: |
170 | 0 | case GramProcessor::token::TOK_LAST_P: { |
171 | 0 | return GramProcessor::make_NULLS_LA(cursor_); |
172 | 0 | } |
173 | | |
174 | 0 | default: { |
175 | 0 | break; |
176 | 0 | } |
177 | 0 | } |
178 | 0 | break; |
179 | 0 | } |
180 | | |
181 | 1.99k | case GramProcessor::token::TOK_WITH: { |
182 | | // Replace WITH by WITH_LA if it's followed by TIME or ORDINALITY. |
183 | 1.99k | switch (next_token_type) { |
184 | 0 | case GramProcessor::token::TOK_TIME: |
185 | 0 | case GramProcessor::token::TOK_ORDINALITY: { |
186 | 0 | return GramProcessor::make_WITH_LA(cursor_); |
187 | 0 | } |
188 | | |
189 | 1.99k | default: { |
190 | 1.99k | break; |
191 | 0 | } |
192 | 1.99k | } |
193 | 1.99k | break; |
194 | 1.99k | } |
195 | | |
196 | 1.99k | default: { |
197 | 0 | break; |
198 | 1.99k | } |
199 | 4.09k | } |
200 | | |
201 | 2.04k | return cur_token; |
202 | 4.09k | } |
203 | | |
204 | | //-------------------------------------------------------------------------------------------------- |
205 | | |
206 | 680k | int LexProcessor::LexerInput(char* buf, int max_size) { |
207 | 680k | return narrow_cast<int>(parse_context_->Read(buf, max_size)); |
208 | 680k | } |
209 | | |
210 | | //-------------------------------------------------------------------------------------------------- |
211 | | |
212 | 2.27M | void LexProcessor::CountNewlineInToken(const string& token) { |
213 | 2.27M | const auto lines = |
214 | 2.27M | count(token.begin(), token.end(), '\n') + count(token.begin(), token.end(), '\r'); |
215 | 2.27M | cursor_.lines(narrow_cast<int>(lines)); |
216 | 2.27M | } |
217 | | |
218 | | //-------------------------------------------------------------------------------------------------- |
219 | | |
220 | 0 | GramProcessor::symbol_type LexProcessor::ScanError(const char *token) { |
221 | | // Flex scanner will raise exception by itself, so we don't return Status::Error here. |
222 | 0 | Status s = parse_context_->Error(token_loc_, |
223 | 0 | "Lexical error at or near ", |
224 | 0 | ErrorCode::LEXICAL_ERROR, |
225 | 0 | token); |
226 | 0 | VLOG(3) << s.ToString(); |
227 | 0 | return GramProcessor::make_SCAN_ERROR(cursor_); |
228 | 0 | } |
229 | | |
230 | 0 | GramProcessor::symbol_type LexProcessor::ScanError(const char *message, ErrorCode errcode) { |
231 | | // Flex scanner will raise exception by itself, so we don't return Status::Error here. |
232 | 0 | Status s = parse_context_->Error(token_loc_, message, errcode); |
233 | 0 | VLOG(3) << s.ToString(); |
234 | 0 | return GramProcessor::make_SCAN_ERROR(cursor_); |
235 | 0 | } |
236 | | |
237 | | //-------------------------------------------------------------------------------------------------- |
238 | | |
239 | | void LexProcessor::ScanNextToken(const ScanState& scan_state, |
240 | 4.45M | GramProcessor::symbol_type *next_token) { |
241 | 4.45M | GramProcessor::symbol_type new_token(yylex(scan_state)); |
242 | 4.45M | next_token->move(new_token); |
243 | 4.45M | } |
244 | | |
245 | | //-------------------------------------------------------------------------------------------------- |
246 | | |
247 | 216k | MCSharedPtr<MCString> LexProcessor::ScanLiteral() { |
248 | | // Convert the literal to string and count the newline character. |
249 | 216k | MCSharedPtr<MCString> value = MCMakeShared<MCString>(PTreeMem(), literalbuf_, literallen_); |
250 | | // Count newlines in this literal. |
251 | 216k | CountNewlineInToken(value->c_str()); |
252 | | |
253 | 216k | return value; |
254 | 216k | } |
255 | | |
256 | | //-------------------------------------------------------------------------------------------------- |
257 | | |
258 | 973k | MCSharedPtr<MCString> LexProcessor::MakeIdentifier(const char *text, int len, bool warn) { |
259 | | // SQL99 specifies Unicode-aware case normalization, which we don't yet |
260 | | // have the infrastructure for. Instead we use tolower() to provide a |
261 | | // locale-aware translation. However, there are some locales where this |
262 | | // is not right either (eg, Turkish may do strange things with 'i' and |
263 | | // 'I'). Our current compromise is to use tolower() for characters with |
264 | | // the high bit set, as long as they aren't part of a multi-byte |
265 | | // character, and use an ASCII-only downcasing for 7-bit characters. |
266 | 973k | MCSharedPtr<MCString> ident = MCMakeShared<MCString>(PTreeMem(), len, '\0'); |
267 | 973k | int i; |
268 | 8.62M | for (i = 0; i < len; i++7.64M ) { |
269 | 7.64M | unsigned char ch = static_cast<unsigned char>(text[i]); |
270 | 7.64M | if (ch >= 'A' && ch <= 'Z'7.35M ) { |
271 | 15.4k | ch += 'a' - 'A'; |
272 | 15.4k | } |
273 | 7.64M | (*ident)[i] = static_cast<char>(ch); |
274 | 7.64M | } |
275 | | |
276 | 973k | if (i >= NAMEDATALEN) { |
277 | 0 | TruncateIdentifier(ident, warn); |
278 | 0 | } |
279 | 973k | return ident; |
280 | 973k | } |
281 | | |
282 | 0 | void LexProcessor::TruncateIdentifier(const MCSharedPtr<MCString>& ident, bool warn) { |
283 | 0 | auto len = ident->length(); |
284 | 0 | if (len >= NAMEDATALEN) { |
285 | 0 | len = pg_encoding_mbcliplen(ident->c_str(), len, NAMEDATALEN - 1); |
286 | 0 | if (warn) { |
287 | | // We avoid using %.*s here because it can misbehave if the data |
288 | | // is not valid in what libc thinks is the prevailing encoding. |
289 | 0 | char buf[NAMEDATALEN]; |
290 | 0 | memcpy(buf, ident->c_str(), len); |
291 | 0 | buf[len] = '\0'; |
292 | 0 | char warn_msg[1024]; |
293 | 0 | snprintf(warn_msg, sizeof(warn_msg), |
294 | 0 | "Identifier %s will be truncated to %s", ident->c_str(), buf); |
295 | 0 | parse_context_->Warn(token_loc_, warn_msg, ErrorCode::NAME_TOO_LONG); |
296 | 0 | } |
297 | 0 | ident->resize(len); |
298 | 0 | } |
299 | 0 | } |
300 | | |
301 | | //-------------------------------------------------------------------------------------------------- |
302 | | // NOTE: All entities below this line in this modules are copies of PostgreQL's code. We made some |
303 | | // minor changes to avoid lint errors such as using '{' for if blocks and change the comment style |
304 | | // from '/**/' to '//'. |
305 | | //-------------------------------------------------------------------------------------------------- |
306 | | |
307 | 215k | inline void LexProcessor::EnlargeLiteralBuf(size_t bytes) { |
308 | | // Increase literalbuf by the given number of "bytes". |
309 | 215k | auto prev_literalalloc = literalalloc_; |
310 | 215k | if (prev_literalalloc == 0) { |
311 | 16.5k | literalalloc_ = 4096; |
312 | 16.5k | } |
313 | 215k | while (literalalloc_ < literallen_ + bytes) { |
314 | 0 | literalalloc_ <<= 1; |
315 | 0 | } |
316 | 215k | if (prev_literalalloc != literalalloc_) { |
317 | 16.4k | literalbuf_ = reinterpret_cast<char *>(realloc(literalbuf_, literalalloc_)); |
318 | 16.4k | } |
319 | 215k | } |
320 | | |
321 | 215k | void LexProcessor::startlit() { |
322 | 215k | literallen_ = 0; |
323 | 215k | } |
324 | | |
325 | 215k | void LexProcessor::addlit(char *ytext, size_t yleng) { |
326 | | // Enlarge buffer if needed. |
327 | 215k | EnlargeLiteralBuf(yleng); |
328 | | |
329 | | // Append new data. |
330 | 215k | memcpy(literalbuf_ + literallen_, ytext, yleng); |
331 | 215k | literallen_ += yleng; |
332 | 215k | } |
333 | | |
334 | 0 | void LexProcessor::addlitchar(unsigned char ychar) { |
335 | | // Enlarge buffer if needed. |
336 | 0 | EnlargeLiteralBuf(1); |
337 | | |
338 | | // Append new data. |
339 | 0 | literalbuf_[literallen_++] = ychar; |
340 | 0 | } |
341 | | |
342 | 0 | char *LexProcessor::litbuf_udeescape(unsigned char escape) { |
343 | 0 | char *new_litbuf; |
344 | 0 | char *litbuf, *in, *out; |
345 | 0 | pg_wchar pair_first = 0; |
346 | | |
347 | | // Make literalbuf null-terminated to simplify the scanning loop. |
348 | 0 | litbuf = literalbuf_; |
349 | 0 | litbuf[literallen_] = '\0'; |
350 | | |
351 | | // This relies on the subtle assumption that a UTF-8 expansion |
352 | | // cannot be longer than its escaped representation. |
353 | 0 | new_litbuf = static_cast<char *>(PTempMem()->AllocateBytes(literallen_ + 1)); |
354 | |
|
355 | 0 | in = litbuf; |
356 | 0 | out = new_litbuf; |
357 | 0 | while (*in) { |
358 | 0 | if (in[0] == escape) { |
359 | 0 | if (in[1] == escape) { |
360 | 0 | if (pair_first) { |
361 | 0 | AdvanceCursor(narrow_cast<int>(in - litbuf + 3)); // 3 for U&". |
362 | 0 | ScanError("invalid Unicode surrogate pair"); |
363 | 0 | } |
364 | 0 | *out++ = escape; |
365 | 0 | in += 2; |
366 | |
|
367 | 0 | } else if (isxdigit((unsigned char) in[1]) && |
368 | 0 | isxdigit((unsigned char) in[2]) && |
369 | 0 | isxdigit((unsigned char) in[3]) && |
370 | 0 | isxdigit((unsigned char) in[4])) { |
371 | 0 | pg_wchar unicode; |
372 | 0 | unicode = ((hexval(in[1]) << 12) + |
373 | 0 | (hexval(in[2]) << 8) + |
374 | 0 | (hexval(in[3]) << 4) + |
375 | 0 | hexval(in[4])); |
376 | 0 | check_unicode_value(unicode, in); |
377 | |
|
378 | 0 | if (pair_first) { |
379 | 0 | if (is_utf16_surrogate_second(unicode)) { |
380 | 0 | unicode = surrogate_pair_to_codepoint(pair_first, unicode); |
381 | 0 | pair_first = 0; |
382 | 0 | } else { |
383 | 0 | AdvanceCursor(narrow_cast<int>(in - litbuf + 3)); /* 3 for U&" */ |
384 | 0 | ScanError("invalid Unicode surrogate pair"); |
385 | 0 | } |
386 | 0 | } else if (is_utf16_surrogate_second(unicode)) { |
387 | 0 | ScanError("invalid Unicode surrogate pair"); |
388 | 0 | } |
389 | |
|
390 | 0 | if (is_utf16_surrogate_first(unicode)) { |
391 | 0 | pair_first = unicode; |
392 | 0 | } else { |
393 | 0 | unicode_to_utf8(unicode, (unsigned char *) out); |
394 | 0 | out += pg_utf_mblen((unsigned char *)out); |
395 | 0 | } |
396 | 0 | in += 5; |
397 | |
|
398 | 0 | } else if (in[1] == '+' && |
399 | 0 | isxdigit((unsigned char) in[2]) && |
400 | 0 | isxdigit((unsigned char) in[3]) && |
401 | 0 | isxdigit((unsigned char) in[4]) && |
402 | 0 | isxdigit((unsigned char) in[5]) && |
403 | 0 | isxdigit((unsigned char) in[6]) && |
404 | 0 | isxdigit((unsigned char) in[7])) { |
405 | 0 | pg_wchar unicode; |
406 | 0 | unicode = ((hexval(in[2]) << 20) + |
407 | 0 | (hexval(in[3]) << 16) + |
408 | 0 | (hexval(in[4]) << 12) + |
409 | 0 | (hexval(in[5]) << 8) + |
410 | 0 | (hexval(in[6]) << 4) + |
411 | 0 | hexval(in[7])); |
412 | 0 | check_unicode_value(unicode, in); |
413 | |
|
414 | 0 | if (pair_first) { |
415 | 0 | if (is_utf16_surrogate_second(unicode)) { |
416 | 0 | unicode = surrogate_pair_to_codepoint(pair_first, unicode); |
417 | 0 | pair_first = 0; |
418 | 0 | } else { |
419 | 0 | AdvanceCursor(narrow_cast<int>(in - litbuf + 3)); /* 3 for U&" */ |
420 | 0 | ScanError("invalid Unicode surrogate pair"); |
421 | 0 | } |
422 | 0 | } else if (is_utf16_surrogate_second(unicode)) { |
423 | 0 | ScanError("invalid Unicode surrogate pair"); |
424 | 0 | } |
425 | |
|
426 | 0 | if (is_utf16_surrogate_first(unicode)) { |
427 | 0 | pair_first = unicode; |
428 | 0 | } else { |
429 | 0 | unicode_to_utf8(unicode, (unsigned char *) out); |
430 | 0 | out += pg_utf_mblen((unsigned char *)out); |
431 | 0 | } |
432 | 0 | in += 8; |
433 | 0 | } else { |
434 | 0 | AdvanceCursor(narrow_cast<int>(in - litbuf + 3)); /* 3 for U&" */ |
435 | 0 | ScanError("invalid Unicode escape value"); |
436 | 0 | } |
437 | 0 | } else { |
438 | 0 | if (pair_first) { |
439 | 0 | AdvanceCursor(narrow_cast<int>(in - litbuf + 3)); /* 3 for U&" */ |
440 | 0 | ScanError("invalid Unicode surrogate pair"); |
441 | 0 | } |
442 | 0 | *out++ = *in++; |
443 | 0 | } |
444 | 0 | } |
445 | 0 | *out = '\0'; |
446 | | |
447 | | // We could skip pg_verifymbstr if we didn't process any non-7-bit-ASCII |
448 | | // codes; but it's probably not worth the trouble, since this isn't |
449 | | // likely to be a performance-critical path. |
450 | 0 | pg_verify_mbstr_len(new_litbuf, out - new_litbuf, false); |
451 | 0 | return new_litbuf; |
452 | 0 | } |
453 | | |
454 | | //-------------------------------------------------------------------------------------------------- |
455 | | |
456 | 0 | void LexProcessor::check_string_escape_warning(unsigned char ychar) { |
457 | 0 | if (ychar == '\'') { |
458 | 0 | if (warn_on_first_escape_ && escape_string_warning_) |
459 | 0 | parse_context_->Warn(token_loc_, |
460 | 0 | "Nonstandard use of \\' in a string literal. Use '' to write quotes in " |
461 | 0 | "strings, or use the escape string syntax (E'...').", |
462 | 0 | ErrorCode::NONSTANDARD_USE_OF_ESCAPE_CHARACTER); |
463 | 0 | warn_on_first_escape_ = false; // Warn only once per string. |
464 | 0 | } else if (ychar == '\\') { |
465 | 0 | if (warn_on_first_escape_ && escape_string_warning_) |
466 | 0 | parse_context_->Warn(token_loc_, |
467 | 0 | "(Nonstandard use of \\\\ in a string literal. Use the escape string " |
468 | 0 | "syntax for backslashes, e.g., E'\\\\'.", |
469 | 0 | ErrorCode::NONSTANDARD_USE_OF_ESCAPE_CHARACTER); |
470 | 0 | warn_on_first_escape_ = false; // Warn only once per string. |
471 | 0 | } else { |
472 | 0 | check_escape_warning(); |
473 | 0 | } |
474 | 0 | } |
475 | | |
476 | 0 | void LexProcessor::check_escape_warning() { |
477 | 0 | if (warn_on_first_escape_ && escape_string_warning_) |
478 | 0 | parse_context_->Warn(token_loc_, |
479 | 0 | "Nonstandard use of escape in a string literal. Use the escape string " |
480 | 0 | "syntax for escapes, e.g., E'\\r\\n'.", |
481 | 0 | ErrorCode::NONSTANDARD_USE_OF_ESCAPE_CHARACTER); |
482 | 0 | warn_on_first_escape_ = false; // Warn only once per string. |
483 | 0 | } |
484 | | |
485 | 0 | unsigned char LexProcessor::unescape_single_char(unsigned char c) { |
486 | 0 | switch (c) { |
487 | 0 | case 'b': |
488 | 0 | return '\b'; |
489 | 0 | case 'f': |
490 | 0 | return '\f'; |
491 | 0 | case 'n': |
492 | 0 | return '\n'; |
493 | 0 | case 'r': |
494 | 0 | return '\r'; |
495 | 0 | case 't': |
496 | 0 | return '\t'; |
497 | 0 | default: |
498 | | /* check for backslash followed by non-7-bit-ASCII */ |
499 | 0 | if (c == '\0' || is_utf_highbit_set(c)) { |
500 | 0 | saw_non_ascii_ = true; |
501 | 0 | } |
502 | 0 | return c; |
503 | 0 | } |
504 | 0 | } |
505 | | |
506 | | //-------------------------------------------------------------------------------------------------- |
507 | | |
508 | 0 | void LexProcessor::addunicode(pg_wchar c) { |
509 | 0 | char buf[8]; |
510 | |
|
511 | 0 | if (c == 0 || c > 0x10FFFF) |
512 | 0 | ScanError("invalid Unicode escape value"); |
513 | 0 | if (c > 0x7F) { |
514 | 0 | saw_non_ascii_ = true; |
515 | 0 | } |
516 | 0 | unicode_to_utf8(c, (unsigned char *)buf); |
517 | 0 | addlit(buf, pg_utf_mblen((unsigned char *)buf)); |
518 | 0 | } |
519 | | |
520 | | //-------------------------------------------------------------------------------------------------- |
521 | | |
522 | | static const ScanKeyword& kInvalidKeyword { |
523 | | "", GramProcessor::token::TOK_NULL_P, ScanKeyword::KeywordCategory::INVALID_KEYWORD |
524 | | }; |
525 | | |
526 | | #define PG_KEYWORD(a, b, c) \ |
527 | | {a, {a, GramProcessor::token::TOK_##b, ScanKeyword::KeywordCategory::c}}, |
528 | | const unordered_map<string, const ScanKeyword> kScanKeywords { |
529 | | #include "yb/yql/cql/ql/kwlist.h" |
530 | | }; |
531 | | |
532 | 2.23M | const ScanKeyword& LexProcessor::ScanKeywordLookup(const char *text) { |
533 | 2.23M | static const int kMaxKeywordBytes = 4096; |
534 | 2.23M | char word[kMaxKeywordBytes]; |
535 | 2.23M | size_t word_bytes = strlen(text); |
536 | | |
537 | | // PostgreQL Note: Apply an ASCII-only downcasing. We must not use tolower() since it may |
538 | | // produce the wrong translation in some locales (eg, Turkish). |
539 | 16.3M | for (size_t i = 0; i < word_bytes; i++14.1M ) { |
540 | 14.1M | char ch = text[i]; |
541 | 14.1M | if (ch >= 'A' && ch <= 'Z'13.8M ) { |
542 | 3.76M | ch += 'a' - 'A'; |
543 | 3.76M | } |
544 | 14.1M | word[i] = ch; |
545 | 14.1M | } |
546 | 2.23M | word[word_bytes] = '\0'; |
547 | | |
548 | 2.23M | unordered_map<string, const ScanKeyword>::const_iterator iter = kScanKeywords.find(word); |
549 | 2.23M | if (iter != kScanKeywords.end()) { |
550 | 1.25M | return iter->second; |
551 | 1.25M | } |
552 | 977k | return kInvalidKeyword; |
553 | 2.23M | } |
554 | | |
555 | | //-------------------------------------------------------------------------------------------------- |
556 | | // Class ScanStatus - Not yet implemented. |
557 | | //-------------------------------------------------------------------------------------------------- |
558 | 4.45M | ScanState::ScanState() { |
559 | 4.45M | } |
560 | | |
561 | 4.44M | ScanState::~ScanState() { |
562 | 4.44M | } |
563 | | |
564 | | } // namespace ql |
565 | | } // namespace yb |