/Users/deen/code/yugabyte-db/src/yb/yql/cql/ql/parser/scanner.cc

Source (jump to first uncovered line)
//--------------------------------------------------------------------------------------------------
// The following only applies to changes made to this file as part of YugaByte development.
//
// Copyright (c) YugaByte, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
// in compliance with the License.  You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software distributed under the License
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
// or implied.  See the License for the specific language governing permissions and limitations
// under the License.
//
// Portions Copyright (c) 1996-2015, PostgreSQL Global Development Group
// Portions Copyright (c) 1994, Regents of the University of California
//
// API for the core scanner (flex machine). Some of the functions are not following Yugabyte naming
// convention because they are PostgreQL code.
//--------------------------------------------------------------------------------------------------

// #include <algorithm>
#include <unordered_map>

#include "yb/gutil/casts.h"

#include "yb/yql/cql/ql/parser/parser.h"
#include "yb/yql/cql/ql/parser/scanner.h"
#include "yb/yql/cql/ql/parser/scanner_util.h"
#include "yb/util/logging.h"

namespace yb {
namespace ql {

using std::unordered_map;

//--------------------------------------------------------------------------------------------------
// Class LexProcessor.
//--------------------------------------------------------------------------------------------------

LexProcessor::LexProcessor()
    : literalbuf_(nullptr),
      literallen_(0),
      literalalloc_(0),
      backslash_quote_(BackslashQuoteType::SAFE_ENCODING),
      escape_string_warning_(true),
      standard_conforming_strings_(true) {
}

LexProcessor::~LexProcessor() {
  if (literalbuf_ != nullptr) {
    free(literalbuf_);
  }
}

//--------------------------------------------------------------------------------------------------

void LexProcessor::ScanInit(ParseContext *parse_context) {
  yyrestart(parse_context->ql_file());

  token_loc_.initialize();
  cursor_.initialize();
  lookahead_.type = 0;

  literallen_ = 0;
  xcdepth_ = 0;
  dolqstart_ = nullptr;
  utf16_first_part_ = 0;
  warn_on_first_escape_ = false;
  saw_non_ascii_ = false;

  backslash_quote_ = BackslashQuoteType::SAFE_ENCODING;
  escape_string_warning_ = true;
  standard_conforming_strings_ = true;

  parse_context_ = parse_context;
  if (parse_context_ != nullptr) {
    yy_flex_debug = parse_context->trace_scanning();
  }
}

//--------------------------------------------------------------------------------------------------

GramProcessor::symbol_type LexProcessor::Scan() {
  // Use the lookahead from the context if it's available. Otherwise, read the token.
  GramProcessor::symbol_type cur_token;
  ScanState scan_state;

  if (lookahead_.token() != 0) {
    // Remove lookahead from the context and reset it to type 0.
    cur_token.move(lookahead_);
    lookahead_.type = 0;
  } else {
    // Read the next token and save it to 'cur_token'.
    ScanNextToken(scan_state, &cur_token);
  }

  // Return the token if it doesn't require lookahead. Otherwise, set the token length.
  switch (cur_token.token()) {
    case GramProcessor::token::TOK_GROUP_P:
    case GramProcessor::token::TOK_OFFSET:
    case GramProcessor::token::TOK_NOT:
    case GramProcessor::token::TOK_NULLS_P:
    case GramProcessor::token::TOK_WITH: {
      break;
    }

    default: {
      // Return 'cur_token' as it does not require lookahead.
      return cur_token;
    }
  }

  // Cache the lookahead token.
  ScanNextToken(scan_state, &lookahead_);

  // Replace cur_token if needed, based on lookahead.
  GramProcessor::token_type next_token_type = lookahead_.token();
  switch (cur_token.token()) {
    case GramProcessor::token::TOK_GROUP_P: {
      // Replace GROUP_P with GROUP_LA to support SELECT ... GROUP BY ...
      // - Token GROUP_P is accepted when being used as column name (practically all names).
      // - Token GROUP_LA is accepted when being used in GROUP BY clause.
      //   group_clause: GROUP_LA BY <group_by_list>
      int next_tok = static_cast<int>(next_token_type);
      if (next_tok == GramProcessor::token::TOK_BY) {
        return GramProcessor::make_GROUP_LA(cursor_);
      }
      break;
    }

    case GramProcessor::token::TOK_OFFSET: {
      // Replace OFFSET with OFFSET_LA to support SELECT ... OFFSET ...
      // - Token OFFSET is accepted when being used as column name (practically all names).
      // - Token OFFSET_LA is accepted when being used in OFFSET clause.
      //   offset_clause:  OFFSET_LA <int constant>
      //                   OFFSET_LA '?'  --> Bind variable
      //                   OFFSET_LA ':'  --> Bind variable
      int next_tok = static_cast<int>(next_token_type);
      if (next_tok == GramProcessor::token::TOK_ICONST || next_tok == '?'34 || next_tok == ':'26) {
        return GramProcessor::make_OFFSET_LA(cursor_);
      }
      break;
    }

    case GramProcessor::token::TOK_NOT: {
      // Replace NOT by NOT_LA if it's followed by BETWEEN, IN, etc.
      switch (next_token_type) {
        case GramProcessor::token::TOK_BETWEEN:
        case GramProcessor::token::TOK_EXISTS:
        case GramProcessor::token::TOK_IN_P:
        case GramProcessor::token::TOK_LIKE:
        case GramProcessor::token::TOK_ILIKE:
        case GramProcessor::token::TOK_SIMILAR: {
          return GramProcessor::make_NOT_LA(cursor_);
        }

        default: {
          break;
        }
      }
      break;
    }

    case GramProcessor::token::TOK_NULLS_P: {
      // Replace NULLS_P by NULLS_LA if it's followed by FIRST or LAST.
      switch (next_token_type) {
        case GramProcessor::token::TOK_FIRST_P:
        case GramProcessor::token::TOK_LAST_P: {
          return GramProcessor::make_NULLS_LA(cursor_);
        }

        default: {
          break;
        }
      }
      break;
    }

    case GramProcessor::token::TOK_WITH: {
      // Replace WITH by WITH_LA if it's followed by TIME or ORDINALITY.
      switch (next_token_type) {
        case GramProcessor::token::TOK_TIME:
        case GramProcessor::token::TOK_ORDINALITY: {
          return GramProcessor::make_WITH_LA(cursor_);
        }

        default: {
          break;
        }
      }
      break;
    }

    default: {
      break;
    }
  }

  return cur_token;
}

//--------------------------------------------------------------------------------------------------

int LexProcessor::LexerInput(char* buf, int max_size) {
  return narrow_cast<int>(parse_context_->Read(buf, max_size));
}

//--------------------------------------------------------------------------------------------------

void LexProcessor::CountNewlineInToken(const string& token) {
  const auto lines =
    count(token.begin(), token.end(), '\n') + count(token.begin(), token.end(), '\r');
  cursor_.lines(narrow_cast<int>(lines));
}

//--------------------------------------------------------------------------------------------------

GramProcessor::symbol_type LexProcessor::ScanError(const char *token) {
  // Flex scanner will raise exception by itself, so we don't return Status::Error here.
  Status s = parse_context_->Error(token_loc_,
                                   "Lexical error at or near ",
                                   ErrorCode::LEXICAL_ERROR,
                                   token);
  VLOG(3) << s.ToString();
  return GramProcessor::make_SCAN_ERROR(cursor_);
}

GramProcessor::symbol_type LexProcessor::ScanError(const char *message, ErrorCode errcode) {
  // Flex scanner will raise exception by itself, so we don't return Status::Error here.
  Status s = parse_context_->Error(token_loc_, message, errcode);
  VLOG(3) << s.ToString();
  return GramProcessor::make_SCAN_ERROR(cursor_);
}

//--------------------------------------------------------------------------------------------------

void LexProcessor::ScanNextToken(const ScanState& scan_state,
                                 GramProcessor::symbol_type *next_token) {
  GramProcessor::symbol_type new_token(yylex(scan_state));
  next_token->move(new_token);
}

//--------------------------------------------------------------------------------------------------

MCSharedPtr<MCString> LexProcessor::ScanLiteral() {
  // Convert the literal to string and count the newline character.
  MCSharedPtr<MCString> value = MCMakeShared<MCString>(PTreeMem(), literalbuf_, literallen_);
  // Count newlines in this literal.
  CountNewlineInToken(value->c_str());

  return value;
}

//--------------------------------------------------------------------------------------------------

MCSharedPtr<MCString> LexProcessor::MakeIdentifier(const char *text, int len, bool warn) {
  // SQL99 specifies Unicode-aware case normalization, which we don't yet
  // have the infrastructure for.  Instead we use tolower() to provide a
  // locale-aware translation.  However, there are some locales where this
  // is not right either (eg, Turkish may do strange things with 'i' and
  // 'I').  Our current compromise is to use tolower() for characters with
  // the high bit set, as long as they aren't part of a multi-byte
  // character, and use an ASCII-only downcasing for 7-bit characters.
  MCSharedPtr<MCString> ident = MCMakeShared<MCString>(PTreeMem(), len, '\0');
  int i;
  for (i = 0; i < len; i++7.64M) {
    unsigned char ch = static_cast<unsigned char>(text[i]);
    if (ch >= 'A' && ch <= 'Z'7.35M) {
      ch += 'a' - 'A';
    }
    (*ident)[i] = static_cast<char>(ch);
  }

  if (i >= NAMEDATALEN) {
    TruncateIdentifier(ident, warn);
  }
  return ident;
}

void LexProcessor::TruncateIdentifier(const MCSharedPtr<MCString>& ident, bool warn) {
  auto len = ident->length();
  if (len >= NAMEDATALEN) {
    len = pg_encoding_mbcliplen(ident->c_str(), len, NAMEDATALEN - 1);
    if (warn) {
      // We avoid using %.*s here because it can misbehave if the data
      // is not valid in what libc thinks is the prevailing encoding.
      char buf[NAMEDATALEN];
      memcpy(buf, ident->c_str(), len);
      buf[len] = '\0';
      char warn_msg[1024];
      snprintf(warn_msg, sizeof(warn_msg),
               "Identifier %s will be truncated to %s", ident->c_str(), buf);
      parse_context_->Warn(token_loc_, warn_msg, ErrorCode::NAME_TOO_LONG);
    }
    ident->resize(len);
  }
}

//--------------------------------------------------------------------------------------------------
// NOTE: All entities below this line in this modules are copies of PostgreQL's code. We made some
// minor changes to avoid lint errors such as using '{' for if blocks and change the comment style
// from '/**/' to '//'.
//--------------------------------------------------------------------------------------------------

inline void LexProcessor::EnlargeLiteralBuf(size_t bytes) {
  // Increase literalbuf by the given number of "bytes".
  auto prev_literalalloc = literalalloc_;
  if (prev_literalalloc == 0) {
    literalalloc_ = 4096;
  }
  while (literalalloc_ < literallen_ + bytes) {
    literalalloc_ <<= 1;
  }
  if (prev_literalalloc != literalalloc_) {
    literalbuf_ = reinterpret_cast<char *>(realloc(literalbuf_, literalalloc_));
  }
}

void LexProcessor::startlit() {
  literallen_ = 0;
}

void LexProcessor::addlit(char *ytext, size_t yleng) {
  // Enlarge buffer if needed.
  EnlargeLiteralBuf(yleng);

  // Append new data.
  memcpy(literalbuf_ + literallen_, ytext, yleng);
  literallen_ += yleng;
}

void LexProcessor::addlitchar(unsigned char ychar) {
  // Enlarge buffer if needed.
  EnlargeLiteralBuf(1);

  // Append new data.
  literalbuf_[literallen_++] = ychar;
}

char *LexProcessor::litbuf_udeescape(unsigned char escape) {
  char     *new_litbuf;
  char     *litbuf, *in, *out;
  pg_wchar  pair_first = 0;

  // Make literalbuf null-terminated to simplify the scanning loop.
  litbuf = literalbuf_;
  litbuf[literallen_] = '\0';

  // This relies on the subtle assumption that a UTF-8 expansion
  // cannot be longer than its escaped representation.
  new_litbuf = static_cast<char *>(PTempMem()->AllocateBytes(literallen_ + 1));

  in = litbuf;
  out = new_litbuf;
  while (*in) {
    if (in[0] == escape) {
      if (in[1] == escape) {
        if (pair_first) {
          AdvanceCursor(narrow_cast<int>(in - litbuf + 3)); // 3 for U&".
          ScanError("invalid Unicode surrogate pair");
        }
        *out++ = escape;
        in += 2;

      } else if (isxdigit((unsigned char) in[1]) &&
                 isxdigit((unsigned char) in[2]) &&
                 isxdigit((unsigned char) in[3]) &&
                 isxdigit((unsigned char) in[4])) {
        pg_wchar unicode;
        unicode = ((hexval(in[1]) << 12) +
                   (hexval(in[2]) << 8) +
                   (hexval(in[3]) << 4) +
                   hexval(in[4]));
        check_unicode_value(unicode, in);

        if (pair_first) {
          if (is_utf16_surrogate_second(unicode)) {
            unicode = surrogate_pair_to_codepoint(pair_first, unicode);
            pair_first = 0;
          } else {
            AdvanceCursor(narrow_cast<int>(in - litbuf + 3));   /* 3 for U&" */
            ScanError("invalid Unicode surrogate pair");
          }
        } else if (is_utf16_surrogate_second(unicode)) {
          ScanError("invalid Unicode surrogate pair");
        }

        if (is_utf16_surrogate_first(unicode)) {
          pair_first = unicode;
        } else {
          unicode_to_utf8(unicode, (unsigned char *) out);
          out += pg_utf_mblen((unsigned char *)out);
        }
        in += 5;

      } else if (in[1] == '+' &&
                 isxdigit((unsigned char) in[2]) &&
                 isxdigit((unsigned char) in[3]) &&
                 isxdigit((unsigned char) in[4]) &&
                 isxdigit((unsigned char) in[5]) &&
                 isxdigit((unsigned char) in[6]) &&
                 isxdigit((unsigned char) in[7])) {
        pg_wchar unicode;
        unicode = ((hexval(in[2]) << 20) +
                   (hexval(in[3]) << 16) +
                   (hexval(in[4]) << 12) +
                   (hexval(in[5]) << 8) +
                   (hexval(in[6]) << 4) +
                   hexval(in[7]));
        check_unicode_value(unicode, in);

        if (pair_first) {
          if (is_utf16_surrogate_second(unicode)) {
            unicode = surrogate_pair_to_codepoint(pair_first, unicode);
            pair_first = 0;
          } else {
            AdvanceCursor(narrow_cast<int>(in - litbuf + 3));   /* 3 for U&" */
            ScanError("invalid Unicode surrogate pair");
          }
        } else if (is_utf16_surrogate_second(unicode)) {
          ScanError("invalid Unicode surrogate pair");
        }

        if (is_utf16_surrogate_first(unicode)) {
          pair_first = unicode;
        } else {
          unicode_to_utf8(unicode, (unsigned char *) out);
          out += pg_utf_mblen((unsigned char *)out);
        }
        in += 8;
      } else {
        AdvanceCursor(narrow_cast<int>(in - litbuf + 3));   /* 3 for U&" */
        ScanError("invalid Unicode escape value");
      }
    } else {
      if (pair_first) {
        AdvanceCursor(narrow_cast<int>(in - litbuf + 3));   /* 3 for U&" */
        ScanError("invalid Unicode surrogate pair");
      }
      *out++ = *in++;
    }
  }
  *out = '\0';

  // We could skip pg_verifymbstr if we didn't process any non-7-bit-ASCII
  // codes; but it's probably not worth the trouble, since this isn't
  // likely to be a performance-critical path.
  pg_verify_mbstr_len(new_litbuf, out - new_litbuf, false);
  return new_litbuf;
}

//--------------------------------------------------------------------------------------------------

void LexProcessor::check_string_escape_warning(unsigned char ychar) {
  if (ychar == '\'') {
    if (warn_on_first_escape_ && escape_string_warning_)
      parse_context_->Warn(token_loc_,
                           "Nonstandard use of \\' in a string literal. Use '' to write quotes in "
                           "strings, or use the escape string syntax (E'...').",
                           ErrorCode::NONSTANDARD_USE_OF_ESCAPE_CHARACTER);
    warn_on_first_escape_ = false;  // Warn only once per string.
  } else if (ychar == '\\') {
    if (warn_on_first_escape_ && escape_string_warning_)
      parse_context_->Warn(token_loc_,
                           "(Nonstandard use of \\\\ in a string literal. Use the escape string "
                           "syntax for backslashes, e.g., E'\\\\'.",
                           ErrorCode::NONSTANDARD_USE_OF_ESCAPE_CHARACTER);
    warn_on_first_escape_ = false;  // Warn only once per string.
  } else {
    check_escape_warning();
  }
}

void LexProcessor::check_escape_warning() {
  if (warn_on_first_escape_ && escape_string_warning_)
    parse_context_->Warn(token_loc_,
                         "Nonstandard use of escape in a string literal. Use the escape string "
                         "syntax for escapes, e.g., E'\\r\\n'.",
                         ErrorCode::NONSTANDARD_USE_OF_ESCAPE_CHARACTER);
  warn_on_first_escape_ = false;  // Warn only once per string.
}

unsigned char LexProcessor::unescape_single_char(unsigned char c) {
  switch (c) {
    case 'b':
      return '\b';
    case 'f':
      return '\f';
    case 'n':
      return '\n';
    case 'r':
      return '\r';
    case 't':
      return '\t';
    default:
      /* check for backslash followed by non-7-bit-ASCII */
      if (c == '\0' || is_utf_highbit_set(c)) {
        saw_non_ascii_ = true;
      }
      return c;
  }
}

//--------------------------------------------------------------------------------------------------

void LexProcessor::addunicode(pg_wchar c) {
  char buf[8];

  if (c == 0 || c > 0x10FFFF)
    ScanError("invalid Unicode escape value");
  if (c > 0x7F) {
    saw_non_ascii_ = true;
  }
  unicode_to_utf8(c, (unsigned char *)buf);
  addlit(buf, pg_utf_mblen((unsigned char *)buf));
}

//--------------------------------------------------------------------------------------------------

static const ScanKeyword& kInvalidKeyword {
  "", GramProcessor::token::TOK_NULL_P, ScanKeyword::KeywordCategory::INVALID_KEYWORD
};

#define PG_KEYWORD(a, b, c) \
  {a, {a, GramProcessor::token::TOK_##b, ScanKeyword::KeywordCategory::c}},
const unordered_map<string, const ScanKeyword> kScanKeywords {
#include "yb/yql/cql/ql/kwlist.h"
};

const ScanKeyword& LexProcessor::ScanKeywordLookup(const char *text) {
  static const int kMaxKeywordBytes = 4096;
  char word[kMaxKeywordBytes];
  size_t word_bytes = strlen(text);

  // PostgreQL Note: Apply an ASCII-only downcasing.  We must not use tolower() since it may
  // produce the wrong translation in some locales (eg, Turkish).
  for (size_t i = 0; i < word_bytes; i++14.1M) {
    char ch = text[i];
    if (ch >= 'A' && ch <= 'Z'13.8M) {
      ch += 'a' - 'A';
    }
    word[i] = ch;
  }
  word[word_bytes] = '\0';

  unordered_map<string, const ScanKeyword>::const_iterator iter = kScanKeywords.find(word);
  if (iter != kScanKeywords.end()) {
    return iter->second;
  }
  return kInvalidKeyword;
}

//--------------------------------------------------------------------------------------------------
// Class ScanStatus - Not yet implemented.
//--------------------------------------------------------------------------------------------------
ScanState::ScanState() {
}

ScanState::~ScanState() {
}

}  // namespace ql
}  // namespace yb

YugabyteDB (2.13.1.0-b60, 21121d69985fbf76aa6958d8f04a9bfa936293b5)

Coverage Report

Created: 2022-03-22 16:43

Line	Count	Source (jump to first uncovered line)
1		//--------------------------------------------------------------------------------------------------
2		// The following only applies to changes made to this file as part of YugaByte development.
3		//
4		// Copyright (c) YugaByte, Inc.
5		//
6		// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
7		// in compliance with the License. You may obtain a copy of the License at
8		//
9		// http://www.apache.org/licenses/LICENSE-2.0
10		//
11		// Unless required by applicable law or agreed to in writing, software distributed under the License
12		// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
13		// or implied. See the License for the specific language governing permissions and limitations
14		// under the License.
15		//
16		// Portions Copyright (c) 1996-2015, PostgreSQL Global Development Group
17		// Portions Copyright (c) 1994, Regents of the University of California
18		//
19		// API for the core scanner (flex machine). Some of the functions are not following Yugabyte naming
20		// convention because they are PostgreQL code.
21		//--------------------------------------------------------------------------------------------------
22
23		// #include <algorithm>
24		#include <unordered_map>
25
26		#include "yb/gutil/casts.h"
27
28		#include "yb/yql/cql/ql/parser/parser.h"
29		#include "yb/yql/cql/ql/parser/scanner.h"
30		#include "yb/yql/cql/ql/parser/scanner_util.h"
31		#include "yb/util/logging.h"
32
33		namespace yb {
34		namespace ql {
35
36		using std::unordered_map;
37
38		//--------------------------------------------------------------------------------------------------
39		// Class LexProcessor.
40		//--------------------------------------------------------------------------------------------------
41
42		LexProcessor::LexProcessor()
43		: literalbuf_(nullptr),
44		literallen_(0),
45		literalalloc_(0),
46		backslash_quote_(BackslashQuoteType::SAFE_ENCODING),
47		escape_string_warning_(true),
48	18.0k	standard_conforming_strings_(true) {
49	18.0k	}
50
51	1	LexProcessor::~LexProcessor() {
52	1	if (literalbuf_ != nullptr) {
53	0	free(literalbuf_);
54	0	}
55	1	}
56
57		//--------------------------------------------------------------------------------------------------
58
59	342k	void LexProcessor::ScanInit(ParseContext *parse_context) {
60	342k	yyrestart(parse_context->ql_file());
61
62	342k	token_loc_.initialize();
63	342k	cursor_.initialize();
64	342k	lookahead_.type = 0;
65
66	342k	literallen_ = 0;
67	342k	xcdepth_ = 0;
68	342k	dolqstart_ = nullptr;
69	342k	utf16_first_part_ = 0;
70	342k	warn_on_first_escape_ = false;
71	342k	saw_non_ascii_ = false;
72
73	342k	backslash_quote_ = BackslashQuoteType::SAFE_ENCODING;
74	342k	escape_string_warning_ = true;
75	342k	standard_conforming_strings_ = true;
76
77	342k	parse_context_ = parse_context;
78	342k	if (parse_context_ != nullptr) {
79	339k	yy_flex_debug = parse_context->trace_scanning();
80	339k	}
81	342k	}
82
83		//--------------------------------------------------------------------------------------------------
84
85	4.44M	GramProcessor::symbol_type LexProcessor::Scan() {
86		// Use the lookahead from the context if it's available. Otherwise, read the token.
87	4.44M	GramProcessor::symbol_type cur_token;
88	4.44M	ScanState scan_state;
89
90	4.44M	if (lookahead_.token() != 0) {
91		// Remove lookahead from the context and reset it to type 0.
92	4.09k	cur_token.move(lookahead_);
93	4.09k	lookahead_.type = 0;
94	4.43M	} else {
95		// Read the next token and save it to 'cur_token'.
96	4.43M	ScanNextToken(scan_state, &cur_token);
97	4.43M	}
98
99		// Return the token if it doesn't require lookahead. Otherwise, set the token length.
100	4.44M	switch (cur_token.token()) {
101	28	case GramProcessor::token::TOK_GROUP_P:
102	450	case GramProcessor::token::TOK_OFFSET:
103	2.09k	case GramProcessor::token::TOK_NOT:
104	2.09k	case GramProcessor::token::TOK_NULLS_P:
105	4.09k	case GramProcessor::token::TOK_WITH: {
106	4.09k	break;
107	2.09k	}
108
109	4.44M	default: {
110		// Return 'cur_token' as it does not require lookahead.
111	4.44M	return cur_token;
112	2.09k	}
113	4.44M	}
114
115		// Cache the lookahead token.
116	4.09k	ScanNextToken(scan_state, &lookahead_);
117
118		// Replace cur_token if needed, based on lookahead.
119	4.09k	GramProcessor::token_type next_token_type = lookahead_.token();
120	4.09k	switch (cur_token.token()) {
121	28	case GramProcessor::token::TOK_GROUP_P: {
122		// Replace GROUP_P with GROUP_LA to support SELECT ... GROUP BY ...
123		// - Token GROUP_P is accepted when being used as column name (practically all names).
124		// - Token GROUP_LA is accepted when being used in GROUP BY clause.
125		// group_clause: GROUP_LA BY <group_by_list>
126	28	int next_tok = static_cast<int>(next_token_type);
127	28	if (next_tok == GramProcessor::token::TOK_BY) {
128	3	return GramProcessor::make_GROUP_LA(cursor_);
129	3	}
130	25	break;
131	28	}
132
133	422	case GramProcessor::token::TOK_OFFSET: {
134		// Replace OFFSET with OFFSET_LA to support SELECT ... OFFSET ...
135		// - Token OFFSET is accepted when being used as column name (practically all names).
136		// - Token OFFSET_LA is accepted when being used in OFFSET clause.
137		// offset_clause: OFFSET_LA <int constant>
138		// OFFSET_LA '?' --> Bind variable
139		// OFFSET_LA ':' --> Bind variable
140	422	int next_tok = static_cast<int>(next_token_type);
141	422	if (next_tok == GramProcessor::token::TOK_ICONST \|\| next_tok == '?'34 \|\| next_tok == ':'26 ) {
142	399	return GramProcessor::make_OFFSET_LA(cursor_);
143	399	}
144	23	break;
145	422	}
146
147	1.64k	case GramProcessor::token::TOK_NOT: {
148		// Replace NOT by NOT_LA if it's followed by BETWEEN, IN, etc.
149	1.64k	switch (next_token_type) {
150	0	case GramProcessor::token::TOK_BETWEEN:
151	1.55k	case GramProcessor::token::TOK_EXISTS:
152	1.64k	case GramProcessor::token::TOK_IN_P:
153	1.64k	case GramProcessor::token::TOK_LIKE:
154	1.64k	case GramProcessor::token::TOK_ILIKE:
155	1.64k	case GramProcessor::token::TOK_SIMILAR: {
156	1.64k	return GramProcessor::make_NOT_LA(cursor_);
157	1.64k	}
158
159	0	default: {
160	0	break;
161	1.64k	}
162	1.64k	}
163	0	break;
164	1.64k	}
165
166	0	case GramProcessor::token::TOK_NULLS_P: {
167		// Replace NULLS_P by NULLS_LA if it's followed by FIRST or LAST.
168	0	switch (next_token_type) {
169	0	case GramProcessor::token::TOK_FIRST_P:
170	0	case GramProcessor::token::TOK_LAST_P: {
171	0	return GramProcessor::make_NULLS_LA(cursor_);
172	0	}
173
174	0	default: {
175	0	break;
176	0	}
177	0	}
178	0	break;
179	0	}
180
181	1.99k	case GramProcessor::token::TOK_WITH: {
182		// Replace WITH by WITH_LA if it's followed by TIME or ORDINALITY.
183	1.99k	switch (next_token_type) {
184	0	case GramProcessor::token::TOK_TIME:
185	0	case GramProcessor::token::TOK_ORDINALITY: {
186	0	return GramProcessor::make_WITH_LA(cursor_);
187	0	}
188
189	1.99k	default: {
190	1.99k	break;
191	0	}
192	1.99k	}
193	1.99k	break;
194	1.99k	}
195
196	1.99k	default: {
197	0	break;
198	1.99k	}
199	4.09k	}
200
201	2.04k	return cur_token;
202	4.09k	}
203
204		//--------------------------------------------------------------------------------------------------
205
206	680k	int LexProcessor::LexerInput(char* buf, int max_size) {
207	680k	return narrow_cast<int>(parse_context_->Read(buf, max_size));
208	680k	}
209
210		//--------------------------------------------------------------------------------------------------
211
212	2.27M	void LexProcessor::CountNewlineInToken(const string& token) {
213	2.27M	const auto lines =
214	2.27M	count(token.begin(), token.end(), '\n') + count(token.begin(), token.end(), '\r');
215	2.27M	cursor_.lines(narrow_cast<int>(lines));
216	2.27M	}
217
218		//--------------------------------------------------------------------------------------------------
219
220	0	GramProcessor::symbol_type LexProcessor::ScanError(const char *token) {
221		// Flex scanner will raise exception by itself, so we don't return Status::Error here.
222	0	Status s = parse_context_->Error(token_loc_,
223	0	"Lexical error at or near ",
224	0	ErrorCode::LEXICAL_ERROR,
225	0	token);
226	0	VLOG(3) << s.ToString();
227	0	return GramProcessor::make_SCAN_ERROR(cursor_);
228	0	}
229
230	0	GramProcessor::symbol_type LexProcessor::ScanError(const char *message, ErrorCode errcode) {
231		// Flex scanner will raise exception by itself, so we don't return Status::Error here.
232	0	Status s = parse_context_->Error(token_loc_, message, errcode);
233	0	VLOG(3) << s.ToString();
234	0	return GramProcessor::make_SCAN_ERROR(cursor_);
235	0	}
236
237		//--------------------------------------------------------------------------------------------------
238
239		void LexProcessor::ScanNextToken(const ScanState& scan_state,
240	4.45M	GramProcessor::symbol_type *next_token) {
241	4.45M	GramProcessor::symbol_type new_token(yylex(scan_state));
242	4.45M	next_token->move(new_token);
243	4.45M	}
244
245		//--------------------------------------------------------------------------------------------------
246
247	216k	MCSharedPtr<MCString> LexProcessor::ScanLiteral() {
248		// Convert the literal to string and count the newline character.
249	216k	MCSharedPtr<MCString> value = MCMakeShared<MCString>(PTreeMem(), literalbuf_, literallen_);
250		// Count newlines in this literal.
251	216k	CountNewlineInToken(value->c_str());
252
253	216k	return value;
254	216k	}
255
256		//--------------------------------------------------------------------------------------------------
257
258	973k	MCSharedPtr<MCString> LexProcessor::MakeIdentifier(const char *text, int len, bool warn) {
259		// SQL99 specifies Unicode-aware case normalization, which we don't yet
260		// have the infrastructure for. Instead we use tolower() to provide a
261		// locale-aware translation. However, there are some locales where this
262		// is not right either (eg, Turkish may do strange things with 'i' and
263		// 'I'). Our current compromise is to use tolower() for characters with
264		// the high bit set, as long as they aren't part of a multi-byte
265		// character, and use an ASCII-only downcasing for 7-bit characters.
266	973k	MCSharedPtr<MCString> ident = MCMakeShared<MCString>(PTreeMem(), len, '\0');
267	973k	int i;
268	8.62M	for (i = 0; i < len; i++7.64M ) {
269	7.64M	unsigned char ch = static_cast<unsigned char>(text[i]);
270	7.64M	if (ch >= 'A' && ch <= 'Z'7.35M ) {
271	15.4k	ch += 'a' - 'A';
272	15.4k	}
273	7.64M	(*ident)[i] = static_cast<char>(ch);
274	7.64M	}
275
276	973k	if (i >= NAMEDATALEN) {
277	0	TruncateIdentifier(ident, warn);
278	0	}
279	973k	return ident;
280	973k	}
281
282	0	void LexProcessor::TruncateIdentifier(const MCSharedPtr<MCString>& ident, bool warn) {
283	0	auto len = ident->length();
284	0	if (len >= NAMEDATALEN) {
285	0	len = pg_encoding_mbcliplen(ident->c_str(), len, NAMEDATALEN - 1);
286	0	if (warn) {
287		// We avoid using %.*s here because it can misbehave if the data
288		// is not valid in what libc thinks is the prevailing encoding.
289	0	char buf[NAMEDATALEN];
290	0	memcpy(buf, ident->c_str(), len);
291	0	buf[len] = '\0';
292	0	char warn_msg[1024];
293	0	snprintf(warn_msg, sizeof(warn_msg),
294	0	"Identifier %s will be truncated to %s", ident->c_str(), buf);
295	0	parse_context_->Warn(token_loc_, warn_msg, ErrorCode::NAME_TOO_LONG);
296	0	}
297	0	ident->resize(len);
298	0	}
299	0	}
300
301		//--------------------------------------------------------------------------------------------------
302		// NOTE: All entities below this line in this modules are copies of PostgreQL's code. We made some
303		// minor changes to avoid lint errors such as using '{' for if blocks and change the comment style
304		// from '/**/' to '//'.
305		//--------------------------------------------------------------------------------------------------
306
307	215k	inline void LexProcessor::EnlargeLiteralBuf(size_t bytes) {
308		// Increase literalbuf by the given number of "bytes".
309	215k	auto prev_literalalloc = literalalloc_;
310	215k	if (prev_literalalloc == 0) {
311	16.5k	literalalloc_ = 4096;
312	16.5k	}
313	215k	while (literalalloc_ < literallen_ + bytes) {
314	0	literalalloc_ <<= 1;
315	0	}
316	215k	if (prev_literalalloc != literalalloc_) {
317	16.4k	literalbuf_ = reinterpret_cast<char *>(realloc(literalbuf_, literalalloc_));
318	16.4k	}
319	215k	}
320
321	215k	void LexProcessor::startlit() {
322	215k	literallen_ = 0;
323	215k	}
324
325	215k	void LexProcessor::addlit(char *ytext, size_t yleng) {
326		// Enlarge buffer if needed.
327	215k	EnlargeLiteralBuf(yleng);
328
329		// Append new data.
330	215k	memcpy(literalbuf_ + literallen_, ytext, yleng);
331	215k	literallen_ += yleng;
332	215k	}
333
334	0	void LexProcessor::addlitchar(unsigned char ychar) {
335		// Enlarge buffer if needed.
336	0	EnlargeLiteralBuf(1);
337
338		// Append new data.
339	0	literalbuf_[literallen_++] = ychar;
340	0	}
341
342	0	char *LexProcessor::litbuf_udeescape(unsigned char escape) {
343	0	char *new_litbuf;
344	0	char litbuf, in, *out;
345	0	pg_wchar pair_first = 0;
346
347		// Make literalbuf null-terminated to simplify the scanning loop.
348	0	litbuf = literalbuf_;
349	0	litbuf[literallen_] = '\0';
350
351		// This relies on the subtle assumption that a UTF-8 expansion
352		// cannot be longer than its escaped representation.
353	0	new_litbuf = static_cast<char *>(PTempMem()->AllocateBytes(literallen_ + 1));
354
355	0	in = litbuf;
356	0	out = new_litbuf;
357	0	while (*in) {
358	0	if (in[0] == escape) {
359	0	if (in[1] == escape) {
360	0	if (pair_first) {
361	0	AdvanceCursor(narrow_cast<int>(in - litbuf + 3)); // 3 for U&".
362	0	ScanError("invalid Unicode surrogate pair");
363	0	}
364	0	*out++ = escape;
365	0	in += 2;
366
367	0	} else if (isxdigit((unsigned char) in[1]) &&
368	0	isxdigit((unsigned char) in[2]) &&
369	0	isxdigit((unsigned char) in[3]) &&
370	0	isxdigit((unsigned char) in[4])) {
371	0	pg_wchar unicode;
372	0	unicode = ((hexval(in[1]) << 12) +
373	0	(hexval(in[2]) << 8) +
374	0	(hexval(in[3]) << 4) +
375	0	hexval(in[4]));
376	0	check_unicode_value(unicode, in);
377
378	0	if (pair_first) {
379	0	if (is_utf16_surrogate_second(unicode)) {
380	0	unicode = surrogate_pair_to_codepoint(pair_first, unicode);
381	0	pair_first = 0;
382	0	} else {
383	0	AdvanceCursor(narrow_cast<int>(in - litbuf + 3)); /* 3 for U&" */
384	0	ScanError("invalid Unicode surrogate pair");
385	0	}
386	0	} else if (is_utf16_surrogate_second(unicode)) {
387	0	ScanError("invalid Unicode surrogate pair");
388	0	}
389
390	0	if (is_utf16_surrogate_first(unicode)) {
391	0	pair_first = unicode;
392	0	} else {
393	0	unicode_to_utf8(unicode, (unsigned char *) out);
394	0	out += pg_utf_mblen((unsigned char *)out);
395	0	}
396	0	in += 5;
397
398	0	} else if (in[1] == '+' &&
399	0	isxdigit((unsigned char) in[2]) &&
400	0	isxdigit((unsigned char) in[3]) &&
401	0	isxdigit((unsigned char) in[4]) &&
402	0	isxdigit((unsigned char) in[5]) &&
403	0	isxdigit((unsigned char) in[6]) &&
404	0	isxdigit((unsigned char) in[7])) {
405	0	pg_wchar unicode;
406	0	unicode = ((hexval(in[2]) << 20) +
407	0	(hexval(in[3]) << 16) +
408	0	(hexval(in[4]) << 12) +
409	0	(hexval(in[5]) << 8) +
410	0	(hexval(in[6]) << 4) +
411	0	hexval(in[7]));
412	0	check_unicode_value(unicode, in);
413
414	0	if (pair_first) {
415	0	if (is_utf16_surrogate_second(unicode)) {
416	0	unicode = surrogate_pair_to_codepoint(pair_first, unicode);
417	0	pair_first = 0;
418	0	} else {
419	0	AdvanceCursor(narrow_cast<int>(in - litbuf + 3)); /* 3 for U&" */
420	0	ScanError("invalid Unicode surrogate pair");
421	0	}
422	0	} else if (is_utf16_surrogate_second(unicode)) {
423	0	ScanError("invalid Unicode surrogate pair");
424	0	}
425
426	0	if (is_utf16_surrogate_first(unicode)) {
427	0	pair_first = unicode;
428	0	} else {
429	0	unicode_to_utf8(unicode, (unsigned char *) out);
430	0	out += pg_utf_mblen((unsigned char *)out);
431	0	}
432	0	in += 8;
433	0	} else {
434	0	AdvanceCursor(narrow_cast<int>(in - litbuf + 3)); /* 3 for U&" */
435	0	ScanError("invalid Unicode escape value");
436	0	}
437	0	} else {
438	0	if (pair_first) {
439	0	AdvanceCursor(narrow_cast<int>(in - litbuf + 3)); /* 3 for U&" */
440	0	ScanError("invalid Unicode surrogate pair");
441	0	}
442	0	out++ = in++;
443	0	}
444	0	}
445	0	*out = '\0';
446
447		// We could skip pg_verifymbstr if we didn't process any non-7-bit-ASCII
448		// codes; but it's probably not worth the trouble, since this isn't
449		// likely to be a performance-critical path.
450	0	pg_verify_mbstr_len(new_litbuf, out - new_litbuf, false);
451	0	return new_litbuf;
452	0	}
453
454		//--------------------------------------------------------------------------------------------------
455
456	0	void LexProcessor::check_string_escape_warning(unsigned char ychar) {
457	0	if (ychar == '\'') {
458	0	if (warn_on_first_escape_ && escape_string_warning_)
459	0	parse_context_->Warn(token_loc_,
460	0	"Nonstandard use of \\' in a string literal. Use '' to write quotes in "
461	0	"strings, or use the escape string syntax (E'...').",
462	0	ErrorCode::NONSTANDARD_USE_OF_ESCAPE_CHARACTER);
463	0	warn_on_first_escape_ = false; // Warn only once per string.
464	0	} else if (ychar == '\\') {
465	0	if (warn_on_first_escape_ && escape_string_warning_)
466	0	parse_context_->Warn(token_loc_,
467	0	"(Nonstandard use of \\\\ in a string literal. Use the escape string "
468	0	"syntax for backslashes, e.g., E'\\\\'.",
469	0	ErrorCode::NONSTANDARD_USE_OF_ESCAPE_CHARACTER);
470	0	warn_on_first_escape_ = false; // Warn only once per string.
471	0	} else {
472	0	check_escape_warning();
473	0	}
474	0	}
475
476	0	void LexProcessor::check_escape_warning() {
477	0	if (warn_on_first_escape_ && escape_string_warning_)
478	0	parse_context_->Warn(token_loc_,
479	0	"Nonstandard use of escape in a string literal. Use the escape string "
480	0	"syntax for escapes, e.g., E'\\r\\n'.",
481	0	ErrorCode::NONSTANDARD_USE_OF_ESCAPE_CHARACTER);
482	0	warn_on_first_escape_ = false; // Warn only once per string.
483	0	}
484
485	0	unsigned char LexProcessor::unescape_single_char(unsigned char c) {
486	0	switch (c) {
487	0	case 'b':
488	0	return '\b';
489	0	case 'f':
490	0	return '\f';
491	0	case 'n':
492	0	return '\n';
493	0	case 'r':
494	0	return '\r';
495	0	case 't':
496	0	return '\t';
497	0	default:
498		/* check for backslash followed by non-7-bit-ASCII */
499	0	if (c == '\0' \|\| is_utf_highbit_set(c)) {
500	0	saw_non_ascii_ = true;
501	0	}
502	0	return c;
503	0	}
504	0	}
505
506		//--------------------------------------------------------------------------------------------------
507
508	0	void LexProcessor::addunicode(pg_wchar c) {
509	0	char buf[8];
510
511	0	if (c == 0 \|\| c > 0x10FFFF)
512	0	ScanError("invalid Unicode escape value");
513	0	if (c > 0x7F) {
514	0	saw_non_ascii_ = true;
515	0	}
516	0	unicode_to_utf8(c, (unsigned char *)buf);
517	0	addlit(buf, pg_utf_mblen((unsigned char *)buf));
518	0	}
519
520		//--------------------------------------------------------------------------------------------------
521
522		static const ScanKeyword& kInvalidKeyword {
523		"", GramProcessor::token::TOK_NULL_P, ScanKeyword::KeywordCategory::INVALID_KEYWORD
524		};
525
526		#define PG_KEYWORD(a, b, c) \
527		{a, {a, GramProcessor::token::TOK_##b, ScanKeyword::KeywordCategory::c}},
528		const unordered_map<string, const ScanKeyword> kScanKeywords {
529		#include "yb/yql/cql/ql/kwlist.h"
530		};
531
532	2.23M	const ScanKeyword& LexProcessor::ScanKeywordLookup(const char *text) {
533	2.23M	static const int kMaxKeywordBytes = 4096;
534	2.23M	char word[kMaxKeywordBytes];
535	2.23M	size_t word_bytes = strlen(text);
536
537		// PostgreQL Note: Apply an ASCII-only downcasing. We must not use tolower() since it may
538		// produce the wrong translation in some locales (eg, Turkish).
539	16.3M	for (size_t i = 0; i < word_bytes; i++14.1M ) {
540	14.1M	char ch = text[i];
541	14.1M	if (ch >= 'A' && ch <= 'Z'13.8M ) {
542	3.76M	ch += 'a' - 'A';
543	3.76M	}
544	14.1M	word[i] = ch;
545	14.1M	}
546	2.23M	word[word_bytes] = '\0';
547
548	2.23M	unordered_map<string, const ScanKeyword>::const_iterator iter = kScanKeywords.find(word);
549	2.23M	if (iter != kScanKeywords.end()) {
550	1.25M	return iter->second;
551	1.25M	}
552	977k	return kInvalidKeyword;
553	2.23M	}
554
555		//--------------------------------------------------------------------------------------------------
556		// Class ScanStatus - Not yet implemented.
557		//--------------------------------------------------------------------------------------------------
558	4.45M	ScanState::ScanState() {
559	4.45M	}
560
561	4.44M	ScanState::~ScanState() {
562	4.44M	}
563
564		} // namespace ql
565		} // namespace yb