/Users/deen/code/yugabyte-db/src/yb/yql/cql/ql/parser/scanner.h

Source (jump to first uncovered line)
//--------------------------------------------------------------------------------------------------
// The following only applies to changes made to this file as part of YugaByte development.
//
// Copyright (c) YugaByte, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
// in compliance with the License.  You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software distributed under the License
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
// or implied.  See the License for the specific language governing permissions and limitations
// under the License.
//
// Portions Copyright (c) 1996-2015, PostgreSQL Global Development Group
// Portions Copyright (c) 1994, Regents of the University of California
//
// API for the scanner.
//
// The core scanner is also used by PL/pgsql, so we provide a public API
// for it.  However, the rest of the backend is only expected to use the
// higher-level API provided by parser.h.
//--------------------------------------------------------------------------------------------------
#ifndef YB_YQL_CQL_QL_PARSER_SCANNER_H_
#define YB_YQL_CQL_QL_PARSER_SCANNER_H_

#include <cstddef>
#include <cstdint>

// Include base lexer class. FLEX might or might not "include <FlexLexer.h>" in its code and
// generated code. The macro "yyFlexLexerOnce" is used here to guard duplicate includes.
#ifndef yyFlexLexerOnce
#include <FlexLexer.h>
#endif

#include "yb/util/memory/mc_types.h"

#include "yb/yql/cql/ql/parser/parse_context.h"

// Include auto-generated file from YACC.
#include "yb/yql/cql/ql/parser/parser_gram.y.final.hh"

namespace yb {
namespace ql {

//--------------------------------------------------------------------------------------------------
// Various declarations that are used for keyword, identifier, and text.
//--------------------------------------------------------------------------------------------------

// Unicode support.
typedef uint32_t pg_wchar;

// Maximum length for identifiers (e.g. table names, column names, function names).  Names actually
// are limited to one less byte than this, because the length must include a trailing zero byte.
//
// Changing this requires an initdb.
constexpr int NAMEDATALEN = 64;

// UTF bit.
constexpr int UTF_HIGHBIT = 0x80;
constexpr int is_utf_highbit_set(unsigned char ch) { return (ch & UTF_HIGHBIT) != 0; }

// Keywords.
class ScanKeyword {
 public:
  //------------------------------------------------------------------------------------------------
  // Keyword categories. The value in this enum is used to characterize the keywords into different
  // groups. The group that a keyword belongs to must match with their definitions in the file
  // "parser_gram.y".
  enum class KeywordCategory : int16_t {
    UNRESERVED_KEYWORD = 0,
    COL_NAME_KEYWORD,
    TYPE_FUNC_NAME_KEYWORD,
    RESERVED_KEYWORD,
    INVALID_KEYWORD,
  };

  //------------------------------------------------------------------------------------------------
  // Public functions.
  ScanKeyword(const char* name, GramProcessor::token_type value, KeywordCategory category)
      : name_(name), value_(value), category_(category) {
  }

  bool is_valid() const {
    return category_ != KeywordCategory::INVALID_KEYWORD;
  }

  GramProcessor::token_type token() const {
    return static_cast<GramProcessor::token_type>(value_);
  }

  const char* name() const {
    return name_;
  }

 private:
  //------------------------------------------------------------------------------------------------
  const char *name_;           // Name in lower case.
  int16_t value_;              // Grammar's token code.
  KeywordCategory category_;           // See codes above for different keyword categories.
};

// Scan state.
// A token might require multiple scans, and each of these calls might be passed a different
// ScanState. This callstack variable was needed in PostgreQL C-code, but we might not needed in
// our C++ code.
class ScanState {
 public:
  //------------------------------------------------------------------------------------------------
  // Public types.
  typedef std::unique_ptr<ScanState> UniPtr;
  typedef std::unique_ptr<const ScanState> UniPtrConst;

  //------------------------------------------------------------------------------------------------
  // Public functions.
  ScanState();
  virtual ~ScanState();
};

// LexProcessor class.
class LexProcessor : public yyFlexLexer {
 public:
  //------------------------------------------------------------------------------------------------
  // Public types.
  typedef std::unique_ptr<LexProcessor> UniPtr;
  typedef std::unique_ptr<const LexProcessor> UniPtrConst;

  //------------------------------------------------------------------------------------------------
  // Public functions.
  // Constructor and destructor.
  LexProcessor();
  virtual ~LexProcessor();

  // Reset all scanning state variables such that processing a SQL statement should not be affected
  // by the erroneous state of the precedent statements.
  void ScanInit(ParseContext *parse_context);

  // Memory pool for allocating and deallocating operating memory spaces during parsing process.
  MemoryContext *PTempMem() const {
    return parse_context_->PTempMem();
  }

  // Memory pool for constructing the parse tree of a statement.
  MemoryContext *PTreeMem() const {
    return parse_context_->PTreeMem();
  }

  // Entry point for lexical analysis. Scanns and return one token at a time. This is a wrapper
  // around yylex(), and it might call yylex more than once to process a token.
  GramProcessor::symbol_type Scan();

  // Counts number of newline characters in the current token and set token location accordingly.
  void CountNewlineInToken(const std::string& token);

  // Reports error and returns SCAN_ERROR to instruct the parser to stop the parsing process.
  GramProcessor::symbol_type ScanError(const char *token);
  GramProcessor::symbol_type ScanError(const char *message, ErrorCode errcode);

  // Read literal value during a scan and convert it to MCString.
  MCSharedPtr<MCString> ScanLiteral();

  // Access function for current token location.
  const location &token_loc() const {
    return token_loc_;
  }

  GramProcessor::symbol_type make_symbol(int16_t token, location l) {
    return GramProcessor::symbol_type(static_cast<GramProcessor::token_type>(token), std::move(l));
  }

  GramProcessor::symbol_type make_symbol(const ScanKeyword &keyword, location l) {
    return GramProcessor::symbol_type(keyword.token(), keyword.name(), std::move(l));
  }

 private:
  //------------------------------------------------------------------------------------------------
  // Private types.
  enum class BackslashQuoteType {
    OFF,
    ON,
    SAFE_ENCODING
  };

  //------------------------------------------------------------------------------------------------
  // Private functions.
  // Returns a valid keyword value if it exists. Otherwise, returns an invalid value.
  static const ScanKeyword& ScanKeywordLookup(const char *text);

  // The following line lets the compilers know that we know of the existing yylex() from FLEX base
  // class, but we intend to define and use our own yylex().
  using yyFlexLexer::yylex;

  // Run lexical analysis.
  GramProcessor::symbol_type yylex(const ScanState& scan_state);

  // Returns the number of bytes that was read from the input stream or string. Lexer will call
  // this function to collect tokens from the input.
  int LexerInput(char* buf, int max_size) override;

  // Scans the input statement for the next token.
  void ScanNextToken(const ScanState& scan_state, GramProcessor::symbol_type *next_token);

  // Converts text into MCString and truncates it to allowable length, NAMEDATALEN, if needed.
  MCSharedPtr<MCString> MakeIdentifier(const char *text, int len, bool warn);

  // Truncates identifier to allowable length, NAMEDATALEN, if necessary.
  void TruncateIdentifier(const MCSharedPtr<MCString>& ident, bool warn);

  // Converts a char* to MCString.
  MCSharedPtr<MCString> MakeString(const char *str) {
    return MCMakeShared<MCString>(PTreeMem(), str);
  }

  // Advance current token location by the given number of bytes.
  void AdvanceCursor(int bytes) {
    cursor_ += bytes;
  }

  //------------------------------------------------------------------------------------------------
  // NOTE: All entities below this line in this modules are copies of PostgreQL's code. We made
  // some minor changes to avoid lint errors such as using '{' for if blocks, change the comment
  // style from '/**/' to '//', and post-fix data members with "_".
  //------------------------------------------------------------------------------------------------
  // Operations on literal buffers.
  void EnlargeLiteralBuf(size_t bytes);
  void startlit();
  void addlit(char *ytext, size_t yleng);
  void addlitchar(unsigned char ychar);
  char *litbuf_udeescape(unsigned char escape);

  // Unicode support.
  unsigned char unescape_single_char(unsigned char c);
  void addunicode(pg_wchar c);
  void check_string_escape_warning(unsigned char ychar);
  void check_escape_warning();

  //------------------------------------------------------------------------------------------------
  // The context in which the scanning process is running.
  ParseContext *parse_context_;

  // The rest of this class are scanning state variables, which are declared or used by PostgreQL
  // structures, functions, and operations.
  location token_loc_;  // Current token location.
  GramProcessor::symbol_type lookahead_;  // Lookahead token.
  location cursor_;  // The current scanning location.

  // Literalbuf is used to accumulate literal values when multiple rules are needed to parse a
  // single literal.  Call startlit() to reset buffer to empty, addlit() to add text.
  // NOTE: The string in literalbuf is NOT necessarily null-terminated, but there always IS room to
  // add a trailing null at offset literallen.  We store a null only when we need it.
  char *literalbuf_;  // Temporary buffer for literal.
  size_t literallen_;  // Temporary buffer length.
  size_t literalalloc_;  // Temporary buffer size.
  int xcdepth_;  // Depth of nesting in slash-star comments.
  char *dolqstart_;  // Current $foo$ quote start string.
  int32_t utf16_first_part_;  // First of UTF16 surrogate unicode escape pair.
  bool warn_on_first_escape_;  // Literal-lexing warning for escape.
  bool saw_non_ascii_;  // Literal-lexing warning for non ascii.

  // Scanner settings to use.  These are initialized from the corresponding GUC variables by
  // scanner_init().  Callers can modify them after scanner_init() if they don't want the scanner's
  // behavior to follow the prevailing GUC settings.
  BackslashQuoteType backslash_quote_;  // State when scaning backslash.
  bool escape_string_warning_;  // State when scaning escape.
  bool standard_conforming_strings_;  // State when scaning standard string.
};

}  // namespace ql
}  // namespace yb

#endif  // YB_YQL_CQL_QL_PARSER_SCANNER_H_

YugabyteDB (2.13.1.0-b60, 21121d69985fbf76aa6958d8f04a9bfa936293b5)

Coverage Report

Created: 2022-03-22 16:43

Line	Count	Source (jump to first uncovered line)
1		//--------------------------------------------------------------------------------------------------
2		// The following only applies to changes made to this file as part of YugaByte development.
3		//
4		// Copyright (c) YugaByte, Inc.
5		//
6		// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
7		// in compliance with the License. You may obtain a copy of the License at
8		//
9		// http://www.apache.org/licenses/LICENSE-2.0
10		//
11		// Unless required by applicable law or agreed to in writing, software distributed under the License
12		// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
13		// or implied. See the License for the specific language governing permissions and limitations
14		// under the License.
15		//
16		// Portions Copyright (c) 1996-2015, PostgreSQL Global Development Group
17		// Portions Copyright (c) 1994, Regents of the University of California
18		//
19		// API for the scanner.
20		//
21		// The core scanner is also used by PL/pgsql, so we provide a public API
22		// for it. However, the rest of the backend is only expected to use the
23		// higher-level API provided by parser.h.
24		//--------------------------------------------------------------------------------------------------
25		#ifndef YB_YQL_CQL_QL_PARSER_SCANNER_H_
26		#define YB_YQL_CQL_QL_PARSER_SCANNER_H_
27
28		#include <cstddef>
29		#include <cstdint>
30
31		// Include base lexer class. FLEX might or might not "include <FlexLexer.h>" in its code and
32		// generated code. The macro "yyFlexLexerOnce" is used here to guard duplicate includes.
33		#ifndef yyFlexLexerOnce
34		#include <FlexLexer.h>
35		#endif
36
37		#include "yb/util/memory/mc_types.h"
38
39		#include "yb/yql/cql/ql/parser/parse_context.h"
40
41		// Include auto-generated file from YACC.
42		#include "yb/yql/cql/ql/parser/parser_gram.y.final.hh"
43
44		namespace yb {
45		namespace ql {
46
47		//--------------------------------------------------------------------------------------------------
48		// Various declarations that are used for keyword, identifier, and text.
49		//--------------------------------------------------------------------------------------------------
50
51		// Unicode support.
52		typedef uint32_t pg_wchar;
53
54		// Maximum length for identifiers (e.g. table names, column names, function names). Names actually
55		// are limited to one less byte than this, because the length must include a trailing zero byte.
56		//
57		// Changing this requires an initdb.
58		constexpr int NAMEDATALEN = 64;
59
60		// UTF bit.
61		constexpr int UTF_HIGHBIT = 0x80;
62	0	constexpr int is_utf_highbit_set(unsigned char ch) { return (ch & UTF_HIGHBIT) != 0; }
63
64		// Keywords.
65		class ScanKeyword {
66		public:
67		//------------------------------------------------------------------------------------------------
68		// Keyword categories. The value in this enum is used to characterize the keywords into different
69		// groups. The group that a keyword belongs to must match with their definitions in the file
70		// "parser_gram.y".
71		enum class KeywordCategory : int16_t {
72		UNRESERVED_KEYWORD = 0,
73		COL_NAME_KEYWORD,
74		TYPE_FUNC_NAME_KEYWORD,
75		RESERVED_KEYWORD,
76		INVALID_KEYWORD,
77		};
78
79		//------------------------------------------------------------------------------------------------
80		// Public functions.
81		ScanKeyword(const char* name, GramProcessor::token_type value, KeywordCategory category)
82	4.25M	: name_(name), value_(value), category_(category) {
83	4.25M	}
84
85	2.22M	bool is_valid() const {
86	2.22M	return category_ != KeywordCategory::INVALID_KEYWORD;
87	2.22M	}
88
89	1.25M	GramProcessor::token_type token() const {
90	1.25M	return static_cast<GramProcessor::token_type>(value_);
91	1.25M	}
92
93	1.25M	const char* name() const {
94	1.25M	return name_;
95	1.25M	}
96
97		private:
98		//------------------------------------------------------------------------------------------------
99		const char *name_; // Name in lower case.
100		int16_t value_; // Grammar's token code.
101		KeywordCategory category_; // See codes above for different keyword categories.
102		};
103
104		// Scan state.
105		// A token might require multiple scans, and each of these calls might be passed a different
106		// ScanState. This callstack variable was needed in PostgreQL C-code, but we might not needed in
107		// our C++ code.
108		class ScanState {
109		public:
110		//------------------------------------------------------------------------------------------------
111		// Public types.
112		typedef std::unique_ptr<ScanState> UniPtr;
113		typedef std::unique_ptr<const ScanState> UniPtrConst;
114
115		//------------------------------------------------------------------------------------------------
116		// Public functions.
117		ScanState();
118		virtual ~ScanState();
119		};
120
121		// LexProcessor class.
122		class LexProcessor : public yyFlexLexer {
123		public:
124		//------------------------------------------------------------------------------------------------
125		// Public types.
126		typedef std::unique_ptr<LexProcessor> UniPtr;
127		typedef std::unique_ptr<const LexProcessor> UniPtrConst;
128
129		//------------------------------------------------------------------------------------------------
130		// Public functions.
131		// Constructor and destructor.
132		LexProcessor();
133		virtual ~LexProcessor();
134
135		// Reset all scanning state variables such that processing a SQL statement should not be affected
136		// by the erroneous state of the precedent statements.
137		void ScanInit(ParseContext *parse_context);
138
139		// Memory pool for allocating and deallocating operating memory spaces during parsing process.
140	0	MemoryContext *PTempMem() const {
141	0	return parse_context_->PTempMem();
142	0	}
143
144		// Memory pool for constructing the parse tree of a statement.
145	1.35M	MemoryContext *PTreeMem() const {
146	1.35M	return parse_context_->PTreeMem();
147	1.35M	}
148
149		// Entry point for lexical analysis. Scanns and return one token at a time. This is a wrapper
150		// around yylex(), and it might call yylex more than once to process a token.
151		GramProcessor::symbol_type Scan();
152
153		// Counts number of newline characters in the current token and set token location accordingly.
154		void CountNewlineInToken(const std::string& token);
155
156		// Reports error and returns SCAN_ERROR to instruct the parser to stop the parsing process.
157		GramProcessor::symbol_type ScanError(const char *token);
158		GramProcessor::symbol_type ScanError(const char *message, ErrorCode errcode);
159
160		// Read literal value during a scan and convert it to MCString.
161		MCSharedPtr<MCString> ScanLiteral();
162
163		// Access function for current token location.
164	0	const location &token_loc() const {
165	0	return token_loc_;
166	0	}
167
168	1.50M	GramProcessor::symbol_type make_symbol(int16_t token, location l) {
169	1.50M	return GramProcessor::symbol_type(static_cast<GramProcessor::token_type>(token), std::move(l));
170	1.50M	}
171
172	1.24M	GramProcessor::symbol_type make_symbol(const ScanKeyword &keyword, location l) {
173	1.24M	return GramProcessor::symbol_type(keyword.token(), keyword.name(), std::move(l));
174	1.24M	}
175
176		private:
177		//------------------------------------------------------------------------------------------------
178		// Private types.
179		enum class BackslashQuoteType {
180		OFF,
181		ON,
182		SAFE_ENCODING
183		};
184
185		//------------------------------------------------------------------------------------------------
186		// Private functions.
187		// Returns a valid keyword value if it exists. Otherwise, returns an invalid value.
188		static const ScanKeyword& ScanKeywordLookup(const char *text);
189
190		// The following line lets the compilers know that we know of the existing yylex() from FLEX base
191		// class, but we intend to define and use our own yylex().
192		using yyFlexLexer::yylex;
193
194		// Run lexical analysis.
195		GramProcessor::symbol_type yylex(const ScanState& scan_state);
196
197		// Returns the number of bytes that was read from the input stream or string. Lexer will call
198		// this function to collect tokens from the input.
199		int LexerInput(char* buf, int max_size) override;
200
201		// Scans the input statement for the next token.
202		void ScanNextToken(const ScanState& scan_state, GramProcessor::symbol_type *next_token);
203
204		// Converts text into MCString and truncates it to allowable length, NAMEDATALEN, if needed.
205		MCSharedPtr<MCString> MakeIdentifier(const char *text, int len, bool warn);
206
207		// Truncates identifier to allowable length, NAMEDATALEN, if necessary.
208		void TruncateIdentifier(const MCSharedPtr<MCString>& ident, bool warn);
209
210		// Converts a char* to MCString.
211	165k	MCSharedPtr<MCString> MakeString(const char *str) {
212	165k	return MCMakeShared<MCString>(PTreeMem(), str);
213	165k	}
214
215		// Advance current token location by the given number of bytes.
216	0	void AdvanceCursor(int bytes) {
217	0	cursor_ += bytes;
218	0	}
219
220		//------------------------------------------------------------------------------------------------
221		// NOTE: All entities below this line in this modules are copies of PostgreQL's code. We made
222		// some minor changes to avoid lint errors such as using '{' for if blocks, change the comment
223		// style from '/**/' to '//', and post-fix data members with "_".
224		//------------------------------------------------------------------------------------------------
225		// Operations on literal buffers.
226		void EnlargeLiteralBuf(size_t bytes);
227		void startlit();
228		void addlit(char *ytext, size_t yleng);
229		void addlitchar(unsigned char ychar);
230		char *litbuf_udeescape(unsigned char escape);
231
232		// Unicode support.
233		unsigned char unescape_single_char(unsigned char c);
234		void addunicode(pg_wchar c);
235		void check_string_escape_warning(unsigned char ychar);
236		void check_escape_warning();
237
238		//------------------------------------------------------------------------------------------------
239		// The context in which the scanning process is running.
240		ParseContext *parse_context_;
241
242		// The rest of this class are scanning state variables, which are declared or used by PostgreQL
243		// structures, functions, and operations.
244		location token_loc_; // Current token location.
245		GramProcessor::symbol_type lookahead_; // Lookahead token.
246		location cursor_; // The current scanning location.
247
248		// Literalbuf is used to accumulate literal values when multiple rules are needed to parse a
249		// single literal. Call startlit() to reset buffer to empty, addlit() to add text.
250		// NOTE: The string in literalbuf is NOT necessarily null-terminated, but there always IS room to
251		// add a trailing null at offset literallen. We store a null only when we need it.
252		char *literalbuf_; // Temporary buffer for literal.
253		size_t literallen_; // Temporary buffer length.
254		size_t literalalloc_; // Temporary buffer size.
255		int xcdepth_; // Depth of nesting in slash-star comments.
256		char *dolqstart_; // Current $foo$ quote start string.
257		int32_t utf16_first_part_; // First of UTF16 surrogate unicode escape pair.
258		bool warn_on_first_escape_; // Literal-lexing warning for escape.
259		bool saw_non_ascii_; // Literal-lexing warning for non ascii.
260
261		// Scanner settings to use. These are initialized from the corresponding GUC variables by
262		// scanner_init(). Callers can modify them after scanner_init() if they don't want the scanner's
263		// behavior to follow the prevailing GUC settings.
264		BackslashQuoteType backslash_quote_; // State when scaning backslash.
265		bool escape_string_warning_; // State when scaning escape.
266		bool standard_conforming_strings_; // State when scaning standard string.
267		};
268
269		} // namespace ql
270		} // namespace yb
271
272		#endif // YB_YQL_CQL_QL_PARSER_SCANNER_H_