/Users/deen/code/yugabyte-db/src/yb/gutil/strings/split.h
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright 2008 and onwards Google, Inc. |
2 | | // |
3 | | // #status: RECOMMENDED |
4 | | // #category: operations on strings |
5 | | // #summary: Functions for splitting strings into substrings. |
6 | | // |
7 | | // This file contains functions for splitting strings. The new and recommended |
8 | | // API for string splitting is the strings::Split() function. The old API is a |
9 | | // large collection of standalone functions declared at the bottom of this file |
10 | | // in the global scope. |
11 | | // |
12 | | // TODO(user): Rough migration plan from old API to new API |
13 | | // (1) Add comments to old Split*() functions showing how to do the same things |
14 | | // with the new API. |
15 | | // (2) Reimplement some of the old Split*() functions in terms of the new |
16 | | // Split() API. This will allow deletion of code in split.cc. |
17 | | // (3) (Optional) Replace old Split*() API calls at call sites with calls to new |
18 | | // Split() API. |
19 | | // |
20 | | // The following only applies to changes made to this file as part of YugaByte development. |
21 | | // |
22 | | // Portions Copyright (c) YugaByte, Inc. |
23 | | // |
24 | | // Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except |
25 | | // in compliance with the License. You may obtain a copy of the License at |
26 | | // |
27 | | // http://www.apache.org/licenses/LICENSE-2.0 |
28 | | // |
29 | | // Unless required by applicable law or agreed to in writing, software distributed under the License |
30 | | // is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express |
31 | | // or implied. See the License for the specific language governing permissions and limitations |
32 | | // under the License. |
33 | | // |
34 | | #ifndef YB_GUTIL_STRINGS_SPLIT_H |
35 | | #define YB_GUTIL_STRINGS_SPLIT_H |
36 | | |
37 | | #include <stddef.h> |
38 | | #include <algorithm> |
39 | | using std::copy; |
40 | | using std::max; |
41 | | using std::min; |
42 | | using std::reverse; |
43 | | using std::sort; |
44 | | using std::swap; |
45 | | #include <iterator> |
46 | | using std::back_insert_iterator; |
47 | | using std::iterator_traits; |
48 | | #include <map> |
49 | | using std::map; |
50 | | using std::multimap; |
51 | | #include <set> |
52 | | using std::multiset; |
53 | | using std::set; |
54 | | #include <string> |
55 | | using std::string; |
56 | | #include <utility> |
57 | | using std::make_pair; |
58 | | using std::pair; |
59 | | #include <vector> |
60 | | using std::vector; |
61 | | |
62 | | #include <glog/logging.h> |
63 | | |
64 | | #include "yb/gutil/integral_types.h" |
65 | | #include "yb/gutil/logging-inl.h" |
66 | | #include "yb/gutil/strings/charset.h" |
67 | | #include "yb/gutil/strings/split_internal.h" |
68 | | #include "yb/gutil/strings/stringpiece.h" |
69 | | #include "yb/gutil/strings/strip.h" |
70 | | |
71 | | namespace strings { |
72 | | |
73 | | // The new Split API |
74 | | // aka Split2 |
75 | | // aka strings::Split() |
76 | | // |
77 | | // This string splitting API consists of a Split() function in the ::strings |
78 | | // namespace and a handful of delimiter objects in the ::strings::delimiter |
79 | | // namespace (more on delimiter objects below). The Split() function always |
80 | | // takes two arguments: the text to be split and the delimiter on which to split |
81 | | // the text. An optional third argument may also be given, which is a Predicate |
82 | | // functor that will be used to filter the results, e.g., to skip empty strings |
83 | | // (more on predicates below). The Split() function adapts the returned |
84 | | // collection to the type specified by the caller. |
85 | | // |
86 | | // Example 1: |
87 | | // // Splits the given string on commas. Returns the results in a |
88 | | // // vector of strings. |
89 | | // vector<string> v = strings::Split("a,b,c", ","); |
90 | | // assert(v.size() == 3); |
91 | | // |
92 | | // Example 2: |
93 | | // // By default, empty strings are *included* in the output. See the |
94 | | // // strings::SkipEmpty predicate below to omit them. |
95 | | // vector<string> v = strings::Split("a,b,,c", ","); |
96 | | // assert(v.size() == 4); // "a", "b", "", "c" |
97 | | // v = strings::Split("", ","); |
98 | | // assert(v.size() == 1); // v contains a single "" |
99 | | // |
100 | | // Example 3: |
101 | | // // Splits the string as in the previous example, except that the results |
102 | | // // are returned as GStringPiece objects. Note that because we are storing |
103 | | // // the results within GStringPiece objects, we have to ensure that the input |
104 | | // // string outlives any results. |
105 | | // vector<GStringPiece> v = strings::Split("a,b,c", ","); |
106 | | // assert(v.size() == 3); |
107 | | // |
108 | | // Example 4: |
109 | | // // Stores results in a set<string>. |
110 | | // set<string> a = strings::Split("a,b,c,a,b,c", ","); |
111 | | // assert(a.size() == 3); |
112 | | // |
113 | | // Example 5: |
114 | | // // Stores results in a map. The map implementation assumes that the input |
115 | | // // is provided as a series of key/value pairs. For example, the 0th element |
116 | | // // resulting from the split will be stored as a key to the 1st element. If |
117 | | // // an odd number of elements are resolved, the last element is paired with |
118 | | // // a default-constructed value (e.g., empty string). |
119 | | // map<string, string> m = strings::Split("a,b,c", ","); |
120 | | // assert(m.size() == 2); |
121 | | // assert(m["a"] == "b"); |
122 | | // assert(m["c"] == ""); // last component value equals "" |
123 | | // |
124 | | // Example 6: |
125 | | // // Splits on the empty string, which results in each character of the input |
126 | | // // string becoming one element in the output collection. |
127 | | // vector<string> v = strings::Split("abc", ""); |
128 | | // assert(v.size() == 3); |
129 | | // |
130 | | // Example 7: |
131 | | // // Stores first two split strings as the members in an std::pair. |
132 | | // std::pair<string, string> p = strings::Split("a,b,c", ","); |
133 | | // EXPECT_EQ("a", p.first); |
134 | | // EXPECT_EQ("b", p.second); |
135 | | // // "c" is omitted because std::pair can hold only two elements. |
136 | | // |
137 | | // As illustrated above, the Split() function adapts the returned collection to |
138 | | // the type specified by the caller. The returned collections may contain |
139 | | // string, GStringPiece, Cord, or any object that has a constructor (explicit or |
140 | | // not) that takes a single GStringPiece argument. This pattern works for all |
141 | | // standard STL containers including vector, list, deque, set, multiset, map, |
142 | | // and multimap, non-standard containers including hash_set and hash_map, and |
143 | | // even std::pair which is not actually a container. |
144 | | // |
145 | | // Splitting to std::pair is an interesting case because it can hold only two |
146 | | // elements and is not a collection type. When splitting to an std::pair the |
147 | | // first two split strings become the std::pair's .first and .second members |
148 | | // respectively. The remaining split substrings are discarded. If there are less |
149 | | // than two split substrings, the empty string is used for the corresponding |
150 | | // std::pair member. |
151 | | // |
152 | | // The strings::Split() function can be used multiple times to perform more |
153 | | // complicated splitting logic, such as intelligently parsing key-value pairs. |
154 | | // For example |
155 | | // |
156 | | // // The input string "a=b=c,d=e,f=,g" becomes |
157 | | // // { "a" => "b=c", "d" => "e", "f" => "", "g" => "" } |
158 | | // map<string, string> m; |
159 | | // for (GStringPiece sp : strings::Split("a=b=c,d=e,f=,g", ",")) { |
160 | | // m.insert(strings::Split(sp, strings::delimiter::Limit("=", 1))); |
161 | | // } |
162 | | // EXPECT_EQ("b=c", m.find("a")->second); |
163 | | // EXPECT_EQ("e", m.find("d")->second); |
164 | | // EXPECT_EQ("", m.find("f")->second); |
165 | | // EXPECT_EQ("", m.find("g")->second); |
166 | | // |
167 | | // The above example stores the results in an std::map. But depending on your |
168 | | // data requirements, you can just as easily store the results in an |
169 | | // std::multimap or even a vector<std::pair<>>. |
170 | | // |
171 | | // |
172 | | // Delimiters |
173 | | // |
174 | | // The Split() function also takes a second argument that is a delimiter. This |
175 | | // delimiter is actually an object that defines the boundaries between elements |
176 | | // in the provided input. If a string (const char*, ::string, or GStringPiece) is |
177 | | // passed in place of an explicit Delimiter object, the argument is implicitly |
178 | | // converted to a ::strings::delimiter::Literal. |
179 | | // |
180 | | // With this split API comes the formal concept of a Delimiter (big D). A |
181 | | // Delimiter is an object with a Find() function that knows how find the first |
182 | | // occurrence of itself in a given GStringPiece. Models of the Delimiter concept |
183 | | // represent specific kinds of delimiters, such as single characters, |
184 | | // substrings, or even regular expressions. |
185 | | // |
186 | | // The following Delimiter objects are provided as part of the Split() API: |
187 | | // |
188 | | // - Literal (default) |
189 | | // - AnyOf |
190 | | // - Limit |
191 | | // |
192 | | // The following are examples of using some provided Delimiter objects: |
193 | | // |
194 | | // Example 1: |
195 | | // // Because a string literal is converted to a strings::delimiter::Literal, |
196 | | // // the following two splits are equivalent. |
197 | | // vector<string> v1 = strings::Split("a,b,c", ","); // (1) |
198 | | // using ::strings::delimiter::Literal; |
199 | | // vector<string> v2 = strings::Split("a,b,c", Literal(",")); // (2) |
200 | | // |
201 | | // Example 2: |
202 | | // // Splits on any of the characters specified in the delimiter string. |
203 | | // using ::strings::delimiter::AnyOf; |
204 | | // vector<string> v = strings::Split("a,b;c-d", AnyOf(",;-")); |
205 | | // assert(v.size() == 4); |
206 | | // |
207 | | // Example 3: |
208 | | // // Uses the Limit meta-delimiter to limit the number of matches a delimiter |
209 | | // // can have. In this case, the delimiter of a Literal comma is limited to |
210 | | // // to matching at most one time. The last element in the returned |
211 | | // // collection will contain all unsplit pieces, which may contain instances |
212 | | // // of the delimiter. |
213 | | // using ::strings::delimiter::Limit; |
214 | | // vector<string> v = strings::Split("a,b,c", Limit(",", 1)); |
215 | | // assert(v.size() == 2); // Limited to 1 delimiter; so two elements found |
216 | | // assert(v[0] == "a"); |
217 | | // assert(v[1] == "b,c"); |
218 | | // |
219 | | // |
220 | | // Predicates |
221 | | // |
222 | | // Predicates can filter the results of a Split() operation by determining |
223 | | // whether or not a resultant element is included in the result set. A predicate |
224 | | // may be passed as an *optional* third argument to the Split() function. |
225 | | // |
226 | | // Predicates are unary functions (or functors) that take a single GStringPiece |
227 | | // argument and return bool indicating whether the argument should be included |
228 | | // (true) or excluded (false). |
229 | | // |
230 | | // One example where this is useful is when filtering out empty substrings. By |
231 | | // default, empty substrings may be returned by strings::Split(), which is |
232 | | // similar to the way split functions work in other programming languages. For |
233 | | // example: |
234 | | // |
235 | | // // Empty strings *are* included in the returned collection. |
236 | | // vector<string> v = strings::Split(",a,,b,", ","); |
237 | | // assert(v.size() == 5); // v[0] == "", v[1] == "a", v[2] == "", ... |
238 | | // |
239 | | // These empty strings can be filtered out of the results by simply passing the |
240 | | // provided SkipEmpty predicate as the third argument to the Split() function. |
241 | | // SkipEmpty does not consider a string containing all whitespace to be empty. |
242 | | // For that behavior use the SkipWhitespace predicate. For example: |
243 | | // |
244 | | // Example 1: |
245 | | // // Uses SkipEmpty to omit empty strings. Strings containing whitespace are |
246 | | // // not empty and are therefore not skipped. |
247 | | // using strings::SkipEmpty; |
248 | | // vector<string> v = strings::Split(",a, ,b,", ",", SkipEmpty()); |
249 | | // assert(v.size() == 3); |
250 | | // assert(v[0] == "a"); |
251 | | // assert(v[1] == " "); // <-- The whitespace makes the string not empty. |
252 | | // assert(v[2] == "b"); |
253 | | // |
254 | | // Example 2: |
255 | | // // Uses SkipWhitespace to skip all strings that are either empty or contain |
256 | | // // only whitespace. |
257 | | // using strings::SkipWhitespace; |
258 | | // vector<string> v = strings::Split(",a, ,b,", ",", SkipWhitespace()); |
259 | | // assert(v.size() == 2); |
260 | | // assert(v[0] == "a"); |
261 | | // assert(v[1] == "b"); |
262 | | // |
263 | | // |
264 | | // Differences between Split1 and Split2 |
265 | | // |
266 | | // Split2 is the strings::Split() API described above. Split1 is a name for the |
267 | | // collection of legacy Split*() functions declared later in this file. Most of |
268 | | // the Split1 functions follow a set of conventions that don't necessarily match |
269 | | // the conventions used in Split2. The following are some of the important |
270 | | // differences between Split1 and Split2: |
271 | | // |
272 | | // Split1 -> Split2 |
273 | | // ---------------- |
274 | | // Append -> Assign: |
275 | | // The Split1 functions all returned their output collections via a pointer to |
276 | | // an out parameter as is typical in Google code. In some cases the comments |
277 | | // explicitly stated that results would be *appended* to the output |
278 | | // collection. In some cases it was ambiguous whether results were appended. |
279 | | // This ambiguity is gone in the Split2 API as results are always assigned to |
280 | | // the output collection, never appended. |
281 | | // |
282 | | // AnyOf -> Literal: |
283 | | // Most Split1 functions treated their delimiter argument as a string of |
284 | | // individual byte delimiters. For example, a delimiter of ",;" would split on |
285 | | // "," and ";", not the substring ",;". This behavior is equivalent to the |
286 | | // Split2 delimiter strings::delimiter::AnyOf, which is *not* the default. By |
287 | | // default, strings::Split() splits using strings::delimiter::Literal() which |
288 | | // would treat the whole string ",;" as a single delimiter string. |
289 | | // |
290 | | // SkipEmpty -> allow empty: |
291 | | // Most Split1 functions omitted empty substrings in the results. To keep |
292 | | // empty substrings one would have to use an explicitly named |
293 | | // Split*AllowEmpty() function. This behavior is reversed in Split2. By |
294 | | // default, strings::Split() *allows* empty substrings in the output. To skip |
295 | | // them, use the strings::SkipEmpty predicate. |
296 | | // |
297 | | // string -> user's choice: |
298 | | // Most Split1 functions return collections of string objects. Some return |
299 | | // char*, but the type returned is dictated by each Split1 function. With |
300 | | // Split2 the caller can choose which string-like object to return. (Note: |
301 | | // char* C-strings are not supported in Split2--use GStringPiece instead). |
302 | | // |
303 | | |
304 | | // Definitions of the main Split() function. |
305 | | template <typename Delimiter> |
306 | 0 | inline internal::Splitter<Delimiter> Split(GStringPiece text, Delimiter d) { |
307 | 0 | return internal::Splitter<Delimiter>(text, d); |
308 | 0 | } Unexecuted instantiation: strings::internal::Splitter<strings::delimiter::AnyOf, strings::internal::NoFilter> strings::Split<strings::delimiter::AnyOf>(GStringPiece, strings::delimiter::AnyOf) Unexecuted instantiation: strings::internal::Splitter<strings::delimiter::LimitImpl<strings::delimiter::AnyOf>, strings::internal::NoFilter> strings::Split<strings::delimiter::LimitImpl<strings::delimiter::AnyOf> >(GStringPiece, strings::delimiter::LimitImpl<strings::delimiter::AnyOf>) |
309 | | |
310 | | template <typename Delimiter, typename Predicate> |
311 | | inline internal::Splitter<Delimiter, Predicate> Split( |
312 | 0 | GStringPiece text, Delimiter d, Predicate p) { |
313 | 0 | return internal::Splitter<Delimiter, Predicate>(text, d, p); |
314 | 0 | } |
315 | | |
316 | | namespace delimiter { |
317 | | // A Delimiter object represents a single separator, such as a character, |
318 | | // literal string, or regular expression. A Delimiter object must have the |
319 | | // following member: |
320 | | // |
321 | | // GStringPiece Find(GStringPiece text); |
322 | | // |
323 | | // This Find() member function should return a GStringPiece referring to the next |
324 | | // occurrence of the represented delimiter within the given string text. If no |
325 | | // delimiter is found in the given text, a zero-length GStringPiece referring to |
326 | | // text.end() should be returned (e.g., GStringPiece(text.end(), 0)). It is |
327 | | // important that the returned GStringPiece always be within the bounds of the |
328 | | // GStringPiece given as an argument--it must not refer to a string that is |
329 | | // physically located outside of the given string. The following example is a |
330 | | // simple Delimiter object that is created with a single char and will look for |
331 | | // that char in the text given to the Find() function: |
332 | | // |
333 | | // struct SimpleDelimiter { |
334 | | // const char c_; |
335 | | // explicit SimpleDelimiter(char c) : c_(c) {} |
336 | | // GStringPiece Find(GStringPiece text) { |
337 | | // int pos = text.find(c_); |
338 | | // if (pos == GStringPiece::npos) return GStringPiece(text.end(), 0); |
339 | | // return GStringPiece(text, pos, 1); |
340 | | // } |
341 | | // }; |
342 | | |
343 | | // Represents a literal string delimiter. Examples: |
344 | | // |
345 | | // using ::strings::delimiter::Literal; |
346 | | // vector<string> v = strings::Split("a=>b=>c", Literal("=>")); |
347 | | // assert(v.size() == 3); |
348 | | // assert(v[0] == "a"); |
349 | | // assert(v[1] == "b"); |
350 | | // assert(v[2] == "c"); |
351 | | // |
352 | | // The next example uses the empty string as a delimiter. |
353 | | // |
354 | | // using ::strings::delimiter::Literal; |
355 | | // vector<string> v = strings::Split("abc", Literal("")); |
356 | | // assert(v.size() == 3); |
357 | | // assert(v[0] == "a"); |
358 | | // assert(v[1] == "b"); |
359 | | // assert(v[2] == "c"); |
360 | | // |
361 | | class Literal { |
362 | | public: |
363 | | explicit Literal(GStringPiece sp); |
364 | | GStringPiece Find(GStringPiece text) const; |
365 | | |
366 | | private: |
367 | | const string delimiter_; |
368 | | }; |
369 | | |
370 | | // Represents a delimiter that will match any of the given byte-sized |
371 | | // characters. AnyOf is similar to Literal, except that AnyOf uses |
372 | | // GStringPiece::find_first_of() and Literal uses GStringPiece::find(). AnyOf |
373 | | // examples: |
374 | | // |
375 | | // using ::strings::delimiter::AnyOf; |
376 | | // vector<string> v = strings::Split("a,b=c", AnyOf(",=")); |
377 | | // |
378 | | // assert(v.size() == 3); |
379 | | // assert(v[0] == "a"); |
380 | | // assert(v[1] == "b"); |
381 | | // assert(v[2] == "c"); |
382 | | // |
383 | | // If AnyOf is given the empty string, it behaves exactly like Literal and |
384 | | // matches each individual character in the input string. |
385 | | // |
386 | | // Note: The string passed to AnyOf is assumed to be a string of single-byte |
387 | | // ASCII characters. AnyOf does not work with multi-byte characters. |
388 | | class AnyOf { |
389 | | public: |
390 | | explicit AnyOf(GStringPiece sp); |
391 | | GStringPiece Find(GStringPiece text) const; |
392 | | |
393 | | private: |
394 | | const string delimiters_; |
395 | | }; |
396 | | |
397 | | // Wraps another delimiter and sets a max number of matches for that delimiter. |
398 | | // Create LimitImpls using the Limit() function. Example: |
399 | | // |
400 | | // using ::strings::delimiter::Limit; |
401 | | // vector<string> v = strings::Split("a,b,c,d", Limit(",", 2)); |
402 | | // |
403 | | // assert(v.size() == 3); // Split on 2 commas, giving a vector with 3 items |
404 | | // assert(v[0] == "a"); |
405 | | // assert(v[1] == "b"); |
406 | | // assert(v[2] == "c,d"); |
407 | | // |
408 | | template <typename Delimiter> |
409 | | class LimitImpl { |
410 | | public: |
411 | | LimitImpl(Delimiter delimiter, size_t limit) |
412 | 0 | : delimiter_(std::move(delimiter)), limit_(limit), count_(0) {} |
413 | 0 | GStringPiece Find(GStringPiece text) { |
414 | 0 | if (count_++ == limit_) { |
415 | 0 | return GStringPiece(text.end(), 0); // No more matches. |
416 | 0 | } |
417 | 0 | return delimiter_.Find(text); |
418 | 0 | } |
419 | | |
420 | | private: |
421 | | Delimiter delimiter_; |
422 | | const size_t limit_; |
423 | | size_t count_; |
424 | | }; |
425 | | |
426 | | // Overloaded Limit() function to create LimitImpl<> objects. Uses the Delimiter |
427 | | // Literal as the default if string-like objects are passed as the delimiter |
428 | | // parameter. This is similar to the overloads for Split() below. |
429 | | template <typename Delimiter> |
430 | 0 | inline LimitImpl<Delimiter> Limit(Delimiter delim, size_t limit) { |
431 | 0 | return LimitImpl<Delimiter>(delim, limit); |
432 | 0 | } |
433 | | |
434 | 0 | inline LimitImpl<Literal> Limit(const char* s, size_t limit) { |
435 | 0 | return LimitImpl<Literal>(Literal(s), limit); |
436 | 0 | } |
437 | | |
438 | 0 | inline LimitImpl<Literal> Limit(const string& s, size_t limit) { |
439 | 0 | return LimitImpl<Literal>(Literal(s), limit); |
440 | 0 | } |
441 | | |
442 | 0 | inline LimitImpl<Literal> Limit(GStringPiece s, size_t limit) { |
443 | 0 | return LimitImpl<Literal>(Literal(s), limit); |
444 | 0 | } |
445 | | |
446 | | } // namespace delimiter |
447 | | |
448 | | // |
449 | | // Predicates are functors that return bool indicating whether the given |
450 | | // GStringPiece should be included in the split output. If the predicate returns |
451 | | // false then the string will be excluded from the output from strings::Split(). |
452 | | // |
453 | | |
454 | | // Always returns true, indicating that all strings--including empty |
455 | | // strings--should be included in the split output. This predicate is not |
456 | | // strictly needed because this is the default behavior of the strings::Split() |
457 | | // function. But it might be useful at some call sites to make the intent |
458 | | // explicit. |
459 | | // |
460 | | // vector<string> v = Split(" a , ,,b,", ",", AllowEmpty()); |
461 | | // EXPECT_THAT(v, ElementsAre(" a ", " ", "", "b", "")); |
462 | | struct AllowEmpty { |
463 | 212 | bool operator()(GStringPiece sp) const { |
464 | 212 | return true; |
465 | 212 | } |
466 | | }; |
467 | | |
468 | | // Returns false if the given GStringPiece is empty, indicating that the |
469 | | // strings::Split() API should omit the empty string. |
470 | | // |
471 | | // vector<string> v = Split(" a , ,,b,", ",", SkipEmpty()); |
472 | | // EXPECT_THAT(v, ElementsAre(" a ", " ", "b")); |
473 | | struct SkipEmpty { |
474 | 412k | bool operator()(GStringPiece sp) const { |
475 | 412k | return !sp.empty(); |
476 | 412k | } |
477 | | }; |
478 | | |
479 | | // Returns false if the given GStringPiece is empty or contains only whitespace, |
480 | | // indicating that the strings::Split() API should omit the string. |
481 | | // |
482 | | // vector<string> v = Split(" a , ,,b,", ",", SkipWhitespace()); |
483 | | // EXPECT_THAT(v, ElementsAre(" a ", "b")); |
484 | | struct SkipWhitespace { |
485 | 0 | bool operator()(GStringPiece sp) const { |
486 | 0 | StripWhiteSpace(&sp); |
487 | 0 | return !sp.empty(); |
488 | 0 | } |
489 | | }; |
490 | | |
491 | | // Split() function overloads to effectively give Split() a default Delimiter |
492 | | // type of Literal. If Split() is called and a string is passed as the delimiter |
493 | | // instead of an actual Delimiter object, then one of these overloads will be |
494 | | // invoked and will create a Splitter<Literal> with the delimiter string. |
495 | | // |
496 | | // Since Split() is a function template above, these overload signatures need to |
497 | | // be explicit about the string type so they match better than the templated |
498 | | // version. These functions are overloaded for: |
499 | | // |
500 | | // - const char* |
501 | | // - const string& |
502 | | // - GStringPiece |
503 | | |
504 | | inline internal::Splitter<delimiter::Literal> Split( |
505 | 222k | GStringPiece text, const char* delimiter) { |
506 | 222k | return internal::Splitter<delimiter::Literal>( |
507 | 222k | text, delimiter::Literal(delimiter)); |
508 | 222k | } |
509 | | |
510 | | inline internal::Splitter<delimiter::Literal> Split( |
511 | 0 | GStringPiece text, const string& delimiter) { |
512 | 0 | return internal::Splitter<delimiter::Literal>( |
513 | 0 | text, delimiter::Literal(delimiter)); |
514 | 0 | } |
515 | | |
516 | | inline internal::Splitter<delimiter::Literal> Split( |
517 | 0 | GStringPiece text, GStringPiece delimiter) { |
518 | 0 | return internal::Splitter<delimiter::Literal>( |
519 | 0 | text, delimiter::Literal(delimiter)); |
520 | 0 | } |
521 | | |
522 | | // Same overloads as above, but also including a Predicate argument. |
523 | | template <typename Predicate> |
524 | | inline internal::Splitter<delimiter::Literal, Predicate> Split( |
525 | 332k | GStringPiece text, const char* delimiter, Predicate p) { |
526 | 332k | return internal::Splitter<delimiter::Literal, Predicate>( |
527 | 332k | text, delimiter::Literal(delimiter), p); |
528 | 332k | } strings::internal::Splitter<strings::delimiter::Literal, strings::SkipEmpty> strings::Split<strings::SkipEmpty>(GStringPiece, char const*, strings::SkipEmpty) Line | Count | Source | 525 | 332k | GStringPiece text, const char* delimiter, Predicate p) { | 526 | 332k | return internal::Splitter<delimiter::Literal, Predicate>( | 527 | 332k | text, delimiter::Literal(delimiter), p); | 528 | 332k | } |
strings::internal::Splitter<strings::delimiter::Literal, strings::AllowEmpty> strings::Split<strings::AllowEmpty>(GStringPiece, char const*, strings::AllowEmpty) Line | Count | Source | 525 | 101 | GStringPiece text, const char* delimiter, Predicate p) { | 526 | 101 | return internal::Splitter<delimiter::Literal, Predicate>( | 527 | 101 | text, delimiter::Literal(delimiter), p); | 528 | 101 | } |
|
529 | | |
530 | | template <typename Predicate> |
531 | | inline internal::Splitter<delimiter::Literal, Predicate> Split( |
532 | | GStringPiece text, const string& delimiter, Predicate p) { |
533 | | return internal::Splitter<delimiter::Literal, Predicate>( |
534 | | text, delimiter::Literal(delimiter), p); |
535 | | } |
536 | | |
537 | | template <typename Predicate> |
538 | | inline internal::Splitter<delimiter::Literal, Predicate> Split( |
539 | | GStringPiece text, GStringPiece delimiter, Predicate p) { |
540 | | return internal::Splitter<delimiter::Literal, Predicate>( |
541 | | text, delimiter::Literal(delimiter), p); |
542 | | } |
543 | | |
544 | | } // namespace strings |
545 | | |
546 | | // |
547 | | // ==================== LEGACY SPLIT FUNCTIONS ==================== |
548 | | // |
549 | | |
550 | | // NOTE: The instruction below creates a Module titled |
551 | | // GlobalSplitFunctions within the auto-generated Doxygen documentation. |
552 | | // This instruction is needed to expose global functions that are not |
553 | | // within a namespace. |
554 | | // |
555 | | // START DOXYGEN SplitFunctions grouping |
556 | | /* @defgroup SplitFunctions |
557 | | * @{ */ |
558 | | |
559 | | // ---------------------------------------------------------------------- |
560 | | // ClipString |
561 | | // Clip a string to a max length. We try to clip on a word boundary |
562 | | // if this is possible. If the string is clipped, we append an |
563 | | // ellipsis. |
564 | | // |
565 | | // ***NOTE*** |
566 | | // ClipString counts length with strlen. If you have non-ASCII |
567 | | // strings like UTF-8, this is wrong. If you are displaying the |
568 | | // clipped strings to users in a frontend, consider using |
569 | | // ClipStringOnWordBoundary in |
570 | | // webserver/util/snippets/rewriteboldtags, which considers the width |
571 | | // of the string, not just the number of bytes. |
572 | | // |
573 | | // TODO(user) Move ClipString back to strutil. The problem with this is |
574 | | // that ClipStringHelper is used behind the scenes by SplitStringToLines, but |
575 | | // probably shouldn't be exposed in the .h files. |
576 | | // ---------------------------------------------------------------------- |
577 | | void ClipString(char* str, size_t max_len); |
578 | | |
579 | | // ---------------------------------------------------------------------- |
580 | | // ClipString |
581 | | // Version of ClipString() that uses string instead of char*. |
582 | | // NOTE: See comment above. |
583 | | // ---------------------------------------------------------------------- |
584 | | void ClipString(string* full_str, size_t max_len); |
585 | | |
586 | | // ---------------------------------------------------------------------- |
587 | | // SplitStringToLines() Split a string into lines of maximum length |
588 | | // 'max_len'. Append the resulting lines to 'result'. Will attempt |
589 | | // to split on word boundaries. If 'num_lines' |
590 | | // is zero it splits up the whole string regardless of length. If |
591 | | // 'num_lines' is positive, it returns at most num_lines lines, and |
592 | | // appends a "..." to the end of the last line if the string is too |
593 | | // long to fit completely into 'num_lines' lines. |
594 | | // ---------------------------------------------------------------------- |
595 | | void SplitStringToLines(const char* full, |
596 | | size_t max_len, |
597 | | size_t num_lines, |
598 | | vector<string>* result); |
599 | | |
600 | | // ---------------------------------------------------------------------- |
601 | | // SplitOneStringToken() |
602 | | // Returns the first "delim" delimited string from "*source" and modifies |
603 | | // *source to point after the delimiter that was found. If no delimiter is |
604 | | // found, *source is set to NULL. |
605 | | // |
606 | | // If the start of *source is a delimiter, an empty string is returned. |
607 | | // If *source is NULL, an empty string is returned. |
608 | | // |
609 | | // "delim" is treated as a sequence of 1 or more character delimiters. Any one |
610 | | // of the characters present in "delim" is considered to be a single |
611 | | // delimiter; The delimiter is not "delim" as a whole. For example: |
612 | | // |
613 | | // const char* s = "abc=;de"; |
614 | | // string r = SplitOneStringToken(&s, ";="); |
615 | | // // r = "abc" |
616 | | // // s points to ";de" |
617 | | // ---------------------------------------------------------------------- |
618 | | string SplitOneStringToken(const char** source, const char* delim); |
619 | | |
620 | | // ---------------------------------------------------------------------- |
621 | | // SplitUsing() |
622 | | // Split a string into substrings based on the nul-terminated list |
623 | | // of bytes at delimiters (uses strsep) and return a vector of |
624 | | // those strings. Modifies 'full' We allocate the return vector, |
625 | | // and you should free it. Note that empty fields are ignored. |
626 | | // Use SplitToVector with last argument 'false' if you want the |
627 | | // empty fields. |
628 | | // ---------------------------------------------------------------------- |
629 | | vector<char*>* SplitUsing(char* full, const char* delimiters); |
630 | | |
631 | | // ---------------------------------------------------------------------- |
632 | | // SplitToVector() |
633 | | // Split a string into substrings based on the nul-terminated list |
634 | | // of bytes at delim (uses strsep) and appends the split |
635 | | // strings to 'vec'. Modifies "full". If omit empty strings is |
636 | | // true, empty strings are omitted from the resulting vector. |
637 | | // ---------------------------------------------------------------------- |
638 | | void SplitToVector(char* full, const char* delimiters, |
639 | | vector<char*>* vec, |
640 | | bool omit_empty_strings); |
641 | | void SplitToVector(char* full, const char* delimiters, |
642 | | vector<const char*>* vec, |
643 | | bool omit_empty_strings); |
644 | | |
645 | | // ---------------------------------------------------------------------- |
646 | | // SplitGStringPieceToVector |
647 | | // Split a GStringPiece into sub-GStringPieces based on the |
648 | | // nul-terminated list of bytes at delim and appends the |
649 | | // pieces to 'vec'. If omit empty strings is true, empty strings |
650 | | // are omitted from the resulting vector. |
651 | | // Expects the original string (from which 'full' is derived) to exist |
652 | | // for the full lifespan of 'vec'. |
653 | | // ---------------------------------------------------------------------- |
654 | | void SplitGStringPieceToVector(const GStringPiece& full, |
655 | | const char* delim, |
656 | | vector<GStringPiece>* vec, |
657 | | bool omit_empty_strings); |
658 | | |
659 | | // ---------------------------------------------------------------------- |
660 | | // SplitStringUsing() |
661 | | // SplitStringToHashsetUsing() |
662 | | // SplitStringToSetUsing() |
663 | | // SplitStringToMapUsing() |
664 | | // SplitStringToHashmapUsing() |
665 | | |
666 | | // Splits a string using one or more byte delimiters, presented as a |
667 | | // nul-terminated c string. Append the components to 'result'. If there are |
668 | | // consecutive delimiters, this function skips over all of them: in other words, |
669 | | // empty components are dropped. If you want to keep empty components, try |
670 | | // SplitStringAllowEmpty(). |
671 | | // |
672 | | // NOTE: Do not use this for multi-byte delimiters such as UTF-8 strings. Use |
673 | | // strings::Split() with strings::delimiter::Literal as the delimiter. |
674 | | // |
675 | | // ==> NEW API: Consider using the new Split API defined above. <== |
676 | | // Example: |
677 | | // |
678 | | // using strings::SkipEmpty; |
679 | | // using strings::Split; |
680 | | // using strings::delimiter::AnyOf; |
681 | | // |
682 | | // vector<string> v = Split(full, AnyOf(delimiter), SkipEmpty()); |
683 | | // |
684 | | // For even better performance, store the result in a vector<GStringPiece> |
685 | | // to avoid string copies. |
686 | | // ---------------------------------------------------------------------- |
687 | | void SplitStringUsing(const string& full, const char* delimiters, |
688 | | vector<string>* result); |
689 | | void SplitStringToSetUsing(const string& full, const char* delimiters, |
690 | | set<string>* result); |
691 | | // The even-positioned (0-based) components become the keys for the |
692 | | // odd-positioned components that follow them. When there is an odd |
693 | | // number of components, the value for the last key will be unchanged |
694 | | // if the key was already present in the hash table, or will be the |
695 | | // empty string if the key is a newly inserted key. |
696 | | void SplitStringToMapUsing(const string& full, const char* delim, |
697 | | map<string, string>* result); |
698 | | |
699 | | // ---------------------------------------------------------------------- |
700 | | // SplitStringAllowEmpty() |
701 | | // |
702 | | // Split a string using one or more byte delimiters, presented as a |
703 | | // nul-terminated c string. Append the components to 'result'. If there are |
704 | | // consecutive delimiters, this function will return corresponding empty |
705 | | // strings. If you want to drop the empty strings, try SplitStringUsing(). |
706 | | // |
707 | | // If "full" is the empty string, yields an empty string as the only value. |
708 | | // |
709 | | // ==> NEW API: Consider using the new Split API defined above. <== |
710 | | // |
711 | | // using strings::Split; |
712 | | // using strings::delimiter::AnyOf; |
713 | | // |
714 | | // vector<string> v = Split(full, AnyOf(delimiter)); |
715 | | // |
716 | | // For even better performance, store the result in a vector<GStringPiece> to |
717 | | // avoid string copies. |
718 | | // ---------------------------------------------------------------------- |
719 | | void SplitStringAllowEmpty(const string& full, const char* delim, |
720 | | vector<string>* result); |
721 | | |
722 | | // ---------------------------------------------------------------------- |
723 | | // SplitStringWithEscaping() |
724 | | // SplitStringWithEscapingAllowEmpty() |
725 | | // SplitStringWithEscapingToSet() |
726 | | // SplitStringWithEscapingToHashset() |
727 | | |
728 | | // Split the string using the specified delimiters, taking escaping into |
729 | | // account. '\' is not allowed as a delimiter. |
730 | | // |
731 | | // Within the string, preserve a delimiter preceded by a backslash as a |
732 | | // literal delimiter. In addition, preserve two consecutive backslashes as |
733 | | // a single literal backslash. Do not unescape any other backslash-character |
734 | | // sequence. |
735 | | // |
736 | | // Eg. 'foo\=bar=baz\\qu\ux' split on '=' becomes ('foo=bar', 'baz\qu\ux') |
737 | | // |
738 | | // All versions other than "AllowEmpty" discard any empty substrings. |
739 | | // ---------------------------------------------------------------------- |
740 | | void SplitStringWithEscaping(const string& full, |
741 | | const strings::CharSet& delimiters, |
742 | | vector<string>* result); |
743 | | void SplitStringWithEscapingAllowEmpty(const string& full, |
744 | | const strings::CharSet& delimiters, |
745 | | vector<string>* result); |
746 | | void SplitStringWithEscapingToSet(const string& full, |
747 | | const strings::CharSet& delimiters, |
748 | | set<string>* result); |
749 | | |
750 | | // ---------------------------------------------------------------------- |
751 | | // SplitStringIntoNPiecesAllowEmpty() |
752 | | |
753 | | // Split a string using a nul-terminated list of byte |
754 | | // delimiters. Append the components to 'result'. If there are |
755 | | // consecutive delimiters, this function will return corresponding |
756 | | // empty strings. The string is split into at most the specified |
757 | | // number of pieces greedily. This means that the last piece may |
758 | | // possibly be split further. To split into as many pieces as |
759 | | // possible, specify 0 as the number of pieces. |
760 | | // |
761 | | // If "full" is the empty string, yields an empty string as the only value. |
762 | | // ---------------------------------------------------------------------- |
763 | | void SplitStringIntoNPiecesAllowEmpty(const string& full, |
764 | | const char* delimiters, |
765 | | size_t pieces, |
766 | | vector<string>* result); |
767 | | |
768 | | // ---------------------------------------------------------------------- |
769 | | // SplitStringAndParse() |
770 | | // SplitStringAndParseToContainer() |
771 | | // SplitStringAndParseToList() |
772 | | // Split a string using a nul-terminated list of character |
773 | | // delimiters. For each component, parse using the provided |
774 | | // parsing function and if successful, append it to 'result'. |
775 | | // Return true if and only if all components parse successfully. |
776 | | // If there are consecutive delimiters, this function skips over |
777 | | // all of them. This function will correctly handle parsing |
778 | | // strings that have embedded \0s. |
779 | | // |
780 | | // SplitStringAndParse fills into a vector. |
781 | | // SplitStringAndParseToContainer fills into any container that implements |
782 | | // a single-argument insert function. (i.e. insert(const value_type& x) ). |
783 | | // SplitStringAndParseToList fills into any container that implements a single- |
784 | | // argument push_back function (i.e. push_back(const value_type& x) ), plus |
785 | | // value_type& back() and pop_back(). |
786 | | // NOTE: This implementation relies on parsing in-place into the "back()" |
787 | | // reference, so its performance may depend on the efficiency of back(). |
788 | | // |
789 | | // Example Usage: |
790 | | // vector<double> values; |
791 | | // CHECK(SplitStringAndParse("1.0,2.0,3.0", ",", &safe_strtod, &values)); |
792 | | // CHECK_EQ(3, values.size()); |
793 | | // |
794 | | // vector<int64> values; |
795 | | // CHECK(SplitStringAndParse("1M,2M,3M", ",", |
796 | | // &HumanReadableNumBytes::ToInt64, &values)); |
797 | | // CHECK_EQ(3, values.size()); |
798 | | // |
799 | | // set<int64> values; |
800 | | // CHECK(SplitStringAndParseToContainer("3,1,1,2", ",", |
801 | | // &safe_strto64, &values)); |
802 | | // CHECK_EQ(4, values.size()); |
803 | | // |
804 | | // deque<int64> values; |
805 | | // CHECK(SplitStringAndParseToList("3,1,1,2", ",", &safe_strto64, &values)); |
806 | | // CHECK_EQ(4, values.size()); |
807 | | // ---------------------------------------------------------------------- |
808 | | template <class T> |
809 | | bool SplitStringAndParse(GStringPiece source, GStringPiece delim, |
810 | | bool (*parse)(const string& str, T* value), |
811 | | vector<T>* result); |
812 | | template <class Container> |
813 | | bool SplitStringAndParseToContainer( |
814 | | GStringPiece source, GStringPiece delim, |
815 | | bool (*parse)(const string& str, typename Container::value_type* value), |
816 | | Container* result); |
817 | | |
818 | | template <class List> |
819 | | bool SplitStringAndParseToList( |
820 | | GStringPiece source, GStringPiece delim, |
821 | | bool (*parse)(const string& str, typename List::value_type* value), |
822 | | List* result); |
823 | | // ---------------------------------------------------------------------- |
824 | | // SplitRange() |
825 | | // Splits a string of the form "<from>-<to>". Either or both can be |
826 | | // missing. A raw number (<to>) is interpreted as "<to>-". Modifies |
827 | | // parameters insofar as they're specified by the string. RETURNS |
828 | | // true iff the input is a well-formed range. If it RETURNS false, |
829 | | // from and to remain unchanged. The range in rangestr should be |
830 | | // terminated either by "\0" or by whitespace. |
831 | | // ---------------------------------------------------------------------- |
832 | | bool SplitRange(const char* rangestr, int* from, int* to); |
833 | | |
834 | | // ---------------------------------------------------------------------- |
835 | | // SplitCSVLineWithDelimiter() |
836 | | // CSV lines come in many guises. There's the Comma Separated Values |
837 | | // variety, in which fields are separated by (surprise!) commas. There's |
838 | | // also the tab-separated values variant, in which tabs separate the |
839 | | // fields. This routine handles both, which makes it almost like |
840 | | // SplitUsing(line, delimiter), but for some special processing. For both |
841 | | // delimiters, whitespace is trimmed from either side of the field value. |
842 | | // If the delimiter is ',', we play additional games with quotes. A |
843 | | // field value surrounded by double quotes is allowed to contain commas, |
844 | | // which are not treated as field separators. Within a double-quoted |
845 | | // string, a series of two double quotes signals an escaped single double |
846 | | // quote. It'll be clearer in the examples. |
847 | | // Example: |
848 | | // Google , x , "Buchheit, Paul", "string with "" quote in it" |
849 | | // --> [Google], [x], [Buchheit, Paul], [string with " quote in it] |
850 | | // |
851 | | // SplitCSVLine() |
852 | | // A convenience wrapper around SplitCSVLineWithDelimiter which uses |
853 | | // ',' as the delimiter. |
854 | | // |
855 | | // The following variants of SplitCSVLine() are not recommended for new code. |
856 | | // Please consider the CSV parser in //util/csv as an alternative. Examples: |
857 | | // To parse a single line: |
858 | | // #include "yb/util/csv/parser.h" |
859 | | // vector<string> fields = util::csv::ParseLine(line).fields(); |
860 | | // |
861 | | // To parse an entire file: |
862 | | // #include "yb/util/csv/parser.h" |
863 | | // for (Record rec : Parser(source)) { |
864 | | // vector<string> fields = rec.fields(); |
865 | | // } |
866 | | // |
867 | | // See //util/csv/parser.h for more complete documentation. |
868 | | // |
869 | | // ---------------------------------------------------------------------- |
870 | | void SplitCSVLine(char* line, vector<char*>* cols); |
871 | | void SplitCSVLineWithDelimiter(char* line, char delimiter, |
872 | | vector<char*>* cols); |
873 | | // SplitCSVLine string wrapper that internally makes a copy of string line. |
874 | | void SplitCSVLineWithDelimiterForStrings(const string& line, char delimiter, |
875 | | vector<string>* cols); |
876 | | |
877 | | // ---------------------------------------------------------------------- |
878 | | // SplitStructuredLine() |
879 | | // Splits a line using the given delimiter, and places the columns |
880 | | // into 'cols'. This is unlike 'SplitUsing(line, ",")' because you can |
881 | | // define pairs of opening closing symbols inside which the delimiter should |
882 | | // be ignored. If the symbol_pair string has an odd number of characters, |
883 | | // the last character (which cannot be paired) will be assumed to be both an |
884 | | // opening and closing symbol. |
885 | | // WARNING : The input string 'line' is destroyed in the process. |
886 | | // The function returns 0 if the line was parsed correctly (i.e all the |
887 | | // opened braces had their closing braces) otherwise, it returns the position |
888 | | // of the error. |
889 | | // Example: |
890 | | // SplitStructuredLine("item1,item2,{subitem1,subitem2},item4,[5,{6,7}]", |
891 | | // ',', |
892 | | // "{}[]", &output) |
893 | | // --> output = { "item1", "item2", "{subitem1,subitem2}", "item4", |
894 | | // "[5,{6,7}]" } |
895 | | // Example2: trying to split "item1,[item2,{4,5],5}" will fail and the |
896 | | // function will return the position of the problem : ] |
897 | | // |
898 | | // ---------------------------------------------------------------------- |
899 | | char* SplitStructuredLine(char* line, |
900 | | char delimiter, |
901 | | const char* symbol_pairs, |
902 | | vector<char*>* cols); |
903 | | |
904 | | // Similar to the function with the same name above, but splits a GStringPiece |
905 | | // into GStringPiece parts. Returns true if successful. |
906 | | bool SplitStructuredLine(GStringPiece line, |
907 | | char delimiter, |
908 | | const char* symbol_pairs, |
909 | | vector<GStringPiece>* cols); |
910 | | |
911 | | // ---------------------------------------------------------------------- |
912 | | // SplitStructuredLineWithEscapes() |
913 | | // Like SplitStructuredLine but also allows characters to be escaped. |
914 | | // |
915 | | // WARNING: the escape characters will be replicated in the output |
916 | | // columns rather than being consumed, i.e. if {} were the opening and |
917 | | // closing symbols, using \{ to quote a curly brace in the middle of |
918 | | // an option would pass this unchanged. |
919 | | // |
920 | | // Example: |
921 | | // SplitStructuredLineWithEscapes( |
922 | | // "\{item1\},it\\em2,{\{subitem1\},sub\\item2},item4\,item5,[5,{6,7}]", |
923 | | // ',', |
924 | | // "{}[]", |
925 | | // &output) |
926 | | // --> output = { "\{item1\}", "it\\em2", "{\{subitem1\},sub\\item2}", |
927 | | // "item4\,item5", "[5,{6,7}]" } |
928 | | // |
929 | | // ---------------------------------------------------------------------- |
930 | | char* SplitStructuredLineWithEscapes(char* line, |
931 | | char delimiter, |
932 | | const char* symbol_pairs, |
933 | | vector<char*>* cols); |
934 | | |
935 | | // Similar to the function with the same name above, but splits a GStringPiece |
936 | | // into GStringPiece parts. Returns true if successful. |
937 | | bool SplitStructuredLineWithEscapes(GStringPiece line, |
938 | | char delimiter, |
939 | | const char* symbol_pairs, |
940 | | vector<GStringPiece>* cols); |
941 | | |
942 | | // ---------------------------------------------------------------------- |
943 | | // DEPRECATED(jgm): See the "NEW API" comment about this function below for |
944 | | // example code showing an alternative. |
945 | | // |
946 | | // SplitStringIntoKeyValues() |
947 | | // Split a line into a key string and a vector of value strings. The line has |
948 | | // the following format: |
949 | | // |
950 | | // <key><kvsep>+<vvsep>*<value1><vvsep>+<value2><vvsep>+<value3>...<vvsep>* |
951 | | // |
952 | | // where key and value are strings; */+ means zero/one or more; <kvsep> is |
953 | | // a delimiter character to separate key and value; and <vvsep> is a delimiter |
954 | | // character to separate between values. The user can specify a bunch of |
955 | | // delimiter characters using a string. For example, if the user specifies |
956 | | // the separator string as "\t ", then either ' ' or '\t' or any combination |
957 | | // of them wil be treated as separator. For <vvsep>, the user can specify a |
958 | | // empty string to indicate there is only one value. |
959 | | // |
960 | | // Note: this function assumes the input string begins exactly with a |
961 | | // key. Therefore, if you use whitespaces to separate key and value, you |
962 | | // should not let whitespace precedes the key in the input. Otherwise, you |
963 | | // will get an empty string as the key. |
964 | | // |
965 | | // A line with no <kvsep> will return an empty string as the key, even if |
966 | | // <key> is non-empty! |
967 | | // |
968 | | // The syntax makes it impossible for a value to be the empty string. |
969 | | // It is possible for the number of values to be zero. |
970 | | // |
971 | | // Returns false if the line has no <kvsep> or if the number of values is |
972 | | // zero. |
973 | | // |
974 | | // ==> NEW API: Consider using the new Split API defined above. <== |
975 | | // |
976 | | // The SplitStringIntoKeyValues() function has some subtle and surprising |
977 | | // semantics in various corner cases. To avoid this the strings::Split API is |
978 | | // recommended. The following example shows how to split a string of delimited |
979 | | // key-value pairs into a vector of pairs using the strings::Split API. |
980 | | // |
981 | | // using strings::Split; |
982 | | // using strings::delimiter::AnyOf; |
983 | | // using strings::delimiter::Limit; |
984 | | // |
985 | | // pair<string, GStringPiece> key_values = |
986 | | // Split(line, Limit(AnyOf(kv_delim), 1)); |
987 | | // string key = key_values.first; |
988 | | // vector<string> values = Split(key_values.second, AnyOf(vv_delim)); |
989 | | // |
990 | | // ---------------------------------------------------------------------- |
991 | | bool SplitStringIntoKeyValues(const string& line, |
992 | | const string& key_value_delimiters, |
993 | | const string& value_value_delimiters, |
994 | | string* key, vector<string>* values); |
995 | | |
996 | | // ---------------------------------------------------------------------- |
997 | | // SplitStringIntoKeyValuePairs() |
998 | | // Split a line into a vector of <key, value> pairs. The line has |
999 | | // the following format: |
1000 | | // |
1001 | | // <kvpsep>*<key1><kvsep>+<value1><kvpsep>+<key2><kvsep>+<value2>...<kvpsep>* |
1002 | | // |
1003 | | // Where key and value are strings; */+ means zero/one or more. <kvsep> is |
1004 | | // a delimiter character to separate key and value and <kvpsep> is a delimiter |
1005 | | // character to separate key value pairs. The user can specify a bunch of |
1006 | | // delimiter characters using a string. |
1007 | | // |
1008 | | // Note: this function assumes each key-value pair begins exactly with a |
1009 | | // key. Therefore, if you use whitespaces to separate key and value, you |
1010 | | // should not let whitespace precede the key in the pair. Otherwise, you |
1011 | | // will get an empty string as the key. |
1012 | | // |
1013 | | // A pair with no <kvsep> will return empty strings as the key and value, |
1014 | | // even if <key> is non-empty! |
1015 | | // |
1016 | | // Returns false for pairs with no <kvsep> specified and for pairs with |
1017 | | // empty strings as values. |
1018 | | // |
1019 | | // ==> NEW API: Consider using the new Split API defined above. <== |
1020 | | // |
1021 | | // The SplitStringIntoKeyValuePairs() function has some subtle and surprising |
1022 | | // semantics in various corner cases. To avoid this the strings::Split API is |
1023 | | // recommended. The following example shows how to split a string of delimited |
1024 | | // key-value pairs into a vector of pairs using the strings::Split API. |
1025 | | // |
1026 | | // using strings::SkipEmpty; |
1027 | | // using strings::Split; |
1028 | | // using strings::delimiter::AnyOf; |
1029 | | // using strings::delimiter::Limit; |
1030 | | // |
1031 | | // vector<pair<string, string>> pairs; // or even map<string, string> |
1032 | | // for (GStringPiece sp : Split(line, AnyOf(pair_delim), SkipEmpty())) { |
1033 | | // pairs.push_back(Split(sp, Limit(AnyOf(kv_delim), 1), SkipEmpty())); |
1034 | | // } |
1035 | | // |
1036 | | // ---------------------------------------------------------------------- |
1037 | | bool SplitStringIntoKeyValuePairs(const string& line, |
1038 | | const string& key_value_delimiters, |
1039 | | const string& key_value_pair_delimiters, |
1040 | | vector<pair<string, string> >* kv_pairs); |
1041 | | |
1042 | | |
1043 | | // ---------------------------------------------------------------------- |
1044 | | // SplitLeadingDec32Values() |
1045 | | // SplitLeadingDec64Values() |
1046 | | // A simple parser for space-separated decimal int32/int64 values. |
1047 | | // Appends parsed integers to the end of the result vector, stopping |
1048 | | // at the first unparsable spot. Skips past leading and repeated |
1049 | | // whitespace (does not consume trailing whitespace), and returns |
1050 | | // a pointer beyond the last character parsed. |
1051 | | // -------------------------------------------------------------------- |
1052 | | const char* SplitLeadingDec32Values(const char* next, vector<int32>* result); |
1053 | | const char* SplitLeadingDec64Values(const char* next, vector<int64>* result); |
1054 | | |
1055 | | // ---------------------------------------------------------------------- |
1056 | | // SplitOneIntToken() |
1057 | | // SplitOneInt32Token() |
1058 | | // SplitOneUint32Token() |
1059 | | // SplitOneInt64Token() |
1060 | | // SplitOneUint64Token() |
1061 | | // SplitOneDoubleToken() |
1062 | | // SplitOneFloatToken() |
1063 | | // Parse a single "delim" delimited number from "*source" into "*value". |
1064 | | // Modify *source to point after the delimiter. |
1065 | | // If no delimiter is present after the number, set *source to NULL. |
1066 | | // |
1067 | | // If the start of *source is not an number, return false. |
1068 | | // If the int is followed by the null character, return true. |
1069 | | // If the int is not followed by a character from delim, return false. |
1070 | | // If *source is NULL, return false. |
1071 | | // |
1072 | | // They cannot handle decimal numbers with leading 0s, since they will be |
1073 | | // treated as octal. |
1074 | | // ---------------------------------------------------------------------- |
1075 | | bool SplitOneIntToken(const char** source, const char* delim, |
1076 | | int* value); |
1077 | | bool SplitOneInt32Token(const char** source, const char* delim, |
1078 | | int32* value); |
1079 | | bool SplitOneUint32Token(const char** source, const char* delim, |
1080 | | uint32* value); |
1081 | | bool SplitOneInt64Token(const char** source, const char* delim, |
1082 | | int64* value); |
1083 | | bool SplitOneUint64Token(const char** source, const char* delim, |
1084 | | uint64* value); |
1085 | | bool SplitOneDoubleToken(const char** source, const char* delim, |
1086 | | double* value); |
1087 | | bool SplitOneFloatToken(const char** source, const char* delim, |
1088 | | float* value); |
1089 | | |
1090 | | // Some aliases, so that the function names are standardized against the names |
1091 | | // of the reflection setters/getters in proto2. This makes it easier to use |
1092 | | // certain macros with reflection when creating custom text formats for protos. |
1093 | | |
1094 | | inline bool SplitOneUInt32Token(const char** source, const char* delim, |
1095 | 0 | uint32* value) { |
1096 | 0 | return SplitOneUint32Token(source, delim, value); |
1097 | 0 | } |
1098 | | |
1099 | | inline bool SplitOneUInt64Token(const char** source, const char* delim, |
1100 | 0 | uint64* value) { |
1101 | 0 | return SplitOneUint64Token(source, delim, value); |
1102 | 0 | } |
1103 | | |
1104 | | // ---------------------------------------------------------------------- |
1105 | | // SplitOneDecimalIntToken() |
1106 | | // SplitOneDecimalInt32Token() |
1107 | | // SplitOneDecimalUint32Token() |
1108 | | // SplitOneDecimalInt64Token() |
1109 | | // SplitOneDecimalUint64Token() |
1110 | | // Parse a single "delim"-delimited number from "*source" into "*value". |
1111 | | // Unlike SplitOneIntToken, etc., this function always interprets |
1112 | | // the numbers as decimal. |
1113 | | bool SplitOneDecimalIntToken(const char** source, const char* delim, |
1114 | | int* value); |
1115 | | bool SplitOneDecimalInt32Token(const char** source, const char* delim, |
1116 | | int32* value); |
1117 | | bool SplitOneDecimalUint32Token(const char** source, const char* delim, |
1118 | | uint32* value); |
1119 | | bool SplitOneDecimalInt64Token(const char** source, const char* delim, |
1120 | | int64* value); |
1121 | | bool SplitOneDecimalUint64Token(const char** source, const char* delim, |
1122 | | uint64* value); |
1123 | | |
1124 | | // ---------------------------------------------------------------------- |
1125 | | // SplitOneHexUint32Token() |
1126 | | // SplitOneHexUint64Token() |
1127 | | // Once more, for hexadecimal numbers (unsigned only). |
1128 | | bool SplitOneHexUint32Token(const char** source, const char* delim, |
1129 | | uint32* value); |
1130 | | bool SplitOneHexUint64Token(const char** source, const char* delim, |
1131 | | uint64* value); |
1132 | | |
1133 | | |
1134 | | // ###################### TEMPLATE INSTANTIATIONS BELOW ####################### |
1135 | | |
1136 | | // SplitStringAndParse() -- see description above |
1137 | | template <class T> |
1138 | | bool SplitStringAndParse(GStringPiece source, GStringPiece delim, |
1139 | | bool (*parse)(const string& str, T* value), |
1140 | | vector<T>* result) { |
1141 | | return SplitStringAndParseToList(source, delim, parse, result); |
1142 | | } |
1143 | | |
1144 | | namespace strings { |
1145 | | namespace internal { |
1146 | | |
1147 | | template <class Container, class InsertPolicy> |
1148 | | bool SplitStringAndParseToInserter( |
1149 | | GStringPiece source, GStringPiece delim, |
1150 | | bool (*parse)(const string& str, typename Container::value_type* value), |
1151 | | Container* result, InsertPolicy insert_policy) { |
1152 | | CHECK(NULL != parse); |
1153 | | CHECK(NULL != result); |
1154 | | CHECK(NULL != delim.data()); |
1155 | | CHECK_GT(delim.size(), 0); |
1156 | | bool retval = true; |
1157 | | vector<GStringPiece> pieces = strings::Split(source, |
1158 | | strings::delimiter::AnyOf(delim), |
1159 | | strings::SkipEmpty()); |
1160 | | for (const auto& piece : pieces) { |
1161 | | typename Container::value_type t; |
1162 | | if (parse(piece.as_string(), &t)) { |
1163 | | insert_policy(result, t); |
1164 | | } else { |
1165 | | retval = false; |
1166 | | } |
1167 | | } |
1168 | | return retval; |
1169 | | } |
1170 | | |
1171 | | // Cannot use output iterator here (e.g. std::inserter, std::back_inserter) |
1172 | | // because some callers use non-standard containers that don't have iterators, |
1173 | | // only an insert() or push_back() method. |
1174 | | struct BasicInsertPolicy { |
1175 | | template <class C, class V> |
1176 | | void operator()(C* c, const V& v) const { c->insert(v); } |
1177 | | }; |
1178 | | |
1179 | | struct BackInsertPolicy { |
1180 | | template <class C, class V> |
1181 | | void operator()(C* c, const V& v) const { c->push_back(v); } |
1182 | | }; |
1183 | | |
1184 | | } // namespace internal |
1185 | | } // namespace strings |
1186 | | |
1187 | | // SplitStringAndParseToContainer() -- see description above |
1188 | | template <class Container> |
1189 | | bool SplitStringAndParseToContainer( |
1190 | | GStringPiece source, GStringPiece delim, |
1191 | | bool (*parse)(const string& str, typename Container::value_type* value), |
1192 | | Container* result) { |
1193 | | return strings::internal::SplitStringAndParseToInserter( |
1194 | | source, delim, parse, result, strings::internal::BasicInsertPolicy()); |
1195 | | } |
1196 | | |
1197 | | // SplitStringAndParseToList() -- see description above |
1198 | | template <class List> |
1199 | | bool SplitStringAndParseToList( |
1200 | | GStringPiece source, GStringPiece delim, |
1201 | | bool (*parse)(const string& str, typename List::value_type* value), |
1202 | | List* result) { |
1203 | | return strings::internal::SplitStringAndParseToInserter( |
1204 | | source, delim, parse, result, strings::internal::BackInsertPolicy()); |
1205 | | } |
1206 | | |
1207 | | // END DOXYGEN SplitFunctions grouping |
1208 | | /* @} */ |
1209 | | |
1210 | | #endif // YB_GUTIL_STRINGS_SPLIT_H |