-
Notifications
You must be signed in to change notification settings - Fork 3.6k
/
Copy pathtrie.h
243 lines (202 loc) · 6.95 KB
/
trie.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <cassert>
#include <cstdint>
#include <cstring>
#include <iosfwd>
#include <limits>
#include <string>
#include <string_view>
#include <utility>
#include <vector>
#include "arrow/status.h"
#include "arrow/util/macros.h"
#include "arrow/util/visibility.h"
namespace arrow {
namespace internal {
// A non-zero-terminated small string class.
// std::string usually has a small string optimization
// (see review at https://shaharmike.com/cpp/std-string/)
// but this one allows tight control and optimization of memory layout.
template <uint8_t N>
class SmallString {
public:
SmallString() : length_(0) {}
template <typename T>
SmallString(const T& v) { // NOLINT implicit constructor
*this = std::string_view(v);
}
SmallString& operator=(const std::string_view s) {
#ifndef NDEBUG
CheckSize(s.size());
#endif
length_ = static_cast<uint8_t>(s.size());
std::memcpy(data_, s.data(), length_);
return *this;
}
SmallString& operator=(const std::string& s) {
*this = std::string_view(s);
return *this;
}
SmallString& operator=(const char* s) {
*this = std::string_view(s);
return *this;
}
explicit operator std::string_view() const { return std::string_view(data_, length_); }
const char* data() const { return data_; }
size_t length() const { return length_; }
bool empty() const { return length_ == 0; }
char operator[](size_t pos) const {
#ifdef NDEBUG
assert(pos <= length_);
#endif
return data_[pos];
}
SmallString substr(size_t pos) const {
return SmallString(std::string_view(*this).substr(pos));
}
SmallString substr(size_t pos, size_t count) const {
return SmallString(std::string_view(*this).substr(pos, count));
}
template <typename T>
bool operator==(T&& other) const {
return std::string_view(*this) == std::string_view(std::forward<T>(other));
}
template <typename T>
bool operator!=(T&& other) const {
return std::string_view(*this) != std::string_view(std::forward<T>(other));
}
protected:
uint8_t length_;
char data_[N];
void CheckSize(size_t n) { assert(n <= N); }
};
template <uint8_t N>
std::ostream& operator<<(std::ostream& os, const SmallString<N>& str) {
return os << std::string_view(str);
}
// A trie class for byte strings, optimized for small sets of short strings.
// This class is immutable by design, use a TrieBuilder to construct it.
class ARROW_EXPORT Trie {
using index_type = int16_t;
using fast_index_type = int_fast16_t;
static constexpr auto kMaxIndex = std::numeric_limits<index_type>::max();
public:
Trie() : size_(0) {}
Trie(Trie&&) = default;
Trie& operator=(Trie&&) = default;
int32_t Find(std::string_view s) const {
const Node* node = &nodes_[0];
fast_index_type pos = 0;
if (s.length() > static_cast<size_t>(kMaxIndex)) {
return -1;
}
fast_index_type remaining = static_cast<fast_index_type>(s.length());
while (remaining > 0) {
auto substring_length = node->substring_length();
if (substring_length > 0) {
auto substring_data = node->substring_data();
if (remaining < substring_length) {
// Input too short
return -1;
}
for (fast_index_type i = 0; i < substring_length; ++i) {
if (s[pos++] != substring_data[i]) {
// Mismatching substring
return -1;
}
--remaining;
}
if (remaining == 0) {
// Matched node exactly
return node->found_index_;
}
}
// Lookup child using next input character
if (node->child_lookup_ == -1) {
// Input too long
return -1;
}
auto c = static_cast<uint8_t>(s[pos++]);
--remaining;
auto child_index = lookup_table_[node->child_lookup_ * 256 + c];
if (child_index == -1) {
// Child not found
return -1;
}
node = &nodes_[child_index];
}
// Input exhausted
if (node->substring_.empty()) {
// Matched node exactly
return node->found_index_;
} else {
return -1;
}
}
Status Validate() const;
void Dump() const;
protected:
static constexpr size_t kNodeSize = 16;
static constexpr auto kMaxSubstringLength =
kNodeSize - 2 * sizeof(index_type) - sizeof(int8_t);
struct Node {
// If this node is a valid end of string, index of found string, otherwise -1
index_type found_index_;
// Base index for child lookup in lookup_table_ (-1 if no child nodes)
index_type child_lookup_;
// The substring for this node.
SmallString<kMaxSubstringLength> substring_;
fast_index_type substring_length() const {
return static_cast<fast_index_type>(substring_.length());
}
const char* substring_data() const { return substring_.data(); }
};
static_assert(sizeof(Node) == kNodeSize, "Unexpected node size");
ARROW_DISALLOW_COPY_AND_ASSIGN(Trie);
void Dump(const Node* node, const std::string& indent) const;
// Node table: entry 0 is the root node
std::vector<Node> nodes_;
// Indexed lookup structure: gives index in node table, or -1 if not found
std::vector<index_type> lookup_table_;
// Number of entries
index_type size_;
friend class TrieBuilder;
};
class ARROW_EXPORT TrieBuilder {
using index_type = Trie::index_type;
using fast_index_type = Trie::fast_index_type;
public:
TrieBuilder();
Status Append(std::string_view s, bool allow_duplicate = false);
Trie Finish();
protected:
// Extend the lookup table by 256 entries, return the index of the new span
Status ExtendLookupTable(index_type* out_lookup_index);
// Split the node given by the index at the substring index `split_at`
Status SplitNode(fast_index_type node_index, fast_index_type split_at);
// Append an already constructed child node to the parent
Status AppendChildNode(Trie::Node* parent, uint8_t ch, Trie::Node&& node);
// Create a matching child node from this parent
Status CreateChildNode(Trie::Node* parent, uint8_t ch, std::string_view substring);
Status CreateChildNode(Trie::Node* parent, char ch, std::string_view substring);
Trie trie_;
static constexpr auto kMaxIndex = std::numeric_limits<index_type>::max();
};
} // namespace internal
} // namespace arrow