Unit Hat_Trie;
{
* This file is part of hat-trie.
*
* Copyright (c) 2011 by Daniel C. Jones
*
}
Interface
Uses hat_table;
Const
// maximum number of keys that may be stored in a bucket before it is burst
MAX_BUCKET_SIZE = 16384;
NODE_MAXCHAR = 255; // 0x7f for 7-bit ASCII
NODE_CHILDS = NODE_MAXCHAR+1;
Type
PTrieNode = ^trie_node;
// Node's may be trie nodes or buckets. This union allows us to keep non-specific pointer.
node_ptr = record
case Byte of
0:(b:TahTable);
1:(t:PTrieNode);
//2:(flag:PHatNodeSet);
end;
Pnode_ptr = ^node_ptr;
trie_node = record
flag:THatNodeSet;
// the value for the key that is consumed on a trie node
val:Integer;
// Map a character to either a trie_node_t or a ahtable_t. The first byte
// must be examined to determine which.
xs:Array[0..NODE_CHILDS-1] of node_ptr;
end;
hattrie = record
root:node_ptr; // root node
m:Cardinal; // number of stored keys
end;
Phattrie = ^hattrie;
// plan for iteration:
// This is tricky, as we have no parent pointers currently, and I would like to
// avoid adding them. That means maintaining a stack
Phattrie_node_stack = ^hattrie_node_stack;
hattrie_node_stack = record
c:Char;
level:Cardinal;
node:node_ptr;
next:Phattrie_node_stack;
end;
{
hattrie_iter = record
key:PAnsiChar;
keysize:Cardinal; // space reserved for the key
level:Cardinal;
// keep track of keys stored in trie nodes
has_nil_key:Boolean;
nil_val:Integer;
T:Phattrie;
i:Pahtable_iter;
stack:Phattrie_node_stack;
end;
Phattrie_iter = ^hattrie_iter;
}
THatTrie = class
Private
FHat:Phattrie;
Function ClearVal (n:node_ptr):Integer;
Function FindNode (Var key:PAnsiChar; Var len:Cardinal):node_ptr;
Function UseVal (n:node_ptr):PInteger;
Public
Constructor Create;
Destructor Destroy; Override;
Procedure Clear;
// Find the given key in the trie, inserting it if it does not exist, and
// returning a pointer to it's key.
// This pointer is not guaranteed to be valid after additional calls to
// GetValPtr, Delete, Clear, or other functions that modifies the trie.
Function Get (key:PAnsiChar; len:Cardinal):PInteger;
// Find a given key in the table, returning a NULL pointer if it does not exist
Function Find (key:PAnsiChar; len:Cardinal):PInteger;
// Delete a given key from trie. Returns 0 if successful or -1 if not found
Function Delete (key:PAnsiChar; len:Cardinal):Integer;
end;
{
Function hattrie_iter_Begin (Const T:Phattrie):Phattrie_iter;
Function hattrie_iter_finished (i:Phattrie_iter):Boolean;
Procedure hattrie_iter_next (i:Phattrie_iter);
Procedure hattrie_iter_free (i:Phattrie_iter);
}
Implementation
// ============= utility functions =============
// Create a new trie node with all pointers pointing to the given child (which can be NULL).
Function AllocTrieNode(child:node_ptr):PTrieNode;
Var
i:Integer;
Begin
New(Result);
Result.flag := [NODE_TYPE_TRIE];
Result.val := 0;
for i := 0 to NODE_CHILDS-1 do Result.xs[i] := child;
End;
Procedure FreeNode(node:node_ptr);
var
i:Integer;
Begin
if NODE_TYPE_TRIE in node.t.flag then
begin
for i := 0 to NODE_CHILDS-1 do
begin
if (i > 0) and (node.t.xs[i].t = node.t.xs[i - 1].t) then continue;
// XXX: recursion might not be the best choice here. It is possible
// to build a very deep trie.
if Assigned(node.t.xs[i].t) then FreeNode(node.t.xs[i]);
end;
Dispose(node.t);
end
else node.b.Free;
End;
// iterate trie nodes until string is consumed or bucket is found
Function hattrie_consume(var p:node_ptr; var k:PAnsiChar; var l:Cardinal; brk:Cardinal):node_ptr;
Begin
Result:= p.t.xs[Byte(k^)];
while (NODE_TYPE_TRIE in Result.t.flag) and (l > brk) do
Begin
Inc(k);
Dec(l);
p := Result;
Result := Result.t.xs[Byte(k^)];
End;
// copy and writeback variables if it's faster
assert(NODE_TYPE_TRIE in p.t.flag);
End;
// Perform one split operation on the given node with the given parent.
Procedure Split(parent,node:node_ptr);
Var
val:PInteger;
len,num_slots,c:Cardinal;
key:PAnsiChar;
iter:TahIterator;
cs:Array[0..NODE_CHILDS-1] of Cardinal; // occurance count for leading chars
left_m, right_m, all_m:Cardinal;
j:Byte;
d:Integer;
left, right:node_ptr;
u,v:PInteger;
Begin
// only buckets may be split
assert((NODE_TYPE_PURE_BUCKET in node.b.flag) or (NODE_TYPE_HYBRID_BUCKET in node.b.flag));
assert(NODE_TYPE_TRIE in parent.t.flag);
if NODE_TYPE_PURE_BUCKET in node.b.flag then
begin
// turn the pure bucket into a hybrid bucket
parent.t.xs[node.b.c0].t := AllocTrieNode(node);
// if the bucket had an empty key, move it to the new trie node
val := node.b.Find(Nil, 0);
if Assigned(val) then
begin
parent.t.xs[node.b.c0].t.val := val^;
Include(parent.t.xs[node.b.c0].t.flag, NODE_HAS_VAL);
val^ := 0;
node.b.Delete(Nil, 0);
End;
with node.b do
begin
c0 := 0;
c1 := NODE_MAXCHAR;
flag := [NODE_TYPE_HYBRID_BUCKET];
end; ;
Exit;
End;
// This is a hybrid bucket. Perform a proper split.
// count the number of occourances of every leading character
FillChar(cs[0], Length(cs) * SizeOf(Cardinal),0);
iter := TahIterator.Create(node.b);
while not iter.Finished do
begin
key := iter.GetKey(len);
assert(len > 0);
Inc(cs[Byte(key[0])], 1);
iter.Next;
end;
iter.Free;
// choose a split point
j := node.b.c0;
all_m := node.b.ms;
left_m := cs[j];
right_m := all_m - left_m;
while j + 1 < node.b.c1 do
begin
d := abs(Integer(left_m + cs[j + 1]) - Integer(right_m - cs[j + 1]));
if (d <= abs(left_m - right_m)) and (left_m + cs[j + 1] < all_m) then
begin
Inc(j, 1);
Inc(left_m, cs[j]);
Dec(right_m, cs[j]);
end
else break;
end;
// now split into two node cooresponding to ranges [0, j] and [j + 1, NODE_MAXCHAR], respectively.
// create new left and right nodes
// TODO: Add a special case if either node is a hybrid bucket containing all
// the keys. In such a case, do not build a new table, just use the old one.
num_slots := ahtable_initial_size;
while left_m > ahtable_max_load_factor * num_slots do num_slots:= num_slots * 2;
left.b := TahTable.Create(num_slots);
left.b.c0 := node.b.c0;
left.b.c1 := j;
if left.b.c0 = left.b.c1 then left.b.flag := [NODE_TYPE_PURE_BUCKET]
else left.b.flag:= [NODE_TYPE_HYBRID_BUCKET];
num_slots := ahtable_initial_size;
while right_m > ahtable_max_load_factor * num_slots do num_slots:=num_slots * 2;
right.b := TahTable.Create(num_slots);
right.b.c0 := j + 1;
right.b.c1 := node.b.c1;
if right.b.c0 = right.b.c1 then right.b.flag := [NODE_TYPE_PURE_BUCKET]
else right.b.flag:= [NODE_TYPE_HYBRID_BUCKET];
// update the parent's pointer
c := node.b.c0;
while c<=j do
begin
parent.t.xs[c] := left;
Inc(c);
end;
while c <= node.b.c1 do
begin
parent.t.xs[c] := right;
Inc(c);
end;
// distribute keys to the new left or right node
iter := TahIterator.Create(node.b);
while not iter.Finished do
begin
key := iter.GetKey(len);
u := iter.GetVal;
assert(len > 0);
// left
if Byte(key[0]) <= j then
begin
if NODE_TYPE_PURE_BUCKET in left.b.flag then
v := left.b.Get(key + 1, len - 1)
else
v := left.b.Get(key, len);
v^ := u^;
end
// right
else
begin
if NODE_TYPE_PURE_BUCKET in right.b.flag then
v := right.b.Get(key + 1, len - 1)
else
v := right.b.Get(key, len);
v^ := u^;
end;
iter.Next;
end;
iter.Free;
node.b.Free;
End;
// ============= Methos =============
Constructor THatTrie.Create;
Var
node:node_ptr;
Begin
New(FHat);
FHat.m := 0;
node.b := TahTable.Create;
with node.b do
begin
flag := [NODE_TYPE_HYBRID_BUCKET];
c0 := 0;
c1 := NODE_MAXCHAR;
end;
FHat.root.t := AllocTrieNode(node);
End;
Destructor THatTrie.Destroy;
Begin
FreeNode(FHat.root);
Dispose(FHat);
Inherited;
End;
Procedure THatTrie.Clear;
var
node:node_ptr;
Begin
FreeNode(FHat.root);
node.b := TahTable.Create;
with node.b do
begin
flag := [NODE_TYPE_HYBRID_BUCKET];
c0 := 0;
c1 := 255;
end;
FHat.root.t := AllocTrieNode(node);
End;
// use node value and return pointer to it
Function THatTrie.UseVal(n:node_ptr):PInteger;
Begin
if not (NODE_HAS_VAL in n.t.flag) then
begin
Include(n.t.flag, NODE_HAS_VAL);
Inc(FHat.m);
end;
Result:= @n.t.val;
End;
// clear node value if exists
Function THatTrie.ClearVal(n:node_ptr):Integer;
Begin
if NODE_HAS_VAL in n.t.flag then
begin
Exclude(n.t.flag, NODE_HAS_VAL);
n.t.val := 0;
Dec(FHat.m);
Result:= 0;
end
else Result:= -1;
End;
// find node in trie
Function THatTrie.FindNode(var key:PAnsiChar; var len:Cardinal):node_ptr;
var
parent:node_ptr;
Begin
parent := FHat.root;
assert(NODE_TYPE_TRIE in parent.t.flag);
if len = 0 then
begin
Result:=parent;
Exit;
end;
Result:= hattrie_consume(parent, key, len, 1);
// if the trie node consumes value, use it
if NODE_TYPE_TRIE in Result.t.flag then
begin
if not(NODE_HAS_VAL in Result.t.flag) then Result.t.flag := [];
Exit;
End;
// pure bucket holds only key suffixes, skip current char
if NODE_TYPE_PURE_BUCKET in Result.b.flag then
begin
Inc(key, 1);
Dec(len, 1);
end;
// do not scan bucket, it's not needed for this operation
End;
Function THatTrie.Get(key:PAnsiChar; len:Cardinal):PInteger;
var
node,parent:node_ptr;
m_old:Cardinal;
val:PInteger;
Begin
parent := FHat.root;
assert(NODE_TYPE_TRIE in parent.t.flag);
if len = 0 then
begin
Result:= @parent.t.val;
Exit;
End;
// consume all trie nodes, now parent must be trie and child anything
node := hattrie_consume(parent, key, len, 0);
assert(NODE_TYPE_TRIE in parent.t.flag);
// if the key has been consumed on a trie node, use its value
if len = 0 then
begin
if NODE_TYPE_TRIE in node.t.flag then
begin
Result:= UseVal(node);
Exit;
end
else if NODE_TYPE_HYBRID_BUCKET in node.b.flag then
begin
Result:=UseVal(parent);
Exit;
end;
end;
// preemptively split the bucket if it is full
while node.b.ms >= MAX_BUCKET_SIZE do
begin
Split(parent, node);
// after the split, the node pointer is invalidated, so we search from
// the parent again.
node := hattrie_consume(parent, key, len, 0);
// if the key has been consumed on a trie node, use its value
if len = 0 then
begin
if NODE_TYPE_TRIE in node.t.flag then
begin
Result:= UseVal(node);
Exit;
end
else if NODE_TYPE_HYBRID_BUCKET in node.b.flag then
begin
Result:=UseVal(parent);
Exit;
end;
end;
end;
assert((NODE_TYPE_PURE_BUCKET in node.b.flag) or (NODE_TYPE_HYBRID_BUCKET in node.b.flag));
assert(len > 0);
m_old := node.b.ms;
if NODE_TYPE_PURE_BUCKET in node.b.flag then
val := node.b.Get(key + 1, len - 1)
else
val := node.b.Get(key, len);
Inc(FHat.m, node.b.ms - m_old);
Result:=val;
End;
Function THatTrie.Find(key:PAnsiChar; len:Cardinal):PInteger;
var
node:node_ptr;
Begin
// find node for given key
node := FindNode(key, len);
if node.b = Nil then
begin
Result:=Nil;
Exit;
End;
// if the trie node consumes value, use it
if NODE_TYPE_TRIE in node.t.flag then
begin
Result:= @node.t.val;
Exit;
End;
Result:= node.b.Find(key, len);
End;
Function THatTrie.Delete(key:PAnsiChar; len:Cardinal):Integer;
var
node,parent:node_ptr;
m_old:Cardinal;
ret:Integer;
Begin
parent := FHat.root;
assert(NODE_TYPE_TRIE in parent.t.flag);
// find node for deletion
node := FindNode(key, len);
if node.b = Nil then
begin
Result:= -1;
Exit;
End;
// if consumed on a trie node, clear the value
if NODE_TYPE_TRIE in node.t.flag then
begin
Result:= ClearVal(node);
Exit;
End;
// remove from bucket
m_old := node.b.ms;
ret := node.b.Delete(key, len);
Dec(FHat.m, m_old - node.b.ms);
// TODO - merge empty buckets
Result:=ret;
End;
{
Procedure hattrie_iter_pushchar(i:Phattrie_iter; level:Cardinal; c:Char);
Begin
if i.keysize < level then
begin
i.keysize:=i.keysize * 2;
ReallocMem(i.key, i.keysize * SizeOf(Char));
end;
if level > 0 then i.key[level - 1] := c;
i.level := level;
End;
Procedure hattrie_iter_nextnode(i:Phattrie_iter);
var
node:node_ptr;
next:Phattrie_node_stack;
c:Char;
level:Cardinal;
j:Integer;
Begin
if not Assigned(i.stack) then Exit;
// pop the stack
node := i.stack.node;
next := i.stack.next;
c := i.stack.c;
level := i.stack.level;
Dispose(i.stack);
i.stack := next;
if NODE_TYPE_TRIE in node.flag^ then
begin
hattrie_iter_pushchar(i, level, c);
if NODE_HAS_VAL in node.t.flag then
begin
i.has_nil_key := true;
i.nil_val := node.t.val;
end;
// push all child nodes from right to left
for j := NODE_MAXCHAR downto 0 do
begin
// skip repeated pointers to hybrid bucket
if (j < NODE_MAXCHAR) and (node.t.xs[j].t = node.t.xs[j + 1].t) then continue;
// push stack
next := i.stack;
New(i.stack);
i.stack.node := node.t.xs[j];
i.stack.next := next;
i.stack.level := level + 1;
i.stack.c := Chr(j);
end;
end
else
begin
if NODE_TYPE_PURE_BUCKET in node.flag^ then
hattrie_iter_pushchar(i, level, c)
else
i.level := level - 1;
i.i := ahtable_iter_begin(node.b);
end;
End;
function hattrie_iter_Begin(Const T:Phattrie): Phattrie_iter;
Begin
New(Result);
Result.T := T;
Result.i := Nil;
Result.keysize := 16;
GetMem(Result.key,Result.keysize * SizeOf(Char));
Result.level := 0;
Result.has_nil_key := false;
Result.nil_val := 0;
New(Result.stack);
Result.stack.next := Nil;
Result.stack.node := T.root;
Result.stack.c := #0;
Result.stack.level := 0;
while ((Result.i = Nil) or ahtable_iter_finished(Result.i)) and not Result.has_nil_key and Assigned(Result.stack) do
begin
ahtable_iter_free(Result.i);
Result.i := Nil;
hattrie_iter_nextnode(Result);
end;
if Assigned(Result.i) and ahtable_iter_finished(Result.i) then
begin
ahtable_iter_free(Result.i);
Result.i := Nil;
end;
End;
Function hattrie_iter_finished(i:Phattrie_iter):Boolean;
Begin
Result:= (i.stack = Nil) and (i.i = Nil) and not i.has_nil_key;
End;
Procedure hattrie_iter_next(i:Phattrie_iter);
Begin
if hattrie_iter_finished(i) then Exit;
if Assigned(i.i) and not ahtable_iter_finished(i.i) then ahtable_iter_next(i.i)
else if i.has_nil_key then
begin
i.has_nil_key := false;
i.nil_val := 0;
hattrie_iter_nextnode(i);
end;
while ((i.i = Nil) or ahtable_iter_finished(i.i)) and not i.has_nil_key and Assigned(i.stack) do
begin
ahtable_iter_free(i.i);
i.i := Nil;
hattrie_iter_nextnode(i);
end;
if Assigned(i.i) and ahtable_iter_finished(i.i) then
begin
ahtable_iter_free(i.i);
i.i := Nil;
end;
End;
Procedure hattrie_iter_free(i:Phattrie_iter);
var
next:Phattrie_node_stack;
Begin
if not Assigned(i) then exit;
if Assigned(i.i) then ahtable_iter_free(i.i);
while Assigned(i.stack) do
begin
next := i.stack.next;
Dispose(i.stack);
i.stack := next;
end;
FreeMem(i.key);
Dispose(i);
End;
Function hattrie_iter_key(i:Phattrie_iter; len:PCardinal):PAnsiChar;
var
sublen:Cardinal;
subkey:PAnsiChar;
Begin
Result:=Nil;
if hattrie_iter_finished(i) then Exit;
if i.has_nil_key then
begin
subkey := Nil;
sublen := 0;
end
else subkey := ahtable_iter_key(i.i, @sublen);
if i.keysize < i.level + sublen + 1 then
begin
while i.keysize < i.level + sublen + 1 do i.keysize:=i.keysize * 2;
ReallocMem(i.key, i.keysize * SizeOf(Char));
end;
Move(subkey^,(i.key + i.level)^, sublen);
i.key[i.level + sublen] := #0;
if Assigned(len) then len^ := i.level + sublen;
Result:= i.key;
End;
Function hattrie_iter_val(i:Phattrie_iter):PInteger;
Begin
if i.has_nil_key then Result:= @i.nil_val
else if hattrie_iter_finished(i) then Result:=Nil
else Result:= ahtable_iter_val(i.i);
End;
Function hattrie_iter_equal(a,b:Phattrie_iter):Boolean;
Begin
Result:= (a.T = b.T) and (a.i = b.i);
End;
}
End.