using System; using System.Collections; using System.Text; /* * $Id: TernaryTree.cs,v 1.2 2005/06/18 08:17:05 psoares33 Exp $ * Copyright (C) 2001 The Apache Software Foundation. All rights reserved. * For details on use and redistribution please refer to the * LICENSE file included with these sources. */ namespace iTextSharp.text.pdf.hyphenation { /** *
A ternary search tree is a hibrid between a binary tree and * a digital search tree (trie). Keys are limited to strings. * A data value of type char is stored in each leaf node. * It can be used as an index (or pointer) to the data. * Branches that only contain one key are compressed to one node * by storing a pointer to the trailer substring of the key. * This class is intended to serve as base class or helper class * to implement Dictionary collections or the like. Ternary trees * have some nice properties as the following: the tree can be * traversed in sorted order, partial matches (wildcard) can be * implemented, retrieval of all keys within a given distance * from the target, etc. The storage requirements are higher than * a binary tree but a lot less than a trie. Performance is * comparable with a hash table, sometimes it outperforms a hash * function (most of the time can determine a miss faster than a hash).
* *The main purpose of this java port is to serve as a base for * implementing TeX's hyphenation algorithm (see The TeXBook, * appendix H). Each language requires from 5000 to 15000 hyphenation * patterns which will be keys in this tree. The strings patterns * are usually small (from 2 to 5 characters), but each char in the * tree is stored in a node. Thus memory usage is the main concern. * We will sacrify 'elegance' to keep memory requirenments to the * minimum. Using java's char type as pointer (yes, I know pointer * it is a forbidden word in java) we can keep the size of the node * to be just 8 bytes (3 pointers and the data char). This gives * room for about 65000 nodes. In my tests the english patterns * took 7694 nodes and the german patterns 10055 nodes, * so I think we are safe.
* *All said, this is a map with strings as keys and char as value. * Pretty limited!. It can be extended to a general map by * using the string representation of an object and using the * char value as an index to an array that contains the object * values.
* * @author cav@uniscope.co.jp */ public class TernaryTree : ICloneable { /** * We use 4 arrays to represent a node. I guess I should have created * a proper node class, but somehow Knuth's pascal code made me forget * we now have a portable language with memory management and * automatic garbage collection! And now is kind of late, furthermore, * if it ain't broken, don't fix it. */ /** * Pointer to low branch and to rest of the key when it is * stored directly in this node, we don't have unions in java! */ protected char[] lo; /** * Pointer to high branch. */ protected char[] hi; /** * Pointer to equal branch and to data when this node is a string terminator. */ protected char[] eq; /** *The character stored in this node: splitchar * Two special values are reserved:
*This shouldn't be a problem if we give the usual semantics to * strings since 0xFFFF is garanteed not to be an Unicode character.
*/ protected char[] sc; /** * This vector holds the trailing of the keys when the branch is compressed. */ protected CharVector kv; protected char root; protected char freenode; protected int length; // number of items in tree protected static int BLOCK_SIZE = 2048; // allocation size for arrays internal TernaryTree() { Init(); } protected void Init() { root = (char)0; freenode = (char)1; length = 0; lo = new char[BLOCK_SIZE]; hi = new char[BLOCK_SIZE]; eq = new char[BLOCK_SIZE]; sc = new char[BLOCK_SIZE]; kv = new CharVector(); } /** * Branches are initially compressed, needing * one node per key plus the size of the string * key. They are decompressed as needed when * another key with same prefix * is inserted. This saves a lot of space, * specially for long keys. */ public void Insert(string key, char val) { // make sure we have enough room in the arrays int len = key.Length + 1; // maximum number of nodes that may be generated if (freenode + len > eq.Length) RedimNodeArrays(eq.Length + BLOCK_SIZE); char[] strkey = new char[len--]; key.CopyTo(0, strkey, 0, len); strkey[len] = (char)0; root = Insert(root, strkey, 0, val); } public void Insert(char[] key, int start, char val) { int len = Strlen(key) + 1; if (freenode + len > eq.Length) RedimNodeArrays(eq.Length + BLOCK_SIZE); root = Insert(root, key, start, val); } /** * The actual insertion function, recursive version. */ private char Insert(char p, char[] key, int start, char val) { int len = Strlen(key, start); if (p == 0) { // this means there is no branch, this node will start a new branch. // Instead of doing that, we store the key somewhere else and create // only one node with a pointer to the key p = freenode++; eq[p] = val; // holds data length++; hi[p] = (char)0; if (len > 0) { sc[p] = (char)0xFFFF; // indicates branch is compressed lo[p] = (char)kv.Alloc(len + 1); // use 'lo' to hold pointer to key Strcpy(kv.Arr, lo[p], key, start); } else { sc[p] = (char)0; lo[p] = (char)0; } return p; } if (sc[p] == 0xFFFF) { // branch is compressed: need to decompress // this will generate garbage in the external key array // but we can do some garbage collection later char pp = freenode++; lo[pp] = lo[p]; // previous pointer to key eq[pp] = eq[p]; // previous pointer to data lo[p] = (char)0; if (len > 0) { sc[p] = kv[lo[pp]]; eq[p] = pp; lo[pp]++; if (kv[lo[pp]] == 0) { // key completly decompressed leaving garbage in key array lo[pp] = (char)0; sc[pp] = (char)0; hi[pp] = (char)0; } else sc[pp] = (char)0xFFFF; // we only got first char of key, rest is still there } else { // In this case we can save a node by swapping the new node // with the compressed node sc[pp] = (char)0xFFFF; hi[p] = pp; sc[p] = (char)0; eq[p] = val; length++; return p; } } char s = key[start]; if (s < sc[p]) lo[p] = Insert(lo[p], key, start, val); else if (s == sc[p]) { if (s != 0) eq[p] = Insert(eq[p], key, start + 1, val); else { // key already in tree, overwrite data eq[p] = val; } } else hi[p] = Insert(hi[p], key, start, val); return p; } /** * Compares 2 null terminated char arrays */ public static int Strcmp(char[] a, int startA, char[] b, int startB) { for (; a[startA] == b[startB]; startA++, startB++) if (a[startA] == 0) return 0; return a[startA] - b[startB]; } /** * Compares a string with null terminated char array */ public static int Strcmp(string str, char[] a, int start) { int i, d, len = str.Length; for (i = 0; i < len; i++) { d = (int)str[i] - a[start + i]; if (d != 0) return d; if (a[start + i] == 0) return d; } if (a[start + i] != 0) return (int)-a[start + i]; return 0; } public static void Strcpy(char[] dst, int di, char[] src, int si) { while (src[si] != 0) dst[di++] = src[si++]; dst[di] = (char)0; } public static int Strlen(char[] a, int start) { int len = 0; for (int i = start; i < a.Length && a[i] != 0; i++) len++; return len; } public static int Strlen(char[] a) { return Strlen(a, 0); } public int Find(string key) { int len = key.Length; char[] strkey = new char[len + 1]; key.CopyTo(0, strkey, 0, len); strkey[len] = (char)0; return Find(strkey, 0); } public int Find(char[] key, int start) { int d; char p = root; int i = start; char c; while (p != 0) { if (sc[p] == 0xFFFF) { if (Strcmp(key, i, kv.Arr, lo[p]) == 0) return eq[p]; else return -1; } c = key[i]; d = c - sc[p]; if (d == 0) { if (c == 0) return eq[p]; i++; p = eq[p]; } else if (d < 0) p = lo[p]; else p = hi[p]; } return -1; } public bool Knows(string key) { return (Find(key) >= 0); } // redimension the arrays private void RedimNodeArrays(int newsize) { int len = newsize < lo.Length ? newsize : lo.Length; char[] na = new char[newsize]; Array.Copy(lo, 0, na, 0, len); lo = na; na = new char[newsize]; Array.Copy(hi, 0, na, 0, len); hi = na; na = new char[newsize]; Array.Copy(eq, 0, na, 0, len); eq = na; na = new char[newsize]; Array.Copy(sc, 0, na, 0, len); sc = na; } public int Size { get { return length; } } public Object Clone() { TernaryTree t = new TernaryTree(); t.lo = (char[])this.lo.Clone(); t.hi = (char[])this.hi.Clone(); t.eq = (char[])this.eq.Clone(); t.sc = (char[])this.sc.Clone(); t.kv = (CharVector)this.kv.Clone(); t.root = this.root; t.freenode = this.freenode; t.length = this.length; return t; } /** * Recursively insert the median first and then the median of the * lower and upper halves, and so on in order to get a balanced * tree. The array of keys is assumed to be sorted in ascending * order. */ protected void InsertBalanced(string[] k, char[] v, int offset, int n) { int m; if (n < 1) return; m = n >> 1; Insert(k[m + offset], v[m + offset]); InsertBalanced(k, v, offset, m); InsertBalanced(k, v, offset + m + 1, n - m - 1); } /** * Balance the tree for best search performance */ public void Balance() { // System.out.Print("Before root splitchar = "); System.out.Println(sc[root]); int i = 0, n = length; string[] k = new string[n]; char[] v = new char[n]; Iterator iter = new Iterator(this); while (iter.HasMoreElements()) { v[i] = iter.Value; k[i++] = (string)iter.NextElement(); } Init(); InsertBalanced(k, v, 0, n); // With uniform letter distribution sc[root] should be around 'm' // System.out.Print("After root splitchar = "); System.out.Println(sc[root]); } /** * Each node stores a character (splitchar) which is part of * some Key(s). In a compressed branch (one that only contain * a single string key) the trailer of the key which is not * already in nodes is stored externally in the kv array. * As items are inserted, key substrings decrease. * Some substrings may completely disappear when the whole * branch is totally decompressed. * The tree is traversed to find the key substrings actually * used. In addition, duplicate substrings are removed using * a map (implemented with a TernaryTree!). * */ public void TrimToSize() { // first balance the tree for best performance Balance(); // redimension the node arrays RedimNodeArrays(freenode); // ok, compact kv array CharVector kx = new CharVector(); kx.Alloc(1); TernaryTree map = new TernaryTree(); Compact(kx, map, root); kv = kx; kv.TrimToSize(); } private void Compact(CharVector kx, TernaryTree map, char p) { int k; if (p == 0) return; if (sc[p] == 0xFFFF) { k = map.Find(kv.Arr, lo[p]); if (k < 0) { k = kx.Alloc(Strlen(kv.Arr, lo[p]) + 1); Strcpy(kx.Arr, k, kv.Arr, lo[p]); map.Insert(kx.Arr, k, (char)k); } lo[p] = (char)k; } else { Compact(kx, map, lo[p]); if (sc[p] != 0) Compact(kx, map, eq[p]); Compact(kx, map, hi[p]); } } public Iterator Keys { get { return new Iterator(this); } } public class Iterator { /** * current node index */ int cur; /** * current key */ string curkey; /** * TernaryTree parent */ TernaryTree parent; private class Item : ICloneable { internal char parent; internal char child; public Item() { parent = (char)0; child = (char)0; } public Item(char p, char c) { parent = p; child = c; } public Object Clone() { return new Item(parent, child); } } /** * Node stack */ Stack ns; /** * key stack implemented with a StringBuilder */ StringBuilder ks; public Iterator(TernaryTree parent) { this.parent = parent; cur = -1; ns = new Stack(); ks = new StringBuilder(); Rewind(); } public void Rewind() { ns.Clear(); ks.Length = 0; cur = parent.root; Run(); } public Object NextElement() { string res = curkey; cur = Up(); Run(); return res; } public char Value { get { if (cur >= 0) return this.parent.eq[cur]; return (char)0; } } public bool HasMoreElements() { return (cur != -1); } /** * traverse upwards */ private int Up() { Item i = new Item(); int res = 0; if (ns.Count == 0) return -1; if (cur != 0 && parent.sc[cur] == 0) return parent.lo[cur]; bool climb = true; while (climb) { i = (Item)ns.Pop(); i.child++; switch (i.child) { case (char)1: if (parent.sc[i.parent] != 0) { res = parent.eq[i.parent]; ns.Push(i.Clone()); ks.Append(parent.sc[i.parent]); } else { i.child++; ns.Push(i.Clone()); res = parent.hi[i.parent]; } climb = false; break; case (char)2: res = parent.hi[i.parent]; ns.Push(i.Clone()); if (ks.Length > 0) ks.Length = ks.Length - 1; // pop climb = false; break; default: if (ns.Count == 0) return -1; climb = true; break; } } return res; } /** * traverse the tree to find next key */ private int Run() { if (cur == -1) return -1; bool leaf = false; for (; ; ) { // first go down on low branch until leaf or compressed branch while (cur != 0) { if (parent.sc[cur] == 0xFFFF) { leaf = true; break; } ns.Push(new Item((char)cur, '\u0000')); if (parent.sc[cur] == 0) { leaf = true; break; } cur = parent.lo[cur]; } if (leaf) break; // nothing found, go up one node and try again cur = Up(); if (cur == -1) { return -1; } } // The current node should be a data node and // the key should be in the key stack (at least partially) StringBuilder buf = new StringBuilder(ks.ToString()); if (parent.sc[cur] == 0xFFFF) { int p = parent.lo[cur]; while (parent.kv[p] != 0) buf.Append(parent.kv[p++]); } curkey = buf.ToString(); return 0; } } public virtual void PrintStats() { Console.Error.WriteLine("Number of keys = " + length.ToString()); Console.Error.WriteLine("Node count = " + freenode.ToString()); // Console.Error.WriteLine("Array length = " + int.ToString(eq.Length)); Console.Error.WriteLine("Key Array length = " + kv.Length.ToString()); /* * for (int i=0; i