I asked how to paginate a trie a while back, and got this elegant and simple Trie structure in the answer:
class TrieNode {
constructor(key) {
// the "key" value will be the character in sequence
this.key = key;
// we keep a reference to parent
this.parent = null;
// we have hash of children
this.children = {};
// check to see if the node is at the end
this.end = false;
// count how many words are at this node or below.
this.count = 0;
// Ensure children are in the right order.
this.lastChildKey = '';
}
getWord() {
let output = [];
let node = this;
while (node !== null) {
output.unshift(node.key)
node = node.parent
}
return output.join('')
}
}
class Trie {
constructor() {
this.base = new TrieNode(null)
}
insert(word) {
let node = this.base
const points = Array.from(word)
for (const i in points) {
const point = points[i]
node.count++
if (!node.children[point]) {
const child = node.children[point] = new TrieNode(point)
child.parent = node
}
node = node.children[point]
if (i == word.length - 1) {
node.end = true
}
// If words were inserted out of order, fix order of children.
if (node.lastChildKey <= point) {
node.lastChildKey = point
}
else {
// Need to fix order of children.
let keys = Object.keys(node.children)
keys.sort()
let orderedChildren = {}
for (const key in keys) {
orderedChildren[key] = node.children[key]
}
node.children = orderedChildren
}
}
}
contains(word) {
let node = this.base
const points = Array.from(word)
for (const i in points) {
const point = points[i]
if (node.children[point]) {
node = node.children[point]
} else {
return false
}
}
return node.end;
}
find(prefix, startAt, maxCount) {
let node = this.base
let output = []
const points = Array.from(prefix)
for (const i in points) {
const point = points[i]
// make sure prefix actually has words
if (node.children[point]) {
node = node.children[point]
} else {
// there's none. just return it.
return output
}
}
const stack = [node]
while (stack.length) {
if (maxCount <= output.length) {
return output
}
node = stack.shift()
if (node.count < startAt) {
startAt -= node.count
}
else {
// base case, if node is at a word, push to output
if (node.end) {
if (0 < startAt) {
startAt--;
}
else {
output.push(node.getWord())
}
}
// iterate through each children, call recursive findAllWords
for (var child in node.children) {
stack.push(node.children[child])
}
}
}
return output
}
}
You can use it like this:
const fs = require('fs')
const Trie = require('./Trie')
const words = fs.readFileSync('tmp/scrabble.csv', 'utf-8')
.trim()
.split(/n+/)
.map(x => x.trim())
const trie = new Trie.Trie()
words.forEach(word => trie.insert(word))
console.log(trie.find('a', 0, 12))
console.log(trie.find('a', 6, 6))
Asking AI about fuzzy string matching on Tries, they gave this implementation:
class TrieNode {
children: Map<string, TrieNode> = new Map();
isEndOfWord: boolean = false;
word: string | null = null; // To store the word at leaf nodes
}
class Trie {
root: TrieNode = new TrieNode();
insert(word: string) {
let node = this.root;
for (const char of word) {
if (!node.children.has(char)) {
node.children.set(char, new TrieNode());
}
node = node.children.get(char)!;
}
node.isEndOfWord = true;
node.word = word; // Store the complete word at the end node
}
fuzzySearch(query: string): {word: string, score: number}[] {
let results: {word: string, score: number}[] = [];
// Initial call with max distance, here it's set to a portion of the query length for flexibility
const maxDistance = Math.floor(query.length / 2);
this.searchRecursive(this.root, query, "", 0, maxDistance, results);
// Sort results based on score, higher first
return results.sort((a, b) => b.score - a.score);
}
private searchRecursive(node: TrieNode, query: string, currentWord: string, currentScore: number, maxDistance: number, results: {word: string, score: number}[]): void {
if (query.length === 0 || node.isEndOfWord) {
let finalScore = currentScore;
if (node.word) {
// Adjust score based on the length difference, assuming a penalty for each missing character
finalScore -= Math.abs(node.word.length - currentWord.length) * 5;
if (query.length === 0 && node.isEndOfWord) {
results.push({ word: node.word, score: finalScore });
}
}
}
if (maxDistance < 0) return; // Stop if max edits exceeded
for (let i = 0; i < query.length; i++) {
const char = query[i];
if (node.children.has(char)) {
// Exact match, continue with the rest of the query
const nextNode = node.children.get(char)!;
this.searchRecursive(nextNode, query.substring(i + 1), currentWord + char, currentScore + 10, maxDistance, results);
}
// Edit operations with penalties
node.children.forEach((childNode, childChar) => {
// Substitution (replace char)
if (childChar !== char) {
this.searchRecursive(childNode, query.substring(i + 1), currentWord + childChar, currentScore - 5, maxDistance - 1, results);
}
// Insertion (add a char)
this.searchRecursive(childNode, query.substring(i), currentWord + childChar, currentScore - 5, maxDistance - 1, results);
});
// Deletion (skip a char in query)
this.searchRecursive(node, query.substring(i + 1), currentWord, currentScore - 5, maxDistance - 1, results);
break; // Only apply one edit per recursion level
}
}
}
// Example enhanced usage
let trie = new Trie();
trie.insert("VisualStudioCode");
trie.insert("VisualStudioCommunity");
trie.insert("VerySimpleCode");
trie.insert("VisualSpaceCoordinator");
const searchResults = trie.fuzzySearch("VSC");
console.log(searchResults);
If you implement “pruning” of the scored results (sorted by score), how can you also implement pagination, without going through the entire set of trie nodes? That is, say I want to return 100 results, if I use the second Trie example, it will return possibly 1000’s of results (assuming a large dictionary like the 100k+ scrabble dictionary, or a million word dictionary). Then I can paginate those results, but that is a waste of computational effort. How can you avoid processing all the nodes in the trie, while implementing VSCode-like fuzzy text matching (with scoring), AND implementing pagination, where you can jump to a specific page and get say 100 nodes from that point forward?
Note: It appears to get really high quality fuzzy matching you need machine learning, AI, and user data input (like most recently accessed files, for VSCode), etc.. I don’t intend on creating the perfect paginated fuzzy matching Trie out there, just something that works reasonably well with your standard algorithms. Just not sure yet how / if it’s possible to combine pagination with the fuzzy matching scoring aspect, on a Trie.