EDIT: Dequeu was the correct guess
EDIT: I did the BFS method for the suffix link/output link construction
I'm doing a hackerrank challenge and I came up with this implementation of the aho corasick algorithm. Everything works fine but my suffix construction is too slow and it's taking ages to build with 100 000 inputs (the tree construction is quite fast tho 1s~ for 100 000 inputs but 3min for the suffix construction).
What should I do to improve my createLink function ?
I put a simple test at the end to show how to use it, the getHealth function launch the algorithm and take a string in first parameter and two numbers that is there to only take a part of the inputs.
Let's say you have:
inputs = ['a', 'b', 'c', 'd']
health = [1, 2, 3, 4]
if you do getHealth with this:
getHealth('aadbancjlav', 0, 2); // inputs.slice(0, 2 + 1) (+1 because it's exclusive)
The algorithm will skip the letter 'd' and his corresponding health.
const ROOT_TREE = 0;
const ALPHABET_SIZE = 26;
class TreeNode {
next;
isLeaf = false;
character = '$';
parentNode = -1;
outputLink = -1;
health = [];
link = -1;
constructor(p = -1, character = '$') {
this.parentNode = p;
this.character = character;
this.next = new Array(ALPHABET_SIZE).fill(-1);
}
}
class AhoCorasick {
trie = [new TreeNode()];
addString(word, health, index) {
let treeIndex = 0;
for (let index = 0; index < word.length; index += 1) {
const character = word[index];
const char = character.charCodeAt(0) - 'a'.charCodeAt(0);
if (this.trie[treeIndex].next[char] === -1) {
this.trie[treeIndex].next[char] = this.trie.length;
this.trie.push(new TreeNode(treeIndex, character));
}
treeIndex = this.trie[treeIndex].next[char];
}
this.trie[treeIndex].health.push({ health, index });
this.trie[treeIndex].isLeaf = true;
}
createLink() { // <===== THIS IS THE SLOW ONE, any advice on this will be appreciate
const queue = [];
const treeIndex = 0;
queue.push(treeIndex);
while (queue.length > 0) {
const trieToCheck = this.trie[queue.shift()];
const nodeToCheck = trieToCheck.next.filter((i) => i !== -1);
nodeToCheck.forEach((nextIndex) => {
let childNode = trieToCheck;
queue.push(nextIndex);
if (childNode.link !== -1) {
childNode = this.trie[childNode.link];
}
/*const index = childNode.next.find( <== old way of finding index
(next) =>
next !== -1 &&
next !== nextIndex &&
this.trie[next].character === this.trie[nextIndex].character,
);*/
let index; // <=== EDIT: new index calculation
index =
childNode.next[
this.trie[nextIndex].character.charCodeAt(0) - 'a'.charCodeAt(0)
];
if (index === nextIndex) {
index = null;
}
if (!index) {
this.trie[nextIndex].link = treeIndex;
} else {
this.trie[nextIndex].link = index;
if (this.trie[index].isLeaf) {
this.trie[nextIndex].outputLink = index;
} else if (this.trie[index].outputLink !== -1) {
this.trie[nextIndex].outputLink = this.trie[index].outputLink;
}
}
});
}
}
constructor(words, health) {
console.time('words');
words.forEach((word, index) => this.addString(word, health[index], index));
console.timeEnd('words');
console.time('link');
this.createLink();
console.timeEnd('link');
}
processOutputLink(outputLink, health, first, last) {
if (outputLink === -1) {
return health;
}
const newHealth = this.trie[outputLink].health
.filter((e) => e.index >= first && e.index <= last)
.reduce((acc, e) => acc + e.health, 0);
return this.processOutputLink(
this.trie[outputLink].outputLink,
health + newHealth,
);
}
processLink(link, character) {
if (link !== -1 && this.trie[link].next[character] === -1) {
return this.processLink(this.trie[link].link, character);
}
if (link !== -1 && this.trie[link].next[character] !== -1) {
return this.trie[link].next[character];
}
return link === -1 ? ROOT_TREE : link;
}
processNode(index, health, first, last) {
const node = this.trie[index];
let newHealth = health;
if (node.isLeaf) {
const healthCalc = node.health
.filter((e) => e.index >= first && e.index <= last)
.reduce((acc, e) => acc + e.health, 0);
newHealth += healthCalc;
}
newHealth = this.processOutputLink(node.outputLink, newHealth, first, last); // FINITO
return newHealth;
}
getHealth(word, first, last) {
let health = 0;
let index = 0;
for (const c of word) {
const character = c.charCodeAt(0) - 97;
const parentNode = this.trie[index];
const nextIndex = this.trie[index].next[character];
if (nextIndex !== -1) {
health = this.processNode(nextIndex, health, first, last);
index = nextIndex;
} else {
const indexSuffix = this.processLink(parentNode.link, character);
const indexNextNode =
indexSuffix !== ROOT_TREE
? indexSuffix
: this.trie[indexSuffix].next[character];
if (indexNextNode !== -1) {
health = this.processNode(indexNextNode, health, first, last);
index = indexNextNode;
} else {
index = ROOT_TREE;
}
}
}
return health;
}
}
const algo = new AhoCorasick(['a', 'aa', 'b', 'bca', 'd'], [5, 10, 15, 20, 25, 30]);
algo.getHealth("aabcaccdabaa", 0, 5);
-
1\$\begingroup\$ Welcome to Code Review! While it appears that the code changes were reversed, please be aware that after getting an answer you are not allowed to change your code anymore. This ensures that answers do not get invalidated and have to hit a moving target. If you have changed your code you can either post it as an answer (if it would constitute a code review) or ask a new question with your changed code (linking back to this one as reference). See this post for more information \$\endgroup\$Sᴀᴍ Onᴇᴌᴀ– Sᴀᴍ Onᴇᴌᴀ ♦2022年11月29日 00:10:34 +00:00Commented Nov 29, 2022 at 0:10
2 Answers 2
queue.shift()
runs in linear time. You need a deque data structure in order to remove the first element efficiently in constant time.
-
\$\begingroup\$ Perfect, I did a small implementation of dequeu and my link construction went down to 70ms instead of 3min.. Amazing, I didn't know that shift was so bad with the JS array implementation \$\endgroup\$Nicolas Menettrier– Nicolas Menettrier2022年11月28日 21:34:16 +00:00Commented Nov 28, 2022 at 21:34
Not the best but easy implementation of Dequeue I used in my code to replace the array based implementation (based on @coderodde answer).
What changed in my code:
const queue = [];
replaced by
const queue = new Dequeue();
and
while (queue.length > 0)
replaced by
while (queue.size > 0)
I cannot use the queue.data.length because the length of the data array is corrupted by the delete use (inside the shift function). That's why I need to keep track of the size.
I put the transformation in create link below:
class Dequeue {
front;
end;
size = 0;
data = [];
push(number) {
if (typeof this.front === "undefined") {
this.front = 0;
this.end = this.size;
this.data[this.front] = number;
this.data[this.end] = number;
} else {
this.end = this.end + 1;
this.data[this.end] = number;
}
this.size += 1;
}
shift() {
const value = this.data[this.front];
delete this.data[this.front];
this.size -= 1;
this.front = this.front + 1;
if (this.size === 0) {
this.front = undefined;
this.back = undefined;
this.data = [];
}
return value;
}
display() {
console.log(this.data, this.size);
}
}
// EDIT: What changed in my previous code, I changed two line (previously queue = [] and queue.length > 0)
createLink() {
const queue = new Dequeue(); // Using the new class
const treeIndex = 0;
queue.push(treeIndex);
while (queue.size > 0) { // Check on the size since the length is not relatable anymore
const trieToCheck = this.trie[queue.shift()]; // shift and push as before
const nodeToCheck = trieToCheck.next.filter((i) => i !== -1);
nodeToCheck.forEach((nextIndex) => {
let childNode = trieToCheck;
queue.push(nextIndex);
if (childNode.link !== -1) {
childNode = this.trie[childNode.link];
}
let index;
index =
childNode.next[
this.trie[nextIndex].character.charCodeAt(0) - 'a'.charCodeAt(0)
];
if (index === nextIndex) {
index = null;
}
if (!index) {
this.trie[nextIndex].link = treeIndex;
} else {
this.trie[nextIndex].link = index;
if (this.trie[index].isLeaf) {
this.trie[nextIndex].outputLink = index;
} else if (this.trie[index].outputLink !== -1) {
this.trie[nextIndex].outputLink = this.trie[index].outputLink;
}
}
});
}
}
-
2\$\begingroup\$ Please note that the data structure is called a deque (stems from "double-ended queue"), not a dequeue. \$\endgroup\$coderodde– coderodde2023年01月11日 17:35:33 +00:00Commented Jan 11, 2023 at 17:35