I had this idea of trying to re-implement the function String.prototype.indexOf
in a way that could potentially be faster than the native function.
With some testing, I've doodled the following function in less than 1 hour:
function strpos(haystack, needle, offset)
{
// Accessing <object>.length is slower than storing and
// reading from a local variable
var h_length = haystack.length;
var n_length = needle.length;
// Skips checking strings that have the same length (since they may be equal)
// or strings where the needle is larger (a large needle won't fit in a small haystack)
if(n_length >= h_length)
{
return needle === haystack ? 0 : -1;
}
// If the offset is too big, we will check outside the haystack
// which means the string isn't there and we must quit
if(offset >= h_length - n_length || offset < 0)
{
return -1;
}
// small performance boost
var i_limit = h_length - n_length;
next_char:
for(var i = (offset || 0); i < i_limit; i++)
{
// Check if the first and last character from i are different
// it still works for 1-character long needles
if(needle[0] !== haystack[i] || needle[n_length - 1] !== haystack[i + n_length - 1])
{
continue;
}
// Check all characters inside needle, excluding the first and last
for(var j = 1; j < n_length - 1; j++)
{
if(needle[j] !== haystack[i + j])
{
// The needle isn't in this position, so, check the next character
continue next_char;
}
}
// If you reached here, you found the needle
return i;
}
// If you reached here, the haystack doesn't have the needle
return -1;
}
The idea here was to skip checking as many characters as possible, and only checking the characters inside the needle if strictly necessary.
To test the speed of the function, I're ran the following code a few times:
console.time('indexOf'); '<haystack>'.indexOf('<needle>'); console.timeEnd('indexOf'); console.time('strpos'); strpos('<haystack>','<needle>'); console.timeEnd('strpos');
Based on what I've seen on Firefox 53.0.3(32-bit), on Windows 10, my implementation is around 5-20% faster most of the time.
You can try it bellow:
function strpos(haystack, needle, offset)
{
// Accessing <object>.length is slower than storing and
// reading from a local variable
var h_length = haystack.length;
var n_length = needle.length;
// Skips checking strings that have the same length (since they may be equal)
// or strings where the needle is larger (a large needle won't fit in a small haystack)
if(n_length >= h_length)
{
return needle === haystack ? 0 : -1;
}
// If the offset is too big, we will check outside the haystack
// which means the string isn't there and we must quit
if(offset >= h_length - n_length || offset < 0)
{
return -1;
}
// small performance boost
var i_limit = h_length - n_length;
next_char:
for(var i = (offset || 0); i < i_limit; i++)
{
// Check if the first and last character from i are different
// it still works for 1-character long needles
if(needle[0] !== haystack[i] || needle[n_length - 1] !== haystack[i + n_length - 1])
{
continue;
}
// Check all characters inside needle, excluding the first and last
for(var j = 1; j < n_length - 1; j++)
{
if(needle[j] !== haystack[i + j])
{
// The needle isn't in this position, so, check the next character
continue next_char;
}
}
// If you reached here, you found the needle
return i;
}
// If you reached here, the haystack doesn't have the needle
return -1;
}
var haystack = prompt('Haystack:', '');
var needle = prompt('Needle:', '');
for(var i = 0; i < 10; i++)
{
console.log('Iteration: ' + (i + 1));
console.time('indexOf');
var x = haystack.indexOf(needle);
console.timeEnd('indexOf');
console.log('position: ' + x);
console.time('strpos');
var y = strpos(haystack, needle);
console.timeEnd('strpos');
console.log('position: ' + y);
}
As far as I know, it is working as it should, returning -1
when expected.
Is there anything else I can improve on this code, regarding performance or readability? Or even a better way to do this, without that pesky continue next_char;
?
2 Answers 2
[...] that could potentially be faster than the native function.
The native String.indexOf
is fast, but comes with a certain constant setup cost that makes it slower than your implementation for very small strings. As soon as the input strings get longer however the built-in String.indexOf
outperforms any other solution:
Small strings:
strpos 55,056,671
String.indexOf 23,497,940
Large strings:
String.indexOf 5,132,350
strpos 572,222
See https://jsperf.com/string-indexof-vs-strpos/1
The JavaScript specification doesn't mandate a specific implementation for String.indexOf
, but I assume that the chosen string searching algorithm comes with higher setup cost but better runtime complexity or simply benefits from its native implementation speedup.
I know it's a late answer, but I just stumbled over the question and I have a suggestion of how to get rid of the "pesky continue next_char;
"
// Check all characters inside needle, excluding the first and last
for(var j = 1; j < n_length - 1; j++) {
if(needle[j] !== haystack[i + j]) {
// The needle isn't in this position, end the inner for loop
break;
}
}
if (j === n_length) {
// we looped through to the end of the needle, which means we found a match
return i;
}
Explore related questions
See similar questions with these tags.
O(nm)
with no preprocessing. Perhaps the built-in features an implementation with higher setup cost but lower runtime complexity for the search afterwards? \$\endgroup\$