I'm looking for feedback on my library for extracting words from a text: https://npmjs.org/package/uwords
The extracted word is defined as sequence of Unicode characters from Lu, Ll, Lt, Lm, Lo groups. So the code of the main part is (https://github.com/AlexAtNet/uwords/blob/master/index.js#L9):
module.exports = function (text) {
var words, word, index, limit, code;
words = [ ];
word = null;
for (index = 0, limit = text.length; index < limit; index += 1) {
code = text.charCodeAt(index);
if (-1 === _.indexOf(letters, code, true)) {
if (null !== word) {
words.push(word.join(''));
word = null;
}
} else {
if (null === word) {
word = [ ];
}
word.push(String.fromCharCode(code));
}
}
if (null !== word) {
words.push(word.join(''));
}
return words;
};
and the array letters
was created as follows (https://github.com/AlexAtNet/uwords/blob/master/gruntfile.js#L59):
grunt.registerTask('create-letters-json', 'letters.json', function () {
var letters, compacted;
letters = [
require('unicode/category/Lu'),
require('unicode/category/Ll'),
require('unicode/category/Lt'),
require('unicode/category/Lm'),
require('unicode/category/Lo')
].reduce(function (list, item) {
list.push.apply(list, Object.keys(item).map(function (value) {
return parseInt(value, 10);
}));
return list;
}, [ ]).sort(function (a, b) { return a - b; });
compacted = (function (list) {
var result, item, idx, value;
result = [ ];
item = { begin : list[0], end : list[0] };
result.push(item);
for (idx = 1; idx < list.length; idx += 1) {
value = list[idx];
if (item.end + 1 === value) {
item.end = value;
} else {
item = { begin : list[idx], end : list[idx] };
result.push(item);
}
}
for (idx = 0; idx < result.length; idx += 1) {
item = result[idx];
if (item.begin === item.end) {
result[idx] = item.begin;
} else {
result[idx] = [ item.begin, item.end ];
}
}
return result;
}(letters));
require('fs').writeFileSync(__dirname + '/letters.json',
JSON.stringify(compacted, null, 2));
});
It is quite naive approach but I think that it will work in most of the cases. What do you think?
-
\$\begingroup\$ I have not used node js. However, I wonder if you could simply use string replace? \$\endgroup\$James– James2014年02月01日 00:47:33 +00:00Commented Feb 1, 2014 at 0:47
-
\$\begingroup\$ Sorry, did not get it - what do you mean by "use string replace"? \$\endgroup\$Alex Netkachov– Alex Netkachov2014年02月01日 01:58:28 +00:00Commented Feb 1, 2014 at 1:58
2 Answers 2
The top part looks clean, personally I would
- Not compare with
null
all the time, just checkword.length
and haveword
be an array at all times. - Not initialize
word
andwords
separately - Not use
String.fromCharCode(code)
, I would usetext[index]
instead - I would use the
~
operator instead of comparing to-1
- I would first deal with finding a match , and then with not finding a match ( switch the
if
blocks in other words ), my mind had to do a double take when I was reading your code - if would add a space to the end of
text
, so that I would not need the last if statement
All that would give me something like:
module.exports = function (text){
text += " ";
var words = [],
word = [],
limit = text.length, code, index;
for (index = 0; index < limit; index += 1) {
code = text.charCodeAt(index);
if (~_.indexOf(letters, code, true)) {
word.push( text[index] )
} else {
if (word.length) {
words.push(word.join(''));
word = [];
}
}
}
return words;
};
Finally, I think collecting char-codes in the 2nd script, then taking chars, converting those to char-codes and then use the chars again might not be the best approach.
Personally, I would letters
be an object where each letter ( not the char-code ) would be a property of the object set to true. No more char-code conversions, and most likely it would beat the sorted lookup table. ( To be tested.. )
-
\$\begingroup\$ I've tried to use properties instead of the binary search, and it is much faster - but it is not consistent. For some reason
console.log(String.fromCharCode(195101) === String.fromCharCode(64029))
and as the result theletters.length - Object.keys(obj).length = 1583
obj = letters.reduce(function (tmp, item) { tmp[String.fromCharCode(item)] = item; return tmp; }, { }))
\$\endgroup\$Alex Netkachov– Alex Netkachov2014年02月01日 23:41:12 +00:00Commented Feb 1, 2014 at 23:41 -
\$\begingroup\$ Happy that it is faster, disconcerting that you found that behaviour, I am fairly certain that you will win a ton of rep if you ask about this on stackoverflow ;) \$\endgroup\$konijn– konijn2014年02月02日 00:12:02 +00:00Commented Feb 2, 2014 at 0:12
I have not used node js. However, I wonder if you could simply use string replace? For example,
v = 'I was walking down the park on day. I was running down the block'
j = message.replace(/was/g, '')
The output would be the following.
'I walking down the park one day. I here'
-
1\$\begingroup\$ Hi James, please post comments as comments. I flagged this post as "not an answer". \$\endgroup\$Mathieu Guindon– Mathieu Guindon2014年02月01日 00:44:06 +00:00Commented Feb 1, 2014 at 0:44