node.js library for extracting words from a text

Question 1

I'm looking for feedback on my library for extracting words from a text: https://npmjs.org/package/uwords

The extracted word is defined as sequence of Unicode characters from Lu, Ll, Lt, Lm, Lo groups. So the code of the main part is (https://github.com/AlexAtNet/uwords/blob/master/index.js#L9):

module.exports = function (text) {
 var words, word, index, limit, code;
 words = [ ];
 word = null;
 for (index = 0, limit = text.length; index < limit; index += 1) {
 code = text.charCodeAt(index);
 if (-1 === _.indexOf(letters, code, true)) {
 if (null !== word) {
 words.push(word.join(''));
 word = null;
 }
 } else {
 if (null === word) {
 word = [ ];
 }
 word.push(String.fromCharCode(code));
 }
 }
 if (null !== word) {
 words.push(word.join(''));
 }
 return words;
};

and the array letters was created as follows (https://github.com/AlexAtNet/uwords/blob/master/gruntfile.js#L59):

 grunt.registerTask('create-letters-json', 'letters.json', function () {
 var letters, compacted;
 letters = [
 require('unicode/category/Lu'),
 require('unicode/category/Ll'),
 require('unicode/category/Lt'),
 require('unicode/category/Lm'),
 require('unicode/category/Lo')
 ].reduce(function (list, item) {
 list.push.apply(list, Object.keys(item).map(function (value) {
 return parseInt(value, 10);
 }));
 return list;
 }, [ ]).sort(function (a, b) { return a - b; });
 compacted = (function (list) {
 var result, item, idx, value;
 result = [ ];
 item = { begin : list[0], end : list[0] };
 result.push(item);
 for (idx = 1; idx < list.length; idx += 1) {
 value = list[idx];
 if (item.end + 1 === value) {
 item.end = value;
 } else {
 item = { begin : list[idx], end : list[idx] };
 result.push(item);
 }
 }
 for (idx = 0; idx < result.length; idx += 1) {
 item = result[idx];
 if (item.begin === item.end) {
 result[idx] = item.begin;
 } else {
 result[idx] = [ item.begin, item.end ];
 }
 }
 return result;
 }(letters));
 require('fs').writeFileSync(__dirname + '/letters.json',
 JSON.stringify(compacted, null, 2));
 });

It is quite naive approach but I think that it will work in most of the cases. What do you think?

Question 2

I have not used node js. However, I wonder if you could simply use string replace?

Question 3

Sorry, did not get it - what do you mean by "use string replace"?

Question 4

The top part looks clean, personally I would

Not compare with null all the time, just check word.length and have word be an array at all times.
Not initialize word and words separately
Not use String.fromCharCode(code), I would use text[index] instead
I would use the ~ operator instead of comparing to -1
I would first deal with finding a match , and then with not finding a match ( switch the if blocks in other words ), my mind had to do a double take when I was reading your code
if would add a space to the end of text, so that I would not need the last if statement

All that would give me something like:

module.exports = function (text){
 text += " "; 
 var words = [], 
 word = [], 
 limit = text.length, code, index;
 for (index = 0; index < limit; index += 1) {
 code = text.charCodeAt(index);
 if (~_.indexOf(letters, code, true)) {
 word.push( text[index] )
 } else {
 if (word.length) {
 words.push(word.join(''));
 word = [];
 }
 }
 }
 return words;
};

Finally, I think collecting char-codes in the 2nd script, then taking chars, converting those to char-codes and then use the chars again might not be the best approach.

Personally, I would letters be an object where each letter ( not the char-code ) would be a property of the object set to true. No more char-code conversions, and most likely it would beat the sorted lookup table. ( To be tested.. )

Question 5

I've tried to use properties instead of the binary search, and it is much faster - but it is not consistent. For some reason console.log(String.fromCharCode(195101) === String.fromCharCode(64029)) and as the result the letters.length - Object.keys(obj).length = 1583 obj = letters.reduce(function (tmp, item) { tmp[String.fromCharCode(item)] = item; return tmp; }, { }))

Question 6

Happy that it is faster, disconcerting that you found that behaviour, I am fairly certain that you will win a ton of rep if you ask about this on stackoverflow ;)

Question 7

I have not used node js. However, I wonder if you could simply use string replace? For example,

v = 'I was walking down the park on day. I was running down the block'
j = message.replace(/was/g, '')

The output would be the following.

'I walking down the park one day. I here'

Question 8

Hi James, please post comments as comments. I flagged this post as "not an answer".

konijn konijn 34.3k5 gold badges70 silver badges267 bronze badges · Accepted Answer · 2014-02-01 16:55:47Z

The top part looks clean, personally I would

Not compare with null all the time, just check word.length and have word be an array at all times.
Not initialize word and words separately
Not use String.fromCharCode(code), I would use text[index] instead
I would use the ~ operator instead of comparing to -1
I would first deal with finding a match , and then with not finding a match ( switch the if blocks in other words ), my mind had to do a double take when I was reading your code
if would add a space to the end of text, so that I would not need the last if statement

All that would give me something like:

module.exports = function (text){
 text += " "; 
 var words = [], 
 word = [], 
 limit = text.length, code, index;
 for (index = 0; index < limit; index += 1) {
 code = text.charCodeAt(index);
 if (~_.indexOf(letters, code, true)) {
 word.push( text[index] )
 } else {
 if (word.length) {
 words.push(word.join(''));
 word = [];
 }
 }
 }
 return words;
};

Finally, I think collecting char-codes in the 2nd script, then taking chars, converting those to char-codes and then use the chars again might not be the best approach.

Personally, I would letters be an object where each letter ( not the char-code ) would be a property of the object set to true. No more char-code conversions, and most likely it would beat the sorted lookup table. ( To be tested.. )

I've tried to use properties instead of the binary search, and it is much faster - but it is not consistent. For some reason console.log(String.fromCharCode(195101) === String.fromCharCode(64029)) and as the result the letters.length - Object.keys(obj).length = 1583 obj = letters.reduce(function (tmp, item) { tmp[String.fromCharCode(item)] = item; return tmp; }, { }))
Happy that it is faster, disconcerting that you found that behaviour, I am fairly certain that you will win a ton of rep if you ask about this on stackoverflow ;)

Stack Exchange Network

node.js library for extracting words from a text

2 Answers 2

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Hot Network Questions

node.js library for extracting words from a text

2 Answers 2

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Related

Hot Network Questions