0
\$\begingroup\$

I'm looking for feedback on my library for extracting words from a text: https://npmjs.org/package/uwords

The extracted word is defined as sequence of Unicode characters from Lu, Ll, Lt, Lm, Lo groups. So the code of the main part is (https://github.com/AlexAtNet/uwords/blob/master/index.js#L9):

module.exports = function (text) {
 var words, word, index, limit, code;
 words = [ ];
 word = null;
 for (index = 0, limit = text.length; index < limit; index += 1) {
 code = text.charCodeAt(index);
 if (-1 === _.indexOf(letters, code, true)) {
 if (null !== word) {
 words.push(word.join(''));
 word = null;
 }
 } else {
 if (null === word) {
 word = [ ];
 }
 word.push(String.fromCharCode(code));
 }
 }
 if (null !== word) {
 words.push(word.join(''));
 }
 return words;
};

and the array letters was created as follows (https://github.com/AlexAtNet/uwords/blob/master/gruntfile.js#L59):

 grunt.registerTask('create-letters-json', 'letters.json', function () {
 var letters, compacted;
 letters = [
 require('unicode/category/Lu'),
 require('unicode/category/Ll'),
 require('unicode/category/Lt'),
 require('unicode/category/Lm'),
 require('unicode/category/Lo')
 ].reduce(function (list, item) {
 list.push.apply(list, Object.keys(item).map(function (value) {
 return parseInt(value, 10);
 }));
 return list;
 }, [ ]).sort(function (a, b) { return a - b; });
 compacted = (function (list) {
 var result, item, idx, value;
 result = [ ];
 item = { begin : list[0], end : list[0] };
 result.push(item);
 for (idx = 1; idx < list.length; idx += 1) {
 value = list[idx];
 if (item.end + 1 === value) {
 item.end = value;
 } else {
 item = { begin : list[idx], end : list[idx] };
 result.push(item);
 }
 }
 for (idx = 0; idx < result.length; idx += 1) {
 item = result[idx];
 if (item.begin === item.end) {
 result[idx] = item.begin;
 } else {
 result[idx] = [ item.begin, item.end ];
 }
 }
 return result;
 }(letters));
 require('fs').writeFileSync(__dirname + '/letters.json',
 JSON.stringify(compacted, null, 2));
 });

It is quite naive approach but I think that it will work in most of the cases. What do you think?

rolfl
98.1k17 gold badges219 silver badges419 bronze badges
asked Jan 31, 2014 at 21:24
\$\endgroup\$
2
  • \$\begingroup\$ I have not used node js. However, I wonder if you could simply use string replace? \$\endgroup\$ Commented Feb 1, 2014 at 0:47
  • \$\begingroup\$ Sorry, did not get it - what do you mean by "use string replace"? \$\endgroup\$ Commented Feb 1, 2014 at 1:58

2 Answers 2

2
\$\begingroup\$

The top part looks clean, personally I would

  • Not compare with null all the time, just check word.length and have word be an array at all times.
  • Not initialize word and words separately
  • Not use String.fromCharCode(code), I would use text[index] instead
  • I would use the ~ operator instead of comparing to -1
  • I would first deal with finding a match , and then with not finding a match ( switch the if blocks in other words ), my mind had to do a double take when I was reading your code
  • if would add a space to the end of text, so that I would not need the last if statement

All that would give me something like:

module.exports = function (text){
 text += " "; 
 var words = [], 
 word = [], 
 limit = text.length, code, index;
 for (index = 0; index < limit; index += 1) {
 code = text.charCodeAt(index);
 if (~_.indexOf(letters, code, true)) {
 word.push( text[index] )
 } else {
 if (word.length) {
 words.push(word.join(''));
 word = [];
 }
 }
 }
 return words;
};

Finally, I think collecting char-codes in the 2nd script, then taking chars, converting those to char-codes and then use the chars again might not be the best approach.

Personally, I would letters be an object where each letter ( not the char-code ) would be a property of the object set to true. No more char-code conversions, and most likely it would beat the sorted lookup table. ( To be tested.. )

answered Feb 1, 2014 at 16:55
\$\endgroup\$
2
  • \$\begingroup\$ I've tried to use properties instead of the binary search, and it is much faster - but it is not consistent. For some reason console.log(String.fromCharCode(195101) === String.fromCharCode(64029)) and as the result the letters.length - Object.keys(obj).length = 1583 obj = letters.reduce(function (tmp, item) { tmp[String.fromCharCode(item)] = item; return tmp; }, { })) \$\endgroup\$ Commented Feb 1, 2014 at 23:41
  • \$\begingroup\$ Happy that it is faster, disconcerting that you found that behaviour, I am fairly certain that you will win a ton of rep if you ask about this on stackoverflow ;) \$\endgroup\$ Commented Feb 2, 2014 at 0:12
0
\$\begingroup\$

I have not used node js. However, I wonder if you could simply use string replace? For example,

v = 'I was walking down the park on day. I was running down the block'
j = message.replace(/was/g, '')

The output would be the following.

'I walking down the park one day. I here'
answered Feb 1, 2014 at 0:41
\$\endgroup\$
1
  • 1
    \$\begingroup\$ Hi James, please post comments as comments. I flagged this post as "not an answer". \$\endgroup\$ Commented Feb 1, 2014 at 0:44

Your Answer

Draft saved
Draft discarded

Sign up or log in

Sign up using Google
Sign up using Email and Password

Post as a guest

Required, but never shown

Post as a guest

Required, but never shown

By clicking "Post Your Answer", you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.