4
\$\begingroup\$

I've been trying to rewrite the perl transliteration function in javascript. This isn't a complete replication, and I haven't looked at the perl source code... I took inspiration from this question for my source code.

some unit tests (not all):

describe('without flags', () => {
 it('should produce a function that transliterates abcd to dcba when search is abcd and replacement is dcba', () => {
 let text = 'abcd';
 let search = 'abcd';
 let replace = 'dcba';
 let expected = 'dcba';
 let actual = tr(text, search, replace);
 expect(actual).to.be.equal(expected);
 });
 it('should produce a function that transliterates ruby to perl when search is bury and replacement is repl', () => {
 let text = 'ruby';
 let search = 'bury';
 let replace = 'repl';
 let expected = 'perl';
 let actual = tr(text, search, replace);
 expect(actual).to.be.equal(expected);
 });
});
describe('with s flag', () => {
 before(() => {
 flags = 's';
 });
 it('should produce a function that transliterates abba to pop when search is ab and replacement is pop', () => {
 let text = 'abba';
 let search = 'ab';
 let replace = 'po';
 let expected = 'pop';
 let actual = tr(text, search, replace, flags);
 expect(actual).to.be.equal(expected);
 });
});
});
 describe('with d flag', () => {
 before(() => {
 flags = 'd';
 });
 it('should produce a function that transliterates abba to aa when search is b and replacement is null', () => {
 let text = 'abba';
 let search = 'b';
 let replace = '';
 let expected = 'aa';
 let actual = tr(text, search, replace, flags);
 expect(actual).to.be.equal(expected);
 });
 it('should produce a function that transliterates adam to eve when search is adm and replacement is ev', () => {
 let text = 'adam';
 let search = 'adm';
 let replace = 'ev';
 let expected = 'eve';
 let actual = tr(text, search, replace, flags);
 expect(actual).to.be.equal(expected);
 });
});
describe('with ds flags', () => {
 before(() => {
 flags = 'ds';
 });
 it('should produce a function that transliterates abba to p when search is ab and replacement is p', () => {
 let text = 'abba';
 let search = 'ab';
 let replace = 'p';
 let expected = 'p';
 let actual = tr(text, search, replace, flags);
 expect(actual).to.be.equal(expected);
 });
 });
});
 describe('characters that would need escaping e.g. "()[]{}..."', () => {
describe('without flags', () => {
 it('should produce a function that transliterates ( to ) when search [({< is and replacement is ])}>', () => {
 let text = '(';
 let search = '[({<';
 let replace = '])}>';
 let expected = ')';
 let actual = tr(text, search, replace);
 expect(actual).to.be.equal(expected);
 });
 it('should produce a function that transliterates ()abc to [)qbc when search is (a and replacement is [q', () => {
 let text = '()abc';
 let search = '(a';
 let replace = '[q';
 let expected = '[)qbc';
 let actual = tr(text, search, replace);
 expect(actual).to.be.equal(expected);
 });
});
describe('with s flag', () => {
 beforeEach(() => {
 flags = 's';
 });
 it('should produce a function that transliterates () to ( when search is [](){}<> and replacement is [[(({{<<', () => {
 let text = '()';
 let search = '[](){}<>';
 let replace = '[[(({{<<';
 let expected = '(';
 let actual = tr(text, search, replace, flags);
 expect(actual).to.be.equal(expected);
 });
 });
});
describe('with d flag', () => {
 beforeEach(() => {
 flags = 'd';
 });
 it('should produce a function that transliterates ()[] to ){} when search is []( and replacement is {}', () => {
 let text = '()[]';
 let search = '[](';
 let replace = '{}';
 let expected = '){}';
 let actual = tr(text, search, replace, flags);
 expect(actual).to.be.equal(expected);
 });
 });
 describe('with ds flags', () => {
 beforeEach(() => {
 flags = 'ds';
 });
 it('should produce a function that transliterates ()a to (a when search is [](){}<> and replacement is [[(({{<<', () => {
 let text = '()a';
 let search = '[](){}<>';
 let replace = '[[(({{<<';
 let expected = '(a';
 let actual = tr(text, search, replace, flags);
 expect(actual).to.be.equal(expected);
 });
 it('should produce a function that transliterates ()a to ( when search is [](){}<>\0円-\377円 and replacement is [[(({{<<', () => {
 let text = '()a';
 let search = '[](){}<>\0円-\377円';
 let replace = '[[(({{<<';
 let expected = '(';
 let actual = tr(text, search, replace, flags);
 expect(actual).to.be.equal(expected);
 });
 });

And the actual function:

export function tr(text, search, replace, flags) {
 let escapedObj = _charsNeedEscaping(search);
 let escaped = escapedObj.replaced;
 let escapedSearch = escapedObj.text;
 let replacementRegex = new RegExp('[' + escapedSearch + ']', 'g');
 let obj = {};
 let pos = 0;
 let t = text.replace(replacementRegex, function (chr) {
 let r = '';
 if (flags) {
 if (flags.match(/ds/)) {
 r = _dFlag(chr, pos, search, replace, obj, escaped);
 if (r) {
 let retDeets = _sFlag(chr, pos, search, replace, obj, escaped);
 r = retDeets.r;
 obj = retDeets.charKeeper;
 pos = retDeets.pos;
 }
 } else if (flags.match(/s/)) {
 let retDeets = _sFlag(chr, pos, search, replace, obj, escaped);
 r = retDeets.r;
 obj = retDeets.charKeeper;
 pos = retDeets.pos;
 }
 else if (flags.match(/d/)) {
 r = _dFlag(chr, pos, search, replace, obj, escaped);
 }
 } else {
 let ind = search.indexOf(chr);
 r = replace.charAt(ind);
 if (r === '') {
 r = replace.charAt(replace.length - 1);
 }
 }
 return r;
 });
 return t;
}
function _dFlag(chr, pos, search, replace, obj) {
 let r = '';
 if (replace) {
 let ind = search.indexOf(chr);
 if (replace.length >= ind) {
 r = replace.charAt(ind);
 }
 }
 return r;
}
function _sFlag(chr, pos, search, replace, obj, escaped) {
 let escapedChrDeets = _charsNeedEscaping(chr);
 let escapedChr = escapedChrDeets.text;
 let searchRegExp = new RegExp(escapedChr, 'y');
 if (escaped) {
 pos = search.indexOf(chr);
 }
 searchRegExp.lastIndex = pos;
 let searchMatch = search.match(searchRegExp);
 let r = '';
 if (searchMatch) {
 let searchChr = searchMatch[0];
 if (searchChr in obj) {
 r = replace.charAt(obj[searchChr]);
 if (obj[searchChr]+1 === searchMatch.index) {
 r = '';
 }
 } else {
 let replacementIndex = searchMatch.index;
 obj[searchChr] = replacementIndex;
 r = replace.charAt(replacementIndex);
 if (r === '') {
 r = searchChr;
 } else if (r === replace.charAt(replacementIndex-1)) {
 r = '';
 }
 }
 pos++;
 } else {
 r = replace.charAt(obj[chr]);
 if (pos-1 === obj[chr]) {
 r = '';
 }
 pos++;
 }
 return {
 r: r,
 pos: pos,
 charKeeper: obj
 };
}
function _charsNeedEscaping(src) {
 let text = src;
 let res = {
 'text': text,
 'replaced': false,
 };
 if (res.text.match(/\[/) ) {
 res.text = res.text.replace(/\[/g, '\\[');
 res.replaced = true;
 }
 if (text.match(/\]/) ) {
 res.text = res.text.replace(/\]/g, '\\]');
 res.replaced = true;
 }
 if (res.text.match(/\(/) ) {
 res.text = res.text.replace(/\(/g, '\\(');
 res.replaced = true;
 }
 if (text.match(/\)/) ) {
 res.text = res.text.replace(/\)/g, '\\)');
 res.replaced = true;
 }
 if (res.text.match(/\</) ) {
 res.text = res.text.replace(/\</g, '\\<');
 res.replaced = true;
 }
 if (text.match(/\>/) ) {
 res.text = res.text.replace(/\>/g, '\\>');
 res.replaced = true;
 }
 if (res.text.match(/\{/) ) {
 res.text = res.text.replace(/\{/g, '\\{');
 res.replaced = true;
 }
 if (text.match(/\}/) ) {
 res.text = res.text.replace(/\}/g, '\\}');
 res.replaced = true;
 }
 return res;
}
200_success
146k22 gold badges190 silver badges479 bronze badges
asked Apr 18, 2018 at 16:09
\$\endgroup\$

2 Answers 2

1
\$\begingroup\$

Good work with the unit tests! They made it much simpler to check if I broke something when reworking some of your logic.

I've never used Perl, so had to play around with a repl for a while in order to understand the tr function. I may certainly still be missing some functionality here!

While playing with tr, I noticed a couple discrepancies between your version and the built in version.

  1. The s flag doesn't always squash replaced characters.

    my $string = '()aa))';
    $string =~ tr/()/((/ds;
    print "$string\n"; # => (aa(
    

    With tr('()aa))', '()', ((', 'ds') I receive (aa(( instead of the expected (aa(.

  2. Squashing isn't greedy enough.

    my $string = 'abbba';
    $string =~ tr/b/a/s;
    print "$string\n"; # => aaa
    

    With tr('abbba', 'b', 'a', 's') I receive aaaa.

With this in mind I added three new unit tests:

it('Should handle multiple locations for squashed characters', () => {
 const expected = '(aa(';
 const actual = tr('()aa))', '()', '((', 'ds');
 expect(actual).to.equal(expected);
});
it('Should be greedy when squashing replaced characters', () => {
 const expected = 'aaa';
 const actual = tr('abbba', 'b', 'a', 's');
 expect(actual).to.equal(expected);
});
it('Should handle squashing characters with multiple source characters', () => {
 const expected = 'aaa';
 const actual = tr('abccbba', 'bc', 'a', 's');
 expect(actual).to.equal(expected);
});

Now, to your code!

  1. In the unit tests, you have a lot of duplication in defining text, search, replace, expected, and actual in every test. I believe it helps the readability to drop text, search, and replace as I did in the new tests shown above. You could further reduce the amount of code by defining an array of tests that you loop through, but there are few enough tests that that may not be helpful here.

  2. I expected to be able to pass in sd or ds flags for the same effect. Order shouldn't matter here.

  3. _sFlag, _dFlag and _charsNeedEscaping don't tell me anything about what the function will do - try to be more descriptive in your variable names. obj is even more cryptic. It's fine to use non-descriptive names in very short functions, but in longer functions, it makes the logic incredibly difficult to follow.

  4. You should lint your code, it helps find potential errors. ESLint points out a few problems with the default configuration.

    • _dFlag's obj parameter isn't used.
    • You unnecessarily escape characters in the regular expressions in _charsNeedEscaping, this makes the regex harder to read.
  5. The first thing that stuck out to me when reading through your code is that _charsNeedEscaping is much longer than it needs to be. You can use $& in your replace string to refer to the match text. With this knowledge, _charsNeedEscaping can be trivially written as a single replace statement. I have added a few missing regex special characters that were missing from your function. (Only - and \ are left out)

    const escapeRequiredChars = s => s.replace(/[\/^$*+?.()|[\]{}]/g, '\\$&');
    
  6. Don't unnecessarily quote object keys.

    let res = {
     'text': text,
     'replaced': false,
    };
    

    Is equivalent to:

    let res = {
     text: text,
     replaced: false,
    };
    

    In this case, since text is named the same as the property name, it can be further simplified to:

    let res = {
     text,
     replaced: false,
    };
    
  7. There is a str.includes function. When possible, use it instead of /regex/.match. When you just care if a string matches a regex, use /regex/.test instead of /regex/.match.

  8. Don't assign unnecessary variables, t in the tr function is assigned then immediately returned. In _charsNeedEscaping, text is just an alias to src, just rename src to text.

  9. Prefer const to let when possible - this makes it possible to immediately tell when a variable will be redefined and when it won't change.

  10. You might be interested in learning about destructuring, it could help with some of your object handling code.

With all this in mind, here is how I would implement tr. It passes all of your provided tests and the three extra I wrote. I took a slightly different approach than your solution, though the general idea is the same.

const escapeRequiredChars = s => s.replace(/[\/^$*+?.()|[\]{}]/g, '\\$&');
function tr(text, search, replace, flags = '') {
 const escaped = escapeRequiredChars(search);
 let lastReplaceChar = '';
 let lastReplaceEnd = 0;
 return text.replace(new RegExp(`([${escaped}])\1円*`, 'g'), (chars, char, offset) => {
 // Reset lastReplaceChar after passing something that hasn't been replaced
 if (lastReplaceEnd < offset) {
 lastReplaceChar = '';
 }
 lastReplaceEnd = offset + chars.length;
 // Find replacement
 const replaceIndex = search.indexOf(char);
 let replacement = replace[replaceIndex];
 if (!replacement) {
 if (flags.includes('d')) return '';
 replacement = replace[replace.length - 1] || char;
 }
 // Handle squashing when the squashed character has already been output.
 if (lastReplaceChar == replacement && flags.includes('s')) {
 return '';
 }
 lastReplaceChar = replacement;
 const returnCount = flags.includes('s') ? 1 : chars.length;
 return replacement.repeat(returnCount);
 });
}
answered Apr 19, 2018 at 5:12
\$\endgroup\$
3
  • \$\begingroup\$ Thank you so much for the improvements. I'm still having a good read through of them, but this helps me a lot. \$\endgroup\$ Commented Apr 19, 2018 at 9:27
  • \$\begingroup\$ Ok, one failure from a test i didn't include: text = 'abba', search = 'b', replace = '', flags = 's'. should return aba, yours returns aa. I think on line 24, if the replacement is '', then you should return only one of the search characters. \$\endgroup\$ Commented Apr 19, 2018 at 11:05
  • 1
    \$\begingroup\$ Good catch! I've updated the code to handle that test. \$\endgroup\$ Commented Apr 19, 2018 at 13:15
1
\$\begingroup\$

5 years later, I too find myself looking for an equivalent for Perl's tr/ operator in JS, only to find the solutions here, but sadly with a severe lack of tests with ranges (like a-z) and as a result @Gerrit0's final code indeed does not work with them:

A simple Perl sample: perl -e '$s = "abc"; $s =~ tr/a-c/d-f/; print "$s\n";' Yields def, as expected.

Calling Gerrit0's solution with the equivalent: tr('abc', 'a-c', 'd-f') Yields ddf, clearly wrong.

This happens because the solution only worried about ranges in the search, by not escaping the - character, but when indexing the replacement ranges were not taken into account.

The line: const replaceIndex = search.indexOf(char);

...attempts to index b from the search pattern a-c, which naturally fails and returns -1.

This cascades into: let replacement = replace[replaceIndex];

...getting undefined, and then the fallback intended only for searches larger than the replacers gets triggered, which inserts the unexpected f in place of the b.

With this in mind I've created my own range-aware solution, also using TypeScript:

export function tr(str: string, search: string, replace: string): string {
 const buildRangesMap = (from: string, searchToReplacementMode: boolean) => {
 const ranges = new Map<number, { index: number, baseCharcode: number, size: number; }>();
 let idxBias = 0;
 from = from.replaceAll(/([^\\-]|(?<=[^\\](?:\\\\)*)\\[^-])-([^-])/ug, (match, start, end, idx) => {
 const startCode = start.codePointAt(0)!;
 const endCode = end.codePointAt(0)!;
 if (startCode > endCode) throw new Error(`Invalid range specified: ${start}-${end} (${startCode}-${endCode})`);
 const rangeSize = endCode - startCode + 1;
 idx += idxBias;
 idxBias += rangeSize - 3;
 const key = searchToReplacementMode ? startCode : idx;
 ranges.set(key, { index: idx, baseCharcode: startCode, size: rangeSize }); 
 return String.fromCodePoint(0xF0000 + key);
 });
 let idx = 0;
 for (const char of from) {
 const code = char.codePointAt(0)!;
 if (code >= 0xF0000) {
 idx += ranges.get(searchToReplacementMode ? code - 0xF0000 : idx)!.size;
 continue;
 }
 const key = searchToReplacementMode ? code : idx;
 if (searchToReplacementMode && ranges.has(key)) {
 idx++;
 continue;
 }
 ranges.set(key, { index: idx, baseCharcode: code, size: 1 });
 idx++;
 }
 return ranges;
 };
 const searchRanges = buildRangesMap(search, true);
 const replaceRanges = buildRangesMap(replace, false);
 const escapedSearch = search.replaceAll(/(?!-)[\/^$*+?.()|[\]{}]/ug, '\\$&');
 return str.replaceAll(new RegExp(`[${escapedSearch}]`, 'ug'), (char) => {
 // Find replacement
 const charCode = char.codePointAt(0)!;
 let searchRange = searchRanges.get(charCode);
 if (!searchRange) {
 for (const { index, baseCharcode, size } of searchRanges.values()) {
 if (charCode >= baseCharcode && charCode < baseCharcode + size) {
 searchRange = { index, baseCharcode, size };
 break;
 }
 }
 }
 if (!searchRange) throw new Error(`Could not find range for ${charCode} in ${searchRanges}`);
 const replaceIndexToFind = searchRange.index + (charCode - searchRange.baseCharcode);
 let replaceRange = replaceRanges.get(replaceIndexToFind);
 if (!replaceRange) {
 for (const { index, baseCharcode, size } of replaceRanges.values()) {
 if (replaceIndexToFind >= index && replaceIndexToFind < index + size) {
 replaceRange = { index, baseCharcode, size };
 break;
 }
 }
 }
 if (!replaceRange) return [...replace].at(-1) ?? char; // If replace is empty apparently we just keep the chars
 const replaceCharCode = replaceIndexToFind - replaceRange.index + replaceRange.baseCharcode;
 return String.fromCodePoint(replaceCharCode);
 });
}

Disclaimer: Since for my needs I did not need any of the flags, and they seem trivially easy to implement externally as pre/post processing steps on the string, I've not included the flags functionality. For this reason I've also not tested this against all of the original tests on this question, only the ones with no flags, and included some tests of my own with a more barebones approach but still gets the job done:

function test(text: string, search: string, replace: string, expect: string) {
 const out = tr(text, search, replace);
 console.log(out, '===', expect, '->', out === expect);
}
// expected outputs taken directly from perl results of running the equivalent tr/ operation
test('abcd', 'abcd', 'dcba', 'dcba');
test('ruby', 'bury', 'repl', 'perl');
test('(', '[({<', '])}>', ')');
test('()abc', '(a', '[q', '[)qbc');
test('abcghi', 'a-cg-i', 'd-f', 'deffff');
test('10fedcba98', '01a-f89', '-+A-CD-F<>', '+-FEDCBA><');
test('10fedcba98', '01a-f89', '22A-F33', '22FEDCBA33');
test('10fedcba98', 'abccba', 'ABCDEF', '10fedCBA98');
test('0円abc\u{10FFFF}', '0円-\u{10FFFF}', 'x', 'xxxxx');

There is nothing platform-specific in the code, so it should work on any recent enough JS engine (as some fairly recent features are used like unicode regexes), but I'll include here for completion that I tested on Windows with Node.js 19.9.0.

As a final disclaimer, I did not test this very extensively against original Perl behavior around rarer quirks and edge cases, or a wide variety of possible inputs outside few basic ranges. I do not guarantee this to be 1:1 matching with Perl's tr/ on all scenarios, maybe it could be (ignoring uses of flags), but I won't claim to guarantee it because I can't know for sure. If anyone does use this and finds a mismatch between it and Perl, I'd be happy to hear about it in the comments here.

answered May 15, 2023 at 9:25
\$\endgroup\$
2
  • 1
    \$\begingroup\$ Thats awesome. I did actually release a library npmjs.com/package/perl-transliterate I'm not 100% sure if i still have access to that git though... might be worth you raising a PR or creating your own library for it \$\endgroup\$ Commented May 17, 2023 at 8:24
  • \$\begingroup\$ Oh cool! Somehow I did not find that library when searching... it's unfortunate you don't have access to it anymore. Once I'm done using my implementation for my needs in a larger codebase (gets it a little battle testing in the process) I may go back and reimplement flags support and post it as a separate npm library, I'll update back here with a link whenever I do! \$\endgroup\$ Commented May 18, 2023 at 6:09

Your Answer

Draft saved
Draft discarded

Sign up or log in

Sign up using Google
Sign up using Email and Password

Post as a guest

Required, but never shown

Post as a guest

Required, but never shown

By clicking "Post Your Answer", you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.