I've been trying to rewrite the perl transliteration function in javascript. This isn't a complete replication, and I haven't looked at the perl source code... I took inspiration from this question for my source code.
some unit tests (not all):
describe('without flags', () => {
it('should produce a function that transliterates abcd to dcba when search is abcd and replacement is dcba', () => {
let text = 'abcd';
let search = 'abcd';
let replace = 'dcba';
let expected = 'dcba';
let actual = tr(text, search, replace);
expect(actual).to.be.equal(expected);
});
it('should produce a function that transliterates ruby to perl when search is bury and replacement is repl', () => {
let text = 'ruby';
let search = 'bury';
let replace = 'repl';
let expected = 'perl';
let actual = tr(text, search, replace);
expect(actual).to.be.equal(expected);
});
});
describe('with s flag', () => {
before(() => {
flags = 's';
});
it('should produce a function that transliterates abba to pop when search is ab and replacement is pop', () => {
let text = 'abba';
let search = 'ab';
let replace = 'po';
let expected = 'pop';
let actual = tr(text, search, replace, flags);
expect(actual).to.be.equal(expected);
});
});
});
describe('with d flag', () => {
before(() => {
flags = 'd';
});
it('should produce a function that transliterates abba to aa when search is b and replacement is null', () => {
let text = 'abba';
let search = 'b';
let replace = '';
let expected = 'aa';
let actual = tr(text, search, replace, flags);
expect(actual).to.be.equal(expected);
});
it('should produce a function that transliterates adam to eve when search is adm and replacement is ev', () => {
let text = 'adam';
let search = 'adm';
let replace = 'ev';
let expected = 'eve';
let actual = tr(text, search, replace, flags);
expect(actual).to.be.equal(expected);
});
});
describe('with ds flags', () => {
before(() => {
flags = 'ds';
});
it('should produce a function that transliterates abba to p when search is ab and replacement is p', () => {
let text = 'abba';
let search = 'ab';
let replace = 'p';
let expected = 'p';
let actual = tr(text, search, replace, flags);
expect(actual).to.be.equal(expected);
});
});
});
describe('characters that would need escaping e.g. "()[]{}..."', () => {
describe('without flags', () => {
it('should produce a function that transliterates ( to ) when search [({< is and replacement is ])}>', () => {
let text = '(';
let search = '[({<';
let replace = '])}>';
let expected = ')';
let actual = tr(text, search, replace);
expect(actual).to.be.equal(expected);
});
it('should produce a function that transliterates ()abc to [)qbc when search is (a and replacement is [q', () => {
let text = '()abc';
let search = '(a';
let replace = '[q';
let expected = '[)qbc';
let actual = tr(text, search, replace);
expect(actual).to.be.equal(expected);
});
});
describe('with s flag', () => {
beforeEach(() => {
flags = 's';
});
it('should produce a function that transliterates () to ( when search is [](){}<> and replacement is [[(({{<<', () => {
let text = '()';
let search = '[](){}<>';
let replace = '[[(({{<<';
let expected = '(';
let actual = tr(text, search, replace, flags);
expect(actual).to.be.equal(expected);
});
});
});
describe('with d flag', () => {
beforeEach(() => {
flags = 'd';
});
it('should produce a function that transliterates ()[] to ){} when search is []( and replacement is {}', () => {
let text = '()[]';
let search = '[](';
let replace = '{}';
let expected = '){}';
let actual = tr(text, search, replace, flags);
expect(actual).to.be.equal(expected);
});
});
describe('with ds flags', () => {
beforeEach(() => {
flags = 'ds';
});
it('should produce a function that transliterates ()a to (a when search is [](){}<> and replacement is [[(({{<<', () => {
let text = '()a';
let search = '[](){}<>';
let replace = '[[(({{<<';
let expected = '(a';
let actual = tr(text, search, replace, flags);
expect(actual).to.be.equal(expected);
});
it('should produce a function that transliterates ()a to ( when search is [](){}<>\0円-\377円 and replacement is [[(({{<<', () => {
let text = '()a';
let search = '[](){}<>\0円-\377円';
let replace = '[[(({{<<';
let expected = '(';
let actual = tr(text, search, replace, flags);
expect(actual).to.be.equal(expected);
});
});
And the actual function:
export function tr(text, search, replace, flags) {
let escapedObj = _charsNeedEscaping(search);
let escaped = escapedObj.replaced;
let escapedSearch = escapedObj.text;
let replacementRegex = new RegExp('[' + escapedSearch + ']', 'g');
let obj = {};
let pos = 0;
let t = text.replace(replacementRegex, function (chr) {
let r = '';
if (flags) {
if (flags.match(/ds/)) {
r = _dFlag(chr, pos, search, replace, obj, escaped);
if (r) {
let retDeets = _sFlag(chr, pos, search, replace, obj, escaped);
r = retDeets.r;
obj = retDeets.charKeeper;
pos = retDeets.pos;
}
} else if (flags.match(/s/)) {
let retDeets = _sFlag(chr, pos, search, replace, obj, escaped);
r = retDeets.r;
obj = retDeets.charKeeper;
pos = retDeets.pos;
}
else if (flags.match(/d/)) {
r = _dFlag(chr, pos, search, replace, obj, escaped);
}
} else {
let ind = search.indexOf(chr);
r = replace.charAt(ind);
if (r === '') {
r = replace.charAt(replace.length - 1);
}
}
return r;
});
return t;
}
function _dFlag(chr, pos, search, replace, obj) {
let r = '';
if (replace) {
let ind = search.indexOf(chr);
if (replace.length >= ind) {
r = replace.charAt(ind);
}
}
return r;
}
function _sFlag(chr, pos, search, replace, obj, escaped) {
let escapedChrDeets = _charsNeedEscaping(chr);
let escapedChr = escapedChrDeets.text;
let searchRegExp = new RegExp(escapedChr, 'y');
if (escaped) {
pos = search.indexOf(chr);
}
searchRegExp.lastIndex = pos;
let searchMatch = search.match(searchRegExp);
let r = '';
if (searchMatch) {
let searchChr = searchMatch[0];
if (searchChr in obj) {
r = replace.charAt(obj[searchChr]);
if (obj[searchChr]+1 === searchMatch.index) {
r = '';
}
} else {
let replacementIndex = searchMatch.index;
obj[searchChr] = replacementIndex;
r = replace.charAt(replacementIndex);
if (r === '') {
r = searchChr;
} else if (r === replace.charAt(replacementIndex-1)) {
r = '';
}
}
pos++;
} else {
r = replace.charAt(obj[chr]);
if (pos-1 === obj[chr]) {
r = '';
}
pos++;
}
return {
r: r,
pos: pos,
charKeeper: obj
};
}
function _charsNeedEscaping(src) {
let text = src;
let res = {
'text': text,
'replaced': false,
};
if (res.text.match(/\[/) ) {
res.text = res.text.replace(/\[/g, '\\[');
res.replaced = true;
}
if (text.match(/\]/) ) {
res.text = res.text.replace(/\]/g, '\\]');
res.replaced = true;
}
if (res.text.match(/\(/) ) {
res.text = res.text.replace(/\(/g, '\\(');
res.replaced = true;
}
if (text.match(/\)/) ) {
res.text = res.text.replace(/\)/g, '\\)');
res.replaced = true;
}
if (res.text.match(/\</) ) {
res.text = res.text.replace(/\</g, '\\<');
res.replaced = true;
}
if (text.match(/\>/) ) {
res.text = res.text.replace(/\>/g, '\\>');
res.replaced = true;
}
if (res.text.match(/\{/) ) {
res.text = res.text.replace(/\{/g, '\\{');
res.replaced = true;
}
if (text.match(/\}/) ) {
res.text = res.text.replace(/\}/g, '\\}');
res.replaced = true;
}
return res;
}
2 Answers 2
Good work with the unit tests! They made it much simpler to check if I broke something when reworking some of your logic.
I've never used Perl, so had to play around with a repl for a while in order to understand the tr
function. I may certainly still be missing some functionality here!
While playing with tr
, I noticed a couple discrepancies between your version and the built in version.
The
s
flag doesn't always squash replaced characters.my $string = '()aa))'; $string =~ tr/()/((/ds; print "$string\n"; # => (aa(
With
tr('()aa))', '()', ((', 'ds')
I receive(aa((
instead of the expected(aa(
.Squashing isn't greedy enough.
my $string = 'abbba'; $string =~ tr/b/a/s; print "$string\n"; # => aaa
With
tr('abbba', 'b', 'a', 's')
I receiveaaaa
.
With this in mind I added three new unit tests:
it('Should handle multiple locations for squashed characters', () => {
const expected = '(aa(';
const actual = tr('()aa))', '()', '((', 'ds');
expect(actual).to.equal(expected);
});
it('Should be greedy when squashing replaced characters', () => {
const expected = 'aaa';
const actual = tr('abbba', 'b', 'a', 's');
expect(actual).to.equal(expected);
});
it('Should handle squashing characters with multiple source characters', () => {
const expected = 'aaa';
const actual = tr('abccbba', 'bc', 'a', 's');
expect(actual).to.equal(expected);
});
Now, to your code!
In the unit tests, you have a lot of duplication in defining
text
,search
,replace
,expected
, andactual
in every test. I believe it helps the readability to droptext
,search
, andreplace
as I did in the new tests shown above. You could further reduce the amount of code by defining an array of tests that you loop through, but there are few enough tests that that may not be helpful here.I expected to be able to pass in
sd
ords
flags for the same effect. Order shouldn't matter here._sFlag
,_dFlag
and_charsNeedEscaping
don't tell me anything about what the function will do - try to be more descriptive in your variable names.obj
is even more cryptic. It's fine to use non-descriptive names in very short functions, but in longer functions, it makes the logic incredibly difficult to follow.You should lint your code, it helps find potential errors. ESLint points out a few problems with the default configuration.
_dFlag
'sobj
parameter isn't used.- You unnecessarily escape characters in the regular expressions in
_charsNeedEscaping
, this makes the regex harder to read.
The first thing that stuck out to me when reading through your code is that
_charsNeedEscaping
is much longer than it needs to be. You can use$&
in your replace string to refer to the match text. With this knowledge,_charsNeedEscaping
can be trivially written as a single replace statement. I have added a few missing regex special characters that were missing from your function. (Only-
and\
are left out)const escapeRequiredChars = s => s.replace(/[\/^$*+?.()|[\]{}]/g, '\\$&');
Don't unnecessarily quote object keys.
let res = { 'text': text, 'replaced': false, };
Is equivalent to:
let res = { text: text, replaced: false, };
In this case, since
text
is named the same as the property name, it can be further simplified to:let res = { text, replaced: false, };
There is a
str.includes
function. When possible, use it instead of/regex/.match
. When you just care if a string matches a regex, use/regex/.test
instead of/regex/.match
.Don't assign unnecessary variables,
t
in thetr
function is assigned then immediately returned. In_charsNeedEscaping
,text
is just an alias tosrc
, just renamesrc
totext
.Prefer
const
tolet
when possible - this makes it possible to immediately tell when a variable will be redefined and when it won't change.You might be interested in learning about destructuring, it could help with some of your object handling code.
With all this in mind, here is how I would implement tr
. It passes all of your provided tests and the three extra I wrote. I took a slightly different approach than your solution, though the general idea is the same.
const escapeRequiredChars = s => s.replace(/[\/^$*+?.()|[\]{}]/g, '\\$&');
function tr(text, search, replace, flags = '') {
const escaped = escapeRequiredChars(search);
let lastReplaceChar = '';
let lastReplaceEnd = 0;
return text.replace(new RegExp(`([${escaped}])\1円*`, 'g'), (chars, char, offset) => {
// Reset lastReplaceChar after passing something that hasn't been replaced
if (lastReplaceEnd < offset) {
lastReplaceChar = '';
}
lastReplaceEnd = offset + chars.length;
// Find replacement
const replaceIndex = search.indexOf(char);
let replacement = replace[replaceIndex];
if (!replacement) {
if (flags.includes('d')) return '';
replacement = replace[replace.length - 1] || char;
}
// Handle squashing when the squashed character has already been output.
if (lastReplaceChar == replacement && flags.includes('s')) {
return '';
}
lastReplaceChar = replacement;
const returnCount = flags.includes('s') ? 1 : chars.length;
return replacement.repeat(returnCount);
});
}
-
\$\begingroup\$ Thank you so much for the improvements. I'm still having a good read through of them, but this helps me a lot. \$\endgroup\$Jarede– Jarede2018年04月19日 09:27:35 +00:00Commented Apr 19, 2018 at 9:27
-
\$\begingroup\$ Ok, one failure from a test i didn't include: text = 'abba', search = 'b', replace = '', flags = 's'. should return
aba
, yours returnsaa
. I think on line 24, if the replacement is '', then you should return only one of the search characters. \$\endgroup\$Jarede– Jarede2018年04月19日 11:05:50 +00:00Commented Apr 19, 2018 at 11:05 -
1\$\begingroup\$ Good catch! I've updated the code to handle that test. \$\endgroup\$Gerrit0– Gerrit02018年04月19日 13:15:49 +00:00Commented Apr 19, 2018 at 13:15
5 years later, I too find myself looking for an equivalent for Perl's tr/
operator in JS, only to find the solutions here, but sadly with a severe lack of tests with ranges (like a-z
) and as a result @Gerrit0's final code indeed does not work with them:
A simple Perl sample:
perl -e '$s = "abc"; $s =~ tr/a-c/d-f/; print "$s\n";'
Yields def
, as expected.
Calling Gerrit0's solution with the equivalent:
tr('abc', 'a-c', 'd-f')
Yields ddf
, clearly wrong.
This happens because the solution only worried about ranges in the search, by not escaping the -
character, but when indexing the replacement ranges were not taken into account.
The line: const replaceIndex = search.indexOf(char);
...attempts to index b
from the search pattern a-c
, which naturally fails and returns -1
.
This cascades into: let replacement = replace[replaceIndex];
...getting undefined
, and then the fallback intended only for searches larger than the replacers gets triggered, which inserts the unexpected f
in place of the b
.
With this in mind I've created my own range-aware solution, also using TypeScript:
export function tr(str: string, search: string, replace: string): string {
const buildRangesMap = (from: string, searchToReplacementMode: boolean) => {
const ranges = new Map<number, { index: number, baseCharcode: number, size: number; }>();
let idxBias = 0;
from = from.replaceAll(/([^\\-]|(?<=[^\\](?:\\\\)*)\\[^-])-([^-])/ug, (match, start, end, idx) => {
const startCode = start.codePointAt(0)!;
const endCode = end.codePointAt(0)!;
if (startCode > endCode) throw new Error(`Invalid range specified: ${start}-${end} (${startCode}-${endCode})`);
const rangeSize = endCode - startCode + 1;
idx += idxBias;
idxBias += rangeSize - 3;
const key = searchToReplacementMode ? startCode : idx;
ranges.set(key, { index: idx, baseCharcode: startCode, size: rangeSize });
return String.fromCodePoint(0xF0000 + key);
});
let idx = 0;
for (const char of from) {
const code = char.codePointAt(0)!;
if (code >= 0xF0000) {
idx += ranges.get(searchToReplacementMode ? code - 0xF0000 : idx)!.size;
continue;
}
const key = searchToReplacementMode ? code : idx;
if (searchToReplacementMode && ranges.has(key)) {
idx++;
continue;
}
ranges.set(key, { index: idx, baseCharcode: code, size: 1 });
idx++;
}
return ranges;
};
const searchRanges = buildRangesMap(search, true);
const replaceRanges = buildRangesMap(replace, false);
const escapedSearch = search.replaceAll(/(?!-)[\/^$*+?.()|[\]{}]/ug, '\\$&');
return str.replaceAll(new RegExp(`[${escapedSearch}]`, 'ug'), (char) => {
// Find replacement
const charCode = char.codePointAt(0)!;
let searchRange = searchRanges.get(charCode);
if (!searchRange) {
for (const { index, baseCharcode, size } of searchRanges.values()) {
if (charCode >= baseCharcode && charCode < baseCharcode + size) {
searchRange = { index, baseCharcode, size };
break;
}
}
}
if (!searchRange) throw new Error(`Could not find range for ${charCode} in ${searchRanges}`);
const replaceIndexToFind = searchRange.index + (charCode - searchRange.baseCharcode);
let replaceRange = replaceRanges.get(replaceIndexToFind);
if (!replaceRange) {
for (const { index, baseCharcode, size } of replaceRanges.values()) {
if (replaceIndexToFind >= index && replaceIndexToFind < index + size) {
replaceRange = { index, baseCharcode, size };
break;
}
}
}
if (!replaceRange) return [...replace].at(-1) ?? char; // If replace is empty apparently we just keep the chars
const replaceCharCode = replaceIndexToFind - replaceRange.index + replaceRange.baseCharcode;
return String.fromCodePoint(replaceCharCode);
});
}
Disclaimer: Since for my needs I did not need any of the flags, and they seem trivially easy to implement externally as pre/post processing steps on the string, I've not included the flags functionality. For this reason I've also not tested this against all of the original tests on this question, only the ones with no flags, and included some tests of my own with a more barebones approach but still gets the job done:
function test(text: string, search: string, replace: string, expect: string) {
const out = tr(text, search, replace);
console.log(out, '===', expect, '->', out === expect);
}
// expected outputs taken directly from perl results of running the equivalent tr/ operation
test('abcd', 'abcd', 'dcba', 'dcba');
test('ruby', 'bury', 'repl', 'perl');
test('(', '[({<', '])}>', ')');
test('()abc', '(a', '[q', '[)qbc');
test('abcghi', 'a-cg-i', 'd-f', 'deffff');
test('10fedcba98', '01a-f89', '-+A-CD-F<>', '+-FEDCBA><');
test('10fedcba98', '01a-f89', '22A-F33', '22FEDCBA33');
test('10fedcba98', 'abccba', 'ABCDEF', '10fedCBA98');
test('0円abc\u{10FFFF}', '0円-\u{10FFFF}', 'x', 'xxxxx');
There is nothing platform-specific in the code, so it should work on any recent enough JS engine (as some fairly recent features are used like unicode regexes), but I'll include here for completion that I tested on Windows with Node.js 19.9.0.
As a final disclaimer, I did not test this very extensively against original Perl behavior around rarer quirks and edge cases, or a wide variety of possible inputs outside few basic ranges. I do not guarantee this to be 1:1 matching with Perl's tr/ on all scenarios, maybe it could be (ignoring uses of flags), but I won't claim to guarantee it because I can't know for sure. If anyone does use this and finds a mismatch between it and Perl, I'd be happy to hear about it in the comments here.
-
1\$\begingroup\$ Thats awesome. I did actually release a library npmjs.com/package/perl-transliterate I'm not 100% sure if i still have access to that git though... might be worth you raising a PR or creating your own library for it \$\endgroup\$Jarede– Jarede2023年05月17日 08:24:00 +00:00Commented May 17, 2023 at 8:24
-
\$\begingroup\$ Oh cool! Somehow I did not find that library when searching... it's unfortunate you don't have access to it anymore. Once I'm done using my implementation for my needs in a larger codebase (gets it a little battle testing in the process) I may go back and reimplement flags support and post it as a separate npm library, I'll update back here with a link whenever I do! \$\endgroup\$jhm2k– jhm2k2023年05月18日 06:09:58 +00:00Commented May 18, 2023 at 6:09
Explore related questions
See similar questions with these tags.