VB.NET has a Like
operator with a paradigm similar to the standard SQL LIKE
expression (SQL Server, SQLite, MySQL), but with a different syntax:
Dim result As Boolean = "abcd" Like "a*"
It supports:
- multiple-character matches --
"abcd" Like "a*"
- single-character matches --
"abcd" Like "a??d"
- digit matches --
"ab12cd" Like "ab##cd"
- alternate character matches --
"abcd" Like "[aeiou]bcd"
- ranges within the alternate character matches --
"abcd" Like "[a-n]bcd"
- negation of alternate characters --
"abcd" Like "[!e-z]bcd"
I've written code that tokenizes these patterns into a data structure, using the OneOf library which provides discriminated unions for C#. (The ultimate purpose is to convert the VB Like syntax to SQL LIKE syntax.)
I'm looking for feedback in the following areas:
- correctness, even in edge cases
- clarity / readability
The function returns a List<T>
; each element in the list represents a token that consists of one of the following:
char
Wildcards
enum value (defined below)- an instance of
PatternGrouping
-- an alternates grouping
Each alternates grouping contains one or more ranges, represented using a value tuple; a single character is represented as a range with the same start and end character.
using OneOf;
using System;
using System.Collections.Generic;
namespace Shared {
public enum Wildcards {
SingleCharacter,
MultipleCharacter
}
// Single characters within a group are represented by a group with the same start and end character
public class PatternGroup : List<(char start, char end)> {
public PatternGroup(bool isPositive = true) => IsPositive = isPositive;
public bool IsPositive { get; set; } = true;
}
public class LikePattern : List<OneOfBase<char, Wildcards, PatternGroup>> {
public static LikePattern ParseVBLike(string pattern) {
var ret = new LikePattern();
int pos = -1;
int lastPos = pattern.Length - 1;
PatternGroup currentGroup = null;
char ch;
while (pos < lastPos) {
advanceChar();
if (currentGroup == null) {
if (ch == '?') {
ret.Add(Wildcards.SingleCharacter);
} else if (ch == '*') {
ret.Add(Wildcards.MultipleCharacter);
} else if (ch == '#') {
ret.Add(new PatternGroup() {
{'0','9' }
});
} else if (ch == '[') {
currentGroup = new PatternGroup();
if (nextChar() == '!') {
advanceChar();
currentGroup.IsPositive = false;
}
} else {
ret.Add(ch);
}
} else {
var start = ch;
if (ch == ']') {
ret.Add(currentGroup);
currentGroup = null;
} else if (nextChar() == '-' && nextChar(2) != ']') {
advanceChar();
advanceChar();
currentGroup.Add(start, ch);
} else {
currentGroup.Add(ch, ch);
}
}
}
if (currentGroup != null) {
throw new ArgumentException("Missing group end.");
}
return ret;
void advanceChar(bool ignoreEnd = false) {
pos += 1;
if (pos <= lastPos) {
ch = pattern[pos];
} else if (ignoreEnd) {
ch = '\x0';
} else {
throw new ArgumentException("Unexpected end of text");
}
}
char nextChar(int offset = 1) => pos + offset > lastPos ? '\x0' : pattern[pos + offset];
}
}
}
Usage
Usage looks like this (source of functions, usage):
private static string MapSqlSpecialCharacters(char ch) {
switch (ch) {
case '%':
case '_':
case '[':
return "[" + ch + "]";
default:
return ch.ToString();
}
}
public static string GetSQLLike(LikePattern pattern) => pattern.Joined("", x => x.Match(
ch => MapSqlSpecialCharacters(ch),
wildcard => (wildcard == Wildcards.SingleCharacter ? '_' : '%').ToString(),
patternGroup => {
string ret = "";
if (patternGroup.IsPositive) { ret += '^'; }
return ret + String.Join("",
patternGroup.Select(range =>
range.start == range.end ?
$"{range.start}" :
$"{range.start}-{range.end}"
);
}
));
var tokenized = ParseVBLike(oldPattern);
var newPattern = GetSQLLike(tokenized);
The following XUnit test passes:
[Fact]
public void Test1() {
var pattern = "a[L-P]#[!c-e]";
var result = ParseVBLike(pattern);
Assert.Collection(
result,
item => Assert.Equal('a', item),
item => {
PatternGroup grp = item.AsT2;
Assert.True(grp.IsPositive);
Assert.Equal(
new List<(char,char)> { ('L', 'P') },
grp.ToList()
);
},
item => {
PatternGroup grp = item.AsT2;
Assert.True(grp.IsPositive);
Assert.Equal(
new List<(char, char)> { ('0', '9') },
grp.ToList()
);
},
item => {
PatternGroup grp = item.AsT2;
Assert.False(grp.IsPositive);
Assert.Equal(
new List<(char, char)> { ('c', 'e') },
grp.ToList()
);
}
);
}
as does this one:
[Fact]
public void TestWithAsterisk() {
var pattern = "a*b";
var result = ParseVBLike(pattern);
Assert.Collection(
result,
item => Assert.Equal('a', item),
item => Assert.Equal(Wildcards.MultipleCharacter, item),
item => Assert.Equal('b', item)
);
}
-
\$\begingroup\$ Comments are not for extended discussion; this conversation has been moved to chat. \$\endgroup\$Vogel612– Vogel6122019年08月27日 21:41:37 +00:00Commented Aug 27, 2019 at 21:41
1 Answer 1
IMO the use of OneOfBase
as a kind of imitation of discriminated unions makes a lot of noise in your code and makes it more difficult to read than it has to be.
Instead I would define a more traditional class structure, where each class holds information about each pattern or token type:
abstract class LikeToken
{
public abstract bool Match(string value, ref int start);
public abstract string ToSQL();
public abstract string ToVB();
internal LikeToken Next { get; set; }
}
// Any constant char eg: 'a'
class SingleChar : LikeToken
{
public SingleChar(char value)
{
Value = value;
}
public char Value { get; }
public override bool Match(string value, ref int start)
{
if (start < value.Length && value[start] == Value)
{
start++;
return Next.Match(value, ref start);
}
return false;
}
}
// ?
class AnySingleChar : LikeToken
{
// TODO implement the behavior
}
// *
class ZeroOrMoreChars : LikeToken
{
// TODO implement the behavior
}
// 0 - 9
class DigitChar : LikeToken
{
// TODO implement the behavior
}
// [a-z0-9] or [^a-z0-9]
class CharList : LikeToken
{
public CharList(string charList)
{
}
public bool IsPositive { get; private set; }
// TODO implement the behavior
}
As shown above the Match()
method can be used to match each char in the string to test, and it will be independent of if the pattern originally was a VB or SQL pattern.
The methods ToSQL()
and ToVB()
should be used to reconstruct the pattern and hence act as a conversion mechanism between the two languages.
The Next
member can be used to chain the tokens in a linked list which can be useful in the match process, because some of the patterns have to look ahead to find the optimal match - but you have surely already a design for that.
Parsing the pattern could then for VB be something like:
public class LikePattern
{
internal LikePattern(IReadOnlyList<LikeToken> tokens)
{
// TODO Initialize the tokens Next member to form a linked list and the head member with the first token
}
public string Message { get; private set; }
private readonly LikeToken head
public bool Match(string value)
{
// TOD Implement
}
public static LikePattern Parse(string pattern)
{
if (string.IsNullOrEmpty(pattern))
throw new ArgumentException("Can not be an empty string", nameof(pattern));
List<LikeToken> tokens = new List<LikeToken>();
int index = 0;
while (index < pattern.Length)
{
char current = pattern[index];
switch (current)
{
case '?':
tokens.Add(new AnySingleChar());
break;
case '*':
tokens.Add(new ZeroOrMoreChars());
break;
case '#':
tokens.Add(new DigitChar());
break;
case '[':
int start = index;
while (index < pattern.Length && pattern[index] != ']')
{
index++;
}
if (index >= pattern.Length)
throw new InvalidOperationException("Missing a closing square bracket for last char list");
tokens.Add(new CharList(pattern.Substring(start, index - start + 1)));
break;
default:
tokens.Add(new SingleChar(pattern[index]));
break;
}
index++;
}
return new LikePattern(tokens);
}
}
And a similar method could easily be made for SQL.
Some convenient extension methods could be:
public static class LikeExtensions
{
public static bool Like(this string value, string pattern, out string message)
{
LikePattern likePattern = LikePattern.Parse(pattern);
bool result = likePattern.Match(value);
if (!result)
message = likePattern.Message;
else
message = "";
return result;
}
public static bool Like(this string value, LikePattern pattern)
{
return pattern.Match(value);
}
}
And usage:
LikePattern likePattern = LikePattern.Parse("[!e-z]bcd");
string value = "abcd";
bool result = value.Like(likePattern);