Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit 2615df9

Browse files
github-actions[bot]stevejgordon
andauthored
Support simple_pattern and simple_pattern_split tokenizers (#5789) (#5791)
* Support simple pattern tokenizer * Remove incorrect serializer attribute * Support simple_pattern_split tokenizer Co-authored-by: Steve Gordon <sgordon@hotmail.co.uk>
1 parent 27e1a18 commit 2615df9

File tree

5 files changed

+151
-103
lines changed

5 files changed

+151
-103
lines changed
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
// Licensed to Elasticsearch B.V under one or more agreements.
2+
// Elasticsearch B.V licenses this file to you under the Apache 2.0 License.
3+
// See the LICENSE file in the project root for more information
4+
5+
using System.Runtime.Serialization;
6+
7+
namespace Nest
8+
{
9+
/// <summary>
10+
/// The simple_pattern tokenizer uses a regular expression to capture matching text as terms.
11+
/// </summary>
12+
public interface ISimplePatternSplitTokenizer : ITokenizer
13+
{
14+
/// <summary>
15+
/// Lucene regular expression, defaults to the empty string.
16+
/// </summary>
17+
[DataMember(Name = "pattern")]
18+
string Pattern { get; set; }
19+
}
20+
21+
/// <inheritdoc />
22+
public class SimplePatternSplitTokenizer : TokenizerBase, ISimplePatternSplitTokenizer
23+
{
24+
public SimplePatternSplitTokenizer() => Type = "simple_pattern_split";
25+
26+
/// <inheritdoc />
27+
public string Pattern { get; set; }
28+
}
29+
30+
/// <inheritdoc />
31+
public class SimplePatternSplitTokenizerDescriptor
32+
: TokenizerDescriptorBase<SimplePatternSplitTokenizerDescriptor, ISimplePatternSplitTokenizer>, ISimplePatternSplitTokenizer
33+
{
34+
protected override string Type => "simple_pattern_split";
35+
36+
string ISimplePatternSplitTokenizer.Pattern { get; set; }
37+
38+
/// <inheritdoc cref="ISimplePatternSplitTokenizer.Pattern" />
39+
public SimplePatternSplitTokenizerDescriptor Pattern(string pattern) => Assign(pattern, (a, v) => a.Pattern = v);
40+
}
41+
}
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
// Licensed to Elasticsearch B.V under one or more agreements.
2+
// Elasticsearch B.V licenses this file to you under the Apache 2.0 License.
3+
// See the LICENSE file in the project root for more information
4+
5+
using System.Runtime.Serialization;
6+
7+
namespace Nest
8+
{
9+
/// <summary>
10+
/// The simple_pattern tokenizer uses a regular expression to capture matching text as terms.
11+
/// </summary>
12+
public interface ISimplePatternTokenizer : ITokenizer
13+
{
14+
/// <summary>
15+
/// Lucene regular expression, defaults to the empty string.
16+
/// </summary>
17+
[DataMember(Name = "pattern")]
18+
string Pattern { get; set; }
19+
}
20+
21+
/// <inheritdoc />
22+
public class SimplePatternTokenizer : TokenizerBase, ISimplePatternTokenizer
23+
{
24+
public SimplePatternTokenizer() => Type = "simple_pattern";
25+
26+
/// <inheritdoc />
27+
public string Pattern { get; set; }
28+
}
29+
30+
/// <inheritdoc />
31+
public class SimplePatternTokenizerDescriptor
32+
: TokenizerDescriptorBase<SimplePatternTokenizerDescriptor, ISimplePatternTokenizer>, ISimplePatternTokenizer
33+
{
34+
protected override string Type => "simple_pattern";
35+
36+
string ISimplePatternTokenizer.Pattern { get; set; }
37+
38+
/// <inheritdoc cref="ISimplePatternTokenizer.Pattern" />
39+
public SimplePatternTokenizerDescriptor Pattern(string pattern) => Assign(pattern, (a, v) => a.Pattern = v);
40+
}
41+
}

‎src/Nest/Analysis/Tokenizers/Tokenizers.cs‎

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -132,5 +132,13 @@ public TokenizersDescriptor Nori(string name, Func<NoriTokenizerDescriptor, INor
132132
/// >
133133
public TokenizersDescriptor CharGroup(string name, Func<CharGroupTokenizerDescriptor, ICharGroupTokenizer> selector) =>
134134
Assign(name, selector?.Invoke(new CharGroupTokenizerDescriptor()));
135+
136+
/// <inheritdoc cref="ISimplePatternTokenizer"/>>
137+
public TokenizersDescriptor SimplePattern(string name, Func<SimplePatternTokenizerDescriptor, ISimplePatternTokenizer> selector) =>
138+
Assign(name, selector?.Invoke(new SimplePatternTokenizerDescriptor()));
139+
140+
/// <inheritdoc cref="ISimplePatternSplitTokenizer"/>>
141+
public TokenizersDescriptor SimplePatternSplit(string name, Func<SimplePatternSplitTokenizerDescriptor, ISimplePatternSplitTokenizer> selector) =>
142+
Assign(name, selector?.Invoke(new SimplePatternSplitTokenizerDescriptor()));
135143
}
136144
}

‎src/Nest/Indices/Analyze/AnalyzeTokenizersDescriptor.cs‎

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -104,8 +104,15 @@ public ITokenizer Nori(Func<NoriTokenizerDescriptor, INoriTokenizer> selector) =
104104
selector.Invoke(new NoriTokenizerDescriptor());
105105

106106
/// <inheritdoc cref="ICharGroupTokenizer.TokenizeOnCharacters" />
107-
/// >
108107
public ITokenizer CharGroup(Func<CharGroupTokenizerDescriptor, ICharGroupTokenizer> selector) =>
109108
selector?.Invoke(new CharGroupTokenizerDescriptor());
109+
110+
/// <inheritdoc cref="ISimplePatternTokenizer"/>>
111+
public ITokenizer SimplePattern(Func<SimplePatternTokenizerDescriptor, ISimplePatternTokenizer> selector) =>
112+
selector?.Invoke(new SimplePatternTokenizerDescriptor());
113+
114+
/// <inheritdoc cref="ISimplePatternSplitTokenizer"/>>
115+
public ITokenizer SimplePatternSplit(Func<SimplePatternSplitTokenizerDescriptor, ISimplePatternSplitTokenizer> selector) =>
116+
selector?.Invoke(new SimplePatternSplitTokenizerDescriptor());
110117
}
111118
}

‎tests/Tests/Analysis/Tokenizers/TokenizerTests.cs‎

Lines changed: 53 additions & 102 deletions
Original file line numberDiff line numberDiff line change
@@ -22,18 +22,10 @@ public class EdgeNGramTests : TokenizerAssertionBase<EdgeNGramTests>
2222

2323
public override ITokenizer Initializer => new EdgeNGramTokenizer
2424
{
25-
MaxGram = 2,
26-
MinGram = 1,
27-
TokenChars = new[] { TokenChar.Digit, TokenChar.Letter }
25+
MaxGram = 2, MinGram = 1, TokenChars = new[] { TokenChar.Digit, TokenChar.Letter }
2826
};
2927

30-
public override object Json => new
31-
{
32-
min_gram = 1,
33-
max_gram = 2,
34-
token_chars = new[] { "digit", "letter" },
35-
type = "edge_ngram"
36-
};
28+
public override object Json => new { min_gram = 1, max_gram = 2, token_chars = new[] { "digit", "letter" }, type = "edge_ngram" };
3729

3830
public override string Name => "endgen";
3931
}
@@ -50,10 +42,7 @@ public class EdgeNGramCustomTokenCharsTests : TokenizerAssertionBase<EdgeNGramCu
5042

5143
public override ITokenizer Initializer => new EdgeNGramTokenizer
5244
{
53-
MaxGram = 2,
54-
MinGram = 1,
55-
TokenChars = new[] { TokenChar.Custom },
56-
CustomTokenChars = "+-_"
45+
MaxGram = 2, MinGram = 1, TokenChars = new[] { TokenChar.Custom }, CustomTokenChars = "+-_"
5746
};
5847

5948
public override object Json => new
@@ -62,7 +51,7 @@ public class EdgeNGramCustomTokenCharsTests : TokenizerAssertionBase<EdgeNGramCu
6251
max_gram = 2,
6352
token_chars = new[] { "custom" },
6453
custom_token_chars = "+-_",
65-
type = "edge_ngram"
54+
type = "edge_ngram"
6655
};
6756

6857
public override string Name => "endgen_custom";
@@ -78,18 +67,10 @@ public class NGramTests : TokenizerAssertionBase<NGramTests>
7867

7968
public override ITokenizer Initializer => new NGramTokenizer
8069
{
81-
MaxGram = 2,
82-
MinGram = 1,
83-
TokenChars = new[] { TokenChar.Digit, TokenChar.Letter }
70+
MaxGram = 2, MinGram = 1, TokenChars = new[] { TokenChar.Digit, TokenChar.Letter }
8471
};
8572

86-
public override object Json => new
87-
{
88-
min_gram = 1,
89-
max_gram = 2,
90-
token_chars = new[] { "digit", "letter" },
91-
type = "ngram"
92-
};
73+
public override object Json => new { min_gram = 1, max_gram = 2, token_chars = new[] { "digit", "letter" }, type = "ngram" };
9374

9475
public override string Name => "ng";
9576
}
@@ -106,10 +87,7 @@ public class NGramCustomTokenCharsTests : TokenizerAssertionBase<NGramCustomToke
10687

10788
public override ITokenizer Initializer => new NGramTokenizer
10889
{
109-
MaxGram = 2,
110-
MinGram = 1,
111-
TokenChars = new[] { TokenChar.Custom },
112-
CustomTokenChars = "+-_"
90+
MaxGram = 2, MinGram = 1, TokenChars = new[] { TokenChar.Custom }, CustomTokenChars = "+-_"
11391
};
11492

11593
public override object Json => new
@@ -164,16 +142,9 @@ public class IcuTests : TokenizerAssertionBase<IcuTests>
164142
.RuleFiles(RuleFiles)
165143
);
166144

167-
public override ITokenizer Initializer => new IcuTokenizer
168-
{
169-
RuleFiles = RuleFiles,
170-
};
145+
public override ITokenizer Initializer => new IcuTokenizer { RuleFiles = RuleFiles, };
171146

172-
public override object Json => new
173-
{
174-
rule_files = RuleFiles,
175-
type = "icu_tokenizer"
176-
};
147+
public override object Json => new { rule_files = RuleFiles, type = "icu_tokenizer" };
177148

178149
public override string Name => "icu";
179150
}
@@ -198,7 +169,7 @@ public class KuromojiTests : TokenizerAssertionBase<KuromojiTests>
198169
DiscardPunctuation = true,
199170
NBestExamples = Example,
200171
NBestCost = 1000,
201-
UserDictionaryRules = new[] { Inline }
172+
UserDictionaryRules = new[] { Inline }
202173
};
203174

204175
public override object Json => new
@@ -208,7 +179,7 @@ public class KuromojiTests : TokenizerAssertionBase<KuromojiTests>
208179
nbest_cost = 1000,
209180
nbest_examples = Example,
210181
type = "kuromoji_tokenizer",
211-
user_dictionary_rules = new[] { Inline }
182+
user_dictionary_rules = new[] { Inline }
212183
};
213184

214185
public override string Name => "kuro";
@@ -228,18 +199,9 @@ public class KuromojiDiscardCompoundTokenTests : TokenizerAssertionBase<Kuromoji
228199
.DiscardCompoundToken()
229200
);
230201

231-
public override ITokenizer Initializer => new KuromojiTokenizer
232-
{
233-
Mode = KuromojiTokenizationMode.Search,
234-
DiscardCompoundToken = true,
235-
};
202+
public override ITokenizer Initializer => new KuromojiTokenizer { Mode = KuromojiTokenizationMode.Search, DiscardCompoundToken = true, };
236203

237-
public override object Json => new
238-
{
239-
discard_compound_token = true,
240-
mode = "search",
241-
type = "kuromoji_tokenizer",
242-
};
204+
public override object Json => new { discard_compound_token = true, mode = "search", type = "kuromoji_tokenizer", };
243205

244206
public override string Name => "kuro_discard_compound_token";
245207
}
@@ -252,11 +214,7 @@ public class UaxTests : TokenizerAssertionBase<UaxTests>
252214

253215
public override ITokenizer Initializer => new UaxEmailUrlTokenizer { MaxTokenLength = 12 };
254216

255-
public override object Json => new
256-
{
257-
max_token_length = 12,
258-
type = "uax_url_email"
259-
};
217+
public override object Json => new { max_token_length = 12, type = "uax_url_email" };
260218

261219
public override string Name => "uax";
262220
}
@@ -269,20 +227,9 @@ public class PatternTests : TokenizerAssertionBase<PatternTests>
269227
.Pattern(@"\W+")
270228
);
271229

272-
public override ITokenizer Initializer => new PatternTokenizer
273-
{
274-
Flags = "CASE_INSENSITIVE",
275-
Group = 1,
276-
Pattern = @"\W+"
277-
};
230+
public override ITokenizer Initializer => new PatternTokenizer { Flags = "CASE_INSENSITIVE", Group = 1, Pattern = @"\W+" };
278231

279-
public override object Json => new
280-
{
281-
pattern = @"\W+",
282-
flags = "CASE_INSENSITIVE",
283-
group = 1,
284-
type = "pattern"
285-
};
232+
public override object Json => new { pattern = @"\W+", flags = "CASE_INSENSITIVE", group = 1, type = "pattern" };
286233

287234
public override string Name => "pat";
288235
}
@@ -312,10 +259,7 @@ public class NoriTests : TokenizerAssertionBase<NoriTests>
312259
.DecompoundMode(NoriDecompoundMode.Mixed)
313260
);
314261

315-
public override ITokenizer Initializer => new NoriTokenizer
316-
{
317-
DecompoundMode = NoriDecompoundMode.Mixed
318-
};
262+
public override ITokenizer Initializer => new NoriTokenizer { DecompoundMode = NoriDecompoundMode.Mixed };
319263

320264
public override object Json => new { type = "nori_tokenizer", decompound_mode = "mixed" };
321265
public override string Name => "nori";
@@ -331,16 +275,14 @@ public class NoriWithUserDictionaryTests : TokenizerAssertionBase<NoriWithUserDi
331275

332276
public override ITokenizer Initializer => new NoriTokenizer
333277
{
334-
DecompoundMode = NoriDecompoundMode.Mixed,
335-
UserDictionaryRules = new [] { "c++", "C샤프", "세종", "세종시 세종 시" }
278+
DecompoundMode = NoriDecompoundMode.Mixed, UserDictionaryRules = new[] { "c++", "C샤프", "세종", "세종시 세종 시" }
336279
};
337280

338281
public override object Json => new
339282
{
340-
type = "nori_tokenizer",
341-
decompound_mode = "mixed",
342-
user_dictionary_rules = new [] { "c++", "C샤프", "세종", "세종시 세종 시" }
283+
type = "nori_tokenizer", decompound_mode = "mixed", user_dictionary_rules = new[] { "c++", "C샤프", "세종", "세종시 세종 시" }
343284
};
285+
344286
public override string Name => "nori_userdictionary";
345287
}
346288

@@ -353,16 +295,9 @@ public class CharGroupTests : TokenizerAssertionBase<CharGroupTests>
353295
.TokenizeOnCharacters(_chars)
354296
);
355297

356-
public override ITokenizer Initializer => new CharGroupTokenizer
357-
{
358-
TokenizeOnCharacters = _chars
359-
};
298+
public override ITokenizer Initializer => new CharGroupTokenizer { TokenizeOnCharacters = _chars };
360299

361-
public override object Json => new
362-
{
363-
tokenize_on_chars = _chars,
364-
type = "char_group"
365-
};
300+
public override object Json => new { tokenize_on_chars = _chars, type = "char_group" };
366301

367302
public override string Name => "char_group";
368303
}
@@ -377,18 +312,9 @@ public class CharGroupMaxTokenLengthTests : TokenizerAssertionBase<CharGroupMaxT
377312
.MaxTokenLength(255)
378313
);
379314

380-
public override ITokenizer Initializer => new CharGroupTokenizer
381-
{
382-
TokenizeOnCharacters = _chars,
383-
MaxTokenLength = 255
384-
};
315+
public override ITokenizer Initializer => new CharGroupTokenizer { TokenizeOnCharacters = _chars, MaxTokenLength = 255 };
385316

386-
public override object Json => new
387-
{
388-
tokenize_on_chars = _chars,
389-
type = "char_group",
390-
max_token_length = 255
391-
};
317+
public override object Json => new { tokenize_on_chars = _chars, type = "char_group", max_token_length = 255 };
392318

393319
public override string Name => "char_group_max_token_length";
394320
}
@@ -400,13 +326,38 @@ public class DiscardPunctuationTests : TokenizerAssertionBase<DiscardPunctuation
400326
.DiscardPunctuation()
401327
);
402328

403-
public override ITokenizer Initializer => new NoriTokenizer
404-
{
405-
DiscardPunctuation = true
406-
};
329+
public override ITokenizer Initializer => new NoriTokenizer { DiscardPunctuation = true };
407330

408331
public override object Json => new { type = "nori_tokenizer", discard_punctuation = true };
409332
public override string Name => "nori-discard";
410333
}
334+
335+
[SkipVersion("<7.7.0", "simple_pattern experimental until 7.7.0")]
336+
public class SimplePatternTests : TokenizerAssertionBase<SimplePatternTests>
337+
{
338+
public override FuncTokenizer Fluent => (n, t) => t.SimplePattern(n, e => e
339+
.Pattern(@"\W+")
340+
);
341+
342+
public override ITokenizer Initializer => new SimplePatternTokenizer { Pattern = @"\W+" };
343+
344+
public override object Json => new { pattern = @"\W+", type = "simple_pattern" };
345+
346+
public override string Name => "simple-pattern";
347+
}
348+
349+
[SkipVersion("<7.7.0", "simple_pattern_split experimental until 7.7.0")]
350+
public class SimplePatternSplitTests : TokenizerAssertionBase<SimplePatternTests>
351+
{
352+
public override FuncTokenizer Fluent => (n, t) => t.SimplePatternSplit(n, e => e
353+
.Pattern(@"\W+")
354+
);
355+
356+
public override ITokenizer Initializer => new SimplePatternTokenizer { Pattern = @"\W+" };
357+
358+
public override object Json => new { pattern = @"\W+", type = "simple_pattern_split" };
359+
360+
public override string Name => "simple-pattern-split";
361+
}
411362
}
412363
}

0 commit comments

Comments
(0)

AltStyle によって変換されたページ (->オリジナル) /