const string Input = "a\u0304\u0308bc\u0327";
var bytes = Encoding.UTF8.GetByteCount(Input);
Console.WriteLine("{0} ({1} bytes in UTF-8)", Input, bytes);
for (var i = 0; i <= bytes; i++)
{
var result = Input.OfMaxBytes(i);
Console.WriteLine("{0} \"{1}\" {2}", i, result, Input.OfMaxBytesStartsWith(iresult, StringComparison.Ordinal));
}
ā̈bç (9 bytes in UTF-8) 0 "" True 1 a"a" True 2 ab"ab" False 3 ā"ā" True 4 āb"āb" False 5 ā̈"ā̈" True 6 ā̈b"ā̈b" True 7 ā̈bc"ā̈bc" True 8 ā̈bc"ā̈bc" True 9 ā̈bç"ā̈bç" True
ā̈bç (9 bytes in UTF-8) 0 "" True 1 "" True 2 "" True 3 "" True 4 "" True 5 ā̈"ā̈" True 6 ā̈b"ā̈b" True 7 ā̈b"ā̈b" True 8 ā̈b"ā̈b" True 9 ā̈bç"ā̈bç" True
const string Input = "a\u0304\u0308bc\u0327";
var bytes = Encoding.UTF8.GetByteCount(Input);
Console.WriteLine("{0} ({1} bytes in UTF-8)", Input, bytes);
for (var i = 0; i <= bytes; i++)
{
Console.WriteLine("{0} {1}", i, Input.OfMaxBytes(i));
}
ā̈bç (9 bytes in UTF-8) 0 1 a 2 ab 3 ā 4 āb 5 ā̈ 6 ā̈b 7 ā̈bc 8 ā̈bc 9 ā̈bç
ā̈bç (9 bytes in UTF-8) 0 1 2 3 4 5 ā̈ 6 ā̈b 7 ā̈b 8 ā̈b 9 ā̈bç
const string Input = "a\u0304\u0308bc\u0327";
var bytes = Encoding.UTF8.GetByteCount(Input);
Console.WriteLine("{0} ({1} bytes in UTF-8)", Input, bytes);
for (var i = 0; i <= bytes; i++)
{
var result = Input.OfMaxBytes(i);
Console.WriteLine("{0} \"{1}\" {2}", i, result, Input.StartsWith(result, StringComparison.Ordinal));
}
ā̈bç (9 bytes in UTF-8) 0 "" True 1 "a" True 2 "ab" False 3 "ā" True 4 "āb" False 5 "ā̈" True 6 "ā̈b" True 7 "ā̈bc" True 8 "ā̈bc" True 9 "ā̈bç" True
ā̈bç (9 bytes in UTF-8) 0 "" True 1 "" True 2 "" True 3 "" True 4 "" True 5 "ā̈" True 6 "ā̈b" True 7 "ā̈b" True 8 "ā̈b" True 9 "ā̈bç" True
It depends what you mean by correct. Consider this program:
const string Input = "a\u0304\u0308bc\u0327";
var bytes = Encoding.UTF8.GetByteCount(Input);
Console.WriteLine("{0} ({1} bytes in UTF-8)", Input, bytes);
for (var i = 0; i <= bytes; i++)
{
Console.WriteLine("{0} {1}", i, Input.OfMaxBytes(i));
}
Here is what your solution gives:
ā̈bç (9 bytes in UTF-8) 0 1 a 2 ab 3 ā 4 āb 5 ā̈ 6 ā̈b 7 ā̈bc 8 ā̈bc 9 ā̈bç
Granted, you might not come across such an input very often, but I don't think that's the result you want.
Two other points:
- You are building up a lot of intermediate strings. When you catch yourself doing that, see if you can use a
StringBuilder
instead. - You are iterating through the entire string, regardless of the value of
maxBytes
.
Here is what I would suggest:
public static string OfMaxBytes(this string input, int maxBytes)
{
if (maxBytes == 0 || string.IsNullOrEmpty(input))
{
return string.Empty;
}
var encoding = Encoding.UTF8;
if (encoding.GetByteCount(input) <= maxBytes)
{
return input;
}
var sb = new StringBuilder();
var bytes = 0;
var enumerator = StringInfo.GetTextElementEnumerator(input);
while (enumerator.MoveNext())
{
var textElement = enumerator.GetTextElement();
bytes += encoding.GetByteCount(textElement);
if (bytes <= maxBytes)
{
sb.Append(textElement);
}
else
{
break;
}
}
return sb.ToString();
}
Which gives this output:
ā̈bç (9 bytes in UTF-8) 0 1 2 3 4 5 ā̈ 6 ā̈b 7 ā̈b 8 ā̈b 9 ā̈bç
It depends what you mean by correct. Consider this program:
const string Input = "a\u0304\u0308bc\u0327";
var bytes = Encoding.UTF8.GetByteCount(Input);
Console.WriteLine("{0} ({1} bytes in UTF-8)", Input, bytes);
for (var i = 0; i <= bytes; i++)
{
Console.WriteLine("{0} {1}", i, Input.OfMaxBytes(i));
}
Here is what your solution gives:
ā̈bç (9 bytes in UTF-8) 0 1 a 2 ab 3 ā 4 āb 5 ā̈ 6 ā̈b 7 ā̈bc 8 ā̈bc 9 ā̈bç
Granted, you might not come across such an input very often, but I don't think that's the result you want.
Two other points:
- You are building up a lot of intermediate strings. When you catch yourself doing that, see if you can use a
StringBuilder
instead. - You are iterating through the entire string, regardless of the value of
maxBytes
.
Here is what I would suggest:
public static string OfMaxBytes(this string input, int maxBytes)
{
if (maxBytes == 0 || string.IsNullOrEmpty(input))
{
return string.Empty;
}
var encoding = Encoding.UTF8;
var sb = new StringBuilder();
var bytes = 0;
var enumerator = StringInfo.GetTextElementEnumerator(input);
while (enumerator.MoveNext())
{
var textElement = enumerator.GetTextElement();
bytes += encoding.GetByteCount(textElement);
if (bytes <= maxBytes)
{
sb.Append(textElement);
}
else
{
break;
}
}
return sb.ToString();
}
Which gives this output:
ā̈bç (9 bytes in UTF-8) 0 1 2 3 4 5 ā̈ 6 ā̈b 7 ā̈b 8 ā̈b 9 ā̈bç
It depends what you mean by correct. Consider this program:
const string Input = "a\u0304\u0308bc\u0327";
var bytes = Encoding.UTF8.GetByteCount(Input);
Console.WriteLine("{0} ({1} bytes in UTF-8)", Input, bytes);
for (var i = 0; i <= bytes; i++)
{
Console.WriteLine("{0} {1}", i, Input.OfMaxBytes(i));
}
Here is what your solution gives:
ā̈bç (9 bytes in UTF-8) 0 1 a 2 ab 3 ā 4 āb 5 ā̈ 6 ā̈b 7 ā̈bc 8 ā̈bc 9 ā̈bç
Granted, you might not come across such an input very often, but I don't think that's the result you want.
Two other points:
- You are building up a lot of intermediate strings. When you catch yourself doing that, see if you can use a
StringBuilder
instead. - You are iterating through the entire string, regardless of the value of
maxBytes
.
Here is what I would suggest:
public static string OfMaxBytes(this string input, int maxBytes)
{
if (maxBytes == 0 || string.IsNullOrEmpty(input))
{
return string.Empty;
}
var encoding = Encoding.UTF8;
if (encoding.GetByteCount(input) <= maxBytes)
{
return input;
}
var sb = new StringBuilder();
var bytes = 0;
var enumerator = StringInfo.GetTextElementEnumerator(input);
while (enumerator.MoveNext())
{
var textElement = enumerator.GetTextElement();
bytes += encoding.GetByteCount(textElement);
if (bytes <= maxBytes)
{
sb.Append(textElement);
}
else
{
break;
}
}
return sb.ToString();
}
Which gives this output:
ā̈bç (9 bytes in UTF-8) 0 1 2 3 4 5 ā̈ 6 ā̈b 7 ā̈b 8 ā̈b 9 ā̈bç
It depends what you mean by correct. Consider this program:
const string Input = "a\u0304\u0308bc\u0327";
var bytes = Encoding.UTF8.GetByteCount(Input);
Console.WriteLine("{0} ({1} bytes in UTF-8)", Input, bytes);
for (var i = 0; i <= bytes; i++)
{
Console.WriteLine("{0} {1}", i, Input.OfMaxBytes(i));
}
Here is what your solution gives:
ā̈bç (9 bytes in UTF-8) 0 1 a 2 ab 3 ā 4 āb 5 ā̈ 6 ā̈b 7 ā̈bc 8 ā̈bc 9 ā̈bç
Granted, you might not come across such an input very often, but I don't think that's the result you want.
Two other points:
- You are also building up a lot of intermediate strings. When you catch yourself doing that, see if you can use a
StringBuilder
instead. - You are iterating through the entire string, regardless of the value of
maxBytes
.
Here is what I would suggest:
public static string OfMaxBytes(this string input, int maxBytes)
{
if (maxBytes == 0 || string.IsNullOrEmpty(input))
{
return string.Empty;
}
var encoding = Encoding.UTF8;
var sb = new StringBuilder();
var bytes = 0;
var enumerator = StringInfo.GetTextElementEnumerator(input);
while (enumerator.MoveNext())
{
var textElement = enumerator.GetTextElement();
bytes += encoding.GetByteCount(textElement);
if (bytes <= maxBytes)
{
sb.Append(textElement);
}
else
{
break;
}
}
return sb.ToString();
}
Which gives this output:
ā̈bç (9 bytes in UTF-8) 0 1 2 3 4 5 ā̈ 6 ā̈b 7 ā̈b 8 ā̈b 9 ā̈bç
It depends what you mean by correct. Consider this program:
const string Input = "a\u0304\u0308bc\u0327";
var bytes = Encoding.UTF8.GetByteCount(Input);
Console.WriteLine("{0} ({1} bytes in UTF-8)", Input, bytes);
for (var i = 0; i <= bytes; i++)
{
Console.WriteLine("{0} {1}", i, Input.OfMaxBytes(i));
}
Here is what your solution gives:
ā̈bç (9 bytes in UTF-8) 0 1 a 2 ab 3 ā 4 āb 5 ā̈ 6 ā̈b 7 ā̈bc 8 ā̈bc 9 ā̈bç
Granted, you might not come across such an input very often, but I don't think that's the result you want.
Two other points:
- You are also building up a lot of intermediate strings. When you catch yourself doing that, see if you can use a
StringBuilder
instead. - You are iterating through the entire string, regardless of the value of
maxBytes
.
Here is what I would suggest:
public static string OfMaxBytes(this string input, int maxBytes)
{
var encoding = Encoding.UTF8;
var sb = new StringBuilder();
var bytes = 0;
var enumerator = StringInfo.GetTextElementEnumerator(input);
while (enumerator.MoveNext())
{
var textElement = enumerator.GetTextElement();
bytes += encoding.GetByteCount(textElement);
if (bytes <= maxBytes)
{
sb.Append(textElement);
}
else
{
break;
}
}
return sb.ToString();
}
Which gives this output:
ā̈bç (9 bytes in UTF-8) 0 1 2 3 4 5 ā̈ 6 ā̈b 7 ā̈b 8 ā̈b 9 ā̈bç
It depends what you mean by correct. Consider this program:
const string Input = "a\u0304\u0308bc\u0327";
var bytes = Encoding.UTF8.GetByteCount(Input);
Console.WriteLine("{0} ({1} bytes in UTF-8)", Input, bytes);
for (var i = 0; i <= bytes; i++)
{
Console.WriteLine("{0} {1}", i, Input.OfMaxBytes(i));
}
Here is what your solution gives:
ā̈bç (9 bytes in UTF-8) 0 1 a 2 ab 3 ā 4 āb 5 ā̈ 6 ā̈b 7 ā̈bc 8 ā̈bc 9 ā̈bç
Granted, you might not come across such an input very often, but I don't think that's the result you want.
Two other points:
- You are building up a lot of intermediate strings. When you catch yourself doing that, see if you can use a
StringBuilder
instead. - You are iterating through the entire string, regardless of the value of
maxBytes
.
Here is what I would suggest:
public static string OfMaxBytes(this string input, int maxBytes)
{
if (maxBytes == 0 || string.IsNullOrEmpty(input))
{
return string.Empty;
}
var encoding = Encoding.UTF8;
var sb = new StringBuilder();
var bytes = 0;
var enumerator = StringInfo.GetTextElementEnumerator(input);
while (enumerator.MoveNext())
{
var textElement = enumerator.GetTextElement();
bytes += encoding.GetByteCount(textElement);
if (bytes <= maxBytes)
{
sb.Append(textElement);
}
else
{
break;
}
}
return sb.ToString();
}
Which gives this output:
ā̈bç (9 bytes in UTF-8) 0 1 2 3 4 5 ā̈ 6 ā̈b 7 ā̈b 8 ā̈b 9 ā̈bç
- 16.3k
- 2
- 30
- 73