I've recently been assigned to write up a CSV writer with "as much flexibility" as possible concerning pretty much all (output stream, string fields with potential new line characters, separator collisions, etc. you name it...)
No specification, no design guidelines, no nothing and I'm not expecting any any time soon (let's not dwell on the situation I'm currently in...). So, I've quickly built a test implementation which is as follows:
public sealed class CsvWriter<T> : MarshalByRefObject, IDisposable
{
private const int MIN_BUFFER = 8;
private const char ESCAPE_CHAR = '\\';
private const char ESCAPED_SEPARATOR_CHAR = 's';
private Stream output;
private readonly StringBuilder buffer;
private bool isDisposed;
private readonly IEnumerable<Func<T, string>> projections;
private readonly List<char> ignoredCharacters;
private readonly char listSeparator;
private readonly bool escapeListSeparator;
private readonly Encoding encoding;
private readonly bool escapeNewLineCharacters;
private readonly int bufferLength = 512;
public CsvWriter(Stream output, IEnumerable<Func<T, string>> projections, Encoding encoding = null, int bufferLength = 512, bool escapeListSeparatorInsideFields = true,
bool escapeNewLineCharacters = true, IEnumerable<char> ignoredCharacters = null)
: this(output, projections, CultureInfo.CurrentCulture.TextInfo.ListSeparator[0], encoding, bufferLength, escapeListSeparatorInsideFields, escapeNewLineCharacters, ignoredCharacters)
{
}
public CsvWriter(Stream output, IEnumerable<Func<T, string>> projections, char listSeparator, Encoding encoding = null, int bufferLength = 512, bool escapeListSeparatorInsideFields = true,
bool escapeNewLineCharacters = true, IEnumerable<char> ignoredCharacters = null)
{
ArgumentValidator.ArgumentNotNull(output, "output");
ArgumentValidator.ArgumentNotNull(projections, "projections");
ArgumentValidator.ArgumentNotValid(output, b => output.CanWrite, "output", "Can not write in specified stream.");
this.bufferLength = bufferLength < MIN_BUFFER ? MIN_BUFFER : bufferLength;
this.output = output;
this.projections = projections;
this.listSeparator = listSeparator;
this.encoding = encoding ?? Encoding.Default;
this.ignoredCharacters = ignoredCharacters == null ? new List<char>() : ignoredCharacters.ToList();
this.buffer = new StringBuilder();
this.escapeListSeparator = escapeListSeparatorInsideFields;
this.escapeNewLineCharacters = escapeNewLineCharacters;
}
public bool EscapeListSeparatorInsideField
{
get
{
return this.escapeListSeparator;
}
}
public bool EscapeNewLineCharactersInsideField
{
get
{
return this.escapeNewLineCharacters;
}
}
public IEnumerable<char> IgnoredCharacters
{
get
{
return this.ignoredCharacters;
}
}
public char ListSeparatorCharacter
{
get
{
return this.listSeparator;
}
}
public void WriteLine(T value)
{
ArgumentValidator.ArgumentNotNull(value, "value");
if (this.isDisposed)
throw new ObjectDisposedException(this.GetType().Name);
Debug.Assert(this.buffer != null);
Debug.Assert(this.projections != null);
Debug.Assert(this.ignoredCharacters != null);
using (var projectionEnumerator = projections.GetEnumerator())
{
if (projectionEnumerator.MoveNext())
{
while (true)
{
var projection = projectionEnumerator.Current;
if (projection != null)
{
var field = projection(value);
if (!string.IsNullOrEmpty(field))
{
bool escapeCharacters = this.escapeNewLineCharacters || this.escapeListSeparator;
if (escapeCharacters || this.ignoredCharacters.Count > 0)
{
foreach (var c in field)
{
if (escapeCharacters && c == ESCAPE_CHAR)
{
this.buffer.Append(ESCAPE_CHAR);
this.buffer.Append(c);
}
else if (this.escapeListSeparator && c == this.listSeparator)
{
this.buffer.Append(ESCAPE_CHAR);
this.buffer.Append(ESCAPED_SEPARATOR_CHAR);
}
else if (this.ignoredCharacters.Contains(c))
{
continue;
}
else if (this.escapeNewLineCharacters && c == '\n')
{
this.buffer.Append(ESCAPE_CHAR);
this.buffer.Append('n');
}
else if (this.escapeNewLineCharacters && c == '\r')
{
this.buffer.Append(ESCAPE_CHAR);
this.buffer.Append('r');
}
else
{
this.buffer.Append(c);
}
}
}
else
{
this.buffer.Append(field);
}
}
}
if (projectionEnumerator.MoveNext())
{
if (projection != null)
{
this.buffer.Append(this.listSeparator);
}
}
else
{
break;
}
}
}
}
buffer.AppendLine();
checkBuffer();
}
public void WriteLines(IEnumerable<T> values)
{
ArgumentValidator.ArgumentNotNull(values, "values");
foreach (var value in values)
{
WriteLine(value);
}
}
public void Dispose()
{
if (this.isDisposed)
return;
if (this.output != null)
{
if (this.buffer != null && this.buffer.Length > 0)
{
flushBuffer();
}
this.output = null;
}
GC.SuppressFinalize(this);
this.isDisposed = true;
}
private void checkBuffer()
{
Debug.Assert(this.output != null);
Debug.Assert(!this.isDisposed);
Debug.Assert(this.buffer != null);
if (this.buffer.Length >= bufferLength)
{
flushBuffer();
}
}
private void flushBuffer()
{
Debug.Assert(this.output != null);
Debug.Assert(!this.isDisposed);
Debug.Assert(this.buffer != null);
this.output.Write(this.encoding.GetBytes(this.buffer.ToString()), 0, this.buffer.Length);
this.buffer.Clear();
}
}
And because I think it will be useful to use this class with anonymous types (from Linq
queries for example), I've implemented a static class that takes advantage of the compiler's type inference:
public static class CsvWriter
{
public static IList<Func<T, string>> CreateEmptyProjectionList<T>(T value)
{
return new List<Func<T, string>>();
}
public static CsvWriter<T> CreateWriter<T>(this T value, Stream output, IEnumerable<Func<T, string>> projections, Encoding encoding = null, int bufferLength = 512, bool escapeListSeparatorInsideFields = true,
bool escapeNewLineCharacters = true, IEnumerable<char> ignoredCharacters = null)
{
return new CsvWriter<T>(output, projections, CultureInfo.CurrentCulture.TextInfo.ListSeparator[0], encoding, bufferLength, escapeListSeparatorInsideFields, escapeNewLineCharacters, ignoredCharacters);
}
public static CsvWriter<T> CreateWriter<T>(this T value, Stream output, IEnumerable<Func<T, string>> projections, char listSeparator, Encoding encoding = null, int bufferLength = 512, bool escapeListSeparatorInsideFields = true,
bool escapeNewLineCharacters = true, IEnumerable<char> ignoredCharacters = null)
{
return new CsvWriter<T>(output, projections, listSeparator, encoding, bufferLength, escapeListSeparatorInsideFields, escapeNewLineCharacters, ignoredCharacters);
}
}
I'm not really set on these methods being extension methods. I most likely will decide to implement them as regular static methods.
I'd really appreciate any peer review pointing out design flaws that can become important when parsing the generates .csv files (I'm expecting I'll be implementing a CsvReader
too) which can correctly handle and take advantage of the writer's features.
Am I going in a completely wrong direction here or this a reasonable implementation?
1 Answer 1
- You should use auto-properties instead of
public IEnumerable<char> IgnoredCharacters { get { return this.ignoredCharacters; } }
It will improve the readability of your code.
The recommended naming convention for constants in
.Net
is Pascal case. So it should beMinBuffer
, for example. Same goes for methods (CheckBuffer()
).Instead of manually moving your iterator, you should use a
foreach
loop.Your
WriteLine
method is too complex. You should probably break it down to smaller parts, so it's easier to understand. I would also encapsulate parsing logic into different class.Your constructor is too complex as well, you should encapsulate those parameters into some class (
CswWriterSettings
?).BufferedStream
andStreamWriter
classes (evenStream
class to some degree) cache data internally. I suggest you use either of those classes instead of building and maintaining your own cache. Edit: Actually,FileStream
already has an internal buffer, so there is no point in usingBufferedStream
either. It will only result in double buffering.I did not quite understand what
projection
is, so I'm not sure if it's really needed, but in my experience - passing a delegate to a constructor (let alone a collection of delegates) is almost always a bad idea. :)
-
\$\begingroup\$ Thanks for the inputs. Concerning #2: I have always named my private methods that way, didn't know it wasn't recommended in NET. #3: I'm doing it that way to avoid adding an unecessary separator at the end of each line but its true that the result is more clunky than simply removing the last character in
buffer
after each iteration. #5. Absolutely correct. #6. You are probably right but my buffer logic is very simple, not sure ifBufferedStream
is an overkill here. \$\endgroup\$InBetween– InBetween2014年11月14日 13:01:58 +00:00Commented Nov 14, 2014 at 13:01 -
\$\begingroup\$ @InBetween, you can learn more about naming conventions on msdn: msdn.microsoft.com/en-us/library/ms229002%28v=vs.110%29.aspx . As for buffered strem - I don think its an overkill. It was built for that purpose alone, it does the same thing you do, and it's bulletproof when it comes to debugging. :) \$\endgroup\$Nikita B– Nikita B2014年11月14日 13:13:10 +00:00Commented Nov 14, 2014 at 13:13
-
\$\begingroup\$ @InBetween, never mind my last point. See the edit to #6. \$\endgroup\$Nikita B– Nikita B2014年11月14日 13:23:08 +00:00Commented Nov 14, 2014 at 13:23
-
\$\begingroup\$ @InBetween Just in case it's useful, you can remove the last char of your StringBuilder with
--buffer.Length;
. \$\endgroup\$RobH– RobH2014年11月14日 14:16:46 +00:00Commented Nov 14, 2014 at 14:16 -
1\$\begingroup\$ I disagree with #1. Because the backing field is
readonly
, this is the way it must be exposed as a property. An auto-property (even with aprivate
setter) would make it non-readonly
. Note, C# version 6 will allow for read-only auto properties. \$\endgroup\$Jesse C. Slicer– Jesse C. Slicer2014年11月14日 15:53:51 +00:00Commented Nov 14, 2014 at 15:53
3,"a","...""...",6
),\r\n
as record terminator,\n
for multiline text. Could make a test case for that. \$\endgroup\$