This is the successor of my previous CSV reader. I implemented quite some useful suggestions.
I will give explanation on a few parts:
Prefixing constant identifiers with
Sym_
(abbreviation for symbol) is a practice I learned from writing parsers for more complex formats. Such formats contain alot more symbols than just"
, therefore I add a prefix to make other kinds of constants distict. I understand that it looks odd in a context where there is only one fixed symbol, but it makes sense to me. Also in any format, fixed symbols do never change, doing so would transform the format, so they are justified constants.I am aware that the method
GetCurrentRow
could have been a property calledCurrentRow
. I decided not to do this because of the small amount of public members.
The code may look too spacious, that's because of the great amount of spacing between lines. I advice pasting the code in Visual Studio if it bothers you.
public class CsvReader : IDisposable
{
private const char Sym_Escape = '"';
private static int standardRowSize = 16;
private StreamReader reader;
private char delimiter;
private char[] buffer;
private int bufferSize;
private int bufferBound;
private int bufferPos;
private bool endReached;
private bool boundReached;
private bool newlineReached;
private bool returnImplicitRow;
private bool finishRow;
private string[] row;
private int rowSize;
private int rowPos;
private int valuePos;
private StringBuilder valueBuilder;
public CsvReader(Stream stream, char delimiter = ',', int bufferSize = 4096)
{
#region check
if (stream == null)
{
throw new ArgumentNullException("stream");
}
if (delimiter == Sym_Escape || delimiter == '\r')
{
throw new ArgumentException("Invalid delimiter", "delimiter");
}
if (bufferSize < 128)
{
throw new ArgumentException("Invalid buffer size", "bufferSize");
}
#endregion
this.reader = new StreamReader(stream, Encoding.UTF8, true, bufferSize);
this.delimiter = delimiter;
this.buffer = new char[bufferSize];
this.bufferSize = bufferSize;
this.row = new string[standardRowSize];
this.rowSize = standardRowSize;
this.rowPos = -1;
this.valueBuilder = new StringBuilder(128);
if (stream.Length == 0)
{
returnImplicitRow = true;
}
}
public void Dispose()
{
reader.Dispose();
reader = null;
buffer = null;
valueBuilder = null;
}
public string[] GetCurrentRow()
{
if (rowPos == -1)
{
throw new InvalidOperationException("No row is available at this time");
}
string[] sx = new string[rowPos];
Array.Copy(row, 0, sx, 0, rowPos);
return sx;
}
public bool Read()
{
if (reader == null)
{
throw new ObjectDisposedException(null);
}
rowPos = 0;
while (ReadValue())
{
SetValue(valueBuilder.ToString());
valueBuilder.Length = 0;
if (newlineReached)
{
newlineReached = false;
break;
}
}
if (rowPos == 0)
{
if (returnImplicitRow)
{
SetValue(String.Empty);
returnImplicitRow = false;
}
else
{
rowPos = -1;
return false;
}
}
else if (finishRow)
{
SetValue(String.Empty);
finishRow = false;
}
return true;
}
private void SetValue(string value)
{
if (rowPos == rowSize)
{
rowSize *= 2;
Array.Resize(ref row, rowSize);
}
row[rowPos++] = value;
}
private bool ReadValue()
{
CheckBuffer();
if (endReached)
{
return false;
}
finishRow = false;
char chr = buffer[bufferPos++];
if (chr == Sym_Escape)
{
ReadEnclosedValue();
if (endReached)
{
return true;
}
chr = buffer[bufferPos++];
}
while (true)
{
CheckBuffer();
if (chr == delimiter)
{
if (!boundReached)
{
valueBuilder.Append(buffer, valuePos, (bufferPos - valuePos) - 1);
valuePos = bufferPos;
}
finishRow = true;
break;
}
else if (chr == '\r' && !endReached && buffer[bufferPos] == '\n')
{
if (!boundReached)
{
valueBuilder.Append(buffer, valuePos, (bufferPos - valuePos) - 1);
}
bufferPos++;
valuePos = bufferPos;
CheckBuffer();
if (endReached)
{
returnImplicitRow = true;
}
newlineReached = true;
break;
}
else if (boundReached)
{
valueBuilder.Append(chr);
}
if (endReached)
{
break;
}
chr = buffer[bufferPos++];
}
return true;
}
private void ReadEnclosedValue()
{
CheckBuffer();
if (endReached)
{
return;
}
valuePos = bufferPos;
char chr = buffer[bufferPos++];
while (true)
{
CheckBuffer();
if (chr == Sym_Escape)
{
if (endReached)
{
return;
}
chr = buffer[bufferPos];
if (chr == Sym_Escape)
{
if (boundReached)
{
valueBuilder.Append(Sym_Escape);
}
else
{
valueBuilder.Append(buffer, valuePos, bufferPos - valuePos);
}
bufferPos++;
valuePos = bufferPos;
CheckBuffer();
}
else
{
if (!boundReached)
{
valueBuilder.Append(buffer, valuePos, (bufferPos - valuePos) - 1);
valuePos = bufferPos;
}
return;
}
}
else if (boundReached)
{
valueBuilder.Append(chr);
}
if (endReached)
{
return;
}
chr = buffer[bufferPos++];
}
}
private void CheckBuffer()
{
if (bufferPos == bufferBound)
{
if (valuePos < bufferPos)
{
valueBuilder.Append(buffer, valuePos, (bufferPos - valuePos) - 1);
}
if (reader.EndOfStream)
{
endReached = true;
}
else
{
bufferBound = reader.Read(buffer, 0, bufferSize);
bufferPos = 0;
valuePos = 0;
}
boundReached = true;
}
else
{
boundReached = false;
}
}
}
3 Answers 3
Constructor
- Use a constant to define the minimum buffer size
- Use descriptive exception messages; the more descriptive the message is, the easier it is for a programmer to determine the root cause of problems should they appear.
- Because the minimum buffer size and escape character are part of the contract, I would make them public constants as supposed to private.
- I believe the initial capacity of the
StringBuilder
should be set to the buffer size?
Before
private const char Sym_Escape = '"';
if (delimiter == Sym_Escape || delimiter == '\r')
throw new ArgumentException("Invalid delimiter", "delimiter");
if (bufferSize < 128)
throw new ArgumentException("Invalid buffer size", "bufferSize");
this.valueBuilder = new StringBuilder(128);
After
public const char Sym_Escape = '"';
public const int MinBufferSize = 128;
if (delimiter == Sym_Escape || delimiter == '\r')
throw new ArgumentException("Delimiter must not be an escape or return character.", "delimiter");
if (bufferSize < MinBufferSize)
throw new ArgumentException(String.Format("Buffer size must not be less than {0}.", MinBufferSize), "bufferSize");
this.valueBuilder = new StringBuilder(bufferSize);
Dispose
Firstly, check out the awesome answer to this question regarding the proper implementation of IDisposable
. Secondly, check out this question regarding setting variables to null
after you're done with them. Thirdly, see the final result :)
Before
public void Dispose()
{
reader.Dispose();
reader = null;
buffer = null;
valueBuilder = null;
}
After
~CsvReader()
{
Dispose(false);
}
protected void Dispose(bool disposing)
{
if (disposing)
reader.Dispose();
}
public void Dispose()
{
Dispose(true);
GC.SuppressFinalize(this);
}
-
\$\begingroup\$
bufferSize
specifies the initial size for the buffer which replaces the internal buffer ofStreamReader
. This way I can avoid callingStreamReader.Read
too many times. Is there any particular reason why the "internal" Dispose method is protected instead of private? Very useful information btw. \$\endgroup\$Leopold Asperger– Leopold Asperger2014年07月03日 20:22:55 +00:00Commented Jul 3, 2014 at 20:22 -
3\$\begingroup\$ What is the purpose of calling
Disposing(false)
in the finalizer? The "proper" implementation ofIDisposable
is only required if you're dealing with unmanaged resources, which OP isn't. \$\endgroup\$Matthew– Matthew2014年07月03日 20:39:00 +00:00Commented Jul 3, 2014 at 20:39 -
1\$\begingroup\$ The current implementation of
Dispose
is perfect (well, after correcting Chris's point about aNullReferenceException
on the second pass). The link you have is completely unrelated to implementation ofDispose
. In fact, the linked answer assumes that disposing objects is sufficient; that's only true if those objects don't hold onto anything past theDispose
call. \$\endgroup\$Ben Voigt– Ben Voigt2014年07月03日 21:02:23 +00:00Commented Jul 3, 2014 at 21:02 -
\$\begingroup\$ @LeopoldAsperger It's there mainly for the benefit of the
Dispose
call in the finalizer (and also inheritors). The GC runs on a background thread so when you callDispose
in the finalizer, the program may crash when your try to release managed resources as they may have already been collected by the GC. As others have pointed out though, if you only have managed resources, I guess you don't need to go this far. \$\endgroup\$George Howarth– George Howarth2014年07月04日 07:25:02 +00:00Commented Jul 4, 2014 at 7:25
A few minors for now:
When throwing an exception because the argument violated a constraint then you should consider mentioning that in the message. Simply "Invalid foobar" is quite useless to anyone using it. A message like "Buffer size must be 128 or larger" would be infinitely more helpful to the programmer using this class.
Dispose
should be able to be called more than once without any side effects. In your case it would throw aNullReferenceException
because you setreader
to null. Anull
check should suffice. Adding adisposed
flag is another option.In the times of Linq as a user of your class I'd appreciate it if it would implement
IEnumerable<string[]>
to iterate over the rows.I'm not 100% of the full purpose of
rowPos
. It looks like it could simply be expressed as boolean flag to indicate whether or not there a row is currently available.I think the design of the private methods needs to be cleaned up. They have some inter-dependencies for the various state variable (like buffer and row positions) which are not easy to follow.
-
\$\begingroup\$ Good points, especially the second. \$\endgroup\$Leopold Asperger– Leopold Asperger2014年07月03日 19:48:07 +00:00Commented Jul 3, 2014 at 19:48
Your implementation looks really complicated and (more importantly) error-prone. To be honest: i stared at your code for ten minutes and i still failed to follow the workflow. And it should not be that way. I mean, parsing a csv file in simple case is a three liner:
using(var sr = new StreamReader(stream, encoding))
{
while(!sr.EndOfStream)
{
strings.Add(sr.ReadLine().Split(delimiters));
}
}
And thats it. Make it ten lines, if you want a custom "end of line" character.
You do not need to implement a buffer. It is already implemented on both Stream
level (see BufferedStream
) and on StreamReader level (see constructor which takes buffer size).
You do not need to implement a dynamic array (your SetValue
method) it is already implemented and it is called List<T>
.
To me it looks like you either do not know of the tools already available to you (in which case you should do some research) or you are trying to "optimize" things which do not have to be optimized (which is a bad attitude towards coding).
P.S. if you want your class to be reusable by others, you should follow an existing API for various stream readers: expose BaseStream
property, expose EndOfStream
property, overload the constructor to accept file path instead of a stream and a flag indicating whether th stream should be disposed along with your reader, etc. If this class is for "personal use", then any API you are comfortable with is fine, i guess.
-
\$\begingroup\$ @NikitaBrizhak - Thanks for you time, but I can't use anything from your answer. 1: Parsing a string with
String.Split
will ignore enclosed values. It seems like you tried to make a point in an extravagant manner with just 7 lines of code, but thats not how real-world implementations work. 2: I'm familiar with the .NET framework, which means that I intended any optimizations. 3: I suggest that you try to point out what can be improved, not what you think is easier for you. I know its tempting, but it doesn't help. \$\endgroup\$Leopold Asperger– Leopold Asperger2014年07月04日 10:06:58 +00:00Commented Jul 4, 2014 at 10:06