Please review the code and suggest better algorithm to efficiently extract key/value from files using multithread.
Input : FormNGFAddTab:First add Tab control inside Tabs control.परीक्षण
Output :
Key : FormNGFAddTab
Value : First add Tab control inside Tabs control.परीक्षण
Separator : ':'
DataHolder/DataStructure : Any[Dictionary/Hash Table/Array/Linked List]
Solution 1:
public static ConcurrentDictionary<string, string> LoadActivityLookupParallelOptimized(string filePath)
{
Stopwatch sw = Stopwatch.StartNew();
var lineCollection = File.ReadAllLines(filePath);
var newLookup = new System.Collections.Concurrent.ConcurrentDictionary<string, string>();
int ParallelThreads = 100;
Parallel.ForEach(lineCollection, new ParallelOptions() { MaxDegreeOfParallelism = ParallelThreads }, currentLine =>
{
if (String.IsNullOrEmpty(currentLine))
{
return;
}
if (currentLine.Substring(0, 2) == "//")
{
return;
}
int iPos = currentLine.IndexOf(":", StringComparison.OrdinalIgnoreCase);
int iiPos = currentLine.IndexOf("::", StringComparison.OrdinalIgnoreCase);
string currentKey, currentValue;
if (iPos>0)
{
if (iiPos>0)
{
currentKey = currentLine.Substring(0, iiPos);
if ((iPos = currentLine.IndexOf(":", iiPos + 2, StringComparison.OrdinalIgnoreCase)) > 0)
{
if ((iPos + 1) <= currentLine.Length)
{
currentValue = currentLine.Substring(iPos + 1);
}
else
{
currentValue = string.Empty;
}
}
else
{
return;
}
}
else
{
currentKey = currentLine.Substring(0, iPos);
if ((iPos + 1) <= currentLine.Length)
{
currentValue = currentLine.Substring(iPos + 1);
}
else
{
currentValue = string.Empty;
}
}
newLookup.TryAdd(currentKey.Trim(), currentValue.TrimEnd());
}
});
sw.Stop();
Console.WriteLine("Time taken: {0}ms", sw.Elapsed.TotalMilliseconds);
return newLookup;
}
I am using Parallel.Foreach to process all lines.
Solution 2:
private static Dictionary<string, string> ProcessLine(string SourceFile)
{
Stopwatch sw2 = Stopwatch.StartNew();
Dictionary<string, string> keyValue = new Dictionary<string, string>();
using (FileStream fs = new FileStream(SourceFile, FileMode.Open, FileAccess.Read, FileShare.Read))
{
using (StreamReader sr = new StreamReader(fs))
{
string sLine = string.Empty;
string sKey = string.Empty, sValue = string.Empty;
try
{
while ((sLine = sr.ReadLine()) != null)
{
int iPos, iiPos;
sLine = sLine.Trim();
if (String.IsNullOrEmpty(sLine))
{
continue;
}
////Ignore comments
if (sLine.Substring(0, 2) == "//")
{
continue;
}
////Ignore if line does not contain :
if ((iPos = sLine.IndexOf(":", StringComparison.OrdinalIgnoreCase)) > 0)
{
if ((iiPos = sLine.IndexOf("::", StringComparison.OrdinalIgnoreCase)) > 0)
{
sKey = sLine.Substring(0, iiPos);
if ((iPos = sLine.IndexOf(":", iiPos + 2, StringComparison.OrdinalIgnoreCase)) > 0)
{
if ((iPos + 1) <= sLine.Length)
{
sValue = sLine.Substring(iPos + 1);
}
else
{
sValue = string.Empty;
}
}
else
{
continue;
}
}
else
{
sKey = sLine.Substring(0, iPos);
if ((iPos + 1) <= sLine.Length)
{
sValue = sLine.Substring(iPos + 1);
}
else
{
sValue = string.Empty;
}
}
keyValue.Add(sKey.Trim(), sValue.TrimEnd());
}
}
sw2.Stop();
Console.WriteLine("Time taken: {0}ms", sw2.Elapsed.TotalMilliseconds);
}
catch (Exception ex)
{
//Workflow.NET.Log logger = new Log();
//logger.LogError(ex, "Could not add key (" + sKey + ") from folder (" + SourceFile + "), Error:" + ex.Message);
}
}
}
return keyValue;
}
Main()
var filePath = @"M:\Dev3.0\Locales\hi-IN\NGF\NGFStandardMessages.txt";
var result = LoadActivityLookupParallelOptimized(filePath);
var result2 = ProcessLine(filePath);
Console.ReadLine();
Results:
Algo 1: 3.1717 ms
Algo 2: 0.6234
LinesCount : 196 lines to process
Please share your thoughts on writing better algorithms for File Processing.
My Thoughts:
--> Using IndexOf
is costly but it is the most efficient.
Sample Key/Value in a line:
Line1 : FormNGFAddTab:First add Tab control inside Tabs control.परीक्षण
Key : FormNGFAddTab
Value : First add Tab control inside Tabs control.परीक्षण
Common string in key : FormNGF
1 Answer 1
- You should use multithreading for reading if you are after performance, so solution which uses
Parallel.ForEach()
is definetely better. - For really big files, you should not read all the text into memory. Instead you should create multiple file streams, split your file into reading sections and read those asynchroniously. For smaller files (200 lines is really small) your approach is fine.
Your algorithm can be simplified if you'd use
Split
method. For example://remove comments var lineWithoutComments = line.Split(new []{"//"}, StringSplitOptions.None).FirstOrDefault(); if (String.IsNullOrWhitespace(lineWithoutComments)) continue; //split key and value var split = lineWithoutComments.Split(new []{":"}, StringSplitOptions.None); if (split.Length < 2) continue; var key = split[0].Trim(); var value = split[1].Trim();
-
\$\begingroup\$ Thank you for the comments. I am concerned about performance as file size is close to 10MB i.e generating close to 1,00,000 keys. I tried the string.split operation but it costly when it comes to performance. I am trying to avoid IndexOf method which is costly. Searching for better algo to extract Key/Value pair. \$\endgroup\$user3089806– user30898062013年12月12日 09:07:40 +00:00Commented Dec 12, 2013 at 9:07
-
\$\begingroup\$ Is it possible to write the algo in C++ and call it in c# to get best performance. \$\endgroup\$user3089806– user30898062013年12月12日 09:15:01 +00:00Commented Dec 12, 2013 at 9:15