\$\begingroup\$
\$\endgroup\$
7
I would like to optimize the code to be efficient. Basically the code finds and generates an Anova table with the p-value also computed.
I am inputting a text file with data delimited with commas.
This is the main function that calls the methods in the library file:
using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Linq;
using System.Text;
using System.Windows.Forms;
using System.IO;
namespace OneWayAnovaTable
{
public partial class OneWayAnovaTable : Form
{
public OneWayAnovaTable()
{
InitializeComponent();
}
static string TSS, ESS, TotSS, TDF, EDF, TotDF, TMS, EMS, F, p;
private void ReadFile()
{
List<List<double>> numbers = new List<List<double>>();
foreach (string line in File.ReadAllLines(@"data.txt"))
{
var list = new List<double>();
foreach (string s in line.Split(new[] { ',', ' ' }, StringSplitOptions.RemoveEmptyEntries))
{
double i;
if (Double.TryParse(s, out i))
{
list.Add(i);
}
}
numbers.Add(list);
}
double[] rowTotal = new double[numbers.Count];
double[] squareRowTotal = new double[numbers.Count];
double[] rowMean = new double[numbers.Count];
int totalElements = 0;
int[] totalInRow = new int[numbers.Count()];
double grandTotalMean = 0;
double grandMean = 0;
double grandTotal=0;
for (int row = 0; row < numbers.Count; row++)
{
var values = numbers[row].ToArray();
rowTotal[row] = values.Sum();
squareRowTotal[row] = values.Select(v => v * v).Sum();
rowMean[row] = rowTotal[row] / values.Length;
totalInRow[row] += values.Length;
totalElements += totalInRow[row];
grandTotalMean += rowMean[row];
grandMean += rowMean[row]/numbers.Count;
}
for (int j=0; j<rowTotal.Length; j++)
{
grandTotal += rowTotal[j];
}
double sumOfSquares = OneWayAnovaClassLibrary.OneWayAnova.totalSumOfSquares(squareRowTotal, grandTotal, totalElements);
double treatmentSumOfSquares = OneWayAnovaClassLibrary.OneWayAnova.treatmentSumOfSquares(rowTotal, totalInRow, grandTotal, totalElements);
double errorSumOfSquares = OneWayAnovaClassLibrary.OneWayAnova.errorSumOfSquares(sumOfSquares, treatmentSumOfSquares);
double meanTreatmentSumOfSquares = OneWayAnovaClassLibrary.OneWayAnova.meanTreatmentSumOfSquares(treatmentSumOfSquares, totalInRow);
double meanErrorSumOfSquares = OneWayAnovaClassLibrary.OneWayAnova.meanErrorSumOfSquares(errorSumOfSquares, (numbers.Count-1), (totalElements-1));
double fStatistic = OneWayAnovaClassLibrary.OneWayAnova.testStatistic(meanTreatmentSumOfSquares, meanErrorSumOfSquares);
double pValue = OneWayAnovaClassLibrary.OneWayAnova.pValue(fStatistic, (numbers.Count - 1), (totalElements - (numbers.Count-1)));
TSS = treatmentSumOfSquares.ToString();
ESS = errorSumOfSquares.ToString();
TotSS = sumOfSquares.ToString();
TDF = (numbers.Count() - 1).ToString();
EDF = (totalElements - numbers.Count()).ToString();
TotDF = (totalElements - 1).ToString();
TMS = meanTreatmentSumOfSquares.ToString();
EMS = meanErrorSumOfSquares.ToString();
F = fStatistic.ToString();
p = pValue.ToString();
}
private void button2_Click(object sender, EventArgs e)
{
ReadFile();
display();
}
private void display()
{
textBoxTSS.Text = TSS;
textBoxESS.Text = ESS;
textBoxTotSS.Text = TotSS;
textBoxTDF.Text = TDF;
textBoxEDF.Text = EDF;
textBoxTotDF.Text = TotDF;
textBoxTMS.Text = TMS;
textBoxEMS.Text = EMS;
textBoxF.Text = F;
textBoxp.Text = p;
}
}
}
The library file is here:
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
namespace OneWayAnovaClassLibrary
{
public class OneWayAnova
{
public static double totalSumOfSquares(double[] squareRowTotal, double grandTotal, int totalOfAllElements)
{
double sumOfSquares = 0;
for (int i = 0; i < squareRowTotal.Length; i++)
{
sumOfSquares += squareRowTotal[i];
}
sumOfSquares = sumOfSquares - (grandTotal * grandTotal / totalOfAllElements);
return sumOfSquares;
}
public static double treatmentSumOfSquares(double[] rowTotal, int[] totalInRow, double grandTotal, int totalOfAllElements)
{
double treatmentSumOfSquares = 0;
for (int i = 0; i < totalInRow.Length; i++)
{
treatmentSumOfSquares += rowTotal[i] * rowTotal[i] / totalInRow[i];
}
treatmentSumOfSquares = treatmentSumOfSquares - (grandTotal * grandTotal / totalOfAllElements);
return treatmentSumOfSquares;
}
public static double errorSumOfSquares(double sumOfSquares, double treatmentSumOfSquares)
{
double errorSumOfSquares = 0;
return errorSumOfSquares = sumOfSquares - treatmentSumOfSquares;
}
public static double meanTreatmentSumOfSquares(double errorSumOfSquares, int[] totalInRow)
{
double meanTreatmentSumOfSquares = 0;
return meanTreatmentSumOfSquares = errorSumOfSquares / (totalInRow.Length - 1);
}
public static double meanErrorSumOfSquares(double errorSumOfSquares, int a, int b)
{
double meanErrorSumOfSquares = 0;
meanErrorSumOfSquares = errorSumOfSquares / (double)(b - a);
return meanErrorSumOfSquares;
}
public static double testStatistic(double meanTreatmentSumOfSquares, double meanErrorSumOfSquares)
{
return (meanTreatmentSumOfSquares / meanErrorSumOfSquares);
}
public static double pValue(double fStatistic, int degreeNum, int degreeDenom)
{
double pValue = 0;
pValue = integrate(0, fStatistic, degreeNum, degreeDenom);
return pValue;
}
public static double integrate(double start, double end, int degreeFreedomT, int degreeFreedomE)
{
int iterations = 100000;
double x, dist, sum = 0, sumT = 0;
dist = (end - start) / iterations;
for (int i = 1; i <= iterations; i++)
{
x = start + i * dist;
sumT += integralFunction(x - dist / 2, degreeFreedomT, degreeFreedomE);
if (i < iterations)
{
sum += integralFunction(x, degreeFreedomT, degreeFreedomE);
}
}
sum = (dist / 6) * (integralFunction(start, degreeFreedomT, degreeFreedomE) + integralFunction(end, degreeFreedomT, degreeFreedomE) + 2 * sum + 4 * sumT);
return sum;
}
public static double integralFunction(double x, int degreeFreedomT, int degreeFreedomE)
{
double temp=0;
temp = ((Math.Pow(degreeFreedomE, degreeFreedomE / 2) * Math.Pow(degreeFreedomT, degreeFreedomT / 2)) / (factorial(degreeFreedomE / 2 - 1) * factorial(degreeFreedomT / 2 - 1))) * (factorial(((degreeFreedomT + degreeFreedomE) / 2 - 1)))*((Math.Pow(x, degreeFreedomE / 2 - 1)) / (Math.Pow((degreeFreedomT + degreeFreedomE * x), ((degreeFreedomE + degreeFreedomT) / 2))));
return temp;
}
public static double factorial(double n)
{
if (n == 0)
{
return 1.0;
}
else
{
return n * factorial(n - 1);
}
}
}
}
Heslacher
50.9k5 gold badges83 silver badges177 bronze badges
asked Jul 25, 2012 at 6:35
Next Door Engineer Next Door EngineerNext Door Engineer
2231 gold badge2 silver badges9 bronze badges
-
\$\begingroup\$ Quick question: How much data is expected to be in your file? In the millions? In the thousands? \$\endgroup\$Dave New– Dave New2012年07月25日 07:47:57 +00:00Commented Jul 25, 2012 at 7:47
-
\$\begingroup\$ @davenewza: Data right now in the higher thousands. May be planning for millions of data in the following days to come. \$\endgroup\$Next Door Engineer– Next Door Engineer2012年07月25日 08:12:30 +00:00Commented Jul 25, 2012 at 8:12
-
\$\begingroup\$ And the number of columns in each row? \$\endgroup\$Dave New– Dave New2012年07月25日 08:22:55 +00:00Commented Jul 25, 2012 at 8:22
-
1\$\begingroup\$ Understood, but do you expect the number of columns to also possibly reach into the millions? The reason I ask these questions is because it is extremely important to understand the data before parallelism (for example, take note of my comment to ANeves's post). \$\endgroup\$Dave New– Dave New2012年07月25日 08:32:39 +00:00Commented Jul 25, 2012 at 8:32
-
1\$\begingroup\$ @davenewza: The number rows will not go to millions, will be in hundreds, while the columns could go to the millions. \$\endgroup\$Next Door Engineer– Next Door Engineer2012年07月25日 08:34:53 +00:00Commented Jul 25, 2012 at 8:34
2 Answers 2
\$\begingroup\$
\$\endgroup\$
Some advice:
- Replace
File.ReadAllLines
withFile.ReadLines
— ReadAllLines will read all file content into memory before the iteration, andforeach
is really designed forIEnumerable
. - Definitely replace
for (int row = 0; row < numbers.Count; row++)
,for (int j=0; j<rowTotal.Length; j++)
andfor (int i = 1; i <= iterations; i++)
with PLinq as @ANeves suggested - Not sure that replacing
foreach (string line in File.ReadAllLines(@"data.txt"))
will help you, as it is heavily dependent on disc I/O operations.
Adam
5,2261 gold badge30 silver badges47 bronze badges
\$\begingroup\$
\$\endgroup\$
5
Use Parallel.For instead of for
, and the same for foreach
.
Example:
var lines = File.ReadAllLines("data.txt");
List<List<double>> numbers = new List<List<double>>();
char[] separators = { ',', ' ' };
/*System.Threading.Tasks.*/Parallel.ForEach<string>(lines, line => {
var list = new List<double>();
foreach (string s in line.Split(separators, StringSplitOptions.RemoveEmptyEntries)) {
double i;
if (Double.TryParse(s, out i)) {
list.Add(i);
}
}
numbers.Add(list);
});
answered Jul 25, 2012 at 8:00
-
4\$\begingroup\$ This certainly could work, but it really depends on the data. If the inner loop is the expensive part (i.e. it runs through a huge number of costly iterations), then this method may work. If the inner loop is relatively light but the outer loop runs many times, then parallelizing it might not even be worth it (it could even be slower). Remember that when using Parallel.ForEach, there is additional overhead spent on each iteration. \$\endgroup\$Dave New– Dave New2012年07月25日 08:21:11 +00:00Commented Jul 25, 2012 at 8:21
-
1\$\begingroup\$ I believe you will need to lock around the
numbers.Add(list)
line OR use one of the Concurrent collections. You cannot safely write to a List from two separate threads. \$\endgroup\$pstrjds– pstrjds2012年07月26日 18:03:34 +00:00Commented Jul 26, 2012 at 18:03 -
\$\begingroup\$ This will only make it run slower. Splitting a single string is not an operation which deserves to be on a separate thread. \$\endgroup\$vgru– vgru2012年07月29日 06:54:25 +00:00Commented Jul 29, 2012 at 6:54
-
\$\begingroup\$ @Groo I would really like to know why. If one reads hundreds of lines with millions of collumns (see OP's comment to the question), and splits+parses them into an unordered list of numbers, how can threading make it slower? Is the overhead that big that it is not offset by the time that it takes to do a split and a million parses? I don't see how. \$\endgroup\$ANeves– ANeves2012年07月30日 07:47:01 +00:00Commented Jul 30, 2012 at 7:47
-
1\$\begingroup\$ @ANeves: that's true, I didn't expect the number of columns to be that large, it's usually the other way around. But the main issue is that parsing is actually the simplest (and fastest) part of the algorithm anyway. I would use
File.ReadLines
to load lines sequentially and then process them on the fly, which would give more sense to multithreading. I actually suggested this to OP in his/her previous question, but I see s/he has returned back to the initial approach. \$\endgroup\$vgru– vgru2012年07月30日 11:31:56 +00:00Commented Jul 30, 2012 at 11:31
lang-cs