Finding and generating an Anova table

Question 1

I would like to optimize the code to be efficient. Basically the code finds and generates an Anova table with the p-value also computed.

I am inputting a text file with data delimited with commas.

This is the main function that calls the methods in the library file:

using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Linq;
using System.Text;
using System.Windows.Forms;
using System.IO;
namespace OneWayAnovaTable
{
public partial class OneWayAnovaTable : Form
{
 public OneWayAnovaTable()
 {
 InitializeComponent();
 }
 static string TSS, ESS, TotSS, TDF, EDF, TotDF, TMS, EMS, F, p;
 private void ReadFile()
 {
 List<List<double>> numbers = new List<List<double>>(); 
 foreach (string line in File.ReadAllLines(@"data.txt"))
 {
 var list = new List<double>();
 foreach (string s in line.Split(new[] { ',', ' ' }, StringSplitOptions.RemoveEmptyEntries))
 {
 double i;
 if (Double.TryParse(s, out i))
 {
 list.Add(i);
 }
 }
 numbers.Add(list);
 }
 double[] rowTotal = new double[numbers.Count];
 double[] squareRowTotal = new double[numbers.Count];
 double[] rowMean = new double[numbers.Count];
 int totalElements = 0;
 int[] totalInRow = new int[numbers.Count()];
 double grandTotalMean = 0;
 double grandMean = 0;
 double grandTotal=0;
 for (int row = 0; row < numbers.Count; row++)
 {
 var values = numbers[row].ToArray();
 rowTotal[row] = values.Sum();
 squareRowTotal[row] = values.Select(v => v * v).Sum();
 rowMean[row] = rowTotal[row] / values.Length;
 totalInRow[row] += values.Length;
 totalElements += totalInRow[row];
 grandTotalMean += rowMean[row];
 grandMean += rowMean[row]/numbers.Count;
 }
 for (int j=0; j<rowTotal.Length; j++)
 {
 grandTotal += rowTotal[j];
 }
 double sumOfSquares = OneWayAnovaClassLibrary.OneWayAnova.totalSumOfSquares(squareRowTotal, grandTotal, totalElements);
 double treatmentSumOfSquares = OneWayAnovaClassLibrary.OneWayAnova.treatmentSumOfSquares(rowTotal, totalInRow, grandTotal, totalElements);
 double errorSumOfSquares = OneWayAnovaClassLibrary.OneWayAnova.errorSumOfSquares(sumOfSquares, treatmentSumOfSquares);
 double meanTreatmentSumOfSquares = OneWayAnovaClassLibrary.OneWayAnova.meanTreatmentSumOfSquares(treatmentSumOfSquares, totalInRow);
 double meanErrorSumOfSquares = OneWayAnovaClassLibrary.OneWayAnova.meanErrorSumOfSquares(errorSumOfSquares, (numbers.Count-1), (totalElements-1));
 double fStatistic = OneWayAnovaClassLibrary.OneWayAnova.testStatistic(meanTreatmentSumOfSquares, meanErrorSumOfSquares);
 double pValue = OneWayAnovaClassLibrary.OneWayAnova.pValue(fStatistic, (numbers.Count - 1), (totalElements - (numbers.Count-1)));
 TSS = treatmentSumOfSquares.ToString();
 ESS = errorSumOfSquares.ToString();
 TotSS = sumOfSquares.ToString();
 TDF = (numbers.Count() - 1).ToString();
 EDF = (totalElements - numbers.Count()).ToString();
 TotDF = (totalElements - 1).ToString();
 TMS = meanTreatmentSumOfSquares.ToString();
 EMS = meanErrorSumOfSquares.ToString();
 F = fStatistic.ToString();
 p = pValue.ToString();
 }
 private void button2_Click(object sender, EventArgs e)
 {
 ReadFile();
 display(); 
 }
 private void display()
 {
 textBoxTSS.Text = TSS;
 textBoxESS.Text = ESS;
 textBoxTotSS.Text = TotSS;
 textBoxTDF.Text = TDF;
 textBoxEDF.Text = EDF;
 textBoxTotDF.Text = TotDF;
 textBoxTMS.Text = TMS;
 textBoxEMS.Text = EMS;
 textBoxF.Text = F;
 textBoxp.Text = p;
 }
}
}

The library file is here:

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
namespace OneWayAnovaClassLibrary
{
public class OneWayAnova
{
 public static double totalSumOfSquares(double[] squareRowTotal, double grandTotal, int totalOfAllElements)
 {
 double sumOfSquares = 0;
 for (int i = 0; i < squareRowTotal.Length; i++)
 {
 sumOfSquares += squareRowTotal[i];
 }
 sumOfSquares = sumOfSquares - (grandTotal * grandTotal / totalOfAllElements);
 return sumOfSquares;
 }
 public static double treatmentSumOfSquares(double[] rowTotal, int[] totalInRow, double grandTotal, int totalOfAllElements)
 {
 double treatmentSumOfSquares = 0;
 for (int i = 0; i < totalInRow.Length; i++)
 {
 treatmentSumOfSquares += rowTotal[i] * rowTotal[i] / totalInRow[i];
 }
 treatmentSumOfSquares = treatmentSumOfSquares - (grandTotal * grandTotal / totalOfAllElements);
 return treatmentSumOfSquares;
 }
 public static double errorSumOfSquares(double sumOfSquares, double treatmentSumOfSquares)
 {
 double errorSumOfSquares = 0;
 return errorSumOfSquares = sumOfSquares - treatmentSumOfSquares;
 }
 public static double meanTreatmentSumOfSquares(double errorSumOfSquares, int[] totalInRow)
 {
 double meanTreatmentSumOfSquares = 0;
 return meanTreatmentSumOfSquares = errorSumOfSquares / (totalInRow.Length - 1);
 }
 public static double meanErrorSumOfSquares(double errorSumOfSquares, int a, int b)
 {
 double meanErrorSumOfSquares = 0;
 meanErrorSumOfSquares = errorSumOfSquares / (double)(b - a);
 return meanErrorSumOfSquares;
 }
 public static double testStatistic(double meanTreatmentSumOfSquares, double meanErrorSumOfSquares)
 {
 return (meanTreatmentSumOfSquares / meanErrorSumOfSquares);
 }
 public static double pValue(double fStatistic, int degreeNum, int degreeDenom)
 {
 double pValue = 0;
 pValue = integrate(0, fStatistic, degreeNum, degreeDenom);
 return pValue;
 }
 public static double integrate(double start, double end, int degreeFreedomT, int degreeFreedomE)
 {
 int iterations = 100000;
 double x, dist, sum = 0, sumT = 0;
 dist = (end - start) / iterations;
 for (int i = 1; i <= iterations; i++)
 {
 x = start + i * dist;
 sumT += integralFunction(x - dist / 2, degreeFreedomT, degreeFreedomE);
 if (i < iterations)
 {
 sum += integralFunction(x, degreeFreedomT, degreeFreedomE);
 }
 }
 sum = (dist / 6) * (integralFunction(start, degreeFreedomT, degreeFreedomE) + integralFunction(end, degreeFreedomT, degreeFreedomE) + 2 * sum + 4 * sumT);
 return sum;
 }
 public static double integralFunction(double x, int degreeFreedomT, int degreeFreedomE)
 {
 double temp=0;
 temp = ((Math.Pow(degreeFreedomE, degreeFreedomE / 2) * Math.Pow(degreeFreedomT, degreeFreedomT / 2)) / (factorial(degreeFreedomE / 2 - 1) * factorial(degreeFreedomT / 2 - 1))) * (factorial(((degreeFreedomT + degreeFreedomE) / 2 - 1)))*((Math.Pow(x, degreeFreedomE / 2 - 1)) / (Math.Pow((degreeFreedomT + degreeFreedomE * x), ((degreeFreedomE + degreeFreedomT) / 2))));
 return temp;
 }
 public static double factorial(double n)
 {
 if (n == 0)
 {
 return 1.0;
 }
 else
 {
 return n * factorial(n - 1);
 }
 }
}
}

Question 2

Quick question: How much data is expected to be in your file? In the millions? In the thousands?

Question 3

@davenewza: Data right now in the higher thousands. May be planning for millions of data in the following days to come.

Question 4

And the number of columns in each row?

Question 5

Understood, but do you expect the number of columns to also possibly reach into the millions? The reason I ask these questions is because it is extremely important to understand the data before parallelism (for example, take note of my comment to ANeves's post).

Question 6

@davenewza: The number rows will not go to millions, will be in hundreds, while the columns could go to the millions.

Question 7

Some advice:

Replace File.ReadAllLines with File.ReadLines — ReadAllLines will read all file content into memory before the iteration, and foreach is really designed for IEnumerable.
Definitely replace for (int row = 0; row < numbers.Count; row++), for (int j=0; j<rowTotal.Length; j++) and for (int i = 1; i <= iterations; i++) with PLinq as @ANeves suggested
Not sure that replacing foreach (string line in File.ReadAllLines(@"data.txt")) will help you, as it is heavily dependent on disc I/O operations.

Question 8

Use Parallel.For instead of for, and the same for foreach.
Example:

var lines = File.ReadAllLines("data.txt");
List<List<double>> numbers = new List<List<double>>();
char[] separators = { ',', ' ' };
/*System.Threading.Tasks.*/Parallel.ForEach<string>(lines, line => {
 var list = new List<double>();
 foreach (string s in line.Split(separators, StringSplitOptions.RemoveEmptyEntries)) {
 double i;
 if (Double.TryParse(s, out i)) {
 list.Add(i);
 }
 }
 numbers.Add(list);
});

Question 9

This certainly could work, but it really depends on the data. If the inner loop is the expensive part (i.e. it runs through a huge number of costly iterations), then this method may work. If the inner loop is relatively light but the outer loop runs many times, then parallelizing it might not even be worth it (it could even be slower). Remember that when using Parallel.ForEach, there is additional overhead spent on each iteration.

Question 10

I believe you will need to lock around the numbers.Add(list) line OR use one of the Concurrent collections. You cannot safely write to a List from two separate threads.

Question 11

This will only make it run slower. Splitting a single string is not an operation which deserves to be on a separate thread.

Question 12

@Groo I would really like to know why. If one reads hundreds of lines with millions of collumns (see OP's comment to the question), and splits+parses them into an unordered list of numbers, how can threading make it slower? Is the overhead that big that it is not offset by the time that it takes to do a split and a million parses? I don't see how.

Question 13

@ANeves: that's true, I didn't expect the number of columns to be that large, it's usually the other way around. But the main issue is that parsing is actually the simplest (and fastest) part of the algorithm anyway. I would use File.ReadLines to load lines sequentially and then process them on the fly, which would give more sense to multithreading. I actually suggested this to OP in his/her previous question, but I see s/he has returned back to the initial approach.

Akim AkimAkim 4753 silver badges13 bronze badges · Answer 1 · 2012-07-28 15:52:53Z

Some advice:

Replace File.ReadAllLines with File.ReadLines — ReadAllLines will read all file content into memory before the iteration, and foreach is really designed for IEnumerable.
Definitely replace for (int row = 0; row < numbers.Count; row++), for (int j=0; j<rowTotal.Length; j++) and for (int i = 1; i <= iterations; i++) with PLinq as @ANeves suggested
Not sure that replacing foreach (string line in File.ReadAllLines(@"data.txt")) will help you, as it is heavily dependent on disc I/O operations.

ANeves ANevesANeves 2,96217 silver badges29 bronze badges · Answer 2 · 2012-07-25 08:00:44Z

1

\$\begingroup\$

Use Parallel.For instead of for, and the same for foreach.
Example:

var lines = File.ReadAllLines("data.txt");
List<List<double>> numbers = new List<List<double>>();
char[] separators = { ',', ' ' };
/*System.Threading.Tasks.*/Parallel.ForEach<string>(lines, line => {
 var list = new List<double>();
 foreach (string s in line.Split(separators, StringSplitOptions.RemoveEmptyEntries)) {
 double i;
 if (Double.TryParse(s, out i)) {
 list.Add(i);
 }
 }
 numbers.Add(list);
});

Share

answered Jul 25, 2012 at 8:00

ANeves's user avatar

ANeves ANevesANeves

2,96217 silver badges29 bronze badges

\$\endgroup\$

5

4

\$\begingroup\$ This certainly could work, but it really depends on the data. If the inner loop is the expensive part (i.e. it runs through a huge number of costly iterations), then this method may work. If the inner loop is relatively light but the outer loop runs many times, then parallelizing it might not even be worth it (it could even be slower). Remember that when using Parallel.ForEach, there is additional overhead spent on each iteration. \$\endgroup\$

Dave New
– Dave New

2012年07月25日 08:21:11 +00:00
Commented Jul 25, 2012 at 8:21
1

\$\begingroup\$ I believe you will need to lock around the numbers.Add(list) line OR use one of the Concurrent collections. You cannot safely write to a List from two separate threads. \$\endgroup\$

pstrjds
– pstrjds

2012年07月26日 18:03:34 +00:00
Commented Jul 26, 2012 at 18:03
\$\begingroup\$ This will only make it run slower. Splitting a single string is not an operation which deserves to be on a separate thread. \$\endgroup\$

vgru
– vgru

2012年07月29日 06:54:25 +00:00
Commented Jul 29, 2012 at 6:54
\$\begingroup\$ @Groo I would really like to know why. If one reads hundreds of lines with millions of collumns (see OP's comment to the question), and splits+parses them into an unordered list of numbers, how can threading make it slower? Is the overhead that big that it is not offset by the time that it takes to do a split and a million parses? I don't see how. \$\endgroup\$

ANeves
– ANeves

2012年07月30日 07:47:01 +00:00
Commented Jul 30, 2012 at 7:47
1

\$\begingroup\$ @ANeves: that's true, I didn't expect the number of columns to be that large, it's usually the other way around. But the main issue is that parsing is actually the simplest (and fastest) part of the algorithm anyway. I would use File.ReadLines to load lines sequentially and then process them on the fly, which would give more sense to multithreading. I actually suggested this to OP in his/her previous question, but I see s/he has returned back to the initial approach. \$\endgroup\$

vgru
– vgru

2012年07月30日 11:31:56 +00:00
Commented Jul 30, 2012 at 11:31

Add a comment |

Stack Exchange Network

Finding and generating an Anova table

2 Answers 2

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Hot Network Questions

Finding and generating an Anova table

2 Answers 2

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Related

Hot Network Questions