2
\$\begingroup\$

This is my code for clustering 111272 by 29987 tfidf vector but it is taking long time .How can I speed up the code execution. The tfidf matrix is sparse and works well for 10000 records.

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
package singlepass;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import com.opencsv.CSVReader;
import org.apache.commons.lang3.ArrayUtils;
import java.util.Arrays;
/**
 *
 * @author User
 */
public class SinglePass {
 /**
 * @param args the command line arguments
 */
 public static void main(String[] args) throws IOException {
 int noOfDocuments = 5;
 int noOfTokens = 5;
 float threshhold = 0.6f;
 SinglePassAlgorithm(threshhold);
 }
 private static void SinglePassAlgorithm(float threshhold) throws IOException {
 int docNo = 0;
 //variable to store the clusters
 ArrayList<int[]> cluster = new ArrayList<int[]>();
 //variable to store the clusterRepresentations-centroids
 ArrayList< Float[]> clusterRepresentative = new ArrayList< Float[]>();
 //initial no of cluster
 int noOfClusters = 1;
 //declared but intialised later on first read
 int noOfTokens;
 //Code to read from a file one line at a time without consuming memory
 //"C:\\Users\\User\\Desktop\\IITpatna\\code\\SinglePass\\src\\singlepass\\foo.csv"
 // "C:\\Users\\User\\Desktop\\IITpatna\\code\\twitter_streams\\foo.csv"
 CSVReader reader = new CSVReader(new FileReader("C:\\Users\\User\\Desktop\\IITpatna\\code\\twitter_streams\\foo-comma_old.csv"), ',');
 //stores the current line read from the stream
 String[] nextLine;
 //loop until no more entry exists in the stream
 while ((nextLine = reader.readNext()) != null) {
 //TODO may be check for the number of dimension for every record
 //handle blanks and nulls
 nextLine = Arrays.stream(nextLine)
 .filter(s -> (s != null && s.length() > 0))
 .toArray(String[]::new);
 //set no of tokens from single feature
 noOfTokens = nextLine.length;
 //if first record
 if (docNo == 0) {
 float fResult[] = new float[nextLine.length];
 //parse the string feature into Float
 for (int i = 0; i < nextLine.length; i++) {
 fResult[i] = Float.parseFloat(nextLine[i]);
 }
 //add the parsed record as the zeroth record
 // i.e. since it is the first document read it can put into cluster 
 //as first cluster without any harm
 cluster.add(new int[]{docNo});
 //convert the read features into float and add them to the clusterRepresentative Store as the first centroid 
 Float[] temp = new Float[noOfTokens];
 temp = convertintArrToFloatArr(fResult);
 clusterRepresentative.add(temp);
 } else {
 //it is not the first record...any other record
 //parse it into float
 float fResult[] = new float[nextLine.length];
 for (int i = 0; i < nextLine.length; i++) {
 fResult[i] = Float.parseFloat(nextLine[i]);
 }
 //variable to capture the max similarity till now
 float max = -1;
 //variable to capture the max similarity clusterId
 int clusterId = -1;
 //since we are in else part we assume there are other cluster
 //loop through every current cluster found till now to calculate the similarity and the cluster id
 for (int j = 0; j < noOfClusters; ++j) {
 //compute the cosine similarity
 float similarity = calculateSimilarity(convertintArrToFloatArr(fResult), clusterRepresentative.get(j));
 //check if greater than the threshold
 if (similarity > threshhold) {
 //check if greater than max
 if (similarity > max) {
 max = similarity;
 clusterId = j;
 }
 }
 }
 if (max == -1) {
 //case when the similarity value never crossed the threshold 
 //it means new cluster needs to be created
 //add the current doc as new entry in to the cluster
 cluster.add(new int[]{docNo});
 noOfClusters++;
 //add the current doc as new represenation for itself
 clusterRepresentative.add(convertintArrToFloatArr(fResult));
 } else {
 //else we found a candidate for merging with existing cluster
 //cluster contains other docs so fetch them
 int[] values = cluster.get(clusterId);
 //create a new array with size one
 int[] newValue = new int[1];
 //add the newly found doc into the newValue 
 newValue[0] = docNo;
 //merge both the values from the cluster ..old and the latest found
 cluster.set(clusterId, ArrayUtils.addAll(values, newValue));
 //compute the new centroid representation for the newly modified cluster
 clusterRepresentative.set(clusterId,
 calculateClusterRepresentative(cluster.get(clusterId), fResult, clusterId, clusterRepresentative, noOfTokens));
 }
 }
 System.out.println(docNo);
 docNo += 1;
 }
 for (int i = 0; i < noOfClusters; ++i) {
 System.out.print("\n" + i + "\t");
 for (int j = 0; j < cluster.get(i).length; ++j) {
 System.out.print(" " + cluster.get(i)[j]);
 }
 }
 }
 private static Float[] convertintArrToFloatArr(float[] input) {
 int size = input.length;
 Float[] answer = new Float[size];
 for (int i = 0; i < input.length; ++i) {
 answer[i] = (float) input[i];
 }
 return answer;
 }
 private static float calculateSimilarity(Float[] vectorA, Float[] vectorB) {
 float answer = 0;
 double dotProduct = 0.0;
 double normA = 0.0;
 double normB = 0.0;
 for (int i = 0; i < vectorA.length; i++) {
 dotProduct += vectorA[i] * vectorB[i];
 normA += Math.pow(vectorA[i], 2);
 normB += Math.pow(vectorB[i], 2);
 }
 answer = (float) ((float) dotProduct / (Math.sqrt(normA) * Math.sqrt(normB)));
 return answer;
 }
 private static Float[] calculateClusterRepresentative(int[] cluster,
 float[] input,
 int clusterId,
 ArrayList< Float[]> clusterRepresentative,
 int noOFTokens) {
 //create a answer variable equal to the dimension of the noOFTokens
 Float[] answer = new Float[noOFTokens];
 for (int i = 0; i < noOFTokens; ++i) {
 answer[i] = Float.parseFloat("0");
 }
 //get the cluster representation
 Float[] clusRepresent = clusterRepresentative.get(clusterId);
 //get the number of members in the cluster
 int clusterMemberSize = cluster.length;
 for (int i = 0; i < noOFTokens; ++i) {
 //so we multiply the previous cluster represenation by one number less and add it to new member features
 answer[i] = clusRepresent[i] * (clusterMemberSize - 1) + input[i];
 }
 for (int i = 0; i < noOFTokens; ++i) {
 //divide the sum of all the cluster members by the total number of memebers to calculate the new centroid
 answer[i] /= clusterMemberSize;
 }
 return answer;
 }
}
asked Mar 1, 2017 at 7:55
\$\endgroup\$

2 Answers 2

2
\$\begingroup\$

You should avoid autoboxing and use native types wherever you can.

  • Change ArrayList< Float[]> to ArrayList< float[]> and avoid the autoboxing that is happening all over the place.
  • 'convertintArrToFloatArr' is actually converting a float[] to a Float[] so you can get rid of calling that completely.
  • calculateSimilarity(Float[] vectorA, Float[] vectorB) will speed up a little if it doesn't have to autobox so change to calculateSimilarity(float[] vectorA, float[] vectorB)
  • You are using Float.parseFloat(..), this is good because it already returns a float. So don't autobox that to a Float again.
  • You have a couple of double to float casts in calculateSimilarity that you may be able to simplify. Alternatively, if memory is not a concern, why not use doubles all over the place to avoid all the conversions?
answered Mar 1, 2017 at 12:39
\$\endgroup\$
0
\$\begingroup\$

Another improvement that I just noticed is this line:

cluster.set(clusterId, ArrayUtils.addAll(values, newValue));

You are appending a single element to an existing array using ArrayUtils.addAll. So basically you are merging a lot of arrays just to append a single element. That probably adds up. Why not make cluster a list of lists and simply add the newValue. ArrayList will internally also copy the array once in a while when it needs to but a lot less often then you are doing here.

t3chb0t
44.6k9 gold badges84 silver badges190 bronze badges
answered Mar 2, 2017 at 11:14
\$\endgroup\$

Your Answer

Draft saved
Draft discarded

Sign up or log in

Sign up using Google
Sign up using Email and Password

Post as a guest

Required, but never shown

Post as a guest

Required, but never shown

By clicking "Post Your Answer", you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.