Implementation of Kosaraju's algorithm for strongly connected components

Question 1

To learn and practice coding in C++, I wrote an implementation of Kosaraju's two-pass algorithm for computing the strongly connected components in a directed graph, using depth-first search.

This was my first time touching any C++ code in a while (I mainly program in Python or C#). As such, any critique or advice on style, layout, readability, maintainability, and best practice would be greatly appreciated. And, although performance wasn't a primary goal, I am highly interested in any optimizations that could be made (currently it can process about 800 thousand nodes with 5 million edges in around 10 seconds).

#include <iostream>
#include <fstream>
#include <vector>
#include <map>
#include <list>
using std::vector;
using std::map;
using std::list;
using std::ifstream;
using std::cout;
using std::endl;
// Constants
//-------------------------------
const char FILENAME[] = "SCC.txt";
// Prototypes
//-------------------------------
long get_node_count(const char filename[]);
vector< vector<long> > parse_file(const char filename[]);
map< long, vector<long> > compute_scc(vector< vector<long> > &graph);
vector< vector<long> > reverse_graph(const vector< vector<long> > &graph);
void dfs_loop(const vector< vector<long> > &graph, vector<long> &finishTime, vector<long> &leader);
long dfs(const vector< vector<long> > &graph, long nodeIndex, vector<bool> &expanded, vector<long> &finishTime, long t, vector<long> &leader, long s);
list<unsigned long> get_largest_components(const map< long, vector<long> > scc, long size);
/**
 * Main
 */
int main() {
 // Get the sequential graph representation from the file
 vector< vector<long> > graph = parse_file(FILENAME);
 // Compute the strongly-connected components
 map< long, vector<long> > scc = compute_scc(graph);
 // Compute the largest 5 components and print them out
 list<unsigned long> largestComponents = get_largest_components(scc, 5);
 list<unsigned long>::iterator it;
 for (it = largestComponents.begin(); it != largestComponents.end(); it++) {
 cout << *it << ' ';
 }
 cout << endl;
 return 0;
}
/**
 * Parse an input file as a graph, and return the graph.
 */
vector< vector<long> > parse_file(const char filename[]) {
 // Get the node count and prepare the graph
 long nodeCount = get_node_count(filename);
 vector< vector<long> > graph(nodeCount);
 // Open file and extract the data
 ifstream graphFile(filename);
 long nodeIndex;
 long outIndex;
 while (graphFile) {
 graphFile >> nodeIndex;
 graphFile >> outIndex;
 // Add the new outgoing edge to the node
 graph[nodeIndex - 1].push_back(outIndex - 1);
 }
 return graph;
}
/**
 * Get the count of nodes from a graph file representation
 */
long get_node_count(const char filename[]) {
 // Open file and keep track of how many times the value changes
 ifstream graphFile(filename);
 long maxNodeIndex = 0;
 long nodeIndex = 0;
 while (graphFile) {
 // Check the node index
 graphFile >> nodeIndex;
 if (nodeIndex > maxNodeIndex) {
 maxNodeIndex = nodeIndex;
 }
 // Check the outgoing edge
 graphFile >> nodeIndex;
 if (nodeIndex > maxNodeIndex) {
 maxNodeIndex = nodeIndex;
 }
 }
 return maxNodeIndex;
}
/**
 * Compute all of the strongly-connected components of a graph
 * using depth-first search, Kosaraju's 2-pass method
 */
map< long, vector<long> > compute_scc(vector< vector<long> > &graph) {
 // Create finishing time and leader vectors to record the data
 // from the search
 vector<long> finishTime(graph.size(), 0);
 vector<long> leader(graph.size(), 0);
 // Initialize the finish time initially to be the numbers of the graph
 vector<long>::iterator it;
 long index = 0;
 for (it = finishTime.begin(); it != finishTime.end(); it++) {
 *it = index;
 index++;
 }
 // Reverse the graph, to compute the 'magic' finishing times
 vector< vector<long> > reversed = reverse_graph(graph);
 dfs_loop(reversed, finishTime, leader);
 // Compute the SCC leaders using the finishing times
 dfs_loop(graph, finishTime, leader);
 // Distribute nodes to SCCs
 map< long, vector<long> > scc;
 vector<long>::iterator lit;
 for (lit = leader.begin(); lit != leader.end(); lit++) {
 long nodeIndex = lit - leader.begin();
 // Append node to SCC
 scc[*lit].push_back(nodeIndex);
 }
 return scc;
}
/**
 * Reverse a directed graph by looping through each node/edge pair
 * and recording the reverse
 */
vector< vector<long> > reverse_graph(const vector< vector<long> > &graph) {
 // Create new graph
 vector< vector<long> > reversed(graph.size());
 // Loop through all elements and fill new graph with reversed endpoints
 vector< vector<long> >::const_iterator it;
 for (it = graph.begin(); it != graph.end(); it++) {
 long nodeIndex = it - graph.begin();
 // Loop through all outgoing edges, and reverse them in new graph
 vector<long>::const_iterator eit;
 for (eit = graph[nodeIndex].begin(); eit != graph[nodeIndex].end(); eit++) {
 reversed[*eit].push_back(nodeIndex);
 }
 }
 return reversed;
}
/**
 * Compute a depth-first search through all nodes of a graph
 */
void dfs_loop(const vector< vector<long> > &graph, vector<long> &finishTime, vector<long> &leader) {
 // Create expanded tracker and copied finishing time tracker
 vector<bool> expanded(graph.size(), 0);
 vector<long> loopFinishTime = finishTime;
 long t = 0;
 vector<long>::reverse_iterator it;
 // Outer loop through all nodes in order to cover disconnected
 // sections of the graph
 for (it = loopFinishTime.rbegin(); it != loopFinishTime.rend(); it++) {
 // Compute a depth-first search if the node hasn't
 // been expanded yet
 if (!expanded[*it]) {
 t = dfs(graph, *it, expanded, finishTime, t, leader, *it);
 }
 }
}
/**
 * Search through a directed graph recursively, beginning at node 'nodeIndex',
 * until no more node can be searched, recording the finishing times and the 
 * leaders
 */
long dfs(
 const vector< vector<long> > &graph,
 long nodeIndex,
 vector<bool> &expanded,
 vector<long> &finishTime,
 long t,
 vector<long> &leader,
 long s
) {
 // Mark the current node as explored
 expanded[nodeIndex] = true;
 // Set the leader to the given leader
 leader[nodeIndex] = s;
 // Loop through outgoing edges
 vector<long>::const_iterator it;
 for (it = graph[nodeIndex].begin(); it != graph[nodeIndex].end(); it++) {
 // Recursively call DFS if not explored
 if (!expanded[*it]) {
 t = dfs(graph, *it, expanded, finishTime, t, leader, s);
 }
 }
 // Update the finishing time
 finishTime[t] = nodeIndex;
 t++;
 return t;
}
/**
 * Computes the largest 'n' of a strongly-connected component list
 * and return them
 */
list<unsigned long> get_largest_components(const map< long, vector<long> > scc, long size) {
 // Create vector to hold the largest components
 list<unsigned long> largest(size, 0);
 // Iterate through map and keep track of largest components
 map< long, vector<long> >::const_iterator it;
 for (it = scc.begin(); it != scc.end(); it++) {
 // Search through the current largest list to see if there exists
 // an SCC with less elements than the current one
 list<unsigned long>::iterator lit;
 for (lit = largest.begin(); lit != largest.end(); lit++) {
 // Compare size and change largest if needed, inserting
 // the new one at the proper position, and popping off the old
 if (*lit < it->second.size()) {
 largest.insert(lit, it->second.size());
 largest.pop_back();
 break;
 }
 }
 }
 return largest;
}

An example input file (SCC.txt) is composed of edges represented by begin-end node numbers. It could look like this:

Question 2

Fair enough. Lazy but OK I suppose. Personally when using using like this I bind it to the tightest scope possible.

using std::vector;
using std::map;
using std::list;
using std::ifstream;
using std::cout;
using std::endl;

The idea of making it std and not standard was so that it would not be too obnoxious when going std::cout in the code.

This is very obnoxious to read. A bit of work formatting these lines to make it easy to read would have gone a long way:

long get_node_count(const char filename[]);
vector< vector<long> > parse_file(const char filename[]);
map< long, vector<long> > compute_scc(vector< vector<long> > &graph);
vector< vector<long> > reverse_graph(const vector< vector<long> > &graph);
void dfs_loop(const vector< vector<long> > &graph, vector<long> &finishTime, vector<long> &leader);
long dfs(const vector< vector<long> > &graph, long nodeIndex, vector<bool> &expanded, vector<long> &finishTime, long t, vector<long> &leader, long s);
list<unsigned long> get_largest_components(const map< long, vector<long> > scc, long size);

Nothing wrong with this:

// Get the sequential graph representation from the file
vector< vector<long> > graph = parse_file(FILENAME);

But you don' think it would have been more intuitive to read as:

MyGraph graph(FILENAME);

A tiny bit of work wrapping your data into a class goes a long way to make the code more readable. Your style you are using the classes available in C++ but really your style is more C than C++.

Learn to use the standard algorithms:

list<unsigned long>::iterator it;
for (it = largestComponents.begin(); it != largestComponents.end(); it++) {
 cout << *it << ' ';
}
cout << endl;
// Easier to write as
std::copy(largestComponents.begin(), largestComponents.end(),
 std::ostream_iterator<unsigned long>(std::cout, " ");
std::cout << std::endl;

Main is special (you don't actually need to return anything. If there is no return in main() it is equivalent to return 0).

return 0;

If you're code never returns anything but 0 then I would not return anything (let the language do it stuff). Do this to distinguish it from code that can return an error code.

I hope you profiled to make sure it was worth reading the file twice.

long nodeCount = get_node_count(filename);
vector< vector<long> > graph(nodeCount);

The vector automatically re-sizes as required. It uses a heuristic to prevent it re-sizing too many times. If you are using a C++11 compiler then the cost of resizing will be minimized as it will be using move rather than copy to reduce the cost.

This is almost always wrong:

while (graphFile) {

The last successfull read will read up-to but not past the end of file. Thus leaving the stream in a good state even though there is no data left to read from the stream. Thus the loop will be enetered but the first read will fail. But inside your loop you don't check for failure. As a result you push_back() one more node than exists in the file (though the last node is a copy of the previous node).

The correct way to write the loop is:

while(graphFile >> nodeIndex >> outIndex)
{
 // Read of both values was successful
 // Add it to the graph.
 graph[nodeIndex - 1].push_back(outIndex - 1);
}

Learn the standard functions:

 if (nodeIndex > maxNodeIndex) {
 maxNodeIndex = nodeIndex;
 }
 // Easier to read as:
 maxNodeIndex = std::max(maxNodeIndex, nodeIndex);

There is no point in calculating nodeIndex. Either use the iterators, or use the index into the array the combination of both techniques is untidy.

for (it = graph.begin(); it != graph.end(); it++) {
 long nodeIndex = it - graph.begin();
 // Loop through all outgoing edges, and reverse them in new graph
 vector<long>::const_iterator eit;
 for (eit = graph[nodeIndex].begin(); eit != graph[nodeIndex].end(); eit++) {
 reversed[*eit].push_back(nodeIndex);
 }

Question 3

I have a few comments.

In the parse_file function, you need to close the file before you return the graph by using
```
graphFile.close()
```
Your reverse_graph function could be better. I think you can parse the file again but this time reverse the head and tail of the node to get a reversed graph. For example, instead of read in head first, you can read in the tail and then the head.
Your graph is big (with 800 thousand nodes). It would be better to use pass by reference extensively. For example, in your parse_file function, you construct a very big graph in the function, and return it. It would be better if you pass in a graph by reference and construct it while you read file. In this way, you create fewer temporary objects (graph in this case) and it saves you some memory. The same idea applies in other functions like compute_scc and, if you want, get_largest_component. For example:
```
void parse_file(const char filename[], Graph & graph);
```

score 8 · Accepted Answer · 2012-04-17 08:21:28Z

Fair enough. Lazy but OK I suppose. Personally when using using like this I bind it to the tightest scope possible.

using std::vector;
using std::map;
using std::list;
using std::ifstream;
using std::cout;
using std::endl;

The idea of making it std and not standard was so that it would not be too obnoxious when going std::cout in the code.

This is very obnoxious to read. A bit of work formatting these lines to make it easy to read would have gone a long way:

long get_node_count(const char filename[]);
vector< vector<long> > parse_file(const char filename[]);
map< long, vector<long> > compute_scc(vector< vector<long> > &graph);
vector< vector<long> > reverse_graph(const vector< vector<long> > &graph);
void dfs_loop(const vector< vector<long> > &graph, vector<long> &finishTime, vector<long> &leader);
long dfs(const vector< vector<long> > &graph, long nodeIndex, vector<bool> &expanded, vector<long> &finishTime, long t, vector<long> &leader, long s);
list<unsigned long> get_largest_components(const map< long, vector<long> > scc, long size);

Nothing wrong with this:

// Get the sequential graph representation from the file
vector< vector<long> > graph = parse_file(FILENAME);

But you don' think it would have been more intuitive to read as:

MyGraph graph(FILENAME);

A tiny bit of work wrapping your data into a class goes a long way to make the code more readable. Your style you are using the classes available in C++ but really your style is more C than C++.

Learn to use the standard algorithms:

list<unsigned long>::iterator it;
for (it = largestComponents.begin(); it != largestComponents.end(); it++) {
 cout << *it << ' ';
}
cout << endl;
// Easier to write as
std::copy(largestComponents.begin(), largestComponents.end(),
 std::ostream_iterator<unsigned long>(std::cout, " ");
std::cout << std::endl;

Main is special (you don't actually need to return anything. If there is no return in main() it is equivalent to return 0).

return 0;

If you're code never returns anything but 0 then I would not return anything (let the language do it stuff). Do this to distinguish it from code that can return an error code.

I hope you profiled to make sure it was worth reading the file twice.

long nodeCount = get_node_count(filename);
vector< vector<long> > graph(nodeCount);

The vector automatically re-sizes as required. It uses a heuristic to prevent it re-sizing too many times. If you are using a C++11 compiler then the cost of resizing will be minimized as it will be using move rather than copy to reduce the cost.

This is almost always wrong:

while (graphFile) {

The last successfull read will read up-to but not past the end of file. Thus leaving the stream in a good state even though there is no data left to read from the stream. Thus the loop will be enetered but the first read will fail. But inside your loop you don't check for failure. As a result you push_back() one more node than exists in the file (though the last node is a copy of the previous node).

The correct way to write the loop is:

while(graphFile >> nodeIndex >> outIndex)
{
 // Read of both values was successful
 // Add it to the graph.
 graph[nodeIndex - 1].push_back(outIndex - 1);
}

Learn the standard functions:

 if (nodeIndex > maxNodeIndex) {
 maxNodeIndex = nodeIndex;
 }
 // Easier to read as:
 maxNodeIndex = std::max(maxNodeIndex, nodeIndex);

There is no point in calculating nodeIndex. Either use the iterators, or use the index into the array the combination of both techniques is untidy.

for (it = graph.begin(); it != graph.end(); it++) {
 long nodeIndex = it - graph.begin();
 // Loop through all outgoing edges, and reverse them in new graph
 vector<long>::const_iterator eit;
 for (eit = graph[nodeIndex].begin(); eit != graph[nodeIndex].end(); eit++) {
 reversed[*eit].push_back(nodeIndex);
 }

Stack Exchange Network

Implementation of Kosaraju's algorithm for strongly connected components

2 Answers 2

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Hot Network Questions

Implementation of Kosaraju's algorithm for strongly connected components

2 Answers 2

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Related

Hot Network Questions