dlib C++ Library - linear_manifold_regularizer.cpp

// Copyright (C) 2010 Davis E. King (davis@dlib.net)
// License: Boost Software License See LICENSE.txt for the full license.
#include "tester.h"
#include <dlib/manifold_regularization.h>
#include <dlib/svm.h>
#include <dlib/rand.h>
#include <dlib/string.h>
#include <dlib/graph_utils_threaded.h>
#include <vector>
#include <sstream>
#include <ctime>
namespace 
{
 using namespace test;
 using namespace dlib;
 using namespace std;
 dlib::logger dlog("test.linear_manifold_regularizer");
 template <typename hash_type, typename samples_type>
 void test_find_k_nearest_neighbors_lsh(
 const samples_type& samples
 )
 {
 std::vector<sample_pair> edges1, edges2;
 find_k_nearest_neighbors(samples, cosine_distance(), 2, edges1);
 find_k_nearest_neighbors_lsh(samples, cosine_distance(), hash_type(), 2, 6, edges2, 2);
 std::sort(edges1.begin(), edges1.end(), order_by_index<sample_pair>);
 std::sort(edges2.begin(), edges2.end(), order_by_index<sample_pair>);
 DLIB_TEST_MSG(edges1.size() == edges2.size(), edges1.size() << " " << edges2.size());
 for (unsigned long i = 0; i < edges1.size(); ++i)
 {
 DLIB_TEST(edges1[i] == edges2[i]);
 DLIB_TEST_MSG(std::abs(edges1[i].distance() - edges2[i].distance()) < 1e-7,
 edges1[i].distance() - edges2[i].distance());
 }
 }
 template <typename scalar_type>
 void test_knn_lsh_sparse()
 {
 dlib::rand rnd;
 std::vector<std::map<unsigned long,scalar_type> > samples;
 samples.resize(20);
 for (unsigned int i = 0; i < samples.size(); ++i)
 {
 samples[i][0] = rnd.get_random_gaussian();
 samples[i][2] = rnd.get_random_gaussian();
 }
 test_find_k_nearest_neighbors_lsh<hash_similar_angles_64>(samples);
 test_find_k_nearest_neighbors_lsh<hash_similar_angles_128>(samples);
 test_find_k_nearest_neighbors_lsh<hash_similar_angles_256>(samples);
 test_find_k_nearest_neighbors_lsh<hash_similar_angles_512>(samples);
 }
 template <typename scalar_type>
 void test_knn_lsh_dense()
 {
 dlib::rand rnd;
 std::vector<matrix<scalar_type,0,1> > samples;
 samples.resize(20);
 for (unsigned int i = 0; i < samples.size(); ++i)
 {
 samples[i].set_size(2);
 samples[i](0) = rnd.get_random_gaussian();
 samples[i](1) = rnd.get_random_gaussian();
 }
 test_find_k_nearest_neighbors_lsh<hash_similar_angles_64>(samples);
 test_find_k_nearest_neighbors_lsh<hash_similar_angles_128>(samples);
 test_find_k_nearest_neighbors_lsh<hash_similar_angles_256>(samples);
 test_find_k_nearest_neighbors_lsh<hash_similar_angles_512>(samples);
 }
 class linear_manifold_regularizer_tester : public tester
 {
 /*!
 WHAT THIS OBJECT REPRESENTS
 This object represents a unit test. When it is constructed
 it adds itself into the testing framework.
 !*/
 public:
 linear_manifold_regularizer_tester (
 ) :
 tester (
 "test_linear_manifold_regularizer", // the command line argument name for this test
 "Run tests on the linear_manifold_regularizer object.", // the command line argument description
 0 // the number of command line arguments for this test
 )
 {
 seed = 1;
 }
 dlib::rand rnd;
 unsigned long seed;
 typedef matrix<double, 0, 1> sample_type;
 typedef radial_basis_kernel<sample_type> kernel_type;
 void do_the_test()
 {
 print_spinner();
 std::vector<sample_type> samples;
 // Declare an instance of the kernel we will be using. 
 const kernel_type kern(0.1);
 const unsigned long num_points = 200;
 // create a large dataset with two concentric circles. 
 generate_circle(samples, 1, num_points); // circle of radius 1
 generate_circle(samples, 5, num_points); // circle of radius 5
 std::vector<sample_pair> edges;
 find_percent_shortest_edges_randomly(samples, squared_euclidean_distance(0.1, 4), 1, 10000, "random seed", edges);
 dlog << LTRACE << "number of edges generated: " << edges.size();
 empirical_kernel_map<kernel_type> ekm;
 ekm.load(kern, randomly_subsample(samples, 100));
 // Project all the samples into the span of our 50 basis samples
 for (unsigned long i = 0; i < samples.size(); ++i)
 samples[i] = ekm.project(samples[i]);
 // Now create the manifold regularizer. The result is a transformation matrix that
 // embodies the manifold assumption discussed above. 
 linear_manifold_regularizer<sample_type> lmr;
 lmr.build(samples, edges, use_gaussian_weights(0.1));
 matrix<double> T = lmr.get_transformation_matrix(10000);
 print_spinner();
 // generate the T matrix manually and make sure it matches. The point of this test
 // is to make sure that the more complex version of this that happens inside the linear_manifold_regularizer
 // is correct. It uses a tedious block of loops to do it in a way that is a lot faster for sparse
 // W matrices but isn't super straight forward. 
 matrix<double> X(samples[0].size(), samples.size());
 for (unsigned long i = 0; i < samples.size(); ++i)
 set_colm(X,i) = samples[i];
 matrix<double> W(samples.size(), samples.size());
 W = 0;
 for (unsigned long i = 0; i < edges.size(); ++i)
 {
 W(edges[i].index1(), edges[i].index2()) = use_gaussian_weights(0.1)(edges[i]);
 W(edges[i].index2(), edges[i].index1()) = use_gaussian_weights(0.1)(edges[i]);
 }
 matrix<double> L = diagm(sum_rows(W)) - W;
 matrix<double> trueT = inv_lower_triangular(chol(identity_matrix<double>(X.nr()) + (10000.0/sum(lowerm(W)))*X*L*trans(X)));
 dlog << LTRACE << "T error: "<< max(abs(T - trueT));
 DLIB_TEST(max(abs(T - trueT)) < 1e-7);
 print_spinner();
 // Apply the transformation generated by the linear_manifold_regularizer to 
 // all our samples.
 for (unsigned long i = 0; i < samples.size(); ++i)
 samples[i] = T*samples[i];
 // For convenience, generate a projection_function and merge the transformation
 // matrix T into it. 
 projection_function<kernel_type> proj = ekm.get_projection_function();
 proj.weights = T*proj.weights;
 // Pick 2 different labeled points. One on the inner circle and another on the outer. 
 // For each of these test points we will see if using the single plane that separates
 // them is a good way to separate the concentric circles. Also do this a bunch 
 // of times with different randomly chosen points so we can see how robust the result is.
 for (int itr = 0; itr < 10; ++itr)
 {
 print_spinner();
 std::vector<sample_type> test_points;
 // generate a random point from the radius 1 circle
 generate_circle(test_points, 1, 1);
 // generate a random point from the radius 5 circle
 generate_circle(test_points, 5, 1);
 // project the two test points into kernel space. Recall that this projection_function
 // has the manifold regularizer incorporated into it. 
 const sample_type class1_point = proj(test_points[0]);
 const sample_type class2_point = proj(test_points[1]);
 double num_wrong = 0;
 // Now attempt to classify all the data samples according to which point
 // they are closest to. The output of this program shows that without manifold 
 // regularization this test will fail but with it it will perfectly classify
 // all the points.
 for (unsigned long i = 0; i < samples.size(); ++i)
 {
 double distance_to_class1 = length(samples[i] - class1_point);
 double distance_to_class2 = length(samples[i] - class2_point);
 bool predicted_as_class_1 = (distance_to_class1 < distance_to_class2);
 bool really_is_class_1 = (i < num_points);
 // now count how many times we make a mistake
 if (predicted_as_class_1 != really_is_class_1)
 ++num_wrong;
 }
 DLIB_TEST_MSG(num_wrong == 0, num_wrong);
 }
 }
 void generate_circle (
 std::vector<sample_type>& samples,
 double radius,
 const long num
 )
 {
 sample_type m(2,1);
 for (long i = 0; i < num; ++i)
 {
 double sign = 1;
 if (rnd.get_random_double() < 0.5)
 sign = -1;
 m(0) = 2*radius*rnd.get_random_double()-radius;
 m(1) = sign*sqrt(radius*radius - m(0)*m(0));
 samples.push_back(m);
 }
 }
 void test_knn1()
 {
 std::vector<matrix<double,2,1> > samples;
 matrix<double,2,1> test;
 
 test = 0,0; samples.push_back(test);
 test = 1,1; samples.push_back(test);
 test = 1,-1; samples.push_back(test);
 test = -1,1; samples.push_back(test);
 test = -1,-1; samples.push_back(test);
 std::vector<sample_pair> edges;
 find_k_nearest_neighbors(samples, squared_euclidean_distance(), 1, edges);
 DLIB_TEST(edges.size() == 4);
 std::sort(edges.begin(), edges.end(), &order_by_index<sample_pair>);
 DLIB_TEST(edges[0] == sample_pair(0,1,0));
 DLIB_TEST(edges[1] == sample_pair(0,2,0));
 DLIB_TEST(edges[2] == sample_pair(0,3,0));
 DLIB_TEST(edges[3] == sample_pair(0,4,0));
 find_k_nearest_neighbors(samples, squared_euclidean_distance(), 3, edges);
 DLIB_TEST(edges.size() == 8);
 find_k_nearest_neighbors(samples, squared_euclidean_distance(3.9, 4.1), 3, edges);
 DLIB_TEST(edges.size() == 4);
 std::sort(edges.begin(), edges.end(), &order_by_index<sample_pair>);
 DLIB_TEST(edges[0] == sample_pair(1,2,0));
 DLIB_TEST(edges[1] == sample_pair(1,3,0));
 DLIB_TEST(edges[2] == sample_pair(2,4,0));
 DLIB_TEST(edges[3] == sample_pair(3,4,0));
 find_k_nearest_neighbors(samples, squared_euclidean_distance(30000, 4.1), 3, edges);
 DLIB_TEST(edges.size() == 0);
 }
 void test_knn1_approx()
 {
 std::vector<matrix<double,2,1> > samples;
 matrix<double,2,1> test;
 
 test = 0,0; samples.push_back(test);
 test = 1,1; samples.push_back(test);
 test = 1,-1; samples.push_back(test);
 test = -1,1; samples.push_back(test);
 test = -1,-1; samples.push_back(test);
 std::vector<sample_pair> edges;
 find_approximate_k_nearest_neighbors(samples, squared_euclidean_distance(), 1, 10000, seed, edges);
 DLIB_TEST(edges.size() == 4);
 std::sort(edges.begin(), edges.end(), &order_by_index<sample_pair>);
 DLIB_TEST(edges[0] == sample_pair(0,1,0));
 DLIB_TEST(edges[1] == sample_pair(0,2,0));
 DLIB_TEST(edges[2] == sample_pair(0,3,0));
 DLIB_TEST(edges[3] == sample_pair(0,4,0));
 find_approximate_k_nearest_neighbors(samples, squared_euclidean_distance(), 3, 10000, seed, edges);
 DLIB_TEST(edges.size() == 8);
 find_approximate_k_nearest_neighbors(samples, squared_euclidean_distance(3.9, 4.1), 3, 10000, seed, edges);
 DLIB_TEST(edges.size() == 4);
 std::sort(edges.begin(), edges.end(), &order_by_index<sample_pair>);
 DLIB_TEST(edges[0] == sample_pair(1,2,0));
 DLIB_TEST(edges[1] == sample_pair(1,3,0));
 DLIB_TEST(edges[2] == sample_pair(2,4,0));
 DLIB_TEST(edges[3] == sample_pair(3,4,0));
 find_approximate_k_nearest_neighbors(samples, squared_euclidean_distance(30000, 4.1), 3, 10000, seed, edges);
 DLIB_TEST(edges.size() == 0);
 }
 void test_knn2()
 {
 std::vector<matrix<double,2,1> > samples;
 matrix<double,2,1> test;
 
 test = 1,1; samples.push_back(test);
 test = 1,-1; samples.push_back(test);
 test = -1,1; samples.push_back(test);
 test = -1,-1; samples.push_back(test);
 std::vector<sample_pair> edges;
 find_k_nearest_neighbors(samples, squared_euclidean_distance(), 2, edges);
 DLIB_TEST(edges.size() == 4);
 std::sort(edges.begin(), edges.end(), &order_by_index<sample_pair>);
 DLIB_TEST(edges[0] == sample_pair(0,1,0));
 DLIB_TEST(edges[1] == sample_pair(0,2,0));
 DLIB_TEST(edges[2] == sample_pair(1,3,0));
 DLIB_TEST(edges[3] == sample_pair(2,3,0));
 find_k_nearest_neighbors(samples, squared_euclidean_distance(), 200, edges);
 DLIB_TEST(edges.size() == 4*3/2);
 }
 void test_knn2_approx()
 {
 std::vector<matrix<double,2,1> > samples;
 matrix<double,2,1> test;
 
 test = 1,1; samples.push_back(test);
 test = 1,-1; samples.push_back(test);
 test = -1,1; samples.push_back(test);
 test = -1,-1; samples.push_back(test);
 std::vector<sample_pair> edges;
 // For this simple graph and high number of samples we will do we should obtain the exact 
 // knn solution.
 find_approximate_k_nearest_neighbors(samples, squared_euclidean_distance(), 2, 10000, seed, edges);
 DLIB_TEST(edges.size() == 4);
 std::sort(edges.begin(), edges.end(), &order_by_index<sample_pair>);
 DLIB_TEST(edges[0] == sample_pair(0,1,0));
 DLIB_TEST(edges[1] == sample_pair(0,2,0));
 DLIB_TEST(edges[2] == sample_pair(1,3,0));
 DLIB_TEST(edges[3] == sample_pair(2,3,0));
 find_approximate_k_nearest_neighbors(samples, squared_euclidean_distance(), 200, 10000, seed, edges);
 DLIB_TEST(edges.size() == 4*3/2);
 }
 void perform_test (
 )
 {
 for (int i = 0; i < 5; ++i)
 {
 do_the_test();
 ++seed;
 test_knn1_approx();
 test_knn2_approx();
 }
 test_knn1();
 test_knn2();
 test_knn_lsh_sparse<double>();
 test_knn_lsh_sparse<float>();
 test_knn_lsh_dense<double>();
 test_knn_lsh_dense<float>();
 }
 };
 // Create an instance of this object. Doing this causes this test
 // to be automatically inserted into the testing framework whenever this cpp file
 // is linked into the project. Note that since we are inside an unnamed-namespace 
 // we won't get any linker errors about the symbol a being defined multiple times. 
 linear_manifold_regularizer_tester a;
}