LALInferenceClusteredKDE.h 6.92 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
/*
 *  LALInferenceClusteredKDE.h:  Bayesian Followup, kernel density estimator.
 *
 *  Copyright (C) 2013 Ben Farr
 *
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with with program; see the file COPYING. If not, write to the
 *  Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
 *  MA  02111-1307  USA
 */
#ifndef LALInferenceClusteredKDE_h
#define LALInferenceClusteredKDE_h

25 26 27 28
#include <lal/LALStdlib.h>
#include <lal/LALConstants.h>
#include <lal/LALDatatypes.h>

29 30
#include <lal/LALInferenceKDE.h>

31 32 33 34 35 36 37 38
struct tagkmeans;

/**
 * Structure for performing the kmeans clustering algorithm on a set of samples.
 */
typedef struct
tagkmeans
{
39
    gsl_matrix *data;                    /**< Data to be clustered (typically whitened) */
40 41 42 43
    INT4 dim;                            /**< Dimension of data */
    INT4 npts;                           /**< Number of points being clustered */
    INT4 k;                              /**< Number of clusters */
    INT4 has_changed;                    /**< Flag indicating a change to cluster assignmens */
44 45

    REAL8 (*dist) (gsl_vector *x, gsl_vector *y);                           /**< Distance funtion */
46
    void (*centroid) (gsl_vector *centroid, gsl_matrix *data, INT4 *mask); /**< Find centroid */
47

48 49 50
    INT4 *assignments;                   /**< Cluster assignments */
    INT4 *sizes;                         /**< Cluster sizes */
    INT4 *mask;                          /**< Mask used to select data from individual clusters */
51 52
    REAL8 *weights;                      /**< Fraction of data points in each cluster */
    gsl_vector *mean;                    /**< Mean of unwhitened data (for tranforming points) */
53 54
    gsl_vector *std;                     /**< Std deviations of unwhitened data */
    gsl_matrix *cov;                     /**< Covariance matrix of data */
55 56 57
    gsl_matrix *centroids;               /**< Array with rows containing cluster centroids */
    gsl_matrix *recursive_centroids;     /**< Matrix used to accumulate tested centroids */
    gsl_rng *rng;                        /**< Random number generator */
58

59
    REAL8 error;                         /**< Error of current clustering */
60

61
    LALInferenceKDE **KDEs;              /**< Array of KDEs, one for each cluster */
62 63 64
} LALInferenceKmeans;

/* Kmeans cluster data, increasing k until the Bayes Information Criteria stop increasing. */
65
LALInferenceKmeans *LALInferenceIncrementalKmeans(gsl_matrix *data, INT4 ntrials, gsl_rng *rng);
66

67
/* Kmeans cluster data, find k that maximizes BIC assuming BIC(k) is concave-down. */
68
LALInferenceKmeans *LALInferenceOptimizedKmeans(gsl_matrix *data, INT4 ntrials, gsl_rng *rng);
69

70
/* Xmeans cluster data, splitting centroids in kmeans according to the Bayes Information Criteria. */
71
LALInferenceKmeans *LALInferenceXmeans(gsl_matrix *data, INT4 ntrials, gsl_rng *rng);
72 73

/* Run kmeans recursively, splitting each centroid recursively, accepting if the BIC increases. */
74
LALInferenceKmeans *LALInferenceRecursiveKmeans(gsl_matrix *data, INT4 ntrials, gsl_rng *rng);
75 76

/* Recursively split a k=1 kmeans. */
77
void LALInferenceKmeansRecursiveSplit(LALInferenceKmeans *kmeans, INT4 ntrials, gsl_rng *rng);
78 79 80 81 82

/* Run the kmeans algorithm until cluster assignments don't change. */
void LALInferenceKmeansRun(LALInferenceKmeans *kmeans);

/* Run a kmeans several times and return the best. */
83
LALInferenceKmeans *LALInferenceKmeansRunBestOf(INT4 k, gsl_matrix *samples, INT4 ntrials, gsl_rng *rng);
84 85

/* Generate a new kmeans struct from a set of data. */
86
LALInferenceKmeans * LALInferenceCreateKmeans(INT4 k, gsl_matrix *data, gsl_rng *rng);
87

88 89 90
/* A 'k-means++'-like seeded initialization of centroids for a kmeans run. */
void LALInferenceKmeansSeededInitialize(LALInferenceKmeans *kmeans);

91 92 93 94 95 96 97 98 99 100 101 102 103
/* Randomly select points from the data as initial centroids for a kmeans run. */
void LALInferenceKmeansForgyInitialize(LALInferenceKmeans *kmeans);

/* Use the resulting centroids from a random cluster assignment as the initial centroids of a kmeans run. */
void LALInferenceKmeansRandomPartitionInitialize(LALInferenceKmeans *kmeans);

/* The assignment step of the kmeans algorithm. */
void LALInferenceKmeansAssignment(LALInferenceKmeans *kmeans);

/* The update step of the kmeans algorithm. */
void LALInferenceKmeansUpdate(LALInferenceKmeans *kmeans);

/* Construct a mask to select only the data assigned to a single cluster. */
104
void LALInferenceKmeansConstructMask(LALInferenceKmeans *kmeans, INT4 *mask, INT4 cluster_id);
105 106

/* Extract a single cluster from an existing kmeans as a new 1-means. */
107
LALInferenceKmeans *LALInferenceKmeansExtractCluster(LALInferenceKmeans *kmeans, INT4 cluster_id);
108

Ben Farr's avatar
Ben Farr committed
109
/* Impose boundaries on KDEs. */
110
void LALInferenceKmeansImposeBounds(LALInferenceKmeans *kmeans, LALInferenceVariables *params, LALInferenceVariables *priorArgs, INT4 cyclic_reflective);
111

112
/* Generate a new matrix by masking an existing one. */
113
gsl_matrix *mask_data(gsl_matrix *data, INT4 *mask);
114 115 116 117 118 119 120

/* Add a vector to the end of a matrix, allocating if necessary. */
void accumulate_vectors(gsl_matrix **mat_ptr, gsl_vector *vec);

/* Add a vector to the end of a matrix. */
void append_vec_to_mat(gsl_matrix **mat_ptr, gsl_vector *vec);

121 122 123
/* Draw a sample from a kmeans-KDE estimate of a distribution. */
REAL8 *LALInferenceKmeansDraw(LALInferenceKmeans *kmeans);

124 125 126
/* Evaluate the estimated (log) PDF from a clustered-KDE at a point. */
REAL8 LALInferenceKmeansPDF(LALInferenceKmeans *kmeans, REAL8 *pt);

127
/* Evaluate the estimated (log) PDF from a clustered-KDE at an already whitened point. */
128
REAL8 LALInferenceWhitenedKmeansPDF(LALInferenceKmeans *kmeans, REAL8 *pt);
129

130 131 132
/* Transform a data set to obtain a 0-mean and identity covariance matrix. */
gsl_matrix * LALInferenceWhitenSamples(gsl_matrix *samples);

133 134 135 136
/* Calculate the squared Euclidean distance bewteen two points. */
REAL8 euclidean_dist_squared(gsl_vector *x, gsl_vector *y);

/* Find the centroid of a masked data set. */
137
void euclidean_centroid(gsl_vector *centroid, gsl_matrix *data, INT4 *mask);
138 139 140 141 142 143 144 145 146 147 148 149

/* Build the kernel density estimate from a kmeans clustering. */
void LALInferenceKmeansBuildKDE(LALInferenceKmeans *kmeans);

/* Calculate the maximum likelihood of a given kmeans assuming spherical Gaussian clusters. */
REAL8 LALInferenceKmeansMaxLogL(LALInferenceKmeans *kmeans);

/* Calculate the BIC using the KDEs of the cluster distributions. */
REAL8 LALInferenceKmeansBIC(LALInferenceKmeans *kmeans);

/* Destroy a kmeans instance. */
void LALInferenceKmeansDestroy(LALInferenceKmeans *kmeans);
150 151

#endif