Rosetta 3.5
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
APCluster.hh
Go to the documentation of this file.
1 // -*- mode:c++;tab-width:2;indent-tabs-mode:t;show-trailing-whitespace:t;rm-trailing-spaces:t -*-
2 // vi: set ts=2 noet:
3 //
4 // (c) Copyright Rosetta Commons Member Institutions.
5 // (c) This file is part of the Rosetta software suite and is made available under license.
6 // (c) The Rosetta software is developed by the contributing members of the Rosetta Commons.
7 // (c) For more information, see http://www.rosettacommons.org. Questions about this can be
8 // (c) addressed to University of Washington UW TechTransfer, email: license@u.washington.edu.
9 
10 /// @file protocols/cluster/APCluster.hh
11 ///
12 /// @brief
13 /// @author Ian W. Davis
14 
15 
16 #ifndef INCLUDED_protocols_cluster_APCluster_hh
17 #define INCLUDED_protocols_cluster_APCluster_hh
18 
20 #include <utility/pointer/ReferenceCount.hh>
21 
22 #include <core/types.hh>
23 // AUTO-REMOVED #include <utility/vector1.hh>
24 
25 #include <utility/vector1_bool.hh>
26 
27 
28 namespace protocols {
29 namespace cluster {
30 
31 
32 ///@brief Data structure for one similarity measurement (s_ik) for affinity propagation clustering.
33 ///
34 ///@details There will be one instance of this class for each (finite) similarity between two input points,
35 /// up to a maximum of N*N instances if the similarity matrix is fully populated.
36 ///
37 class Exemplar //: public utility::pointer::ReferenceCount
38 {
39 public:
40 
41  Exemplar(core::Size k_in, core::Real s_ik_in):
42  k(k_in),
43  s_ik(s_ik_in),
44  r_ik(0),
45  a_ik(0)
46  {}
47 
48  ~Exemplar() {}
49 
50  /// @brief "Less than" (actually greater than) comparator for making a heap of exemplars
51  static bool min_heap(Exemplar a, Exemplar b);
52 
53 public:
54  core::Size k; //< label for this data point, 1 .. N
55  core::Real s_ik; //< similarity: higher value = point k is a better exemplar for point i
56  core::Real r_ik; //< responsibility: higher value = point k is a better exemplar for point i relative to other possible exemplars for i
57  core::Real a_ik; //< availability: higher value = point k would be a good exemplar for many points i
58 
59 }; // Exemplar
60 
61 
62 ///@brief Data structure for one input data point for affinity propagation clustering.
63 ///
64 ///@details There should be one instance of this class for each input point.
65 /// Fields are public because it's a glorified struct -- clients shouldn't use this directly.
66 ///
67 class DataPoint //: public utility::pointer::ReferenceCount
68 {
69 public:
70 
72  i(i_in),
73  s_kk(0),
74  r_kk(0),
75  a_kk(0),
76  sum(0),
77  curr_exemplar(i_in), // != 0 used as a flag that s_kk hasn't been set yet
78  best_exemplar(0),
79  candidates(),
81  {}
82 
84 
85  /// @brief Set similarity s(i,k), the suitability of point k to be an exemplar for this point.
86  void add_similarity(core::Size k, core::Real s_ik, core::Size max_sims);
87 
88  bool is_set_s_kk() const { return curr_exemplar == 0; }
89 
90 public:
91  core::Size i; //< label for this data point, 1 .. N
92  core::Real s_kk; //< self-similarity, aka "preference". Higher values = more likely to be a cluster center (exemplar).
93  core::Real r_kk; //< self-responsibility
94  core::Real a_kk; //< self-availability
95  core::Real sum; //< = sum(e.r_ik for e in candidate_for if e.r_ik > 0); a.k.a. the update value for a(k,k); cached for efficiency
96  core::Size curr_exemplar; //< label of point that was best exemplar for this in last round
97  core::Size best_exemplar; //< exemplar from the best round (largest net_sim) in case we want to go back to it
98  utility::vector1< Exemplar > candidates; //< candidate exemplars: s(i,k) exists with i = this. Organized as a max-heap using STL functions.
99  utility::vector1< Exemplar* > candidate_for; //< reverse lookup for s(i,k), with k = this
100 
101 }; // DataPoint
102 
103 
104 ///@brief Public interface for doing affinity propagation clustering.
105 ///
106 ///@details Based on Frey and Dueck, "Clustering by Passing Messages Between Data Points", Science 315 (2007).
107 /// Useful for choosing a set of representative data points (exemplars)
108 /// out of a large set (e.g. all decoys from a large Rosetta run)
109 /// given a measure of similarity (e.g. RMSD, maxsub, GDT, ...).
110 ///
111 /// As I understand it, this procedures tries to maximize the sum of similarities between
112 /// each data point and its exemplar, while balancing that with total number of clusters.
113 /// Reasonable measures of similarity would be negative RMSD, log-likelihoods,
114 /// or squared distance (i.e. squared error), depending on what the points represent.
115 /// Note there is no requirement for symmetry: s(i,j) need not equal s(j,i).
116 /// The self-similarity s(k,k) ("preference") for each point controls the likelihood it will be selected as an exemplar,
117 /// and thus indirectly controls the total number of clusters.
118 /// There is no way to directly specify a specific number of clusters.
119 /// The authors suggest that using the median of all other similarities will give a moderate number of clusters,
120 /// and using the minimum of the other similaries will give a small number of clusters.
121 ///
122 /// This implementation is designed for clustering very large numbers of data points
123 /// with sparse similarity [ s(i,k) = -Inf for most i,k ].
124 /// Similarities for each input point are kept in a heap so that you can limit to only the L highest for each.
125 /// (This scheme is quite likely to break symmetry, as some points will have more close neighbors than others.)
126 /// Alternately, you may choose to do your own pre-filtering and only enter the G globally highest similarities
127 /// between any points in the data set.
128 /// Run time (per cycle) is linear in the number of similarities, or O(N^2) in the limit of a dense similarity matrix.
129 ///
130 /// I follow the conventions of the original paper, where "i" is the index of some generic data point,
131 /// and "k" is the index of a data point being considered as an exemplar (cluster center).
132 ///
134 {
135 public:
136 
137  /// @brief Create new clustering class for total_pts input data points.
138  /// Optionally, set a limit on the number of similarities stored per point.
139  APCluster(core::Size total_pts, core::Size max_sims_per_pt = 0);
140  virtual ~APCluster();
141 
142  /// @brief How appropriate is k as an exemplar for i?
143  virtual void set_sim(core::Size i, core::Size k, core::Real sim);
144  /// @brief Run the actual clustering algorithm.
145  virtual bool cluster(core::Size maxits, core::Size convits, core::Real lambda);
146 
147  virtual core::Size num_pts() const { return pts_.size(); }
148  /// @brief Return the index of the point that is the exemplar for point i.
149  virtual core::Size get_exemplar_for(core::Size i) const { return pts_[i].curr_exemplar; }
150  /// @brief The number of exemplars selected (number of clusters).
151  /// Monotonically related to the self-preferences s(k,k).
152  virtual core::Size get_num_exemplars() const;
153  /// @brief Return the indices of data points chosen as exemplars (cluster centers).
154  virtual void get_all_exemplars(utility::vector1< core::Size > & exemplars) const;
155  /// @brief Returns the indices of points with the specified exemplar k.
156  /// Note that k is the index of an (input) data point that was chosen as an exemplar,
157  /// not some "cluster index" between 1 and get_num_exemplars().
158  virtual void get_cluster_for(core::Size k, utility::vector1< core::Size > & cluster) const;
159  /// @brief The sum of similarities s(i,k) between every data point i and its exemplar k,
160  /// plus the self preferences of the exemplars.
161  /// The algorithm *should* minimize this value -- if it dips and climbs again, increase lambda.
162  virtual core::Real get_net_sim() const;
163 
164  /// @brief Saves the (sparse) similarity matrix and current cluster assignments (if any),
165  /// but not the accumulated evidence from the last clustering [ r(i,k) and a(i,k) ].
166  /// File format is custom binary and is not portable (host endian-ness).
167  virtual bool save_binary(std::string const & filename) const;
168  /// @brief Wipes all currently held data and reads in similarity values and cluster assignments.
169  /// Afterwards, points may be re-clustered with different parameters if desired.
170  /// File format is custom binary and is not portable (host endian-ness).
171  virtual bool load_binary(std::string const & filename);
172 
173 protected:
174  virtual void freeze();
175  virtual void reinitialize();
176  virtual void update_r_ik(core::Real lambda);
177  virtual void update_a_ik(core::Real lambda);
178  virtual core::Size assign_exemplars();
179  virtual void save_best_exemplars();
180  virtual void restore_best_exemplars();
181 
182 private:
183  utility::vector1< DataPoint > pts_; //< the data points to be clustered
184  core::Size max_sims_per_pt_; //< if more than this many similarities for some point i, discard the lowest ones
185  bool is_frozen_; //< have the DataPoint.candidate_for vectors been filled in yet?
186 
187 }; // APCluster
188 
189 
190 } // namespace cluster
191 } // namespace protocols
192 
193 #endif // INCLUDED_protocols_cluster_APCluster_HH