Rosetta 3.5
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
APCluster.cc
Go to the documentation of this file.
1 // -*- mode:c++;tab-width:2;indent-tabs-mode:t;show-trailing-whitespace:t;rm-trailing-spaces:t -*-
2 // vi: set ts=2 noet:
3 //
4 // (c) Copyright Rosetta Commons Member Institutions.
5 // (c) This file is part of the Rosetta software suite and is made available under license.
6 // (c) The Rosetta software is developed by the contributing members of the Rosetta Commons.
7 // (c) For more information, see http://www.rosettacommons.org. Questions about this can be
8 // (c) addressed to University of Washington UW TechTransfer, email: license@u.washington.edu.
9 
10 /// @file protocols/cluster/APCluster.cc
11 ///
12 /// @brief
13 /// @author Ian W. Davis
14 
15 
17 
18 #include <basic/Tracer.hh>
19 
20 #include <utility/exit.hh>
21 
22 // AUTO-REMOVED #include <algorithm>
23 // AUTO-REMOVED
24 #include <cstdio>
25 
26 static basic::Tracer TR("protocols.cluster.APCluster");
27 
28 namespace protocols {
29 namespace cluster {
30 
31 
32 /// @details Less-than comparator works backwards on similarity values,
33 /// to effectively implement a min-heap instead of a max-heap.
35 {
36  return a.s_ik > b.s_ik;
37 }
38 
39 
40 /// @details If this point has more than max_sims similarities already stored,
41 /// the lowest one (most negative) will be discarded.
42 ///
43 /// There is currently no protection against adding s(i,k) twice,
44 /// which will not be caught and will screw up the computation royally.
46 {
47  if( k == i ) {
48  s_kk = s_ik;
49  curr_exemplar = 0; // used as a flag that s_kk has been set
50  } else {
51  Exemplar ex(k, s_ik);
52  candidates.push_back(ex); // add element
53  std::push_heap( candidates.begin(), candidates.end(), Exemplar::min_heap ); // sort smallest s_ik to front
54  while( candidates.size() > max_sims ) {
55  std::pop_heap( candidates.begin(), candidates.end(), Exemplar::min_heap ); // sort smallest s_ik to back
56  candidates.pop_back(); // remove element
57  }
58  }
59 }
60 
61 
62 APCluster::APCluster(core::Size total_pts, core::Size max_sims_per_pt /*= 0*/):
63  utility::pointer::ReferenceCount(),
64  pts_(),
65  is_frozen_(false)
66 {
67  if( max_sims_per_pt > 0 ) max_sims_per_pt_ = max_sims_per_pt;
68  else max_sims_per_pt_ = total_pts;
69 
70  for(core::Size i = 1; i <= total_pts; ++i) {
71  pts_.push_back( DataPoint(i) );
72  }
73 }
74 
75 
77 
78 
79 /// @details Adding s(i,j) is not the same as adding s(j,i) -- you must do both if you want symmetry.
80 ///
81 /// There is currently no protection against adding s(i,k) twice,
82 /// which will not be caught and will screw up the computation royally.
84 {
85  // If we've already filled in the DataPoint.candidate_for arrays,
86  // we'll have to go back and recompute them before we can cluster again.
87  // As a special case, we can change the self preferences without trouble.
88  if( i != k && is_frozen_ ) is_frozen_ = false;
89  pts_[i].add_similarity(k, sim, max_sims_per_pt_);
90 }
91 
92 
93 /// @details
94 /// @param maxits maximum number of iterations, period; 100 - 4000 reasonable.
95 /// @param convits terminate after clusters don't change for this many iterations, 10 - 400 reasonable.
96 /// @param lambda damping factor, 0.50 - 0.95 is a reasonable range
97 /// @return true if the clustering converges and false otherwise.
98 /// Failure to converge is not necessarily a problem -- you may be close to a good solution.
99 /// Try increasing maxits and/or making lambda closer to 1 if you want convergence.
100 bool APCluster::cluster(core::Size maxits, core::Size convits, core::Real lambda)
101 {
102  runtime_assert( 0 <= lambda && lambda < 1 );
103  if( !is_frozen_ ) freeze();
104  reinitialize();
105 
106  bool is_converged = false;
107  core::Size no_change_its = 0;
108  //core::Real last_net_sim = -1e99;
109  core::Real max_net_sim = -1e99;
110  for(core::Size itr = 1; itr <= maxits; ++itr) {
111  update_r_ik(lambda);
112  update_a_ik(lambda);
113  core::Size const changes = assign_exemplars();
114  // Check for early termination if exemplar assignments haven't changed for several cycles.
115  // I find that sometimes the run gets stuck with a few unassigned points (which count as changes)
116  // and if you continue on, you shoot off to a less-optimal solution. Here, "a few" is < 5%.
117  // This heuristic seems to hacky though, so instead I'm just going to track net_sim
118  // and return the solution with the lowest value of that.
119  core::Real const net_sim = get_net_sim();
120  if( net_sim > max_net_sim ) {
122  max_net_sim = net_sim;
123  }
124  if( changes == 0 /*|| (net_sim == last_net_sim && changes <= pts_.size()/20 )*/ ) {
125  no_change_its += 1;
126  if( no_change_its >= convits ) {
127  is_converged = true;
128  break;
129  }
130  } else {
131  no_change_its = 0;
132  }
133  TR.Debug << "Iteration " << itr << ", " << get_num_exemplars() << " clusters, net_sim " << net_sim << ", "
134  << changes << " changes, stable for " << no_change_its << std::endl;
135  //last_net_sim = net_sim;
136  }
138  core::Real const net_sim = get_net_sim();
139  TR << "Finished, " << get_num_exemplars() << " clusters, net_sim " << net_sim << std::endl;
140  return is_converged;
141 }
142 
143 
145 {
146  core::Size count = 0;
147  for(core::Size i = 1, N = pts_.size(); i <= N; ++i) {
148  DataPoint const & p = pts_[i];
149  if( p.i == p.curr_exemplar ) count += 1;
150  }
151  return count;
152 }
153 
154 
156 {
157  exemplars.clear();
158  for(core::Size i = 1, N = pts_.size(); i <= N; ++i) {
159  DataPoint const & p = pts_[i];
160  if( p.i == p.curr_exemplar ) exemplars.push_back(i);
161  }
162 }
163 
164 
166 {
167  cluster.clear();
168  for(core::Size i = 1, N = pts_.size(); i <= N; ++i) {
169  DataPoint const & p = pts_[i];
170  if( k == p.curr_exemplar ) cluster.push_back(i);
171  }
172 }
173 
174 
176 {
177  core::Real sum = 0;
178  for(core::Size i = 1, N = pts_.size(); i <= N; ++i) {
179  DataPoint const & p = pts_[i];
180  if( p.curr_exemplar == i ) {
181  sum += p.s_kk; // Should this be included? Yes.
182  continue;
183  }
184  for(core::Size j = 1, j_end = p.candidates.size(); j <= j_end; ++j) {
185  Exemplar const & e = p.candidates[j];
186  if( p.curr_exemplar == e.k ) {
187  sum += e.s_ik;
188  break;
189  }
190  }
191  }
192  return sum;
193 }
194 
195 
197 {
198  if( is_frozen_ ) return;
199 
200  for(core::Size i = 1, N = pts_.size(); i <= N; ++i) {
201  DataPoint & p = pts_[i];
202  p.candidate_for.clear();
203  }
204  for(core::Size i = 1, N = pts_.size(); i <= N; ++i) {
205  DataPoint & p = pts_[i];
206  for(core::Size j = 1, j_end = p.candidates.size(); j <= j_end; ++j) {
207  // Tell point j that s(i,j) exists -- point i already knows by traversing candidates
208  Exemplar & e = p.candidates[j];
209  pts_[e.k].candidate_for.push_back(&e);
210  }
211  }
212 
213  is_frozen_ = true;
214 }
215 
216 
217 /// @details Prepares the data points for another clustering run.
218 /// Does not erase similarity data, etc.
220 {
221  for(core::Size i = 1, N = pts_.size(); i <= N; ++i) {
222  DataPoint & p = pts_[i];
223  p.r_kk = 0;
224  p.a_kk = 0;
225  p.curr_exemplar = 0;
226  p.best_exemplar = 0;
227  for(core::Size j = 1, j_end = p.candidates.size(); j <= j_end; ++j) {
228  Exemplar & e = p.candidates[j];
229  e.r_ik = 0;
230  e.a_ik = 0;
231  }
232  }
233 }
234 
235 
237 {
238  for(core::Size i = 1, N = pts_.size(); i <= N; ++i) {
239  DataPoint & p = pts_[i];
240 
241  // Compute the max and runner-up just once per point.
242  // Reduces time complexity from O(N^3) to O(N^2)
243  core::Real max1 = p.a_kk + p.s_kk, max2 = p.a_kk + p.s_kk;
244  core::Size max1_k = i/*, max2_k = i*/;
245  for(core::Size kk = 1, kk_end = p.candidates.size(); kk <= kk_end; ++kk) {
246  Exemplar const & e_kk = p.candidates[kk];
247  core::Real const as = e_kk.a_ik + e_kk.s_ik;
248  if( as >= max2 ) {
249  if( as >= max1 ) {
250  max2 = max1;
251  //max2_k = max1_k; // set but never used ~Labonte
252  max1 = as;
253  max1_k = e_kk.k;
254  } else {
255  max2 = as;
256  //max2_k = e_kk.k; // set but never used ~Labonte
257  }
258  }
259  }
260 
261  // General update rule for r(i,k)
262  // r(i,k) = s(i,k) - max{ a(i,k') + s(i,k') }, k' != k
263  for(core::Size k = 1, k_end = p.candidates.size(); k <= k_end; ++k) {
264  Exemplar & e = p.candidates[k];
265 
266  //core::Real max_as = p.a_kk + p.s_kk;
267  //for(core::Size kk = 1, kk_end = p.candidates.size(); kk <= kk_end; ++kk) {
268  // Exemplar const & e_kk = p.candidates[kk];
269  // core::Real const as = e_kk.a_ik + e_kk.s_ik;
270  // if( as >= max_as && k != kk ) max_as = as;
271  //}
272  // Global max unless that was from this k, then runner up.
273  core::Real const max_as = ( max1_k == e.k ? max2 : max1 );
274 
275  core::Real const new_r_ik = e.s_ik - max_as;
276  e.r_ik = (lambda)*e.r_ik + (1-lambda)*new_r_ik;
277  }
278  // Special update rule for r(k,k)
279  {
280  //core::Real max_as = -1e99;
281  //for(core::Size kk = 1, kk_end = p.candidates.size(); kk <= kk_end; ++kk) {
282  // Exemplar const & e_kk = p.candidates[kk];
283  // core::Real const as = e_kk.a_ik + e_kk.s_ik;
284  // if( as >= max_as ) max_as = as;
285  //}
286  core::Real const max_as = ( max1_k == i ? max2 : max1 );
287 
288  core::Real const new_r_kk = p.s_kk - max_as;
289  p.r_kk = (lambda)*p.r_kk + (1-lambda)*new_r_kk;
290  }
291  }
292 }
293 
294 
296 {
297  // Precalculate the sum term to reduce complexity from O(N^3) to O(N^2)
298  for(core::Size i = 1, N = pts_.size(); i <= N; ++i) {
299  DataPoint & p = pts_[i];
300  p.sum = 0;
301  for(core::Size ii = 1, ii_end = p.candidate_for.size(); ii <= ii_end; ++ii) {
302  Exemplar const & e_ii = *( p.candidate_for[ii] );
303  // Can't ever have k == i because the diagonal elements have their own vars
304  if( e_ii.r_ik > 0 /*&& e_ii.k != i*/ ) p.sum += e_ii.r_ik;
305  }
306  runtime_assert( p.sum >= 0 );
307  }
308  for(core::Size i = 1, N = pts_.size(); i <= N; ++i) {
309  DataPoint & p = pts_[i];
310  // General update rule for a(i,k)
311  // a(i,k) = min{ 0, r(k,k) + sum{ max{0, r(i',k)} } }, i' != i and i' != k
312  for(core::Size k = 1, k_end = p.candidates.size(); k <= k_end; ++k) {
313  Exemplar & e = p.candidates[k];
314  DataPoint const & p_k = pts_[e.k];
315 
316  //core::Real sum = 0;
317  //for(core::Size ii = 1, ii_end = p_k.candidate_for.size(); ii <= ii_end; ++ii) {
318  // Exemplar const & e_ii = *( p_k.candidate_for[ii] );
319  // if( e_ii.r_ik > 0 && e_ii.k != i && e_ii.k != e.k ) sum += e_ii.r_ik;
320  //}
321  core::Real sum = p_k.sum;
322  // We know i' == k has been excluded from the sum because that would be r(k,k),
323  // and diagonal terms are stored in their own vars in the DataPoints rather than in Exemplars.
324  // If i' == i, then r(i',k) -> r(i,k), which we have access to as e.r_ik.
325  if( e.r_ik > 0 ) sum -= e.r_ik;
326  runtime_assert( sum >= 0 );
327 
328  core::Real new_a_ik = p_k.r_kk + sum;
329  if( new_a_ik > 0 ) new_a_ik = 0;
330  e.a_ik = (lambda)*e.a_ik + (1-lambda)*new_a_ik;
331  }
332  // Special update rule for a(k,k)
333  {
334  core::Real const new_a_kk = p.sum;
335  p.a_kk = (lambda)*p.a_kk + (1-lambda)*new_a_kk;
336  }
337  // It appears that a(i,k) <= 0 while a(k,k) >= 0 ... strange.
338  }
339 }
340 
341 
343 {
344  core::Size changes = 0;
345 
346  // The paper claims that the exemplar for each point i is the point k
347  // that maximizes a(i,k)+r(i,k), where i and k may be equal.
348  // HOWEVER, this can lead to the case where a point k is the exemplar
349  // for other point and yet not for itself.
350  // Also, it's at odds with what's actually in their MATLAB code -- see below.
351 
352  //for(core::Size i = 1, N = pts_.size(); i <= N; ++i) {
353  // DataPoint & p = pts_[i];
354  // core::Size new_exemplar = i;
355  // core::Real max_ar = p.a_kk + p.r_kk;
356  // for(core::Size k = 1, k_end = p.candidates.size(); k <= k_end; ++k) {
357  // Exemplar const & e = p.candidates[k];
358  // core::Real const ar = e.a_ik + e.r_ik;
359  // if( ar > max_ar ) {
360  // new_exemplar = e.k;
361  // max_ar = ar;
362  // }
363  // }
364  // if( new_exemplar != p.curr_exemplar ) changes += 1;
365  // p.curr_exemplar = new_exemplar;
366  //}
367 
368  // The MATLAB code says that exemplars are points for which a(k,k)+r(k,k) > 0,
369  // and all other points i are assigned to the exemplar they are closest to,
370  // i.e. the exemplar k that maximizes s(i,k)
371 
372  // 1. Find exemplars
373  for(core::Size i = 1, N = pts_.size(); i <= N; ++i) {
374  DataPoint & p = pts_[i];
375  if( p.a_kk + p.r_kk > 0 ) {
376  if( p.curr_exemplar != i ) changes += 1;
377  p.curr_exemplar = i;
378  } else if( p.curr_exemplar == i ) {
379  changes += 1;
380  p.curr_exemplar = 0; // to avoid confusion below if this point was an exemplar last time
381  }
382  }
383  // 2. Assign other points to nearest exemplar.
384  for(core::Size i = 1, N = pts_.size(); i <= N; ++i) {
385  DataPoint & p = pts_[i];
386  if( p.curr_exemplar == i ) continue; // already an exemplar itself
387  core::Size new_exemplar = 0;
388  core::Real max_s = -1e99;
389  for(core::Size k = 1, k_end = p.candidates.size(); k <= k_end; ++k) {
390  Exemplar const & e = p.candidates[k];
391  DataPoint const & p_k = pts_[e.k];
392  if( p_k.curr_exemplar != p_k.i ) continue; // not an exemplar
393  if( e.s_ik > max_s ) {
394  max_s = e.s_ik;
395  new_exemplar = e.k; // == p_k.i
396  }
397  }
398  if( new_exemplar != p.curr_exemplar && p.curr_exemplar != 0 ) changes += 1;
399  p.curr_exemplar = new_exemplar;
400  }
401  // 3. Mop up unassigned points by putting them in their own cluster.
402  // This is possible in sparse cases; shouldn't be in dense cases.
403  // Due to the logic above, these points are ALWAYS counted as changes,
404  // even if they ended up in this predicament during the last cycle too.
405  for(core::Size i = 1, N = pts_.size(); i <= N; ++i) {
406  DataPoint & p = pts_[i];
407  if( p.curr_exemplar != 0 ) continue; // already has an exemplar
408  //changes += 1; already counted in other steps
409  p.curr_exemplar = i;
410  }
411 
412 
413 
414  return changes;
415 }
416 
417 
419 {
420  for(core::Size i = 1, N = pts_.size(); i <= N; ++i) {
421  DataPoint & p = pts_[i];
423  }
424 }
425 
426 
428 {
429  for(core::Size i = 1, N = pts_.size(); i <= N; ++i) {
430  DataPoint & p = pts_[i];
432  }
433 }
434 
435 
436 // Helper functions for IO
437 // I'm not too worried about efficiency as files should be block-buffered by the OS by default.
438 template<typename T> inline void write1(FILE /*const*/ * f, T t)
439 { if (fwrite(&t, sizeof(T), 1, f) != 1){ assert(false); } }
440 template<typename T> inline void read1(FILE /*const*/ * f, T & t)
441 { if (fread(&t, sizeof(T), 1, f) != 1){ assert(false); } }
442 
443 
445 {
446  using namespace std;
447  FILE* f = fopen( filename.c_str(), "wb" );
448  if( f == NULL ) return false;
449  core::Size const N = pts_.size(); write1(f, N);
450  for(core::Size i = 1; i <= N; ++i) {
451  DataPoint const & p = pts_[i];
452  write1(f, p.i);
453  write1(f, p.s_kk);
454  write1(f, p.curr_exemplar);
455  core::Size const k_end = p.candidates.size(); write1(f, k_end);
456  for(core::Size k = 1; k <= k_end; ++k) {
457  Exemplar const & e = p.candidates[k];
458  write1(f, e.k);
459  write1(f, e.s_ik);
460  }
461  }
463  fclose(f);
464  return true;
465 }
466 
467 
469 {
470  using namespace std;
471  FILE* f = fopen( filename.c_str(), "rb" );
472  if( f == NULL ) return false;
473  pts_.clear();
474  core::Size N; read1(f, N);
475  for(core::Size i = 1; i <= N; ++i) {
476  pts_.push_back( DataPoint(i) );
477  DataPoint & p = pts_[i];
478  read1(f, p.i);
479  read1(f, p.s_kk);
480  read1(f, p.curr_exemplar);
481  core::Size k_end; read1(f, k_end);
482  for(core::Size k = 1; k <= k_end; ++k) {
483  Exemplar e(0,0);
484  read1(f, e.k);
485  read1(f, e.s_ik);
486  p.candidates.push_back(e);
487  }
488  }
490  is_frozen_ = false;
491  fclose(f);
492  return true;
493 }
494 
495 
496 } // namespace cluster
497 } // namespace protocols