Rosetta 3.5
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
SequenceComparison.cc
Go to the documentation of this file.
1 // -*- mode:c++;tab-width:2;indent-tabs-mode:t;show-trailing-whitespace:t;rm-trailing-spaces:t -*-
2 // vi: set ts=2 noet:
3 //
4 // (c) Copyright Rosetta Commons Member Institutions.
5 // (c) This file is part of the Rosetta software suite and is made available under license.
6 // (c) The Rosetta software is developed by the contributing members of the Rosetta Commons.
7 // (c) For more information, see http://www.rosettacommons.org. Questions about this can be
8 // (c) addressed to University of Washington UW TechTransfer, email: license@u.washington.edu.
9 
10 //////////////////////////////////////////////////////////////////////
11 /// @begin SequenceComparison
12 ///
13 /// @brief
14 /// Compare the sequences between a native and designed protein
15 ///
16 /// @detailed
17 /// This is an implementation taken from Ron Jacak, Douglas Renfrew, Matt O Mera.
18 /// The main function that is called is the get_sequence_recovery() function. You can
19 /// pass this function a list of native pdbs and designed pdbs, or just 1 native and 1
20 /// designed pdb. The sequence recovery will be output in a file called sequencerecovery.txt
21 /// along with a substitution matrix in a file called submatrix.txt
22 ///
23 ///
24 /// @references
25 /// "Native sequences are close to optimal" paper
26 ///
27 ///
28 /// @authors
29 /// Ron Jacak,
30 /// Douglas Renfrew (renfrew@unc.edu) ( added rotamer recovery, cleanup )
31 /// Steven Combs (moved it into a general use class)
32 ///
33 /// @last_modified October 20 2010
34 /////////////////////////////////////////////////////////////////////////
35 
36 // Unit headers
37 //#include <devel/init.hh>
38 
39 //project Headers
42 // AUTO-REMOVED #include <core/io/pdb/pose_io.hh>
43 // AUTO-REMOVED #include <basic/options/util.hh>
46 // AUTO-REMOVED #include <core/pack/task/operation/TaskOperationFactory.hh>
48 #include <core/pose/Pose.hh>
50 // AUTO-REMOVED #include <core/pose/metrics/CalculatorFactory.hh>
51 // AUTO-REMOVED #include <core/scoring/Energies.hh>
52 // AUTO-REMOVED #include <core/scoring/EnergyMap.hh>
53 // AUTO-REMOVED #include <core/scoring/ScoreType.hh>
54 // AUTO-REMOVED #include <core/scoring/ScoreFunction.hh>
55 // AUTO-REMOVED #include <core/scoring/ScoreFunctionFactory.hh>
56 // AUTO-REMOVED #include <core/scoring/TenANeighborGraph.hh>
58 
59 // AUTO-REMOVED #include <protocols/simple_moves/PackRotamersMover.hh>
60 // AUTO-REMOVED #include <core/pose/PDBInfo.hh>
61 
62 // Utility Headers
63 #include <basic/Tracer.hh>
64 // AUTO-REMOVED #include <basic/MetricValue.hh>
65 #include <basic/prof.hh>
66 // AUTO-REMOVED #include <utility/file/file_sys_util.hh>
67 #include <utility/io/ozstream.hh>
68 #include <utility/vector1.hh>
69 
70 
71 
72 // Numeric Headers
73 
74 // ObjexxFCL Headers
75 #include <ObjexxFCL/format.hh>
76 #include <ObjexxFCL/FArray2D.hh>
77 
78 // C++ headers
79 #include <sstream>
80 #include <map>
81 
82 //Auto Headers
85 #include <utility/vector0.hh>
86 
87 
88 
89 
90 namespace protocols{
91 namespace toolbox{
92 namespace pose_metric_calculators{
93 
94 static basic::Tracer TR("seqrecovery");
95 
96 using namespace core;
97 using namespace protocols;
98 using namespace basic::options;
99 using namespace basic::options::OptionKeys;
100 
101 using namespace ObjexxFCL::fmt;
102 
103 
104 
105 ///@brief load custom TaskOperations according to an xml-like utility::tag file
107  using namespace core::pack::task::operation;
108 
109  task_factory_->push_back( new pack::task::operation::InitializeFromCommandline );
110 
111 
112  return task_factory_;
113 
114 }
115 
116 ///@brief return the set of residues that are designable based given pose
118 
119  //we need to score the pose for many of the task operations passed from cmd line
120  std::set< Size > designable_set;
121  core::pack::task::PackerTaskOP design_task( tf->create_task_and_apply_taskoperations( pose ) );
122 
123 #ifndef NDEBUG
124  TR<< "Task is: \n" << *(design_task) << std::endl;
125 #endif
126 
127  // iterate over all residues
128  for ( Size ii = 1; ii<= design_task->total_residue(); ++ii ) {
129  if( design_task->being_designed( ii ) )
130  designable_set.insert( ii );
131  }
132 
133  return designable_set;
134 
135 }
136 
137 
138 ///@brief helper method which uses the tenA nb graph in the pose object to fill a vector with nb counts
140 
143 
144  PointGraphOP pg( new PointGraph ); // create graph
146  core::conformation::find_neighbors<core::conformation::PointGraphVertexData,core::conformation::PointGraphEdgeData>( pg, 10.0 /* ten angstrom distance */ ); // create edges
147 
148  num_nbs.resize( pose.n_residue(), 0 );
149  for ( core::Size ii=1; ii <= pose.total_residue(); ++ii ) {
150 
151  // a PointGraph is a typedef of UpperEdgeGraph< PointGraphVertexData, PointGraphEdgeData >
152  // so any of the method in UpperEdgeGraph should be avail. here. The UpperEdgeGraph provides access to nodes
153  // via a get_vertex() method, and each vertex can report back how many nbs it has.
154  // So something that before was really complicated (nb count calculation) is done in <10 lines of code.
155  // the assumption we're making here is that a pose residue position ii is the same index as the point graph vertex
156  // that is indeed the case if you look at what the function residue_point_graph_from_pose().
157  num_nbs[ ii ] = pg->get_vertex(ii).num_neighbors_counting_self();
158  }
159 
160  return;
161 }
162 
163 
164 
165 ///@brief iterates over all designed positions and determines identity to native. outputs recoveries to file.
167 
168  // setup main arrays used for calculation
172 
176 
180 
181  ObjexxFCL::FArray2D_int sub_matrix( chemical::num_canonical_aas, chemical::num_canonical_aas, 0 );
182 
183  Size n_correct_total(0); Size n_total(0);
184  Size n_correct_total_core(0); Size n_total_core(0);
185  Size n_correct_total_surface(0); Size n_total_surface(0);
186 
187  Size surface_exposed_cutoff(surface_exposure_);
188  Size core_cutoff(core_cutoff_);
189 
190  // iterate through all the structures
191  utility::vector1< core::pose::Pose >::iterator native_itr( native_poses.begin() ), native_last( native_poses.end() );
192  utility::vector1< core::pose::Pose >::iterator redesign_itr( redesign_poses.begin() ), redesign_last( redesign_poses.end() );
193 
194  while( ( native_itr != native_last ) && (redesign_itr != redesign_last ) ) {
195 
196  // get local copies of the poses
197  core::pose::Pose native_pose( *native_itr );
198  core::pose::Pose redesign_pose( *redesign_itr );
199 
200  // figure out the task & neighbor info
202  std::set< Size > design_set;
203  utility::vector1< core::Size > num_neighbors;
204 
205  // setup what residues we are going to look at...
206  setup_tf( task_factory );
207  design_set = fill_designable_set( native_pose, task_factory );
208  fill_num_neighbors( native_pose, num_neighbors );
209 
210  // record native sequence
211  // native_sequence vector is sized for the WHOLE pose not just those being designed
212  // it doesn't matter because we only iterate over the number of designed positions
213  Size const nres( native_pose.total_residue() );
214  utility::vector1< chemical::AA > native_sequence( nres );
215 
216  // iterate over designable positions
217  for ( std::set< core::Size >::const_iterator it = design_set.begin(), end = design_set.end(); it != end; ++it ) {
218 
219  if ( ! native_pose.residue(*it).is_protein() ) {
220  native_sequence[ *it ] = chemical::aa_unk;
221  continue;
222  }
223 
224  native_sequence[ *it ] = native_pose.residue( *it ).aa();
225  n_native[ native_pose.residue(*it).aa() ]++;
226 
227  //determine core/surface
228  if ( num_neighbors[*it] >= core_cutoff ) {
229  n_native_core[ native_pose.residue(*it).aa() ]++;
230  n_total_core++;
231  }
232 
233  if ( num_neighbors[*it] < surface_exposed_cutoff ) {
234  n_native_surface[ native_pose.residue(*it).aa() ]++;
235  n_total_surface++;
236  }
237 
238  } // end finding native seq
239 
240  /// measure seq recov
241  for ( std::set< core::Size >::const_iterator it = design_set.begin(), end = design_set.end(); it != end; ++it ) {
242 
243  // don't worry about recovery of non-protein residues
244  if ( redesign_pose.residue( *it ).is_protein() ) {
245  n_total++;
246 
247  // increment the designed count
248  n_designed[ redesign_pose.residue(*it).aa() ]++;
249 
250  if ( num_neighbors[*it] >= core_cutoff ) { n_designed_core[ redesign_pose.residue(*it).aa() ]++; }
251  if ( num_neighbors[*it] < surface_exposed_cutoff ) { n_designed_surface[ redesign_pose.residue(*it).aa() ]++; }
252 
253  // then check if it's the same
254  if ( native_sequence[ *it ] == redesign_pose.residue(*it).aa() ) {
255  n_correct[ redesign_pose.residue(*it).aa() ]++;
256 
257  if ( num_neighbors[*it] >= core_cutoff ) {
258  n_correct_core[ redesign_pose.residue(*it).aa() ]++;
259  n_correct_total_core++;
260  }
261  if ( num_neighbors[*it] < surface_exposed_cutoff ) {
262  n_correct_surface[ redesign_pose.residue(*it).aa() ]++;
263  n_correct_total_surface++;
264  }
265  n_correct_total++;
266  }
267 
268  // set the substitution matrix for this go round
269  sub_matrix( native_pose.residue(*it).aa(), redesign_pose.residue(*it).aa() )++;
270  }
271 
272  } // end measure seq reovery
273 
274  // increment iterators
275  native_itr++; redesign_itr++;
276  }
277 
278  // open sequence recovery file stream
279  utility::io::ozstream outputFile( "sequencerecovery.txt" ) ;
280 
281  // write header
282  outputFile << "Residue\tNo.correct core\tNo.native core\tNo.designed core\tNo.correct/ No.native core\tNo.correct/ No.designed core\t"
283  << "No.correct\tNo.native\tNo.designed\tNo.correct/ No.native\tNo.correct/ No.designed\t"
284  << "Residue\tNo.correct surface\tNo.native surface\tNo.designed surface\tNo.correct/ No.native\tNo.correct/ No.designed" << std::endl;
285 
286  // write AA data
287  for ( Size ii = 1; ii <= chemical::num_canonical_aas; ++ii ) {
288 
289  outputFile << chemical::name_from_aa( chemical::AA(ii) ) << "\t"
290  << n_correct_core[ ii ] << "\t" << n_native_core[ ii ] << "\t" << n_designed_core[ ii ] << "\t";
291 
292  if ( n_native_core[ii] != 0 ) outputFile << F(4,2, (float)n_correct_core[ii]/n_native_core[ii] ) << "\t";
293  else outputFile << "---\t";
294  if ( n_designed_core[ii] != 0 ) outputFile << F(4,2, (float)n_correct_core[ii]/n_designed_core[ii] ) << "\t";
295  else outputFile << "---\t";
296 
297  // debug
298  //if ( n_native_core[ii] != 0 ) std::cout << F(4,2, (float)n_correct_core[ii]/n_native_core[ii] ) << "\t";
299  //if ( n_designed_core[ii] != 0 ) std::cout << F(4,2, (float)n_correct_core[ii]/n_designed_core[ii] ) << "\t";
300 
301  outputFile << n_correct[ ii ] << "\t" << n_native[ ii ] << "\t" << n_designed[ ii ] << "\t";
302  if ( n_native[ii] != 0 ) outputFile << F(4,2, (float)n_correct[ii]/n_native[ii] ) << "\t";
303  else outputFile << "---\t";
304  if ( n_designed[ii] != 0 ) outputFile << F(4,2, (float)n_correct[ii]/n_designed[ii] ) << "\t";
305  else outputFile << "---\t";
306 
307  // debug
308  //if ( n_native[ii] != 0 ) std::cout << F(4,2, (float)n_correct[ii]/n_native[ii] ) << "\t";
309  //if ( n_designed[ii] != 0 ) std::cout << F(4,2, (float)n_correct[ii]/n_designed[ii] ) << "\t";
310 
311  outputFile << chemical::name_from_aa( chemical::AA(ii) ) << "\t"
312  << n_correct_surface[ ii ] << "\t" << n_native_surface[ ii ] << "\t" << n_designed_surface[ ii ] << "\t";
313 
314  if ( n_native_surface[ii] != 0 ) outputFile << F(4,2, (float)n_correct_surface[ii]/n_native_surface[ii] ) << "\t";
315  else outputFile << "---\t";
316  if ( n_designed_surface[ii] != 0 ) outputFile << F(4,2, (float)n_correct_surface[ii]/n_designed_surface[ii] ) << "\t";
317  else outputFile << "---\t";
318 
319  // debug
320  //if ( n_native_surface[ii] != 0 ) std::cout << F(4,2, (float)n_correct_surface[ii]/n_native_surface[ii] ) << "\t";
321  //if ( n_designed_surface[ii] != 0 ) std::cout << F(4,2, (float)n_correct_surface[ii]/n_designed_surface[ii] ) << "\t";
322 
323  outputFile << std::endl;
324  }
325 
326  // write totals
327  outputFile << "Total\t"
328  << n_correct_total_core << "\t" << n_total_core << "\t\t" << F(5,3, (float)n_correct_total_core/n_total_core ) << "\t\t"
329  << n_correct_total << "\t" << n_total << "\t\t" << F(5,3, (float)n_correct_total/n_total ) << "\t\tTotal\t"
330  << n_correct_total_surface << "\t" << n_total_surface << "\t\t" << F(5,3, (float)n_correct_total_surface/n_total_surface )
331  << std::endl;
332 
333 
334  // output the sequence substitution file
335  utility::io::ozstream matrixFile( "submatrix.txt" ) ; //defaults to submatrix.txt
336 
337  // write the header
338  matrixFile << "AA_TYPE" << "\t" ;
339  for ( Size ii = 1; ii <= chemical::num_canonical_aas; ++ii ) {
340  matrixFile << "nat_"<<chemical::name_from_aa( chemical::AA(ii) ) << "\t";
341  }
342  matrixFile<<std::endl;
343 
344  // now write the numbers
345  for ( Size ii = 1; ii <= chemical::num_canonical_aas; ++ii ) { //redesigns
346  matrixFile << "sub_" << chemical::name_from_aa( chemical::AA(ii) );
347  for ( Size jj = 1; jj <= chemical::num_canonical_aas; ++jj ) { //natives
348  //std::cout<<"Native: "<< jj << " Sub: " << ii << " Value: "<<sub_matrix( jj, ii ) << std::endl;
349  matrixFile<< "\t" << sub_matrix( jj, ii );
350  }
351  matrixFile << std::endl;
352  }
353 
354 
355 }
356 
357 
358 
359 
360 
361 
364  utility::vector1<core::pose::Pose> redesign_poses;
365 
366  native_poses.push_back(native);
367  redesign_poses.push_back(designed);
368 
369  get_sequence_recovery(native_poses, redesign_poses);
370 }
371 
372 //@brief main method for the sequence recovery protocol
374 
375  if ( native_poses.size() != redesign_poses.size() ) {
376  utility_exit_with_message( "Size of native pdb list file does not equal size of redesign pdb list! \n" );
377  }
378 
379  TR << "Measuring sequence recovery" << std::endl;
380  measure_sequence_recovery( native_poses, redesign_poses );
381 
382 }
383 
384 
385 
386 
387 }
388 }
389 }