Rosetta 3.5
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
SequenceProfile.cc
Go to the documentation of this file.
1 // -*- mode:c++;tab-width:2;indent-tabs-mode:t;show-trailing-whitespace:t;rm-trailing-spaces:t -*-
2 // vi: set ts=2 noet:
3 //
4 // (c) Copyright Rosetta Commons Member Institutions.
5 // (c) This file is part of the Rosetta software suite and is made available under license.
6 // (c) The Rosetta software is developed by the contributing members of the Rosetta Commons.
7 // (c) For more information, see http://www.rosettacommons.org. Questions about this can be
8 // (c) addressed to University of Washington UW TechTransfer, email: license@u.washington.edu.
9 
10 /// @file SequenceProfile.cc
11 /// @brief class definition for a given scoring scheme for an alignment.
12 /// @detailed Simply based on comparing single characters from two protein
13 /// sequences, along with affine gap penalties of the form penalty = A + Bk, where
14 /// A represents the penalty for starting a gap, and B represents the penalty for
15 /// extending a previously opened gap by k characters.
16 /// @author James Thompson
17 
18 #include <core/types.hh>
19 #include <basic/Tracer.hh>
23 
24 #include <utility/exit.hh>
25 #include <utility/io/izstream.hh>
26 #include <utility/file/FileName.hh>
27 #include <utility/pointer/owning_ptr.hh>
28 
29 #include <core/chemical/AA.hh>
30 
31 // AUTO-REMOVED #include <ObjexxFCL/string.functions.hh>
32 #include <iostream>
33 #include <string>
34 
35 #include <utility/vector1.hh>
36 #include <ObjexxFCL/format.hh>
37 
38 
39 namespace core {
40 namespace sequence {
41 
42 static basic::Tracer tr( "core.sequence.SequenceProfile" );
43 
45  utility::file::FileName const & fn,
46  bool negative_better
47 ) {
48  utility::io::izstream input( fn );
50  // order of amino acids in the .checkpoint file
52  order.resize( 20 );
53  order[ 1] = core::chemical::aa_from_oneletter_code( 'A' );
54  order[ 2] = core::chemical::aa_from_oneletter_code( 'C' );
55  order[ 3] = core::chemical::aa_from_oneletter_code( 'D' );
56  order[ 4] = core::chemical::aa_from_oneletter_code( 'E' );
57  order[ 5] = core::chemical::aa_from_oneletter_code( 'F' );
58  order[ 6] = core::chemical::aa_from_oneletter_code( 'G' );
59  order[ 7] = core::chemical::aa_from_oneletter_code( 'H' );
60  order[ 8] = core::chemical::aa_from_oneletter_code( 'I' );
61  order[ 9] = core::chemical::aa_from_oneletter_code( 'K' );
62  order[10] = core::chemical::aa_from_oneletter_code( 'L' );
63  order[11] = core::chemical::aa_from_oneletter_code( 'M' );
64  order[12] = core::chemical::aa_from_oneletter_code( 'N' );
65  order[13] = core::chemical::aa_from_oneletter_code( 'P' );
66  order[14] = core::chemical::aa_from_oneletter_code( 'Q' );
67  order[15] = core::chemical::aa_from_oneletter_code( 'R' );
68  order[16] = core::chemical::aa_from_oneletter_code( 'S' );
69  order[17] = core::chemical::aa_from_oneletter_code( 'T' );
70  order[18] = core::chemical::aa_from_oneletter_code( 'V' );
71  order[19] = core::chemical::aa_from_oneletter_code( 'W' );
72  order[20] = core::chemical::aa_from_oneletter_code( 'Y' );
73 
74  std::string aa_seq;
76  // profile is indexed by the order of amino acids in core::chemical::AA
77 
78  if ( !input ) {
79  utility_exit_with_message( "ERROR: Unable to open file!" );
80  }
81  std::string line;
82  getline( input, line ); // SKIPS FIRST LINE
83  while( getline( input, line ) ) {
84  if ( line.substr(0,3) == "END" ) break; // end of profile
85  std::string aa;
87  prof_row.resize( order.size() );
88 
89  std::istringstream ls( line );
90  ls >> aa;
91 
92  core::Real aa_prob;
93  ls >> aa_prob;
94  Size index(1);
95  while ( !ls.fail() ) {
96  //prof_row.push_back( aa_prob );
97  prof_row[ order[index] ] = aa_prob;
98 
99  ls >> aa_prob;
100  ++index;
101  }
102 
103  aa_seq += aa;
104  new_prof.push_back( prof_row );
105  }
106 
107  sequence( aa_seq );
108  profile( new_prof );
109 }
110 
112  utility::file::FileName const & fn
113 ) {
114  negative_better_ = false;
115  // order of amino acids in the .pssm file
117  order.resize( 20 );
118  order[ 1] = core::chemical::aa_from_oneletter_code( 'A' );
119  order[ 2] = core::chemical::aa_from_oneletter_code( 'R' );
120  order[ 3] = core::chemical::aa_from_oneletter_code( 'N' );
121  order[ 4] = core::chemical::aa_from_oneletter_code( 'D' );
122  order[ 5] = core::chemical::aa_from_oneletter_code( 'C' );
123  order[ 6] = core::chemical::aa_from_oneletter_code( 'Q' );
124  order[ 7] = core::chemical::aa_from_oneletter_code( 'E' );
125  order[ 8] = core::chemical::aa_from_oneletter_code( 'G' );
126  order[ 9] = core::chemical::aa_from_oneletter_code( 'H' );
127  order[10] = core::chemical::aa_from_oneletter_code( 'I' );
128  order[11] = core::chemical::aa_from_oneletter_code( 'L' );
129  order[12] = core::chemical::aa_from_oneletter_code( 'K' );
130  order[13] = core::chemical::aa_from_oneletter_code( 'M' );
131  order[14] = core::chemical::aa_from_oneletter_code( 'F' );
132  order[15] = core::chemical::aa_from_oneletter_code( 'P' );
133  order[16] = core::chemical::aa_from_oneletter_code( 'S' );
134  order[17] = core::chemical::aa_from_oneletter_code( 'T' );
135  order[18] = core::chemical::aa_from_oneletter_code( 'W' );
136  order[19] = core::chemical::aa_from_oneletter_code( 'Y' );
137  order[20] = core::chemical::aa_from_oneletter_code( 'V' );
138 
139  utility::io::izstream input( fn );
140 
141  if ( !input ) {
142  std::string msg(
143  "ERROR: Unable to open file " +
144  static_cast< std::string > (fn) +
145  "!"
146  );
147  utility_exit_with_message( msg );
148  }
149  std::string line;
150 
151  tr.Debug << "reading from " << fn << std::endl;
152 
153  // read in two header lines
154  getline( input, line );
155  getline( input, line );
156 
157  // initialize headers
158  getline( input, line );
159  std::istringstream line_stream( line );
160  while ( !line_stream.fail() ) {
161  std::string aa;
162  line_stream >> aa;
163  if ( line_stream.fail() ) continue;
164 
165  alphabet_.push_back( aa );
166  if ( alphabet_.size() >= 20 ) break; // super-hack for pssm file format! bad james, bad!
167  }
168 
169  std::string seq;
170  while( getline( input, line ) ) {
171  std::istringstream line_stream( line );
172  //std::cout << "line = " << line << std::endl;
173  Size pos;
174  string aa;
175 
176  line_stream >> pos >> aa;
177  if ( line_stream.fail() ) continue;
178 
180  prof_row.resize( order.size() );
181 
182  Real score;
183  line_stream >> score;
184  Size index(1);
185  while ( !line_stream.fail() && index <= order.size() ) {
186  prof_row[ order[index] ] = score;
187  line_stream >> score;
188  ++index;
189  }
190 
191  profile_.push_back( prof_row );
192 
193  seq += aa;
194  } // while( getline( input, line ) )
195 
196  tr.Debug << "Read sequence " << seq << " from " << fn << std::endl;
197  tr.Debug << "profile dimensions are " << profile_.size() << "x"
198  << profile_.front().size() << "." << std::endl;
199  sequence( seq );
200  id( std::string(fn) );
201 } // read_from_file
202 
205  mat.read_from_database(matrix);
206 
208  for (core::Size ii(1), end(seq.length()); ii <= end; ++ii) {
211  new_prof.push_back(prof_row);
212  }
213 
214  sequence( seq.sequence() );
215  profile( new_prof );
216  negative_better_ = false;
217 }
218 
219 /// @brief Returns the 2D vector1 of Real-values representing this profile.
222  return profile_;
223 }
224 
225 /// @brief Multiply all profile weights by factor
227  for ( Size ii = 1; ii <= profile().size(); ++ii ) {
229  it = profile_[ii].begin(), end = profile_[ii].end(); it != end; ++it ) {
230  *it *= factor;
231  }
232  }
233  if ( factor < 0 ) {
234  tr << "Flipping sense of negative_better." << std::endl;
236  }
237 }
238 
240  temp_ = temp;
242  for ( Size ii = 1; ii <= profile().size(); ++ii ) {
243  utility::vector1< core::Real > new_prof_row = prof_row(ii);
244  scores_to_probs_( new_prof_row, temp, negative_better_ );
245  new_prof.push_back( new_prof_row );
246  }
247  profile( new_prof );
248  negative_better_ = false;
249 }
250 
252  core::Real maxval(0.0);
253  for ( Size ii = 1; ii <= profile_.size(); ++ii ) {
254  for ( Size jj = 1; jj <= profile_[ii].size(); ++jj ) {
255  if ( maxval < std::abs(profile_[ii][jj]) ) {
256  maxval = std::abs(profile_[ii][jj]);
257  }
258  }
259  }
260  rescale( 1/maxval );
261 }
262 
263 
266 ) {
267  profile_ = new_prof;
268 }
269 
271  profile_.resize(pos+1);
272  profile_[pos] = new_prof_row;
273 }
274 
276  core::Size pos,
277  char new_char
278 ) {
279  using core::Real;
280  using utility::vector1;
281  // add in a profile column of zeroes.
282  vector1< Real > zero_col( width(), 0.0 );
283  vector1< vector1< Real > > new_prof, old_prof( profile() );
284  for ( Size i = 0; i <= length() + 1; ++i ) {
285  if ( i == pos ) new_prof.push_back( zero_col );
286  if ( i >= 1 && i <= length() ) new_prof.push_back( old_prof[i] );
287  }
288 
289  profile( new_prof );
290  Sequence::insert_char( pos, new_char );
291 }
292 
294  core::Size pos
295 ) {
296  //std::cout << "sequence is " << sequence() << std::endl;
297  using core::Real;
298  using utility::vector1;
299 
300  runtime_assert( pos <= length() );
301 
302  vector1< vector1< Real > > new_prof( profile() );
303  vector1< vector1< Real > >::iterator it
304  = new_prof.begin() + pos - start();
305 
306  runtime_assert( it != new_prof.end() );
307 
308  new_prof.erase( it );
309  profile( new_prof );
310 
312  //std::cout << "sequence is " << sequence() << std::endl;
313 }
314 
315 /// @brief Returns the number of distinct values at each position in this profile.
317  assert( check_internals_() );
318  return profile_[1].size();
319 }
320 
321 /// @brief Returns the vector1 of values at this position.
324  runtime_assert( pos <= profile_.size() );
325  return profile_[ pos ];
326 }
327 
330  core::Real kT,
331  bool negative_better /* = false */
332 ) const {
333  using std::exp;
334  using utility::vector1;
335 
336  // calculate partition (aka Z), with this definition:
337  // Z = sum( exp( score / kT ) )
338  core::Real partition( 0.0 );
340  it = scores.begin(), end = scores.end(); it != end; ++it
341  ) {
342  if ( negative_better ) {
343  *it = exp( -1 * *it / kT );
344  } else {
345  *it = exp( *it / kT );
346  }
347  partition += *it;
348  }
349 
350  // transform scores using the partition calculated above:
351  // P(s) = exp( -1 * score / kT ) ) / Z
353  it = scores.begin(), end = scores.end(); it != end; ++it
354  ) {
355  *it = *it / partition;
356  }
357 } // scores_to_probs_
358 
359 std::ostream & operator<<( std::ostream & out, const SequenceProfile & p ) {
360  Size width = 8;
361  Size precision = 3;
362 
363  out << p.to_string() << std::endl;
364  for ( Size i = 1; i <= p.length(); ++i ) {
365  for ( Size j = 1; j <= p.width(); ++j ) {
366  out << ObjexxFCL::fmt::F( width, precision, p.prof_row(i)[j] );
367  }
368  out << std::endl;
369  }
370 
371  return out;
372 }
373 
375  using core::Real;
376  using utility::vector1;
377 
378  runtime_assert( profile_.size() == length() );
379 
380  for ( Size i = 1; i <= profile_.size(); ++i ) {
381  runtime_assert( profile_[i].size() == alphabet_.size() );
382  }
383 
384  return true;
385 }
386 
387 } // sequence
388 } // core