Rosetta 3.5
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
SequenceAlignment.cc
Go to the documentation of this file.
1 // -*- mode:c++;tab-width:2;indent-tabs-mode:t;show-trailing-whitespace:t;rm-trailing-spaces:t -*-
2 // vi: set ts=2 noet:
3 //
4 // (c) Copyright Rosetta Commons Member Institutions.
5 // (c) This file is part of the Rosetta software suite and is made available under license.
6 // (c) The Rosetta software is developed by the contributing members of the Rosetta Commons.
7 // (c) For more information, see http://www.rosettacommons.org. Questions about this can be
8 // (c) addressed to University of Washington UW TechTransfer, email: license@u.washington.edu.
9 
10 /// @file SequenceAlignment.hh
11 /// @brief class definition for a multiple sequence alignment
12 /// @author James Thompson
13 
14 #include <core/types.hh>
15 
20 
21 #include <utility/exit.hh>
22 #include <utility/io/izstream.hh>
23 
24 #include <ObjexxFCL/string.functions.hh>
25 
26 // AUTO-REMOVED #include <numeric>
27 #include <iostream>
28 #include <string>
29 
30 #include <basic/Tracer.hh>
31 
32 #include <utility/vector1.hh>
33 
34 
35 namespace core {
36 namespace sequence {
37 
38 /// @details moving from .cc file
40  // clear(); // APL NOTE: originally, this dstor called clear(), but clear() doesn't do anything that the dstor itself doesn't do
41 }
42 
43 static basic::Tracer tr( "core.sequence.SequenceAlignment" );
44 
46  sequences_.push_back( myseq );
47 }
48 
50  return sequences_.size();
51 }
52 
54  if ( size() < 1 ) return 0;
55  else return sequences_[1]->length();
56 }
57 
59  if ( idx > size() ){
60  using ObjexxFCL::string_of;
61  std::string msg("");
62  msg += "Requested sequence " + string_of(idx)
63  + " but alignment only has " + string_of(size()) + " sequences\n";
64  msg += "Alignments:\n" + to_string() + "\n";
65  utility_exit_with_message( msg );
66  //runtime_assert( idx <= size() );
67  }
68  return sequences_[idx];
69 }
70 
71 std::map< std::string, core::Real > SequenceAlignment::scores() const {
72  return scores_;
73 }
74 
75 void SequenceAlignment::scores( std::map< std::string, core::Real > new_scores ) {
76  scores_ = new_scores;
77 }
78 
80  static std::string const name("score");
81  return score(name);
82 }
83 
84 void SequenceAlignment::score( Real const & sc ) {
85  score( "score", sc );
86 }
87 
88 Real SequenceAlignment::score( std::string const & name ) const {
89 
90  std::map< std::string, core::Real >::const_iterator it(
91  scores_.find(name)
92  );
93 
94  if ( it == scores_.end() ) return 0;
95  return it->second;
96 }
97 
98 void SequenceAlignment::score( std::string const & name, Real const value ) {
99  scores_[name] = value;
100 }
101 
102 
103 
105  std::string retval("");
106  retval += "# score " + ObjexxFCL::string_of(score()) + "\n";
107  // add comments here
108  for ( Size i = 1; i <= size(); ++i ) {
109  retval += sequence(i)->to_string() + "\n";
110  }
111  retval += "--\n";
112  return retval;
113 }
114 
116  return ObjexxFCL::uppercased( sequence(2)->id() );
117 }
118 
119 /// @brief super-general alignment format reader. This will never ever ever
120 /// have to be improved or extended.
122  utility::io::izstream data( filename );
123  if ( !data ) {
124  utility_exit_with_message(
125  "ERROR: Unable to open alignment file: " + filename
126  );
127  }
128 
129  std::string line;
130  while ( getline( data, line ) ) {
131  std::istringstream input( line );
132  SequenceOP temp_seq( new Sequence );
133  temp_seq->read_data( input );
134  add_sequence( temp_seq );
135  }
136 } // read_from_file
137 
138 void SequenceAlignment::read_data( std::istream & in ) {
139  std::string line;
140  while ( getline( in, line ) ) {
141  std::istringstream input( line );
142  if ( line.substr(0,1) == "#" ) {
143  if ( line.substr(2,5) == "score" ) {
144  std::string dummy;
145  Real sc;
146  input >> dummy >> dummy >> dummy >> sc;
147  score( sc );
148  }
149  } else if ( line.substr(0,2) == "--" ) {
150  break;
151  }
152 
153  SequenceOP temp_seq( new Sequence );
154  temp_seq->read_data( input );
155  add_sequence( temp_seq );
156  }
157 } // read_data
158 
159 /// @brief returns a SequenceMapping of the sequence at index idx1 mapped to
160 /// the sequence at idx2.
162  Size const idx1,
163  Size const idx2
164 ) const {
165  runtime_assert( idx1 <= size() && idx1 > 0 );
166  runtime_assert( idx2 <= size() && idx2 > 0);
167 
169 
171  it1( sequences_.begin() + (idx1 - 1) ),
172  it2( sequences_.begin() + (idx2 - 1) );
173 
174  SequenceMapping mapping( (*it1)->ungapped_length(), (*it2)->ungapped_length() );
175 
176  for ( Size pos = 1; pos <= length(); ++pos ) {
177  Size const seq1_pos( (*it1)->resnum(pos) );
178  Size const seq2_pos( (*it2)->resnum(pos) );
179  if ( seq1_pos != 0 && seq2_pos != 0 ) {
180  mapping[ seq1_pos ] = seq2_pos;
181 
182  //char const aa1( (*it1)->sequence().at(seq1_pos-1) );
183  //char const aa2( (*it2)->sequence().at(seq2_pos-2) );
184  //std::cout << "texdebug: adding mapping of "
185  // << seq1_pos << "," << aa1 << " => "
186  // << seq2_pos << "," << aa2 << std::endl;
187  }
188  }
189 
190  return mapping;
191 } // sequence_mapping
192 
194  Size pos(1);
195  while ( pos <= length() ) {
196  // delete this new position if the entire column is gapped.
197  bool delete_column( true );
199  end = sequences_.end();
200  it != end; ++it
201  ) {
202  if ( !(*it)->is_gap(pos) ) {
203  // std::cout << "not deleting column because sequence " << *it
204  // << " has no gap at position " << pos << std::endl;
205  delete_column = false;
206  }
207  } // for sequences
208 
209  if ( delete_column ) {
211  it != end; ++it
212  ) {
213  // std::cout << "deleting column " << pos << " from " << *it << std::endl;
214  (*it)->delete_position( pos );
215  }
216  } else {
217  ++pos;
218  }
219 
220  } // while pos <= length()
221 } // remove_gapped_positions
222 
224  ScoringSchemeOP ss
225 ) const {
226  using core::Real;
227  using utility::vector1;
228 
229  Real score( 0.0 );
231  // something mis-understood about std::accumulate?
232  //std::accumulate( scores.begin(), scores.end(), 0 );
233  typedef vector1< Real >::const_iterator iter;
234  for ( iter it = scores.begin(), end = scores.end(); it != end; ++it ) {
235  score += *it;
236  }
237 
238  return score;
239 } // calculate_score_sum_of_pairs
240 
242  ScoringSchemeOP ss
243 ) const {
245  for ( Size i = 1; i <= size(); ++i ) {
246  for ( Size j = i + 1; j <= size(); ++j ) {
247  bool last_pos_was_gapped( false );
248  for ( Size k = 1; k <= length(); ++k ) {
249  if ( sequence(i)->is_gap(k) && sequence(j)->is_gap(k) ) {
250  // maybe rethink this? you could argue for a positive score as well,
251  // such as 2 * gap_open or 2 * gap_extend, depending on
252  // last_pos_was_gapped.
253  last_pos_was_gapped = true;
254  scores[k] += 0.0;
255  } else if ( sequence(i)->is_gap(k) || sequence(j)->is_gap(k) ) {
256  if ( last_pos_was_gapped ) scores[k] += ss->gap_extend();
257  else scores[k] += ss->gap_open();
258  last_pos_was_gapped = true;
259  } else {
260  bool over_run(false);
261  if ( sequences_[i]->length() < k ) {
262  over_run = true;
263  tr.Error << "Attempting to run off the end of sequence!" << std::endl;
264  tr.Error << "asked for position " << k << " in seq:" << std::endl;
265  tr.Error << sequences_[i]->to_string() << std::endl;
266  }
267  if ( sequences_[j]->length() < k ) {
268  over_run = true;
269  tr.Error << "Attempting to run off the end of sequence!" << std::endl;
270  tr.Error << "asked for position " << k << " in seq:" << std::endl;
271  tr.Error << sequences_[j]->to_string() << std::endl;
272  }
273  if ( !over_run ) {
274  scores[k] += ss->score( sequences_[i], sequences_[j], k, k );
275  last_pos_was_gapped = false;
276  }
277  }
278  } // col k
279  } // seq j
280  } // seq i
281 
282  return scores;
283 }
284 
286  for ( Size jj = 1; jj <= size(); ++jj ) {
287  if ( sequences_[jj]->length() != length() ) {
288  std::string msg( "Error: length mismatch between sequence and alignment" );
289  msg += "problem with sequence: " + sequences_[jj]->to_string();
290  msg += "alignment: " + to_string();
291  utility_exit_with_message( msg );
292  }
293  } // sequence jj
294 }
295 
297  Size n_ident(0);
298 
299  for ( Size i = 1; i <= length(); ++i ) {
300  bool ident(true);
302 
303  for ( Size j = 2; j <= sequences_.size(); ++j ) {
304  if ( (*sequences_[j])[i] != (*sequences_[1])[i] ) {
305  ident = false;
306  }
307  } // for sequences
308 
309  if ( ident ) ++n_ident;
310  } // for ( size i = 1 )
311  return n_ident;
312 } // identities
313 
315  Size n_gapped(0);
316 
317  for ( Size ii = 1; ii <= length(); ++ii ) {
318  if ( is_gapped(ii) ) ++n_gapped;
319  } // for ( size i = 1 )
320  return n_gapped;
321 } // gapped_positions
322 
324  bool in_frame( true );
325 
326  for ( Size i = 1; i <= length(); ++i ) {
328 
329  seq1 = sequences_.begin();
330  if ( (*seq1)->is_gap( i ) ) continue;
331 
332  for ( it = seq1 + 1, end = sequences_.end(); it != end; ++it ) {
333  if ( (*it)->is_gap( i ) ) continue;
334 
335  if ( (*seq1)->resnum(i) != (*it)->resnum(i) ) {
336  in_frame = false;
337  break; // out of inner loop
338  }
339  }
340 
341  if ( in_frame == false ) break; // out of outer loop
342  }
343 
344  return in_frame;
345 }
346 
348  core::Size const column
349 ) const {
350  runtime_assert( column <= length() );
351  runtime_assert( column > 0 );
352 
354 
355  typedef utility::vector1< SequenceOP > seqlist;
356  for ( seqlist::const_iterator it = sequences_.begin(), end = sequences_.end();
357  it != end; ++it
358  ) {
359  indices.push_back( (*it)->resnum(column) );
360  }
361 
362  return indices;
363 }
364 
365 
366 void SequenceAlignment::comment( std::string const & comment ) {
367  comments_.push_back( comment );
368 }
369 
371  return comments_;
372 }
373 
375  typedef utility::vector1< SequenceOP > seqlist;
376  Real max_gp(0.0);
377  for ( seqlist::const_iterator it = sequences_.begin(), end = sequences_.end();
378  it != end; ++it
379  ) {
380  Real gap_percentage = static_cast< Real >
381  ( (*it)->length() - (*it)->ungapped_length() );
382  gap_percentage = gap_percentage / static_cast< Real > ( length() );
383  max_gp = std::max( max_gp, gap_percentage );
384  //std::cout << "length = " << (*it)->length() << std::endl;
385  //std::cout << "ungapped_length = " << (*it)->ungapped_length() << std::endl;
386  }
387 
388  return max_gp;
389 }
390 
391 bool SequenceAlignment::is_gapped( Size const col_idx ) const {
392  bool gapped( false );
393  for ( Size jj = 1; jj <= size(); ++jj ) {
394  if ( sequences_[jj]->is_gap( col_idx ) ) {
395  gapped = true;
396  break;
397  }
398  } // for sequences
399  return gapped;
400 }
401 
402 //void SequenceAlignment::trim_terminal_gaps() {
403 // // find the first non-gap character
404 // utility::vector1< core::Size > nterm_gaps;
405 // typedef utility::vector1< SequenceOP > seqlist;
406 // for ( seqlist::const_iterator it = sequences_.begin(), end = sequences_.end();
407 // it != end; ++it
408 // ) {
409 //
410 // }
411 //} // trim_terminal_gaps
412 
413 std::ostream & operator << (
414  std::ostream & out,
415  const SequenceAlignment & sa
416 ) {
417  out << "score: " << sa.score()
418  << " identities: " << sa.identities()
419  << "/" << sa.length()
420  << " gaps: " << sa.gapped_positions()
421  << "/" << sa.length()
422  << std::endl;
423 
424  for ( Size i = 1; i <= sa.size(); ++i ) {
425  out << (*sa.sequence(i)) << std::endl;
426  }
427  return out;
428 }
429 
430 
431 std::istream & operator>> (
432  std::istream & in,
433  SequenceAlignment & aln
434 ) {
435  aln.read_data( in );
436  return in;
437 }
438 
440  SequenceAlignment const & src
441 ) :
442  ReferenceCount ( src )
443 {
444  *this = src;
445 }
446 
449  SequenceAlignment const & src
450 ) {
451  clear();
452 
453  scores_ = src.scores();
454 
455  // copy sequences, be sure to call clone manually
456  for ( Size i = 1; i <= src.size(); ++i ) {
458  = src.sequence(i)->clone();
459  add_sequence( copy );
460  }
461 
462  comments_ = src.comments();
463  return *this;
464 }
465 
466 bool
468  SequenceAlignment const & lhs, SequenceAlignment const & rhs
469 ) {
470  // compare number of sequences first
471  if ( lhs.size() < rhs.size() ) { return true; }
472 
473  // if sequences are the same, compare concatentation of
474  // all sequences from each alignment lexicographically
475  std::string l_str(""), r_str("");
476  for ( Size ii = 1; ii <= lhs.size(); ++ii ) {
477  l_str += lhs.sequence(ii)->sequence();
478  r_str += rhs.sequence(ii)->sequence();
479  }
480 
481  return ( l_str < r_str );
482 }
483 
485  std::ostream & out
486 ) const
487 {
488 
489  //if ( size() < 2 ) return;
490 
491  out << "## " << sequence(1)->id() << " " << sequence(2)->id() << std::endl;
492  out << "# " << std::endl;
493  //out << "scores_from_program: 0.000000 " << score() << std::endl;
494  out << "scores_from_program:";
495  // print scores in order of sorted keys
496  using std::map;
497  using std::string;
498  using utility::vector1;
499 
500  vector1< string > keys;
501  for ( map< string, Real >::const_iterator it = scores_.begin(), end = scores_.end(); it != end; ++it )
502  keys.push_back( it->first );
503  std::sort( keys.begin(), keys.end() );
504 
505  for ( vector1< string >::const_iterator it = keys.begin(), end = keys.end(); it != end; ++it ) {
506  out << " " << score(*it);
507  //std::cout << "score(" << *it << "," << score(*it) << ")" << std::endl;
508  }
509  out << std::endl;
510 
511  // print out sequences
512  for ( Size i = 1; i <= size(); ++i ) {
513  out << sequence(i)->start()-1 << " " << (*sequence(i)).sequence() << std::endl;
514  }
515  out << "--" << std::endl;
516 }
517 
518 } // sequence
519 } // core