Rosetta 3.5
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
ProfSimScoringScheme.cc
Go to the documentation of this file.
1 // -*- mode:c++;tab-width:2;indent-tabs-mode:t;show-trailing-whitespace:t;rm-trailing-spaces:t -*-
2 // vi: set ts=2 noet:
3 //
4 // (c) Copyright Rosetta Commons Member Institutions.
5 // (c) This file is part of the Rosetta software suite and is made available under license.
6 // (c) The Rosetta software is developed by the contributing members of the Rosetta Commons.
7 // (c) For more information, see http://www.rosettacommons.org. Questions about this can be
8 // (c) addressed to University of Washington UW TechTransfer, email: license@u.washington.edu.
9 
10 /// @file ProfSimScoringScheme.hh
11 /// @brief class definition for a given scoring scheme for an alignment.
12 /// @detailed Simply based on comparing single profiles from two protein
13 /// sequences, along with affine gap penalties of the form penalty = A + Bk, where
14 /// A represents the penalty for starting a gap, and B represents the penalty for
15 /// extending a previously opened gap by k characters.
16 /// @author James Thompson
17 
18 #include <core/types.hh>
19 // AUTO-REMOVED #include <basic/Tracer.hh>
25 
26 #include <utility/exit.hh>
27 // AUTO-REMOVED #include <utility/vector1.hh>
28 // AUTO-REMOVED #include <utility/io/izstream.hh>
29 // AUTO-REMOVED #include <utility/file/FileName.hh>
30 
31 #include <core/chemical/AA.hh>
32 
33 // AUTO-REMOVED #include <ObjexxFCL/format.hh>
34 // AUTO-REMOVED #include <iostream>
35 #include <string>
36 
37 #include <utility/vector1.hh>
38 #include <complex>
39 #include <map>
40 
41 
42 namespace core {
43 namespace sequence {
44 
46  // this is stupid, refactor this into a database file someday!
47  // log-probabilities (base e) for each amino acid calculated from
48  // frequencies measured in the SwissProt database on 10/10/08.
49  std::map< char, Real > log_p_aa_;
50  log_p_aa_['A'] = -2.50775275017596;
51  log_p_aa_['C'] = -4.25443370709812;
52  log_p_aa_['D'] = -2.91635539230326;
53  log_p_aa_['E'] = -2.69728310210688;
54  log_p_aa_['F'] = -3.24970340180708;
55  log_p_aa_['G'] = -2.6536761536201;
56  log_p_aa_['H'] = -3.77884402329716;
57  log_p_aa_['I'] = -2.82635555265843;
58  log_p_aa_['K'] = -2.83305056323539;
59  log_p_aa_['L'] = -2.33558638262581;
60  log_p_aa_['M'] = -3.72639555635987;
61  log_p_aa_['N'] = -3.20234385542140;
62  log_p_aa_['P'] = -3.0416188351623;
63  log_p_aa_['Q'] = -3.22656571396179;
64  log_p_aa_['R'] = -2.89952136047537;
65  log_p_aa_['S'] = -2.70771691574528;
66  log_p_aa_['T'] = -2.9259691035378;
67  log_p_aa_['V'] = -2.68564236933079;
68  log_p_aa_['W'] = -4.51369283589433;
69  log_p_aa_['Y'] = -3.53006197206897;
70 
71  // initialize log_p_aa values into prior_probs based on the ordering
72  // in AA enum
73  prior_probs_.resize( log_p_aa_.size() );
74  using std::map;
75  for ( map< char, Real >::const_iterator it = log_p_aa_.begin(),
76  end = log_p_aa_.end();
77  it != end; ++it
78  ) {
81  prior_probs_[ aa ] = std::exp( it->second );
82  }
83 }
84 
86  SequenceOP seq1,
87  SequenceOP seq2,
88  Size pos1,
89  Size pos2
90 ) {
91  SequenceProfileOP prof1
92  = SequenceProfileOP( static_cast < SequenceProfile * > ( seq1() ) );
93  SequenceProfileOP prof2
94  = SequenceProfileOP( static_cast < SequenceProfile * > ( seq2() ) );
95 
96  runtime_assert( pos1 <= prof1->length() );
97  runtime_assert( pos2 <= prof2->length() );
98  //runtime_assert( (*prof1)[pos1].size() == (*prof2)[pos2].size() );
99  runtime_assert( prof1->prof_row( pos1 ).size() == prof2->prof_row(pos2).size() );
100 
101  // initialize prior probabilities
102  Real divergence_score( 0.0 );
103  Real similarity_score( 0.0 );
104  Real const base( 2.0 ); // calculate logarithms in base 2
105 
106  Size n_aa( prof1->prof_row(pos1).size() );
107  //Size n_aa( prof1->alphabet().size() );
108  for ( Size i = 1; i <= n_aa; ++i ) {
109  // divergence_score is divergence between prof1 and prof2
110  Real div_avg( ( prof1->prof_row(pos1)[i] + prof2->prof_row(pos2)[i] ) / 2 );
111  divergence_score += 0.5 * prof1->prof_row(pos1)[i]
112  * log( prof1->prof_row(pos1)[i] / div_avg ) / log( base );
113  divergence_score += 0.5 * prof2->prof_row(pos2)[i]
114  * log( prof2->prof_row(pos2)[i] / div_avg ) / log( base );
115 
116  // similarity_score is divergence between average of prof1 and prof2 and the prior
117  Real prior_prob = prior_probs_[i];
118  Real sim_avg( (div_avg + prior_prob) / 2 );
119  similarity_score += 0.5 * sim_avg * log( sim_avg / prior_prob ) / log( base );
120  similarity_score += 0.5 * prior_prob * log( prior_prob / sim_avg ) / log( base );
121 
122  //std::cout << "comparing " << prof1->prof_row(pos1)[i] << " with " << prof2->prof_row(pos2)[i] << std::endl;
123  }
124 
125  Real score = 0.5 * ( 1 - divergence_score ) * ( 1 + similarity_score );
126  //std::cout << "divergence = " << divergence_score
127  // << ", similarity = " << similarity_score
128  // << ", score = " << score
129  // << std::endl;
130  return score;
131 } // score
132 
133 } // sequence
134 } // core