Rosetta 3.5
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
VallData.cc
Go to the documentation of this file.
1 // -*- mode:c++;tab-width:2;indent-tabs-mode:t;show-trailing-whitespace:t;rm-trailing-spaces:t -*-
2 // vi: set ts=2 noet:
3 //
4 // (c) Copyright Rosetta Commons Member Institutions.
5 // (c) This file is part of the Rosetta software suite and is made available under license.
6 // (c) The Rosetta software is developed by the contributing members of the Rosetta Commons.
7 // (c) For more information, see http://www.rosettacommons.org. Questions about this can be
8 // (c) addressed to University of Washington UW TechTransfer, email: license@u.washington.edu.
9 
10 
11 // Rosetta Headers
13 
14 
16 #include <protocols/frags/heap.hh>
17 
18 
19 // utility headers
20 // AUTO-REMOVED #include <ObjexxFCL/ObjexxFCL.hh>
21 #include <ObjexxFCL/FArray1A.hh>
22 #include <utility/io/izstream.hh>
23 
24 #include <numeric/random/random.fwd.hh>
25 #include <utility/exit.hh>
26 
27 // C++ Headers
28 //#include <cmath>
29 //#include <cstdlib>
30 #include <iostream>
31 //#include <fstream>
32 //#include <sstream>
33 #include <basic/Tracer.hh> // tracer output
34 
35 #include <utility/vector1.hh>
36 
37 
38 //Auto using namespaces
39 namespace ObjexxFCL { } using namespace ObjexxFCL; // AUTO USING NS
40 //Auto using namespaces end
41 
42 
43 static basic::Tracer TR( "protocols.frags.VallData" );
44 
45 namespace protocols {
46 namespace frags {
47 
48 ///////////////////////////////////////////////////////////////////////////////
49 ///\brief read from vall database file "filename"
50 ///
51 ///support both old format and new format, i.e., with "trimmed" in filename
52 void
53 VallData::read_file(
54  std::string const & filename
55 )
56 {
57 
58  utility::io::izstream data ( filename );
59  if ( !data ) {
60  utility_exit_with_message( "cant open vall file: " + filename );
61  }
62 
63  //////////////////////////////////////////////
64  // this file parsing should be very fast since
65  // the vall is enormous
66  bool const new_format( filename.find( "trimmed" ) != std::string::npos );
67 
68  if ( new_format ) {
69  char line[250];
70  float phi,psi,omega; // should match types in sscanf conversion
71  char seq,ss;
72  while ( data ) {
73  data.getline( line, 250 );
74  if ( data.eof() ) break;
75 
76  std::sscanf( line , "%1c", &seq);
77  std::sscanf( line+ 1, "%1c", &ss);
78 
79 
80  std::sscanf( line+ 2, "%9f", &phi);
81  std::sscanf( line+11, "%9f", &psi);
82  std::sscanf( line+20, "%9f", &omega);
83 
84  add_line( seq, ss, phi, psi, omega );
85  }
86  } else {
87  char line[250];
88  float phi,psi,omega;
89  char seq,ss;
90  while ( data ) {
91  data.getline( line, 250 );
92  if ( data.eof() ) break;
93 
94  std::sscanf( line+6 , "%1c", &seq);
95  std::sscanf( line+8 , "%1c", &ss);
96 
97  std::sscanf( line+52, "%9f", &phi);
98  std::sscanf( line+61, "%9f", &psi);
99  std::sscanf( line+70, "%9f", &omega);
100 
101  add_line( seq, ss, phi, psi, omega );
102  }
103  }
104  data.close();
105 
106  // remove excess capacity
107  shrink();
108 
109 }
110 
111 ///////////////////////////////////////////////////////////////////////////////
112 ///\brief pick fragments in a single window
113 ///
114 ///\li nfrags -- number of fragment pieces to be picked
115 ///\li target_seq -- query sequence and this also defines fragment length,
116 /// such as 3mer or 9mer
117 ///\li target_ss -- secondary structure of query sequence.
118 ///\li seq_weight and ss_weight -- weight for sequence match and secstruct match
119 ///\li exclude_* -- exclude such squence pattern when picking fragments
120 ///\li library -- output as a collection of fragments for that residue window
121 ///
122 ///\detail scan through vall database, for each position within each window,
123 ///if sequence does not match target sequence, penalize by seq_weight; if secstruct
124 ///does not match target secstruct, penalize by ss_weight. In the end, picking top "nfrags"
125 ///fragments with lowest penalty score (this internally uses a "heap" implementation).
126 ///
127 void
128 VallData::get_frags(
129  Size const nfrags,
130  std::string const & target_seq,
131  std::string const & target_ss,
132  Real const seq_weight,
133  Real const ss_weight,
134  bool const exclude_gly,
135  bool const exclude_pro,
136  bool const exclude_cys_peptides,
138 ) const
139 {
140  Size const frag_size( target_seq.size() );
141  runtime_assert( frag_size == target_ss.size() && ss_weight >= 0.0 && seq_weight >= 0.0 );
142 
143  // for randomizing the order of frags with identical score
144  Real const min_nonzero_weight
145  ( ss_weight == 0.0 ? seq_weight : ( seq_weight == 0.0 ? ss_weight : std::min( ss_weight, seq_weight ) ) );
146 
147  // reset heaps
148  FArray1D_int heap( nfrags + 2 );
149  FArray1D_float coheap( nfrags + 2 );
150  heap_init( heap, coheap, nfrags );
151 
152  Size const my_size( size() );
153 
154  for ( Size vall_pos=1; vall_pos <= my_size - frag_size + 1; ++vall_pos ) {
155  // score this position
156 
157  bool bad_frag( false );
158 
159  Real score(0.0); // bigger is worse
160 
161  for ( Size k=0; k< frag_size; ++k ) {
162  Real const phi ( phi_ [ vall_pos+k ] );
163  Real const psi ( psi_ [ vall_pos+k ] );
164  Real const omega( omega_ [ vall_pos+k ] );
165  char const seq ( sequence_ [ vall_pos+k ] );
166  char const ss ( secstruct_[ vall_pos+k ] );
167  if ( ( std::abs( phi ) < 0.01 ) ||
168  ( std::abs( psi ) + std::abs( omega ) < 0.01 ) ||
169  ( seq == 'G' && exclude_gly && seq != target_seq[k] ) ||
170  ( seq == 'P' && exclude_pro && seq != target_seq[k] ) ||
171  ( std::abs( omega ) < 90.0 && exclude_cys_peptides ) ) {
172  bad_frag = true;
173  break;
174  }
175  if ( ss != target_ss[k] ) {
176  score += ss_weight;
177  }
178  if ( seq != target_seq[k] ) {
179  score += seq_weight;
180  }
181  }
182 
183  if ( bad_frag ) continue;
184 
185  // randomly reorder frags with the same score
186  score += min_nonzero_weight * 0.1 * numeric::random::uniform();
187 
188  // insert into heap, with negative score! since bigger==better for heaps
189  bool err;
190  heap_insert( heap, coheap, vall_pos, -score, err );
191  }
192 
193  // now extract top nfrags matches, copy Sizeo frag array
194  // fragments come out of the heap from worst to best
195  Size exact_matches(0);
196  Real worst_score(999), best_score(999);
197 
198  for ( Size nn= nfrags; nn >=1; --nn ) {
199  bool err;
200  int vall_pos;
201  float score;// heaps use float!!!
202  heap_extract( heap, coheap, vall_pos, score, err);
203  runtime_assert( !err );
204 
205  if ( score >= -0.1 * min_nonzero_weight ) ++exact_matches;
206 
207  if ( nn == nfrags ) worst_score = -score;
208  else if ( nn == 1 ) best_score = -score;
209 
210  //
211  TorsionFragmentOP fragment( new TorsionFragment( frag_size, 3 /* #bb torsions */ ) );
212  for ( Size k=0; k< frag_size; ++k ) {
213  fragment->set_torsion ( k+1, 1, phi_ [ vall_pos + k ] );
214  fragment->set_torsion ( k+1, 2, psi_ [ vall_pos + k ] );
215  fragment->set_torsion ( k+1, 3, omega_ [ vall_pos + k ] );
216  fragment->set_secstruct( k+1, secstruct_[ vall_pos + k ] );
217  }
218  // goes at the beginning
219  library.insert_fragment( fragment );
220  } // nn
221 
222 
223  TR.Trace << "ss-frags: " <<
224  " target_ss: " << target_ss <<
225  " target_seq: " << target_seq <<
226  " exact_matches: " << exact_matches <<
227  " best_score: " << best_score <<
228  " worst_score: " << worst_score << std::endl;
229 
230 }
231 
232 
233 
234 
235 
236 } // ns frags
237 } // ns protocols