Rosetta
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Properties Friends Macros Pages
rna_thread.cc
Go to the documentation of this file.
1 // -*- mode:c++;tab-width:2;indent-tabs-mode:t;show-trailing-whitespace:t;rm-trailing-spaces:t -*-
2 // vi: set ts=2 noet:
3 //
4 // (c) Copyright Rosetta Commons Member Institutions.
5 // (c) This file is part of the Rosetta software suite and is made available under license.
6 // (c) The Rosetta software is developed by the contributing members of the Rosetta Commons.
7 // (c) For more information, see http://www.rosettacommons.org. Questions about this can be
8 // (c) addressed to University of Washington UW TechTransfer, email: license@u.washington.edu.
9 
10 /// @file
11 /// @brief
12 
13 
14 // libRosetta headers
15 #include <core/types.hh>
16 #include <core/chemical/AA.hh>
17 #include <core/conformation/Residue.hh>
18 #include <core/chemical/ResidueTypeSet.hh>
19 #include <core/chemical/VariantType.hh>
20 #include <core/chemical/util.hh>
21 #include <core/chemical/ChemicalManager.hh>
22 #include <core/import_pose/import_pose.hh>
23 #include <core/scoring/ScoreFunction.hh>
24 #include <core/scoring/ScoreFunctionFactory.hh>
25 #include <core/sequence/Sequence.hh>
26 #include <core/sequence/util.hh>
27 #include <utility/string_util.hh>
28 #include <core/kinematics/FoldTree.hh>
29 #include <core/kinematics/tree/Atom.hh>
30 #include <core/id/AtomID_Map.hh>
31 #include <core/id/AtomID.hh>
32 #include <core/id/DOF_ID.hh>
33 #include <core/kinematics/AtomTree.hh>
34 #include <core/kinematics/Jump.hh>
35 #include <core/kinematics/MoveMap.hh>
36 #include <core/pose/PDBInfo.hh>
37 #include <protocols/farna/util.hh>
38 #include <protocols/viewer/viewers.hh>
39 #include <core/pose/Pose.hh>
40 #include <core/pose/util.hh>
41 #include <core/pose/rna/util.hh>
42 #include <devel/init.hh>
43 
44 #include <core/io/pdb/pose_io.hh>
45 
46 #include <utility/vector1.hh>
47 #include <utility/io/ozstream.hh>
48 #include <utility/io/izstream.hh>
49 
50 #include <numeric/xyzVector.hh>
51 #include <numeric/conversions.hh>
52 
53 #include <ObjexxFCL/FArray1D.hh>
54 #include <ObjexxFCL/format.hh>
56 
57 // C++ headers
58 //#include <cstdlib>
59 #include <fstream>
60 #include <iostream>
61 #include <string>
62 
63 // option key includes
64 
65 #include <basic/options/keys/out.OptionKeys.gen.hh>
66 #include <basic/options/keys/score.OptionKeys.gen.hh>
67 #include <basic/options/keys/in.OptionKeys.gen.hh>
69 
71 
72 using namespace core;
73 using namespace basic;
74 using namespace protocols;
75 using namespace basic::options::OptionKeys;
76 
77 using utility::vector1;
78 
79 using io::pdb::dump_pdb;
80 
82 
83 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////
84 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////
85 // Adapted from easy_target_test.cc (threading routine for CASP8 proteins, 2008) to RNA
86 // for RNA_puzzle GIR1 in 2012.
87 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////
88 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////
89 
90 //Definition of new OptionKeys
91 // these will be available in the top-level OptionKey namespace:
92 // i.e., OPT_KEY( Type, key ) --> OptionKey::key
93 // to have them in a namespace use OPT_1GRP_KEY( Type, grp, key ) --> OptionKey::grp::key
94 OPT_KEY( String, seq )
95 OPT_KEY( Integer,seq_offset )
96 OPT_KEY( String, sequence_mask_file )
97 
98 
99 ///////////////////////////////////////////////////////////////////////
100 void
102 ObjexxFCL::FArray1D_bool & sequence_mask,
103 utility::vector1< sequence::SequenceOP > const & sequences_from_alignment )
104 {
105  using namespace basic::options;
106  using namespace basic::options::OptionKeys;
107 
109  for ( Size n=1; n <= sequences_from_alignment.size(); n++ ) sequences.push_back( sequences_from_alignment[n]->sequence() );
110 
111  Size const alignment_length = sequences[1].size();
112  sequence_mask = ObjexxFCL::FArray1D_bool( alignment_length, false );
113 
114  ///////////////////////////////////////////////////////////////
115  // First pass -- rule out any part that is gapped in the alignment.
116  for ( Size i = 1; i <= alignment_length; i++ ) {
117 
118  bool found_a_gap( false );
119  for ( Size n = 1; n <= sequences.size(); n++ ) {
120  if ( sequences[n][i-1] == '-' ) {
121  found_a_gap = true;
122  break;
123  }
124  }
125 
126  if ( !found_a_gap ) {
127  sequence_mask( i ) = true;
128  }
129 
130  }
131 
132  ///////////////////////////////////////////////////////////////
133  // Second pass -- look for ungapped part that are bracketed by gapped regions
134  bool look_for_fishy_gaps( false ); //this was from protein stuff, and is not actually in use for RNA.
135 
136  if ( look_for_fishy_gaps ) {
137  Size const look_for_gap( 4 );
138  for ( int i = 1; i <= int(alignment_length); i++ ) {
139 
140  if ( sequence_mask(i) == false ) continue; //Don't worry about it.
141 
142  bool found_gap_before( false ), found_gap_after( false );
143  for ( int offset = -1 * (int)look_for_gap; offset < 0; ++offset ) {
144  if ( i+offset > 1 && sequence_mask( i+offset ) == false ) {
145  found_gap_before = true;
146  break;
147  }
148  }
149  if ( !found_gap_before ) continue;
150 
151  for ( int offset = 1; Size(offset) <= look_for_gap; offset ++ ) {
152  if ( i+offset <= int(alignment_length) && sequence_mask( i+offset ) == false ) {
153  found_gap_after = true;
154  break;
155  }
156  }
157  if ( !found_gap_after ) continue;
158 
159  std::cout << "MASK: Region that is not nominally gapped but looks fishy: " << i << std::endl;
160  sequence_mask( i ) = false;
161  }
162  }
163 
164  // Save into a file.
165  Size count( 0 );
166  if ( option[ sequence_mask_file ].user() ) {
167  utility::io::ozstream out( option[ OptionKeys::sequence_mask_file ]() );
168  for ( Size i = 1; i <= alignment_length; i++ ) {
169  if ( sequences[1][i-1] == '-' ) continue; //Assume native numbering
170  count++;
171  out << sequence_mask(i);
172  }
173  out << std::endl;
174  out.close();
175  }
176 
177 }
178 
179 ////////////////////////////////////////////////////////////////////////////
180 void
181 setup_alignment_map( std::map< Size, Size > & mapping,
182  std::string const & sequence_from_alignment ){
183  Size count( 0 );
184  for ( Size i = 1; i <= sequence_from_alignment.size(); i++ ) {
185  if ( sequence_from_alignment[ i-1 ] != '-' ) {
186  count++;
187  mapping[ i ] = count;
188  } else {
189  mapping[ i ] = 0;
190  }
191  }
192 }
193 
194 //////////////////////////////////////////////////////////////////////////////////////////////////////
195 std::string
196 remove_dashes( std::string const & s ){
197  std::string s_out( "" );
198  for ( Size i = 0; i < s.size(); i++ ) {
199  if ( s[i] != '-' ) s_out += s[i];
200  }
201  return s_out;
202 }
203 
204 //////////////////////////////////////////////////////////////////////////////////////////////////////
205 void
207  pose::Pose & pose,
208  std::string const & target_sequence_from_alignment,
209  std::string const & template_sequence_from_alignment,
210  ObjexxFCL::FArray1D_bool & sequence_mask /* should make this optional! */,
211  Size offset = 0
212 )
213 {
214 
215  using namespace core::chemical;
216  using namespace core::conformation;
217  using namespace core::pose;
218  using namespace protocols::farna;
219  using namespace utility;
220 
221  // following creates pose from scratch! Assumes that we need to add/delete, and it changes the
222  // fold-tree -- both are probably bad ideas for a general threading routine.
223  // Would be better to add or delete residues as needed, trying to be smart about fold_tree [how?].
224  // And better to skip this part if add/delete not necessary -- i.e., just isolate the "mutate sequence" part.
225  Pose const template_pose = pose;
226  pose.clear();
227 
228  Size const alignment_length = target_sequence_from_alignment.size();
229  runtime_assert( alignment_length == template_sequence_from_alignment.size() );
230 
231  if ( sequence_mask.size() == 0 ) sequence_mask = ObjexxFCL::FArray1D_bool( alignment_length, true );
232  runtime_assert( alignment_length == sequence_mask.size() );
233 
234  std::map <Size, Size> template_alignment2sequence, target_alignment2sequence;
235  // Note that the Sequence object already has a nice function for this called resnum() -- we should just use it.
236  setup_alignment_map( template_alignment2sequence, template_sequence_from_alignment );
237  setup_alignment_map( target_alignment2sequence, target_sequence_from_alignment );
238 
239  std::string target_sequence;
240  utility::vector1< Size > working_res;
241  for ( Size i = 1; i <= alignment_length; i++ ) {
242 
243  if ( !sequence_mask( i ) ) continue;
244 
245  Size const pdb_number( template_alignment2sequence[i] );
246 
247  if ( i == 1 || ( sequence_mask( i-1 ) && pdb_number > 1 && !template_pose.fold_tree().is_cutpoint( pdb_number-1 ) ) ) {
248  pose.append_residue_by_bond( template_pose.residue( pdb_number ) );
249  } else {
250  pose.append_residue_by_jump( template_pose.residue( pdb_number ), pose.total_residue() );
251  }
252 
253  //std::cout << "creating alignment ==> template:" << pdb_number << " target: " << target_alignment2sequence[i] << std::endl;
254  //std::cout << target_sequence_from_alignment[i-1] << ' ' << template_pose.residue( pdb_number ).name1() << std::endl;
255 
256  target_sequence += target_sequence_from_alignment[i-1];
257  working_res.push_back( target_alignment2sequence[i] + offset); // will be used for PDB numbering
258  }
259 
260  // fix up numbering.
261  PDBInfoOP pdb_info( new PDBInfo( pose ) );
262  pdb_info->set_numbering( working_res );
263  pose.pdb_info( pdb_info );
264 
265  std::string current_sequence = pose.sequence();
266  std::cout << "TARGET: " << target_sequence << std::endl;
267  std::cout << "CURRENT: " << current_sequence << std::endl;
268  utility::vector1< Size > changed_pos;
269 
270  std::cout << "WORKING_RES " << make_tag( working_res ) << std::endl;
271  std::cout << "WORKING_RES " << make_tag_with_dashes( working_res ) << std::endl;
272 
273  /////////////////////
274  //Mutate sequence
275  /////////////////////
276  utility::vector1< int> changed_pos_working;
277  for ( Size i = 1; i <= target_sequence.size(); i++ ) {
278 
279  char const new_seq = target_sequence[i-1];
280  if ( pose::rna::mutate_position( pose, i, new_seq ) ) {
281  changed_pos.push_back( i );
282  changed_pos_working.push_back( working_res[i] );
283  }
284 
285  }
286 
287  std::cout << "Changed residues (without offset applied): " << make_tag_with_dashes( changed_pos ) << std::endl;
288  std::cout << "Changed residues (in residue numbering): " << make_tag_with_dashes( changed_pos_working ) << std::endl;
289 
290 }
291 
292 
293 //////////////////////////////////////////////////////////////////////////////////////////////////////
294 // Find which template to use, based on input pdb name, or, if that fails, the template sequence.
295 Size
297  pose::Pose const & template_pose,
298  std::string const & template_file_name ){
299 
300  Size which_sequence( 0 );
301 
302  for ( Size n=1; n<=sequences_from_alignment.size(); n++ ) {
303  if ( sequences_from_alignment[n]->id() == template_file_name ) {
304  which_sequence = n;
305  break;
306  }
307  }
308 
309  //////////////////////////////////////////////////////////////////
310  // If could not find template based on name, try based on sequence...
311  std::string const template_sequence = template_pose.sequence();
312  if ( which_sequence == 0 ) {
313  for ( Size n=1; n<=sequences_from_alignment.size(); n++ ) {
314  if ( sequences_from_alignment[n]->ungapped_sequence() == template_sequence ) {
315  which_sequence = n;
316  break;
317  }
318  }
319  if ( which_sequence > 0 ) std::cout << "Using " << sequences_from_alignment[which_sequence]->id() << " from alignment FASTA file, as its sequence corresponds to input template PDB!" << std::endl;
320  }
321 
322  if ( which_sequence == 0 ) utility_exit_with_message( "Could not figure out which sequence in fasta file was the template based on name or on sequence!!" );
323 
324  if ( which_sequence == 1 ) std::cout << "WARNING! WARNING! Assuming template corresponds to first sequence in the alignment FASTA file " << sequences_from_alignment[which_sequence]->id() << ". But that first sequence should be the target sequence!" << std::endl;
325 
326  runtime_assert( template_sequence == remove_dashes( sequences_from_alignment[which_sequence]->ungapped_sequence() ) );
327 
328  return which_sequence;
329 
330 }
331 
332 
333 ////////////////////////////////////////////////////////////////////////////
334 void
336 
337  using namespace basic::options;
338  using namespace basic::options::OptionKeys;
339  using namespace core::scoring;
340  using namespace core::pose;
341  using namespace core::sequence;
342 
343  ////////////////////////////////////////////////
344  //Read in template pdb.
345  Pose pose;
346  std::string template_file = option[ in::file::s ][1];
347  core::chemical::ResidueTypeSetCOP rsd_set = core::chemical::ChemicalManager::get_instance()->residue_type_set( core::chemical::FA_STANDARD );
348  core::import_pose::pose_from_pdb( pose, *rsd_set, template_file );
349  core::pose::rna::figure_out_reasonable_rna_fold_tree( pose );
350 
351  ////////////////////////////////////////////////
352  //Read in fasta file or user-inputted sequence
353  std::string template_sequence_from_alignment, target_sequence_from_alignment;
354  ObjexxFCL::FArray1D_bool sequence_mask;
355  if ( option[ seq ].user() ) {
356 
357  runtime_assert( ! option[ in::file::fasta ].user() );
358  template_sequence_from_alignment = pose.sequence();
359  target_sequence_from_alignment = option[ seq ]();
360  if ( target_sequence_from_alignment.size() != template_sequence_from_alignment.size() ) utility_exit_with_message( "Input sequence from -seq should match size of template pose specified by -s!" );
361 
362  } else {
363 
364  std::string const fasta_file ( option[ in::file::fasta ]()[1] );
365  utility::vector1< std::string > sequences, pdb_names;
366  utility::vector1< SequenceOP > sequences_from_alignment = read_fasta_file( fasta_file );
367  Size const which_sequence = figure_out_which_sequence_is_template( sequences_from_alignment, pose, template_file );
368  template_sequence_from_alignment = sequences_from_alignment[ which_sequence ]->sequence();
369  target_sequence_from_alignment = sequences_from_alignment[ 1 ]->sequence();
370 
371  // We can/should replace above with standard Rosetta reader!
372  // And if we use Sequence object, that will return useful info like ungapped_sequence()...
373  // Also cool -- Sequence can include start numbering [= offset + 1].
374 
375  //Figure out a mask ... this is currently not doing anything if we just have a template and a target sequence.
376  setup_mask( sequence_mask, sequences_from_alignment );
377 
378  }
379 
380  prepare_threaded_model( pose, target_sequence_from_alignment,
381  template_sequence_from_alignment,
382  sequence_mask, option[ seq_offset ]() );
383  core::pose::rna::virtualize_5prime_phosphates( pose );
384 
385  std::string outfile( "threaded.pdb" );
386  if ( option[ out::file::o ].user() ) outfile = option[ out::file::o ]();
387 
388  std::cout << "Outputting: " << outfile << std::endl;
389  pose.dump_pdb( outfile );
390 
391 }
392 
393 
394 ///////////////////////////////////////////////////////////////
395 void*
396 my_main( void* )
397 {
398 
399  using namespace basic::options;
400  rna_thread_test();
401  exit( 0 );
402 
403 }
404 
405 
406 ///////////////////////////////////////////////////////////////////////////////
407 int
408 main( int argc, char * argv [] )
409 {
410  try {
411  using namespace basic::options;
412 
413  //Uh, options?
414  NEW_OPT( seq, "target sequence (can include dashes). Must specify either this or -fasta. Length of this sequence must exactly equal length of template pose specified by -s", "");
415  NEW_OPT( seq_offset, "Integer to add to all residue numbers in output PDB", 0 );
416  NEW_OPT( sequence_mask_file, "Output name for sequence mask file (not in use at the moment)", "sequence_mask.txt" );
417  ////////////////////////////////////////////////////////////////////////////
418  // setup
419  ////////////////////////////////////////////////////////////////////////////
420  devel::init(argc, argv);
421 
422 
423  ////////////////////////////////////////////////////////////////////////////
424  // end of setup
425  ////////////////////////////////////////////////////////////////////////////
426 
427  protocols::viewer::viewer_main( my_main );
428  } catch ( utility::excn::EXCN_Base const & e ) {
429  std::cout << "caught exception " << e.msg() << std::endl;
430  return -1;
431  }
432 }
#define utility_exit_with_message(m)
Exit with file + line + message.
Definition: exit.hh:47
virtual std::string const msg() const
Definition: EXCN_Base.hh:70
int main(int argc, char *argv[])
Definition: rna_thread.cc:408
std::string String
Definition: remodel.cc:69
void setup_mask(ObjexxFCL::FArray1D_bool &sequence_mask, utility::vector1< sequence::SequenceOP > const &sequences_from_alignment)
Definition: rna_thread.cc:101
void init(int argc, char *argv[])
Command line init() version.
Definition: init.cc:23
BooleanOptionKey const user("options:user")
Definition: OptionKeys.hh:40
core::pose::Pose Pose
Definition: supercharge.cc:101
#define runtime_assert(_Expression)
Assert that the condition holds. Evaluated for both debug and release builds.
Definition: exit.hh:63
Conversions between degrees and radians.
common derived classes for thrown exceptions
void rna_thread_test()
Definition: rna_thread.cc:335
FArray1D< bool > FArray1D_bool
Definition: FArray1D.fwd.hh:31
Input file stream wrapper for uncompressed and compressed files.
std::vector with 1-based indexing
Definition: vector1.fwd.hh:44
std::string make_tag_with_dashes(utility::vector1< int > res_vector, char const delimiter)
Compactifies vectors of ints: 1 2 3 9 10 11 to "1-3 9-11".
Definition: string_util.cc:449
rule< Scanner, options_closure::context_t > options
Definition: Tag.cc:377
#define OPT_KEY(type, key)
Output file stream wrapper for uncompressed and compressed files.
void close()
Close the ofstream and reset the state.
Definition: ozstream.hh:430
std::string remove_dashes(std::string const &s)
Definition: rna_thread.cc:196
#define NEW_OPT(akey, help, adef)
void setup_alignment_map(std::map< Size, Size > &mapping, std::string const &sequence_from_alignment)
Definition: rna_thread.cc:181
Size figure_out_which_sequence_is_template(utility::vector1< sequence::SequenceOP > const &sequences_from_alignment, pose::Pose const &template_pose, std::string const &template_file_name)
Definition: rna_thread.cc:296
ocstream cout(std::cout)
Wrapper around std::cout.
Definition: ocstream.hh:287
BooleanOptionKey const exit("options:exit")
Definition: OptionKeys.hh:51
vector1: std::vector with 1-based indexing
xyzMatrix: Fast 3x3 xyz matrix template
std::string make_tag(utility::vector1< int > res_vector)
Definition: string_util.cc:552
ozstream: Output file stream wrapper for uncompressed and compressed files
Definition: ozstream.hh:53
tuple seq
Definition: PyMOL_demo.py:72
Some std::string helper functions.
void prepare_threaded_model(pose::Pose &pose, std::string const &target_sequence_from_alignment, std::string const &template_sequence_from_alignment, ObjexxFCL::FArray1D_bool &sequence_mask, Size offset=0)
Definition: rna_thread.cc:206
Fast (x,y,z)-coordinate numeric vector.
void * my_main(void *)
Definition: rna_thread.cc:396
platform::Size Size
Definition: random.fwd.hh:30
rule< Scanner, option_closure::context_t > option
Definition: Tag.cc:378
size_type size() const
Active Array Size.
Definition: FArray.hh:621
FArray1D: Fortran-Compatible 1D Array.
Definition: FArray1.hh:30
numeric::xyzMatrix< Real > Matrix
Definition: rna_thread.cc:81