Rosetta
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Properties Friends Macros Pages
sewing_hasher.cc
Go to the documentation of this file.
1 // -*- mode:c++;tab-width:2;indent-tabs-mode:t;show-trailing-whitespace:t;rm-trailing-spaces:t -*-
2 // vi: set ts=2 noet;
3 //
4 // (c) Copyright Rosetta Commons Member Institutions.
5 // (c) This file is part of the Rosetta software suite and is made available under license.
6 // (c) The Rosetta software is developed by the contributing members of the Rosetta Commons.
7 // (c) For more information, see http://www.rosettacommons.org. Questions about this can be
8 // (c) addressed to University of Washington UW TechTransfer, email: license@u.washington.edu.
9 
10 /// @file Apps/sewing_hasher.cc
11 ///
12 /// @brief An MPI-enabled application that reads in a Hasher hashtable, scores each model in that table against all others, and
13 /// generates a SewGraph file.
14 ///
15 /// @author Tim Jacobs
16 
17 //Package headers
18 #include <devel/init.hh>
19 #include <protocols/sewing/hashing/Hasher.hh>
20 #include <protocols/sewing/conformation/Model.hh>
21 #include <protocols/sewing/util/io.hh>
22 
23 //Protocol headers
24 #include <core/types.hh>
25 #include <core/id/AtomID.hh>
26 #include <core/pose/Pose.hh>
27 #include <core/import_pose/import_pose.hh>
28 
29 //Utility headers
30 #include <utility/io/izstream.hh>
31 #include <utility/mpi_util.hh>
32 #include <utility/string_util.hh>
33 #include <utility/vector1.hh>
35 #include <utility/file/FileName.hh>
36 
37 #include <basic/options/option.hh>
41 #include <basic/Tracer.hh>
43 
44 //C++ headers
45 #include <map>
46 #include <set>
47 #include <iterator>
48 
49 static basic::Tracer TR("sewing_hasher");
50 
51 int
52 main( int argc, char * argv [] ) {
53  using namespace basic::options;
54  using namespace basic::options::OptionKeys;
55  using namespace protocols::sewing;
56 
57  try {
58  // initialize core and read options
59  devel::init(argc, argv);
60 
61  if ( !option[sewing::mode].user() ) {
62  std::stringstream err;
63  err << "You must provide a mode for sewing_hasher to run in using the -sewing:mode flag. Valid options are" << std::endl;
64  err << " -generate: generates a model file from an sqlite database" << std::endl;
65  err << " -generate_five_ss_model: generates a 3~5 ss model file from an sqlite database" << std::endl;
66  err << " -hash: score all models against each other and create a plain text score file (MPI required)" << std::endl;
67  err << " -convert: convert a plain text score file to a binary score file. This is required by the SEWING movers" << std::endl;
68  utility_exit_with_message(err.str());
69  }
70 
71  //Check for model file (either for reading or writing)
72  std::map< int, Model > models;
73  std::string model_filename = option[sewing::model_file_name];
75  std::stringstream err;
76  err << "You must provide a model file name to the sewing_hasher using the -model_file_name flag. To generate a new model file use the "
77  << "-generate_models_from_db flag and a new model file with that name will be written. Otherwise, the model file will be read";
78  utility_exit_with_message(err.str());
79  }
80 
81  //////////////////////////// MODEL GENERATION ///////////////////////////////////
82 
83  //Are we just generating a model file?
84  if ( option[sewing::mode].value() == "generate" ) {
85 
86  //Create comments stream for model file and add the date
87  std::stringstream comments;
88  //timestamp became commented by Vikram to prevent daily integration failure
89  //time_t t = time(0); // get time now
90  //struct tm * now = localtime( & t );
91  //comments << "#Model file created on " << (now->tm_year + 1900) << '-'
92  // << (now->tm_mon + 1) << '-'
93  // << now->tm_mday
94  // << std::endl;
95 
97  TR << "hash_tag_only_terminal_Es: " << hash_tag_only_terminal_Es << std::endl;
98  std::string hash_between;
99  //std::string model_three_ss_filename;
100 
101  //Generate model file from a list of PDBs. All models will have 1 segment
102  if ( option[ in::file::l ].user() ) {
103  comments << "#Models generated from PDB input (-l flag)" << std::endl;
106  for ( core::Size i = 1; i <= input_lists.size(); i++ ) {
107  utility::io::izstream current_input_list( input_lists[i] );
108  if ( !current_input_list.good() ) {
109  utility_exit_with_message("unable to open input file file: "+input_lists[i].name()+"\n");
110  }
111  while ( current_input_list.good() ) {
112  std::string name;
113  current_input_list.getline(name);
114  if ( current_input_list.good() ) pdb_library.push_back( utility::file::FileName(name) );
115  }
116  }
117 
118  for ( core::Size i=1; i<=pdb_library.size(); ++i ) {
120  core::import_pose::pose_from_file(pose, pdb_library[i], core::import_pose::PDB_file);
122  segments.push_back(std::make_pair(1, pose.total_residue()));
123  Model pdb_model = create_model_from_pose(pose, segments, (int)i);
124  models.insert(std::make_pair(i, pdb_model));
125  }
126 
127  } else {
128  //Generate models from a features database. Each segment is a single piece of secondary structure
129 
130  if ( hash_tag_only_terminal_Es ) {
131  hash_between = "hash_tag_only_terminal_Es";
132  comments << "#Only terminal Es are hash bool true to be merged with other nodes later (but as of 2015/11/14, later model assembly has 'ERROR: alignment_scores.size() == 1'" << std::endl;
133  // model_three_ss_filename = model_filename + "_three_ss_will_be_hashed_only_between_Es";
134  //(not possible due to integration test)
135  } else {
136  hash_between = "hash_between_any_HEs";
137  comments << "# hash between any HEs are bool true to be merged with other nodes" << std::endl;
138  // model_three_ss_filename = model_filename + "_three_ss_will_be_hashed_between_any_HEs";
139  //(not possible due to integration test)
140  }
141 
142  if ( !option[ sewing::assembly_type ].user() ) {
143  std::stringstream err;
144  err << "You must provide an assembly_type (continuous or discontinuous) with the -sewing:assembly_type flag in order to extract models";
145  utility_exit_with_message(err.str());
146  }
147  if ( option[ sewing::assembly_type ].value() == "discontinuous" ) {
148  comments << "#Discontinuous models generated from sqlite database " << option[inout::dbms::database_name].value() << std::endl;
149  models = get_discontinuous_models_from_db();
150  } else if ( option[ sewing::assembly_type ].value() == "continuous" ) {
151  comments << "#Continuous models generated from sqlite database " << option[inout::dbms::database_name].value() << std::endl;
152  models = get_continuous_models_from_db(hash_between);
153  }
154  }
155 
156  write_model_file(comments.str(), models, model_filename);
157  TR << "New model file " << model_filename << " successfully written." << std::endl;
158  std::exit(0);
159  } else if ( option[sewing::mode].value() == "generate_five_ss_model" ) {
160 
161  //Create comments stream for model file and add the date
162  std::stringstream comments;
163  //timestamp became commented by Vikram to prevent daily integration failure
164  //time_t t = time(0); // get time now
165  //struct tm * now = localtime( & t );
166  //comments << "#Model file created on " << (now->tm_year + 1900) << '-'
167  // << (now->tm_mon + 1) << '-'
168  // << now->tm_mday
169  // << std::endl;
170 
171  //Generate models from a features database. Each segment is a single piece of secondary structure
172  comments << "# 3 or 5 secondary structures based models generated from sqlite database " << option[inout::dbms::database_name].value() << std::endl;
173 
175  TR << "hash_tag_only_terminal_Es: " << hash_tag_only_terminal_Es << std::endl;
176  std::string hash_between;
177  std::string model_five_ss_filename;
178  if ( hash_tag_only_terminal_Es ) {
179  hash_between = "hash_tag_only_terminal_Es";
180  comments << "# only_terminal_Es_are_hashed " << std::endl;
181  model_five_ss_filename = model_filename + "_three_or_five_ss_will_be_hashed_only_between_Es";
182  } else {
183  hash_between = "hash_between_any_HEs";
184  comments << "# will_be_merged_between_any_HEs " << std::endl;
185  model_five_ss_filename = model_filename + "_three_or_five_ss_will_be_hashed_between_any_HEs";
186  }
187 
188  std::map< int, Model > models = get_5_ss_models_from_db(hash_between);
189 
190  write_model_file(comments.str(), models, model_five_ss_filename);
191  TR << "New model file with 3~5 ss " << model_five_ss_filename << " successfully written." << std::endl;
192 
193  std::exit(0);
194  }
195 
196  //If we aren't generating models, then we need to read them
197  models = read_model_file(model_filename);
198 
199  ///////////////////////// BINARY FILE TESTING //////////////////////////////
200 
201  if ( option[sewing::mode].value() == "test" ) {
202  std::string binary_filename = option[sewing::score_file_name].value();
203  SewGraphOP graph( new SewGraph(models, 1) );
204  graph->report_binary_stats(models, binary_filename);
205  }
206 
207  ///////////////////////// MODEL CONVERSION TO BINARY //////////////////////////////
208 
209  //If we are generating a binary file then do that and exit
210  if ( ( option[sewing::mode].value() == "convert" ) ) {
211  if ( !option[sewing::score_file_name].user() ) {
212  std::stringstream err;
213  err << "You must provide a score file name to the sewing_hasher for binary conversion.";
214  utility_exit_with_message(err.str());
215 
216  }
217  std::string binary_filename = utility::to_string(option[sewing::score_file_name].value())+utility::to_string(".bin");
218 
219  SewGraphOP graph;
220  if ( option[ sewing::assembly_type ].value() == "discontinuous" ) {
221  graph = SewGraphOP( new SewGraph(models, 2) );
222  } else if ( option[ sewing::assembly_type ].value() == "continuous" ) {
223  graph = SewGraphOP( new SewGraph(models, 1) );
224  }
225  graph->generate_binary_score_file(option[sewing::score_file_name].value(), binary_filename);
226  std::exit(0);
227  }
228 
229 
230  //////////// MODEL COMPARISON USING GEOMETRIC HASHING /////////////////
231 
232  if ( ( option[sewing::mode].value() == "hash" ) ) {
233  if ( !option[sewing::score_file_name].user() ) {
234  utility_exit_with_message("You must provide a graph file name to the sewing_hasher using -score_file_name");
235  }
240  // box_length 3 is for neighborhood lookup box size 27 (=3^3), while box_length 5 is for neighborhood lookup box size 125 (=5^3)
241 
243 
244 
245  ///// Size to string reference: http://www.cplusplus.com/articles/D9j2Nwbp/
246  std::ostringstream convert_min_hash_score;
247  std::string min_hash_score_string;
248  convert_min_hash_score << min_hash_score;
249  min_hash_score_string = convert_min_hash_score.str();
250 
251  std::ostringstream convert_max_clash_score; // using single 'std::ostringstream convert' many times stopped to initialize convert, so I begin to use many 'std::ostringstream convert' since 2016/1/19
252  std::string max_clash_score_string;
253  convert_max_clash_score << max_clash_score;
254  max_clash_score_string = convert_max_clash_score.str();
255 
256  std::ostringstream convert_num_segments_to_match;
257  std::string num_segments_to_match_string;
258  convert_num_segments_to_match << num_segments_to_match;
259  num_segments_to_match_string = convert_num_segments_to_match.str();
260 
261  std::ostringstream convert_box_length;
262  std::string box_length_string;
263  convert_box_length << box_length;
264  box_length_string = convert_box_length.str();
265 
266  /*
267  OK with release compilation, but not ok with mac clang debug
268  "src/apps/public/sewing/sewing_hasher.cc:249:74: error: cannot take the address of an rvalue of type 'std::__1::basic_ostringstream<char, std::__1::char_traits<char>, std::__1::allocator<char> >'
269  std::string min_hash_score_string = static_cast<std::ostringstream*>( &(std::ostringstream() << min_hash_score) )->str();
270  ^ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
271 
272  std::string min_hash_score_string = static_cast<std::ostringstream*>( &(std::ostringstream() << min_hash_score) )->str();
273  std::string max_clash_score_string = static_cast<std::ostringstream*>( &(std::ostringstream() << max_clash_score) )->str();
274  std::string num_segments_to_match_string = static_cast<std::ostringstream*>( &(std::ostringstream() << num_segments_to_match) )->str();
275  std::string box_length_string = static_cast<std::ostringstream*>( &(std::ostringstream() << box_length) )->str();
276  */
277 
278  std::string hash_target = option[sewing::mode].value();
279 
280 
281  // Keep this comment! Customized_score_file_name is very useful, Doonam just commented here to avoid integration test failure
282  // score_file_name = score_file_name + "_" + hash_target + "_" + min_hash_score_string + "_min_hash_score_" + max_clash_score_string + "_max_clash_score_" + num_segments_to_match_string + "_num_segments_to_match_" + box_length_string + "_box_length";
283  ///////////////////
284 
285 
286  TR << "Bundle Hasher options:" << std::endl;
287  TR << "\tMinimum Score: " << min_hash_score << std::endl;
288  TR << "\tMaximum Clash Score: " << max_clash_score << std::endl;
289  TR << "\tNumber of segments to match: " << num_segments_to_match << std::endl;
290  TR << "\tScore file name: " << score_file_name << std::endl;
291  TR << "\tNeighborhood lookup box_length: " << box_length << std::endl;
292 
293 #ifdef USEMPI
294  //Now, score models. Split this work between multiple processors if we have them
295  core::Size n_models;
296  if ( option[sewing::max_models].user() ) {
297  n_models = std::min( (core::Size)option[sewing::max_models], models.size() );
298  } else {
299  n_models = models.size();
300  }
301 
302  int starting_model = 0;
304  starting_model = option[sewing::starting_model].value();
305  }
306 
308  if(rank == 0) {
310 
311  TR << "Master node has " << n_models << " jobs to submit" << std::endl;
312 
313  //send out initial jobs
314  std::map< int, Model >::const_iterator it = models.begin();
315  ++it; //Increment the iterator (no need to score the first model against just itself)
316 
317  std::map< int, Model >::const_iterator it_end = models.begin();
318  std::advance(it_end, n_models);
319 
320  if(num_procs > n_models) {
321  utility_exit_with_message("You have more processors than number of models to hash. Reduce your number of processors for efficiency");
322  }
323 
324  core::Size curr_proc = 1;
325  for(; it != it_end; ++it) {
326  //send *it to curr_proc
327  if(it->first >= starting_model) {
328  utility::send_integer_to_node(curr_proc,it->first);
329  TR << "Master node sent a job for model " << it->first << " to processor " << curr_proc << std::endl;
330  ++curr_proc;
331  if(curr_proc == num_procs) { break; }
332  }
333  }
334 
335  ++it;//Increment iterator once more to account for the last model sent in the above loop
336  while( it != it_end ){
337  //wait for a processor to send you a message
338  //send more work
340  TR << "Master node received a message from processor " << received_node << std::endl;
341  utility::send_integer_to_node(received_node,it->first);
342  TR << "Master node sent a new job for model " << it->first << " to processor " << received_node << std::endl;
343  ++it;
344  }
345 
346  //Once we're done with all the work, tell each processor to spin down
347  core::Size counter = 1;
348  while(counter != num_procs){
350  utility::send_integer_to_node(received_node,0);
351  ++counter;
352  }
353  TR << "Master node finished sending jobs" << std::endl;
354  }
355 
356  else {
357  while(true) {
358  //Recieve message from master node
359  int model_id = utility::receive_integer_from_node(0);
360  if(model_id == 0){ break; }
361 
362  TR << "Processor " << rank << " received job for model number " << model_id << std::endl;
363 
364  //Breakup the models into groups of 1000 for scoring. This make the memory demand *much* less
365  //with a minor hit to scoring speed.
366  std::map< int, Model >::const_iterator it = models.begin();
367  std::map< int, Model >::const_iterator it_end = models.find(model_id);
368  while(true) {
369  core::Size counter = 0;
370  Hasher hasher;
371  for(; it != it_end; ++it) {
372  hasher.insert(it->second);
373  ++counter;
374  if(counter > 100) {
375  ++it;
376  break;
377  }
378  }
379  ScoreResults group_scores = hasher.score(models[model_id], num_segments_to_match, min_hash_score, max_clash_score, false, box_length);
380  hasher.remove_connection_inconsistencies(models, group_scores);
381  std::string node_score_file_name = score_file_name + "." + utility::to_string(rank);
382  write_hashing_scores_to_file(group_scores, node_score_file_name);
383  TR << "Processor " << rank << " done scoring. Found " << group_scores.size() << " valid comparisons" << std::endl;
384  if(it == it_end) {
385  break;
386  }
387  }
388 
389 // //Begin work
390 // Hasher hasher;
391 // ScoreResults scores;
392 // for(; it != it_end; ++it) {
393 // if(it->first == model_id) {
394 // TR << "Processor " << rank << " begin scoring " << model_id << std::endl;
395 // scores = hasher.score(it->second, 1, min_hash_score, 0, false);
396 // break;
397 // }
398 // hasher.insert(it->second);
399 // }
400 // hasher.remove_connection_inconsistencies(models, scores);
401 
402  //std::string node_score_file_name = score_file_name + "." + utility::to_string(rank);
403  //write_hashing_scores_to_file(group_scores, node_score_file_name);
404 
405  //Send message back to master node
407  }
408  TR << "Processor " << rank << " was told to stop working" << std::endl;
409  }
410  MPI_Barrier( MPI_COMM_WORLD ); // make all nodes reach this point together.
411  MPI_Finalize();
412 #else // so use 1 cpu
413  Hasher hasher;
414  std::map< int, Model >::const_iterator it1 = models.begin();
415  std::map< int, Model >::const_iterator it_end = models.end();
416  std::map< int, Model >::const_iterator it2 = models.begin();
417  for ( ; it1 != it_end; ++it1 ) {
418  ScoreResults scores;
419  for ( ; it2 != it1; ++it2 ) {
420  hasher.insert(it2->second); // it2->second is Model itself
421  }
422  TR << "current model id (it1): " << (it1->second).model_id_ << std::endl;
423  TR << "current model id (it2): " << (it2->second).model_id_ << std::endl;
424  scores = hasher.score(it1->second, num_segments_to_match, min_hash_score, max_clash_score, true, box_length);
427  }
429  TR << "do_not_remove_connection_inconsistencies: " << do_not_remove_connection_inconsistencies << std::endl;
430  if ( (!do_not_remove_connection_inconsistencies) ) {
431  //remove edges between segments that both have 'next' or 'previous' segments
432  hasher.remove_connection_inconsistencies(models, scores);
433  }
434  TR << "Done scoring the " << it2->first << "th model (this is not necessariliy a model_id) found (" << scores.size() << ") valid comparisons" << std::endl;
435  if ( scores.size() > 0 && TR.Debug ) {
436  BasisPair bp = scores.begin()->first;
437  std::map< SegmentPair, core::Size > segment_matches = scores.begin()->second.segment_match_counts;
438  std::map< SegmentPair, AtomMap > segment_matches2 = scores.begin()->second.segment_matches;
439  TR.Debug << "After scoring." << std::endl;
440  TR.Debug << "\tModels: " << bp.first.model_id << " " << bp.second.model_id << std::endl;
441  TR.Debug << "\tBasis Residues: " << bp.first.resnum << " " << bp.second.resnum << std::endl;
442  TR.Debug << "\tNumber of matched segments: " << segment_matches.size() << std::endl;
443  std::map< SegmentPair, core::Size >::const_iterator it = segment_matches.begin();
444  std::map< SegmentPair, core::Size >::const_iterator it_end = segment_matches.end();
445  std::map< SegmentPair, AtomMap >::const_iterator it2 = segment_matches2.begin();
446  for ( ; it != it_end; ++it ) {
447  TR.Debug << "\tSegments " << it->first.first << " and " << it->first.second << " have " << it->second << " overlapping atoms." << std::endl;
448  TR.Debug << "\tSegments " << it2->first.first << " and " << it2->first.second << " have " << it2->second.size() << " overlapping atoms." << std::endl;
449  ++it2;
450  }
451  }
452  write_hashing_scores_to_file(scores, score_file_name);
453  } //for ( ; it1 != it_end; ++it1 ) {
454 #endif
455  }// if ( ( option[sewing::mode].value() == "hash" ) )
456  } catch ( utility::excn::EXCN_Base& excn ) {
457  std::cerr << "Exception : " << std::endl;
458  excn.show( std::cerr );
459  return -1;
460  }
461  return 0;
462 }
ocstream cerr(std::cerr)
Wrapper around std::cerr.
Definition: ocstream.hh:290
IntegerOptionKey const num_procs
#define utility_exit_with_message(m)
Exit with file + line + message.
Definition: exit.hh:47
static T min(T x, T y)
Definition: Svm.cc:16
int main(int argc, char *argv[])
static basic::Tracer TR("sewing_hasher")
FileOptionKey const score_file_name
BooleanOptionKey const sewing
bool good() const
Good?
Definition: izstream.hh:572
int receive_integer_from_node(int source)
Use MPI to receive a single integer from a particular node.
Definition: mpi_util.cc:336
BooleanOptionKey const do_not_remove_connection_inconsistencies
IntegerOptionKey const num_segments_to_match
IntegerOptionKey const max_models
utility::keys::KeyLookup< KeyType >::const_iterator const_iterator
Key collection iterators.
izstream & getline(char *line, std::streamsize const count)
Get the rest of the line.
Definition: izstream.hh:330
IntegerOptionKey const box_length
void init(int argc, char *argv[])
Command line init() version.
Definition: init.cc:23
core::pose::Pose Pose
Definition: supercharge.cc:101
StringOptionKey const mode
Definition: contacts.py:32
File name class supporting Windows and UN*X/Linux format names.
Definition: FileName.hh:37
basic::options::OptionKeys collection
IntegerOptionKey const max_clash_score
IntegerOptionKey const rank
BooleanOptionKey const hash_tag_only_terminal_Es
member1 value
Definition: Tag.cc:296
virtual void show(std::ostream &) const =0
static THREAD_LOCAL basic::Tracer TR("basic.options")
FileOptionKey const model_file_name
IntegerOptionKey const min_hash_score
Tracer IO system.
izstream: Input file stream wrapper for uncompressed and compressed files
Definition: izstream.hh:44
Input file stream wrapper for uncompressed and compressed files.
void send_integer_to_node(int destination, int message)
Definition: mpi_util.cc:348
std::vector with 1-based indexing
Definition: vector1.fwd.hh:44
File name class supporting Windows and UN*X/Linux format names.
int receive_integer_from_anyone()
Use MPI to wait until some node sends an integer – usually its own mpi_rank so that it can send furt...
Definition: mpi_util.cc:276
IntegerOptionKey const starting_model
basic::options::OptionKeys collection
rule< Scanner, options_closure::context_t > options
Definition: Tag.cc:377
utility::options::OptionCollection option
OptionCollection global.
Definition: option.cc:46
BooleanOptionKey const exit("options:exit")
Definition: OptionKeys.hh:51
vector1: std::vector with 1-based indexing
int mpi_rank()
Definition: mpi_util.cc:254
Class for handling user debug/warnings/errors. Use instance of this class instead of 'std::cout' for ...
Definition: Tracer.hh:134
StringVectorOptionKey const scores
rule< Scanner, string_closure::context_t > name
Definition: Tag.cc:376
basic::options::OptionKeys collection
std::string to_string(const T &t)
Definition: string_util.hh:52
Some std::string helper functions.
Program options global and initialization function.
StringOptionKey const assembly_type
int mpi_nprocs()
Definition: mpi_util.cc:265
platform::Size Size
Definition: random.fwd.hh:30
TracerProxy Debug
Definition: Tracer.hh:262
FileVectorOptionKey const l