Rosetta
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Properties Friends Macros Pages
sewing_hasher.cc
Go to the documentation of this file.
1 // -*- mode:c++;tab-width:2;indent-tabs-mode:t;show-trailing-whitespace:t;rm-trailing-spaces:t -*-
2 // vi: set ts=2 noet;
3 //
4 // (c) Copyright Rosetta Commons Member Institutions.
5 // (c) This file is part of the Rosetta software suite and is made available under license.
6 // (c) The Rosetta software is developed by the contributing members of the Rosetta Commons.
7 // (c) For more information, see http://www.rosettacommons.org. Questions about this can be
8 // (c) addressed to University of Washington UW TechTransfer, email: license@u.washington.edu.
9 
10 /// @file Apps/sewing_hasher.cc
11 ///
12 /// @brief An MPI-enabled application that reads in a Hasher hashtable, scores each model in that table against all others, and
13 /// generates a SewGraph file.
14 ///
15 /// @author Tim Jacobs
16 
17 //Package headers
18 #include <devel/init.hh>
19 #include <protocols/sewing/hashing/Hasher.hh>
20 #include <protocols/sewing/conformation/Model.hh>
21 #include <protocols/sewing/util/io.hh>
22 
23 //Protocol headers
24 #include <core/types.hh>
25 #include <core/id/AtomID.hh>
26 #include <core/pose/Pose.hh>
27 #include <core/import_pose/import_pose.hh>
28 
29 //Utility headers
30 #include <utility/io/izstream.hh>
31 #include <utility/mpi_util.hh>
32 #include <utility/string_util.hh>
33 #include <utility/vector1.hh>
35 #include <utility/file/FileName.hh>
36 
37 #include <basic/options/option.hh>
38 #include <basic/options/keys/sewing.OptionKeys.gen.hh>
39 #include <basic/options/keys/in.OptionKeys.gen.hh>
40 #include <basic/options/keys/inout.OptionKeys.gen.hh>
41 #include <basic/Tracer.hh>
43 
44 //C++ headers
45 #include <map>
46 #include <set>
47 #include <iterator>
48 
49 static basic::Tracer TR("sewing_hasher");
50 
51 int
52 main( int argc, char * argv [] ) {
53  using namespace basic::options;
54  using namespace basic::options::OptionKeys;
55  using namespace protocols::sewing;
56 
57  try {
58  // initialize core and read options
59  devel::init(argc, argv);
60 
61  if ( !option[sewing::mode].user() ) {
62  std::stringstream err;
63  err << "You must provide a mode for sewing_hasher to run in using the -sewing:mode flag. Valid options are" << std::endl;
64  err << " -generate: generates a model file from an sqlite database" << std::endl;
65  err << " -generate_five_ss_model: generates a 3~5 ss model file from an sqlite database" << std::endl;
66  err << " -hash: score all models against each other and create a plain text score file (MPI required)" << std::endl;
67  err << " -convert: convert a plain text score file to a binary score file. This is required by the SEWING movers" << std::endl;
68  utility_exit_with_message(err.str());
69  }
70 
71  //Check for model file (either for reading or writing)
72  std::map< int, Model > models;
73  std::string model_filename = option[sewing::model_file_name];
74  if ( !option[sewing::model_file_name].user() ) {
75  std::stringstream err;
76  err << "You must provide a model file name to the sewing_hasher using the -model_file_name flag. To generate a new model file use the "
77  << "-generate_models_from_db flag and a new model file with that name will be written. Otherwise, the model file will be read";
78  utility_exit_with_message(err.str());
79  }
80 
81  //////////////////////////// MODEL GENERATION ///////////////////////////////////
82 
83  //Are we just generating a model file?
84  if ( option[sewing::mode].value() == "generate" ) {
85 
86  //Create comments stream for model file and add the date
87  std::stringstream comments;
88  //timestamp became commented by Vikram to prevent daily integration failure
89  //time_t t = time(0); // get time now
90  //struct tm * now = localtime( & t );
91  //comments << "#Model file created on " << (now->tm_year + 1900) << '-'
92  // << (now->tm_mon + 1) << '-'
93  // << now->tm_mday
94  // << std::endl;
95 
96  bool hash_tag_only_terminal_Es = option[sewing::hash_tag_only_terminal_Es].def(false);
97  TR << "hash_tag_only_terminal_Es: " << hash_tag_only_terminal_Es << std::endl;
98  std::string hash_between;
99  //std::string model_three_ss_filename;
100 
101  //Generate model file from a list of PDBs. All models will have 1 segment
102  if ( option[ in::file::l ].user() ) {
103  comments << "#Models generated from PDB input (-l flag)" << std::endl;
104  utility::vector1<utility::file::FileName> input_lists( option[ in::file::l ]() );
106  for ( core::Size i = 1; i <= input_lists.size(); i++ ) {
107  utility::io::izstream current_input_list( input_lists[i] );
108  if ( !current_input_list.good() ) {
109  utility_exit_with_message("unable to open input file file: "+input_lists[i].name()+"\n");
110  }
111  while ( current_input_list.good() ) {
112  std::string name;
113  current_input_list.getline(name);
114  if ( current_input_list.good() ) pdb_library.push_back( utility::file::FileName(name) );
115  }
116  }
117 
118  for ( core::Size i=1; i<=pdb_library.size(); ++i ) {
120  core::import_pose::pose_from_pdb(pose, pdb_library[i]);
122  segments.push_back(std::make_pair(1, pose.total_residue()));
123  Model pdb_model = create_model_from_pose(pose, segments, (int)i);
124  models.insert(std::make_pair(i, pdb_model));
125  }
126 
127  } else {
128  //Generate models from a features database. Each segment is a single piece of secondary structure
129 
130  if ( hash_tag_only_terminal_Es ) {
131  hash_between = "hash_tag_only_terminal_Es";
132  comments << "#Only terminal Es are hash bool true to be merged with other nodes later (but as of 2015/11/14, later model assembly has 'ERROR: alignment_scores.size() == 1'" << std::endl;
133  // model_three_ss_filename = model_filename + "_three_ss_will_be_hashed_only_between_Es";
134  //(not possible due to integration test)
135  } else {
136  hash_between = "hash_between_any_HEs";
137  comments << "# hash between any HEs are bool true to be merged with other nodes" << std::endl;
138  // model_three_ss_filename = model_filename + "_three_ss_will_be_hashed_between_any_HEs";
139  //(not possible due to integration test)
140  }
141 
142  if ( !option[ sewing::assembly_type ].user() ) {
143  std::stringstream err;
144  err << "You must provide an assembly_type (continuous or discontinuous) with the -sewing:assembly_type flag in order to extract models";
145  utility_exit_with_message(err.str());
146  }
147  if ( option[ sewing::assembly_type ].value() == "discontinuous" ) {
148  comments << "#Discontinuous models generated from sqlite database " << option[inout::dbms::database_name].value() << std::endl;
149  models = get_discontinuous_models_from_db();
150  } else if ( option[ sewing::assembly_type ].value() == "continuous" ) {
151  comments << "#Continuous models generated from sqlite database " << option[inout::dbms::database_name].value() << std::endl;
152  models = get_continuous_models_from_db(hash_between);
153  }
154  }
155 
156  write_model_file(comments.str(), models, model_filename);
157  TR << "New model file " << model_filename << " successfully written." << std::endl;
158  std::exit(0);
159  } else if ( option[sewing::mode].value() == "generate_five_ss_model" ) {
160 
161  //Create comments stream for model file and add the date
162  std::stringstream comments;
163  //timestamp became commented by Vikram to prevent daily integration failure
164  //time_t t = time(0); // get time now
165  //struct tm * now = localtime( & t );
166  //comments << "#Model file created on " << (now->tm_year + 1900) << '-'
167  // << (now->tm_mon + 1) << '-'
168  // << now->tm_mday
169  // << std::endl;
170 
171  //Generate models from a features database. Each segment is a single piece of secondary structure
172  comments << "# 3 or 5 secondary structures based models generated from sqlite database " << option[inout::dbms::database_name].value() << std::endl;
173 
174  bool hash_tag_only_terminal_Es = option[sewing::hash_tag_only_terminal_Es].def(false);
175  TR << "hash_tag_only_terminal_Es: " << hash_tag_only_terminal_Es << std::endl;
176  std::string hash_between;
177  std::string model_five_ss_filename;
178  if ( hash_tag_only_terminal_Es ) {
179  hash_between = "hash_tag_only_terminal_Es";
180  comments << "# only_terminal_Es_are_hash_bool_true_to_be_merged_with_other_node " << std::endl;
181  model_five_ss_filename = model_filename + "_three_or_five_ss_will_be_hashed_only_between_Es";
182  } else {
183  hash_between = "hash_between_any_HEs";
184  comments << "# hash_between_any_HEs_are_bool_true_to_be_merged_with_other_node " << std::endl;
185  model_five_ss_filename = model_filename + "_three_or_five_ss_will_be_hashed_between_any_HEs";
186  }
187 
188  std::map< int, Model > models = get_5_ss_models_from_db(hash_between);
189 
190  write_model_file(comments.str(), models, model_five_ss_filename);
191  TR << "New model file with 3~5 ss " << model_five_ss_filename << " successfully written." << std::endl;
192 
193  std::exit(0);
194  }
195 
196  //If we aren't generating models, then we need to read them
197  models = read_model_file(model_filename);
198 
199  ///////////////////////// BINARY FILE TESTING //////////////////////////////
200 
201  if ( option[sewing::mode].value() == "test" ) {
202  std::string binary_filename = option[sewing::score_file_name].value();
203  SewGraphOP graph( new SewGraph(models, 1) );
204  graph->report_binary_stats(models, binary_filename);
205  }
206 
207  ///////////////////////// MODEL CONVERSION TO BINARY //////////////////////////////
208 
209  //If we are generating a binary file then do that and exit
210  if ( option[sewing::mode].value() == "convert" ) {
211  if ( !option[sewing::score_file_name].user() ) {
212  std::stringstream err;
213  err << "You must provide a score file name to the sewing_hasher for binary conversion.";
214  utility_exit_with_message(err.str());
215 
216  }
217  std::string binary_filename = utility::to_string(option[sewing::score_file_name].value())+utility::to_string(".bin");
218 
219  SewGraphOP graph;
220  if ( option[ sewing::assembly_type ].value() == "discontinuous" ) {
221  graph = SewGraphOP( new SewGraph(models, 2) );
222  } else if ( option[ sewing::assembly_type ].value() == "continuous" ) {
223  graph = SewGraphOP( new SewGraph(models, 1) );
224  }
225  graph->generate_binary_score_file(option[sewing::score_file_name].value(), binary_filename);
226  std::exit(0);
227  }
228 
229 
230  //////////// MODEL COMPARISON USING GEOMETRIC HASHING /////////////////
231 
232  if ( option[sewing::mode].value() == "hash" ) {
233 
234  if ( !option[sewing::score_file_name].user() ) {
235  utility_exit_with_message("You must provide a graph file name to the sewing_hasher using -score_file_name");
236  }
237  std::string score_file_name = option[sewing::score_file_name];
238 
239  core::Size min_score = option[sewing::min_hash_score].def(10);
240  core::Size max_clash_score = option[sewing::max_clash_score].def(0);
241  core::Size num_segments_to_match = option[sewing::num_segments_to_match].def(0);
242  core::Size box_length = option[sewing::box_length].def(3);
243 
244  TR << "Bundle Hasher options:" << std::endl;
245  TR << "\tMinimum Score: " << min_score << std::endl;
246  TR << "\tMaximum Clash Score: " << max_clash_score << std::endl;
247  TR << "\tNumber of segments to match: " << num_segments_to_match << std::endl;
248  TR << "\tScore file name: " << score_file_name << std::endl;
249  TR << "\tNeighborhood lookup box_length: " << box_length << std::endl;
250 
251 #ifdef USEMPI
252  //Now, score models. Split this work between multiple processors if we have them
253  core::Size n_models;
254  if ( option[sewing::max_models].user() ) {
255  n_models = std::min( (core::Size)option[sewing::max_models], models.size() );
256  } else {
257  n_models = models.size();
258  }
259 
260  int starting_model = 0;
261  if ( option[sewing::starting_model].user() ) {
262  starting_model = option[sewing::starting_model].value();
263  }
264 
265  core::Size rank = utility::mpi_rank();
266  if(rank == 0) {
267  core::Size num_procs = utility::mpi_nprocs();
268 
269  TR << "Master node has " << n_models << " jobs to submit" << std::endl;
270 
271  //send out initial jobs
272  std::map< int, Model >::const_iterator it = models.begin();
273  ++it; //Increment the iterator (no need to score the first model against just itself)
274 
275  std::map< int, Model >::const_iterator it_end = models.begin();
276  std::advance(it_end, n_models);
277 
278  if(num_procs > n_models) {
279  utility_exit_with_message("You have more processors than number of models to hash. Reduce your number of processors for efficiency");
280  }
281 
282  core::Size curr_proc = 1;
283  for(; it != it_end; ++it) {
284  //send *it to curr_proc
285  if(it->first >= starting_model) {
286  utility::send_integer_to_node(curr_proc,it->first);
287  TR << "Master node sent a job for model " << it->first << " to processor " << curr_proc << std::endl;
288  ++curr_proc;
289  if(curr_proc == num_procs) { break; }
290  }
291  }
292 
293  ++it;//Increment iterator once more to account for the last model sent in the above loop
294  while( it != it_end ){
295  //wait for a processor to send you a message
296  //send more work
298  TR << "Master node received a message from processor " << received_node << std::endl;
299  utility::send_integer_to_node(received_node,it->first);
300  TR << "Master node sent a new job for model " << it->first << " to processor " << received_node << std::endl;
301  ++it;
302  }
303 
304  //Once we're done with all the work, tell each processor to spin down
305  core::Size counter = 1;
306  while(counter != num_procs){
308  utility::send_integer_to_node(received_node,0);
309  ++counter;
310  }
311  TR << "Master node finished sending jobs" << std::endl;
312  }
313 
314  else {
315  while(true) {
316  //Recieve message from master node
317  int model_id = utility::receive_integer_from_node(0);
318  if(model_id == 0){ break; }
319 
320  TR << "Processor " << rank << " received job for model number " << model_id << std::endl;
321 
322  //Breakup the models into groups of 1000 for scoring. This make the memory demand *much* less
323  //with a minor hit to scoring speed.
324  std::map< int, Model >::const_iterator it = models.begin();
325  std::map< int, Model >::const_iterator it_end = models.find(model_id);
326  while(true) {
327  core::Size counter = 0;
328  Hasher hasher;
329  for(; it != it_end; ++it) {
330  hasher.insert(it->second);
331  ++counter;
332  if(counter > 100) {
333  ++it;
334  break;
335  }
336  }
337  //ScoreResults group_scores = hasher.score(models[model_id], num_segments_to_match, min_score, max_clash_score, false);
338  ScoreResults group_scores = hasher.score(models[model_id], num_segments_to_match, min_score, max_clash_score, false, box_length);
339  hasher.remove_connection_inconsistencies(models, group_scores);
340  std::string node_score_file_name = score_file_name + "." + utility::to_string(rank);
341  write_hashing_scores_to_file(group_scores, node_score_file_name);
342  TR << "Processor " << rank << " done scoring. Found " << group_scores.size() << " valid comparisons" << std::endl;
343  if(it == it_end) {
344  break;
345  }
346  }
347 
348 // //Begin work
349 // Hasher hasher;
350 // ScoreResults scores;
351 // for(; it != it_end; ++it) {
352 // if(it->first == model_id) {
353 // TR << "Processor " << rank << " begin scoring " << model_id << std::endl;
354 // scores = hasher.score(it->second, 1, min_score, 0, false);
355 // break;
356 // }
357 // hasher.insert(it->second);
358 // }
359 // hasher.remove_connection_inconsistencies(models, scores);
360 
361  //std::string node_score_file_name = score_file_name + "." + utility::to_string(rank);
362  //write_hashing_scores_to_file(group_scores, node_score_file_name);
363 
364  //Send message back to master node
366  }
367  TR << "Processor " << rank << " was told to stop working" << std::endl;
368  }
369  MPI_Barrier( MPI_COMM_WORLD ); // make all nodes reach this point together.
370  MPI_Finalize();
371 #else
372  Hasher hasher;
373  std::map< int, Model >::const_iterator it1 = models.begin();
374  std::map< int, Model >::const_iterator it_end = models.end();
375  std::map< int, Model >::const_iterator it2 = models.begin();
376  for ( ; it1 != it_end; ++it1 ) {
377  ScoreResults scores;
378  for ( ; it2 != it1; ++it2 ) {
379  hasher.insert(it2->second);
380  }
381  //scores = hasher.score(it1->second, num_segments_to_match, min_score, max_clash_score, true);
382  scores = hasher.score(it1->second, num_segments_to_match, min_score, max_clash_score, true, box_length);
383  hasher.remove_connection_inconsistencies(models, scores);
384  TR << "Done scoring " << it2->first << " found " << scores.size() << " valid comparisons" << std::endl;
385  if ( scores.size() > 0 && TR.Debug ) {
386  BasisPair bp = scores.begin()->first;
387  std::map< SegmentPair, core::Size > segment_matches = scores.begin()->second.segment_match_counts;
388  std::map< SegmentPair, AtomMap > segment_matches2 = scores.begin()->second.segment_matches;
389  TR.Debug << "After scoring." << std::endl;
390  TR.Debug << "\tModels: " << bp.first.model_id << " " << bp.second.model_id << std::endl;
391  TR.Debug << "\tBasis Residues: " << bp.first.resnum << " " << bp.second.resnum << std::endl;
392  TR.Debug << "\tNumber of matched segments: " << segment_matches.size() << std::endl;
393  std::map< SegmentPair, core::Size >::const_iterator it = segment_matches.begin();
394  std::map< SegmentPair, core::Size >::const_iterator it_end = segment_matches.end();
395  std::map< SegmentPair, AtomMap >::const_iterator it2 = segment_matches2.begin();
396  for ( ; it != it_end; ++it ) {
397  TR.Debug << "\tSegments " << it->first.first << " and " << it->first.second << " have " << it->second << " overlapping atoms." << std::endl;
398  TR.Debug << "\tSegments " << it2->first.first << " and " << it2->first.second << " have " << it2->second.size() << " overlapping atoms." << std::endl;
399  ++it2;
400  }
401  }
402  write_hashing_scores_to_file(scores, score_file_name);
403  }
404 #endif
405  }
406  } catch ( utility::excn::EXCN_Base& excn ) {
407  std::cerr << "Exception : " << std::endl;
408  excn.show( std::cerr );
409  return -1;
410  }
411  return 0;
412 }
ocstream cerr(std::cerr)
Wrapper around std::cerr.
Definition: ocstream.hh:290
#define utility_exit_with_message(m)
Exit with file + line + message.
Definition: exit.hh:47
static T min(T x, T y)
Definition: Svm.cc:16
int main(int argc, char *argv[])
static basic::Tracer TR("sewing_hasher")
bool good() const
Good?
Definition: izstream.hh:572
int receive_integer_from_node(int source)
Use MPI to receive a single integer from a particular node.
Definition: mpi_util.cc:336
utility::keys::KeyLookup< KeyType >::const_iterator const_iterator
Key collection iterators.
izstream & getline(char *line, std::streamsize const count)
Get the rest of the line.
Definition: izstream.hh:330
void init(int argc, char *argv[])
Command line init() version.
Definition: init.cc:23
BooleanOptionKey const user("options:user")
Definition: OptionKeys.hh:40
core::pose::Pose Pose
Definition: supercharge.cc:101
File name class supporting Windows and UN*X/Linux format names.
Definition: FileName.hh:37
member1 value
Definition: Tag.cc:296
virtual void show(std::ostream &) const =0
static THREAD_LOCAL basic::Tracer TR("basic.options")
Tracer IO system.
izstream: Input file stream wrapper for uncompressed and compressed files
Definition: izstream.hh:44
Input file stream wrapper for uncompressed and compressed files.
void send_integer_to_node(int destination, int message)
Definition: mpi_util.cc:348
std::vector with 1-based indexing
Definition: vector1.fwd.hh:44
File name class supporting Windows and UN*X/Linux format names.
int receive_integer_from_anyone()
Use MPI to wait until some node sends an integer – usually its own mpi_rank so that it can send furt...
Definition: mpi_util.cc:276
rule< Scanner, options_closure::context_t > options
Definition: Tag.cc:377
utility::options::OptionCollection option
OptionCollection global.
Definition: option.cc:45
BooleanOptionKey const exit("options:exit")
Definition: OptionKeys.hh:51
vector1: std::vector with 1-based indexing
int mpi_rank()
Definition: mpi_util.cc:254
Class for handling user debug/warnings/errors. Use instance of this class instead of 'std::cout' for ...
Definition: Tracer.hh:134
rule< Scanner, string_closure::context_t > name
Definition: Tag.cc:376
std::string to_string(const T &t)
Definition: string_util.hh:205
string mode
Definition: contacts.py:32
Some std::string helper functions.
Program options global and initialization function.
int mpi_nprocs()
Definition: mpi_util.cc:265
platform::Size Size
Definition: random.fwd.hh:30
TracerProxy Debug
Definition: Tracer.hh:262