Rosetta
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Properties Friends Macros Pages
arls_impl.py
Go to the documentation of this file.
1 #!/usr/bin/env python
2 # (c) Copyright Rosetta Commons Member Institutions.
3 # (c) This file is part of the Rosetta software suite and is made available under license.
4 # (c) The Rosetta software is developed by the contributing members of the Rosetta Commons.
5 # (c) For more information, see http://www.rosettacommons.org. Questions about this can be
6 # (c) addressed to University of Washington UW TechTransfer, email: license@u.washington.edu.
7 
8 import sys, os, re
9 from optparse import OptionParser, IndentedHelpFormatter
10 
11 try: set
12 except: from sets import Set as set
13 
14 # Better handle multiple paragraph descriptions.
15 class PreformattedDescFormatter (IndentedHelpFormatter):
16  def format_description(self, description):
17  return description.strip() + "\n" # Remove leading/trailing whitespace
18 
19 
20 def chain_from_iterables(iterables):
21  for it in iterables:
22  for el in it:
23  yield el
24 
25 
26 def find_file(regex, paths, required=True):
27  if isinstance(regex, str): regex = re.compile(regex)
28  for path in paths:
29  if not os.path.isdir(path): continue
30  hits = [ f for f in os.listdir(path) if regex.match(f) is not None ]
31  if len(hits) == 1:
32  return os.path.join(path, hits[0])
33  elif len(hits) > 1:
34  if required: raise ValueError("Multiple hits for '%s' in '%s' (try using the --compile-tag option)" % (regex.pattern, path))
35  else: return None
36  if required: raise ValueError("'%s' not found" % regex.pattern)
37  else: return None
38 
39 
40 class DockingCase(object):
41  # Caching ensures that each file gets prepared only once,
42  # and that cases that share files share DockingFile objects.
43  # Classes of files are separated b/c e.g. ligands and cofactors are treated differently
44  _protein_cache = {}
45  _cofactor_cache = {}
46  _ligand_cache = {}
47 
48  def __init__(self, paths, prot_cofs_lig):
49  protein_exts = ['pdb']
50  chemical_exts = ['mol', 'mol2', 'sdf']
51  self.protein = self._pcache(DockingFile(paths, prot_cofs_lig[0], protein_exts))
52  self.cofactors = [ self._ccache(DockingFile(paths, c, chemical_exts)) for c in prot_cofs_lig[1:-1] ]
53  self.ligand = self._lcache(DockingFile(paths, prot_cofs_lig[-1], chemical_exts))
54 
55  def _pcache(self, docking_file):
56  return self._protein_cache.setdefault(docking_file, docking_file)
57  def _ccache(self, docking_file):
58  return self._cofactor_cache.setdefault(docking_file, docking_file)
59  def _lcache(self, docking_file):
60  return self._ligand_cache.setdefault(docking_file, docking_file)
61 
62 
63 class DockingFile(object):
64  def __init__(self, paths, base, exts):
65  self.base = base
66  ext_re = r'\.(%s)$' % '|'.join(exts) # == e.g. r'\.(pdb|mol|mol2|sdf)$'
67  self.path = find_file(re.escape(base)+ext_re, paths)
68 
69  def __cmp__(self, other):
70  return cmp(self.path, other.path)
71  def __hash__(self):
72  return hash(self.path)
73 
74  def __str__(self):
75  return self.path
76  def __repr__(self):
77  return self.path
78 
79 
80 def find_programs(options):
81  PATH_paths = os.environ['PATH'].split(os.pathsep)
82  ros_paths = [os.path.join(options.mini,'bin')] + PATH_paths
83  if options.compile_tag is not None:
84  comp_tag = options.compile_tag.replace('.',r'\.')
85  else:
86  comp_tag = r'.+release'
87  scr_paths = [os.path.join(options.mini,'src','apps','public','ligand_docking'),
88  os.path.join(options.mini,'src','python','apps','public')] + PATH_paths
89  oe_paths = [os.path.join(options.openeye,'bin')] + PATH_paths
90  if 'OE_LICENSE' in os.environ:
91  oe_paths.append( os.path.join(os.path.dirname(os.environ['OE_LICENSE']),'bin') )
92  progs = {
93  'dock': find_file(r'ligand_dock\.'+comp_tag, ros_paths),
94  'rpkmin': find_file(r'ligand_rpkmin\.'+comp_tag, ros_paths),
95  'confidence': find_file(r'ligdock_confidence\.'+comp_tag, ros_paths, required=False),
96  'extract': find_file(r'extract_atomtree_diffs\.'+comp_tag, ros_paths),
97  'best_poses': find_file(r'select_best_unique_ligand_poses\.'+comp_tag, ros_paths, required=False),
98  'assign_charges': find_file(r'assign_charges\.py', scr_paths, required=not options.skip_charges),
99  'molfile_to_params': find_file(r'molfile_to_params\.py', scr_paths),
100  'pdb_to_molfile': find_file(r'pdb_to_molfile\.py', scr_paths),
101  'get_scores': find_file(r'get_scores\.py', scr_paths),
102  'plot_funnels': find_file(r'plot_funnels\.R', scr_paths),
103  'parallel': find_file(r'parallel\.py', scr_paths),
104  'omega': find_file(r'omega2', oe_paths, required=not options.skip_omega),
105  }
106  if options.skip_omega: progs['omega'] = None
107  if options.skip_charges: progs['assign_charges'] = None
108  return progs
109 
110 
111 def read_input_list(infile):
112  inputs = []
113  paths = [ os.getcwd() ]
114  for line in infile:
115  line = line.rstrip()
116  if line.startswith("#") or line == "": continue
117  fields = line.split()
118  if len(fields) < 2:
119  raise ValueError("Not enough fields in list of input structures: "+line.rstrip())
120  inputs.append( DockingCase(paths, fields) )
121  return inputs
122 
123 
124 def get_procs_per_case(docking_cases, options):
125  return max(1, int(options.njobs) // len(docking_cases))
126 
127 
128 def write_setup_script(outfile, options, progs, docking_cases): #{{{
129  proteins = set([ d.protein for d in docking_cases ])
130  cofactors = set( chain_from_iterables([d.cofactors for d in docking_cases]) )
131  ligands = set([ d.ligand for d in docking_cases ])
132 
133  outfile.write('''#!/bin/bash
134 set -v
135 
136 # Set up directory structure
137 mkdir -p ligand cofactor unbound input native
138 
139 # Make ligands
140 pushd ligand
141 mkdir -p {fa,cen}/{conf1,confs,kins,withxtal}
142 
143 ''')
144  if progs['omega'] is not None:
145  outfile.write('omega="%s -includeInput -commentEnergy"\n' % progs['omega'])
146  if options.num_procs > 1: outfile.write("%s -j%i <<HEREDOC\n" % (progs['parallel'], options.num_procs))
147  for i, ligand in enumerate(ligands):
148  ligcode = "X%02X" % i # X01, X02, ..., XFF
149  ligbase = ligand.base
150  ligpath = ligand.path
151  if progs['omega'] is not None:
152  oldpath = ligpath
153  ligpath = "%s.omega.mol2" % ligbase
154  outfile.write("$omega -in %s -out %s -prefix _%s && " % (oldpath, ligpath, ligbase))
155  if progs['assign_charges'] is not None:
156  oldpath = ligpath
157  ligpath = "%s.am1bcc.mol2" % ligbase
158  outfile.write("%s %s < %s > %s && " % (options.python, progs['assign_charges'], oldpath, ligpath))
159  outfile.write("%s %s -c -n%s -p%s -k%s.kin %s" % (options.python, progs['molfile_to_params'], ligcode, ligbase, ligbase, ligpath))
160  for ext in ['fa', 'cen']:
161  outfile.write(" && cat %s_????.%s.pdb | gzip -c > %s/withxtal/%s_confs.%s.pdb.gz" % (ligbase, ext, ext, ligbase, ext))
162  # Force a _0002 conformer to exist, even if it's just a duplicate of _0001
163  outfile.write(" && ( [ -f %s_0002.%s.pdb ] || cp %s_0001.%s.pdb %s_0002.%s.pdb )" % (ligbase, ext, ligbase, ext, ligbase, ext))
164  outfile.write(" && mv %s_0001.%s.pdb %s/conf1/" % (ligbase, ext, ext))
165  outfile.write(" && cat %s_????.%s.pdb | gzip -c > %s/%s_confs.%s.pdb.gz" % (ligbase, ext, ext, ligbase, ext))
166  outfile.write(" && mv %s_????.%s.pdb %s/confs/" % (ligbase, ext, ext))
167  outfile.write(" && echo 'PDB_ROTAMERS %s_confs.%s.pdb' >> %s.%s.params" % (ligbase, ext, ligbase, ext))
168  outfile.write(" && cp %s.%s.params %s/withxtal/" % (ligbase, ext, ext))
169  outfile.write(" && mv %s.%s.params %s/" % (ligbase, ext, ext))
170  outfile.write(" && mv %s.%s.kin %s/kins/" % (ligbase, ext, ext))
171  outfile.write("\n")
172  if options.num_procs > 1: outfile.write("HEREDOC\n")
173  outfile.write('''
174 popd
175 
176 # Make cofactors
177 pushd cofactor
178 mkdir -p {fa,cen}/{conf1,confs,kins,withxtal}
179 
180 ''')
181  if options.num_procs > 1: outfile.write("%s -j%i <<HEREDOC\n" % (progs['parallel'], options.num_procs))
182  for i, ligand in enumerate(cofactors):
183  ligcode = "Q%02X" % i # Q01, Q02, ..., QFF
184  ligbase = ligand.base
185  ligpath = ligand.path
186  # Generate conformers for cofactors, but by default only use xtal conf
187  if progs['omega'] is not None:
188  oldpath = ligpath
189  ligpath = "%s.omega.mol2" % ligbase
190  outfile.write("$omega -in %s -out %s -prefix _%s && " % (oldpath, ligpath, ligbase))
191  if progs['assign_charges'] is not None:
192  oldpath = ligpath
193  ligpath = "%s.am1bcc.mol2" % ligbase
194  outfile.write("%s %s < %s > %s && " % (options.python, progs['assign_charges'], oldpath, ligpath))
195  outfile.write("%s %s -c -n%s -p%s -k%s.kin %s" % (options.python, progs['molfile_to_params'], ligcode, ligbase, ligbase, ligpath))
196  for ext in ['fa', 'cen']:
197  outfile.write(" && cat %s_????.%s.pdb | gzip -c > %s/withxtal/%s_confs.%s.pdb.gz" % (ligbase, ext, ext, ligbase, ext))
198  # Force a _0002 conformer to exist, even if it's just a duplicate of _0001
199  outfile.write(" && ( [ -f %s_0002.%s.pdb ] || cp %s_0001.%s.pdb %s_0002.%s.pdb )" % (ligbase, ext, ligbase, ext, ligbase, ext))
200  outfile.write(" && mv %s_0001.%s.pdb %s/conf1/" % (ligbase, ext, ext))
201  #outfile.write(" && cat %s_????.%s.pdb | gzip -c > %s/%s_confs.%s.pdb.gz" % (ligbase, ext, ext, ligbase, ext))
202  outfile.write(" && mv %s_????.%s.pdb %s/confs/" % (ligbase, ext, ext))
203  # Default is to generate the conformer libraries "just in case" but not enable them...
204  #outfile.write(" && echo 'PDB_ROTAMERS %s_confs.%s.pdb' >> %s.%s.params" % (ligbase, ext, ligbase, ext))
205  outfile.write(" && mv %s.%s.params %s/" % (ligbase, ext, ext))
206  outfile.write(" && mv %s.%s.kin %s/kins/" % (ligbase, ext, ext))
207  outfile.write("\n")
208  if options.num_procs > 1: outfile.write("HEREDOC\n")
209  outfile.write('''
210 popd
211 
212 # Make "unbound" proteins (never prepacked, never a ligand or cofactor)
213 ''')
214  for protein in proteins:
215  outfile.write("cp %s unbound/%s.pdb\n" % (protein.path, protein.base))
216  outfile.write('''
217 # Make native and input PDBs, with ligands and cofactors
218 # May later be replaced by repacked PDB files
219 ''')
220  for docking_case in docking_cases:
221  cofs = " ".join(["cofactor/fa/conf1/%s_0001.fa.pdb" % c.base for c in docking_case.cofactors])
222  nat_files = " ".join([docking_case.protein.path, cofs, "ligand/fa/conf1/%s_0001.fa.pdb" % docking_case.ligand.base])
223  inp_files = " ".join([docking_case.protein.path, cofs, "ligand/fa/confs/%s_0002.fa.pdb" % docking_case.ligand.base])
224  outfile.write("cat %s > native/%s_%s.pdb\n" % (nat_files, docking_case.protein.base, docking_case.ligand.base))
225  outfile.write("cat %s > input/%s_%s.pdb\n" % (inp_files, docking_case.protein.base, docking_case.ligand.base))
226 #}}}
227 
228 
229 def write_common_flags(outfile, options, docking_cases): #{{{
230  database = options.database
231  cofactors = set( chain_from_iterables([d.cofactors for d in docking_cases]) )
232  ligands = set([ d.ligand for d in docking_cases ])
233  extra_res_cen = " ".join(
234  ["ligand/cen/%s.cen.params" % l.base for l in ligands]
235  + ["cofactor/cen/%s.cen.params" % c.base for c in cofactors])
236  extra_res_fa = " ".join(
237  ["ligand/fa/%s.fa.params" % l.base for l in ligands]
238  + ["cofactor/fa/%s.fa.params" % c.base for c in cofactors])
239  outfile.write('''
240 -in
241  -path
242  ## On a large cluster, the database should be on a local scratch disk
243  ## to avoid over-taxing NFS.
244  #-database /scratch/ROSETTA/minirosetta_database
245  ## "Fallback" database locations can also be specified,
246  ## in case the primary database is missing on some nodes:
247  #-database /work/davis/rosetta/rosetta_database
248  ## Location provided by user:
249  -database %(database)s
250  -file
251  ## You must supply .params files for any residue types (ligands)
252  ## that are not present in the standard Rosetta database.
253  ## Centroid residues are not needed for docking,
254  ## but may be needed for other Rosetta protocols.
255  #-extra_res_cen %(extra_res_cen)s
256  -extra_res_fa %(extra_res_fa)s
257 -out
258  ## These channels generate a LOT of output,
259  ## especially with -improve_orientation.
260  -mute protocols.geometry.RB_geometry core.scoring.dunbrack.SingleLigandRotamerLibrary core.scoring.rms_util core.pack.dunbrack.SingleLigandRotamerLibrary
261  -file
262  ## I prefer output structures with Rosetta numbering, from 1 to N residues.
263  ## To keep the original PDB numbering, omit this flag:
264  -renumber_pdb
265 -run
266  ## Recording the SVN revision of the code in your output files
267  ## makes it easier to figure out what you did later.
268  -version
269  ## MT is now the default random number generator, actually.
270  #-rng mt19937
271  ## Rosetta's default behavior is to pretend that atoms with zero occupancy
272  ## don't exist at all, which can lead to whole residues disappearing...
273  -ignore_zero_occupancy false
274 -packing
275  ## Includes the input sidechain conformations as rotamers when repacking,
276  ## but they can be "lost" if any other rotamer is chosen at that position.
277  #-use_input_sc
278  ## Instead, use -unboundrot to permanently add the rotamers from one or more
279  ## PDB files. Their rotamer ("Dunbrack") energies are also adjusted to be
280  ## equal to the best rotamer at that position, thereby favoring the native.
281  ## Since most sidechains do not change position much upon ligand binding,
282  ## including this knowledge generally improves docking results:
283  #-unboundrot ...
284  ## Controls the number of (protein) rotamers used.
285  -ex1
286  -ex1aro
287  -ex2
288  ## Ensures that extra rotamers are used for surface residues too.
289  -extrachi_cutoff 1
290 -docking
291  -ligand
292  ## Use soft-repulsive scoring during search (but not final minimization).
293  ## This slightly improves search. Hard-rep is used for final scoring,
294  ## however, because it gives better discrimination.
295  -soft_rep
296  ## Like Rosetta++, only evaluate the Coloumb term between protein and ligand.
297  -old_estat
298 ''' % locals())
299 #}}}
300 
301 
302 def write_rpkmin_flags(outfile, options, docking_cases): #{{{
303  outfile.write('''
304 -in
305  -file
306  ## If you weren't supplying the input file(s) on the command line,
307  ## this is where you would put them:
308  #-s ...
309 -out
310  ## Number of trajectories to run (per input structure given with -s)
311  -nstruct 10
312  -path
313  ## Where to write the PDB files.
314  -pdb prepack/all_traj
315 -run
316  ## Do nothing for 0 - N seconds on startup. If many processes are started
317  ## at once, this helps avoid too much I/O as they all load the database.
318  #-random_delay 30
319 -packing
320  ## If your PDB file does not have hydrogens in the right places, use this:
321  -no_optH false
322  ## You may also want to fix Asn/Gln/His sidechains:
323  -flip_HNQ
324 ''')
325 #}}}
326 
327 
328 def write_dock_flags(outfile, options, docking_cases): #{{{
329  nstruct = 5000 // get_procs_per_case(docking_cases, options)
330  outfile.write('''
331 -in
332  -file
333  ## If you weren't supplying the input file(s) on the command line,
334  ## this is where you would put them:
335  #-s ...
336  ## To get meaningful RMS values in the output, either the input PDB must have
337  ## the ligand in the correct place, or you must supply another file that does:
338  #-native ...
339 -out
340  ## Number of trajectories to run (per input structure given with -s)
341  -nstruct %(nstruct)i
342  ## With multiple processes, ensures a unique name for every output structure.
343  ## Each process should get a different string here, so you can't really
344  ## put it in the FLAGS.txt file, it has to go in the Condor script:
345  #-suffix 3
346  -path
347  ## Where to write the silent.out file. Specified in my Condor script.
348  #-pdb work/jnk_pp_1_001/3
349 -run
350  ## Do nothing for 0 - N seconds on startup. If many processes are started
351  ## at once, this helps avoid too much I/O as they all load the database.
352  -random_delay 240
353 -packing
354  ## If your PDB file does not have hydrogens in the right places, use this:
355  #-no_optH false
356  ## You may also want to fix Asn/Gln/His sidechains:
357  #-flip_HNQ
358 -docking
359  ## Flags to control the initial perturbation of the ligand,
360  ## and thus how much of the binding pocket to explore.
361  ##
362  ## Random perturbation of up to N Angstroms in X, Y, and Z,
363  ## drawn from a uniform distribution.
364  ## This gives a cube, but points outside the sphere of radius N are discarded,
365  ## resulting in uniform positional sampling within the sphere.
366  -uniform_trans 5 # angstroms
367  ## Randomize the starting orientation and conformer (respectively) of the
368  ## ligand. Unnecessary if using -improve_orientation.
369  #-randomize2
370  #-random_conformer
371  ## An alternative to -uniform_trans and -randomize2.
372  ## Random perturbations in X, Y, and Z and the 3 Euler angles,
373  ## drawn from a Gaussian distribution.
374  ## I *believe* that the three Gaussians in X, Y, and Z actually give a
375  ## spherically isotropic distribution of positions, but large angles
376  ## clearly give unreasonable results (use -randomize2 instead).
377  ## This can be used for positional perturbation only (instead of
378  ## -uniform_trans) by setting the angle component to zero.
379  #-dock_pert 30 5 # rot degrees, trans angstroms
380  -ligand
381  ## Instead of choosing an orientation and conformer at random, try all
382  ## available conformers in N random orientations, and try to maximize
383  ## shape complementarity to the protein (keeping in mind that sidechains
384  ## may move later, etc etc. See the paper or code for details.)
385  ## We find this *significantly* improves search, and costs just a few seconds.
386  -improve_orientation 1000
387  ## Let ligand rotatable bonds minimize, with harmonic restraints
388  ## where one standard deviation is 15 degrees:
389  -minimize_ligand
390  -harmonic_torsions 15
391  ## Allows rotamers to "recombine" instead of always minimizing towards
392  ## the last rotamer selected by the packer.
393  -use_ambig_constraints
394  ## Introduces random torsion tweaks in the MCM cycle; may help large ligands.
395  ## Requires -use_ambig_constraints.
396  -shear_moves 5
397  ## In the final minimization, let the backbone minimize with harmonic
398  ## restraints on the Calphas (stddev = 0.3 A).
399  ## Only stretches of residues near the ligand are minimized,
400  ## typically 20 - 40 in total. (40 - 80 residues are repacked.)
401  -minimize_backbone
402  -harmonic_Calphas 0.3
403  ## The 6-cycle protocol with repacks in cycles 1 and 4.
404  -protocol abbrev2
405  ## In each trajectory, one of the following points will be chosen at random
406  ## as the starting point for the ligand centroid (followed by other
407  ## perturbations like -uniform_trans, etc).
408  #-start_from 20.0 28.4 26.0
409  #-start_from 23.3 18.9 26.0
410  #-start_from 28.7 14.0 19.6
411  #-start_from 16.9 16.0 32.7
412  #-start_from 21.3 10.8 31.1
413  #-start_from 25.6 4.4 32.5
414  ## By supplying multiple .params files that have the same 3-letter residue
415  ## name, the same heavy atoms, but different proton configurations, you can
416  ## sample different protomers / tautomers within the packing steps.
417  ## In many cases this gives a negligible increase in run time (nice).
418  ## However, I'm unaware of any cases (so far) where it actually helped
419  ## to improve docking results, so use at your own risk.
420  #-mutate_same_name3
421 ''' % locals())
422 #}}}
423 
424 
425 def write_minnat_flags(outfile, options, docking_cases): #{{{
426  outfile.write('''
427 ## Flags for generating "minimized natives", if applicable.
428 ## In docking benchmarks, these can be used to diagnose search problems vs scoring problems.
429 -out
430  -nstruct 10
431 -run
432  -random_delay 240
433 -packing
434  -use_input_sc # causes -use_ambig_constraints to include native rotamer!
435 -docking
436  -ligand
437  -minimize_ligand
438  -harmonic_torsions 15
439  -use_ambig_constraints
440  -shear_moves 5
441  -minimize_backbone
442  -harmonic_Calphas 0.3
443  -protocol abbrev2
444 ''' % locals())
445 #}}}
446 
447 
448 def write_rpkmin_script(outfile, options, progs, docking_cases): #{{{
449  # All unique protein/cofactor combinations
450  proteins_and_cofactors = set([ tuple([d.protein]+d.cofactors) for d in docking_cases ])
451  outfile.write('''#!/bin/bash
452 set -v
453 
454 # Set up directory structure
455 mkdir -p prepack/{input,all_traj}
456 
457 # Prepack and minimize protein receptors
458 
459 ''')
460  if options.num_procs > 1: outfile.write("%s -j%i <<'HEREDOC'\n" % (progs['parallel'], options.num_procs))
461  for p_and_c in proteins_and_cofactors:
462  protein = p_and_c[0]
463  cofactors = p_and_c[1:]
464  base = "_".join([pc.base for pc in p_and_c])
465  cof_files = " ".join(["cofactor/fa/conf1/%s_0001.fa.pdb" % c.base for c in cofactors])
466  outfile.write("cat %s %s > prepack/input/%s.pdb" % (protein.path, cof_files, base))
467  outfile.write(" && %s @COMMON.flags @RPKMIN.flags -s prepack/input/%s.pdb" % (progs['rpkmin'], base))
468  outfile.write(" && egrep '^(ATOM|HETATM)' $(awk '/^pose / {print FILENAME, $NF}' prepack/all_traj/%s_????.pdb | sort -nk2 | head -1 | cut -d' ' -f1) > prepack/%s.pdb" % (base, base))
469  outfile.write("\n")
470  if options.num_procs > 1: outfile.write("HEREDOC\n")
471  outfile.write('''
472 # Replace by old input PDB files with repacked ones
473 ''')
474  for d in docking_cases:
475  base = "_".join([pc.base for pc in [d.protein]+d.cofactors])
476  outfile.write("cat prepack/%s.pdb ligand/fa/confs/%s_0002.fa.pdb > input/%s_%s.pdb\n" % (base, d.ligand.base, d.protein.base, d.ligand.base))
477 #}}}
478 
479 
480 def write_tarball_pre_script(outfile, options, progs, docking_cases): #{{{
481  if not options.local :
482  outfile.write('''#!/bin/bash
483 #set -v # this messes up the nice echo output at the end
484 shopt -s nullglob # avoid error messages from tar about non-existant files
485 
486 # Make tarball of files needed for docking
487 tar cvzf $(basename $(pwd)).tgz {cofactor,ligand}/{fa,cen}/{withxtal/,}*.params {cofactor,ligand}/{fa,cen}/{withxtal/,}*_confs.*.pdb* input/ native/ unbound/ *.flags [45]_*.sh
488 
489 # Suggest copying
490 echo
491 echo " Now copy this file to the cluster where you'll do docking."
492 echo
493 echo " rsync -avxP $(basename $(pwd)).tgz syd:runs/$(basename $(pwd))/"
494 echo
495 echo " *** If necessary, adjust path to database in COMMON.flags"
496 echo " *** If necessary, adjust path to executable in 4_dock_condor.sh"
497 echo
498 ''')
499  else: # options.local = True
500  outfile.write('''#!/bin/bash
501 echo
502 echo " *** If necessary, adjust path to database in COMMON.flags"
503 echo " *** If necessary, adjust path to executable in 4_dock_condor.sh"
504 echo
505 ''')
506 #}}}
507 
508 
509 def write_dock_condor_script(outfile, options, progs, docking_cases): #{{{
510  ligand_dock = progs['dock']
511  #target_dir = os.path.join(os.getcwd(), options.target_dir)
512  outfile.write('''#!/bin/bash
513 
514 mkdir -p log
515 
516 # Create the header for the CONDOR files
517 minnatfile="CONDOR.minnat"
518 dockfile="CONDOR.dock"
519 
520 for f in "$minnatfile" "$dockfile"; do
521  cat > "$f" <<'HEREDOC'
522 Executable = %(ligand_dock)s
523 Universe = vanilla
524 Requirements = Memory > 256
525 Notification = Never
526 copy_to_spool = false
527 
528 Initialdir = .
529 HEREDOC
530 done
531 
532 cat >> "$minnatfile" <<'HEREDOC'
533 Error = log/err.minnat.$(Process).log
534 Log = log/condor.minnat.$(Process).log
535 Output = log/out.minnat.$(Process).log
536 
537 
538 HEREDOC
539 
540 cat >> "$dockfile" <<'HEREDOC'
541 Error = log/err.$(Process).log
542 Log = log/condor.$(Process).log
543 Output = log/out.$(Process).log
544 
545 
546 HEREDOC
547 
548 # Function to set up each docking case
549 # Takes protein_name ligand_name num_procs
550 setup_docking_case()
551 {
552  p="$1"
553  l="$2"
554  pl="${1}_${2}"
555  mkdir -p "work/minnat/$pl"
556  echo "Arguments = @COMMON.flags @MINNAT.flags -in:file:s native/$pl.pdb -in:file:native native/$pl.pdb -packing:unboundrot unbound/$p.pdb -out:path:pdb work/minnat/$pl" >> "$minnatfile"
557  echo "Queue" >> "$minnatfile"
558  for((i=0;i<$3;i++)); do
559  mkdir -p "work/$pl/$i"
560  echo "Arguments = @COMMON.flags @DOCK.flags -in:file:s input/$pl.pdb -in:file:native native/$pl.pdb -packing:unboundrot unbound/$p.pdb -out:path:pdb work/$pl/$i -run:seed_offset $i -out:suffix _$i" >> "$dockfile"
561  echo "Queue" >> "$dockfile"
562  done
563  echo >> "$dockfile"
564 }
565 
566 # List the individual docking cases
567 ''' % locals())
568  procs_per_case = get_procs_per_case(docking_cases, options)
569  outfile.write('procs_per_case=%i\n' % procs_per_case)
570  for d in docking_cases:
571  prot_lig = "%s_%s" % (d.protein.base, d.ligand.base)
572  outfile.write('setup_docking_case "%s" "%s" $procs_per_case\n' % (d.protein.base, d.ligand.base))
573  outfile.write('''
574 echo
575 echo " condor_submit $minnatfile # if you want minimized natives"
576 echo " condor_submit $dockfile # the actual docking calculations"
577 echo
578 ''')
579 #}}}
580 
581 def write_dock_bash_script(outfile, options, progs, docking_cases): #{{{
582  ligand_dock = progs['dock']
583  #target_dir = os.path.join(os.getcwd(), options.target_dir)
584  outfile.write('''#!/bin/bash
585 
586 Executable="%(ligand_dock)s"
587 
588 mkdir -p log
589 
590 # Create the header for the BASH files
591 minnatfile="BASH.minnat.sh"
592 dockfile="BASH.dock.sh"
593 
594 # Make sure that neither of them already exist.
595 rm -f "$minnatfile"
596 rm -f "$dockfile"
597 
598 # Function to set up each docking case
599 # Takes protein_name ligand_name num_procs
600 setup_docking_case()
601 {
602  p="$1"
603  l="$2"
604  pl="${1}_${2}"
605  mkdir -p "work/minnat/$pl"
606  echo "${Executable} @COMMON.flags @MINNAT.flags -in:file:s native/$pl.pdb -in:file:native native/$pl.pdb -packing:unboundrot unbound/$p.pdb -out:path:pdb work/minnat/$pl > log/err.minnat.${pl}.log 2> log/out.minnat.${pl}.log" >> "$minnatfile"
607  echo >> "$minnatfile"
608  for((i=0;i<$3;i++)); do
609  mkdir -p "work/$pl/$i"
610  echo "${Executable} @COMMON.flags @DOCK.flags -in:file:s input/$pl.pdb -in:file:native native/$pl.pdb -packing:unboundrot unbound/$p.pdb -out:path:pdb work/$pl/$i -run:seed_offset $i -out:suffix _$i > log/out.${pl}.${i}.log 2> log/err.${pl}.${i}.log" >> "$dockfile"
611  done
612  echo >> "$dockfile"
613 }
614 
615 # List the individual docking cases
616 ''' % locals())
617  procs_per_case = get_procs_per_case(docking_cases, options)
618  outfile.write('procs_per_case=%i\n' % procs_per_case)
619  for d in docking_cases:
620  prot_lig = "%s_%s" % (d.protein.base, d.ligand.base)
621  outfile.write('setup_docking_case "%s" "%s" $procs_per_case\n' % (d.protein.base, d.ligand.base))
622  outfile.write('''
623 
624 # Make scripts executable
625 chmod u+x $minnatfile $dockfile
626 
627 echo
628 echo " nohup nice $minnatfile # if you want minimized natives"
629 echo " nohup nice $dockfile # the actual docking calculations"
630 echo
631 echo " Or for multiple processors something like:"
632 echo
633 echo " %s -j %s < $minnatfile # if you want minimized natives"
634 echo " %s -j %s < $dockfile # the actual docking calculations"
635 ''' % (progs['parallel'], options.num_procs, progs['parallel'], options.num_procs) )
636 #}}}
637 
638 
639 def write_concat_script(outfile, options, progs, docking_cases): #{{{
640  outfile.write('''#!/bin/bash
641 mkdir -p out/minnat
642 for dir in work/minnat/*; do
643  d=$(basename $dir)
644  echo $d
645  f="out/minnat/${d}_silent.out"
646  mv "$dir"/silent*.out $f
647  # Remove new file if it is contains no data (0 size)
648  [ -s $f ] || rm $f
649 done
650 
651 for dir in work/*; do
652  d=$(basename $dir)
653  [ "$d" = minnat ] && continue
654  echo $d
655  f="out/${d}_silent.out"
656  # Remove old file first
657  [ -f $f ] && rm $f
658  find $dir -name 'silent*.out' -print0 | xargs -0 cat >> $f
659  # Remove new file if it is contains no data (0 size)
660  [ -s $f ] || rm $f
661 done
662 
663 for f in out/*_silent.out; do
664  echo -n "$f "
665  a=$(grep -c '^POSE_TAG ' $f)
666  b=$(grep -c '^END_POSE_TAG ' $f)
667  if ((a == b)); then
668  echo "ok $a $b"
669  else
670  echo "***ERROR*** $a $b"
671  fi
672 done
673 ''')
674  if not options.local:
675  outfile.write('''tar cvzf $(basename $(pwd))_out.tgz out/
676 
677 # Suggest copying
678 echo
679 echo " Now copy this file back to your computer for analysis."
680 echo
681 echo " rsync -avxP $(basename $(pwd))_out.tgz dig:"
682 echo " ssh dig"
683 echo " cd path/to/project/"
684 echo " tar xvzf ~/$(basename $(pwd))_out.tgz && rm ~/$(basename $(pwd))_out.tgz"
685 echo
686 ''')
687 #}}}
688 
689 
690 def write_analysis_script(outfile, options, progs, docking_cases): #{{{
691  outfile.write('''#!/bin/bash
692 
693 # Check for result files
694 if [ ! -d out ]; then
695  echo
696  echo " Can't find out/ directory. Have you untarred the results?"
697  echo
698  exit
699 fi
700 
701 # Make score files
702 mkdir -p {out,scores}/minnat
703 pushd out
704 pushd minnat
705 for f in *_silent.out; do
706  echo $f
707  %(get_scores)s < $f > ../../scores/minnat/${f/_silent.out/.tab}
708 done
709 popd
710 for f in *_silent.out; do
711  echo $f
712  %(get_scores)s < $f > ../scores/${f/_silent.out/.tab}
713 done
714 popd
715 
716 # Make PDF plots if R is on the PATH
717 pushd scores
718 if which R > /dev/null; then
719  R --vanilla < %(plot_funnels)s
720 else
721  echo
722  echo " Can't find R on the PATH. Skipping funnel plots..."
723  echo
724 fi
725 popd
726 ''' % progs)
727  if progs['confidence'] is not None:
728  outfile.write('''
729 # Estimate confidence in docking results
730 %(confidence)s @COMMON.flags -in:file:silent out/*.out
731 ''' % progs)
732  if progs['best_poses'] is not None:
733  outfile.write('''
734 # Extract best-scoring docked poses, skipping duplicates
735 # Wow, the select_best_unique_ligand_poses program is awkward. I should fix that.
736 mkdir -p pdbs
737 for f in out/*.out; do
738  %(best_poses)s @COMMON.flags -docking:ligand:max_poses 10 -docking:ligand:min_rms 1.0 -out:path:pdb pdbs -in:file:silent $f -out:file:silent default.out
739  tags=( $(head -1 cluster_rms.tab) ) # Bash array of selected tags
740  %(extract)s @COMMON.flags -out:path:pdb pdbs -in:file:s $f -in:file:tags "${tags[@]}"
741  # Prefix each extracted structure filename with its rank: 01, 02, 03, ...
742  for ((i=0; i<${#tags[@]}; ++i)); do
743  mv pdbs/${tags[$i]}.pdb pdbs/$(printf "%%02i_" $((i+1)))${tags[$i]}.pdb
744  done
745  # select_best_unique_ligand_poses seems to mangle the silentfile sometimes, so better to use the original
746  #%(extract)s @COMMON.flags -out:path:pdb pdbs -in:file:s default.out
747  rm default.out cluster_rms.tab
748 done
749 
750 # Convert extracted PDBs into mol2 files, for people who prefer that
751 mkdir -p mol2_out
752 pdb_to_molfile="%(pdb_to_molfile)s --comment -oMOL2"
753 ''' % progs)
754  for d in docking_cases:
755  prot_lig = "%s_%s" % (d.protein.base, d.ligand.base)
756  if progs['assign_charges'] is not None: lig_path = "ligand/%s.am1bcc.mol2" % d.ligand.base
757  elif progs['omega'] is not None: lig_path = "ligand/%s.omega.mol2" % d.ligand.base
758  else: lig_path = d.ligand.path
759  outfile.write('$pdb_to_molfile %s pdbs/%s*.pdb > mol2_out/%s.mol2\n' % (lig_path, prot_lig, prot_lig))
760 #}}}
761 
762 
763 def write_if_not_exists(filename, options, function, *func_args, **func_kwargs):
764  filename = os.path.join(options.target_dir, filename)
765  if os.path.exists(filename) and not options.clobber:
766  print "File %s exists, use --clobber to overwrite" % filename
767  else:
768  try:
769  outfile = open(filename, 'w')
770  function(outfile, *func_args, **func_kwargs)
771  outfile.close()
772  if filename.endswith('.sh'): os.chmod(filename, 0750)
773  print "Wrote %s" % filename
774  except Exception, ex:
775  print "Exception while writing %s: %s" % (filename, ex)
776  # Remove partially complete junk file
777  os.remove(filename)
778  # Re-raise the exception
779  raise # "raise ex" clobbers the original stack trace
780 
781 
782 def main(argv):
783  '''
784  Automatic RosettaLigand Setup (ARLS)
785 
786  Given a list of protein-ligand pairings, generates all the required input
787  files for RosettaLigand docking, along with suggested flags and submit scripts.
788  The scripts are numbered 1 - N and should be run in order.
789  See the Doxygen documentation for important information on using RosettaLigand.
790 
791  The script takes one argument, a list of protein-ligand pairs to dock.
792  Each line of the list should consist of the protein name, zero or more cofactor
793  names, and the ligand name (whitespace separated).
794  In the current directory should be files with the same name and an appropriate
795  extension: .pdb for proteins, and one of (.mol, .sdf, .mol2) for ligands and
796  cofactors. For example, the line "1abc cofactor mylig" could use files named
797  "1abc.pdb", "cofactor.mol", and "mylig.mol2". No unnecessary work will be done
798  even if a protein/cofactor/ligand appears in multiple lines.
799 
800  The protein file should only contain standard amino acid residues.
801  The cofactor and ligand files should contain a single conformation
802  of a single compound (unless using --skip-omega).
803  The MiniRosetta database should be in ~/rosetta/rosetta_database;
804  otherwise use --database.
805  OpenEye's Omega should be in ~/openeye or on your PATH;
806  otherwise use --openeye or --skip-omega.
807  The OpenEye QUACPAC toolkit should be on your PYTHONPATH
808  for charges to be assigned (else use --skip-charges).
809  You can specify which clustering software to set up the docking runs for with
810  --cluster. Specifying BASH will result in a standard shell script-type submission.
811  for those without clustering software (It is recommended to also set --njobs.) You
812  also can use the BASH output file with MOAB/TORQUE/PBS style clustering software.
813  ''' # Preformatted
814 
815  def up_dir(path, num_steps=1):
816  for i in xrange(num_steps):
817  path = os.path.dirname(path)
818  return path
819  MINI_HOME = up_dir(os.path.abspath(sys.path[0]), 4)
820 
821  parser = OptionParser(usage="usage: %prog [<] LIST_OF_PAIRS.txt", formatter=PreformattedDescFormatter())
822  parser.set_description(main.__doc__)
823  # parser.add_option("-short", ["--long"],
824  # action="store|store_true|store_false",
825  # default=True|False|...
826  # type="string|int|float",
827  # dest="opt_name",
828  # help="store value in PLACE",
829  # metavar="PLACE",
830  # )
831  parser.add_option("-j", "--num-procs",
832  default=1,
833  type="int",
834  help="Number of processors to use on local machine for non-CONDOR tasks (default: 1)",
835  )
836  parser.add_option("-m", "--mini",
837  default=MINI_HOME,
838  help="Directory where rosetta_source is found (default: %s)" % MINI_HOME,
839  )
840  parser.add_option("-d", "--database",
841  default=os.path.join( os.path.expanduser("~"), "rosetta", "rosetta_database"),
842  help="Directory where the Rosetta database is found (default: ~/rosetta/rosetta_database)",
843  )
844  parser.add_option("--openeye",
845  default=os.path.join( os.path.expanduser("~"), "openeye"),
846  help="Directory where OpenEye tools are found (default: ~/openeye)",
847  )
848  parser.add_option("-t", "--target-dir",
849  help="Directory where output should be created (default: ./arls_work)",
850  default="arls_work",
851  )
852  parser.add_option("--python",
853  help="Python executable (2.5/2.6 required for Omega) ",
854  default="/usr/bin/env python",
855  )
856  parser.add_option("--clobber",
857  action="store_true",
858  default=False,
859  help="Overwrite pre-existing output files",
860  )
861  parser.add_option("--skip-omega",
862  action="store_true",
863  default=False,
864  help="Do not use OpenEye's Omega to generate small molecule conformers. Assumes small molecule file already contains all desired conformers.",
865  )
866  parser.add_option("--skip-charges",
867  action="store_true",
868  default=False,
869  help="Do not assign partial charges with OpenEye's AM1BCC. Assumes small molecule file is in .mol2 format and already has charges, or that you want default Rosetta charges.",
870  )
871  parser.add_option("--local",
872  action="store_true",
873  default=False,
874  help="Run docking jobs locally (through current system), rather than on a remote cluster.",
875  )
876  parser.add_option("--cluster",
877  default="CONDOR",
878  help="Type of clustering software to use. Acceptable values: CONDOR, BASH",
879  )
880  parser.add_option("--njobs",
881  default=250,
882  help="Aproximate number of cluster jobs to split processing over (default: 250).",
883  )
884  parser.add_option("--compile-tag",
885  default=None,
886  help="""The compile tag (e.g. "static.linuxgccrelease") for Rosetta programs (default: use a very basic autodetect).""",
887  )
888  (options, args) = parser.parse_args(args=argv)
889 
890  if len(args) == 0:
891  infile = sys.stdin
892  elif len(args) == 1:
893  infile = open(args[0])
894  else:
895  parser.print_help()
896  print "Too many arguments!"
897  return 1
898 
899  try:
900  progs = find_programs(options)
901  docking_cases = read_input_list(infile)
902 
903  if len(docking_cases) == 0:
904  raise ValueError("No protein-ligand pairs were supplied!")
905  if len(set([(d.protein, d.ligand) for d in docking_cases])) < len(docking_cases):
906  raise ValueError("Some protein-ligand pairs are duplicated!")
907 
908  options.cluster = str(options.cluster).upper()
909  if options.cluster not in ["CONDOR","BASH"]:
910  raise ValueError('Invalid cluster type "%s"specified with --cluster'% options.cluster)
911  if not os.path.isdir(options.mini):
912  raise ValueError("Cannot find MiniRosetta source tree; please specify with --mini")
913  if not os.path.isdir(options.database):
914  raise ValueError("Cannot find Rosetta database; please specify with --database")
915  if not options.skip_omega and not os.path.isdir(options.openeye):
916  raise ValueError("Cannot find OpenEye tools; please specify with --openeye (or use --skip-omega)")
917 
918  if not os.path.isdir(options.target_dir):
919  os.mkdir(options.target_dir)
920 
921  write_if_not_exists("1_setup.sh", options,
922  write_setup_script, options, progs, docking_cases)
923  write_if_not_exists("COMMON.flags", options,
924  write_common_flags, options, docking_cases)
925  write_if_not_exists("RPKMIN.flags", options,
926  write_rpkmin_flags, options, docking_cases)
927  write_if_not_exists("DOCK.flags", options,
928  write_dock_flags, options, docking_cases)
929  write_if_not_exists("MINNAT.flags", options,
930  write_minnat_flags, options, docking_cases)
931  write_if_not_exists("2_prepack_minimize.sh", options,
932  write_rpkmin_script, options, progs, docking_cases)
933  write_if_not_exists("3_tarball_pre.sh", options,
934  write_tarball_pre_script, options, progs, docking_cases)
935 
936  if options.cluster == "CONDOR":
937  write_if_not_exists("4_dock_condor.sh", options,
938  write_dock_condor_script, options, progs, docking_cases)
939  elif options.cluster == "BASH":
940  write_if_not_exists("4_dock_bash.sh", options,
941  write_dock_bash_script, options, progs, docking_cases)
942  else:
943  raise ValueError("Shouldn't get here: invalid cluster type %s"%options.cluster)
944 
945  write_if_not_exists("5_concat.sh", options,
946  write_concat_script, options, progs, docking_cases)
947  write_if_not_exists("6_analyze_results.sh", options,
948  write_analysis_script, options, progs, docking_cases)
949 
950  except ValueError, v:
951  parser.print_help()
952  print
953  print v
954  return 2
955 
956  return 0
957 
958 
959 if __name__ == "__main__":
960  sys.exit(main(sys.argv[1:]))
961  print "You should really be running the wrapper script, arls.py, instead!"
def write_dock_flags
Definition: arls_impl.py:328
def write_analysis_script
Definition: arls_impl.py:690
def write_rpkmin_script
Definition: arls_impl.py:448
def write_common_flags
Definition: arls_impl.py:229
def chain_from_iterables
Definition: arls_impl.py:20
def find_file
Definition: arls_impl.py:26
def write_dock_condor_script
Definition: arls_impl.py:509
def get_procs_per_case
Definition: arls_impl.py:124
Fstring::size_type len(Fstring const &s)
Length.
Definition: Fstring.hh:2207
def read_input_list
Definition: arls_impl.py:111
utility::vector1< std::string > split(const std::string &s)
split given std::string using ' ' symbol.
Definition: string_util.cc:59
def write_tarball_pre_script
Definition: arls_impl.py:480
def write_concat_script
Definition: arls_impl.py:639
bool open(utility::io::izstream &db_stream, std::string const &db_file, bool warn)
Open a database file on a provided stream.
Definition: open.cc:55
def main
Definition: arls_impl.py:782
def write_minnat_flags
Definition: arls_impl.py:425
def write_dock_bash_script
Definition: arls_impl.py:581
def find_programs
Definition: arls_impl.py:80
def write_if_not_exists
Definition: arls_impl.py:763
static T max(T x, T y)
Definition: Svm.cc:19
def write_setup_script
Definition: arls_impl.py:128
def write_rpkmin_flags
Definition: arls_impl.py:302