Rosetta
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Properties Friends Macros Pages
clean_pdb_keep_ligand.py
Go to the documentation of this file.
1 #!/usr/bin/python
2 ## Written and improved over the years by Phil, Rhiju, Olli and Mike
3 ## Lucas Nivon, Nov 2010, modifying to keep HETATMS which are not non-canonical ligands
4 ## Usage: python clean_pdb_keep_ligand.py 1dqb -ignorechain
5 ## Note that the database for the original pdbs is at local_pdb_database, change this as needed at top of script
6 ###############################################################
7 
8 import string
9 from sys import argv,stderr
10 from os import popen,system
11 from os.path import exists,basename
12 from amino_acids import longer_names
13 from amino_acids import modres
14 
15 ############################## set your scaffold source here #################################
16 local_pdb_database = "/lab/shared/scaffolds/"
17 keep_ligand = True # set to false to throw away all HETATMs, not just the confusing non-canonical Amino Acids
18 ###############################################################################################
19 
20 shit_stat_insres = False
21 shit_stat_altpos = False
22 shit_stat_modres = False
23 shit_stat_misdns = False
24 
25 hetatm_list = [] ### keep a record of the hetatms observed so far that are not in the amino acid dictionary
26 
27 CA_MODEL = True
28 NO_OCCUPANCY = True
29 
30 fastaseq = ""
31 pdbfile = ""
32 def check_and_print_pdb( count, residue_buffer, residue_letter ):
33  global fastaseq
34  global pdbfile
35  global CA_MODEL
36  global NO_OCCUPANCY
37 
38  ## Check that CA, N and C are present!def check_and_print_pdb( outid, residue_buffer )
39  hasCA = False
40  hasN = False
41  hasC = False
42  IsGoodLigand = False
43 
44  for line in residue_buffer:
45  atomname = line[12:16]
46  #Only add bb atoms if they have occupancy!
47  occupancy=float(line[55:60])
48 
49  if occupancy > 0.0 :
50  NO_OCCUPANCY = False
51  if atomname == " CA " and occupancy > 0.0: hasCA = True
52  if atomname == " N " and occupancy > 0.0: hasN = True
53  if atomname == " C " and occupancy > 0.0: hasC = True
54  if (line[0:6] == 'HETATM' and keep_ligand):
55  IsGoodLigand = True
56 
57  ## if all three backbone atoms are present withoccupancy proceed to print the residue
58  if hasN or hasC : CA_MODEL = False
59  if ((hasCA and hasN and hasC) or (IsGoodLigand)) :
60  for line in residue_buffer:
61  ## add linear residue count
62  newnum = '%4d ' % count
63  line_edit = line[0:22] + newnum + line[27:]
64  ## write the residue line
65  pdbfile = pdbfile + line_edit
66 
67  ## finally print residue letter into fasta stream
68  fastaseq = fastaseq + residue_letter
69 
70  ## count up residue number
71  count = count + 1
72  return True
73 
74  return False
75 
76 
77 assert( len(argv)>2)
78 pdbname = argv[1]
79 pdbcode = argv[1]
80 chainid = argv[2]
81 
82 if (pdbname[-4:] != '.pdb' and pdbname[-8:] != '.pdb1.gz'):
83  pdbname += '.pdb'
84 
85 outfile = pdbname
86 
87 nopdbout = 0
88 if argv.count('-nopdbout'):
89  nopdbout = 1
90 
91 removechain = 0
92 if argv.count('-nochain'):
93  removechain = 1
94 
95 ignorechain = 0
96 if argv.count('-ignorechain'):
97  ignorechain = 1
98 
99 netpdbname = local_pdb_database + pdbname[1:3] + '/' + pdbname[0:4] + '/' + argv[1] + '_0.pdb'
100 #print "getting the name right:", netpdbname
101 if not exists(netpdbname):
102  netpdbname = pdbname
103 
104 
105 if netpdbname[-3:]=='.gz':
106  lines = popen( 'zcat '+netpdbname,'r').readlines()
107 else:
108  lines = open(netpdbname,'r').readlines()
109 
110 
111 oldresnum = ' '
112 count = 1;
113 modifiedres = ''
114 
115 
116 residue_buffer = []
117 residue_letter = ''
118 residue_invalid = False
119 
120 if chainid == '_':
121  chainid = ' '
122 
123 for i in range(len(lines)):
124  line = lines[i]
125 
126  if len(line)>5 and line[:6]=='ENDMDL':break #Its an NMR model.
127 
128  if (chainid == line[21] or ignorechain):
129  line_edit = line
130  if line[0:3] == 'TER':
131  continue
132  elif (line[0:6] == 'HETATM'):
133  ok = False
134  #print "found a HETATM", line_edit
135 
136  ## Is it a modified residue ?
137  if modres.has_key( line[17:20] ):
138  ## if so replace it with its canonical equivalent !
139  line_edit = 'ATOM '+line[6:17]+modres[line[17:20]] +line[20:]
140  modifiedres = modifiedres + line[17:20] + ', '
141  ## dont count MSEs as modiied residues (cos they're so common and get_pdb deal with them previosuly)
142  if line[17:20] != "MSE":
143  shit_stat_modres = True
144  ok = True
145 
146  ## other substitution (of atoms mainly)
147  elif (line[17:20]=='MSE'): #Selenomethionine
148  if (line_edit[12:14] == 'SE'):
149  line_edit = line_edit[0:12]+' S'+line_edit[14:]
150  if len(line_edit)>75:
151  if (line_edit[76:78] == 'SE'):
152  line_edit = line_edit[0:76]+' S'+line_edit[78:]
153  else:
154  line_edit = line
155 # print "found a HETATM", line
156  if line[17:20] != "MSE":
157  shit_stat_modres = True
158  hetatm_list.append(line[17:20])
159  ok = True
160 
161  if not ok:
162  continue # skip this atom if we havnt found a conversion
163 
164  if (line_edit[0:4] == 'ATOM') or (line_edit[0:6] == 'HETATM' ):
165 ## if line_edit[13:14]=='P': #Nucleic acid? Skip.
166 ## resnum = line_edit[23:26]
167 ## oldresnum = resnum
168 ## while (resnum == oldresnum):
169 ## print "HERE"
170 ## i += 1
171 ## line = lines[i]
172 ## resnum = line_edit[23:26]
173 
174  resnum = line_edit[22:27]
175 
176  insres = line[26]
177  if insres != ' ': shit_stat_insres = True
178 
179  altpos = line[16]
180  if altpos != ' ': shit_stat_altpos = True
181  ## Print out the residue when the new line is for a new residue
182  if not resnum == oldresnum:
183  if residue_buffer != []: ## is there a residue in the buffer ? If so print it out, then reinitialize
184  if not residue_invalid:
185  ##print "call from the loop:\n" ## new
186  if not check_and_print_pdb( count, residue_buffer, residue_letter ):
187  ## if unsuccessful
188  shit_stat_misdns = True
189  else:
190  count = count + 1
191 
192  residue_buffer = []
193  residue_letter = ''
194  residue_invalid = False
195 
196  longname = line_edit[17:20]
197  if longer_names.has_key(longname):
198  residue_letter = longer_names[longname];
199  else:
200  residue_letter = 'X'
201  ######### residue_invalid = True
202 
203  oldresnum = resnum
204 
205  ## What does this do ?
206  if line_edit[16:17] == 'A':
207  line_edit = line_edit[:16]+' '+line_edit[17:]
208 
209  if line_edit[16:17] != ' ':
210  continue
211 
212  if removechain:
213  line_edit = line_edit[0:21]+' '+line_edit[22:]
214 
215  #print "about to append to residue_buffer####", line_edit
216  residue_buffer.append( line_edit )
217 
218  #outid.write(line_edit)
219 
220 
221 #print "calling after the loop\n" ## new
222 if not check_and_print_pdb( count, residue_buffer, residue_letter ):
223  ## if unsuccessful
224  shit_stat_misdns = True
225 else:
226  count = count + 1
227 
228 
229 flag_altpos = "---"
230 if shit_stat_altpos : flag_altpos = "ALT"
231 flag_insres = "---"
232 if shit_stat_insres : flag_insres = "INS"
233 flag_modres = "---"
234 if shit_stat_modres : flag_modres = "MOD"
235 flag_misdns = "---"
236 if shit_stat_misdns : flag_misdns = "DNS"
237 
238 nres = len(fastaseq)
239 flag_successful = "OK"
240 if nres <= 0:
241  flag_successful = "BAD"
242  if CA_MODEL: flag_successful = flag_successful + " (CA only model )"
243  if NO_OCCUPANCY: flag_successful = flag_successful + " (No occupancy )"
244 
245 print netpdbname, pdbname, chainid, "%5d"%nres, flag_altpos, flag_insres, flag_modres, flag_misdns, flag_successful
246 
247 #for item in hetatm_list:
248 # print item
249 
250 if chainid == ' ': chainid = '_'
251 print "chainid is", chainid
252 if chainid == "-ignorechain": chainid = '_00'
253 if nres > 0:
254  if( nopdbout == 0 ):
255  outfile = pdbcode + chainid + ".pdb"
256  outfile = outfile.replace('.pdb1.gz','.pdb')
257  outid = open( outfile, 'w')
258  outid.write(pdbfile)
259  outid.write("TER\n")
260  outid.close()
261 
262  fastaid = stderr
263  fastaid.write('>'+pdbname[0:4]+chainid+'\n');
264  fastaid.write( fastaseq )
265  fastaid.write('\n')
266  fastaid.close()
267 
Fstring::size_type len(Fstring const &s)
Length.
Definition: Fstring.hh:2207
bool open(utility::io::izstream &db_stream, std::string const &db_file, bool warn)
Open a database file on a provided stream.
Definition: open.cc:55