Skip to content

Commit 85dd432

Browse files
committed
submitting conf gen script
1 parent d154b68 commit 85dd432

File tree

2 files changed

+392
-0
lines changed

2 files changed

+392
-0
lines changed
Lines changed: 364 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,364 @@
1+
#
2+
# Copyright (c) 2015 The Cambridge Crystallographic Data Centre (CCDC)
3+
#
4+
# This script can be used for any purpose without limitation subject to the
5+
# conditions at http://www.ccdc.cam.ac.uk/Community/Licenses/pages/v1.aspx
6+
#
7+
# The above copyright statement and this permission notice must be included
8+
# in all copies or substantial portions of the script.
9+
#
10+
11+
import requests
12+
import urllib.request, urllib.parse, urllib.error
13+
import os
14+
import argparse
15+
16+
from ccdc import io
17+
from ccdc import conformer
18+
from ccdc.molecule import Molecule
19+
from ccdc.descriptors import MolecularDescriptors
20+
21+
__doc__ = """
22+
Starting from a list of PDB-codes, the script generates idealized conformers
23+
for ligands and evaluates their RMSD to the conformation in the PDB.
24+
"""
25+
26+
__author__ = 'Brandl, Giangreco, Higueruelo, Schaerfer and Sykes'
27+
28+
29+
excluded_hetids = set(['NAP', 'MA4', 'EOH', 'AGC', 'EOM', 'BGC', 'HEM', 'CPS', 'CPT',
30+
'TAU', 'TAR', 'TAS', 'SPK', '1FH', 'SPM', 'VA3', 'SPD',
31+
'XPE', 'GD', 'GA', 'EHN', 'CO5', 'LAK', 'V70', 'PE4',
32+
'MAC', 'LAT', 'CP2', 'CP3', 'AR', 'MAN', 'CMO', 'TSM',
33+
'DTN', 'TSD', 'TSE', 'YBT', 'EDO', 'PCL', 'CB5', 'BEQ',
34+
'F09', 'DTV', 'NO2', 'NO3', 'DZZ', 'IUM', 'UVW', 'NME',
35+
'ZN', 'NFC', 'NFB', 'NFO', 'K', 'OH', 'NFS', 'NFR',
36+
'MQD', 'MG', 'CBG', 'NOE', 'MO', 'MN', 'PC4', 'CBM',
37+
'SE', 'CBX', 'MXE', 'BE7', 'BCT', 'DCE', 'W', 'GLO',
38+
'MM4', 'GLC', 'DOD', 'SMO', 'GLY', 'DOX', 'FE', 'OF2',
39+
'DET', 'C2C', 'FCY', 'MDD', 'AU3', 'FCL', 'FCO', 'HTG',
40+
'AUC', 'VER', 'F', 'NFE', 'FMT', 'FMS', 'JEF', 'V',
41+
'TOU', 'SX', 'OF3', 'SBT', 'SR', 'SBY', 'LMT', 'LMU',
42+
'T3A', 'SM', 'SBE', 'SB', 'SBO', 'MMC', 'YH', 'DPR',
43+
'YB', 'CAT', 'DPG', 'DPO', 'CAC', 'TF4', 'CAD', 'NHE',
44+
'PGE', 'LA', 'PGO', 'PGL', 'HG2', 'LI', 'LU', 'PGR',
45+
'PGQ', 'NHY', '144', 'NMU', 'PG6', 'NH4', 'WCC',
46+
'PG5', 'ZRC', 'PG0', 'B7G', 'HGC', 'CFN', 'CFO', 'CFM',
47+
'TFH', 'NCP', 'I42', 'TFA', 'PNL', 'NCO', 'Y1', 'CFT',
48+
'KO4', 'SF4', 'ZNH', 'ZNO', 'SF3', 'FNE', 'EGL', '7PE',
49+
'RU', 'LBT', 'SAL', 'RE', 'RB', 'BF4', 'URE', 'MRY',
50+
'CHM', 'ACM', 'MRD', 'IPA', 'IPH', 'ZN3', 'ZN2', 'ZN1',
51+
'LI1', '4OA', 'RHD', 'OCM', 'OCL', 'OCO', 'OCN', 'NGN',
52+
'HDZ', 'P2K', 'V40', 'HDD', 'HDN', 'MG8', 'BMA', '2FU',
53+
'BME', 'WO5', 'WO4', 'OX', 'YT3', 'WO2', 'CEQ', '2FH',
54+
'AF3', 'ENC', 'GCD', '2HP', 'B3P', 'CE1', 'EU', 'XCC',
55+
'MGF', 'TBR', 'TBU', 'PG4', 'OC8', 'BTC', 'BTB', 'OC1',
56+
'OC3', 'OC2', 'OC5', 'OC4', 'OC7', 'OC6', 'NH3', '3OF',
57+
'ATO', 'HGI', 'DMF', 'AST', 'MTG', 'SEA', 'SEK', 'GBL',
58+
'EU3', 'CLN', 'CLO', 'DUM', 'UNX', 'BEF', 'CLF', 'PDT',
59+
'XE', 'PDO', 'TLA', 'CLX', 'DMN', 'FRU', 'BBX', 'BBU',
60+
'CLP', 'OMO', 'H2S', 'ZEM', 'KR', 'SRM', 'SE4', 'P6G',
61+
'2ME', '2MO', 'CYS', '3PO', 'POR', 'POP', 'PON', '2BM',
62+
'CYN', 'GOL', '202', 'F50', 'DHE', 'DHD', 'PO2', 'PO3',
63+
'MLP', 'OTE', 'TFP', 'E1H', 'MCR', 'NRU', '3CO', '3CN',
64+
'2PA', '2PE', 'NI2', 'NI3', 'NI1', '2PO', '2PN', 'TCN',
65+
'HSG', 'HSH', 'TZZ', '12P', '543', 'OF0', 'OF1', 'HSW',
66+
'C2O', 'MEE', 'N2O', 'PD', 'VXA', 'GPX', 'THJ', 'BNG',
67+
'NIK', 'F2O', 'SYL', 'CCH', 'FDE', 'FDD', 'EMC', 'MO3',
68+
'MO2', 'MO1', 'CRY', 'MO7', 'MO6', 'MO5', 'MO4', 'DMS',
69+
'DMU', 'DMX', 'RGI', 'AEM', 'PS5', 'PR', 'LDA', 'PT',
70+
'PB', 'ZO3', 'TCH', 'PI', 'MH2', 'MH3', 'MHM', 'BU2',
71+
'BU3', 'DDQ', 'BU1', 'MHA', 'S', 'DDH', 'T1A', 'LCP',
72+
'MOS', 'O4M', 'MOH', 'MOO', 'LCO', 'BCN', 'UMQ', 'NH2',
73+
'HNI', 'PEU', 'PER', 'FSO', 'TMA', 'ETA', 'ETF', 'CD1',
74+
'PEJ', 'CD3', 'FSX', 'DVT', 'PEG', 'COM', 'RU7', 'CON',
75+
'HE5', 'HE6', 'BF2', '16D', '16A', 'P33', 'ND4', 'HEV',
76+
'CO', 'CN', 'CM', 'HES', 'CA', 'HEZ', 'CD', 'HEG',
77+
'HEA', 'HEB', 'HEC', 'CS', 'CR', 'CU', 'IOD', 'PE8',
78+
'PSL', 'OES', 'PE5', 'PE7', 'PE3', 'FS1', 'FS2', 'FS3',
79+
'FS4', 'PC3', 'OEC', 'NMO', 'NML', 'DIO', '6MO', 'CIT',
80+
'DIA', 'C8E', '1PS', 'C10', 'C15', 'DIS', 'PPF', 'SOG',
81+
'PPK', 'SOH', 'PPI', 'IR3', 'SOM', 'TBA', 'PPM', 'SOR',
82+
'PPV', 'GAI', 'XL1', 'MBR', 'IR', '3NI', 'IRI', 'SO3',
83+
'SO2', 'SO4', 'MTL', 'IN', 'VX', 'PP9', 'MTF', 'CM5',
84+
'MTD', 'ARS', 'ART', 'SDS', 'ARF', '4MO', 'BA', 'NAO',
85+
'CCN', 'NAW', 'BR', 'EPE', 'O2', 'BOG', 'PIN', 'MYQ',
86+
'MYR', 'POL', 'PIG', 'N8E', 'F3S', 'IOH', 'PIS', 'CD5',
87+
'BO3', 'BO4', 'DR6', 'NA2', 'PEO', 'NA6', 'CXE', 'NA5',
88+
'OS', 'REO', 'U1', 'VO4', 'FLH', 'HO', 'FLO', 'COH',
89+
'EEE', 'HG', 'PTN', 'ETI', 'SFO', 'ETN', 'MNR', 'VO3',
90+
'MNH', 'OSM', 'ROP', 'SFN', 'MNC', 'PO4', '3BR', 'FMI',
91+
'MN3', 'TRE', 'MN5', 'O', 'PBM', 'TRS', 'PT4', 'TRT',
92+
'MW3', 'MW2', 'MPD', 'HTO', 'MPO', 'MTO', '6WO', 'MPR',
93+
'MPS', 'AU', 'OXY', 'ACA', 'GD3', 'SUC', 'NEH', 'IDT',
94+
'SUL', 'P4C', 'ACN', '2OF', 'S0H', 'IDO', 'HFM', 'ACU',
95+
'ACT', '2OP', 'ACY', '2OS', 'NI', 'CU1', 'NO', 'NA',
96+
'MW1', 'TEE', 'FE2', 'TEP', '1BO', 'FEC', 'FEA', 'FEO',
97+
'FEL', 'CUZ', 'FES', 'CUA', 'CUO', 'CUN', 'CUM', 'CUL',
98+
'PTL', 'MN6', 'BRP', 'BRO', 'BRM', 'BRJ', 'MES', 'OUT',
99+
'AZI', 'MP1', '1PG', '1PE', 'MSM', 'HLT', 'MSF', 'CN1',
100+
'CL', 'TL', 'SGM', 'HO3', 'TE', 'TB', 'ICI', 'AG',
101+
'FPO', '1CP', 'AL', 'HOH', 'CNF', 'SCN', 'CNB', 'CE',
102+
'CNN', 'CNM', 'ICT', '15P', 'ZH3', 'RTC', '2NO', 'DXG',
103+
'DXE', 'VSO', 'MSE', 'FAR', 'FII', 'FPP', 'HFP', 'A4P', 'NAD',
104+
'ADP', 'AMP', 'APC', 'ATP', 'IMP', 'RVP', 'XMP', 'SAH', 'SAM',
105+
'FAD', 'NAP', 'NDP', 'UDP', 'ANP', 'COA', 'CME', 'UMP', 'NAI'])
106+
107+
108+
109+
class SearchAPI:
110+
def __init__(self, search_url):
111+
self.search_url = search_url
112+
self.search_options = '&wt=json&rows=100000'
113+
114+
115+
def url_response(self, url):
116+
"""
117+
Getting JSON response from URL
118+
:param url: String
119+
:return: JSON
120+
"""
121+
r = requests.get(url=url)
122+
# Status code 200 means 'OK'
123+
if r.status_code == 200:
124+
json_result = r.json()
125+
return json_result
126+
else:
127+
print(r.status_code, r.reason)
128+
return None
129+
130+
def run_search(self):
131+
"""
132+
Running search with terms
133+
Check pdbe search api documentation for more details
134+
:param pdbe_search_term: String
135+
:return: JSON
136+
"""
137+
full_query = self.search_url
138+
response = self.url_response(full_query)
139+
return response
140+
141+
142+
def get_sdf_hetid(pdbid, hetid, pdb_lig_filename):
143+
144+
"""
145+
downloads ligand sdf file using RCSB ligand expo API
146+
"""
147+
148+
hetid = str(hetid)
149+
sflag = False
150+
url = "http://ligand-expo.rcsb.org/files/%s/%s/isdf/" % (
151+
hetid[0], hetid)
152+
153+
try:
154+
response = urllib.request.urlopen(url)
155+
except Exception:
156+
print(" can't find the ligand for " + str(pdbid))
157+
else:
158+
lines = response.readlines()
159+
sdf_file = ''
160+
for line in lines:
161+
try:
162+
sdf_filename = line.decode().split('<li><a href="')[1].split('"')[0]
163+
if sdf_filename.startswith(pdbid.lower()) and sdf_filename[-7] != 'D':
164+
sdf_url = url + sdf_filename
165+
sdf_file = urllib.request.urlopen(sdf_url).read()
166+
break
167+
except Exception:
168+
continue
169+
if sdf_file:
170+
with open(pdb_lig_filename, 'wb') as f:
171+
f.write(sdf_file)
172+
sflag = True
173+
else:
174+
print(" can't find a ligand for " + str(pdbid) + " or it is disordered")
175+
return sflag
176+
177+
178+
def get_smi_from_hetid(hetid):
179+
"""
180+
obtain SMILES string for hetid using PDBe REST API
181+
"""
182+
smi_query = SearchAPI(
183+
search_url='https://www.ebi.ac.uk/pdbe/api/pdb/compound/summary/{}'.format(hetid))
184+
try:
185+
smi_call = smi_query.run_search()
186+
except Exception:
187+
print("problem accessing the compound API service")
188+
else:
189+
for x, y in smi_call.items():
190+
smi = (y[0]['smiles'][0]['name'])
191+
192+
return smi
193+
194+
195+
def get_3d_sdf_from_smi(smi, sdf_filename):
196+
197+
"""
198+
Generate 3D coordinate from SMILES string
199+
"""
200+
201+
sflag = False
202+
conformer_generator = conformer.ConformerGenerator()
203+
conformer_generator.settings.max_conformers = 1
204+
m = Molecule.from_string(smi)
205+
conformers = conformer_generator.generate(m)
206+
with io.EntryWriter(sdf_filename) as writer:
207+
writer.write(conformers[0].molecule)
208+
sflag = True
209+
210+
return sflag
211+
212+
213+
def get_hetids_from_pdbid(pdbid):
214+
"""
215+
Find hetids in PDB entry using PDBe REST API and then return unique hetid that
216+
are not present in the excluded_hetids list above
217+
"""
218+
chem_comp_query = SearchAPI(
219+
search_url='https://www.ebi.ac.uk/pdbe/api/pdb/entry/ligand_monomers/{}'.format(pdbid))
220+
try:
221+
cc_call = chem_comp_query.run_search()
222+
except Exception:
223+
print("problem accesing the entry API")
224+
else:
225+
all_cc_list = []
226+
for x, y in cc_call.items():
227+
for z in y:
228+
cc_id = (z['chem_comp_id'])
229+
all_cc_list.append(cc_id)
230+
cc_list = list(set(all_cc_list))
231+
232+
hetids = [x for x in cc_list if x not in excluded_hetids]
233+
234+
return hetids
235+
236+
237+
def write_conformers(mol_input, mol_reference, conf_file):
238+
"""
239+
Generate conformer and calculate rmsd wrt reference
240+
"""
241+
top_rmsd = -1
242+
conformer_generator = conformer.ConformerGenerator()
243+
conformer_generator.settings.superimpose_conformers_onto_reference = True
244+
conformers = conformer_generator.generate(mol_input)
245+
if conformers:
246+
best_rmsd = [
247+
MolecularDescriptors.rmsd(mol_reference, c.molecule, None,
248+
True, True, True)
249+
for c in conformers
250+
]
251+
best_rmsd.sort()
252+
top_rmsd = best_rmsd[0]
253+
with io.MoleculeWriter(conf_file) as mol_writer:
254+
for c in conformers:
255+
mol_writer.write(c.molecule)
256+
pass
257+
rmsd, norm_score = round(conformers[0].rmsd(), 2), conformers[
258+
0].normalised_score
259+
else:
260+
rmsd, norm_score = '', ''
261+
return rmsd, norm_score, top_rmsd
262+
263+
264+
def generate_conformers_for_all_hetids(pdbid, folder_path='conformers_'):
265+
"""
266+
generate conformer for all hetid
267+
"""
268+
working_dir = folder_path + pdbid
269+
if not os.path.exists(working_dir):
270+
os.makedirs(working_dir)
271+
272+
list_hetids = get_hetids_from_pdbid(pdbid)
273+
results = []
274+
for hetid in list_hetids:
275+
print('doing hetid ' + str(hetid))
276+
reference_flag, input_flag = False, False
277+
pdb_lig_filename = os.path.join(working_dir, str(hetid) + '_reference.sdf')
278+
model_lig_filename = os.path.join(working_dir, str(hetid) + '_model.sdf')
279+
reference_flag = get_sdf_hetid(pdbid, hetid, pdb_lig_filename)
280+
input_smi = get_smi_from_hetid(hetid)
281+
if input_smi:
282+
input_flag = get_3d_sdf_from_smi(input_smi, model_lig_filename)
283+
if reference_flag:
284+
mol_reader_ref = io.MoleculeReader(pdb_lig_filename)
285+
try:
286+
mol_reference = mol_reader_ref[0]
287+
except Exception:
288+
continue
289+
if input_flag:
290+
mol_reader_input = io.MoleculeReader(model_lig_filename)
291+
try:
292+
mol_input = mol_reader_input[0]
293+
except Exception:
294+
continue
295+
count = 0
296+
if reference_flag and input_flag:
297+
if len(mol_input.heavy_atoms) == len(mol_reference.heavy_atoms):
298+
rb = len([b for b in mol_input.bonds if b.is_rotatable])
299+
if len(mol_input.heavy_atoms) > 6 and rb > 0:
300+
# molecule needs hydrogens, set atom types,
301+
# and aromatic bonds to match mogul libraries
302+
mol_input.add_hydrogens()
303+
mol_reference.add_hydrogens()
304+
mol_input.standardise_delocalised_bonds()
305+
mol_reference.standardise_delocalised_bonds()
306+
mol_input.assign_bond_types(which='unknown')
307+
mol_reference.assign_bond_types(which='unknown')
308+
mol_input.standardise_aromatic_bonds()
309+
mol_reference.standardise_aromatic_bonds()
310+
mw = int(round(mol_input.molecular_weight, 0))
311+
conf_file = model_lig_filename.replace('.sdf', '_conformers.sdf')
312+
rmsd, norm_score, top_rmsd = write_conformers(mol_input,
313+
mol_reference,
314+
conf_file)
315+
output_list = ','.join(map(str,
316+
[pdbid, hetid, mw, rmsd,
317+
norm_score, top_rmsd]))
318+
results.append(output_list)
319+
return results
320+
321+
322+
###############################################################################
323+
# main
324+
###############################################################################
325+
326+
327+
def main():
328+
parser = argparse.ArgumentParser(description=__doc__)
329+
parser.add_argument("targets", help="list of PDB-codes", type=str)
330+
331+
args = parser.parse_args()
332+
input_filename = args.targets
333+
base_name = os.path.splitext(input_filename)[0]
334+
output_filename = base_name + "_CSD_Conformer_Results.csv"
335+
336+
with open(input_filename) as input_file:
337+
pdbs_raw = input_file.readlines()
338+
339+
pdbs = []
340+
for pdb_raw in pdbs_raw:
341+
pdb = pdb_raw.strip()
342+
if pdb and len(pdb) == 4:
343+
pdbs.append(pdb)
344+
else:
345+
print('this line is not a pdbid ' + str(pdb_raw))
346+
347+
total = str(len(pdbs))
348+
i = 1
349+
350+
with open(output_filename, 'w') as output_file:
351+
output_file.write('pdbid,hetid,mw,rmsd_lowest_energy_conformer,norm_score,lowest_rmsd\n')
352+
for pdbid in pdbs:
353+
print('processing ' + str(i) + ' of ' + total + ' PDB: ' + pdbid)
354+
i += 1
355+
results = generate_conformers_for_all_hetids(pdbid.strip())
356+
for result in results:
357+
output_file.write(result + '\n')
358+
359+
360+
if __name__ == '__main__':
361+
main()
362+
363+
364+
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
#find_binding_conformation.py
2+
3+
We know that most pharmaceutically relevant compounds bind to their targets in a relaxed conformation. The challenge in discovery is to figure out rapidly which conformations are readily accessible for the molecules we are considering. There is now a new solution to address this based on statistical, rather than just energetic approaches.
4+
5+
Driven by the wealth and diversity of bond, angle and torsion information in the Cambridge Structural Database, the CSD Conformer Generator produces realistic ensembles of low energy ligand structures. These are ready to be exploited for drug design in the presence and also in the absence of detailed knowledge about the three-dimensional structure of the protein active site.
6+
7+
Starting from a list of PDB-codes, this script generates idealized conformers
8+
for ligands and evaluates their RMSD to the conformation in the PDB.
9+
10+
#Author
11+
'Brandl, Giangreco, Higueruelo, Schaerfer and Sykes'
12+
13+
#Requirements
14+
This script uses the CSD Python API and needs full installation of CSD.
15+
This script also uses PDBe's and RCSB's API to obtain PDB related information.
16+
17+
#Usage
18+
19+
```Python
20+
python find_binding_conformation.py pdb_example.txt
21+
```
22+
23+
#Output
24+
25+
Subdirectories for each PDB entry with the conformers generated for each ligand, and a spreadsheet (.csv) with the results of the comparison.
26+
27+
28+
For more details about conformer generator see this blog https://www.ccdc.cam.ac.uk/Community/blog/post-61/

0 commit comments

Comments
 (0)