Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1from collections import namedtuple
2import warnings
3import urllib.request
4from urllib.error import URLError, HTTPError
5import json
6from io import StringIO, BytesIO
7from ase.io import read
10base_url = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug'
12PubchemSearch = namedtuple('PubchemSearch', 'search field')
15class PubchemData:
16 """
17 a specialized class for entries from the pubchem database
18 """
19 def __init__(self, atoms, data):
20 self.atoms = atoms
21 self.data = data
23 def get_atoms(self):
24 return self.atoms
26 def get_pubchem_data(self):
27 return self.data
30def search_pubchem_raw(search, field, silent=False, mock_test=False):
31 """
32 A helper function for searching pubchem.
34 Parameters:
35 search (str or int):
36 the compound you are searching for. This can be either
37 a common name, CID, or smiles string depending of the
38 `field` you are searching
40 field (str):
41 the particular field you are searching with. Possible values
42 are 'name', 'CID', and 'smiles'.'name' will search common '
43 'names,CID will search the Pubchem Chemical Idenitification '
44 'Numberswhich can be found on their website and smiles'
45 ' searches for compounds with the entered smiles string.
47 returns:
48 data (str):
49 a string containing the raw response from pubchem.
50 """
51 suffix = 'sdf?record_type=3d'
53 if field == 'conformers':
54 # we don't use the "compound" flag when looking for conformers
55 url = '{}/{}/{}/{}'.format(base_url, field, str(search),
56 suffix)
57 else:
58 url = '{}/compound/{}/{}/{}'.format(base_url, field,
59 str(search), suffix)
60 if mock_test: # for testing only
61 r = BytesIO(test_output)
62 else:
63 try:
64 r = urllib.request.urlopen(url)
65 except HTTPError as e:
66 print(e.reason)
67 raise ValueError('the search term {} could not be found'
68 ' for the field {}'.format(search, field))
69 except URLError as e:
70 print(e.reason)
71 raise ValueError('Couldn\'t reach the pubchem servers, check'
72 ' your internet connection')
74 # check if there are confomers and warn them if there are
75 if field != 'conformers' and not silent:
76 conformer_ids = available_conformer_search(search, field,
77 mock_test=mock_test)
78 if len(conformer_ids) > 1:
79 warnings.warn('The structure "{}" has more than one '
80 'conformer in PubChem. By default, the '
81 'first conformer is returned, please ensure'
82 ' you are using the structure you intend to'
83 ' or use the '
84 '`ase.data.pubchem.pubchem_conformer_search`'
85 ' function'.format(search))
87 data = r.read().decode('utf-8')
88 return data
91def parse_pubchem_raw(data):
92 """
93 a helper function for parsing the returned pubchem entries
95 Parameters:
96 data (str):
97 the raw output from pubchem in string form
99 returns:
100 atoms (ASE Atoms Object):
101 An ASE atoms obejct containing the information from
102 pubchem
103 pubchem_data (dict):
104 a dictionary containing the non-structural information
105 from pubchem
107 """
108 if 'PUBCHEM_COMPOUND_CID' not in data:
109 raise Exception('There was a problem with the data returned by '
110 'PubChem')
111 f_like = StringIO(data)
112 atoms = read(f_like, format='sdf')
114 # check if there are confomers and warn them if there are
116 # further analyze the text returned from pubchem
117 pubchem_data = {}
118 other_info = data.split('END\n')[1]
119 other_info = other_info.split('$')[0] # remove the $$$$ at the end
120 # the strucuture of this string is > <field>\nentry_info\n
121 other_info = other_info.split('> <') # split into the fields
122 for data_field in other_info:
123 if data_field == '':
124 continue
125 field_name, entry_value = data_field.split('>\n')
126 # split it into lines and remove the empty lines
127 entry_value = entry_value.splitlines()
128 entry_value = [a for a in entry_value if a != '']
129 if len(entry_value) == 1:
130 entry_value = entry_value[0]
131 pubchem_data[field_name] = entry_value
132 # recover partial charges
133 if 'PUBCHEM_MMFF94_PARTIAL_CHARGES' in pubchem_data.keys():
134 # the first entry just contains the number of atoms with charges
135 charges = pubchem_data['PUBCHEM_MMFF94_PARTIAL_CHARGES'][1:]
136 # each subsequent entry contains the index and charge of the atoms
137 atom_charges = [0.] * len(atoms)
138 for charge in charges:
139 i, charge = charge.split()
140 # indices start at 1
141 atom_charges[int(i) - 1] = float(charge)
142 atoms.set_initial_charges(atom_charges)
143 return atoms, pubchem_data
146def analyze_input(name=None, cid=None, smiles=None, conformer=None,
147 silent=False):
148 """
149 helper function to translate keyword arguments from intialization
150 and searching into the search and field that is being asked for
152 Parameters:
153 see `ase.data.pubchem.pubchem_search`
154 returns:
155 search:
156 the search term the user has entered
157 field:
158 the name of the field being asked for
160 """
161 inputs = [name, cid, smiles, conformer]
162 inputs_check = [a is not None for a in [name, cid, smiles, conformer]]
163 input_fields = ['name', 'cid', 'smiles', 'conformers']
165 if inputs_check.count(True) > 1:
166 raise ValueError('Only one search term my be entered a time.'
167 ' Please pass in only one of the following: '
168 'name, cid, smiles, confomer')
169 elif inputs_check.count(True) == 1:
170 # Figure out which input has been passed in
171 index = inputs_check.index(True)
172 field = input_fields[index]
173 search = inputs[index]
174 else:
175 raise ValueError('No search was entered.'
176 ' Please pass in only one of the following: '
177 'name, cid, smiles, confomer')
179 return PubchemSearch(search, field)
182def available_conformer_search(search, field, mock_test=False):
183 """
184 Helper function to get the conformer IDs. This searches pubchem for
185 the conformers of a given structure and returns all the confomer ids
186 of a structure.
188 Parameters:
189 search (str or int):
190 the compound you are searching for. This can be either
191 a common name, CID, or smiles string depending of the
192 `field` you are searching
194 field (str):
195 the particular field you are searching with. Possible values
196 are 'name', 'CID', and 'smiles'.'name' will search common '
197 'names,CID will search the Pubchem Chemical Idenitification '
198 'Numberswhich can be found on their website and smiles'
199 ' searches for compounds with the entered smiles string.
201 returns:
202 conformers_ids (list):
203 a list of the conformer IDs from PubChem, this is different
204 than the CID numbers
205 """
206 suffix = 'conformers/JSON'
207 url = '{}/compound/{}/{}/{}'.format(base_url, field, str(search),
208 suffix)
209 if mock_test:
210 r = BytesIO(test_conformer_output)
211 else:
212 try:
213 r = urllib.request.urlopen(url)
214 except HTTPError as e:
215 err = ValueError('the search term {} could not be found'
216 ' for the field {}'.format(search, field))
217 raise err from e
218 except URLError as e:
219 err = ValueError('Couldn\'t reach the pubchem servers, check'
220 ' your internet connection')
221 raise err from e
222 record = r.read().decode('utf-8')
223 record = json.loads(record)
224 # note: cid = compound id != conformer id
225 conformer_ids = record['InformationList']['Information'][0]['ConformerID']
226 return conformer_ids
229def pubchem_search(*args, mock_test=False, **kwargs):
230 """
231 Search PubChem for the field and search input on the argument passed in
232 returning a PubchemData object. Note that only one argument may be passed
233 in at a time.
235 Parameters:
236 name (str):
237 the common name of the compound you're searching for
238 cid (str or int):
239 the cid of the compound you're searching for
240 smiles (str):
241 the smiles string of the compound you're searching for
242 conformer (str or int):
243 the conformer id of the compound you're searching for
245 returns:
246 result (PubchemData):
247 a pubchem data object containing the information on the
248 requested entry
249 """
251 search, field = analyze_input(*args, **kwargs)
252 raw_pubchem = search_pubchem_raw(search, field, mock_test=mock_test)
253 atoms, data = parse_pubchem_raw(raw_pubchem)
254 result = PubchemData(atoms, data)
255 return result
258def pubchem_conformer_search(*args, mock_test=False, **kwargs):
259 """
260 Search PubChem for all the conformers of a given compound.
261 Note that only one argument may be passed in at a time.
263 Parameters:
264 see `ase.data.pubchem.pubchem_search`
266 returns:
267 conformers (list):
268 a list containing the PubchemData objects of all the conformers
269 for your search
270 """
272 search, field = analyze_input(*args, **kwargs)
274 conformer_ids = available_conformer_search(search, field,
275 mock_test=mock_test)
276 conformers = []
278 for id_ in conformer_ids:
279 conformers.append(pubchem_search(mock_test=mock_test,
280 conformer=id_))
281 return conformers
284def pubchem_atoms_search(*args, **kwargs):
285 """
286 Search PubChem for the field and search input on the argument passed in
287 returning an atoms object.Note that only one argument may be passed
288 in at a time.
290 Parameters:
291 see `ase.data.pubchem.pubchem_search`
293 returns:
294 atoms (ASE Atoms Object):
295 an ASE Atoms object containing the information on the
296 requested entry
297 """
298 return pubchem_search(*args, **kwargs).get_atoms()
301def pubchem_atoms_conformer_search(*args, **kwargs):
302 """
303 Search PubChem for all the conformers of a given compound.
304 Note that only one argument may be passed in at a time.
306 Parameters:
307 see `ase.data.pubchem.pubchem_search`
309 returns:
310 conformers (list):
311 a list containing the atoms objects of all the conformers
312 for your search
313 """
314 conformers = pubchem_conformer_search(*args, **kwargs)
315 conformers = [conformer.get_atoms() for conformer in conformers]
316 return conformers
319test_output = b'222\n -OEChem-10071914343D\n\n 4 3 0 0 0 0 0 0 0999 V2000\n 0.0000 0.0000 0.0000 N 0 0 0 0 0 0 0 0 0 0 0 0\n -0.4417 0.2906 0.8711 H 0 0 0 0 0 0 0 0 0 0 0 0\n 0.7256 0.6896 -0.1907 H 0 0 0 0 0 0 0 0 0 0 0 0\n 0.4875 -0.8701 0.2089 H 0 0 0 0 0 0 0 0 0 0 0 0\n 1 2 1 0 0 0 0\n 1 3 1 0 0 0 0\n 1 4 1 0 0 0 0\nM END\n> <PUBCHEM_COMPOUND_CID>\n222\n\n> <PUBCHEM_CONFORMER_RMSD>\n0.4\n\n> <PUBCHEM_CONFORMER_DIVERSEORDER>\n1\n\n> <PUBCHEM_MMFF94_PARTIAL_CHARGES>\n4\n1 -1.08\n2 0.36\n3 0.36\n4 0.36\n\n> <PUBCHEM_EFFECTIVE_ROTOR_COUNT>\n0\n\n> <PUBCHEM_PHARMACOPHORE_FEATURES>\n1\n1 1 cation\n\n> <PUBCHEM_HEAVY_ATOM_COUNT>\n1\n\n> <PUBCHEM_ATOM_DEF_STEREO_COUNT>\n0\n\n> <PUBCHEM_ATOM_UDEF_STEREO_COUNT>\n0\n\n> <PUBCHEM_BOND_DEF_STEREO_COUNT>\n0\n\n> <PUBCHEM_BOND_UDEF_STEREO_COUNT>\n0\n\n> <PUBCHEM_ISOTOPIC_ATOM_COUNT>\n0\n\n> <PUBCHEM_COMPONENT_COUNT>\n1\n\n> <PUBCHEM_CACTVS_TAUTO_COUNT>\n1\n\n> <PUBCHEM_CONFORMER_ID>\n000000DE00000001\n\n> <PUBCHEM_MMFF94_ENERGY>\n0\n\n> <PUBCHEM_FEATURE_SELFOVERLAP>\n5.074\n\n> <PUBCHEM_SHAPE_FINGERPRINT>\n260 1 18410856563934756871\n\n> <PUBCHEM_SHAPE_MULTIPOLES>\n15.6\n0.51\n0.51\n0.51\n0\n0\n0\n0\n0\n0\n0\n0\n0\n0\n\n> <PUBCHEM_SHAPE_SELFOVERLAP>\n14.89\n\n> <PUBCHEM_SHAPE_VOLUME>\n15.6\n\n> <PUBCHEM_COORDINATE_TYPE>\n2\n5\n10\n\n$$$$\n'
320test_conformer_output = b'{\n "InformationList": {\n "Information": [\n {\n "CID": 222,\n "ConformerID": [\n "000000DE00000001"\n ]\n }\n ]\n }\n}\n'