Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1from collections import namedtuple 

2import warnings 

3import urllib.request 

4from urllib.error import URLError, HTTPError 

5import json 

6from io import StringIO, BytesIO 

7from ase.io import read 

8 

9 

10base_url = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug' 

11 

12PubchemSearch = namedtuple('PubchemSearch', 'search field') 

13 

14 

15class PubchemData: 

16 """ 

17 a specialized class for entries from the pubchem database 

18 """ 

19 def __init__(self, atoms, data): 

20 self.atoms = atoms 

21 self.data = data 

22 

23 def get_atoms(self): 

24 return self.atoms 

25 

26 def get_pubchem_data(self): 

27 return self.data 

28 

29 

30def search_pubchem_raw(search, field, silent=False, mock_test=False): 

31 """ 

32 A helper function for searching pubchem. 

33 

34 Parameters: 

35 search (str or int): 

36 the compound you are searching for. This can be either 

37 a common name, CID, or smiles string depending of the 

38 `field` you are searching 

39 

40 field (str): 

41 the particular field you are searching with. Possible values 

42 are 'name', 'CID', and 'smiles'.'name' will search common ' 

43 'names,CID will search the Pubchem Chemical Idenitification ' 

44 'Numberswhich can be found on their website and smiles' 

45 ' searches for compounds with the entered smiles string. 

46 

47 returns: 

48 data (str): 

49 a string containing the raw response from pubchem. 

50 """ 

51 suffix = 'sdf?record_type=3d' 

52 

53 if field == 'conformers': 

54 # we don't use the "compound" flag when looking for conformers 

55 url = '{}/{}/{}/{}'.format(base_url, field, str(search), 

56 suffix) 

57 else: 

58 url = '{}/compound/{}/{}/{}'.format(base_url, field, 

59 str(search), suffix) 

60 if mock_test: # for testing only 

61 r = BytesIO(test_output) 

62 else: 

63 try: 

64 r = urllib.request.urlopen(url) 

65 except HTTPError as e: 

66 print(e.reason) 

67 raise ValueError('the search term {} could not be found' 

68 ' for the field {}'.format(search, field)) 

69 except URLError as e: 

70 print(e.reason) 

71 raise ValueError('Couldn\'t reach the pubchem servers, check' 

72 ' your internet connection') 

73 

74 # check if there are confomers and warn them if there are 

75 if field != 'conformers' and not silent: 

76 conformer_ids = available_conformer_search(search, field, 

77 mock_test=mock_test) 

78 if len(conformer_ids) > 1: 

79 warnings.warn('The structure "{}" has more than one ' 

80 'conformer in PubChem. By default, the ' 

81 'first conformer is returned, please ensure' 

82 ' you are using the structure you intend to' 

83 ' or use the ' 

84 '`ase.data.pubchem.pubchem_conformer_search`' 

85 ' function'.format(search)) 

86 

87 data = r.read().decode('utf-8') 

88 return data 

89 

90 

91def parse_pubchem_raw(data): 

92 """ 

93 a helper function for parsing the returned pubchem entries 

94 

95 Parameters: 

96 data (str): 

97 the raw output from pubchem in string form 

98 

99 returns: 

100 atoms (ASE Atoms Object): 

101 An ASE atoms obejct containing the information from 

102 pubchem 

103 pubchem_data (dict): 

104 a dictionary containing the non-structural information 

105 from pubchem 

106 

107 """ 

108 if 'PUBCHEM_COMPOUND_CID' not in data: 

109 raise Exception('There was a problem with the data returned by ' 

110 'PubChem') 

111 f_like = StringIO(data) 

112 atoms = read(f_like, format='sdf') 

113 

114 # check if there are confomers and warn them if there are 

115 

116 # further analyze the text returned from pubchem 

117 pubchem_data = {} 

118 other_info = data.split('END\n')[1] 

119 other_info = other_info.split('$')[0] # remove the $$$$ at the end 

120 # the strucuture of this string is > <field>\nentry_info\n 

121 other_info = other_info.split('> <') # split into the fields 

122 for data_field in other_info: 

123 if data_field == '': 

124 continue 

125 field_name, entry_value = data_field.split('>\n') 

126 # split it into lines and remove the empty lines 

127 entry_value = entry_value.splitlines() 

128 entry_value = [a for a in entry_value if a != ''] 

129 if len(entry_value) == 1: 

130 entry_value = entry_value[0] 

131 pubchem_data[field_name] = entry_value 

132 # recover partial charges 

133 if 'PUBCHEM_MMFF94_PARTIAL_CHARGES' in pubchem_data.keys(): 

134 # the first entry just contains the number of atoms with charges 

135 charges = pubchem_data['PUBCHEM_MMFF94_PARTIAL_CHARGES'][1:] 

136 # each subsequent entry contains the index and charge of the atoms 

137 atom_charges = [0.] * len(atoms) 

138 for charge in charges: 

139 i, charge = charge.split() 

140 # indices start at 1 

141 atom_charges[int(i) - 1] = float(charge) 

142 atoms.set_initial_charges(atom_charges) 

143 return atoms, pubchem_data 

144 

145 

146def analyze_input(name=None, cid=None, smiles=None, conformer=None, 

147 silent=False): 

148 """ 

149 helper function to translate keyword arguments from intialization 

150 and searching into the search and field that is being asked for 

151 

152 Parameters: 

153 see `ase.data.pubchem.pubchem_search` 

154 returns: 

155 search: 

156 the search term the user has entered 

157 field: 

158 the name of the field being asked for 

159 

160 """ 

161 inputs = [name, cid, smiles, conformer] 

162 inputs_check = [a is not None for a in [name, cid, smiles, conformer]] 

163 input_fields = ['name', 'cid', 'smiles', 'conformers'] 

164 

165 if inputs_check.count(True) > 1: 

166 raise ValueError('Only one search term my be entered a time.' 

167 ' Please pass in only one of the following: ' 

168 'name, cid, smiles, confomer') 

169 elif inputs_check.count(True) == 1: 

170 # Figure out which input has been passed in 

171 index = inputs_check.index(True) 

172 field = input_fields[index] 

173 search = inputs[index] 

174 else: 

175 raise ValueError('No search was entered.' 

176 ' Please pass in only one of the following: ' 

177 'name, cid, smiles, confomer') 

178 

179 return PubchemSearch(search, field) 

180 

181 

182def available_conformer_search(search, field, mock_test=False): 

183 """ 

184 Helper function to get the conformer IDs. This searches pubchem for 

185 the conformers of a given structure and returns all the confomer ids 

186 of a structure. 

187 

188 Parameters: 

189 search (str or int): 

190 the compound you are searching for. This can be either 

191 a common name, CID, or smiles string depending of the 

192 `field` you are searching 

193 

194 field (str): 

195 the particular field you are searching with. Possible values 

196 are 'name', 'CID', and 'smiles'.'name' will search common ' 

197 'names,CID will search the Pubchem Chemical Idenitification ' 

198 'Numberswhich can be found on their website and smiles' 

199 ' searches for compounds with the entered smiles string. 

200 

201 returns: 

202 conformers_ids (list): 

203 a list of the conformer IDs from PubChem, this is different 

204 than the CID numbers 

205 """ 

206 suffix = 'conformers/JSON' 

207 url = '{}/compound/{}/{}/{}'.format(base_url, field, str(search), 

208 suffix) 

209 if mock_test: 

210 r = BytesIO(test_conformer_output) 

211 else: 

212 try: 

213 r = urllib.request.urlopen(url) 

214 except HTTPError as e: 

215 err = ValueError('the search term {} could not be found' 

216 ' for the field {}'.format(search, field)) 

217 raise err from e 

218 except URLError as e: 

219 err = ValueError('Couldn\'t reach the pubchem servers, check' 

220 ' your internet connection') 

221 raise err from e 

222 record = r.read().decode('utf-8') 

223 record = json.loads(record) 

224 # note: cid = compound id != conformer id 

225 conformer_ids = record['InformationList']['Information'][0]['ConformerID'] 

226 return conformer_ids 

227 

228 

229def pubchem_search(*args, mock_test=False, **kwargs): 

230 """ 

231 Search PubChem for the field and search input on the argument passed in 

232 returning a PubchemData object. Note that only one argument may be passed 

233 in at a time. 

234 

235 Parameters: 

236 name (str): 

237 the common name of the compound you're searching for 

238 cid (str or int): 

239 the cid of the compound you're searching for 

240 smiles (str): 

241 the smiles string of the compound you're searching for 

242 conformer (str or int): 

243 the conformer id of the compound you're searching for 

244 

245 returns: 

246 result (PubchemData): 

247 a pubchem data object containing the information on the 

248 requested entry 

249 """ 

250 

251 search, field = analyze_input(*args, **kwargs) 

252 raw_pubchem = search_pubchem_raw(search, field, mock_test=mock_test) 

253 atoms, data = parse_pubchem_raw(raw_pubchem) 

254 result = PubchemData(atoms, data) 

255 return result 

256 

257 

258def pubchem_conformer_search(*args, mock_test=False, **kwargs): 

259 """ 

260 Search PubChem for all the conformers of a given compound. 

261 Note that only one argument may be passed in at a time. 

262 

263 Parameters: 

264 see `ase.data.pubchem.pubchem_search` 

265 

266 returns: 

267 conformers (list): 

268 a list containing the PubchemData objects of all the conformers 

269 for your search 

270 """ 

271 

272 search, field = analyze_input(*args, **kwargs) 

273 

274 conformer_ids = available_conformer_search(search, field, 

275 mock_test=mock_test) 

276 conformers = [] 

277 

278 for id_ in conformer_ids: 

279 conformers.append(pubchem_search(mock_test=mock_test, 

280 conformer=id_)) 

281 return conformers 

282 

283 

284def pubchem_atoms_search(*args, **kwargs): 

285 """ 

286 Search PubChem for the field and search input on the argument passed in 

287 returning an atoms object.Note that only one argument may be passed 

288 in at a time. 

289 

290 Parameters: 

291 see `ase.data.pubchem.pubchem_search` 

292 

293 returns: 

294 atoms (ASE Atoms Object): 

295 an ASE Atoms object containing the information on the 

296 requested entry 

297 """ 

298 return pubchem_search(*args, **kwargs).get_atoms() 

299 

300 

301def pubchem_atoms_conformer_search(*args, **kwargs): 

302 """ 

303 Search PubChem for all the conformers of a given compound. 

304 Note that only one argument may be passed in at a time. 

305 

306 Parameters: 

307 see `ase.data.pubchem.pubchem_search` 

308 

309 returns: 

310 conformers (list): 

311 a list containing the atoms objects of all the conformers 

312 for your search 

313 """ 

314 conformers = pubchem_conformer_search(*args, **kwargs) 

315 conformers = [conformer.get_atoms() for conformer in conformers] 

316 return conformers 

317 

318 

319test_output = b'222\n -OEChem-10071914343D\n\n 4 3 0 0 0 0 0 0 0999 V2000\n 0.0000 0.0000 0.0000 N 0 0 0 0 0 0 0 0 0 0 0 0\n -0.4417 0.2906 0.8711 H 0 0 0 0 0 0 0 0 0 0 0 0\n 0.7256 0.6896 -0.1907 H 0 0 0 0 0 0 0 0 0 0 0 0\n 0.4875 -0.8701 0.2089 H 0 0 0 0 0 0 0 0 0 0 0 0\n 1 2 1 0 0 0 0\n 1 3 1 0 0 0 0\n 1 4 1 0 0 0 0\nM END\n> <PUBCHEM_COMPOUND_CID>\n222\n\n> <PUBCHEM_CONFORMER_RMSD>\n0.4\n\n> <PUBCHEM_CONFORMER_DIVERSEORDER>\n1\n\n> <PUBCHEM_MMFF94_PARTIAL_CHARGES>\n4\n1 -1.08\n2 0.36\n3 0.36\n4 0.36\n\n> <PUBCHEM_EFFECTIVE_ROTOR_COUNT>\n0\n\n> <PUBCHEM_PHARMACOPHORE_FEATURES>\n1\n1 1 cation\n\n> <PUBCHEM_HEAVY_ATOM_COUNT>\n1\n\n> <PUBCHEM_ATOM_DEF_STEREO_COUNT>\n0\n\n> <PUBCHEM_ATOM_UDEF_STEREO_COUNT>\n0\n\n> <PUBCHEM_BOND_DEF_STEREO_COUNT>\n0\n\n> <PUBCHEM_BOND_UDEF_STEREO_COUNT>\n0\n\n> <PUBCHEM_ISOTOPIC_ATOM_COUNT>\n0\n\n> <PUBCHEM_COMPONENT_COUNT>\n1\n\n> <PUBCHEM_CACTVS_TAUTO_COUNT>\n1\n\n> <PUBCHEM_CONFORMER_ID>\n000000DE00000001\n\n> <PUBCHEM_MMFF94_ENERGY>\n0\n\n> <PUBCHEM_FEATURE_SELFOVERLAP>\n5.074\n\n> <PUBCHEM_SHAPE_FINGERPRINT>\n260 1 18410856563934756871\n\n> <PUBCHEM_SHAPE_MULTIPOLES>\n15.6\n0.51\n0.51\n0.51\n0\n0\n0\n0\n0\n0\n0\n0\n0\n0\n\n> <PUBCHEM_SHAPE_SELFOVERLAP>\n14.89\n\n> <PUBCHEM_SHAPE_VOLUME>\n15.6\n\n> <PUBCHEM_COORDINATE_TYPE>\n2\n5\n10\n\n$$$$\n' 

320test_conformer_output = b'{\n "InformationList": {\n "Information": [\n {\n "CID": 222,\n "ConformerID": [\n "000000DE00000001"\n ]\n }\n ]\n }\n}\n'