Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1"""Module to read and write atoms in PDB file format. 

2 

3See:: 

4 

5 http://www.wwpdb.org/documentation/file-format 

6 

7Note: The PDB format saves cell lengths and angles; hence the absolute 

8orientation is lost when saving. Saving and loading a file will 

9conserve the scaled positions, not the absolute ones. 

10""" 

11 

12import warnings 

13 

14import numpy as np 

15 

16from ase.atoms import Atoms 

17from ase.geometry import cellpar_to_cell 

18from ase.io.espresso import label_to_symbol 

19from ase.utils import reader, writer 

20 

21 

22def read_atom_line(line_full): 

23 """ 

24 Read atom line from pdb format 

25 HETATM 1 H14 ORTE 0 6.301 0.693 1.919 1.00 0.00 H 

26 """ 

27 

28 line = line_full.rstrip('\n') 

29 type_atm = line[0:6] 

30 if type_atm == "ATOM " or type_atm == "HETATM": 

31 

32 name = line[12:16].strip() 

33 

34 altloc = line[16] 

35 resname = line[17:21] 

36 # chainid = line[21] # Not used 

37 

38 resseq = int(line[22:26].split()[0]) # sequence identifier 

39 # icode = line[26] # insertion code, not used 

40 

41 # atomic coordinates 

42 try: 

43 coord = np.array([float(line[30:38]), 

44 float(line[38:46]), 

45 float(line[46:54])], dtype=np.float64) 

46 except ValueError: 

47 raise ValueError("Invalid or missing coordinate(s)") 

48 

49 # occupancy & B factor 

50 try: 

51 occupancy = float(line[54:60]) 

52 except ValueError: 

53 occupancy = None # Rather than arbitrary zero or one 

54 

55 if occupancy is not None and occupancy < 0: 

56 warnings.warn("Negative occupancy in one or more atoms") 

57 

58 try: 

59 bfactor = float(line[60:66]) 

60 except ValueError: 

61 # The PDB use a default of zero if the data is missing 

62 bfactor = 0.0 

63 

64 # segid = line[72:76] # not used 

65 symbol = line[76:78].strip().upper() 

66 

67 else: 

68 raise ValueError("Only ATOM and HETATM supported") 

69 

70 return symbol, name, altloc, resname, coord, occupancy, bfactor, resseq 

71 

72 

73@reader 

74def read_proteindatabank(fileobj, index=-1, read_arrays=True): 

75 """Read PDB files.""" 

76 images = [] 

77 orig = np.identity(3) 

78 trans = np.zeros(3) 

79 occ = [] 

80 bfactor = [] 

81 residuenames = [] 

82 residuenumbers = [] 

83 atomtypes = [] 

84 

85 symbols = [] 

86 positions = [] 

87 cell = None 

88 pbc = None 

89 

90 def build_atoms(): 

91 atoms = Atoms(symbols=symbols, 

92 cell=cell, pbc=pbc, 

93 positions=positions) 

94 

95 if not read_arrays: 

96 return atoms 

97 

98 info = {'occupancy': occ, 

99 'bfactor': bfactor, 

100 'residuenames': residuenames, 

101 'atomtypes': atomtypes, 

102 'residuenumbers': residuenumbers} 

103 for name, array in info.items(): 

104 if len(array) == 0: 

105 pass 

106 elif len(array) != len(atoms): 

107 warnings.warn('Length of {} array, {}, ' 

108 'different from number of atoms {}'. 

109 format(name, len(array), len(atoms))) 

110 else: 

111 atoms.set_array(name, np.array(array)) 

112 return atoms 

113 

114 for line in fileobj.readlines(): 

115 if line.startswith('CRYST1'): 

116 cellpar = [float(line[6:15]), # a 

117 float(line[15:24]), # b 

118 float(line[24:33]), # c 

119 float(line[33:40]), # alpha 

120 float(line[40:47]), # beta 

121 float(line[47:54])] # gamma 

122 cell = cellpar_to_cell(cellpar) 

123 pbc = True 

124 for c in range(3): 

125 if line.startswith('ORIGX' + '123'[c]): 

126 orig[c] = [float(line[10:20]), 

127 float(line[20:30]), 

128 float(line[30:40])] 

129 trans[c] = float(line[45:55]) 

130 

131 if line.startswith('ATOM') or line.startswith('HETATM'): 

132 # Atom name is arbitrary and does not necessarily 

133 # contain the element symbol. The specification 

134 # requires the element symbol to be in columns 77+78. 

135 # Fall back to Atom name for files that do not follow 

136 # the spec, e.g. packmol. 

137 

138 # line_info = symbol, name, altloc, resname, coord, occupancy, 

139 # bfactor, resseq 

140 line_info = read_atom_line(line) 

141 

142 try: 

143 symbol = label_to_symbol(line_info[0]) 

144 except (KeyError, IndexError): 

145 symbol = label_to_symbol(line_info[1]) 

146 

147 position = np.dot(orig, line_info[4]) + trans 

148 atomtypes.append(line_info[1]) 

149 residuenames.append(line_info[3]) 

150 if line_info[5] is not None: 

151 occ.append(line_info[5]) 

152 bfactor.append(line_info[6]) 

153 residuenumbers.append(line_info[7]) 

154 

155 symbols.append(symbol) 

156 positions.append(position) 

157 

158 if line.startswith('END'): 

159 # End of configuration reached 

160 # According to the latest PDB file format (v3.30), 

161 # this line should start with 'ENDMDL' (not 'END'), 

162 # but in this way PDB trajectories from e.g. CP2K 

163 # are supported (also VMD supports this format). 

164 atoms = build_atoms() 

165 images.append(atoms) 

166 occ = [] 

167 bfactor = [] 

168 residuenames = [] 

169 atomtypes = [] 

170 symbols = [] 

171 positions = [] 

172 cell = None 

173 pbc = None 

174 

175 if len(images) == 0: 

176 atoms = build_atoms() 

177 images.append(atoms) 

178 return images[index] 

179 

180 

181@writer 

182def write_proteindatabank(fileobj, images, write_arrays=True): 

183 """Write images to PDB-file.""" 

184 if hasattr(images, 'get_positions'): 

185 images = [images] 

186 

187 rotation = None 

188 if images[0].get_pbc().any(): 

189 from ase.geometry import cell_to_cellpar, cellpar_to_cell 

190 

191 currentcell = images[0].get_cell() 

192 cellpar = cell_to_cellpar(currentcell) 

193 exportedcell = cellpar_to_cell(cellpar) 

194 rotation = np.linalg.solve(currentcell, exportedcell) 

195 # ignoring Z-value, using P1 since we have all atoms defined explicitly 

196 format = 'CRYST1%9.3f%9.3f%9.3f%7.2f%7.2f%7.2f P 1\n' 

197 fileobj.write(format % (cellpar[0], cellpar[1], cellpar[2], 

198 cellpar[3], cellpar[4], cellpar[5])) 

199 

200 # 1234567 123 6789012345678901 89 67 456789012345678901234567 890 

201 format = ('ATOM %5d %4s MOL 1 %8.3f%8.3f%8.3f%6.2f%6.2f' 

202 ' %2s \n') 

203 

204 # RasMol complains if the atom index exceeds 100000. There might 

205 # be a limit of 5 digit numbers in this field. 

206 MAXNUM = 100000 

207 

208 symbols = images[0].get_chemical_symbols() 

209 natoms = len(symbols) 

210 

211 for n, atoms in enumerate(images): 

212 fileobj.write('MODEL ' + str(n + 1) + '\n') 

213 p = atoms.get_positions() 

214 occupancy = np.ones(len(atoms)) 

215 bfactor = np.zeros(len(atoms)) 

216 if write_arrays: 

217 if 'occupancy' in atoms.arrays: 

218 occupancy = atoms.get_array('occupancy') 

219 if 'bfactor' in atoms.arrays: 

220 bfactor = atoms.get_array('bfactor') 

221 if rotation is not None: 

222 p = p.dot(rotation) 

223 for a in range(natoms): 

224 x, y, z = p[a] 

225 occ = occupancy[a] 

226 bf = bfactor[a] 

227 fileobj.write(format % ((a+1) % MAXNUM, symbols[a], 

228 x, y, z, occ, bf, symbols[a].upper())) 

229 fileobj.write('ENDMDL\n')