Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1""" 

2ULM files 

3========= 

4 

5*Simple and efficient pythonic file-format* 

6 

7Stores ndarrays as binary data and Python's built-in datatypes 

8(bool, int, float, complex, str, dict, list, tuple, None) as json. 

9 

10.. autofunction:: open 

11.. autoexception:: InvalidULMFileError 

12 

13 

14File layout 

15----------- 

16 

17When there is only a single item:: 

18 

19 0: "- of Ulm" (magic prefix, ascii) 

20 8: " " (tag, ascii) 

21 24: version (int64) 

22 32: nitems (int64) 

23 40: 48 (position of offsets, int64) 

24 48: p0 (offset to json data, int64) 

25 56: array1, array2, ... (8-byte aligned ndarrays) 

26 p0: n (length of json data, int64) 

27 p0+8: json data 

28 p0+8+n: EOF 

29 

30 

31Examples 

32-------- 

33 

34Writing: 

35 

36>>> import numpy as np 

37>>> import ase.io.ulm as ulm 

38>>> with ulm.open('x.ulm', 'w') as w: 

39... w.write(a=np.ones(7), b=42, c='abc') 

40... w.write(d=3.14) 

41 

42 

43Reading: 

44 

45>>> r = ulm.open('x.ulm') 

46>>> print(r.c) 

47abc 

48>>> r.close() 

49 

50To see what's inside 'x.ulm' do this:: 

51 

52 $ ase ulm x.ulm 

53 x.ulm (tag: "", 1 item) 

54 item #0: 

55 { 

56 a: <ndarray shape=(7,) dtype=float64>, 

57 b: 42, 

58 c: abc, 

59 d: 3.14} 

60 

61 

62.. autoclass:: Writer 

63 :members: 

64 

65.. autoclass:: Reader 

66 :members: 

67 

68 

69More examples 

70------------- 

71 

72In the following we append to the ulm-file from above and demonstrae 

73how to write a big array in chunks: 

74 

75>>> w = ulm.open('x.ulm', 'a') 

76>>> w.add_array('bigarray', (10, 1000), float) 

77>>> for i in range(10): 

78... w.fill(np.ones(1000)) 

79... 

80>>> w.close() 

81 

82Now read first and second items: 

83 

84>>> with ulm.open('x.ulm') as r: 

85... print(r.keys()) 

86dict_keys(['a', 'b', 'c', 'd']) 

87>>> with ulm.open('x.ulm', index=1) as r: 

88... print(r.keys()) 

89dict_keys(['bigarray']) 

90 

91To get all the data, it is possible to iterate over the items in the file. 

92 

93>>> for i, r in enumerate(ulm.Reader('x.ulm')): 

94... for k in r.keys(): 

95... print(i, k) 

960 a 

970 b 

980 c 

990 d 

1001 bigarray 

101>>> r.close() 

102 

103The different parts (items) of the file are numbered by the index 

104argument: 

105 

106>>> r = ulm.Reader('x.ulm') 

107>>> r[1].bigarray.shape 

108(10, 1000) 

109>>> r.close() 

110 

111 

112Versions 

113-------- 

114 

1151) Initial version. 

116 

1172) Added support for big endian machines. Json data may now have 

118 _little_endian=False item. 

119 

1203) Changed magic string from "AFFormat" to "- of Ulm". 

121""" 

122 

123import os 

124import numbers 

125from pathlib import Path 

126from typing import Union, Set 

127 

128import numpy as np 

129 

130from ase.io.jsonio import encode, decode 

131from ase.utils import plural 

132 

133 

134VERSION = 3 

135N1 = 42 # block size - max number of items: 1, N1, N1*N1, N1*N1*N1, ... 

136 

137 

138def open(filename, mode='r', index=None, tag=None): 

139 """Open ulm-file. 

140 

141 filename: str 

142 Filename. 

143 mode: str 

144 Mode. Must be 'r' for reading, 'w' for writing to a new file 

145 (overwriting an existing one) or 'a' for appending to an existing file. 

146 index: int 

147 Index of item to read. Defaults to 0. 

148 tag: str 

149 Magic ID string. 

150 

151 Returns a :class:`Reader` or a :class:`Writer` object. May raise 

152 :class:`InvalidULMFileError`. 

153 """ 

154 if mode == 'r': 

155 assert tag is None 

156 return Reader(filename, index or 0) 

157 if mode not in 'wa': 

158 2 / 0 

159 assert index is None 

160 return Writer(filename, mode, tag or '') 

161 

162 

163ulmopen = open 

164 

165 

166def align(fd): 

167 """Advance file descriptor to 8 byte alignment and return position.""" 

168 pos = fd.tell() 

169 r = pos % 8 

170 if r == 0: 

171 return pos 

172 fd.write(b'#' * (8 - r)) 

173 return pos + 8 - r 

174 

175 

176def writeint(fd, n, pos=None): 

177 """Write 64 bit integer n at pos or current position.""" 

178 if pos is not None: 

179 fd.seek(pos) 

180 a = np.array(n, np.int64) 

181 if not np.little_endian: 

182 a.byteswap(True) 

183 fd.write(a.tobytes()) 

184 

185 

186def readints(fd, n): 

187 a = np.frombuffer(fd.read(int(n * 8)), dtype=np.int64, count=n) 

188 if not np.little_endian: 

189 # Cannot use in-place byteswap because frombuffer() 

190 # returns readonly view 

191 a = a.byteswap() 

192 return a 

193 

194 

195def file_has_fileno(fd): 

196 """Tell whether file implements fileio() or not. 

197 

198 array.tofile(fd) works only on files with fileno(). 

199 numpy may write faster to physical files using fileno(). 

200 

201 For files without fileno() we use instead fd.write(array.tobytes()). 

202 Either way we need to distinguish.""" 

203 

204 try: 

205 fno = fd.fileno # AttributeError? 

206 fno() # IOError/OSError? (Newer python: OSError is IOError) 

207 except (AttributeError, IOError): 

208 return False 

209 return True 

210 

211 

212class Writer: 

213 def __init__(self, fd, mode='w', tag='', data=None): 

214 """Create writer object. 

215 

216 fd: str 

217 Filename. 

218 mode: str 

219 Mode. Must be 'w' for writing to a new file (overwriting an 

220 existing one) and 'a' for appending to an existing file. 

221 tag: str 

222 Magic ID string. 

223 """ 

224 

225 assert mode in 'aw' 

226 

227 # Header to be written later: 

228 self.header = b'' 

229 

230 if data is None: 

231 if np.little_endian: 

232 data = {} 

233 else: 

234 data = {'_little_endian': False} 

235 

236 if isinstance(fd, str): 

237 fd = Path(fd) 

238 

239 if mode == 'w' or (isinstance(fd, Path) and 

240 not (fd.is_file() and 

241 fd.stat().st_size > 0)): 

242 self.nitems = 0 

243 self.pos0 = 48 

244 self.offsets = np.array([-1], np.int64) 

245 

246 if isinstance(fd, Path): 

247 fd = fd.open('wb') 

248 

249 # File format identifier and other stuff: 

250 a = np.array([VERSION, self.nitems, self.pos0], np.int64) 

251 if not np.little_endian: 

252 a.byteswap(True) 

253 self.header = ('- of Ulm{0:16}'.format(tag).encode('ascii') + 

254 a.tobytes() + 

255 self.offsets.tobytes()) 

256 else: 

257 if isinstance(fd, Path): 

258 fd = fd.open('r+b') 

259 

260 version, self.nitems, self.pos0, offsets = read_header(fd)[1:] 

261 assert version == VERSION 

262 n = 1 

263 while self.nitems > n: 

264 n *= N1 

265 padding = np.zeros(n - self.nitems, np.int64) 

266 self.offsets = np.concatenate((offsets, padding)) 

267 fd.seek(0, 2) 

268 

269 self.fd = fd 

270 self.hasfileno = file_has_fileno(fd) 

271 

272 self.data = data 

273 

274 # date for array being filled: 

275 self.nmissing = 0 # number of missing numbers 

276 self.shape = None 

277 self.dtype = None 

278 

279 def __enter__(self): 

280 return self 

281 

282 def __exit__(self, exc_type, exc_value, tb): 

283 self.close() 

284 

285 def add_array(self, name, shape, dtype=float): 

286 """Add ndarray object. 

287 

288 Set name, shape and dtype for array and fill in the data in chunks 

289 later with the fill() method. 

290 """ 

291 

292 self._write_header() 

293 

294 if isinstance(shape, int): 

295 shape = (shape,) 

296 

297 shape = tuple(int(s) for s in shape) # Convert np.int64 to int 

298 

299 i = align(self.fd) 

300 

301 self.data[name + '.'] = { 

302 'ndarray': (shape, np.dtype(dtype).name, i)} 

303 

304 assert self.nmissing == 0, 'last array not done' 

305 

306 self.dtype = dtype 

307 self.shape = shape 

308 self.nmissing = np.prod(shape) 

309 

310 def _write_header(self): 

311 # We want to delay writing until there is any real data written. 

312 # Some people rely on zero file size. 

313 if self.header: 

314 self.fd.write(self.header) 

315 self.header = b'' 

316 

317 def fill(self, a): 

318 """Fill in ndarray chunks for array currently being written.""" 

319 assert a.dtype == self.dtype 

320 assert a.shape[1:] == self.shape[len(self.shape) - a.ndim + 1:] 

321 self.nmissing -= a.size 

322 assert self.nmissing >= 0 

323 

324 if self.hasfileno: 

325 a.tofile(self.fd) 

326 else: 

327 self.fd.write(a.tobytes()) 

328 

329 def sync(self): 

330 """Write data dictionary. 

331 

332 Write bool, int, float, complex and str data, shapes and 

333 dtypes for ndarrays.""" 

334 

335 self._write_header() 

336 

337 assert self.nmissing == 0 

338 i = self.fd.tell() 

339 s = encode(self.data).encode() 

340 writeint(self.fd, len(s)) 

341 self.fd.write(s) 

342 

343 n = len(self.offsets) 

344 if self.nitems >= n: 

345 offsets = np.zeros(n * N1, np.int64) 

346 offsets[:n] = self.offsets 

347 self.pos0 = align(self.fd) 

348 

349 buf = offsets if np.little_endian else offsets.byteswap() 

350 

351 if self.hasfileno: 

352 buf.tofile(self.fd) 

353 else: 

354 self.fd.write(buf.tobytes()) 

355 writeint(self.fd, self.pos0, 40) 

356 self.offsets = offsets 

357 

358 self.offsets[self.nitems] = i 

359 writeint(self.fd, i, self.pos0 + self.nitems * 8) 

360 self.nitems += 1 

361 writeint(self.fd, self.nitems, 32) 

362 self.fd.flush() 

363 self.fd.seek(0, 2) # end of file 

364 if np.little_endian: 

365 self.data = {} 

366 else: 

367 self.data = {'_little_endian': False} 

368 

369 def write(self, *args, **kwargs): 

370 """Write data. 

371 

372 Examples:: 

373 

374 writer.write('n', 7) 

375 writer.write(n=7) 

376 writer.write(n=7, s='abc', a=np.zeros(3), abc=obj) 

377 

378 If obj is not one of the supported data types (bool, int, float, 

379 complex, tupl, list, dict, None or ndarray) then it must have a 

380 obj.write(childwriter) method. 

381 """ 

382 

383 if args: 

384 name, value = args 

385 kwargs[name] = value 

386 

387 self._write_header() 

388 

389 for name, value in kwargs.items(): 

390 if isinstance(value, (bool, int, float, complex, 

391 dict, list, tuple, str, 

392 type(None))): 

393 self.data[name] = value 

394 elif hasattr(value, '__array__'): 

395 value = np.asarray(value) 

396 if value.ndim == 0: 

397 self.data[name] = value.item() 

398 else: 

399 self.add_array(name, value.shape, value.dtype) 

400 self.fill(value) 

401 else: 

402 value.write(self.child(name)) 

403 

404 def child(self, name): 

405 """Create child-writer object.""" 

406 self._write_header() 

407 dct = self.data[name + '.'] = {} 

408 return Writer(self.fd, data=dct) 

409 

410 def close(self): 

411 """Close file.""" 

412 n = int('_little_endian' in self.data) 

413 if len(self.data) > n: 

414 # There is more than the "_little_endian" key. 

415 # Write that stuff before closing: 

416 self.sync() 

417 else: 

418 # Make sure header has been written (empty ulm-file): 

419 self._write_header() 

420 self.fd.close() 

421 

422 def __len__(self): 

423 return int(self.nitems) 

424 

425 

426class DummyWriter: 

427 def __enter__(self): 

428 return self 

429 

430 def __exit__(self, exc_type, exc_value, tb): 

431 self.close() 

432 

433 def add_array(self, name, shape, dtype=float): 

434 pass 

435 

436 def fill(self, a): 

437 pass 

438 

439 def sync(self): 

440 pass 

441 

442 def write(self, *args, **kwargs): 

443 pass 

444 

445 def child(self, name): 

446 return self 

447 

448 def close(self): 

449 pass 

450 

451 def __len__(self): 

452 return 0 

453 

454 

455def read_header(fd): 

456 fd.seek(0) 

457 if fd.read(8) not in [b'- of Ulm', b'AFFormat']: 

458 raise InvalidULMFileError('This is not an ULM formatted file.') 

459 tag = fd.read(16).decode('ascii').rstrip() 

460 version, nitems, pos0 = readints(fd, 3) 

461 fd.seek(pos0) 

462 offsets = readints(fd, nitems) 

463 return tag, version, nitems, pos0, offsets 

464 

465 

466class InvalidULMFileError(IOError): 

467 pass 

468 

469 

470class Reader: 

471 def __init__(self, fd, index=0, data=None, _little_endian=None): 

472 """Create reader.""" 

473 

474 self._little_endian = _little_endian 

475 

476 if not hasattr(fd, 'read'): 

477 fd = Path(fd).open('rb') 

478 

479 self._fd = fd 

480 self._index = index 

481 

482 if data is None: 

483 (self._tag, self._version, self._nitems, self._pos0, 

484 self._offsets) = read_header(fd) 

485 if self._nitems > 0: 

486 data = self._read_data(index) 

487 else: 

488 data = {} 

489 

490 self._parse_data(data) 

491 

492 def __enter__(self): 

493 return self 

494 

495 def __exit__(self, exc_type, exc_value, tb): 

496 self.close() 

497 

498 def _parse_data(self, data): 

499 self._data = {} 

500 for name, value in data.items(): 

501 if name.endswith('.'): 

502 if 'ndarray' in value: 

503 shape, dtype, offset = value['ndarray'] 

504 dtype = dtype.encode() # compatibility with Numpy 1.4 

505 value = NDArrayReader(self._fd, 

506 shape, 

507 np.dtype(dtype), 

508 offset, 

509 self._little_endian) 

510 else: 

511 value = Reader(self._fd, data=value, 

512 _little_endian=self._little_endian) 

513 name = name[:-1] 

514 

515 self._data[name] = value 

516 

517 def get_tag(self): 

518 """Return special tag string.""" 

519 return self._tag 

520 

521 def keys(self): 

522 """Return list of keys.""" 

523 return self._data.keys() 

524 

525 def asdict(self): 

526 """Read everything now and convert to dict.""" 

527 dct = {} 

528 for key, value in self._data.items(): 

529 if isinstance(value, NDArrayReader): 

530 value = value.read() 

531 elif isinstance(value, Reader): 

532 value = value.asdict() 

533 dct[key] = value 

534 return dct 

535 

536 __dir__ = keys # needed for tab-completion 

537 

538 def __getattr__(self, attr): 

539 try: 

540 value = self._data[attr] 

541 except KeyError: 

542 raise AttributeError(attr) 

543 if isinstance(value, NDArrayReader): 

544 return value.read() 

545 return value 

546 

547 def __contains__(self, key): 

548 return key in self._data 

549 

550 def __iter__(self): 

551 yield self 

552 for i in range(self._index + 1, self._nitems): 

553 self._index = i 

554 data = self._read_data(i) 

555 self._parse_data(data) 

556 yield self 

557 

558 def get(self, attr, value=None): 

559 """Get attr or value if no such attr.""" 

560 try: 

561 return self.__getattr__(attr) 

562 except AttributeError: 

563 return value 

564 

565 def proxy(self, name, *indices): 

566 value = self._data[name] 

567 assert isinstance(value, NDArrayReader) 

568 if indices: 

569 return value.proxy(*indices) 

570 return value 

571 

572 def __len__(self): 

573 return int(self._nitems) 

574 

575 def _read_data(self, index): 

576 self._fd.seek(self._offsets[index]) 

577 size = int(readints(self._fd, 1)[0]) 

578 data = decode(self._fd.read(size).decode(), False) 

579 self._little_endian = data.pop('_little_endian', True) 

580 return data 

581 

582 def __getitem__(self, index): 

583 """Return Reader for item *index*.""" 

584 data = self._read_data(index) 

585 return Reader(self._fd, index, data, self._little_endian) 

586 

587 def tostr(self, verbose=False, indent=' '): 

588 keys = sorted(self._data) 

589 strings = [] 

590 for key in keys: 

591 value = self._data[key] 

592 if verbose and isinstance(value, NDArrayReader): 

593 value = value.read() 

594 if isinstance(value, NDArrayReader): 

595 s = '<ndarray shape={} dtype={}>'.format(value.shape, 

596 value.dtype) 

597 elif isinstance(value, Reader): 

598 s = value.tostr(verbose, indent + ' ') 

599 else: 

600 s = str(value).replace('\n', '\n ' + ' ' * len(key) + indent) 

601 strings.append('{}{}: {}'.format(indent, key, s)) 

602 return '{\n' + ',\n'.join(strings) + '}' 

603 

604 def __str__(self): 

605 return self.tostr(False, '').replace('\n', ' ') 

606 

607 def close(self): 

608 self._fd.close() 

609 

610 

611class NDArrayReader: 

612 def __init__(self, fd, shape, dtype, offset, little_endian): 

613 self.fd = fd 

614 self.hasfileno = file_has_fileno(fd) 

615 self.shape = tuple(shape) 

616 self.dtype = dtype 

617 self.offset = offset 

618 self.little_endian = little_endian 

619 

620 self.ndim = len(self.shape) 

621 self.itemsize = dtype.itemsize 

622 self.size = np.prod(self.shape) 

623 self.nbytes = self.size * self.itemsize 

624 

625 self.scale = 1.0 

626 self.length_of_last_dimension = None 

627 

628 def __len__(self): 

629 return int(self.shape[0]) # Python-2.6 needs int 

630 

631 def read(self): 

632 return self[:] 

633 

634 def __getitem__(self, i): 

635 if isinstance(i, numbers.Integral): 

636 if i < 0: 

637 i += len(self) 

638 return self[i:i + 1][0] 

639 start, stop, step = i.indices(len(self)) 

640 stride = np.prod(self.shape[1:], dtype=int) 

641 offset = self.offset + start * self.itemsize * stride 

642 self.fd.seek(offset) 

643 count = (stop - start) * stride 

644 if self.hasfileno: 

645 a = np.fromfile(self.fd, self.dtype, count) 

646 else: 

647 # Not as fast, but works for reading from tar-files: 

648 a = np.frombuffer(self.fd.read(int(count * self.itemsize)), 

649 self.dtype) 

650 a.shape = (stop - start,) + self.shape[1:] 

651 if step != 1: 

652 a = a[::step].copy() 

653 if self.little_endian != np.little_endian: 

654 # frombuffer() returns readonly array 

655 a = a.byteswap(inplace=a.flags.writeable) 

656 if self.length_of_last_dimension is not None: 

657 a = a[..., :self.length_of_last_dimension] 

658 if self.scale != 1.0: 

659 a *= self.scale 

660 return a 

661 

662 def proxy(self, *indices): 

663 stride = self.size // len(self) 

664 start = 0 

665 for i, index in enumerate(indices): 

666 start += stride * index 

667 stride //= self.shape[i + 1] 

668 offset = self.offset + start * self.itemsize 

669 p = NDArrayReader(self.fd, self.shape[i + 1:], self.dtype, 

670 offset, self.little_endian) 

671 p.scale = self.scale 

672 return p 

673 

674 

675def print_ulm_info(filename, index=None, verbose=False): 

676 b = ulmopen(filename, 'r') 

677 if index is None: 

678 indices = range(len(b)) 

679 else: 

680 indices = [index] 

681 print('{0} (tag: "{1}", {2})'.format(filename, b.get_tag(), 

682 plural(len(b), 'item'))) 

683 for i in indices: 

684 print('item #{0}:'.format(i)) 

685 print(b[i].tostr(verbose)) 

686 

687 

688def copy(reader: Union[str, Path, Reader], 

689 writer: Union[str, Path, Writer], 

690 exclude: Set[str] = set(), 

691 name: str = '') -> None: 

692 """Copy from reader to writer except for keys in exclude.""" 

693 close_reader = False 

694 close_writer = False 

695 if not isinstance(reader, Reader): 

696 reader = Reader(reader) 

697 close_reader = True 

698 if not isinstance(writer, Writer): 

699 writer = Writer(writer) 

700 close_writer = True 

701 for key, value in reader._data.items(): 

702 if name + '.' + key in exclude: 

703 continue 

704 if isinstance(value, NDArrayReader): 

705 value = value.read() 

706 if isinstance(value, Reader): 

707 copy(value, writer.child(key), exclude, name + '.' + key) 

708 else: 

709 writer.write(key, value) 

710 if close_reader: 

711 reader.close() 

712 if close_writer: 

713 writer.close() 

714 

715 

716class CLICommand: 

717 """Manipulate/show content of ulm-file. 

718 

719 The ULM file format is used for ASE's trajectory files, 

720 for GPAW's gpw-files and other things. 

721 

722 Example (show first image of a trajectory file): 

723 

724 ase ulm abc.traj -n 0 -v 

725 """ 

726 

727 @staticmethod 

728 def add_arguments(parser): 

729 add = parser.add_argument 

730 add('filename', help='Name of ULM-file.') 

731 add('-n', '--index', type=int, 

732 help='Show only one index. Default is to show all.') 

733 add('-d', '--delete', metavar='key1,key2,...', 

734 help='Remove key(s) from ULM-file.') 

735 add('-v', '--verbose', action='store_true', help='More output.') 

736 

737 @staticmethod 

738 def run(args): 

739 if args.delete: 

740 exclude = set('.' + key for key in args.delete.split(',')) 

741 copy(args.filename, args.filename + '.temp', exclude) 

742 os.rename(args.filename + '.temp', args.filename) 

743 else: 

744 print_ulm_info(args.filename, args.index, verbose=args.verbose)