Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""
2ULM files
3=========
5*Simple and efficient pythonic file-format*
7Stores ndarrays as binary data and Python's built-in datatypes
8(bool, int, float, complex, str, dict, list, tuple, None) as json.
10.. autofunction:: open
11.. autoexception:: InvalidULMFileError
14File layout
15-----------
17When there is only a single item::
19 0: "- of Ulm" (magic prefix, ascii)
20 8: " " (tag, ascii)
21 24: version (int64)
22 32: nitems (int64)
23 40: 48 (position of offsets, int64)
24 48: p0 (offset to json data, int64)
25 56: array1, array2, ... (8-byte aligned ndarrays)
26 p0: n (length of json data, int64)
27 p0+8: json data
28 p0+8+n: EOF
31Examples
32--------
34Writing:
36>>> import numpy as np
37>>> import ase.io.ulm as ulm
38>>> with ulm.open('x.ulm', 'w') as w:
39... w.write(a=np.ones(7), b=42, c='abc')
40... w.write(d=3.14)
43Reading:
45>>> r = ulm.open('x.ulm')
46>>> print(r.c)
47abc
48>>> r.close()
50To see what's inside 'x.ulm' do this::
52 $ ase ulm x.ulm
53 x.ulm (tag: "", 1 item)
54 item #0:
55 {
56 a: <ndarray shape=(7,) dtype=float64>,
57 b: 42,
58 c: abc,
59 d: 3.14}
62.. autoclass:: Writer
63 :members:
65.. autoclass:: Reader
66 :members:
69More examples
70-------------
72In the following we append to the ulm-file from above and demonstrae
73how to write a big array in chunks:
75>>> w = ulm.open('x.ulm', 'a')
76>>> w.add_array('bigarray', (10, 1000), float)
77>>> for i in range(10):
78... w.fill(np.ones(1000))
79...
80>>> w.close()
82Now read first and second items:
84>>> with ulm.open('x.ulm') as r:
85... print(r.keys())
86dict_keys(['a', 'b', 'c', 'd'])
87>>> with ulm.open('x.ulm', index=1) as r:
88... print(r.keys())
89dict_keys(['bigarray'])
91To get all the data, it is possible to iterate over the items in the file.
93>>> for i, r in enumerate(ulm.Reader('x.ulm')):
94... for k in r.keys():
95... print(i, k)
960 a
970 b
980 c
990 d
1001 bigarray
101>>> r.close()
103The different parts (items) of the file are numbered by the index
104argument:
106>>> r = ulm.Reader('x.ulm')
107>>> r[1].bigarray.shape
108(10, 1000)
109>>> r.close()
112Versions
113--------
1151) Initial version.
1172) Added support for big endian machines. Json data may now have
118 _little_endian=False item.
1203) Changed magic string from "AFFormat" to "- of Ulm".
121"""
123import os
124import numbers
125from pathlib import Path
126from typing import Union, Set
128import numpy as np
130from ase.io.jsonio import encode, decode
131from ase.utils import plural
134VERSION = 3
135N1 = 42 # block size - max number of items: 1, N1, N1*N1, N1*N1*N1, ...
138def open(filename, mode='r', index=None, tag=None):
139 """Open ulm-file.
141 filename: str
142 Filename.
143 mode: str
144 Mode. Must be 'r' for reading, 'w' for writing to a new file
145 (overwriting an existing one) or 'a' for appending to an existing file.
146 index: int
147 Index of item to read. Defaults to 0.
148 tag: str
149 Magic ID string.
151 Returns a :class:`Reader` or a :class:`Writer` object. May raise
152 :class:`InvalidULMFileError`.
153 """
154 if mode == 'r':
155 assert tag is None
156 return Reader(filename, index or 0)
157 if mode not in 'wa':
158 2 / 0
159 assert index is None
160 return Writer(filename, mode, tag or '')
163ulmopen = open
166def align(fd):
167 """Advance file descriptor to 8 byte alignment and return position."""
168 pos = fd.tell()
169 r = pos % 8
170 if r == 0:
171 return pos
172 fd.write(b'#' * (8 - r))
173 return pos + 8 - r
176def writeint(fd, n, pos=None):
177 """Write 64 bit integer n at pos or current position."""
178 if pos is not None:
179 fd.seek(pos)
180 a = np.array(n, np.int64)
181 if not np.little_endian:
182 a.byteswap(True)
183 fd.write(a.tobytes())
186def readints(fd, n):
187 a = np.frombuffer(fd.read(int(n * 8)), dtype=np.int64, count=n)
188 if not np.little_endian:
189 # Cannot use in-place byteswap because frombuffer()
190 # returns readonly view
191 a = a.byteswap()
192 return a
195def file_has_fileno(fd):
196 """Tell whether file implements fileio() or not.
198 array.tofile(fd) works only on files with fileno().
199 numpy may write faster to physical files using fileno().
201 For files without fileno() we use instead fd.write(array.tobytes()).
202 Either way we need to distinguish."""
204 try:
205 fno = fd.fileno # AttributeError?
206 fno() # IOError/OSError? (Newer python: OSError is IOError)
207 except (AttributeError, IOError):
208 return False
209 return True
212class Writer:
213 def __init__(self, fd, mode='w', tag='', data=None):
214 """Create writer object.
216 fd: str
217 Filename.
218 mode: str
219 Mode. Must be 'w' for writing to a new file (overwriting an
220 existing one) and 'a' for appending to an existing file.
221 tag: str
222 Magic ID string.
223 """
225 assert mode in 'aw'
227 # Header to be written later:
228 self.header = b''
230 if data is None:
231 if np.little_endian:
232 data = {}
233 else:
234 data = {'_little_endian': False}
236 if isinstance(fd, str):
237 fd = Path(fd)
239 if mode == 'w' or (isinstance(fd, Path) and
240 not (fd.is_file() and
241 fd.stat().st_size > 0)):
242 self.nitems = 0
243 self.pos0 = 48
244 self.offsets = np.array([-1], np.int64)
246 if isinstance(fd, Path):
247 fd = fd.open('wb')
249 # File format identifier and other stuff:
250 a = np.array([VERSION, self.nitems, self.pos0], np.int64)
251 if not np.little_endian:
252 a.byteswap(True)
253 self.header = ('- of Ulm{0:16}'.format(tag).encode('ascii') +
254 a.tobytes() +
255 self.offsets.tobytes())
256 else:
257 if isinstance(fd, Path):
258 fd = fd.open('r+b')
260 version, self.nitems, self.pos0, offsets = read_header(fd)[1:]
261 assert version == VERSION
262 n = 1
263 while self.nitems > n:
264 n *= N1
265 padding = np.zeros(n - self.nitems, np.int64)
266 self.offsets = np.concatenate((offsets, padding))
267 fd.seek(0, 2)
269 self.fd = fd
270 self.hasfileno = file_has_fileno(fd)
272 self.data = data
274 # date for array being filled:
275 self.nmissing = 0 # number of missing numbers
276 self.shape = None
277 self.dtype = None
279 def __enter__(self):
280 return self
282 def __exit__(self, exc_type, exc_value, tb):
283 self.close()
285 def add_array(self, name, shape, dtype=float):
286 """Add ndarray object.
288 Set name, shape and dtype for array and fill in the data in chunks
289 later with the fill() method.
290 """
292 self._write_header()
294 if isinstance(shape, int):
295 shape = (shape,)
297 shape = tuple(int(s) for s in shape) # Convert np.int64 to int
299 i = align(self.fd)
301 self.data[name + '.'] = {
302 'ndarray': (shape, np.dtype(dtype).name, i)}
304 assert self.nmissing == 0, 'last array not done'
306 self.dtype = dtype
307 self.shape = shape
308 self.nmissing = np.prod(shape)
310 def _write_header(self):
311 # We want to delay writing until there is any real data written.
312 # Some people rely on zero file size.
313 if self.header:
314 self.fd.write(self.header)
315 self.header = b''
317 def fill(self, a):
318 """Fill in ndarray chunks for array currently being written."""
319 assert a.dtype == self.dtype
320 assert a.shape[1:] == self.shape[len(self.shape) - a.ndim + 1:]
321 self.nmissing -= a.size
322 assert self.nmissing >= 0
324 if self.hasfileno:
325 a.tofile(self.fd)
326 else:
327 self.fd.write(a.tobytes())
329 def sync(self):
330 """Write data dictionary.
332 Write bool, int, float, complex and str data, shapes and
333 dtypes for ndarrays."""
335 self._write_header()
337 assert self.nmissing == 0
338 i = self.fd.tell()
339 s = encode(self.data).encode()
340 writeint(self.fd, len(s))
341 self.fd.write(s)
343 n = len(self.offsets)
344 if self.nitems >= n:
345 offsets = np.zeros(n * N1, np.int64)
346 offsets[:n] = self.offsets
347 self.pos0 = align(self.fd)
349 buf = offsets if np.little_endian else offsets.byteswap()
351 if self.hasfileno:
352 buf.tofile(self.fd)
353 else:
354 self.fd.write(buf.tobytes())
355 writeint(self.fd, self.pos0, 40)
356 self.offsets = offsets
358 self.offsets[self.nitems] = i
359 writeint(self.fd, i, self.pos0 + self.nitems * 8)
360 self.nitems += 1
361 writeint(self.fd, self.nitems, 32)
362 self.fd.flush()
363 self.fd.seek(0, 2) # end of file
364 if np.little_endian:
365 self.data = {}
366 else:
367 self.data = {'_little_endian': False}
369 def write(self, *args, **kwargs):
370 """Write data.
372 Examples::
374 writer.write('n', 7)
375 writer.write(n=7)
376 writer.write(n=7, s='abc', a=np.zeros(3), abc=obj)
378 If obj is not one of the supported data types (bool, int, float,
379 complex, tupl, list, dict, None or ndarray) then it must have a
380 obj.write(childwriter) method.
381 """
383 if args:
384 name, value = args
385 kwargs[name] = value
387 self._write_header()
389 for name, value in kwargs.items():
390 if isinstance(value, (bool, int, float, complex,
391 dict, list, tuple, str,
392 type(None))):
393 self.data[name] = value
394 elif hasattr(value, '__array__'):
395 value = np.asarray(value)
396 if value.ndim == 0:
397 self.data[name] = value.item()
398 else:
399 self.add_array(name, value.shape, value.dtype)
400 self.fill(value)
401 else:
402 value.write(self.child(name))
404 def child(self, name):
405 """Create child-writer object."""
406 self._write_header()
407 dct = self.data[name + '.'] = {}
408 return Writer(self.fd, data=dct)
410 def close(self):
411 """Close file."""
412 n = int('_little_endian' in self.data)
413 if len(self.data) > n:
414 # There is more than the "_little_endian" key.
415 # Write that stuff before closing:
416 self.sync()
417 else:
418 # Make sure header has been written (empty ulm-file):
419 self._write_header()
420 self.fd.close()
422 def __len__(self):
423 return int(self.nitems)
426class DummyWriter:
427 def __enter__(self):
428 return self
430 def __exit__(self, exc_type, exc_value, tb):
431 self.close()
433 def add_array(self, name, shape, dtype=float):
434 pass
436 def fill(self, a):
437 pass
439 def sync(self):
440 pass
442 def write(self, *args, **kwargs):
443 pass
445 def child(self, name):
446 return self
448 def close(self):
449 pass
451 def __len__(self):
452 return 0
455def read_header(fd):
456 fd.seek(0)
457 if fd.read(8) not in [b'- of Ulm', b'AFFormat']:
458 raise InvalidULMFileError('This is not an ULM formatted file.')
459 tag = fd.read(16).decode('ascii').rstrip()
460 version, nitems, pos0 = readints(fd, 3)
461 fd.seek(pos0)
462 offsets = readints(fd, nitems)
463 return tag, version, nitems, pos0, offsets
466class InvalidULMFileError(IOError):
467 pass
470class Reader:
471 def __init__(self, fd, index=0, data=None, _little_endian=None):
472 """Create reader."""
474 self._little_endian = _little_endian
476 if not hasattr(fd, 'read'):
477 fd = Path(fd).open('rb')
479 self._fd = fd
480 self._index = index
482 if data is None:
483 (self._tag, self._version, self._nitems, self._pos0,
484 self._offsets) = read_header(fd)
485 if self._nitems > 0:
486 data = self._read_data(index)
487 else:
488 data = {}
490 self._parse_data(data)
492 def __enter__(self):
493 return self
495 def __exit__(self, exc_type, exc_value, tb):
496 self.close()
498 def _parse_data(self, data):
499 self._data = {}
500 for name, value in data.items():
501 if name.endswith('.'):
502 if 'ndarray' in value:
503 shape, dtype, offset = value['ndarray']
504 dtype = dtype.encode() # compatibility with Numpy 1.4
505 value = NDArrayReader(self._fd,
506 shape,
507 np.dtype(dtype),
508 offset,
509 self._little_endian)
510 else:
511 value = Reader(self._fd, data=value,
512 _little_endian=self._little_endian)
513 name = name[:-1]
515 self._data[name] = value
517 def get_tag(self):
518 """Return special tag string."""
519 return self._tag
521 def keys(self):
522 """Return list of keys."""
523 return self._data.keys()
525 def asdict(self):
526 """Read everything now and convert to dict."""
527 dct = {}
528 for key, value in self._data.items():
529 if isinstance(value, NDArrayReader):
530 value = value.read()
531 elif isinstance(value, Reader):
532 value = value.asdict()
533 dct[key] = value
534 return dct
536 __dir__ = keys # needed for tab-completion
538 def __getattr__(self, attr):
539 try:
540 value = self._data[attr]
541 except KeyError:
542 raise AttributeError(attr)
543 if isinstance(value, NDArrayReader):
544 return value.read()
545 return value
547 def __contains__(self, key):
548 return key in self._data
550 def __iter__(self):
551 yield self
552 for i in range(self._index + 1, self._nitems):
553 self._index = i
554 data = self._read_data(i)
555 self._parse_data(data)
556 yield self
558 def get(self, attr, value=None):
559 """Get attr or value if no such attr."""
560 try:
561 return self.__getattr__(attr)
562 except AttributeError:
563 return value
565 def proxy(self, name, *indices):
566 value = self._data[name]
567 assert isinstance(value, NDArrayReader)
568 if indices:
569 return value.proxy(*indices)
570 return value
572 def __len__(self):
573 return int(self._nitems)
575 def _read_data(self, index):
576 self._fd.seek(self._offsets[index])
577 size = int(readints(self._fd, 1)[0])
578 data = decode(self._fd.read(size).decode(), False)
579 self._little_endian = data.pop('_little_endian', True)
580 return data
582 def __getitem__(self, index):
583 """Return Reader for item *index*."""
584 data = self._read_data(index)
585 return Reader(self._fd, index, data, self._little_endian)
587 def tostr(self, verbose=False, indent=' '):
588 keys = sorted(self._data)
589 strings = []
590 for key in keys:
591 value = self._data[key]
592 if verbose and isinstance(value, NDArrayReader):
593 value = value.read()
594 if isinstance(value, NDArrayReader):
595 s = '<ndarray shape={} dtype={}>'.format(value.shape,
596 value.dtype)
597 elif isinstance(value, Reader):
598 s = value.tostr(verbose, indent + ' ')
599 else:
600 s = str(value).replace('\n', '\n ' + ' ' * len(key) + indent)
601 strings.append('{}{}: {}'.format(indent, key, s))
602 return '{\n' + ',\n'.join(strings) + '}'
604 def __str__(self):
605 return self.tostr(False, '').replace('\n', ' ')
607 def close(self):
608 self._fd.close()
611class NDArrayReader:
612 def __init__(self, fd, shape, dtype, offset, little_endian):
613 self.fd = fd
614 self.hasfileno = file_has_fileno(fd)
615 self.shape = tuple(shape)
616 self.dtype = dtype
617 self.offset = offset
618 self.little_endian = little_endian
620 self.ndim = len(self.shape)
621 self.itemsize = dtype.itemsize
622 self.size = np.prod(self.shape)
623 self.nbytes = self.size * self.itemsize
625 self.scale = 1.0
626 self.length_of_last_dimension = None
628 def __len__(self):
629 return int(self.shape[0]) # Python-2.6 needs int
631 def read(self):
632 return self[:]
634 def __getitem__(self, i):
635 if isinstance(i, numbers.Integral):
636 if i < 0:
637 i += len(self)
638 return self[i:i + 1][0]
639 start, stop, step = i.indices(len(self))
640 stride = np.prod(self.shape[1:], dtype=int)
641 offset = self.offset + start * self.itemsize * stride
642 self.fd.seek(offset)
643 count = (stop - start) * stride
644 if self.hasfileno:
645 a = np.fromfile(self.fd, self.dtype, count)
646 else:
647 # Not as fast, but works for reading from tar-files:
648 a = np.frombuffer(self.fd.read(int(count * self.itemsize)),
649 self.dtype)
650 a.shape = (stop - start,) + self.shape[1:]
651 if step != 1:
652 a = a[::step].copy()
653 if self.little_endian != np.little_endian:
654 # frombuffer() returns readonly array
655 a = a.byteswap(inplace=a.flags.writeable)
656 if self.length_of_last_dimension is not None:
657 a = a[..., :self.length_of_last_dimension]
658 if self.scale != 1.0:
659 a *= self.scale
660 return a
662 def proxy(self, *indices):
663 stride = self.size // len(self)
664 start = 0
665 for i, index in enumerate(indices):
666 start += stride * index
667 stride //= self.shape[i + 1]
668 offset = self.offset + start * self.itemsize
669 p = NDArrayReader(self.fd, self.shape[i + 1:], self.dtype,
670 offset, self.little_endian)
671 p.scale = self.scale
672 return p
675def print_ulm_info(filename, index=None, verbose=False):
676 b = ulmopen(filename, 'r')
677 if index is None:
678 indices = range(len(b))
679 else:
680 indices = [index]
681 print('{0} (tag: "{1}", {2})'.format(filename, b.get_tag(),
682 plural(len(b), 'item')))
683 for i in indices:
684 print('item #{0}:'.format(i))
685 print(b[i].tostr(verbose))
688def copy(reader: Union[str, Path, Reader],
689 writer: Union[str, Path, Writer],
690 exclude: Set[str] = set(),
691 name: str = '') -> None:
692 """Copy from reader to writer except for keys in exclude."""
693 close_reader = False
694 close_writer = False
695 if not isinstance(reader, Reader):
696 reader = Reader(reader)
697 close_reader = True
698 if not isinstance(writer, Writer):
699 writer = Writer(writer)
700 close_writer = True
701 for key, value in reader._data.items():
702 if name + '.' + key in exclude:
703 continue
704 if isinstance(value, NDArrayReader):
705 value = value.read()
706 if isinstance(value, Reader):
707 copy(value, writer.child(key), exclude, name + '.' + key)
708 else:
709 writer.write(key, value)
710 if close_reader:
711 reader.close()
712 if close_writer:
713 writer.close()
716class CLICommand:
717 """Manipulate/show content of ulm-file.
719 The ULM file format is used for ASE's trajectory files,
720 for GPAW's gpw-files and other things.
722 Example (show first image of a trajectory file):
724 ase ulm abc.traj -n 0 -v
725 """
727 @staticmethod
728 def add_arguments(parser):
729 add = parser.add_argument
730 add('filename', help='Name of ULM-file.')
731 add('-n', '--index', type=int,
732 help='Show only one index. Default is to show all.')
733 add('-d', '--delete', metavar='key1,key2,...',
734 help='Remove key(s) from ULM-file.')
735 add('-v', '--verbose', action='store_true', help='More output.')
737 @staticmethod
738 def run(args):
739 if args.delete:
740 exclude = set('.' + key for key in args.delete.split(','))
741 copy(args.filename, args.filename + '.temp', exclude)
742 os.rename(args.filename + '.temp', args.filename)
743 else:
744 print_ulm_info(args.filename, args.index, verbose=args.verbose)