""" caching.py ----------- Functions and classes that help with tracking changes in ndarrays and clearing cached values based on those changes. """ import numpy as np import zlib import hashlib from functools import wraps from .constants import log from .util import is_sequence, now try: from collections.abc import Mapping except ImportError: from collections import Mapping try: # xxhash is roughly 5x faster than zlib.adler32 but is only # packaged in easy wheels on linux (`pip install xxhash`) # so we keep it a soft dependency import xxhash except ImportError: xxhash = None def tracked_array(array, dtype=None): """ Properly subclass a numpy ndarray to track changes. Avoids some pitfalls of subclassing by forcing contiguous arrays and does a view into a TrackedArray. Parameters ------------ array : array- like object To be turned into a TrackedArray dtype : np.dtype Which dtype to use for the array Returns ------------ tracked : TrackedArray Contains input array data """ # if someone passed us None, just create an empty array if array is None: array = [] # make sure it is contiguous then view it as our subclass tracked = np.ascontiguousarray( array, dtype=dtype).view(TrackedArray) # should always be contiguous here assert tracked.flags['C_CONTIGUOUS'] return tracked def cache_decorator(function): """ A decorator for class methods, replaces @property but will store and retrieve function return values in object cache. Parameters ------------ function : method This is used as a decorator: ``` @cache_decorator def foo(self, things): return 'happy days' ``` """ # use wraps to preserve docstring @wraps(function) def get_cached(*args, **kwargs): """ Only execute the function if its value isn't stored in cache already. """ self = args[0] # use function name as key in cache name = function.__name__ # store execution times tic = [now(), 0.0, 0.0] # do the dump logic ourselves to avoid # verifying cache twice per call self._cache.verify() tic[1] = now() # access cache dict to avoid automatic validation # since we already called cache.verify manually if name in self._cache.cache: # already stored so return value return self._cache.cache[name] # value not in cache so execute the function value = function(*args, **kwargs) tic[2] = now() # store the value if self._cache.force_immutable and hasattr( value, 'flags') and len(value.shape) > 0: value.flags.writeable = False self._cache.cache[name] = value # log both the function execution time and how long # it took to validate the state of the cache log.debug('`%s`: %.2Es, `cache.verify`: %.2Es', name, tic[2] - tic[1], tic[1] - tic[0]) return value # all cached values are also properties # so they can be accessed like value attributes # rather than functions return property(get_cached) class TrackedArray(np.ndarray): """ Subclass of numpy.ndarray that provides hash methods to track changes. General method is to aggressively set 'modified' flags on operations which might (but don't necessarily) alter the array, ideally we sometimes compute hashes when we don't need to, but we don't return wrong hashes ever. We store boolean modified flag for each hash type to make checks fast even for queries of different hashes. Methods ---------- md5 : str, hexadecimal MD5 of array crc : int, zlib crc32/adler32 checksum fast_hash : int, CRC or xxhash.xx64 """ def __array_finalize__(self, obj): """ Sets a modified flag on every TrackedArray This flag will be set on every change as well as during copies and certain types of slicing. """ self._modified_c = True self._modified_m = True self._modified_x = True if isinstance(obj, type(self)): obj._modified_c = True obj._modified_m = True obj._modified_x = True @property def mutable(self): return self.flags['WRITEABLE'] @mutable.setter def mutable(self, value): self.flags.writeable = value def md5(self): """ Return an MD5 hash of the current array. Returns ----------- md5 : str Hexadecimal MD5 of the array """ if self._modified_m or not hasattr(self, '_hashed_md5'): if self.flags['C_CONTIGUOUS']: hasher = hashlib.md5(self) self._hashed_md5 = hasher.hexdigest() else: # the case where we have sliced our nice # contiguous array into a non- contiguous block # for example (note slice *after* track operation): # t = util.tracked_array(np.random.random(10))[::-1] contiguous = np.ascontiguousarray(self) hasher = hashlib.md5(contiguous) self._hashed_md5 = hasher.hexdigest() self._modified_m = False return self._hashed_md5 def crc(self): """ A zlib.crc32 or zlib.adler32 checksum of the current data. Returns ----------- crc : int Checksum from zlib.crc32 or zlib.adler32 """ if self._modified_c or not hasattr(self, '_hashed_crc'): if self.flags['C_CONTIGUOUS']: self._hashed_crc = crc32(self) else: # the case where we have sliced our nice # contiguous array into a non- contiguous block # for example (note slice *after* track operation): # t = util.tracked_array(np.random.random(10))[::-1] contiguous = np.ascontiguousarray(self) self._hashed_crc = crc32(contiguous) self._modified_c = False return self._hashed_crc def _xxhash(self): """ An xxhash.b64 hash of the array. Returns ------------- xx : int xxhash.xxh64 hash of array. """ # repeat the bookkeeping to get a contiguous array inside # the function to avoid additional function calls # these functions are called millions of times so everything helps if self._modified_x or not hasattr(self, '_hashed_xx'): if self.flags['C_CONTIGUOUS']: self._hashed_xx = xxhash.xxh64(self).intdigest() else: # the case where we have sliced our nice # contiguous array into a non- contiguous block # for example (note slice *after* track operation): # t = util.tracked_array(np.random.random(10))[::-1] self._hashed_xx = xxhash.xxh64(np.ascontiguousarray(self)).intdigest() self._modified_x = False return self._hashed_xx def __hash__(self): """ Hash is required to return an int. Returns ----------- hash : int Result of fast_hash """ return self.fast_hash() def __iadd__(self, *args, **kwargs): """ In- place addition. The i* operations are in- place and modify the array, so we better catch all of them. """ self._modified_c = True self._modified_m = True self._modified_x = True return super(self.__class__, self).__iadd__(*args, **kwargs) def __isub__(self, *args, **kwargs): self._modified_c = True self._modified_m = True self._modified_x = True return super(self.__class__, self).__isub__(*args, **kwargs) def __imul__(self, *args, **kwargs): self._modified_c = True self._modified_m = True self._modified_x = True return super(self.__class__, self).__imul__(*args, **kwargs) def __idiv__(self, *args, **kwargs): self._modified_c = True self._modified_m = True self._modified_x = True return super(self.__class__, self).__idiv__(*args, **kwargs) def __itruediv__(self, *args, **kwargs): self._modified_c = True self._modified_m = True self._modified_x = True return super(self.__class__, self).__itruediv__(*args, **kwargs) def __imatmul__(self, *args, **kwargs): self._modified_c = True self._modified_m = True self._modified_x = True return super(self.__class__, self).__imatmul__(*args, **kwargs) def __ipow__(self, *args, **kwargs): self._modified_c = True self._modified_m = True self._modified_x = True return super(self.__class__, self).__ipow__(*args, **kwargs) def __imod__(self, *args, **kwargs): self._modified_c = True self._modified_m = True self._modified_x = True return super(self.__class__, self).__imod__(*args, **kwargs) def __ifloordiv__(self, *args, **kwargs): self._modified_c = True self._modified_m = True self._modified_x = True return super(self.__class__, self).__ifloordiv__(*args, **kwargs) def __ilshift__(self, *args, **kwargs): self._modified_c = True self._modified_m = True self._modified_x = True return super(self.__class__, self).__ilshift__(*args, **kwargs) def __irshift__(self, *args, **kwargs): self._modified_c = True self._modified_m = True self._modified_x = True return super(self.__class__, self).__irshift__(*args, **kwargs) def __iand__(self, *args, **kwargs): self._modified_c = True self._modified_m = True self._modified_x = True return super(self.__class__, self).__iand__(*args, **kwargs) def __ixor__(self, *args, **kwargs): self._modified_c = True self._modified_m = True self._modified_x = True return super(self.__class__, self).__ixor__(*args, **kwargs) def __ior__(self, *args, **kwargs): self._modified_c = True self._modified_m = True self._modified_x = True return super(self.__class__, self).__ior__(*args, **kwargs) def __setitem__(self, *args, **kwargs): self._modified_c = True self._modified_m = True self._modified_x = True super(self.__class__, self).__setitem__(*args, **kwargs) def __setslice__(self, *args, **kwargs): self._modified_c = True self._modified_m = True self._modified_x = True super(self.__class__, self).__setslice__(*args, **kwargs) if xxhash is None: # otherwise use our fastest CRC fast_hash = crc else: # if xxhash is installed use it fast_hash = _xxhash class Cache(object): """ Class to cache values which will be stored until the result of an ID function changes. """ def __init__(self, id_function, force_immutable=False): """ Create a cache object. Parameters ------------ id_function : function Returns hashable value force_immutable : bool If set will make all numpy arrays read-only """ self._id_function = id_function # force stored numpy arrays to have flags.writable=False self.force_immutable = bool(force_immutable) # call the id function for initial value self.id_current = self._id_function() # a counter for locks self._lock = 0 # actual store for data self.cache = {} def delete(self, key): """ Remove a key from the cache. """ if key in self.cache: self.cache.pop(key, None) def verify(self): """ Verify that the cached values are still for the same value of id_function and delete all stored items if the value of id_function has changed. """ # if we are in a lock don't check anything if self._lock != 0: return # check the hash of our data id_new = self._id_function() # things changed if id_new != self.id_current: if len(self.cache) > 0: log.debug('%d items cleared from cache: %s', len(self.cache), str(list(self.cache.keys()))) # hash changed, so dump the cache # do it manually rather than calling clear() # as we are internal logic and can avoid function calls self.cache = {} # set the id to the new data hash self.id_current = id_new def clear(self, exclude=None): """ Remove all elements in the cache. """ if exclude is None: self.cache = {} else: self.cache = {k: v for k, v in self.cache.items() if k in exclude} def update(self, items): """ Update the cache with a set of key, value pairs without checking id_function. """ self.cache.update(items) if self.force_immutable: for k, v in self.cache.items(): if hasattr(v, 'flags') and len(v.shape) > 0: v.flags.writeable = False self.id_set() def id_set(self): """ Set the current ID to the value of the ID function. """ self.id_current = self._id_function() def __getitem__(self, key): """ Get an item from the cache. If the item is not in the cache, it will return None Parameters ------------- key : hashable Key in dict Returns ------------- cached : object, or None Object that was stored """ self.verify() if key in self.cache: return self.cache[key] return None def __setitem__(self, key, value): """ Add an item to the cache. Parameters ------------ key : hashable Key to reference value value : any Value to store in cache """ # dumpy cache if ID function has changed self.verify() # make numpy arrays read-only if asked to if self.force_immutable and hasattr(value, 'flags') and len(value.shape) > 0: value.flags.writeable = False # assign data to dict self.cache[key] = value return value def __contains__(self, key): self.verify() return key in self.cache def __len__(self): self.verify() return len(self.cache) def __enter__(self): self._lock += 1 def __exit__(self, *args): self._lock -= 1 self.id_current = self._id_function() class DataStore(Mapping): """ A class to store multiple numpy arrays and track them all for changes. Operates like a dict that only stores numpy.ndarray """ def __init__(self): self.data = {} def __iter__(self): return iter(self.data) def __delitem__(self, key): del self.data[key] @property def mutable(self): """ Is data allowed to be altered or not. Returns ----------- is_mutable : bool Can data be altered in the DataStore """ if not hasattr(self, '_mutable'): return True return self._mutable @mutable.setter def mutable(self, value): """ Is data allowed to be altered or not. Parameters ------------ is_mutable : bool Should data be allowed to be altered """ # make sure passed value is a bool is_mutable = bool(value) # apply the flag to any data stored for n, i in self.data.items(): i.mutable = value # save the mutable setting self._mutable = is_mutable def is_empty(self): """ Is the current DataStore empty or not. Returns ---------- empty : bool False if there are items in the DataStore """ if len(self.data) == 0: return True for v in self.data.values(): if is_sequence(v): if len(v) == 0: return True else: return False elif bool(np.isreal(v)): return False return True def clear(self): """ Remove all data from the DataStore. """ self.data = {} def __getitem__(self, key): return self.data[key] def __setitem__(self, key, data): """ Store an item in the DataStore """ # we shouldn't allow setting on immutable datastores if not self.mutable: raise ValueError('DataStore is configured immutable!') if hasattr(data, 'md5'): # don't bother to re-track TrackedArray tracked = data else: # otherwise wrap data tracked = tracked_array(data) # apply our mutability setting if hasattr(self, '_mutable'): # apply our mutability setting only if it was explicitly set tracked.mutable = self.mutable # store data self.data[key] = tracked def __contains__(self, key): return key in self.data def __len__(self): return len(self.data) def update(self, values): if not isinstance(values, dict): raise ValueError('Update only implemented for dicts') for key, value in values.items(): self[key] = value def md5(self): """ Get an MD5 reflecting everything in the DataStore. Returns ---------- md5 : str MD5 of data in hexadecimal """ hasher = hashlib.md5() for key in sorted(self.data.keys()): hasher.update(self.data[key].md5().encode('utf-8')) md5 = hasher.hexdigest() return md5 def crc(self): """ Get a CRC reflecting everything in the DataStore. Returns ---------- crc : int CRC of data """ # combine with a sum of every hash crc = sum(i.crc() for i in self.data.values()) return crc def fast_hash(self): """ Get a CRC32 or xxhash.xxh64 reflecting the DataStore. Returns ------------ hashed : int Checksum of data """ # combine every hash fast = sum(i.fast_hash() for i in self.data.values()) return fast def _fast_crc(count=25): """ On certain platforms/builds zlib.adler32 is substantially faster than zlib.crc32, but it is not consistent across Windows/Linux/OSX. This function runs a quick check (2ms on my machines) to determine the fastest hashing function available in zlib. Parameters ------------ count : int Number of repetitions to do on the speed trial Returns ---------- crc32 : function Either `zlib.adler32` or `zlib.crc32` """ import timeit # create an array of random numbers setup = 'import numpy, zlib;' setup += 'd = numpy.random.random((500,3));' # time crc32 crc32 = timeit.timeit(setup=setup, stmt='zlib.crc32(d)', number=count) # time adler32 adler32 = timeit.timeit(setup=setup, stmt='zlib.adler32(d)', number=count) if adler32 < crc32: return zlib.adler32 else: return zlib.crc32 # get the fastest CRC32 available on the # current platform when trimesh is imported crc32 = _fast_crc()