Source code for ezmsg.sigproc.util.buffer

import collections
import math
import typing
import warnings

Array = typing.TypeVar("Array")
ArrayNamespace = typing.Any
DType = typing.Any
UpdateStrategy = typing.Literal["immediate", "threshold", "on_demand"]
OverflowStrategy = typing.Literal["grow", "raise", "drop", "warn-overwrite"]


[docs] class HybridBuffer: """A stateful, FIFO buffer that combines a deque for fast appends with a contiguous circular buffer for efficient, advancing reads. This buffer is designed to be agnostic to the array library used (e.g., NumPy, CuPy, PyTorch) via the Python Array API standard. Args: array_namespace: The array library (e.g., numpy, cupy) that conforms to the Array API. capacity: The current maximum number of samples to store in the circular buffer. other_shape: A tuple defining the shape of the non-sample dimensions. dtype: The data type of the samples, belonging to the provided array_namespace. update_strategy: The strategy for synchronizing the deque to the circular buffer (flushing). threshold: The number of samples to accumulate in the deque before flushing. Ignored if update_strategy is "immediate" or "on_demand". overflow_strategy: The strategy for handling overflow when the buffer is full. Options are "grow", "raise", "drop", or "warn-overwrite". If "grow" (default), the buffer will increase its capacity to accommodate new samples up to max_size. If "raise", an error will be raised when the buffer is full. If "drop", the overflowing samples will be ignored. If "warn-overwrite", a warning will be logged then the overflowing samples will overwrite previously-unread samples. max_size: The maximum size of the buffer in bytes. If the buffer exceeds this size, it will raise an error. warn_once: If True, will only warn once on overflow when using "warn-overwrite" strategy. """
[docs] def __init__( self, array_namespace: ArrayNamespace, capacity: int, other_shape: tuple[int, ...], dtype: DType, update_strategy: UpdateStrategy = "on_demand", threshold: int = 0, overflow_strategy: OverflowStrategy = "grow", max_size: int = 1024**3, # 1 GB default max size warn_once: bool = True, ): self.xp = array_namespace self._capacity = capacity self._deque = collections.deque() self._update_strategy = update_strategy self._threshold = threshold self._overflow_strategy = overflow_strategy self._max_size = max_size self._warn_once = warn_once self._buffer = self.xp.empty((capacity, *other_shape), dtype=dtype) self._head = 0 # Write pointer self._tail = 0 # Read pointer self._buff_unread = 0 # Number of unread samples in the circular buffer self._buff_read = 0 # Tracks samples read and still in buffer self._deque_len = 0 # Number of unread samples in the deque self._last_overflow = ( 0 # Tracks the last overflow count, overwritten or skipped ) self._warned = False # Tracks if we've warned already (for warn_once)
@property def capacity(self) -> int: """The maximum number of samples that can be stored in the buffer.""" return self._capacity
[docs] def available(self) -> int: """The total number of unread samples available (in buffer and deque).""" return self._buff_unread + self._deque_len
[docs] def is_empty(self) -> bool: """Returns True if there are no unread samples in the buffer or deque.""" return self.available() == 0
[docs] def is_full(self) -> bool: """Returns True if the buffer is full and cannot _flush_ more samples without overwriting.""" return self._buff_unread == self._capacity
[docs] def tell(self) -> int: """Returns the number of samples that have been read and are still in the buffer.""" return self._buff_read
[docs] def write(self, block: Array): """Appends a new block (an array of samples) to the internal deque.""" other_shape = self._buffer.shape[1:] if other_shape == (1,) and block.ndim == 1: block = block[:, self.xp.newaxis] if block.shape[1:] != other_shape: raise ValueError( f"Block shape {block.shape[1:]} does not match buffer's other_shape {other_shape}" ) # Most overflow strategies are handled during flush, but there are a couple # scenarios that can be evaluated on write to give immediate feedback. new_len = self._deque_len + block.shape[0] if new_len > self._capacity and self._overflow_strategy == "raise": raise OverflowError( f"Buffer overflow: {new_len} samples awaiting in deque exceeds buffer capacity {self._capacity}." ) elif new_len * block.dtype.itemsize > self._max_size: raise OverflowError( f"deque contents would exceed max_size ({self._max_size}) on subsequent flush." "Are you reading samples frequently enough?" ) self._deque.append(block) self._deque_len += block.shape[0] if self._update_strategy == "immediate" or ( self._update_strategy == "threshold" and (0 < self._threshold <= self._deque_len) ): self.flush()
def _estimate_overflow(self, n_samples: int) -> int: """ Estimates the number of samples that would overflow we requested n_samples from the buffer. """ if n_samples > self.available(): raise ValueError( f"Requested {n_samples} samples, but only {self.available()} are available." ) n_overflow = 0 if self._deque and (n_samples > self._buff_unread): # We would cause a flush, but would that cause an overflow? n_free = self._capacity - self._buff_unread n_overflow = max(0, self._deque_len - n_free) return n_overflow
[docs] def read( self, n_samples: int | None = None, ) -> Array: """ Retrieves the oldest unread samples from the buffer with padding and advances the read head. Args: n_samples: The number of samples to retrieve. If None, returns all unread samples. Returns: An array containing the requested samples. This may be a view or a copy. Note: The result may have more samples than the buffer.capacity as it may include samples from the deque in the output. """ n_samples = n_samples if n_samples is not None else self.available() data = None offset = 0 n_overflow = self._estimate_overflow(n_samples) if n_overflow > 0: first_read = self._buff_unread if (n_overflow - first_read) < self.capacity or ( self._overflow_strategy == "drop" ): # We can prevent the overflow (or at least *some* if using "drop" # strategy) by reading the samples in the buffer first to make room. data = self.xp.empty( (n_samples, *self._buffer.shape[1:]), dtype=self._buffer.dtype ) self.peek(first_read, out=data[:first_read]) offset += first_read self.seek(first_read) n_samples -= first_read if data is None: data = self.peek(n_samples) self.seek(data.shape[0]) else: d2 = self.peek(n_samples, out=data[offset:]) self.seek(d2.shape[0]) return data
[docs] def peek(self, n_samples: int | None = None, out: Array | None = None) -> Array: """ Retrieves the oldest unread samples from the buffer with padding without advancing the read head. Args: n_samples: The number of samples to retrieve. If None, returns all unread samples. out: Optionally, a destination array to store the samples. If provided, must have shape (n_samples, *other_shape) where other_shape matches the shape of the samples in the buffer. If `out` is provided then the data will always be copied into it, even if they are contiguous in the buffer. Returns: An array containing the requested samples. This may be a view or a copy. Note: The result may have more samples than the buffer.capacity as it may include samples from the deque in the output. """ if n_samples is None: n_samples = self.available() elif n_samples > self.available(): raise ValueError( f"Requested to peek {n_samples} samples, but only {self.available()} are available." ) if out is not None and out.shape[0] < n_samples: raise ValueError( f"Output array shape {out.shape} is smaller than requested {n_samples} samples." ) if n_samples == 0: return self._buffer[:0] self._flush_if_needed(n_samples=n_samples) if self._tail + n_samples > self._capacity: # discontiguous read (wraps around) part1_len = self._capacity - self._tail part2_len = n_samples - part1_len out = ( out if out is not None else self.xp.empty( (n_samples, *self._buffer.shape[1:]), dtype=self._buffer.dtype ) ) out[:part1_len] = self._buffer[self._tail :] out[part1_len:] = self._buffer[:part2_len] else: if out is not None: out[:] = self._buffer[self._tail : self._tail + n_samples] else: # No output array provided, just return a view out = self._buffer[self._tail : self._tail + n_samples] return out
[docs] def peek_at(self, idx: int, allow_flush: bool = False) -> Array: """ Retrieves a specific sample from the buffer without advancing the read head. Args: idx: The index of the sample to retrieve, relative to the read head. allow_flush: If True, allows flushing the deque to the buffer if the requested sample is not in the buffer. If False and the sample is in the deque, the sample will be retrieved from the deque (slow!). Returns: An array containing the requested sample. This may be a view or a copy. """ if idx < 0 or idx >= self.available(): raise IndexError(f"Index {idx} out of bounds for unread samples.") if not allow_flush and idx >= self._buff_unread: # The requested sample is in the deque. idx -= self._buff_unread deq_splits = self.xp.cumsum( [0] + [_.shape[0] for _ in self._deque], dtype=int ) arr_idx = self.xp.searchsorted(deq_splits, idx, side="right") - 1 idx -= deq_splits[arr_idx] return self._deque[arr_idx][idx : idx + 1] self._flush_if_needed(n_samples=idx + 1) # The requested sample is within the unread samples in the buffer. idx = (self._tail + idx) % self._capacity return self._buffer[idx : idx + 1]
[docs] def peek_last(self) -> Array: """ Retrieves the last sample in the buffer without advancing the read head. """ if self._deque: return self._deque[-1][-1:] elif self._buff_unread > 0: idx = (self._head - 1 + self._capacity) % self._capacity return self._buffer[idx : idx + 1] else: raise IndexError("Cannot peek last from an empty buffer.")
[docs] def seek(self, n_samples: int) -> int: """ Advances the read head by n_samples. Args: n_samples: The number of samples to seek. Will seek forward if positive or backward if negative. Returns: The number of samples actually skipped. """ self._flush_if_needed(n_samples=n_samples) n_to_seek = max(min(n_samples, self._buff_unread), -self._buff_read) if n_to_seek == 0: return 0 self._tail = (self._tail + n_to_seek) % self._capacity self._buff_unread -= n_to_seek self._buff_read += n_to_seek return n_to_seek
def _flush_if_needed(self, n_samples: int | None = None): if ( self._update_strategy == "on_demand" and self._deque and (n_samples is None or n_samples > self._buff_unread) ): self.flush()
[docs] def flush(self): """ Transfers all data from the deque to the circular buffer. Note: This may overwrite data depending on the overflow strategy, which will invalidate previous state variables. """ if not self._deque: return n_new = self._deque_len n_free = self._capacity - self._buff_unread n_overflow = max(0, n_new - n_free) # If new data is larger than buffer and overflow strategy is "warn-overwrite", # then we can take a shortcut and replace the entire buffer. if n_new >= self._capacity and self._overflow_strategy == "warn-overwrite": if n_overflow > 0 and (not self._warn_once or not self._warned): self._warned = True warnings.warn( f"Buffer overflow: {n_new} samples received, but only {self._capacity - self._buff_unread} available. " f"Overwriting {n_overflow} previous samples.", RuntimeWarning, ) # We need to grab the last `self._capacity` samples from the deque samples_to_copy = self._capacity copied_samples = 0 for block in reversed(self._deque): if copied_samples >= samples_to_copy: break n_to_copy = min(block.shape[0], samples_to_copy - copied_samples) start_idx = block.shape[0] - n_to_copy self._buffer[ samples_to_copy - copied_samples - n_to_copy : samples_to_copy - copied_samples ] = block[start_idx:] copied_samples += n_to_copy self._head = 0 self._tail = 0 self._buff_unread = self._capacity self._buff_read = 0 self._last_overflow = n_overflow else: if n_overflow > 0: if self._overflow_strategy == "raise": raise OverflowError( f"Buffer overflow: {n_new} samples received, but only {n_free} available." ) elif self._overflow_strategy == "warn-overwrite": if not self._warn_once or not self._warned: self._warned = True warnings.warn( f"Buffer overflow: {n_new} samples received, but only {n_free} available. " f"Overwriting {n_overflow} previous samples.", RuntimeWarning, ) # Move the tail forward to make room for the new data. self.seek(n_overflow) # Adjust the read pointer to account for the overflow. Should always be 0. self._buff_read = max(0, self._buff_read - n_overflow) self._last_overflow = n_overflow elif self._overflow_strategy == "drop": # Drop the overflow samples from the deque samples_to_drop = n_overflow while samples_to_drop > 0 and self._deque: block = self._deque[-1] if samples_to_drop >= block.shape[0]: samples_to_drop -= block.shape[0] self._deque.pop() else: block = self._deque.pop() self._deque.append(block[:-samples_to_drop]) samples_to_drop = 0 n_new -= n_overflow self._last_overflow = n_overflow elif self._overflow_strategy == "grow": self._grow_buffer(self._capacity + n_new) self._last_overflow = 0 # Copy data to buffer by iterating over the deque for block in self._deque: n_block = block.shape[0] space_til_end = self._capacity - self._head if n_block > space_til_end: # Two-part copy (wraps around) part1_len = space_til_end part2_len = n_block - part1_len self._buffer[self._head :] = block[:part1_len] self._buffer[:part2_len] = block[part1_len:] else: # Single-part copy self._buffer[self._head : self._head + n_block] = block self._head = (self._head + n_block) % self._capacity self._buff_unread += n_new if (self._buff_read > self._tail) or (self._tail > self._head): # We have wrapped around the buffer; our count of read samples # is simply the buffer capacity minus the count of unread samples. self._buff_read = self._capacity - self._buff_unread self._deque.clear() self._deque_len = 0
def _grow_buffer(self, min_capacity: int): """ Grows the buffer to at least min_capacity. This is a helper method for the overflow strategy "grow". """ if self._capacity >= min_capacity: return other_shape = self._buffer.shape[1:] max_capacity = self._max_size / ( self._buffer.dtype.itemsize * math.prod(other_shape) ) if min_capacity > max_capacity: raise OverflowError( f"Cannot grow buffer to {min_capacity} samples, " f"maximum capacity is {max_capacity} samples ({self._max_size} bytes)." ) new_capacity = min(max_capacity, max(self._capacity * 2, min_capacity)) new_buffer = self.xp.empty( (new_capacity, *other_shape), dtype=self._buffer.dtype ) # Copy existing data to new buffer total_samples = self._buff_read + self._buff_unread if total_samples > 0: start_idx = (self._tail - self._buff_read) % self._capacity stop_idx = (self._tail + self._buff_unread) % self._capacity if stop_idx > start_idx: # Data is contiguous new_buffer[:total_samples] = self._buffer[start_idx:stop_idx] else: # Data wraps around. We write it in 2 parts. part1_len = self._capacity - start_idx part2_len = stop_idx new_buffer[:part1_len] = self._buffer[start_idx:] new_buffer[part1_len : part1_len + part2_len] = self._buffer[:stop_idx] # self._buff_read stays the same self._tail = self._buff_read # self._buff_unread stays the same self._head = self._tail + self._buff_unread else: self._tail = 0 self._head = 0 self._buffer = new_buffer self._capacity = new_capacity