Source code for ezmsg.sigproc.util.buffer

import collections
import math
import typing
import warnings

Array = typing.TypeVar("Array")
ArrayNamespace = typing.Any
DType = typing.Any
UpdateStrategy = typing.Literal["immediate", "threshold", "on_demand"]
OverflowStrategy = typing.Literal["grow", "raise", "drop", "warn-overwrite"]



[docs]
class HybridBuffer:
    """A stateful, FIFO buffer that combines a deque for fast appends with a
    contiguous circular buffer for efficient, advancing reads.

    This buffer is designed to be agnostic to the array library used (e.g., NumPy,
    CuPy, PyTorch) via the Python Array API standard.

    Args:
        array_namespace: The array library (e.g., numpy, cupy) that conforms to the Array API.
        capacity: The current maximum number of samples to store in the circular buffer.
        other_shape: A tuple defining the shape of the non-sample dimensions.
        dtype: The data type of the samples, belonging to the provided array_namespace.
        update_strategy: The strategy for synchronizing the deque to the circular buffer (flushing).
        threshold: The number of samples to accumulate in the deque before flushing.
          Ignored if update_strategy is "immediate" or "on_demand".
        overflow_strategy: The strategy for handling overflow when the buffer is full.
            Options are "grow", "raise", "drop", or "warn-overwrite". If "grow" (default), the buffer will
            increase its capacity to accommodate new samples up to max_size. If "raise", an error will be
            raised when the buffer is full. If "drop", the overflowing samples will be ignored.
            If "warn-overwrite", a warning will be logged then the overflowing samples will
            overwrite previously-unread samples.
        max_size: The maximum size of the buffer in bytes.
            If the buffer exceeds this size, it will raise an error.
        warn_once: If True, will only warn once on overflow when using "warn-overwrite" strategy.
    """


[docs]
    def __init__(
        self,
        array_namespace: ArrayNamespace,
        capacity: int,
        other_shape: tuple[int, ...],
        dtype: DType,
        update_strategy: UpdateStrategy = "on_demand",
        threshold: int = 0,
        overflow_strategy: OverflowStrategy = "grow",
        max_size: int = 1024**3,  # 1 GB default max size
        warn_once: bool = True,
    ):
        self.xp = array_namespace
        self._capacity = capacity
        self._deque = collections.deque()
        self._update_strategy = update_strategy
        self._threshold = threshold
        self._overflow_strategy = overflow_strategy
        self._max_size = max_size
        self._warn_once = warn_once

        self._buffer = self.xp.empty((capacity, *other_shape), dtype=dtype)
        self._head = 0  # Write pointer
        self._tail = 0  # Read pointer
        self._buff_unread = 0  # Number of unread samples in the circular buffer
        self._buff_read = 0  # Tracks samples read and still in buffer
        self._deque_len = 0  # Number of unread samples in the deque
        self._last_overflow = (
            0  # Tracks the last overflow count, overwritten or skipped
        )
        self._warned = False  # Tracks if we've warned already (for warn_once)


    @property
    def capacity(self) -> int:
        """The maximum number of samples that can be stored in the buffer."""
        return self._capacity


[docs]
    def available(self) -> int:
        """The total number of unread samples available (in buffer and deque)."""
        return self._buff_unread + self._deque_len



[docs]
    def is_empty(self) -> bool:
        """Returns True if there are no unread samples in the buffer or deque."""
        return self.available() == 0



[docs]
    def is_full(self) -> bool:
        """Returns True if the buffer is full and cannot _flush_ more samples without overwriting."""
        return self._buff_unread == self._capacity



[docs]
    def tell(self) -> int:
        """Returns the number of samples that have been read and are still in the buffer."""
        return self._buff_read



[docs]
    def write(self, block: Array):
        """Appends a new block (an array of samples) to the internal deque."""
        other_shape = self._buffer.shape[1:]
        if other_shape == (1,) and block.ndim == 1:
            block = block[:, self.xp.newaxis]

        if block.shape[1:] != other_shape:
            raise ValueError(
                f"Block shape {block.shape[1:]} does not match buffer's other_shape {other_shape}"
            )

        # Most overflow strategies are handled during flush, but there are a couple
        # scenarios that can be evaluated on write to give immediate feedback.
        new_len = self._deque_len + block.shape[0]
        if new_len > self._capacity and self._overflow_strategy == "raise":
            raise OverflowError(
                f"Buffer overflow: {new_len} samples awaiting in deque exceeds buffer capacity {self._capacity}."
            )
        elif new_len * block.dtype.itemsize > self._max_size:
            raise OverflowError(
                f"deque contents would exceed max_size ({self._max_size}) on subsequent flush."
                "Are you reading samples frequently enough?"
            )

        self._deque.append(block)
        self._deque_len += block.shape[0]

        if self._update_strategy == "immediate" or (
            self._update_strategy == "threshold"
            and (0 < self._threshold <= self._deque_len)
        ):
            self.flush()


    def _estimate_overflow(self, n_samples: int) -> int:
        """
        Estimates the number of samples that would overflow we requested n_samples
        from the buffer.
        """
        if n_samples > self.available():
            raise ValueError(
                f"Requested {n_samples} samples, but only {self.available()} are available."
            )
        n_overflow = 0
        if self._deque and (n_samples > self._buff_unread):
            # We would cause a flush, but would that cause an overflow?
            n_free = self._capacity - self._buff_unread
            n_overflow = max(0, self._deque_len - n_free)
        return n_overflow


[docs]
    def read(
        self,
        n_samples: int | None = None,
    ) -> Array:
        """
        Retrieves the oldest unread samples from the buffer with padding
        and advances the read head.

        Args:
            n_samples: The number of samples to retrieve. If None, returns all
                unread samples.

        Returns:
            An array containing the requested samples. This may be a view or a copy.
            Note: The result may have more samples than the buffer.capacity as it
            may include samples from the deque in the output.
        """
        n_samples = n_samples if n_samples is not None else self.available()
        data = None
        offset = 0
        n_overflow = self._estimate_overflow(n_samples)
        if n_overflow > 0:
            first_read = self._buff_unread
            if (n_overflow - first_read) < self.capacity or (
                self._overflow_strategy == "drop"
            ):
                # We can prevent the overflow (or at least *some* if using "drop"
                # strategy) by reading the samples in the buffer first to make room.
                data = self.xp.empty(
                    (n_samples, *self._buffer.shape[1:]), dtype=self._buffer.dtype
                )
                self.peek(first_read, out=data[:first_read])
                offset += first_read
                self.seek(first_read)
                n_samples -= first_read
        if data is None:
            data = self.peek(n_samples)
            self.seek(data.shape[0])
        else:
            d2 = self.peek(n_samples, out=data[offset:])
            self.seek(d2.shape[0])

        return data



[docs]
    def peek(self, n_samples: int | None = None, out: Array | None = None) -> Array:
        """
        Retrieves the oldest unread samples from the buffer with padding without
        advancing the read head.

        Args:
            n_samples: The number of samples to retrieve. If None, returns all
                unread samples.
            out: Optionally, a destination array to store the samples.
                If provided, must have shape (n_samples, *other_shape) where
                other_shape matches the shape of the samples in the buffer.
                If `out` is provided then the data will always be copied into it,
                even if they are contiguous in the buffer.

        Returns:
            An array containing the requested samples. This may be a view or a copy.
            Note: The result may have more samples than the buffer.capacity as it
            may include samples from the deque in the output.
        """
        if n_samples is None:
            n_samples = self.available()
        elif n_samples > self.available():
            raise ValueError(
                f"Requested to peek {n_samples} samples, but only {self.available()} are available."
            )
        if out is not None and out.shape[0] < n_samples:
            raise ValueError(
                f"Output array shape {out.shape} is smaller than requested {n_samples} samples."
            )

        if n_samples == 0:
            return self._buffer[:0]

        self._flush_if_needed(n_samples=n_samples)

        if self._tail + n_samples > self._capacity:
            # discontiguous read (wraps around)
            part1_len = self._capacity - self._tail
            part2_len = n_samples - part1_len
            out = (
                out
                if out is not None
                else self.xp.empty(
                    (n_samples, *self._buffer.shape[1:]), dtype=self._buffer.dtype
                )
            )
            out[:part1_len] = self._buffer[self._tail :]
            out[part1_len:] = self._buffer[:part2_len]
        else:
            if out is not None:
                out[:] = self._buffer[self._tail : self._tail + n_samples]
            else:
                # No output array provided, just return a view
                out = self._buffer[self._tail : self._tail + n_samples]

        return out



[docs]
    def peek_at(self, idx: int, allow_flush: bool = False) -> Array:
        """
        Retrieves a specific sample from the buffer without advancing the read head.

        Args:
            idx: The index of the sample to retrieve, relative to the read head.
            allow_flush: If True, allows flushing the deque to the buffer if the
                requested sample is not in the buffer. If False and the sample is
                in the deque, the sample will be retrieved from the deque (slow!).

        Returns:
            An array containing the requested sample. This may be a view or a copy.
        """
        if idx < 0 or idx >= self.available():
            raise IndexError(f"Index {idx} out of bounds for unread samples.")

        if not allow_flush and idx >= self._buff_unread:
            # The requested sample is in the deque.
            idx -= self._buff_unread
            deq_splits = self.xp.cumsum(
                [0] + [_.shape[0] for _ in self._deque], dtype=int
            )
            arr_idx = self.xp.searchsorted(deq_splits, idx, side="right") - 1
            idx -= deq_splits[arr_idx]
            return self._deque[arr_idx][idx : idx + 1]

        self._flush_if_needed(n_samples=idx + 1)

        # The requested sample is within the unread samples in the buffer.
        idx = (self._tail + idx) % self._capacity
        return self._buffer[idx : idx + 1]



[docs]
    def peek_last(self) -> Array:
        """
        Retrieves the last sample in the buffer without advancing the read head.
        """
        if self._deque:
            return self._deque[-1][-1:]
        elif self._buff_unread > 0:
            idx = (self._head - 1 + self._capacity) % self._capacity
            return self._buffer[idx : idx + 1]
        else:
            raise IndexError("Cannot peek last from an empty buffer.")



[docs]
    def seek(self, n_samples: int) -> int:
        """
        Advances the read head by n_samples.

        Args:
            n_samples: The number of samples to seek.
            Will seek forward if positive or backward if negative.

        Returns:
            The number of samples actually skipped.
        """
        self._flush_if_needed(n_samples=n_samples)

        n_to_seek = max(min(n_samples, self._buff_unread), -self._buff_read)

        if n_to_seek == 0:
            return 0

        self._tail = (self._tail + n_to_seek) % self._capacity
        self._buff_unread -= n_to_seek
        self._buff_read += n_to_seek

        return n_to_seek


    def _flush_if_needed(self, n_samples: int | None = None):
        if (
            self._update_strategy == "on_demand"
            and self._deque
            and (n_samples is None or n_samples > self._buff_unread)
        ):
            self.flush()


[docs]
    def flush(self):
        """
        Transfers all data from the deque to the circular buffer.
        Note: This may overwrite data depending on the overflow strategy,
            which will invalidate previous state variables.
        """
        if not self._deque:
            return

        n_new = self._deque_len
        n_free = self._capacity - self._buff_unread
        n_overflow = max(0, n_new - n_free)

        # If new data is larger than buffer and overflow strategy is "warn-overwrite",
        #  then we can take a shortcut and replace the entire buffer.
        if n_new >= self._capacity and self._overflow_strategy == "warn-overwrite":
            if n_overflow > 0 and (not self._warn_once or not self._warned):
                self._warned = True
                warnings.warn(
                    f"Buffer overflow: {n_new} samples received, but only {self._capacity - self._buff_unread} available. "
                    f"Overwriting {n_overflow} previous samples.",
                    RuntimeWarning,
                )

            # We need to grab the last `self._capacity` samples from the deque
            samples_to_copy = self._capacity
            copied_samples = 0
            for block in reversed(self._deque):
                if copied_samples >= samples_to_copy:
                    break
                n_to_copy = min(block.shape[0], samples_to_copy - copied_samples)
                start_idx = block.shape[0] - n_to_copy
                self._buffer[
                    samples_to_copy - copied_samples - n_to_copy : samples_to_copy
                    - copied_samples
                ] = block[start_idx:]
                copied_samples += n_to_copy

            self._head = 0
            self._tail = 0
            self._buff_unread = self._capacity
            self._buff_read = 0
            self._last_overflow = n_overflow

        else:
            if n_overflow > 0:
                if self._overflow_strategy == "raise":
                    raise OverflowError(
                        f"Buffer overflow: {n_new} samples received, but only {n_free} available."
                    )
                elif self._overflow_strategy == "warn-overwrite":
                    if not self._warn_once or not self._warned:
                        self._warned = True
                        warnings.warn(
                            f"Buffer overflow: {n_new} samples received, but only {n_free} available. "
                            f"Overwriting {n_overflow} previous samples.",
                            RuntimeWarning,
                        )
                    # Move the tail forward to make room for the new data.
                    self.seek(n_overflow)
                    # Adjust the read pointer to account for the overflow. Should always be 0.
                    self._buff_read = max(0, self._buff_read - n_overflow)
                    self._last_overflow = n_overflow
                elif self._overflow_strategy == "drop":
                    # Drop the overflow samples from the deque
                    samples_to_drop = n_overflow
                    while samples_to_drop > 0 and self._deque:
                        block = self._deque[-1]
                        if samples_to_drop >= block.shape[0]:
                            samples_to_drop -= block.shape[0]
                            self._deque.pop()
                        else:
                            block = self._deque.pop()
                            self._deque.append(block[:-samples_to_drop])
                            samples_to_drop = 0
                    n_new -= n_overflow
                    self._last_overflow = n_overflow

                elif self._overflow_strategy == "grow":
                    self._grow_buffer(self._capacity + n_new)
                    self._last_overflow = 0

            # Copy data to buffer by iterating over the deque
            for block in self._deque:
                n_block = block.shape[0]
                space_til_end = self._capacity - self._head
                if n_block > space_til_end:
                    # Two-part copy (wraps around)
                    part1_len = space_til_end
                    part2_len = n_block - part1_len
                    self._buffer[self._head :] = block[:part1_len]
                    self._buffer[:part2_len] = block[part1_len:]
                else:
                    # Single-part copy
                    self._buffer[self._head : self._head + n_block] = block
                self._head = (self._head + n_block) % self._capacity

            self._buff_unread += n_new
            if (self._buff_read > self._tail) or (self._tail > self._head):
                # We have wrapped around the buffer; our count of read samples
                #  is simply the buffer capacity minus the count of unread samples.
                self._buff_read = self._capacity - self._buff_unread

        self._deque.clear()
        self._deque_len = 0


    def _grow_buffer(self, min_capacity: int):
        """
        Grows the buffer to at least min_capacity.
        This is a helper method for the overflow strategy "grow".
        """
        if self._capacity >= min_capacity:
            return

        other_shape = self._buffer.shape[1:]
        max_capacity = self._max_size / (
            self._buffer.dtype.itemsize * math.prod(other_shape)
        )
        if min_capacity > max_capacity:
            raise OverflowError(
                f"Cannot grow buffer to {min_capacity} samples, "
                f"maximum capacity is {max_capacity} samples ({self._max_size} bytes)."
            )

        new_capacity = min(max_capacity, max(self._capacity * 2, min_capacity))
        new_buffer = self.xp.empty(
            (new_capacity, *other_shape), dtype=self._buffer.dtype
        )

        # Copy existing data to new buffer
        total_samples = self._buff_read + self._buff_unread
        if total_samples > 0:
            start_idx = (self._tail - self._buff_read) % self._capacity
            stop_idx = (self._tail + self._buff_unread) % self._capacity
            if stop_idx > start_idx:
                # Data is contiguous
                new_buffer[:total_samples] = self._buffer[start_idx:stop_idx]
            else:
                # Data wraps around. We write it in 2 parts.
                part1_len = self._capacity - start_idx
                part2_len = stop_idx
                new_buffer[:part1_len] = self._buffer[start_idx:]
                new_buffer[part1_len : part1_len + part2_len] = self._buffer[:stop_idx]
            # self._buff_read stays the same
            self._tail = self._buff_read
            # self._buff_unread stays the same
            self._head = self._tail + self._buff_unread
        else:
            self._tail = 0
            self._head = 0

        self._buffer = new_buffer
        self._capacity = new_capacity