Source code for ezmsg.learn.model.mlp

import torch
import torch.nn



[docs]
class MLP(torch.nn.Module):
    """
    A simple Multi-Layer Perceptron (MLP) model. Adapted from Ezmsg MLP.

    Attributes:
        feature_extractor (torch.nn.Sequential): The sequential feature extractor part of the MLP.
        heads (torch.nn.ModuleDict): A dictionary of output linear layers for each output head.
    """


[docs]
    def __init__(
        self,
        input_size: int,
        hidden_size: int | list[int],
        num_layers: int | None = None,
        output_heads: int | dict[str, int] = 2,
        norm_layer: str | None = None,
        activation_layer: str | None = "ReLU",
        inplace: bool | None = None,
        bias: bool = True,
        dropout: float = 0.0,
    ):
        """
        Initialize the MLP model.
        Args:
            input_size (int): The size of the input features.
            hidden_size (int | list[int]): The sizes of the hidden layers. If a list, num_layers must be None or the length
                of the list. If a single integer, num_layers must be specified and determines the number of hidden layers.
            num_layers (int, optional): The number of hidden layers. Length of hidden_size if None. Default is None.
            output_heads (int | dict[str, int], optional): Number of output features or classes if single head output or a
                dictionary mapping head names to output sizes if multi-head output. Default is 2 (single head).
            norm_layer (str, optional): A normalization layer to be applied after each linear layer. Default is None.
                Common choices are "BatchNorm1d" or "LayerNorm".
            activation_layer (str, optional): An activation function to be applied after each normalization
                layer. Default is "ReLU".
            inplace (bool, optional): Whether the activation function is performed in-place. Default is None.
            bias (bool, optional): Whether to use bias in the linear layers. Default is True.
            dropout (float, optional): The dropout rate to be applied after each linear layer. Default is 0.0.
        """
        super().__init__()
        if isinstance(hidden_size, int):
            if num_layers is None:
                raise ValueError(
                    "If hidden_size is an integer, num_layers must be specified."
                )
            hidden_size = [hidden_size] * num_layers
        if len(hidden_size) == 0:
            raise ValueError("hidden_size must have at least one element")
        if any(not isinstance(x, int) for x in hidden_size):
            raise ValueError("hidden_size must contain only integers")
        if num_layers is not None and len(hidden_size) != num_layers:
            raise ValueError(
                "Length of hidden_size must match num_layers if num_layers is specified."
            )

        params = {} if inplace is None else {"inplace": inplace}

        layers = []
        in_dim = input_size

        def _get_layer_class(layer_name: str):
            if layer_name is not None and "torch.nn" in layer_name:
                return getattr(torch.nn, layer_name.rsplit(".", 1)[1])
            return None

        norm_layer_class = _get_layer_class(norm_layer)
        activation_layer_class = _get_layer_class(activation_layer)
        for hidden_dim in hidden_size[:-1]:
            layers.append(torch.nn.Linear(in_dim, hidden_dim, bias=bias))
            if norm_layer_class is not None:
                layers.append(norm_layer_class(hidden_dim))
            if activation_layer_class is not None:
                layers.append(activation_layer_class(**params))
            layers.append(torch.nn.Dropout(dropout, **params))
            in_dim = hidden_dim

        layers.append(torch.nn.Linear(in_dim, hidden_size[-1], bias=bias))

        self.feature_extractor = torch.nn.Sequential(*layers)

        if isinstance(output_heads, int):
            output_heads = {"output": output_heads}
        self.heads = torch.nn.ModuleDict(
            {
                name: torch.nn.Linear(hidden_size[-1], output_size)
                for name, output_size in output_heads.items()
            }
        )



[docs]
    @classmethod
    def infer_config_from_state_dict(cls, state_dict: dict) -> dict[str, int | float]:
        """
        Infer the configuration from the state dict.

        Args:
            state_dict: The state dict of the model.

        Returns:
            dict[str, int | float]: A dictionary containing the inferred configuration.
        """
        input_size = state_dict["feature_extractor.0.weight"].shape[1]
        hidden_size = [
            param.shape[0]
            for key, param in state_dict.items()
            if key.startswith("feature_extractor.") and key.endswith(".weight")
        ]
        output_heads = {
            key.split(".")[1]: param.shape[0]
            for key, param in state_dict.items()
            if key.startswith("heads.") and key.endswith(".bias")
        }

        return {
            "input_size": input_size,
            "hidden_size": hidden_size,
            "output_heads": output_heads,
        }



[docs]
    def forward(self, x: torch.Tensor) -> dict[str, torch.Tensor]:
        """
        Forward pass through the MLP.

        Args:
            x (torch.Tensor): Input tensor of shape (batch, seq_len, input_size).

        Returns:
            dict[str, torch.Tensor]: A dictionary mapping head names to output tensors.
        """
        x = self.feature_extractor(x)
        return {name: head(x) for name, head in self.heads.items()}