Source code for tsl.nn.layers.base.attention

import math
from typing import Optional, Union

import torch
from einops import rearrange
from torch import Tensor, nn
from torch_geometric.nn.dense import Linear
from torch_geometric.typing import OptTensor

from tsl.nn.utils import get_functional_activation


[docs]class PositionalEncoding(nn.Module): """The positional encoding from the paper `"Attention Is All You Need" <https://arxiv.org/abs/1706.03762>`_ (Vaswani et al., NeurIPS 2017).""" def __init__(self, d_model: int, dropout: float = 0., max_len: int = 5000, affinity: bool = False, batch_first=True): super(PositionalEncoding, self).__init__() self.dropout = nn.Dropout(p=dropout) if affinity: self.affinity = nn.Linear(d_model, d_model) else: self.affinity = None pe = torch.zeros(max_len, d_model) position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1) div_term = torch.exp( torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)) pe[:, 0::2] = torch.sin(position * div_term) pe[:, 1::2] = torch.cos(position * div_term) pe = pe.unsqueeze(0).transpose(0, 1) self.register_buffer('pe', pe) self.batch_first = batch_first def forward(self, x: Tensor): """""" if self.affinity is not None: x = self.affinity(x) if self.batch_first: pe = self.pe[:x.size(1), :] else: pe = self.pe[:x.size(0), :] x = x + pe return self.dropout(x)
@torch.jit.script def _get_causal_mask(seq_len: int, diagonal: int = 0, device: Optional[torch.device] = None): # mask keeping only previous steps ones = torch.ones((seq_len, seq_len), dtype=torch.bool, device=device) causal_mask = torch.triu(ones, diagonal) return causal_mask class AttentionEncoder(nn.Module): def __init__(self, embed_dim: int, qdim: Optional[int] = None, kdim: Optional[int] = None, vdim: Optional[int] = None, add_positional_encoding: bool = False, bias: bool = True, activation: Optional[str] = None) -> None: super(AttentionEncoder, self).__init__() self.embed_dim = embed_dim self.qdim = qdim self.kdim = kdim self.vdim = vdim self.lin_query = Linear(qdim, self.embed_dim, bias) \ if qdim is not None else nn.Identity() self.lin_key = Linear(kdim, self.embed_dim, bias) \ if qdim is not None else nn.Identity() self.lin_value = Linear(vdim, self.embed_dim, bias) \ if qdim is not None else nn.Identity() self.activation = get_functional_activation(activation) self.pe = PositionalEncoding(self.embed_dim) \ if add_positional_encoding else nn.Identity() def forward(self, query: Tensor, key: OptTensor = None, value: OptTensor = None): """""" # inputs: [batches, time, nodes, channels] if key is None: key = query if value is None: value = key query = self.pe(self.activation(self.lin_query(query))) key = self.pe(self.activation(self.lin_key(key))) value = self.activation(self.lin_value(value)) return query, key, value
[docs]class MultiHeadAttention(nn.MultiheadAttention): """The multi-head attention from the paper `"Attention Is All You Need" <https://arxiv.org/abs/1706.03762>`_ (Vaswani et al., NeurIPS 2017) for spatiotemporal data. Args: embed_dim (int): Size of the hidden dimension associated with each node at each time step. heads (int): Number of parallel attention heads. qdim (int, optional): Size of the query dimension. If :obj:`None`, then defaults to :attr:`embed_dim`. (default: :obj:`None`) kdim (int, optional): Size of the query dimension. If :obj:`None`, then defaults to :attr:`embed_dim`. (default: :obj:`None`) vdim (int, optional): Size of the query dimension. If :obj:`None`, then defaults to :attr:`embed_dim`. (default: :obj:`None`) axis (str): Dimension on which to apply attention to update the representations. Can be either, ``'time'`` or ``'nodes'``. (default: :obj:`'time'`) add_bias_kv (bool): If :obj:`True`, then adds bias to the key and value sequences. (default: :obj:`False`) add_zero_attn (bool): If :obj:`True`, then adds a new batch of zeros to the key and value sequences. (default: :obj:`False`) causal (bool): If :obj:`True`, then causally mask attention scores in temporal attention (has an effect only if :attr:`axis` is :obj:`'time'`). (default: :obj:`False`) dropout (float): Dropout probability. (default: :obj:`0.`) bias (bool): Whether to add a learnable bias. (default: :obj:`True`) device (optional): Device on which store the model. (default: :obj:`None`) dtype (optional): Data Type of the parameters. (default: :obj:`None`) """ def __init__(self, embed_dim: int, heads: int, qdim: Optional[int] = None, kdim: Optional[int] = None, vdim: Optional[int] = None, axis: Union[str, int] = 'time', add_bias_kv: bool = False, add_zero_attn: bool = False, causal: bool = False, dropout: float = 0., bias: bool = True, device=None, dtype=None) -> None: if axis in ['time', 0]: shape = 't (b n) f' elif axis in ['nodes', 1]: if causal: raise ValueError( 'Cannot use causal attention for axis "nodes"') shape = 'n (b t) f' else: raise ValueError("Axis can either be 'time' (0) or 'nodes' (1), " f"not '{axis}'.") self._in_pattern = f'b t n f -> {shape}' self._out_pattern = f'{shape} -> b t n f' self.causal = causal # Impose batch dimension as the second one super(MultiHeadAttention, self).__init__(embed_dim, heads, dropout=dropout, bias=bias, add_bias_kv=add_bias_kv, add_zero_attn=add_zero_attn, kdim=kdim, vdim=vdim, batch_first=False, device=device, dtype=dtype) # change projections if qdim is not None and qdim != embed_dim: self.qdim = qdim self.q_proj = Linear(self.qdim, embed_dim) else: self.qdim = embed_dim self.q_proj = nn.Identity() def forward(self, query: Tensor, key: OptTensor = None, value: OptTensor = None, key_padding_mask: OptTensor = None, need_weights: bool = True, attn_mask: OptTensor = None): """""" # inputs: [batches, time, nodes, features] -> [t (b n) f] if key is None: key = query if value is None: value = key batch = value.shape[0] query, key, value = [ rearrange(x, self._in_pattern) for x in (query, key, value) ] if self.causal: causal_mask = _get_causal_mask(key.size(0), diagonal=1, device=query.device) if attn_mask is None: attn_mask = causal_mask else: attn_mask = torch.logical_and(attn_mask, causal_mask) attn_output, attn_weights = super(MultiHeadAttention, self).forward( self.q_proj(query), key, value, key_padding_mask, need_weights, attn_mask) attn_output = rearrange(attn_output, self._out_pattern, b=batch) \ .contiguous() if attn_weights is not None: attn_weights = rearrange(attn_weights, '(b d) l m -> b d l m', b=batch).contiguous() return attn_output, attn_weights
[docs]class TemporalSelfAttention(nn.Module): """Temporal Self Attention layer. Args: embed_dim (int): Size of the hidden dimension associated with each node at each time step. num_heads (int): Number of parallel attention heads. dropout (float): Dropout probability. bias (bool, optional): Whther to add a learnable bias. device (optional): Device on which store the model. dtype (optional): Data Type of the parameters. Examples:: >>> import torch >>> m = TemporalSelfAttention(32, 4, -1) >>> input = torch.randn(128, 24, 10, 20) >>> output, _ = m(input) >>> print(output.size()) torch.Size([128, 24, 10, 32]) """ def __init__(self, embed_dim, num_heads, in_channels=None, dropout=0., bias=True, device=None, dtype=None) -> None: super(TemporalSelfAttention, self).__init__() self.embed_dim = embed_dim if in_channels is not None: self.input_encoder = Linear(in_channels, self.embed_dim) else: self.input_encoder = nn.Identity() self.attention = MultiHeadAttention(embed_dim, num_heads, axis='time', dropout=dropout, bias=bias, device=device, dtype=dtype) def forward(self, x, attn_mask: Optional[Tensor] = None, key_padding_mask: Optional[Tensor] = None, need_weights: bool = True): """""" # x: [batch, time, nodes, in_channels] x = self.input_encoder(x) # -> [batch, time, nodes, embed_dim] return self.attention(x, attn_mask=attn_mask, key_padding_mask=key_padding_mask, need_weights=need_weights)
[docs]class SpatialSelfAttention(nn.Module): """Spatial Self Attention layer. Args: embed_dim (int): Size of the hidden dimension associated with each node at each time step. num_heads (int): Number of parallel attention heads. dropout (float): Dropout probability. bias (bool, optional): Whther to add a learnable bias. device (optional): Device on which store the model. dtype (optional): Data Type of the parameters. Examples:: >>> import torch >>> m = SpatialSelfAttention(32, 4, -1) >>> input = torch.randn(128, 24, 10, 20) >>> output, _ = m(input) >>> print(output.size()) torch.Size([128, 24, 10, 32]) """ def __init__(self, embed_dim, num_heads, in_channels=None, dropout=0., bias=True, device=None, dtype=None) -> None: super(SpatialSelfAttention, self).__init__() self.embed_dim = embed_dim if in_channels is not None: self.input_encoder = Linear(in_channels, self.embed_dim) else: self.input_encoder = nn.Identity() self.attention = MultiHeadAttention(embed_dim, num_heads, axis='nodes', dropout=dropout, bias=bias, device=device, dtype=dtype) def forward(self, x, attn_mask: Optional[Tensor] = None, key_padding_mask: Optional[Tensor] = None, need_weights: bool = True): """""" # x: [batch, time, nodes, in_channels] x = self.input_encoder(x) # -> [batch, time, nodes, embed_dim] return self.attention(x, attn_mask=attn_mask, key_padding_mask=key_padding_mask, need_weights=need_weights)