Skip to content

src.jormungandr.embedder

Spatial and temporal positional embeddings for DETR-style models.

All embedders implement the Embedder protocol, exposing a forward(shape, device, dtype, mask) interface so they can be swapped without changing the calling code.

Classes:

DetrLearnedPositionEmbedding

DetrLearnedPositionEmbedding(embedding_dim=256)

Bases: Module, Embedder

This module learns positional embeddings up to a fixed maximum size.

Source code in src/jormungandr/embedder.py
110
111
112
113
def __init__(self, embedding_dim=256):
    super().__init__()
    self.row_embeddings = nn.Embedding(50, embedding_dim)
    self.column_embeddings = nn.Embedding(50, embedding_dim)

DetrSinePositionEmbedding

DetrSinePositionEmbedding(num_position_features: int = 128, temperature: int = 10000, normalize: bool = True, scale: float | None = None)

Bases: Module, Embedder

This is a more standard version of the position embedding, very similar to the one used by the Attention is all you need paper, generalized to work on images.

Methods:

Source code in src/jormungandr/embedder.py
40
41
42
43
44
45
46
47
48
49
50
51
52
53
def __init__(
    self,
    num_position_features: int = 128,
    temperature: int = 10000,
    normalize: bool = True,
    scale: float | None = None,
):
    super().__init__()
    if scale is not None and normalize is False:
        raise ValueError("normalize should be True if scale is passed")
    self.num_position_features = num_position_features
    self.temperature = temperature
    self.normalize = normalize
    self.scale = 2 * math.pi if scale is None else scale

forward

forward(shape: Size, device: device | str, dtype: dtype, mask: Tensor | None = None) -> torch.Tensor

Parameters:

  • shape

    (Size) –

    The shape of the feature maps for which to compute the position embedding, expected to be (batch_size, channels, height, width)

  • device

    (device | str) –

    The device on which to create the position embedding

  • dtype

    (dtype) –

    The dtype of the position embedding

  • mask

    (Tensor | None, default: None ) –

    An optional mask tensor of shape (batch_size, height, width) where True values indicate masked positions. If None, no positions are masked.

Returns: A position embedding tensor of shape (batch_size, sequence_length, hidden_size) where sequence_length is height * width and hidden_size is num_position_features * 2 (for sine and cosine components)

Source code in src/jormungandr/embedder.py
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
def forward(
    self,
    shape: torch.Size,
    device: torch.device | str,
    dtype: torch.dtype,
    mask: torch.Tensor | None = None,
) -> torch.Tensor:
    """
    Args:
        shape: The shape of the feature maps for which to compute the position embedding, expected to be (batch_size, channels, height, width)
        device: The device on which to create the position embedding
        dtype: The dtype of the position embedding
        mask: An optional mask tensor of shape (batch_size, height, width) where True values indicate masked positions. If None, no positions are masked.
    Returns:
        A position embedding tensor of shape (batch_size, sequence_length, hidden_size) where sequence_length is height * width and hidden_size is num_position_features * 2 (for sine and cosine components)
    """
    if mask is None:
        mask = torch.zeros(
            (shape[0], shape[2], shape[3]), device=device, dtype=torch.bool
        )
    y_embed = mask.cumsum(1, dtype=dtype)
    x_embed = mask.cumsum(2, dtype=dtype)
    if self.normalize:
        eps = 1e-6
        y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
        x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale

    dim_t = torch.arange(
        self.num_position_features, dtype=torch.int64, device=device
    ).to(dtype)
    dim_t = self.temperature ** (
        2 * torch.div(dim_t, 2, rounding_mode="floor") / self.num_position_features
    )

    pos_x = x_embed[:, :, :, None] / dim_t
    pos_y = y_embed[:, :, :, None] / dim_t
    pos_x = torch.stack(
        (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4
    ).flatten(3)
    pos_y = torch.stack(
        (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4
    ).flatten(3)
    pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
    # Flatten spatial dimensions and permute to (batch_size, sequence_length, hidden_size) format
    # expected by the encoder
    pos = pos.flatten(2).permute(0, 2, 1)
    return pos

TemporalSinePositionEmbedding

TemporalSinePositionEmbedding(num_position_features: int = 128, temperature: int = 10000, normalize: bool = True, scale: float | None = None)

Bases: Module, Embedder

Methods:

  • forward

    Generate temporal sine position embeddings.

Source code in src/jormungandr/embedder.py
145
146
147
148
149
150
151
152
153
154
155
156
def __init__(
    self,
    num_position_features: int = 128,
    temperature: int = 10000,
    normalize: bool = True,
    scale: float | None = None,
):
    super().__init__()
    self.num_position_features = num_position_features
    self.temperature = temperature
    self.normalize = normalize
    self.scale = scale

forward

forward(shape: Size, device: device | str, dtype: dtype, delta_t: float = 1.0) -> torch.Tensor

Generate temporal sine position embeddings. Args: shape: The shape of the input tensor for which to compute the position embedding, expected to be (n_frames, sequence_length, model_dimension) device: The device on which to create the position embedding dtype: The dtype of the position embedding delta_t: The time interval between frames, used to compute the sine and cosine values. n_frames: The number of frames in the temporal sequence for which to compute the position embeddings. Returns: A position embedding tensor of shape (sequence_length * n_frames, num_position_features * 2) where num_position_features is the number of sine and cosine features for each temporal position. The first half of the features correspond to sine values and the second half correspond to cosine values.

PE(n_f, 2i) = sin(n_f * delta_t / (10000^(2i/d_model))) PE(n_f, 2i+1) = cos(n_f * delta_t / (10000^(2i/d_model)))

Source code in src/jormungandr/embedder.py
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
def forward(
    self,
    shape: torch.Size,
    device: torch.device | str,
    dtype: torch.dtype,
    delta_t: float = 1.0,
) -> torch.Tensor:
    """
    Generate temporal sine position embeddings.
    Args:
        shape: The shape of the input tensor for which to compute the position embedding, expected to be (n_frames, sequence_length, model_dimension)
        device: The device on which to create the position embedding
        dtype: The dtype of the position embedding
        delta_t: The time interval between frames, used to compute the sine and cosine values.
        n_frames: The number of frames in the temporal sequence for which to compute the position embeddings.
    Returns:
    A position embedding tensor of shape (sequence_length * n_frames, num_position_features * 2) where num_position_features is the number of sine and cosine features for each temporal position. The first half of the features correspond to sine values and the second half correspond to cosine values.

    PE(n_f, 2i) = sin(n_f * delta_t / (10000^(2i/d_model)))
    PE(n_f, 2i+1) = cos(n_f * delta_t / (10000^(2i/d_model)))
    """

    n_frames, sequence_length, model_dimension = shape

    # Create frame indices tensor of shape (n_frames,)
    frame_indices = torch.arange(n_frames, device=device, dtype=dtype)
    dim_t = torch.arange(
        self.num_position_features, dtype=torch.int64, device=device
    ).to(dtype)

    dim_t = self.temperature ** (
        2
        * dim_t
        / (
            self.num_position_features * 2
        )  # torch.div(dim_t, 2, rounding_mode="floor")
    )

    frame_indices = frame_indices[:, None]  # Shape (n_frames, 1)
    dim_t = dim_t[None, :]  # Shape (1, num_position_features

    angles = (
        frame_indices * delta_t / dim_t
    )  # Shape (n_frames, num_position_features)

    pos = torch.stack(
        (angles.sin(), angles.cos()), dim=-1
    )  # Shape (n_frames, num_position_features, 2)
    pos = pos.flatten(-2)  # Shape (n_frames, num_position_features * 2)

    pos = pos.unsqueeze(1)  # Add sequence length dimension
    pos = pos.repeat(
        1, sequence_length, 1
    )  # Repeat for each position in the sequence
    pos = pos.flatten(
        0, 1
    )  # Flatten to (n_frames * sequence_length, num_position_features * 2)

    return pos