Skip to content

mlx3d.cameras

mlx3d.cameras

Camera dataclass

A single pinhole camera.

Attributes:

Name Type Description
R array

(3, 3) world-to-camera rotation.

t array

(3,) world-to-camera translation.

fx, fy

focal lengths in pixels.

cx, cy

principal point in pixels.

width, height

image size in pixels.

znear, zfar

clipping range used by renderers.

orthographic bool

if True, use an orthographic projection (parallel rays, no perspective divide). fx/fy then act as pixels-per-world-unit instead of focal lengths.

distortion tuple[float, ...] | None

optional lens distortion coefficients. Brown-Conrady (k1, k2, p1, p2[, k3]) by default, or OpenCV fisheye (k1, k2, k3, k4) when fisheye=True. Applied in :meth:project_points and inverted in :meth:generate_rays / :meth:unproject_points.

fisheye bool

select the equidistant fisheye distortion model.

Source code in src/mlx3d/cameras/cameras.py
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
@dataclass
class Camera:
    """A single pinhole camera.

    Attributes:
        R: (3, 3) world-to-camera rotation.
        t: (3,) world-to-camera translation.
        fx, fy: focal lengths in pixels.
        cx, cy: principal point in pixels.
        width, height: image size in pixels.
        znear, zfar: clipping range used by renderers.
        orthographic: if ``True``, use an orthographic projection (parallel
            rays, no perspective divide). ``fx``/``fy`` then act as
            pixels-per-world-unit instead of focal lengths.
        distortion: optional lens distortion coefficients. Brown-Conrady
            ``(k1, k2, p1, p2[, k3])`` by default, or OpenCV fisheye
            ``(k1, k2, k3, k4)`` when ``fisheye=True``. Applied in
            :meth:`project_points` and inverted in :meth:`generate_rays` /
            :meth:`unproject_points`.
        fisheye: select the equidistant fisheye distortion model.
    """

    R: mx.array
    t: mx.array
    fx: float
    fy: float
    cx: float
    cy: float
    width: int
    height: int
    znear: float = 0.01
    zfar: float = 100.0
    orthographic: bool = False
    distortion: tuple[float, ...] | None = None
    fisheye: bool = False

    @classmethod
    def orthographic_camera(
        cls,
        scale: float,
        width: int,
        height: int,
        R: mx.array | None = None,
        t: mx.array | None = None,
        **kwargs,
    ) -> "Camera":
        """Create an orthographic camera.

        ``scale`` is the world-units half-height of the view volume: the visible
        region spans ``[-scale, scale]`` vertically in camera space, mapped to
        the image height (the width follows from the aspect ratio).
        """
        ppwu = (height / 2.0) / float(scale)  # pixels per world unit
        if R is None:
            R = mx.eye(3)
        if t is None:
            t = mx.zeros((3,))
        return cls(
            R=R,
            t=t,
            fx=ppwu,
            fy=ppwu,
            cx=width / 2.0,
            cy=height / 2.0,
            width=width,
            height=height,
            orthographic=True,
            **kwargs,
        )

    @classmethod
    def from_fov(
        cls,
        fov: float,
        width: int,
        height: int,
        R: mx.array | None = None,
        t: mx.array | None = None,
        degrees: bool = True,
        **kwargs,
    ) -> "Camera":
        """Create a camera from a vertical field of view (the horizontal FoV
        follows from the aspect ratio)."""
        if degrees:
            fov = math.radians(fov)
        f = fov_to_focal(fov, height)
        if R is None:
            R = mx.eye(3)
        if t is None:
            t = mx.zeros((3,))
        return cls(
            R=R,
            t=t,
            fx=f,
            fy=f,
            cx=width / 2.0,
            cy=height / 2.0,
            width=width,
            height=height,
            **kwargs,
        )

    @classmethod
    def look_at(
        cls,
        eye,
        at=(0.0, 0.0, 0.0),
        up=(0.0, 1.0, 0.0),
        fov: float = 60.0,
        width: int = 512,
        height: int = 512,
        degrees: bool = True,
        **kwargs,
    ) -> "Camera":
        """Create a camera at ``eye`` looking at ``at``."""
        R, t = look_at(mx.array(eye), mx.array(at), mx.array(up))
        return cls.from_fov(fov, width, height, R=R, t=t, degrees=degrees, **kwargs)

    @property
    def K(self) -> mx.array:
        """(3, 3) intrinsic matrix."""
        return mx.array([[self.fx, 0.0, self.cx], [0.0, self.fy, self.cy], [0.0, 0.0, 1.0]])

    @property
    def fov_x(self) -> float:
        return focal_to_fov(self.fx, self.width)

    @property
    def fov_y(self) -> float:
        return focal_to_fov(self.fy, self.height)

    @property
    def camera_center(self) -> mx.array:
        """(3,) camera position in world coordinates."""
        return -(self.R.T @ self.t)

    @property
    def world_to_camera_matrix(self) -> mx.array:
        """(4, 4) homogeneous world-to-camera matrix."""
        top = mx.concatenate([self.R, self.t[:, None]], axis=1)
        bottom = mx.array([[0.0, 0.0, 0.0, 1.0]])
        return mx.concatenate([top, bottom], axis=0)

    def world_to_camera(self, points: mx.array) -> mx.array:
        """Transform world points ``(..., 3)`` into the camera frame."""
        return points @ self.R.T + self.t

    def camera_to_world(self, points: mx.array) -> mx.array:
        """Transform camera-frame points ``(..., 3)`` back to world coordinates."""
        return (points - self.t) @ self.R

    def project_points(self, points: mx.array, eps: float = 1e-8) -> tuple[mx.array, mx.array]:
        """Project world points ``(..., 3)`` to pixel coordinates.

        Returns:
            ``(xy, depth)`` where ``xy`` is ``(..., 2)`` pixel coordinates and
            ``depth`` is ``(...,)`` z-depth in the camera frame. Points behind
            the camera have negative depth; callers should mask on it.
        """
        pc = self.world_to_camera(points)
        z = pc[..., 2]
        if self.orthographic:
            u = self.fx * pc[..., 0] + self.cx
            v = self.fy * pc[..., 1] + self.cy
            return mx.stack([u, v], axis=-1), z
        inv_z = 1.0 / mx.where(mx.abs(z) < eps, mx.full(z.shape, eps), z)
        x, y = pc[..., 0] * inv_z, pc[..., 1] * inv_z
        x, y = self._distort(x, y)
        u = self.fx * x + self.cx
        v = self.fy * y + self.cy
        return mx.stack([u, v], axis=-1), z

    def _distort(self, x: mx.array, y: mx.array) -> tuple[mx.array, mx.array]:
        """Apply lens distortion to normalized image coords (identity if none)."""
        if self.distortion is None:
            return x, y
        if self.fisheye:
            return _fisheye_distort(x, y, self.distortion)
        return _brown_distort(x, y, self.distortion)

    def _undistort(self, x: mx.array, y: mx.array) -> tuple[mx.array, mx.array]:
        """Invert lens distortion on normalized image coords (identity if none)."""
        if self.distortion is None:
            return x, y
        if self.fisheye:
            return _fisheye_undistort(x, y, self.distortion)
        return _brown_undistort(x, y, self.distortion)

    def unproject_points(self, xy: mx.array, depth: mx.array) -> mx.array:
        """Lift pixel coordinates ``(..., 2)`` with z-depths ``(...,)`` back to world points."""
        if self.orthographic:
            x = (xy[..., 0] - self.cx) / self.fx
            y = (xy[..., 1] - self.cy) / self.fy
            return self.camera_to_world(mx.stack([x, y, depth], axis=-1))
        xd = (xy[..., 0] - self.cx) / self.fx
        yd = (xy[..., 1] - self.cy) / self.fy
        x, y = self._undistort(xd, yd)
        return self.camera_to_world(mx.stack([x * depth, y * depth, depth], axis=-1))

    def generate_rays(self) -> tuple[mx.array, mx.array]:
        """Generate one ray per pixel (at pixel centers).

        Returns:
            ``(origins, directions)``, both ``(height, width, 3)`` in world
            coordinates. Directions are normalized.
        """
        u = mx.arange(self.width, dtype=mx.float32) + 0.5
        v = mx.arange(self.height, dtype=mx.float32) + 0.5
        uu = mx.broadcast_to(u[None, :], (self.height, self.width))
        vv = mx.broadcast_to(v[:, None], (self.height, self.width))
        xc = (uu - self.cx) / self.fx
        yc = (vv - self.cy) / self.fy
        if self.orthographic:
            # Parallel rays: shared forward direction, per-pixel origins on the
            # image plane (z = 0 in camera space).
            zeros = mx.zeros_like(uu)
            origins = self.camera_to_world(mx.stack([xc, yc, zeros], axis=-1))
            fwd = mx.array([0.0, 0.0, 1.0]) @ self.R
            fwd = fwd / mx.linalg.norm(fwd)
            dirs_world = mx.broadcast_to(fwd, origins.shape)
            return origins, dirs_world
        xc, yc = self._undistort(xc, yc)  # pixels carry distortion; rays must not
        dirs_cam = mx.stack([xc, yc, mx.ones_like(uu)], axis=-1)
        dirs_world = dirs_cam @ self.R  # == dirs_cam @ R^-T == R^T applied per-vector
        dirs_world = dirs_world / mx.linalg.norm(dirs_world, axis=-1, keepdims=True)
        origins = mx.broadcast_to(self.camera_center, dirs_world.shape)
        return origins, dirs_world

K property

(3, 3) intrinsic matrix.

camera_center property

(3,) camera position in world coordinates.

world_to_camera_matrix property

(4, 4) homogeneous world-to-camera matrix.

orthographic_camera(scale, width, height, R=None, t=None, **kwargs) classmethod

Create an orthographic camera.

scale is the world-units half-height of the view volume: the visible region spans [-scale, scale] vertically in camera space, mapped to the image height (the width follows from the aspect ratio).

Source code in src/mlx3d/cameras/cameras.py
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
@classmethod
def orthographic_camera(
    cls,
    scale: float,
    width: int,
    height: int,
    R: mx.array | None = None,
    t: mx.array | None = None,
    **kwargs,
) -> "Camera":
    """Create an orthographic camera.

    ``scale`` is the world-units half-height of the view volume: the visible
    region spans ``[-scale, scale]`` vertically in camera space, mapped to
    the image height (the width follows from the aspect ratio).
    """
    ppwu = (height / 2.0) / float(scale)  # pixels per world unit
    if R is None:
        R = mx.eye(3)
    if t is None:
        t = mx.zeros((3,))
    return cls(
        R=R,
        t=t,
        fx=ppwu,
        fy=ppwu,
        cx=width / 2.0,
        cy=height / 2.0,
        width=width,
        height=height,
        orthographic=True,
        **kwargs,
    )

from_fov(fov, width, height, R=None, t=None, degrees=True, **kwargs) classmethod

Create a camera from a vertical field of view (the horizontal FoV follows from the aspect ratio).

Source code in src/mlx3d/cameras/cameras.py
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
@classmethod
def from_fov(
    cls,
    fov: float,
    width: int,
    height: int,
    R: mx.array | None = None,
    t: mx.array | None = None,
    degrees: bool = True,
    **kwargs,
) -> "Camera":
    """Create a camera from a vertical field of view (the horizontal FoV
    follows from the aspect ratio)."""
    if degrees:
        fov = math.radians(fov)
    f = fov_to_focal(fov, height)
    if R is None:
        R = mx.eye(3)
    if t is None:
        t = mx.zeros((3,))
    return cls(
        R=R,
        t=t,
        fx=f,
        fy=f,
        cx=width / 2.0,
        cy=height / 2.0,
        width=width,
        height=height,
        **kwargs,
    )

look_at(eye, at=(0.0, 0.0, 0.0), up=(0.0, 1.0, 0.0), fov=60.0, width=512, height=512, degrees=True, **kwargs) classmethod

Create a camera at eye looking at at.

Source code in src/mlx3d/cameras/cameras.py
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
@classmethod
def look_at(
    cls,
    eye,
    at=(0.0, 0.0, 0.0),
    up=(0.0, 1.0, 0.0),
    fov: float = 60.0,
    width: int = 512,
    height: int = 512,
    degrees: bool = True,
    **kwargs,
) -> "Camera":
    """Create a camera at ``eye`` looking at ``at``."""
    R, t = look_at(mx.array(eye), mx.array(at), mx.array(up))
    return cls.from_fov(fov, width, height, R=R, t=t, degrees=degrees, **kwargs)

world_to_camera(points)

Transform world points (..., 3) into the camera frame.

Source code in src/mlx3d/cameras/cameras.py
282
283
284
def world_to_camera(self, points: mx.array) -> mx.array:
    """Transform world points ``(..., 3)`` into the camera frame."""
    return points @ self.R.T + self.t

camera_to_world(points)

Transform camera-frame points (..., 3) back to world coordinates.

Source code in src/mlx3d/cameras/cameras.py
286
287
288
def camera_to_world(self, points: mx.array) -> mx.array:
    """Transform camera-frame points ``(..., 3)`` back to world coordinates."""
    return (points - self.t) @ self.R

project_points(points, eps=1e-08)

Project world points (..., 3) to pixel coordinates.

Returns:

Type Description
array

(xy, depth) where xy is (..., 2) pixel coordinates and

array

depth is (...,) z-depth in the camera frame. Points behind

tuple[array, array]

the camera have negative depth; callers should mask on it.

Source code in src/mlx3d/cameras/cameras.py
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
def project_points(self, points: mx.array, eps: float = 1e-8) -> tuple[mx.array, mx.array]:
    """Project world points ``(..., 3)`` to pixel coordinates.

    Returns:
        ``(xy, depth)`` where ``xy`` is ``(..., 2)`` pixel coordinates and
        ``depth`` is ``(...,)`` z-depth in the camera frame. Points behind
        the camera have negative depth; callers should mask on it.
    """
    pc = self.world_to_camera(points)
    z = pc[..., 2]
    if self.orthographic:
        u = self.fx * pc[..., 0] + self.cx
        v = self.fy * pc[..., 1] + self.cy
        return mx.stack([u, v], axis=-1), z
    inv_z = 1.0 / mx.where(mx.abs(z) < eps, mx.full(z.shape, eps), z)
    x, y = pc[..., 0] * inv_z, pc[..., 1] * inv_z
    x, y = self._distort(x, y)
    u = self.fx * x + self.cx
    v = self.fy * y + self.cy
    return mx.stack([u, v], axis=-1), z

unproject_points(xy, depth)

Lift pixel coordinates (..., 2) with z-depths (...,) back to world points.

Source code in src/mlx3d/cameras/cameras.py
327
328
329
330
331
332
333
334
335
336
def unproject_points(self, xy: mx.array, depth: mx.array) -> mx.array:
    """Lift pixel coordinates ``(..., 2)`` with z-depths ``(...,)`` back to world points."""
    if self.orthographic:
        x = (xy[..., 0] - self.cx) / self.fx
        y = (xy[..., 1] - self.cy) / self.fy
        return self.camera_to_world(mx.stack([x, y, depth], axis=-1))
    xd = (xy[..., 0] - self.cx) / self.fx
    yd = (xy[..., 1] - self.cy) / self.fy
    x, y = self._undistort(xd, yd)
    return self.camera_to_world(mx.stack([x * depth, y * depth, depth], axis=-1))

generate_rays()

Generate one ray per pixel (at pixel centers).

Returns:

Type Description
array

(origins, directions), both (height, width, 3) in world

array

coordinates. Directions are normalized.

Source code in src/mlx3d/cameras/cameras.py
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
def generate_rays(self) -> tuple[mx.array, mx.array]:
    """Generate one ray per pixel (at pixel centers).

    Returns:
        ``(origins, directions)``, both ``(height, width, 3)`` in world
        coordinates. Directions are normalized.
    """
    u = mx.arange(self.width, dtype=mx.float32) + 0.5
    v = mx.arange(self.height, dtype=mx.float32) + 0.5
    uu = mx.broadcast_to(u[None, :], (self.height, self.width))
    vv = mx.broadcast_to(v[:, None], (self.height, self.width))
    xc = (uu - self.cx) / self.fx
    yc = (vv - self.cy) / self.fy
    if self.orthographic:
        # Parallel rays: shared forward direction, per-pixel origins on the
        # image plane (z = 0 in camera space).
        zeros = mx.zeros_like(uu)
        origins = self.camera_to_world(mx.stack([xc, yc, zeros], axis=-1))
        fwd = mx.array([0.0, 0.0, 1.0]) @ self.R
        fwd = fwd / mx.linalg.norm(fwd)
        dirs_world = mx.broadcast_to(fwd, origins.shape)
        return origins, dirs_world
    xc, yc = self._undistort(xc, yc)  # pixels carry distortion; rays must not
    dirs_cam = mx.stack([xc, yc, mx.ones_like(uu)], axis=-1)
    dirs_world = dirs_cam @ self.R  # == dirs_cam @ R^-T == R^T applied per-vector
    dirs_world = dirs_world / mx.linalg.norm(dirs_world, axis=-1, keepdims=True)
    origins = mx.broadcast_to(self.camera_center, dirs_world.shape)
    return origins, dirs_world

CameraBatch dataclass

A batch of N pinhole cameras with vectorized projection and rays.

Stores stacked extrinsics/intrinsics (R (N, 3, 3), t (N, 3), fx/fy/cx/cy (N,)) sharing one image size. Indexing returns a single :class:Camera, so it interoperates with the per-camera renderers; the batched methods avoid Python loops for multi-view projection and ray generation (e.g. projecting one point set into every view at once).

Source code in src/mlx3d/cameras/cameras.py
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
@dataclass
class CameraBatch:
    """A batch of ``N`` pinhole cameras with vectorized projection and rays.

    Stores stacked extrinsics/intrinsics (``R`` ``(N, 3, 3)``, ``t`` ``(N, 3)``,
    ``fx``/``fy``/``cx``/``cy`` ``(N,)``) sharing one image size. Indexing returns
    a single :class:`Camera`, so it interoperates with the per-camera renderers;
    the batched methods avoid Python loops for multi-view projection and ray
    generation (e.g. projecting one point set into every view at once).
    """

    R: mx.array  # (N, 3, 3)
    t: mx.array  # (N, 3)
    fx: mx.array  # (N,)
    fy: mx.array  # (N,)
    cx: mx.array  # (N,)
    cy: mx.array  # (N,)
    width: int
    height: int
    znear: float = 0.01
    zfar: float = 100.0

    @classmethod
    def from_cameras(cls, cameras: list[Camera]) -> "CameraBatch":
        """Stack a list of single :class:`Camera` objects into a batch."""
        if not cameras:
            raise ValueError("from_cameras needs at least one camera.")
        w, h = cameras[0].width, cameras[0].height
        if any((c.width, c.height) != (w, h) for c in cameras):
            raise ValueError("CameraBatch requires all cameras to share an image size.")
        return cls(
            R=mx.stack([mx.array(c.R) for c in cameras]),
            t=mx.stack([mx.array(c.t) for c in cameras]),
            fx=mx.array([float(c.fx) for c in cameras]),
            fy=mx.array([float(c.fy) for c in cameras]),
            cx=mx.array([float(c.cx) for c in cameras]),
            cy=mx.array([float(c.cy) for c in cameras]),
            width=w,
            height=h,
            znear=float(cameras[0].znear),
            zfar=float(cameras[0].zfar),
        )

    def __len__(self) -> int:
        return int(self.R.shape[0])

    def __getitem__(self, i: int) -> Camera:
        return Camera(
            R=self.R[i],
            t=self.t[i],
            fx=float(self.fx[i]),
            fy=float(self.fy[i]),
            cx=float(self.cx[i]),
            cy=float(self.cy[i]),
            width=self.width,
            height=self.height,
            znear=self.znear,
            zfar=self.zfar,
        )

    @property
    def camera_centers(self) -> mx.array:
        """``(N, 3)`` camera positions in world coordinates."""
        return -(mx.swapaxes(self.R, -1, -2) @ self.t[..., None])[..., 0]

    def world_to_camera(self, points: mx.array) -> mx.array:
        """Transform world points ``(P, 3)`` into each camera frame -> ``(N, P, 3)``."""
        return points[None] @ mx.swapaxes(self.R, -1, -2) + self.t[:, None, :]

    def project_points(self, points: mx.array, eps: float = 1e-8) -> tuple[mx.array, mx.array]:
        """Project world points ``(P, 3)`` into all ``N`` views.

        Returns ``(xy, depth)`` of shapes ``(N, P, 2)`` and ``(N, P)``.
        """
        pc = self.world_to_camera(points)  # (N, P, 3)
        z = pc[..., 2]
        inv_z = 1.0 / mx.where(mx.abs(z) < eps, mx.full(z.shape, eps), z)
        u = self.fx[:, None] * pc[..., 0] * inv_z + self.cx[:, None]
        v = self.fy[:, None] * pc[..., 1] * inv_z + self.cy[:, None]
        return mx.stack([u, v], axis=-1), z

    def generate_rays(self) -> tuple[mx.array, mx.array]:
        """Per-pixel rays for every camera.

        Returns ``(origins, directions)`` both ``(N, height, width, 3)`` in world
        coordinates, with normalized directions.
        """
        n, h, w = len(self), self.height, self.width
        uu = mx.broadcast_to((mx.arange(w, dtype=mx.float32) + 0.5)[None, :], (h, w))
        vv = mx.broadcast_to((mx.arange(h, dtype=mx.float32) + 0.5)[:, None], (h, w))
        # Per-camera intrinsics -> direction in camera space, then to world.
        xcam = (uu[None] - self.cx[:, None, None]) / self.fx[:, None, None]  # (N, H, W)
        ycam = (vv[None] - self.cy[:, None, None]) / self.fy[:, None, None]
        dirs_cam = mx.stack([xcam, ycam, mx.ones_like(xcam)], axis=-1)  # (N, H, W, 3)
        dirs_world = dirs_cam.reshape(n, h * w, 3) @ self.R  # (N, HW, 3)
        dirs_world = dirs_world / mx.linalg.norm(dirs_world, axis=-1, keepdims=True)
        dirs_world = dirs_world.reshape(n, h, w, 3)
        origins = mx.broadcast_to(self.camera_centers[:, None, None, :], dirs_world.shape)
        return origins, dirs_world

camera_centers property

(N, 3) camera positions in world coordinates.

from_cameras(cameras) classmethod

Stack a list of single :class:Camera objects into a batch.

Source code in src/mlx3d/cameras/cameras.py
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
@classmethod
def from_cameras(cls, cameras: list[Camera]) -> "CameraBatch":
    """Stack a list of single :class:`Camera` objects into a batch."""
    if not cameras:
        raise ValueError("from_cameras needs at least one camera.")
    w, h = cameras[0].width, cameras[0].height
    if any((c.width, c.height) != (w, h) for c in cameras):
        raise ValueError("CameraBatch requires all cameras to share an image size.")
    return cls(
        R=mx.stack([mx.array(c.R) for c in cameras]),
        t=mx.stack([mx.array(c.t) for c in cameras]),
        fx=mx.array([float(c.fx) for c in cameras]),
        fy=mx.array([float(c.fy) for c in cameras]),
        cx=mx.array([float(c.cx) for c in cameras]),
        cy=mx.array([float(c.cy) for c in cameras]),
        width=w,
        height=h,
        znear=float(cameras[0].znear),
        zfar=float(cameras[0].zfar),
    )

world_to_camera(points)

Transform world points (P, 3) into each camera frame -> (N, P, 3).

Source code in src/mlx3d/cameras/cameras.py
433
434
435
def world_to_camera(self, points: mx.array) -> mx.array:
    """Transform world points ``(P, 3)`` into each camera frame -> ``(N, P, 3)``."""
    return points[None] @ mx.swapaxes(self.R, -1, -2) + self.t[:, None, :]

project_points(points, eps=1e-08)

Project world points (P, 3) into all N views.

Returns (xy, depth) of shapes (N, P, 2) and (N, P).

Source code in src/mlx3d/cameras/cameras.py
437
438
439
440
441
442
443
444
445
446
447
def project_points(self, points: mx.array, eps: float = 1e-8) -> tuple[mx.array, mx.array]:
    """Project world points ``(P, 3)`` into all ``N`` views.

    Returns ``(xy, depth)`` of shapes ``(N, P, 2)`` and ``(N, P)``.
    """
    pc = self.world_to_camera(points)  # (N, P, 3)
    z = pc[..., 2]
    inv_z = 1.0 / mx.where(mx.abs(z) < eps, mx.full(z.shape, eps), z)
    u = self.fx[:, None] * pc[..., 0] * inv_z + self.cx[:, None]
    v = self.fy[:, None] * pc[..., 1] * inv_z + self.cy[:, None]
    return mx.stack([u, v], axis=-1), z

generate_rays()

Per-pixel rays for every camera.

Returns (origins, directions) both (N, height, width, 3) in world coordinates, with normalized directions.

Source code in src/mlx3d/cameras/cameras.py
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
def generate_rays(self) -> tuple[mx.array, mx.array]:
    """Per-pixel rays for every camera.

    Returns ``(origins, directions)`` both ``(N, height, width, 3)`` in world
    coordinates, with normalized directions.
    """
    n, h, w = len(self), self.height, self.width
    uu = mx.broadcast_to((mx.arange(w, dtype=mx.float32) + 0.5)[None, :], (h, w))
    vv = mx.broadcast_to((mx.arange(h, dtype=mx.float32) + 0.5)[:, None], (h, w))
    # Per-camera intrinsics -> direction in camera space, then to world.
    xcam = (uu[None] - self.cx[:, None, None]) / self.fx[:, None, None]  # (N, H, W)
    ycam = (vv[None] - self.cy[:, None, None]) / self.fy[:, None, None]
    dirs_cam = mx.stack([xcam, ycam, mx.ones_like(xcam)], axis=-1)  # (N, H, W, 3)
    dirs_world = dirs_cam.reshape(n, h * w, 3) @ self.R  # (N, HW, 3)
    dirs_world = dirs_world / mx.linalg.norm(dirs_world, axis=-1, keepdims=True)
    dirs_world = dirs_world.reshape(n, h, w, 3)
    origins = mx.broadcast_to(self.camera_centers[:, None, None, :], dirs_world.shape)
    return origins, dirs_world

focal_to_fov(focal, pixels)

Field of view in radians from a focal length in pixels.

Source code in src/mlx3d/cameras/cameras.py
37
38
39
def focal_to_fov(focal: float, pixels: int) -> float:
    """Field of view in radians from a focal length in pixels."""
    return 2.0 * math.atan(pixels / (2.0 * focal))

fov_to_focal(fov, pixels)

Focal length in pixels from a field of view in radians.

Source code in src/mlx3d/cameras/cameras.py
32
33
34
def fov_to_focal(fov: float, pixels: int) -> float:
    """Focal length in pixels from a field of view in radians."""
    return pixels / (2.0 * math.tan(fov / 2.0))

look_at(eye, at, up)

Build OpenCV-convention extrinsics (R, t) for a camera at eye looking at at.

Parameters:

Name Type Description Default
eye array

(3,) camera position in world coordinates.

required
at array

(3,) target point in world coordinates.

required
up array

(3,) approximate world up vector.

required

Returns:

Type Description
tuple[array, array]

R (3, 3) and t (3,) such that X_cam = R @ X_world + t.

Source code in src/mlx3d/cameras/cameras.py
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
def look_at(eye: mx.array, at: mx.array, up: mx.array) -> tuple[mx.array, mx.array]:
    """Build OpenCV-convention extrinsics ``(R, t)`` for a camera at ``eye`` looking at ``at``.

    Args:
        eye: (3,) camera position in world coordinates.
        at: (3,) target point in world coordinates.
        up: (3,) approximate world up vector.

    Returns:
        ``R`` (3, 3) and ``t`` (3,) such that ``X_cam = R @ X_world + t``.
    """
    eye, at, up = mx.array(eye), mx.array(at), mx.array(up)
    z = at - eye
    z = z / mx.linalg.norm(z)
    x = mx.linalg.cross(z, up)
    x = x / mx.linalg.norm(x)
    y = mx.linalg.cross(z, x)
    R = mx.stack([x, y, z], axis=0)
    t = -(R @ eye)
    return R, t

look_at_view_transform(dist=1.0, elev=0.0, azim=0.0, at=(0.0, 0.0, 0.0), up=(0.0, 1.0, 0.0), degrees=True)

Extrinsics for a camera on a sphere around at.

elev is the angle above the xz-plane, azim the angle around +y measured from +z. Returns (R, t) in the OpenCV convention.

Source code in src/mlx3d/cameras/cameras.py
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
def look_at_view_transform(
    dist: float = 1.0,
    elev: float = 0.0,
    azim: float = 0.0,
    at: tuple[float, float, float] = (0.0, 0.0, 0.0),
    up: tuple[float, float, float] = (0.0, 1.0, 0.0),
    degrees: bool = True,
) -> tuple[mx.array, mx.array]:
    """Extrinsics for a camera on a sphere around ``at``.

    ``elev`` is the angle above the xz-plane, ``azim`` the angle around ``+y``
    measured from ``+z``. Returns ``(R, t)`` in the OpenCV convention.
    """
    if degrees:
        elev = math.radians(elev)
        azim = math.radians(azim)
    x = dist * math.cos(elev) * math.sin(azim)
    y = dist * math.sin(elev)
    z = dist * math.cos(elev) * math.cos(azim)
    eye = mx.array([at[0] + x, at[1] + y, at[2] + z])
    return look_at(eye, mx.array(at), mx.array(up))

refine_camera(camera, twist)

Return a copy of camera whose pose is perturbed by an SE(3) twist.

The world-to-camera extrinsics are left-multiplied by exp(twist) (a 6D Lie-algebra vector [v, omega]), so at twist = 0 the camera is unchanged. The result is differentiable w.r.t. twist, which makes camera poses optimizable jointly with a scene (BARF / pose-free NeRF & 3DGS): parameterize each view by a learnable twist, refine the camera, render, and backpropagate.

Intrinsics, image size and distortion are carried over unchanged.

Source code in src/mlx3d/cameras/cameras.py
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
def refine_camera(camera: Camera, twist: mx.array) -> Camera:
    """Return a copy of ``camera`` whose pose is perturbed by an SE(3) ``twist``.

    The world-to-camera extrinsics are left-multiplied by ``exp(twist)`` (a 6D
    Lie-algebra vector ``[v, omega]``), so at ``twist = 0`` the camera is
    unchanged. The result is differentiable w.r.t. ``twist``, which makes camera
    poses optimizable jointly with a scene (BARF / pose-free NeRF & 3DGS):
    parameterize each view by a learnable twist, refine the camera, render, and
    backpropagate.

    Intrinsics, image size and distortion are carried over unchanged.
    """
    from ..transforms.se3 import Transform3d, se3_exp_map

    delta = se3_exp_map(twist)
    refined = Transform3d(camera.R, camera.t).compose(delta)
    return Camera(
        R=refined.rot,
        t=refined.trans,
        fx=camera.fx,
        fy=camera.fy,
        cx=camera.cx,
        cy=camera.cy,
        width=camera.width,
        height=camera.height,
        znear=camera.znear,
        zfar=camera.zfar,
        orthographic=camera.orthographic,
        distortion=camera.distortion,
        fisheye=camera.fisheye,
    )