Skip to content

mlx3d.datasets

mlx3d.datasets

ColmapDataset dataclass

Source code in src/mlx3d/datasets/colmap.py
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
@dataclass
class ColmapDataset:
    cameras: list[Camera]
    images: ImageCollection
    image_names: list[str]
    points: mx.array  # (P, 3) SfM points
    point_colors: mx.array  # (P, 3) in [0, 1]

    def __len__(self) -> int:
        return len(self.cameras)

    def __getitem__(self, i: int) -> tuple[Camera, mx.array]:
        return self.cameras[i], self.images[i]

    @property
    def scene_extent(self) -> float:
        """Radius of the camera-centers bounding sphere (used to scale
        densification thresholds, as in 3DGS)."""
        centers = np.stack([np.array(c.camera_center) for c in self.cameras])
        center = centers.mean(axis=0)
        return float(np.linalg.norm(centers - center, axis=1).max()) * 1.1

scene_extent property

Radius of the camera-centers bounding sphere (used to scale densification thresholds, as in 3DGS).

ImageCollection

A list-like view over training images with configurable storage.

Always yields (H, W, 3) float32 MLX arrays in [0, 1], regardless of the underlying storage mode.

Source code in src/mlx3d/datasets/images.py
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
class ImageCollection:
    """A list-like view over training images with configurable storage.

    Always yields (H, W, 3) float32 MLX arrays in [0, 1], regardless of the
    underlying storage mode.
    """

    def __init__(
        self, cache: str = "ram", downscale: int = 1, white_background: bool | None = None
    ):
        if cache not in ("ram", "uint8", "disk"):
            raise ValueError(f"cache must be 'ram', 'uint8' or 'disk', got {cache!r}.")
        self.cache = cache
        self.downscale = downscale
        self.white_background = white_background
        self._items: list = []

    def append_file(self, path: str) -> None:
        """Register an image file, decoding it now or later per the cache mode."""
        if self.cache == "disk":
            self._items.append(path)
        else:
            arr = decode_image_file(path, self.downscale, self.white_background)
            if self.cache == "uint8":
                self._items.append((arr * 255.0 + 0.5).astype(np.uint8))
            else:
                self._items.append(mx.array(arr))

    def shape_of(self, i: int) -> tuple[int, int]:
        """(height, width) of image ``i`` (decodes it in disk mode)."""
        img = self[i]
        return img.shape[0], img.shape[1]

    def __len__(self) -> int:
        return len(self._items)

    def __getitem__(self, i: int) -> mx.array:
        item = self._items[i]
        if self.cache == "ram":
            return item
        if self.cache == "uint8":
            return mx.array(item.astype(np.float32) / 255.0)
        return mx.array(decode_image_file(item, self.downscale, self.white_background))

    def __iter__(self):
        for i in range(len(self)):
            yield self[i]

    @property
    def nbytes_resident(self) -> int:
        """Approximate resident memory of the stored images, in bytes."""
        if self.cache == "disk":
            return 0
        if self.cache == "uint8":
            return sum(a.nbytes for a in self._items)
        return sum(a.size * 4 for a in self._items)

nbytes_resident property

Approximate resident memory of the stored images, in bytes.

append_file(path)

Register an image file, decoding it now or later per the cache mode.

Source code in src/mlx3d/datasets/images.py
65
66
67
68
69
70
71
72
73
74
def append_file(self, path: str) -> None:
    """Register an image file, decoding it now or later per the cache mode."""
    if self.cache == "disk":
        self._items.append(path)
    else:
        arr = decode_image_file(path, self.downscale, self.white_background)
        if self.cache == "uint8":
            self._items.append((arr * 255.0 + 0.5).astype(np.uint8))
        else:
            self._items.append(mx.array(arr))

shape_of(i)

(height, width) of image i (decodes it in disk mode).

Source code in src/mlx3d/datasets/images.py
76
77
78
79
def shape_of(self, i: int) -> tuple[int, int]:
    """(height, width) of image ``i`` (decodes it in disk mode)."""
    img = self[i]
    return img.shape[0], img.shape[1]

load_blender(root, split='train', downscale=1, white_background=True, cache='ram')

Load a Blender-synthetic scene (lego, chair, ...).

Parameters:

Name Type Description Default
root str

scene directory containing transforms_<split>.json.

required
split str

"train", "val" or "test".

'train'
downscale int

integer image downscaling factor.

1
white_background bool

composite the RGBA renders onto white (else black).

True
cache str

image storage policy ("ram", "uint8" or "disk"); see :class:~mlx3d.datasets.images.ImageCollection.

'ram'
Source code in src/mlx3d/datasets/blender.py
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
def load_blender(
    root: str,
    split: str = "train",
    downscale: int = 1,
    white_background: bool = True,
    cache: str = "ram",
) -> BlenderDataset:
    """Load a Blender-synthetic scene (lego, chair, ...).

    Args:
        root: scene directory containing ``transforms_<split>.json``.
        split: ``"train"``, ``"val"`` or ``"test"``.
        downscale: integer image downscaling factor.
        white_background: composite the RGBA renders onto white (else black).
        cache: image storage policy (``"ram"``, ``"uint8"`` or ``"disk"``);
            see :class:`~mlx3d.datasets.images.ImageCollection`.
    """
    with open(os.path.join(root, f"transforms_{split}.json")) as f:
        meta = json.load(f)

    cameras: list[Camera] = []
    images = ImageCollection(cache=cache, downscale=downscale, white_background=white_background)
    for frame in meta["frames"]:
        img_path = os.path.join(root, frame["file_path"] + ".png")
        if not os.path.exists(img_path):
            img_path = os.path.join(root, frame["file_path"])
        images.append_file(img_path)
        H, W = images.shape_of(len(images) - 1)

        focal = 0.5 * W / math.tan(0.5 * meta["camera_angle_x"])
        R, t = _c2w_opengl_to_opencv_extrinsics(
            np.asarray(frame["transform_matrix"], dtype=np.float64)
        )
        cameras.append(
            Camera(
                R=R,
                t=t,
                fx=focal,
                fy=focal,
                cx=W / 2.0,
                cy=H / 2.0,
                width=W,
                height=H,
                znear=0.01,
                zfar=100.0,
            )
        )

    return BlenderDataset(cameras=cameras, images=images)

load_colmap(root, images_dir='images', downscale=1, load_images=True, cache='ram')

Load a COLMAP scene laid out as root/sparse/0 + root/<images_dir>.

COLMAP already uses the OpenCV camera convention, so extrinsics map directly onto :class:~mlx3d.cameras.Camera.

Parameters:

Name Type Description Default
cache str

image storage policy — "ram" (float32, fastest), "uint8" (4x less memory, negligible per-access cost), or "disk" (paths only, decode on access; near-zero resident memory). See :class:~mlx3d.datasets.images.ImageCollection.

'ram'
Source code in src/mlx3d/datasets/colmap.py
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
def load_colmap(
    root: str,
    images_dir: str = "images",
    downscale: int = 1,
    load_images: bool = True,
    cache: str = "ram",
) -> ColmapDataset:
    """Load a COLMAP scene laid out as ``root/sparse/0`` + ``root/<images_dir>``.

    COLMAP already uses the OpenCV camera convention, so extrinsics map
    directly onto :class:`~mlx3d.cameras.Camera`.

    Args:
        cache: image storage policy — ``"ram"`` (float32, fastest),
            ``"uint8"`` (4x less memory, negligible per-access cost), or
            ``"disk"`` (paths only, decode on access; near-zero resident
            memory). See :class:`~mlx3d.datasets.images.ImageCollection`.
    """
    sparse = os.path.join(root, "sparse", "0")
    if not os.path.isdir(sparse):
        sparse = os.path.join(root, "sparse")
    cams_meta = _read_cameras_bin(os.path.join(sparse, "cameras.bin"))
    imgs_meta = _read_images_bin(os.path.join(sparse, "images.bin"))
    xyz, rgb = _read_points3d_bin(os.path.join(sparse, "points3D.bin"))

    cameras: list[Camera] = []
    images = ImageCollection(cache=cache, downscale=downscale, white_background=None)
    names: list[str] = []
    for _, meta in sorted(imgs_meta.items(), key=lambda kv: kv[1]["name"]):
        cam = cams_meta[meta["camera_id"]]
        params = cam["params"]
        distortion = None
        fisheye = False
        model = cam["model"]
        if model == "SIMPLE_PINHOLE":
            fx = fy = params[0]
            cx, cy = params[1], params[2]
        elif model == "PINHOLE":
            fx, fy, cx, cy = params[:4]
        elif model == "SIMPLE_RADIAL":
            fx = fy = params[0]
            cx, cy = params[1], params[2]
            distortion = (params[3], 0.0, 0.0, 0.0)
        elif model == "RADIAL":
            fx = fy = params[0]
            cx, cy = params[1], params[2]
            distortion = (params[3], params[4], 0.0, 0.0)
        elif model == "OPENCV":
            fx, fy, cx, cy = params[:4]
            distortion = tuple(params[4:8])  # k1, k2, p1, p2
        elif model == "OPENCV_FISHEYE":
            fx, fy, cx, cy = params[:4]
            distortion = tuple(params[4:8])  # k1, k2, k3, k4
            fisheye = True
        else:  # pragma: no cover
            raise ValueError(f"Unsupported camera model {model}")

        image_path = os.path.join(root, images_dir, meta["name"])
        file_size = _image_size(image_path)
        base_w, base_h = file_size or (cam["width"], cam["height"])
        # Use the actual image files as the training-resolution source of
        # truth. Some public COLMAP scenes ship pre-resized images while
        # retaining full-resolution camera metadata.
        W = max(1, base_w // downscale)
        H = max(1, base_h // downscale)
        sx = W / cam["width"]
        sy = H / cam["height"]
        R = _qvec_to_rotmat(meta["qvec"])
        t = meta["tvec"]

        cameras.append(
            Camera(
                R=mx.array(R.astype(np.float32)),
                t=mx.array(t.astype(np.float32)),
                fx=fx * sx,
                fy=fy * sy,
                cx=cx * sx,
                cy=cy * sy,
                width=int(W),
                height=int(H),
                distortion=distortion,
                fisheye=fisheye,
            )
        )
        if load_images:
            images.append_file(image_path)
        names.append(meta["name"])

    return ColmapDataset(
        cameras=cameras,
        images=images,
        image_names=names,
        points=mx.array(xyz.astype(np.float32)),
        point_colors=mx.array(rgb.astype(np.float32) / 255.0),
    )

load_instant_ngp(root, transforms_file='transforms.json', downscale=1, white_background=False, cache='ram')

Load an Instant-NGP / nerfstudio scene.

Parameters:

Name Type Description Default
root str

scene directory containing transforms_file.

required
transforms_file str

name of the transforms JSON (default transforms.json).

'transforms.json'
downscale int

integer image downscaling factor (intrinsics are scaled to match).

1
white_background bool

composite RGBA images onto white (else black).

False
cache str

image storage policy; see :class:~mlx3d.datasets.images.ImageCollection.

'ram'

Returns:

Name Type Description
A BlenderDataset

class:~mlx3d.datasets.BlenderDataset (cameras + images).

Source code in src/mlx3d/datasets/instant_ngp.py
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
def load_instant_ngp(
    root: str,
    transforms_file: str = "transforms.json",
    downscale: int = 1,
    white_background: bool = False,
    cache: str = "ram",
) -> BlenderDataset:
    """Load an Instant-NGP / nerfstudio scene.

    Args:
        root: scene directory containing ``transforms_file``.
        transforms_file: name of the transforms JSON (default ``transforms.json``).
        downscale: integer image downscaling factor (intrinsics are scaled to match).
        white_background: composite RGBA images onto white (else black).
        cache: image storage policy; see :class:`~mlx3d.datasets.images.ImageCollection`.

    Returns:
        A :class:`~mlx3d.datasets.BlenderDataset` (``cameras`` + ``images``).
    """
    with open(os.path.join(root, transforms_file)) as f:
        meta = json.load(f)

    cameras: list[Camera] = []
    images = ImageCollection(cache=cache, downscale=downscale, white_background=white_background)

    g_angle = meta.get("camera_angle_x")
    for frame in meta["frames"]:
        images.append_file(_resolve_path(root, frame["file_path"]))
        h, w = images.shape_of(len(images) - 1)

        # Per-frame intrinsics override globals when present.
        def _get(key, default=None):
            return frame.get(key, meta.get(key, default))

        orig_w = float(_get("w", w * downscale))
        orig_h = float(_get("h", h * downscale))
        sx, sy = w / orig_w, h / orig_h  # account for downscaling

        fl_x = _get("fl_x")
        if fl_x is not None:
            fx = float(fl_x) * sx
            fy = float(_get("fl_y", fl_x)) * sy
            cx = float(_get("cx", orig_w / 2.0)) * sx
            cy = float(_get("cy", orig_h / 2.0)) * sy
        else:
            angle = frame.get("camera_angle_x", g_angle)
            if angle is None:
                raise ValueError("transforms.json must provide fl_x or camera_angle_x.")
            fx = fy = 0.5 * w / math.tan(0.5 * float(angle))
            cx, cy = w / 2.0, h / 2.0

        R, t = _c2w_opengl_to_opencv_extrinsics(
            np.asarray(frame["transform_matrix"], dtype=np.float64)
        )
        cameras.append(
            Camera(R=R, t=t, fx=fx, fy=fy, cx=cx, cy=cy, width=w, height=h, znear=0.01, zfar=100.0)
        )

    return BlenderDataset(cameras=cameras, images=images)