Skip to content

Datasets Reference

This section provides a detailed API reference for all modules related to built‑in datasets in the datarec library. Dataset entry points live in datarec.datasets and connect to registry metadata and versions.

On This Page

Minimal usage

from datarec.datasets import Movielens

data = Movielens(version="1m").prepare_and_load()

Dataset Entry Points

list_datasets()

Return the built-in dataset names registered in DataRec.

Source code in datarec/datasets/__init__.py
def list_datasets() -> list[str]:
    """Return the built-in dataset names registered in DataRec."""
    return sorted(available_datasets())

list_dataset_versions(name)

Return all available versions for a registered dataset.

Source code in datarec/datasets/__init__.py
def list_dataset_versions(name: str) -> list[str]:
    """Return all available versions for a registered dataset."""
    conf = load_dataset_config(name)
    return list(conf.get("versions", []))

latest_dataset_version(name)

Return the latest version for a registered dataset.

Source code in datarec/datasets/__init__.py
def latest_dataset_version(name: str) -> str:
    """Return the latest version for a registered dataset."""
    return load_dataset_config(name)["latest_version"]

load_dataset(name, version='latest', **kwargs)

Instantiate a dataset by registry name, or load a dataset from a remote registry YAML.

If name is a URL, this delegates to load_dataset_from_url.

Source code in datarec/datasets/__init__.py
def load_dataset(name: str, version: str = "latest", **kwargs) -> RegisteredDataset:
    """
    Instantiate a dataset by registry name, or load a dataset from a remote registry YAML.

    If `name` is a URL, this delegates to `load_dataset_from_url`.
    """
    if _is_url(name):
        prepare_and_load = kwargs.pop("prepare_and_load", True)
        folder = kwargs.pop("folder", None)
        return load_dataset_from_url(name, folder=folder, prepare_and_load=prepare_and_load)

    if name not in available_datasets():
        raise ValueError(f"Dataset '{name}' is not registered. Available: {', '.join(list_datasets())}")

    if version == "latest":
        version = latest_dataset_version(name)

    if version not in list_dataset_versions(name):
        raise ValueError(
            f"Unsupported version '{version}' for dataset '{name}'. "
            f"Supported versions: {', '.join(list_dataset_versions(name))}"
        )

    for cls in DatasetEntryPoint.__subclasses__():
        if cls.dataset_name == name:
            return cls(version=version, **kwargs)

    raise ValueError(f"No entrypoint registered for dataset '{name}'.")

load_dataset_from_url(url, *, folder=None, prepare_and_load=True)

Load a dataset from a remote registry YAML.

Parameters:

Name Type Description Default
url str

URL to a registry version YAML.

required
folder str | None

Optional output folder override.

None
prepare_and_load bool

When True, returns a loaded DataRec; otherwise returns RegisteredDataset.

True

Returns:

Type Description

RegisteredDataset | DataRec: The dataset entrypoint or loaded dataset.

Source code in datarec/datasets/__init__.py
def load_dataset_from_url(url: str, *, folder: Optional[str] = None, prepare_and_load: bool = True):
    """
    Load a dataset from a remote registry YAML.

    Args:
        url (str): URL to a registry version YAML.
        folder (str | None): Optional output folder override.
        prepare_and_load (bool): When True, returns a loaded DataRec; otherwise returns RegisteredDataset.

    Returns:
        RegisteredDataset | DataRec: The dataset entrypoint or loaded dataset.
    """
    dataset = RegisteredDataset.from_url(url, folder=folder)
    if prepare_and_load:
        return dataset.prepare_and_load()
    return dataset

Registry Utilities

available_datasets()

Return a list of available built-in datasets Returns: List[str]: list of built-in datasets

Source code in datarec/registry/utils.py
def available_datasets()->List[str]:
    """
    Return a list of available built-in datasets
    Returns:
        List[str]: list of built-in datasets
    """
    return sorted([d.replace('.yml', '') for d in os.listdir(REGISTRY_DATASETS_FOLDER)])

print_available_datasets()

Prints the list of available built-in datasets Returns: None

Source code in datarec/registry/utils.py
def print_available_datasets()->None:
    """
    Prints the list of available built-in datasets
    Returns:
        None
    """
    print("""
DataRec built-in datasets:
- """+'\n - '.join(available_datasets()))

compute_dataset_characteristics(dataset_name, version, *, output_dir=REGISTRY_METRICS_FOLDER, use_cache=True, overwrite=False)

Compute and persist characteristics for a specific dataset/version.

Returns:

Type Description
str

Path to the written YAML file.

Source code in datarec/registry/utils.py
def compute_dataset_characteristics(dataset_name: str,
                                    version: str,
                                    *,
                                    output_dir: str = REGISTRY_METRICS_FOLDER,
                                    use_cache: bool = True,
                                    overwrite: bool = False) -> str:
    """
    Compute and persist characteristics for a specific dataset/version.

    Returns:
        Path to the written YAML file.
    """
    os.makedirs(output_dir, exist_ok=True)
    out_path = registry_metrics_filepath(dataset_name, version)

    if os.path.exists(out_path) and not overwrite:
        print(f"Skip {dataset_name} {version}: file exists ({out_path})")
        return out_path

    dset = RegisteredDataset(dataset_name=dataset_name, version=version)
    dset.prepare(use_cache=use_cache)
    dr = dset.load(use_cache=use_cache, to_cache=use_cache, only_required=True)

    characteristics = {}
    for name, func in CHARACTERISTICS.items():
        try:
            value = func(dr)
            # Make YAML-friendly: unwrap numpy scalars if possible
            if hasattr(value, "item"):
                try:
                    value = value.item()
                except Exception:
                    pass
            characteristics[name] = value
        except Exception as exc:
            characteristics[name] = None
            characteristics[f"{name}_error"] = str(exc)

    payload = {
        "dataset": dataset_name,
        "version": version,
        "computed_at": datetime.now(timezone.utc).isoformat(),
        "characteristics": characteristics,
    }

    with open(out_path, "w") as f:
        yaml.safe_dump(payload, f, sort_keys=False)

    print(f"Wrote {out_path}")
    return out_path

get_metrics_filepath(dataset_name, version)

Return the expected registry metrics filepath for a dataset/version.

Source code in datarec/registry/utils.py
def get_metrics_filepath(dataset_name: str, version: str) -> str:
    """
    Return the expected registry metrics filepath for a dataset/version.
    """
    return registry_metrics_filepath(dataset_name, version)

compute_all_characteristics(output_dir=REGISTRY_METRICS_FOLDER, use_cache=True, overwrite=False)

Compute characteristics for every dataset/version and write YAML files.

Source code in datarec/registry/utils.py
def compute_all_characteristics(output_dir: str = REGISTRY_METRICS_FOLDER,
                                use_cache: bool = True,
                                overwrite: bool = False) -> None:
    """
    Compute characteristics for every dataset/version and write YAML files.
    """
    os.makedirs(output_dir, exist_ok=True)

    for ds_name in available_datasets():
        conf = load_dataset_config(ds_name)
        versions = conf.get("versions", [])
        if isinstance(versions, dict):  # support both list and dict configs
            versions = list(versions.keys())

        for version in versions:
            try:
                compute_dataset_characteristics(
                    dataset_name=ds_name,
                    version=version,
                    output_dir=output_dir,
                    use_cache=use_cache,
                    overwrite=overwrite,
                )
            except Exception as exc:  # noqa: BLE001
                print(f"Failed {ds_name} {version}: {exc}")