# Authors: The MNE-Python contributors. # License: BSD-3-Clause # Copyright the MNE-Python contributors. from __future__ import annotations # only needed for Python ≤ 3.9 import os import os.path as op import sys import time from pathlib import Path from shutil import rmtree from .. import __version__ as mne_version from ..fixes import _compare_version from ..utils import _safe_input, logger, warn from .config import ( MISC_VERSIONED, RELEASES, TESTING_VERSIONED, _bst_license_text, ) from .utils import ( _dataset_version, _do_path_update, _downloader_params, _get_path, _log_time_size, ) _FAKE_VERSION = None # used for monkeypatching while testing versioning def fetch_dataset( dataset_params, processor=None, path=None, force_update=False, update_path=True, download=True, check_version=False, return_version=False, accept=False, auth=None, token=None, ) -> Path | tuple[Path, str]: """Fetch an MNE-compatible dataset using pooch. Parameters ---------- dataset_params : list of dict | dict The dataset name(s) and corresponding parameters to download the dataset(s). The dataset parameters that contains the following keys: ``archive_name``, ``url``, ``folder_name``, ``hash``, ``config_key`` (optional). See Notes. processor : None | "unzip" | "untar" | instance of pooch.Unzip | instance of pooch.Untar What to do after downloading the file. ``"unzip"`` and ``"untar"`` will decompress the downloaded file in place; for custom extraction (e.g., only extracting certain files from the archive) pass an instance of ``pooch.Unzip`` or ``pooch.Untar``. If ``None`` (the default), the files are left as-is. path : None | str Directory in which to put the dataset. If ``None``, the dataset location is determined by first checking whether ``dataset_params['config_key']`` is defined, and if so, whether that config key exists in the MNE-Python config file. If so, the configured path is used; if not, the location is set to the value of the ``MNE_DATA`` config key (if it exists), or ``~/mne_data`` otherwise. force_update : bool Force update of the dataset even if a local copy exists. Default is False. update_path : bool | None If True (default), set the mne-python config to the given path. If None, the user is prompted. download : bool If False and the dataset has not been downloaded yet, it will not be downloaded and the path will be returned as ``''`` (empty string). This is mostly used for testing purposes and can be safely ignored by most users. check_version : bool Whether to check the version of the dataset or not. Each version of the dataset is stored in the root with a ``version.txt`` file. return_version : bool Whether or not to return the version of the dataset or not. Defaults to False. accept : bool Some MNE-supplied datasets require acceptance of an additional license. Default is ``False``. auth : tuple | None Optional authentication tuple containing the username and password/token, passed to ``pooch.HTTPDownloader`` (e.g., ``auth=('foo', 012345)``). token : str | None Optional authentication token passed to ``pooch.HTTPDownloader``. Returns ------- data_path : instance of Path The path to the fetched dataset. version : str Only returned if ``return_version`` is True. See Also -------- mne.get_config mne.set_config mne.datasets.has_dataset Notes ----- The ``dataset_params`` argument must contain the following keys: - ``archive_name``: The name of the (possibly compressed) file to download - ``url``: URL from which the file can be downloaded - ``folder_name``: the subfolder within the ``MNE_DATA`` folder in which to save and uncompress (if needed) the file(s) - ``hash``: the cryptographic hash type of the file followed by a colon and then the hash value (examples: "sha256:19uheid...", "md5:upodh2io...") - ``config_key`` (optional): key passed to :func:`mne.set_config` to store the on-disk location of the downloaded dataset (e.g., ``"MNE_DATASETS_EEGBCI_PATH"``). This will only work for the provided datasets listed :ref:`here `; do not use for user-defined datasets. An example would look like:: {'dataset_name': 'sample', 'archive_name': 'MNE-sample-data-processed.tar.gz', 'hash': 'md5:12b75d1cb7df9dfb4ad73ed82f61094f', 'url': 'https://osf.io/86qa2/download?version=5', 'folder_name': 'MNE-sample-data', 'config_key': 'MNE_DATASETS_SAMPLE_PATH'} For datasets where a single (possibly compressed) file must be downloaded, pass a single :class:`dict` as ``dataset_params``. For datasets where multiple files must be downloaded and (optionally) uncompressed separately, pass a list of dicts. """ # noqa E501 import pooch t0 = time.time() if auth is not None: if len(auth) != 2: raise RuntimeError( "auth should be a 2-tuple consisting " "of a username and password/token." ) # processor to uncompress files if processor == "untar": processor = pooch.Untar(extract_dir=path) elif processor == "unzip": processor = pooch.Unzip(extract_dir=path) if isinstance(dataset_params, dict): dataset_params = [dataset_params] # extract configuration parameters names = [params["dataset_name"] for params in dataset_params] name = names[0] dataset_dict = dataset_params[0] config_key = dataset_dict.get("config_key", None) folder_name = dataset_dict["folder_name"] # get download path for specific dataset path = _get_path(path=path, key=config_key, name=name) # get the actual path to each dataset folder name final_path = op.join(path, folder_name) # handle BrainStorm datasets with nested folders for datasets if name.startswith("bst_"): final_path = op.join(final_path, name) final_path = Path(final_path) # additional condition: check for version.txt and parse it # check if testing or misc data is outdated; if so, redownload it want_version = RELEASES.get(name, None) want_version = _FAKE_VERSION if name == "fake" else want_version # get the version of the dataset and then check if the version is outdated data_version = _dataset_version(final_path, name) outdated = want_version is not None and _compare_version( want_version, ">", data_version ) if outdated: logger.info( f"Dataset {name} version {data_version} out of date, " f"latest version is {want_version}" ) empty = Path("") # return empty string if outdated dataset and we don't want to download if (not force_update) and outdated and not download: logger.info( "Dataset out of date but force_update=False and download=False, " "returning empty data_path" ) return (empty, data_version) if return_version else empty # reasons to bail early (hf_sef has separate code for this): if (not force_update) and (not outdated) and (not name.startswith("hf_sef_")): # ...if target folder exists (otherwise pooch downloads every # time because we don't save the archive files after unpacking, so # pooch can't check its checksum) if op.isdir(final_path): if config_key is not None: _do_path_update(path, update_path, config_key, name) return (final_path, data_version) if return_version else final_path # ...if download=False (useful for debugging) elif not download: return (empty, data_version) if return_version else empty # ...if user didn't accept the license elif name.startswith("bst_"): if accept or "--accept-brainstorm-license" in sys.argv: answer = "y" else: # If they don't have stdin, just accept the license # https://github.com/mne-tools/mne-python/issues/8513#issuecomment-726823724 # noqa: E501 answer = _safe_input(f"{_bst_license_text}Agree (y/[n])? ", use="y") if answer.lower() != "y": raise RuntimeError("You must agree to the license to use this dataset") # downloader & processors download_params = _downloader_params(auth=auth, token=token) if name == "fake": download_params["progressbar"] = False downloader = pooch.HTTPDownloader(**download_params) # make mappings from archive names to urls and to checksums urls = dict() registry = dict() for idx, this_name in enumerate(names): this_dataset = dataset_params[idx] archive_name = this_dataset["archive_name"] dataset_url = this_dataset["url"] dataset_hash = this_dataset["hash"] urls[archive_name] = dataset_url registry[archive_name] = dataset_hash # create the download manager use_path = final_path if processor is None else Path(path) fetcher = pooch.create( path=str(use_path), base_url="", # Full URLs are given in the `urls` dict. version=None, # Data versioning is decoupled from MNE-Python version. urls=urls, registry=registry, retry_if_failed=2, # 2 retries = 3 total attempts ) # use our logger level for pooch's logger too pooch.get_logger().setLevel(logger.getEffectiveLevel()) sz = 0 for idx in range(len(names)): # fetch and unpack the data archive_name = dataset_params[idx]["archive_name"] try: fetcher.fetch( fname=archive_name, downloader=downloader, processor=processor ) except ValueError as err: err = str(err) if "hash of downloaded file" in str(err): raise ValueError( f"{err} Consider using force_update=True to force " "the dataset to be downloaded again." ) from None else: raise fname = use_path / archive_name sz += fname.stat().st_size # after unpacking, remove the archive file if processor is not None: fname.unlink() # remove version number from "misc" and "testing" datasets folder names if name == "misc": rmtree(final_path, ignore_errors=True) os.replace(op.join(path, MISC_VERSIONED), final_path) elif name == "testing": rmtree(final_path, ignore_errors=True) os.replace(op.join(path, TESTING_VERSIONED), final_path) # maybe update the config if config_key is not None: old_name = "brainstorm" if name.startswith("bst_") else name _do_path_update(path, update_path, config_key, old_name) # compare the version of the dataset and mne data_version = _dataset_version(path, name) # 0.7 < 0.7.git should be False, therefore strip if check_version and ( _compare_version(data_version, "<", mne_version.strip(".git")) ): # OK to `nosec` because it's false positive (misidentified as SQL) warn( f"The {name} dataset (version {data_version}) is older than " f"mne-python (version {mne_version}). If the examples fail, " f"you may need to update the {name} dataset by using " f"mne.datasets.{name}.data_path(force_update=True)" # nosec B608 ) _log_time_size(t0, sz) return (final_path, data_version) if return_version else final_path