Feature-Extraction/dist/client/mne/datasets/sleep_physionet/_utils.py

# Authors: The MNE-Python contributors.
# License: BSD-3-Clause
# Copyright the MNE-Python contributors.

import os
import os.path as op

import numpy as np

from ...utils import _check_pandas_installed, _on_missing, _TempDir, verbose
from ..utils import _downloader_params, _get_path

AGE_SLEEP_RECORDS = op.join(op.dirname(__file__), "age_records.csv")
TEMAZEPAM_SLEEP_RECORDS = op.join(op.dirname(__file__), "temazepam_records.csv")

TEMAZEPAM_RECORDS_URL = (
    "https://physionet.org/physiobank/database/sleep-edfx/ST-subjects.xls"  # noqa: E501
)
TEMAZEPAM_RECORDS_URL_SHA1 = "f52fffe5c18826a2bd4c5d5cb375bb4a9008c885"

AGE_RECORDS_URL = "https://physionet.org/physiobank/database/sleep-edfx/SC-subjects.xls"
AGE_RECORDS_URL_SHA1 = "0ba6650892c5d33a8e2b3f62ce1cc9f30438c54f"

sha1sums_fname = op.join(op.dirname(__file__), "SHA1SUMS")


def _fetch_one(fname, hashsum, path, force_update, base_url):
    import pooch

    # Fetch the file
    url = base_url + "/" + fname
    destination = op.join(path, fname)
    if op.isfile(destination) and not force_update:
        return destination, False
    if op.isfile(destination):
        os.remove(destination)
    if not op.isdir(op.dirname(destination)):
        os.makedirs(op.dirname(destination))
    downloader = pooch.HTTPDownloader(**_downloader_params())
    pooch.retrieve(
        url=url,
        known_hash=f"sha1:{hashsum}",
        path=path,
        downloader=downloader,
        fname=fname,
    )
    return destination, True


@verbose
def _data_path(path=None, verbose=None):
    """Get path to local copy of EEG Physionet age Polysomnography dataset URL.

    This is a low-level function useful for getting a local copy of a
    remote Polysomnography dataset :footcite:`KempEtAl2000` which is available
    at PhysioNet :footcite:`GoldbergerEtAl2000`.

    Parameters
    ----------
    path : None | str
        Location of where to look for the data storing location.
        If None, the environment variable or config parameter
        ``PHYSIONET_SLEEP_PATH`` is used. If it doesn't exist, the "~/mne_data"
        directory is used. If the dataset is not found under the given path,
        the data will be automatically downloaded to the specified folder.
    %(verbose)s

    Returns
    -------
    path : list of Path
        Local path to the given data file. This path is contained inside a list
        of length one, for compatibility.

    References
    ----------
    .. footbibliography::
    """  # noqa: E501
    key = "PHYSIONET_SLEEP_PATH"
    name = "PHYSIONET_SLEEP"
    path = _get_path(path, key, name)
    return op.join(path, "physionet-sleep-data")


def _update_sleep_temazepam_records(fname=TEMAZEPAM_SLEEP_RECORDS):
    """Help function to download Physionet's temazepam dataset records."""
    import pooch

    pd = _check_pandas_installed()
    tmp = _TempDir()

    # Download subjects info.
    subjects_fname = op.join(tmp, "ST-subjects.xls")
    downloader = pooch.HTTPDownloader(**_downloader_params())
    pooch.retrieve(
        url=TEMAZEPAM_RECORDS_URL,
        known_hash=f"sha1:{TEMAZEPAM_RECORDS_URL_SHA1}",
        path=tmp,
        downloader=downloader,
        fname=op.basename(subjects_fname),
    )

    # Load and Massage the checksums.
    sha1_df = pd.read_csv(
        sha1sums_fname, sep="  ", header=None, names=["sha", "fname"], engine="python"
    )
    select_age_records = sha1_df.fname.str.startswith(
        "ST"
    ) & sha1_df.fname.str.endswith("edf")
    sha1_df = sha1_df[select_age_records]
    sha1_df["id"] = [name[:6] for name in sha1_df.fname]

    # Load and massage the data.
    data = pd.read_excel(subjects_fname, header=[0, 1])
    data = data.set_index(("Subject - age - sex", "Nr"))
    data.index.name = "subject"
    data.columns.names = [None, None]
    data = (
        data.set_index(
            [("Subject - age - sex", "Age"), ("Subject - age - sex", "M1/F2")],
            append=True,
        )
        .stack(level=0)
        .reset_index()
    )

    data = data.rename(
        columns={
            ("Subject - age - sex", "Age"): "age",
            ("Subject - age - sex", "M1/F2"): "sex",
            "level_3": "drug",
        }
    )
    data["id"] = [f"ST7{s:02d}{n:1d}" for s, n in zip(data.subject, data["night nr"])]

    data = pd.merge(sha1_df, data, how="outer", on="id")
    data["record type"] = (
        data.fname.str.split("-", expand=True)[1]
        .str.split(".", expand=True)[0]
        .astype("category")
    )

    data = data.set_index(
        ["id", "subject", "age", "sex", "drug", "lights off", "night nr", "record type"]
    ).unstack()
    data.columns = [l1 + "_" + l2 for l1, l2 in data.columns]
    data = data.reset_index().drop(columns=["id"])

    data["sex"] = data.sex.astype("category").cat.rename_categories(
        {1: "male", 2: "female"}
    )

    data["drug"] = data["drug"].str.split(expand=True)[0]
    data["subject_orig"] = data["subject"]
    data["subject"] = data.index // 2  # to make sure index is from 0 to 21

    # Save the data.
    data.to_csv(fname, index=False)


def _update_sleep_age_records(fname=AGE_SLEEP_RECORDS):
    """Help function to download Physionet's age dataset records."""
    import pooch

    pd = _check_pandas_installed()
    tmp = _TempDir()

    # Download subjects info.
    subjects_fname = op.join(tmp, "SC-subjects.xls")
    downloader = pooch.HTTPDownloader(**_downloader_params())
    pooch.retrieve(
        url=AGE_RECORDS_URL,
        known_hash=f"sha1:{AGE_RECORDS_URL_SHA1}",
        path=tmp,
        downloader=downloader,
        fname=op.basename(subjects_fname),
    )

    # Load and Massage the checksums.
    sha1_df = pd.read_csv(
        sha1sums_fname, sep="  ", header=None, names=["sha", "fname"], engine="python"
    )
    select_age_records = sha1_df.fname.str.startswith(
        "SC"
    ) & sha1_df.fname.str.endswith("edf")
    sha1_df = sha1_df[select_age_records]
    sha1_df["id"] = [name[:6] for name in sha1_df.fname]

    # Load and massage the data.
    data = pd.read_excel(subjects_fname)
    data = data.rename(
        index=str, columns={"sex (F=1)": "sex", "LightsOff": "lights off"}
    )
    data["sex"] = data.sex.astype("category").cat.rename_categories(
        {1: "female", 2: "male"}
    )

    data["id"] = [f"SC4{s:02d}{n:1d}" for s, n in zip(data.subject, data.night)]

    data = data.set_index("id").join(sha1_df.set_index("id")).dropna()

    data["record type"] = (
        data.fname.str.split("-", expand=True)[1]
        .str.split(".", expand=True)[0]
        .astype("category")
    )

    data = data.reset_index().drop(columns=["id"])
    data = data[
        ["subject", "night", "record type", "age", "sex", "lights off", "sha", "fname"]
    ]

    # Save the data.
    data.to_csv(fname, index=False)


def _check_subjects(subjects, n_subjects, missing=None, on_missing="raise"):
    """Check whether subjects are available.

    Parameters
    ----------
    subjects : list
        Subject numbers to be checked.
    n_subjects : int
        Number of subjects available.
    missing : list | None
        Subject numbers that are missing.
    on_missing : 'raise' | 'warn' | 'ignore'
        What to do if one or several subjects are not available. Valid keys
        are 'raise' | 'warn' | 'ignore'. Default is 'error'. If on_missing
        is 'warn' it will proceed but warn, if 'ignore' it will proceed
        silently.
    """
    valid_subjects = np.arange(n_subjects)
    if missing is not None:
        valid_subjects = np.setdiff1d(valid_subjects, missing)
    unknown_subjects = np.setdiff1d(subjects, valid_subjects)
    if unknown_subjects.size > 0:
        subjects_list = ", ".join([str(s) for s in unknown_subjects])
        msg = (
            f"This dataset contains subjects 0 to {n_subjects - 1} with "
            f"missing subjects {missing}. Unknown subjects: "
            f"{subjects_list}."
        )
        _on_missing(on_missing, msg)