Source code for miblab.data

import os
import zipfile
import subprocess
import shutil

# Try importing optional dependencies
try:
    import requests
    from osfclient.api import OSF
    from tqdm import tqdm
    import_error = False
except ImportError:
    import_error = True

# Zenodo DOI of the repository
DOI = {
    'MRR': "15285017",    
    'TRISTAN': "15301607", 
}

# miblab datasets
DATASETS = {
    'KRUK.dmr.zip': {'doi': DOI['MRR']},
    'tristan_humans_healthy_controls.dmr.zip': {'doi': DOI['TRISTAN']},
    'tristan_humans_healthy_ciclosporin.dmr.zip': {'doi': DOI['TRISTAN']},
    'tristan_humans_healthy_metformin.dmr.zip': {'doi': DOI['TRISTAN']},
    'tristan_humans_healthy_rifampicin.dmr.zip': {'doi': DOI['TRISTAN']},
    'tristan_humans_patients_rifampicin.dmr.zip': {'doi': DOI['TRISTAN']},
    'tristan_rats_healthy_multiple_dosing.dmr.zip': {'doi': DOI['TRISTAN']},
    'tristan_rats_healthy_reproducibility.dmr.zip': {'doi': DOI['TRISTAN']},
    'tristan_rats_healthy_six_drugs.dmr.zip': {'doi': DOI['TRISTAN']},
}

[docs] def zenodo_fetch(dataset: str, folder: str, doi: str = None, filename: str = None, extract: bool = False, verbose: bool = False): """Download a dataset from Zenodo. Note if a dataset already exists locally it will not be downloaded again and the existing file will be returned. Args: dataset (str): Name of the dataset folder (str): Local folder where the result is to be saved doi (str, optional): Digital object identifier (DOI) of the Zenodo repository where the dataset is uploaded. If this is not provided, the function will look for the dataset in miblab's own Zenodo repositories. filename (str, optional): Filename of the downloaded dataset. If this is not provided, then *dataset* is used as filename. extract (bool): Whether to automatically extract downloaded ZIP files. verbose (bool): If True, prints logging messages. Raises: NotImplementedError: If miblab is not installed with the data option. requests.exceptions.ConnectionError: If the connection to Zenodo cannot be made. Returns: str: Full path to the downloaded datafile. """ if import_error: raise NotImplementedError( 'Please install miblab as pip install miblab[data] ' 'to use this function.' ) # Create filename if filename is None: file = os.path.join(folder, dataset) else: file = os.path.join(folder, filename) # If it is not already downloaded, download it. if os.path.exists(file): if verbose: print(f"Skipping {dataset} download, file {file} already exists.") else: # Get DOI if doi is None: if dataset in DATASETS: doi = DATASETS[dataset]['doi'] else: raise ValueError( f"{dataset} does not exist in one of the miblab " f"repositories on Zenodo. If you want to fetch " f"a dataset in an external Zenodo repository, please " f"provide the doi of the repository." ) # Dataset download link file_url = f"https://zenodo.org/records/{doi}/files/{filename or dataset}" # Make the request and check for connection error try: file_response = requests.get(file_url) except requests.exceptions.ConnectionError as err: raise requests.exceptions.ConnectionError( f"\n\n" f"A connection error occurred trying to download {dataset} " f"from Zenodo. This usually happens if you are offline. " f"The detailed error message is here: {err}" ) # Check for other errors file_response.raise_for_status() # Create the folder if needed if not os.path.exists(folder): os.makedirs(folder) # Save the file with open(file, 'wb') as f: f.write(file_response.content) # If the zip file is requested we are done if not extract: return file # If extraction requested, returned extracted if file[-4:] == '.zip': extract_to = file[:-4] else: extract_to = file + '_unzip' # Skip extraction if the folder already exists if os.path.exists(extract_to): if verbose: print(f"Skipping {file} extraction, folder {extract_to} already exists.") return extract_to # Perform extraction os.makedirs(extract_to) with zipfile.ZipFile(file, 'r') as zip_ref: bad_file = zip_ref.testzip() if bad_file: raise zipfile.BadZipFile( f"Cannot extract: corrupt file {bad_file}." ) zip_ref.extractall(extract_to) return extract_to
def clear_cache_datafiles(directory: str, verbose: bool = True): """ Delete all files and subdirectories in the specified cache directory, except for '__init__' files. Args: directory (str): Path to the directory to clear. verbose (bool): If True, prints names of deleted items. Raises: FileNotFoundError: If the directory does not exist. OSError: If a file or folder cannot be deleted. """ if not os.path.exists(directory): raise FileNotFoundError(f"Directory not found: {directory}") deleted = [] for item in os.listdir(directory): path = os.path.join(directory, item) # Skip __init__ files (e.g., __init__.py, __init__.pyc) if os.path.isfile(path) and os.path.splitext(item)[0] == '__init__': continue try: if os.path.isfile(path) or os.path.islink(path): os.remove(path) deleted.append(path) if verbose: print(f"Deleted file: {path}") elif os.path.isdir(path): shutil.rmtree(path) deleted.append(path) if verbose: print(f"Deleted folder: {path}") except Exception as e: print(f"Error deleting {path}: {e}") if verbose and not deleted: print("Directory is already clean.")
[docs] def osf_fetch(dataset: str, folder: str, project: str = "un5ct", token: str = None, extract: bool = True, verbose: bool = True): """ Download a dataset from OSF (Open Science Framework). This function downloads a specific dataset (folder or subfolder) from a public or private OSF project. Files are saved into the specified local directory. If a zip file is found, it will be extracted by default. Args: dataset (str): Subfolder path inside the OSF project. If an empty string, all files in the root will be downloaded (use with caution). folder (str): Local folder where the dataset will be saved. project (str, optional): OSF project ID (default is "un5ct"). token (str, optional): Personal OSF token for accessing private projects. Read from OSF_TOKEN environment variable if needed. extract (bool, optional): Whether to automatically unzip downloaded .zip files (default is True). verbose (bool, optional): Whether to print progress messages (default is True). Raises: FileNotFoundError: If the specified dataset path does not exist in the OSF project. NotImplementedError: If required packages are not installed. Returns: str: Path to the local folder containing the downloaded data. Example: >>> from miblab import osf_fetch >>> osf_fetch('TRISTAN/RAT/bosentan_highdose/Sanofi', 'test_download') """ if import_error: raise NotImplementedError( "Please install miblab as pip install miblab[data] to use this function." ) # Prepare local folder os.makedirs(folder, exist_ok=True) # Connect to OSF and locate project storage osf = OSF(token=token) #osf = OSF() for public projects project = osf.project(project) storage = project.storage('osfstorage') # Navigate the dataset folder if provided current = storage if dataset: parts = dataset.strip('/').split('/') for part in parts: for f in current.folders: if f.name == part: current = f break else: raise FileNotFoundError(f"Folder '{part}' not found when navigating path '{dataset}'.") # Recursive download of all files and folders def download(current_folder, local_folder): os.makedirs(local_folder, exist_ok=True) files = list(current_folder.files) iterator = tqdm(files, desc=f"Downloading to {local_folder}") if verbose and files else files for file in iterator: local_file = os.path.join(local_folder, file.name) try: with open(local_file, 'wb') as f: file.write_to(f) except Exception as e: if verbose: print(f"Warning downloading {file.name}: {e}") for subfolder in current_folder.folders: download(subfolder, os.path.join(local_folder, subfolder.name)) download(current, folder) # Extract all downloaded zip files if needed if extract: for dirpath, _, filenames in os.walk(folder): for filename in filenames: if filename.lower().endswith('.zip'): zip_path = os.path.join(dirpath, filename) extract_to = os.path.join(dirpath, filename[:-4]) os.makedirs(extract_to, exist_ok=True) try: with zipfile.ZipFile(zip_path, 'r') as zip_ref: bad_file = zip_ref.testzip() if bad_file: raise zipfile.BadZipFile(f"Corrupt file {bad_file} inside {zip_path}") zip_ref.extractall(extract_to) os.remove(zip_path) if verbose: print(f"Unzipped and deleted {zip_path}") except Exception as e: if verbose: print(f"Warning unzipping {zip_path}: {e}") return folder
[docs] def osf_upload(folder: str, dataset: str, project: str = "un5ct", token: str = None, verbose: bool = True, overwrite: bool = True): """ Upload a file to OSF (Open Science Framework) using osfclient. This function uploads a single local file to a specified path inside an OSF project. Intermediate folders must already exist in the OSF project; osfclient does not create them. If the file already exists, it can be overwritten or skipped. Args: folder (str): Path to the local file to upload. dataset (str): OSF path where the file should be placed (e.g., "Testing/filename.txt"). project (str): OSF project ID (default: "un5ct"). token (str): OSF personal token for private/write access. verbose (bool): Whether to print progress messages (default True). overwrite (bool): Whether to replace an existing file if it already exists (default True). Raises: FileNotFoundError: If the file does not exist. NotImplementedError: If osfclient is not installed. RuntimeError: If upload fails for any reason. Example: >>> from miblab import osf_upload >>> osf_upload( ... folder='data/results.csv', ... dataset='Testing/results.csv', ... project='un5ct', ... token='your-osf-token', ... verbose=True, ... overwrite=True ... ) """ import os # Check that optional dependencies are installed if import_error: raise NotImplementedError("Please install miblab[data] to use this function.") # Check that the specified local file exists if not os.path.isfile(folder): raise FileNotFoundError(f"Local file not found: {folder}") # Authenticate and connect to the OSF project from osfclient.api import OSF osf = OSF(token=token) project = osf.project(project) storage = project.storage("osfstorage") # Clean and prepare the remote dataset path full_path = dataset.strip("/") # Check if the file already exists on OSF existing = next((f for f in storage.files if f.path == "/" + full_path), None) if existing: if overwrite: if verbose: print(f"File '{full_path}' already exists. Deleting before re-upload...") try: existing.remove() except Exception as e: raise RuntimeError(f"Failed to delete existing file before overwrite: {e}") else: if verbose: print(f"File '{full_path}' already exists. Skipping (overwrite=False).") return # Upload the file size_mb = os.path.getsize(folder) / 1e6 with open(folder, "rb") as f: if verbose: print(f"Uploading '{os.path.basename(folder)}' ({size_mb:.2f} MB) to '{full_path}'...") try: storage.create_file(full_path, f) if verbose: print("Upload complete.") except Exception as e: raise RuntimeError(f"Failed to upload file: {e}")