nanopyx.data.download

View Source
  1import os
  2import shutil
  3import tempfile
  4import zipfile
  5from urllib.request import ProxyHandler, build_opener, install_opener
  6
  7import numpy as np
  8import yaml
  9from gdown import download as gdrive_download
 10from onedrivedownloader import download as onedrive_download
 11from ..core.io.downloader import download
 12
 13from ..core.io.zip_image_loader import ZipTiffIterator
 14from .examples import get_path as get_examples_path
 15
 16
 17class ExampleDataManager:
 18    _base_path = get_examples_path()
 19    _temp_dir = os.path.join(tempfile.gettempdir(), "nanopyx_data")
 20    _to_download_path = None
 21
 22    def __init__(self, to_download_path: str = None):
 23        """
 24        Helper class for downloading example test data
 25
 26        :param to_download_path: path to download the data to. If to_download_path is None, a temporary directory
 27        will be created. Note that it will not be automatically deleted.
 28        :type to_download_path: str, optional
 29        :raises ValueError: If to_download_path is not None and does not exist
 30
 31        To clear downloads use self._clear_download()
 32        """
 33
 34        # Set download path
 35        if to_download_path is None:
 36            self._to_download_path = self._temp_dir
 37        else:
 38            self._to_download_path = to_download_path
 39
 40        # Lets check on how many examples we have available
 41        self._datasets = []
 42        for path in os.listdir(self._base_path):
 43            full_path = os.path.join(self._base_path, path)
 44            info_file_path = os.path.join(full_path, "info.yaml")
 45            if os.path.isdir(full_path) and os.path.exists(info_file_path):
 46                info_data = None
 47                with open(os.path.join(info_file_path), "r") as f:
 48                    # Load the YAML contents
 49                    info_data = yaml.load(f, Loader=yaml.FullLoader)
 50
 51                info = {
 52                    "info_path": info_file_path,
 53                    "thumbnail_path": os.path.join(self._base_path, path, "thumbnail.jpg"),
 54                    "tiff_sequence_path": None,
 55                }
 56                tiff_sequence_path = os.path.join(self._to_download_path, path, "tiff_sequence.zip")
 57                if os.path.exists(tiff_sequence_path):
 58                    info["tiff_sequence_path"] = tiff_sequence_path
 59
 60                for key in info_data:
 61                    info[key] = info_data[key]
 62
 63                info["shape"] = tuple([int(v) for v in info["data_shape"].split(",")])
 64
 65                info["dtype"] = np.dtype(info["data_dtype"])
 66
 67                self._datasets.append(info)
 68
 69        # Fix agent
 70        proxy = ProxyHandler({})
 71        opener = build_opener(proxy)
 72        opener.addheaders = [
 73            (
 74                "User-Agent",
 75                "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/603.1.30"
 76                + " (KHTML, like Gecko) Version/10.1 Safari/603.1.30",
 77            )
 78        ]
 79        install_opener(opener)
 80
 81    def list_datasets(self) -> tuple:
 82        """
 83        :return: list of dataset labels
 84        """
 85        return sorted([dataset["label"] for dataset in self._datasets])
 86
 87    def list_datasets_nickname(self) -> tuple:
 88        """
 89        :return: list of dataset nicknames
 90        """
 91        return [(dataset["nickname"], dataset["label"]) for dataset in self._datasets]
 92
 93    def get_dataset_info(self, dataset_name: str) -> dict:
 94        """
 95        :param dataset_name: can be a dataset label or nickname
 96        :type dataset_name: str
 97        :return: dictionary with information about the dataset
 98        """
 99
100        for dataset in self._datasets:
101            if dataset_name in (dataset["label"], dataset["nickname"]):
102                return dataset
103        raise ValueError(f"{dataset_name} not found in example datasets")
104
105    def _download(self, url, file_path, download_type=None, unzip=False):
106        if os.path.exists(file_path):  # or os.path.exists(os.path.splitext(file_path)[0]):
107            # raise Warning(f"already exists, no need to download: {file_path}")
108            return
109
110        if not os.path.exists(self._temp_dir):
111            os.mkdir(self._temp_dir)
112
113        base_path = os.path.split(file_path)[0]
114        if not os.path.exists(base_path):
115            os.mkdir(base_path)
116
117        if download_type == "onedrive":
118            onedrive_download(url, file_path, unzip=unzip, clean=True)
119        elif download_type == "gdrive":
120            gdrive_download(url, file_path, fuzzy=True, quiet=False)
121        else:
122            download(url, file_path)
123
124    def _copy_auxiliary_files(self, info: dict):
125        if not os.path.exists(self._to_download_path):
126            os.mkdir(self._to_download_path)
127
128        path = os.path.join(self._to_download_path, info["label"])
129        if not os.path.exists(path):
130            os.mkdir(path)
131
132        # thumbnail_path = os.path.join(path, "thumbnail.jpg")
133        # if not os.path.exists(thumbnail_path):
134        #    shutil.copyfile(info["thumbnail_path"], thumbnail_path)
135
136        info_path = os.path.join(path, "info.yaml")
137        if not os.path.exists(info_path):
138            shutil.copyfile(info["info_path"], info_path)
139
140    def download_tiff_sequence(self, dataset_name: str) -> str:
141        """
142        Downloads the tiff sequence and returns the path to the zip file
143
144        :param dataset_name: can be a dataset label or nickname
145        :type dataset_name: str
146        :return: path to the zip file
147        """
148        info = self.get_dataset_info(dataset_name)
149        path = os.path.join(self._to_download_path, info["label"])
150
151        file_path = os.path.join(path, "tiff_sequence.zip")
152        url = info["tiff_sequence_url"]
153        download_type = info["tiff_sequence_url_type"]
154
155        self._copy_auxiliary_files(info)
156        self._download(url, file_path, download_type)
157        info["tiff_sequence_path"] = file_path
158
159        return file_path
160
161    def is_downloaded(self, dataset_name: str) -> bool:
162        """
163        :param dataset_name: can be a dataset label or nickname
164        :type dataset_name: str
165        :return: True if the dataset is downloaded
166        """
167        info = self.get_dataset_info(dataset_name)
168        return info["tiff_sequence_path"] is not None
169
170    def get_ZipTiffIterator(self, dataset_name: str, as_ndarray: bool = False) -> ZipTiffIterator:
171        """
172        Downloads the tiff sequence and returns the ZipTiffIterator
173
174        :param dataset_name: can be a dataset label or nickname
175        :type dataset_name: str
176        :param as_ndarray: if True, returns a numpy array instead of a ZipTiffIterator
177        :type as_ndarray: bool
178        :return: ZipTiffIterator or numpy array
179        """
180        self._show_citation_notice(dataset_name)
181        file_path = self.download_tiff_sequence(dataset_name)
182        try:
183            zti = ZipTiffIterator(file_path)
184        except zipfile.BadZipFile:
185            self.clear_downloads()
186            # try once more
187            file_path = self.download_tiff_sequence(dataset_name)
188            zti = ZipTiffIterator(file_path)
189        if not as_ndarray:
190            return zti
191        else:
192            arr = np.asarray(zti)
193            zti.close()
194            return arr
195
196    def get_thumbnail(self, dataset_name: str) -> str:
197        """
198        Returns the path to the thumbnail
199
200        :param dataset_name: can be a dataset label or nickname
201        :type dataset_name: str
202        :return: path to the thumbnail
203        """
204        info = self.get_dataset_info(dataset_name)
205        return info["thumbnail_path"]
206
207    def clear_downloads(self):
208        """
209        Deletes all downloaded datasets
210        """
211        if os.path.exists(self._temp_dir):
212            shutil.rmtree(self._temp_dir)
213
214    def _show_citation_notice(self, dataset_name: str):
215        info = self.get_dataset_info(dataset_name)
216        if info["reference"] not in [None, ""]:
217            print(
218                f"If you find the '{dataset_name}' dataset useful, please cite: "
219                + f"{info['reference']} - {info['reference_doi']}"
220            )
nanopyx.data.download

Parameters

Raises

Returns

Returns

Parameters

Returns

Parameters

Returns

Parameters

Returns

Parameters

Returns

Parameters

Returns