Source code for fetchez.modules.base

#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
fetchez.modules.base
~~~~~~~~~~~~~~~~~~~~~~

This holds the FetchModule super class

:copyright: (c) 2010-2026 Regents of the University of Colorado
:license: MIT, see LICENSE for more details.
"""

import os
import logging
import urllib.parse
from typing import List, Dict, Any

from fetchez import spatial
from fetchez import utils
from fetchez.core import Fetch

logger = logging.getLogger(__name__)


[docs] class FetchModule: """Base class for all Fetchez data modules.""" # --- Registry Metadata (Override these in subclasses) --- name = "base_module" meta_category = "Generic" meta_desc = "Base module class." meta_agency = "Unknown" meta_tags: List[Any] = [] meta_aliases: List[Any] = [] meta_urls: Dict[Any, Any] = {}
[docs] def __init__( self, src_region=None, hook=None, outdir=None, min_year=None, max_year=None, weight=1.0, uncertainty=0.0, params=None, **kwargs, ): self.region = src_region self.outdir = outdir self.params = params or {} self.status = 0 self.results = [] # Determine base output directory using the module's registered name if self.outdir is None: self._outdir = os.path.join(os.getcwd(), self.name) else: self._outdir = os.path.join(self.outdir, self.name) self.min_year = utils.int_or(min_year) self.max_year = utils.int_or(max_year) self.weight = float(weight) self.uncertainty = float(uncertainty) # Default Headers (Can be overridden in subclass) self.headers = {"User-Agent": "fetchez/0.5.0"} self.internal_hooks = [] self.external_hooks = hook if hook else [] # Default to the whole world if the region is invalid or missing. # Note: This will result in massive downloads for global datasets! if self.region is None or not spatial.region_valid_p(self.region): self.region = (-180, 180, -90, 90)
@property def hooks(self): """Combine internal and external hooks in the correct execution order.""" return self.internal_hooks + self.external_hooks
[docs] def add_hook(self, hook_obj): """Add a hook instance at runtime.""" if hasattr(hook_obj, "run"): self.external_hooks.append(hook_obj) else: logger.warning( f"Hook {hook_obj} does not appear to be a valid FetchHook class." )
[docs] def run(self): """Override this method in a subclass to populate `self.results`.""" raise NotImplementedError("Subclasses must implement the `run` method.")
[docs] def fetch_entry(self, entry, check_size=True, retries=5, verbose=True): """Standardized method for fetching a single result entry.""" try: parsed_url = urllib.parse.urlparse(entry["url"]) if parsed_url.scheme == "ftp": status = Fetch(url=entry["url"], headers=self.headers).fetch_ftp_file( entry["dst_fn"] ) else: status = Fetch( url=entry["url"], headers=self.headers, ).fetch_file( entry["dst_fn"], check_size=check_size, tries=retries, verbose=verbose, ) except Exception as e: logger.debug(f"Fetch failed for {entry['url']}: {e}") status = -1 return status
[docs] def add_entry_to_results(self, url, dst_fn, data_type, **kwargs): """Add fetch entries to `results`. At minimum, `url`, `dst_fn` and `data_type` are required. Any additional keyword arguments will be added to the entry dictionary. """ if utils.str_or(dst_fn) is not None: # Only join with outdir if dst_fn isn't already an absolute path if not os.path.isabs(dst_fn): dst_fn = os.path.join(self._outdir, dst_fn) entry = {"url": url, "dst_fn": dst_fn, "data_type": data_type} entry.update(kwargs) self.results.append(entry)
# ============================================================================= # Core/Test Modules # =============================================================================
[docs] class HttpDataset(FetchModule): """Fetch an HTTP/HTTPS file directly from a URL.""" name = "url_fetcher" meta_category = "Generic" meta_desc = "Fetch a file directly from a URL." meta_resolution = "N/A" meta_license = "N/A"
[docs] def __init__(self, url=None, **kwargs): super().__init__(**kwargs) self.url = url
[docs] def run(self): if self.url: self.add_entry_to_results(self.url, os.path.basename(self.url), "https")
[docs] class Scratch(FetchModule): """Scratch module that populates results directly from arguments.""" name = "scratch" meta_category = "Reference" meta_desc = "Testing module that injects direct arguments into the pipeline." meta_resolution = "N/A" meta_license = "N/A"
[docs] def __init__(self, url=None, path=None, datatype=None, **kwargs): super().__init__(**kwargs) self.url = url self.path = path self.datatype = datatype
[docs] def run(self): if self.url and self.path and self.datatype: self.add_entry_to_results(self.url, self.path, self.datatype)