Skip to content

API Reference: Client

mobility_db_api.api.MobilityAPI(data_dir='data', refresh_token=None, log_level='INFO', logger_name='mobility_db_api', force_csv_mode=False)

A client for interacting with the Mobility Database API.

This class provides methods to search for GTFS providers, download datasets, and manage downloaded data. It handles authentication, caching, and metadata tracking automatically.

The client can operate in two modes: 1. API mode (default): Uses the Mobility Database API with authentication 2. CSV mode: Uses the CSV catalog when no API key is provided or when force_csv_mode is True

Attributes:

Name Type Description
data_dir Path

Directory where downloaded datasets are stored

refresh_token str

Token used for API authentication

datasets Dict

Dictionary of downloaded dataset metadata

force_csv_mode bool

If True, always use CSV catalog even if API key is available

Example

api = MobilityAPI(data_dir="data") # Will try API first, fallback to CSV api_csv = MobilityAPI(force_csv_mode=True) # Will always use CSV providers = api.get_providers_by_country("HU") dataset_path = api.download_latest_dataset("tld-5862")

Initialize the API client.

Parameters:

Name Type Description Default
data_dir str

Base directory for all GTFS downloads

'data'
refresh_token Optional[str]

Optional refresh token. If not provided, will try to load from .env file

None
log_level str

Logging level (DEBUG, INFO, WARNING, ERROR). Defaults to INFO.

'INFO'
logger_name str

Name for the logger instance. Defaults to 'mobility_db_api'.

'mobility_db_api'
force_csv_mode bool

If True, always use CSV catalog even if API key is available.

False
Source code in src/mobility_db_api/api.py
def __init__(
    self,
    data_dir: str = "data",
    refresh_token: Optional[str] = None,
    log_level: str = "INFO",
    logger_name: str = "mobility_db_api",
    force_csv_mode: bool = False,
):
    """
    Initialize the API client.

    Args:
        data_dir: Base directory for all GTFS downloads
        refresh_token: Optional refresh token. If not provided, will try to load from .env file
        log_level: Logging level (DEBUG, INFO, WARNING, ERROR). Defaults to INFO.
        logger_name: Name for the logger instance. Defaults to 'mobility_db_api'.
        force_csv_mode: If True, always use CSV catalog even if API key is available.
    """
    # Set up logger with instance-specific name if needed
    self.logger = setup_logger(name=f"{logger_name}_{data_dir}", level=log_level)
    self.logger.debug("Initializing MobilityAPI client")

    self.base_url = "https://api.mobilitydatabase.org/v1"
    self.data_dir = Path(data_dir)
    self.data_dir.mkdir(parents=True, exist_ok=True)
    self.metadata_file = self.data_dir / "datasets_metadata.json"
    self.refresh_token = refresh_token
    self._last_metadata_mtime = None
    self._load_metadata()

    # CSV catalog is initialized lazily
    self._csv_catalog = None
    self.force_csv_mode = force_csv_mode
    self._use_csv = force_csv_mode

    if not force_csv_mode:
        # Try to get an access token, fallback to CSV if it fails
        if not self.get_access_token():
            self.logger.info(
                "No valid API token found, falling back to CSV catalog"
            )
            self._use_csv = True

Attributes

base_url = 'https://api.mobilitydatabase.org/v1' instance-attribute

csv_catalog property

Lazy initialization of CSV catalog.

data_dir = Path(data_dir) instance-attribute

force_csv_mode = force_csv_mode instance-attribute

logger = setup_logger(name=f'{logger_name}_{data_dir}', level=log_level) instance-attribute

metadata_file = self.data_dir / 'datasets_metadata.json' instance-attribute

refresh_token = refresh_token instance-attribute

Functions

delete_all_datasets()

Delete all downloaded datasets. The main data directory is preserved, only dataset directories are removed.

Returns:

Type Description
bool

True if all datasets were deleted successfully, False if any deletion failed

Source code in src/mobility_db_api/api.py
def delete_all_datasets(self) -> bool:
    """
    Delete all downloaded datasets.
    The main data directory is preserved, only dataset directories are removed.

    Returns:
        True if all datasets were deleted successfully, False if any deletion failed
    """
    if not self.datasets:
        self.logger.info("No datasets to delete")
        return True

    success = True
    provider_dirs = set()

    for key, meta in list(self.datasets.items()):
        try:
            if meta.download_path.exists():
                shutil.rmtree(meta.download_path)
                self.logger.info(f"Deleted dataset directory: {meta.download_path}")

            # Store provider directory for later cleanup
            provider_dirs.add(meta.download_path.parent)

            # Remove from metadata
            del self.datasets[key]

        except Exception as e:
            self.logger.error(f"Error deleting dataset {key}: {str(e)}")
            success = False

    # Save metadata after all deletions
    if success:
        self._save_metadata()

        # Clean up empty provider directories
        for provider_dir in provider_dirs:
            self._cleanup_empty_provider_dir(provider_dir)

    return success

delete_dataset(provider_id, dataset_id=None)

Delete a downloaded dataset.

Parameters:

Name Type Description Default
provider_id str

The ID of the provider

required
dataset_id Optional[str]

Optional specific dataset ID. If not provided, deletes the latest dataset

None

Returns:

Type Description
bool

True if the dataset was deleted, False if it wasn't found or couldn't be deleted

Source code in src/mobility_db_api/api.py
def delete_dataset(
    self, provider_id: str, dataset_id: Optional[str] = None
) -> bool:
    """
    Delete a downloaded dataset.

    Args:
        provider_id: The ID of the provider
        dataset_id: Optional specific dataset ID. If not provided, deletes the latest dataset

    Returns:
        True if the dataset was deleted, False if it wasn't found or couldn't be deleted
    """
    # Find matching datasets
    matches = [
        (key, meta)
        for key, meta in self.datasets.items()
        if meta.provider_id == provider_id
        and (dataset_id is None or meta.dataset_id == dataset_id)
    ]

    if not matches:
        self.logger.error(f"No matching dataset found for provider {provider_id}")
        return False

    # If dataset_id not specified, take the latest one
    if dataset_id is None and len(matches) > 1:
        matches.sort(key=lambda x: x[1].download_date, reverse=True)

    key, meta = matches[0]
    provider_dir = meta.download_path.parent

    try:
        if meta.download_path.exists():
            shutil.rmtree(meta.download_path)
            self.logger.info(f"Deleted dataset directory: {meta.download_path}")

        # Remove from metadata
        del self.datasets[key]
        self._save_metadata()

        # Clean up provider directory if empty
        self._cleanup_empty_provider_dir(provider_dir)

        return True

    except Exception as e:
        self.logger.error(f"Error deleting dataset: {str(e)}")
        return False

delete_provider_datasets(provider_id)

Delete all downloaded datasets for a specific provider.

Parameters:

Name Type Description Default
provider_id str

The ID of the provider whose datasets should be deleted

required

Returns:

Type Description
bool

True if all datasets were deleted successfully, False if any deletion failed

Source code in src/mobility_db_api/api.py
def delete_provider_datasets(self, provider_id: str) -> bool:
    """
    Delete all downloaded datasets for a specific provider.

    Args:
        provider_id: The ID of the provider whose datasets should be deleted

    Returns:
        True if all datasets were deleted successfully, False if any deletion failed
    """
    # Find all datasets for this provider
    matches = [
        (key, meta)
        for key, meta in self.datasets.items()
        if meta.provider_id == provider_id
    ]

    if not matches:
        self.logger.error(f"No datasets found for provider {provider_id}")
        return False

    success = True
    provider_dir = None

    for key, meta in matches:
        try:
            if meta.download_path.exists():
                shutil.rmtree(meta.download_path)
                self.logger.info(f"Deleted dataset directory: {meta.download_path}")

            # Store provider directory for later cleanup
            provider_dir = meta.download_path.parent

            # Remove from metadata
            del self.datasets[key]

        except Exception as e:
            self.logger.error(f"Error deleting dataset {key}: {str(e)}")
            success = False

    # Save metadata after all deletions
    if success:
        self._save_metadata()

        # Clean up provider directory if empty
        if provider_dir:
            self._cleanup_empty_provider_dir(provider_dir)

    return success

download_latest_dataset(provider_id, download_dir=None, use_direct_source=False, force_bounding_box_calculation=False)

Download the latest dataset for a provider.

Parameters:

Name Type Description Default
provider_id str

The ID of the provider to download the dataset for.

required
download_dir Optional[str]

Optional directory to download the dataset to.

None
use_direct_source bool

Whether to use direct download URL instead of hosted dataset.

False
force_bounding_box_calculation bool

Whether to force recalculation of the bounding box from stops.txt.

False

Returns:

Type Description
Optional[Path]

The path to the extracted dataset directory, or None if the download failed.

Source code in src/mobility_db_api/api.py
def download_latest_dataset(
    self,
    provider_id: str,
    download_dir: Optional[str] = None,
    use_direct_source: bool = False,
    force_bounding_box_calculation: bool = False,
) -> Optional[Path]:
    """
    Download the latest dataset for a provider.

    Args:
        provider_id: The ID of the provider to download the dataset for.
        download_dir: Optional directory to download the dataset to.
        use_direct_source: Whether to use direct download URL instead of hosted dataset.
        force_bounding_box_calculation: Whether to force recalculation of the bounding box from stops.txt.

    Returns:
        The path to the extracted dataset directory, or None if the download failed.
    """
    try:
        # Get provider info based on mode
        if self._use_csv:
            # In CSV mode, get provider info from catalog
            provider_data = self.csv_catalog.get_provider_info(provider_id)
            if not provider_data:
                self.logger.error(
                    f"Provider {provider_id} not found in CSV catalog"
                )
                return None
        else:
            # In API mode, get provider info from the API
            self.logger.info(f"Fetching provider info for {provider_id}")
            url = f"{self.base_url}/gtfs_feeds/{provider_id}"
            response = requests.get(url, headers=self._get_headers())
            if response.status_code != 200:
                self.logger.error(
                    f"Failed to get provider info: {response.status_code}"
                )
                if response.status_code in (
                    401,
                    403,
                    413,
                ):  # Auth errors or request too large
                    self.logger.info("Falling back to CSV catalog")
                    self._use_csv = True
                    return self.download_latest_dataset(
                        provider_id,
                        download_dir,
                        use_direct_source,
                        force_bounding_box_calculation,
                    )
                return None
            provider_data = response.json()

        provider_name = provider_data.get("provider", "Unknown Provider")
        latest_dataset = provider_data.get("latest_dataset")

        # For direct source, we don't need latest_dataset
        if use_direct_source:
            if not provider_data.get("source_info", {}).get("producer_url"):
                self.logger.error(
                    "No direct download URL available for this provider"
                )
                return None
            download_url = provider_data["source_info"]["producer_url"]
            api_hash = None
            is_direct = True
            # Create a pseudo dataset ID for direct downloads
            latest_dataset = {
                "id": f"direct_{datetime.now().strftime('%Y%m%d%H%M%S')}"
            }
        else:
            if not latest_dataset:
                self.logger.error(
                    f"No latest dataset available for provider {provider_id}"
                )
                return None
            download_url = latest_dataset["hosted_url"]
            api_hash = latest_dataset.get("hash")
            is_direct = False

        # Create provider directory with sanitized name
        safe_name = self._sanitize_provider_name(provider_name)
        base_dir = Path(download_dir) if download_dir else self.data_dir
        base_dir.mkdir(parents=True, exist_ok=True)
        provider_dir = base_dir / f"{provider_id}_{safe_name}"
        provider_dir.mkdir(exist_ok=True)

        # Check if we already have this dataset
        dataset_key = f"{provider_id}_{latest_dataset['id']}"
        old_dataset_id = None
        old_dataset_path = None

        # Find any existing dataset for this provider
        for key, meta in list(self.datasets.items()):
            if meta.provider_id == provider_id:
                if dataset_key == key and meta.is_direct_source == is_direct:
                    if api_hash and api_hash == meta.api_provided_hash:
                        self.logger.info(
                            f"Dataset {dataset_key} already exists and hash matches"
                        )
                        return meta.download_path
                    elif not api_hash and meta.download_path.exists():
                        # For direct source, download and compare file hash
                        self.logger.info(
                            "Checking if direct source dataset has changed..."
                        )
                        temp_file = (
                            provider_dir
                            / f"temp_{datetime.now().strftime('%Y%m%d_%H%M%S')}.zip"
                        )
                        start_time = time.time()
                        response = requests.get(download_url)
                        download_time = time.time() - start_time
                        if response.status_code == 200:
                            with open(temp_file, "wb") as f:
                                f.write(response.content)
                            new_hash = self._calculate_file_hash(temp_file)
                            if new_hash == meta.file_hash:
                                temp_file.unlink()
                                self.logger.info(
                                    f"Dataset {dataset_key} already exists and content matches"
                                )
                                return meta.download_path
                            # If hash different, continue with new download
                            temp_file.unlink()
                # Store the old dataset info for later cleanup
                old_dataset_id = meta.dataset_id
                old_dataset_path = meta.download_path
                # Remove old dataset from metadata now
                del self.datasets[key]

        # Delete old datasets if they exist
        for key in list(self.datasets.keys()):
            if key.startswith(provider_id):
                del self.datasets[key]

        # Download dataset
        self.logger.info(f"Downloading dataset from {download_url}")
        start_time = time.time()
        response = requests.get(download_url)
        download_time = time.time() - start_time

        if response.status_code != 200:
            self.logger.error(f"Failed to download dataset: {response.status_code}")
            return None

        # Save and process the zip file
        zip_file = provider_dir / f"{latest_dataset['id']}.zip"
        try:
            with open(zip_file, "wb") as f:
                f.write(response.content)
        except IOError as e:
            self.logger.error(f"Failed to write zip file: {str(e)}")
            if zip_file.exists():
                zip_file.unlink()
            return None

        zip_size = zip_file.stat().st_size
        self.logger.info(f"Download completed in {download_time:.2f} seconds")
        self.logger.info(f"Downloaded file size: {zip_size / 1024 / 1024:.2f} MB")

        # Calculate file hash
        file_hash = self._calculate_file_hash(zip_file)

        # Check if dataset already exists and hash matches
        if (
            dataset_key in self.datasets
            and self.datasets[dataset_key].file_hash == file_hash
            and not force_bounding_box_calculation
        ):
            self.logger.info(
                f"Dataset {dataset_key} already exists and hash matches"
            )
            return self.datasets[dataset_key].download_path

        # Create extraction directory
        provider_name_safe = self._sanitize_provider_name(provider_name)
        extract_dir = base_dir / f"{provider_id}_{provider_name_safe}" / latest_dataset["id"]
        extract_dir.mkdir(parents=True, exist_ok=True)

        # Extract dataset
        self.logger.info("Extracting dataset...")
        start_time = time.time()
        with zipfile.ZipFile(zip_file, "r") as zip_ref:
            zip_ref.extractall(extract_dir)
        end_time = time.time()
        self.logger.info(
            f"Extraction completed in {end_time - start_time:.2f} seconds"
        )

        # Get extracted size
        extracted_size = sum(
            f.stat().st_size for f in extract_dir.rglob("*") if f.is_file()
        )
        self.logger.info(f"Extracted size: {extracted_size / 1024 / 1024:.2f} MB")

        # Get feed validity period
        feed_start_date = None
        feed_end_date = None
        feed_info_path = extract_dir / "feed_info.txt"
        if feed_info_path.exists():
            try:
                with open(feed_info_path, "r", encoding="utf-8") as f:
                    reader = csv.DictReader(f)
                    for row in reader:
                        feed_start_date = row.get("feed_start_date")
                        feed_end_date = row.get("feed_end_date")
                        break
            except Exception as e:
                self.logger.warning(f"Failed to read feed_info.txt: {e}")
        if feed_start_date and feed_end_date:
            self.logger.info(
                f"Feed validity period: {feed_start_date} to {feed_end_date}"
            )

        # Get bounding box information
        min_lat = None
        max_lat = None
        min_lon = None
        max_lon = None

        if not force_bounding_box_calculation:
            bounding_box = None
            if latest_dataset and isinstance(latest_dataset, dict):
                bounding_box = latest_dataset.get("bounding_box", {})
            if bounding_box:
                min_lat = bounding_box.get("minimum_latitude")
                max_lat = bounding_box.get("maximum_latitude")
                min_lon = bounding_box.get("minimum_longitude")
                max_lon = bounding_box.get("maximum_longitude")
                self.logger.info(
                    f"Using bounding box from API/CSV: ({min_lat}, {min_lon}) to ({max_lat}, {max_lon})"
                )

        # Calculate bounding box from stops.txt if needed or forced
        if force_bounding_box_calculation or (
            min_lat is None
            or max_lat is None
            or min_lon is None
            or max_lon is None
        ):
            try:
                stops_path = extract_dir / "stops.txt"
                if stops_path.exists():
                    min_lat, max_lat, min_lon, max_lon = calculate_bounding_box(extract_dir)
                    if min_lat is not None:
                        self.logger.info(
                            f"{'Recalculated' if force_bounding_box_calculation else 'Calculated'} bounding box from stops.txt: ({min_lat}, {min_lon}) to ({max_lat}, {max_lon})"
                        )
            except Exception as e:
                self.logger.warning(f"Failed to calculate bounding box: {e}")

        # Clean up zip file
        self.logger.info("Cleaning up downloaded zip file...")
        zip_file.unlink()

        # Save metadata
        metadata = DatasetMetadata(
            provider_id=provider_id,
            provider_name=provider_name,
            dataset_id=latest_dataset["id"],
            download_date=datetime.now(),
            source_url=download_url,
            is_direct_source=is_direct,
            api_provided_hash=api_hash,
            file_hash=file_hash,
            download_path=extract_dir,
            feed_start_date=feed_start_date,
            feed_end_date=feed_end_date,
            minimum_latitude=min_lat,
            maximum_latitude=max_lat,
            minimum_longitude=min_lon,
            maximum_longitude=max_lon,
        )
        self.datasets[dataset_key] = metadata
        if download_dir:
            self._save_metadata(base_dir)  # Save to custom directory metadata file
        elif not download_dir:
            self._save_metadata()  # Save to main data directory

        # Clean up old dataset if it exists
        if old_dataset_path and old_dataset_path.exists():
            self.logger.info(f"Cleaning up old dataset {old_dataset_id}...")
            shutil.rmtree(old_dataset_path)

        return extract_dir
    except requests.exceptions.RequestException as e:
        self.logger.error(f"Network error during download: {str(e)}")
        return None
    except (zipfile.BadZipFile, OSError) as e:
        self.logger.error(f"Error processing dataset: {str(e)}")
        return None

ensure_metadata_current()

Ensure the in-memory metadata is current with the file. This is a convenience method that should be called before any operation that reads from the metadata.

Returns:

Name Type Description
bool bool

True if metadata was reloaded, False if no reload was needed

Source code in src/mobility_db_api/api.py
def ensure_metadata_current(self) -> bool:
    """
    Ensure the in-memory metadata is current with the file.
    This is a convenience method that should be called before
    any operation that reads from the metadata.

    Returns:
        bool: True if metadata was reloaded, False if no reload was needed
    """
    return self.reload_metadata(force=False)

get_access_token()

Get a valid access token for API authentication.

This method handles token refresh automatically when needed. It uses the refresh token to obtain a new access token from the API.

Returns:

Type Description
Optional[str]

A valid access token string if successful, None if token refresh fails

Optional[str]

or if no refresh token is available.

Example

api = MobilityAPI() token = api.get_access_token() if token: ... print(token) ... 'eyJ0eXAiOiJKV1QiLCJhbGc...' ... else: ... print("Using CSV fallback mode")

Source code in src/mobility_db_api/api.py
def get_access_token(self) -> Optional[str]:
    """Get a valid access token for API authentication.

    This method handles token refresh automatically when needed. It uses the
    refresh token to obtain a new access token from the API.

    Returns:
        A valid access token string if successful, None if token refresh fails
        or if no refresh token is available.

    Example:
        >>> api = MobilityAPI()
        >>> token = api.get_access_token()
        >>> if token:
        ...     print(token)
        ...     'eyJ0eXAiOiJKV1QiLCJhbGc...'
        ... else:
        ...     print("Using CSV fallback mode")
    """
    if not self.refresh_token:
        self.refresh_token = os.getenv("MOBILITY_API_REFRESH_TOKEN")
    if not self.refresh_token:
        self.logger.debug("No refresh token provided and none found in .env file")
        return None

    url = f"{self.base_url}/tokens"
    headers = {"Content-Type": "application/json"}
    data = {"refresh_token": self.refresh_token}

    try:
        response = requests.post(url, headers=headers, json=data)
        if response.status_code == 200:
            data = response.json()
            return data.get("access_token")
        return None
    except Exception as e:
        self.logger.error(f"Exception during token request: {str(e)}")
        return None

get_provider_by_id(provider_id)

Get information about a specific provider by ID.

This method is similar to get_provider_info but follows the naming convention of get_providers_by_country and get_providers_by_name. It returns information about a single provider, including any downloaded dataset.

Parameters:

Name Type Description Default
provider_id str

The unique identifier of the provider

required

Returns:

Type Description
Optional[Dict]

Dictionary containing provider information and downloaded dataset details

Optional[Dict]

if available, None if the provider doesn't exist or is inactive/deprecated.

Example

api = MobilityAPI() info = api.get_provider_by_id("mdb-123") if info: ... print(f"Provider: {info['provider']}") ... if 'downloaded_dataset' in info: ... print(f"Downloaded: {info['downloaded_dataset']['download_path']}")

Source code in src/mobility_db_api/api.py
def get_provider_by_id(self, provider_id: str) -> Optional[Dict]:
    """Get information about a specific provider by ID.

    This method is similar to get_provider_info but follows the naming convention
    of get_providers_by_country and get_providers_by_name. It returns information
    about a single provider, including any downloaded dataset.

    Args:
        provider_id: The unique identifier of the provider

    Returns:
        Dictionary containing provider information and downloaded dataset details
        if available, None if the provider doesn't exist or is inactive/deprecated.

    Example:
        >>> api = MobilityAPI()
        >>> info = api.get_provider_by_id("mdb-123")
        >>> if info:
        ...     print(f"Provider: {info['provider']}")
        ...     if 'downloaded_dataset' in info:
        ...         print(f"Downloaded: {info['downloaded_dataset']['download_path']}")
    """
    return self.get_provider_info(provider_id=provider_id)

get_provider_info(provider_id=None, country_code=None, name=None)

Get information about providers based on search criteria.

This method is the central provider search functionality that powers get_provider_by_id, get_providers_by_country, and get_providers_by_name. It can search by ID, country code, or name, and returns either a single provider or a list of providers.

Parameters:

Name Type Description Default
provider_id Optional[str]

Optional provider ID for exact match

None
country_code Optional[str]

Optional two-letter ISO country code for filtering

None
name Optional[str]

Optional provider name for partial matching

None

Returns:

Type Description
Union[Optional[Dict], List[Dict]]

If provider_id is specified: Dictionary containing provider information and downloaded dataset details if available, None if the provider doesn't exist or is inactive/deprecated.

Union[Optional[Dict], List[Dict]]

If country_code or name is specified: List of matching provider dictionaries.

Union[Optional[Dict], List[Dict]]

If no criteria specified: Empty list.

Example

api = MobilityAPI()

Get by ID

info = api.get_provider_info(provider_id="mdb-123")

Get by country

be_providers = api.get_provider_info(country_code="BE")

Get by name

sncb = api.get_provider_info(name="SNCB")

Source code in src/mobility_db_api/api.py
def get_provider_info(
    self,
    provider_id: Optional[str] = None,
    country_code: Optional[str] = None,
    name: Optional[str] = None,
) -> Union[Optional[Dict], List[Dict]]:
    """
    Get information about providers based on search criteria.

    This method is the central provider search functionality that powers get_provider_by_id,
    get_providers_by_country, and get_providers_by_name. It can search by ID, country code,
    or name, and returns either a single provider or a list of providers.

    Args:
        provider_id: Optional provider ID for exact match
        country_code: Optional two-letter ISO country code for filtering
        name: Optional provider name for partial matching

    Returns:
        If provider_id is specified:
            Dictionary containing provider information and downloaded dataset details
            if available, None if the provider doesn't exist or is inactive/deprecated.
        If country_code or name is specified:
            List of matching provider dictionaries.
        If no criteria specified:
            Empty list.

    Example:
        >>> api = MobilityAPI()
        >>> # Get by ID
        >>> info = api.get_provider_info(provider_id="mdb-123")
        >>> # Get by country
        >>> be_providers = api.get_provider_info(country_code="BE")
        >>> # Get by name
        >>> sncb = api.get_provider_info(name="SNCB")
    """
    # If provider_id is specified, use exact match lookup
    if provider_id is not None:
        # First try to get provider info from API or CSV
        if self._use_csv:
            provider_info = self.csv_catalog.get_provider_info(provider_id)
            if not provider_info:
                return None
            # Check for redirects
            if provider_info.get("redirects"):
                return None
            return self._add_downloaded_dataset_info(provider_info)

        try:
            url = f"{self.base_url}/gtfs_feeds/{provider_id}"
            response = requests.get(url, headers=self._get_headers())
            if response.status_code == 200:
                try:
                    provider_info = response.json()
                    # Handle both single item and list responses
                    if isinstance(provider_info, list):
                        if not provider_info:  # Empty list
                            return None
                        provider_info = provider_info[0]  # Take first match
                    # Check for redirects
                    if provider_info.get("redirects"):
                        return None
                    return self._add_downloaded_dataset_info(provider_info)
                except requests.exceptions.JSONDecodeError:
                    self.logger.warning("Invalid JSON response from API")
                    return None
            elif response.status_code in (
                401,
                403,
                413,
            ):  # Auth errors or request too large
                self.logger.info("Falling back to CSV catalog")
                self._use_csv = True
                return self.get_provider_info(provider_id=provider_id)
            elif response.status_code == 404:
                return None
            else:
                self.logger.warning(
                    f"API request failed with status {response.status_code}"
                )
                self._use_csv = True  # Fall back to CSV on any other error
                return self.get_provider_info(provider_id=provider_id)
        except requests.exceptions.RequestException:
            self.logger.warning("API request failed, falling back to CSV catalog")
            self._use_csv = True
            return self.get_provider_info(provider_id=provider_id)

        return None

    # For country or name search, use the appropriate API endpoint or CSV catalog
    if self._use_csv:
        if country_code is not None:
            providers = self.csv_catalog.get_providers()
            return [
                p
                for p in providers
                if any(
                    loc["country_code"].upper() == country_code.upper()
                    for loc in p["locations"]
                )
            ]
        elif name is not None:
            providers = self.csv_catalog.get_providers()
            name_lower = name.lower()
            return [p for p in providers if name_lower in p["provider"].lower()]
        return []

    # Use API for search
    try:
        url = f"{self.base_url}/gtfs_feeds"
        params = {}
        if country_code is not None:
            params["country_code"] = country_code
        elif name is not None:
            params["provider"] = name

        if not params:
            return []

        response = requests.get(url, headers=self._get_headers(), params=params)
        if response.status_code == 200:
            return response.json()
        elif response.status_code in (
            401,
            403,
            413,
        ):  # Auth errors or request too large
            self.logger.info("Falling back to CSV catalog")
            self._use_csv = True
            return self.get_provider_info(country_code=country_code, name=name)
        else:
            self.logger.warning(
                f"API request failed with status {response.status_code}"
            )
            self._use_csv = True  # Fall back to CSV on any other error
            return self.get_provider_info(country_code=country_code, name=name)
    except requests.exceptions.RequestException:
        self.logger.warning("API request failed, falling back to CSV catalog")
        self._use_csv = True
        return self.get_provider_info(country_code=country_code, name=name)

    return []

get_providers_by_country(country_code)

Search for GTFS providers by country code.

Parameters:

Name Type Description Default
country_code str

Two-letter ISO country code (e.g., "HU" for Hungary)

required

Returns:

Type Description
List[Dict]

List of provider dictionaries containing provider information.

List[Dict]

Each dictionary includes: - id: Provider's unique identifier - provider: Provider's name - country: Provider's country - source_info: Information about data sources

Example

api = MobilityAPI() providers = api.get_providers_by_country("HU") for p in providers: ... print(f"{p['provider']}: {p['id']}") 'BKK: o-u-dr_bkk'

Source code in src/mobility_db_api/api.py
def get_providers_by_country(self, country_code: str) -> List[Dict]:
    """Search for GTFS providers by country code.

    Args:
        country_code: Two-letter ISO country code (e.g., "HU" for Hungary)

    Returns:
        List of provider dictionaries containing provider information.
        Each dictionary includes:
            - id: Provider's unique identifier
            - provider: Provider's name
            - country: Provider's country
            - source_info: Information about data sources

    Example:
        >>> api = MobilityAPI()
        >>> providers = api.get_providers_by_country("HU")
        >>> for p in providers:
        ...     print(f"{p['provider']}: {p['id']}")
        'BKK: o-u-dr_bkk'
    """
    return self.get_provider_info(country_code=country_code)

get_providers_by_name(name)

Search for providers by name.

Parameters:

Name Type Description Default
name str

Provider name to search for (case-insensitive partial match)

required

Returns:

Type Description
List[Dict]

List of matching provider dictionaries.

Source code in src/mobility_db_api/api.py
def get_providers_by_name(self, name: str) -> List[Dict]:
    """Search for providers by name.

    Args:
        name: Provider name to search for (case-insensitive partial match)

    Returns:
        List of matching provider dictionaries.
    """
    return self.get_provider_info(name=name)

list_downloaded_datasets()

Get a list of all downloaded datasets in the data directory.

Returns:

Type Description
List[DatasetMetadata]

List of DatasetMetadata objects for all downloaded datasets

Source code in src/mobility_db_api/api.py
def list_downloaded_datasets(self) -> List[DatasetMetadata]:
    """
    Get a list of all downloaded datasets in the data directory.

    Returns:
        List of DatasetMetadata objects for all downloaded datasets
    """
    return [meta for meta in self.datasets.values() if meta.download_path.exists()]

reload_metadata(force=False)

Reload metadata from file if it has been modified or if forced.

Parameters:

Name Type Description Default
force bool

If True, reload metadata regardless of modification time. If False, only reload if the file has been modified.

False

Returns:

Name Type Description
bool

True if metadata was reloaded, False if no reload was needed

Source code in src/mobility_db_api/api.py
def reload_metadata(self, force: bool = False):
    """
    Reload metadata from file if it has been modified or if forced.

    Args:
        force: If True, reload metadata regardless of modification time.
              If False, only reload if the file has been modified.

    Returns:
        bool: True if metadata was reloaded, False if no reload was needed
    """
    if force or self._has_metadata_changed():
        self._load_metadata()
        return True
    return False

Common Exceptions

The client can raise the following exceptions:

ValueError

Raised in cases like: - Missing or invalid refresh token - Failed token refresh - Invalid provider ID

requests.exceptions.RequestException

Raised for network-related issues: - Connection errors - API errors - Timeout issues

OSError

Raised for file system issues: - Permission errors - Disk space issues - File access problems

Environment Variables

The following environment variables can be used to configure the client:

  • MOBILITY_API_REFRESH_TOKEN: The API refresh token for authentication
  • MOBILITY_API_BASE_URL: The base URL of the Mobility Database API
  • MOBILITY_API_DATA_DIR: The default directory for storing downloaded datasets

Type Hints

from typing import Dict, List, Optional, Union
from pathlib import Path
from datetime import datetime
from dataclasses import dataclass

@dataclass
class DatasetMetadata:
    provider_id: str
    provider_name: str
    dataset_id: str
    download_date: datetime
    source_url: str
    is_direct_source: bool
    api_provided_hash: Optional[str]
    file_hash: str
    download_path: Path
    feed_start_date: Optional[str] = None
    feed_end_date: Optional[str] = None

# Function signatures
def get_providers_by_country(country_code: str) -> List[Dict]: ...
def get_providers_by_name(name: str) -> List[Dict]: ...
def get_provider_by_id(provider_id: str) -> Optional[Dict]: ...
def get_provider_info(
    provider_id: Optional[str] = None,
    country_code: Optional[str] = None,
    name: Optional[str] = None
) -> Union[Optional[Dict], List[Dict]]: ...
def download_latest_dataset(
    provider_id: str,
    download_dir: Optional[str] = None,
    use_direct_source: bool = False
) -> Optional[Path]: ...
def list_downloaded_datasets() -> List[DatasetMetadata]: ...
def delete_dataset(provider_id: str, dataset_id: Optional[str] = None) -> bool: ...
def delete_provider_datasets(provider_id: str) -> bool: ...
def delete_all_datasets() -> bool: ...

Usage Examples

Basic Usage

from mobility_db_api import MobilityAPI

# Initialize client
api = MobilityAPI()

# Search for providers
providers = api.get_providers_by_country("BE")
for provider in providers:
    print(f"Found provider: {provider['provider']}")

# Download dataset
dataset_path = api.download_latest_dataset(providers[0]['id'])
print(f"Dataset downloaded to: {dataset_path}")

Dataset Management

from mobility_db_api import MobilityAPI

api = MobilityAPI()

# List downloaded datasets
datasets = api.list_downloaded_datasets()
for dataset in datasets:
    print(f"Dataset: {dataset.dataset_id}")
    print(f"Provider: {dataset.provider_name}")
    print(f"Downloaded: {dataset.download_date}")

# Delete specific dataset
api.delete_dataset("tld-5862", "20240315")

# Delete all datasets for a provider
api.delete_provider_datasets("tld-5862")

# Delete all datasets
api.delete_all_datasets()

Error Handling

from mobility_db_api import MobilityAPI
import requests

api = MobilityAPI()

try:
    dataset_path = api.download_latest_dataset("invalid-id")
except ValueError as e:
    print(f"Invalid input: {e}")
except requests.exceptions.RequestException as e:
    print(f"Network error: {e}")
except OSError as e:
    print(f"File system error: {e}")

Implementation Details

Authentication Flow

  1. Initialize client with refresh token
  2. Client automatically handles token refresh
  3. Access token is used for API requests
  4. Refresh token is used to obtain new access tokens

Download Process

  1. Get provider information
  2. Choose download source (hosted or direct)
  3. Download dataset to specified directory
  4. Update metadata with download information
  5. Return path to downloaded dataset

Metadata Management

  1. Each download directory has its own metadata file
  2. Metadata is locked during updates
  3. Changes are detected using checksums
  4. Failed downloads are cleaned up

Client API Reference

MobilityAPI

The main client class for interacting with the Mobility Database API.

Constructor

MobilityAPI(data_dir: str = "data", 
           refresh_token: Optional[str] = None,
           log_level: str = "INFO", 
           logger_name: str = "mobility_db_api",
           force_csv_mode: bool = False)

Parameters: - data_dir: Base directory for all GTFS downloads (default: "data") - refresh_token: Optional refresh token. If not provided, will try to load from .env file - log_level: Logging level (DEBUG, INFO, WARNING, ERROR). Defaults to INFO - logger_name: Name for the logger instance. Defaults to 'mobility_db_api' - force_csv_mode: If True, always use CSV catalog even if API key is available

The client can operate in two modes: 1. API mode (default): Uses the Mobility Database API with authentication 2. CSV mode: Uses the CSV catalog when no API key is provided or when force_csv_mode is True

Operating Modes

API Mode

When a valid refresh token is available (either passed directly or through environment variables), the client operates in API mode. This mode provides: - Full access to all API features - Real-time dataset information - Provider search capabilities - Dataset downloads with hash verification

CSV Mode

The client automatically falls back to CSV mode when: - No API key is available - Authentication fails - API requests return errors (e.g., 413 Request Entity Too Large) - force_csv_mode=True is set

CSV mode provides: - Basic provider information from a local CSV catalog - Dataset download URLs - Provider search by country and name - ID normalization for consistent provider lookup

ID Normalization

The CSV catalog supports the following ID formats: - Direct numeric IDs (e.g., "123") - MDB-prefixed IDs (e.g., "mdb-123") - Other prefixed IDs are not resolvable (e.g., "tld-123")

Methods

get_providers_by_country

get_providers_by_country(country_code: str) -> List[Dict]
Search for GTFS providers by country code.

Parameters: - country_code: Two-letter ISO country code (e.g., "HU" for Hungary)

Returns: - List of provider dictionaries containing provider information

Example:

api = MobilityAPI()
providers = api.get_providers_by_country("HU")
for p in providers:
    print(f"{p['provider']}: {p['id']}")

get_providers_by_name

get_providers_by_name(name: str) -> List[Dict]
Search for providers by name.

Parameters: - name: Provider name to search for (case-insensitive partial match)

Returns: - List of matching provider dictionaries

Example:

api = MobilityAPI()
providers = api.get_providers_by_name("BKK")

get_provider_by_id

get_provider_by_id(provider_id: str) -> Optional[Dict]
Get information about a specific provider by ID.

Parameters: - provider_id: The unique identifier of the provider

Returns: - Dictionary containing provider information and downloaded dataset details if available - None if the provider doesn't exist or is inactive/deprecated

Example:

api = MobilityAPI()
info = api.get_provider_by_id("mdb-123")
if info:
    print(f"Provider: {info['provider']}")
    if 'downloaded_dataset' in info:
        print(f"Downloaded: {info['downloaded_dataset']['download_path']}")

get_provider_info

get_provider_info(
    provider_id: Optional[str] = None,
    country_code: Optional[str] = None,
    name: Optional[str] = None
) -> Union[Optional[Dict], List[Dict]]
Get information about providers based on search criteria. This method combines the functionality of get_provider_by_id, get_providers_by_country, and get_providers_by_name into a single method.

Parameters: - provider_id: Optional provider ID for exact match - country_code: Optional two-letter ISO country code for filtering - name: Optional provider name for partial matching

Returns: - If provider_id is specified: - Dictionary containing provider information and downloaded dataset details if available - None if the provider doesn't exist or is inactive/deprecated - If country_code or name is specified: - List of matching provider dictionaries - If no criteria specified: - Empty list

Example:

api = MobilityAPI()
# Get by ID
info = api.get_provider_info(provider_id="mdb-123")
# Get by country
be_providers = api.get_provider_info(country_code="BE")
# Get by name
sncb = api.get_provider_info(name="SNCB")

download_latest_dataset

download_latest_dataset(provider_id: str, 
                       download_dir: Optional[str] = None,
                       use_direct_source: bool = False) -> Optional[Path]
Download the latest GTFS dataset from a provider.

Parameters: - provider_id: The unique identifier of the provider - download_dir: Optional custom directory to store the dataset - use_direct_source: Whether to use direct download URL instead of hosted dataset

Returns: - Path to the extracted dataset directory if successful, None if download fails

Example:

api = MobilityAPI()
dataset_path = api.download_latest_dataset("mdb-123")

Error Handling

The client includes robust error handling: - Graceful fallback to CSV mode on API errors - Automatic retry with CSV catalog on authentication failures - Clear error messages and logging - Safe handling of network issues and invalid responses

Best Practices

  1. Mode Selection:
  2. Use API mode when real-time data is critical
  3. Use CSV mode for basic provider information or when API access is not available
  4. Consider force_csv_mode=True for better performance when only basic features are needed

  5. Error Handling:

  6. Always check return values for None/empty lists
  7. Use try/except blocks for network operations
  8. Monitor logs for important messages

  9. Resource Management:

  10. Use custom data directories for better organization
  11. Clean up downloaded datasets when no longer needed
  12. Monitor disk space usage

Example:

# API mode with fallback
api = MobilityAPI()
providers = api.get_providers_by_country("HU")

# Force CSV mode for better performance
api_csv = MobilityAPI(force_csv_mode=True)
providers = api_csv.get_providers_by_country("HU")

# Custom data directory
api = MobilityAPI(data_dir="custom/path")
dataset = api.download_latest_dataset("mdb-123")

Features

  • Search for GTFS providers by country code or name
  • Download and extract GTFS datasets
  • Track dataset metadata including:
  • Feed validity dates from feed_info.txt
  • Geographical bounding box from stops.txt
  • Dataset hashes for version control
  • Automatic fallback to CSV catalog when API is unavailable
  • Support for direct source downloads

Metadata

The client tracks various metadata for each downloaded dataset:

  • provider_id: Unique identifier of the provider
  • provider_name: Human-readable name of the provider
  • dataset_id: Unique identifier of the dataset
  • download_date: When the dataset was downloaded
  • source_url: URL or path where the dataset was downloaded from
  • is_direct_source: Whether the dataset was downloaded directly from the provider
  • api_provided_hash: Hash provided by the Mobility Database API (if available)
  • file_hash: SHA-256 hash of the downloaded file
  • download_path: Path where the dataset is stored
  • feed_start_date: Start date from feed_info.txt (YYYYMMDD format)
  • feed_end_date: End date from feed_info.txt (YYYYMMDD format)
  • minimum_latitude: Southern boundary of the dataset's coverage area
  • maximum_latitude: Northern boundary of the dataset's coverage area
  • minimum_longitude: Western boundary of the dataset's coverage area
  • maximum_longitude: Eastern boundary of the dataset's coverage area

Bounding Box Calculation

The client automatically calculates geographical bounding boxes for datasets:

  • For datasets from the Mobility Database API, it uses the bounding box provided by the API
  • For datasets from the CSV catalog, it uses the bounding box information from the catalog
  • For direct source downloads and external GTFS files, it calculates the bounding box from stops.txt
  • The calculation handles missing or invalid coordinates gracefully
  • Coordinates are validated to be within valid ranges (-90/90 for latitude, -180/180 for longitude)

Example Usage

from mobility_db_api import MobilityAPI

# Initialize the client
api = MobilityAPI()

# Download a dataset
dataset_path = api.download_latest_dataset("mdb-123")

# Get dataset metadata including bounding box
datasets = api.list_downloaded_datasets()
for dataset in datasets:
    print(f"Dataset: {dataset.provider_name}")
    if dataset.minimum_latitude is not None:
        print(f"Coverage area: ({dataset.minimum_latitude}, {dataset.minimum_longitude}) to "
              f"({dataset.maximum_latitude}, {dataset.maximum_longitude})")

ExternalGTFSAPI

Extension of MobilityAPI for handling external GTFS files not in the Mobility Database.

Features

  • Extract and process external GTFS ZIP files
  • Generate unique provider IDs for external sources
  • Extract agency names from GTFS files
  • Handle versioning of datasets
  • Match files to existing providers
  • Calculate bounding boxes from stops.txt

Example Usage

from mobility_db_api import ExternalGTFSAPI
from pathlib import Path

# Initialize the client
api = ExternalGTFSAPI()

# Extract a GTFS file
dataset_path = api.extract_gtfs(Path("gtfs.zip"))

# Get dataset metadata including bounding box
datasets = api.list_downloaded_datasets()
for dataset in datasets:
    print(f"Dataset: {dataset.provider_name}")
    if dataset.minimum_latitude is not None:
        print(f"Coverage area: ({dataset.minimum_latitude}, {dataset.minimum_longitude}) to "
              f"({dataset.maximum_latitude}, {dataset.maximum_longitude})")