Source code for tsl.datasets.large_st

import os
from typing import Dict, Literal, Optional, Sequence, Union

import numpy as np
import pandas as pd

from ..utils import download_url, extract_zip
from .prototypes import DatetimeDataset
from .prototypes.casting import to_pandas_freq

__base_url__ = "https://drive.switch.ch/index.php/s/nJgK7ca28hk7AMU/download"
__subsets__ = ["CA", "GBA", "GLA", "SD"]
SubsetType = Literal["CA", "GBA", "GLA", "SD"]


[docs]class LargeST(DatetimeDataset): r"""LargeST is a large-scale traffic forecasting dataset containing 5 years of traffic readings from 01/01/2017 to 12/31/2021 collected every 5 minutes by 8600 traffic sensors in California. Given the large number of sensors in the dataset, there are 3 subsets of sensors that can be selected: + :obj:`GLA` (Greater Los Angeles) + Nodes: 3834 + Edges: 98703 + District: 7, 8, 12 + :obj:`GBA` (Greater Bay Area) + Nodes: 2352 + Edges: 61246 + District: 4 + :obj:`SD` (San Diego) + Nodes: 716 + Edges: 17319 + District: 11 By default, the full dataset :obj:`CA` is loaded, corresponding to the whole California. The measurements are provided by California Transportation Agencies (CalTrans) Performance Measurement System (PeMS). Introduced in the paper `"LargeST: A Benchmark Dataset for Large-Scale Traffic Forecasting" <https://arxiv.org/abs/2306.08259>`_ (Liu et al., 2023), where only readings from 2019 are considered, aggregated into 15-minutes intervals. Dataset information: + Time steps: 525888 + Nodes: 8600 + Edges: 201363 + Channels: 1 + Sampling rate: 5 minutes + Missing values: 1.51% Static attributes: + :obj:`metadata`: storing for each node: + ``lat``: latitude of the sensor; + ``lon``: longitude of the sensor; + ``district``: California's district where sensor is located (one of ``3``, ``4``, ``5``, ``6``, ``7``, ``8``, ``10``, ``11``, ``12``); + ``county``: California's county where sensor is located; + ``fwy_id``: id of highway where a sensor is located; + ``n_lanes``: the number of lanes in correspondence to the sensor (max 8); + ``direction``: direction of the highway measured by the sensor (one of ``N``, ``S``, ``E``, ``W``). + :obj:`adj`: weighted adjacency matrix :math:`\mathbf{A} \in \mathbb{R}^{N \times N}` built using road distances. Args: root (str, optional): The root directory where data will be downloaded and stored. If :obj:`None`, then defaults to :obj:`.storage` folder inside :tsl:`null` tsl's root directory. (default: :obj:`None`) subset (str): The subset to be loaded. Must be one of :obj:`"CA"`, :obj:`"GLA"`, :obj:`"GBA"`, :obj:`"SD"`. (default: :obj:`"CA"`) year (int or list): The year(s) to be loaded. Must be (a list) in :obj:`[2017, 2021]`. Note that raw data are divided by year and only requested years are downloaded. (default: :obj:`2019`) imputation_mode (str, optional): How to impute missing values. If :obj:`"nearest"`, then use nearest observation; if :obj:`"zero"`, fill missing values with :obj:`0`; if :obj:`None`, do not impute (leave :obj:`nan`). (default: :obj:`"zero"`) freq (str): The sampling rate used for resampling (e.g., :obj:`"15T"` for 15-minutes intervals resampling). (default: :obj:`"15T"`) precision (int or str): The float precision of the dataset. (default: :obj:`32`) """ base_url = __base_url__ url = { "2017": __base_url__ + "?path=%2F2017&files=data.h5", "2018": __base_url__ + "?path=%2F2018&files=data.h5", "2019": __base_url__ + "?path=%2F2019&files=data.h5", "2020": __base_url__ + "?path=%2F2020&files=data.h5", "2021": __base_url__ + "?path=%2F2021&files=data.h5", "sensors": __base_url__ + "?files=sensors.zip", } similarity_options = {"precomputed"} def __init__(self, root: str = None, subset: SubsetType = "CA", year: Optional[Union[int, Sequence[int]]] = 2019, imputation_mode: Literal["nearest", "zero", None] = "zero", freq: str = "15T", precision: Union[int, str] = 32): # set root path self.root = root subset = subset.upper() if subset not in __subsets__: raise ValueError( f"Incorrect choice for 'subset' ({subset}). " f"Available options are {', '.join(__subsets__)}.") self.subset = subset view_years = years_set = set(range(2017, 2022)) # between 2017 and 2021 if year is not None: year = {year} if isinstance(year, int) else set(year) view_years = view_years.intersection(year) if not len(view_years): raise ValueError(f"Incorrect choice for 'year' ({year}). " f"Must be a subset of {years_set}.") self.years = sorted(view_years) self.imputation_mode = imputation_mode assert imputation_mode in ["nearest", "zero", None] # Set dataset frequency here to resample when loading if freq is not None: freq = to_pandas_freq(freq) self.freq = freq # load dataset readings, mask, metadata, adj = self.load() covariates = {"metadata": (metadata, 'n f'), "adj": (adj, 'n n')} super().__init__(target=readings, freq=freq, mask=mask, covariates=covariates, similarity_score="precomputed", temporal_aggregation="mean", spatial_aggregation="mean", name=f"LargeST-{subset}", precision=precision) @property def raw_file_names(self) -> Dict[str, str]: out = { str(year): os.path.join(str(year), "data.h5") for year in self.years } out["metadata"] = os.path.join("sensors", "metadata.csv") out["adj"] = os.path.join("sensors", "adj.npz") return out
[docs] def download(self) -> None: for key, filepath in self.raw_files_paths.items(): # download only required data that are missing if not os.path.exists(filepath): # "metadata" and "adj" are inside single .zip file if key in ["metadata", "adj"]: sub_dir = os.path.dirname(filepath) os.makedirs(sub_dir, exist_ok=True) # download, extract, and remove .zip file in_dir = download_url(self.url["sensors"], sub_dir, filename="sensors.zip") extract_zip(in_dir, sub_dir) os.unlink(in_dir) else: # download directly .h5 file containing readings per year sub_dir, filename = os.path.split(filepath) os.makedirs(sub_dir, exist_ok=True) download_url(self.url[key], sub_dir, filename)
[docs] def load_raw(self): self.maybe_download() filenames = self.required_files_paths # load sensors information metadata = pd.read_csv(filenames["metadata"], index_col=0) max_nodes = len(metadata) # possibly select subset, "CA" stands for no subset (whole California) node_mask = slice(None) if self.subset == "GLA": # Greater Los Angeles node_mask = ((metadata.District == 7) | (metadata.District == 8) | (metadata.District == 12)).values elif self.subset == "GBA": # Greater Bay Area node_mask = (metadata.District == 4).values elif self.subset == "SD": # San Diego node_mask = (metadata.District == 11).values metadata = metadata.loc[node_mask] # load traffic data only for requested years readings = [] for year in self.years: data_path = filenames[str(year)] data_df = pd.read_hdf(data_path, key="readings") data_df = data_df.loc[:, node_mask] # filter subset # resample here to aggregate only valid observations and # align to authors' preprocessing if self.freq is not None: data_df = data_df.resample(self.freq).mean() # in authors' code: data_df.resample('15T').mean().round(0) readings.append(data_df) readings = ( readings[0] if len(readings) == 1 # avoid useless else pd.concat(readings, axis=0)) # computations # load adjacency edge_index, edge_weight = np.load(filenames["adj"]).values() # build square adj from coo to add adj as covariate adj = np.eye(max_nodes, dtype=np.float32) adj[tuple(edge_index)] = edge_weight adj = adj[node_mask][:, node_mask] return readings, metadata, adj
[docs] def load(self): readings, metadata, adj = self.load_raw() # impute missing observations using last observed values # in authors' code: readings = readings.fillna(0) mask = ~readings.isna().values if self.imputation_mode == "nearest": readings = readings.ffill().bfill() elif self.imputation_mode == "zero": readings = readings.fillna(0) return readings, mask, metadata, adj
[docs] def compute_similarity(self, method: str, **kwargs): if method == "precomputed": # load precomputed adjacency matrix based on road distance return self.adj