Source code for tsl.datasets.pems_benchmarks

import os
from pathlib import Path

import numpy as np
import pandas as pd

from tsl import logger
from tsl.datasets.prototypes import DatetimeDataset
from tsl.ops.similarities import gaussian_kernel
from tsl.utils import download_url, extract_zip


class _PeMS(DatetimeDataset):
    r"""Abstract class for PeMSD datasets."""

    url: None
    start_date: None
    similarity_options = {'distance', 'stcn', 'binary'}
    num_sensors: None
    name: None

    def __init__(self, mask_zeros: bool = False, root=None, freq=None):
        # Set root path
        self.root = root
        self.mask_zeros = mask_zeros
        # load dataset
        flow, occupancy, speed, dist, mask = self.load(mask_zeros)
        super().__init__(target=flow,
                         mask=mask,
                         freq=freq,
                         similarity_score="distance",
                         temporal_aggregation="nearest",
                         name=self.name)
        # todo : remove this hack
        if occupancy is not None:
            occupancy.columns = self.target.columns
            self.add_covariate('occupancy', occupancy, pattern='t n f')
        if speed is not None:
            speed.columns = self.target.columns
            self.add_covariate('speed', speed, pattern='t n f')
        self.add_covariate('dist', dist, pattern='n n')

    def download(self) -> None:
        path = download_url(self.url, self.root_dir)
        extract_zip(path, self.root_dir)
        os.unlink(path)

    def build(self) -> None:
        # Build dataset
        self.maybe_download()
        self.build_distance_matrix(self.num_sensors)
        self.clean_downloads()

    def load_raw(self):
        self.maybe_build()
        fp = np.load(self.raw_files_paths[0])
        data = fp['data']
        fp.close()
        index = pd.date_range(start=self.start_date,
                              periods=len(data),
                              freq='5T')

        df_flow = pd.DataFrame(data=data[..., 0],
                               index=index).astype('float32')

        if data.shape[-1] > 1:
            df_occ = pd.DataFrame(data=data[..., 1],
                                  index=index).astype('float32')

            df_speed = pd.DataFrame(data=data[..., 2],
                                    index=index).astype('float32')
        else:
            df_occ = df_speed = None

        # load distance matrix
        path = os.path.join(self.root_dir, 'distance_matrix.npy')
        dist = np.load(path)
        return df_flow, \
            df_occ, \
            df_speed, \
            dist

    def load(self, mask_zeros: bool = True):
        *dfs, dist = self.load_raw()
        mask = None
        if mask_zeros:
            mask = dfs[0].values != 0
        return *dfs, dist, mask

    def build_distance_matrix(self, num_sensors):
        logger.info('Building distance matrix...')
        distances = pd.read_csv(self.raw_files_paths[1])
        dist = np.ones((num_sensors, num_sensors), dtype=np.float32) * np.inf
        # Fills cells in the matrix with distances.
        for row in distances.values:
            dist[int(row[0]), int(row[1])] = row[2]
        # Save to built directory
        path = os.path.join(self.root_dir, 'distance_matrix.npy')
        np.save(path, dist)
        return dist

    def compute_similarity(self, method: str, **kwargs):
        if method == 'distance':
            finite_dist = self.dist.reshape(-1)
            finite_dist = finite_dist[~np.isinf(finite_dist)]
            sigma = finite_dist.std()
            return gaussian_kernel(self.dist, sigma)
        elif method == 'stcn':
            sigma = 10
            return gaussian_kernel(self.dist, sigma)
        elif method == 'binary':
            return (~np.isinf(self.dist)).astype('float32')


[docs]class PeMS03(_PeMS): r"""The dataset contains 3 months of traffic readings from 09/01/2018 to 11/30/2018 collected every 5 minutes by 358 traffic sensors. The measurements are provided by California Transportation Agencies (CalTrans) Performance Measurement System (PeMS). A benchmark dataset for traffic forecasting as described in the paper `"Learning Dynamics and Heterogeneity of Spatial-Temporal Graph Data for Traffic Forecasting" <https://ieeexplore.ieee.org/document/9346058>`_ (Guo et al., 2021). Dataset information: + Time steps: 26208 + Nodes: 358 + Channels: 1 + Sampling rate: 5 minutes + Missing values: 0% (already imputed in the dataset) Static attributes: + :obj:`dist`: :math:`N \times N` matrix of node pairwise distances. """ name = 'PeMS03' start_date = '09-01-2018 00:00' num_sensors = 358 url = 'https://drive.switch.ch/index.php/s/B5xDMtNs4M7pzsn/download' @property def raw_file_names(self): return ['pems03.npz', 'distances.csv', 'index.txt'] @property def required_file_names(self): return ['pems03.npz', 'distance_matrix.npy', 'index.txt'] def build_distance_matrix(self, num_sensors): logger.info('Building distance matrix...') raw_dist_path = os.path.join(self.root_dir, self.raw_files_paths[1]) distances = pd.read_csv(raw_dist_path) ids = Path(os.path.join(self.root_dir, 'index.txt')).read_text().splitlines() dist = np.ones((num_sensors, num_sensors), dtype=np.float32) * np.inf sensor_to_idx = {int(sensor_id): i for i, sensor_id in enumerate(ids)} for row in distances.values: if row[0] not in sensor_to_idx or row[1] not in sensor_to_idx: continue dist[sensor_to_idx[row[0]], sensor_to_idx[row[1]]] = row[2] path = os.path.join(self.root_dir, 'distance_matrix.npy') np.save(path, dist) return dist
[docs]class PeMS04(_PeMS): r"""The dataset contains 2 months of traffic readings from 01/01/2018 to 02/28/2018 collected every 5 minutes by 307 traffic sensors in San Francisco Bay Area. The measurements are provided by California Transportation Agencies (CalTrans) Performance Measurement System (PeMS). A benchmark dataset for traffic forecasting as described in the paper `"Learning Dynamics and Heterogeneity of Spatial-Temporal Graph Data for Traffic Forecasting" <https://ieeexplore.ieee.org/document/9346058>`_ (Guo et al., 2021). The target variable is the total flow (number of detected vehicles). Dataset information: + Time steps: 16992 + Nodes: 307 + Channels: 1 + Sampling rate: 5 minutes + Missing values: 0% (already imputed in the dataset) Covariates: + :obj:`occupancy`: :math:`T \times N \times 1` Time series associated to the occupancy of the lanes. + :obj:`speed`: :math:`T \times N \times 1` Time series associated to average speed of the detected vehicles. Static attributes: + :obj:`dist`: :math:`N \times N` matrix of node pairwise distances. """ name = 'PeMS04' start_date = '01-01-2018 00:00' num_sensors = 307 url = 'https://drive.switch.ch/index.php/s/swNbaB5rPrBmAZQ/download' @property def raw_file_names(self): return ['pems04.npz', 'distance.csv'] @property def required_file_names(self): return ['pems04.npz', 'distance_matrix.npy']
[docs]class PeMS07(_PeMS): r"""The dataset contains 4 months of traffic readings from 05/01/2017 to 08/31/2017 collected every 5 minutes by 883 traffic sensors. The measurements are provided by California Transportation Agencies (CalTrans) Performance Measurement System (PeMS). A benchmark dataset for traffic forecasting as described in the paper `"Learning Dynamics and Heterogeneity of Spatial-Temporal Graph Data for Traffic Forecasting" <https://ieeexplore.ieee.org/document/9346058>`_ (Guo et al., 2021). Dataset information: + Time steps: 28224 + Nodes: 883 + Channels: 1 + Sampling rate: 5 minutes + Missing values: 0% (already imputed in the dataset) Static attributes: + :obj:`dist`: :math:`N \times N` matrix of node pairwise distances. """ name = 'PeMS07' start_date = '05-01-2017 00:00' num_sensors = 883 url = 'https://drive.switch.ch/index.php/s/VcyirewUufrN57h/download' @property def raw_file_names(self): return ['pems07.npz', 'distance.csv'] @property def required_file_names(self): return ['pems07.npz', 'distance_matrix.npy']
[docs]class PeMS08(_PeMS): r"""The dataset contains 2 months of traffic readings from 07/01/2016 to 08/31/2016 collected every 5 minutes by 170 traffic sensors in San Bernardino. The measurements are provided by California Transportation Agencies (CalTrans) Performance Measurement System (PeMS). A benchmark dataset for traffic forecasting as described in the paper `"Learning Dynamics and Heterogeneity of Spatial-Temporal Graph Data for Traffic Forecasting" <https://ieeexplore.ieee.org/document/9346058>`_ (Guo et al., 2021). Dataset information: + Time steps: 17856 + Nodes: 170 + Channels: 1 + Sampling rate: 5 minutes + Missing values: 0% (already imputed in the dataset) Covariates: + :obj:`occupancy`: :math:`T \times N \times 1` Time series associated to the occupancy of the lanes. + :obj:`speed`: :math:`T \times N \times 1` Time series associated to average speed of the detected vehicles. Static attributes: + :obj:`dist`: :math:`N \times N` matrix of node pairwise distances. """ name = 'PeMS08' start_date = '07-01-2016 00:00' num_sensors = 170 url = 'https://drive.switch.ch/index.php/s/AUGNn9Rx9zMz3vg/download' @property def raw_file_names(self): return ['pems08.npz', 'distance.csv'] @property def required_file_names(self): return ['pems08.npz', 'distance_matrix.npy']