Source code for tsl.datasets.metr_la

import os

import numpy as np
import pandas as pd

from tsl import logger

from ..ops.similarities import gaussian_kernel
from ..utils import download_url, extract_zip
from .prototypes import DatetimeDataset


[docs]class MetrLA(DatetimeDataset):
    r"""Traffic readings collected from 207 loop detectors on
    highways in Los Angeles County, aggregated in 5 minutes intervals over four
    months between March 2012 and June 2012.

    A benchmark dataset for traffic forecasting as described in
    `"Diffusion Convolutional Recurrent Neural Network: Data-Driven Traffic
    Forecasting" <https://arxiv.org/abs/1707.01926>`_.

    Dataset information:
        + Time steps: 34272
        + Nodes: 207
        + Channels: 1
        + Sampling rate: 5 minutes
        + Missing values: 8.11%

    Static attributes:
        + :obj:`dist`: :math:`N \times N` matrix of node pairwise distances.
    """
    url = "https://drive.switch.ch/index.php/s/Z8cKHAVyiDqkzaG/download"

    similarity_options = {'distance'}

    def __init__(self, root=None, impute_zeros=True, freq=None):
        # set root path
        self.root = root
        # load dataset
        df, dist, mask = self.load(impute_zeros=impute_zeros)
        super().__init__(target=df,
                         mask=mask,
                         freq=freq,
                         similarity_score="distance",
                         temporal_aggregation="nearest",
                         name="MetrLA")
        self.add_covariate('dist', dist, pattern='n n')

    @property
    def raw_file_names(self):
        return [
            'metr_la.h5', 'distances_la.csv', 'sensor_locations_la.csv',
            'sensor_ids_la.txt'
        ]

    @property
    def required_file_names(self):
        return ['metr_la.h5', 'metr_la_dist.npy']

[docs]    def download(self) -> None:
        path = download_url(self.url, self.root_dir)
        extract_zip(path, self.root_dir)
        os.unlink(path)

[docs]    def build(self) -> None:
        self.maybe_download()
        # Build distance matrix
        logger.info('Building distance matrix...')
        raw_dist_path = os.path.join(self.root_dir, 'distances_la.csv')
        distances = pd.read_csv(raw_dist_path)
        ids_path = os.path.join(self.root_dir, 'sensor_ids_la.txt')
        with open(ids_path) as f:
            ids = f.read().strip().split(',')
        num_sensors = len(ids)
        dist = np.ones((num_sensors, num_sensors), dtype=np.float32) * np.inf
        # Builds sensor id to index map.
        sensor_to_ind = {int(sensor_id): i for i, sensor_id in enumerate(ids)}
        # Fills cells in the matrix with distances.
        for row in distances.values:
            if row[0] not in sensor_to_ind or row[1] not in sensor_to_ind:
                continue
            dist[sensor_to_ind[row[0]], sensor_to_ind[row[1]]] = row[2]
        # Save to built directory
        path = os.path.join(self.root_dir, 'metr_la_dist.npy')
        np.save(path, dist)
        # Remove raw data
        self.clean_downloads()

[docs]    def load_raw(self):
        self.maybe_build()
        # load traffic data
        traffic_path = os.path.join(self.root_dir, 'metr_la.h5')
        df = pd.read_hdf(traffic_path)
        # add missing values
        datetime_idx = sorted(df.index)
        date_range = pd.date_range(datetime_idx[0],
                                   datetime_idx[-1],
                                   freq='5T')
        df = df.reindex(index=date_range)
        # load distance matrix
        path = os.path.join(self.root_dir, 'metr_la_dist.npy')
        dist = np.load(path)
        return df, dist

[docs]    def load(self, impute_zeros=True):
        df, dist = self.load_raw()
        mask = (df.values != 0.).astype('uint8')
        if impute_zeros:
            df = df.replace(to_replace=0., method='ffill')
        return df, dist, mask

[docs]    def compute_similarity(self, method: str, **kwargs):
        if method == "distance":
            finite_dist = self.dist.reshape(-1)
            finite_dist = finite_dist[~np.isinf(finite_dist)]
            sigma = finite_dist.std()
            return gaussian_kernel(self.dist, sigma)