import os
import pandas as pd
import tsl
from tsl.datasets.prototypes import DatetimeDataset
from tsl.utils import download_url
class _MTSBenchmarkDataset(DatetimeDataset):
"""Abstract class for loading datasets from
https://github.com/laiguokun/multivariate-time-series-data.
Args:
root: Root folder for data download.
freq: Resampling frequency.
"""
url = None
default_similarity_score = None
default_spatial_aggregation = None
default_temporal_aggregation = None
default_freq = None
start_date = None
def __init__(self, root=None, freq=None):
self.root = root
df, mask = self.load()
super().__init__(
target=df,
mask=mask,
freq=freq,
similarity_score=self.default_similarity_score,
temporal_aggregation=self.default_temporal_aggregation,
spatial_aggregation=self.default_spatial_aggregation,
name=self.__class__.__name__)
@property
def required_file_names(self):
return [f'{self.__class__.__name__}.h5']
def download(self) -> None:
download_url(self.url, self.root_dir)
def build(self):
# Build dataset
self.maybe_download()
tsl.logger.info(f"Building the {self.__class__.__name__} dataset...")
df = pd.read_csv(self.raw_files_paths[0],
index_col=False,
header=None,
sep=',',
compression='gzip')
index = pd.date_range(start=self.start_date,
periods=len(df),
freq=self.default_freq)
df = df.set_index(index)
path = os.path.join(self.root_dir, f'{self.__class__.__name__}.h5')
df.to_hdf(path, key='raw')
self.clean_downloads()
def load_raw(self) -> pd.DataFrame:
self.maybe_build()
df = pd.read_hdf(self.required_files_paths[0])
return df
def load(self):
df = self.load_raw()
tsl.logger.info('Loaded raw dataset.')
mask = (df.values != 0.).astype('uint8')
return df, mask
[docs]class ElectricityBenchmark(_MTSBenchmarkDataset):
"""Electricity consumption (in kWh) measured hourly by 321 sensors from
2012 to 2014.
Imported from https://github.com/laiguokun/multivariate-time-series-data.
The `original dataset
<https://archive.ics.uci.edu/ml/datasets/ElectricityLoadDiagrams20112014>`_
records values in kW for 370 nodes starting from 2011, with part of the
nodes with missing values before 2012. For the original dataset refer to
:class:`~tsl.datasets.Elergone`.
Dataset information:
+ Time steps: 26304
+ Nodes: 321
+ Channels: 1
+ Sampling rate: 1 hour
+ Missing values: 1.09%
"""
url = 'https://github.com/TorchSpatiotemporal/multivariate-time-series-data/blob/master/electricity/electricity.txt.gz?raw=true' # noqa
similarity_options = None
default_similarity_score = None
default_temporal_aggregation = 'sum'
default_spatial_aggregation = 'sum'
default_freq = '1H'
start_date = '01-01-2012 00:00'
@property
def raw_file_names(self):
return ['electricity.txt.gz']
[docs]class TrafficBenchmark(_MTSBenchmarkDataset):
"""A collection of hourly road occupancy rates (between 0 and 1) measured
by 862 sensors for 48 months (2015-2016) on San Francisco Bay Area freeways.
Imported from https://github.com/laiguokun/multivariate-time-series-data,
raw data at `California Department of Transportation
<https://pems.dot.ca.gov>`_.
Dataset information:
+ Time steps: 17544
+ Nodes: 862
+ Channels: 1
+ Sampling rate: 1 hour
+ Missing values: 0.90%
"""
url = 'https://github.com/TorchSpatiotemporal/multivariate-time-series-data/blob/master/traffic/traffic.txt.gz?raw=true' # noqa
similarity_options = None
default_similarity_score = None
default_temporal_aggregation = 'mean'
default_spatial_aggregation = 'mean'
default_freq = '1H'
start_date = '01-01-2015 00:00'
@property
def raw_file_names(self):
return ['traffic.txt.gz']
[docs]class SolarBenchmark(_MTSBenchmarkDataset):
"""Solar power production records in the year of 2006, is sampled every 10
minutes from 137 synthetic PV farms in Alabama State.
The mask denotes 55.10% of data corresponding to daily hours with nonzero
power production.
Imported from https://github.com/laiguokun/multivariate-time-series-data,
raw data at https://www.nrel.gov/grid/solar-power-data.html.
Dataset information:
+ Time steps: 52560
+ Nodes: 137
+ Channels: 1
+ Sampling rate: 10 minutes
+ Missing values: 0.00%
"""
url = 'https://github.com/TorchSpatiotemporal/multivariate-time-series-data/blob/master/solar-energy/solar_AL.txt.gz?raw=true' # noqa
similarity_options = None
default_similarity_score = None
default_temporal_aggregation = 'mean'
default_spatial_aggregation = 'sum'
default_freq = '10T'
start_date = '01-01-2006 00:00'
@property
def raw_file_names(self):
return ['solar_AL.txt.gz']
[docs]class ExchangeBenchmark(_MTSBenchmarkDataset):
"""The collection of the daily exchange rates of eight foreign countries
including Australia, British, Canada, Switzerland, China, Japan, New
Zealand and Singapore ranging from 1990 to 2016.
Imported from https://github.com/laiguokun/multivariate-time-series-data.
Dataset information:
+ Time steps: 7588
+ Nodes: 8
+ Channels: 1
+ Sampling rate: 1 day
+ Missing values: 0.00%
"""
url = 'https://github.com/TorchSpatiotemporal/multivariate-time-series-data/blob/master/exchange_rate/exchange_rate.txt.gz?raw=true' # noqa
similarity_options = None
default_similarity_score = None
default_temporal_aggregation = 'mean'
default_spatial_aggregation = None
default_freq = '1D'
start_date = '01-01-1990'
@property
def raw_file_names(self):
return ['exchange_rate.txt.gz']