Source code for tsl.datasets.pv_us

import os
from typing import List, Union

import pandas as pd

from tsl.utils.python_utils import ensure_list

from ..utils import download_url
from .prototypes import DatetimeDataset


[docs]class PvUS(DatetimeDataset): r"""Simulated solar power production from more than 5,000 photovoltaic plants in the US. Data are provided by `National Renewable Energy Laboratory (NREL) <https://www.nrel.gov/>`_'s `Solar Power Data for Integration Studies <https://www.nrel.gov/grid/solar-power-data.html>`_. Original raw data consist of 1 year (2006) of 5-minute solar power (in MW) for approximately 5,000 synthetic PV plants in the United States. Preprocessed data are resampled in 10-minutes intervals taking the average. The entire dataset contains 5016 plants, divided in two macro zones (east and west). The "east" zone contains 4084 plants, the "west" zone has 1082 plants. Some states appear in both zones, with plants at same geographical position. When loading the entire datasets, duplicated plants in "east" zone are dropped. Dataset size: + Time steps: 52560 + Nodes: + Full graph: 5016 + East only: 4084 + West only: 1082 + Channels: 1 + Sampling rate: 10 minutes + Missing values: 0.00% Args: zones (Union[str, List], optional): The US zones to include in the dataset. Can be ``"east"``, ``"west"``, or a list of both. If :obj:`None`, then the full dataset is loaded. (default: :obj:`None`) mask_zeros (bool, optional): If :obj:`True`, then zero values (corresponding to night hours) are masked out. (default: :obj:`False`) root (str, optional): The root directory for the data. (default: :obj:`None`) freq (str, optional): The data sampling rate for resampling. (default: :obj:`None`) """ available_zones = ['east', 'west'] urls = { 'east': "https://drive.switch.ch/index.php/s/ZUORMr4uzBSr04b/download", 'west': "https://drive.switch.ch/index.php/s/HRPNJdeAzeQLA1f/download" } similarity_options = {'distance', 'correntropy'} def __init__(self, zones: Union[str, List] = None, mask_zeros: bool = False, root: str = None, freq: str = None): # allow to download a single zone if zones is None: zones = self.available_zones else: zones = ensure_list(zones) if not set(zones).issubset(self.available_zones): invalid_zones = set(zones).difference(self.available_zones) raise ValueError(f"Invalid zones {invalid_zones}. " f"Allowed zones are {self.available_zones}.") self.zones = zones self.mask_zeros = mask_zeros self.root = root # set name name = "PvUS" if len(zones) == 2 else f"PvUS-{zones[0]}" # load dataset actual, mask, metadata = self.load(mask_zeros) super().__init__(target=actual, mask=mask, freq=freq, similarity_score="distance", spatial_aggregation="sum", temporal_aggregation="mean", name=name) self.add_covariate('metadata', metadata, pattern='n f') @property def raw_file_names(self): return [f'{zone}.h5' for zone in self.zones] @property def required_file_names(self): return self.raw_file_names
[docs] def download(self) -> None: for zone in self.zones: download_url(self.urls[zone], self.root_dir, filename=f'{zone}.h5')
[docs] def load_raw(self): self.maybe_download() actual, metadata = [], [] for zone in self.zones: # load zone data zone_path = os.path.join(self.root_dir, f'{zone}.h5') actual.append(pd.read_hdf(zone_path, key='actual')) metadata.append(pd.read_hdf(zone_path, key='metadata')) # concat zone and sort by plant id actual = pd.concat(actual, axis=1).sort_index(axis=1, level=0) metadata = pd.concat(metadata, axis=0).sort_index() # drop duplicated farms when loading whole dataset if len(self.zones) == 2: duplicated_farms = metadata.index[[ s_id.endswith('-east') for s_id in metadata.state_id ]] metadata = metadata.drop(duplicated_farms, axis=0) actual = actual.drop(duplicated_farms, axis=1, level=0) return actual, metadata
[docs] def load(self, mask_zeros): actual, metadata = self.load_raw() mask = (actual > 0) if mask_zeros else None return actual, mask, metadata
[docs] def compute_similarity(self, method: str, theta: float = 150, **kwargs): if method == "distance": from tsl.ops.similarities import (gaussian_kernel, geographical_distance) # compute distances from latitude and longitude degrees loc_coord = self.metadata.loc[:, ['lat', 'lon']].values dist = geographical_distance(loc_coord, to_rad=True) return gaussian_kernel(dist, theta=theta)