Module hela.datasets.pandas_parquet_dataset
Expand source code
import pandas as pd
from hela import BaseDataset
from hela._column_classes import _ColumnType
from typing import Optional, Sequence, Set, Union, Dict, Any
from datetime import date
from pathlib import Path
class PandasParquetDataset(BaseDataset):
def __init__(
self,
name: str,
folder: Optional[Union[str, Path]] = None,
description: Optional[str] = None,
rich_description_path: Optional[str] = None,
partition_cols: Optional[Sequence[str]] = None,
columns: Optional[Sequence[_ColumnType]] = None
) -> None:
super().__init__(
name=name,
data_type='parquet',
folder=folder,
description=description,
rich_description_path=rich_description_path,
columns=columns,
partition_cols=partition_cols
)
def write(self, df: pd.DataFrame) -> int:
df.to_parquet(self.path, partition_cols=self.partition_cols)
return len(df)
def load(self) -> pd.DataFrame:
if self.path is None:
raise FileNotFoundError('No path has been specified for this dataset.')
return pd.read_parquet(self.path)
def get_dates(self) -> Optional[Set[date]]:
if self.path is None:
return None
if not self.path.exists():
return None
return set([
date.fromisoformat(p.stem.split('=')[-1])
for p in self.path.iterdir()
])
def get_samples(self) -> Optional[Dict[str, Any]]:
try:
return self.load().apply(lambda x: x[x.first_valid_index()], axis=0).to_dict()
except FileNotFoundError:
return None
Classes
class PandasParquetDataset (name: str, folder: Union[str, pathlib.Path, ForwardRef(None)] = None, description: Optional[str] = None, rich_description_path: Optional[str] = None, partition_cols: Optional[Sequence[str]] = None, columns: Optional[Sequence[hela._column_classes._ColumnType]] = None)-
Abstract Dataset class to be used when defining building your own datasets.
If you choose to build data interactivity through the data catalog, it is within your own dataset classes you would build authentication and connection logic.
For full usage of the available catalog features implement the functions
BaseDataset.get_samplesandBaseDataset.get_dates.Attributes
name- The name of the dataset
data_type- The data type of the dataset e.g. "parquet" or "bigquery
description- A description of the dataset as a string
partition_cols- A list of column names to be used for partitioning as strings
rich_description_path- A path to a markdown file with possibilities for longer, more detailed descriptions. Primarily used for generated catalog web page.
columns- A list of class ColumnType objects defining the columns of the dataset
path- The path to the dataset (combination of folder and name)
Expand source code
class PandasParquetDataset(BaseDataset): def __init__( self, name: str, folder: Optional[Union[str, Path]] = None, description: Optional[str] = None, rich_description_path: Optional[str] = None, partition_cols: Optional[Sequence[str]] = None, columns: Optional[Sequence[_ColumnType]] = None ) -> None: super().__init__( name=name, data_type='parquet', folder=folder, description=description, rich_description_path=rich_description_path, columns=columns, partition_cols=partition_cols ) def write(self, df: pd.DataFrame) -> int: df.to_parquet(self.path, partition_cols=self.partition_cols) return len(df) def load(self) -> pd.DataFrame: if self.path is None: raise FileNotFoundError('No path has been specified for this dataset.') return pd.read_parquet(self.path) def get_dates(self) -> Optional[Set[date]]: if self.path is None: return None if not self.path.exists(): return None return set([ date.fromisoformat(p.stem.split('=')[-1]) for p in self.path.iterdir() ]) def get_samples(self) -> Optional[Dict[str, Any]]: try: return self.load().apply(lambda x: x[x.first_valid_index()], axis=0).to_dict() except FileNotFoundError: return NoneAncestors
- hela._base_dataset.BaseDataset
- abc.ABC
Methods
def get_dates(self) ‑> Optional[Set[datetime.date]]-
Implement this function for date inspection functionality such as
BaseDataset.show_dates.Should return a set of dates when called or None if dates for some reason could not be fetched.
Expand source code
def get_dates(self) -> Optional[Set[date]]: if self.path is None: return None if not self.path.exists(): return None return set([ date.fromisoformat(p.stem.split('=')[-1]) for p in self.path.iterdir() ]) def get_samples(self) ‑> Optional[Dict[str, Any]]-
Implement this function for sample inspection functionality used in e.g.
BaseDataset.show_columns.Should return a dictionary of string keys for column names with samples:
>>> {'my_column': 123}Nested columns should return names with dot-notation:
>>> {'parent_column.my_column': 123}Or None if samples could not be fetched:
>>> NoneExpand source code
def get_samples(self) -> Optional[Dict[str, Any]]: try: return self.load().apply(lambda x: x[x.first_valid_index()], axis=0).to_dict() except FileNotFoundError: return None def load(self) ‑> pandas.core.frame.DataFrame-
Expand source code
def load(self) -> pd.DataFrame: if self.path is None: raise FileNotFoundError('No path has been specified for this dataset.') return pd.read_parquet(self.path) def write(self, df: pandas.core.frame.DataFrame) ‑> int-
Expand source code
def write(self, df: pd.DataFrame) -> int: df.to_parquet(self.path, partition_cols=self.partition_cols) return len(df)