Module `hela.datasets.pandas_parquet_dataset`

Expand source code

import pandas as pd
from hela import BaseDataset
from hela._column_classes import _ColumnType
from typing import Optional, Sequence, Set, Union, Dict, Any
from datetime import date
from pathlib import Path


class PandasParquetDataset(BaseDataset):
    def __init__(
        self,
        name: str,
        folder: Optional[Union[str, Path]] = None,
        description: Optional[str] = None,
        rich_description_path: Optional[str] = None,
        partition_cols: Optional[Sequence[str]] = None,
        columns: Optional[Sequence[_ColumnType]] = None
    ) -> None:
        super().__init__(
            name=name,
            data_type='parquet',
            folder=folder,
            description=description,
            rich_description_path=rich_description_path,
            columns=columns,
            partition_cols=partition_cols
        )

    def write(self, df: pd.DataFrame) -> int:
        df.to_parquet(self.path, partition_cols=self.partition_cols)
        return len(df)

    def load(self) -> pd.DataFrame:
        if self.path is None:
            raise FileNotFoundError('No path has been specified for this dataset.')
        return pd.read_parquet(self.path)

    def get_dates(self) -> Optional[Set[date]]:
        if self.path is None:
            return None
        if not self.path.exists():
            return None
        return set([
            date.fromisoformat(p.stem.split('=')[-1])
            for p in self.path.iterdir()
        ])

    def get_samples(self) -> Optional[Dict[str, Any]]:
        try:
            return self.load().apply(lambda x: x[x.first_valid_index()], axis=0).to_dict()
        except FileNotFoundError:
            return None

Classes

class PandasParquetDataset (name: str, folder: Union[str, pathlib.Path, ForwardRef(None)] = None, description: Optional[str] = None, rich_description_path: Optional[str] = None, partition_cols: Optional[Sequence[str]] = None, columns: Optional[Sequence[hela._column_classes._ColumnType]] = None)

Abstract Dataset class to be used when defining building your own datasets.

If you choose to build data interactivity through the data catalog, it is within your own dataset classes you would build authentication and connection logic.

For full usage of the available catalog features implement the functions BaseDataset.get_samples and BaseDataset.get_dates.

Attributes

name: The name of the dataset
data_type: The data type of the dataset e.g. "parquet" or "bigquery
description: A description of the dataset as a string
partition_cols: A list of column names to be used for partitioning as strings
rich_description_path: A path to a markdown file with possibilities for longer, more detailed descriptions. Primarily used for generated catalog web page.
columns: A list of class ColumnType objects defining the columns of the dataset
path: The path to the dataset (combination of folder and name)

Expand source code

class PandasParquetDataset(BaseDataset):
    def __init__(
        self,
        name: str,
        folder: Optional[Union[str, Path]] = None,
        description: Optional[str] = None,
        rich_description_path: Optional[str] = None,
        partition_cols: Optional[Sequence[str]] = None,
        columns: Optional[Sequence[_ColumnType]] = None
    ) -> None:
        super().__init__(
            name=name,
            data_type='parquet',
            folder=folder,
            description=description,
            rich_description_path=rich_description_path,
            columns=columns,
            partition_cols=partition_cols
        )

    def write(self, df: pd.DataFrame) -> int:
        df.to_parquet(self.path, partition_cols=self.partition_cols)
        return len(df)

    def load(self) -> pd.DataFrame:
        if self.path is None:
            raise FileNotFoundError('No path has been specified for this dataset.')
        return pd.read_parquet(self.path)

    def get_dates(self) -> Optional[Set[date]]:
        if self.path is None:
            return None
        if not self.path.exists():
            return None
        return set([
            date.fromisoformat(p.stem.split('=')[-1])
            for p in self.path.iterdir()
        ])

    def get_samples(self) -> Optional[Dict[str, Any]]:
        try:
            return self.load().apply(lambda x: x[x.first_valid_index()], axis=0).to_dict()
        except FileNotFoundError:
            return None

Ancestors

hela._base_dataset.BaseDataset
abc.ABC

Methods

def get_dates(self) ‑> Optional[Set[datetime.date]]

Implement this function for date inspection functionality such as BaseDataset.show_dates.

Should return a set of dates when called or None if dates for some reason could not be fetched.

Expand source code

def get_dates(self) -> Optional[Set[date]]:
    if self.path is None:
        return None
    if not self.path.exists():
        return None
    return set([
        date.fromisoformat(p.stem.split('=')[-1])
        for p in self.path.iterdir()
    ])

def get_samples(self) ‑> Optional[Dict[str, Any]]

Implement this function for sample inspection functionality used in e.g. BaseDataset.show_columns.

Should return a dictionary of string keys for column names with samples:

>>> {'my_column': 123}

Nested columns should return names with dot-notation:

>>> {'parent_column.my_column': 123}

Or None if samples could not be fetched:

>>> None

Expand source code

def get_samples(self) -> Optional[Dict[str, Any]]:
    try:
        return self.load().apply(lambda x: x[x.first_valid_index()], axis=0).to_dict()
    except FileNotFoundError:
        return None

def load(self) ‑> pandas.core.frame.DataFrame

Expand source code

def load(self) -> pd.DataFrame:
    if self.path is None:
        raise FileNotFoundError('No path has been specified for this dataset.')
    return pd.read_parquet(self.path)

def write(self, df: pandas.core.frame.DataFrame) ‑> int

Expand source code

def write(self, df: pd.DataFrame) -> int:
    df.to_parquet(self.path, partition_cols=self.partition_cols)
    return len(df)