Source code for caikit.interfaces.ts.data_model.timeseries_evaluation

# Copyright The Caikit Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
The core data model object for a TimeSeries Evaluator.
"""
# Standard
from typing import List, Union

# Third Party
import pandas as pd

# First Party
from py_to_proto.dataclass_to_proto import (  # Annotated imported from here for compatibility
    Annotated,
    FieldNumber,
    OneofField,
)
import alog

# Local
from ....core import DataObjectBase
from ....core.data_model import ProducerId, dataobject
from ....core.exceptions import error_handler
from .package import TS_PACKAGE

log = alog.use_channel("TSEDM")
error = error_handler.get(log)

## TimeSeries Evaluator ##################################################################


[docs] @dataobject(package=TS_PACKAGE) class Id(DataObjectBase): """A single instance of Id Representation of ids that can be either text or index. Customized this way to be able to work with repeated """ value: Union[ Annotated[str, OneofField("text"), FieldNumber(1)], Annotated[int, OneofField("index"), FieldNumber(2)], ]
[docs] @dataobject(package=TS_PACKAGE) class EvaluationRecord(DataObjectBase): """A single EvaluationRecord for EvaluationResult Representation of EvaluationRecord for each row in the dataframe EvaluationRecord{id_values=["A", "B"], metric_values=[0.234, 0.568, 0.417], offset="overall"} """ id_values: Annotated[List[Id], FieldNumber(1)] metric_values: Annotated[List[float], FieldNumber(2)] offset: Annotated[Id, FieldNumber(3)] def __init__(self, id_values=None, metric_values=None, offset=None): """Construct a new EvaluationRecord instance EvaluationRecord Args: id_values: list(Id) List of Id values for the record metric_values: list(float) List of Id values containing metric results for the record offset: (optional) Id offset associated with the record """ error.type_check_all( "<COR26895394E>", str, int, Id, allow_none=True, id_values=id_values ) error.type_check_all("<COR25875394E>", float, metric_values=metric_values) error.type_check("<COR25873484E>", str, int, Id, allow_none=True, offset=offset) super().__init__() self.id_values = ( [] if id_values is None else [ Id(id_value) if not isinstance(id_value, Id) else id_value for id_value in id_values ] ) self.metric_values = metric_values self.offset = ( None if offset is None else Id(offset) if not isinstance(offset, Id) else offset )
[docs] @dataobject(package=TS_PACKAGE) class EvaluationResult(DataObjectBase): """EvaluationResult containing the evaluation results Representation of EvaluationResult stores rows of the dataframe as list of records string lists to keep track of id and metric columns """ records: Annotated[List[EvaluationRecord], FieldNumber(1)] id_cols: Annotated[List[str], FieldNumber(2)] metric_cols: Annotated[List[str], FieldNumber(3)] offset_col: Annotated[str, FieldNumber(4)] producer_id: Annotated[ProducerId, FieldNumber(5)] def __init__( self, records=None, id_cols=None, metric_cols=None, offset_col=None, df=None, producer_id=None, ): """Construct a new EvaluationResult instance EvaluationResult Args: records: list(EvaluationRecord) List of Evaluation Record instances id_cols: list(string) List of string containing id column names (Optional) metric_cols: list(string) List of string containing metric value column names offset_col: string Name of offset column in dataframe if exists (Optional) df: pandas dataframe initial input dataframe from which to store the results producer_id: ProducerId | None The module that produced this evaluation result. """ error.type_check_all("<COR25782594E>", str, allow_none=True, id_cols=id_cols) error.type_check_all("<COR28634484E>", str, metric_cols=metric_cols) error.type_check("<COR28485384E>", str, allow_none=True, offset_col=offset_col) error.type_check( "<COR28485385E>", tuple, ProducerId, allow_none=True, producer_id=producer_id, ) super().__init__() self.id_cols = [] if id_cols is None else id_cols self.metric_cols = metric_cols self.offset_col = offset_col self.producer_id = producer_id if df is not None: if self.offset_col is not None: error.value_check( "<COR28484474E>", self.offset_col in df.columns, f"Specified '{self.offset_col}' offset column not in dataframe", ) self.records = [ EvaluationRecord( id_values=( None if len(self.id_cols) == 0 else df.loc[i, self.id_cols].values.tolist() ), metric_values=df.loc[i, self.metric_cols].values.tolist(), offset=( None if self.offset_col is None else df.loc[i, self.offset_col] ), ) for i in range(len(df)) ] else: error.type_check_all("<COR32696407E>", EvaluationRecord, records=records) self.records = records
[docs] def as_pandas(self) -> "pd.DataFrame": """Generate and return a pandas DataFrame""" records = [] has_offset = False for record in self.records: id_values = [] metric_values = [] offset = None id_values = [v.value for v in record.id_values] metric_values = record.metric_values if record.offset: offset = record.offset.value has_offset = True records.append(id_values + metric_values + [offset]) df = pd.DataFrame( records, columns=self.id_cols + self.metric_cols + [self.offset_col] ) if not has_offset: df.drop([self.offset_col], axis=1, inplace=True) return df