import pandas as pd
import pandera.pandas as pa
from pandera.typing import Series
class TrainingDataSchema(pa.DataFrameModel):
feature1: Series[float] = pa.Field(gt=0, nullable=False)
feature2: Series[int] = pa.Field(
in_range={"min_value": 0, "max_value": 100}
)
label: Series[int] = pa.Field(isin=[0, 1])
# Validate dataframe
df = pd.read_csv("some_data.csv")
try:
TrainingDataSchema.validate(df, lazy=True)
print("validation passed")
except pa.errors.SchemaErrors as e:
print(e.failure_cases) # full table
print(e.failure_cases.shape) # how manyvalidation
schema definition