New Research: Supply Chain Attack on Axios Pulls Malicious Dependency from npm.Details →
Socket
Book a DemoSign in
Socket

pointblank

Package Overview
Dependencies
Maintainers
1
Versions
54
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

pointblank - pypi Package Compare versions

Comparing version
0.17.0
to
0.18.0
+120
pointblank/_agg.py
from __future__ import annotations
import itertools
from collections.abc import Callable
from typing import Any
import narwhals as nw
# TODO: Should take any frame type
Aggregator = Callable[[nw.DataFrame], float | int]
Comparator = Callable[[Any, Any, Any], bool]
AGGREGATOR_REGISTRY: dict[str, Aggregator] = {}
COMPARATOR_REGISTRY: dict[str, Comparator] = {}
def register(fn):
"""Register an aggregator or comparator function."""
name: str = fn.__name__
if name.startswith("comp_"):
COMPARATOR_REGISTRY[name.removeprefix("comp_")] = fn
elif name.startswith("agg_"):
AGGREGATOR_REGISTRY[name.removeprefix("agg_")] = fn
else:
raise NotImplementedError # pragma: no cover
return fn
## Aggregator Functions
@register
def agg_sum(column: nw.DataFrame) -> float:
return column.select(nw.all().sum()).item()
@register
def agg_avg(column: nw.DataFrame) -> float:
return column.select(nw.all().mean()).item()
@register
def agg_sd(column: nw.DataFrame) -> float:
return column.select(nw.all().std()).item()
## Comparator functions:
@register
def comp_eq(real: float, lower: float, upper: float) -> bool:
if lower == upper:
return bool(real == lower)
return _generic_between(real, lower, upper)
@register
def comp_gt(real: float, lower: float, upper: float) -> bool:
return bool(real > lower)
@register
def comp_ge(real: Any, lower: float, upper: float) -> bool:
return bool(real >= lower)
@register
def comp_lt(real: float, lower: float, upper: float) -> bool:
return bool(real < upper)
@register
def comp_le(real: float, lower: float, upper: float) -> bool:
return bool(real <= upper)
def _generic_between(real: Any, lower: Any, upper: Any) -> bool:
"""Call if comparator needs to check between two values."""
return bool(lower <= real <= upper)
def resolve_agg_registries(name: str) -> tuple[Aggregator, Comparator]:
"""Resolve the assertion name to a valid aggregator
Args:
name (str): The name of the assertion.
Returns:
tuple[Aggregator, Comparator]: The aggregator and comparator functions.
"""
name = name.removeprefix("col_")
agg_name, comp_name = name.split("_")[-2:]
aggregator = AGGREGATOR_REGISTRY.get(agg_name)
comparator = COMPARATOR_REGISTRY.get(comp_name)
if aggregator is None: # pragma: no cover
raise ValueError(f"Aggregator '{agg_name}' not found in registry.")
if comparator is None: # pragma: no cover
raise ValueError(f"Comparator '{comp_name}' not found in registry.")
return aggregator, comparator
def is_valid_agg(name: str) -> bool:
try:
resolve_agg_registries(name)
return True
except ValueError:
return False
def load_validation_method_grid() -> tuple[str, ...]:
"""Generate all possible validation methods."""
methods = []
for agg_name, comp_name in itertools.product(
AGGREGATOR_REGISTRY.keys(), COMPARATOR_REGISTRY.keys()
):
method = f"col_{agg_name}_{comp_name}"
methods.append(method)
return tuple(methods)
from pointblank import Actions, Thresholds
from pointblank._utils import _PBUnresolvedColumn
from pointblank.column import Column, ReferenceColumn
from pointblank._typing import Tolerance
from collections.abc import Collection
from dataclasses import dataclass
from great_tables import GT
from narwhals.typing import FrameT, IntoFrame
from pathlib import Path
from pointblank._typing import SegmentSpec, Tolerance
from pointblank._utils import _PBUnresolvedColumn
from pointblank.column import Column, ColumnSelector, ColumnSelectorNarwhals, ReferenceColumn
from pointblank.schema import Schema
from pointblank.thresholds import Actions, FinalActions, Thresholds
from typing import Any, Callable, Literal, ParamSpec, TypeVar
__all__ = [
"Validate",
"load_dataset",
"read_file",
"write_file",
"config",
"connect_to_table",
"print_database_tables",
"preview",
"missing_vals_tbl",
"get_action_metadata",
"get_column_count",
"get_data_path",
"get_row_count",
"get_validation_summary",
]
P = ParamSpec("P")
R = TypeVar("R")
def get_action_metadata() -> dict | None: ...
def get_validation_summary() -> dict | None: ...
@dataclass
class PointblankConfig:
report_incl_header: bool = ...
report_incl_footer: bool = ...
report_incl_footer_timings: bool = ...
report_incl_footer_notes: bool = ...
preview_incl_header: bool = ...
def __repr__(self) -> str: ...
def config(
report_incl_header: bool = True,
report_incl_footer: bool = True,
report_incl_footer_timings: bool = True,
report_incl_footer_notes: bool = True,
preview_incl_header: bool = True,
) -> PointblankConfig: ...
def load_dataset(
dataset: Literal["small_table", "game_revenue", "nycflights", "global_sales"] = "small_table",
tbl_type: Literal["polars", "pandas", "duckdb"] = "polars",
) -> FrameT | Any: ...
def read_file(filepath: str | Path) -> Validate: ...
def write_file(
validation: Validate,
filename: str,
path: str | None = None,
keep_tbl: bool = False,
keep_extracts: bool = False,
quiet: bool = False,
) -> None: ...
def get_data_path(
dataset: Literal["small_table", "game_revenue", "nycflights", "global_sales"] = "small_table",
file_type: Literal["csv", "parquet", "duckdb"] = "csv",
) -> str: ...
def preview(
data: FrameT | Any,
columns_subset: str | list[str] | Column | None = None,
n_head: int = 5,
n_tail: int = 5,
limit: int = 50,
show_row_numbers: bool = True,
max_col_width: int = 250,
min_tbl_width: int = 500,
incl_header: bool = None,
) -> GT: ...
def missing_vals_tbl(data: FrameT | Any) -> GT: ...
def get_column_count(data: FrameT | Any) -> int: ...
def get_row_count(data: FrameT | Any) -> int: ...
@dataclass
class _ValidationInfo:
@classmethod
def from_agg_validator(
cls,
assertion_type: str,
columns: _PBUnresolvedColumn,
value: float | Column | ReferenceColumn,
tol: Tolerance = 0,
thresholds: float | bool | tuple | dict | Thresholds | None = None,
brief: str | bool = False,
actions: Actions | None = None,
active: bool = True,
) -> _ValidationInfo: ...
i: int | None = ...
i_o: int | None = ...
step_id: str | None = ...
sha1: str | None = ...
assertion_type: str | None = ...
column: Any | None = ...
values: Any | list[any] | tuple | None = ...
inclusive: tuple[bool, bool] | None = ...
na_pass: bool | None = ...
pre: Callable | None = ...
segments: Any | None = ...
thresholds: Thresholds | None = ...
actions: Actions | None = ...
label: str | None = ...
brief: str | None = ...
autobrief: str | None = ...
active: bool | None = ...
eval_error: bool | None = ...
all_passed: bool | None = ...
n: int | None = ...
n_passed: int | None = ...
n_failed: int | None = ...
f_passed: int | None = ...
f_failed: int | None = ...
warning: bool | None = ...
error: bool | None = ...
critical: bool | None = ...
failure_text: str | None = ...
tbl_checked: FrameT | None = ...
extract: FrameT | None = ...
val_info: dict[str, any] | None = ...
time_processed: str | None = ...
proc_duration_s: float | None = ...
notes: dict[str, dict[str, str]] | None = ...
def get_val_info(self) -> dict[str, any]: ...
def _add_note(self, key: str, markdown: str, text: str | None = None) -> None: ...
def _get_notes(self, format: str = "dict") -> dict[str, dict[str, str]] | list[str] | None: ...
def _get_note(self, key: str, format: str = "dict") -> dict[str, str] | str | None: ...
def _has_notes(self) -> bool: ...
def connect_to_table(connection_string: str) -> Any: ...
def print_database_tables(connection_string: str) -> list[str]: ...
@dataclass
class Validate:
data: FrameT | Any
reference: IntoFrame | None = ...
tbl_name: str | None = ...
label: str | None = ...
thresholds: int | float | bool | tuple | dict | Thresholds | None = ...
actions: Actions | None = ...
final_actions: FinalActions | None = ...
brief: str | bool | None = ...
lang: str | None = ...
locale: str | None = ...
col_names = ...
col_types = ...
time_start = ...
time_end = ...
validation_info = ...
def __post_init__(self) -> None: ...
def _add_agg_validation(
self,
*,
assertion_type: str,
columns: str | Collection[str],
value,
tol: int = 0,
thresholds=None,
brief: bool = False,
actions=None,
active: bool = True,
): ...
def set_tbl(
self, tbl: FrameT | Any, tbl_name: str | None = None, label: str | None = None
) -> Validate: ...
def _repr_html_(self) -> str: ...
def col_vals_gt(
self,
columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals,
value: float | int | Column,
na_pass: bool = False,
pre: Callable | None = None,
segments: SegmentSpec | None = None,
thresholds: int | float | bool | tuple | dict | Thresholds = None,
actions: Actions | None = None,
brief: str | bool | None = None,
active: bool = True,
) -> Validate: ...
def col_vals_lt(
self,
columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals,
value: float | int | Column,
na_pass: bool = False,
pre: Callable | None = None,
segments: SegmentSpec | None = None,
thresholds: int | float | bool | tuple | dict | Thresholds = None,
actions: Actions | None = None,
brief: str | bool | None = None,
active: bool = True,
) -> Validate: ...
def col_vals_eq(
self,
columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals,
value: float | int | Column,
na_pass: bool = False,
pre: Callable | None = None,
segments: SegmentSpec | None = None,
thresholds: int | float | bool | tuple | dict | Thresholds = None,
actions: Actions | None = None,
brief: str | bool | None = None,
active: bool = True,
) -> Validate: ...
def col_vals_ne(
self,
columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals,
value: float | int | Column,
na_pass: bool = False,
pre: Callable | None = None,
segments: SegmentSpec | None = None,
thresholds: int | float | bool | tuple | dict | Thresholds = None,
actions: Actions | None = None,
brief: str | bool | None = None,
active: bool = True,
) -> Validate: ...
def col_vals_ge(
self,
columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals,
value: float | int | Column,
na_pass: bool = False,
pre: Callable | None = None,
segments: SegmentSpec | None = None,
thresholds: int | float | bool | tuple | dict | Thresholds = None,
actions: Actions | None = None,
brief: str | bool | None = None,
active: bool = True,
) -> Validate: ...
def col_vals_le(
self,
columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals,
value: float | int | Column,
na_pass: bool = False,
pre: Callable | None = None,
segments: SegmentSpec | None = None,
thresholds: int | float | bool | tuple | dict | Thresholds = None,
actions: Actions | None = None,
brief: str | bool | None = None,
active: bool = True,
) -> Validate: ...
def col_vals_between(
self,
columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals,
left: float | int | Column,
right: float | int | Column,
inclusive: tuple[bool, bool] = (True, True),
na_pass: bool = False,
pre: Callable | None = None,
segments: SegmentSpec | None = None,
thresholds: int | float | bool | tuple | dict | Thresholds = None,
actions: Actions | None = None,
brief: str | bool | None = None,
active: bool = True,
) -> Validate: ...
def col_vals_outside(
self,
columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals,
left: float | int | Column,
right: float | int | Column,
inclusive: tuple[bool, bool] = (True, True),
na_pass: bool = False,
pre: Callable | None = None,
segments: SegmentSpec | None = None,
thresholds: int | float | bool | tuple | dict | Thresholds = None,
actions: Actions | None = None,
brief: str | bool | None = None,
active: bool = True,
) -> Validate: ...
def col_vals_in_set(
self,
columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals,
set: Collection[Any],
pre: Callable | None = None,
segments: SegmentSpec | None = None,
thresholds: int | float | bool | tuple | dict | Thresholds = None,
actions: Actions | None = None,
brief: str | bool | None = None,
active: bool = True,
) -> Validate: ...
def col_vals_not_in_set(
self,
columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals,
set: Collection[Any],
pre: Callable | None = None,
segments: SegmentSpec | None = None,
thresholds: int | float | bool | tuple | dict | Thresholds = None,
actions: Actions | None = None,
brief: str | bool | None = None,
active: bool = True,
) -> Validate: ...
def col_vals_increasing(
self,
columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals,
allow_stationary: bool = False,
decreasing_tol: float | None = None,
na_pass: bool = False,
pre: Callable | None = None,
segments: SegmentSpec | None = None,
thresholds: int | float | bool | tuple | dict | Thresholds = None,
actions: Actions | None = None,
brief: str | bool | None = None,
active: bool = True,
) -> Validate: ...
def col_vals_decreasing(
self,
columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals,
allow_stationary: bool = False,
increasing_tol: float | None = None,
na_pass: bool = False,
pre: Callable | None = None,
segments: SegmentSpec | None = None,
thresholds: int | float | bool | tuple | dict | Thresholds = None,
actions: Actions | None = None,
brief: str | bool | None = None,
active: bool = True,
) -> Validate: ...
def col_vals_null(
self,
columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals,
pre: Callable | None = None,
segments: SegmentSpec | None = None,
thresholds: int | float | bool | tuple | dict | Thresholds = None,
actions: Actions | None = None,
brief: str | bool | None = None,
active: bool = True,
) -> Validate: ...
def col_vals_not_null(
self,
columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals,
pre: Callable | None = None,
segments: SegmentSpec | None = None,
thresholds: int | float | bool | tuple | dict | Thresholds = None,
actions: Actions | None = None,
brief: str | bool | None = None,
active: bool = True,
) -> Validate: ...
def col_vals_regex(
self,
columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals,
pattern: str,
na_pass: bool = False,
inverse: bool = False,
pre: Callable | None = None,
segments: SegmentSpec | None = None,
thresholds: int | float | bool | tuple | dict | Thresholds = None,
actions: Actions | None = None,
brief: str | bool | None = None,
active: bool = True,
) -> Validate: ...
def col_vals_within_spec(
self,
columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals,
spec: str,
na_pass: bool = False,
pre: Callable | None = None,
segments: SegmentSpec | None = None,
thresholds: int | float | bool | tuple | dict | Thresholds = None,
actions: Actions | None = None,
brief: str | bool | None = None,
active: bool = True,
) -> Validate: ...
def col_vals_expr(
self,
expr: Any,
pre: Callable | None = None,
segments: SegmentSpec | None = None,
thresholds: int | float | bool | tuple | dict | Thresholds = None,
actions: Actions | None = None,
brief: str | bool | None = None,
active: bool = True,
) -> Validate: ...
def col_exists(
self,
columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals,
thresholds: int | float | bool | tuple | dict | Thresholds = None,
actions: Actions | None = None,
brief: str | bool | None = None,
active: bool = True,
) -> Validate: ...
def col_pct_null(
self,
columns: str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals,
p: float,
tol: Tolerance = 0,
thresholds: int | float | None | bool | tuple | dict | Thresholds = None,
actions: Actions | None = None,
brief: str | bool | None = None,
active: bool = True,
) -> Validate: ...
def rows_distinct(
self,
columns_subset: str | list[str] | None = None,
pre: Callable | None = None,
segments: SegmentSpec | None = None,
thresholds: int | float | bool | tuple | dict | Thresholds = None,
actions: Actions | None = None,
brief: str | bool | None = None,
active: bool = True,
) -> Validate: ...
def rows_complete(
self,
columns_subset: str | list[str] | None = None,
pre: Callable | None = None,
segments: SegmentSpec | None = None,
thresholds: int | float | bool | tuple | dict | Thresholds = None,
actions: Actions | None = None,
brief: str | bool | None = None,
active: bool = True,
) -> Validate: ...
def prompt(
self,
prompt: str,
model: str,
columns_subset: str | list[str] | None = None,
batch_size: int = 1000,
max_concurrent: int = 3,
pre: Callable | None = None,
segments: SegmentSpec | None = None,
thresholds: int | float | bool | tuple | dict | Thresholds = None,
actions: Actions | None = None,
brief: str | bool | None = None,
active: bool = True,
) -> Validate: ...
def col_schema_match(
self,
schema: Schema,
complete: bool = True,
in_order: bool = True,
case_sensitive_colnames: bool = True,
case_sensitive_dtypes: bool = True,
full_match_dtypes: bool = True,
pre: Callable | None = None,
thresholds: int | float | bool | tuple | dict | Thresholds = None,
actions: Actions | None = None,
brief: str | bool | None = None,
active: bool = True,
) -> Validate: ...
def row_count_match(
self,
count: int | FrameT | Any,
tol: Tolerance = 0,
inverse: bool = False,
pre: Callable | None = None,
thresholds: int | float | bool | tuple | dict | Thresholds = None,
actions: Actions | None = None,
brief: str | bool | None = None,
active: bool = True,
) -> Validate: ...
def col_count_match(
self,
count: int | FrameT | Any,
inverse: bool = False,
pre: Callable | None = None,
thresholds: int | float | bool | tuple | dict | Thresholds = None,
actions: Actions | None = None,
brief: str | bool | None = None,
active: bool = True,
) -> Validate: ...
def tbl_match(
self,
tbl_compare: FrameT | Any,
pre: Callable | None = None,
thresholds: int | float | bool | tuple | dict | Thresholds = None,
actions: Actions | None = None,
brief: str | bool | None = None,
active: bool = True,
) -> Validate: ...
def conjointly(
self,
*exprs: Callable,
pre: Callable | None = None,
thresholds: int | float | bool | tuple | dict | Thresholds = None,
actions: Actions | None = None,
brief: str | bool | None = None,
active: bool = True,
) -> Validate: ...
def specially(
self,
expr: Callable,
pre: Callable | None = None,
thresholds: int | float | bool | tuple | dict | Thresholds = None,
actions: Actions | None = None,
brief: str | bool | None = None,
active: bool = True,
) -> Validate: ...
def interrogate(
self,
collect_extracts: bool = True,
collect_tbl_checked: bool = True,
get_first_n: int | None = None,
sample_n: int | None = None,
sample_frac: int | float | None = None,
extract_limit: int = 500,
) -> Validate: ...
def all_passed(self) -> bool: ...
def assert_passing(self) -> None: ...
def assert_below_threshold(
self, level: str = "warning", i: int | None = None, message: str | None = None
) -> None: ...
def above_threshold(self, level: str = "warning", i: int | None = None) -> bool: ...
def n(self, i: int | list[int] | None = None, scalar: bool = False) -> dict[int, int] | int: ...
def n_passed(
self, i: int | list[int] | None = None, scalar: bool = False
) -> dict[int, int] | int: ...
def n_failed(
self, i: int | list[int] | None = None, scalar: bool = False
) -> dict[int, int] | int: ...
def f_passed(
self, i: int | list[int] | None = None, scalar: bool = False
) -> dict[int, float] | float: ...
def f_failed(
self, i: int | list[int] | None = None, scalar: bool = False
) -> dict[int, float] | float: ...
def warning(
self, i: int | list[int] | None = None, scalar: bool = False
) -> dict[int, bool] | bool: ...
def error(
self, i: int | list[int] | None = None, scalar: bool = False
) -> dict[int, bool] | bool: ...
def critical(
self, i: int | list[int] | None = None, scalar: bool = False
) -> dict[int, bool] | bool: ...
def get_data_extracts(
self, i: int | list[int] | None = None, frame: bool = False
) -> dict[int, FrameT | None] | FrameT | None: ...
def get_json_report(
self, use_fields: list[str] | None = None, exclude_fields: list[str] | None = None
) -> str: ...
def get_sundered_data(self, type: str = "pass") -> FrameT: ...
def get_notes(
self, i: int, format: str = "dict"
) -> dict[str, dict[str, str]] | list[str] | None: ...
def get_note(self, i: int, key: str, format: str = "dict") -> dict[str, str] | str | None: ...
def get_tabular_report(
self,
title: str | None = ":default:",
incl_header: bool | None = None,
incl_footer: bool | None = None,
incl_footer_timings: bool | None = None,
incl_footer_notes: bool | None = None,
) -> GT: ...
def get_step_report(
self,
i: int,
columns_subset: str | list[str] | Column | None = None,
header: str = ":default:",
limit: int | None = 10,
) -> GT: ...
def _add_validation(self, validation_info): ...
def _evaluate_column_exprs(self, validation_info): ...
def _evaluate_segments(self, validation_info): ...
def _get_validation_dict(self, i: int | list[int] | None, attr: str) -> dict[int, int]: ...
def _execute_final_actions(self) -> None: ...
def _get_highest_severity_level(self): ...
# === GENERATED START ===
def col_sum_eq(
self,
columns: _PBUnresolvedColumn,
value: float | Column | ReferenceColumn | None = None,
tol: Tolerance = 0,
thresholds: float | bool | tuple | dict | Thresholds | None = None,
brief: str | bool = False,
actions: Actions | None = None,
active: bool = True,
) -> Validate:
"""Assert the values in a column sum to a value eq some `value`.
Args:
columns (_PBUnresolvedColumn): Column or collection of columns to validate.
value (float | Column | ReferenceColumn | None): Target value to validate against.
If None and reference data is set on the Validate object, defaults to
ref(column) to compare against the same column in the reference data.
tol (Tolerance, optional): Tolerance for validation distance to target. Defaults to 0.
thresholds (float | bool | tuple | dict | Thresholds | None, optional): Custom thresholds for
the bounds. See examples for usage. Defaults to None.
brief (str | bool, optional): Explanation of validation operation. Defaults to False.
actions (Actions | None, optional): Actions to take after validation. Defaults to None.
active (bool, optional): Whether to activate the validation. Defaults to True.
Returns:
Validate: A `Validate` instance with the new validation method added.
Examples:
>>> import polars as pl
>>>
>>> data = pl.DataFrame({"a": [1, 2, 3, 4, 5]})
>>> v = Validate(data).col_sum_eq("a", 15)
>>> v.assert_passing()
"""
...
def col_sum_gt(
self,
columns: _PBUnresolvedColumn,
value: float | Column | ReferenceColumn | None = None,
tol: Tolerance = 0,
thresholds: float | bool | tuple | dict | Thresholds | None = None,
brief: str | bool = False,
actions: Actions | None = None,
active: bool = True,
) -> Validate:
"""Assert the values in a column sum to a value gt some `value`.
Args:
columns (_PBUnresolvedColumn): Column or collection of columns to validate.
value (float | Column | ReferenceColumn | None): Target value to validate against.
If None and reference data is set on the Validate object, defaults to
ref(column) to compare against the same column in the reference data.
tol (Tolerance, optional): Tolerance for validation distance to target. Defaults to 0.
thresholds (float | bool | tuple | dict | Thresholds | None, optional): Custom thresholds for
the bounds. See examples for usage. Defaults to None.
brief (str | bool, optional): Explanation of validation operation. Defaults to False.
actions (Actions | None, optional): Actions to take after validation. Defaults to None.
active (bool, optional): Whether to activate the validation. Defaults to True.
Returns:
Validate: A `Validate` instance with the new validation method added.
Examples:
>>> import polars as pl
>>>
>>> data = pl.DataFrame({"a": [1, 2, 3, 4, 5]})
>>> v = Validate(data).col_sum_gt("a", 10)
>>> v.assert_passing()
"""
...
def col_sum_ge(
self,
columns: _PBUnresolvedColumn,
value: float | Column | ReferenceColumn | None = None,
tol: Tolerance = 0,
thresholds: float | bool | tuple | dict | Thresholds | None = None,
brief: str | bool = False,
actions: Actions | None = None,
active: bool = True,
) -> Validate:
"""Assert the values in a column sum to a value ge some `value`.
Args:
columns (_PBUnresolvedColumn): Column or collection of columns to validate.
value (float | Column | ReferenceColumn | None): Target value to validate against.
If None and reference data is set on the Validate object, defaults to
ref(column) to compare against the same column in the reference data.
tol (Tolerance, optional): Tolerance for validation distance to target. Defaults to 0.
thresholds (float | bool | tuple | dict | Thresholds | None, optional): Custom thresholds for
the bounds. See examples for usage. Defaults to None.
brief (str | bool, optional): Explanation of validation operation. Defaults to False.
actions (Actions | None, optional): Actions to take after validation. Defaults to None.
active (bool, optional): Whether to activate the validation. Defaults to True.
Returns:
Validate: A `Validate` instance with the new validation method added.
Examples:
>>> import polars as pl
>>>
>>> data = pl.DataFrame({"a": [1, 2, 3, 4, 5]})
>>> v = Validate(data).col_sum_ge("a", 15)
>>> v.assert_passing()
"""
...
def col_sum_lt(
self,
columns: _PBUnresolvedColumn,
value: float | Column | ReferenceColumn | None = None,
tol: Tolerance = 0,
thresholds: float | bool | tuple | dict | Thresholds | None = None,
brief: str | bool = False,
actions: Actions | None = None,
active: bool = True,
) -> Validate:
"""Assert the values in a column sum to a value lt some `value`.
Args:
columns (_PBUnresolvedColumn): Column or collection of columns to validate.
value (float | Column | ReferenceColumn | None): Target value to validate against.
If None and reference data is set on the Validate object, defaults to
ref(column) to compare against the same column in the reference data.
tol (Tolerance, optional): Tolerance for validation distance to target. Defaults to 0.
thresholds (float | bool | tuple | dict | Thresholds | None, optional): Custom thresholds for
the bounds. See examples for usage. Defaults to None.
brief (str | bool, optional): Explanation of validation operation. Defaults to False.
actions (Actions | None, optional): Actions to take after validation. Defaults to None.
active (bool, optional): Whether to activate the validation. Defaults to True.
Returns:
Validate: A `Validate` instance with the new validation method added.
Examples:
>>> import polars as pl
>>>
>>> data = pl.DataFrame({"a": [1, 2, 3, 4, 5]})
>>> v = Validate(data).col_sum_lt("a", 20)
>>> v.assert_passing()
"""
...
def col_sum_le(
self,
columns: _PBUnresolvedColumn,
value: float | Column | ReferenceColumn | None = None,
tol: Tolerance = 0,
thresholds: float | bool | tuple | dict | Thresholds | None = None,
brief: str | bool = False,
actions: Actions | None = None,
active: bool = True,
) -> Validate:
"""Assert the values in a column sum to a value le some `value`.
Args:
columns (_PBUnresolvedColumn): Column or collection of columns to validate.
value (float | Column | ReferenceColumn | None): Target value to validate against.
If None and reference data is set on the Validate object, defaults to
ref(column) to compare against the same column in the reference data.
tol (Tolerance, optional): Tolerance for validation distance to target. Defaults to 0.
thresholds (float | bool | tuple | dict | Thresholds | None, optional): Custom thresholds for
the bounds. See examples for usage. Defaults to None.
brief (str | bool, optional): Explanation of validation operation. Defaults to False.
actions (Actions | None, optional): Actions to take after validation. Defaults to None.
active (bool, optional): Whether to activate the validation. Defaults to True.
Returns:
Validate: A `Validate` instance with the new validation method added.
Examples:
>>> import polars as pl
>>>
>>> data = pl.DataFrame({"a": [1, 2, 3, 4, 5]})
>>> v = Validate(data).col_sum_le("a", 15)
>>> v.assert_passing()
"""
...
def col_avg_eq(
self,
columns: _PBUnresolvedColumn,
value: float | Column | ReferenceColumn | None = None,
tol: Tolerance = 0,
thresholds: float | bool | tuple | dict | Thresholds | None = None,
brief: str | bool = False,
actions: Actions | None = None,
active: bool = True,
) -> Validate:
"""Assert the values in a column avg to a value eq some `value`.
Args:
columns (_PBUnresolvedColumn): Column or collection of columns to validate.
value (float | Column | ReferenceColumn | None): Target value to validate against.
If None and reference data is set on the Validate object, defaults to
ref(column) to compare against the same column in the reference data.
tol (Tolerance, optional): Tolerance for validation distance to target. Defaults to 0.
thresholds (float | bool | tuple | dict | Thresholds | None, optional): Custom thresholds for
the bounds. See examples for usage. Defaults to None.
brief (str | bool, optional): Explanation of validation operation. Defaults to False.
actions (Actions | None, optional): Actions to take after validation. Defaults to None.
active (bool, optional): Whether to activate the validation. Defaults to True.
Returns:
Validate: A `Validate` instance with the new validation method added.
Examples:
>>> import polars as pl
>>>
>>> data = pl.DataFrame({"a": [1, 2, 3, 4, 5]})
>>> v = Validate(data).col_avg_eq("a", 3)
>>> v.assert_passing()
"""
...
def col_avg_gt(
self,
columns: _PBUnresolvedColumn,
value: float | Column | ReferenceColumn | None = None,
tol: Tolerance = 0,
thresholds: float | bool | tuple | dict | Thresholds | None = None,
brief: str | bool = False,
actions: Actions | None = None,
active: bool = True,
) -> Validate:
"""Assert the values in a column avg to a value gt some `value`.
Args:
columns (_PBUnresolvedColumn): Column or collection of columns to validate.
value (float | Column | ReferenceColumn | None): Target value to validate against.
If None and reference data is set on the Validate object, defaults to
ref(column) to compare against the same column in the reference data.
tol (Tolerance, optional): Tolerance for validation distance to target. Defaults to 0.
thresholds (float | bool | tuple | dict | Thresholds | None, optional): Custom thresholds for
the bounds. See examples for usage. Defaults to None.
brief (str | bool, optional): Explanation of validation operation. Defaults to False.
actions (Actions | None, optional): Actions to take after validation. Defaults to None.
active (bool, optional): Whether to activate the validation. Defaults to True.
Returns:
Validate: A `Validate` instance with the new validation method added.
Examples:
>>> import polars as pl
>>>
>>> data = pl.DataFrame({"a": [1, 2, 3, 4, 5]})
>>> v = Validate(data).col_avg_gt("a", 2)
>>> v.assert_passing()
"""
...
def col_avg_ge(
self,
columns: _PBUnresolvedColumn,
value: float | Column | ReferenceColumn | None = None,
tol: Tolerance = 0,
thresholds: float | bool | tuple | dict | Thresholds | None = None,
brief: str | bool = False,
actions: Actions | None = None,
active: bool = True,
) -> Validate:
"""Assert the values in a column avg to a value ge some `value`.
Args:
columns (_PBUnresolvedColumn): Column or collection of columns to validate.
value (float | Column | ReferenceColumn | None): Target value to validate against.
If None and reference data is set on the Validate object, defaults to
ref(column) to compare against the same column in the reference data.
tol (Tolerance, optional): Tolerance for validation distance to target. Defaults to 0.
thresholds (float | bool | tuple | dict | Thresholds | None, optional): Custom thresholds for
the bounds. See examples for usage. Defaults to None.
brief (str | bool, optional): Explanation of validation operation. Defaults to False.
actions (Actions | None, optional): Actions to take after validation. Defaults to None.
active (bool, optional): Whether to activate the validation. Defaults to True.
Returns:
Validate: A `Validate` instance with the new validation method added.
Examples:
>>> import polars as pl
>>>
>>> data = pl.DataFrame({"a": [1, 2, 3, 4, 5]})
>>> v = Validate(data).col_avg_ge("a", 3)
>>> v.assert_passing()
"""
...
def col_avg_lt(
self,
columns: _PBUnresolvedColumn,
value: float | Column | ReferenceColumn | None = None,
tol: Tolerance = 0,
thresholds: float | bool | tuple | dict | Thresholds | None = None,
brief: str | bool = False,
actions: Actions | None = None,
active: bool = True,
) -> Validate:
"""Assert the values in a column avg to a value lt some `value`.
Args:
columns (_PBUnresolvedColumn): Column or collection of columns to validate.
value (float | Column | ReferenceColumn | None): Target value to validate against.
If None and reference data is set on the Validate object, defaults to
ref(column) to compare against the same column in the reference data.
tol (Tolerance, optional): Tolerance for validation distance to target. Defaults to 0.
thresholds (float | bool | tuple | dict | Thresholds | None, optional): Custom thresholds for
the bounds. See examples for usage. Defaults to None.
brief (str | bool, optional): Explanation of validation operation. Defaults to False.
actions (Actions | None, optional): Actions to take after validation. Defaults to None.
active (bool, optional): Whether to activate the validation. Defaults to True.
Returns:
Validate: A `Validate` instance with the new validation method added.
Examples:
>>> import polars as pl
>>>
>>> data = pl.DataFrame({"a": [1, 2, 3, 4, 5]})
>>> v = Validate(data).col_avg_lt("a", 5)
>>> v.assert_passing()
"""
...
def col_avg_le(
self,
columns: _PBUnresolvedColumn,
value: float | Column | ReferenceColumn | None = None,
tol: Tolerance = 0,
thresholds: float | bool | tuple | dict | Thresholds | None = None,
brief: str | bool = False,
actions: Actions | None = None,
active: bool = True,
) -> Validate:
"""Assert the values in a column avg to a value le some `value`.
Args:
columns (_PBUnresolvedColumn): Column or collection of columns to validate.
value (float | Column | ReferenceColumn | None): Target value to validate against.
If None and reference data is set on the Validate object, defaults to
ref(column) to compare against the same column in the reference data.
tol (Tolerance, optional): Tolerance for validation distance to target. Defaults to 0.
thresholds (float | bool | tuple | dict | Thresholds | None, optional): Custom thresholds for
the bounds. See examples for usage. Defaults to None.
brief (str | bool, optional): Explanation of validation operation. Defaults to False.
actions (Actions | None, optional): Actions to take after validation. Defaults to None.
active (bool, optional): Whether to activate the validation. Defaults to True.
Returns:
Validate: A `Validate` instance with the new validation method added.
Examples:
>>> import polars as pl
>>>
>>> data = pl.DataFrame({"a": [1, 2, 3, 4, 5]})
>>> v = Validate(data).col_avg_le("a", 3)
>>> v.assert_passing()
"""
...
def col_sd_eq(
self,
columns: _PBUnresolvedColumn,
value: float | Column | ReferenceColumn | None = None,
tol: Tolerance = 0,
thresholds: float | bool | tuple | dict | Thresholds | None = None,
brief: str | bool = False,
actions: Actions | None = None,
active: bool = True,
) -> Validate:
"""Assert the values in a column sd to a value eq some `value`.
Args:
columns (_PBUnresolvedColumn): Column or collection of columns to validate.
value (float | Column | ReferenceColumn | None): Target value to validate against.
If None and reference data is set on the Validate object, defaults to
ref(column) to compare against the same column in the reference data.
tol (Tolerance, optional): Tolerance for validation distance to target. Defaults to 0.
thresholds (float | bool | tuple | dict | Thresholds | None, optional): Custom thresholds for
the bounds. See examples for usage. Defaults to None.
brief (str | bool, optional): Explanation of validation operation. Defaults to False.
actions (Actions | None, optional): Actions to take after validation. Defaults to None.
active (bool, optional): Whether to activate the validation. Defaults to True.
Returns:
Validate: A `Validate` instance with the new validation method added.
Examples:
>>> import polars as pl
>>>
>>> data = pl.DataFrame({"a": [2, 4, 6, 8, 10]})
>>> v = Validate(data).col_sd_eq("a", 3.1622776601683795)
>>> v.assert_passing()
"""
...
def col_sd_gt(
self,
columns: _PBUnresolvedColumn,
value: float | Column | ReferenceColumn | None = None,
tol: Tolerance = 0,
thresholds: float | bool | tuple | dict | Thresholds | None = None,
brief: str | bool = False,
actions: Actions | None = None,
active: bool = True,
) -> Validate:
"""Assert the values in a column sd to a value gt some `value`.
Args:
columns (_PBUnresolvedColumn): Column or collection of columns to validate.
value (float | Column | ReferenceColumn | None): Target value to validate against.
If None and reference data is set on the Validate object, defaults to
ref(column) to compare against the same column in the reference data.
tol (Tolerance, optional): Tolerance for validation distance to target. Defaults to 0.
thresholds (float | bool | tuple | dict | Thresholds | None, optional): Custom thresholds for
the bounds. See examples for usage. Defaults to None.
brief (str | bool, optional): Explanation of validation operation. Defaults to False.
actions (Actions | None, optional): Actions to take after validation. Defaults to None.
active (bool, optional): Whether to activate the validation. Defaults to True.
Returns:
Validate: A `Validate` instance with the new validation method added.
Examples:
>>> import polars as pl
>>>
>>> data = pl.DataFrame({"a": [1, 2, 3, 4, 5]})
>>> v = Validate(data).col_sd_gt("a", 1)
>>> v.assert_passing()
"""
...
def col_sd_ge(
self,
columns: _PBUnresolvedColumn,
value: float | Column | ReferenceColumn | None = None,
tol: Tolerance = 0,
thresholds: float | bool | tuple | dict | Thresholds | None = None,
brief: str | bool = False,
actions: Actions | None = None,
active: bool = True,
) -> Validate:
"""Assert the values in a column sd to a value ge some `value`.
Args:
columns (_PBUnresolvedColumn): Column or collection of columns to validate.
value (float | Column | ReferenceColumn | None): Target value to validate against.
If None and reference data is set on the Validate object, defaults to
ref(column) to compare against the same column in the reference data.
tol (Tolerance, optional): Tolerance for validation distance to target. Defaults to 0.
thresholds (float | bool | tuple | dict | Thresholds | None, optional): Custom thresholds for
the bounds. See examples for usage. Defaults to None.
brief (str | bool, optional): Explanation of validation operation. Defaults to False.
actions (Actions | None, optional): Actions to take after validation. Defaults to None.
active (bool, optional): Whether to activate the validation. Defaults to True.
Returns:
Validate: A `Validate` instance with the new validation method added.
Examples:
>>> import polars as pl
>>>
>>> data = pl.DataFrame({"a": [2, 4, 4, 4, 6]})
>>> v = Validate(data).col_sd_ge("a", 1.4142135623730951)
>>> v.assert_passing()
"""
...
def col_sd_lt(
self,
columns: _PBUnresolvedColumn,
value: float | Column | ReferenceColumn | None = None,
tol: Tolerance = 0,
thresholds: float | bool | tuple | dict | Thresholds | None = None,
brief: str | bool = False,
actions: Actions | None = None,
active: bool = True,
) -> Validate:
"""Assert the values in a column sd to a value lt some `value`.
Args:
columns (_PBUnresolvedColumn): Column or collection of columns to validate.
value (float | Column | ReferenceColumn | None): Target value to validate against.
If None and reference data is set on the Validate object, defaults to
ref(column) to compare against the same column in the reference data.
tol (Tolerance, optional): Tolerance for validation distance to target. Defaults to 0.
thresholds (float | bool | tuple | dict | Thresholds | None, optional): Custom thresholds for
the bounds. See examples for usage. Defaults to None.
brief (str | bool, optional): Explanation of validation operation. Defaults to False.
actions (Actions | None, optional): Actions to take after validation. Defaults to None.
active (bool, optional): Whether to activate the validation. Defaults to True.
Returns:
Validate: A `Validate` instance with the new validation method added.
Examples:
>>> import polars as pl
>>>
>>> data = pl.DataFrame({"a": [1, 2, 3, 4, 5]})
>>> v = Validate(data).col_sd_lt("a", 2)
>>> v.assert_passing()
"""
...
def col_sd_le(
self,
columns: _PBUnresolvedColumn,
value: float | Column | ReferenceColumn | None = None,
tol: Tolerance = 0,
thresholds: float | bool | tuple | dict | Thresholds | None = None,
brief: str | bool = False,
actions: Actions | None = None,
active: bool = True,
) -> Validate:
"""Assert the values in a column sd to a value le some `value`.
Args:
columns (_PBUnresolvedColumn): Column or collection of columns to validate.
value (float | Column | ReferenceColumn | None): Target value to validate against.
If None and reference data is set on the Validate object, defaults to
ref(column) to compare against the same column in the reference data.
tol (Tolerance, optional): Tolerance for validation distance to target. Defaults to 0.
thresholds (float | bool | tuple | dict | Thresholds | None, optional): Custom thresholds for
the bounds. See examples for usage. Defaults to None.
brief (str | bool, optional): Explanation of validation operation. Defaults to False.
actions (Actions | None, optional): Actions to take after validation. Defaults to None.
active (bool, optional): Whether to activate the validation. Defaults to True.
Returns:
Validate: A `Validate` instance with the new validation method added.
Examples:
>>> import polars as pl
>>>
>>> data = pl.DataFrame({"a": [2, 4, 4, 4, 6]})
>>> v = Validate(data).col_sd_le("a", 1.4142135623730951)
>>> v.assert_passing()
"""
...
# === GENERATED END ===
import inspect
import itertools
import subprocess
import sys
from pathlib import Path
from pointblank._agg import AGGREGATOR_REGISTRY, COMPARATOR_REGISTRY, is_valid_agg
# Go from `.scripts/__file__.py` to `.`, allowing us to import `tests` which lives
# at the root.
project_root = Path(__file__).parent.parent
sys.path.append(str(project_root))
from tests.test_agg_doctests import _TEST_FUNCTION_REGISTRY
VALIDATE_PYI_PATH = Path("pointblank/validate.pyi")
SIGNATURE = """
self,
columns: _PBUnresolvedColumn,
value: float | Column | ReferenceColumn | None = None,
tol: Tolerance = 0,
thresholds: float | bool | tuple | dict | Thresholds | None = None,
brief: str | bool = False,
actions: Actions | None = None,
active: bool = True,
"""
DOCSTRING = """
Args:
columns (_PBUnresolvedColumn): Column or collection of columns to validate.
value (float | Column | ReferenceColumn | None): Target value to validate against.
If None and reference data is set on the Validate object, defaults to
ref(column) to compare against the same column in the reference data.
tol (Tolerance, optional): Tolerance for validation distance to target. Defaults to 0.
thresholds (float | bool | tuple | dict | Thresholds | None, optional): Custom thresholds for
the bounds. See examples for usage. Defaults to None.
brief (str | bool, optional): Explanation of validation operation. Defaults to False.
actions (Actions | None, optional): Actions to take after validation. Defaults to None.
active (bool, optional): Whether to activate the validation. Defaults to True.
Returns:
Validate: A `Validate` instance with the new validation method added.
Examples:
"""
CLS = "Validate"
IMPORT_HEADER = """
from pointblank import Actions, Thresholds
from pointblank._utils import _PBUnresolvedColumn
from pointblank.column import Column, ReferenceColumn
from pointblank._typing import Tolerance
"""
# Write the headers to the end. Ruff will take care of sorting imports.
with VALIDATE_PYI_PATH.open() as f:
content = f.read()
with VALIDATE_PYI_PATH.open("w") as f:
f.write(IMPORT_HEADER + "\n\n" + content)
## Create grid of aggs and comparators
with VALIDATE_PYI_PATH.open("a") as f:
f.write(" # === GENERATED START ===\n")
for agg_name, comp_name in itertools.product(
AGGREGATOR_REGISTRY.keys(), COMPARATOR_REGISTRY.keys()
):
method = f"col_{agg_name}_{comp_name}"
assert is_valid_agg(method) # internal sanity check
# Extract examples from the doctest registry.
doctest_fn = _TEST_FUNCTION_REGISTRY[method]
try:
lines_to_skip = len(doctest_fn.__doc__.split("\n"))
except AttributeError:
lines_to_skip = 0
lines: list[str] = inspect.getsourcelines(doctest_fn)[0]
cleaned_lines: list[str] = [line.strip() for line in lines]
body: str = "\n".join(cleaned_lines[lines_to_skip + 2 :])
# Add >>> to each line in the body so doctest can run it
body_with_arrows: str = "\n".join(f"\t>>> {line}" for line in body.split("\n"))
# Build docstring
meth_body = (
f'"""Assert the values in a column '
f"{agg_name.replace('_', ' ')} to a value "
f"{comp_name.replace('_', ' ')} some `value`.\n"
f"{DOCSTRING}"
f"{body_with_arrows}\n"
f'"""\n'
)
# Build the .pyi stub method
temp = f" def {method}({SIGNATURE}\t) -> {CLS}:\n {meth_body} ...\n\n"
f.write(temp)
f.write(" # === GENERATED END ===\n")
## Run formatter and linter on the generated file:
subprocess.run(["uv", "run", "ruff", "format", str(VALIDATE_PYI_PATH)])
subprocess.run(["uv", "run", "ty", "check", str(VALIDATE_PYI_PATH)])
from pointblank import Validate
from collections.abc import Callable
## IMPORTANT: READ THIS
# This test file is unique, it's designed to create the doctests for the `col_*` aggregate functions.
# Since we generate the docs dynamically using the `make pyi` command, we need a store of doctests to
# inform the examples in the docstring. The `scripts/generate_agg_validate_pyi.py` script will use these
# to create the examples.
## How to add a new test:
# 1. Create a test titled `test_<agg_function>` OR add to the existing.
# 2. Mark the function with the `@_test` decorator to ensure the pyi gen can find it.
# 3. Run `uv run pytest tests/test_agg_doctests.py` to run these tests and ensure they pass.
# 4. Run `make pyi` to update the `pyi` files to reflect the new examples.
_TEST_FUNCTION_REGISTRY: dict[str, Callable] = {}
def _test(fn):
nm: str = fn.__name__
name_no_test = nm.removeprefix("test_")
_TEST_FUNCTION_REGISTRY[name_no_test] = fn
return fn
@_test
def test_col_sum_eq():
"""Test col_sum_eq"""
import polars as pl
# We check this column sums to 15
data = pl.DataFrame({"a": [1, 2, 3, 4, 5]})
v = Validate(data).col_sum_eq("a", 15)
v.assert_passing()
@_test
def test_col_sum_eq():
"""Test col_sum_eq"""
import polars as pl
data = pl.DataFrame({"a": [1, 2, 3, 4, 5]})
v = Validate(data).col_sum_eq("a", 15)
v.assert_passing()
@_test
def test_col_sum_gt():
"""Test col_sum_gt"""
import polars as pl
data = pl.DataFrame({"a": [1, 2, 3, 4, 5]})
v = Validate(data).col_sum_gt("a", 10)
v.assert_passing()
@_test
def test_col_sum_ge():
"""Test col_sum_ge"""
import polars as pl
data = pl.DataFrame({"a": [1, 2, 3, 4, 5]})
v = Validate(data).col_sum_ge("a", 15)
v.assert_passing()
@_test
def test_col_sum_lt():
"""Test col_sum_lt"""
import polars as pl
data = pl.DataFrame({"a": [1, 2, 3, 4, 5]})
v = Validate(data).col_sum_lt("a", 20)
v.assert_passing()
@_test
def test_col_sum_le():
"""Test col_sum_le"""
import polars as pl
data = pl.DataFrame({"a": [1, 2, 3, 4, 5]})
v = Validate(data).col_sum_le("a", 15)
v.assert_passing()
@_test
def test_col_avg_eq():
"""Test col_avg_eq"""
import polars as pl
data = pl.DataFrame({"a": [1, 2, 3, 4, 5]})
v = Validate(data).col_avg_eq("a", 3)
v.assert_passing()
@_test
def test_col_avg_gt():
"""Test col_avg_gt"""
import polars as pl
data = pl.DataFrame({"a": [1, 2, 3, 4, 5]})
v = Validate(data).col_avg_gt("a", 2)
v.assert_passing()
@_test
def test_col_avg_ge():
"""Test col_avg_ge"""
import polars as pl
data = pl.DataFrame({"a": [1, 2, 3, 4, 5]})
v = Validate(data).col_avg_ge("a", 3)
v.assert_passing()
@_test
def test_col_avg_lt():
"""Test col_avg_lt"""
import polars as pl
data = pl.DataFrame({"a": [1, 2, 3, 4, 5]})
v = Validate(data).col_avg_lt("a", 5)
v.assert_passing()
@_test
def test_col_avg_le():
"""Test col_avg_le"""
import polars as pl
data = pl.DataFrame({"a": [1, 2, 3, 4, 5]})
v = Validate(data).col_avg_le("a", 3)
v.assert_passing()
@_test
def test_col_sd_eq():
"""Test col_sd_eq"""
import polars as pl
data = pl.DataFrame({"a": [2, 4, 6, 8, 10]})
v = Validate(data).col_sd_eq("a", 3.1622776601683795)
v.assert_passing()
@_test
def test_col_sd_gt():
"""Test col_sd_gt"""
import polars as pl
data = pl.DataFrame({"a": [1, 2, 3, 4, 5]})
v = Validate(data).col_sd_gt("a", 1)
v.assert_passing()
@_test
def test_col_sd_ge():
"""Test col_sd_ge"""
import polars as pl
data = pl.DataFrame({"a": [2, 4, 4, 4, 6]})
v = Validate(data).col_sd_ge("a", 1.4142135623730951)
v.assert_passing()
@_test
def test_col_sd_lt():
"""Test col_sd_lt"""
import polars as pl
data = pl.DataFrame({"a": [1, 2, 3, 4, 5]})
v = Validate(data).col_sd_lt("a", 2)
v.assert_passing()
@_test
def test_col_sd_le():
"""Test col_sd_le"""
import polars as pl
data = pl.DataFrame({"a": [2, 4, 4, 4, 6]})
v = Validate(data).col_sd_le("a", 1.4142135623730951)
v.assert_passing()
import pytest
from pointblank import Validate, ref
import polars as pl
from pointblank._agg import load_validation_method_grid, is_valid_agg
@pytest.fixture
def simple_pl() -> pl.DataFrame:
return pl.DataFrame(
{
"a": [1, 1, 1, None],
"b": [2, 2, 2, None],
"c": [3, 3, 3, None],
}
)
@pytest.mark.parametrize(
"tol",
[
(0, 0),
(1, 1),
(100, 100),
0,
],
)
def test_sums_old(tol, simple_pl) -> None:
v = Validate(simple_pl).col_sum_eq("a", 3, tol=tol).interrogate()
v.assert_passing()
v.get_tabular_report()
# TODO: Expand expression types
# TODO: Expand table types
@pytest.mark.parametrize(
("method", "vals"),
[
# Sum -> 3, 6, 9
("col_sum_eq", (3, 6, 9)),
("col_sum_gt", (2, 5, 8)),
("col_sum_ge", (3, 6, 9)),
("col_sum_lt", (4, 7, 10)),
("col_sum_le", (3, 6, 9)),
# Average -> 1, 2, 3
("col_avg_eq", (1, 2, 3)),
("col_avg_gt", (0, 1, 2)),
("col_avg_ge", (1, 2, 3)),
("col_avg_lt", (2, 3, 4)),
("col_avg_le", (1, 2, 3)),
# Standard Deviation -> 0, 0, 0
("col_sd_eq", (0, 0, 0)),
("col_sd_gt", (-1, -1, -1)),
("col_sd_ge", (0, 0, 0)),
("col_sd_lt", (1, 1, 1)),
("col_sd_le", (0, 0, 0)),
],
)
def test_aggs(simple_pl: pl.DataFrame, method: str, vals: tuple[int, int, int]):
v = Validate(simple_pl)
for col, val in zip(["a", "b", "c"], vals):
v = getattr(v, method)(col, val)
v = v.interrogate()
v.assert_passing()
@pytest.fixture
def simple_pl() -> pl.DataFrame:
return pl.DataFrame(
{
"a": [1, 1, 1, None],
"b": [2, 2, 2, None],
"c": [3, 3, 3, None],
}
)
@pytest.fixture
def varied_pl() -> pl.DataFrame:
"""DataFrame with varied values for testing standard deviation"""
return pl.DataFrame(
{
"low_variance": [5, 5, 5, 5, 5],
"high_variance": [1, 5, 10, 15, 20],
"mixed": [1, 2, 3, 4, 5],
}
)
@pytest.fixture
def edge_case_pl() -> pl.DataFrame:
"""DataFrame with edge cases: single value, all nulls, mixed nulls"""
return pl.DataFrame(
{
"single_value": [42, None, None, None],
"all_nulls": [None, None, None, None],
"mostly_nulls": [1, None, None, None],
"no_nulls": [1, 2, 3, 4],
}
)
@pytest.fixture
def negative_pl() -> pl.DataFrame:
"""DataFrame with negative numbers"""
return pl.DataFrame(
{
"all_negative": [-1, -2, -3, -4],
"mixed_signs": [-2, -1, 1, 2],
"zeros": [0, 0, 0, 0],
}
)
@pytest.fixture
def large_values_pl() -> pl.DataFrame:
"""DataFrame with large values to test numerical stability"""
return pl.DataFrame(
{
"large": [1_000_000, 1_000_000, 1_000_000],
"very_large": [1e10, 1e10, 1e10],
"small_decimals": [0.001, 0.002, 0.003],
}
)
# Original test
@pytest.mark.parametrize(
("method", "vals"),
[
("col_sum_eq", (3, 6, 9)),
("col_sum_gt", (2, 5, 8)),
("col_sum_ge", (3, 6, 9)),
("col_sum_lt", (4, 7, 10)),
("col_sum_le", (3, 6, 9)),
("col_avg_eq", (1, 2, 3)),
("col_avg_gt", (0, 1, 2)),
("col_avg_ge", (1, 2, 3)),
("col_avg_lt", (2, 3, 4)),
("col_avg_le", (1, 2, 3)),
("col_sd_eq", (0, 0, 0)),
("col_sd_gt", (-1, -1, -1)),
("col_sd_ge", (0, 0, 0)),
("col_sd_lt", (1, 1, 1)),
("col_sd_le", (0, 0, 0)),
],
)
def test_aggs(simple_pl: pl.DataFrame, method: str, vals: tuple[int, int, int]):
v = Validate(simple_pl)
for col, val in zip(["a", "b", "c"], vals):
getattr(v, method)(col, val)
v = v.interrogate()
v.assert_passing()
# Test with varied standard deviations
def test_aggs_with_variance(varied_pl: pl.DataFrame):
v = Validate(varied_pl)
# Low variance column should have SD close to 0
v.col_sd_lt("low_variance", 0.1)
v.col_sd_eq("low_variance", 0)
# High variance column
v.col_sd_gt("high_variance", 5)
# Mixed values
v.col_sd_ge("mixed", 1)
v = v.interrogate()
v.assert_passing()
# Test negative numbers
@pytest.mark.parametrize(
("method", "col", "val", "should_pass"),
[
# Negative sums
("col_sum_eq", "all_negative", -10, True),
("col_sum_lt", "all_negative", -9, True),
("col_sum_gt", "all_negative", -11, True),
# Mixed signs sum to zero
("col_sum_eq", "mixed_signs", 0, True),
# Zeros
("col_sum_eq", "zeros", 0, True),
("col_avg_eq", "zeros", 0, True),
("col_sd_eq", "zeros", 0, True),
# Negative averages
("col_avg_eq", "all_negative", -2.5, True),
("col_avg_lt", "all_negative", -2, True),
],
)
def test_negative_values(
negative_pl: pl.DataFrame, method: str, col: str, val: float, should_pass: bool
):
v = Validate(negative_pl)
v = getattr(v, method)(col, val).interrogate()
if should_pass:
v.assert_passing()
else:
with pytest.raises(AssertionError):
v.assert_passing()
# Test edge cases with nulls
@pytest.mark.parametrize(
("method", "col", "val", "should_handle"),
[
# Single non-null value
("col_sum_eq", "single_value", 42, True),
("col_avg_eq", "single_value", 42, True),
("col_sd_eq", "single_value", 0, True), # SD of single value is 0
# Mostly nulls
("col_sum_eq", "mostly_nulls", 1, True),
("col_avg_eq", "mostly_nulls", 1, True),
# No nulls
("col_sum_eq", "no_nulls", 10, True),
("col_avg_eq", "no_nulls", 2.5, True),
],
)
@pytest.mark.xfail(reason="Have some work to do here")
def test_edge_cases_with_nulls(
edge_case_pl: pl.DataFrame, method: str, col: str, val: float, should_handle: bool
):
v = Validate(edge_case_pl)
v = getattr(v, method)(col, val)
v = v.interrogate()
v.assert_passing()
# Test boundary conditions
@pytest.mark.parametrize(
("method", "col", "exact_val", "just_below", "just_above"),
[
("col_sum", "a", 3, 2.99, 3.01),
("col_avg", "b", 2, 1.99, 2.01),
("col_sd", "c", 0, -0.01, 0.01),
],
)
def test_boundary_conditions(
simple_pl: pl.DataFrame,
method: str,
col: str,
exact_val: float,
just_below: float,
just_above: float,
):
# Test exact equality
v = Validate(simple_pl)
getattr(v, f"{method}_eq")(col, exact_val)
v.interrogate().assert_passing()
# Test greater than (just below should pass)
v = Validate(simple_pl)
getattr(v, f"{method}_gt")(col, just_below)
v.interrogate().assert_passing()
# Test less than (just above should pass)
v = Validate(simple_pl)
getattr(v, f"{method}_lt")(col, just_above)
v.interrogate().assert_passing()
# Test greater than or equal
v = Validate(simple_pl)
getattr(v, f"{method}_ge")(col, exact_val)
v.interrogate().assert_passing()
# Test less than or equal
v = Validate(simple_pl)
getattr(v, f"{method}_le")(col, exact_val)
v.interrogate().assert_passing()
# Test large values
def test_large_values(large_values_pl: pl.DataFrame):
v = Validate(large_values_pl)
# Large values
v = v.col_sum_eq("large", 3_000_000)
v = v.col_avg_eq("large", 1_000_000)
# Very large values
v = v.col_sum_eq("very_large", 3e10)
v = v.col_avg_eq("very_large", 1e10)
# Small decimals
v = v.col_sum_eq("small_decimals", 0.006)
v = v.col_avg_eq("small_decimals", 0.002)
v = v.interrogate()
v.assert_passing()
# Test multiple assertions on same column
def test_multiple_assertions_same_column(simple_pl: pl.DataFrame):
v = Validate(simple_pl)
# Multiple checks on column 'a'
v = v.col_sum_eq("a", 3)
v = v.col_sum_ge("a", 3)
v = v.col_sum_le("a", 3)
v = v.col_avg_eq("a", 1)
v = v.col_sd_eq("a", 0)
v = v.interrogate()
v.assert_passing()
# Test chaining all comparison operators
def test_all_operators_chained(simple_pl: pl.DataFrame):
v = Validate(simple_pl)
# Test all operators work together
v = v.col_sum_gt("a", 2)
v = v.col_sum_lt("a", 4)
v = v.col_avg_ge("b", 2)
v = v.col_avg_le("b", 2)
v = v.col_sd_eq("c", 0)
v = v.interrogate()
v.assert_passing()
# Test failure cases
@pytest.mark.parametrize(
("method", "col", "val"),
[
("col_sum_eq", "a", 999), # Wrong sum
("col_sum_gt", "a", 10), # Sum not greater
("col_avg_lt", "b", 1), # Avg not less than
("col_sd_gt", "c", 5), # SD not greater
],
)
def test_expected_failures(simple_pl: pl.DataFrame, method: str, col: str, val: float):
v = Validate(simple_pl)
v = getattr(v, method)(col, val).interrogate()
with pytest.raises(AssertionError):
v.assert_passing()
# Test with floating point precision
def test_floating_point_precision():
df = pl.DataFrame(
{
"precise": [1.1, 2.2, 3.3],
"imprecise": [0.1 + 0.2, 0.2 + 0.3, 0.3 + 0.4], # Classic floating point issues
}
)
v: Validate = Validate(df)
# Sum might not be exactly 6.6 due to floating point
v = v.col_sum_ge("precise", 6.5)
v = v.col_sum_le("precise", 6.7)
v = v.interrogate()
v.assert_passing()
# Test with extreme standard deviations
def test_extreme_standard_deviations():
df = pl.DataFrame(
{
"uniform": [5, 5, 5, 5, 5],
"extreme_range": [1, 1000, 1, 1000, 1],
}
)
Validate(df).col_sd_eq("uniform", 0).col_sd_gt(
"extreme_range", 400
).interrogate().assert_passing()
def test_all_methods_can_be_accessed():
v = Validate(pl.DataFrame())
for meth in load_validation_method_grid():
assert hasattr(v, meth)
def test_invalid_agg():
assert not is_valid_agg("not_a_real_method")
assert is_valid_agg("col_sum_eq")
# =====================
# Reference Data Tests
# =====================
@pytest.fixture
def reference_data() -> pl.DataFrame:
"""Reference data for comparison tests."""
return pl.DataFrame(
{
"a": [1, 1, 1], # sum=3, avg=1, sd=0
"b": [2, 2, 2], # sum=6, avg=2, sd=0
"c": [3, 3, 3], # sum=9, avg=3, sd=0
}
)
@pytest.fixture
def matching_data() -> pl.DataFrame:
"""Data that matches the reference data."""
return pl.DataFrame(
{
"a": [1, 1, 1], # sum=3, avg=1, sd=0
"b": [2, 2, 2], # sum=6, avg=2, sd=0
"c": [3, 3, 3], # sum=9, avg=3, sd=0
}
)
@pytest.fixture
def different_data() -> pl.DataFrame:
"""Data with different values than reference."""
return pl.DataFrame(
{
"a": [2, 2, 2], # sum=6, avg=2, sd=0
"b": [3, 3, 3], # sum=9, avg=3, sd=0
"c": [4, 4, 4], # sum=12, avg=4, sd=0
}
)
def test_ref_sum_eq_matching(matching_data, reference_data):
"""Test that sum matches between identical data and reference."""
v = (
Validate(data=matching_data, reference=reference_data)
.col_sum_eq("a", ref("a"))
.col_sum_eq("b", ref("b"))
.col_sum_eq("c", ref("c"))
.interrogate()
)
v.assert_passing()
def test_ref_avg_eq_matching(matching_data, reference_data):
"""Test that avg matches between identical data and reference."""
v = (
Validate(data=matching_data, reference=reference_data)
.col_avg_eq("a", ref("a"))
.col_avg_eq("b", ref("b"))
.col_avg_eq("c", ref("c"))
.interrogate()
)
v.assert_passing()
def test_ref_sd_eq_matching(matching_data, reference_data):
"""Test that sd matches between identical data and reference."""
v = (
Validate(data=matching_data, reference=reference_data)
.col_sd_eq("a", ref("a"))
.col_sd_eq("b", ref("b"))
.col_sd_eq("c", ref("c"))
.interrogate()
)
v.assert_passing()
def test_ref_sum_gt(different_data, reference_data):
"""Test that sum of different data is greater than reference."""
# different_data.a sum=6 > reference_data.a sum=3
v = (
Validate(data=different_data, reference=reference_data)
.col_sum_gt("a", ref("a"))
.interrogate()
)
v.assert_passing()
def test_ref_sum_lt(reference_data, different_data):
"""Test that sum is less than reference."""
# reference_data.a sum=3 < different_data.a sum=6 (using different as reference)
v = (
Validate(data=reference_data, reference=different_data)
.col_sum_lt("a", ref("a"))
.interrogate()
)
v.assert_passing()
def test_ref_different_columns():
"""Test comparing different columns between data and reference."""
data = pl.DataFrame({"x": [1, 2, 3]}) # sum=6
reference = pl.DataFrame({"y": [2, 2, 2]}) # sum=6
v = Validate(data=data, reference=reference).col_sum_eq("x", ref("y")).interrogate()
v.assert_passing()
def test_ref_with_tolerance():
"""Test reference data comparison with tolerance."""
data = pl.DataFrame({"a": [10, 11, 12]}) # sum=33
reference = pl.DataFrame({"a": [10, 10, 10]}) # sum=30
# Without tolerance, this should fail
v_fail = Validate(data=data, reference=reference).col_sum_eq("a", ref("a")).interrogate()
with pytest.raises(AssertionError):
v_fail.assert_passing()
# With 10% tolerance, this should pass (30 +/- 3 includes 33)
v_pass = (
Validate(data=data, reference=reference).col_sum_eq("a", ref("a"), tol=0.1).interrogate()
)
v_pass.assert_passing()
def test_ref_without_reference_data_raises():
"""Test that using ref() without reference data raises an error."""
data = pl.DataFrame({"a": [1, 2, 3]})
v = Validate(data=data).col_sum_eq("a", ref("a"))
with pytest.raises(ValueError, match="Cannot use ref"):
v.interrogate()
def test_ref_avg_comparisons():
"""Test all avg comparison operators with reference data."""
data = pl.DataFrame({"value": [5, 5, 5]}) # avg=5
ref_equal = pl.DataFrame({"value": [5, 5, 5]}) # avg=5
ref_lower = pl.DataFrame({"value": [3, 3, 3]}) # avg=3
ref_higher = pl.DataFrame({"value": [7, 7, 7]}) # avg=7
# Test eq
Validate(data=data, reference=ref_equal).col_avg_eq(
"value", ref("value")
).interrogate().assert_passing()
# Test gt (5 > 3)
Validate(data=data, reference=ref_lower).col_avg_gt(
"value", ref("value")
).interrogate().assert_passing()
# Test ge (5 >= 5)
Validate(data=data, reference=ref_equal).col_avg_ge(
"value", ref("value")
).interrogate().assert_passing()
# Test lt (5 < 7)
Validate(data=data, reference=ref_higher).col_avg_lt(
"value", ref("value")
).interrogate().assert_passing()
# Test le (5 <= 5)
Validate(data=data, reference=ref_equal).col_avg_le(
"value", ref("value")
).interrogate().assert_passing()
def test_ref_multiple_columns_single_reference():
"""Test validating multiple columns against a single reference column."""
data = pl.DataFrame(
{
"col_a": [10, 10, 10], # sum=30
"col_b": [10, 10, 10], # sum=30
"col_c": [10, 10, 10], # sum=30
}
)
reference = pl.DataFrame({"baseline": [10, 10, 10]}) # sum=30
v = (
Validate(data=data, reference=reference)
.col_sum_eq("col_a", ref("baseline"))
.col_sum_eq("col_b", ref("baseline"))
.col_sum_eq("col_c", ref("baseline"))
.interrogate()
)
v.assert_passing()
def test_ref_mixed_validation():
"""Test mixing reference-based and literal-value validations."""
data = pl.DataFrame({"a": [1, 2, 3]}) # sum=6, avg=2
reference = pl.DataFrame({"a": [1, 2, 3]}) # sum=6
v = (
Validate(data=data, reference=reference)
.col_sum_eq("a", ref("a")) # Reference-based
.col_sum_eq("a", 6) # Literal value
.col_avg_eq("a", 2) # Literal value
.interrogate()
)
v.assert_passing()
# Tests for automatic reference column inference (when value is None)
def test_auto_ref_sum_eq():
"""Test automatic reference inference when value is not provided."""
data = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) # sum: a=6, b=15
reference = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) # sum: a=6, b=15
# When value is None, it should default to ref("a") and ref("b")
v = (
Validate(data=data, reference=reference)
.col_sum_eq("a") # No value provided, should use ref("a")
.col_sum_eq("b") # No value provided, should use ref("b")
.interrogate()
)
v.assert_passing()
def test_auto_ref_avg_eq():
"""Test automatic reference inference for avg comparison."""
data = pl.DataFrame({"x": [10, 20, 30]}) # avg=20
reference = pl.DataFrame({"x": [10, 20, 30]}) # avg=20
v = Validate(data=data, reference=reference).col_avg_eq("x").interrogate()
v.assert_passing()
def test_auto_ref_sd_eq():
"""Test automatic reference inference for sd comparison."""
data = pl.DataFrame({"val": [2, 4, 4, 4, 6]})
reference = pl.DataFrame({"val": [2, 4, 4, 4, 6]})
v = Validate(data=data, reference=reference).col_sd_eq("val").interrogate()
v.assert_passing()
def test_auto_ref_gt():
"""Test automatic reference with greater than comparison."""
data = pl.DataFrame({"a": [10, 20, 30]}) # sum=60
reference = pl.DataFrame({"a": [1, 2, 3]}) # sum=6
# data.a sum (60) > reference.a sum (6)
v = Validate(data=data, reference=reference).col_sum_gt("a").interrogate()
v.assert_passing()
def test_auto_ref_lt():
"""Test automatic reference with less than comparison."""
data = pl.DataFrame({"a": [1, 2, 3]}) # sum=6
reference = pl.DataFrame({"a": [10, 20, 30]}) # sum=60
# data.a sum (6) < reference.a sum (60)
v = Validate(data=data, reference=reference).col_sum_lt("a").interrogate()
v.assert_passing()
def test_auto_ref_with_tolerance():
"""Test automatic reference with tolerance."""
data = pl.DataFrame({"a": [11, 22, 33]}) # sum=66
reference = pl.DataFrame({"a": [10, 20, 30]}) # sum=60
# sum difference is 6, which is 10% of 60
v = Validate(data=data, reference=reference).col_sum_eq("a", tol=0.11).interrogate()
v.assert_passing()
def test_auto_ref_multiple_columns():
"""Test automatic reference with multiple columns."""
data = pl.DataFrame({"a": [1, 2], "b": [3, 4], "c": [5, 6]})
reference = pl.DataFrame({"a": [1, 2], "b": [3, 4], "c": [5, 6]})
v = (
Validate(data=data, reference=reference)
.col_sum_eq("a")
.col_sum_eq("b")
.col_sum_eq("c")
.col_avg_eq("a")
.col_avg_eq("b")
.col_avg_eq("c")
.interrogate()
)
v.assert_passing()
def test_auto_ref_no_reference_data_raises():
"""Test that using no value without reference data raises an error."""
data = pl.DataFrame({"a": [1, 2, 3]})
# Error is raised when calling the method, not at interrogate time
with pytest.raises(ValueError, match="value.*required"):
Validate(data=data).col_sum_eq("a") # No value and no reference
def test_auto_ref_mixed_explicit_and_auto():
"""Test mixing explicit ref() and automatic inference."""
data = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
reference = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
v = (
Validate(data=data, reference=reference)
.col_sum_eq("a") # Auto: ref("a")
.col_sum_eq("b", ref("b")) # Explicit ref
.col_sum_eq("a", 6) # Literal value
.interrogate()
)
v.assert_passing()
def test_auto_ref_column_list():
"""Test automatic reference with a list of columns."""
data = pl.DataFrame({"a": [1, 2, 3], "b": [1, 2, 3], "c": [1, 2, 3]})
reference = pl.DataFrame({"a": [1, 2, 3], "b": [1, 2, 3], "c": [1, 2, 3]})
# Passing a list of columns should use ref(col) for each
v = Validate(data=data, reference=reference).col_sum_eq(["a", "b", "c"]).interrogate()
v.assert_passing()
# =====================================================
# Parameterized Auto-Reference Tests (All Agg Methods)
# =====================================================
@pytest.fixture
def auto_ref_equal_data() -> tuple[pl.DataFrame, pl.DataFrame]:
"""Matching data and reference for equality tests."""
data = pl.DataFrame(
{
"val": [1.0, 2.0, 3.0, 4.0], # sum=10, avg=2.5, sd≈1.29
}
)
reference = pl.DataFrame(
{
"val": [1.0, 2.0, 3.0, 4.0], # same as data
}
)
return data, reference
@pytest.fixture
def auto_ref_greater_data() -> tuple[pl.DataFrame, pl.DataFrame]:
"""Data with larger values than reference for gt/ge tests."""
data = pl.DataFrame(
{
"val": [10.0, 20.0, 30.0, 40.0], # sum=100, avg=25, sd≈12.91
}
)
reference = pl.DataFrame(
{
"val": [1.0, 2.0, 3.0, 4.0], # sum=10, avg=2.5, sd≈1.29
}
)
return data, reference
@pytest.fixture
def auto_ref_lesser_data() -> tuple[pl.DataFrame, pl.DataFrame]:
"""Data with smaller values than reference for lt/le tests."""
data = pl.DataFrame(
{
"val": [1.0, 2.0, 3.0, 4.0], # sum=10, avg=2.5, sd≈1.29
}
)
reference = pl.DataFrame(
{
"val": [10.0, 20.0, 30.0, 40.0], # sum=100, avg=25, sd≈12.91
}
)
return data, reference
# Test all equality methods (_eq) with automatic reference inference
@pytest.mark.parametrize(
"method",
[
"col_sum_eq",
"col_avg_eq",
"col_sd_eq",
],
)
def test_auto_ref_eq_methods_with_equal_data(method: str, auto_ref_equal_data):
"""Test all _eq methods pass when data equals reference (auto-inference)."""
data, reference = auto_ref_equal_data
v = Validate(data=data, reference=reference)
v = getattr(v, method)("val") # No value provided, should auto-infer ref("val")
v = v.interrogate()
v.assert_passing()
# Test all greater-than methods (_gt) with auto-reference
@pytest.mark.parametrize(
"method",
[
"col_sum_gt",
"col_avg_gt",
"col_sd_gt",
],
)
def test_auto_ref_gt_methods(method: str, auto_ref_greater_data):
"""Test all _gt methods pass when data > reference (auto-inference)."""
data, reference = auto_ref_greater_data
v = Validate(data=data, reference=reference)
v = getattr(v, method)("val") # No value provided, should auto-infer ref("val")
v = v.interrogate()
v.assert_passing()
# Test all greater-or-equal methods (_ge) with auto-reference
@pytest.mark.parametrize(
"method",
[
"col_sum_ge",
"col_avg_ge",
"col_sd_ge",
],
)
def test_auto_ref_ge_methods_with_equal_data(method: str, auto_ref_equal_data):
"""Test all _ge methods pass when data == reference (auto-inference)."""
data, reference = auto_ref_equal_data
v = Validate(data=data, reference=reference)
v = getattr(v, method)("val")
v = v.interrogate()
v.assert_passing()
@pytest.mark.parametrize(
"method",
[
"col_sum_ge",
"col_avg_ge",
"col_sd_ge",
],
)
def test_auto_ref_ge_methods_with_greater_data(method: str, auto_ref_greater_data):
"""Test all _ge methods pass when data > reference (auto-inference)."""
data, reference = auto_ref_greater_data
v = Validate(data=data, reference=reference)
v = getattr(v, method)("val")
v = v.interrogate()
v.assert_passing()
# Test all less-than methods (_lt) with auto-reference
@pytest.mark.parametrize(
"method",
[
"col_sum_lt",
"col_avg_lt",
"col_sd_lt",
],
)
def test_auto_ref_lt_methods(method: str, auto_ref_lesser_data):
"""Test all _lt methods pass when data < reference (auto-inference)."""
data, reference = auto_ref_lesser_data
v = Validate(data=data, reference=reference)
v = getattr(v, method)("val")
v = v.interrogate()
v.assert_passing()
# Test all less-or-equal methods (_le) with auto-reference
@pytest.mark.parametrize(
"method",
[
"col_sum_le",
"col_avg_le",
"col_sd_le",
],
)
def test_auto_ref_le_methods_with_equal_data(method: str, auto_ref_equal_data):
"""Test all _le methods pass when data == reference (auto-inference)."""
data, reference = auto_ref_equal_data
v = Validate(data=data, reference=reference)
v = getattr(v, method)("val")
v = v.interrogate()
v.assert_passing()
@pytest.mark.parametrize(
"method",
[
"col_sum_le",
"col_avg_le",
"col_sd_le",
],
)
def test_auto_ref_le_methods_with_lesser_data(method: str, auto_ref_lesser_data):
"""Test all _le methods pass when data < reference (auto-inference)."""
data, reference = auto_ref_lesser_data
v = Validate(data=data, reference=reference)
v = getattr(v, method)("val")
v = v.interrogate()
v.assert_passing()
# Test all methods raise error when no reference data
@pytest.mark.parametrize("method", load_validation_method_grid())
def test_auto_ref_all_methods_raise_without_reference(method: str):
"""Test that all agg methods raise ValueError when value=None and no reference data."""
data = pl.DataFrame({"val": [1, 2, 3]})
with pytest.raises(ValueError, match="value.*required"):
v = Validate(data=data)
getattr(v, method)("val") # No value and no reference
# Test all methods work with explicit ref() even without auto-inference
@pytest.mark.parametrize(
"method",
[
"col_sum_eq",
"col_avg_eq",
"col_sd_eq",
],
)
def test_auto_ref_explicit_ref_still_works(method: str, auto_ref_equal_data):
"""Test that explicit ref() still works alongside auto-inference."""
data, reference = auto_ref_equal_data
v = Validate(data=data, reference=reference)
# Explicit ref("val") should work the same as omitting value
v = getattr(v, method)("val", ref("val"))
v = v.interrogate()
v.assert_passing()
# Test auto-reference with tolerance for all eq methods
@pytest.mark.parametrize(
"method",
[
"col_sum_eq",
"col_avg_eq",
"col_sd_eq",
],
)
def test_auto_ref_eq_methods_with_tolerance(method: str):
"""Test all _eq methods with tolerance and auto-reference.
Note: Tolerance is calculated as int(tol * ref), so we need values large
enough that the tolerance doesn't truncate to 0. For ref=100 and tol=0.1,
we get int(0.1 * 100) = 10, allowing a tolerance of 10 units.
"""
# Use larger values so tolerance calculation works (int(tol * ref) > 0)
# Data avg=110, sum=440, sd≈12.91
data = pl.DataFrame({"val": [100.0, 105.0, 115.0, 120.0]})
# Reference avg=100, sum=400, sd≈12.91
reference = pl.DataFrame({"val": [90.0, 95.0, 105.0, 110.0]})
v = Validate(data=data, reference=reference)
v = getattr(v, method)("val", tol=0.15) # 15% tolerance
v = v.interrogate()
v.assert_passing()
# Test auto-reference with multiple columns for all eq methods
@pytest.mark.parametrize(
"method",
[
"col_sum_eq",
"col_avg_eq",
"col_sd_eq",
],
)
def test_auto_ref_eq_methods_multiple_columns(method: str):
"""Test auto-reference works when passing a list of columns."""
data = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]})
reference = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]})
v = Validate(data=data, reference=reference)
# Pass list of columns; should auto-infer ref(col) for each
v = getattr(v, method)(["a", "b", "c"])
v = v.interrogate()
v.assert_passing()
# Test expected failures with auto-reference
@pytest.mark.parametrize(
("method", "data_vals", "ref_vals"),
[
# eq should fail when values differ
("col_sum_eq", [1, 2, 3], [10, 20, 30]),
("col_avg_eq", [1, 2, 3], [10, 20, 30]),
("col_sd_eq", [1, 2, 3], [1, 1, 1]), # Different variance
# gt should fail when data <= reference
("col_sum_gt", [1, 2, 3], [10, 20, 30]),
("col_avg_gt", [1, 2, 3], [10, 20, 30]),
# lt should fail when data >= reference
("col_sum_lt", [10, 20, 30], [1, 2, 3]),
("col_avg_lt", [10, 20, 30], [1, 2, 3]),
],
)
def test_auto_ref_expected_failures(method: str, data_vals: list, ref_vals: list):
"""Test that auto-reference correctly fails when conditions are not met."""
data = pl.DataFrame({"val": data_vals})
reference = pl.DataFrame({"val": ref_vals})
v = Validate(data=data, reference=reference)
v = getattr(v, method)("val")
v = v.interrogate()
with pytest.raises(AssertionError):
v.assert_passing()
# Test mixing auto-reference and explicit values in same validation
@pytest.mark.parametrize(
"method",
[
"col_sum_eq",
"col_avg_eq",
"col_sd_eq",
],
)
def test_auto_ref_mixed_with_explicit_values(method: str, auto_ref_equal_data):
"""Test mixing auto-reference (value=None) with explicit numeric values."""
data, reference = auto_ref_equal_data
v = Validate(data=data, reference=reference)
# First call with auto-reference
v = getattr(v, method)("val")
# Second call with explicit ref()
v = getattr(v, method)("val", ref("val"))
v = v.interrogate()
v.assert_passing()
# =============================================================================
# Tests for validation report display formatting
# =============================================================================
def test_agg_report_columns_display():
"""Test that the COLUMNS column displays column names without list brackets."""
data = pl.DataFrame({"a": [1, 2, 3, 4, 5]})
validation = Validate(data).col_sum_gt(columns="a", value=10).interrogate()
report = validation.get_tabular_report()
html = report.as_raw_html()
# Should display 'a' not '['a']'
assert "['a']" not in html
# The column name should be present in the HTML
assert ">a<" in html
def test_agg_report_values_no_tolerance():
"""Test that VALUES column shows just the value when tolerance is 0."""
data = pl.DataFrame({"a": [1, 2, 3, 4, 5]})
validation = Validate(data).col_sum_gt(columns="a", value=10).interrogate()
report = validation.get_tabular_report()
html = report.as_raw_html()
# Should display just '10' and no tolerance info
assert ">10<" in html
# Should NOT contain 'tol=' since tolerance is 0
assert "tol=0" not in html
def test_agg_report_values_with_symmetric_tolerance():
"""Test that VALUES column shows value and tolerance on separate lines."""
data = pl.DataFrame({"a": [1, 2, 3, 4, 5]})
validation = Validate(data).col_avg_eq(columns="a", value=3, tol=0.5).interrogate()
report = validation.get_tabular_report()
html = report.as_raw_html()
# Should display value and tolerance separated by <br/>
assert "3<br/>tol=0.5" in html
def test_agg_report_values_with_asymmetric_tolerance():
"""Test that VALUES column shows value and asymmetric tolerance tuple."""
data = pl.DataFrame({"a": [1, 2, 3, 4, 5]})
validation = Validate(data).col_sd_le(columns="a", value=2.0, tol=(0.1, 0.2)).interrogate()
report = validation.get_tabular_report()
html = report.as_raw_html()
# Should display value and asymmetric tolerance separated by <br/>
assert "2.0<br/>tol=(0.1, 0.2)" in html
def test_agg_report_values_with_reference_column():
"""Test that VALUES column shows ref('column') for reference columns."""
data = pl.DataFrame({"a": [1, 2, 3, 4, 5]})
ref_data = pl.DataFrame({"a": [1, 2, 3, 4, 5]})
validation = (
Validate(data, reference=ref_data).col_sum_eq(columns="a", value=ref("a")).interrogate()
)
report = validation.get_tabular_report()
html = report.as_raw_html()
# Should display ref('a') for the reference column
assert "ref('a')" in html or "ref(&#x27;a&#x27;)" in html # HTML may escape quotes
def test_agg_report_values_implicit_reference():
"""Test that VALUES column shows ref('column') for implicit reference."""
data = pl.DataFrame({"a": [1, 2, 3, 4, 5]})
ref_data = pl.DataFrame({"a": [1, 2, 3, 4, 5]})
# When value is omitted with reference data, it should default to ref('a')
validation = Validate(data, reference=ref_data).col_sum_eq(columns="a").interrogate()
report = validation.get_tabular_report()
html = report.as_raw_html()
# Should display ref('a') for the implicit reference column
assert "ref('a')" in html or "ref(&#x27;a&#x27;)" in html # HTML may escape quotes
def test_agg_report_multiple_steps_formatting():
"""Test that multiple aggregation steps all display correctly."""
data = pl.DataFrame({"a": [1, 2, 3, 4, 5], "b": [10, 20, 30, 40, 50]})
validation = (
Validate(data)
.col_sum_gt(columns="a", value=10) # No tolerance
.col_avg_eq(columns="b", value=30, tol=0.1) # Symmetric tolerance
.col_sd_le(columns="a", value=2.0, tol=(0.1, 0.2)) # Asymmetric tolerance
.interrogate()
)
report = validation.get_tabular_report()
html = report.as_raw_html()
# Step 1: Just the value (no tolerance)
assert ">10<" in html
# Step 2: Value with symmetric tolerance
assert "30<br/>tol=0.1" in html
# Step 3: Value with asymmetric tolerance
assert "2.0<br/>tol=(0.1, 0.2)" in html
+3
-0

@@ -144,1 +144,4 @@ # Byte-compiled / optimized / DLL files

.vscode/mcp.json
# While typing is experimental, don't mark the entire package as typed
pointblank/py.typed

@@ -28,7 +28,8 @@ # Contributing Guidelines

To set up your development environment, you can follow these steps:
To set up your development environment, first clone the posit-dev/pointblank repository.
- Clone the posit-dev/pointblank repository
- Create a virtual environment for the folder
- Install the package in editable mode with `pip install -e .` from the root of the project folder
If you're using UV, you may run `uv sync` and your environment is setup! If using pip or another package manager, keep following these steps:
- Create a virtual environment for the folder.
- Install the package in editable mode with `pip install -e .` from the root of the project folder.
- Install the development dependencies with `pip install '.[dev]'` (have a look at the `pyproject.toml` file for the list of development dependencies)

@@ -47,1 +48,27 @@

If you create new tests involving snapshots, please ensure that the resulting snapshots are relatively small. After adding snapshots, use `make test-update` (this runs `pytest --snapshot-update`). A subsequent use of `make test` should pass without any issues.
### Creating Aggregation Methods
Aggregation methods are generated dynamically! This is done because they all have the same signature and they're registered on the `Validate` class in the same way. So, to add a new method, go to `pointblank/_agg.py` and add either a comparison or statistical aggregation function.
Comparison functions are defined by `comp_*`, for example `comp_gt` for "greater than". Statistical functions are defined by `agg_*`, for example `agg_sum` for "sum". At build time, these are registered and a grid of all combinations are created:
```{python}
Aggregator = Callable[[nw.DataFrame], Any]
Comparator = Callable[[Any, Any], bool]
AGGREGATOR_REGISTRY: dict[str, Aggregator] = {}
COMPARATOR_REGISTRY: dict[str, Comparator] = {}
```
Once you've added a new method(s), run `make pyi` to generate the updated type stubs in `pointblank/validate.pyi` which contains the new signatures for the aggregation methods. At runtime, or import time to be precise, the methods are added to the `Validate` class and resolved internally through the registry.
```{python}
# pointblank/validate.py
for method in load_validation_method_grid(): # -> `col_sum_*`, `col_mean_*`, etc.
setattr(Validate, method, make_agg_validator(method))
```
At this point, the methods will exist AND the docs/signature are loaded properly in the type checker and IDE/LSPs, which is very important for usability.
### Linting and Type Checking
We use `ruff` for linting, the settings used are fairly loose and objective. Linting is run in pre-commit in CI. You can run it locally with `make lint`. Type checking is currently not enforced, but we intend on gradually typing the codebase. You can run `make type` to run Astral's new experimental type checker `ty`. Feel free to leverage type hints and occasionally type checking but it's not obligatory at this time.

@@ -117,3 +117,3 @@ project:

left: 'Proudly supported by <a href="https://www.posit.co/" class="no-icon"><img src="/assets/posit-logo-black.svg" alt="Posit" width="80" style="padding-left: 3px;vertical-align:text-top;"></a>'
right: "&copy; 2024&ndash;2025 Posit Software, PBC."
right: "&copy; 2024&ndash;2026 Posit Software, PBC."

@@ -174,2 +174,17 @@ html-table-processing: none

- name: Validate.col_vals_expr
- name: Validate.col_sum_gt
- name: Validate.col_sum_lt
- name: Validate.col_sum_ge
- name: Validate.col_sum_le
- name: Validate.col_sum_eq
- name: Validate.col_avg_gt
- name: Validate.col_avg_lt
- name: Validate.col_avg_ge
- name: Validate.col_avg_le
- name: Validate.col_avg_eq
- name: Validate.col_sd_gt
- name: Validate.col_sd_lt
- name: Validate.col_sd_ge
- name: Validate.col_sd_le
- name: Validate.col_sd_eq
- name: Validate.rows_distinct

@@ -176,0 +191,0 @@ - name: Validate.rows_complete

@@ -40,2 +40,3 @@ # Pointblank

- [Actions](https://posit-dev.github.io/pointblank/reference/Actions.html): Definition of action values.
- [FinalActions](https://posit-dev.github.io/pointblank/reference/FinalActions.html): Define actions to be taken after validation is complete.
- [Schema](https://posit-dev.github.io/pointblank/reference/Schema.html): Definition of a schema object.

@@ -53,12 +54,35 @@ - [DraftValidation](https://posit-dev.github.io/pointblank/reference/DraftValidation.html): Draft a validation plan for a given table using an LLM.

- [Validate.col_vals_not_in_set](https://posit-dev.github.io/pointblank/reference/Validate.col_vals_not_in_set.html): Validate whether column values are not in a set of values.
- [Validate.col_vals_null](https://posit-dev.github.io/pointblank/reference/Validate.col_vals_null.html): Validate whether values in a column are NULL.
- [Validate.col_vals_not_null](https://posit-dev.github.io/pointblank/reference/Validate.col_vals_not_null.html): Validate whether values in a column are not NULL.
- [Validate.col_vals_increasing](https://posit-dev.github.io/pointblank/reference/Validate.col_vals_increasing.html): Are column data increasing by row?
- [Validate.col_vals_decreasing](https://posit-dev.github.io/pointblank/reference/Validate.col_vals_decreasing.html): Are column data decreasing by row?
- [Validate.col_vals_null](https://posit-dev.github.io/pointblank/reference/Validate.col_vals_null.html): Validate whether values in a column are Null.
- [Validate.col_vals_not_null](https://posit-dev.github.io/pointblank/reference/Validate.col_vals_not_null.html): Validate whether values in a column are not Null.
- [Validate.col_vals_regex](https://posit-dev.github.io/pointblank/reference/Validate.col_vals_regex.html): Validate whether column values match a regular expression pattern.
- [Validate.col_vals_within_spec](https://posit-dev.github.io/pointblank/reference/Validate.col_vals_within_spec.html): Validate whether column values fit within a specification.
- [Validate.col_vals_expr](https://posit-dev.github.io/pointblank/reference/Validate.col_vals_expr.html): Validate column values using a custom expression.
- [Validate.rows_distinct](https://posit-dev.github.io/pointblank/reference/Validate.rows_distinct.html): Validate whether rows in the table are distinct.
- [Validate.rows_complete](https://posit-dev.github.io/pointblank/reference/Validate.rows_complete.html): Validate whether row data are complete by having no missing values.
- [Validate.col_exists](https://posit-dev.github.io/pointblank/reference/Validate.col_exists.html): Validate whether one or more columns exist in the table.
- [Validate.col_pct_null](https://posit-dev.github.io/pointblank/reference/Validate.col_pct_null.html): Validate whether a column has a specific percentage of Null values.
- [Validate.rows_distinct](https://posit-dev.github.io/pointblank/reference/Validate.rows_distinct.html): Validate whether rows in the table are distinct.
- [Validate.col_schema_match](https://posit-dev.github.io/pointblank/reference/Validate.col_schema_match.html): Do columns in the table (and their types) match a predefined schema?
- [Validate.row_count_match](https://posit-dev.github.io/pointblank/reference/Validate.row_count_match.html): Validate whether the row count of the table matches a specified count.
- [Validate.col_count_match](https://posit-dev.github.io/pointblank/reference/Validate.col_count_match.html): Validate whether the column count of the table matches a specified count.
- [Validate.tbl_match](https://posit-dev.github.io/pointblank/reference/Validate.tbl_match.html): Validate whether the target table matches a comparison table.
- [Validate.conjointly](https://posit-dev.github.io/pointblank/reference/Validate.conjointly.html): Perform multiple row-wise validations for joint validity.
- [Validate.specially](https://posit-dev.github.io/pointblank/reference/Validate.specially.html): Perform a specialized validation with customized logic.
- [Validate.prompt](https://posit-dev.github.io/pointblank/reference/Validate.prompt.html): Validate rows using AI/LLM-powered analysis.
- [Validate.col_sum_eq](https://posit-dev.github.io/pointblank/reference/Validate.col_sum_eq.html): Does the column sum satisfy an equal to comparison?
- [Validate.col_sum_gt](https://posit-dev.github.io/pointblank/reference/Validate.col_sum_gt.html): Does the column sum satisfy a greater than comparison?
- [Validate.col_sum_ge](https://posit-dev.github.io/pointblank/reference/Validate.col_sum_ge.html): Does the column sum satisfy a greater than or equal to comparison?
- [Validate.col_sum_lt](https://posit-dev.github.io/pointblank/reference/Validate.col_sum_lt.html): Does the column sum satisfy a less than comparison?
- [Validate.col_sum_le](https://posit-dev.github.io/pointblank/reference/Validate.col_sum_le.html): Does the column sum satisfy a less than or equal to comparison?
- [Validate.col_avg_eq](https://posit-dev.github.io/pointblank/reference/Validate.col_avg_eq.html): Does the column average satisfy an equal to comparison?
- [Validate.col_avg_gt](https://posit-dev.github.io/pointblank/reference/Validate.col_avg_gt.html): Does the column average satisfy a greater than comparison?
- [Validate.col_avg_ge](https://posit-dev.github.io/pointblank/reference/Validate.col_avg_ge.html): Does the column average satisfy a greater than or equal to comparison?
- [Validate.col_avg_lt](https://posit-dev.github.io/pointblank/reference/Validate.col_avg_lt.html): Does the column average satisfy a less than comparison?
- [Validate.col_avg_le](https://posit-dev.github.io/pointblank/reference/Validate.col_avg_le.html): Does the column average satisfy a less than or equal to comparison?
- [Validate.col_sd_eq](https://posit-dev.github.io/pointblank/reference/Validate.col_sd_eq.html): Does the column standard deviation satisfy an equal to comparison?
- [Validate.col_sd_gt](https://posit-dev.github.io/pointblank/reference/Validate.col_sd_gt.html): Does the column standard deviation satisfy a greater than comparison?
- [Validate.col_sd_ge](https://posit-dev.github.io/pointblank/reference/Validate.col_sd_ge.html): Does the column standard deviation satisfy a greater than or equal to comparison?
- [Validate.col_sd_lt](https://posit-dev.github.io/pointblank/reference/Validate.col_sd_lt.html): Does the column standard deviation satisfy a less than comparison?
- [Validate.col_sd_le](https://posit-dev.github.io/pointblank/reference/Validate.col_sd_le.html): Does the column standard deviation satisfy a less than or equal to comparison?
- [col](https://posit-dev.github.io/pointblank/reference/col.html): Helper function for referencing a column in the input table.

@@ -72,3 +96,6 @@ - [starts_with](https://posit-dev.github.io/pointblank/reference/starts_with.html): Select columns that start with specified text.

- [last_n](https://posit-dev.github.io/pointblank/reference/last_n.html): Select the last `n` columns in the column list.
- [expr_col](https://posit-dev.github.io/pointblank/reference/expr_col.html): Create a column expression for use in `conjointly()` validation.
- [seg_group](https://posit-dev.github.io/pointblank/reference/seg_group.html): Group together values for segmentation.
- [Validate.interrogate](https://posit-dev.github.io/pointblank/reference/Validate.interrogate.html): Execute each validation step against the table and store the results.
- [Validate.set_tbl](https://posit-dev.github.io/pointblank/reference/Validate.set_tbl.html): Set or replace the table associated with the Validate object.
- [Validate.get_tabular_report](https://posit-dev.github.io/pointblank/reference/Validate.get_tabular_report.html): Validation report as a GT table.

@@ -81,2 +108,4 @@ - [Validate.get_step_report](https://posit-dev.github.io/pointblank/reference/Validate.get_step_report.html): Get a detailed report for a single validation step.

- [Validate.assert_passing](https://posit-dev.github.io/pointblank/reference/Validate.assert_passing.html): Raise an `AssertionError` if all tests are not passing.
- [Validate.assert_below_threshold](https://posit-dev.github.io/pointblank/reference/Validate.assert_below_threshold.html): Raise an `AssertionError` if validation steps exceed a specified threshold level.
- [Validate.above_threshold](https://posit-dev.github.io/pointblank/reference/Validate.above_threshold.html): Check if any validation steps exceed a specified threshold level.
- [Validate.n](https://posit-dev.github.io/pointblank/reference/Validate.n.html): Provides a dictionary of the number of test units for each validation step.

@@ -92,7 +121,20 @@ - [Validate.n_passed](https://posit-dev.github.io/pointblank/reference/Validate.n_passed.html): Provides a dictionary of the number of test units that passed for each validation step.

- [preview](https://posit-dev.github.io/pointblank/reference/preview.html): Display a table preview that shows some rows from the top, some from the bottom.
- [col_summary_tbl](https://posit-dev.github.io/pointblank/reference/col_summary_tbl.html): Generate a column-level summary table of a dataset.
- [missing_vals_tbl](https://posit-dev.github.io/pointblank/reference/missing_vals_tbl.html): Display a table that shows the missing values in the input table.
- [assistant](https://posit-dev.github.io/pointblank/reference/assistant.html): Chat with the PbA (Pointblank Assistant) about your data validation needs.
- [load_dataset](https://posit-dev.github.io/pointblank/reference/load_dataset.html): Load a dataset hosted in the library as specified table type.
- [get_data_path](https://posit-dev.github.io/pointblank/reference/get_data_path.html): Get the file path to a dataset included with the Pointblank package.
- [connect_to_table](https://posit-dev.github.io/pointblank/reference/connect_to_table.html): Connect to a database table using a connection string.
- [print_database_tables](https://posit-dev.github.io/pointblank/reference/print_database_tables.html): List all tables in a database from a connection string.
- [yaml_interrogate](https://posit-dev.github.io/pointblank/reference/yaml_interrogate.html): Execute a YAML-based validation workflow.
- [validate_yaml](https://posit-dev.github.io/pointblank/reference/validate_yaml.html): Validate YAML configuration against the expected structure.
- [yaml_to_python](https://posit-dev.github.io/pointblank/reference/yaml_to_python.html): Convert YAML validation configuration to equivalent Python code.
- [get_column_count](https://posit-dev.github.io/pointblank/reference/get_column_count.html): Get the number of columns in a table.
- [get_row_count](https://posit-dev.github.io/pointblank/reference/get_row_count.html): Get the number of rows in a table.
- [load_dataset](https://posit-dev.github.io/pointblank/reference/load_dataset.html): Load a dataset hosted in the library as specified table type.
- [config](https://posit-dev.github.io/pointblank/reference/config.html): Configuration settings for the pointblank library.
- [get_action_metadata](https://posit-dev.github.io/pointblank/reference/get_action_metadata.html): Access step-level metadata when authoring custom actions.
- [get_validation_summary](https://posit-dev.github.io/pointblank/reference/get_validation_summary.html): Access validation summary information when authoring final actions.
- [write_file](https://posit-dev.github.io/pointblank/reference/write_file.html): Write a Validate object to disk as a serialized file.
- [read_file](https://posit-dev.github.io/pointblank/reference/read_file.html): Read a Validate object from disk that was previously saved with `write_file()`.
- [config](https://posit-dev.github.io/pointblank/reference/config.html): Configuration settings for the Pointblank library.
- [send_slack_notification](https://posit-dev.github.io/pointblank/reference/send_slack_notification.html): Create a Slack notification function using a webhook URL.

@@ -99,0 +141,0 @@ ### User Guide

+1
-1
MIT License
Copyright (c) 2024-2025 Posit Software, PBC
Copyright (c) 2024-2026 Posit Software, PBC

@@ -5,0 +5,0 @@ Permission is hereby granted, free of charge, to any person obtaining a copy

.PHONY: check
.PHONY: pyi
pyi: ## Generate .pyi stub files
@uv run stubgen ./pointblank/validate.py \
--include-private \
-o .
@uv run scripts/generate_agg_validate_pyi.py
.PHONY: test

@@ -48,2 +55,7 @@ test:

type: ## Run experimental type checking
@uv run ty check pointblank
check:

@@ -50,0 +62,0 @@ pyright --pythonversion 3.8 pointblank

Metadata-Version: 2.4
Name: pointblank
Version: 0.17.0
Version: 0.18.0
Summary: Find out if your data is what you think it is.

@@ -8,3 +8,3 @@ Author-email: Richard Iannone <riannone@me.com>

Copyright (c) 2024-2025 Posit Software, PBC
Copyright (c) 2024-2026 Posit Software, PBC

@@ -11,0 +11,0 @@ Permission is hereby granted, free of charge, to any person obtaining a copy

Metadata-Version: 2.4
Name: pointblank
Version: 0.17.0
Version: 0.18.0
Summary: Find out if your data is what you think it is.

@@ -8,3 +8,3 @@ Author-email: Richard Iannone <riannone@me.com>

Copyright (c) 2024-2025 Posit Software, PBC
Copyright (c) 2024-2026 Posit Software, PBC

@@ -11,0 +11,0 @@ Permission is hereby granted, free of charge, to any person obtaining a copy

@@ -184,2 +184,3 @@ .gitignore

pointblank/__init__.py
pointblank/_agg.py
pointblank/_constants.py

@@ -209,2 +210,3 @@ pointblank/_constants_translations.py

pointblank/validate.py
pointblank/validate.pyi
pointblank/yaml.py

@@ -246,2 +248,3 @@ pointblank.egg-info/PKG-INFO

scripts/create_toc_pdf.py
scripts/generate_agg_validate_pyi.py
scripts/generate_llms_txt.py

@@ -256,2 +259,4 @@ scripts/html_to_pdf.py

tests/test_actions.py
tests/test_agg.py
tests/test_agg_doctests.py
tests/test_assistant.py

@@ -258,0 +263,0 @@ tests/test_cli.py

@@ -23,2 +23,3 @@ try:

matches,
ref,
starts_with,

@@ -63,2 +64,3 @@ )

"col",
"ref",
"expr_col",

@@ -65,0 +67,0 @@ "col_summary_tbl",

@@ -20,7 +20,11 @@ from __future__ import annotations

if value == 0:
formatted = "0"
formatted: str = "0"
elif abs(value) >= 1 and abs(value) < 10_000:
formatted = fmt_integer(value, use_seps=False)[0]
formatted = fmt_integer(value, use_seps=False)
assert isinstance(formatted, list)
formatted: str = formatted[0]
else:
formatted = fmt_scientific(value, decimals=1, exp_style="E1")[0]
formatted = fmt_scientific(value, decimals=1, exp_style="E1")
assert isinstance(formatted, list)
formatted: str = formatted[0]

@@ -34,11 +38,21 @@ return formatted

elif abs(value) < 1 and abs(value) >= 0.01:
formatted = fmt_number(value, decimals=2)[0]
formatted = fmt_number(value, decimals=2)
assert isinstance(formatted, list)
formatted: str = formatted[0]
elif abs(value) < 0.01:
formatted = fmt_scientific(value, decimals=1, exp_style="E1")[0]
formatted = fmt_scientific(value, decimals=1, exp_style="E1")
assert isinstance(formatted, list)
formatted: str = formatted[0]
elif abs(value) >= 1 and abs(value) < 1000:
formatted = fmt_number(value, n_sigfig=3)[0]
formatted = fmt_number(value, n_sigfig=3)
assert isinstance(formatted, list)
formatted: str = formatted[0]
elif abs(value) >= 1000 and abs(value) < 10_000:
formatted = fmt_number(value, decimals=0, use_seps=False)[0]
formatted = fmt_number(value, decimals=0, use_seps=False)
assert isinstance(formatted, list)
formatted: str = formatted[0]
else:
formatted = fmt_scientific(value, decimals=1, exp_style="E1")[0]
formatted = fmt_scientific(value, decimals=1, exp_style="E1")
assert isinstance(formatted, list)
formatted: str = formatted[0]

@@ -59,3 +73,5 @@ return formatted

if abs(value) < 1 and abs(value) >= 0.01:
return " " + fmt_number(value, decimals=2)[0]
formatted = fmt_number(value, decimals=2)
assert isinstance(formatted, list)
return " " + formatted[0]

@@ -68,2 +84,4 @@ if abs(value) < 0.01:

return fmt_number(value, n_sigfig=3)[0]
formatted = fmt_number(value, n_sigfig=3)
assert isinstance(formatted, list)
return formatted[0]
from __future__ import annotations
import datetime
import sys
from collections.abc import Container
from typing import List, Tuple, Union

@@ -18,2 +20,8 @@

SegmentSpec: TypeAlias = Union[str, SegmentTuple, List[SegmentItem]]
_CompliantValue: TypeAlias = Union[str, int, float, datetime.datetime, datetime.date]
"""A compliant value that pointblank can use in a validation step"""
_CompliantValues: TypeAlias = Container[_CompliantValue]
"""A collection of compliant values that pointblank can use in a validation step"""
else:

@@ -28,2 +36,6 @@ # Python 3.8 and 3.9 compatible type aliases

SegmentSpec = Union[str, SegmentTuple, List[SegmentItem]]
_CompliantValue = Union[str, int, float, datetime.datetime, datetime.date]
"""A compliant value that pointblank can use in a validation step"""
_CompliantValues = Container[_CompliantValue]
"""A collection of compliant values that pointblank can use in a validation step"""

@@ -30,0 +42,0 @@ # Add docstrings for better IDE support

@@ -10,3 +10,2 @@ from __future__ import annotations

import narwhals as nw
from narwhals.typing import FrameT

@@ -115,3 +114,3 @@ from pointblank._constants import MODEL_PROVIDERS

try:
import anthropic # noqa
import anthropic # noqa # type: ignore[import-not-found]
except ImportError:

@@ -210,3 +209,3 @@ raise ImportError(

self,
data: FrameT,
data: Any,
columns: Optional[List[str]] = None,

@@ -271,3 +270,3 @@ config: Optional[_BatchConfig] = None,

def _build_unique_rows_table(self) -> Tuple[FrameT, Dict[str, List[int]]]:
def _build_unique_rows_table(self) -> Tuple[Any, Dict[str, List[int]]]:
"""

@@ -278,3 +277,3 @@ Build unique rows table and mapping back to original indices.

-------
Tuple[FrameT, Dict[str, List[int]]]
Tuple[Any, Dict[str, List[int]]]
Unique rows table and signature-to-indices mapping.

@@ -281,0 +280,0 @@ """

from __future__ import annotations
from typing import Callable
from typing import Any, Callable

@@ -31,3 +31,3 @@ import narwhals as nw

def _check_column(column: str | list[str]):
def _check_column(column: str | list[str] | Column | ColumnSelector | nw.selectors.Selector):
"""

@@ -63,3 +63,3 @@ Check the input value of the `column=` parameter.

# TODO: allow for checking of dates/datetimes
def _check_value_float_int(value: float | int | any):
def _check_value_float_int(value: float | int | Any):
"""

@@ -66,0 +66,0 @@ Check that input value of the `value=` parameter is a float or integer.

@@ -49,2 +49,21 @@ import inspect

# Fallback for dynamically generated aggregation methods that might not have
# their docstrings properly attached yet
if not doc and obj_name.startswith("col_") and "_" in obj_name:
# Check if this looks like a dynamically generated aggregation method
# (e.g., col_sum_gt, col_avg_eq, col_sd_le)
parts_name = obj_name.split("_")
if (
len(parts_name) == 3
and parts_name[1] in ["sum", "avg", "sd"]
and parts_name[2] in ["gt", "ge", "lt", "le", "eq"]
):
try:
from pointblank.validate import _generate_agg_docstring
doc = _generate_agg_docstring(obj_name)
except Exception:
# If we can't generate the docstring, just use what we have
pass
# Combine the class name, signature, and docstring

@@ -105,2 +124,17 @@ api_text += f"{obj_name}{sig}\n{doc}\n\n"

"Validate.col_vals_expr",
"Validate.col_sum_gt",
"Validate.col_sum_lt",
"Validate.col_sum_ge",
"Validate.col_sum_le",
"Validate.col_sum_eq",
"Validate.col_avg_gt",
"Validate.col_avg_lt",
"Validate.col_avg_ge",
"Validate.col_avg_le",
"Validate.col_avg_eq",
"Validate.col_sd_gt",
"Validate.col_sd_lt",
"Validate.col_sd_ge",
"Validate.col_sd_le",
"Validate.col_sd_eq",
"Validate.rows_distinct",

@@ -337,6 +371,10 @@ "Validate.rows_complete",

# Extract the title of the example (the line beginning with `###`)
title = re.search(r"### (.*)", example_text).group(1)
title_match = re.search(r"### (.*)", example_text)
assert title_match is not None
title = title_match.group(1)
# The next line with text is the short description of the example
desc = re.search(r"(.*)\.", example_text).group(1)
desc_match = re.search(r"(.*)\.", example_text)
assert desc_match is not None
desc = desc_match.group(1)

@@ -343,0 +381,0 @@ # Get all of the Python code blocks in the example

@@ -10,6 +10,7 @@ from __future__ import annotations

from great_tables import GT
from narwhals.dependencies import is_narwhals_dataframe, is_narwhals_lazyframe
from great_tables.gt import _get_column_of_values
from narwhals.typing import FrameT
from pointblank._constants import ASSERTION_TYPE_METHOD_MAP, GENERAL_COLUMN_TYPES, IBIS_BACKENDS
from pointblank.column import Column, ColumnLiteral, ColumnSelector, ColumnSelectorNarwhals, col

@@ -19,2 +20,4 @@ if TYPE_CHECKING:

from narwhals.typing import IntoFrame, IntoFrameT
from pointblank._typing import AbsoluteBounds, Tolerance

@@ -40,2 +43,3 @@

# TODO: doctest
def _derive_single_bound(ref: int, tol: int | float) -> int:

@@ -50,6 +54,7 @@ """Derive a single bound using the reference."""

# TODO: doctest
def _derive_bounds(ref: int, tol: Tolerance) -> AbsoluteBounds:
"""Validate and extract the absolute bounds of the tolerance."""
if isinstance(tol, tuple):
return tuple(_derive_single_bound(ref, t) for t in tol)
return (_derive_single_bound(ref, tol[0]), _derive_single_bound(ref, tol[1]))

@@ -60,3 +65,3 @@ bound = _derive_single_bound(ref, tol)

def _get_tbl_type(data: FrameT | Any) -> str:
def _get_tbl_type(data: Any) -> str:
type_str = str(type(data))

@@ -118,3 +123,3 @@

def _process_ibis_through_narwhals(data: FrameT | Any, tbl_type: str) -> tuple[FrameT | Any, str]:
def _process_ibis_through_narwhals(data: Any, tbl_type: str) -> tuple[Any, str]:
"""

@@ -129,5 +134,5 @@ Process Ibis tables through Narwhals to unify the processing pathway.

----------
data : FrameT | Any
data
The data table, potentially an Ibis table
tbl_type : str
tbl_type
The detected table type

@@ -137,3 +142,3 @@

-------
tuple[FrameT | Any, str]
tuple[Any, str]
A tuple of (processed_data, updated_tbl_type) where:

@@ -156,3 +161,3 @@ - processed_data is the Narwhals-wrapped table if it was Ibis, otherwise original data

def _is_narwhals_table(data: any) -> bool:
def _is_narwhals_table(data: Any) -> bool:
# Check if the data is a Narwhals DataFrame

@@ -168,3 +173,3 @@ type_str = str(type(data)).lower()

def _is_lazy_frame(data: any) -> bool:
def _is_lazy_frame(data: Any) -> bool:
# Check if the data is a Polars or Narwhals DataFrame

@@ -193,11 +198,13 @@ type_str = str(type(data)).lower()

# Determine whether Pandas or Polars is available
pd = None
try:
import pandas as pd
except ImportError:
pd = None
pass
pl = None
try:
import polars as pl
except ImportError:
pl = None
pass

@@ -225,12 +232,14 @@ # If neither Pandas nor Polars is available, raise an ImportError

# Determine whether Pandas is available
pd = None
try:
import pandas as pd
except ImportError:
pd = None
pass
# Determine whether Pandas is available
# Determine whether Polars is available
pl = None
try:
import polars as pl
except ImportError:
pl = None
pass

@@ -255,3 +264,4 @@ # TODO: replace this with the `_check_any_df_lib()` function, introduce `method_used=` param

def _copy_dataframe(df):
# TODO: Good argument exceptions should be handled by caller
def _copy_dataframe(df: IntoFrameT) -> IntoFrameT:
"""

@@ -296,8 +306,11 @@ Create a copy of a DataFrame, handling different DataFrame types.

def _convert_to_narwhals(df: FrameT) -> nw.DataFrame:
# TODO: Should straight up remove this
def _convert_to_narwhals(df: IntoFrame) -> nw.DataFrame[Any] | nw.LazyFrame[Any]:
# Convert the DataFrame to a format that narwhals can work with
return nw.from_native(df)
result = nw.from_native(df)
assert is_narwhals_dataframe(result) or is_narwhals_lazyframe(result)
return result
def _check_column_exists(dfn: nw.DataFrame, column: str) -> None:
def _check_column_exists(dfn: nw.DataFrame[Any] | nw.LazyFrame[Any], column: str) -> None:
"""

@@ -309,3 +322,3 @@ Check if a column exists in a DataFrame.

dfn
A Narwhals DataFrame.
A Narwhals DataFrame or LazyFrame.
column

@@ -325,3 +338,3 @@ The column to check for existence.

def _count_true_values_in_column(
tbl: FrameT,
tbl: IntoFrame,
column: str,

@@ -356,3 +369,3 @@ inverse: bool = False,

# Always collect table if it is a LazyFrame; this is required to get the row count
if _is_lazy_frame(tbl_filtered):
if is_narwhals_lazyframe(tbl_filtered):
tbl_filtered = tbl_filtered.collect()

@@ -364,3 +377,3 @@

def _count_null_values_in_column(
tbl: FrameT,
tbl: IntoFrame,
column: str,

@@ -392,3 +405,3 @@ ) -> int:

# Always collect table if it is a LazyFrame; this is required to get the row count
if _is_lazy_frame(tbl_filtered):
if is_narwhals_lazyframe(tbl_filtered):
tbl_filtered = tbl_filtered.collect()

@@ -457,4 +470,7 @@

def _get_column_dtype(
dfn: nw.DataFrame, column: str, raw: bool = False, lowercased: bool = True
) -> str:
dfn: nw.DataFrame[Any] | nw.LazyFrame[Any],
column: str,
raw: bool = False,
lowercased: bool = True,
) -> str | nw.dtypes.DType | None:
"""

@@ -470,3 +486,3 @@ Get the data type of a column in a DataFrame.

raw
If `True`, return the raw data type string.
If `True`, return the raw DType object (or None if column not found).
lowercased

@@ -477,4 +493,4 @@ If `True`, return the data type string in lowercase.

-------
str
The data type of the column.
str | nw.dtypes.DType | None
The data type of the column as a string, or the raw DType object if `raw=True`.
"""

@@ -493,3 +509,5 @@

def _check_column_type(dfn: nw.DataFrame, column: str, allowed_types: list[str]) -> None:
def _check_column_type(
dfn: nw.DataFrame[Any] | nw.LazyFrame[Any], column: str, allowed_types: list[str]
) -> None:
"""

@@ -546,4 +564,4 @@ Check if a column is of a certain data type.

def _column_test_prep(
df: FrameT, column: str, allowed_types: list[str] | None, check_exists: bool = True
) -> nw.DataFrame:
df: IntoFrame, column: str, allowed_types: list[str] | None, check_exists: bool = True
) -> nw.DataFrame[Any] | nw.LazyFrame[Any]:
# Convert the DataFrame to a format that narwhals can work with.

@@ -564,4 +582,4 @@ dfn = _convert_to_narwhals(df=df)

def _column_subset_test_prep(
df: FrameT, columns_subset: list[str] | None, check_exists: bool = True
) -> nw.DataFrame:
df: IntoFrame, columns_subset: list[str] | None, check_exists: bool = True
) -> nw.DataFrame[Any] | nw.LazyFrame[Any]:
# Convert the DataFrame to a format that narwhals can work with.

@@ -578,19 +596,38 @@ dfn = _convert_to_narwhals(df=df)

def _get_fn_name() -> str:
# Get the current function name
fn_name = inspect.currentframe().f_back.f_code.co_name
_PBUnresolvedColumn = str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals
_PBResolvedColumn = Column | ColumnLiteral | ColumnSelectorNarwhals | list[Column] | list[str]
return fn_name
def _resolve_columns(columns: _PBUnresolvedColumn) -> _PBResolvedColumn:
# If `columns` is a ColumnSelector or Narwhals selector, call `col()` on it to later
# resolve the columns
if isinstance(columns, (ColumnSelector, nw.selectors.Selector)):
columns = col(columns)
def _get_assertion_from_fname() -> str:
# If `columns` is Column value or a string, place it in a list for iteration
if isinstance(columns, (Column, str)):
columns = [columns]
return columns
def _get_fn_name() -> str | None:
# Get the current function name
func_name = inspect.currentframe().f_back.f_code.co_name
frame = inspect.currentframe()
if frame is None or frame.f_back is None:
return None
return frame.f_back.f_code.co_name
def _get_assertion_from_fname() -> str | None:
# Get the current function name
frame = inspect.currentframe()
if frame is None or frame.f_back is None:
return None
func_name = frame.f_back.f_code.co_name
# Use the `ASSERTION_TYPE_METHOD_MAP` dictionary to get the assertion type
assertion = ASSERTION_TYPE_METHOD_MAP.get(func_name)
return ASSERTION_TYPE_METHOD_MAP.get(func_name)
return assertion
def _check_invalid_fields(fields: list[str], valid_fields: list[str]):

@@ -689,3 +726,3 @@ """

result_dict = {}
for col, sub_dict in col_dict.items():
for _col, sub_dict in col_dict.items():
for key, value in sub_dict.items():

@@ -695,3 +732,3 @@ # add columns fields not present

result_dict[key] = [None] * len(col_dict)
result_dict[key][list(col_dict.keys()).index(col)] = value
result_dict[key][list(col_dict.keys()).index(_col)] = value
return result_dict

@@ -18,3 +18,3 @@ from __future__ import annotations

debug: bool = False,
) -> Callable:
) -> Callable | None:
"""

@@ -21,0 +21,0 @@ Create a Slack notification function using a webhook URL.

@@ -6,3 +6,2 @@ from __future__ import annotations

from importlib_resources import files
from narwhals.typing import FrameT

@@ -19,3 +18,3 @@ from pointblank._constants import MODEL_PROVIDERS

model: str,
data: FrameT | Any | None = None,
data: Any = None,
tbl_name: str | None = None,

@@ -300,3 +299,3 @@ api_key: str | None = None,

try:
import anthropic # noqa
import anthropic # noqa # type: ignore[import-not-found]
except ImportError: # pragma: no cover

@@ -303,0 +302,0 @@ raise ImportError( # pragma: no cover

@@ -6,3 +6,3 @@ from __future__ import annotations

from importlib.metadata import version
from typing import TYPE_CHECKING, Any
from typing import TYPE_CHECKING, Any, cast

@@ -12,3 +12,2 @@ import narwhals as nw

from narwhals.dataframe import LazyFrame
from narwhals.typing import FrameT

@@ -23,3 +22,3 @@ from pointblank._utils_html import _create_table_dims_html, _create_table_type_html, _fmt_frac

from narwhals.dataframe import DataFrame
from narwhals.typing import Frame, IntoFrameT
from narwhals.typing import Frame

@@ -129,3 +128,3 @@ from pointblank.scan_profile_stats import StatGroup

# TODO: This needs to be generically typed at the class level, ie. DataScan[T]
def __init__(self, data: IntoFrameT, tbl_name: str | None = None) -> None:
def __init__(self, data: Any, tbl_name: str | None = None) -> None:
# Import processing functions from validate module

@@ -179,3 +178,3 @@ from pointblank.validate import (

for column in columns:
col_data: DataFrame = self.nw_data.select(column)
col_data: Frame = self.nw_data.select(column)

@@ -191,3 +190,3 @@ ## Handle dtyping:

col_profile = ColumnProfile(colname=column, coltype=native_dtype)
col_profile = ColumnProfile(colname=column, coltype=str(native_dtype))

@@ -214,3 +213,3 @@ ## Collect Sample Data:

@property
def summary_data(self) -> IntoFrameT:
def summary_data(self) -> Any:
return self.profile.as_dataframe(strict=False).to_native()

@@ -328,7 +327,6 @@

# this is an anti-pattern but there's no serious alternative
_backend = cast(Any, self.profile.implementation)
for _fmt_col in ("__frac_n_unique", "__frac_n_missing"):
_formatted: list[str | None] = _fmt_frac(formatted_data[_fmt_col])
formatted: nw.Series = nw.new_series(
_fmt_col, values=_formatted, backend=self.profile.implementation
)
formatted: nw.Series = nw.new_series(_fmt_col, values=_formatted, backend=_backend)
formatted_data = formatted_data.drop(_fmt_col)

@@ -376,6 +374,6 @@ formatted_data = formatted_data.with_columns(formatted.alias(_fmt_col))

true_ser: nw.Series = nw.new_series(
name="__freq_true", values=trues, backend=self.profile.implementation
name="__freq_true", values=trues, backend=_backend
)
false_ser: nw.Series = nw.new_series(
name="__freq_false", values=falses, backend=self.profile.implementation
name="__freq_false", values=falses, backend=_backend
)

@@ -394,5 +392,3 @@ formatted_data = formatted_data.with_columns(

_formatted: list[str | None] = _fmt_frac(formatted_data[_fmt_col])
formatted = nw.new_series(
name=_fmt_col, values=_formatted, backend=self.profile.implementation
)
formatted = nw.new_series(name=_fmt_col, values=_formatted, backend=_backend)
formatted_data = formatted_data.drop(_fmt_col)

@@ -472,3 +468,7 @@ formatted_data = formatted_data.with_columns(formatted.alias(_fmt_col))

.cols_width(
icon="35px", colname="200px", **{stat_col: "60px" for stat_col in present_stat_cols}
cases={
"icon": "35px",
"colname": "200px",
**{stat_col: "60px" for stat_col in present_stat_cols},
}
)

@@ -512,3 +512,3 @@ )

def col_summary_tbl(data: FrameT | Any, tbl_name: str | None = None) -> GT:
def col_summary_tbl(data: Any, tbl_name: str | None = None) -> GT:
"""

@@ -515,0 +515,0 @@ Generate a column-level summary table of a dataset.

@@ -7,3 +7,2 @@ from __future__ import annotations

from importlib_resources import files
from narwhals.typing import FrameT

@@ -227,3 +226,3 @@ from pointblank._constants import MODEL_PROVIDERS

data: FrameT | Any
data: Any
model: str

@@ -333,3 +332,3 @@ api_key: str | None = None

try:
import anthropic # noqa
import anthropic # noqa # type: ignore[import-not-found]
except ImportError: # pragma: no cover

@@ -336,0 +335,0 @@ raise ImportError( # pragma: no cover

@@ -8,3 +8,3 @@ from __future__ import annotations

from enum import Enum
from typing import TYPE_CHECKING, Any
from typing import TYPE_CHECKING, Any, ClassVar

@@ -100,2 +100,3 @@ import narwhals as nw

statistics: MutableSequence[Stat] = field(default_factory=lambda: [])
_type: ClassVar[_TypeMap] # Defined by subclasses

@@ -102,0 +103,0 @@ @property

@@ -5,2 +5,3 @@ from __future__ import annotations

from dataclasses import dataclass
from typing import TYPE_CHECKING

@@ -12,2 +13,5 @@ import narwhals as nw

if TYPE_CHECKING:
from typing import Any
__all__ = ["Schema", "_check_schema_match"]

@@ -274,6 +278,4 @@

columns: str | list[str] | list[tuple[str, str]] | list[tuple[str]] | dict[str, str] | None = (
None
)
tbl: any | None = None
columns: list[tuple[str, ...]] | None = None
tbl: Any | None = None

@@ -285,3 +287,3 @@ def __init__(

) = None,
tbl: any | None = None,
tbl: Any | None = None,
**kwargs,

@@ -394,2 +396,4 @@ ):

"""
if self.columns is None or other.columns is None:
return self.columns is None and other.columns is None

@@ -471,2 +475,4 @@ if not case_sensitive_colnames:

"""
if self.columns is None or other.columns is None:
return self.columns is None and other.columns is None

@@ -556,2 +562,4 @@ if not case_sensitive_colnames:

"""
if self.columns is None or other.columns is None:
return self.columns is None and other.columns is None

@@ -643,2 +651,4 @@ if not case_sensitive_colnames:

"""
if self.columns is None or other.columns is None:
return self.columns is None and other.columns is None

@@ -713,2 +723,4 @@ if not case_sensitive_colnames:

"""
if self.columns is None:
return []
return [col[0] for col in self.columns]

@@ -725,5 +737,7 @@

"""
if self.columns is None:
return []
return [col[1] for col in self.columns]
def get_schema_coerced(self, to: str | None = None) -> dict[str, str]:
def get_schema_coerced(self, to: str | None = None) -> Schema:
# If a table isn't provided, we cannot use this method

@@ -768,4 +782,11 @@ if self.tbl is None:

raise ValueError(
f"Cannot coerce schema from '{self.tbl_type}' to '{to}'. "
"Supported conversions: pandas->polars, polars->pandas."
)
def __str__(self):
formatted_columns = []
if self.columns is None:
return "Pointblank Schema (empty)"
for col in self.columns:

@@ -784,4 +805,11 @@ if len(col) == 1: # Only column name provided (no data type)

def _process_columns(
*, columns: str | list[str] | list[tuple[str, str]] | dict[str, str] | None = None, **kwargs
) -> list[tuple[str, str]]:
*,
columns: str
| list[str]
| list[tuple[str, str]]
| list[tuple[str]]
| dict[str, str]
| None = None,
**kwargs,
) -> list[tuple[str, ...]]:
"""

@@ -800,4 +828,4 @@ Process column information provided as individual arguments or as a list of

-------
list[tuple[str, str]]
A list of tuples containing column information.
list[tuple[str, ...]]
A list of tuples containing column information (name only or name and dtype).
"""

@@ -807,5 +835,8 @@ if columns is not None:

if all(isinstance(col, str) for col in columns):
return [(col,) for col in columns]
# Type narrowing: after the all() check, columns contains only strings
str_columns: list[str] = columns # type: ignore[assignment]
return [(col,) for col in str_columns]
else:
return columns
# Type narrowing: columns contains tuples
return columns # type: ignore[return-value]

@@ -827,7 +858,7 @@ if isinstance(columns, str):

dtype_present: bool,
dtype_input: str | list[str],
dtype_input: str | list[str] | None,
dtype_matched: bool,
dtype_multiple: bool,
dtype_matched_pos: int,
) -> dict[str, any]:
dtype_matched_pos: int | None,
) -> dict[str, Any]:
return {

@@ -847,4 +878,4 @@ "colname_matched": colname_matched,

colnames: list[str] | None,
colname_dict: list[dict[str, any]] | None,
) -> dict[str, dict[str, any]]:
colname_dict: list[dict[str, Any]] | None,
) -> dict[str, dict[str, Any]]:
"""

@@ -866,2 +897,3 @@ Generate the columns dictionary for the schema information dictionary.

"""
assert colnames is not None and colname_dict is not None
return {colnames[i]: colname_dict[i] for i in range(len(colnames))}

@@ -876,3 +908,3 @@

full_match_dtypes: bool,
) -> dict[str, any]:
) -> dict[str, Any]:
"""

@@ -910,3 +942,3 @@ Generate the parameters dictionary for the schema information dictionary.

def _get_schema_validation_info(
data_tbl: any,
data_tbl: Any,
schema: Schema,

@@ -919,3 +951,3 @@ passed: bool,

full_match_dtypes: bool,
) -> dict[str, any]:
) -> dict[str, Any]:
"""

@@ -972,2 +1004,6 @@ Get the schema validation information dictionary.

# Both schemas must have columns for validation
assert schema_exp.columns is not None, "Expected schema must have columns"
assert schema_tgt.columns is not None, "Target schema must have columns"
# Initialize the schema information dictionary

@@ -1146,2 +1182,7 @@ schema_info = {

if colname_matched and dtype_present:
# Type narrowing: matched_to is not None when colname_matched is True
# and dtype_input is not None when dtype_present is True
assert matched_to is not None
assert dtype_input is not None
# Get the dtype of the column in the target table

@@ -1148,0 +1189,0 @@ dtype_tgt = schema_tgt.columns[tgt_colnames.index(matched_to)][1]

from __future__ import annotations
from dataclasses import dataclass, field
from typing import Callable
from typing import Callable, cast

@@ -183,5 +183,6 @@ __all__ = ["Thresholds", "Actions", "FinalActions"]

if isinstance(threshold_value, int):
threshold_value = _convert_abs_count_to_fraction(
value=threshold_value, test_units=test_units
)
converted = _convert_abs_count_to_fraction(value=threshold_value, test_units=test_units)
if converted is None:
return None
threshold_value = converted

@@ -191,3 +192,3 @@ return fraction_failing >= threshold_value

def _convert_abs_count_to_fraction(value: int | None, test_units: int) -> float:
def _convert_abs_count_to_fraction(value: int | None, test_units: int) -> float | None:
# Using a integer value signifying the total number of 'test units' (in the

@@ -256,3 +257,3 @@ # context of a validation), we convert an integer count (absolute) threshold

# Check keys for invalid entries and raise a ValueError if any are found
invalid_keys = set(thresholds.keys()) - {"warning", "error", "critical"}
invalid_keys: set = set(thresholds.keys()) - {"warning", "error", "critical"}

@@ -262,3 +263,3 @@ if invalid_keys:

thresholds = Thresholds(**thresholds)
thresholds = Thresholds(**cast(dict[str, int | float | None], thresholds))

@@ -490,8 +491,8 @@ elif isinstance(thresholds, Thresholds):

self, value: str | Callable | list[str | Callable] | None
) -> list[str | Callable]:
) -> list[str | Callable] | None:
if value is None:
return None
if not isinstance(value, list):
return [value]
return value
if isinstance(value, list):
return cast(list[str | Callable], value)
return [value]

@@ -635,3 +636,4 @@ def __repr__(self) -> str:

action_reprs = ", ".join(
f"'{a}'" if isinstance(a, str) else a.__name__ for a in self.actions
f"'{a}'" if isinstance(a, str) else getattr(a, "__name__", repr(a))
for a in self.actions
)

@@ -642,3 +644,3 @@ return f"FinalActions([{action_reprs}])"

elif callable(self.actions):
return f"FinalActions({self.actions.__name__})"
return f"FinalActions({getattr(self.actions, '__name__', repr(self.actions))})"
else:

@@ -645,0 +647,0 @@ return f"FinalActions({self.actions})" # pragma: no cover

@@ -5,6 +5,5 @@ from __future__ import annotations

from pathlib import Path
from typing import Any, Iterable, Mapping, Optional, Union
from typing import TYPE_CHECKING, Any, Iterable, Mapping, Optional, Union
import yaml
from narwhals.typing import FrameT

@@ -15,3 +14,6 @@ from pointblank._utils import _is_lib_present

if TYPE_CHECKING:
from typing import Literal
class YAMLValidationError(Exception):

@@ -102,4 +104,4 @@ """Exception raised for YAML validation errors."""

try:
safe_namespace[alias] = import_module(module_name)
except ImportError as e:
safe_namespace[alias] = import_module(str(module_name))
except ImportError as e: # TODO: This is basically redundant, remove?
raise ImportError(

@@ -382,3 +384,5 @@ f"Could not import requested namespace '{module_name}': {e}"

def _load_data_source(self, tbl_spec: str, df_library: str = "polars") -> Any:
def _load_data_source(
self, tbl_spec: str, df_library: Literal["polars", "pandas", "duckdb"]
) -> Any:
"""Load data source based on table specification.

@@ -800,3 +804,3 @@

yaml: Union[str, Path],
set_tbl: Union[FrameT, Any, None] = None,
set_tbl: Any = None,
namespaces: Optional[Union[Iterable[str], Mapping[str, str]]] = None,

@@ -803,0 +807,0 @@ ) -> Validate:

@@ -61,7 +61,3 @@ [build-system]

]
mcp = [
"mcp[cli]>=1.10.1",
"fastmcp>=2.11.3",
"pytest-asyncio>=1.0.0",
]
mcp = ["mcp[cli]>=1.10.1", "fastmcp>=2.11.3", "pytest-asyncio>=1.0.0"]

@@ -86,3 +82,3 @@ excel = ["openpyxl>=3.0.0"]

"openpyxl>=3.0.0",
"duckdb>=1.2.0,<1.3.3", # Pin to stable versions avoiding 1.4.0+ RecordBatchReader issues
"duckdb>=1.2.0,<1.3.3", # Pin to stable versions avoiding 1.4.0+ RecordBatchReader issues
]

@@ -93,3 +89,3 @@

"chatlas>=0.6.1",
"duckdb>=1.2.0,<1.3.3", # Pin to stable versions avoiding 1.4.0+ RecordBatchReader issues
"duckdb>=1.2.0,<1.3.3", # Pin to stable versions avoiding 1.4.0+ RecordBatchReader issues
"griffe==0.38.1",

@@ -114,3 +110,2 @@ "hypothesis>=6.129.2",

"pytest-xdist>=3.6.1",
"pytest-xdist>=3.6.1",
"pytz>=2025.2",

@@ -123,2 +118,4 @@ "quartodoc>=0.8.1; python_version >= '3.9'",

"fastmcp>=2.10.5",
"ty>=0.0.1a31",
"mypy>=1.19.0",
]

@@ -125,0 +122,0 @@

Sorry, the diff of this file is not supported yet

Sorry, the diff of this file is too big to display

Sorry, the diff of this file is not supported yet

Sorry, the diff of this file is not supported yet

Sorry, the diff of this file is too big to display

Sorry, the diff of this file is too big to display

Sorry, the diff of this file is too big to display

Sorry, the diff of this file is too big to display

Sorry, the diff of this file is too big to display

Sorry, the diff of this file is too big to display

Sorry, the diff of this file is too big to display