Latest Threat Research:SANDWORM_MODE: Shai-Hulud-Style npm Worm Hijacks CI Workflows and Poisons AI Toolchains.Details
Socket
Book a DemoInstallSign in
Socket

arkas

Package Overview
Dependencies
Maintainers
1
Versions
16
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

arkas - npm Package Compare versions

Comparing version
0.0.1a10
to
0.0.1a11
+108
src/arkas/analyzer/column_correlation.py
r"""Implement an analyzer that analyzes the correlation between numeric
columns."""
from __future__ import annotations
__all__ = ["ColumnCorrelationAnalyzer"]
import logging
from typing import TYPE_CHECKING
from grizz.utils.format import str_shape_diff
from polars import selectors as cs
from arkas.analyzer.lazy import BaseInNLazyAnalyzer
from arkas.output import EmptyOutput
from arkas.output.column_correlation import ColumnCorrelationOutput
from arkas.state.target_dataframe import TargetDataFrameState
if TYPE_CHECKING:
from collections.abc import Sequence
import polars as pl
logger = logging.getLogger(__name__)
class ColumnCorrelationAnalyzer(BaseInNLazyAnalyzer):
r"""Implement an analyzer to analyze the correlation between numeric
columns.
Args:
columns: The columns to analyze. If ``None``, it analyzes all
the columns.
exclude_columns: The columns to exclude from the input
``columns``. If any column is not found, it will be ignored
during the filtering process.
missing_policy: The policy on how to handle missing columns.
The following options are available: ``'ignore'``,
``'warn'``, and ``'raise'``. If ``'raise'``, an exception
is raised if at least one column is missing.
If ``'warn'``, a warning is raised if at least one column
is missing and the missing columns are ignored.
If ``'ignore'``, the missing columns are ignored and
no warning message appears.
Example usage:
```pycon
>>> import polars as pl
>>> from arkas.analyzer import ColumnCorrelationAnalyzer
>>> analyzer = ColumnCorrelationAnalyzer(target_column="col3")
>>> analyzer
ColumnCorrelationAnalyzer(target_column='col3', columns=None, exclude_columns=(), missing_policy='raise')
>>> frame = pl.DataFrame(
... {
... "col1": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0],
... "col2": [7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0],
... "col3": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0],
... },
... )
>>> output = analyzer.analyze(frame)
>>> output
ColumnCorrelationOutput(
(state): TargetDataFrameState(dataframe=(7, 3), target_column='col3', nan_policy='propagate', figure_config=MatplotlibFigureConfig())
)
```
"""
def __init__(
self,
target_column: str,
columns: Sequence[str] | None = None,
exclude_columns: Sequence[str] = (),
missing_policy: str = "raise",
) -> None:
super().__init__(
columns=columns, exclude_columns=exclude_columns, missing_policy=missing_policy
)
self._target_column = target_column
def find_columns(self, frame: pl.DataFrame) -> tuple[str, ...]:
columns = list(super().find_columns(frame))
if self._target_column not in columns:
columns.append(self._target_column)
return tuple(columns)
def get_args(self) -> dict:
return {"target_column": self._target_column} | super().get_args()
def _analyze(self, frame: pl.DataFrame) -> ColumnCorrelationOutput | EmptyOutput:
if self._target_column not in frame:
logger.info(
f"Skipping '{self.__class__.__qualname__}.analyze' "
f"because the target column {self._target_column!r} is missing"
)
return EmptyOutput()
logger.info(
f"Analyzing the correlation between {self._target_column} and {self._columns}..."
)
columns = list(self.find_common_columns(frame))
out = frame.select(cs.by_name(columns) & cs.numeric())
logger.info(str_shape_diff(orig=frame.shape, final=out.shape))
return ColumnCorrelationOutput(
state=TargetDataFrameState(dataframe=out, target_column=self._target_column)
)
r"""Implement an analyzer that analyzes the correlation between two
columns."""
from __future__ import annotations
__all__ = ["CorrelationAnalyzer"]
import logging
from typing import TYPE_CHECKING, Any
from coola import objects_are_equal
from coola.utils.format import repr_mapping_line
from grizz.utils.column import check_column_missing_policy, check_missing_column
from grizz.utils.format import str_shape_diff
from arkas.analyzer.lazy import BaseLazyAnalyzer
from arkas.metric.utils import check_nan_policy
from arkas.output import EmptyOutput
from arkas.output.correlation import CorrelationOutput
from arkas.state.dataframe import DataFrameState
if TYPE_CHECKING:
import polars as pl
from arkas.figure import BaseFigureConfig
logger = logging.getLogger(__name__)
class CorrelationAnalyzer(BaseLazyAnalyzer):
r"""Implement an analyzer that analyzes the correlation between two
columns.
Args:
x: The first column.
y: The second column.
drop_nulls: If ``True``, the rows with null values in
``x`` or ``y`` columns are dropped.
missing_policy: The policy on how to handle missing columns.
The following options are available: ``'ignore'``,
``'warn'``, and ``'raise'``. If ``'raise'``, an exception
is raised if at least one column is missing.
If ``'warn'``, a warning is raised if at least one column
is missing and the missing columns are ignored.
If ``'ignore'``, the missing columns are ignored and
no warning message appears.
nan_policy: The policy on how to handle NaN values in the input
arrays. The following options are available: ``'omit'``,
``'propagate'``, and ``'raise'``.
figure_config: The figure configuration.
Example usage:
```pycon
>>> import polars as pl
>>> from arkas.analyzer import CorrelationAnalyzer
>>> analyzer = CorrelationAnalyzer(x="col1", y="col2")
>>> analyzer
CorrelationAnalyzer(x='col1', y='col2', drop_nulls=True, missing_policy='raise', nan_policy='propagate', figure_config=None)
>>> frame = pl.DataFrame(
... {
... "col1": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0],
... "col2": [7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0],
... "col3": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0],
... },
... schema={"col1": pl.Float64, "col2": pl.Float64, "col3": pl.Float64},
... )
>>> output = analyzer.analyze(frame)
>>> output
CorrelationOutput(
(state): DataFrameState(dataframe=(7, 2), figure_config=MatplotlibFigureConfig())
)
```
"""
def __init__(
self,
x: str,
y: str,
drop_nulls: bool = True,
missing_policy: str = "raise",
nan_policy: str = "propagate",
figure_config: BaseFigureConfig | None = None,
) -> None:
self._x = x
self._y = y
self._drop_nulls = bool(drop_nulls)
check_column_missing_policy(missing_policy)
self._missing_policy = missing_policy
check_nan_policy(nan_policy)
self._nan_policy = nan_policy
self._figure_config = figure_config
def __repr__(self) -> str:
args = repr_mapping_line(self.get_args())
return f"{self.__class__.__qualname__}({args})"
def equal(self, other: Any, equal_nan: bool = False) -> bool:
if not isinstance(other, self.__class__):
return False
return objects_are_equal(self.get_args(), other.get_args(), equal_nan=equal_nan)
def get_args(self) -> dict:
return {
"x": self._x,
"y": self._y,
"drop_nulls": self._drop_nulls,
"missing_policy": self._missing_policy,
"nan_policy": self._nan_policy,
"figure_config": self._figure_config,
}
def _analyze(self, frame: pl.DataFrame) -> CorrelationOutput | EmptyOutput:
self._check_input_column(frame)
for col in [self._x, self._y]:
if col not in frame:
logger.info(
f"Skipping '{self.__class__.__qualname__}.analyze' "
f"because the input column {col!r} is missing"
)
return EmptyOutput()
logger.info(f"Analyzing the correlation between {self._x!r} and {self._y!r}...")
dataframe = self._prepare_data(frame)
logger.info(str_shape_diff(orig=frame.shape, final=dataframe.shape))
return CorrelationOutput(
DataFrameState(
dataframe=dataframe, nan_policy=self._nan_policy, figure_config=self._figure_config
)
)
def _prepare_data(self, data: pl.DataFrame) -> pl.DataFrame:
cols = [self._x, self._y]
data = data.select(cols)
if self._drop_nulls:
logger.info(f"Dropping rows that have at least one null value in the columns: {cols}")
data = data.drop_nulls()
return data
def _check_input_column(self, frame: pl.DataFrame) -> None:
r"""Check if the input column is missing.
Args:
frame: The input DataFrame to check.
"""
check_missing_column(frame, column=self._x, missing_policy=self._missing_policy)
check_missing_column(frame, column=self._y, missing_policy=self._missing_policy)
r"""Implement an analyzer that generates a summary of the numeric
columns of a DataFrame."""
from __future__ import annotations
__all__ = ["NumericSummaryAnalyzer"]
import logging
from typing import TYPE_CHECKING
from grizz.utils.format import str_shape_diff
from polars import selectors as cs
from arkas.analyzer.lazy import BaseInNLazyAnalyzer
from arkas.output.numeric_summary import NumericSummaryOutput
from arkas.state.dataframe import DataFrameState
if TYPE_CHECKING:
import polars as pl
logger = logging.getLogger(__name__)
class NumericSummaryAnalyzer(BaseInNLazyAnalyzer):
r"""Implement an analyzer to show a summary of the numeric columns of
a DataFrame.
Args:
columns: The columns to analyze. If ``None``, it analyzes all
the columns.
exclude_columns: The columns to exclude from the input
``columns``. If any column is not found, it will be ignored
during the filtering process.
missing_policy: The policy on how to handle missing columns.
The following options are available: ``'ignore'``,
``'warn'``, and ``'raise'``. If ``'raise'``, an exception
is raised if at least one column is missing.
If ``'warn'``, a warning is raised if at least one column
is missing and the missing columns are ignored.
If ``'ignore'``, the missing columns are ignored and
no warning message appears.
Example usage:
```pycon
>>> import polars as pl
>>> from arkas.analyzer import NumericSummaryAnalyzer
>>> analyzer = NumericSummaryAnalyzer()
>>> analyzer
NumericSummaryAnalyzer(columns=None, exclude_columns=(), missing_policy='raise')
>>> frame = pl.DataFrame(
... {
... "col1": [0, 1, 1, 0, 0, 1, 0],
... "col2": [0, 1, 0, 1, 0, 1, 0],
... "col3": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0],
... },
... schema={"col1": pl.Int64, "col2": pl.Int32, "col3": pl.Float64},
... )
>>> output = analyzer.analyze(frame)
>>> output
NumericSummaryOutput(
(state): DataFrameState(dataframe=(7, 3), nan_policy='propagate', figure_config=MatplotlibFigureConfig())
)
```
"""
def _analyze(self, frame: pl.DataFrame) -> NumericSummaryOutput:
logger.info("Analyzing the numeric columns...")
columns = self.find_common_columns(frame)
out = frame.select(cs.by_name(columns) & cs.numeric())
logger.info(str_shape_diff(orig=frame.shape, final=out.shape))
return NumericSummaryOutput(state=DataFrameState(out))
r"""Implement an analyzer that generates a summary of the DataFrame."""
from __future__ import annotations
__all__ = ["SummaryAnalyzer"]
import logging
from typing import TYPE_CHECKING
from arkas.analyzer.lazy import BaseLazyAnalyzer
from arkas.output.summary import SummaryOutput
from arkas.utils.validation import check_positive
if TYPE_CHECKING:
import polars as pl
logger = logging.getLogger(__name__)
class SummaryAnalyzer(BaseLazyAnalyzer):
r"""Implement an analyzer to show a summary of the DataFrame.
Args:
top: The number of most frequent values to show.
sort: If ``True``, sort the columns by alphabetical order.
Example usage:
```pycon
>>> import polars as pl
>>> from arkas.analyzer import SummaryAnalyzer
>>> analyzer = SummaryAnalyzer()
>>> analyzer
SummaryAnalyzer(top=5, sort=False)
>>> frame = pl.DataFrame(
... {
... "col1": [0, 1, 0, 1],
... "col2": [1, 0, 1, 0],
... "col3": [1, 1, 1, 1],
... },
... schema={"col1": pl.Int64, "col2": pl.Int64, "col3": pl.Int64},
... )
>>> output = analyzer.analyze(frame)
>>> output
SummaryOutput(shape=(4, 3), top=5)
```
"""
def __init__(self, top: int = 5, sort: bool = False) -> None:
check_positive(name="top", value=top)
self._top = top
self._sort = bool(sort)
def __repr__(self) -> str:
return f"{self.__class__.__qualname__}(top={self._top:,}, sort={self._sort})"
def _analyze(self, frame: pl.DataFrame) -> SummaryOutput:
logger.info("Analyzing the DataFrame...")
if self._sort:
frame = frame.select(sorted(frame.columns))
return SummaryOutput(frame=frame, top=self._top)
r"""Contain the implementation of a HTML content generator that analyzes
the correlation between 1 target column and other columns."""
from __future__ import annotations
__all__ = [
"ColumnCorrelationContentGenerator",
"create_table",
"create_table_row",
"create_template",
]
import logging
from typing import TYPE_CHECKING, Any
from coola.utils import repr_indent, repr_mapping, str_indent, str_mapping
from jinja2 import Template
from arkas.content.section import BaseSectionContentGenerator
from arkas.evaluator2.column_correlation import ColumnCorrelationEvaluator
if TYPE_CHECKING:
from collections.abc import Sequence
from arkas.state.target_dataframe import TargetDataFrameState
logger = logging.getLogger(__name__)
class ColumnCorrelationContentGenerator(BaseSectionContentGenerator):
r"""Implement a content generator that analyzes the correlation
between 1 target column and other columns.
Args:
state: The state containing the DataFrame to analyze.
Example usage:
```pycon
>>> import polars as pl
>>> from arkas.content import ColumnCorrelationContentGenerator
>>> from arkas.state import TargetDataFrameState
>>> frame = pl.DataFrame(
... {
... "col1": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0],
... "col2": [7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0],
... "col3": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0],
... },
... )
>>> content = ColumnCorrelationContentGenerator(
... TargetDataFrameState(frame, target_column="col3")
... )
>>> content
ColumnCorrelationContentGenerator(
(state): TargetDataFrameState(dataframe=(7, 3), target_column='col3', nan_policy='propagate', figure_config=MatplotlibFigureConfig())
)
```
"""
def __init__(self, state: TargetDataFrameState) -> None:
self._state = state
def __repr__(self) -> str:
args = repr_indent(repr_mapping({"state": self._state}))
return f"{self.__class__.__qualname__}(\n {args}\n)"
def __str__(self) -> str:
args = str_indent(str_mapping({"state": self._state}))
return f"{self.__class__.__qualname__}(\n {args}\n)"
def equal(self, other: Any, equal_nan: bool = False) -> bool:
if not isinstance(other, self.__class__):
return False
return self._state.equal(other._state, equal_nan=equal_nan)
def generate_content(self) -> str:
logger.info(
f"Generating the correlation analysis between {self._state.target_column} "
f"and {list(self._state.dataframe.columns)}..."
)
metrics = ColumnCorrelationEvaluator(self._state).evaluate()
columns = list(self._state.dataframe.columns)
columns.remove(self._state.target_column)
nrows, ncols = self._state.dataframe.shape
return Template(create_template()).render(
{
"nrows": f"{nrows:,}",
"ncols": f"{ncols:,}",
"columns": ", ".join(self._state.dataframe.columns),
"table": create_table(metrics, columns=columns),
"target_column": f"{self._state.target_column}",
}
)
def create_template() -> str:
r"""Return the template of the content.
Returns:
The content template.
Example usage:
```pycon
>>> from arkas.content.column_correlation import create_template
>>> template = create_template()
```
"""
return """<p style="margin-top: 1rem;">
This section analyzes the correlation between <em>{{target_column}}</em> and other columns.
The correlation coefficient is a statistical measure of the strength of a
relationship between two variables. Its values can range from -1 to 1.
<ul>
<li> A correlation coefficient of -1 describes a perfect negative, or inverse,
correlation, with values in one series rising as those in the other decline,
and vice versa. </li>
<li> A coefficient of 1 shows a perfect positive correlation, or a direct relationship. </li>
<li> A correlation coefficient of 0 means there is no direct relationship. </li>
</ul>
The DataFrame has {{nrows}} rows and {{ncols}} columns.
</p>
{{table}}
"""
def create_table(metrics: dict[str, dict], columns: Sequence[str]) -> str:
r"""Return a HTML representation of a table with some statisticts
about each column.
Args:
metrics: The dictionary of metrics.
columns: The columns to show in the table.
Returns:
The HTML representation of the table.
Example usage:
```pycon
>>> import polars as pl
>>> from arkas.content.column_correlation import create_table
>>> row = create_table(
... metrics={
... "correlation_col1": {
... "count": 7,
... "pearson_coeff": 1.0,
... "pearson_pvalue": 0.0,
... "spearman_coeff": 1.0,
... "spearman_pvalue": 0.0,
... },
... "correlation_col2": {
... "count": 7,
... "pearson_coeff": -1.0,
... "pearson_pvalue": 0.0,
... "spearman_coeff": -1.0,
... "spearman_pvalue": 0.0,
... },
... },
... columns=["col1", "col2"],
... )
```
"""
rows = "\n".join(
[create_table_row(column=col, metrics=metrics[f"correlation_{col}"]) for col in columns]
)
return Template(
"""<table class="table table-hover table-responsive w-auto" >
<thead class="thead table-group-divider">
<tr>
<th>column</th>
<th>num samples</th>
<th>pearson coefficient</th>
<th>pearson p-value</th>
<th>spearman coefficient</th>
<th>spearman p-value</th>
</tr>
</thead>
<tbody class="tbody table-group-divider">
{{rows}}
<tr class="table-group-divider"></tr>
</tbody>
</table>
"""
).render({"rows": rows})
def create_table_row(column: str, metrics: dict) -> str:
r"""Create the HTML code of a new table row.
Args:
column: The column name
metrics: The dictionary of metrics with the correlation scores.
Returns:
The HTML code of a row.
Example usage:
```pycon
>>> import polars as pl
>>> from arkas.content.column_correlation import create_table_row
>>> row = create_table_row(
... column="col1",
... metrics={
... "count": 7,
... "pearson_coeff": 1.0,
... "pearson_pvalue": 0.0,
... "spearman_coeff": 1.0,
... "spearman_pvalue": 0.0,
... },
... )
```
"""
return Template(
"""<tr>
<th>{{column}}</th>
<td {{num_style}}>{{count}}</td>
<td {{num_style}}>{{pearson_coeff}}</td>
<td {{num_style}}>{{pearson_pvalue}}</td>
<td {{num_style}}>{{spearman_coeff}}</td>
<td {{num_style}}>{{spearman_pvalue}}</td>
</tr>"""
).render(
{
"num_style": 'style="text-align: right;"',
"column": column,
"count": f'{metrics.get("count", 0):,}',
"pearson_coeff": f'{metrics.get("pearson_coeff", float("nan")):.4f}',
"pearson_pvalue": f'{metrics.get("pearson_pvalue", float("nan")):.4f}',
"spearman_coeff": f'{metrics.get("spearman_coeff", float("nan")):.4f}',
"spearman_pvalue": f'{metrics.get("spearman_pvalue", float("nan")):.4f}',
}
)
r"""Contain the implementation of a HTML content generator that analyzes
the correlation between two columns."""
from __future__ import annotations
__all__ = [
"CorrelationContentGenerator",
"create_template",
]
import logging
from typing import TYPE_CHECKING, Any
from coola.utils import repr_indent, repr_mapping, str_indent, str_mapping
from jinja2 import Template
from arkas.content.section import BaseSectionContentGenerator
from arkas.evaluator2.correlation import CorrelationEvaluator
from arkas.figure.utils import figure2html
from arkas.plotter.correlation import CorrelationPlotter
from arkas.utils.dataframe import check_num_columns
if TYPE_CHECKING:
from arkas.state.target_dataframe import DataFrameState
logger = logging.getLogger(__name__)
class CorrelationContentGenerator(BaseSectionContentGenerator):
r"""Implement a content generator that analyzes the correlation
between two columns.
Args:
state: The state containing the DataFrame to analyze.
The DataFrame must have only 2 columns, which are the two
columns to analyze.
Example usage:
```pycon
>>> import polars as pl
>>> from arkas.content import CorrelationContentGenerator
>>> from arkas.state import DataFrameState
>>> frame = pl.DataFrame(
... {
... "col1": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0],
... "col2": [7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0],
... },
... )
>>> content = CorrelationContentGenerator(DataFrameState(frame))
>>> content
CorrelationContentGenerator(
(state): DataFrameState(dataframe=(7, 2), nan_policy='propagate', figure_config=MatplotlibFigureConfig())
)
```
"""
def __init__(self, state: DataFrameState) -> None:
check_num_columns(state.dataframe, num_columns=2)
self._state = state
def __repr__(self) -> str:
args = repr_indent(repr_mapping({"state": self._state}))
return f"{self.__class__.__qualname__}(\n {args}\n)"
def __str__(self) -> str:
args = str_indent(str_mapping({"state": self._state}))
return f"{self.__class__.__qualname__}(\n {args}\n)"
def equal(self, other: Any, equal_nan: bool = False) -> bool:
if not isinstance(other, self.__class__):
return False
return self._state.equal(other._state, equal_nan=equal_nan)
def generate_content(self) -> str:
xcol, ycol = self._state.dataframe.columns
logger.info(f"Generating the correlation analysis between {xcol} and {ycol}...")
metrics = CorrelationEvaluator(self._state).evaluate()
figures = CorrelationPlotter(self._state).plot()
return Template(create_template()).render(
{
"xcol": str(xcol),
"ycol": str(ycol),
"columns": ", ".join(self._state.dataframe.columns),
"count": f"{metrics['count']:,}",
"pearson_coeff": f"{metrics['pearson_coeff']:.4f}",
"pearson_pvalue": f"{metrics['pearson_pvalue']:.4f}",
"spearman_coeff": f"{metrics['spearman_coeff']:.4f}",
"spearman_pvalue": f"{metrics['spearman_pvalue']:.4f}",
"figure": figure2html(figures["correlation"], close_fig=True),
}
)
def create_template() -> str:
r"""Return the template of the content.
Returns:
The content template.
Example usage:
```pycon
>>> from arkas.content.correlation import create_template
>>> template = create_template()
```
"""
return """<p style="margin-top: 1rem;">
This section analyzes the correlation between <em>{{xcol}}</em> and <em>{{ycol}}</em>.
The correlation coefficient is a statistical measure of the strength of a
relationship between two variables. Its values can range from -1 to 1.
<ul>
<li> <b>pearson coefficient</b>: {{pearson_coeff}} </li>
<li> <b>pearson p-value</b>: {{pearson_pvalue}} </li>
<li> <b>spearman coefficient</b>: {{spearman_coeff}} </li>
<li> <b>spearman p-value</b>: {{spearman_pvalue}} </li>
<li> <b>num samples</b>: {{count}} </li>
</ul>
<p style="margin-top: 1rem;">
The following figure shows the scatter plot between <em>{{xcol}}</em> and <em>{{ycol}}</em>.
</p>
{{figure}}
"""
r"""Contain the implementation of a HTML content generator that
summarizes the numeric columns of a DataFrame."""
from __future__ import annotations
__all__ = [
"NumericSummaryContentGenerator",
"create_table",
"create_table_quantiles",
"create_table_quantiles_row",
"create_table_row",
"create_template",
]
import logging
from typing import TYPE_CHECKING, Any
from coola.utils import repr_indent, repr_mapping, str_indent, str_mapping
from jinja2 import Template
from arkas.content.section import BaseSectionContentGenerator
from arkas.content.utils import float_to_str
from arkas.utils.stats import compute_statistics_continuous
if TYPE_CHECKING:
import polars as pl
from arkas.state.dataframe import DataFrameState
logger = logging.getLogger(__name__)
class NumericSummaryContentGenerator(BaseSectionContentGenerator):
r"""Implement a content generator that summarizes the numeric columns
of a DataFrame.
Args:
state: The state containing the DataFrame to analyze.
Example usage:
```pycon
>>> import polars as pl
>>> from arkas.content import NumericSummaryContentGenerator
>>> from arkas.state import DataFrameState
>>> dataframe = pl.DataFrame(
... {
... "col1": [0, 1, 1, 0, 0, 1, 0],
... "col2": [0, 1, 0, 1, 0, 1, 0],
... "col3": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0],
... }
... )
>>> content = NumericSummaryContentGenerator(DataFrameState(dataframe))
>>> content
NumericSummaryContentGenerator(
(state): DataFrameState(dataframe=(7, 3), nan_policy='propagate', figure_config=MatplotlibFigureConfig())
)
```
"""
def __init__(self, state: DataFrameState) -> None:
self._state = state
def __repr__(self) -> str:
args = repr_indent(repr_mapping({"state": self._state}))
return f"{self.__class__.__qualname__}(\n {args}\n)"
def __str__(self) -> str:
args = str_indent(str_mapping({"state": self._state}))
return f"{self.__class__.__qualname__}(\n {args}\n)"
def equal(self, other: Any, equal_nan: bool = False) -> bool:
if not isinstance(other, self.__class__):
return False
return self._state.equal(other._state, equal_nan=equal_nan)
def generate_content(self) -> str:
nrows, ncols = self._state.dataframe.shape
logger.info(f"Generating the summary of {ncols:,} numeric columns...")
return Template(create_template()).render(
{
"nrows": f"{nrows:,}",
"ncols": f"{ncols:,}",
"columns": ", ".join(self._state.dataframe.columns),
"table": create_table(self._state.dataframe),
"table_quantiles": create_table_quantiles(self._state.dataframe),
}
)
def create_template() -> str:
r"""Return the template of the content.
Returns:
The content template.
Example usage:
```pycon
>>> from arkas.content.numeric_summary import create_template
>>> template = create_template()
```
"""
return """This section shows a short summary of each column.
<ul>
<li> <b>column</b>: is the column name</li>
<li> <b>dtype</b>: is the column data type </li>
<li> <b>null</b>: is the number (and percentage) of null values in the column </li>
<li> <b>nan</b>: is the number (and percentage) of not a number (NaN) values in the column </li>
<li> <b>unique</b>: is the number (and percentage) of unique values in the column </li>
<li> <b>negative</b>: is the number (and percentage) of strictly negative values (<span>&#60;</span>0) in the column </li>
<li> <b>zero</b>: is the number (and percentage) of zero values (=0) in the column </li>
<li> <b>positive</b>: is the number (and percentage) of strictly positive values (<span>&#62;</span>0) in the column </li>
</ul>
<p style="margin-top: 1rem;">
<b>General statistics about the DataFrame</b>
{{table}}
<details>
<summary>[show additional statistics]</summary>
<p style="margin-top: 1rem;">
The following table shows some quantiles for each column. </p>
{{table_quantiles}}
</details>
"""
def create_table(
frame: pl.DataFrame,
) -> str:
r"""Return a HTML representation of a table with some statisticts
about each column.
Args:
frame: The DataFrame to analyze.
Returns:
The HTML representation of the table.
Example usage:
```pycon
>>> import polars as pl
>>> from arkas.content.numeric_summary import create_table
>>> dataframe = pl.DataFrame(
... {
... "col1": [0, 1, 1, 0, 0, 1, 0],
... "col2": [0, 1, 0, 1, 0, 1, 0],
... "col3": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0],
... }
... )
>>> row = create_table(dataframe)
```
"""
rows = "\n".join([create_table_row(series=series) for series in frame])
return Template(
"""<table class="table table-hover table-responsive w-auto" >
<thead class="thead table-group-divider">
<tr>
<th>column</th>
<th>dtype</th>
<th>null</th>
<th>nan</th>
<th>unique</th>
<th>negative</th>
<th>zero</th>
<th>positive</th>
<th>mean</th>
<th>std</th>
<th>skewness</th>
<th>kurtosis</th>
<th>min</th>
<th>median</th>
<th>max</th>
</tr>
</thead>
<tbody class="tbody table-group-divider">
{{rows}}
<tr class="table-group-divider"></tr>
</tbody>
</table>
"""
).render({"rows": rows})
def create_table_row(series: pl.Series) -> str:
r"""Create the HTML code of a new table row.
Args:
series: The series to analyze.
Returns:
The HTML code of a row.
Example usage:
```pycon
>>> import polars as pl
>>> from arkas.content.numeric_summary import create_table_row
>>> row = create_table_row(pl.Series("col1", [1, 2, 3, 4, 5, 6, 7]))
```
"""
stats = compute_statistics_continuous(series)
nan = int(series.is_nan().sum())
null = stats["num_nulls"]
nunique = stats["nunique"]
total = stats["count"]
negative = stats["<0"]
zero = stats["=0"]
positive = stats[">0"]
return Template(
"""<tr>
<th>{{column}}</th>
<td>{{dtype}}</td>
<td {{num_style}}>{{null}}</td>
<td {{num_style}}>{{nan}}</td>
<td {{num_style}}>{{nunique}}</td>
<td {{num_style}}>{{negative}}</td>
<td {{num_style}}>{{zero}}</td>
<td {{num_style}}>{{positive}}</td>
<td {{num_style}}>{{mean}}</td>
<td {{num_style}}>{{std}}</td>
<td {{num_style}}>{{skewness}}</td>
<td {{num_style}}>{{kurtosis}}</td>
<td {{num_style}}>{{min}}</td>
<td {{num_style}}>{{median}}</td>
<td {{num_style}}>{{max}}</td>
</tr>"""
).render(
{
"num_style": 'style="text-align: right;"',
"column": series.name,
"dtype": series.dtype,
"null": f"{null:,} ({100 * null / total if total else float('nan'):.2f}%)",
"nan": f"{nan:,} ({100 * nan / total if total else float('nan'):.2f}%)",
"nunique": f"{nunique:,} ({100 * nunique / total if total else float('nan'):.2f}%)",
"mean": float_to_str(stats["mean"]),
"std": float_to_str(stats["std"]),
"skewness": float_to_str(stats["skewness"]),
"kurtosis": float_to_str(stats["kurtosis"]),
"min": float_to_str(stats["min"]),
"median": float_to_str(stats["median"]),
"max": float_to_str(stats["max"]),
"negative": f"{negative:,} ({100 * negative / total if total else float('nan'):.2f}%)",
"zero": f"{zero:,} ({100 * zero / total if total else float('nan'):.2f}%)",
"positive": f"{positive:,} ({100 * positive / total if total else float('nan'):.2f}%)",
}
)
def create_table_quantiles(
frame: pl.DataFrame,
) -> str:
r"""Return a HTML representation of a table with quantile statisticts
for each column.
Args:
frame: The DataFrame to analyze.
Returns:
The HTML representation of the table.
Example usage:
```pycon
>>> import polars as pl
>>> from arkas.content.numeric_summary import create_table
>>> dataframe = pl.DataFrame(
... {
... "col1": [0, 1, 1, 0, 0, 1, 0],
... "col2": [0, 1, 0, 1, 0, 1, 0],
... "col3": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0],
... }
... )
>>> row = create_table(dataframe)
```
"""
rows = "\n".join([create_table_quantiles_row(series=series) for series in frame])
return Template(
"""<table class="table table-hover table-responsive w-auto" >
<thead class="thead table-group-divider">
<tr>
<th>column</th>
<th>min</th>
<th>q0.001</th>
<th>q0.01</th>
<th>q0.05</th>
<th>q0.10</th>
<th>q0.25</th>
<th>median</th>
<th>q0.75</th>
<th>q0.90</th>
<th>q0.95</th>
<th>q0.99</th>
<th>q0.999</th>
<th>max</th>
</tr>
</thead>
<tbody class="tbody table-group-divider">
{{rows}}
<tr class="table-group-divider"></tr>
</tbody>
</table>
"""
).render({"rows": rows})
def create_table_quantiles_row(series: pl.Series) -> str:
r"""Create the HTML code of a new table row.
Args:
series: The series to analyze.
Returns:
The HTML code of a row.
Example usage:
```pycon
>>> import polars as pl
>>> from arkas.content.numeric_summary import create_table_row
>>> row = create_table_row(pl.Series("col1", [1, 2, 3, 4, 5, 6, 7]))
```
"""
stats = compute_statistics_continuous(series)
return Template(
"""<tr>
<th>{{column}}</th>
<td {{num_style}}>{{min}}</td>
<td {{num_style}}>{{q001}}</td>
<td {{num_style}}>{{q01}}</td>
<td {{num_style}}>{{q05}}</td>
<td {{num_style}}>{{q10}}</td>
<td {{num_style}}>{{q25}}</td>
<td {{num_style}}>{{median}}</td>
<td {{num_style}}>{{q75}}</td>
<td {{num_style}}>{{q90}}</td>
<td {{num_style}}>{{q95}}</td>
<td {{num_style}}>{{q99}}</td>
<td {{num_style}}>{{q999}}</td>
<td {{num_style}}>{{max}}</td>
</tr>"""
).render(
{
"num_style": 'style="text-align: right;"',
"column": series.name,
"min": float_to_str(stats["min"]),
"q001": float_to_str(stats["q001"]),
"q01": float_to_str(stats["q01"]),
"q05": float_to_str(stats["q05"]),
"q10": float_to_str(stats["q10"]),
"q25": float_to_str(stats["q25"]),
"median": float_to_str(stats["median"]),
"q75": float_to_str(stats["q75"]),
"q90": float_to_str(stats["q90"]),
"q95": float_to_str(stats["q95"]),
"q99": float_to_str(stats["q99"]),
"q999": float_to_str(stats["q999"]),
"max": float_to_str(stats["max"]),
}
)
r"""Contain the implementation of a HTML content generator that returns
a summary of a DataFrame."""
from __future__ import annotations
__all__ = [
"SummaryContentGenerator",
"create_table",
"create_table_row",
"create_template",
]
import logging
from collections import Counter
from typing import TYPE_CHECKING, Any
from coola import objects_are_equal
from grizz.utils.count import compute_nunique
from grizz.utils.null import compute_null_count
from jinja2 import Template
from arkas.content.section import BaseSectionContentGenerator
from arkas.utils.validation import check_positive
if TYPE_CHECKING:
from collections.abc import Sequence
import polars as pl
logger = logging.getLogger(__name__)
class SummaryContentGenerator(BaseSectionContentGenerator):
r"""Implement a content generator that returns a summary of a
DataFrame.
Args:
frame: The DataFrame to analyze.
top: The number of most frequent values to show.
Example usage:
```pycon
>>> import polars as pl
>>> from arkas.content import SummaryContentGenerator
>>> content = SummaryContentGenerator(
... frame=pl.DataFrame(
... {
... "col1": [1.2, 4.2, 4.2, 2.2],
... "col2": [1, 1, 1, 1],
... "col3": [1, 2, 2, 2],
... },
... schema={"col1": pl.Float64, "col2": pl.Int64, "col3": pl.Int64},
... )
... )
>>> content
SummaryContentGenerator(shape=(4, 3), top=5)
```
"""
def __init__(self, frame: pl.DataFrame, top: int = 5) -> None:
self._frame = frame
check_positive(name="top", value=top)
self._top = top
def __repr__(self) -> str:
return f"{self.__class__.__qualname__}(shape={self._frame.shape}, top={self._top})"
@property
def frame(self) -> pl.DataFrame:
r"""The DataFrame to analyze."""
return self._frame
@property
def top(self) -> int:
return self._top
def equal(self, other: Any, equal_nan: bool = False) -> bool:
if not isinstance(other, self.__class__):
return False
return self.top == other.top and objects_are_equal(
self.frame, other.frame, equal_nan=equal_nan
)
def get_columns(self) -> tuple[str, ...]:
return tuple(self._frame.columns)
def get_null_count(self) -> tuple[int, ...]:
return tuple(compute_null_count(self._frame).tolist())
def get_nunique(self) -> tuple[int, ...]:
return tuple(compute_nunique(self._frame).tolist())
def get_dtypes(self) -> tuple[pl.DataType, ...]:
return tuple(self._frame.schema.dtypes())
def get_most_frequent_values(self, top: int = 5) -> tuple[tuple[tuple[Any, int], ...], ...]:
return tuple(tuple(Counter(series.to_list()).most_common(top)) for series in self.frame)
def generate_content(self) -> str:
logger.info("Generating the DataFrame summary content...")
return Template(create_template()).render(
{
"table": self._create_table(),
"nrows": f"{self._frame.shape[0]:,}",
"ncols": f"{self._frame.shape[1]:,}",
}
)
def _create_table(self) -> str:
return create_table(
columns=self.get_columns(),
null_count=self.get_null_count(),
nunique=self.get_nunique(),
dtypes=self.get_dtypes(),
most_frequent_values=self.get_most_frequent_values(top=self._top),
total=self._frame.shape[0],
)
def create_template() -> str:
r"""Return the template of the content.
Returns:
The content template.
Example usage:
```pycon
>>> from arkas.content.summary import create_template
>>> template = create_template()
```
"""
return """This section shows a short summary of each column.
<ul>
<li> <b>column</b>: are the column names</li>
<li> <b>types</b>: are the object types for the objects in the column </li>
<li> <b>null</b>: are the number (and percentage) of null values in the column </li>
<li> <b>unique</b>: are the number (and percentage) of unique values in the column </li>
</ul>
<p style="margin-top: 1rem;">
<b>General statistics about the DataFrame</b>
<ul>
<li> number of columns: {{ncols}} </li>
<li> number of rows: {{nrows}}</li>
</ul>
{{table}}
"""
def create_table(
columns: Sequence[str],
null_count: Sequence[int],
nunique: Sequence[int],
dtypes: Sequence[pl.DataType],
most_frequent_values: Sequence[Sequence[tuple[Any, int]]],
total: int,
) -> str:
r"""Return a HTML representation of a table with the temporal
distribution of null values.
Args:
columns: The column names.
null_count: The number of null values for each column.
nunique: The number of unique values for each column.
dtypes: The data type for each column.
most_frequent_values: The most frequent values for each column.
total: The total number of rows.
Returns:
The HTML representation of the table.
Example usage:
```pycon
>>> import polars as pl
>>> from arkas.content.summary import create_table
>>> row = create_table(
... columns=["float", "int", "str"],
... null_count=(1, 0, 2),
... nunique=(5, 2, 4),
... dtypes=(pl.Float64(), pl.Int64(), pl.String()),
... most_frequent_values=(
... ((2.2, 2), (1.2, 1), (4.2, 1), (None, 1), (1.0, 1)),
... ((1, 5), (0, 1)),
... (("B", 2), (None, 2), ("A", 1), ("C", 1)),
... ),
... total=42,
... )
```
"""
rows = []
for (
column,
null,
nuniq,
dtype,
mf_values,
) in zip(columns, null_count, nunique, dtypes, most_frequent_values):
rows.append(
create_table_row(
column=column,
null=null,
dtype=dtype,
nunique=nuniq,
most_frequent_values=mf_values,
total=total,
)
)
rows = "\n".join(rows)
return Template(
"""<table class="table table-hover table-responsive w-auto" >
<thead class="thead table-group-divider">
<tr>
<th>column</th>
<th>types</th>
<th>null</th>
<th>unique</th>
<th>most frequent values</th>
</tr>
</thead>
<tbody class="tbody table-group-divider">
{{rows}}
<tr class="table-group-divider"></tr>
</tbody>
</table>
"""
).render({"rows": rows})
def create_table_row(
column: str,
null: int,
nunique: int,
dtype: pl.DataType,
most_frequent_values: Sequence[tuple[Any, int]],
total: int,
) -> str:
r"""Create the HTML code of a new table row.
Args:
column: The column name.
null: The number of null values.
nunique: The number of unique values.
dtype: The data type of the column.
most_frequent_values: The most frequent values.
total: The total number of rows.
Returns:
The HTML code of a row.
Example usage:
```pycon
>>> import polars as pl
>>> from arkas.content.summary import create_table_row
>>> row = create_table_row(
... column="col",
... null=5,
... nunique=42,
... dtype=pl.Float64(),
... most_frequent_values=[("C", 12), ("A", 5), ("B", 4)],
... total=100,
... )
```
"""
null = f"{null:,} ({100 * null / total if total else float('nan'):.2f}%)"
nunique = f"{nunique:,} ({100 * nunique / total if total else float('nan'):.2f}%)"
most_frequent_values = ", ".join(
[f"{val} ({100 * c / total:.2f}%)" for val, c in most_frequent_values]
)
return Template(
"""<tr>
<th>{{column}}</th>
<td>{{dtype}}</td>
<td {{num_style}}>{{null}}</td>
<td {{num_style}}>{{nunique}}</td>
<td>{{most_frequent_values}}</td>
</tr>"""
).render(
{
"num_style": 'style="text-align: right;"',
"column": column,
"null": null,
"dtype": dtype,
"nunique": nunique,
"most_frequent_values": most_frequent_values,
}
)
r"""Contain utility functions."""
from __future__ import annotations
__all__ = ["float_to_str", "to_str"]
from typing import Any
def to_str(value: Any) -> str:
r"""Return a string representation of the input value.
Args:
value: The value to encode.
Returns:
The string representation of the input value.
Example usage:
```pycon
>>> from arkas.content.utils import to_str
>>> to_str(42)
42
```
"""
if isinstance(value, (int, float)):
return float_to_str(value)
return str(value)
def float_to_str(value: float) -> str:
r"""Return a string representation of the input value.
Args:
value: The value to encode.
Returns:
The string representation of the input value.
Example usage:
```pycon
>>> from arkas.content.utils import float_to_str
>>> float_to_str(42)
42
```
"""
return f"{value:.4g}"
r"""Implement the pairwise column correlation evaluator."""
from __future__ import annotations
__all__ = ["ColumnCorrelationEvaluator"]
from typing import TYPE_CHECKING, Any
from coola.utils import repr_indent, repr_mapping, str_indent, str_mapping
from arkas.evaluator2.base import BaseEvaluator
from arkas.evaluator2.vanilla import Evaluator
from arkas.metric import pearsonr, spearmanr
if TYPE_CHECKING:
from arkas.state.target_dataframe import TargetDataFrameState
class ColumnCorrelationEvaluator(BaseEvaluator):
r"""Implement the column correlation evaluator.
Args:
state: The state with the DataFrame to analyze.
Example usage:
```pycon
>>> import polars as pl
>>> from arkas.evaluator2 import ColumnCorrelationEvaluator
>>> from arkas.state import TargetDataFrameState
>>> frame = pl.DataFrame(
... {
... "col1": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0],
... "col2": [7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0],
... "col3": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0],
... },
... )
>>> evaluator = ColumnCorrelationEvaluator(
... TargetDataFrameState(frame, target_column="col3")
... )
>>> evaluator
ColumnCorrelationEvaluator(
(state): TargetDataFrameState(dataframe=(7, 3), target_column='col3', nan_policy='propagate', figure_config=MatplotlibFigureConfig())
)
>>> evaluator.evaluate()
{'correlation_col1': {'count': 7, 'pearson_coeff': 1.0, 'pearson_pvalue': 0.0, 'spearman_coeff': 1.0, 'spearman_pvalue': 0.0},
'correlation_col2': {'count': 7, 'pearson_coeff': -1.0, 'pearson_pvalue': 0.0, 'spearman_coeff': -1.0, 'spearman_pvalue': 0.0}}
```
"""
def __init__(self, state: TargetDataFrameState) -> None:
self._state = state
def __repr__(self) -> str:
args = repr_indent(repr_mapping({"state": self._state}))
return f"{self.__class__.__qualname__}(\n {args}\n)"
def __str__(self) -> str:
args = str_indent(str_mapping({"state": self._state}))
return f"{self.__class__.__qualname__}(\n {args}\n)"
def compute(self) -> Evaluator:
return Evaluator(metrics=self.evaluate())
def equal(self, other: Any, equal_nan: bool = False) -> bool:
if not isinstance(other, self.__class__):
return False
return self._state.equal(other._state, equal_nan=equal_nan)
def evaluate(self, prefix: str = "", suffix: str = "") -> dict[str, dict]:
target_column = self._state.target_column
columns = list(self._state.dataframe.columns)
columns.remove(target_column)
out = {}
for col in columns:
frame = self._state.dataframe.select([col, target_column]).drop_nulls().drop_nans()
x = frame[target_column].to_numpy()
y = frame[col].to_numpy()
out[f"{prefix}correlation_{col}{suffix}"] = pearsonr(x, y) | spearmanr(x, y)
return out
r"""Implement the pairwise column correlation evaluator."""
from __future__ import annotations
__all__ = ["CorrelationEvaluator"]
from typing import TYPE_CHECKING, Any
from coola.utils import repr_indent, repr_mapping, str_indent, str_mapping
from arkas.evaluator2.base import BaseEvaluator
from arkas.evaluator2.vanilla import Evaluator
from arkas.metric import pearsonr, spearmanr
from arkas.utils.dataframe import check_num_columns
if TYPE_CHECKING:
from arkas.state.target_dataframe import DataFrameState
class CorrelationEvaluator(BaseEvaluator):
r"""Implement the pairwise column correlation evaluator.
Args:
state: The state with the DataFrame to analyze.
The DataFrame must have only 2 columns, which are the two
columns to analyze.
Example usage:
```pycon
>>> import polars as pl
>>> from arkas.evaluator2 import CorrelationEvaluator
>>> from arkas.state import DataFrameState
>>> frame = pl.DataFrame(
... {
... "col1": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0],
... "col3": [2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0],
... },
... )
>>> evaluator = CorrelationEvaluator(DataFrameState(frame))
>>> evaluator
CorrelationEvaluator(
(state): DataFrameState(dataframe=(7, 2), nan_policy='propagate', figure_config=MatplotlibFigureConfig())
)
>>> evaluator.evaluate()
{'count': 7, 'pearson_coeff': 1.0, 'pearson_pvalue': 0.0, 'spearman_coeff': 1.0, 'spearman_pvalue': 0.0}
```
"""
def __init__(self, state: DataFrameState) -> None:
check_num_columns(state.dataframe, num_columns=2)
self._state = state
def __repr__(self) -> str:
args = repr_indent(repr_mapping({"state": self._state}))
return f"{self.__class__.__qualname__}(\n {args}\n)"
def __str__(self) -> str:
args = str_indent(str_mapping({"state": self._state}))
return f"{self.__class__.__qualname__}(\n {args}\n)"
def compute(self) -> Evaluator:
return Evaluator(metrics=self.evaluate())
def equal(self, other: Any, equal_nan: bool = False) -> bool:
if not isinstance(other, self.__class__):
return False
return self._state.equal(other._state, equal_nan=equal_nan)
def evaluate(self, prefix: str = "", suffix: str = "") -> dict[str, float]:
frame = self._state.dataframe.drop_nulls().drop_nans()
x = frame[frame.columns[0]].to_numpy()
y = frame[frame.columns[1]].to_numpy()
return pearsonr(x=x, y=y, prefix=prefix, suffix=suffix) | spearmanr(
x=x, y=y, prefix=prefix, suffix=suffix
)
r"""Implement an output to analyze the correlation between columns."""
from __future__ import annotations
__all__ = ["ColumnCorrelationOutput"]
from typing import TYPE_CHECKING, Any
from coola.utils import repr_indent, repr_mapping, str_indent, str_mapping
from arkas.content.column_correlation import ColumnCorrelationContentGenerator
from arkas.evaluator2.column_correlation import ColumnCorrelationEvaluator
from arkas.output.lazy import BaseLazyOutput
from arkas.plotter.vanilla import Plotter
if TYPE_CHECKING:
from arkas.state.target_dataframe import TargetDataFrameState
class ColumnCorrelationOutput(BaseLazyOutput):
r"""Implement an output to summarize the numeric columns of a
DataFrame.
Args:
state: The state containing the DataFrame to analyze.
Example usage:
```pycon
>>> import polars as pl
>>> from arkas.output import ColumnCorrelationOutput
>>> from arkas.state import TargetDataFrameState
>>> frame = pl.DataFrame(
... {
... "col1": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0],
... "col2": [7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0],
... "col3": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0],
... },
... )
>>> output = ColumnCorrelationOutput(TargetDataFrameState(frame, target_column="col3"))
>>> output
ColumnCorrelationOutput(
(state): TargetDataFrameState(dataframe=(7, 3), target_column='col3', nan_policy='propagate', figure_config=MatplotlibFigureConfig())
)
>>> output.get_content_generator()
ColumnCorrelationContentGenerator(
(state): TargetDataFrameState(dataframe=(7, 3), target_column='col3', nan_policy='propagate', figure_config=MatplotlibFigureConfig())
)
>>> output.get_evaluator()
ColumnCorrelationEvaluator(
(state): TargetDataFrameState(dataframe=(7, 3), target_column='col3', nan_policy='propagate', figure_config=MatplotlibFigureConfig())
)
>>> output.get_plotter()
Plotter(count=0)
```
"""
def __init__(self, state: TargetDataFrameState) -> None:
self._state = state
def __repr__(self) -> str:
args = repr_indent(repr_mapping({"state": self._state}))
return f"{self.__class__.__qualname__}(\n {args}\n)"
def __str__(self) -> str:
args = str_indent(str_mapping({"state": self._state}))
return f"{self.__class__.__qualname__}(\n {args}\n)"
def equal(self, other: Any, equal_nan: bool = False) -> bool:
if not isinstance(other, self.__class__):
return False
return self._state.equal(other._state, equal_nan=equal_nan)
def _get_content_generator(self) -> ColumnCorrelationContentGenerator:
return ColumnCorrelationContentGenerator(self._state)
def _get_evaluator(self) -> ColumnCorrelationEvaluator:
return ColumnCorrelationEvaluator(self._state)
def _get_plotter(self) -> Plotter:
return Plotter()
r"""Implement an output to analyze the correlation between columns."""
from __future__ import annotations
__all__ = ["CorrelationOutput"]
from typing import TYPE_CHECKING, Any
from coola.utils import repr_indent, repr_mapping, str_indent, str_mapping
from arkas.content.correlation import CorrelationContentGenerator
from arkas.evaluator2.correlation import CorrelationEvaluator
from arkas.output.lazy import BaseLazyOutput
from arkas.plotter.correlation import CorrelationPlotter
from arkas.utils.dataframe import check_num_columns
if TYPE_CHECKING:
from arkas.state.dataframe import DataFrameState
class CorrelationOutput(BaseLazyOutput):
r"""Implement an output to summarize the numeric columns of a
DataFrame.
Args:
state: The state containing the DataFrame to analyze.
Example usage:
```pycon
>>> import polars as pl
>>> from arkas.output import CorrelationOutput
>>> from arkas.state import DataFrameState
>>> frame = pl.DataFrame(
... {
... "col1": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0],
... "col2": [7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0],
... },
... )
>>> output = CorrelationOutput(DataFrameState(frame))
>>> output
CorrelationOutput(
(state): DataFrameState(dataframe=(7, 2), nan_policy='propagate', figure_config=MatplotlibFigureConfig())
)
>>> output.get_content_generator()
CorrelationContentGenerator(
(state): DataFrameState(dataframe=(7, 2), nan_policy='propagate', figure_config=MatplotlibFigureConfig())
)
>>> output.get_evaluator()
CorrelationEvaluator(
(state): DataFrameState(dataframe=(7, 2), nan_policy='propagate', figure_config=MatplotlibFigureConfig())
)
>>> output.get_plotter()
CorrelationPlotter(
(state): DataFrameState(dataframe=(7, 2), nan_policy='propagate', figure_config=MatplotlibFigureConfig())
)
```
"""
def __init__(self, state: DataFrameState) -> None:
check_num_columns(state.dataframe, num_columns=2)
self._state = state
def __repr__(self) -> str:
args = repr_indent(repr_mapping({"state": self._state}))
return f"{self.__class__.__qualname__}(\n {args}\n)"
def __str__(self) -> str:
args = str_indent(str_mapping({"state": self._state}))
return f"{self.__class__.__qualname__}(\n {args}\n)"
def equal(self, other: Any, equal_nan: bool = False) -> bool:
if not isinstance(other, self.__class__):
return False
return self._state.equal(other._state, equal_nan=equal_nan)
def _get_content_generator(self) -> CorrelationContentGenerator:
return CorrelationContentGenerator(self._state)
def _get_evaluator(self) -> CorrelationEvaluator:
return CorrelationEvaluator(self._state)
def _get_plotter(self) -> CorrelationPlotter:
return CorrelationPlotter(self._state)
r"""Implement an output to summarize the numeric columns of a
DataFrame."""
from __future__ import annotations
__all__ = ["NumericSummaryOutput"]
from typing import TYPE_CHECKING, Any
from coola.utils import repr_indent, repr_mapping, str_indent, str_mapping
from arkas.content.numeric_summary import NumericSummaryContentGenerator
from arkas.evaluator2.vanilla import Evaluator
from arkas.output.lazy import BaseLazyOutput
from arkas.plotter.vanilla import Plotter
if TYPE_CHECKING:
from arkas.state.dataframe import DataFrameState
class NumericSummaryOutput(BaseLazyOutput):
r"""Implement an output to summarize the numeric columns of a
DataFrame.
Args:
state: The state containing the DataFrame to analyze.
Example usage:
```pycon
>>> import polars as pl
>>> from arkas.output import NumericSummaryOutput
>>> from arkas.state import DataFrameState
>>> frame = pl.DataFrame(
... {
... "col1": [0, 1, 1, 0, 0, 1, 0],
... "col2": [0, 1, 0, 1, 0, 1, 0],
... "col3": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0],
... },
... schema={"col1": pl.Int64, "col2": pl.Int32, "col3": pl.Float64},
... )
>>> output = NumericSummaryOutput(DataFrameState(frame))
>>> output
NumericSummaryOutput(
(state): DataFrameState(dataframe=(7, 3), nan_policy='propagate', figure_config=MatplotlibFigureConfig())
)
>>> output.get_content_generator()
NumericSummaryContentGenerator(
(state): DataFrameState(dataframe=(7, 3), nan_policy='propagate', figure_config=MatplotlibFigureConfig())
)
>>> output.get_evaluator()
Evaluator(count=0)
>>> output.get_plotter()
Plotter(count=0)
```
"""
def __init__(self, state: DataFrameState) -> None:
self._state = state
def __repr__(self) -> str:
args = repr_indent(repr_mapping({"state": self._state}))
return f"{self.__class__.__qualname__}(\n {args}\n)"
def __str__(self) -> str:
args = str_indent(str_mapping({"state": self._state}))
return f"{self.__class__.__qualname__}(\n {args}\n)"
def equal(self, other: Any, equal_nan: bool = False) -> bool:
if not isinstance(other, self.__class__):
return False
return self._state.equal(other._state, equal_nan=equal_nan)
def _get_content_generator(self) -> NumericSummaryContentGenerator:
return NumericSummaryContentGenerator(self._state)
def _get_evaluator(self) -> Evaluator:
return Evaluator()
def _get_plotter(self) -> Plotter:
return Plotter()
r"""Implement the DataFrame summary output."""
from __future__ import annotations
__all__ = ["SummaryOutput"]
from typing import TYPE_CHECKING, Any
from coola import objects_are_equal
from arkas.content.summary import SummaryContentGenerator
from arkas.evaluator2.vanilla import Evaluator
from arkas.output.lazy import BaseLazyOutput
from arkas.plotter.vanilla import Plotter
from arkas.utils.validation import check_positive
if TYPE_CHECKING:
import polars as pl
class SummaryOutput(BaseLazyOutput):
r"""Implement the DataFrame summary output.
Args:
frame: The DataFrame to analyze.
top: The number of most frequent values to show.
Example usage:
```pycon
>>> import polars as pl
>>> from arkas.output import SummaryOutput
>>> frame = pl.DataFrame(
... {
... "col1": [1.2, 4.2, 4.2, 2.2],
... "col2": [1, 1, 1, 1],
... "col3": [1, 2, 2, 2],
... },
... schema={"col1": pl.Float64, "col2": pl.Int64, "col3": pl.Int64},
... )
>>> output = SummaryOutput(frame)
>>> output
SummaryOutput(shape=(4, 3), top=5)
>>> output.get_content_generator()
SummaryContentGenerator(shape=(4, 3), top=5)
>>> output.get_evaluator()
Evaluator(count=0)
>>> output.get_plotter()
Plotter(count=0)
```
"""
def __init__(self, frame: pl.DataFrame, top: int = 5) -> None:
self._frame = frame
check_positive(name="top", value=top)
self._top = top
def __repr__(self) -> str:
return f"{self.__class__.__qualname__}(shape={self._frame.shape}, top={self._top})"
def equal(self, other: Any, equal_nan: bool = False) -> bool:
if not isinstance(other, self.__class__):
return False
return self._top == other._top and objects_are_equal(
self._frame, other._frame, equal_nan=equal_nan
)
def _get_content_generator(self) -> SummaryContentGenerator:
return SummaryContentGenerator(frame=self._frame, top=self._top)
def _get_evaluator(self) -> Evaluator:
return Evaluator()
def _get_plotter(self) -> Plotter:
return Plotter()
r"""Contain the implementation of a correlation plotter."""
from __future__ import annotations
__all__ = ["BaseFigureCreator", "CorrelationPlotter", "MatplotlibFigureCreator"]
from abc import ABC, abstractmethod
from typing import TYPE_CHECKING, Any
import matplotlib.pyplot as plt
from coola.utils import repr_indent, repr_mapping, str_indent, str_mapping
from arkas.figure.creator import FigureCreatorRegistry
from arkas.figure.html import HtmlFigure
from arkas.figure.matplotlib import MatplotlibFigure, MatplotlibFigureConfig
from arkas.figure.utils import MISSING_FIGURE_MESSAGE
from arkas.plotter.base import BasePlotter
from arkas.plotter.vanilla import Plotter
from arkas.utils.dataframe import check_num_columns
from arkas.utils.range import find_range
if TYPE_CHECKING:
from arkas.figure.base import BaseFigure
from arkas.state.dataframe import DataFrameState
class BaseFigureCreator(ABC):
r"""Define the base class to create a figure with the content of
each column."""
@abstractmethod
def create(self, state: DataFrameState) -> BaseFigure:
r"""Create a figure with the content of each column.
Args:
state: The state containing the DataFrame to analyze.
The DataFrame must have only 2 columns, which are the
two columns to analyze.
Returns:
The generated figure.
Example usage:
```pycon
>>> import polars as pl
>>> from arkas.figure import MatplotlibFigureConfig
>>> from arkas.state import DataFrameState
>>> from arkas.plotter.correlation import MatplotlibFigureCreator
>>> creator = MatplotlibFigureCreator()
>>> frame = pl.DataFrame(
... {
... "col1": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0],
... "col3": [2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0],
... },
... )
>>> fig = creator.create(DataFrameState(frame))
```
"""
class MatplotlibFigureCreator(BaseFigureCreator):
r"""Create a matplotlib figure with the content of each column.
Example usage:
```pycon
>>> import polars as pl
>>> from arkas.figure import MatplotlibFigureConfig
>>> from arkas.state import DataFrameState
>>> from arkas.plotter.correlation import MatplotlibFigureCreator
>>> creator = MatplotlibFigureCreator()
>>> frame = pl.DataFrame(
... {
... "col1": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0],
... "col3": [2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0],
... },
... )
>>> fig = creator.create(DataFrameState(frame))
```
"""
def __repr__(self) -> str:
return f"{self.__class__.__qualname__}()"
def create(self, state: DataFrameState) -> BaseFigure:
if state.dataframe.shape[0] == 0:
return HtmlFigure(MISSING_FIGURE_MESSAGE)
check_num_columns(state.dataframe, num_columns=2)
xcol, ycol = state.dataframe.columns
fig, ax = plt.subplots(**state.figure_config.get_arg("init", {}))
x = state.dataframe[xcol].to_numpy()
y = state.dataframe[ycol].to_numpy()
ax.scatter(x=x, y=y)
xmin, xmax = find_range(
x,
xmin=state.figure_config.get_arg("xmin"),
xmax=state.figure_config.get_arg("xmax"),
)
if xmin < xmax:
ax.set_xlim(xmin, xmax)
ymin, ymax = find_range(
y,
xmin=state.figure_config.get_arg("ymin"),
xmax=state.figure_config.get_arg("ymax"),
)
if ymin < ymax:
ax.set_ylim(ymin, ymax)
ax.set_xlabel(xcol)
ax.set_ylabel(ycol)
if xscale := state.figure_config.get_arg("xscale"):
ax.set_xscale(xscale)
if yscale := state.figure_config.get_arg("yscale"):
ax.set_yscale(yscale)
fig.tight_layout()
return MatplotlibFigure(fig)
class CorrelationPlotter(BasePlotter):
r"""Implement a DataFrame column plotter.
Args:
state: The state containing the DataFrame to analyze.
The DataFrame must have only 2 columns, which are the two
columns to analyze.
Example usage:
```pycon
>>> import polars as pl
>>> from arkas.plotter import CorrelationPlotter
>>> from arkas.state import DataFrameState
>>> frame = pl.DataFrame(
... {
... "col1": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0],
... "col3": [2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0],
... },
... )
>>> plotter = CorrelationPlotter(DataFrameState(frame))
>>> plotter
CorrelationPlotter(
(state): DataFrameState(dataframe=(7, 2), nan_policy='propagate', figure_config=MatplotlibFigureConfig())
)
```
"""
registry = FigureCreatorRegistry[BaseFigureCreator](
{MatplotlibFigureConfig.backend(): MatplotlibFigureCreator()}
)
def __init__(self, state: DataFrameState) -> None:
check_num_columns(state.dataframe, num_columns=2)
self._state = state
def __repr__(self) -> str:
args = repr_indent(repr_mapping({"state": self._state}))
return f"{self.__class__.__qualname__}(\n {args}\n)"
def __str__(self) -> str:
args = str_indent(str_mapping({"state": self._state}))
return f"{self.__class__.__qualname__}(\n {args}\n)"
def compute(self) -> Plotter:
return Plotter(self.plot())
def equal(self, other: Any, equal_nan: bool = False) -> bool:
if not isinstance(other, self.__class__):
return False
return self._state.equal(other._state, equal_nan=equal_nan)
def plot(self, prefix: str = "", suffix: str = "") -> dict:
figure = self.registry.find_creator(self._state.figure_config.backend()).create(self._state)
return {f"{prefix}correlation{suffix}": figure}
r"""Implement DataFrame state with a target column."""
from __future__ import annotations
__all__ = ["TargetDataFrameState"]
import sys
from typing import TYPE_CHECKING
from coola.utils.format import repr_mapping_line, str_indent, str_mapping
from arkas.state.dataframe import DataFrameState
from arkas.utils.dataframe import check_column_exist
if sys.version_info >= (3, 11):
from typing import Self
else: # pragma: no cover
from typing_extensions import (
Self, # use backport because it was added in python 3.11
)
if TYPE_CHECKING:
import polars as pl
from arkas.figure.base import BaseFigureConfig
class TargetDataFrameState(DataFrameState):
r"""Implement a DataFrame state with a target column.
Args:
dataframe: The DataFrame.
target_column: The target column in the DataFrame.
nan_policy: The policy on how to handle NaN values in the input
arrays. The following options are available: ``'omit'``,
``'propagate'``, and ``'raise'``.
figure_config: An optional figure configuration.
Example usage:
```pycon
>>> from datetime import datetime, timezone
>>> import polars as pl
>>> from arkas.state import TargetDataFrameState
>>> frame = pl.DataFrame(
... {
... "col1": [0, 1, 1, 0, 0, 1, 0],
... "col2": [0, 1, 0, 1, 0, 1, 0],
... "col3": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0],
... },
... schema={"col1": pl.Int64, "col2": pl.Int32, "col3": pl.Float64},
... )
>>> state = TargetDataFrameState(frame, target_column="col3")
>>> state
TargetDataFrameState(dataframe=(7, 3), target_column='col3', nan_policy='propagate', figure_config=MatplotlibFigureConfig())
```
"""
def __init__(
self,
dataframe: pl.DataFrame,
target_column: str,
nan_policy: str = "propagate",
figure_config: BaseFigureConfig | None = None,
) -> None:
super().__init__(dataframe=dataframe, nan_policy=nan_policy, figure_config=figure_config)
check_column_exist(dataframe, target_column)
self._target_column = target_column
def __repr__(self) -> str:
args = repr_mapping_line(
{
"dataframe": self._dataframe.shape,
"target_column": self._target_column,
"nan_policy": self._nan_policy,
"figure_config": self._figure_config,
}
)
return f"{self.__class__.__qualname__}({args})"
def __str__(self) -> str:
args = str_indent(
str_mapping(
{
"dataframe": self._dataframe.shape,
"target_column": self._target_column,
"nan_policy": self._nan_policy,
"figure_config": self._figure_config,
}
)
)
return f"{self.__class__.__qualname__}({args})"
@property
def target_column(self) -> str:
return self._target_column
def clone(self, deep: bool = True) -> Self:
return self.__class__(
dataframe=self._dataframe.clone() if deep else self._dataframe,
target_column=self._target_column,
nan_policy=self._nan_policy,
figure_config=self._figure_config.clone() if deep else self._figure_config,
)
def get_args(self) -> dict:
return super().get_args() | {"target_column": self._target_column}
+1
-1
Metadata-Version: 2.1
Name: arkas
Version: 0.0.1a10
Version: 0.0.1a11
Summary: Library to evaluate ML model performances

@@ -5,0 +5,0 @@ Home-page: https://github.com/durandtibo/arkas

[tool.poetry]
name = "arkas"
version = "0.0.1a10"
version = "0.0.1a11"
description = "Library to evaluate ML model performances"

@@ -5,0 +5,0 @@ readme = "README.md"

@@ -13,9 +13,12 @@ r"""Contain DataFrame analyzers."""

"ColumnCooccurrenceAnalyzer",
"ColumnCorrelationAnalyzer",
"ContentAnalyzer",
"ContinuousColumnAnalyzer",
"DataFrameSummaryAnalyzer",
"CorrelationAnalyzer",
"MappingAnalyzer",
"NullValueAnalyzer",
"NumericSummaryAnalyzer",
"PlotColumnAnalyzer",
"ScatterColumnAnalyzer",
"SummaryAnalyzer",
"TemporalNullValueAnalyzer",

@@ -32,13 +35,16 @@ "TemporalPlotColumnAnalyzer",

from arkas.analyzer.column_cooccurrence import ColumnCooccurrenceAnalyzer
from arkas.analyzer.column_correlation import ColumnCorrelationAnalyzer
from arkas.analyzer.columns import BaseTruePredAnalyzer
from arkas.analyzer.content import ContentAnalyzer
from arkas.analyzer.continuous_column import ContinuousColumnAnalyzer
from arkas.analyzer.frame_summary import DataFrameSummaryAnalyzer
from arkas.analyzer.correlation import CorrelationAnalyzer
from arkas.analyzer.lazy import BaseInNLazyAnalyzer, BaseLazyAnalyzer
from arkas.analyzer.mapping import MappingAnalyzer
from arkas.analyzer.null_value import NullValueAnalyzer
from arkas.analyzer.numeric_summary import NumericSummaryAnalyzer
from arkas.analyzer.plot_column import PlotColumnAnalyzer
from arkas.analyzer.scatter_column import ScatterColumnAnalyzer
from arkas.analyzer.summary import SummaryAnalyzer
from arkas.analyzer.temporal_null_value import TemporalNullValueAnalyzer
from arkas.analyzer.temporal_plot_column import TemporalPlotColumnAnalyzer
from arkas.analyzer.transform import TransformAnalyzer

@@ -40,6 +40,6 @@ r"""Define a base class to implement lazy analyzers."""

>>> import polars as pl
>>> from arkas.analyzer import DataFrameSummaryAnalyzer
>>> analyzer = DataFrameSummaryAnalyzer()
>>> from arkas.analyzer import SummaryAnalyzer
>>> analyzer = SummaryAnalyzer()
>>> analyzer
DataFrameSummaryAnalyzer(top=5, sort=False)
SummaryAnalyzer(top=5, sort=False)
>>> frame = pl.DataFrame(

@@ -55,3 +55,3 @@ ... {

>>> output
DataFrameSummaryOutput(shape=(4, 3), top=5)
SummaryOutput(shape=(4, 3), top=5)

@@ -58,0 +58,0 @@ ```

@@ -65,3 +65,3 @@ r"""Implement an analyzer that plots the content of each column."""

PlotColumnOutput(
(state): DataFrameState(dataframe=(4, 3), figure_config=MatplotlibFigureConfig())
(state): DataFrameState(dataframe=(4, 3), nan_policy='propagate', figure_config=MatplotlibFigureConfig())
)

@@ -68,0 +68,0 @@

@@ -56,3 +56,3 @@ r"""Implement an analyzer that plots the content of each column."""

ScatterColumnOutput(
(state): ScatterDataFrameState(dataframe=(4, 2), x='col1', y='col2', color=None, figure_config=MatplotlibFigureConfig())
(state): ScatterDataFrameState(dataframe=(4, 2), x='col1', y='col2', color=None, nan_policy='propagate', figure_config=MatplotlibFigureConfig())
)

@@ -59,0 +59,0 @@

@@ -80,3 +80,3 @@ r"""Implement an analyzer that plots the content of each column."""

TemporalNullValueOutput(
(state): TemporalDataFrameState(dataframe=(4, 4), temporal_column='datetime', period='1d', figure_config=MatplotlibFigureConfig())
(state): TemporalDataFrameState(dataframe=(4, 4), temporal_column='datetime', period='1d', nan_policy='propagate', figure_config=MatplotlibFigureConfig())
)

@@ -83,0 +83,0 @@

@@ -79,3 +79,3 @@ r"""Implement an analyzer that plots the content of each column."""

TemporalPlotColumnOutput(
(state): TemporalDataFrameState(dataframe=(4, 4), temporal_column='datetime', period=None, figure_config=MatplotlibFigureConfig())
(state): TemporalDataFrameState(dataframe=(4, 4), temporal_column='datetime', period=None, nan_policy='propagate', figure_config=MatplotlibFigureConfig())
)

@@ -82,0 +82,0 @@

@@ -10,9 +10,12 @@ r"""Contain HTML content generators."""

"ColumnCooccurrenceContentGenerator",
"ColumnCorrelationContentGenerator",
"ContentGenerator",
"ContentGeneratorDict",
"ContinuousSeriesContentGenerator",
"DataFrameSummaryContentGenerator",
"CorrelationContentGenerator",
"NullValueContentGenerator",
"NumericSummaryContentGenerator",
"PlotColumnContentGenerator",
"ScatterColumnContentGenerator",
"SummaryContentGenerator",
"TemporalNullValueContentGenerator",

@@ -26,10 +29,13 @@ "TemporalPlotColumnContentGenerator",

from arkas.content.column_cooccurrence import ColumnCooccurrenceContentGenerator
from arkas.content.column_correlation import ColumnCorrelationContentGenerator
from arkas.content.continuous_series import ContinuousSeriesContentGenerator
from arkas.content.frame_summary import DataFrameSummaryContentGenerator
from arkas.content.correlation import CorrelationContentGenerator
from arkas.content.mapping import ContentGeneratorDict
from arkas.content.null_value import NullValueContentGenerator
from arkas.content.numeric_summary import NumericSummaryContentGenerator
from arkas.content.plot_column import PlotColumnContentGenerator
from arkas.content.scatter_column import ScatterColumnContentGenerator
from arkas.content.summary import SummaryContentGenerator
from arkas.content.temporal_null_value import TemporalNullValueContentGenerator
from arkas.content.temporal_plot_column import TemporalPlotColumnContentGenerator
from arkas.content.vanilla import ContentGenerator

@@ -102,3 +102,3 @@ r"""Contain the implementation of a HTML content generator that returns

>>> from arkas.content.frame_summary import create_template
>>> from arkas.content.summary import create_template
>>> template = create_template()

@@ -105,0 +105,0 @@

@@ -78,3 +78,3 @@ r"""Contain the implementation of a HTML content generator that analyzes

xmin=self._state.figure_config.get_arg("xmin"),
xmax=self._state.figure_config.get_arg("xmin"),
xmax=self._state.figure_config.get_arg("xmax"),
)

@@ -81,0 +81,0 @@ return Template(create_template()).render(

@@ -49,3 +49,3 @@ r"""Contain the implementation of a HTML content generator that plots

PlotColumnContentGenerator(
(state): DataFrameState(dataframe=(7, 3), figure_config=MatplotlibFigureConfig())
(state): DataFrameState(dataframe=(7, 3), nan_policy='propagate', figure_config=MatplotlibFigureConfig())
)

@@ -52,0 +52,0 @@

@@ -51,3 +51,3 @@ r"""Contain the implementation of a HTML content generator that plots

ScatterColumnContentGenerator(
(state): ScatterDataFrameState(dataframe=(7, 3), x='col1', y='col2', color=None, figure_config=MatplotlibFigureConfig())
(state): ScatterDataFrameState(dataframe=(7, 3), x='col1', y='col2', color=None, nan_policy='propagate', figure_config=MatplotlibFigureConfig())
)

@@ -54,0 +54,0 @@

@@ -67,3 +67,3 @@ r"""Contain the implementation of a HTML content generator that analyzes

TemporalNullValueContentGenerator(
(state): TemporalDataFrameState(dataframe=(4, 4), temporal_column='datetime', period=None, figure_config=MatplotlibFigureConfig())
(state): TemporalDataFrameState(dataframe=(4, 4), temporal_column='datetime', period=None, nan_policy='propagate', figure_config=MatplotlibFigureConfig())
)

@@ -70,0 +70,0 @@

@@ -64,3 +64,3 @@ r"""Contain the implementation of a HTML content generator that plots

TemporalPlotColumnContentGenerator(
(state): TemporalDataFrameState(dataframe=(4, 4), temporal_column='datetime', period=None, figure_config=MatplotlibFigureConfig())
(state): TemporalDataFrameState(dataframe=(4, 4), temporal_column='datetime', period=None, nan_policy='propagate', figure_config=MatplotlibFigureConfig())
)

@@ -67,0 +67,0 @@

@@ -10,2 +10,4 @@ r"""Contain data evaluators."""

"ColumnCooccurrenceEvaluator",
"ColumnCorrelationEvaluator",
"CorrelationEvaluator",
"Evaluator",

@@ -19,3 +21,5 @@ "EvaluatorDict",

from arkas.evaluator2.column_cooccurrence import ColumnCooccurrenceEvaluator
from arkas.evaluator2.column_correlation import ColumnCorrelationEvaluator
from arkas.evaluator2.correlation import CorrelationEvaluator
from arkas.evaluator2.mapping import EvaluatorDict
from arkas.evaluator2.vanilla import Evaluator

@@ -11,7 +11,9 @@ r"""Contain data outputs."""

"ColumnCooccurrenceOutput",
"ColumnCorrelationOutput",
"ContentOutput",
"ContinuousSeriesOutput",
"DataFrameSummaryOutput",
"CorrelationOutput",
"EmptyOutput",
"NullValueOutput",
"NumericSummaryOutput",
"Output",

@@ -21,2 +23,3 @@ "OutputDict",

"ScatterColumnOutput",
"SummaryOutput",
"TemporalNullValueOutput",

@@ -30,13 +33,16 @@ "TemporalPlotColumnOutput",

from arkas.output.column_cooccurrence import ColumnCooccurrenceOutput
from arkas.output.column_correlation import ColumnCorrelationOutput
from arkas.output.content import ContentOutput
from arkas.output.continuous_series import ContinuousSeriesOutput
from arkas.output.correlation import CorrelationOutput
from arkas.output.empty import EmptyOutput
from arkas.output.frame_summary import DataFrameSummaryOutput
from arkas.output.lazy import BaseLazyOutput
from arkas.output.mapping import OutputDict
from arkas.output.null_value import NullValueOutput
from arkas.output.numeric_summary import NumericSummaryOutput
from arkas.output.plot_column import PlotColumnOutput
from arkas.output.scatter_column import ScatterColumnOutput
from arkas.output.summary import SummaryOutput
from arkas.output.temporal_null_value import TemporalNullValueOutput
from arkas.output.temporal_plot_column import TemporalPlotColumnOutput
from arkas.output.vanilla import Output

@@ -44,7 +44,7 @@ r"""Implement an output to plot each column of a DataFrame."""

PlotColumnOutput(
(state): DataFrameState(dataframe=(4, 3), figure_config=MatplotlibFigureConfig())
(state): DataFrameState(dataframe=(4, 3), nan_policy='propagate', figure_config=MatplotlibFigureConfig())
)
>>> output.get_content_generator()
PlotColumnContentGenerator(
(state): DataFrameState(dataframe=(4, 3), figure_config=MatplotlibFigureConfig())
(state): DataFrameState(dataframe=(4, 3), nan_policy='propagate', figure_config=MatplotlibFigureConfig())
)

@@ -55,3 +55,3 @@ >>> output.get_evaluator()

PlotColumnPlotter(
(state): DataFrameState(dataframe=(4, 3), figure_config=MatplotlibFigureConfig())
(state): DataFrameState(dataframe=(4, 3), nan_policy='propagate', figure_config=MatplotlibFigureConfig())
)

@@ -58,0 +58,0 @@

@@ -17,3 +17,3 @@ r"""Implement an output to scatter plot some columns."""

if TYPE_CHECKING:
from arkas.state.temporal_dataframe import ScatterDataFrameState
from arkas.state.scatter_dataframe import ScatterDataFrameState

@@ -45,7 +45,7 @@

ScatterColumnOutput(
(state): ScatterDataFrameState(dataframe=(4, 3), x='col1', y='col2', color=None, figure_config=MatplotlibFigureConfig())
(state): ScatterDataFrameState(dataframe=(4, 3), x='col1', y='col2', color=None, nan_policy='propagate', figure_config=MatplotlibFigureConfig())
)
>>> output.get_content_generator()
ScatterColumnContentGenerator(
(state): ScatterDataFrameState(dataframe=(4, 3), x='col1', y='col2', color=None, figure_config=MatplotlibFigureConfig())
(state): ScatterDataFrameState(dataframe=(4, 3), x='col1', y='col2', color=None, nan_policy='propagate', figure_config=MatplotlibFigureConfig())
)

@@ -56,3 +56,3 @@ >>> output.get_evaluator()

ScatterColumnPlotter(
(state): ScatterDataFrameState(dataframe=(4, 3), x='col1', y='col2', color=None, figure_config=MatplotlibFigureConfig())
(state): ScatterDataFrameState(dataframe=(4, 3), x='col1', y='col2', color=None, nan_policy='propagate', figure_config=MatplotlibFigureConfig())
)

@@ -59,0 +59,0 @@

@@ -60,7 +60,7 @@ r"""Implement an output to analyze the number of null values in a

TemporalNullValueOutput(
(state): TemporalDataFrameState(dataframe=(4, 4), temporal_column='datetime', period=None, figure_config=MatplotlibFigureConfig())
(state): TemporalDataFrameState(dataframe=(4, 4), temporal_column='datetime', period=None, nan_policy='propagate', figure_config=MatplotlibFigureConfig())
)
>>> output.get_content_generator()
TemporalNullValueContentGenerator(
(state): TemporalDataFrameState(dataframe=(4, 4), temporal_column='datetime', period=None, figure_config=MatplotlibFigureConfig())
(state): TemporalDataFrameState(dataframe=(4, 4), temporal_column='datetime', period=None, nan_policy='propagate', figure_config=MatplotlibFigureConfig())
)

@@ -71,3 +71,3 @@ >>> output.get_evaluator()

TemporalNullValuePlotter(
(state): TemporalDataFrameState(dataframe=(4, 4), temporal_column='datetime', period=None, figure_config=MatplotlibFigureConfig())
(state): TemporalDataFrameState(dataframe=(4, 4), temporal_column='datetime', period=None, nan_policy='propagate', figure_config=MatplotlibFigureConfig())
)

@@ -74,0 +74,0 @@

@@ -60,7 +60,7 @@ r"""Implement an output to plot each column of a DataFrame along a

TemporalPlotColumnOutput(
(state): TemporalDataFrameState(dataframe=(4, 4), temporal_column='datetime', period=None, figure_config=MatplotlibFigureConfig())
(state): TemporalDataFrameState(dataframe=(4, 4), temporal_column='datetime', period=None, nan_policy='propagate', figure_config=MatplotlibFigureConfig())
)
>>> output.get_content_generator()
TemporalPlotColumnContentGenerator(
(state): TemporalDataFrameState(dataframe=(4, 4), temporal_column='datetime', period=None, figure_config=MatplotlibFigureConfig())
(state): TemporalDataFrameState(dataframe=(4, 4), temporal_column='datetime', period=None, nan_policy='propagate', figure_config=MatplotlibFigureConfig())
)

@@ -71,3 +71,3 @@ >>> output.get_evaluator()

TemporalPlotColumnPlotter(
(state): TemporalDataFrameState(dataframe=(4, 4), temporal_column='datetime', period=None, figure_config=MatplotlibFigureConfig())
(state): TemporalDataFrameState(dataframe=(4, 4), temporal_column='datetime', period=None, nan_policy='propagate', figure_config=MatplotlibFigureConfig())
)

@@ -74,0 +74,0 @@

@@ -9,2 +9,3 @@ r"""Contain data plotters."""

"ContinuousSeriesPlotter",
"CorrelationPlotter",
"NullValuePlotter",

@@ -22,2 +23,3 @@ "PlotColumnPlotter",

from arkas.plotter.continuous_series import ContinuousSeriesPlotter
from arkas.plotter.correlation import CorrelationPlotter
from arkas.plotter.mapping import PlotterDict

@@ -24,0 +26,0 @@ from arkas.plotter.null_value import NullValuePlotter

@@ -131,3 +131,3 @@ r"""Contain the implementation of a DataFrame column plotter."""

PlotColumnPlotter(
(state): DataFrameState(dataframe=(4, 3), figure_config=MatplotlibFigureConfig())
(state): DataFrameState(dataframe=(4, 3), nan_policy='propagate', figure_config=MatplotlibFigureConfig())
)

@@ -134,0 +134,0 @@

@@ -35,3 +35,3 @@ r"""Contain the implementation of a DataFrame column plotter."""

Args:
state: The state containing the DataFrame to analyze.
state: The state containing the DataFrame to analyze.

@@ -153,3 +153,3 @@ Returns:

ScatterColumnPlotter(
(state): ScatterDataFrameState(dataframe=(4, 3), x='col1', y='col2', color='col3', figure_config=MatplotlibFigureConfig())
(state): ScatterDataFrameState(dataframe=(4, 3), x='col1', y='col2', color='col3', nan_policy='propagate', figure_config=MatplotlibFigureConfig())
)

@@ -156,0 +156,0 @@

@@ -178,3 +178,3 @@ r"""Contain the implementation of a DataFrame column plotter."""

TemporalNullValuePlotter(
(state): TemporalDataFrameState(dataframe=(4, 4), temporal_column='datetime', period='1d', figure_config=MatplotlibFigureConfig())
(state): TemporalDataFrameState(dataframe=(4, 4), temporal_column='datetime', period='1d', nan_policy='propagate', figure_config=MatplotlibFigureConfig())
)

@@ -181,0 +181,0 @@

@@ -186,3 +186,3 @@ r"""Contain the implementation of a DataFrame column plotter."""

TemporalPlotColumnPlotter(
(state): TemporalDataFrameState(dataframe=(4, 4), temporal_column='datetime', period=None, figure_config=MatplotlibFigureConfig())
(state): TemporalDataFrameState(dataframe=(4, 4), temporal_column='datetime', period=None, nan_policy='propagate', figure_config=MatplotlibFigureConfig())
)

@@ -189,0 +189,0 @@

@@ -14,2 +14,3 @@ r"""Contain states."""

"SeriesState",
"TargetDataFrameState",
"TemporalDataFrameState",

@@ -26,2 +27,3 @@ ]

from arkas.state.series import SeriesState
from arkas.state.target_dataframe import TargetDataFrameState
from arkas.state.temporal_dataframe import TemporalDataFrameState

@@ -14,2 +14,3 @@ r"""Implement the DataFrame state."""

from arkas.figure.utils import get_default_config
from arkas.metric.utils import check_nan_policy
from arkas.state.base import BaseState

@@ -35,2 +36,5 @@

dataframe: The DataFrame.
nan_policy: The policy on how to handle NaN values in the input
arrays. The following options are available: ``'omit'``,
``'propagate'``, and ``'raise'``.
figure_config: An optional figure configuration.

@@ -53,3 +57,3 @@

>>> state
DataFrameState(dataframe=(7, 3), figure_config=MatplotlibFigureConfig())
DataFrameState(dataframe=(7, 3), nan_policy='propagate', figure_config=MatplotlibFigureConfig())

@@ -62,5 +66,8 @@ ```

dataframe: pl.DataFrame,
nan_policy: str = "propagate",
figure_config: BaseFigureConfig | None = None,
) -> None:
self._dataframe = dataframe
check_nan_policy(nan_policy)
self._nan_policy = nan_policy
self._figure_config = figure_config or get_default_config()

@@ -72,2 +79,3 @@

"dataframe": self._dataframe.shape,
"nan_policy": self._nan_policy,
"figure_config": self._figure_config,

@@ -83,2 +91,3 @@ }

"dataframe": self._dataframe.shape,
"nan_policy": self._nan_policy,
"figure_config": self._figure_config,

@@ -95,2 +104,6 @@ }

@property
def nan_policy(self) -> str:
return self._nan_policy
@property
def figure_config(self) -> BaseFigureConfig | None:

@@ -102,2 +115,3 @@ return self._figure_config

dataframe=self._dataframe.clone() if deep else self._dataframe,
nan_policy=self._nan_policy,
figure_config=self._figure_config.clone() if deep else self._figure_config,

@@ -114,3 +128,4 @@ )

"dataframe": self._dataframe,
"nan_policy": self._nan_policy,
"figure_config": self._figure_config,
}

@@ -36,2 +36,5 @@ r"""Implement the DataFrame state for scatter plots."""

color: An optional color axis data column.
nan_policy: The policy on how to handle NaN values in the input
arrays. The following options are available: ``'omit'``,
``'propagate'``, and ``'raise'``.
figure_config: An optional figure configuration.

@@ -54,3 +57,3 @@

>>> state
ScatterDataFrameState(dataframe=(7, 3), x='col1', y='col2', color=None, figure_config=MatplotlibFigureConfig())
ScatterDataFrameState(dataframe=(7, 3), x='col1', y='col2', color=None, nan_policy='propagate', figure_config=MatplotlibFigureConfig())

@@ -66,5 +69,6 @@ ```

color: str | None = None,
nan_policy: str = "propagate",
figure_config: BaseFigureConfig | None = None,
) -> None:
super().__init__(dataframe=dataframe, figure_config=figure_config)
super().__init__(dataframe=dataframe, nan_policy=nan_policy, figure_config=figure_config)

@@ -86,2 +90,3 @@ check_column_exist(dataframe, x)

"color": self._color,
"nan_policy": self._nan_policy,
"figure_config": self._figure_config,

@@ -100,2 +105,3 @@ }

"color": self._color,
"nan_policy": self._nan_policy,
"figure_config": self._figure_config,

@@ -125,2 +131,3 @@ }

color=self._color,
nan_policy=self._nan_policy,
figure_config=self._figure_config.clone() if deep else self._figure_config,

@@ -127,0 +134,0 @@ )

@@ -35,2 +35,5 @@ r"""Implement the temporal DataFrame state."""

period: An optional temporal period e.g. monthly or daily.
nan_policy: The policy on how to handle NaN values in the input
arrays. The following options are available: ``'omit'``,
``'propagate'``, and ``'raise'``.
figure_config: An optional figure configuration.

@@ -66,3 +69,3 @@

>>> state
TemporalDataFrameState(dataframe=(4, 4), temporal_column='datetime', period=None, figure_config=MatplotlibFigureConfig())
TemporalDataFrameState(dataframe=(4, 4), temporal_column='datetime', period=None, nan_policy='propagate', figure_config=MatplotlibFigureConfig())

@@ -77,5 +80,6 @@ ```

period: str | None = None,
nan_policy: str = "propagate",
figure_config: BaseFigureConfig | None = None,
) -> None:
super().__init__(dataframe=dataframe, figure_config=figure_config)
super().__init__(dataframe=dataframe, nan_policy=nan_policy, figure_config=figure_config)

@@ -92,2 +96,3 @@ check_column_exist(dataframe, temporal_column)

"period": self._period,
"nan_policy": self._nan_policy,
"figure_config": self._figure_config,

@@ -105,2 +110,3 @@ }

"period": self._period,
"nan_policy": self._nan_policy,
"figure_config": self._figure_config,

@@ -125,2 +131,3 @@ }

period=self._period,
nan_policy=self._nan_policy,
figure_config=self._figure_config.clone() if deep else self._figure_config,

@@ -127,0 +134,0 @@ )

@@ -5,3 +5,3 @@ r"""Contain DataFrame utility functions."""

__all__ = ["to_arrays"]
__all__ = ["check_column_exist", "check_num_columns", "to_arrays"]

@@ -16,10 +16,11 @@

def to_arrays(frame: pl.DataFrame) -> dict[str, np.ndarray]:
r"""Convert a ``polars.DataFrame`` to a dictionary of NumPy arrays.
def check_column_exist(frame: pl.DataFrame, col: str) -> None:
r"""Check if a column exists in the DataFrame.
Args:
frame: The DataFrame to convert.
frame: The DataFrame.
col: The column to check.
Returns:
A dictionary of NumPy arrays.
Raises:
ValueError: if the column is missing.

@@ -31,3 +32,3 @@ Example usage:

>>> import polars as pl
>>> from arkas.utils.dataframe import to_arrays
>>> from arkas.utils.dataframe import check_column_exist
>>> frame = pl.DataFrame(

@@ -41,22 +42,21 @@ ... {

... )
>>> data = to_arrays(frame)
>>> data
{'int': array([1, 2, 3, 4, 5]),
'float': array([5., 4., 3., 2., 1.]),
'str': array(['a', 'b', 'c', 'd', 'e'], dtype=object)}
>>> check_column_exist(frame, "int")
```
"""
return {s.name: s.to_numpy() for s in frame.iter_columns()}
if col not in frame:
msg = f"The column {col!r} is not in the DataFrame: {sorted(frame.columns)}"
raise ValueError(msg)
def check_column_exist(frame: pl.DataFrame, col: str) -> None:
r"""Check if a column exists in the DataFrame.
def check_num_columns(frame: pl.DataFrame, num_columns: int) -> None:
r"""Check if the DataFrame has the expected number of columns.
Args:
frame: The DataFrame.
col: The column to check.
num_columns: The expected number of columns.
Raises:
ValueError: if the column is missing.
ValueError: if the DataFrame has not the expected number of
columns.

@@ -68,3 +68,3 @@ Example usage:

>>> import polars as pl
>>> from arkas.utils.dataframe import check_column_exist
>>> from arkas.utils.dataframe import check_num_columns
>>> frame = pl.DataFrame(

@@ -78,8 +78,45 @@ ... {

... )
>>> check_column_exist(frame, "int")
>>> check_num_columns(frame, num_columns=3)
```
"""
if col not in frame:
msg = f"The column {col!r} is not in the DataFrame: {sorted(frame.columns)}"
if frame.shape[1] != num_columns:
msg = (
f"The DataFrame must have {num_columns:,} columns but received a DataFrame of "
f"shape {frame.shape}"
)
raise ValueError(msg)
def to_arrays(frame: pl.DataFrame) -> dict[str, np.ndarray]:
r"""Convert a ``polars.DataFrame`` to a dictionary of NumPy arrays.
Args:
frame: The DataFrame to convert.
Returns:
A dictionary of NumPy arrays.
Example usage:
```pycon
>>> import polars as pl
>>> from arkas.utils.dataframe import to_arrays
>>> frame = pl.DataFrame(
... {
... "int": [1, 2, 3, 4, 5],
... "float": [5.0, 4.0, 3.0, 2.0, 1.0],
... "str": ["a", "b", "c", "d", "e"],
... },
... schema={"int": pl.Int64, "float": pl.Float64, "str": pl.String},
... )
>>> data = to_arrays(frame)
>>> data
{'int': array([1, 2, 3, 4, 5]),
'float': array([5., 4., 3., 2., 1.]),
'str': array(['a', 'b', 'c', 'd', 'e'], dtype=object)}
```
"""
return {s.name: s.to_numpy() for s in frame.iter_columns()}
r"""Implement an analyzer that generates a summary of the DataFrame."""
from __future__ import annotations
__all__ = ["DataFrameSummaryAnalyzer"]
import logging
from typing import TYPE_CHECKING
from arkas.analyzer.lazy import BaseLazyAnalyzer
from arkas.output.frame_summary import DataFrameSummaryOutput
from arkas.utils.validation import check_positive
if TYPE_CHECKING:
import polars as pl
logger = logging.getLogger(__name__)
class DataFrameSummaryAnalyzer(BaseLazyAnalyzer):
r"""Implement an analyzer to show a summary of the DataFrame.
Args:
top: The number of most frequent values to show.
sort: If ``True``, sort the columns by alphabetical order.
Example usage:
```pycon
>>> import polars as pl
>>> from arkas.analyzer import DataFrameSummaryAnalyzer
>>> analyzer = DataFrameSummaryAnalyzer()
>>> analyzer
DataFrameSummaryAnalyzer(top=5, sort=False)
>>> frame = pl.DataFrame(
... {
... "col1": [0, 1, 0, 1],
... "col2": [1, 0, 1, 0],
... "col3": [1, 1, 1, 1],
... },
... schema={"col1": pl.Int64, "col2": pl.Int64, "col3": pl.Int64},
... )
>>> output = analyzer.analyze(frame)
>>> output
DataFrameSummaryOutput(shape=(4, 3), top=5)
```
"""
def __init__(self, top: int = 5, sort: bool = False) -> None:
check_positive(name="top", value=top)
self._top = top
self._sort = bool(sort)
def __repr__(self) -> str:
return f"{self.__class__.__qualname__}(top={self._top:,}, sort={self._sort})"
def _analyze(self, frame: pl.DataFrame) -> DataFrameSummaryOutput:
logger.info("Analyzing the DataFrame...")
if self._sort:
frame = frame.select(sorted(frame.columns))
return DataFrameSummaryOutput(frame=frame, top=self._top)
r"""Contain the implementation of a HTML content generator that returns
a summary of a DataFrame."""
from __future__ import annotations
__all__ = [
"DataFrameSummaryContentGenerator",
"create_table",
"create_table_row",
"create_template",
]
import logging
from collections import Counter
from typing import TYPE_CHECKING, Any
from coola import objects_are_equal
from grizz.utils.count import compute_nunique
from grizz.utils.null import compute_null_count
from jinja2 import Template
from arkas.content.section import BaseSectionContentGenerator
from arkas.utils.validation import check_positive
if TYPE_CHECKING:
from collections.abc import Sequence
import polars as pl
logger = logging.getLogger(__name__)
class DataFrameSummaryContentGenerator(BaseSectionContentGenerator):
r"""Implement a content generator that returns a summary of a
DataFrame.
Args:
frame: The DataFrame to analyze.
top: The number of most frequent values to show.
Example usage:
```pycon
>>> import polars as pl
>>> from arkas.content import DataFrameSummaryContentGenerator
>>> content = DataFrameSummaryContentGenerator(
... frame=pl.DataFrame(
... {
... "col1": [1.2, 4.2, 4.2, 2.2],
... "col2": [1, 1, 1, 1],
... "col3": [1, 2, 2, 2],
... },
... schema={"col1": pl.Float64, "col2": pl.Int64, "col3": pl.Int64},
... )
... )
>>> content
DataFrameSummaryContentGenerator(shape=(4, 3), top=5)
```
"""
def __init__(self, frame: pl.DataFrame, top: int = 5) -> None:
self._frame = frame
check_positive(name="top", value=top)
self._top = top
def __repr__(self) -> str:
return f"{self.__class__.__qualname__}(shape={self._frame.shape}, top={self._top})"
@property
def frame(self) -> pl.DataFrame:
r"""The DataFrame to analyze."""
return self._frame
@property
def top(self) -> int:
return self._top
def equal(self, other: Any, equal_nan: bool = False) -> bool:
if not isinstance(other, self.__class__):
return False
return self.top == other.top and objects_are_equal(
self.frame, other.frame, equal_nan=equal_nan
)
def get_columns(self) -> tuple[str, ...]:
return tuple(self._frame.columns)
def get_null_count(self) -> tuple[int, ...]:
return tuple(compute_null_count(self._frame).tolist())
def get_nunique(self) -> tuple[int, ...]:
return tuple(compute_nunique(self._frame).tolist())
def get_dtypes(self) -> tuple[pl.DataType, ...]:
return tuple(self._frame.schema.dtypes())
def get_most_frequent_values(self, top: int = 5) -> tuple[tuple[tuple[Any, int], ...], ...]:
return tuple(tuple(Counter(series.to_list()).most_common(top)) for series in self.frame)
def generate_content(self) -> str:
logger.info("Generating the DataFrame summary content...")
return Template(create_template()).render(
{
"table": self._create_table(),
"nrows": f"{self._frame.shape[0]:,}",
"ncols": f"{self._frame.shape[1]:,}",
}
)
def _create_table(self) -> str:
return create_table(
columns=self.get_columns(),
null_count=self.get_null_count(),
nunique=self.get_nunique(),
dtypes=self.get_dtypes(),
most_frequent_values=self.get_most_frequent_values(top=self._top),
total=self._frame.shape[0],
)
def create_template() -> str:
r"""Return the template of the content.
Returns:
The content template.
Example usage:
```pycon
>>> from arkas.content.frame_summary import create_template
>>> template = create_template()
```
"""
return """This section shows a short summary of each column.
<ul>
<li> <b>column</b>: are the column names</li>
<li> <b>types</b>: are the object types for the objects in the column </li>
<li> <b>null</b>: are the number (and percentage) of null values in the column </li>
<li> <b>unique</b>: are the number (and percentage) of unique values in the column </li>
</ul>
<p style="margin-top: 1rem;">
<b>General statistics about the DataFrame</b>
<ul>
<li> number of columns: {{ncols}} </li>
<li> number of rows: {{nrows}}</li>
</ul>
{{table}}
"""
def create_table(
columns: Sequence[str],
null_count: Sequence[int],
nunique: Sequence[int],
dtypes: Sequence[pl.DataType],
most_frequent_values: Sequence[Sequence[tuple[Any, int]]],
total: int,
) -> str:
r"""Return a HTML representation of a table with the temporal
distribution of null values.
Args:
columns: The column names.
null_count: The number of null values for each column.
nunique: The number of unique values for each column.
dtypes: The data type for each column.
most_frequent_values: The most frequent values for each column.
total: The total number of rows.
Returns:
The HTML representation of the table.
Example usage:
```pycon
>>> import polars as pl
>>> from arkas.content.frame_summary import create_table_row
>>> row = create_table(
... columns=["float", "int", "str"],
... null_count=(1, 0, 2),
... nunique=(5, 2, 4),
... dtypes=(pl.Float64(), pl.Int64(), pl.String()),
... most_frequent_values=(
... ((2.2, 2), (1.2, 1), (4.2, 1), (None, 1), (1.0, 1)),
... ((1, 5), (0, 1)),
... (("B", 2), (None, 2), ("A", 1), ("C", 1)),
... ),
... total=42,
... )
```
"""
rows = []
for (
column,
null,
nuniq,
dtype,
mf_values,
) in zip(columns, null_count, nunique, dtypes, most_frequent_values):
rows.append(
create_table_row(
column=column,
null=null,
dtype=dtype,
nunique=nuniq,
most_frequent_values=mf_values,
total=total,
)
)
rows = "\n".join(rows)
return Template(
"""<table class="table table-hover table-responsive w-auto" >
<thead class="thead table-group-divider">
<tr>
<th>column</th>
<th>types</th>
<th>null</th>
<th>unique</th>
<th>most frequent values</th>
</tr>
</thead>
<tbody class="tbody table-group-divider">
{{rows}}
<tr class="table-group-divider"></tr>
</tbody>
</table>
"""
).render({"rows": rows})
def create_table_row(
column: str,
null: int,
nunique: int,
dtype: pl.DataType,
most_frequent_values: Sequence[tuple[Any, int]],
total: int,
) -> str:
r"""Create the HTML code of a new table row.
Args:
column: The column name.
null: The number of null values.
nunique: The number of unique values.
dtype: The data type of the column.
most_frequent_values: The most frequent values.
total: The total number of rows.
Returns:
The HTML code of a row.
Example usage:
```pycon
>>> import polars as pl
>>> from arkas.content.frame_summary import create_table_row
>>> row = create_table_row(
... column="col",
... null=5,
... nunique=42,
... dtype=pl.Float64(),
... most_frequent_values=[("C", 12), ("A", 5), ("B", 4)],
... total=100,
... )
```
"""
null = f"{null:,} ({100 * null / total if total else float('nan'):.2f}%)"
nunique = f"{nunique:,} ({100 * nunique / total if total else float('nan'):.2f}%)"
most_frequent_values = ", ".join(
[f"{val} ({100 * c / total:.2f}%)" for val, c in most_frequent_values]
)
return Template(
"""<tr>
<th>{{column}}</th>
<td>{{dtype}}</td>
<td {{num_style}}>{{null}}</td>
<td {{num_style}}>{{nunique}}</td>
<td>{{most_frequent_values}}</td>
</tr>"""
).render(
{
"num_style": 'style="text-align: right;"',
"column": column,
"null": null,
"dtype": dtype,
"nunique": nunique,
"most_frequent_values": most_frequent_values,
}
)
r"""Implement the DataFrame summary output."""
from __future__ import annotations
__all__ = ["DataFrameSummaryOutput"]
from typing import TYPE_CHECKING, Any
from coola import objects_are_equal
from arkas.content.frame_summary import DataFrameSummaryContentGenerator
from arkas.evaluator2.vanilla import Evaluator
from arkas.output.lazy import BaseLazyOutput
from arkas.plotter.vanilla import Plotter
from arkas.utils.validation import check_positive
if TYPE_CHECKING:
import polars as pl
class DataFrameSummaryOutput(BaseLazyOutput):
r"""Implement the DataFrame summary output.
Args:
frame: The DataFrame to analyze.
top: The number of most frequent values to show.
Example usage:
```pycon
>>> import polars as pl
>>> from arkas.output import DataFrameSummaryOutput
>>> frame = pl.DataFrame(
... {
... "col1": [1.2, 4.2, 4.2, 2.2],
... "col2": [1, 1, 1, 1],
... "col3": [1, 2, 2, 2],
... },
... schema={"col1": pl.Float64, "col2": pl.Int64, "col3": pl.Int64},
... )
>>> output = DataFrameSummaryOutput(frame)
>>> output
DataFrameSummaryOutput(shape=(4, 3), top=5)
>>> output.get_content_generator()
DataFrameSummaryContentGenerator(shape=(4, 3), top=5)
>>> output.get_evaluator()
Evaluator(count=0)
>>> output.get_plotter()
Plotter(count=0)
```
"""
def __init__(self, frame: pl.DataFrame, top: int = 5) -> None:
self._frame = frame
check_positive(name="top", value=top)
self._top = top
def __repr__(self) -> str:
return f"{self.__class__.__qualname__}(shape={self._frame.shape}, top={self._top})"
def equal(self, other: Any, equal_nan: bool = False) -> bool:
if not isinstance(other, self.__class__):
return False
return self._top == other._top and objects_are_equal(
self._frame, other._frame, equal_nan=equal_nan
)
def _get_content_generator(self) -> DataFrameSummaryContentGenerator:
return DataFrameSummaryContentGenerator(frame=self._frame, top=self._top)
def _get_evaluator(self) -> Evaluator:
return Evaluator()
def _get_plotter(self) -> Plotter:
return Plotter()