arkas - npm Package Compare versions

+108

src/arkas/analyzer/column_correlation.py

		r"""Implement an analyzer that analyzes the correlation between numeric
		columns."""

		from __future__ import annotations

		__all__ = ["ColumnCorrelationAnalyzer"]

		import logging
		from typing import TYPE_CHECKING

		from grizz.utils.format import str_shape_diff
		from polars import selectors as cs

		from arkas.analyzer.lazy import BaseInNLazyAnalyzer
		from arkas.output import EmptyOutput
		from arkas.output.column_correlation import ColumnCorrelationOutput
		from arkas.state.target_dataframe import TargetDataFrameState

		if TYPE_CHECKING:
		from collections.abc import Sequence

		import polars as pl

		logger = logging.getLogger(__name__)


		class ColumnCorrelationAnalyzer(BaseInNLazyAnalyzer):
		r"""Implement an analyzer to analyze the correlation between numeric
		columns.

		Args:
		columns: The columns to analyze. If ``None``, it analyzes all
		the columns.
		exclude_columns: The columns to exclude from the input
		``columns``. If any column is not found, it will be ignored
		during the filtering process.
		missing_policy: The policy on how to handle missing columns.
		The following options are available: ``'ignore'``,
		``'warn'``, and ``'raise'``. If ``'raise'``, an exception
		is raised if at least one column is missing.
		If ``'warn'``, a warning is raised if at least one column
		is missing and the missing columns are ignored.
		If ``'ignore'``, the missing columns are ignored and
		no warning message appears.

		Example usage:

		```pycon

		>>> import polars as pl
		>>> from arkas.analyzer import ColumnCorrelationAnalyzer
		>>> analyzer = ColumnCorrelationAnalyzer(target_column="col3")
		>>> analyzer
		ColumnCorrelationAnalyzer(target_column='col3', columns=None, exclude_columns=(), missing_policy='raise')
		>>> frame = pl.DataFrame(
		... {
		... "col1": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0],
		... "col2": [7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0],
		... "col3": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0],
		... },
		... )
		>>> output = analyzer.analyze(frame)
		>>> output
		ColumnCorrelationOutput(
		(state): TargetDataFrameState(dataframe=(7, 3), target_column='col3', nan_policy='propagate', figure_config=MatplotlibFigureConfig())
		)

		```
		"""

		def __init__(
		self,
		target_column: str,
		columns: Sequence[str] \| None = None,
		exclude_columns: Sequence[str] = (),
		missing_policy: str = "raise",
		) -> None:
		super().__init__(
		columns=columns, exclude_columns=exclude_columns, missing_policy=missing_policy
		)
		self._target_column = target_column

		def find_columns(self, frame: pl.DataFrame) -> tuple[str, ...]:
		columns = list(super().find_columns(frame))
		if self._target_column not in columns:
		columns.append(self._target_column)
		return tuple(columns)

		def get_args(self) -> dict:
		return {"target_column": self._target_column} \| super().get_args()

		def _analyze(self, frame: pl.DataFrame) -> ColumnCorrelationOutput \| EmptyOutput:
		if self._target_column not in frame:
		logger.info(
		f"Skipping '{self.__class__.__qualname__}.analyze' "
		f"because the target column {self._target_column!r} is missing"
		)
		return EmptyOutput()

		logger.info(
		f"Analyzing the correlation between {self._target_column} and {self._columns}..."
		)
		columns = list(self.find_common_columns(frame))
		out = frame.select(cs.by_name(columns) & cs.numeric())
		logger.info(str_shape_diff(orig=frame.shape, final=out.shape))
		return ColumnCorrelationOutput(
		state=TargetDataFrameState(dataframe=out, target_column=self._target_column)
		)

+152

src/arkas/analyzer/correlation.py

		r"""Implement an analyzer that analyzes the correlation between two
		columns."""

		from __future__ import annotations

		__all__ = ["CorrelationAnalyzer"]

		import logging
		from typing import TYPE_CHECKING, Any

		from coola import objects_are_equal
		from coola.utils.format import repr_mapping_line
		from grizz.utils.column import check_column_missing_policy, check_missing_column
		from grizz.utils.format import str_shape_diff

		from arkas.analyzer.lazy import BaseLazyAnalyzer
		from arkas.metric.utils import check_nan_policy
		from arkas.output import EmptyOutput
		from arkas.output.correlation import CorrelationOutput
		from arkas.state.dataframe import DataFrameState

		if TYPE_CHECKING:
		import polars as pl

		from arkas.figure import BaseFigureConfig

		logger = logging.getLogger(__name__)


		class CorrelationAnalyzer(BaseLazyAnalyzer):
		r"""Implement an analyzer that analyzes the correlation between two
		columns.

		Args:
		x: The first column.
		y: The second column.
		drop_nulls: If ``True``, the rows with null values in
		``x`` or ``y`` columns are dropped.
		missing_policy: The policy on how to handle missing columns.
		The following options are available: ``'ignore'``,
		``'warn'``, and ``'raise'``. If ``'raise'``, an exception
		is raised if at least one column is missing.
		If ``'warn'``, a warning is raised if at least one column
		is missing and the missing columns are ignored.
		If ``'ignore'``, the missing columns are ignored and
		no warning message appears.
		nan_policy: The policy on how to handle NaN values in the input
		arrays. The following options are available: ``'omit'``,
		``'propagate'``, and ``'raise'``.
		figure_config: The figure configuration.

		Example usage:

		```pycon

		>>> import polars as pl
		>>> from arkas.analyzer import CorrelationAnalyzer
		>>> analyzer = CorrelationAnalyzer(x="col1", y="col2")
		>>> analyzer
		CorrelationAnalyzer(x='col1', y='col2', drop_nulls=True, missing_policy='raise', nan_policy='propagate', figure_config=None)
		>>> frame = pl.DataFrame(
		... {
		... "col1": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0],
		... "col2": [7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0],
		... "col3": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0],
		... },
		... schema={"col1": pl.Float64, "col2": pl.Float64, "col3": pl.Float64},
		... )
		>>> output = analyzer.analyze(frame)
		>>> output
		CorrelationOutput(
		(state): DataFrameState(dataframe=(7, 2), figure_config=MatplotlibFigureConfig())
		)

		```
		"""

		def __init__(
		self,
		x: str,
		y: str,
		drop_nulls: bool = True,
		missing_policy: str = "raise",
		nan_policy: str = "propagate",
		figure_config: BaseFigureConfig \| None = None,
		) -> None:
		self._x = x
		self._y = y
		self._drop_nulls = bool(drop_nulls)

		check_column_missing_policy(missing_policy)
		self._missing_policy = missing_policy

		check_nan_policy(nan_policy)
		self._nan_policy = nan_policy

		self._figure_config = figure_config

		def __repr__(self) -> str:
		args = repr_mapping_line(self.get_args())
		return f"{self.__class__.__qualname__}({args})"

		def equal(self, other: Any, equal_nan: bool = False) -> bool:
		if not isinstance(other, self.__class__):
		return False
		return objects_are_equal(self.get_args(), other.get_args(), equal_nan=equal_nan)

		def get_args(self) -> dict:
		return {
		"x": self._x,
		"y": self._y,
		"drop_nulls": self._drop_nulls,
		"missing_policy": self._missing_policy,
		"nan_policy": self._nan_policy,
		"figure_config": self._figure_config,
		}

		def _analyze(self, frame: pl.DataFrame) -> CorrelationOutput \| EmptyOutput:
		self._check_input_column(frame)
		for col in [self._x, self._y]:
		if col not in frame:
		logger.info(
		f"Skipping '{self.__class__.__qualname__}.analyze' "
		f"because the input column {col!r} is missing"
		)
		return EmptyOutput()

		logger.info(f"Analyzing the correlation between {self._x!r} and {self._y!r}...")
		dataframe = self._prepare_data(frame)
		logger.info(str_shape_diff(orig=frame.shape, final=dataframe.shape))
		return CorrelationOutput(
		DataFrameState(
		dataframe=dataframe, nan_policy=self._nan_policy, figure_config=self._figure_config
		)
		)

		def _prepare_data(self, data: pl.DataFrame) -> pl.DataFrame:
		cols = [self._x, self._y]
		data = data.select(cols)
		if self._drop_nulls:
		logger.info(f"Dropping rows that have at least one null value in the columns: {cols}")
		data = data.drop_nulls()
		return data

		def _check_input_column(self, frame: pl.DataFrame) -> None:
		r"""Check if the input column is missing.

		Args:
		frame: The input DataFrame to check.
		"""
		check_missing_column(frame, column=self._x, missing_policy=self._missing_policy)
		check_missing_column(frame, column=self._y, missing_policy=self._missing_policy)

+74

src/arkas/analyzer/numeric_summary.py

		r"""Implement an analyzer that generates a summary of the numeric
		columns of a DataFrame."""

		from __future__ import annotations

		__all__ = ["NumericSummaryAnalyzer"]

		import logging
		from typing import TYPE_CHECKING

		from grizz.utils.format import str_shape_diff
		from polars import selectors as cs

		from arkas.analyzer.lazy import BaseInNLazyAnalyzer
		from arkas.output.numeric_summary import NumericSummaryOutput
		from arkas.state.dataframe import DataFrameState

		if TYPE_CHECKING:
		import polars as pl

		logger = logging.getLogger(__name__)


		class NumericSummaryAnalyzer(BaseInNLazyAnalyzer):
		r"""Implement an analyzer to show a summary of the numeric columns of
		a DataFrame.

		Args:
		columns: The columns to analyze. If ``None``, it analyzes all
		the columns.
		exclude_columns: The columns to exclude from the input
		``columns``. If any column is not found, it will be ignored
		during the filtering process.
		missing_policy: The policy on how to handle missing columns.
		The following options are available: ``'ignore'``,
		``'warn'``, and ``'raise'``. If ``'raise'``, an exception
		is raised if at least one column is missing.
		If ``'warn'``, a warning is raised if at least one column
		is missing and the missing columns are ignored.
		If ``'ignore'``, the missing columns are ignored and
		no warning message appears.

		Example usage:

		```pycon

		>>> import polars as pl
		>>> from arkas.analyzer import NumericSummaryAnalyzer
		>>> analyzer = NumericSummaryAnalyzer()
		>>> analyzer
		NumericSummaryAnalyzer(columns=None, exclude_columns=(), missing_policy='raise')
		>>> frame = pl.DataFrame(
		... {
		... "col1": [0, 1, 1, 0, 0, 1, 0],
		... "col2": [0, 1, 0, 1, 0, 1, 0],
		... "col3": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0],
		... },
		... schema={"col1": pl.Int64, "col2": pl.Int32, "col3": pl.Float64},
		... )
		>>> output = analyzer.analyze(frame)
		>>> output
		NumericSummaryOutput(
		(state): DataFrameState(dataframe=(7, 3), nan_policy='propagate', figure_config=MatplotlibFigureConfig())
		)

		```
		"""

		def _analyze(self, frame: pl.DataFrame) -> NumericSummaryOutput:
		logger.info("Analyzing the numeric columns...")
		columns = self.find_common_columns(frame)
		out = frame.select(cs.by_name(columns) & cs.numeric())
		logger.info(str_shape_diff(orig=frame.shape, final=out.shape))
		return NumericSummaryOutput(state=DataFrameState(out))

+63

src/arkas/analyzer/summary.py

		r"""Implement an analyzer that generates a summary of the DataFrame."""

		from __future__ import annotations

		__all__ = ["SummaryAnalyzer"]

		import logging
		from typing import TYPE_CHECKING

		from arkas.analyzer.lazy import BaseLazyAnalyzer
		from arkas.output.summary import SummaryOutput
		from arkas.utils.validation import check_positive

		if TYPE_CHECKING:
		import polars as pl

		logger = logging.getLogger(__name__)


		class SummaryAnalyzer(BaseLazyAnalyzer):
		r"""Implement an analyzer to show a summary of the DataFrame.

		Args:
		top: The number of most frequent values to show.
		sort: If ``True``, sort the columns by alphabetical order.

		Example usage:

		```pycon

		>>> import polars as pl
		>>> from arkas.analyzer import SummaryAnalyzer
		>>> analyzer = SummaryAnalyzer()
		>>> analyzer
		SummaryAnalyzer(top=5, sort=False)
		>>> frame = pl.DataFrame(
		... {
		... "col1": [0, 1, 0, 1],
		... "col2": [1, 0, 1, 0],
		... "col3": [1, 1, 1, 1],
		... },
		... schema={"col1": pl.Int64, "col2": pl.Int64, "col3": pl.Int64},
		... )
		>>> output = analyzer.analyze(frame)
		>>> output
		SummaryOutput(shape=(4, 3), top=5)

		```
		"""

		def __init__(self, top: int = 5, sort: bool = False) -> None:
		check_positive(name="top", value=top)
		self._top = top
		self._sort = bool(sort)

		def __repr__(self) -> str:
		return f"{self.__class__.__qualname__}(top={self._top:,}, sort={self._sort})"

		def _analyze(self, frame: pl.DataFrame) -> SummaryOutput:
		logger.info("Analyzing the DataFrame...")
		if self._sort:
		frame = frame.select(sorted(frame.columns))
		return SummaryOutput(frame=frame, top=self._top)

+242

src/arkas/content/column_correlation.py

		r"""Contain the implementation of a HTML content generator that analyzes
		the correlation between 1 target column and other columns."""

		from __future__ import annotations

		__all__ = [
		"ColumnCorrelationContentGenerator",
		"create_table",
		"create_table_row",
		"create_template",
		]

		import logging
		from typing import TYPE_CHECKING, Any

		from coola.utils import repr_indent, repr_mapping, str_indent, str_mapping
		from jinja2 import Template

		from arkas.content.section import BaseSectionContentGenerator
		from arkas.evaluator2.column_correlation import ColumnCorrelationEvaluator

		if TYPE_CHECKING:
		from collections.abc import Sequence

		from arkas.state.target_dataframe import TargetDataFrameState

		logger = logging.getLogger(__name__)


		class ColumnCorrelationContentGenerator(BaseSectionContentGenerator):
		r"""Implement a content generator that analyzes the correlation
		between 1 target column and other columns.

		Args:
		state: The state containing the DataFrame to analyze.

		Example usage:

		```pycon

		>>> import polars as pl
		>>> from arkas.content import ColumnCorrelationContentGenerator
		>>> from arkas.state import TargetDataFrameState
		>>> frame = pl.DataFrame(
		... {
		... "col1": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0],
		... "col2": [7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0],
		... "col3": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0],
		... },
		... )
		>>> content = ColumnCorrelationContentGenerator(
		... TargetDataFrameState(frame, target_column="col3")
		... )
		>>> content
		ColumnCorrelationContentGenerator(
		(state): TargetDataFrameState(dataframe=(7, 3), target_column='col3', nan_policy='propagate', figure_config=MatplotlibFigureConfig())
		)

		```
		"""

		def __init__(self, state: TargetDataFrameState) -> None:
		self._state = state

		def __repr__(self) -> str:
		args = repr_indent(repr_mapping({"state": self._state}))
		return f"{self.__class__.__qualname__}(\n {args}\n)"

		def __str__(self) -> str:
		args = str_indent(str_mapping({"state": self._state}))
		return f"{self.__class__.__qualname__}(\n {args}\n)"

		def equal(self, other: Any, equal_nan: bool = False) -> bool:
		if not isinstance(other, self.__class__):
		return False
		return self._state.equal(other._state, equal_nan=equal_nan)

		def generate_content(self) -> str:
		logger.info(
		f"Generating the correlation analysis between {self._state.target_column} "
		f"and {list(self._state.dataframe.columns)}..."
		)
		metrics = ColumnCorrelationEvaluator(self._state).evaluate()
		columns = list(self._state.dataframe.columns)
		columns.remove(self._state.target_column)
		nrows, ncols = self._state.dataframe.shape
		return Template(create_template()).render(
		{
		"nrows": f"{nrows:,}",
		"ncols": f"{ncols:,}",
		"columns": ", ".join(self._state.dataframe.columns),
		"table": create_table(metrics, columns=columns),
		"target_column": f"{self._state.target_column}",
		}
		)


		def create_template() -> str:
		r"""Return the template of the content.

		Returns:
		The content template.

		Example usage:

		```pycon

		>>> from arkas.content.column_correlation import create_template
		>>> template = create_template()

		```
		"""
		return """<p style="margin-top: 1rem;">
		This section analyzes the correlation between <em>{{target_column}}</em> and other columns.
		The correlation coefficient is a statistical measure of the strength of a
		relationship between two variables. Its values can range from -1 to 1.
		<ul>
		<li> A correlation coefficient of -1 describes a perfect negative, or inverse,
		correlation, with values in one series rising as those in the other decline,
		and vice versa. </li>
		<li> A coefficient of 1 shows a perfect positive correlation, or a direct relationship. </li>
		<li> A correlation coefficient of 0 means there is no direct relationship. </li>
		</ul>
		The DataFrame has {{nrows}} rows and {{ncols}} columns.
		</p>

		{{table}}
		"""


		def create_table(metrics: dict[str, dict], columns: Sequence[str]) -> str:
		r"""Return a HTML representation of a table with some statisticts
		about each column.

		Args:
		metrics: The dictionary of metrics.
		columns: The columns to show in the table.

		Returns:
		The HTML representation of the table.

		Example usage:

		```pycon

		>>> import polars as pl
		>>> from arkas.content.column_correlation import create_table
		>>> row = create_table(
		... metrics={
		... "correlation_col1": {
		... "count": 7,
		... "pearson_coeff": 1.0,
		... "pearson_pvalue": 0.0,
		... "spearman_coeff": 1.0,
		... "spearman_pvalue": 0.0,
		... },
		... "correlation_col2": {
		... "count": 7,
		... "pearson_coeff": -1.0,
		... "pearson_pvalue": 0.0,
		... "spearman_coeff": -1.0,
		... "spearman_pvalue": 0.0,
		... },
		... },
		... columns=["col1", "col2"],
		... )

		```
		"""
		rows = "\n".join(
		[create_table_row(column=col, metrics=metrics[f"correlation_{col}"]) for col in columns]
		)
		return Template(
		"""<table class="table table-hover table-responsive w-auto" >
		<thead class="thead table-group-divider">
		<tr>
		<th>column</th>
		<th>num samples</th>
		<th>pearson coefficient</th>
		<th>pearson p-value</th>
		<th>spearman coefficient</th>
		<th>spearman p-value</th>
		</tr>
		</thead>
		<tbody class="tbody table-group-divider">
		{{rows}}
		<tr class="table-group-divider"></tr>
		</tbody>
		</table>
		"""
		).render({"rows": rows})


		def create_table_row(column: str, metrics: dict) -> str:
		r"""Create the HTML code of a new table row.

		Args:
		column: The column name
		metrics: The dictionary of metrics with the correlation scores.

		Returns:
		The HTML code of a row.

		Example usage:

		```pycon

		>>> import polars as pl
		>>> from arkas.content.column_correlation import create_table_row
		>>> row = create_table_row(
		... column="col1",
		... metrics={
		... "count": 7,
		... "pearson_coeff": 1.0,
		... "pearson_pvalue": 0.0,
		... "spearman_coeff": 1.0,
		... "spearman_pvalue": 0.0,
		... },
		... )

		```
		"""
		return Template(
		"""<tr>
		<th>{{column}}</th>
		<td {{num_style}}>{{count}}</td>
		<td {{num_style}}>{{pearson_coeff}}</td>
		<td {{num_style}}>{{pearson_pvalue}}</td>
		<td {{num_style}}>{{spearman_coeff}}</td>
		<td {{num_style}}>{{spearman_pvalue}}</td>
		</tr>"""
		).render(
		{
		"num_style": 'style="text-align: right;"',
		"column": column,
		"count": f'{metrics.get("count", 0):,}',
		"pearson_coeff": f'{metrics.get("pearson_coeff", float("nan")):.4f}',
		"pearson_pvalue": f'{metrics.get("pearson_pvalue", float("nan")):.4f}',
		"spearman_coeff": f'{metrics.get("spearman_coeff", float("nan")):.4f}',
		"spearman_pvalue": f'{metrics.get("spearman_pvalue", float("nan")):.4f}',
		}
		)

+129

src/arkas/content/correlation.py

		r"""Contain the implementation of a HTML content generator that analyzes
		the correlation between two columns."""

		from __future__ import annotations

		__all__ = [
		"CorrelationContentGenerator",
		"create_template",
		]

		import logging
		from typing import TYPE_CHECKING, Any

		from coola.utils import repr_indent, repr_mapping, str_indent, str_mapping
		from jinja2 import Template

		from arkas.content.section import BaseSectionContentGenerator
		from arkas.evaluator2.correlation import CorrelationEvaluator
		from arkas.figure.utils import figure2html
		from arkas.plotter.correlation import CorrelationPlotter
		from arkas.utils.dataframe import check_num_columns

		if TYPE_CHECKING:
		from arkas.state.target_dataframe import DataFrameState

		logger = logging.getLogger(__name__)


		class CorrelationContentGenerator(BaseSectionContentGenerator):
		r"""Implement a content generator that analyzes the correlation
		between two columns.

		Args:
		state: The state containing the DataFrame to analyze.
		The DataFrame must have only 2 columns, which are the two
		columns to analyze.

		Example usage:

		```pycon

		>>> import polars as pl
		>>> from arkas.content import CorrelationContentGenerator
		>>> from arkas.state import DataFrameState
		>>> frame = pl.DataFrame(
		... {
		... "col1": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0],
		... "col2": [7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0],
		... },
		... )
		>>> content = CorrelationContentGenerator(DataFrameState(frame))
		>>> content
		CorrelationContentGenerator(
		(state): DataFrameState(dataframe=(7, 2), nan_policy='propagate', figure_config=MatplotlibFigureConfig())
		)

		```
		"""

		def __init__(self, state: DataFrameState) -> None:
		check_num_columns(state.dataframe, num_columns=2)
		self._state = state

		def __repr__(self) -> str:
		args = repr_indent(repr_mapping({"state": self._state}))
		return f"{self.__class__.__qualname__}(\n {args}\n)"

		def __str__(self) -> str:
		args = str_indent(str_mapping({"state": self._state}))
		return f"{self.__class__.__qualname__}(\n {args}\n)"

		def equal(self, other: Any, equal_nan: bool = False) -> bool:
		if not isinstance(other, self.__class__):
		return False
		return self._state.equal(other._state, equal_nan=equal_nan)

		def generate_content(self) -> str:
		xcol, ycol = self._state.dataframe.columns
		logger.info(f"Generating the correlation analysis between {xcol} and {ycol}...")
		metrics = CorrelationEvaluator(self._state).evaluate()
		figures = CorrelationPlotter(self._state).plot()
		return Template(create_template()).render(
		{
		"xcol": str(xcol),
		"ycol": str(ycol),
		"columns": ", ".join(self._state.dataframe.columns),
		"count": f"{metrics['count']:,}",
		"pearson_coeff": f"{metrics['pearson_coeff']:.4f}",
		"pearson_pvalue": f"{metrics['pearson_pvalue']:.4f}",
		"spearman_coeff": f"{metrics['spearman_coeff']:.4f}",
		"spearman_pvalue": f"{metrics['spearman_pvalue']:.4f}",
		"figure": figure2html(figures["correlation"], close_fig=True),
		}
		)


		def create_template() -> str:
		r"""Return the template of the content.

		Returns:
		The content template.

		Example usage:

		```pycon

		>>> from arkas.content.correlation import create_template
		>>> template = create_template()

		```
		"""
		return """<p style="margin-top: 1rem;">
		This section analyzes the correlation between <em>{{xcol}}</em> and <em>{{ycol}}</em>.
		The correlation coefficient is a statistical measure of the strength of a
		relationship between two variables. Its values can range from -1 to 1.

		<ul>
		<li> <b>pearson coefficient</b>: {{pearson_coeff}} </li>
		<li> <b>pearson p-value</b>: {{pearson_pvalue}} </li>
		<li> <b>spearman coefficient</b>: {{spearman_coeff}} </li>
		<li> <b>spearman p-value</b>: {{spearman_pvalue}} </li>
		<li> <b>num samples</b>: {{count}} </li>
		</ul>

		<p style="margin-top: 1rem;">
		The following figure shows the scatter plot between <em>{{xcol}}</em> and <em>{{ycol}}</em>.
		</p>
		{{figure}}
		"""

+377

src/arkas/content/numeric_summary.py

		r"""Contain the implementation of a HTML content generator that
		summarizes the numeric columns of a DataFrame."""

		from __future__ import annotations

		__all__ = [
		"NumericSummaryContentGenerator",
		"create_table",
		"create_table_quantiles",
		"create_table_quantiles_row",
		"create_table_row",
		"create_template",
		]

		import logging
		from typing import TYPE_CHECKING, Any

		from coola.utils import repr_indent, repr_mapping, str_indent, str_mapping
		from jinja2 import Template

		from arkas.content.section import BaseSectionContentGenerator
		from arkas.content.utils import float_to_str
		from arkas.utils.stats import compute_statistics_continuous

		if TYPE_CHECKING:
		import polars as pl

		from arkas.state.dataframe import DataFrameState

		logger = logging.getLogger(__name__)


		class NumericSummaryContentGenerator(BaseSectionContentGenerator):
		r"""Implement a content generator that summarizes the numeric columns
		of a DataFrame.

		Args:
		state: The state containing the DataFrame to analyze.

		Example usage:

		```pycon

		>>> import polars as pl
		>>> from arkas.content import NumericSummaryContentGenerator
		>>> from arkas.state import DataFrameState
		>>> dataframe = pl.DataFrame(
		... {
		... "col1": [0, 1, 1, 0, 0, 1, 0],
		... "col2": [0, 1, 0, 1, 0, 1, 0],
		... "col3": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0],
		... }
		... )
		>>> content = NumericSummaryContentGenerator(DataFrameState(dataframe))
		>>> content
		NumericSummaryContentGenerator(
		(state): DataFrameState(dataframe=(7, 3), nan_policy='propagate', figure_config=MatplotlibFigureConfig())
		)

		```
		"""

		def __init__(self, state: DataFrameState) -> None:
		self._state = state

		def __repr__(self) -> str:
		args = repr_indent(repr_mapping({"state": self._state}))
		return f"{self.__class__.__qualname__}(\n {args}\n)"

		def __str__(self) -> str:
		args = str_indent(str_mapping({"state": self._state}))
		return f"{self.__class__.__qualname__}(\n {args}\n)"

		def equal(self, other: Any, equal_nan: bool = False) -> bool:
		if not isinstance(other, self.__class__):
		return False
		return self._state.equal(other._state, equal_nan=equal_nan)

		def generate_content(self) -> str:
		nrows, ncols = self._state.dataframe.shape
		logger.info(f"Generating the summary of {ncols:,} numeric columns...")
		return Template(create_template()).render(
		{
		"nrows": f"{nrows:,}",
		"ncols": f"{ncols:,}",
		"columns": ", ".join(self._state.dataframe.columns),
		"table": create_table(self._state.dataframe),
		"table_quantiles": create_table_quantiles(self._state.dataframe),
		}
		)


		def create_template() -> str:
		r"""Return the template of the content.

		Returns:
		The content template.

		Example usage:

		```pycon

		>>> from arkas.content.numeric_summary import create_template
		>>> template = create_template()

		```
		"""
		return """This section shows a short summary of each column.

		<ul>
		<li> <b>column</b>: is the column name</li>
		<li> <b>dtype</b>: is the column data type </li>
		<li> <b>null</b>: is the number (and percentage) of null values in the column </li>
		<li> <b>nan</b>: is the number (and percentage) of not a number (NaN) values in the column </li>
		<li> <b>unique</b>: is the number (and percentage) of unique values in the column </li>
		<li> <b>negative</b>: is the number (and percentage) of strictly negative values (<span><</span>0) in the column </li>
		<li> <b>zero</b>: is the number (and percentage) of zero values (=0) in the column </li>
		<li> <b>positive</b>: is the number (and percentage) of strictly positive values (<span>></span>0) in the column </li>
		</ul>

		<p style="margin-top: 1rem;">
		<b>General statistics about the DataFrame</b>
		{{table}}

		<details>
		<summary>[show additional statistics]</summary>

		<p style="margin-top: 1rem;">
		The following table shows some quantiles for each column. </p>

		{{table_quantiles}}
		</details>
		"""


		def create_table(
		frame: pl.DataFrame,
		) -> str:
		r"""Return a HTML representation of a table with some statisticts
		about each column.

		Args:
		frame: The DataFrame to analyze.

		Returns:
		The HTML representation of the table.

		Example usage:

		```pycon

		>>> import polars as pl
		>>> from arkas.content.numeric_summary import create_table
		>>> dataframe = pl.DataFrame(
		... {
		... "col1": [0, 1, 1, 0, 0, 1, 0],
		... "col2": [0, 1, 0, 1, 0, 1, 0],
		... "col3": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0],
		... }
		... )
		>>> row = create_table(dataframe)

		```
		"""
		rows = "\n".join([create_table_row(series=series) for series in frame])
		return Template(
		"""<table class="table table-hover table-responsive w-auto" >
		<thead class="thead table-group-divider">
		<tr>
		<th>column</th>
		<th>dtype</th>
		<th>null</th>
		<th>nan</th>
		<th>unique</th>
		<th>negative</th>
		<th>zero</th>
		<th>positive</th>
		<th>mean</th>
		<th>std</th>
		<th>skewness</th>
		<th>kurtosis</th>
		<th>min</th>
		<th>median</th>
		<th>max</th>
		</tr>
		</thead>
		<tbody class="tbody table-group-divider">
		{{rows}}
		<tr class="table-group-divider"></tr>
		</tbody>
		</table>
		"""
		).render({"rows": rows})


		def create_table_row(series: pl.Series) -> str:
		r"""Create the HTML code of a new table row.

		Args:
		series: The series to analyze.

		Returns:
		The HTML code of a row.

		Example usage:

		```pycon

		>>> import polars as pl
		>>> from arkas.content.numeric_summary import create_table_row
		>>> row = create_table_row(pl.Series("col1", [1, 2, 3, 4, 5, 6, 7]))

		```
		"""
		stats = compute_statistics_continuous(series)
		nan = int(series.is_nan().sum())
		null = stats["num_nulls"]
		nunique = stats["nunique"]
		total = stats["count"]
		negative = stats["<0"]
		zero = stats["=0"]
		positive = stats[">0"]
		return Template(
		"""<tr>
		<th>{{column}}</th>
		<td>{{dtype}}</td>
		<td {{num_style}}>{{null}}</td>
		<td {{num_style}}>{{nan}}</td>
		<td {{num_style}}>{{nunique}}</td>
		<td {{num_style}}>{{negative}}</td>
		<td {{num_style}}>{{zero}}</td>
		<td {{num_style}}>{{positive}}</td>
		<td {{num_style}}>{{mean}}</td>
		<td {{num_style}}>{{std}}</td>
		<td {{num_style}}>{{skewness}}</td>
		<td {{num_style}}>{{kurtosis}}</td>
		<td {{num_style}}>{{min}}</td>
		<td {{num_style}}>{{median}}</td>
		<td {{num_style}}>{{max}}</td>
		</tr>"""
		).render(
		{
		"num_style": 'style="text-align: right;"',
		"column": series.name,
		"dtype": series.dtype,
		"null": f"{null:,} ({100 * null / total if total else float('nan'):.2f}%)",
		"nan": f"{nan:,} ({100 * nan / total if total else float('nan'):.2f}%)",
		"nunique": f"{nunique:,} ({100 * nunique / total if total else float('nan'):.2f}%)",
		"mean": float_to_str(stats["mean"]),
		"std": float_to_str(stats["std"]),
		"skewness": float_to_str(stats["skewness"]),
		"kurtosis": float_to_str(stats["kurtosis"]),
		"min": float_to_str(stats["min"]),
		"median": float_to_str(stats["median"]),
		"max": float_to_str(stats["max"]),
		"negative": f"{negative:,} ({100 * negative / total if total else float('nan'):.2f}%)",
		"zero": f"{zero:,} ({100 * zero / total if total else float('nan'):.2f}%)",
		"positive": f"{positive:,} ({100 * positive / total if total else float('nan'):.2f}%)",
		}
		)


		def create_table_quantiles(
		frame: pl.DataFrame,
		) -> str:
		r"""Return a HTML representation of a table with quantile statisticts
		for each column.

		Args:
		frame: The DataFrame to analyze.

		Returns:
		The HTML representation of the table.

		Example usage:

		```pycon

		>>> import polars as pl
		>>> from arkas.content.numeric_summary import create_table
		>>> dataframe = pl.DataFrame(
		... {
		... "col1": [0, 1, 1, 0, 0, 1, 0],
		... "col2": [0, 1, 0, 1, 0, 1, 0],
		... "col3": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0],
		... }
		... )
		>>> row = create_table(dataframe)

		```
		"""
		rows = "\n".join([create_table_quantiles_row(series=series) for series in frame])
		return Template(
		"""<table class="table table-hover table-responsive w-auto" >
		<thead class="thead table-group-divider">
		<tr>
		<th>column</th>
		<th>min</th>
		<th>q0.001</th>
		<th>q0.01</th>
		<th>q0.05</th>
		<th>q0.10</th>
		<th>q0.25</th>
		<th>median</th>
		<th>q0.75</th>
		<th>q0.90</th>
		<th>q0.95</th>
		<th>q0.99</th>
		<th>q0.999</th>
		<th>max</th>
		</tr>
		</thead>
		<tbody class="tbody table-group-divider">
		{{rows}}
		<tr class="table-group-divider"></tr>
		</tbody>
		</table>
		"""
		).render({"rows": rows})


		def create_table_quantiles_row(series: pl.Series) -> str:
		r"""Create the HTML code of a new table row.

		Args:
		series: The series to analyze.

		Returns:
		The HTML code of a row.

		Example usage:

		```pycon

		>>> import polars as pl
		>>> from arkas.content.numeric_summary import create_table_row
		>>> row = create_table_row(pl.Series("col1", [1, 2, 3, 4, 5, 6, 7]))

		```
		"""
		stats = compute_statistics_continuous(series)
		return Template(
		"""<tr>
		<th>{{column}}</th>
		<td {{num_style}}>{{min}}</td>
		<td {{num_style}}>{{q001}}</td>
		<td {{num_style}}>{{q01}}</td>
		<td {{num_style}}>{{q05}}</td>
		<td {{num_style}}>{{q10}}</td>
		<td {{num_style}}>{{q25}}</td>
		<td {{num_style}}>{{median}}</td>
		<td {{num_style}}>{{q75}}</td>
		<td {{num_style}}>{{q90}}</td>
		<td {{num_style}}>{{q95}}</td>
		<td {{num_style}}>{{q99}}</td>
		<td {{num_style}}>{{q999}}</td>
		<td {{num_style}}>{{max}}</td>
		</tr>"""
		).render(
		{
		"num_style": 'style="text-align: right;"',
		"column": series.name,
		"min": float_to_str(stats["min"]),
		"q001": float_to_str(stats["q001"]),
		"q01": float_to_str(stats["q01"]),
		"q05": float_to_str(stats["q05"]),
		"q10": float_to_str(stats["q10"]),
		"q25": float_to_str(stats["q25"]),
		"median": float_to_str(stats["median"]),
		"q75": float_to_str(stats["q75"]),
		"q90": float_to_str(stats["q90"]),
		"q95": float_to_str(stats["q95"]),
		"q99": float_to_str(stats["q99"]),
		"q999": float_to_str(stats["q999"]),
		"max": float_to_str(stats["max"]),
		}
		)

+301

src/arkas/content/summary.py

		r"""Contain the implementation of a HTML content generator that returns
		a summary of a DataFrame."""

		from __future__ import annotations

		__all__ = [
		"SummaryContentGenerator",
		"create_table",
		"create_table_row",
		"create_template",
		]

		import logging
		from collections import Counter
		from typing import TYPE_CHECKING, Any

		from coola import objects_are_equal
		from grizz.utils.count import compute_nunique
		from grizz.utils.null import compute_null_count
		from jinja2 import Template

		from arkas.content.section import BaseSectionContentGenerator
		from arkas.utils.validation import check_positive

		if TYPE_CHECKING:
		from collections.abc import Sequence

		import polars as pl

		logger = logging.getLogger(__name__)


		class SummaryContentGenerator(BaseSectionContentGenerator):
		r"""Implement a content generator that returns a summary of a
		DataFrame.

		Args:
		frame: The DataFrame to analyze.
		top: The number of most frequent values to show.

		Example usage:

		```pycon

		>>> import polars as pl
		>>> from arkas.content import SummaryContentGenerator
		>>> content = SummaryContentGenerator(
		... frame=pl.DataFrame(
		... {
		... "col1": [1.2, 4.2, 4.2, 2.2],
		... "col2": [1, 1, 1, 1],
		... "col3": [1, 2, 2, 2],
		... },
		... schema={"col1": pl.Float64, "col2": pl.Int64, "col3": pl.Int64},
		... )
		... )
		>>> content
		SummaryContentGenerator(shape=(4, 3), top=5)

		```
		"""

		def __init__(self, frame: pl.DataFrame, top: int = 5) -> None:
		self._frame = frame
		check_positive(name="top", value=top)
		self._top = top

		def __repr__(self) -> str:
		return f"{self.__class__.__qualname__}(shape={self._frame.shape}, top={self._top})"

		@property
		def frame(self) -> pl.DataFrame:
		r"""The DataFrame to analyze."""
		return self._frame

		@property
		def top(self) -> int:
		return self._top

		def equal(self, other: Any, equal_nan: bool = False) -> bool:
		if not isinstance(other, self.__class__):
		return False
		return self.top == other.top and objects_are_equal(
		self.frame, other.frame, equal_nan=equal_nan
		)

		def get_columns(self) -> tuple[str, ...]:
		return tuple(self._frame.columns)

		def get_null_count(self) -> tuple[int, ...]:
		return tuple(compute_null_count(self._frame).tolist())

		def get_nunique(self) -> tuple[int, ...]:
		return tuple(compute_nunique(self._frame).tolist())

		def get_dtypes(self) -> tuple[pl.DataType, ...]:
		return tuple(self._frame.schema.dtypes())

		def get_most_frequent_values(self, top: int = 5) -> tuple[tuple[tuple[Any, int], ...], ...]:
		return tuple(tuple(Counter(series.to_list()).most_common(top)) for series in self.frame)

		def generate_content(self) -> str:
		logger.info("Generating the DataFrame summary content...")
		return Template(create_template()).render(
		{
		"table": self._create_table(),
		"nrows": f"{self._frame.shape[0]:,}",
		"ncols": f"{self._frame.shape[1]:,}",
		}
		)

		def _create_table(self) -> str:
		return create_table(
		columns=self.get_columns(),
		null_count=self.get_null_count(),
		nunique=self.get_nunique(),
		dtypes=self.get_dtypes(),
		most_frequent_values=self.get_most_frequent_values(top=self._top),
		total=self._frame.shape[0],
		)


		def create_template() -> str:
		r"""Return the template of the content.

		Returns:
		The content template.

		Example usage:

		```pycon

		>>> from arkas.content.summary import create_template
		>>> template = create_template()

		```
		"""
		return """This section shows a short summary of each column.

		<ul>
		<li> <b>column</b>: are the column names</li>
		<li> <b>types</b>: are the object types for the objects in the column </li>
		<li> <b>null</b>: are the number (and percentage) of null values in the column </li>
		<li> <b>unique</b>: are the number (and percentage) of unique values in the column </li>
		</ul>

		<p style="margin-top: 1rem;">
		<b>General statistics about the DataFrame</b>

		<ul>
		<li> number of columns: {{ncols}} </li>
		<li> number of rows: {{nrows}}</li>
		</ul>

		{{table}}
		"""


		def create_table(
		columns: Sequence[str],
		null_count: Sequence[int],
		nunique: Sequence[int],
		dtypes: Sequence[pl.DataType],
		most_frequent_values: Sequence[Sequence[tuple[Any, int]]],
		total: int,
		) -> str:
		r"""Return a HTML representation of a table with the temporal
		distribution of null values.

		Args:
		columns: The column names.
		null_count: The number of null values for each column.
		nunique: The number of unique values for each column.
		dtypes: The data type for each column.
		most_frequent_values: The most frequent values for each column.
		total: The total number of rows.

		Returns:
		The HTML representation of the table.

		Example usage:

		```pycon

		>>> import polars as pl
		>>> from arkas.content.summary import create_table
		>>> row = create_table(
		... columns=["float", "int", "str"],
		... null_count=(1, 0, 2),
		... nunique=(5, 2, 4),
		... dtypes=(pl.Float64(), pl.Int64(), pl.String()),
		... most_frequent_values=(
		... ((2.2, 2), (1.2, 1), (4.2, 1), (None, 1), (1.0, 1)),
		... ((1, 5), (0, 1)),
		... (("B", 2), (None, 2), ("A", 1), ("C", 1)),
		... ),
		... total=42,
		... )

		```
		"""
		rows = []
		for (
		column,
		null,
		nuniq,
		dtype,
		mf_values,
		) in zip(columns, null_count, nunique, dtypes, most_frequent_values):
		rows.append(
		create_table_row(
		column=column,
		null=null,
		dtype=dtype,
		nunique=nuniq,
		most_frequent_values=mf_values,
		total=total,
		)
		)
		rows = "\n".join(rows)
		return Template(
		"""<table class="table table-hover table-responsive w-auto" >
		<thead class="thead table-group-divider">
		<tr>
		<th>column</th>
		<th>types</th>
		<th>null</th>
		<th>unique</th>
		<th>most frequent values</th>
		</tr>
		</thead>
		<tbody class="tbody table-group-divider">
		{{rows}}
		<tr class="table-group-divider"></tr>
		</tbody>
		</table>
		"""
		).render({"rows": rows})


		def create_table_row(
		column: str,
		null: int,
		nunique: int,
		dtype: pl.DataType,
		most_frequent_values: Sequence[tuple[Any, int]],
		total: int,
		) -> str:
		r"""Create the HTML code of a new table row.

		Args:
		column: The column name.
		null: The number of null values.
		nunique: The number of unique values.
		dtype: The data type of the column.
		most_frequent_values: The most frequent values.
		total: The total number of rows.

		Returns:
		The HTML code of a row.

		Example usage:

		```pycon

		>>> import polars as pl
		>>> from arkas.content.summary import create_table_row
		>>> row = create_table_row(
		... column="col",
		... null=5,
		... nunique=42,
		... dtype=pl.Float64(),
		... most_frequent_values=[("C", 12), ("A", 5), ("B", 4)],
		... total=100,
		... )

		```
		"""
		null = f"{null:,} ({100 * null / total if total else float('nan'):.2f}%)"
		nunique = f"{nunique:,} ({100 * nunique / total if total else float('nan'):.2f}%)"
		most_frequent_values = ", ".join(
		[f"{val} ({100 * c / total:.2f}%)" for val, c in most_frequent_values]
		)
		return Template(
		"""<tr>
		<th>{{column}}</th>
		<td>{{dtype}}</td>
		<td {{num_style}}>{{null}}</td>
		<td {{num_style}}>{{nunique}}</td>
		<td>{{most_frequent_values}}</td>
		</tr>"""
		).render(
		{
		"num_style": 'style="text-align: right;"',
		"column": column,
		"null": null,
		"dtype": dtype,
		"nunique": nunique,
		"most_frequent_values": most_frequent_values,
		}
		)

+53

src/arkas/content/utils.py

		r"""Contain utility functions."""

		from __future__ import annotations

		__all__ = ["float_to_str", "to_str"]

		from typing import Any


		def to_str(value: Any) -> str:
		r"""Return a string representation of the input value.

		Args:
		value: The value to encode.

		Returns:
		The string representation of the input value.

		Example usage:

		```pycon

		>>> from arkas.content.utils import to_str
		>>> to_str(42)
		42

		```
		"""
		if isinstance(value, (int, float)):
		return float_to_str(value)
		return str(value)


		def float_to_str(value: float) -> str:
		r"""Return a string representation of the input value.

		Args:
		value: The value to encode.

		Returns:
		The string representation of the input value.

		Example usage:

		```pycon

		>>> from arkas.content.utils import float_to_str
		>>> float_to_str(42)
		42

		```
		"""
		return f"{value:.4g}"

+84

src/arkas/evaluator2/column_correlation.py

		r"""Implement the pairwise column correlation evaluator."""

		from __future__ import annotations

		__all__ = ["ColumnCorrelationEvaluator"]

		from typing import TYPE_CHECKING, Any

		from coola.utils import repr_indent, repr_mapping, str_indent, str_mapping

		from arkas.evaluator2.base import BaseEvaluator
		from arkas.evaluator2.vanilla import Evaluator
		from arkas.metric import pearsonr, spearmanr

		if TYPE_CHECKING:

		from arkas.state.target_dataframe import TargetDataFrameState


		class ColumnCorrelationEvaluator(BaseEvaluator):
		r"""Implement the column correlation evaluator.

		Args:
		state: The state with the DataFrame to analyze.

		Example usage:

		```pycon

		>>> import polars as pl
		>>> from arkas.evaluator2 import ColumnCorrelationEvaluator
		>>> from arkas.state import TargetDataFrameState
		>>> frame = pl.DataFrame(
		... {
		... "col1": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0],
		... "col2": [7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0],
		... "col3": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0],
		... },
		... )
		>>> evaluator = ColumnCorrelationEvaluator(
		... TargetDataFrameState(frame, target_column="col3")
		... )
		>>> evaluator
		ColumnCorrelationEvaluator(
		(state): TargetDataFrameState(dataframe=(7, 3), target_column='col3', nan_policy='propagate', figure_config=MatplotlibFigureConfig())
		)
		>>> evaluator.evaluate()
		{'correlation_col1': {'count': 7, 'pearson_coeff': 1.0, 'pearson_pvalue': 0.0, 'spearman_coeff': 1.0, 'spearman_pvalue': 0.0},
		'correlation_col2': {'count': 7, 'pearson_coeff': -1.0, 'pearson_pvalue': 0.0, 'spearman_coeff': -1.0, 'spearman_pvalue': 0.0}}

		```
		"""

		def __init__(self, state: TargetDataFrameState) -> None:
		self._state = state

		def __repr__(self) -> str:
		args = repr_indent(repr_mapping({"state": self._state}))
		return f"{self.__class__.__qualname__}(\n {args}\n)"

		def __str__(self) -> str:
		args = str_indent(str_mapping({"state": self._state}))
		return f"{self.__class__.__qualname__}(\n {args}\n)"

		def compute(self) -> Evaluator:
		return Evaluator(metrics=self.evaluate())

		def equal(self, other: Any, equal_nan: bool = False) -> bool:
		if not isinstance(other, self.__class__):
		return False
		return self._state.equal(other._state, equal_nan=equal_nan)

		def evaluate(self, prefix: str = "", suffix: str = "") -> dict[str, dict]:
		target_column = self._state.target_column
		columns = list(self._state.dataframe.columns)
		columns.remove(target_column)

		out = {}
		for col in columns:
		frame = self._state.dataframe.select([col, target_column]).drop_nulls().drop_nans()
		x = frame[target_column].to_numpy()
		y = frame[col].to_numpy()
		out[f"{prefix}correlation_{col}{suffix}"] = pearsonr(x, y) \| spearmanr(x, y)
		return out

+79

src/arkas/evaluator2/correlation.py

		r"""Implement the pairwise column correlation evaluator."""

		from __future__ import annotations

		__all__ = ["CorrelationEvaluator"]

		from typing import TYPE_CHECKING, Any

		from coola.utils import repr_indent, repr_mapping, str_indent, str_mapping

		from arkas.evaluator2.base import BaseEvaluator
		from arkas.evaluator2.vanilla import Evaluator
		from arkas.metric import pearsonr, spearmanr
		from arkas.utils.dataframe import check_num_columns

		if TYPE_CHECKING:

		from arkas.state.target_dataframe import DataFrameState


		class CorrelationEvaluator(BaseEvaluator):
		r"""Implement the pairwise column correlation evaluator.

		Args:
		state: The state with the DataFrame to analyze.
		The DataFrame must have only 2 columns, which are the two
		columns to analyze.

		Example usage:

		```pycon

		>>> import polars as pl
		>>> from arkas.evaluator2 import CorrelationEvaluator
		>>> from arkas.state import DataFrameState
		>>> frame = pl.DataFrame(
		... {
		... "col1": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0],
		... "col3": [2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0],
		... },
		... )
		>>> evaluator = CorrelationEvaluator(DataFrameState(frame))
		>>> evaluator
		CorrelationEvaluator(
		(state): DataFrameState(dataframe=(7, 2), nan_policy='propagate', figure_config=MatplotlibFigureConfig())
		)
		>>> evaluator.evaluate()
		{'count': 7, 'pearson_coeff': 1.0, 'pearson_pvalue': 0.0, 'spearman_coeff': 1.0, 'spearman_pvalue': 0.0}

		```
		"""

		def __init__(self, state: DataFrameState) -> None:
		check_num_columns(state.dataframe, num_columns=2)
		self._state = state

		def __repr__(self) -> str:
		args = repr_indent(repr_mapping({"state": self._state}))
		return f"{self.__class__.__qualname__}(\n {args}\n)"

		def __str__(self) -> str:
		args = str_indent(str_mapping({"state": self._state}))
		return f"{self.__class__.__qualname__}(\n {args}\n)"

		def compute(self) -> Evaluator:
		return Evaluator(metrics=self.evaluate())

		def equal(self, other: Any, equal_nan: bool = False) -> bool:
		if not isinstance(other, self.__class__):
		return False
		return self._state.equal(other._state, equal_nan=equal_nan)

		def evaluate(self, prefix: str = "", suffix: str = "") -> dict[str, float]:
		frame = self._state.dataframe.drop_nulls().drop_nans()
		x = frame[frame.columns[0]].to_numpy()
		y = frame[frame.columns[1]].to_numpy()
		return pearsonr(x=x, y=y, prefix=prefix, suffix=suffix) \| spearmanr(
		x=x, y=y, prefix=prefix, suffix=suffix
		)

+83

src/arkas/output/column_correlation.py

		r"""Implement an output to analyze the correlation between columns."""

		from __future__ import annotations

		__all__ = ["ColumnCorrelationOutput"]

		from typing import TYPE_CHECKING, Any

		from coola.utils import repr_indent, repr_mapping, str_indent, str_mapping

		from arkas.content.column_correlation import ColumnCorrelationContentGenerator
		from arkas.evaluator2.column_correlation import ColumnCorrelationEvaluator
		from arkas.output.lazy import BaseLazyOutput
		from arkas.plotter.vanilla import Plotter

		if TYPE_CHECKING:
		from arkas.state.target_dataframe import TargetDataFrameState


		class ColumnCorrelationOutput(BaseLazyOutput):
		r"""Implement an output to summarize the numeric columns of a
		DataFrame.

		Args:
		state: The state containing the DataFrame to analyze.

		Example usage:

		```pycon

		>>> import polars as pl
		>>> from arkas.output import ColumnCorrelationOutput
		>>> from arkas.state import TargetDataFrameState
		>>> frame = pl.DataFrame(
		... {
		... "col1": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0],
		... "col2": [7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0],
		... "col3": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0],
		... },
		... )
		>>> output = ColumnCorrelationOutput(TargetDataFrameState(frame, target_column="col3"))
		>>> output
		ColumnCorrelationOutput(
		(state): TargetDataFrameState(dataframe=(7, 3), target_column='col3', nan_policy='propagate', figure_config=MatplotlibFigureConfig())
		)
		>>> output.get_content_generator()
		ColumnCorrelationContentGenerator(
		(state): TargetDataFrameState(dataframe=(7, 3), target_column='col3', nan_policy='propagate', figure_config=MatplotlibFigureConfig())
		)
		>>> output.get_evaluator()
		ColumnCorrelationEvaluator(
		(state): TargetDataFrameState(dataframe=(7, 3), target_column='col3', nan_policy='propagate', figure_config=MatplotlibFigureConfig())
		)
		>>> output.get_plotter()
		Plotter(count=0)

		```
		"""

		def __init__(self, state: TargetDataFrameState) -> None:
		self._state = state

		def __repr__(self) -> str:
		args = repr_indent(repr_mapping({"state": self._state}))
		return f"{self.__class__.__qualname__}(\n {args}\n)"

		def __str__(self) -> str:
		args = str_indent(str_mapping({"state": self._state}))
		return f"{self.__class__.__qualname__}(\n {args}\n)"

		def equal(self, other: Any, equal_nan: bool = False) -> bool:
		if not isinstance(other, self.__class__):
		return False
		return self._state.equal(other._state, equal_nan=equal_nan)

		def _get_content_generator(self) -> ColumnCorrelationContentGenerator:
		return ColumnCorrelationContentGenerator(self._state)

		def _get_evaluator(self) -> ColumnCorrelationEvaluator:
		return ColumnCorrelationEvaluator(self._state)

		def _get_plotter(self) -> Plotter:
		return Plotter()

+86

src/arkas/output/correlation.py

		r"""Implement an output to analyze the correlation between columns."""

		from __future__ import annotations

		__all__ = ["CorrelationOutput"]

		from typing import TYPE_CHECKING, Any

		from coola.utils import repr_indent, repr_mapping, str_indent, str_mapping

		from arkas.content.correlation import CorrelationContentGenerator
		from arkas.evaluator2.correlation import CorrelationEvaluator
		from arkas.output.lazy import BaseLazyOutput
		from arkas.plotter.correlation import CorrelationPlotter
		from arkas.utils.dataframe import check_num_columns

		if TYPE_CHECKING:
		from arkas.state.dataframe import DataFrameState


		class CorrelationOutput(BaseLazyOutput):
		r"""Implement an output to summarize the numeric columns of a
		DataFrame.

		Args:
		state: The state containing the DataFrame to analyze.

		Example usage:

		```pycon

		>>> import polars as pl
		>>> from arkas.output import CorrelationOutput
		>>> from arkas.state import DataFrameState
		>>> frame = pl.DataFrame(
		... {
		... "col1": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0],
		... "col2": [7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0],
		... },
		... )
		>>> output = CorrelationOutput(DataFrameState(frame))
		>>> output
		CorrelationOutput(
		(state): DataFrameState(dataframe=(7, 2), nan_policy='propagate', figure_config=MatplotlibFigureConfig())
		)
		>>> output.get_content_generator()
		CorrelationContentGenerator(
		(state): DataFrameState(dataframe=(7, 2), nan_policy='propagate', figure_config=MatplotlibFigureConfig())
		)
		>>> output.get_evaluator()
		CorrelationEvaluator(
		(state): DataFrameState(dataframe=(7, 2), nan_policy='propagate', figure_config=MatplotlibFigureConfig())
		)
		>>> output.get_plotter()
		CorrelationPlotter(
		(state): DataFrameState(dataframe=(7, 2), nan_policy='propagate', figure_config=MatplotlibFigureConfig())
		)

		```
		"""

		def __init__(self, state: DataFrameState) -> None:
		check_num_columns(state.dataframe, num_columns=2)
		self._state = state

		def __repr__(self) -> str:
		args = repr_indent(repr_mapping({"state": self._state}))
		return f"{self.__class__.__qualname__}(\n {args}\n)"

		def __str__(self) -> str:
		args = str_indent(str_mapping({"state": self._state}))
		return f"{self.__class__.__qualname__}(\n {args}\n)"

		def equal(self, other: Any, equal_nan: bool = False) -> bool:
		if not isinstance(other, self.__class__):
		return False
		return self._state.equal(other._state, equal_nan=equal_nan)

		def _get_content_generator(self) -> CorrelationContentGenerator:
		return CorrelationContentGenerator(self._state)

		def _get_evaluator(self) -> CorrelationEvaluator:
		return CorrelationEvaluator(self._state)

		def _get_plotter(self) -> CorrelationPlotter:
		return CorrelationPlotter(self._state)

+83

src/arkas/output/numeric_summary.py

		r"""Implement an output to summarize the numeric columns of a
		DataFrame."""

		from __future__ import annotations

		__all__ = ["NumericSummaryOutput"]

		from typing import TYPE_CHECKING, Any

		from coola.utils import repr_indent, repr_mapping, str_indent, str_mapping

		from arkas.content.numeric_summary import NumericSummaryContentGenerator
		from arkas.evaluator2.vanilla import Evaluator
		from arkas.output.lazy import BaseLazyOutput
		from arkas.plotter.vanilla import Plotter

		if TYPE_CHECKING:
		from arkas.state.dataframe import DataFrameState


		class NumericSummaryOutput(BaseLazyOutput):
		r"""Implement an output to summarize the numeric columns of a
		DataFrame.

		Args:
		state: The state containing the DataFrame to analyze.

		Example usage:

		```pycon

		>>> import polars as pl
		>>> from arkas.output import NumericSummaryOutput
		>>> from arkas.state import DataFrameState
		>>> frame = pl.DataFrame(
		... {
		... "col1": [0, 1, 1, 0, 0, 1, 0],
		... "col2": [0, 1, 0, 1, 0, 1, 0],
		... "col3": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0],
		... },
		... schema={"col1": pl.Int64, "col2": pl.Int32, "col3": pl.Float64},
		... )
		>>> output = NumericSummaryOutput(DataFrameState(frame))
		>>> output
		NumericSummaryOutput(
		(state): DataFrameState(dataframe=(7, 3), nan_policy='propagate', figure_config=MatplotlibFigureConfig())
		)
		>>> output.get_content_generator()
		NumericSummaryContentGenerator(
		(state): DataFrameState(dataframe=(7, 3), nan_policy='propagate', figure_config=MatplotlibFigureConfig())
		)
		>>> output.get_evaluator()
		Evaluator(count=0)
		>>> output.get_plotter()
		Plotter(count=0)

		```
		"""

		def __init__(self, state: DataFrameState) -> None:
		self._state = state

		def __repr__(self) -> str:
		args = repr_indent(repr_mapping({"state": self._state}))
		return f"{self.__class__.__qualname__}(\n {args}\n)"

		def __str__(self) -> str:
		args = str_indent(str_mapping({"state": self._state}))
		return f"{self.__class__.__qualname__}(\n {args}\n)"

		def equal(self, other: Any, equal_nan: bool = False) -> bool:
		if not isinstance(other, self.__class__):
		return False
		return self._state.equal(other._state, equal_nan=equal_nan)

		def _get_content_generator(self) -> NumericSummaryContentGenerator:
		return NumericSummaryContentGenerator(self._state)

		def _get_evaluator(self) -> Evaluator:
		return Evaluator()

		def _get_plotter(self) -> Plotter:
		return Plotter()

+77

src/arkas/output/summary.py

		r"""Implement the DataFrame summary output."""

		from __future__ import annotations

		__all__ = ["SummaryOutput"]

		from typing import TYPE_CHECKING, Any

		from coola import objects_are_equal

		from arkas.content.summary import SummaryContentGenerator
		from arkas.evaluator2.vanilla import Evaluator
		from arkas.output.lazy import BaseLazyOutput
		from arkas.plotter.vanilla import Plotter
		from arkas.utils.validation import check_positive

		if TYPE_CHECKING:
		import polars as pl


		class SummaryOutput(BaseLazyOutput):
		r"""Implement the DataFrame summary output.

		Args:
		frame: The DataFrame to analyze.
		top: The number of most frequent values to show.

		Example usage:

		```pycon

		>>> import polars as pl
		>>> from arkas.output import SummaryOutput
		>>> frame = pl.DataFrame(
		... {
		... "col1": [1.2, 4.2, 4.2, 2.2],
		... "col2": [1, 1, 1, 1],
		... "col3": [1, 2, 2, 2],
		... },
		... schema={"col1": pl.Float64, "col2": pl.Int64, "col3": pl.Int64},
		... )
		>>> output = SummaryOutput(frame)
		>>> output
		SummaryOutput(shape=(4, 3), top=5)
		>>> output.get_content_generator()
		SummaryContentGenerator(shape=(4, 3), top=5)
		>>> output.get_evaluator()
		Evaluator(count=0)
		>>> output.get_plotter()
		Plotter(count=0)

		```
		"""

		def __init__(self, frame: pl.DataFrame, top: int = 5) -> None:
		self._frame = frame
		check_positive(name="top", value=top)
		self._top = top

		def __repr__(self) -> str:
		return f"{self.__class__.__qualname__}(shape={self._frame.shape}, top={self._top})"

		def equal(self, other: Any, equal_nan: bool = False) -> bool:
		if not isinstance(other, self.__class__):
		return False
		return self._top == other._top and objects_are_equal(
		self._frame, other._frame, equal_nan=equal_nan
		)

		def _get_content_generator(self) -> SummaryContentGenerator:
		return SummaryContentGenerator(frame=self._frame, top=self._top)

		def _get_evaluator(self) -> Evaluator:
		return Evaluator()

		def _get_plotter(self) -> Plotter:
		return Plotter()

+182

src/arkas/plotter/correlation.py

		r"""Contain the implementation of a correlation plotter."""

		from __future__ import annotations

		__all__ = ["BaseFigureCreator", "CorrelationPlotter", "MatplotlibFigureCreator"]

		from abc import ABC, abstractmethod
		from typing import TYPE_CHECKING, Any

		import matplotlib.pyplot as plt
		from coola.utils import repr_indent, repr_mapping, str_indent, str_mapping

		from arkas.figure.creator import FigureCreatorRegistry
		from arkas.figure.html import HtmlFigure
		from arkas.figure.matplotlib import MatplotlibFigure, MatplotlibFigureConfig
		from arkas.figure.utils import MISSING_FIGURE_MESSAGE
		from arkas.plotter.base import BasePlotter
		from arkas.plotter.vanilla import Plotter
		from arkas.utils.dataframe import check_num_columns
		from arkas.utils.range import find_range

		if TYPE_CHECKING:
		from arkas.figure.base import BaseFigure
		from arkas.state.dataframe import DataFrameState


		class BaseFigureCreator(ABC):
		r"""Define the base class to create a figure with the content of
		each column."""

		@abstractmethod
		def create(self, state: DataFrameState) -> BaseFigure:
		r"""Create a figure with the content of each column.

		Args:
		state: The state containing the DataFrame to analyze.
		The DataFrame must have only 2 columns, which are the
		two columns to analyze.

		Returns:
		The generated figure.

		Example usage:

		```pycon

		>>> import polars as pl
		>>> from arkas.figure import MatplotlibFigureConfig
		>>> from arkas.state import DataFrameState
		>>> from arkas.plotter.correlation import MatplotlibFigureCreator
		>>> creator = MatplotlibFigureCreator()
		>>> frame = pl.DataFrame(
		... {
		... "col1": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0],
		... "col3": [2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0],
		... },
		... )
		>>> fig = creator.create(DataFrameState(frame))

		```
		"""


		class MatplotlibFigureCreator(BaseFigureCreator):
		r"""Create a matplotlib figure with the content of each column.

		Example usage:

		```pycon

		>>> import polars as pl
		>>> from arkas.figure import MatplotlibFigureConfig
		>>> from arkas.state import DataFrameState
		>>> from arkas.plotter.correlation import MatplotlibFigureCreator
		>>> creator = MatplotlibFigureCreator()
		>>> frame = pl.DataFrame(
		... {
		... "col1": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0],
		... "col3": [2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0],
		... },
		... )
		>>> fig = creator.create(DataFrameState(frame))

		```
		"""

		def __repr__(self) -> str:
		return f"{self.__class__.__qualname__}()"

		def create(self, state: DataFrameState) -> BaseFigure:
		if state.dataframe.shape[0] == 0:
		return HtmlFigure(MISSING_FIGURE_MESSAGE)

		check_num_columns(state.dataframe, num_columns=2)
		xcol, ycol = state.dataframe.columns

		fig, ax = plt.subplots(**state.figure_config.get_arg("init", {}))
		x = state.dataframe[xcol].to_numpy()
		y = state.dataframe[ycol].to_numpy()
		ax.scatter(x=x, y=y)

		xmin, xmax = find_range(
		x,
		xmin=state.figure_config.get_arg("xmin"),
		xmax=state.figure_config.get_arg("xmax"),
		)
		if xmin < xmax:
		ax.set_xlim(xmin, xmax)
		ymin, ymax = find_range(
		y,
		xmin=state.figure_config.get_arg("ymin"),
		xmax=state.figure_config.get_arg("ymax"),
		)
		if ymin < ymax:
		ax.set_ylim(ymin, ymax)
		ax.set_xlabel(xcol)
		ax.set_ylabel(ycol)
		if xscale := state.figure_config.get_arg("xscale"):
		ax.set_xscale(xscale)
		if yscale := state.figure_config.get_arg("yscale"):
		ax.set_yscale(yscale)
		fig.tight_layout()
		return MatplotlibFigure(fig)


		class CorrelationPlotter(BasePlotter):
		r"""Implement a DataFrame column plotter.

		Args:
		state: The state containing the DataFrame to analyze.
		The DataFrame must have only 2 columns, which are the two
		columns to analyze.

		Example usage:

		```pycon

		>>> import polars as pl
		>>> from arkas.plotter import CorrelationPlotter
		>>> from arkas.state import DataFrameState
		>>> frame = pl.DataFrame(
		... {
		... "col1": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0],
		... "col3": [2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0],
		... },
		... )
		>>> plotter = CorrelationPlotter(DataFrameState(frame))
		>>> plotter
		CorrelationPlotter(
		(state): DataFrameState(dataframe=(7, 2), nan_policy='propagate', figure_config=MatplotlibFigureConfig())
		)

		```
		"""

		registry = FigureCreatorRegistry[BaseFigureCreator](
		{MatplotlibFigureConfig.backend(): MatplotlibFigureCreator()}
		)

		def __init__(self, state: DataFrameState) -> None:
		check_num_columns(state.dataframe, num_columns=2)
		self._state = state

		def __repr__(self) -> str:
		args = repr_indent(repr_mapping({"state": self._state}))
		return f"{self.__class__.__qualname__}(\n {args}\n)"

		def __str__(self) -> str:
		args = str_indent(str_mapping({"state": self._state}))
		return f"{self.__class__.__qualname__}(\n {args}\n)"

		def compute(self) -> Plotter:
		return Plotter(self.plot())

		def equal(self, other: Any, equal_nan: bool = False) -> bool:
		if not isinstance(other, self.__class__):
		return False
		return self._state.equal(other._state, equal_nan=equal_nan)

		def plot(self, prefix: str = "", suffix: str = "") -> dict:
		figure = self.registry.find_creator(self._state.figure_config.backend()).create(self._state)
		return {f"{prefix}correlation{suffix}": figure}

+110

src/arkas/state/target_dataframe.py

		r"""Implement DataFrame state with a target column."""

		from __future__ import annotations

		__all__ = ["TargetDataFrameState"]

		import sys
		from typing import TYPE_CHECKING

		from coola.utils.format import repr_mapping_line, str_indent, str_mapping

		from arkas.state.dataframe import DataFrameState
		from arkas.utils.dataframe import check_column_exist

		if sys.version_info >= (3, 11):
		from typing import Self
		else: # pragma: no cover
		from typing_extensions import (
		Self, # use backport because it was added in python 3.11
		)

		if TYPE_CHECKING:
		import polars as pl

		from arkas.figure.base import BaseFigureConfig


		class TargetDataFrameState(DataFrameState):
		r"""Implement a DataFrame state with a target column.

		Args:
		dataframe: The DataFrame.
		target_column: The target column in the DataFrame.
		nan_policy: The policy on how to handle NaN values in the input
		arrays. The following options are available: ``'omit'``,
		``'propagate'``, and ``'raise'``.
		figure_config: An optional figure configuration.

		Example usage:

		```pycon

		>>> from datetime import datetime, timezone
		>>> import polars as pl
		>>> from arkas.state import TargetDataFrameState
		>>> frame = pl.DataFrame(
		... {
		... "col1": [0, 1, 1, 0, 0, 1, 0],
		... "col2": [0, 1, 0, 1, 0, 1, 0],
		... "col3": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0],
		... },
		... schema={"col1": pl.Int64, "col2": pl.Int32, "col3": pl.Float64},
		... )
		>>> state = TargetDataFrameState(frame, target_column="col3")
		>>> state
		TargetDataFrameState(dataframe=(7, 3), target_column='col3', nan_policy='propagate', figure_config=MatplotlibFigureConfig())

		```
		"""

		def __init__(
		self,
		dataframe: pl.DataFrame,
		target_column: str,
		nan_policy: str = "propagate",
		figure_config: BaseFigureConfig \| None = None,
		) -> None:
		super().__init__(dataframe=dataframe, nan_policy=nan_policy, figure_config=figure_config)

		check_column_exist(dataframe, target_column)
		self._target_column = target_column

		def __repr__(self) -> str:
		args = repr_mapping_line(
		{
		"dataframe": self._dataframe.shape,
		"target_column": self._target_column,
		"nan_policy": self._nan_policy,
		"figure_config": self._figure_config,
		}
		)
		return f"{self.__class__.__qualname__}({args})"

		def __str__(self) -> str:
		args = str_indent(
		str_mapping(
		{
		"dataframe": self._dataframe.shape,
		"target_column": self._target_column,
		"nan_policy": self._nan_policy,
		"figure_config": self._figure_config,
		}
		)
		)
		return f"{self.__class__.__qualname__}({args})"

		@property
		def target_column(self) -> str:
		return self._target_column

		def clone(self, deep: bool = True) -> Self:
		return self.__class__(
		dataframe=self._dataframe.clone() if deep else self._dataframe,
		target_column=self._target_column,
		nan_policy=self._nan_policy,
		figure_config=self._figure_config.clone() if deep else self._figure_config,
		)

		def get_args(self) -> dict:
		return super().get_args() \| {"target_column": self._target_column}

+1

-1

PKG-INFO

		Metadata-Version: 2.1
		Name: arkas
		Version: 0.0.1a10
		Version: 0.0.1a11
		Summary: Library to evaluate ML model performances
		@@ -5,0 +5,0 @@ Home-page: https://github.com/durandtibo/arkas

+1

-1

pyproject.toml

		[tool.poetry]
		name = "arkas"
		version = "0.0.1a10"
		version = "0.0.1a11"
		description = "Library to evaluate ML model performances"
		@@ -5,0 +5,0 @@ readme = "README.md"

+8

-2

src/arkas/analyzer/__init__.py

		@@ -13,9 +13,12 @@ r"""Contain DataFrame analyzers."""
		"ColumnCooccurrenceAnalyzer",
		"ColumnCorrelationAnalyzer",
		"ContentAnalyzer",
		"ContinuousColumnAnalyzer",
		"DataFrameSummaryAnalyzer",
		"CorrelationAnalyzer",
		"MappingAnalyzer",
		"NullValueAnalyzer",
		"NumericSummaryAnalyzer",
		"PlotColumnAnalyzer",
		"ScatterColumnAnalyzer",
		"SummaryAnalyzer",
		"TemporalNullValueAnalyzer",
		@@ -32,13 +35,16 @@ "TemporalPlotColumnAnalyzer",
		from arkas.analyzer.column_cooccurrence import ColumnCooccurrenceAnalyzer
		from arkas.analyzer.column_correlation import ColumnCorrelationAnalyzer
		from arkas.analyzer.columns import BaseTruePredAnalyzer
		from arkas.analyzer.content import ContentAnalyzer
		from arkas.analyzer.continuous_column import ContinuousColumnAnalyzer
		from arkas.analyzer.frame_summary import DataFrameSummaryAnalyzer
		from arkas.analyzer.correlation import CorrelationAnalyzer
		from arkas.analyzer.lazy import BaseInNLazyAnalyzer, BaseLazyAnalyzer
		from arkas.analyzer.mapping import MappingAnalyzer
		from arkas.analyzer.null_value import NullValueAnalyzer
		from arkas.analyzer.numeric_summary import NumericSummaryAnalyzer
		from arkas.analyzer.plot_column import PlotColumnAnalyzer
		from arkas.analyzer.scatter_column import ScatterColumnAnalyzer
		from arkas.analyzer.summary import SummaryAnalyzer
		from arkas.analyzer.temporal_null_value import TemporalNullValueAnalyzer
		from arkas.analyzer.temporal_plot_column import TemporalPlotColumnAnalyzer
		from arkas.analyzer.transform import TransformAnalyzer

+4

-4

src/arkas/analyzer/lazy.py

		@@ -40,6 +40,6 @@ r"""Define a base class to implement lazy analyzers."""
		>>> import polars as pl
		>>> from arkas.analyzer import DataFrameSummaryAnalyzer
		>>> analyzer = DataFrameSummaryAnalyzer()
		>>> from arkas.analyzer import SummaryAnalyzer
		>>> analyzer = SummaryAnalyzer()
		>>> analyzer
		DataFrameSummaryAnalyzer(top=5, sort=False)
		SummaryAnalyzer(top=5, sort=False)
		>>> frame = pl.DataFrame(
		@@ -55,3 +55,3 @@ ... {
		>>> output
		DataFrameSummaryOutput(shape=(4, 3), top=5)
		SummaryOutput(shape=(4, 3), top=5)

		@@ -58,0 +58,0 @@ ```

+1

-1

src/arkas/analyzer/plot_column.py

		@@ -65,3 +65,3 @@ r"""Implement an analyzer that plots the content of each column."""
		PlotColumnOutput(
		(state): DataFrameState(dataframe=(4, 3), figure_config=MatplotlibFigureConfig())
		(state): DataFrameState(dataframe=(4, 3), nan_policy='propagate', figure_config=MatplotlibFigureConfig())
		)
		@@ -68,0 +68,0 @@

+1

-1

src/arkas/analyzer/scatter_column.py

		@@ -56,3 +56,3 @@ r"""Implement an analyzer that plots the content of each column."""
		ScatterColumnOutput(
		(state): ScatterDataFrameState(dataframe=(4, 2), x='col1', y='col2', color=None, figure_config=MatplotlibFigureConfig())
		(state): ScatterDataFrameState(dataframe=(4, 2), x='col1', y='col2', color=None, nan_policy='propagate', figure_config=MatplotlibFigureConfig())
		)
		@@ -59,0 +59,0 @@

+1

-1

src/arkas/analyzer/temporal_null_value.py

		@@ -80,3 +80,3 @@ r"""Implement an analyzer that plots the content of each column."""
		TemporalNullValueOutput(
		(state): TemporalDataFrameState(dataframe=(4, 4), temporal_column='datetime', period='1d', figure_config=MatplotlibFigureConfig())
		(state): TemporalDataFrameState(dataframe=(4, 4), temporal_column='datetime', period='1d', nan_policy='propagate', figure_config=MatplotlibFigureConfig())
		)
		@@ -83,0 +83,0 @@

+1

-1

src/arkas/analyzer/temporal_plot_column.py

		@@ -79,3 +79,3 @@ r"""Implement an analyzer that plots the content of each column."""
		TemporalPlotColumnOutput(
		(state): TemporalDataFrameState(dataframe=(4, 4), temporal_column='datetime', period=None, figure_config=MatplotlibFigureConfig())
		(state): TemporalDataFrameState(dataframe=(4, 4), temporal_column='datetime', period=None, nan_policy='propagate', figure_config=MatplotlibFigureConfig())
		)
		@@ -82,0 +82,0 @@

+8

-2

src/arkas/content/__init__.py

		@@ -10,9 +10,12 @@ r"""Contain HTML content generators."""
		"ColumnCooccurrenceContentGenerator",
		"ColumnCorrelationContentGenerator",
		"ContentGenerator",
		"ContentGeneratorDict",
		"ContinuousSeriesContentGenerator",
		"DataFrameSummaryContentGenerator",
		"CorrelationContentGenerator",
		"NullValueContentGenerator",
		"NumericSummaryContentGenerator",
		"PlotColumnContentGenerator",
		"ScatterColumnContentGenerator",
		"SummaryContentGenerator",
		"TemporalNullValueContentGenerator",
		@@ -26,10 +29,13 @@ "TemporalPlotColumnContentGenerator",
		from arkas.content.column_cooccurrence import ColumnCooccurrenceContentGenerator
		from arkas.content.column_correlation import ColumnCorrelationContentGenerator
		from arkas.content.continuous_series import ContinuousSeriesContentGenerator
		from arkas.content.frame_summary import DataFrameSummaryContentGenerator
		from arkas.content.correlation import CorrelationContentGenerator
		from arkas.content.mapping import ContentGeneratorDict
		from arkas.content.null_value import NullValueContentGenerator
		from arkas.content.numeric_summary import NumericSummaryContentGenerator
		from arkas.content.plot_column import PlotColumnContentGenerator
		from arkas.content.scatter_column import ScatterColumnContentGenerator
		from arkas.content.summary import SummaryContentGenerator
		from arkas.content.temporal_null_value import TemporalNullValueContentGenerator
		from arkas.content.temporal_plot_column import TemporalPlotColumnContentGenerator
		from arkas.content.vanilla import ContentGenerator

+1

-1

src/arkas/content/column_cooccurrence.py

		@@ -102,3 +102,3 @@ r"""Contain the implementation of a HTML content generator that returns

		>>> from arkas.content.frame_summary import create_template
		>>> from arkas.content.summary import create_template
		>>> template = create_template()
		@@ -105,0 +105,0 @@

+1

-1

src/arkas/content/continuous_series.py

		@@ -78,3 +78,3 @@ r"""Contain the implementation of a HTML content generator that analyzes
		xmin=self._state.figure_config.get_arg("xmin"),
		xmax=self._state.figure_config.get_arg("xmin"),
		xmax=self._state.figure_config.get_arg("xmax"),
		)
		@@ -81,0 +81,0 @@ return Template(create_template()).render(

+1

-1

src/arkas/content/plot_column.py

		@@ -49,3 +49,3 @@ r"""Contain the implementation of a HTML content generator that plots
		PlotColumnContentGenerator(
		(state): DataFrameState(dataframe=(7, 3), figure_config=MatplotlibFigureConfig())
		(state): DataFrameState(dataframe=(7, 3), nan_policy='propagate', figure_config=MatplotlibFigureConfig())
		)
		@@ -52,0 +52,0 @@

+1

-1

src/arkas/content/scatter_column.py

		@@ -51,3 +51,3 @@ r"""Contain the implementation of a HTML content generator that plots
		ScatterColumnContentGenerator(
		(state): ScatterDataFrameState(dataframe=(7, 3), x='col1', y='col2', color=None, figure_config=MatplotlibFigureConfig())
		(state): ScatterDataFrameState(dataframe=(7, 3), x='col1', y='col2', color=None, nan_policy='propagate', figure_config=MatplotlibFigureConfig())
		)
		@@ -54,0 +54,0 @@

+1

-1

src/arkas/content/temporal_null_value.py

		@@ -67,3 +67,3 @@ r"""Contain the implementation of a HTML content generator that analyzes
		TemporalNullValueContentGenerator(
		(state): TemporalDataFrameState(dataframe=(4, 4), temporal_column='datetime', period=None, figure_config=MatplotlibFigureConfig())
		(state): TemporalDataFrameState(dataframe=(4, 4), temporal_column='datetime', period=None, nan_policy='propagate', figure_config=MatplotlibFigureConfig())
		)
		@@ -70,0 +70,0 @@

+1

-1

src/arkas/content/temporal_plot_column.py

		@@ -64,3 +64,3 @@ r"""Contain the implementation of a HTML content generator that plots
		TemporalPlotColumnContentGenerator(
		(state): TemporalDataFrameState(dataframe=(4, 4), temporal_column='datetime', period=None, figure_config=MatplotlibFigureConfig())
		(state): TemporalDataFrameState(dataframe=(4, 4), temporal_column='datetime', period=None, nan_policy='propagate', figure_config=MatplotlibFigureConfig())
		)
		@@ -67,0 +67,0 @@

+4

-0

src/arkas/evaluator2/__init__.py

		@@ -10,2 +10,4 @@ r"""Contain data evaluators."""
		"ColumnCooccurrenceEvaluator",
		"ColumnCorrelationEvaluator",
		"CorrelationEvaluator",
		"Evaluator",
		@@ -19,3 +21,5 @@ "EvaluatorDict",
		from arkas.evaluator2.column_cooccurrence import ColumnCooccurrenceEvaluator
		from arkas.evaluator2.column_correlation import ColumnCorrelationEvaluator
		from arkas.evaluator2.correlation import CorrelationEvaluator
		from arkas.evaluator2.mapping import EvaluatorDict
		from arkas.evaluator2.vanilla import Evaluator

+8

-2

src/arkas/output/__init__.py

		@@ -11,7 +11,9 @@ r"""Contain data outputs."""
		"ColumnCooccurrenceOutput",
		"ColumnCorrelationOutput",
		"ContentOutput",
		"ContinuousSeriesOutput",
		"DataFrameSummaryOutput",
		"CorrelationOutput",
		"EmptyOutput",
		"NullValueOutput",
		"NumericSummaryOutput",
		"Output",
		@@ -21,2 +23,3 @@ "OutputDict",
		"ScatterColumnOutput",
		"SummaryOutput",
		"TemporalNullValueOutput",
		@@ -30,13 +33,16 @@ "TemporalPlotColumnOutput",
		from arkas.output.column_cooccurrence import ColumnCooccurrenceOutput
		from arkas.output.column_correlation import ColumnCorrelationOutput
		from arkas.output.content import ContentOutput
		from arkas.output.continuous_series import ContinuousSeriesOutput
		from arkas.output.correlation import CorrelationOutput
		from arkas.output.empty import EmptyOutput
		from arkas.output.frame_summary import DataFrameSummaryOutput
		from arkas.output.lazy import BaseLazyOutput
		from arkas.output.mapping import OutputDict
		from arkas.output.null_value import NullValueOutput
		from arkas.output.numeric_summary import NumericSummaryOutput
		from arkas.output.plot_column import PlotColumnOutput
		from arkas.output.scatter_column import ScatterColumnOutput
		from arkas.output.summary import SummaryOutput
		from arkas.output.temporal_null_value import TemporalNullValueOutput
		from arkas.output.temporal_plot_column import TemporalPlotColumnOutput
		from arkas.output.vanilla import Output

+3

-3

src/arkas/output/plot_column.py

		@@ -44,7 +44,7 @@ r"""Implement an output to plot each column of a DataFrame."""
		PlotColumnOutput(
		(state): DataFrameState(dataframe=(4, 3), figure_config=MatplotlibFigureConfig())
		(state): DataFrameState(dataframe=(4, 3), nan_policy='propagate', figure_config=MatplotlibFigureConfig())
		)
		>>> output.get_content_generator()
		PlotColumnContentGenerator(
		(state): DataFrameState(dataframe=(4, 3), figure_config=MatplotlibFigureConfig())
		(state): DataFrameState(dataframe=(4, 3), nan_policy='propagate', figure_config=MatplotlibFigureConfig())
		)
		@@ -55,3 +55,3 @@ >>> output.get_evaluator()
		PlotColumnPlotter(
		(state): DataFrameState(dataframe=(4, 3), figure_config=MatplotlibFigureConfig())
		(state): DataFrameState(dataframe=(4, 3), nan_policy='propagate', figure_config=MatplotlibFigureConfig())
		)
		@@ -58,0 +58,0 @@

+4

-4

src/arkas/output/scatter_column.py

		@@ -17,3 +17,3 @@ r"""Implement an output to scatter plot some columns."""
		if TYPE_CHECKING:
		from arkas.state.temporal_dataframe import ScatterDataFrameState
		from arkas.state.scatter_dataframe import ScatterDataFrameState

		@@ -45,7 +45,7 @@
		ScatterColumnOutput(
		(state): ScatterDataFrameState(dataframe=(4, 3), x='col1', y='col2', color=None, figure_config=MatplotlibFigureConfig())
		(state): ScatterDataFrameState(dataframe=(4, 3), x='col1', y='col2', color=None, nan_policy='propagate', figure_config=MatplotlibFigureConfig())
		)
		>>> output.get_content_generator()
		ScatterColumnContentGenerator(
		(state): ScatterDataFrameState(dataframe=(4, 3), x='col1', y='col2', color=None, figure_config=MatplotlibFigureConfig())
		(state): ScatterDataFrameState(dataframe=(4, 3), x='col1', y='col2', color=None, nan_policy='propagate', figure_config=MatplotlibFigureConfig())
		)
		@@ -56,3 +56,3 @@ >>> output.get_evaluator()
		ScatterColumnPlotter(
		(state): ScatterDataFrameState(dataframe=(4, 3), x='col1', y='col2', color=None, figure_config=MatplotlibFigureConfig())
		(state): ScatterDataFrameState(dataframe=(4, 3), x='col1', y='col2', color=None, nan_policy='propagate', figure_config=MatplotlibFigureConfig())
		)
		@@ -59,0 +59,0 @@

+3

-3

src/arkas/output/temporal_null_value.py

		@@ -60,7 +60,7 @@ r"""Implement an output to analyze the number of null values in a
		TemporalNullValueOutput(
		(state): TemporalDataFrameState(dataframe=(4, 4), temporal_column='datetime', period=None, figure_config=MatplotlibFigureConfig())
		(state): TemporalDataFrameState(dataframe=(4, 4), temporal_column='datetime', period=None, nan_policy='propagate', figure_config=MatplotlibFigureConfig())
		)
		>>> output.get_content_generator()
		TemporalNullValueContentGenerator(
		(state): TemporalDataFrameState(dataframe=(4, 4), temporal_column='datetime', period=None, figure_config=MatplotlibFigureConfig())
		(state): TemporalDataFrameState(dataframe=(4, 4), temporal_column='datetime', period=None, nan_policy='propagate', figure_config=MatplotlibFigureConfig())
		)
		@@ -71,3 +71,3 @@ >>> output.get_evaluator()
		TemporalNullValuePlotter(
		(state): TemporalDataFrameState(dataframe=(4, 4), temporal_column='datetime', period=None, figure_config=MatplotlibFigureConfig())
		(state): TemporalDataFrameState(dataframe=(4, 4), temporal_column='datetime', period=None, nan_policy='propagate', figure_config=MatplotlibFigureConfig())
		)
		@@ -74,0 +74,0 @@

+3

-3

src/arkas/output/temporal_plot_column.py

		@@ -60,7 +60,7 @@ r"""Implement an output to plot each column of a DataFrame along a
		TemporalPlotColumnOutput(
		(state): TemporalDataFrameState(dataframe=(4, 4), temporal_column='datetime', period=None, figure_config=MatplotlibFigureConfig())
		(state): TemporalDataFrameState(dataframe=(4, 4), temporal_column='datetime', period=None, nan_policy='propagate', figure_config=MatplotlibFigureConfig())
		)
		>>> output.get_content_generator()
		TemporalPlotColumnContentGenerator(
		(state): TemporalDataFrameState(dataframe=(4, 4), temporal_column='datetime', period=None, figure_config=MatplotlibFigureConfig())
		(state): TemporalDataFrameState(dataframe=(4, 4), temporal_column='datetime', period=None, nan_policy='propagate', figure_config=MatplotlibFigureConfig())
		)
		@@ -71,3 +71,3 @@ >>> output.get_evaluator()
		TemporalPlotColumnPlotter(
		(state): TemporalDataFrameState(dataframe=(4, 4), temporal_column='datetime', period=None, figure_config=MatplotlibFigureConfig())
		(state): TemporalDataFrameState(dataframe=(4, 4), temporal_column='datetime', period=None, nan_policy='propagate', figure_config=MatplotlibFigureConfig())
		)
		@@ -74,0 +74,0 @@

+2

-0

src/arkas/plotter/__init__.py

		@@ -9,2 +9,3 @@ r"""Contain data plotters."""
		"ContinuousSeriesPlotter",
		"CorrelationPlotter",
		"NullValuePlotter",
		@@ -22,2 +23,3 @@ "PlotColumnPlotter",
		from arkas.plotter.continuous_series import ContinuousSeriesPlotter
		from arkas.plotter.correlation import CorrelationPlotter
		from arkas.plotter.mapping import PlotterDict
		@@ -24,0 +26,0 @@ from arkas.plotter.null_value import NullValuePlotter

+1

-1

src/arkas/plotter/plot_column.py

		@@ -131,3 +131,3 @@ r"""Contain the implementation of a DataFrame column plotter."""
		PlotColumnPlotter(
		(state): DataFrameState(dataframe=(4, 3), figure_config=MatplotlibFigureConfig())
		(state): DataFrameState(dataframe=(4, 3), nan_policy='propagate', figure_config=MatplotlibFigureConfig())
		)
		@@ -134,0 +134,0 @@

+2

-2

src/arkas/plotter/scatter_column.py

		@@ -35,3 +35,3 @@ r"""Contain the implementation of a DataFrame column plotter."""
		Args:
		state: The state containing the DataFrame to analyze.
		state: The state containing the DataFrame to analyze.

		@@ -153,3 +153,3 @@ Returns:
		ScatterColumnPlotter(
		(state): ScatterDataFrameState(dataframe=(4, 3), x='col1', y='col2', color='col3', figure_config=MatplotlibFigureConfig())
		(state): ScatterDataFrameState(dataframe=(4, 3), x='col1', y='col2', color='col3', nan_policy='propagate', figure_config=MatplotlibFigureConfig())
		)
		@@ -156,0 +156,0 @@

+1

-1

src/arkas/plotter/temporal_null_value.py

		@@ -178,3 +178,3 @@ r"""Contain the implementation of a DataFrame column plotter."""
		TemporalNullValuePlotter(
		(state): TemporalDataFrameState(dataframe=(4, 4), temporal_column='datetime', period='1d', figure_config=MatplotlibFigureConfig())
		(state): TemporalDataFrameState(dataframe=(4, 4), temporal_column='datetime', period='1d', nan_policy='propagate', figure_config=MatplotlibFigureConfig())
		)
		@@ -181,0 +181,0 @@

+1

-1

src/arkas/plotter/temporal_plot_column.py

		@@ -186,3 +186,3 @@ r"""Contain the implementation of a DataFrame column plotter."""
		TemporalPlotColumnPlotter(
		(state): TemporalDataFrameState(dataframe=(4, 4), temporal_column='datetime', period=None, figure_config=MatplotlibFigureConfig())
		(state): TemporalDataFrameState(dataframe=(4, 4), temporal_column='datetime', period=None, nan_policy='propagate', figure_config=MatplotlibFigureConfig())
		)
		@@ -189,0 +189,0 @@

+2

-0

src/arkas/state/__init__.py

		@@ -14,2 +14,3 @@ r"""Contain states."""
		"SeriesState",
		"TargetDataFrameState",
		"TemporalDataFrameState",
		@@ -26,2 +27,3 @@ ]
		from arkas.state.series import SeriesState
		from arkas.state.target_dataframe import TargetDataFrameState
		from arkas.state.temporal_dataframe import TemporalDataFrameState

+16

-1

src/arkas/state/dataframe.py

		@@ -14,2 +14,3 @@ r"""Implement the DataFrame state."""
		from arkas.figure.utils import get_default_config
		from arkas.metric.utils import check_nan_policy
		from arkas.state.base import BaseState
		@@ -35,2 +36,5 @@
		dataframe: The DataFrame.
		nan_policy: The policy on how to handle NaN values in the input
		arrays. The following options are available: ``'omit'``,
		``'propagate'``, and ``'raise'``.
		figure_config: An optional figure configuration.
		@@ -53,3 +57,3 @@
		>>> state
		DataFrameState(dataframe=(7, 3), figure_config=MatplotlibFigureConfig())
		DataFrameState(dataframe=(7, 3), nan_policy='propagate', figure_config=MatplotlibFigureConfig())

		@@ -62,5 +66,8 @@ ```
		dataframe: pl.DataFrame,
		nan_policy: str = "propagate",
		figure_config: BaseFigureConfig \| None = None,
		) -> None:
		self._dataframe = dataframe
		check_nan_policy(nan_policy)
		self._nan_policy = nan_policy
		self._figure_config = figure_config or get_default_config()
		@@ -72,2 +79,3 @@
		"dataframe": self._dataframe.shape,
		"nan_policy": self._nan_policy,
		"figure_config": self._figure_config,
		@@ -83,2 +91,3 @@ }
		"dataframe": self._dataframe.shape,
		"nan_policy": self._nan_policy,
		"figure_config": self._figure_config,
		@@ -95,2 +104,6 @@ }
		@property
		def nan_policy(self) -> str:
		return self._nan_policy

		@property
		def figure_config(self) -> BaseFigureConfig \| None:
		@@ -102,2 +115,3 @@ return self._figure_config
		dataframe=self._dataframe.clone() if deep else self._dataframe,
		nan_policy=self._nan_policy,
		figure_config=self._figure_config.clone() if deep else self._figure_config,
		@@ -114,3 +128,4 @@ )
		"dataframe": self._dataframe,
		"nan_policy": self._nan_policy,
		"figure_config": self._figure_config,
		}

+9

-2

src/arkas/state/scatter_dataframe.py

		@@ -36,2 +36,5 @@ r"""Implement the DataFrame state for scatter plots."""
		color: An optional color axis data column.
		nan_policy: The policy on how to handle NaN values in the input
		arrays. The following options are available: ``'omit'``,
		``'propagate'``, and ``'raise'``.
		figure_config: An optional figure configuration.
		@@ -54,3 +57,3 @@
		>>> state
		ScatterDataFrameState(dataframe=(7, 3), x='col1', y='col2', color=None, figure_config=MatplotlibFigureConfig())
		ScatterDataFrameState(dataframe=(7, 3), x='col1', y='col2', color=None, nan_policy='propagate', figure_config=MatplotlibFigureConfig())

		@@ -66,5 +69,6 @@ ```
		color: str \| None = None,
		nan_policy: str = "propagate",
		figure_config: BaseFigureConfig \| None = None,
		) -> None:
		super().__init__(dataframe=dataframe, figure_config=figure_config)
		super().__init__(dataframe=dataframe, nan_policy=nan_policy, figure_config=figure_config)

		@@ -86,2 +90,3 @@ check_column_exist(dataframe, x)
		"color": self._color,
		"nan_policy": self._nan_policy,
		"figure_config": self._figure_config,
		@@ -100,2 +105,3 @@ }
		"color": self._color,
		"nan_policy": self._nan_policy,
		"figure_config": self._figure_config,
		@@ -125,2 +131,3 @@ }
		color=self._color,
		nan_policy=self._nan_policy,
		figure_config=self._figure_config.clone() if deep else self._figure_config,
		@@ -127,0 +134,0 @@ )

+9

-2

src/arkas/state/temporal_dataframe.py

		@@ -35,2 +35,5 @@ r"""Implement the temporal DataFrame state."""
		period: An optional temporal period e.g. monthly or daily.
		nan_policy: The policy on how to handle NaN values in the input
		arrays. The following options are available: ``'omit'``,
		``'propagate'``, and ``'raise'``.
		figure_config: An optional figure configuration.
		@@ -66,3 +69,3 @@
		>>> state
		TemporalDataFrameState(dataframe=(4, 4), temporal_column='datetime', period=None, figure_config=MatplotlibFigureConfig())
		TemporalDataFrameState(dataframe=(4, 4), temporal_column='datetime', period=None, nan_policy='propagate', figure_config=MatplotlibFigureConfig())

		@@ -77,5 +80,6 @@ ```
		period: str \| None = None,
		nan_policy: str = "propagate",
		figure_config: BaseFigureConfig \| None = None,
		) -> None:
		super().__init__(dataframe=dataframe, figure_config=figure_config)
		super().__init__(dataframe=dataframe, nan_policy=nan_policy, figure_config=figure_config)

		@@ -92,2 +96,3 @@ check_column_exist(dataframe, temporal_column)
		"period": self._period,
		"nan_policy": self._nan_policy,
		"figure_config": self._figure_config,
		@@ -105,2 +110,3 @@ }
		"period": self._period,
		"nan_policy": self._nan_policy,
		"figure_config": self._figure_config,
		@@ -125,2 +131,3 @@ }
		period=self._period,
		nan_policy=self._nan_policy,
		figure_config=self._figure_config.clone() if deep else self._figure_config,
		@@ -127,0 +134,0 @@ )

+58

-21

src/arkas/utils/dataframe.py

		@@ -5,3 +5,3 @@ r"""Contain DataFrame utility functions."""

		__all__ = ["to_arrays"]
		__all__ = ["check_column_exist", "check_num_columns", "to_arrays"]

		@@ -16,10 +16,11 @@

		def to_arrays(frame: pl.DataFrame) -> dict[str, np.ndarray]:
		r"""Convert a ``polars.DataFrame`` to a dictionary of NumPy arrays.
		def check_column_exist(frame: pl.DataFrame, col: str) -> None:
		r"""Check if a column exists in the DataFrame.

		Args:
		frame: The DataFrame to convert.
		frame: The DataFrame.
		col: The column to check.

		Returns:
		A dictionary of NumPy arrays.
		Raises:
		ValueError: if the column is missing.

		@@ -31,3 +32,3 @@ Example usage:
		>>> import polars as pl
		>>> from arkas.utils.dataframe import to_arrays
		>>> from arkas.utils.dataframe import check_column_exist
		>>> frame = pl.DataFrame(
		@@ -41,22 +42,21 @@ ... {
		... )
		>>> data = to_arrays(frame)
		>>> data
		{'int': array([1, 2, 3, 4, 5]),
		'float': array([5., 4., 3., 2., 1.]),
		'str': array(['a', 'b', 'c', 'd', 'e'], dtype=object)}
		>>> check_column_exist(frame, "int")

		```
		"""
		return {s.name: s.to_numpy() for s in frame.iter_columns()}
		if col not in frame:
		msg = f"The column {col!r} is not in the DataFrame: {sorted(frame.columns)}"
		raise ValueError(msg)


		def check_column_exist(frame: pl.DataFrame, col: str) -> None:
		r"""Check if a column exists in the DataFrame.
		def check_num_columns(frame: pl.DataFrame, num_columns: int) -> None:
		r"""Check if the DataFrame has the expected number of columns.

		Args:
		frame: The DataFrame.
		col: The column to check.
		num_columns: The expected number of columns.

		Raises:
		ValueError: if the column is missing.
		ValueError: if the DataFrame has not the expected number of
		columns.

		@@ -68,3 +68,3 @@ Example usage:
		>>> import polars as pl
		>>> from arkas.utils.dataframe import check_column_exist
		>>> from arkas.utils.dataframe import check_num_columns
		>>> frame = pl.DataFrame(
		@@ -78,8 +78,45 @@ ... {
		... )
		>>> check_column_exist(frame, "int")
		>>> check_num_columns(frame, num_columns=3)

		```
		"""
		if col not in frame:
		msg = f"The column {col!r} is not in the DataFrame: {sorted(frame.columns)}"
		if frame.shape[1] != num_columns:
		msg = (
		f"The DataFrame must have {num_columns:,} columns but received a DataFrame of "
		f"shape {frame.shape}"
		)
		raise ValueError(msg)


		def to_arrays(frame: pl.DataFrame) -> dict[str, np.ndarray]:
		r"""Convert a ``polars.DataFrame`` to a dictionary of NumPy arrays.

		Args:
		frame: The DataFrame to convert.

		Returns:
		A dictionary of NumPy arrays.

		Example usage:

		```pycon

		>>> import polars as pl
		>>> from arkas.utils.dataframe import to_arrays
		>>> frame = pl.DataFrame(
		... {
		... "int": [1, 2, 3, 4, 5],
		... "float": [5.0, 4.0, 3.0, 2.0, 1.0],
		... "str": ["a", "b", "c", "d", "e"],
		... },
		... schema={"int": pl.Int64, "float": pl.Float64, "str": pl.String},
		... )
		>>> data = to_arrays(frame)
		>>> data
		{'int': array([1, 2, 3, 4, 5]),
		'float': array([5., 4., 3., 2., 1.]),
		'str': array(['a', 'b', 'c', 'd', 'e'], dtype=object)}

		```
		"""
		return {s.name: s.to_numpy() for s in frame.iter_columns()}

-63

src/arkas/analyzer/frame_summary.py

		r"""Implement an analyzer that generates a summary of the DataFrame."""

		from __future__ import annotations

		__all__ = ["DataFrameSummaryAnalyzer"]

		import logging
		from typing import TYPE_CHECKING

		from arkas.analyzer.lazy import BaseLazyAnalyzer
		from arkas.output.frame_summary import DataFrameSummaryOutput
		from arkas.utils.validation import check_positive

		if TYPE_CHECKING:
		import polars as pl

		logger = logging.getLogger(__name__)


		class DataFrameSummaryAnalyzer(BaseLazyAnalyzer):
		r"""Implement an analyzer to show a summary of the DataFrame.

		Args:
		top: The number of most frequent values to show.
		sort: If ``True``, sort the columns by alphabetical order.

		Example usage:

		```pycon

		>>> import polars as pl
		>>> from arkas.analyzer import DataFrameSummaryAnalyzer
		>>> analyzer = DataFrameSummaryAnalyzer()
		>>> analyzer
		DataFrameSummaryAnalyzer(top=5, sort=False)
		>>> frame = pl.DataFrame(
		... {
		... "col1": [0, 1, 0, 1],
		... "col2": [1, 0, 1, 0],
		... "col3": [1, 1, 1, 1],
		... },
		... schema={"col1": pl.Int64, "col2": pl.Int64, "col3": pl.Int64},
		... )
		>>> output = analyzer.analyze(frame)
		>>> output
		DataFrameSummaryOutput(shape=(4, 3), top=5)

		```
		"""

		def __init__(self, top: int = 5, sort: bool = False) -> None:
		check_positive(name="top", value=top)
		self._top = top
		self._sort = bool(sort)

		def __repr__(self) -> str:
		return f"{self.__class__.__qualname__}(top={self._top:,}, sort={self._sort})"

		def _analyze(self, frame: pl.DataFrame) -> DataFrameSummaryOutput:
		logger.info("Analyzing the DataFrame...")
		if self._sort:
		frame = frame.select(sorted(frame.columns))
		return DataFrameSummaryOutput(frame=frame, top=self._top)

-301

src/arkas/content/frame_summary.py

		r"""Contain the implementation of a HTML content generator that returns
		a summary of a DataFrame."""

		from __future__ import annotations

		__all__ = [
		"DataFrameSummaryContentGenerator",
		"create_table",
		"create_table_row",
		"create_template",
		]

		import logging
		from collections import Counter
		from typing import TYPE_CHECKING, Any

		from coola import objects_are_equal
		from grizz.utils.count import compute_nunique
		from grizz.utils.null import compute_null_count
		from jinja2 import Template

		from arkas.content.section import BaseSectionContentGenerator
		from arkas.utils.validation import check_positive

		if TYPE_CHECKING:
		from collections.abc import Sequence

		import polars as pl

		logger = logging.getLogger(__name__)


		class DataFrameSummaryContentGenerator(BaseSectionContentGenerator):
		r"""Implement a content generator that returns a summary of a
		DataFrame.

		Args:
		frame: The DataFrame to analyze.
		top: The number of most frequent values to show.

		Example usage:

		```pycon

		>>> import polars as pl
		>>> from arkas.content import DataFrameSummaryContentGenerator
		>>> content = DataFrameSummaryContentGenerator(
		... frame=pl.DataFrame(
		... {
		... "col1": [1.2, 4.2, 4.2, 2.2],
		... "col2": [1, 1, 1, 1],
		... "col3": [1, 2, 2, 2],
		... },
		... schema={"col1": pl.Float64, "col2": pl.Int64, "col3": pl.Int64},
		... )
		... )
		>>> content
		DataFrameSummaryContentGenerator(shape=(4, 3), top=5)

		```
		"""

		def __init__(self, frame: pl.DataFrame, top: int = 5) -> None:
		self._frame = frame
		check_positive(name="top", value=top)
		self._top = top

		def __repr__(self) -> str:
		return f"{self.__class__.__qualname__}(shape={self._frame.shape}, top={self._top})"

		@property
		def frame(self) -> pl.DataFrame:
		r"""The DataFrame to analyze."""
		return self._frame

		@property
		def top(self) -> int:
		return self._top

		def equal(self, other: Any, equal_nan: bool = False) -> bool:
		if not isinstance(other, self.__class__):
		return False
		return self.top == other.top and objects_are_equal(
		self.frame, other.frame, equal_nan=equal_nan
		)

		def get_columns(self) -> tuple[str, ...]:
		return tuple(self._frame.columns)

		def get_null_count(self) -> tuple[int, ...]:
		return tuple(compute_null_count(self._frame).tolist())

		def get_nunique(self) -> tuple[int, ...]:
		return tuple(compute_nunique(self._frame).tolist())

		def get_dtypes(self) -> tuple[pl.DataType, ...]:
		return tuple(self._frame.schema.dtypes())

		def get_most_frequent_values(self, top: int = 5) -> tuple[tuple[tuple[Any, int], ...], ...]:
		return tuple(tuple(Counter(series.to_list()).most_common(top)) for series in self.frame)

		def generate_content(self) -> str:
		logger.info("Generating the DataFrame summary content...")
		return Template(create_template()).render(
		{
		"table": self._create_table(),
		"nrows": f"{self._frame.shape[0]:,}",
		"ncols": f"{self._frame.shape[1]:,}",
		}
		)

		def _create_table(self) -> str:
		return create_table(
		columns=self.get_columns(),
		null_count=self.get_null_count(),
		nunique=self.get_nunique(),
		dtypes=self.get_dtypes(),
		most_frequent_values=self.get_most_frequent_values(top=self._top),
		total=self._frame.shape[0],
		)


		def create_template() -> str:
		r"""Return the template of the content.

		Returns:
		The content template.

		Example usage:

		```pycon

		>>> from arkas.content.frame_summary import create_template
		>>> template = create_template()

		```
		"""
		return """This section shows a short summary of each column.

		<ul>
		<li> <b>column</b>: are the column names</li>
		<li> <b>types</b>: are the object types for the objects in the column </li>
		<li> <b>null</b>: are the number (and percentage) of null values in the column </li>
		<li> <b>unique</b>: are the number (and percentage) of unique values in the column </li>
		</ul>

		<p style="margin-top: 1rem;">
		<b>General statistics about the DataFrame</b>

		<ul>
		<li> number of columns: {{ncols}} </li>
		<li> number of rows: {{nrows}}</li>
		</ul>

		{{table}}
		"""


		def create_table(
		columns: Sequence[str],
		null_count: Sequence[int],
		nunique: Sequence[int],
		dtypes: Sequence[pl.DataType],
		most_frequent_values: Sequence[Sequence[tuple[Any, int]]],
		total: int,
		) -> str:
		r"""Return a HTML representation of a table with the temporal
		distribution of null values.

		Args:
		columns: The column names.
		null_count: The number of null values for each column.
		nunique: The number of unique values for each column.
		dtypes: The data type for each column.
		most_frequent_values: The most frequent values for each column.
		total: The total number of rows.

		Returns:
		The HTML representation of the table.

		Example usage:

		```pycon

		>>> import polars as pl
		>>> from arkas.content.frame_summary import create_table_row
		>>> row = create_table(
		... columns=["float", "int", "str"],
		... null_count=(1, 0, 2),
		... nunique=(5, 2, 4),
		... dtypes=(pl.Float64(), pl.Int64(), pl.String()),
		... most_frequent_values=(
		... ((2.2, 2), (1.2, 1), (4.2, 1), (None, 1), (1.0, 1)),
		... ((1, 5), (0, 1)),
		... (("B", 2), (None, 2), ("A", 1), ("C", 1)),
		... ),
		... total=42,
		... )

		```
		"""
		rows = []
		for (
		column,
		null,
		nuniq,
		dtype,
		mf_values,
		) in zip(columns, null_count, nunique, dtypes, most_frequent_values):
		rows.append(
		create_table_row(
		column=column,
		null=null,
		dtype=dtype,
		nunique=nuniq,
		most_frequent_values=mf_values,
		total=total,
		)
		)
		rows = "\n".join(rows)
		return Template(
		"""<table class="table table-hover table-responsive w-auto" >
		<thead class="thead table-group-divider">
		<tr>
		<th>column</th>
		<th>types</th>
		<th>null</th>
		<th>unique</th>
		<th>most frequent values</th>
		</tr>
		</thead>
		<tbody class="tbody table-group-divider">
		{{rows}}
		<tr class="table-group-divider"></tr>
		</tbody>
		</table>
		"""
		).render({"rows": rows})


		def create_table_row(
		column: str,
		null: int,
		nunique: int,
		dtype: pl.DataType,
		most_frequent_values: Sequence[tuple[Any, int]],
		total: int,
		) -> str:
		r"""Create the HTML code of a new table row.

		Args:
		column: The column name.
		null: The number of null values.
		nunique: The number of unique values.
		dtype: The data type of the column.
		most_frequent_values: The most frequent values.
		total: The total number of rows.

		Returns:
		The HTML code of a row.

		Example usage:

		```pycon

		>>> import polars as pl
		>>> from arkas.content.frame_summary import create_table_row
		>>> row = create_table_row(
		... column="col",
		... null=5,
		... nunique=42,
		... dtype=pl.Float64(),
		... most_frequent_values=[("C", 12), ("A", 5), ("B", 4)],
		... total=100,
		... )

		```
		"""
		null = f"{null:,} ({100 * null / total if total else float('nan'):.2f}%)"
		nunique = f"{nunique:,} ({100 * nunique / total if total else float('nan'):.2f}%)"
		most_frequent_values = ", ".join(
		[f"{val} ({100 * c / total:.2f}%)" for val, c in most_frequent_values]
		)
		return Template(
		"""<tr>
		<th>{{column}}</th>
		<td>{{dtype}}</td>
		<td {{num_style}}>{{null}}</td>
		<td {{num_style}}>{{nunique}}</td>
		<td>{{most_frequent_values}}</td>
		</tr>"""
		).render(
		{
		"num_style": 'style="text-align: right;"',
		"column": column,
		"null": null,
		"dtype": dtype,
		"nunique": nunique,
		"most_frequent_values": most_frequent_values,
		}
		)

-77

src/arkas/output/frame_summary.py

		r"""Implement the DataFrame summary output."""

		from __future__ import annotations

		__all__ = ["DataFrameSummaryOutput"]

		from typing import TYPE_CHECKING, Any

		from coola import objects_are_equal

		from arkas.content.frame_summary import DataFrameSummaryContentGenerator
		from arkas.evaluator2.vanilla import Evaluator
		from arkas.output.lazy import BaseLazyOutput
		from arkas.plotter.vanilla import Plotter
		from arkas.utils.validation import check_positive

		if TYPE_CHECKING:
		import polars as pl


		class DataFrameSummaryOutput(BaseLazyOutput):
		r"""Implement the DataFrame summary output.

		Args:
		frame: The DataFrame to analyze.
		top: The number of most frequent values to show.

		Example usage:

		```pycon

		>>> import polars as pl
		>>> from arkas.output import DataFrameSummaryOutput
		>>> frame = pl.DataFrame(
		... {
		... "col1": [1.2, 4.2, 4.2, 2.2],
		... "col2": [1, 1, 1, 1],
		... "col3": [1, 2, 2, 2],
		... },
		... schema={"col1": pl.Float64, "col2": pl.Int64, "col3": pl.Int64},
		... )
		>>> output = DataFrameSummaryOutput(frame)
		>>> output
		DataFrameSummaryOutput(shape=(4, 3), top=5)
		>>> output.get_content_generator()
		DataFrameSummaryContentGenerator(shape=(4, 3), top=5)
		>>> output.get_evaluator()
		Evaluator(count=0)
		>>> output.get_plotter()
		Plotter(count=0)

		```
		"""

		def __init__(self, frame: pl.DataFrame, top: int = 5) -> None:
		self._frame = frame
		check_positive(name="top", value=top)
		self._top = top

		def __repr__(self) -> str:
		return f"{self.__class__.__qualname__}(shape={self._frame.shape}, top={self._top})"

		def equal(self, other: Any, equal_nan: bool = False) -> bool:
		if not isinstance(other, self.__class__):
		return False
		return self._top == other._top and objects_are_equal(
		self._frame, other._frame, equal_nan=equal_nan
		)

		def _get_content_generator(self) -> DataFrameSummaryContentGenerator:
		return DataFrameSummaryContentGenerator(frame=self._frame, top=self._top)

		def _get_evaluator(self) -> Evaluator:
		return Evaluator()

		def _get_plotter(self) -> Plotter:
		return Plotter()

arkas - npm Package Compare versions

Improved metrics