arkas
Advanced tools
| r"""Implement an analyzer that analyzes the correlation between numeric | ||
| columns.""" | ||
| from __future__ import annotations | ||
| __all__ = ["ColumnCorrelationAnalyzer"] | ||
| import logging | ||
| from typing import TYPE_CHECKING | ||
| from grizz.utils.format import str_shape_diff | ||
| from polars import selectors as cs | ||
| from arkas.analyzer.lazy import BaseInNLazyAnalyzer | ||
| from arkas.output import EmptyOutput | ||
| from arkas.output.column_correlation import ColumnCorrelationOutput | ||
| from arkas.state.target_dataframe import TargetDataFrameState | ||
| if TYPE_CHECKING: | ||
| from collections.abc import Sequence | ||
| import polars as pl | ||
| logger = logging.getLogger(__name__) | ||
| class ColumnCorrelationAnalyzer(BaseInNLazyAnalyzer): | ||
| r"""Implement an analyzer to analyze the correlation between numeric | ||
| columns. | ||
| Args: | ||
| columns: The columns to analyze. If ``None``, it analyzes all | ||
| the columns. | ||
| exclude_columns: The columns to exclude from the input | ||
| ``columns``. If any column is not found, it will be ignored | ||
| during the filtering process. | ||
| missing_policy: The policy on how to handle missing columns. | ||
| The following options are available: ``'ignore'``, | ||
| ``'warn'``, and ``'raise'``. If ``'raise'``, an exception | ||
| is raised if at least one column is missing. | ||
| If ``'warn'``, a warning is raised if at least one column | ||
| is missing and the missing columns are ignored. | ||
| If ``'ignore'``, the missing columns are ignored and | ||
| no warning message appears. | ||
| Example usage: | ||
| ```pycon | ||
| >>> import polars as pl | ||
| >>> from arkas.analyzer import ColumnCorrelationAnalyzer | ||
| >>> analyzer = ColumnCorrelationAnalyzer(target_column="col3") | ||
| >>> analyzer | ||
| ColumnCorrelationAnalyzer(target_column='col3', columns=None, exclude_columns=(), missing_policy='raise') | ||
| >>> frame = pl.DataFrame( | ||
| ... { | ||
| ... "col1": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0], | ||
| ... "col2": [7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0], | ||
| ... "col3": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0], | ||
| ... }, | ||
| ... ) | ||
| >>> output = analyzer.analyze(frame) | ||
| >>> output | ||
| ColumnCorrelationOutput( | ||
| (state): TargetDataFrameState(dataframe=(7, 3), target_column='col3', nan_policy='propagate', figure_config=MatplotlibFigureConfig()) | ||
| ) | ||
| ``` | ||
| """ | ||
| def __init__( | ||
| self, | ||
| target_column: str, | ||
| columns: Sequence[str] | None = None, | ||
| exclude_columns: Sequence[str] = (), | ||
| missing_policy: str = "raise", | ||
| ) -> None: | ||
| super().__init__( | ||
| columns=columns, exclude_columns=exclude_columns, missing_policy=missing_policy | ||
| ) | ||
| self._target_column = target_column | ||
| def find_columns(self, frame: pl.DataFrame) -> tuple[str, ...]: | ||
| columns = list(super().find_columns(frame)) | ||
| if self._target_column not in columns: | ||
| columns.append(self._target_column) | ||
| return tuple(columns) | ||
| def get_args(self) -> dict: | ||
| return {"target_column": self._target_column} | super().get_args() | ||
| def _analyze(self, frame: pl.DataFrame) -> ColumnCorrelationOutput | EmptyOutput: | ||
| if self._target_column not in frame: | ||
| logger.info( | ||
| f"Skipping '{self.__class__.__qualname__}.analyze' " | ||
| f"because the target column {self._target_column!r} is missing" | ||
| ) | ||
| return EmptyOutput() | ||
| logger.info( | ||
| f"Analyzing the correlation between {self._target_column} and {self._columns}..." | ||
| ) | ||
| columns = list(self.find_common_columns(frame)) | ||
| out = frame.select(cs.by_name(columns) & cs.numeric()) | ||
| logger.info(str_shape_diff(orig=frame.shape, final=out.shape)) | ||
| return ColumnCorrelationOutput( | ||
| state=TargetDataFrameState(dataframe=out, target_column=self._target_column) | ||
| ) |
| r"""Implement an analyzer that analyzes the correlation between two | ||
| columns.""" | ||
| from __future__ import annotations | ||
| __all__ = ["CorrelationAnalyzer"] | ||
| import logging | ||
| from typing import TYPE_CHECKING, Any | ||
| from coola import objects_are_equal | ||
| from coola.utils.format import repr_mapping_line | ||
| from grizz.utils.column import check_column_missing_policy, check_missing_column | ||
| from grizz.utils.format import str_shape_diff | ||
| from arkas.analyzer.lazy import BaseLazyAnalyzer | ||
| from arkas.metric.utils import check_nan_policy | ||
| from arkas.output import EmptyOutput | ||
| from arkas.output.correlation import CorrelationOutput | ||
| from arkas.state.dataframe import DataFrameState | ||
| if TYPE_CHECKING: | ||
| import polars as pl | ||
| from arkas.figure import BaseFigureConfig | ||
| logger = logging.getLogger(__name__) | ||
| class CorrelationAnalyzer(BaseLazyAnalyzer): | ||
| r"""Implement an analyzer that analyzes the correlation between two | ||
| columns. | ||
| Args: | ||
| x: The first column. | ||
| y: The second column. | ||
| drop_nulls: If ``True``, the rows with null values in | ||
| ``x`` or ``y`` columns are dropped. | ||
| missing_policy: The policy on how to handle missing columns. | ||
| The following options are available: ``'ignore'``, | ||
| ``'warn'``, and ``'raise'``. If ``'raise'``, an exception | ||
| is raised if at least one column is missing. | ||
| If ``'warn'``, a warning is raised if at least one column | ||
| is missing and the missing columns are ignored. | ||
| If ``'ignore'``, the missing columns are ignored and | ||
| no warning message appears. | ||
| nan_policy: The policy on how to handle NaN values in the input | ||
| arrays. The following options are available: ``'omit'``, | ||
| ``'propagate'``, and ``'raise'``. | ||
| figure_config: The figure configuration. | ||
| Example usage: | ||
| ```pycon | ||
| >>> import polars as pl | ||
| >>> from arkas.analyzer import CorrelationAnalyzer | ||
| >>> analyzer = CorrelationAnalyzer(x="col1", y="col2") | ||
| >>> analyzer | ||
| CorrelationAnalyzer(x='col1', y='col2', drop_nulls=True, missing_policy='raise', nan_policy='propagate', figure_config=None) | ||
| >>> frame = pl.DataFrame( | ||
| ... { | ||
| ... "col1": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0], | ||
| ... "col2": [7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0], | ||
| ... "col3": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0], | ||
| ... }, | ||
| ... schema={"col1": pl.Float64, "col2": pl.Float64, "col3": pl.Float64}, | ||
| ... ) | ||
| >>> output = analyzer.analyze(frame) | ||
| >>> output | ||
| CorrelationOutput( | ||
| (state): DataFrameState(dataframe=(7, 2), figure_config=MatplotlibFigureConfig()) | ||
| ) | ||
| ``` | ||
| """ | ||
| def __init__( | ||
| self, | ||
| x: str, | ||
| y: str, | ||
| drop_nulls: bool = True, | ||
| missing_policy: str = "raise", | ||
| nan_policy: str = "propagate", | ||
| figure_config: BaseFigureConfig | None = None, | ||
| ) -> None: | ||
| self._x = x | ||
| self._y = y | ||
| self._drop_nulls = bool(drop_nulls) | ||
| check_column_missing_policy(missing_policy) | ||
| self._missing_policy = missing_policy | ||
| check_nan_policy(nan_policy) | ||
| self._nan_policy = nan_policy | ||
| self._figure_config = figure_config | ||
| def __repr__(self) -> str: | ||
| args = repr_mapping_line(self.get_args()) | ||
| return f"{self.__class__.__qualname__}({args})" | ||
| def equal(self, other: Any, equal_nan: bool = False) -> bool: | ||
| if not isinstance(other, self.__class__): | ||
| return False | ||
| return objects_are_equal(self.get_args(), other.get_args(), equal_nan=equal_nan) | ||
| def get_args(self) -> dict: | ||
| return { | ||
| "x": self._x, | ||
| "y": self._y, | ||
| "drop_nulls": self._drop_nulls, | ||
| "missing_policy": self._missing_policy, | ||
| "nan_policy": self._nan_policy, | ||
| "figure_config": self._figure_config, | ||
| } | ||
| def _analyze(self, frame: pl.DataFrame) -> CorrelationOutput | EmptyOutput: | ||
| self._check_input_column(frame) | ||
| for col in [self._x, self._y]: | ||
| if col not in frame: | ||
| logger.info( | ||
| f"Skipping '{self.__class__.__qualname__}.analyze' " | ||
| f"because the input column {col!r} is missing" | ||
| ) | ||
| return EmptyOutput() | ||
| logger.info(f"Analyzing the correlation between {self._x!r} and {self._y!r}...") | ||
| dataframe = self._prepare_data(frame) | ||
| logger.info(str_shape_diff(orig=frame.shape, final=dataframe.shape)) | ||
| return CorrelationOutput( | ||
| DataFrameState( | ||
| dataframe=dataframe, nan_policy=self._nan_policy, figure_config=self._figure_config | ||
| ) | ||
| ) | ||
| def _prepare_data(self, data: pl.DataFrame) -> pl.DataFrame: | ||
| cols = [self._x, self._y] | ||
| data = data.select(cols) | ||
| if self._drop_nulls: | ||
| logger.info(f"Dropping rows that have at least one null value in the columns: {cols}") | ||
| data = data.drop_nulls() | ||
| return data | ||
| def _check_input_column(self, frame: pl.DataFrame) -> None: | ||
| r"""Check if the input column is missing. | ||
| Args: | ||
| frame: The input DataFrame to check. | ||
| """ | ||
| check_missing_column(frame, column=self._x, missing_policy=self._missing_policy) | ||
| check_missing_column(frame, column=self._y, missing_policy=self._missing_policy) |
| r"""Implement an analyzer that generates a summary of the numeric | ||
| columns of a DataFrame.""" | ||
| from __future__ import annotations | ||
| __all__ = ["NumericSummaryAnalyzer"] | ||
| import logging | ||
| from typing import TYPE_CHECKING | ||
| from grizz.utils.format import str_shape_diff | ||
| from polars import selectors as cs | ||
| from arkas.analyzer.lazy import BaseInNLazyAnalyzer | ||
| from arkas.output.numeric_summary import NumericSummaryOutput | ||
| from arkas.state.dataframe import DataFrameState | ||
| if TYPE_CHECKING: | ||
| import polars as pl | ||
| logger = logging.getLogger(__name__) | ||
| class NumericSummaryAnalyzer(BaseInNLazyAnalyzer): | ||
| r"""Implement an analyzer to show a summary of the numeric columns of | ||
| a DataFrame. | ||
| Args: | ||
| columns: The columns to analyze. If ``None``, it analyzes all | ||
| the columns. | ||
| exclude_columns: The columns to exclude from the input | ||
| ``columns``. If any column is not found, it will be ignored | ||
| during the filtering process. | ||
| missing_policy: The policy on how to handle missing columns. | ||
| The following options are available: ``'ignore'``, | ||
| ``'warn'``, and ``'raise'``. If ``'raise'``, an exception | ||
| is raised if at least one column is missing. | ||
| If ``'warn'``, a warning is raised if at least one column | ||
| is missing and the missing columns are ignored. | ||
| If ``'ignore'``, the missing columns are ignored and | ||
| no warning message appears. | ||
| Example usage: | ||
| ```pycon | ||
| >>> import polars as pl | ||
| >>> from arkas.analyzer import NumericSummaryAnalyzer | ||
| >>> analyzer = NumericSummaryAnalyzer() | ||
| >>> analyzer | ||
| NumericSummaryAnalyzer(columns=None, exclude_columns=(), missing_policy='raise') | ||
| >>> frame = pl.DataFrame( | ||
| ... { | ||
| ... "col1": [0, 1, 1, 0, 0, 1, 0], | ||
| ... "col2": [0, 1, 0, 1, 0, 1, 0], | ||
| ... "col3": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0], | ||
| ... }, | ||
| ... schema={"col1": pl.Int64, "col2": pl.Int32, "col3": pl.Float64}, | ||
| ... ) | ||
| >>> output = analyzer.analyze(frame) | ||
| >>> output | ||
| NumericSummaryOutput( | ||
| (state): DataFrameState(dataframe=(7, 3), nan_policy='propagate', figure_config=MatplotlibFigureConfig()) | ||
| ) | ||
| ``` | ||
| """ | ||
| def _analyze(self, frame: pl.DataFrame) -> NumericSummaryOutput: | ||
| logger.info("Analyzing the numeric columns...") | ||
| columns = self.find_common_columns(frame) | ||
| out = frame.select(cs.by_name(columns) & cs.numeric()) | ||
| logger.info(str_shape_diff(orig=frame.shape, final=out.shape)) | ||
| return NumericSummaryOutput(state=DataFrameState(out)) |
| r"""Implement an analyzer that generates a summary of the DataFrame.""" | ||
| from __future__ import annotations | ||
| __all__ = ["SummaryAnalyzer"] | ||
| import logging | ||
| from typing import TYPE_CHECKING | ||
| from arkas.analyzer.lazy import BaseLazyAnalyzer | ||
| from arkas.output.summary import SummaryOutput | ||
| from arkas.utils.validation import check_positive | ||
| if TYPE_CHECKING: | ||
| import polars as pl | ||
| logger = logging.getLogger(__name__) | ||
| class SummaryAnalyzer(BaseLazyAnalyzer): | ||
| r"""Implement an analyzer to show a summary of the DataFrame. | ||
| Args: | ||
| top: The number of most frequent values to show. | ||
| sort: If ``True``, sort the columns by alphabetical order. | ||
| Example usage: | ||
| ```pycon | ||
| >>> import polars as pl | ||
| >>> from arkas.analyzer import SummaryAnalyzer | ||
| >>> analyzer = SummaryAnalyzer() | ||
| >>> analyzer | ||
| SummaryAnalyzer(top=5, sort=False) | ||
| >>> frame = pl.DataFrame( | ||
| ... { | ||
| ... "col1": [0, 1, 0, 1], | ||
| ... "col2": [1, 0, 1, 0], | ||
| ... "col3": [1, 1, 1, 1], | ||
| ... }, | ||
| ... schema={"col1": pl.Int64, "col2": pl.Int64, "col3": pl.Int64}, | ||
| ... ) | ||
| >>> output = analyzer.analyze(frame) | ||
| >>> output | ||
| SummaryOutput(shape=(4, 3), top=5) | ||
| ``` | ||
| """ | ||
| def __init__(self, top: int = 5, sort: bool = False) -> None: | ||
| check_positive(name="top", value=top) | ||
| self._top = top | ||
| self._sort = bool(sort) | ||
| def __repr__(self) -> str: | ||
| return f"{self.__class__.__qualname__}(top={self._top:,}, sort={self._sort})" | ||
| def _analyze(self, frame: pl.DataFrame) -> SummaryOutput: | ||
| logger.info("Analyzing the DataFrame...") | ||
| if self._sort: | ||
| frame = frame.select(sorted(frame.columns)) | ||
| return SummaryOutput(frame=frame, top=self._top) |
| r"""Contain the implementation of a HTML content generator that analyzes | ||
| the correlation between 1 target column and other columns.""" | ||
| from __future__ import annotations | ||
| __all__ = [ | ||
| "ColumnCorrelationContentGenerator", | ||
| "create_table", | ||
| "create_table_row", | ||
| "create_template", | ||
| ] | ||
| import logging | ||
| from typing import TYPE_CHECKING, Any | ||
| from coola.utils import repr_indent, repr_mapping, str_indent, str_mapping | ||
| from jinja2 import Template | ||
| from arkas.content.section import BaseSectionContentGenerator | ||
| from arkas.evaluator2.column_correlation import ColumnCorrelationEvaluator | ||
| if TYPE_CHECKING: | ||
| from collections.abc import Sequence | ||
| from arkas.state.target_dataframe import TargetDataFrameState | ||
| logger = logging.getLogger(__name__) | ||
| class ColumnCorrelationContentGenerator(BaseSectionContentGenerator): | ||
| r"""Implement a content generator that analyzes the correlation | ||
| between 1 target column and other columns. | ||
| Args: | ||
| state: The state containing the DataFrame to analyze. | ||
| Example usage: | ||
| ```pycon | ||
| >>> import polars as pl | ||
| >>> from arkas.content import ColumnCorrelationContentGenerator | ||
| >>> from arkas.state import TargetDataFrameState | ||
| >>> frame = pl.DataFrame( | ||
| ... { | ||
| ... "col1": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0], | ||
| ... "col2": [7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0], | ||
| ... "col3": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0], | ||
| ... }, | ||
| ... ) | ||
| >>> content = ColumnCorrelationContentGenerator( | ||
| ... TargetDataFrameState(frame, target_column="col3") | ||
| ... ) | ||
| >>> content | ||
| ColumnCorrelationContentGenerator( | ||
| (state): TargetDataFrameState(dataframe=(7, 3), target_column='col3', nan_policy='propagate', figure_config=MatplotlibFigureConfig()) | ||
| ) | ||
| ``` | ||
| """ | ||
| def __init__(self, state: TargetDataFrameState) -> None: | ||
| self._state = state | ||
| def __repr__(self) -> str: | ||
| args = repr_indent(repr_mapping({"state": self._state})) | ||
| return f"{self.__class__.__qualname__}(\n {args}\n)" | ||
| def __str__(self) -> str: | ||
| args = str_indent(str_mapping({"state": self._state})) | ||
| return f"{self.__class__.__qualname__}(\n {args}\n)" | ||
| def equal(self, other: Any, equal_nan: bool = False) -> bool: | ||
| if not isinstance(other, self.__class__): | ||
| return False | ||
| return self._state.equal(other._state, equal_nan=equal_nan) | ||
| def generate_content(self) -> str: | ||
| logger.info( | ||
| f"Generating the correlation analysis between {self._state.target_column} " | ||
| f"and {list(self._state.dataframe.columns)}..." | ||
| ) | ||
| metrics = ColumnCorrelationEvaluator(self._state).evaluate() | ||
| columns = list(self._state.dataframe.columns) | ||
| columns.remove(self._state.target_column) | ||
| nrows, ncols = self._state.dataframe.shape | ||
| return Template(create_template()).render( | ||
| { | ||
| "nrows": f"{nrows:,}", | ||
| "ncols": f"{ncols:,}", | ||
| "columns": ", ".join(self._state.dataframe.columns), | ||
| "table": create_table(metrics, columns=columns), | ||
| "target_column": f"{self._state.target_column}", | ||
| } | ||
| ) | ||
| def create_template() -> str: | ||
| r"""Return the template of the content. | ||
| Returns: | ||
| The content template. | ||
| Example usage: | ||
| ```pycon | ||
| >>> from arkas.content.column_correlation import create_template | ||
| >>> template = create_template() | ||
| ``` | ||
| """ | ||
| return """<p style="margin-top: 1rem;"> | ||
| This section analyzes the correlation between <em>{{target_column}}</em> and other columns. | ||
| The correlation coefficient is a statistical measure of the strength of a | ||
| relationship between two variables. Its values can range from -1 to 1. | ||
| <ul> | ||
| <li> A correlation coefficient of -1 describes a perfect negative, or inverse, | ||
| correlation, with values in one series rising as those in the other decline, | ||
| and vice versa. </li> | ||
| <li> A coefficient of 1 shows a perfect positive correlation, or a direct relationship. </li> | ||
| <li> A correlation coefficient of 0 means there is no direct relationship. </li> | ||
| </ul> | ||
| The DataFrame has {{nrows}} rows and {{ncols}} columns. | ||
| </p> | ||
| {{table}} | ||
| """ | ||
| def create_table(metrics: dict[str, dict], columns: Sequence[str]) -> str: | ||
| r"""Return a HTML representation of a table with some statisticts | ||
| about each column. | ||
| Args: | ||
| metrics: The dictionary of metrics. | ||
| columns: The columns to show in the table. | ||
| Returns: | ||
| The HTML representation of the table. | ||
| Example usage: | ||
| ```pycon | ||
| >>> import polars as pl | ||
| >>> from arkas.content.column_correlation import create_table | ||
| >>> row = create_table( | ||
| ... metrics={ | ||
| ... "correlation_col1": { | ||
| ... "count": 7, | ||
| ... "pearson_coeff": 1.0, | ||
| ... "pearson_pvalue": 0.0, | ||
| ... "spearman_coeff": 1.0, | ||
| ... "spearman_pvalue": 0.0, | ||
| ... }, | ||
| ... "correlation_col2": { | ||
| ... "count": 7, | ||
| ... "pearson_coeff": -1.0, | ||
| ... "pearson_pvalue": 0.0, | ||
| ... "spearman_coeff": -1.0, | ||
| ... "spearman_pvalue": 0.0, | ||
| ... }, | ||
| ... }, | ||
| ... columns=["col1", "col2"], | ||
| ... ) | ||
| ``` | ||
| """ | ||
| rows = "\n".join( | ||
| [create_table_row(column=col, metrics=metrics[f"correlation_{col}"]) for col in columns] | ||
| ) | ||
| return Template( | ||
| """<table class="table table-hover table-responsive w-auto" > | ||
| <thead class="thead table-group-divider"> | ||
| <tr> | ||
| <th>column</th> | ||
| <th>num samples</th> | ||
| <th>pearson coefficient</th> | ||
| <th>pearson p-value</th> | ||
| <th>spearman coefficient</th> | ||
| <th>spearman p-value</th> | ||
| </tr> | ||
| </thead> | ||
| <tbody class="tbody table-group-divider"> | ||
| {{rows}} | ||
| <tr class="table-group-divider"></tr> | ||
| </tbody> | ||
| </table> | ||
| """ | ||
| ).render({"rows": rows}) | ||
| def create_table_row(column: str, metrics: dict) -> str: | ||
| r"""Create the HTML code of a new table row. | ||
| Args: | ||
| column: The column name | ||
| metrics: The dictionary of metrics with the correlation scores. | ||
| Returns: | ||
| The HTML code of a row. | ||
| Example usage: | ||
| ```pycon | ||
| >>> import polars as pl | ||
| >>> from arkas.content.column_correlation import create_table_row | ||
| >>> row = create_table_row( | ||
| ... column="col1", | ||
| ... metrics={ | ||
| ... "count": 7, | ||
| ... "pearson_coeff": 1.0, | ||
| ... "pearson_pvalue": 0.0, | ||
| ... "spearman_coeff": 1.0, | ||
| ... "spearman_pvalue": 0.0, | ||
| ... }, | ||
| ... ) | ||
| ``` | ||
| """ | ||
| return Template( | ||
| """<tr> | ||
| <th>{{column}}</th> | ||
| <td {{num_style}}>{{count}}</td> | ||
| <td {{num_style}}>{{pearson_coeff}}</td> | ||
| <td {{num_style}}>{{pearson_pvalue}}</td> | ||
| <td {{num_style}}>{{spearman_coeff}}</td> | ||
| <td {{num_style}}>{{spearman_pvalue}}</td> | ||
| </tr>""" | ||
| ).render( | ||
| { | ||
| "num_style": 'style="text-align: right;"', | ||
| "column": column, | ||
| "count": f'{metrics.get("count", 0):,}', | ||
| "pearson_coeff": f'{metrics.get("pearson_coeff", float("nan")):.4f}', | ||
| "pearson_pvalue": f'{metrics.get("pearson_pvalue", float("nan")):.4f}', | ||
| "spearman_coeff": f'{metrics.get("spearman_coeff", float("nan")):.4f}', | ||
| "spearman_pvalue": f'{metrics.get("spearman_pvalue", float("nan")):.4f}', | ||
| } | ||
| ) |
| r"""Contain the implementation of a HTML content generator that analyzes | ||
| the correlation between two columns.""" | ||
| from __future__ import annotations | ||
| __all__ = [ | ||
| "CorrelationContentGenerator", | ||
| "create_template", | ||
| ] | ||
| import logging | ||
| from typing import TYPE_CHECKING, Any | ||
| from coola.utils import repr_indent, repr_mapping, str_indent, str_mapping | ||
| from jinja2 import Template | ||
| from arkas.content.section import BaseSectionContentGenerator | ||
| from arkas.evaluator2.correlation import CorrelationEvaluator | ||
| from arkas.figure.utils import figure2html | ||
| from arkas.plotter.correlation import CorrelationPlotter | ||
| from arkas.utils.dataframe import check_num_columns | ||
| if TYPE_CHECKING: | ||
| from arkas.state.target_dataframe import DataFrameState | ||
| logger = logging.getLogger(__name__) | ||
| class CorrelationContentGenerator(BaseSectionContentGenerator): | ||
| r"""Implement a content generator that analyzes the correlation | ||
| between two columns. | ||
| Args: | ||
| state: The state containing the DataFrame to analyze. | ||
| The DataFrame must have only 2 columns, which are the two | ||
| columns to analyze. | ||
| Example usage: | ||
| ```pycon | ||
| >>> import polars as pl | ||
| >>> from arkas.content import CorrelationContentGenerator | ||
| >>> from arkas.state import DataFrameState | ||
| >>> frame = pl.DataFrame( | ||
| ... { | ||
| ... "col1": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0], | ||
| ... "col2": [7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0], | ||
| ... }, | ||
| ... ) | ||
| >>> content = CorrelationContentGenerator(DataFrameState(frame)) | ||
| >>> content | ||
| CorrelationContentGenerator( | ||
| (state): DataFrameState(dataframe=(7, 2), nan_policy='propagate', figure_config=MatplotlibFigureConfig()) | ||
| ) | ||
| ``` | ||
| """ | ||
| def __init__(self, state: DataFrameState) -> None: | ||
| check_num_columns(state.dataframe, num_columns=2) | ||
| self._state = state | ||
| def __repr__(self) -> str: | ||
| args = repr_indent(repr_mapping({"state": self._state})) | ||
| return f"{self.__class__.__qualname__}(\n {args}\n)" | ||
| def __str__(self) -> str: | ||
| args = str_indent(str_mapping({"state": self._state})) | ||
| return f"{self.__class__.__qualname__}(\n {args}\n)" | ||
| def equal(self, other: Any, equal_nan: bool = False) -> bool: | ||
| if not isinstance(other, self.__class__): | ||
| return False | ||
| return self._state.equal(other._state, equal_nan=equal_nan) | ||
| def generate_content(self) -> str: | ||
| xcol, ycol = self._state.dataframe.columns | ||
| logger.info(f"Generating the correlation analysis between {xcol} and {ycol}...") | ||
| metrics = CorrelationEvaluator(self._state).evaluate() | ||
| figures = CorrelationPlotter(self._state).plot() | ||
| return Template(create_template()).render( | ||
| { | ||
| "xcol": str(xcol), | ||
| "ycol": str(ycol), | ||
| "columns": ", ".join(self._state.dataframe.columns), | ||
| "count": f"{metrics['count']:,}", | ||
| "pearson_coeff": f"{metrics['pearson_coeff']:.4f}", | ||
| "pearson_pvalue": f"{metrics['pearson_pvalue']:.4f}", | ||
| "spearman_coeff": f"{metrics['spearman_coeff']:.4f}", | ||
| "spearman_pvalue": f"{metrics['spearman_pvalue']:.4f}", | ||
| "figure": figure2html(figures["correlation"], close_fig=True), | ||
| } | ||
| ) | ||
| def create_template() -> str: | ||
| r"""Return the template of the content. | ||
| Returns: | ||
| The content template. | ||
| Example usage: | ||
| ```pycon | ||
| >>> from arkas.content.correlation import create_template | ||
| >>> template = create_template() | ||
| ``` | ||
| """ | ||
| return """<p style="margin-top: 1rem;"> | ||
| This section analyzes the correlation between <em>{{xcol}}</em> and <em>{{ycol}}</em>. | ||
| The correlation coefficient is a statistical measure of the strength of a | ||
| relationship between two variables. Its values can range from -1 to 1. | ||
| <ul> | ||
| <li> <b>pearson coefficient</b>: {{pearson_coeff}} </li> | ||
| <li> <b>pearson p-value</b>: {{pearson_pvalue}} </li> | ||
| <li> <b>spearman coefficient</b>: {{spearman_coeff}} </li> | ||
| <li> <b>spearman p-value</b>: {{spearman_pvalue}} </li> | ||
| <li> <b>num samples</b>: {{count}} </li> | ||
| </ul> | ||
| <p style="margin-top: 1rem;"> | ||
| The following figure shows the scatter plot between <em>{{xcol}}</em> and <em>{{ycol}}</em>. | ||
| </p> | ||
| {{figure}} | ||
| """ |
| r"""Contain the implementation of a HTML content generator that | ||
| summarizes the numeric columns of a DataFrame.""" | ||
| from __future__ import annotations | ||
| __all__ = [ | ||
| "NumericSummaryContentGenerator", | ||
| "create_table", | ||
| "create_table_quantiles", | ||
| "create_table_quantiles_row", | ||
| "create_table_row", | ||
| "create_template", | ||
| ] | ||
| import logging | ||
| from typing import TYPE_CHECKING, Any | ||
| from coola.utils import repr_indent, repr_mapping, str_indent, str_mapping | ||
| from jinja2 import Template | ||
| from arkas.content.section import BaseSectionContentGenerator | ||
| from arkas.content.utils import float_to_str | ||
| from arkas.utils.stats import compute_statistics_continuous | ||
| if TYPE_CHECKING: | ||
| import polars as pl | ||
| from arkas.state.dataframe import DataFrameState | ||
| logger = logging.getLogger(__name__) | ||
| class NumericSummaryContentGenerator(BaseSectionContentGenerator): | ||
| r"""Implement a content generator that summarizes the numeric columns | ||
| of a DataFrame. | ||
| Args: | ||
| state: The state containing the DataFrame to analyze. | ||
| Example usage: | ||
| ```pycon | ||
| >>> import polars as pl | ||
| >>> from arkas.content import NumericSummaryContentGenerator | ||
| >>> from arkas.state import DataFrameState | ||
| >>> dataframe = pl.DataFrame( | ||
| ... { | ||
| ... "col1": [0, 1, 1, 0, 0, 1, 0], | ||
| ... "col2": [0, 1, 0, 1, 0, 1, 0], | ||
| ... "col3": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0], | ||
| ... } | ||
| ... ) | ||
| >>> content = NumericSummaryContentGenerator(DataFrameState(dataframe)) | ||
| >>> content | ||
| NumericSummaryContentGenerator( | ||
| (state): DataFrameState(dataframe=(7, 3), nan_policy='propagate', figure_config=MatplotlibFigureConfig()) | ||
| ) | ||
| ``` | ||
| """ | ||
| def __init__(self, state: DataFrameState) -> None: | ||
| self._state = state | ||
| def __repr__(self) -> str: | ||
| args = repr_indent(repr_mapping({"state": self._state})) | ||
| return f"{self.__class__.__qualname__}(\n {args}\n)" | ||
| def __str__(self) -> str: | ||
| args = str_indent(str_mapping({"state": self._state})) | ||
| return f"{self.__class__.__qualname__}(\n {args}\n)" | ||
| def equal(self, other: Any, equal_nan: bool = False) -> bool: | ||
| if not isinstance(other, self.__class__): | ||
| return False | ||
| return self._state.equal(other._state, equal_nan=equal_nan) | ||
| def generate_content(self) -> str: | ||
| nrows, ncols = self._state.dataframe.shape | ||
| logger.info(f"Generating the summary of {ncols:,} numeric columns...") | ||
| return Template(create_template()).render( | ||
| { | ||
| "nrows": f"{nrows:,}", | ||
| "ncols": f"{ncols:,}", | ||
| "columns": ", ".join(self._state.dataframe.columns), | ||
| "table": create_table(self._state.dataframe), | ||
| "table_quantiles": create_table_quantiles(self._state.dataframe), | ||
| } | ||
| ) | ||
| def create_template() -> str: | ||
| r"""Return the template of the content. | ||
| Returns: | ||
| The content template. | ||
| Example usage: | ||
| ```pycon | ||
| >>> from arkas.content.numeric_summary import create_template | ||
| >>> template = create_template() | ||
| ``` | ||
| """ | ||
| return """This section shows a short summary of each column. | ||
| <ul> | ||
| <li> <b>column</b>: is the column name</li> | ||
| <li> <b>dtype</b>: is the column data type </li> | ||
| <li> <b>null</b>: is the number (and percentage) of null values in the column </li> | ||
| <li> <b>nan</b>: is the number (and percentage) of not a number (NaN) values in the column </li> | ||
| <li> <b>unique</b>: is the number (and percentage) of unique values in the column </li> | ||
| <li> <b>negative</b>: is the number (and percentage) of strictly negative values (<span><</span>0) in the column </li> | ||
| <li> <b>zero</b>: is the number (and percentage) of zero values (=0) in the column </li> | ||
| <li> <b>positive</b>: is the number (and percentage) of strictly positive values (<span>></span>0) in the column </li> | ||
| </ul> | ||
| <p style="margin-top: 1rem;"> | ||
| <b>General statistics about the DataFrame</b> | ||
| {{table}} | ||
| <details> | ||
| <summary>[show additional statistics]</summary> | ||
| <p style="margin-top: 1rem;"> | ||
| The following table shows some quantiles for each column. </p> | ||
| {{table_quantiles}} | ||
| </details> | ||
| """ | ||
| def create_table( | ||
| frame: pl.DataFrame, | ||
| ) -> str: | ||
| r"""Return a HTML representation of a table with some statisticts | ||
| about each column. | ||
| Args: | ||
| frame: The DataFrame to analyze. | ||
| Returns: | ||
| The HTML representation of the table. | ||
| Example usage: | ||
| ```pycon | ||
| >>> import polars as pl | ||
| >>> from arkas.content.numeric_summary import create_table | ||
| >>> dataframe = pl.DataFrame( | ||
| ... { | ||
| ... "col1": [0, 1, 1, 0, 0, 1, 0], | ||
| ... "col2": [0, 1, 0, 1, 0, 1, 0], | ||
| ... "col3": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0], | ||
| ... } | ||
| ... ) | ||
| >>> row = create_table(dataframe) | ||
| ``` | ||
| """ | ||
| rows = "\n".join([create_table_row(series=series) for series in frame]) | ||
| return Template( | ||
| """<table class="table table-hover table-responsive w-auto" > | ||
| <thead class="thead table-group-divider"> | ||
| <tr> | ||
| <th>column</th> | ||
| <th>dtype</th> | ||
| <th>null</th> | ||
| <th>nan</th> | ||
| <th>unique</th> | ||
| <th>negative</th> | ||
| <th>zero</th> | ||
| <th>positive</th> | ||
| <th>mean</th> | ||
| <th>std</th> | ||
| <th>skewness</th> | ||
| <th>kurtosis</th> | ||
| <th>min</th> | ||
| <th>median</th> | ||
| <th>max</th> | ||
| </tr> | ||
| </thead> | ||
| <tbody class="tbody table-group-divider"> | ||
| {{rows}} | ||
| <tr class="table-group-divider"></tr> | ||
| </tbody> | ||
| </table> | ||
| """ | ||
| ).render({"rows": rows}) | ||
| def create_table_row(series: pl.Series) -> str: | ||
| r"""Create the HTML code of a new table row. | ||
| Args: | ||
| series: The series to analyze. | ||
| Returns: | ||
| The HTML code of a row. | ||
| Example usage: | ||
| ```pycon | ||
| >>> import polars as pl | ||
| >>> from arkas.content.numeric_summary import create_table_row | ||
| >>> row = create_table_row(pl.Series("col1", [1, 2, 3, 4, 5, 6, 7])) | ||
| ``` | ||
| """ | ||
| stats = compute_statistics_continuous(series) | ||
| nan = int(series.is_nan().sum()) | ||
| null = stats["num_nulls"] | ||
| nunique = stats["nunique"] | ||
| total = stats["count"] | ||
| negative = stats["<0"] | ||
| zero = stats["=0"] | ||
| positive = stats[">0"] | ||
| return Template( | ||
| """<tr> | ||
| <th>{{column}}</th> | ||
| <td>{{dtype}}</td> | ||
| <td {{num_style}}>{{null}}</td> | ||
| <td {{num_style}}>{{nan}}</td> | ||
| <td {{num_style}}>{{nunique}}</td> | ||
| <td {{num_style}}>{{negative}}</td> | ||
| <td {{num_style}}>{{zero}}</td> | ||
| <td {{num_style}}>{{positive}}</td> | ||
| <td {{num_style}}>{{mean}}</td> | ||
| <td {{num_style}}>{{std}}</td> | ||
| <td {{num_style}}>{{skewness}}</td> | ||
| <td {{num_style}}>{{kurtosis}}</td> | ||
| <td {{num_style}}>{{min}}</td> | ||
| <td {{num_style}}>{{median}}</td> | ||
| <td {{num_style}}>{{max}}</td> | ||
| </tr>""" | ||
| ).render( | ||
| { | ||
| "num_style": 'style="text-align: right;"', | ||
| "column": series.name, | ||
| "dtype": series.dtype, | ||
| "null": f"{null:,} ({100 * null / total if total else float('nan'):.2f}%)", | ||
| "nan": f"{nan:,} ({100 * nan / total if total else float('nan'):.2f}%)", | ||
| "nunique": f"{nunique:,} ({100 * nunique / total if total else float('nan'):.2f}%)", | ||
| "mean": float_to_str(stats["mean"]), | ||
| "std": float_to_str(stats["std"]), | ||
| "skewness": float_to_str(stats["skewness"]), | ||
| "kurtosis": float_to_str(stats["kurtosis"]), | ||
| "min": float_to_str(stats["min"]), | ||
| "median": float_to_str(stats["median"]), | ||
| "max": float_to_str(stats["max"]), | ||
| "negative": f"{negative:,} ({100 * negative / total if total else float('nan'):.2f}%)", | ||
| "zero": f"{zero:,} ({100 * zero / total if total else float('nan'):.2f}%)", | ||
| "positive": f"{positive:,} ({100 * positive / total if total else float('nan'):.2f}%)", | ||
| } | ||
| ) | ||
| def create_table_quantiles( | ||
| frame: pl.DataFrame, | ||
| ) -> str: | ||
| r"""Return a HTML representation of a table with quantile statisticts | ||
| for each column. | ||
| Args: | ||
| frame: The DataFrame to analyze. | ||
| Returns: | ||
| The HTML representation of the table. | ||
| Example usage: | ||
| ```pycon | ||
| >>> import polars as pl | ||
| >>> from arkas.content.numeric_summary import create_table | ||
| >>> dataframe = pl.DataFrame( | ||
| ... { | ||
| ... "col1": [0, 1, 1, 0, 0, 1, 0], | ||
| ... "col2": [0, 1, 0, 1, 0, 1, 0], | ||
| ... "col3": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0], | ||
| ... } | ||
| ... ) | ||
| >>> row = create_table(dataframe) | ||
| ``` | ||
| """ | ||
| rows = "\n".join([create_table_quantiles_row(series=series) for series in frame]) | ||
| return Template( | ||
| """<table class="table table-hover table-responsive w-auto" > | ||
| <thead class="thead table-group-divider"> | ||
| <tr> | ||
| <th>column</th> | ||
| <th>min</th> | ||
| <th>q0.001</th> | ||
| <th>q0.01</th> | ||
| <th>q0.05</th> | ||
| <th>q0.10</th> | ||
| <th>q0.25</th> | ||
| <th>median</th> | ||
| <th>q0.75</th> | ||
| <th>q0.90</th> | ||
| <th>q0.95</th> | ||
| <th>q0.99</th> | ||
| <th>q0.999</th> | ||
| <th>max</th> | ||
| </tr> | ||
| </thead> | ||
| <tbody class="tbody table-group-divider"> | ||
| {{rows}} | ||
| <tr class="table-group-divider"></tr> | ||
| </tbody> | ||
| </table> | ||
| """ | ||
| ).render({"rows": rows}) | ||
| def create_table_quantiles_row(series: pl.Series) -> str: | ||
| r"""Create the HTML code of a new table row. | ||
| Args: | ||
| series: The series to analyze. | ||
| Returns: | ||
| The HTML code of a row. | ||
| Example usage: | ||
| ```pycon | ||
| >>> import polars as pl | ||
| >>> from arkas.content.numeric_summary import create_table_row | ||
| >>> row = create_table_row(pl.Series("col1", [1, 2, 3, 4, 5, 6, 7])) | ||
| ``` | ||
| """ | ||
| stats = compute_statistics_continuous(series) | ||
| return Template( | ||
| """<tr> | ||
| <th>{{column}}</th> | ||
| <td {{num_style}}>{{min}}</td> | ||
| <td {{num_style}}>{{q001}}</td> | ||
| <td {{num_style}}>{{q01}}</td> | ||
| <td {{num_style}}>{{q05}}</td> | ||
| <td {{num_style}}>{{q10}}</td> | ||
| <td {{num_style}}>{{q25}}</td> | ||
| <td {{num_style}}>{{median}}</td> | ||
| <td {{num_style}}>{{q75}}</td> | ||
| <td {{num_style}}>{{q90}}</td> | ||
| <td {{num_style}}>{{q95}}</td> | ||
| <td {{num_style}}>{{q99}}</td> | ||
| <td {{num_style}}>{{q999}}</td> | ||
| <td {{num_style}}>{{max}}</td> | ||
| </tr>""" | ||
| ).render( | ||
| { | ||
| "num_style": 'style="text-align: right;"', | ||
| "column": series.name, | ||
| "min": float_to_str(stats["min"]), | ||
| "q001": float_to_str(stats["q001"]), | ||
| "q01": float_to_str(stats["q01"]), | ||
| "q05": float_to_str(stats["q05"]), | ||
| "q10": float_to_str(stats["q10"]), | ||
| "q25": float_to_str(stats["q25"]), | ||
| "median": float_to_str(stats["median"]), | ||
| "q75": float_to_str(stats["q75"]), | ||
| "q90": float_to_str(stats["q90"]), | ||
| "q95": float_to_str(stats["q95"]), | ||
| "q99": float_to_str(stats["q99"]), | ||
| "q999": float_to_str(stats["q999"]), | ||
| "max": float_to_str(stats["max"]), | ||
| } | ||
| ) |
| r"""Contain the implementation of a HTML content generator that returns | ||
| a summary of a DataFrame.""" | ||
| from __future__ import annotations | ||
| __all__ = [ | ||
| "SummaryContentGenerator", | ||
| "create_table", | ||
| "create_table_row", | ||
| "create_template", | ||
| ] | ||
| import logging | ||
| from collections import Counter | ||
| from typing import TYPE_CHECKING, Any | ||
| from coola import objects_are_equal | ||
| from grizz.utils.count import compute_nunique | ||
| from grizz.utils.null import compute_null_count | ||
| from jinja2 import Template | ||
| from arkas.content.section import BaseSectionContentGenerator | ||
| from arkas.utils.validation import check_positive | ||
| if TYPE_CHECKING: | ||
| from collections.abc import Sequence | ||
| import polars as pl | ||
| logger = logging.getLogger(__name__) | ||
| class SummaryContentGenerator(BaseSectionContentGenerator): | ||
| r"""Implement a content generator that returns a summary of a | ||
| DataFrame. | ||
| Args: | ||
| frame: The DataFrame to analyze. | ||
| top: The number of most frequent values to show. | ||
| Example usage: | ||
| ```pycon | ||
| >>> import polars as pl | ||
| >>> from arkas.content import SummaryContentGenerator | ||
| >>> content = SummaryContentGenerator( | ||
| ... frame=pl.DataFrame( | ||
| ... { | ||
| ... "col1": [1.2, 4.2, 4.2, 2.2], | ||
| ... "col2": [1, 1, 1, 1], | ||
| ... "col3": [1, 2, 2, 2], | ||
| ... }, | ||
| ... schema={"col1": pl.Float64, "col2": pl.Int64, "col3": pl.Int64}, | ||
| ... ) | ||
| ... ) | ||
| >>> content | ||
| SummaryContentGenerator(shape=(4, 3), top=5) | ||
| ``` | ||
| """ | ||
| def __init__(self, frame: pl.DataFrame, top: int = 5) -> None: | ||
| self._frame = frame | ||
| check_positive(name="top", value=top) | ||
| self._top = top | ||
| def __repr__(self) -> str: | ||
| return f"{self.__class__.__qualname__}(shape={self._frame.shape}, top={self._top})" | ||
| @property | ||
| def frame(self) -> pl.DataFrame: | ||
| r"""The DataFrame to analyze.""" | ||
| return self._frame | ||
| @property | ||
| def top(self) -> int: | ||
| return self._top | ||
| def equal(self, other: Any, equal_nan: bool = False) -> bool: | ||
| if not isinstance(other, self.__class__): | ||
| return False | ||
| return self.top == other.top and objects_are_equal( | ||
| self.frame, other.frame, equal_nan=equal_nan | ||
| ) | ||
| def get_columns(self) -> tuple[str, ...]: | ||
| return tuple(self._frame.columns) | ||
| def get_null_count(self) -> tuple[int, ...]: | ||
| return tuple(compute_null_count(self._frame).tolist()) | ||
| def get_nunique(self) -> tuple[int, ...]: | ||
| return tuple(compute_nunique(self._frame).tolist()) | ||
| def get_dtypes(self) -> tuple[pl.DataType, ...]: | ||
| return tuple(self._frame.schema.dtypes()) | ||
| def get_most_frequent_values(self, top: int = 5) -> tuple[tuple[tuple[Any, int], ...], ...]: | ||
| return tuple(tuple(Counter(series.to_list()).most_common(top)) for series in self.frame) | ||
| def generate_content(self) -> str: | ||
| logger.info("Generating the DataFrame summary content...") | ||
| return Template(create_template()).render( | ||
| { | ||
| "table": self._create_table(), | ||
| "nrows": f"{self._frame.shape[0]:,}", | ||
| "ncols": f"{self._frame.shape[1]:,}", | ||
| } | ||
| ) | ||
| def _create_table(self) -> str: | ||
| return create_table( | ||
| columns=self.get_columns(), | ||
| null_count=self.get_null_count(), | ||
| nunique=self.get_nunique(), | ||
| dtypes=self.get_dtypes(), | ||
| most_frequent_values=self.get_most_frequent_values(top=self._top), | ||
| total=self._frame.shape[0], | ||
| ) | ||
| def create_template() -> str: | ||
| r"""Return the template of the content. | ||
| Returns: | ||
| The content template. | ||
| Example usage: | ||
| ```pycon | ||
| >>> from arkas.content.summary import create_template | ||
| >>> template = create_template() | ||
| ``` | ||
| """ | ||
| return """This section shows a short summary of each column. | ||
| <ul> | ||
| <li> <b>column</b>: are the column names</li> | ||
| <li> <b>types</b>: are the object types for the objects in the column </li> | ||
| <li> <b>null</b>: are the number (and percentage) of null values in the column </li> | ||
| <li> <b>unique</b>: are the number (and percentage) of unique values in the column </li> | ||
| </ul> | ||
| <p style="margin-top: 1rem;"> | ||
| <b>General statistics about the DataFrame</b> | ||
| <ul> | ||
| <li> number of columns: {{ncols}} </li> | ||
| <li> number of rows: {{nrows}}</li> | ||
| </ul> | ||
| {{table}} | ||
| """ | ||
| def create_table( | ||
| columns: Sequence[str], | ||
| null_count: Sequence[int], | ||
| nunique: Sequence[int], | ||
| dtypes: Sequence[pl.DataType], | ||
| most_frequent_values: Sequence[Sequence[tuple[Any, int]]], | ||
| total: int, | ||
| ) -> str: | ||
| r"""Return a HTML representation of a table with the temporal | ||
| distribution of null values. | ||
| Args: | ||
| columns: The column names. | ||
| null_count: The number of null values for each column. | ||
| nunique: The number of unique values for each column. | ||
| dtypes: The data type for each column. | ||
| most_frequent_values: The most frequent values for each column. | ||
| total: The total number of rows. | ||
| Returns: | ||
| The HTML representation of the table. | ||
| Example usage: | ||
| ```pycon | ||
| >>> import polars as pl | ||
| >>> from arkas.content.summary import create_table | ||
| >>> row = create_table( | ||
| ... columns=["float", "int", "str"], | ||
| ... null_count=(1, 0, 2), | ||
| ... nunique=(5, 2, 4), | ||
| ... dtypes=(pl.Float64(), pl.Int64(), pl.String()), | ||
| ... most_frequent_values=( | ||
| ... ((2.2, 2), (1.2, 1), (4.2, 1), (None, 1), (1.0, 1)), | ||
| ... ((1, 5), (0, 1)), | ||
| ... (("B", 2), (None, 2), ("A", 1), ("C", 1)), | ||
| ... ), | ||
| ... total=42, | ||
| ... ) | ||
| ``` | ||
| """ | ||
| rows = [] | ||
| for ( | ||
| column, | ||
| null, | ||
| nuniq, | ||
| dtype, | ||
| mf_values, | ||
| ) in zip(columns, null_count, nunique, dtypes, most_frequent_values): | ||
| rows.append( | ||
| create_table_row( | ||
| column=column, | ||
| null=null, | ||
| dtype=dtype, | ||
| nunique=nuniq, | ||
| most_frequent_values=mf_values, | ||
| total=total, | ||
| ) | ||
| ) | ||
| rows = "\n".join(rows) | ||
| return Template( | ||
| """<table class="table table-hover table-responsive w-auto" > | ||
| <thead class="thead table-group-divider"> | ||
| <tr> | ||
| <th>column</th> | ||
| <th>types</th> | ||
| <th>null</th> | ||
| <th>unique</th> | ||
| <th>most frequent values</th> | ||
| </tr> | ||
| </thead> | ||
| <tbody class="tbody table-group-divider"> | ||
| {{rows}} | ||
| <tr class="table-group-divider"></tr> | ||
| </tbody> | ||
| </table> | ||
| """ | ||
| ).render({"rows": rows}) | ||
| def create_table_row( | ||
| column: str, | ||
| null: int, | ||
| nunique: int, | ||
| dtype: pl.DataType, | ||
| most_frequent_values: Sequence[tuple[Any, int]], | ||
| total: int, | ||
| ) -> str: | ||
| r"""Create the HTML code of a new table row. | ||
| Args: | ||
| column: The column name. | ||
| null: The number of null values. | ||
| nunique: The number of unique values. | ||
| dtype: The data type of the column. | ||
| most_frequent_values: The most frequent values. | ||
| total: The total number of rows. | ||
| Returns: | ||
| The HTML code of a row. | ||
| Example usage: | ||
| ```pycon | ||
| >>> import polars as pl | ||
| >>> from arkas.content.summary import create_table_row | ||
| >>> row = create_table_row( | ||
| ... column="col", | ||
| ... null=5, | ||
| ... nunique=42, | ||
| ... dtype=pl.Float64(), | ||
| ... most_frequent_values=[("C", 12), ("A", 5), ("B", 4)], | ||
| ... total=100, | ||
| ... ) | ||
| ``` | ||
| """ | ||
| null = f"{null:,} ({100 * null / total if total else float('nan'):.2f}%)" | ||
| nunique = f"{nunique:,} ({100 * nunique / total if total else float('nan'):.2f}%)" | ||
| most_frequent_values = ", ".join( | ||
| [f"{val} ({100 * c / total:.2f}%)" for val, c in most_frequent_values] | ||
| ) | ||
| return Template( | ||
| """<tr> | ||
| <th>{{column}}</th> | ||
| <td>{{dtype}}</td> | ||
| <td {{num_style}}>{{null}}</td> | ||
| <td {{num_style}}>{{nunique}}</td> | ||
| <td>{{most_frequent_values}}</td> | ||
| </tr>""" | ||
| ).render( | ||
| { | ||
| "num_style": 'style="text-align: right;"', | ||
| "column": column, | ||
| "null": null, | ||
| "dtype": dtype, | ||
| "nunique": nunique, | ||
| "most_frequent_values": most_frequent_values, | ||
| } | ||
| ) |
| r"""Contain utility functions.""" | ||
| from __future__ import annotations | ||
| __all__ = ["float_to_str", "to_str"] | ||
| from typing import Any | ||
| def to_str(value: Any) -> str: | ||
| r"""Return a string representation of the input value. | ||
| Args: | ||
| value: The value to encode. | ||
| Returns: | ||
| The string representation of the input value. | ||
| Example usage: | ||
| ```pycon | ||
| >>> from arkas.content.utils import to_str | ||
| >>> to_str(42) | ||
| 42 | ||
| ``` | ||
| """ | ||
| if isinstance(value, (int, float)): | ||
| return float_to_str(value) | ||
| return str(value) | ||
| def float_to_str(value: float) -> str: | ||
| r"""Return a string representation of the input value. | ||
| Args: | ||
| value: The value to encode. | ||
| Returns: | ||
| The string representation of the input value. | ||
| Example usage: | ||
| ```pycon | ||
| >>> from arkas.content.utils import float_to_str | ||
| >>> float_to_str(42) | ||
| 42 | ||
| ``` | ||
| """ | ||
| return f"{value:.4g}" |
| r"""Implement the pairwise column correlation evaluator.""" | ||
| from __future__ import annotations | ||
| __all__ = ["ColumnCorrelationEvaluator"] | ||
| from typing import TYPE_CHECKING, Any | ||
| from coola.utils import repr_indent, repr_mapping, str_indent, str_mapping | ||
| from arkas.evaluator2.base import BaseEvaluator | ||
| from arkas.evaluator2.vanilla import Evaluator | ||
| from arkas.metric import pearsonr, spearmanr | ||
| if TYPE_CHECKING: | ||
| from arkas.state.target_dataframe import TargetDataFrameState | ||
| class ColumnCorrelationEvaluator(BaseEvaluator): | ||
| r"""Implement the column correlation evaluator. | ||
| Args: | ||
| state: The state with the DataFrame to analyze. | ||
| Example usage: | ||
| ```pycon | ||
| >>> import polars as pl | ||
| >>> from arkas.evaluator2 import ColumnCorrelationEvaluator | ||
| >>> from arkas.state import TargetDataFrameState | ||
| >>> frame = pl.DataFrame( | ||
| ... { | ||
| ... "col1": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0], | ||
| ... "col2": [7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0], | ||
| ... "col3": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0], | ||
| ... }, | ||
| ... ) | ||
| >>> evaluator = ColumnCorrelationEvaluator( | ||
| ... TargetDataFrameState(frame, target_column="col3") | ||
| ... ) | ||
| >>> evaluator | ||
| ColumnCorrelationEvaluator( | ||
| (state): TargetDataFrameState(dataframe=(7, 3), target_column='col3', nan_policy='propagate', figure_config=MatplotlibFigureConfig()) | ||
| ) | ||
| >>> evaluator.evaluate() | ||
| {'correlation_col1': {'count': 7, 'pearson_coeff': 1.0, 'pearson_pvalue': 0.0, 'spearman_coeff': 1.0, 'spearman_pvalue': 0.0}, | ||
| 'correlation_col2': {'count': 7, 'pearson_coeff': -1.0, 'pearson_pvalue': 0.0, 'spearman_coeff': -1.0, 'spearman_pvalue': 0.0}} | ||
| ``` | ||
| """ | ||
| def __init__(self, state: TargetDataFrameState) -> None: | ||
| self._state = state | ||
| def __repr__(self) -> str: | ||
| args = repr_indent(repr_mapping({"state": self._state})) | ||
| return f"{self.__class__.__qualname__}(\n {args}\n)" | ||
| def __str__(self) -> str: | ||
| args = str_indent(str_mapping({"state": self._state})) | ||
| return f"{self.__class__.__qualname__}(\n {args}\n)" | ||
| def compute(self) -> Evaluator: | ||
| return Evaluator(metrics=self.evaluate()) | ||
| def equal(self, other: Any, equal_nan: bool = False) -> bool: | ||
| if not isinstance(other, self.__class__): | ||
| return False | ||
| return self._state.equal(other._state, equal_nan=equal_nan) | ||
| def evaluate(self, prefix: str = "", suffix: str = "") -> dict[str, dict]: | ||
| target_column = self._state.target_column | ||
| columns = list(self._state.dataframe.columns) | ||
| columns.remove(target_column) | ||
| out = {} | ||
| for col in columns: | ||
| frame = self._state.dataframe.select([col, target_column]).drop_nulls().drop_nans() | ||
| x = frame[target_column].to_numpy() | ||
| y = frame[col].to_numpy() | ||
| out[f"{prefix}correlation_{col}{suffix}"] = pearsonr(x, y) | spearmanr(x, y) | ||
| return out |
| r"""Implement the pairwise column correlation evaluator.""" | ||
| from __future__ import annotations | ||
| __all__ = ["CorrelationEvaluator"] | ||
| from typing import TYPE_CHECKING, Any | ||
| from coola.utils import repr_indent, repr_mapping, str_indent, str_mapping | ||
| from arkas.evaluator2.base import BaseEvaluator | ||
| from arkas.evaluator2.vanilla import Evaluator | ||
| from arkas.metric import pearsonr, spearmanr | ||
| from arkas.utils.dataframe import check_num_columns | ||
| if TYPE_CHECKING: | ||
| from arkas.state.target_dataframe import DataFrameState | ||
| class CorrelationEvaluator(BaseEvaluator): | ||
| r"""Implement the pairwise column correlation evaluator. | ||
| Args: | ||
| state: The state with the DataFrame to analyze. | ||
| The DataFrame must have only 2 columns, which are the two | ||
| columns to analyze. | ||
| Example usage: | ||
| ```pycon | ||
| >>> import polars as pl | ||
| >>> from arkas.evaluator2 import CorrelationEvaluator | ||
| >>> from arkas.state import DataFrameState | ||
| >>> frame = pl.DataFrame( | ||
| ... { | ||
| ... "col1": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0], | ||
| ... "col3": [2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0], | ||
| ... }, | ||
| ... ) | ||
| >>> evaluator = CorrelationEvaluator(DataFrameState(frame)) | ||
| >>> evaluator | ||
| CorrelationEvaluator( | ||
| (state): DataFrameState(dataframe=(7, 2), nan_policy='propagate', figure_config=MatplotlibFigureConfig()) | ||
| ) | ||
| >>> evaluator.evaluate() | ||
| {'count': 7, 'pearson_coeff': 1.0, 'pearson_pvalue': 0.0, 'spearman_coeff': 1.0, 'spearman_pvalue': 0.0} | ||
| ``` | ||
| """ | ||
| def __init__(self, state: DataFrameState) -> None: | ||
| check_num_columns(state.dataframe, num_columns=2) | ||
| self._state = state | ||
| def __repr__(self) -> str: | ||
| args = repr_indent(repr_mapping({"state": self._state})) | ||
| return f"{self.__class__.__qualname__}(\n {args}\n)" | ||
| def __str__(self) -> str: | ||
| args = str_indent(str_mapping({"state": self._state})) | ||
| return f"{self.__class__.__qualname__}(\n {args}\n)" | ||
| def compute(self) -> Evaluator: | ||
| return Evaluator(metrics=self.evaluate()) | ||
| def equal(self, other: Any, equal_nan: bool = False) -> bool: | ||
| if not isinstance(other, self.__class__): | ||
| return False | ||
| return self._state.equal(other._state, equal_nan=equal_nan) | ||
| def evaluate(self, prefix: str = "", suffix: str = "") -> dict[str, float]: | ||
| frame = self._state.dataframe.drop_nulls().drop_nans() | ||
| x = frame[frame.columns[0]].to_numpy() | ||
| y = frame[frame.columns[1]].to_numpy() | ||
| return pearsonr(x=x, y=y, prefix=prefix, suffix=suffix) | spearmanr( | ||
| x=x, y=y, prefix=prefix, suffix=suffix | ||
| ) |
| r"""Implement an output to analyze the correlation between columns.""" | ||
| from __future__ import annotations | ||
| __all__ = ["ColumnCorrelationOutput"] | ||
| from typing import TYPE_CHECKING, Any | ||
| from coola.utils import repr_indent, repr_mapping, str_indent, str_mapping | ||
| from arkas.content.column_correlation import ColumnCorrelationContentGenerator | ||
| from arkas.evaluator2.column_correlation import ColumnCorrelationEvaluator | ||
| from arkas.output.lazy import BaseLazyOutput | ||
| from arkas.plotter.vanilla import Plotter | ||
| if TYPE_CHECKING: | ||
| from arkas.state.target_dataframe import TargetDataFrameState | ||
| class ColumnCorrelationOutput(BaseLazyOutput): | ||
| r"""Implement an output to summarize the numeric columns of a | ||
| DataFrame. | ||
| Args: | ||
| state: The state containing the DataFrame to analyze. | ||
| Example usage: | ||
| ```pycon | ||
| >>> import polars as pl | ||
| >>> from arkas.output import ColumnCorrelationOutput | ||
| >>> from arkas.state import TargetDataFrameState | ||
| >>> frame = pl.DataFrame( | ||
| ... { | ||
| ... "col1": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0], | ||
| ... "col2": [7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0], | ||
| ... "col3": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0], | ||
| ... }, | ||
| ... ) | ||
| >>> output = ColumnCorrelationOutput(TargetDataFrameState(frame, target_column="col3")) | ||
| >>> output | ||
| ColumnCorrelationOutput( | ||
| (state): TargetDataFrameState(dataframe=(7, 3), target_column='col3', nan_policy='propagate', figure_config=MatplotlibFigureConfig()) | ||
| ) | ||
| >>> output.get_content_generator() | ||
| ColumnCorrelationContentGenerator( | ||
| (state): TargetDataFrameState(dataframe=(7, 3), target_column='col3', nan_policy='propagate', figure_config=MatplotlibFigureConfig()) | ||
| ) | ||
| >>> output.get_evaluator() | ||
| ColumnCorrelationEvaluator( | ||
| (state): TargetDataFrameState(dataframe=(7, 3), target_column='col3', nan_policy='propagate', figure_config=MatplotlibFigureConfig()) | ||
| ) | ||
| >>> output.get_plotter() | ||
| Plotter(count=0) | ||
| ``` | ||
| """ | ||
| def __init__(self, state: TargetDataFrameState) -> None: | ||
| self._state = state | ||
| def __repr__(self) -> str: | ||
| args = repr_indent(repr_mapping({"state": self._state})) | ||
| return f"{self.__class__.__qualname__}(\n {args}\n)" | ||
| def __str__(self) -> str: | ||
| args = str_indent(str_mapping({"state": self._state})) | ||
| return f"{self.__class__.__qualname__}(\n {args}\n)" | ||
| def equal(self, other: Any, equal_nan: bool = False) -> bool: | ||
| if not isinstance(other, self.__class__): | ||
| return False | ||
| return self._state.equal(other._state, equal_nan=equal_nan) | ||
| def _get_content_generator(self) -> ColumnCorrelationContentGenerator: | ||
| return ColumnCorrelationContentGenerator(self._state) | ||
| def _get_evaluator(self) -> ColumnCorrelationEvaluator: | ||
| return ColumnCorrelationEvaluator(self._state) | ||
| def _get_plotter(self) -> Plotter: | ||
| return Plotter() |
| r"""Implement an output to analyze the correlation between columns.""" | ||
| from __future__ import annotations | ||
| __all__ = ["CorrelationOutput"] | ||
| from typing import TYPE_CHECKING, Any | ||
| from coola.utils import repr_indent, repr_mapping, str_indent, str_mapping | ||
| from arkas.content.correlation import CorrelationContentGenerator | ||
| from arkas.evaluator2.correlation import CorrelationEvaluator | ||
| from arkas.output.lazy import BaseLazyOutput | ||
| from arkas.plotter.correlation import CorrelationPlotter | ||
| from arkas.utils.dataframe import check_num_columns | ||
| if TYPE_CHECKING: | ||
| from arkas.state.dataframe import DataFrameState | ||
| class CorrelationOutput(BaseLazyOutput): | ||
| r"""Implement an output to summarize the numeric columns of a | ||
| DataFrame. | ||
| Args: | ||
| state: The state containing the DataFrame to analyze. | ||
| Example usage: | ||
| ```pycon | ||
| >>> import polars as pl | ||
| >>> from arkas.output import CorrelationOutput | ||
| >>> from arkas.state import DataFrameState | ||
| >>> frame = pl.DataFrame( | ||
| ... { | ||
| ... "col1": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0], | ||
| ... "col2": [7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0], | ||
| ... }, | ||
| ... ) | ||
| >>> output = CorrelationOutput(DataFrameState(frame)) | ||
| >>> output | ||
| CorrelationOutput( | ||
| (state): DataFrameState(dataframe=(7, 2), nan_policy='propagate', figure_config=MatplotlibFigureConfig()) | ||
| ) | ||
| >>> output.get_content_generator() | ||
| CorrelationContentGenerator( | ||
| (state): DataFrameState(dataframe=(7, 2), nan_policy='propagate', figure_config=MatplotlibFigureConfig()) | ||
| ) | ||
| >>> output.get_evaluator() | ||
| CorrelationEvaluator( | ||
| (state): DataFrameState(dataframe=(7, 2), nan_policy='propagate', figure_config=MatplotlibFigureConfig()) | ||
| ) | ||
| >>> output.get_plotter() | ||
| CorrelationPlotter( | ||
| (state): DataFrameState(dataframe=(7, 2), nan_policy='propagate', figure_config=MatplotlibFigureConfig()) | ||
| ) | ||
| ``` | ||
| """ | ||
| def __init__(self, state: DataFrameState) -> None: | ||
| check_num_columns(state.dataframe, num_columns=2) | ||
| self._state = state | ||
| def __repr__(self) -> str: | ||
| args = repr_indent(repr_mapping({"state": self._state})) | ||
| return f"{self.__class__.__qualname__}(\n {args}\n)" | ||
| def __str__(self) -> str: | ||
| args = str_indent(str_mapping({"state": self._state})) | ||
| return f"{self.__class__.__qualname__}(\n {args}\n)" | ||
| def equal(self, other: Any, equal_nan: bool = False) -> bool: | ||
| if not isinstance(other, self.__class__): | ||
| return False | ||
| return self._state.equal(other._state, equal_nan=equal_nan) | ||
| def _get_content_generator(self) -> CorrelationContentGenerator: | ||
| return CorrelationContentGenerator(self._state) | ||
| def _get_evaluator(self) -> CorrelationEvaluator: | ||
| return CorrelationEvaluator(self._state) | ||
| def _get_plotter(self) -> CorrelationPlotter: | ||
| return CorrelationPlotter(self._state) |
| r"""Implement an output to summarize the numeric columns of a | ||
| DataFrame.""" | ||
| from __future__ import annotations | ||
| __all__ = ["NumericSummaryOutput"] | ||
| from typing import TYPE_CHECKING, Any | ||
| from coola.utils import repr_indent, repr_mapping, str_indent, str_mapping | ||
| from arkas.content.numeric_summary import NumericSummaryContentGenerator | ||
| from arkas.evaluator2.vanilla import Evaluator | ||
| from arkas.output.lazy import BaseLazyOutput | ||
| from arkas.plotter.vanilla import Plotter | ||
| if TYPE_CHECKING: | ||
| from arkas.state.dataframe import DataFrameState | ||
| class NumericSummaryOutput(BaseLazyOutput): | ||
| r"""Implement an output to summarize the numeric columns of a | ||
| DataFrame. | ||
| Args: | ||
| state: The state containing the DataFrame to analyze. | ||
| Example usage: | ||
| ```pycon | ||
| >>> import polars as pl | ||
| >>> from arkas.output import NumericSummaryOutput | ||
| >>> from arkas.state import DataFrameState | ||
| >>> frame = pl.DataFrame( | ||
| ... { | ||
| ... "col1": [0, 1, 1, 0, 0, 1, 0], | ||
| ... "col2": [0, 1, 0, 1, 0, 1, 0], | ||
| ... "col3": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0], | ||
| ... }, | ||
| ... schema={"col1": pl.Int64, "col2": pl.Int32, "col3": pl.Float64}, | ||
| ... ) | ||
| >>> output = NumericSummaryOutput(DataFrameState(frame)) | ||
| >>> output | ||
| NumericSummaryOutput( | ||
| (state): DataFrameState(dataframe=(7, 3), nan_policy='propagate', figure_config=MatplotlibFigureConfig()) | ||
| ) | ||
| >>> output.get_content_generator() | ||
| NumericSummaryContentGenerator( | ||
| (state): DataFrameState(dataframe=(7, 3), nan_policy='propagate', figure_config=MatplotlibFigureConfig()) | ||
| ) | ||
| >>> output.get_evaluator() | ||
| Evaluator(count=0) | ||
| >>> output.get_plotter() | ||
| Plotter(count=0) | ||
| ``` | ||
| """ | ||
| def __init__(self, state: DataFrameState) -> None: | ||
| self._state = state | ||
| def __repr__(self) -> str: | ||
| args = repr_indent(repr_mapping({"state": self._state})) | ||
| return f"{self.__class__.__qualname__}(\n {args}\n)" | ||
| def __str__(self) -> str: | ||
| args = str_indent(str_mapping({"state": self._state})) | ||
| return f"{self.__class__.__qualname__}(\n {args}\n)" | ||
| def equal(self, other: Any, equal_nan: bool = False) -> bool: | ||
| if not isinstance(other, self.__class__): | ||
| return False | ||
| return self._state.equal(other._state, equal_nan=equal_nan) | ||
| def _get_content_generator(self) -> NumericSummaryContentGenerator: | ||
| return NumericSummaryContentGenerator(self._state) | ||
| def _get_evaluator(self) -> Evaluator: | ||
| return Evaluator() | ||
| def _get_plotter(self) -> Plotter: | ||
| return Plotter() |
| r"""Implement the DataFrame summary output.""" | ||
| from __future__ import annotations | ||
| __all__ = ["SummaryOutput"] | ||
| from typing import TYPE_CHECKING, Any | ||
| from coola import objects_are_equal | ||
| from arkas.content.summary import SummaryContentGenerator | ||
| from arkas.evaluator2.vanilla import Evaluator | ||
| from arkas.output.lazy import BaseLazyOutput | ||
| from arkas.plotter.vanilla import Plotter | ||
| from arkas.utils.validation import check_positive | ||
| if TYPE_CHECKING: | ||
| import polars as pl | ||
| class SummaryOutput(BaseLazyOutput): | ||
| r"""Implement the DataFrame summary output. | ||
| Args: | ||
| frame: The DataFrame to analyze. | ||
| top: The number of most frequent values to show. | ||
| Example usage: | ||
| ```pycon | ||
| >>> import polars as pl | ||
| >>> from arkas.output import SummaryOutput | ||
| >>> frame = pl.DataFrame( | ||
| ... { | ||
| ... "col1": [1.2, 4.2, 4.2, 2.2], | ||
| ... "col2": [1, 1, 1, 1], | ||
| ... "col3": [1, 2, 2, 2], | ||
| ... }, | ||
| ... schema={"col1": pl.Float64, "col2": pl.Int64, "col3": pl.Int64}, | ||
| ... ) | ||
| >>> output = SummaryOutput(frame) | ||
| >>> output | ||
| SummaryOutput(shape=(4, 3), top=5) | ||
| >>> output.get_content_generator() | ||
| SummaryContentGenerator(shape=(4, 3), top=5) | ||
| >>> output.get_evaluator() | ||
| Evaluator(count=0) | ||
| >>> output.get_plotter() | ||
| Plotter(count=0) | ||
| ``` | ||
| """ | ||
| def __init__(self, frame: pl.DataFrame, top: int = 5) -> None: | ||
| self._frame = frame | ||
| check_positive(name="top", value=top) | ||
| self._top = top | ||
| def __repr__(self) -> str: | ||
| return f"{self.__class__.__qualname__}(shape={self._frame.shape}, top={self._top})" | ||
| def equal(self, other: Any, equal_nan: bool = False) -> bool: | ||
| if not isinstance(other, self.__class__): | ||
| return False | ||
| return self._top == other._top and objects_are_equal( | ||
| self._frame, other._frame, equal_nan=equal_nan | ||
| ) | ||
| def _get_content_generator(self) -> SummaryContentGenerator: | ||
| return SummaryContentGenerator(frame=self._frame, top=self._top) | ||
| def _get_evaluator(self) -> Evaluator: | ||
| return Evaluator() | ||
| def _get_plotter(self) -> Plotter: | ||
| return Plotter() |
| r"""Contain the implementation of a correlation plotter.""" | ||
| from __future__ import annotations | ||
| __all__ = ["BaseFigureCreator", "CorrelationPlotter", "MatplotlibFigureCreator"] | ||
| from abc import ABC, abstractmethod | ||
| from typing import TYPE_CHECKING, Any | ||
| import matplotlib.pyplot as plt | ||
| from coola.utils import repr_indent, repr_mapping, str_indent, str_mapping | ||
| from arkas.figure.creator import FigureCreatorRegistry | ||
| from arkas.figure.html import HtmlFigure | ||
| from arkas.figure.matplotlib import MatplotlibFigure, MatplotlibFigureConfig | ||
| from arkas.figure.utils import MISSING_FIGURE_MESSAGE | ||
| from arkas.plotter.base import BasePlotter | ||
| from arkas.plotter.vanilla import Plotter | ||
| from arkas.utils.dataframe import check_num_columns | ||
| from arkas.utils.range import find_range | ||
| if TYPE_CHECKING: | ||
| from arkas.figure.base import BaseFigure | ||
| from arkas.state.dataframe import DataFrameState | ||
| class BaseFigureCreator(ABC): | ||
| r"""Define the base class to create a figure with the content of | ||
| each column.""" | ||
| @abstractmethod | ||
| def create(self, state: DataFrameState) -> BaseFigure: | ||
| r"""Create a figure with the content of each column. | ||
| Args: | ||
| state: The state containing the DataFrame to analyze. | ||
| The DataFrame must have only 2 columns, which are the | ||
| two columns to analyze. | ||
| Returns: | ||
| The generated figure. | ||
| Example usage: | ||
| ```pycon | ||
| >>> import polars as pl | ||
| >>> from arkas.figure import MatplotlibFigureConfig | ||
| >>> from arkas.state import DataFrameState | ||
| >>> from arkas.plotter.correlation import MatplotlibFigureCreator | ||
| >>> creator = MatplotlibFigureCreator() | ||
| >>> frame = pl.DataFrame( | ||
| ... { | ||
| ... "col1": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0], | ||
| ... "col3": [2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0], | ||
| ... }, | ||
| ... ) | ||
| >>> fig = creator.create(DataFrameState(frame)) | ||
| ``` | ||
| """ | ||
| class MatplotlibFigureCreator(BaseFigureCreator): | ||
| r"""Create a matplotlib figure with the content of each column. | ||
| Example usage: | ||
| ```pycon | ||
| >>> import polars as pl | ||
| >>> from arkas.figure import MatplotlibFigureConfig | ||
| >>> from arkas.state import DataFrameState | ||
| >>> from arkas.plotter.correlation import MatplotlibFigureCreator | ||
| >>> creator = MatplotlibFigureCreator() | ||
| >>> frame = pl.DataFrame( | ||
| ... { | ||
| ... "col1": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0], | ||
| ... "col3": [2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0], | ||
| ... }, | ||
| ... ) | ||
| >>> fig = creator.create(DataFrameState(frame)) | ||
| ``` | ||
| """ | ||
| def __repr__(self) -> str: | ||
| return f"{self.__class__.__qualname__}()" | ||
| def create(self, state: DataFrameState) -> BaseFigure: | ||
| if state.dataframe.shape[0] == 0: | ||
| return HtmlFigure(MISSING_FIGURE_MESSAGE) | ||
| check_num_columns(state.dataframe, num_columns=2) | ||
| xcol, ycol = state.dataframe.columns | ||
| fig, ax = plt.subplots(**state.figure_config.get_arg("init", {})) | ||
| x = state.dataframe[xcol].to_numpy() | ||
| y = state.dataframe[ycol].to_numpy() | ||
| ax.scatter(x=x, y=y) | ||
| xmin, xmax = find_range( | ||
| x, | ||
| xmin=state.figure_config.get_arg("xmin"), | ||
| xmax=state.figure_config.get_arg("xmax"), | ||
| ) | ||
| if xmin < xmax: | ||
| ax.set_xlim(xmin, xmax) | ||
| ymin, ymax = find_range( | ||
| y, | ||
| xmin=state.figure_config.get_arg("ymin"), | ||
| xmax=state.figure_config.get_arg("ymax"), | ||
| ) | ||
| if ymin < ymax: | ||
| ax.set_ylim(ymin, ymax) | ||
| ax.set_xlabel(xcol) | ||
| ax.set_ylabel(ycol) | ||
| if xscale := state.figure_config.get_arg("xscale"): | ||
| ax.set_xscale(xscale) | ||
| if yscale := state.figure_config.get_arg("yscale"): | ||
| ax.set_yscale(yscale) | ||
| fig.tight_layout() | ||
| return MatplotlibFigure(fig) | ||
| class CorrelationPlotter(BasePlotter): | ||
| r"""Implement a DataFrame column plotter. | ||
| Args: | ||
| state: The state containing the DataFrame to analyze. | ||
| The DataFrame must have only 2 columns, which are the two | ||
| columns to analyze. | ||
| Example usage: | ||
| ```pycon | ||
| >>> import polars as pl | ||
| >>> from arkas.plotter import CorrelationPlotter | ||
| >>> from arkas.state import DataFrameState | ||
| >>> frame = pl.DataFrame( | ||
| ... { | ||
| ... "col1": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0], | ||
| ... "col3": [2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0], | ||
| ... }, | ||
| ... ) | ||
| >>> plotter = CorrelationPlotter(DataFrameState(frame)) | ||
| >>> plotter | ||
| CorrelationPlotter( | ||
| (state): DataFrameState(dataframe=(7, 2), nan_policy='propagate', figure_config=MatplotlibFigureConfig()) | ||
| ) | ||
| ``` | ||
| """ | ||
| registry = FigureCreatorRegistry[BaseFigureCreator]( | ||
| {MatplotlibFigureConfig.backend(): MatplotlibFigureCreator()} | ||
| ) | ||
| def __init__(self, state: DataFrameState) -> None: | ||
| check_num_columns(state.dataframe, num_columns=2) | ||
| self._state = state | ||
| def __repr__(self) -> str: | ||
| args = repr_indent(repr_mapping({"state": self._state})) | ||
| return f"{self.__class__.__qualname__}(\n {args}\n)" | ||
| def __str__(self) -> str: | ||
| args = str_indent(str_mapping({"state": self._state})) | ||
| return f"{self.__class__.__qualname__}(\n {args}\n)" | ||
| def compute(self) -> Plotter: | ||
| return Plotter(self.plot()) | ||
| def equal(self, other: Any, equal_nan: bool = False) -> bool: | ||
| if not isinstance(other, self.__class__): | ||
| return False | ||
| return self._state.equal(other._state, equal_nan=equal_nan) | ||
| def plot(self, prefix: str = "", suffix: str = "") -> dict: | ||
| figure = self.registry.find_creator(self._state.figure_config.backend()).create(self._state) | ||
| return {f"{prefix}correlation{suffix}": figure} |
| r"""Implement DataFrame state with a target column.""" | ||
| from __future__ import annotations | ||
| __all__ = ["TargetDataFrameState"] | ||
| import sys | ||
| from typing import TYPE_CHECKING | ||
| from coola.utils.format import repr_mapping_line, str_indent, str_mapping | ||
| from arkas.state.dataframe import DataFrameState | ||
| from arkas.utils.dataframe import check_column_exist | ||
| if sys.version_info >= (3, 11): | ||
| from typing import Self | ||
| else: # pragma: no cover | ||
| from typing_extensions import ( | ||
| Self, # use backport because it was added in python 3.11 | ||
| ) | ||
| if TYPE_CHECKING: | ||
| import polars as pl | ||
| from arkas.figure.base import BaseFigureConfig | ||
| class TargetDataFrameState(DataFrameState): | ||
| r"""Implement a DataFrame state with a target column. | ||
| Args: | ||
| dataframe: The DataFrame. | ||
| target_column: The target column in the DataFrame. | ||
| nan_policy: The policy on how to handle NaN values in the input | ||
| arrays. The following options are available: ``'omit'``, | ||
| ``'propagate'``, and ``'raise'``. | ||
| figure_config: An optional figure configuration. | ||
| Example usage: | ||
| ```pycon | ||
| >>> from datetime import datetime, timezone | ||
| >>> import polars as pl | ||
| >>> from arkas.state import TargetDataFrameState | ||
| >>> frame = pl.DataFrame( | ||
| ... { | ||
| ... "col1": [0, 1, 1, 0, 0, 1, 0], | ||
| ... "col2": [0, 1, 0, 1, 0, 1, 0], | ||
| ... "col3": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0], | ||
| ... }, | ||
| ... schema={"col1": pl.Int64, "col2": pl.Int32, "col3": pl.Float64}, | ||
| ... ) | ||
| >>> state = TargetDataFrameState(frame, target_column="col3") | ||
| >>> state | ||
| TargetDataFrameState(dataframe=(7, 3), target_column='col3', nan_policy='propagate', figure_config=MatplotlibFigureConfig()) | ||
| ``` | ||
| """ | ||
| def __init__( | ||
| self, | ||
| dataframe: pl.DataFrame, | ||
| target_column: str, | ||
| nan_policy: str = "propagate", | ||
| figure_config: BaseFigureConfig | None = None, | ||
| ) -> None: | ||
| super().__init__(dataframe=dataframe, nan_policy=nan_policy, figure_config=figure_config) | ||
| check_column_exist(dataframe, target_column) | ||
| self._target_column = target_column | ||
| def __repr__(self) -> str: | ||
| args = repr_mapping_line( | ||
| { | ||
| "dataframe": self._dataframe.shape, | ||
| "target_column": self._target_column, | ||
| "nan_policy": self._nan_policy, | ||
| "figure_config": self._figure_config, | ||
| } | ||
| ) | ||
| return f"{self.__class__.__qualname__}({args})" | ||
| def __str__(self) -> str: | ||
| args = str_indent( | ||
| str_mapping( | ||
| { | ||
| "dataframe": self._dataframe.shape, | ||
| "target_column": self._target_column, | ||
| "nan_policy": self._nan_policy, | ||
| "figure_config": self._figure_config, | ||
| } | ||
| ) | ||
| ) | ||
| return f"{self.__class__.__qualname__}({args})" | ||
| @property | ||
| def target_column(self) -> str: | ||
| return self._target_column | ||
| def clone(self, deep: bool = True) -> Self: | ||
| return self.__class__( | ||
| dataframe=self._dataframe.clone() if deep else self._dataframe, | ||
| target_column=self._target_column, | ||
| nan_policy=self._nan_policy, | ||
| figure_config=self._figure_config.clone() if deep else self._figure_config, | ||
| ) | ||
| def get_args(self) -> dict: | ||
| return super().get_args() | {"target_column": self._target_column} |
+1
-1
| Metadata-Version: 2.1 | ||
| Name: arkas | ||
| Version: 0.0.1a10 | ||
| Version: 0.0.1a11 | ||
| Summary: Library to evaluate ML model performances | ||
@@ -5,0 +5,0 @@ Home-page: https://github.com/durandtibo/arkas |
+1
-1
| [tool.poetry] | ||
| name = "arkas" | ||
| version = "0.0.1a10" | ||
| version = "0.0.1a11" | ||
| description = "Library to evaluate ML model performances" | ||
@@ -5,0 +5,0 @@ readme = "README.md" |
@@ -13,9 +13,12 @@ r"""Contain DataFrame analyzers.""" | ||
| "ColumnCooccurrenceAnalyzer", | ||
| "ColumnCorrelationAnalyzer", | ||
| "ContentAnalyzer", | ||
| "ContinuousColumnAnalyzer", | ||
| "DataFrameSummaryAnalyzer", | ||
| "CorrelationAnalyzer", | ||
| "MappingAnalyzer", | ||
| "NullValueAnalyzer", | ||
| "NumericSummaryAnalyzer", | ||
| "PlotColumnAnalyzer", | ||
| "ScatterColumnAnalyzer", | ||
| "SummaryAnalyzer", | ||
| "TemporalNullValueAnalyzer", | ||
@@ -32,13 +35,16 @@ "TemporalPlotColumnAnalyzer", | ||
| from arkas.analyzer.column_cooccurrence import ColumnCooccurrenceAnalyzer | ||
| from arkas.analyzer.column_correlation import ColumnCorrelationAnalyzer | ||
| from arkas.analyzer.columns import BaseTruePredAnalyzer | ||
| from arkas.analyzer.content import ContentAnalyzer | ||
| from arkas.analyzer.continuous_column import ContinuousColumnAnalyzer | ||
| from arkas.analyzer.frame_summary import DataFrameSummaryAnalyzer | ||
| from arkas.analyzer.correlation import CorrelationAnalyzer | ||
| from arkas.analyzer.lazy import BaseInNLazyAnalyzer, BaseLazyAnalyzer | ||
| from arkas.analyzer.mapping import MappingAnalyzer | ||
| from arkas.analyzer.null_value import NullValueAnalyzer | ||
| from arkas.analyzer.numeric_summary import NumericSummaryAnalyzer | ||
| from arkas.analyzer.plot_column import PlotColumnAnalyzer | ||
| from arkas.analyzer.scatter_column import ScatterColumnAnalyzer | ||
| from arkas.analyzer.summary import SummaryAnalyzer | ||
| from arkas.analyzer.temporal_null_value import TemporalNullValueAnalyzer | ||
| from arkas.analyzer.temporal_plot_column import TemporalPlotColumnAnalyzer | ||
| from arkas.analyzer.transform import TransformAnalyzer |
@@ -40,6 +40,6 @@ r"""Define a base class to implement lazy analyzers.""" | ||
| >>> import polars as pl | ||
| >>> from arkas.analyzer import DataFrameSummaryAnalyzer | ||
| >>> analyzer = DataFrameSummaryAnalyzer() | ||
| >>> from arkas.analyzer import SummaryAnalyzer | ||
| >>> analyzer = SummaryAnalyzer() | ||
| >>> analyzer | ||
| DataFrameSummaryAnalyzer(top=5, sort=False) | ||
| SummaryAnalyzer(top=5, sort=False) | ||
| >>> frame = pl.DataFrame( | ||
@@ -55,3 +55,3 @@ ... { | ||
| >>> output | ||
| DataFrameSummaryOutput(shape=(4, 3), top=5) | ||
| SummaryOutput(shape=(4, 3), top=5) | ||
@@ -58,0 +58,0 @@ ``` |
@@ -65,3 +65,3 @@ r"""Implement an analyzer that plots the content of each column.""" | ||
| PlotColumnOutput( | ||
| (state): DataFrameState(dataframe=(4, 3), figure_config=MatplotlibFigureConfig()) | ||
| (state): DataFrameState(dataframe=(4, 3), nan_policy='propagate', figure_config=MatplotlibFigureConfig()) | ||
| ) | ||
@@ -68,0 +68,0 @@ |
@@ -56,3 +56,3 @@ r"""Implement an analyzer that plots the content of each column.""" | ||
| ScatterColumnOutput( | ||
| (state): ScatterDataFrameState(dataframe=(4, 2), x='col1', y='col2', color=None, figure_config=MatplotlibFigureConfig()) | ||
| (state): ScatterDataFrameState(dataframe=(4, 2), x='col1', y='col2', color=None, nan_policy='propagate', figure_config=MatplotlibFigureConfig()) | ||
| ) | ||
@@ -59,0 +59,0 @@ |
@@ -80,3 +80,3 @@ r"""Implement an analyzer that plots the content of each column.""" | ||
| TemporalNullValueOutput( | ||
| (state): TemporalDataFrameState(dataframe=(4, 4), temporal_column='datetime', period='1d', figure_config=MatplotlibFigureConfig()) | ||
| (state): TemporalDataFrameState(dataframe=(4, 4), temporal_column='datetime', period='1d', nan_policy='propagate', figure_config=MatplotlibFigureConfig()) | ||
| ) | ||
@@ -83,0 +83,0 @@ |
@@ -79,3 +79,3 @@ r"""Implement an analyzer that plots the content of each column.""" | ||
| TemporalPlotColumnOutput( | ||
| (state): TemporalDataFrameState(dataframe=(4, 4), temporal_column='datetime', period=None, figure_config=MatplotlibFigureConfig()) | ||
| (state): TemporalDataFrameState(dataframe=(4, 4), temporal_column='datetime', period=None, nan_policy='propagate', figure_config=MatplotlibFigureConfig()) | ||
| ) | ||
@@ -82,0 +82,0 @@ |
@@ -10,9 +10,12 @@ r"""Contain HTML content generators.""" | ||
| "ColumnCooccurrenceContentGenerator", | ||
| "ColumnCorrelationContentGenerator", | ||
| "ContentGenerator", | ||
| "ContentGeneratorDict", | ||
| "ContinuousSeriesContentGenerator", | ||
| "DataFrameSummaryContentGenerator", | ||
| "CorrelationContentGenerator", | ||
| "NullValueContentGenerator", | ||
| "NumericSummaryContentGenerator", | ||
| "PlotColumnContentGenerator", | ||
| "ScatterColumnContentGenerator", | ||
| "SummaryContentGenerator", | ||
| "TemporalNullValueContentGenerator", | ||
@@ -26,10 +29,13 @@ "TemporalPlotColumnContentGenerator", | ||
| from arkas.content.column_cooccurrence import ColumnCooccurrenceContentGenerator | ||
| from arkas.content.column_correlation import ColumnCorrelationContentGenerator | ||
| from arkas.content.continuous_series import ContinuousSeriesContentGenerator | ||
| from arkas.content.frame_summary import DataFrameSummaryContentGenerator | ||
| from arkas.content.correlation import CorrelationContentGenerator | ||
| from arkas.content.mapping import ContentGeneratorDict | ||
| from arkas.content.null_value import NullValueContentGenerator | ||
| from arkas.content.numeric_summary import NumericSummaryContentGenerator | ||
| from arkas.content.plot_column import PlotColumnContentGenerator | ||
| from arkas.content.scatter_column import ScatterColumnContentGenerator | ||
| from arkas.content.summary import SummaryContentGenerator | ||
| from arkas.content.temporal_null_value import TemporalNullValueContentGenerator | ||
| from arkas.content.temporal_plot_column import TemporalPlotColumnContentGenerator | ||
| from arkas.content.vanilla import ContentGenerator |
@@ -102,3 +102,3 @@ r"""Contain the implementation of a HTML content generator that returns | ||
| >>> from arkas.content.frame_summary import create_template | ||
| >>> from arkas.content.summary import create_template | ||
| >>> template = create_template() | ||
@@ -105,0 +105,0 @@ |
@@ -78,3 +78,3 @@ r"""Contain the implementation of a HTML content generator that analyzes | ||
| xmin=self._state.figure_config.get_arg("xmin"), | ||
| xmax=self._state.figure_config.get_arg("xmin"), | ||
| xmax=self._state.figure_config.get_arg("xmax"), | ||
| ) | ||
@@ -81,0 +81,0 @@ return Template(create_template()).render( |
@@ -49,3 +49,3 @@ r"""Contain the implementation of a HTML content generator that plots | ||
| PlotColumnContentGenerator( | ||
| (state): DataFrameState(dataframe=(7, 3), figure_config=MatplotlibFigureConfig()) | ||
| (state): DataFrameState(dataframe=(7, 3), nan_policy='propagate', figure_config=MatplotlibFigureConfig()) | ||
| ) | ||
@@ -52,0 +52,0 @@ |
@@ -51,3 +51,3 @@ r"""Contain the implementation of a HTML content generator that plots | ||
| ScatterColumnContentGenerator( | ||
| (state): ScatterDataFrameState(dataframe=(7, 3), x='col1', y='col2', color=None, figure_config=MatplotlibFigureConfig()) | ||
| (state): ScatterDataFrameState(dataframe=(7, 3), x='col1', y='col2', color=None, nan_policy='propagate', figure_config=MatplotlibFigureConfig()) | ||
| ) | ||
@@ -54,0 +54,0 @@ |
@@ -67,3 +67,3 @@ r"""Contain the implementation of a HTML content generator that analyzes | ||
| TemporalNullValueContentGenerator( | ||
| (state): TemporalDataFrameState(dataframe=(4, 4), temporal_column='datetime', period=None, figure_config=MatplotlibFigureConfig()) | ||
| (state): TemporalDataFrameState(dataframe=(4, 4), temporal_column='datetime', period=None, nan_policy='propagate', figure_config=MatplotlibFigureConfig()) | ||
| ) | ||
@@ -70,0 +70,0 @@ |
@@ -64,3 +64,3 @@ r"""Contain the implementation of a HTML content generator that plots | ||
| TemporalPlotColumnContentGenerator( | ||
| (state): TemporalDataFrameState(dataframe=(4, 4), temporal_column='datetime', period=None, figure_config=MatplotlibFigureConfig()) | ||
| (state): TemporalDataFrameState(dataframe=(4, 4), temporal_column='datetime', period=None, nan_policy='propagate', figure_config=MatplotlibFigureConfig()) | ||
| ) | ||
@@ -67,0 +67,0 @@ |
@@ -10,2 +10,4 @@ r"""Contain data evaluators.""" | ||
| "ColumnCooccurrenceEvaluator", | ||
| "ColumnCorrelationEvaluator", | ||
| "CorrelationEvaluator", | ||
| "Evaluator", | ||
@@ -19,3 +21,5 @@ "EvaluatorDict", | ||
| from arkas.evaluator2.column_cooccurrence import ColumnCooccurrenceEvaluator | ||
| from arkas.evaluator2.column_correlation import ColumnCorrelationEvaluator | ||
| from arkas.evaluator2.correlation import CorrelationEvaluator | ||
| from arkas.evaluator2.mapping import EvaluatorDict | ||
| from arkas.evaluator2.vanilla import Evaluator |
@@ -11,7 +11,9 @@ r"""Contain data outputs.""" | ||
| "ColumnCooccurrenceOutput", | ||
| "ColumnCorrelationOutput", | ||
| "ContentOutput", | ||
| "ContinuousSeriesOutput", | ||
| "DataFrameSummaryOutput", | ||
| "CorrelationOutput", | ||
| "EmptyOutput", | ||
| "NullValueOutput", | ||
| "NumericSummaryOutput", | ||
| "Output", | ||
@@ -21,2 +23,3 @@ "OutputDict", | ||
| "ScatterColumnOutput", | ||
| "SummaryOutput", | ||
| "TemporalNullValueOutput", | ||
@@ -30,13 +33,16 @@ "TemporalPlotColumnOutput", | ||
| from arkas.output.column_cooccurrence import ColumnCooccurrenceOutput | ||
| from arkas.output.column_correlation import ColumnCorrelationOutput | ||
| from arkas.output.content import ContentOutput | ||
| from arkas.output.continuous_series import ContinuousSeriesOutput | ||
| from arkas.output.correlation import CorrelationOutput | ||
| from arkas.output.empty import EmptyOutput | ||
| from arkas.output.frame_summary import DataFrameSummaryOutput | ||
| from arkas.output.lazy import BaseLazyOutput | ||
| from arkas.output.mapping import OutputDict | ||
| from arkas.output.null_value import NullValueOutput | ||
| from arkas.output.numeric_summary import NumericSummaryOutput | ||
| from arkas.output.plot_column import PlotColumnOutput | ||
| from arkas.output.scatter_column import ScatterColumnOutput | ||
| from arkas.output.summary import SummaryOutput | ||
| from arkas.output.temporal_null_value import TemporalNullValueOutput | ||
| from arkas.output.temporal_plot_column import TemporalPlotColumnOutput | ||
| from arkas.output.vanilla import Output |
@@ -44,7 +44,7 @@ r"""Implement an output to plot each column of a DataFrame.""" | ||
| PlotColumnOutput( | ||
| (state): DataFrameState(dataframe=(4, 3), figure_config=MatplotlibFigureConfig()) | ||
| (state): DataFrameState(dataframe=(4, 3), nan_policy='propagate', figure_config=MatplotlibFigureConfig()) | ||
| ) | ||
| >>> output.get_content_generator() | ||
| PlotColumnContentGenerator( | ||
| (state): DataFrameState(dataframe=(4, 3), figure_config=MatplotlibFigureConfig()) | ||
| (state): DataFrameState(dataframe=(4, 3), nan_policy='propagate', figure_config=MatplotlibFigureConfig()) | ||
| ) | ||
@@ -55,3 +55,3 @@ >>> output.get_evaluator() | ||
| PlotColumnPlotter( | ||
| (state): DataFrameState(dataframe=(4, 3), figure_config=MatplotlibFigureConfig()) | ||
| (state): DataFrameState(dataframe=(4, 3), nan_policy='propagate', figure_config=MatplotlibFigureConfig()) | ||
| ) | ||
@@ -58,0 +58,0 @@ |
@@ -17,3 +17,3 @@ r"""Implement an output to scatter plot some columns.""" | ||
| if TYPE_CHECKING: | ||
| from arkas.state.temporal_dataframe import ScatterDataFrameState | ||
| from arkas.state.scatter_dataframe import ScatterDataFrameState | ||
@@ -45,7 +45,7 @@ | ||
| ScatterColumnOutput( | ||
| (state): ScatterDataFrameState(dataframe=(4, 3), x='col1', y='col2', color=None, figure_config=MatplotlibFigureConfig()) | ||
| (state): ScatterDataFrameState(dataframe=(4, 3), x='col1', y='col2', color=None, nan_policy='propagate', figure_config=MatplotlibFigureConfig()) | ||
| ) | ||
| >>> output.get_content_generator() | ||
| ScatterColumnContentGenerator( | ||
| (state): ScatterDataFrameState(dataframe=(4, 3), x='col1', y='col2', color=None, figure_config=MatplotlibFigureConfig()) | ||
| (state): ScatterDataFrameState(dataframe=(4, 3), x='col1', y='col2', color=None, nan_policy='propagate', figure_config=MatplotlibFigureConfig()) | ||
| ) | ||
@@ -56,3 +56,3 @@ >>> output.get_evaluator() | ||
| ScatterColumnPlotter( | ||
| (state): ScatterDataFrameState(dataframe=(4, 3), x='col1', y='col2', color=None, figure_config=MatplotlibFigureConfig()) | ||
| (state): ScatterDataFrameState(dataframe=(4, 3), x='col1', y='col2', color=None, nan_policy='propagate', figure_config=MatplotlibFigureConfig()) | ||
| ) | ||
@@ -59,0 +59,0 @@ |
@@ -60,7 +60,7 @@ r"""Implement an output to analyze the number of null values in a | ||
| TemporalNullValueOutput( | ||
| (state): TemporalDataFrameState(dataframe=(4, 4), temporal_column='datetime', period=None, figure_config=MatplotlibFigureConfig()) | ||
| (state): TemporalDataFrameState(dataframe=(4, 4), temporal_column='datetime', period=None, nan_policy='propagate', figure_config=MatplotlibFigureConfig()) | ||
| ) | ||
| >>> output.get_content_generator() | ||
| TemporalNullValueContentGenerator( | ||
| (state): TemporalDataFrameState(dataframe=(4, 4), temporal_column='datetime', period=None, figure_config=MatplotlibFigureConfig()) | ||
| (state): TemporalDataFrameState(dataframe=(4, 4), temporal_column='datetime', period=None, nan_policy='propagate', figure_config=MatplotlibFigureConfig()) | ||
| ) | ||
@@ -71,3 +71,3 @@ >>> output.get_evaluator() | ||
| TemporalNullValuePlotter( | ||
| (state): TemporalDataFrameState(dataframe=(4, 4), temporal_column='datetime', period=None, figure_config=MatplotlibFigureConfig()) | ||
| (state): TemporalDataFrameState(dataframe=(4, 4), temporal_column='datetime', period=None, nan_policy='propagate', figure_config=MatplotlibFigureConfig()) | ||
| ) | ||
@@ -74,0 +74,0 @@ |
@@ -60,7 +60,7 @@ r"""Implement an output to plot each column of a DataFrame along a | ||
| TemporalPlotColumnOutput( | ||
| (state): TemporalDataFrameState(dataframe=(4, 4), temporal_column='datetime', period=None, figure_config=MatplotlibFigureConfig()) | ||
| (state): TemporalDataFrameState(dataframe=(4, 4), temporal_column='datetime', period=None, nan_policy='propagate', figure_config=MatplotlibFigureConfig()) | ||
| ) | ||
| >>> output.get_content_generator() | ||
| TemporalPlotColumnContentGenerator( | ||
| (state): TemporalDataFrameState(dataframe=(4, 4), temporal_column='datetime', period=None, figure_config=MatplotlibFigureConfig()) | ||
| (state): TemporalDataFrameState(dataframe=(4, 4), temporal_column='datetime', period=None, nan_policy='propagate', figure_config=MatplotlibFigureConfig()) | ||
| ) | ||
@@ -71,3 +71,3 @@ >>> output.get_evaluator() | ||
| TemporalPlotColumnPlotter( | ||
| (state): TemporalDataFrameState(dataframe=(4, 4), temporal_column='datetime', period=None, figure_config=MatplotlibFigureConfig()) | ||
| (state): TemporalDataFrameState(dataframe=(4, 4), temporal_column='datetime', period=None, nan_policy='propagate', figure_config=MatplotlibFigureConfig()) | ||
| ) | ||
@@ -74,0 +74,0 @@ |
@@ -9,2 +9,3 @@ r"""Contain data plotters.""" | ||
| "ContinuousSeriesPlotter", | ||
| "CorrelationPlotter", | ||
| "NullValuePlotter", | ||
@@ -22,2 +23,3 @@ "PlotColumnPlotter", | ||
| from arkas.plotter.continuous_series import ContinuousSeriesPlotter | ||
| from arkas.plotter.correlation import CorrelationPlotter | ||
| from arkas.plotter.mapping import PlotterDict | ||
@@ -24,0 +26,0 @@ from arkas.plotter.null_value import NullValuePlotter |
@@ -131,3 +131,3 @@ r"""Contain the implementation of a DataFrame column plotter.""" | ||
| PlotColumnPlotter( | ||
| (state): DataFrameState(dataframe=(4, 3), figure_config=MatplotlibFigureConfig()) | ||
| (state): DataFrameState(dataframe=(4, 3), nan_policy='propagate', figure_config=MatplotlibFigureConfig()) | ||
| ) | ||
@@ -134,0 +134,0 @@ |
@@ -35,3 +35,3 @@ r"""Contain the implementation of a DataFrame column plotter.""" | ||
| Args: | ||
| state: The state containing the DataFrame to analyze. | ||
| state: The state containing the DataFrame to analyze. | ||
@@ -153,3 +153,3 @@ Returns: | ||
| ScatterColumnPlotter( | ||
| (state): ScatterDataFrameState(dataframe=(4, 3), x='col1', y='col2', color='col3', figure_config=MatplotlibFigureConfig()) | ||
| (state): ScatterDataFrameState(dataframe=(4, 3), x='col1', y='col2', color='col3', nan_policy='propagate', figure_config=MatplotlibFigureConfig()) | ||
| ) | ||
@@ -156,0 +156,0 @@ |
@@ -178,3 +178,3 @@ r"""Contain the implementation of a DataFrame column plotter.""" | ||
| TemporalNullValuePlotter( | ||
| (state): TemporalDataFrameState(dataframe=(4, 4), temporal_column='datetime', period='1d', figure_config=MatplotlibFigureConfig()) | ||
| (state): TemporalDataFrameState(dataframe=(4, 4), temporal_column='datetime', period='1d', nan_policy='propagate', figure_config=MatplotlibFigureConfig()) | ||
| ) | ||
@@ -181,0 +181,0 @@ |
@@ -186,3 +186,3 @@ r"""Contain the implementation of a DataFrame column plotter.""" | ||
| TemporalPlotColumnPlotter( | ||
| (state): TemporalDataFrameState(dataframe=(4, 4), temporal_column='datetime', period=None, figure_config=MatplotlibFigureConfig()) | ||
| (state): TemporalDataFrameState(dataframe=(4, 4), temporal_column='datetime', period=None, nan_policy='propagate', figure_config=MatplotlibFigureConfig()) | ||
| ) | ||
@@ -189,0 +189,0 @@ |
@@ -14,2 +14,3 @@ r"""Contain states.""" | ||
| "SeriesState", | ||
| "TargetDataFrameState", | ||
| "TemporalDataFrameState", | ||
@@ -26,2 +27,3 @@ ] | ||
| from arkas.state.series import SeriesState | ||
| from arkas.state.target_dataframe import TargetDataFrameState | ||
| from arkas.state.temporal_dataframe import TemporalDataFrameState |
@@ -14,2 +14,3 @@ r"""Implement the DataFrame state.""" | ||
| from arkas.figure.utils import get_default_config | ||
| from arkas.metric.utils import check_nan_policy | ||
| from arkas.state.base import BaseState | ||
@@ -35,2 +36,5 @@ | ||
| dataframe: The DataFrame. | ||
| nan_policy: The policy on how to handle NaN values in the input | ||
| arrays. The following options are available: ``'omit'``, | ||
| ``'propagate'``, and ``'raise'``. | ||
| figure_config: An optional figure configuration. | ||
@@ -53,3 +57,3 @@ | ||
| >>> state | ||
| DataFrameState(dataframe=(7, 3), figure_config=MatplotlibFigureConfig()) | ||
| DataFrameState(dataframe=(7, 3), nan_policy='propagate', figure_config=MatplotlibFigureConfig()) | ||
@@ -62,5 +66,8 @@ ``` | ||
| dataframe: pl.DataFrame, | ||
| nan_policy: str = "propagate", | ||
| figure_config: BaseFigureConfig | None = None, | ||
| ) -> None: | ||
| self._dataframe = dataframe | ||
| check_nan_policy(nan_policy) | ||
| self._nan_policy = nan_policy | ||
| self._figure_config = figure_config or get_default_config() | ||
@@ -72,2 +79,3 @@ | ||
| "dataframe": self._dataframe.shape, | ||
| "nan_policy": self._nan_policy, | ||
| "figure_config": self._figure_config, | ||
@@ -83,2 +91,3 @@ } | ||
| "dataframe": self._dataframe.shape, | ||
| "nan_policy": self._nan_policy, | ||
| "figure_config": self._figure_config, | ||
@@ -95,2 +104,6 @@ } | ||
| @property | ||
| def nan_policy(self) -> str: | ||
| return self._nan_policy | ||
| @property | ||
| def figure_config(self) -> BaseFigureConfig | None: | ||
@@ -102,2 +115,3 @@ return self._figure_config | ||
| dataframe=self._dataframe.clone() if deep else self._dataframe, | ||
| nan_policy=self._nan_policy, | ||
| figure_config=self._figure_config.clone() if deep else self._figure_config, | ||
@@ -114,3 +128,4 @@ ) | ||
| "dataframe": self._dataframe, | ||
| "nan_policy": self._nan_policy, | ||
| "figure_config": self._figure_config, | ||
| } |
@@ -36,2 +36,5 @@ r"""Implement the DataFrame state for scatter plots.""" | ||
| color: An optional color axis data column. | ||
| nan_policy: The policy on how to handle NaN values in the input | ||
| arrays. The following options are available: ``'omit'``, | ||
| ``'propagate'``, and ``'raise'``. | ||
| figure_config: An optional figure configuration. | ||
@@ -54,3 +57,3 @@ | ||
| >>> state | ||
| ScatterDataFrameState(dataframe=(7, 3), x='col1', y='col2', color=None, figure_config=MatplotlibFigureConfig()) | ||
| ScatterDataFrameState(dataframe=(7, 3), x='col1', y='col2', color=None, nan_policy='propagate', figure_config=MatplotlibFigureConfig()) | ||
@@ -66,5 +69,6 @@ ``` | ||
| color: str | None = None, | ||
| nan_policy: str = "propagate", | ||
| figure_config: BaseFigureConfig | None = None, | ||
| ) -> None: | ||
| super().__init__(dataframe=dataframe, figure_config=figure_config) | ||
| super().__init__(dataframe=dataframe, nan_policy=nan_policy, figure_config=figure_config) | ||
@@ -86,2 +90,3 @@ check_column_exist(dataframe, x) | ||
| "color": self._color, | ||
| "nan_policy": self._nan_policy, | ||
| "figure_config": self._figure_config, | ||
@@ -100,2 +105,3 @@ } | ||
| "color": self._color, | ||
| "nan_policy": self._nan_policy, | ||
| "figure_config": self._figure_config, | ||
@@ -125,2 +131,3 @@ } | ||
| color=self._color, | ||
| nan_policy=self._nan_policy, | ||
| figure_config=self._figure_config.clone() if deep else self._figure_config, | ||
@@ -127,0 +134,0 @@ ) |
@@ -35,2 +35,5 @@ r"""Implement the temporal DataFrame state.""" | ||
| period: An optional temporal period e.g. monthly or daily. | ||
| nan_policy: The policy on how to handle NaN values in the input | ||
| arrays. The following options are available: ``'omit'``, | ||
| ``'propagate'``, and ``'raise'``. | ||
| figure_config: An optional figure configuration. | ||
@@ -66,3 +69,3 @@ | ||
| >>> state | ||
| TemporalDataFrameState(dataframe=(4, 4), temporal_column='datetime', period=None, figure_config=MatplotlibFigureConfig()) | ||
| TemporalDataFrameState(dataframe=(4, 4), temporal_column='datetime', period=None, nan_policy='propagate', figure_config=MatplotlibFigureConfig()) | ||
@@ -77,5 +80,6 @@ ``` | ||
| period: str | None = None, | ||
| nan_policy: str = "propagate", | ||
| figure_config: BaseFigureConfig | None = None, | ||
| ) -> None: | ||
| super().__init__(dataframe=dataframe, figure_config=figure_config) | ||
| super().__init__(dataframe=dataframe, nan_policy=nan_policy, figure_config=figure_config) | ||
@@ -92,2 +96,3 @@ check_column_exist(dataframe, temporal_column) | ||
| "period": self._period, | ||
| "nan_policy": self._nan_policy, | ||
| "figure_config": self._figure_config, | ||
@@ -105,2 +110,3 @@ } | ||
| "period": self._period, | ||
| "nan_policy": self._nan_policy, | ||
| "figure_config": self._figure_config, | ||
@@ -125,2 +131,3 @@ } | ||
| period=self._period, | ||
| nan_policy=self._nan_policy, | ||
| figure_config=self._figure_config.clone() if deep else self._figure_config, | ||
@@ -127,0 +134,0 @@ ) |
@@ -5,3 +5,3 @@ r"""Contain DataFrame utility functions.""" | ||
| __all__ = ["to_arrays"] | ||
| __all__ = ["check_column_exist", "check_num_columns", "to_arrays"] | ||
@@ -16,10 +16,11 @@ | ||
| def to_arrays(frame: pl.DataFrame) -> dict[str, np.ndarray]: | ||
| r"""Convert a ``polars.DataFrame`` to a dictionary of NumPy arrays. | ||
| def check_column_exist(frame: pl.DataFrame, col: str) -> None: | ||
| r"""Check if a column exists in the DataFrame. | ||
| Args: | ||
| frame: The DataFrame to convert. | ||
| frame: The DataFrame. | ||
| col: The column to check. | ||
| Returns: | ||
| A dictionary of NumPy arrays. | ||
| Raises: | ||
| ValueError: if the column is missing. | ||
@@ -31,3 +32,3 @@ Example usage: | ||
| >>> import polars as pl | ||
| >>> from arkas.utils.dataframe import to_arrays | ||
| >>> from arkas.utils.dataframe import check_column_exist | ||
| >>> frame = pl.DataFrame( | ||
@@ -41,22 +42,21 @@ ... { | ||
| ... ) | ||
| >>> data = to_arrays(frame) | ||
| >>> data | ||
| {'int': array([1, 2, 3, 4, 5]), | ||
| 'float': array([5., 4., 3., 2., 1.]), | ||
| 'str': array(['a', 'b', 'c', 'd', 'e'], dtype=object)} | ||
| >>> check_column_exist(frame, "int") | ||
| ``` | ||
| """ | ||
| return {s.name: s.to_numpy() for s in frame.iter_columns()} | ||
| if col not in frame: | ||
| msg = f"The column {col!r} is not in the DataFrame: {sorted(frame.columns)}" | ||
| raise ValueError(msg) | ||
| def check_column_exist(frame: pl.DataFrame, col: str) -> None: | ||
| r"""Check if a column exists in the DataFrame. | ||
| def check_num_columns(frame: pl.DataFrame, num_columns: int) -> None: | ||
| r"""Check if the DataFrame has the expected number of columns. | ||
| Args: | ||
| frame: The DataFrame. | ||
| col: The column to check. | ||
| num_columns: The expected number of columns. | ||
| Raises: | ||
| ValueError: if the column is missing. | ||
| ValueError: if the DataFrame has not the expected number of | ||
| columns. | ||
@@ -68,3 +68,3 @@ Example usage: | ||
| >>> import polars as pl | ||
| >>> from arkas.utils.dataframe import check_column_exist | ||
| >>> from arkas.utils.dataframe import check_num_columns | ||
| >>> frame = pl.DataFrame( | ||
@@ -78,8 +78,45 @@ ... { | ||
| ... ) | ||
| >>> check_column_exist(frame, "int") | ||
| >>> check_num_columns(frame, num_columns=3) | ||
| ``` | ||
| """ | ||
| if col not in frame: | ||
| msg = f"The column {col!r} is not in the DataFrame: {sorted(frame.columns)}" | ||
| if frame.shape[1] != num_columns: | ||
| msg = ( | ||
| f"The DataFrame must have {num_columns:,} columns but received a DataFrame of " | ||
| f"shape {frame.shape}" | ||
| ) | ||
| raise ValueError(msg) | ||
| def to_arrays(frame: pl.DataFrame) -> dict[str, np.ndarray]: | ||
| r"""Convert a ``polars.DataFrame`` to a dictionary of NumPy arrays. | ||
| Args: | ||
| frame: The DataFrame to convert. | ||
| Returns: | ||
| A dictionary of NumPy arrays. | ||
| Example usage: | ||
| ```pycon | ||
| >>> import polars as pl | ||
| >>> from arkas.utils.dataframe import to_arrays | ||
| >>> frame = pl.DataFrame( | ||
| ... { | ||
| ... "int": [1, 2, 3, 4, 5], | ||
| ... "float": [5.0, 4.0, 3.0, 2.0, 1.0], | ||
| ... "str": ["a", "b", "c", "d", "e"], | ||
| ... }, | ||
| ... schema={"int": pl.Int64, "float": pl.Float64, "str": pl.String}, | ||
| ... ) | ||
| >>> data = to_arrays(frame) | ||
| >>> data | ||
| {'int': array([1, 2, 3, 4, 5]), | ||
| 'float': array([5., 4., 3., 2., 1.]), | ||
| 'str': array(['a', 'b', 'c', 'd', 'e'], dtype=object)} | ||
| ``` | ||
| """ | ||
| return {s.name: s.to_numpy() for s in frame.iter_columns()} |
| r"""Implement an analyzer that generates a summary of the DataFrame.""" | ||
| from __future__ import annotations | ||
| __all__ = ["DataFrameSummaryAnalyzer"] | ||
| import logging | ||
| from typing import TYPE_CHECKING | ||
| from arkas.analyzer.lazy import BaseLazyAnalyzer | ||
| from arkas.output.frame_summary import DataFrameSummaryOutput | ||
| from arkas.utils.validation import check_positive | ||
| if TYPE_CHECKING: | ||
| import polars as pl | ||
| logger = logging.getLogger(__name__) | ||
| class DataFrameSummaryAnalyzer(BaseLazyAnalyzer): | ||
| r"""Implement an analyzer to show a summary of the DataFrame. | ||
| Args: | ||
| top: The number of most frequent values to show. | ||
| sort: If ``True``, sort the columns by alphabetical order. | ||
| Example usage: | ||
| ```pycon | ||
| >>> import polars as pl | ||
| >>> from arkas.analyzer import DataFrameSummaryAnalyzer | ||
| >>> analyzer = DataFrameSummaryAnalyzer() | ||
| >>> analyzer | ||
| DataFrameSummaryAnalyzer(top=5, sort=False) | ||
| >>> frame = pl.DataFrame( | ||
| ... { | ||
| ... "col1": [0, 1, 0, 1], | ||
| ... "col2": [1, 0, 1, 0], | ||
| ... "col3": [1, 1, 1, 1], | ||
| ... }, | ||
| ... schema={"col1": pl.Int64, "col2": pl.Int64, "col3": pl.Int64}, | ||
| ... ) | ||
| >>> output = analyzer.analyze(frame) | ||
| >>> output | ||
| DataFrameSummaryOutput(shape=(4, 3), top=5) | ||
| ``` | ||
| """ | ||
| def __init__(self, top: int = 5, sort: bool = False) -> None: | ||
| check_positive(name="top", value=top) | ||
| self._top = top | ||
| self._sort = bool(sort) | ||
| def __repr__(self) -> str: | ||
| return f"{self.__class__.__qualname__}(top={self._top:,}, sort={self._sort})" | ||
| def _analyze(self, frame: pl.DataFrame) -> DataFrameSummaryOutput: | ||
| logger.info("Analyzing the DataFrame...") | ||
| if self._sort: | ||
| frame = frame.select(sorted(frame.columns)) | ||
| return DataFrameSummaryOutput(frame=frame, top=self._top) |
| r"""Contain the implementation of a HTML content generator that returns | ||
| a summary of a DataFrame.""" | ||
| from __future__ import annotations | ||
| __all__ = [ | ||
| "DataFrameSummaryContentGenerator", | ||
| "create_table", | ||
| "create_table_row", | ||
| "create_template", | ||
| ] | ||
| import logging | ||
| from collections import Counter | ||
| from typing import TYPE_CHECKING, Any | ||
| from coola import objects_are_equal | ||
| from grizz.utils.count import compute_nunique | ||
| from grizz.utils.null import compute_null_count | ||
| from jinja2 import Template | ||
| from arkas.content.section import BaseSectionContentGenerator | ||
| from arkas.utils.validation import check_positive | ||
| if TYPE_CHECKING: | ||
| from collections.abc import Sequence | ||
| import polars as pl | ||
| logger = logging.getLogger(__name__) | ||
| class DataFrameSummaryContentGenerator(BaseSectionContentGenerator): | ||
| r"""Implement a content generator that returns a summary of a | ||
| DataFrame. | ||
| Args: | ||
| frame: The DataFrame to analyze. | ||
| top: The number of most frequent values to show. | ||
| Example usage: | ||
| ```pycon | ||
| >>> import polars as pl | ||
| >>> from arkas.content import DataFrameSummaryContentGenerator | ||
| >>> content = DataFrameSummaryContentGenerator( | ||
| ... frame=pl.DataFrame( | ||
| ... { | ||
| ... "col1": [1.2, 4.2, 4.2, 2.2], | ||
| ... "col2": [1, 1, 1, 1], | ||
| ... "col3": [1, 2, 2, 2], | ||
| ... }, | ||
| ... schema={"col1": pl.Float64, "col2": pl.Int64, "col3": pl.Int64}, | ||
| ... ) | ||
| ... ) | ||
| >>> content | ||
| DataFrameSummaryContentGenerator(shape=(4, 3), top=5) | ||
| ``` | ||
| """ | ||
| def __init__(self, frame: pl.DataFrame, top: int = 5) -> None: | ||
| self._frame = frame | ||
| check_positive(name="top", value=top) | ||
| self._top = top | ||
| def __repr__(self) -> str: | ||
| return f"{self.__class__.__qualname__}(shape={self._frame.shape}, top={self._top})" | ||
| @property | ||
| def frame(self) -> pl.DataFrame: | ||
| r"""The DataFrame to analyze.""" | ||
| return self._frame | ||
| @property | ||
| def top(self) -> int: | ||
| return self._top | ||
| def equal(self, other: Any, equal_nan: bool = False) -> bool: | ||
| if not isinstance(other, self.__class__): | ||
| return False | ||
| return self.top == other.top and objects_are_equal( | ||
| self.frame, other.frame, equal_nan=equal_nan | ||
| ) | ||
| def get_columns(self) -> tuple[str, ...]: | ||
| return tuple(self._frame.columns) | ||
| def get_null_count(self) -> tuple[int, ...]: | ||
| return tuple(compute_null_count(self._frame).tolist()) | ||
| def get_nunique(self) -> tuple[int, ...]: | ||
| return tuple(compute_nunique(self._frame).tolist()) | ||
| def get_dtypes(self) -> tuple[pl.DataType, ...]: | ||
| return tuple(self._frame.schema.dtypes()) | ||
| def get_most_frequent_values(self, top: int = 5) -> tuple[tuple[tuple[Any, int], ...], ...]: | ||
| return tuple(tuple(Counter(series.to_list()).most_common(top)) for series in self.frame) | ||
| def generate_content(self) -> str: | ||
| logger.info("Generating the DataFrame summary content...") | ||
| return Template(create_template()).render( | ||
| { | ||
| "table": self._create_table(), | ||
| "nrows": f"{self._frame.shape[0]:,}", | ||
| "ncols": f"{self._frame.shape[1]:,}", | ||
| } | ||
| ) | ||
| def _create_table(self) -> str: | ||
| return create_table( | ||
| columns=self.get_columns(), | ||
| null_count=self.get_null_count(), | ||
| nunique=self.get_nunique(), | ||
| dtypes=self.get_dtypes(), | ||
| most_frequent_values=self.get_most_frequent_values(top=self._top), | ||
| total=self._frame.shape[0], | ||
| ) | ||
| def create_template() -> str: | ||
| r"""Return the template of the content. | ||
| Returns: | ||
| The content template. | ||
| Example usage: | ||
| ```pycon | ||
| >>> from arkas.content.frame_summary import create_template | ||
| >>> template = create_template() | ||
| ``` | ||
| """ | ||
| return """This section shows a short summary of each column. | ||
| <ul> | ||
| <li> <b>column</b>: are the column names</li> | ||
| <li> <b>types</b>: are the object types for the objects in the column </li> | ||
| <li> <b>null</b>: are the number (and percentage) of null values in the column </li> | ||
| <li> <b>unique</b>: are the number (and percentage) of unique values in the column </li> | ||
| </ul> | ||
| <p style="margin-top: 1rem;"> | ||
| <b>General statistics about the DataFrame</b> | ||
| <ul> | ||
| <li> number of columns: {{ncols}} </li> | ||
| <li> number of rows: {{nrows}}</li> | ||
| </ul> | ||
| {{table}} | ||
| """ | ||
| def create_table( | ||
| columns: Sequence[str], | ||
| null_count: Sequence[int], | ||
| nunique: Sequence[int], | ||
| dtypes: Sequence[pl.DataType], | ||
| most_frequent_values: Sequence[Sequence[tuple[Any, int]]], | ||
| total: int, | ||
| ) -> str: | ||
| r"""Return a HTML representation of a table with the temporal | ||
| distribution of null values. | ||
| Args: | ||
| columns: The column names. | ||
| null_count: The number of null values for each column. | ||
| nunique: The number of unique values for each column. | ||
| dtypes: The data type for each column. | ||
| most_frequent_values: The most frequent values for each column. | ||
| total: The total number of rows. | ||
| Returns: | ||
| The HTML representation of the table. | ||
| Example usage: | ||
| ```pycon | ||
| >>> import polars as pl | ||
| >>> from arkas.content.frame_summary import create_table_row | ||
| >>> row = create_table( | ||
| ... columns=["float", "int", "str"], | ||
| ... null_count=(1, 0, 2), | ||
| ... nunique=(5, 2, 4), | ||
| ... dtypes=(pl.Float64(), pl.Int64(), pl.String()), | ||
| ... most_frequent_values=( | ||
| ... ((2.2, 2), (1.2, 1), (4.2, 1), (None, 1), (1.0, 1)), | ||
| ... ((1, 5), (0, 1)), | ||
| ... (("B", 2), (None, 2), ("A", 1), ("C", 1)), | ||
| ... ), | ||
| ... total=42, | ||
| ... ) | ||
| ``` | ||
| """ | ||
| rows = [] | ||
| for ( | ||
| column, | ||
| null, | ||
| nuniq, | ||
| dtype, | ||
| mf_values, | ||
| ) in zip(columns, null_count, nunique, dtypes, most_frequent_values): | ||
| rows.append( | ||
| create_table_row( | ||
| column=column, | ||
| null=null, | ||
| dtype=dtype, | ||
| nunique=nuniq, | ||
| most_frequent_values=mf_values, | ||
| total=total, | ||
| ) | ||
| ) | ||
| rows = "\n".join(rows) | ||
| return Template( | ||
| """<table class="table table-hover table-responsive w-auto" > | ||
| <thead class="thead table-group-divider"> | ||
| <tr> | ||
| <th>column</th> | ||
| <th>types</th> | ||
| <th>null</th> | ||
| <th>unique</th> | ||
| <th>most frequent values</th> | ||
| </tr> | ||
| </thead> | ||
| <tbody class="tbody table-group-divider"> | ||
| {{rows}} | ||
| <tr class="table-group-divider"></tr> | ||
| </tbody> | ||
| </table> | ||
| """ | ||
| ).render({"rows": rows}) | ||
| def create_table_row( | ||
| column: str, | ||
| null: int, | ||
| nunique: int, | ||
| dtype: pl.DataType, | ||
| most_frequent_values: Sequence[tuple[Any, int]], | ||
| total: int, | ||
| ) -> str: | ||
| r"""Create the HTML code of a new table row. | ||
| Args: | ||
| column: The column name. | ||
| null: The number of null values. | ||
| nunique: The number of unique values. | ||
| dtype: The data type of the column. | ||
| most_frequent_values: The most frequent values. | ||
| total: The total number of rows. | ||
| Returns: | ||
| The HTML code of a row. | ||
| Example usage: | ||
| ```pycon | ||
| >>> import polars as pl | ||
| >>> from arkas.content.frame_summary import create_table_row | ||
| >>> row = create_table_row( | ||
| ... column="col", | ||
| ... null=5, | ||
| ... nunique=42, | ||
| ... dtype=pl.Float64(), | ||
| ... most_frequent_values=[("C", 12), ("A", 5), ("B", 4)], | ||
| ... total=100, | ||
| ... ) | ||
| ``` | ||
| """ | ||
| null = f"{null:,} ({100 * null / total if total else float('nan'):.2f}%)" | ||
| nunique = f"{nunique:,} ({100 * nunique / total if total else float('nan'):.2f}%)" | ||
| most_frequent_values = ", ".join( | ||
| [f"{val} ({100 * c / total:.2f}%)" for val, c in most_frequent_values] | ||
| ) | ||
| return Template( | ||
| """<tr> | ||
| <th>{{column}}</th> | ||
| <td>{{dtype}}</td> | ||
| <td {{num_style}}>{{null}}</td> | ||
| <td {{num_style}}>{{nunique}}</td> | ||
| <td>{{most_frequent_values}}</td> | ||
| </tr>""" | ||
| ).render( | ||
| { | ||
| "num_style": 'style="text-align: right;"', | ||
| "column": column, | ||
| "null": null, | ||
| "dtype": dtype, | ||
| "nunique": nunique, | ||
| "most_frequent_values": most_frequent_values, | ||
| } | ||
| ) |
| r"""Implement the DataFrame summary output.""" | ||
| from __future__ import annotations | ||
| __all__ = ["DataFrameSummaryOutput"] | ||
| from typing import TYPE_CHECKING, Any | ||
| from coola import objects_are_equal | ||
| from arkas.content.frame_summary import DataFrameSummaryContentGenerator | ||
| from arkas.evaluator2.vanilla import Evaluator | ||
| from arkas.output.lazy import BaseLazyOutput | ||
| from arkas.plotter.vanilla import Plotter | ||
| from arkas.utils.validation import check_positive | ||
| if TYPE_CHECKING: | ||
| import polars as pl | ||
| class DataFrameSummaryOutput(BaseLazyOutput): | ||
| r"""Implement the DataFrame summary output. | ||
| Args: | ||
| frame: The DataFrame to analyze. | ||
| top: The number of most frequent values to show. | ||
| Example usage: | ||
| ```pycon | ||
| >>> import polars as pl | ||
| >>> from arkas.output import DataFrameSummaryOutput | ||
| >>> frame = pl.DataFrame( | ||
| ... { | ||
| ... "col1": [1.2, 4.2, 4.2, 2.2], | ||
| ... "col2": [1, 1, 1, 1], | ||
| ... "col3": [1, 2, 2, 2], | ||
| ... }, | ||
| ... schema={"col1": pl.Float64, "col2": pl.Int64, "col3": pl.Int64}, | ||
| ... ) | ||
| >>> output = DataFrameSummaryOutput(frame) | ||
| >>> output | ||
| DataFrameSummaryOutput(shape=(4, 3), top=5) | ||
| >>> output.get_content_generator() | ||
| DataFrameSummaryContentGenerator(shape=(4, 3), top=5) | ||
| >>> output.get_evaluator() | ||
| Evaluator(count=0) | ||
| >>> output.get_plotter() | ||
| Plotter(count=0) | ||
| ``` | ||
| """ | ||
| def __init__(self, frame: pl.DataFrame, top: int = 5) -> None: | ||
| self._frame = frame | ||
| check_positive(name="top", value=top) | ||
| self._top = top | ||
| def __repr__(self) -> str: | ||
| return f"{self.__class__.__qualname__}(shape={self._frame.shape}, top={self._top})" | ||
| def equal(self, other: Any, equal_nan: bool = False) -> bool: | ||
| if not isinstance(other, self.__class__): | ||
| return False | ||
| return self._top == other._top and objects_are_equal( | ||
| self._frame, other._frame, equal_nan=equal_nan | ||
| ) | ||
| def _get_content_generator(self) -> DataFrameSummaryContentGenerator: | ||
| return DataFrameSummaryContentGenerator(frame=self._frame, top=self._top) | ||
| def _get_evaluator(self) -> Evaluator: | ||
| return Evaluator() | ||
| def _get_plotter(self) -> Plotter: | ||
| return Plotter() |
Alert delta unavailable
Currently unable to show alert delta for PyPI packages.
977606
6.95%264
5.6%24448
6.75%