arkas
Advanced tools
| r"""Implement an analyzer that analyzes a column with continuous | ||
| values.""" | ||
| from __future__ import annotations | ||
| __all__ = ["ContinuousColumnAnalyzer"] | ||
| import logging | ||
| from typing import TYPE_CHECKING, Any | ||
| from coola import objects_are_equal | ||
| from coola.utils.format import repr_mapping_line | ||
| from arkas.analyzer.lazy import BaseLazyAnalyzer | ||
| from arkas.output.continuous_series import ContinuousSeriesOutput | ||
| from arkas.state.series import SeriesState | ||
| if TYPE_CHECKING: | ||
| import polars as pl | ||
| from arkas.figure import BaseFigureConfig | ||
| logger = logging.getLogger(__name__) | ||
| class ContinuousColumnAnalyzer(BaseLazyAnalyzer): | ||
| r"""Implement an analyzer that analyzes a column with continuous | ||
| values. | ||
| Args: | ||
| column: The column to analyze. | ||
| figure_config: The figure configuration. | ||
| Example usage: | ||
| ```pycon | ||
| >>> import polars as pl | ||
| >>> from arkas.analyzer import ContinuousColumnAnalyzer | ||
| >>> analyzer = ContinuousColumnAnalyzer(column="col1") | ||
| >>> analyzer | ||
| ContinuousColumnAnalyzer(column='col1', figure_config=None) | ||
| >>> frame = pl.DataFrame( | ||
| ... { | ||
| ... "col1": [0, 1, 0, 1], | ||
| ... "col2": [1, 0, 1, 0], | ||
| ... "col3": [1, 1, 1, 1], | ||
| ... }, | ||
| ... schema={"col1": pl.Int64, "col2": pl.Int64, "col3": pl.Int64}, | ||
| ... ) | ||
| >>> output = analyzer.analyze(frame) | ||
| >>> output | ||
| ContinuousSeriesOutput( | ||
| (state): SeriesState(name='col1', values=(4,), figure_config=MatplotlibFigureConfig()) | ||
| ) | ||
| ``` | ||
| """ | ||
| def __init__(self, column: str, figure_config: BaseFigureConfig | None = None) -> None: | ||
| self._column = column | ||
| self._figure_config = figure_config | ||
| def __repr__(self) -> str: | ||
| args = repr_mapping_line(self.get_args()) | ||
| return f"{self.__class__.__qualname__}({args})" | ||
| def equal(self, other: Any, equal_nan: bool = False) -> bool: | ||
| if not isinstance(other, self.__class__): | ||
| return False | ||
| return objects_are_equal(self.get_args(), other.get_args(), equal_nan=equal_nan) | ||
| def get_args(self) -> dict: | ||
| return {"column": self._column, "figure_config": self._figure_config} | ||
| def _analyze(self, frame: pl.DataFrame) -> ContinuousSeriesOutput: | ||
| logger.info(f"Analyzing the continuous distribution of column {self._column!r}...") | ||
| return ContinuousSeriesOutput( | ||
| state=SeriesState( | ||
| series=frame[self._column], | ||
| figure_config=self._figure_config, | ||
| ) | ||
| ) |
| r"""Implement an analyzer that plots the content of each column.""" | ||
| from __future__ import annotations | ||
| __all__ = ["NullValueAnalyzer"] | ||
| import logging | ||
| from typing import TYPE_CHECKING | ||
| from grizz.utils.format import str_shape_diff | ||
| from arkas.analyzer.lazy import BaseInNLazyAnalyzer | ||
| from arkas.output.null_value import NullValueOutput | ||
| from arkas.state.null_value import NullValueState | ||
| if TYPE_CHECKING: | ||
| from collections.abc import Sequence | ||
| import polars as pl | ||
| from arkas.figure import BaseFigureConfig | ||
| logger = logging.getLogger(__name__) | ||
| class NullValueAnalyzer(BaseInNLazyAnalyzer): | ||
| r"""Implement an analyzer that plots the content of each column. | ||
| Args: | ||
| columns: The columns to analyze. If ``None``, it analyzes all | ||
| the columns. | ||
| exclude_columns: The columns to exclude from the input | ||
| ``columns``. If any column is not found, it will be ignored | ||
| during the filtering process. | ||
| missing_policy: The policy on how to handle missing columns. | ||
| The following options are available: ``'ignore'``, | ||
| ``'warn'``, and ``'raise'``. If ``'raise'``, an exception | ||
| is raised if at least one column is missing. | ||
| If ``'warn'``, a warning is raised if at least one column | ||
| is missing and the missing columns are ignored. | ||
| If ``'ignore'``, the missing columns are ignored and | ||
| no warning message appears. | ||
| figure_config: The figure configuration. | ||
| Example usage: | ||
| ```pycon | ||
| >>> import polars as pl | ||
| >>> from arkas.analyzer import NullValueAnalyzer | ||
| >>> analyzer = NullValueAnalyzer() | ||
| >>> analyzer | ||
| NullValueAnalyzer(columns=None, exclude_columns=(), missing_policy='raise', figure_config=None) | ||
| >>> frame = pl.DataFrame( | ||
| ... { | ||
| ... "col1": [0, 1, 1, 0, 0, 1, None], | ||
| ... "col2": [0, 1, None, None, 0, 1, 0], | ||
| ... "col3": [None, 0, 0, 0, None, 1, None], | ||
| ... } | ||
| ... ) | ||
| >>> output = analyzer.analyze(frame) | ||
| >>> output | ||
| NullValueOutput( | ||
| (state): NullValueState(num_columns=3, figure_config=MatplotlibFigureConfig()) | ||
| ) | ||
| ``` | ||
| """ | ||
| def __init__( | ||
| self, | ||
| columns: Sequence[str] | None = None, | ||
| exclude_columns: Sequence[str] = (), | ||
| missing_policy: str = "raise", | ||
| figure_config: BaseFigureConfig | None = None, | ||
| ) -> None: | ||
| super().__init__( | ||
| columns=columns, | ||
| exclude_columns=exclude_columns, | ||
| missing_policy=missing_policy, | ||
| ) | ||
| self._figure_config = figure_config | ||
| def get_args(self) -> dict: | ||
| return super().get_args() | { | ||
| "figure_config": self._figure_config, | ||
| } | ||
| def _analyze(self, frame: pl.DataFrame) -> NullValueOutput: | ||
| logger.info(f"Plotting the content of {len(self.find_columns(frame)):,} columns...") | ||
| columns = self.find_common_columns(frame) | ||
| dataframe = frame.select(columns) | ||
| logger.info(str_shape_diff(orig=frame.shape, final=dataframe.shape)) | ||
| return NullValueOutput( | ||
| state=NullValueState.from_dataframe( | ||
| dataframe=dataframe, figure_config=self._figure_config | ||
| ) | ||
| ) |
| r"""Implement an analyzer that plots the content of each column.""" | ||
| from __future__ import annotations | ||
| __all__ = ["TemporalNullValueAnalyzer"] | ||
| import logging | ||
| from typing import TYPE_CHECKING | ||
| from grizz.utils.format import str_shape_diff | ||
| from arkas.analyzer.lazy import BaseInNLazyAnalyzer | ||
| from arkas.output.temporal_null_value import TemporalNullValueOutput | ||
| from arkas.state.temporal_dataframe import TemporalDataFrameState | ||
| if TYPE_CHECKING: | ||
| from collections.abc import Sequence | ||
| import polars as pl | ||
| from arkas.figure import BaseFigureConfig | ||
| logger = logging.getLogger(__name__) | ||
| class TemporalNullValueAnalyzer(BaseInNLazyAnalyzer): | ||
| r"""Implement an analyzer that analyzes the number of null values in | ||
| a DataFrame. | ||
| Args: | ||
| temporal_column: The temporal column in the DataFrame. | ||
| period: The temporal period e.g. monthly or daily. | ||
| columns: The columns to analyze. If ``None``, it analyzes all | ||
| the columns. | ||
| exclude_columns: The columns to exclude from the input | ||
| ``columns``. If any column is not found, it will be ignored | ||
| during the filtering process. | ||
| missing_policy: The policy on how to handle missing columns. | ||
| The following options are available: ``'ignore'``, | ||
| ``'warn'``, and ``'raise'``. If ``'raise'``, an exception | ||
| is raised if at least one column is missing. | ||
| If ``'warn'``, a warning is raised if at least one column | ||
| is missing and the missing columns are ignored. | ||
| If ``'ignore'``, the missing columns are ignored and | ||
| no warning message appears. | ||
| figure_config: The figure configuration. | ||
| Example usage: | ||
| ```pycon | ||
| >>> from datetime import datetime, timezone | ||
| >>> import polars as pl | ||
| >>> from arkas.analyzer import TemporalNullValueAnalyzer | ||
| >>> analyzer = TemporalNullValueAnalyzer(temporal_column="datetime", period="1d") | ||
| >>> analyzer | ||
| TemporalNullValueAnalyzer(columns=None, exclude_columns=(), missing_policy='raise', temporal_column='datetime', period='1d', figure_config=None) | ||
| >>> frame = pl.DataFrame( | ||
| ... { | ||
| ... "col1": [0, 1, 1, 0], | ||
| ... "col2": [0, 1, 0, 1], | ||
| ... "col3": [1, 0, 0, 0], | ||
| ... "datetime": [ | ||
| ... datetime(year=2020, month=1, day=3, tzinfo=timezone.utc), | ||
| ... datetime(year=2020, month=2, day=3, tzinfo=timezone.utc), | ||
| ... datetime(year=2020, month=3, day=3, tzinfo=timezone.utc), | ||
| ... datetime(year=2020, month=4, day=3, tzinfo=timezone.utc), | ||
| ... ], | ||
| ... }, | ||
| ... schema={ | ||
| ... "col1": pl.Int64, | ||
| ... "col2": pl.Int64, | ||
| ... "col3": pl.Int64, | ||
| ... "datetime": pl.Datetime(time_unit="us", time_zone="UTC"), | ||
| ... }, | ||
| ... ) | ||
| >>> output = analyzer.analyze(frame) | ||
| >>> output | ||
| TemporalNullValueOutput( | ||
| (state): TemporalDataFrameState(dataframe=(4, 4), temporal_column='datetime', period='1d', figure_config=MatplotlibFigureConfig()) | ||
| ) | ||
| ``` | ||
| """ | ||
| def __init__( | ||
| self, | ||
| temporal_column: str, | ||
| period: str, | ||
| columns: Sequence[str] | None = None, | ||
| exclude_columns: Sequence[str] = (), | ||
| missing_policy: str = "raise", | ||
| figure_config: BaseFigureConfig | None = None, | ||
| ) -> None: | ||
| super().__init__( | ||
| columns=columns, | ||
| exclude_columns=exclude_columns, | ||
| missing_policy=missing_policy, | ||
| ) | ||
| self._temporal_column = temporal_column | ||
| self._period = period | ||
| self._figure_config = figure_config | ||
| def get_args(self) -> dict: | ||
| return super().get_args() | { | ||
| "temporal_column": self._temporal_column, | ||
| "period": self._period, | ||
| "figure_config": self._figure_config, | ||
| } | ||
| def _analyze(self, frame: pl.DataFrame) -> TemporalNullValueOutput: | ||
| logger.info( | ||
| f"Plotting the number of null values of {len(self.find_columns(frame)):,} columns " | ||
| f"using the temporal column {self._temporal_column!r} and period {self._period!r}..." | ||
| ) | ||
| columns = list(self.find_common_columns(frame)) | ||
| if self._temporal_column not in columns: | ||
| columns.append(self._temporal_column) | ||
| dataframe = frame.select(columns) | ||
| logger.info(str_shape_diff(orig=frame.shape, final=dataframe.shape)) | ||
| return TemporalNullValueOutput( | ||
| state=TemporalDataFrameState( | ||
| dataframe=dataframe, | ||
| temporal_column=self._temporal_column, | ||
| period=self._period, | ||
| figure_config=self._figure_config, | ||
| ) | ||
| ) |
| r"""Contain the implementation of a HTML content generator that analyzes | ||
| a Series with continuous values.""" | ||
| from __future__ import annotations | ||
| __all__ = ["ContinuousSeriesContentGenerator", "create_template"] | ||
| import logging | ||
| from typing import TYPE_CHECKING, Any | ||
| from coola.utils import repr_indent, repr_mapping, str_indent, str_mapping | ||
| from jinja2 import Template | ||
| from arkas.content.section import BaseSectionContentGenerator | ||
| from arkas.figure.utils import figure2html | ||
| from arkas.plotter.continuous_series import ContinuousSeriesPlotter | ||
| from arkas.utils.range import find_range | ||
| from arkas.utils.stats import compute_statistics_continuous | ||
| if TYPE_CHECKING: | ||
| from arkas.state.series import SeriesState | ||
| logger = logging.getLogger(__name__) | ||
| class ContinuousSeriesContentGenerator(BaseSectionContentGenerator): | ||
| r"""Implement a content generator that analyzes a Series with | ||
| continuous values. | ||
| Args: | ||
| state: The state containing the Series to analyze. | ||
| Example usage: | ||
| ```pycon | ||
| >>> import polars as pl | ||
| >>> from arkas.content import ContinuousSeriesContentGenerator | ||
| >>> from arkas.state import SeriesState | ||
| >>> content = ContinuousSeriesContentGenerator( | ||
| ... SeriesState(pl.Series("col1", [1, 2, 3, 4, 5, 6, 7])) | ||
| ... ) | ||
| >>> content | ||
| ContinuousSeriesContentGenerator( | ||
| (state): SeriesState(name='col1', values=(7,), figure_config=MatplotlibFigureConfig()) | ||
| ) | ||
| ``` | ||
| """ | ||
| def __init__(self, state: SeriesState) -> None: | ||
| self._state = state | ||
| def __repr__(self) -> str: | ||
| args = repr_indent(repr_mapping({"state": self._state})) | ||
| return f"{self.__class__.__qualname__}(\n {args}\n)" | ||
| def __str__(self) -> str: | ||
| args = str_indent(str_mapping({"state": self._state})) | ||
| return f"{self.__class__.__qualname__}(\n {args}\n)" | ||
| def equal(self, other: Any, equal_nan: bool = False) -> bool: | ||
| if not isinstance(other, self.__class__): | ||
| return False | ||
| return self._state.equal(other._state, equal_nan=equal_nan) | ||
| def generate_content(self) -> str: | ||
| logger.info(f"Generating the continuous distribution of {self._state.series.name}...") | ||
| figures = ContinuousSeriesPlotter(state=self._state).plot() | ||
| stats = compute_statistics_continuous(self._state.series) | ||
| null_values_pct = ( | ||
| f"{100 * stats['num_nulls'] / stats['count']:.2f}" if stats["count"] > 0 else "N/A" | ||
| ) | ||
| xmin, xmax = find_range( | ||
| self._state.series.drop_nulls().to_numpy(), | ||
| xmin=self._state.figure_config.get_arg("xmin"), | ||
| xmax=self._state.figure_config.get_arg("xmin"), | ||
| ) | ||
| return Template(create_template()).render( | ||
| { | ||
| "column": self._state.series.name, | ||
| "figure": figure2html(figures["continuous_histogram"], close_fig=True), | ||
| "table": create_table(stats), | ||
| "total_values": f"{stats['count']:,}", | ||
| "unique_values": f"{stats['nunique']:,}", | ||
| "null_values": f"{stats['num_nulls']:,}", | ||
| "null_values_pct": null_values_pct, | ||
| "min_value": f"{stats['min']:,}", | ||
| "max_value": f"{stats['max']:,}", | ||
| "xmin": f"{xmin:,}", | ||
| "xmax": f"{xmax:,}", | ||
| "dtype": str(self._state.series.dtype), | ||
| } | ||
| ) | ||
| def create_template() -> str: | ||
| r"""Return the template of the content. | ||
| Returns: | ||
| The content template. | ||
| Example usage: | ||
| ```pycon | ||
| >>> from arkas.content.continuous_series import create_template | ||
| >>> template = create_template() | ||
| ``` | ||
| """ | ||
| return """<p>This section analyzes the distribution of continuous values for column <em>{{column}}</em>.</p> | ||
| <ul> | ||
| <li> <b>total values:</b> {{total_values}} </li> | ||
| <li> <b>number of unique values:</b> {{unique_values}} </li> | ||
| <li> <b>number of null values:</b> {{null_values}} / {{total_values}} ({{null_values_pct}}%) </li> | ||
| <li> <b>range of values:</b> [{{min_value}}, {{max_value}}] </li> | ||
| <li> <b>data type:</b> <em>{{dtype}}</em> </li> | ||
| </ul> | ||
| <p>The histogram shows the distribution of values in the range [{{xmin}}, {{xmax}}].</p> | ||
| {{figure}} | ||
| <details> | ||
| <summary>[show statistics]</summary> | ||
| <p style="margin-top: 1rem;"> | ||
| The following table shows some statistics about the distribution for column <em>{{column}}<em>. | ||
| </p> | ||
| {{table}} | ||
| </details> | ||
| """ | ||
| def create_table(stats: dict) -> str: | ||
| r"""Create the HTML code of the table with statistics. | ||
| Args: | ||
| stats: Specifies a dictionary with the statistics. | ||
| Returns: | ||
| The HTML code of the table. | ||
| Example usage: | ||
| ```pycon | ||
| >>> from arkas.content.continuous_series import create_table | ||
| >>> table = create_table( | ||
| ... stats={ | ||
| ... "count": 101, | ||
| ... "nunique": 101, | ||
| ... "num_non_nulls": 101, | ||
| ... "num_nulls": 0, | ||
| ... "mean": 50.0, | ||
| ... "std": 29.15, | ||
| ... "skewness": 0.0, | ||
| ... "kurtosis": -1.20, | ||
| ... "min": 0.0, | ||
| ... "q001": 0.1, | ||
| ... "q01": 1.0, | ||
| ... "q05": 5.0, | ||
| ... "q10": 10.0, | ||
| ... "q25": 25.0, | ||
| ... "median": 50.0, | ||
| ... "q75": 75.0, | ||
| ... "q90": 90.0, | ||
| ... "q95": 95.0, | ||
| ... "q99": 99.0, | ||
| ... "q999": 99.9, | ||
| ... "max": 100.0, | ||
| ... ">0": 100, | ||
| ... "<0": 0, | ||
| ... "=0": 1, | ||
| ... }, | ||
| ... ) | ||
| ``` | ||
| """ | ||
| return Template( | ||
| """<table class="table table-hover table-responsive w-auto" > | ||
| <thead class="thead table-group-divider"> | ||
| <tr><th>stat</th><th>value</th></tr> | ||
| </thead> | ||
| <tbody class="tbody table-group-divider"> | ||
| <tr><th>count</th><td {{num_style}}>{{count}}</td></tr> | ||
| <tr><th>mean</th><td {{num_style}}>{{mean}}</td></tr> | ||
| <tr><th>std</th><td {{num_style}}>{{std}}</td></tr> | ||
| <tr><th>skewness</th><td {{num_style}}>{{skewness}}</td></tr> | ||
| <tr><th>kurtosis</th><td {{num_style}}>{{kurtosis}}</td></tr> | ||
| <tr><th>min</th><td {{num_style}}>{{min}}</td></tr> | ||
| <tr><th>quantile 0.1%</th><td {{num_style}}>{{q01}}</td></tr> | ||
| <tr><th>quantile 1%</th><td {{num_style}}>{{q01}}</td></tr> | ||
| <tr><th>quantile 5%</th><td {{num_style}}>{{q05}}</td></tr> | ||
| <tr><th>quantile 10%</th><td {{num_style}}>{{q10}}</td></tr> | ||
| <tr><th>quantile 25%</th><td {{num_style}}>{{q25}}</td></tr> | ||
| <tr><th>median</th><td {{num_style}}>{{median}}</td></tr> | ||
| <tr><th>quantile 75%</th><td {{num_style}}>{{q75}}</td></tr> | ||
| <tr><th>quantile 90%</th><td {{num_style}}>{{q90}}</td></tr> | ||
| <tr><th>quantile 95%</th><td {{num_style}}>{{q95}}</td></tr> | ||
| <tr><th>quantile 99%</th><td {{num_style}}>{{q99}}</td></tr> | ||
| <tr><th>quantile 99.9%</th><td {{num_style}}>{{q99}}</td></tr> | ||
| <tr><th>max</th><td {{num_style}}>{{max}}</td></tr> | ||
| <tr><th>number of zeros</th><td {{num_style}}>{{num_zeros}}</td></tr> | ||
| <tr><th>number of positive values</th><td {{num_style}}>{{num_pos}}</td></tr> | ||
| <tr><th>number of negative values</th><td {{num_style}}>{{num_neg}}</td></tr> | ||
| <tr class="table-group-divider"></tr> | ||
| </tbody> | ||
| </table> | ||
| """ | ||
| ).render( | ||
| { | ||
| "num_style": 'style="text-align: right;"', | ||
| "count": f"{stats['count']:,}", | ||
| "mean": f"{stats['mean']:,.4f}", | ||
| "std": f"{stats['std']:,.4f}", | ||
| "skewness": f"{stats['skewness']:,.4f}", | ||
| "kurtosis": f"{stats['kurtosis']:,.4f}", | ||
| "min": f"{stats['min']:,.4f}", | ||
| "q001": f"{stats['q001']:,.4f}", | ||
| "q01": f"{stats['q01']:,.4f}", | ||
| "q05": f"{stats['q05']:,.4f}", | ||
| "q10": f"{stats['q10']:,.4f}", | ||
| "q25": f"{stats['q25']:,.4f}", | ||
| "median": f"{stats['median']:,.4f}", | ||
| "q75": f"{stats['q75']:,.4f}", | ||
| "q90": f"{stats['q90']:,.4f}", | ||
| "q95": f"{stats['q95']:,.4f}", | ||
| "q99": f"{stats['q99']:,.4f}", | ||
| "q999": f"{stats['q999']:,.4f}", | ||
| "max": f"{stats['max']:,.4f}", | ||
| "num_pos": f"{stats['>0']:,}", | ||
| "num_neg": f"{stats['<0']:,}", | ||
| "num_zeros": f"{stats['=0']:,}", | ||
| } | ||
| ) |
| r"""Contain the implementation of a HTML content generator that analyzes | ||
| the number of null values per column.""" | ||
| from __future__ import annotations | ||
| __all__ = ["NullValueContentGenerator", "create_template"] | ||
| import logging | ||
| from typing import TYPE_CHECKING, Any | ||
| from coola.utils import repr_indent, repr_mapping, str_indent, str_mapping | ||
| from jinja2 import Template | ||
| from arkas.content.section import BaseSectionContentGenerator | ||
| from arkas.figure.utils import figure2html | ||
| from arkas.plotter.null_value import NullValuePlotter | ||
| if TYPE_CHECKING: | ||
| import polars as pl | ||
| from arkas.state.null_value import NullValueState | ||
| logger = logging.getLogger(__name__) | ||
| class NullValueContentGenerator(BaseSectionContentGenerator): | ||
| r"""Implement a content generator that analyzes the number of null | ||
| values per column. | ||
| Args: | ||
| state: The state containing the number of null values per | ||
| column. | ||
| Example usage: | ||
| ```pycon | ||
| >>> import numpy as np | ||
| >>> from arkas.content import NullValueContentGenerator | ||
| >>> from arkas.state import NullValueState | ||
| >>> content = NullValueContentGenerator( | ||
| ... NullValueState( | ||
| ... null_count=np.array([0, 1, 2]), | ||
| ... total_count=np.array([5, 5, 5]), | ||
| ... columns=["col1", "col2", "col3"], | ||
| ... ) | ||
| ... ) | ||
| >>> content | ||
| NullValueContentGenerator( | ||
| (state): NullValueState(num_columns=3, figure_config=MatplotlibFigureConfig()) | ||
| ) | ||
| ``` | ||
| """ | ||
| def __init__(self, state: NullValueState) -> None: | ||
| self._state = state | ||
| def __repr__(self) -> str: | ||
| args = repr_indent(repr_mapping({"state": self._state})) | ||
| return f"{self.__class__.__qualname__}(\n {args}\n)" | ||
| def __str__(self) -> str: | ||
| args = str_indent(str_mapping({"state": self._state})) | ||
| return f"{self.__class__.__qualname__}(\n {args}\n)" | ||
| def equal(self, other: Any, equal_nan: bool = False) -> bool: | ||
| if not isinstance(other, self.__class__): | ||
| return False | ||
| return self._state.equal(other._state, equal_nan=equal_nan) | ||
| def generate_content(self) -> str: | ||
| ncols = len(self._state.columns) | ||
| logger.info(f"Generating the null values bar plot for {ncols:,} columns...") | ||
| figures = NullValuePlotter(state=self._state).plot() | ||
| frame = self._state.to_dataframe() | ||
| return Template(create_template()).render( | ||
| { | ||
| "ncols": f"{ncols:,}", | ||
| "columns": ", ".join(self._state.columns), | ||
| "figure": figure2html(figures["null_values"], close_fig=True), | ||
| "table_alpha": create_table(frame.sort(by="column")), | ||
| "table_sort": create_table(frame.sort(by="null")), | ||
| } | ||
| ) | ||
| def create_template() -> str: | ||
| r"""Return the template of the content. | ||
| Returns: | ||
| The content template. | ||
| Example usage: | ||
| ```pycon | ||
| >>> from arkas.content.null_value import create_template | ||
| >>> template = create_template() | ||
| ``` | ||
| """ | ||
| return """This section analyzes the number and proportion of null values for the {{ncols}} | ||
| columns: <em>{{columns}}</em>. | ||
| <p>The columns are sorted by ascending order of number of null values in the following bar plot.</p> | ||
| {{figure}} | ||
| <details> | ||
| <summary>[show statistics per column]</summary> | ||
| <p style="margin-top: 1rem;"> | ||
| The following tables show the number and proportion of null values for the {{num_columns}} | ||
| columns. | ||
| The background color of the row indicates the proportion of missing values: | ||
| dark blues indicates more missing values than light blues. </p> | ||
| <ul> | ||
| <li> <b>column</b>: is the column name </li> | ||
| <li> <b>null pct</b>: is the percentage of null values in the column </li> | ||
| <li> <b>null count</b>: is the number of null values in the column </li> | ||
| <li> <b>total count</b>: is the total number of values in the column </li> | ||
| </ul> | ||
| <div class="container-fluid"> | ||
| <div class="row align-items-start"> | ||
| <div class="col align-self-center"> | ||
| <p><b>Columns sorted by alphabetical order</b></p> | ||
| {{table_alpha}} | ||
| </div> | ||
| <div class="col"> | ||
| <p><b>Columns sorted by ascending order of missing values</b></p> | ||
| {{table_sort}} | ||
| </div> | ||
| </div> | ||
| </div> | ||
| </details> | ||
| """ | ||
| def create_table(frame: pl.DataFrame) -> str: | ||
| r"""Return a HTML code of a table with the temporal distribution of | ||
| null values. | ||
| Args: | ||
| frame: The DataFrame to analyze. | ||
| Returns: | ||
| The HTML code of the table. | ||
| Example usage: | ||
| ```pycon | ||
| >>> import polars as pl | ||
| >>> from arkas.content.null_value import create_table | ||
| >>> frame = pl.DataFrame( | ||
| ... {"column": ["A", "B", "C"], "null": [0, 1, 2], "total": [4, 4, 4]}, | ||
| ... schema={"column": pl.String, "null": pl.Int64, "total": pl.Int64}, | ||
| ... ) | ||
| >>> table = create_table(frame) | ||
| ``` | ||
| """ | ||
| rows = [ | ||
| create_table_row(column=column, null_count=null, total_count=total) | ||
| for column, null, total in zip( | ||
| frame["column"], | ||
| frame["null"], | ||
| frame["total"], | ||
| ) | ||
| ] | ||
| return Template( | ||
| """<table class="table table-hover table-responsive w-auto" > | ||
| <thead class="thead table-group-divider"> | ||
| <tr> | ||
| <th>column</th> | ||
| <th>null pct</th> | ||
| <th>null count</th> | ||
| <th>total count</th> | ||
| </tr> | ||
| </thead> | ||
| <tbody class="tbody table-group-divider"> | ||
| {{rows}} | ||
| <tr class="table-group-divider"></tr> | ||
| </tbody> | ||
| </table> | ||
| """ | ||
| ).render({"rows": "\n".join(rows)}) | ||
| def create_table_row(column: str, null_count: int, total_count: int) -> str: | ||
| r"""Create the HTML code of a new table row. | ||
| Args: | ||
| column: The column name. | ||
| null_count: The number of null values. | ||
| total_count: The total number of rows. | ||
| Returns: | ||
| The HTML code of a row. | ||
| Example usage: | ||
| ```pycon | ||
| >>> from arkas.content.null_value import create_table_row | ||
| >>> row = create_table_row(column="col", null_count=5, total_count=101) | ||
| ``` | ||
| """ | ||
| pct = null_count / total_count if total_count > 0 else float("nan") | ||
| pct_color = pct if total_count > 0 else 0 | ||
| return Template( | ||
| "<tr>" | ||
| '<th style="background-color: rgba(0, 191, 255, {{null_pct}})">{{column}}</th>' | ||
| "<td {{num_style}}>{{null_pct}}</td>" | ||
| "<td {{num_style}}>{{null_count}}</td>" | ||
| "<td {{num_style}}>{{total_count}}</td>" | ||
| "</tr>" | ||
| ).render( | ||
| { | ||
| "num_style": ( | ||
| f'style="text-align: right; background-color: rgba(0, 191, 255, {pct_color})"' | ||
| ), | ||
| "column": column, | ||
| "null_count": f"{null_count:,}", | ||
| "null_pct": f"{pct:.4f}", | ||
| "total_count": f"{total_count:,}", | ||
| } | ||
| ) |
| r"""Contain the implementation of a HTML content generator that analyzes | ||
| the temporal distribution of null values.""" | ||
| from __future__ import annotations | ||
| __all__ = ["TemporalNullValueContentGenerator", "create_template"] | ||
| import logging | ||
| from typing import TYPE_CHECKING, Any | ||
| from coola.utils import repr_indent, repr_mapping, str_indent, str_mapping | ||
| from grizz.utils.null import compute_temporal_null_count | ||
| from jinja2 import Template | ||
| from arkas.content.section import BaseSectionContentGenerator | ||
| from arkas.figure.utils import figure2html | ||
| from arkas.plotter.temporal_null_value import TemporalNullValuePlotter | ||
| if TYPE_CHECKING: | ||
| import polars as pl | ||
| from arkas.state.temporal_dataframe import TemporalDataFrameState | ||
| logger = logging.getLogger(__name__) | ||
| class TemporalNullValueContentGenerator(BaseSectionContentGenerator): | ||
| r"""Implement a content generator that analyzes the temporal | ||
| distribution of null values. | ||
| Args: | ||
| state: The state containing the DataFrame to analyze. | ||
| Example usage: | ||
| ```pycon | ||
| >>> from datetime import datetime, timezone | ||
| >>> import polars as pl | ||
| >>> from arkas.content import TemporalNullValueContentGenerator | ||
| >>> from arkas.state import TemporalDataFrameState | ||
| >>> dataframe = pl.DataFrame( | ||
| ... { | ||
| ... "col1": [0, 1, 1, 0], | ||
| ... "col2": [0, 1, 0, 1], | ||
| ... "col3": [1, 0, 0, 0], | ||
| ... "datetime": [ | ||
| ... datetime(year=2020, month=1, day=3, tzinfo=timezone.utc), | ||
| ... datetime(year=2020, month=2, day=3, tzinfo=timezone.utc), | ||
| ... datetime(year=2020, month=3, day=3, tzinfo=timezone.utc), | ||
| ... datetime(year=2020, month=4, day=3, tzinfo=timezone.utc), | ||
| ... ], | ||
| ... }, | ||
| ... schema={ | ||
| ... "col1": pl.Int64, | ||
| ... "col2": pl.Int64, | ||
| ... "col3": pl.Int64, | ||
| ... "datetime": pl.Datetime(time_unit="us", time_zone="UTC"), | ||
| ... }, | ||
| ... ) | ||
| >>> content = TemporalNullValueContentGenerator( | ||
| ... TemporalDataFrameState(dataframe, temporal_column="datetime") | ||
| ... ) | ||
| >>> content | ||
| TemporalNullValueContentGenerator( | ||
| (state): TemporalDataFrameState(dataframe=(4, 4), temporal_column='datetime', period=None, figure_config=MatplotlibFigureConfig()) | ||
| ) | ||
| ``` | ||
| """ | ||
| def __init__(self, state: TemporalDataFrameState) -> None: | ||
| self._state = state | ||
| def __repr__(self) -> str: | ||
| args = repr_indent(repr_mapping({"state": self._state})) | ||
| return f"{self.__class__.__qualname__}(\n {args}\n)" | ||
| def __str__(self) -> str: | ||
| args = str_indent(str_mapping({"state": self._state})) | ||
| return f"{self.__class__.__qualname__}(\n {args}\n)" | ||
| def equal(self, other: Any, equal_nan: bool = False) -> bool: | ||
| if not isinstance(other, self.__class__): | ||
| return False | ||
| return self._state.equal(other._state, equal_nan=equal_nan) | ||
| def generate_content(self) -> str: | ||
| nrows, ncols = self._state.dataframe.shape | ||
| logger.info( | ||
| f"Generating the temporal plot of {ncols} columns using the " | ||
| f"temporal column {self._state.temporal_column!r}..." | ||
| ) | ||
| figures = TemporalNullValuePlotter(state=self._state).plot() | ||
| return Template(create_template()).render( | ||
| { | ||
| "nrows": f"{nrows:,}", | ||
| "ncols": f"{ncols:,}", | ||
| "columns": ", ".join(self._state.dataframe.columns), | ||
| "temporal_column": self._state.temporal_column, | ||
| "figure": figure2html(figures["temporal_null_value"], close_fig=True), | ||
| "table": create_table( | ||
| frame=self._state.dataframe, | ||
| temporal_column=self._state.temporal_column, | ||
| period=self._state.period, | ||
| ), | ||
| } | ||
| ) | ||
| def create_template() -> str: | ||
| r"""Return the template of the content. | ||
| Returns: | ||
| The content template. | ||
| Example usage: | ||
| ```pycon | ||
| >>> from arkas.content.temporal_null_value import create_template | ||
| >>> template = create_template() | ||
| ``` | ||
| """ | ||
| return """<p>This section analyzes the temporal distribution of null values in all columns. | ||
| The column <em>{{temporal_column}}</em> is used as the temporal column.</p> | ||
| {{figure}} | ||
| <details> | ||
| <summary>[show statistics per temporal period]</summary> | ||
| <p style="margin-top: 1rem;">The following table shows some statistics for each period.</p> | ||
| {{table}} | ||
| </details> | ||
| """ | ||
| def create_table(frame: pl.DataFrame, temporal_column: str, period: str) -> str: | ||
| r"""Create a HTML representation of a table with the temporal | ||
| distribution of null values. | ||
| Args: | ||
| frame: The DataFrame to analyze. | ||
| temporal_column: The temporal column used to analyze the | ||
| temporal distribution. | ||
| period: The temporal period e.g. monthly or daily. | ||
| Returns: | ||
| The HTML representation of the table. | ||
| Example usage: | ||
| ```pycon | ||
| >>> from datetime import datetime, timezone | ||
| >>> import polars as pl | ||
| >>> from arkas.content.temporal_null_value import create_table | ||
| >>> frame = pl.DataFrame( | ||
| ... { | ||
| ... "col1": [None, 1.0, 0.0, 1.0], | ||
| ... "col2": [None, 1, 0, None], | ||
| ... "datetime": [ | ||
| ... datetime(year=2020, month=1, day=3, tzinfo=timezone.utc), | ||
| ... datetime(year=2020, month=2, day=3, tzinfo=timezone.utc), | ||
| ... datetime(year=2020, month=3, day=3, tzinfo=timezone.utc), | ||
| ... datetime(year=2020, month=4, day=3, tzinfo=timezone.utc), | ||
| ... ], | ||
| ... }, | ||
| ... schema={ | ||
| ... "col1": pl.Float64, | ||
| ... "col2": pl.Int64, | ||
| ... "datetime": pl.Datetime(time_unit="us", time_zone="UTC"), | ||
| ... }, | ||
| ... ) | ||
| >>> table = create_table(frame=frame, temporal_column="datetime", period="1mo") | ||
| ``` | ||
| """ | ||
| if frame.is_empty(): | ||
| return "" | ||
| columns = list(frame.columns) | ||
| columns.remove(temporal_column) | ||
| nulls, totals, labels = compute_temporal_null_count( | ||
| frame=frame, columns=columns, temporal_column=temporal_column, period=period | ||
| ) | ||
| rows = [] | ||
| for label, null, total in zip(labels, nulls, totals): | ||
| rows.append(create_table_row(label=label, num_nulls=null, total=total)) | ||
| return Template( | ||
| """<table class="table table-hover table-responsive w-auto" > | ||
| <thead class="thead table-group-divider"> | ||
| <tr> | ||
| <th>period</th> | ||
| <th>number of null values</th> | ||
| <th>number of non-null values</th> | ||
| <th>total number of values</th> | ||
| <th>percentage of null values</th> | ||
| <th>percentage of non-null values</th> | ||
| </tr> | ||
| </thead> | ||
| <tbody class="tbody table-group-divider"> | ||
| {{rows}} | ||
| <tr class="table-group-divider"></tr> | ||
| </tbody> | ||
| </table> | ||
| """ | ||
| ).render({"rows": "\n".join(rows), "period": period}) | ||
| def create_table_row(label: str, num_nulls: int, total: int) -> str: | ||
| r"""Create the HTML code of a new table row. | ||
| Args: | ||
| label: The label of the row. | ||
| num_nulls: The number of null values. | ||
| total: The total number of values. | ||
| Returns: | ||
| The HTML code of a row. | ||
| Example usage: | ||
| ```pycon | ||
| >>> from arkas.content.temporal_null_value import create_table_row | ||
| >>> row = create_table_row(label="col", num_nulls=5, total=42) | ||
| ``` | ||
| """ | ||
| num_non_nulls = total - num_nulls | ||
| return Template( | ||
| """<tr> | ||
| <th>{{label}}</th> | ||
| <td {{num_style}}>{{num_nulls}}</td> | ||
| <td {{num_style}}>{{num_non_nulls}}</td> | ||
| <td {{num_style}}>{{total}}</td> | ||
| <td {{num_style}}>{{num_nulls_pct}}</td> | ||
| <td {{num_style}}>{{num_non_nulls_pct}}</td> | ||
| </tr>""" | ||
| ).render( | ||
| { | ||
| "num_style": 'style="text-align: right;"', | ||
| "label": label, | ||
| "num_nulls": f"{num_nulls:,}", | ||
| "num_non_nulls": f"{num_non_nulls:,}", | ||
| "total": f"{total:,}", | ||
| "num_nulls_pct": f"{100 * num_nulls / total:.2f}%", | ||
| "num_non_nulls_pct": f"{100 * num_non_nulls / total:.2f}%", | ||
| } | ||
| ) |
| r"""Implement an output to analyze a series with continuous values.""" | ||
| from __future__ import annotations | ||
| __all__ = ["ContinuousSeriesOutput"] | ||
| from typing import TYPE_CHECKING, Any | ||
| from coola.utils import repr_indent, repr_mapping, str_indent, str_mapping | ||
| from arkas.content.continuous_series import ContinuousSeriesContentGenerator | ||
| from arkas.evaluator2.vanilla import Evaluator | ||
| from arkas.output.lazy import BaseLazyOutput | ||
| from arkas.plotter.continuous_series import ContinuousSeriesPlotter | ||
| if TYPE_CHECKING: | ||
| from arkas.state.series import SeriesState | ||
| class ContinuousSeriesOutput(BaseLazyOutput): | ||
| r"""Implement an output to analyze a series with continuous values. | ||
| Args: | ||
| state: The state containing the Series to analyze. | ||
| Example usage: | ||
| ```pycon | ||
| >>> import polars as pl | ||
| >>> from arkas.output import ContinuousSeriesOutput | ||
| >>> from arkas.state import SeriesState | ||
| >>> output = ContinuousSeriesOutput(SeriesState(pl.Series("col1", [1, 2, 3, 4, 5, 6, 7]))) | ||
| >>> output | ||
| ContinuousSeriesOutput( | ||
| (state): SeriesState(name='col1', values=(7,), figure_config=MatplotlibFigureConfig()) | ||
| ) | ||
| >>> output.get_content_generator() | ||
| ContinuousSeriesContentGenerator( | ||
| (state): SeriesState(name='col1', values=(7,), figure_config=MatplotlibFigureConfig()) | ||
| ) | ||
| >>> output.get_evaluator() | ||
| Evaluator(count=0) | ||
| >>> output.get_plotter() | ||
| ContinuousSeriesPlotter( | ||
| (state): SeriesState(name='col1', values=(7,), figure_config=MatplotlibFigureConfig()) | ||
| ) | ||
| ``` | ||
| """ | ||
| def __init__(self, state: SeriesState) -> None: | ||
| self._state = state | ||
| def __repr__(self) -> str: | ||
| args = repr_indent(repr_mapping({"state": self._state})) | ||
| return f"{self.__class__.__qualname__}(\n {args}\n)" | ||
| def __str__(self) -> str: | ||
| args = str_indent(str_mapping({"state": self._state})) | ||
| return f"{self.__class__.__qualname__}(\n {args}\n)" | ||
| def equal(self, other: Any, equal_nan: bool = False) -> bool: | ||
| if not isinstance(other, self.__class__): | ||
| return False | ||
| return self._state.equal(other._state, equal_nan=equal_nan) | ||
| def _get_content_generator(self) -> ContinuousSeriesContentGenerator: | ||
| return ContinuousSeriesContentGenerator(self._state) | ||
| def _get_evaluator(self) -> Evaluator: | ||
| return Evaluator() | ||
| def _get_plotter(self) -> ContinuousSeriesPlotter: | ||
| return ContinuousSeriesPlotter(self._state) |
| r"""Implement an output to analyze the number of null values per | ||
| column.""" | ||
| from __future__ import annotations | ||
| __all__ = ["NullValueOutput"] | ||
| from typing import TYPE_CHECKING, Any | ||
| from coola.utils import repr_indent, repr_mapping, str_indent, str_mapping | ||
| from arkas.content.null_value import NullValueContentGenerator | ||
| from arkas.evaluator2.vanilla import Evaluator | ||
| from arkas.output.lazy import BaseLazyOutput | ||
| from arkas.plotter.null_value import NullValuePlotter | ||
| if TYPE_CHECKING: | ||
| from arkas.state.null_value import NullValueState | ||
| class NullValueOutput(BaseLazyOutput): | ||
| r"""Implement an output to analyze the number of null values per | ||
| column. | ||
| Args: | ||
| state: The state containing the number of null values per | ||
| column. | ||
| Example usage: | ||
| ```pycon | ||
| >>> import numpy as np | ||
| >>> from arkas.output import NullValueOutput | ||
| >>> from arkas.state import NullValueState | ||
| >>> output = NullValueOutput( | ||
| ... NullValueState( | ||
| ... null_count=np.array([0, 1, 2]), | ||
| ... total_count=np.array([5, 5, 5]), | ||
| ... columns=["col1", "col2", "col3"], | ||
| ... ) | ||
| ... ) | ||
| >>> output | ||
| NullValueOutput( | ||
| (state): NullValueState(num_columns=3, figure_config=MatplotlibFigureConfig()) | ||
| ) | ||
| >>> output.get_content_generator() | ||
| NullValueContentGenerator( | ||
| (state): NullValueState(num_columns=3, figure_config=MatplotlibFigureConfig()) | ||
| ) | ||
| >>> output.get_evaluator() | ||
| Evaluator(count=0) | ||
| >>> output.get_plotter() | ||
| NullValuePlotter( | ||
| (state): NullValueState(num_columns=3, figure_config=MatplotlibFigureConfig()) | ||
| ) | ||
| ``` | ||
| """ | ||
| def __init__(self, state: NullValueState) -> None: | ||
| self._state = state | ||
| def __repr__(self) -> str: | ||
| args = repr_indent(repr_mapping({"state": self._state})) | ||
| return f"{self.__class__.__qualname__}(\n {args}\n)" | ||
| def __str__(self) -> str: | ||
| args = str_indent(str_mapping({"state": self._state})) | ||
| return f"{self.__class__.__qualname__}(\n {args}\n)" | ||
| def equal(self, other: Any, equal_nan: bool = False) -> bool: | ||
| if not isinstance(other, self.__class__): | ||
| return False | ||
| return self._state.equal(other._state, equal_nan=equal_nan) | ||
| def _get_content_generator(self) -> NullValueContentGenerator: | ||
| return NullValueContentGenerator(self._state) | ||
| def _get_evaluator(self) -> Evaluator: | ||
| return Evaluator() | ||
| def _get_plotter(self) -> NullValuePlotter: | ||
| return NullValuePlotter(self._state) |
| r"""Implement an output to analyze the number of null values in a | ||
| DataFrame.""" | ||
| from __future__ import annotations | ||
| __all__ = ["TemporalNullValueOutput"] | ||
| from typing import TYPE_CHECKING, Any | ||
| from coola.utils import repr_indent, repr_mapping, str_indent, str_mapping | ||
| from arkas.content.temporal_null_value import TemporalNullValueContentGenerator | ||
| from arkas.evaluator2.vanilla import Evaluator | ||
| from arkas.output.lazy import BaseLazyOutput | ||
| from arkas.plotter.temporal_null_value import TemporalNullValuePlotter | ||
| if TYPE_CHECKING: | ||
| from arkas.state.temporal_dataframe import TemporalDataFrameState | ||
| class TemporalNullValueOutput(BaseLazyOutput): | ||
| r"""Implement an output to analyze the number of null values in a | ||
| DataFrame. | ||
| Args: | ||
| state: The state containing the DataFrame to analyze. | ||
| Example usage: | ||
| ```pycon | ||
| >>> from datetime import datetime, timezone | ||
| >>> import polars as pl | ||
| >>> from arkas.output import TemporalNullValueOutput | ||
| >>> from arkas.state import TemporalDataFrameState | ||
| >>> frame = pl.DataFrame( | ||
| ... { | ||
| ... "col1": [0, 1, 1, 0], | ||
| ... "col2": [0, 1, 0, 1], | ||
| ... "col3": [1, 0, 0, 0], | ||
| ... "datetime": [ | ||
| ... datetime(year=2020, month=1, day=3, tzinfo=timezone.utc), | ||
| ... datetime(year=2020, month=2, day=3, tzinfo=timezone.utc), | ||
| ... datetime(year=2020, month=3, day=3, tzinfo=timezone.utc), | ||
| ... datetime(year=2020, month=4, day=3, tzinfo=timezone.utc), | ||
| ... ], | ||
| ... }, | ||
| ... schema={ | ||
| ... "col1": pl.Int64, | ||
| ... "col2": pl.Int64, | ||
| ... "col3": pl.Int64, | ||
| ... "datetime": pl.Datetime(time_unit="us", time_zone="UTC"), | ||
| ... }, | ||
| ... ) | ||
| >>> output = TemporalNullValueOutput( | ||
| ... TemporalDataFrameState(frame, temporal_column="datetime") | ||
| ... ) | ||
| >>> output | ||
| TemporalNullValueOutput( | ||
| (state): TemporalDataFrameState(dataframe=(4, 4), temporal_column='datetime', period=None, figure_config=MatplotlibFigureConfig()) | ||
| ) | ||
| >>> output.get_content_generator() | ||
| TemporalNullValueContentGenerator( | ||
| (state): TemporalDataFrameState(dataframe=(4, 4), temporal_column='datetime', period=None, figure_config=MatplotlibFigureConfig()) | ||
| ) | ||
| >>> output.get_evaluator() | ||
| Evaluator(count=0) | ||
| >>> output.get_plotter() | ||
| TemporalNullValuePlotter( | ||
| (state): TemporalDataFrameState(dataframe=(4, 4), temporal_column='datetime', period=None, figure_config=MatplotlibFigureConfig()) | ||
| ) | ||
| ``` | ||
| """ | ||
| def __init__(self, state: TemporalDataFrameState) -> None: | ||
| self._state = state | ||
| def __repr__(self) -> str: | ||
| args = repr_indent(repr_mapping({"state": self._state})) | ||
| return f"{self.__class__.__qualname__}(\n {args}\n)" | ||
| def __str__(self) -> str: | ||
| args = str_indent(str_mapping({"state": self._state})) | ||
| return f"{self.__class__.__qualname__}(\n {args}\n)" | ||
| def equal(self, other: Any, equal_nan: bool = False) -> bool: | ||
| if not isinstance(other, self.__class__): | ||
| return False | ||
| return self._state.equal(other._state, equal_nan=equal_nan) | ||
| def _get_content_generator(self) -> TemporalNullValueContentGenerator: | ||
| return TemporalNullValueContentGenerator(self._state) | ||
| def _get_evaluator(self) -> Evaluator: | ||
| return Evaluator() | ||
| def _get_plotter(self) -> TemporalNullValuePlotter: | ||
| return TemporalNullValuePlotter(self._state) |
| r"""Contain CDF plotting functions.""" | ||
| from __future__ import annotations | ||
| __all__ = ["plot_cdf"] | ||
| from typing import TYPE_CHECKING | ||
| import numpy as np | ||
| from arkas.utils.array import nonnan | ||
| if TYPE_CHECKING: | ||
| from matplotlib.axes import Axes | ||
| def plot_cdf( | ||
| ax: Axes, | ||
| array: np.ndarray, | ||
| nbins: int | None = None, | ||
| xmin: float = float("-inf"), | ||
| xmax: float = float("inf"), | ||
| color: str = "tab:blue", | ||
| labelcolor: str = "black", | ||
| ) -> None: | ||
| r"""Plot the cumulative distribution function (CDF). | ||
| Args: | ||
| ax: The axes of the matplotlib figure to update. | ||
| array: The array with the data. | ||
| nbins: The number of bins to use to plot the CDF. | ||
| xmin: The minimum value of the range or its | ||
| associated quantile. ``q0.1`` means the 10% quantile. | ||
| ``0`` is the minimum value and ``1`` is the maximum value. | ||
| xmax: The maximum value of the range or its | ||
| associated quantile. ``q0.9`` means the 90% quantile. | ||
| ``0`` is the minimum value and ``1`` is the maximum value. | ||
| color: The plot color. | ||
| labelcolor: The label color. | ||
| Example usage: | ||
| ```pycon | ||
| >>> import numpy as np | ||
| >>> from matplotlib import pyplot as plt | ||
| >>> from arkas.plot import plot_cdf | ||
| >>> fig, ax = plt.subplots() | ||
| >>> plot_cdf(ax, array=np.arange(101)) | ||
| ``` | ||
| """ | ||
| array = nonnan(array.ravel()) | ||
| if array.size == 0: | ||
| return | ||
| nbins = nbins or min(1000, array.size) | ||
| nleft = array[array < xmin].size | ||
| nright = array[array > xmax].size | ||
| counts, edges = np.histogram(array[np.logical_and(array >= xmin, array <= xmax)], bins=nbins) | ||
| cdf = (np.cumsum(counts) + nleft) / (np.sum(counts) + nleft + nright) | ||
| x = [(left + right) * 0.5 for left, right in zip(edges[:-1], edges[1:])] | ||
| ax.tick_params(axis="y", labelcolor=labelcolor) | ||
| ax.plot(x, cdf, color=color, label="CDF") | ||
| ax.set_ylim(0.0, 1.0) | ||
| ax.set_ylabel("cumulative distribution function (CDF)", color=labelcolor) |
| r"""Contain plotting functions to analyze continuous values.""" | ||
| from __future__ import annotations | ||
| __all__ = [ | ||
| "boxplot_continuous", | ||
| "boxplot_continuous_temporal", | ||
| "hist_continuous", | ||
| "hist_continuous2", | ||
| ] | ||
| from typing import TYPE_CHECKING | ||
| import numpy as np | ||
| from arkas.plot.cdf import plot_cdf | ||
| from arkas.plot.utils import ( | ||
| auto_yscale_continuous, | ||
| axvline_quantile, | ||
| readable_xticklabels, | ||
| ) | ||
| from arkas.utils.array import nonnan | ||
| from arkas.utils.range import find_range | ||
| if TYPE_CHECKING: | ||
| from collections.abc import Sequence | ||
| from matplotlib.axes import Axes | ||
| def boxplot_continuous( | ||
| ax: Axes, | ||
| array: np.ndarray, | ||
| xmin: float | str | None = None, | ||
| xmax: float | str | None = None, | ||
| ) -> None: | ||
| r"""Plot the histogram of an array containing continuous values. | ||
| Args: | ||
| ax: The axes of the matplotlib figure to update. | ||
| array: The array with the data. | ||
| xmin: The minimum value of the range or its | ||
| associated quantile. ``q0.1`` means the 10% quantile. | ||
| ``0`` is the minimum value and ``1`` is the maximum value. | ||
| xmax: The maximum value of the range or its | ||
| associated quantile. ``q0.9`` means the 90% quantile. | ||
| ``0`` is the minimum value and ``1`` is the maximum value. | ||
| Example usage: | ||
| ```pycon | ||
| >>> import numpy as np | ||
| >>> from matplotlib import pyplot as plt | ||
| >>> from arkas.plot import boxplot_continuous | ||
| >>> fig, ax = plt.subplots() | ||
| >>> boxplot_continuous(ax, array=np.arange(101)) | ||
| ``` | ||
| """ | ||
| array = array.ravel() | ||
| if array.size == 0: | ||
| return | ||
| xmin, xmax = find_range(array, xmin=xmin, xmax=xmax) | ||
| ax.boxplot( | ||
| array, | ||
| notch=True, | ||
| vert=False, | ||
| widths=0.7, | ||
| patch_artist=True, | ||
| boxprops={"facecolor": "lightblue"}, | ||
| ) | ||
| readable_xticklabels(ax, max_num_xticks=100) | ||
| if xmin < xmax: | ||
| ax.set_xlim(xmin, xmax) | ||
| ax.set_ylabel(" ") | ||
| def boxplot_continuous_temporal( | ||
| ax: Axes, | ||
| data: Sequence[np.ndarray], | ||
| steps: Sequence, | ||
| ymin: float | str | None = None, | ||
| ymax: float | str | None = None, | ||
| yscale: str = "linear", | ||
| ) -> None: | ||
| r"""Plot the histogram of an array containing continuous values. | ||
| Args: | ||
| ax: The axes of the matplotlib figure to update. | ||
| data: The sequence of data where each item is a 1-d array with | ||
| the values of the time step. | ||
| steps: The sequence time step names. | ||
| ymin: The minimum value of the range or its | ||
| associated quantile. ``q0.1`` means the 10% quantile. | ||
| ``0`` is the minimum value and ``1`` is the maximum value. | ||
| ymax: The maximum value of the range or its | ||
| associated quantile. ``q0.9`` means the 90% quantile. | ||
| ``0`` is the minimum value and ``1`` is the maximum value. | ||
| yscale: The y-axis scale. If ``'auto'``, the | ||
| ``'linear'`` or ``'log'/'symlog'`` scale is chosen based | ||
| on the distribution. | ||
| Raises: | ||
| RuntimeError: if ``data`` and ``steps`` have different lengths | ||
| Example usage: | ||
| ```pycon | ||
| >>> import numpy as np | ||
| >>> from matplotlib import pyplot as plt | ||
| >>> from arkas.plot import boxplot_continuous_temporal | ||
| >>> fig, ax = plt.subplots() | ||
| >>> rng = np.random.default_rng() | ||
| >>> data = [rng.standard_normal(1000) for _ in range(10)] | ||
| >>> boxplot_continuous_temporal(ax, data=data, steps=list(range(len(data)))) | ||
| ``` | ||
| """ | ||
| if len(data) == 0: | ||
| return | ||
| if len(data) != len(steps): | ||
| msg = f"data and steps have different lengths: {len(data):,} vs {len(steps):,}" | ||
| raise RuntimeError(msg) | ||
| data = [nonnan(x) for x in data] | ||
| ax.boxplot( | ||
| data, | ||
| notch=True, | ||
| vert=True, | ||
| widths=0.7, | ||
| patch_artist=True, | ||
| boxprops={"facecolor": "lightblue"}, | ||
| ) | ||
| array = np.concatenate(data) | ||
| ymin, ymax = find_range(array, xmin=ymin, xmax=ymax) | ||
| if ymin < ymax: | ||
| ax.set_ylim(ymin, ymax) | ||
| ax.set_xticks(np.arange(len(steps)), labels=steps) | ||
| if yscale == "auto": | ||
| yscale = auto_yscale_continuous(array=array, nbins=100) | ||
| ax.set_yscale(yscale) | ||
| readable_xticklabels(ax) | ||
| def hist_continuous( | ||
| ax: Axes, | ||
| array: np.ndarray, | ||
| nbins: int | None = None, | ||
| density: bool = False, | ||
| yscale: str = "linear", | ||
| xmin: float | str | None = None, | ||
| xmax: float | str | None = None, | ||
| cdf: bool = True, | ||
| quantile: bool = True, | ||
| ) -> None: | ||
| r"""Plot the histogram of an array containing continuous values. | ||
| Args: | ||
| ax: The axes of the matplotlib figure to update. | ||
| array: The array with the data. | ||
| nbins: The number of bins to use to plot. | ||
| density: If True, draw and return a probability density: | ||
| each bin will display the bin's raw count divided by the | ||
| total number of counts and the bin width, so that the area | ||
| under the histogram integrates to 1. | ||
| yscale: The y-axis scale. If ``'auto'``, the | ||
| ``'linear'`` or ``'log'/'symlog'`` scale is chosen based | ||
| on the distribution. | ||
| xmin: The minimum value of the range or its | ||
| associated quantile. ``q0.1`` means the 10% quantile. | ||
| ``0`` is the minimum value and ``1`` is the maximum value. | ||
| xmax: The maximum value of the range or its | ||
| associated quantile. ``q0.9`` means the 90% quantile. | ||
| ``0`` is the minimum value and ``1`` is the maximum value. | ||
| cdf: If ``True``, the CDF is added to the plot. | ||
| quantile: If ``True``, the 5% and 95% quantiles are added to | ||
| the plot. | ||
| Example usage: | ||
| ```pycon | ||
| >>> import numpy as np | ||
| >>> from matplotlib import pyplot as plt | ||
| >>> from arkas.plot import hist_continuous | ||
| >>> fig, ax = plt.subplots() | ||
| >>> hist_continuous(ax, array=np.arange(101)) | ||
| ``` | ||
| """ | ||
| array = array.ravel() | ||
| if array.size == 0: | ||
| return | ||
| xmin, xmax = find_range(array, xmin=xmin, xmax=xmax) | ||
| ax.hist(array, bins=nbins, range=(xmin, xmax), color="tab:blue", alpha=0.9, density=density) | ||
| readable_xticklabels(ax, max_num_xticks=100) | ||
| if xmin < xmax: | ||
| ax.set_xlim(xmin, xmax) | ||
| ax.set_ylabel("density (number of occurrences/total)" if density else "number of occurrences") | ||
| if yscale == "auto": | ||
| yscale = auto_yscale_continuous(array=array, nbins=nbins) | ||
| ax.set_yscale(yscale) | ||
| if cdf: | ||
| plot_cdf( | ||
| ax=ax.twinx(), | ||
| array=array, | ||
| nbins=nbins, | ||
| xmin=xmin, | ||
| xmax=xmax, | ||
| color="tab:red", | ||
| labelcolor="tab:red", | ||
| ) | ||
| if not quantile: | ||
| return | ||
| q05, q95 = np.quantile(array, q=[0.05, 0.95]) | ||
| if xmin < q05 < xmax: | ||
| axvline_quantile(ax, quantile=q05, label="q0.05 ", horizontalalignment="right") | ||
| if xmin < q95 < xmax: | ||
| axvline_quantile(ax, quantile=q95, label=" q0.95", horizontalalignment="left") | ||
| def hist_continuous2( | ||
| ax: Axes, | ||
| array1: np.ndarray, | ||
| array2: np.ndarray, | ||
| label1: str = "first", | ||
| label2: str = "second", | ||
| nbins: int | None = None, | ||
| density: bool = False, | ||
| yscale: str = "linear", | ||
| xmin: float | str | None = None, | ||
| xmax: float | str | None = None, | ||
| ) -> None: | ||
| r"""Plot the histogram of two arrays to compare the distributions. | ||
| Args: | ||
| ax: The axes of the matplotlib figure to update. | ||
| array1: The first array with the data. | ||
| array2: The second array with the data. | ||
| label1: The label associated to the first array. | ||
| label2: The label associated to the second array. | ||
| nbins: The number of bins to use to plot. | ||
| density: If True, draw and return a probability density: | ||
| each bin will display the bin's raw count divided by the | ||
| total number of counts and the bin width, so that the area | ||
| under the histogram integrates to 1. | ||
| yscale: The y-axis scale. If ``'auto'``, the | ||
| ``'linear'`` or ``'log'/'symlog'`` scale is chosen based | ||
| on the distribution. | ||
| xmin: The minimum value of the range or its | ||
| associated quantile. ``q0.1`` means the 10% quantile. | ||
| ``0`` is the minimum value and ``1`` is the maximum value. | ||
| xmax: The maximum value of the range or its | ||
| associated quantile. ``q0.9`` means the 90% quantile. | ||
| ``0`` is the minimum value and ``1`` is the maximum value. | ||
| Example usage: | ||
| ```pycon | ||
| >>> import numpy as np | ||
| >>> from matplotlib import pyplot as plt | ||
| >>> from arkas.plot import hist_continuous2 | ||
| >>> fig, ax = plt.subplots() | ||
| >>> hist_continuous2(ax, array1=np.arange(101), array2=np.arange(51)) | ||
| ``` | ||
| """ | ||
| array1, array2 = array1.ravel(), array2.ravel() | ||
| array = np.concatenate([array1, array2]) | ||
| if array.size == 0: | ||
| return | ||
| xmin, xmax = find_range(array, xmin=xmin, xmax=xmax) | ||
| ax.hist( | ||
| array1, | ||
| bins=nbins, | ||
| range=(xmin, xmax), | ||
| color="tab:blue", | ||
| alpha=0.5, | ||
| label=label1, | ||
| density=density, | ||
| ) | ||
| ax.hist( | ||
| array2, | ||
| bins=nbins, | ||
| range=(xmin, xmax), | ||
| color="tab:orange", | ||
| alpha=0.5, | ||
| label=label2, | ||
| density=density, | ||
| ) | ||
| readable_xticklabels(ax, max_num_xticks=100) | ||
| if xmin < xmax: | ||
| ax.set_xlim(xmin, xmax) | ||
| ax.set_ylabel("density (number of occurrences/total)" if density else "number of occurrences") | ||
| if yscale == "auto": | ||
| yscale = auto_yscale_continuous(array=array, nbins=nbins) | ||
| ax.set_yscale(yscale) | ||
| ax.legend() |
| r"""Contain plotting functions to analyze discrete values.""" | ||
| from __future__ import annotations | ||
| __all__ = ["bar_discrete", "bar_discrete_temporal"] | ||
| from typing import TYPE_CHECKING | ||
| import numpy as np | ||
| from matplotlib import pyplot as plt | ||
| from arkas.plot.utils import auto_yscale_discrete, readable_xticklabels | ||
| if TYPE_CHECKING: | ||
| from collections.abc import Sequence | ||
| from matplotlib.axes import Axes | ||
| def bar_discrete( | ||
| ax: Axes, | ||
| names: Sequence, | ||
| counts: Sequence[int], | ||
| yscale: str = "auto", | ||
| ) -> None: | ||
| r"""Plot the histogram of an array containing discrete values. | ||
| Args: | ||
| ax: The axes of the matplotlib figure to update. | ||
| names: The name of the values to plot. | ||
| counts: The number of value occurrences. | ||
| yscale: The y-axis scale. If ``'auto'``, the | ||
| ``'linear'`` or ``'log'/'symlog'`` scale is chosen based | ||
| on the distribution. | ||
| Example usage: | ||
| ```pycon | ||
| >>> from matplotlib import pyplot as plt | ||
| >>> from arkas.plot import bar_discrete | ||
| >>> fig, ax = plt.subplots() | ||
| >>> bar_discrete(ax, names=["a", "b", "c", "d"], counts=[5, 100, 42, 27]) | ||
| ``` | ||
| """ | ||
| n = len(names) | ||
| if n == 0: | ||
| return | ||
| x = np.arange(n) | ||
| ax.bar(x, counts, width=0.9 if n < 50 else 1, color="tab:blue") | ||
| if yscale == "auto": | ||
| yscale = auto_yscale_discrete(min_count=min(counts), max_count=max(counts)) | ||
| ax.set_yscale(yscale) | ||
| ax.set_xticks(x, labels=map(str, names)) | ||
| readable_xticklabels(ax, max_num_xticks=100) | ||
| ax.set_xlim(-0.5, len(names) - 0.5) | ||
| ax.set_xlabel("values") | ||
| ax.set_ylabel("number of occurrences") | ||
| def bar_discrete_temporal( | ||
| ax: Axes, | ||
| counts: np.ndarray, | ||
| steps: Sequence | None = None, | ||
| values: Sequence | None = None, | ||
| proportion: bool = False, | ||
| ) -> None: | ||
| r"""Plot the temporal distribution of discrete values. | ||
| Args: | ||
| ax: The axes of the matplotlib figure to update. | ||
| counts: A 2-d array that indicates the number of occurrences | ||
| for each value and time step. The first dimension | ||
| represents the value and the second dimension | ||
| represents the steps. | ||
| steps: The name associated to each step. | ||
| values: The name associated to each value. | ||
| proportion: If ``True``, it plots the normalized number of | ||
| occurrences for each step. | ||
| Example usage: | ||
| ```pycon | ||
| >>> from matplotlib import pyplot as plt | ||
| >>> from arkas.plot import bar_discrete_temporal | ||
| >>> fig, ax = plt.subplots() | ||
| >>> bar_discrete_temporal( | ||
| ... ax, counts=np.ones((5, 20)), values=list(range(5)), steps=list(range(20)) | ||
| ... ) | ||
| ``` | ||
| """ | ||
| if counts.size == 0: | ||
| return | ||
| num_values, num_steps = counts.shape | ||
| values = _prepare_values_bar_discrete_temporal(values=values, num_values=num_values) | ||
| steps = _prepare_steps_bar_discrete_temporal(steps=steps, num_steps=num_steps) | ||
| counts = _prepare_counts_bar_discrete_temporal(counts=counts, proportion=proportion) | ||
| x = np.arange(num_steps, dtype=np.int64) | ||
| bottom = np.zeros(num_steps, dtype=counts.dtype) | ||
| width = 0.9 if num_steps < 50 else 1 | ||
| my_cmap = plt.get_cmap("viridis") | ||
| for i in range(num_values): | ||
| count = counts[i] | ||
| ax.bar(x, count, label=values[i], bottom=bottom, width=width, color=my_cmap(i / num_values)) | ||
| bottom += count | ||
| num_valid_values = len(list(filter(lambda x: x is not None, values))) | ||
| if num_valid_values <= 10 and num_valid_values > 0: | ||
| ax.legend() | ||
| ax.set_xticks(x, labels=steps) | ||
| readable_xticklabels(ax, max_num_xticks=100) | ||
| ax.set_xlim(-0.5, num_steps - 0.5) | ||
| ax.set_ylabel("steps") | ||
| ax.set_ylabel("proportion" if proportion else "number of occurrences") | ||
| def _prepare_values_bar_discrete_temporal(values: Sequence | None, num_values: int) -> list: | ||
| r"""Return the list of values. | ||
| This function was designed to be used in ``bar_discrete_temporal``. | ||
| Args: | ||
| values: The sequence of values. | ||
| num_values: The expected number of values. | ||
| Returns: | ||
| The values. If ``values`` is ``None``, a list filled with | ||
| ``None`` is returned. | ||
| Raises: | ||
| RuntimeError: if the length of ``values`` does not match with | ||
| ``num_values``. | ||
| """ | ||
| if values is None: | ||
| return [None] * num_values | ||
| if len(values) != num_values: | ||
| msg = ( | ||
| f"values length ({len(values):,}) do not match with the count matrix " | ||
| f"first dimension ({num_values:,})" | ||
| ) | ||
| raise RuntimeError(msg) | ||
| return list(values) | ||
| def _prepare_steps_bar_discrete_temporal(steps: Sequence | None, num_steps: int) -> list: | ||
| r"""Return the list of steps. | ||
| This function was designed to be used in ``bar_discrete_temporal``. | ||
| Args: | ||
| steps: The sequence of steps. | ||
| num_steps: The expected number of steps. | ||
| Returns: | ||
| The steps. If ``steps`` is ``None``, a list filled with | ||
| ``None`` is returned. | ||
| Raises: | ||
| RuntimeError: if the length of ``steps`` does not match with | ||
| ``num_steps``. | ||
| """ | ||
| if steps is None: | ||
| return list(range(num_steps)) | ||
| if len(steps) != num_steps: | ||
| msg = ( | ||
| f"steps length ({len(steps):,}) do not match with the count matrix " | ||
| f"second dimension ({num_steps:,})" | ||
| ) | ||
| raise RuntimeError(msg) | ||
| return list(steps) | ||
| def _prepare_counts_bar_discrete_temporal(counts: np.ndarray, proportion: bool) -> np.ndarray: | ||
| r"""Prepare the count matrix. | ||
| This function was designed to be used in ``bar_discrete_temporal``. | ||
| Args: | ||
| counts: A 2-d array that indicates the number of occurrences | ||
| for each value and time step. The first dimension | ||
| represents the value and the second dimension | ||
| represents the steps. | ||
| proportion: If ``True``, the count matrix is normalized number | ||
| of occurrences for each step. | ||
| Returns: | ||
| The count matrix. | ||
| """ | ||
| if not proportion: | ||
| return counts | ||
| return counts / np.clip(counts.sum(axis=0), a_min=1, a_max=None) |
| r"""Contain functionalities to plot the temporal distribution of the | ||
| number of missing values.""" | ||
| from __future__ import annotations | ||
| __all__ = ["plot_null_temporal"] | ||
| from typing import TYPE_CHECKING | ||
| import numpy as np | ||
| if TYPE_CHECKING: | ||
| from collections.abc import Sequence | ||
| from matplotlib.axes import Axes | ||
| def plot_null_temporal(ax: Axes, nulls: Sequence, totals: Sequence, labels: Sequence) -> None: | ||
| r"""Plot the temporal distribution of the number of missing values. | ||
| ``nulls``, ``totals``, and ``labels`` must have the same length | ||
| and have the same order. | ||
| Args: | ||
| ax: The Axes object that encapsulates all the elements of an | ||
| individual (sub-)plot in a figure. | ||
| nulls: The number of null values for each temporal period. | ||
| totals: The number of total values for each temporal period. | ||
| labels: The labels for each temporal period. | ||
| Raises: | ||
| RuntimeError: if ``nulls``, ``totals``, and ``labels`` have | ||
| different lengths. | ||
| Example usage: | ||
| ```pycon | ||
| >>> from matplotlib import pyplot as plt | ||
| >>> from arkas.plot import plot_null_temporal | ||
| >>> fig, ax = plt.subplots() | ||
| >>> plot_null_temporal( | ||
| ... ax, nulls=[1, 2, 3, 4], totals=[10, 12, 14, 16], labels=["jan", "feb", "mar", "apr"] | ||
| ... ) | ||
| ``` | ||
| """ | ||
| if len(nulls) != len(totals): | ||
| msg = f"nulls ({len(nulls):,}) and totals ({len(totals):,}) have different lengths" | ||
| raise ValueError(msg) | ||
| if len(labels) != len(totals): | ||
| msg = f"nulls ({len(nulls):,}) and labels ({len(labels):,}) have different lengths" | ||
| raise ValueError(msg) | ||
| if len(nulls) == 0: | ||
| return | ||
| labels = list(map(str, labels)) | ||
| nulls = np.asarray(nulls) | ||
| totals = np.asarray(totals) | ||
| color = "tab:blue" | ||
| x = np.arange(len(labels)) | ||
| ax.set_ylabel("number of null/total values", color=color) | ||
| ax.tick_params(axis="y", labelcolor=color) | ||
| ax.bar(x=x, height=totals, color="tab:cyan", alpha=0.5, label="total") | ||
| ax.bar(x=x, height=nulls, color=color, alpha=0.8, label="null") | ||
| ax.legend() | ||
| ax2 = ax.twinx() | ||
| color = "black" | ||
| ax2.set_ylabel("percentage", color=color) | ||
| ax2.tick_params(axis="y", labelcolor=color) | ||
| ax2.plot(x, nulls / totals, "o-", color=color) | ||
| ax.set_xticks(x, labels=labels) | ||
| ax.set_xlim(-0.5, len(labels) - 0.5) |
| r"""Contain the implementation of a plotter to analyze a Series with | ||
| continuous values.""" | ||
| from __future__ import annotations | ||
| __all__ = ["BaseFigureCreator", "ContinuousSeriesPlotter", "MatplotlibFigureCreator"] | ||
| from abc import ABC, abstractmethod | ||
| from typing import TYPE_CHECKING, Any | ||
| import matplotlib.pyplot as plt | ||
| from coola.utils import repr_indent, repr_mapping, str_indent, str_mapping | ||
| from arkas.figure.creator import FigureCreatorRegistry | ||
| from arkas.figure.html import HtmlFigure | ||
| from arkas.figure.matplotlib import MatplotlibFigure, MatplotlibFigureConfig | ||
| from arkas.figure.utils import MISSING_FIGURE_MESSAGE | ||
| from arkas.plot.continuous import hist_continuous | ||
| from arkas.plot.utils.hist import adjust_nbins | ||
| from arkas.plotter.base import BasePlotter | ||
| from arkas.plotter.vanilla import Plotter | ||
| from arkas.utils.array import filter_range, nonnan, to_array | ||
| from arkas.utils.range import find_range | ||
| if TYPE_CHECKING: | ||
| from arkas.figure.base import BaseFigure | ||
| from arkas.state.series import SeriesState | ||
| class BaseFigureCreator(ABC): | ||
| r"""Define the base class to create a figure with the content of the | ||
| column.""" | ||
| @abstractmethod | ||
| def create(self, state: SeriesState) -> BaseFigure: | ||
| r"""Create a figure with the content of the column. | ||
| Args: | ||
| state: The state containing the Series to analyze. | ||
| Returns: | ||
| The generated figure. | ||
| Example usage: | ||
| ```pycon | ||
| >>> import polars as pl | ||
| >>> from arkas.figure import MatplotlibFigureConfig | ||
| >>> from arkas.state import SeriesState | ||
| >>> creator = MatplotlibFigureCreator() | ||
| >>> fig = creator.create(SeriesState(pl.Series("col1", [1, 2, 3, 4, 5, 6, 7]))) | ||
| ``` | ||
| """ | ||
| class MatplotlibFigureCreator(BaseFigureCreator): | ||
| r"""Create a matplotlib figure with the content of each column. | ||
| Example usage: | ||
| ```pycon | ||
| >>> import polars as pl | ||
| >>> from arkas.figure import MatplotlibFigureConfig | ||
| >>> from arkas.state import SeriesState | ||
| >>> creator = MatplotlibFigureCreator() | ||
| >>> fig = creator.create(SeriesState(pl.Series("col1", [1, 2, 3, 4, 5, 6, 7]))) | ||
| ``` | ||
| """ | ||
| def __repr__(self) -> str: | ||
| return f"{self.__class__.__qualname__}()" | ||
| def create(self, state: SeriesState) -> BaseFigure: | ||
| array = nonnan(to_array(state.series)) | ||
| if array.size == 0: | ||
| return HtmlFigure(MISSING_FIGURE_MESSAGE) | ||
| fig, ax = plt.subplots(**state.figure_config.get_arg("init", {})) | ||
| xmin, xmax = find_range( | ||
| array, | ||
| xmin=state.figure_config.get_arg("xmin"), | ||
| xmax=state.figure_config.get_arg("xmax"), | ||
| ) | ||
| nbins = adjust_nbins( | ||
| nbins=state.figure_config.get_arg("nbins"), | ||
| array=filter_range(array, xmin=xmin, xmax=xmax), | ||
| ) | ||
| hist_continuous( | ||
| ax=ax, | ||
| array=array, | ||
| nbins=nbins, | ||
| xmin=xmin, | ||
| xmax=xmax, | ||
| yscale=state.figure_config.get_arg("yscale", default="linear"), | ||
| ) | ||
| ax.set_title(f"data distribution for column {state.series.name!r}") | ||
| fig.tight_layout() | ||
| return MatplotlibFigure(fig) | ||
| class ContinuousSeriesPlotter(BasePlotter): | ||
| r"""Implement a plotter that analyzes a column with continuous | ||
| values. | ||
| Args: | ||
| state: The state containing the Series to analyze. | ||
| Example usage: | ||
| ```pycon | ||
| >>> import polars as pl | ||
| >>> from arkas.plotter import ContinuousSeriesPlotter | ||
| >>> from arkas.state import SeriesState | ||
| >>> plotter = ContinuousSeriesPlotter(SeriesState(pl.Series("col1", [1, 2, 3, 4, 5, 6, 7]))) | ||
| >>> plotter | ||
| ContinuousSeriesPlotter( | ||
| (state): SeriesState(name='col1', values=(7,), figure_config=MatplotlibFigureConfig()) | ||
| ) | ||
| ``` | ||
| """ | ||
| registry = FigureCreatorRegistry[BaseFigureCreator]( | ||
| {MatplotlibFigureConfig.backend(): MatplotlibFigureCreator()} | ||
| ) | ||
| def __init__(self, state: SeriesState) -> None: | ||
| self._state = state | ||
| def __repr__(self) -> str: | ||
| args = repr_indent(repr_mapping({"state": self._state})) | ||
| return f"{self.__class__.__qualname__}(\n {args}\n)" | ||
| def __str__(self) -> str: | ||
| args = str_indent(str_mapping({"state": self._state})) | ||
| return f"{self.__class__.__qualname__}(\n {args}\n)" | ||
| def compute(self) -> Plotter: | ||
| return Plotter(self.plot()) | ||
| def equal(self, other: Any, equal_nan: bool = False) -> bool: | ||
| if not isinstance(other, self.__class__): | ||
| return False | ||
| return self._state.equal(other._state, equal_nan=equal_nan) | ||
| def plot(self, prefix: str = "", suffix: str = "") -> dict: | ||
| figure = self.registry.find_creator(self._state.figure_config.backend()).create(self._state) | ||
| return {f"{prefix}continuous_histogram{suffix}": figure} |
| r"""Contain the implementation of a plotter that plots the number of | ||
| null values for each column.""" | ||
| from __future__ import annotations | ||
| __all__ = ["BaseFigureCreator", "MatplotlibFigureCreator", "NullValuePlotter"] | ||
| from abc import ABC, abstractmethod | ||
| from typing import TYPE_CHECKING, Any | ||
| import matplotlib.pyplot as plt | ||
| from coola.utils import repr_indent, repr_mapping, str_indent, str_mapping | ||
| from arkas.figure.creator import FigureCreatorRegistry | ||
| from arkas.figure.html import HtmlFigure | ||
| from arkas.figure.matplotlib import MatplotlibFigure, MatplotlibFigureConfig | ||
| from arkas.figure.utils import MISSING_FIGURE_MESSAGE | ||
| from arkas.plot.utils import readable_xticklabels | ||
| from arkas.plotter.base import BasePlotter | ||
| from arkas.plotter.vanilla import Plotter | ||
| if TYPE_CHECKING: | ||
| from arkas.figure.base import BaseFigure | ||
| from arkas.state.null_value import NullValueState | ||
| class BaseFigureCreator(ABC): | ||
| r"""Define the base class to create a bar plot figure with the | ||
| number of null values for each column.""" | ||
| @abstractmethod | ||
| def create(self, state: NullValueState) -> BaseFigure: | ||
| r"""Create a bar plot figure with the number of null values for | ||
| each column. | ||
| Args: | ||
| state: The state containing the number of null values per | ||
| column. | ||
| Returns: | ||
| The generated figure. | ||
| Example usage: | ||
| ```pycon | ||
| >>> import numpy as np | ||
| >>> from arkas.figure import MatplotlibFigureConfig | ||
| >>> from arkas.state import NullValueState | ||
| >>> creator = MatplotlibFigureCreator() | ||
| >>> fig = creator.create( | ||
| ... NullValueState( | ||
| ... null_count=np.array([0, 1, 2]), | ||
| ... total_count=np.array([5, 5, 5]), | ||
| ... columns=["col1", "col2", "col3"], | ||
| ... ) | ||
| ... ) | ||
| ``` | ||
| """ | ||
| class MatplotlibFigureCreator(BaseFigureCreator): | ||
| r"""Create a matplotlib figure with the number of null values for | ||
| each column. | ||
| Example usage: | ||
| ```pycon | ||
| >>> import numpy as np | ||
| >>> from arkas.figure import MatplotlibFigureConfig | ||
| >>> from arkas.state import NullValueState | ||
| >>> creator = MatplotlibFigureCreator() | ||
| >>> fig = creator.create( | ||
| ... NullValueState( | ||
| ... null_count=np.array([0, 1, 2]), | ||
| ... total_count=np.array([5, 5, 5]), | ||
| ... columns=["col1", "col2", "col3"], | ||
| ... ) | ||
| ... ) | ||
| ``` | ||
| """ | ||
| def __repr__(self) -> str: | ||
| return f"{self.__class__.__qualname__}()" | ||
| def create(self, state: NullValueState) -> BaseFigure: | ||
| if state.null_count.shape[0] == 0: | ||
| return HtmlFigure(MISSING_FIGURE_MESSAGE) | ||
| fig, ax = plt.subplots(**state.figure_config.get_arg("init", {})) | ||
| frame = state.to_dataframe().sort(by=["null", "column"]) | ||
| ax.bar(x=frame["column"].to_list(), height=frame["null"].to_numpy(), color="tab:blue") | ||
| ax.set_xlim(-0.5, len(state.columns) - 0.5) | ||
| readable_xticklabels(ax, max_num_xticks=100) | ||
| ax.set_xlabel("column") | ||
| ax.set_ylabel("number of null values") | ||
| ax.set_title("number of null values per column") | ||
| fig.tight_layout() | ||
| return MatplotlibFigure(fig) | ||
| class NullValuePlotter(BasePlotter): | ||
| r"""Implement a plotter that plots the number of null values for each | ||
| column. | ||
| Args: | ||
| state: The state containing the number of null values per | ||
| column. | ||
| Example usage: | ||
| ```pycon | ||
| >>> import numpy as np | ||
| >>> from arkas.plotter import NullValuePlotter | ||
| >>> from arkas.state import NullValueState | ||
| >>> plotter = NullValuePlotter( | ||
| ... NullValueState( | ||
| ... null_count=np.array([0, 1, 2]), | ||
| ... total_count=np.array([5, 5, 5]), | ||
| ... columns=["col1", "col2", "col3"], | ||
| ... ) | ||
| ... ) | ||
| >>> plotter | ||
| NullValuePlotter( | ||
| (state): NullValueState(num_columns=3, figure_config=MatplotlibFigureConfig()) | ||
| ) | ||
| ``` | ||
| """ | ||
| registry = FigureCreatorRegistry[BaseFigureCreator]( | ||
| {MatplotlibFigureConfig.backend(): MatplotlibFigureCreator()} | ||
| ) | ||
| def __init__(self, state: NullValueState) -> None: | ||
| self._state = state | ||
| def __repr__(self) -> str: | ||
| args = repr_indent(repr_mapping({"state": self._state})) | ||
| return f"{self.__class__.__qualname__}(\n {args}\n)" | ||
| def __str__(self) -> str: | ||
| args = str_indent(str_mapping({"state": self._state})) | ||
| return f"{self.__class__.__qualname__}(\n {args}\n)" | ||
| def compute(self) -> Plotter: | ||
| return Plotter(self.plot()) | ||
| def equal(self, other: Any, equal_nan: bool = False) -> bool: | ||
| if not isinstance(other, self.__class__): | ||
| return False | ||
| return self._state.equal(other._state, equal_nan=equal_nan) | ||
| def plot(self, prefix: str = "", suffix: str = "") -> dict: | ||
| figure = self.registry.find_creator(self._state.figure_config.backend()).create(self._state) | ||
| return {f"{prefix}null_values{suffix}": figure} |
| r"""Contain the implementation of a DataFrame column plotter.""" | ||
| from __future__ import annotations | ||
| __all__ = ["BaseFigureCreator", "MatplotlibFigureCreator", "TemporalNullValuePlotter"] | ||
| from abc import ABC, abstractmethod | ||
| from typing import TYPE_CHECKING, Any | ||
| import matplotlib.pyplot as plt | ||
| from coola.utils import repr_indent, repr_mapping, str_indent, str_mapping | ||
| from grizz.utils.null import compute_temporal_null_count | ||
| from arkas.figure.creator import FigureCreatorRegistry | ||
| from arkas.figure.html import HtmlFigure | ||
| from arkas.figure.matplotlib import MatplotlibFigure, MatplotlibFigureConfig | ||
| from arkas.figure.utils import MISSING_FIGURE_MESSAGE | ||
| from arkas.plot import plot_null_temporal | ||
| from arkas.plot.utils import readable_xticklabels | ||
| from arkas.plotter.base import BasePlotter | ||
| from arkas.plotter.vanilla import Plotter | ||
| if TYPE_CHECKING: | ||
| from arkas.figure.base import BaseFigure | ||
| from arkas.state.temporal_dataframe import TemporalDataFrameState | ||
| class BaseFigureCreator(ABC): | ||
| r"""Define the base class to create a figure with the content of | ||
| each column.""" | ||
| @abstractmethod | ||
| def create(self, state: TemporalDataFrameState) -> BaseFigure: | ||
| r"""Create a figure with the content of each column. | ||
| Args: | ||
| state: The state containing the DataFrame to analyze. | ||
| Returns: | ||
| The generated figure. | ||
| Example usage: | ||
| ```pycon | ||
| >>> from datetime import datetime, timezone | ||
| >>> import polars as pl | ||
| >>> from arkas.plotter.temporal_null_value import MatplotlibFigureCreator | ||
| >>> from arkas.state import TemporalDataFrameState | ||
| >>> creator = MatplotlibFigureCreator() | ||
| >>> frame = pl.DataFrame( | ||
| ... { | ||
| ... "col1": [0, 1, 1, 0], | ||
| ... "col2": [0, 1, 0, 1], | ||
| ... "col3": [1, 0, 0, 0], | ||
| ... "datetime": [ | ||
| ... datetime(year=2020, month=1, day=3, tzinfo=timezone.utc), | ||
| ... datetime(year=2020, month=2, day=3, tzinfo=timezone.utc), | ||
| ... datetime(year=2020, month=3, day=3, tzinfo=timezone.utc), | ||
| ... datetime(year=2020, month=4, day=3, tzinfo=timezone.utc), | ||
| ... ], | ||
| ... }, | ||
| ... schema={ | ||
| ... "col1": pl.Int64, | ||
| ... "col2": pl.Int64, | ||
| ... "col3": pl.Int64, | ||
| ... "datetime": pl.Datetime(time_unit="us", time_zone="UTC"), | ||
| ... }, | ||
| ... ) | ||
| >>> fig = creator.create( | ||
| ... TemporalDataFrameState(frame, temporal_column="datetime", period="1d") | ||
| ... ) | ||
| ``` | ||
| """ | ||
| class MatplotlibFigureCreator(BaseFigureCreator): | ||
| r"""Create a matplotlib figure with the content of each column. | ||
| Example usage: | ||
| ```pycon | ||
| >>> from datetime import datetime, timezone | ||
| >>> import polars as pl | ||
| >>> from arkas.plotter.temporal_null_value import MatplotlibFigureCreator | ||
| >>> from arkas.state import TemporalDataFrameState | ||
| >>> creator = MatplotlibFigureCreator() | ||
| >>> frame = pl.DataFrame( | ||
| ... { | ||
| ... "col1": [0, 1, 1, 0], | ||
| ... "col2": [0, 1, 0, 1], | ||
| ... "col3": [1, 0, 0, 0], | ||
| ... "datetime": [ | ||
| ... datetime(year=2020, month=1, day=3, tzinfo=timezone.utc), | ||
| ... datetime(year=2020, month=2, day=3, tzinfo=timezone.utc), | ||
| ... datetime(year=2020, month=3, day=3, tzinfo=timezone.utc), | ||
| ... datetime(year=2020, month=4, day=3, tzinfo=timezone.utc), | ||
| ... ], | ||
| ... }, | ||
| ... schema={ | ||
| ... "col1": pl.Int64, | ||
| ... "col2": pl.Int64, | ||
| ... "col3": pl.Int64, | ||
| ... "datetime": pl.Datetime(time_unit="us", time_zone="UTC"), | ||
| ... }, | ||
| ... ) | ||
| >>> fig = creator.create( | ||
| ... TemporalDataFrameState(frame, temporal_column="datetime", period="1d") | ||
| ... ) | ||
| ``` | ||
| """ | ||
| def __repr__(self) -> str: | ||
| return f"{self.__class__.__qualname__}()" | ||
| def create(self, state: TemporalDataFrameState) -> BaseFigure: | ||
| if state.dataframe.shape[0] == 0: | ||
| return HtmlFigure(MISSING_FIGURE_MESSAGE) | ||
| fig, ax = plt.subplots(**state.figure_config.get_arg("init", {})) | ||
| columns = list(state.dataframe.columns) | ||
| columns.remove(state.temporal_column) | ||
| nulls, totals, labels = compute_temporal_null_count( | ||
| frame=state.dataframe, | ||
| columns=columns, | ||
| temporal_column=state.temporal_column, | ||
| period=state.period, | ||
| ) | ||
| plot_null_temporal(ax=ax, labels=labels, nulls=nulls, totals=totals) | ||
| readable_xticklabels(ax, max_num_xticks=100) | ||
| fig.tight_layout() | ||
| return MatplotlibFigure(fig) | ||
| class TemporalNullValuePlotter(BasePlotter): | ||
| r"""Implement a DataFrame column plotter. | ||
| Args: | ||
| state: The state containing the DataFrame to analyze. | ||
| Example usage: | ||
| ```pycon | ||
| >>> from datetime import datetime, timezone | ||
| >>> import polars as pl | ||
| >>> from arkas.plotter import TemporalNullValuePlotter | ||
| >>> from arkas.state import TemporalDataFrameState | ||
| >>> frame = pl.DataFrame( | ||
| ... { | ||
| ... "col1": [0, 1, 1, 0], | ||
| ... "col2": [0, 1, 0, 1], | ||
| ... "col3": [1, 0, 0, 0], | ||
| ... "datetime": [ | ||
| ... datetime(year=2020, month=1, day=3, tzinfo=timezone.utc), | ||
| ... datetime(year=2020, month=2, day=3, tzinfo=timezone.utc), | ||
| ... datetime(year=2020, month=3, day=3, tzinfo=timezone.utc), | ||
| ... datetime(year=2020, month=4, day=3, tzinfo=timezone.utc), | ||
| ... ], | ||
| ... }, | ||
| ... schema={ | ||
| ... "col1": pl.Int64, | ||
| ... "col2": pl.Int64, | ||
| ... "col3": pl.Int64, | ||
| ... "datetime": pl.Datetime(time_unit="us", time_zone="UTC"), | ||
| ... }, | ||
| ... ) | ||
| >>> plotter = TemporalNullValuePlotter( | ||
| ... TemporalDataFrameState(frame, temporal_column="datetime", period="1d") | ||
| ... ) | ||
| >>> plotter | ||
| TemporalNullValuePlotter( | ||
| (state): TemporalDataFrameState(dataframe=(4, 4), temporal_column='datetime', period='1d', figure_config=MatplotlibFigureConfig()) | ||
| ) | ||
| ``` | ||
| """ | ||
| registry = FigureCreatorRegistry[BaseFigureCreator]( | ||
| {MatplotlibFigureConfig.backend(): MatplotlibFigureCreator()} | ||
| ) | ||
| def __init__(self, state: TemporalDataFrameState) -> None: | ||
| self._state = state | ||
| def __repr__(self) -> str: | ||
| args = repr_indent(repr_mapping({"state": self._state})) | ||
| return f"{self.__class__.__qualname__}(\n {args}\n)" | ||
| def __str__(self) -> str: | ||
| args = str_indent(str_mapping({"state": self._state})) | ||
| return f"{self.__class__.__qualname__}(\n {args}\n)" | ||
| def compute(self) -> Plotter: | ||
| return Plotter(self.plot()) | ||
| def equal(self, other: Any, equal_nan: bool = False) -> bool: | ||
| if not isinstance(other, self.__class__): | ||
| return False | ||
| return self._state.equal(other._state, equal_nan=equal_nan) | ||
| def plot(self, prefix: str = "", suffix: str = "") -> dict: | ||
| figure = self.registry.find_creator(self._state.figure_config.backend()).create(self._state) | ||
| return {f"{prefix}temporal_null_value{suffix}": figure} |
| r"""Implement a state that contains the number of null values per | ||
| columns.""" | ||
| from __future__ import annotations | ||
| __all__ = ["NullValueState"] | ||
| import sys | ||
| from typing import TYPE_CHECKING, Any | ||
| import numpy as np | ||
| import polars as pl | ||
| from coola import objects_are_equal | ||
| from coola.utils.format import repr_mapping_line | ||
| from grizz.utils.null import compute_null_count | ||
| from arkas.figure import BaseFigureConfig, get_default_config | ||
| from arkas.state.base import BaseState | ||
| if sys.version_info >= (3, 11): | ||
| from typing import Self | ||
| else: # pragma: no cover | ||
| from typing_extensions import ( | ||
| Self, # use backport because it was added in python 3.11 | ||
| ) | ||
| if TYPE_CHECKING: | ||
| from collections.abc import Sequence | ||
| class NullValueState(BaseState): | ||
| r"""Implement a state that contains the number of null values per | ||
| columns. | ||
| Args: | ||
| null_count: The array with the number of null values for each column. | ||
| total_count: The total number of values for each column. | ||
| columns: The column names. | ||
| figure_config: An optional figure configuration. | ||
| Example usage: | ||
| ```pycon | ||
| >>> import numpy as np | ||
| >>> from arkas.state import NullValueState | ||
| >>> state = NullValueState( | ||
| ... null_count=np.array([0, 1, 2]), | ||
| ... total_count=np.array([5, 5, 5]), | ||
| ... columns=["col1", "col2", "col3"], | ||
| ... ) | ||
| >>> state | ||
| NullValueState(num_columns=3, figure_config=MatplotlibFigureConfig()) | ||
| ``` | ||
| """ | ||
| def __init__( | ||
| self, | ||
| null_count: np.ndarray, | ||
| total_count: np.ndarray, | ||
| columns: Sequence[str], | ||
| figure_config: BaseFigureConfig | None = None, | ||
| ) -> None: | ||
| self._null_count = null_count.ravel() | ||
| self._total_count = total_count.ravel() | ||
| self._columns = tuple(columns) | ||
| self._figure_config = figure_config or get_default_config() | ||
| if len(self._columns) != self._null_count.shape[0]: | ||
| msg = ( | ||
| f"'columns' ({len(self._columns):,}) and 'null_count' " | ||
| f"({self._null_count.shape[0]:,}) do not match" | ||
| ) | ||
| raise ValueError(msg) | ||
| if len(self._columns) != self._total_count.shape[0]: | ||
| msg = ( | ||
| f"'columns' ({len(self._columns):,}) and 'total_count' " | ||
| f"({self._total_count.shape[0]:,}) do not match" | ||
| ) | ||
| raise ValueError(msg) | ||
| def __repr__(self) -> str: | ||
| args = repr_mapping_line( | ||
| { | ||
| "num_columns": self._null_count.shape[0], | ||
| "figure_config": self._figure_config, | ||
| } | ||
| ) | ||
| return f"{self.__class__.__qualname__}({args})" | ||
| @property | ||
| def columns(self) -> tuple[str, ...]: | ||
| return self._columns | ||
| @property | ||
| def null_count(self) -> np.ndarray: | ||
| return self._null_count | ||
| @property | ||
| def total_count(self) -> np.ndarray: | ||
| return self._total_count | ||
| @property | ||
| def figure_config(self) -> BaseFigureConfig | None: | ||
| return self._figure_config | ||
| def clone(self, deep: bool = True) -> Self: | ||
| return self.__class__( | ||
| null_count=self._null_count.copy() if deep else self._null_count, | ||
| total_count=self._total_count.copy() if deep else self._total_count, | ||
| columns=self._columns, | ||
| figure_config=self._figure_config.clone() if deep else self._figure_config, | ||
| ) | ||
| def equal(self, other: Any, equal_nan: bool = False) -> bool: | ||
| if not isinstance(other, self.__class__): | ||
| return False | ||
| return ( | ||
| objects_are_equal(self.null_count, other.null_count, equal_nan=equal_nan) | ||
| and objects_are_equal(self.total_count, other.total_count, equal_nan=equal_nan) | ||
| and objects_are_equal(self.columns, other.columns, equal_nan=equal_nan) | ||
| and objects_are_equal(self.figure_config, other.figure_config, equal_nan=equal_nan) | ||
| ) | ||
| def to_dataframe(self) -> pl.DataFrame: | ||
| r"""Export the content of the state to a DataFrame. | ||
| Returns: | ||
| The DataFrame. | ||
| ```pycon | ||
| >>> import numpy as np | ||
| >>> from arkas.state import NullValueState | ||
| >>> state = NullValueState( | ||
| ... null_count=np.array([0, 1, 2]), | ||
| ... total_count=np.array([5, 5, 5]), | ||
| ... columns=["col1", "col2", "col3"], | ||
| ... ) | ||
| >>> state.to_dataframe() | ||
| shape: (3, 3) | ||
| ┌────────┬──────┬───────┐ | ||
| │ column ┆ null ┆ total │ | ||
| │ --- ┆ --- ┆ --- │ | ||
| │ str ┆ i64 ┆ i64 │ | ||
| ╞════════╪══════╪═══════╡ | ||
| │ col1 ┆ 0 ┆ 5 │ | ||
| │ col2 ┆ 1 ┆ 5 │ | ||
| │ col3 ┆ 2 ┆ 5 │ | ||
| └────────┴──────┴───────┘ | ||
| ``` | ||
| """ | ||
| return pl.DataFrame( | ||
| {"column": self._columns, "null": self._null_count, "total": self._total_count}, | ||
| schema={"column": pl.String, "null": pl.Int64, "total": pl.Int64}, | ||
| ) | ||
| @classmethod | ||
| def from_dataframe( | ||
| cls, dataframe: pl.DataFrame, figure_config: BaseFigureConfig | None = None | ||
| ) -> NullValueState: | ||
| r"""Instantiate a ``NullValueState`` object from a DataFrame. | ||
| Args: | ||
| dataframe: The DataFrame. | ||
| figure_config: An optional figure configuration. | ||
| Returns: | ||
| The instantiated ``NullValueState`` object. | ||
| Example usage: | ||
| ```pycon | ||
| >>> import polars as pl | ||
| >>> from arkas.state import NullValueState | ||
| >>> frame = pl.DataFrame( | ||
| ... { | ||
| ... "col1": [0, 1, 1, 0, 0, 1, None], | ||
| ... "col2": [0, 1, None, None, 0, 1, 0], | ||
| ... "col3": [None, 0, 0, 0, None, 1, None], | ||
| ... } | ||
| ... ) | ||
| >>> state = NullValueState.from_dataframe(frame) | ||
| >>> state | ||
| NullValueState(num_columns=3, figure_config=MatplotlibFigureConfig()) | ||
| ``` | ||
| """ | ||
| nrows, ncols = dataframe.shape | ||
| return cls( | ||
| columns=list(dataframe.columns), | ||
| null_count=compute_null_count(dataframe), | ||
| total_count=np.full((ncols,), nrows), | ||
| figure_config=figure_config, | ||
| ) |
| r"""Implement the Series state.""" | ||
| from __future__ import annotations | ||
| __all__ = ["SeriesState"] | ||
| import sys | ||
| from typing import TYPE_CHECKING, Any | ||
| from coola import objects_are_equal | ||
| from coola.utils.format import repr_mapping_line, str_indent, str_mapping | ||
| from arkas.figure.utils import get_default_config | ||
| from arkas.state.base import BaseState | ||
| if sys.version_info >= (3, 11): | ||
| from typing import Self | ||
| else: # pragma: no cover | ||
| from typing_extensions import ( | ||
| Self, # use backport because it was added in python 3.11 | ||
| ) | ||
| if TYPE_CHECKING: | ||
| import polars as pl | ||
| from arkas.figure.base import BaseFigureConfig | ||
| class SeriesState(BaseState): | ||
| r"""Implement the Series state. | ||
| Args: | ||
| series: The Series. | ||
| figure_config: An optional figure configuration. | ||
| Example usage: | ||
| ```pycon | ||
| >>> import polars as pl | ||
| >>> from arkas.state import SeriesState | ||
| >>> state = SeriesState(pl.Series("col1", [1, 2, 3, 4, 5, 6, 7])) | ||
| >>> state | ||
| SeriesState(name='col1', values=(7,), figure_config=MatplotlibFigureConfig()) | ||
| ``` | ||
| """ | ||
| def __init__( | ||
| self, | ||
| series: pl.Series, | ||
| figure_config: BaseFigureConfig | None = None, | ||
| ) -> None: | ||
| self._series = series | ||
| self._figure_config = figure_config or get_default_config() | ||
| def __repr__(self) -> str: | ||
| args = repr_mapping_line( | ||
| { | ||
| "name": self._series.name, | ||
| "values": self._series.shape, | ||
| "figure_config": self._figure_config, | ||
| } | ||
| ) | ||
| return f"{self.__class__.__qualname__}({args})" | ||
| def __str__(self) -> str: | ||
| args = str_indent( | ||
| str_mapping( | ||
| { | ||
| "name": self._series.name, | ||
| "values": self._series.shape, | ||
| "figure_config": self._figure_config, | ||
| } | ||
| ) | ||
| ) | ||
| return f"{self.__class__.__qualname__}({args})" | ||
| @property | ||
| def series(self) -> pl.Series: | ||
| return self._series | ||
| @property | ||
| def figure_config(self) -> BaseFigureConfig | None: | ||
| return self._figure_config | ||
| def clone(self, deep: bool = True) -> Self: | ||
| return self.__class__( | ||
| series=self._series.clone() if deep else self._series, | ||
| figure_config=self._figure_config.clone() if deep else self._figure_config, | ||
| ) | ||
| def equal(self, other: Any, equal_nan: bool = False) -> bool: | ||
| if not isinstance(other, self.__class__): | ||
| return False | ||
| return objects_are_equal(self.get_args(), other.get_args(), equal_nan=equal_nan) | ||
| def get_args(self) -> dict: | ||
| return { | ||
| "series": self._series, | ||
| "figure_config": self._figure_config, | ||
| } |
| r"""Contain utility functions to manage ranges of values.""" | ||
| from __future__ import annotations | ||
| __all__ = ["find_range"] | ||
| import numpy as np | ||
| def find_range( | ||
| values: np.ndarray, | ||
| xmin: float | str | None = None, | ||
| xmax: float | str | None = None, | ||
| ) -> tuple[float, float]: | ||
| r"""Find a valid range of value. | ||
| Args: | ||
| values: The values used to find the quantiles. | ||
| xmin: The minimum value of the range or its | ||
| associated quantile. ``q0.1`` means the 10% quantile. | ||
| ``0`` is the minimum value and ``1`` is the maximum value. | ||
| xmax: The maximum value of the range or its | ||
| associated quantile. ``q0.9`` means the 90% quantile. | ||
| ``0`` is the minimum value and ``1`` is the maximum value. | ||
| Returns: | ||
| The range of values in the format ``(min, max)``. | ||
| It returns ``(nan, nan)`` if the input array is empty. | ||
| Example usage: | ||
| ```pycon | ||
| >>> import numpy as np | ||
| >>> from arkas.utils.range import find_range | ||
| >>> data = np.arange(101) | ||
| >>> find_range(data) | ||
| (0, 100) | ||
| >>> find_range(data, xmin=5, xmax=50) | ||
| (5, 50) | ||
| >>> find_range(data, xmin="q0.1", xmax="q0.9") | ||
| (10.0, 90.0) | ||
| ``` | ||
| """ | ||
| if values.size == 0: | ||
| return float("nan"), float("nan") | ||
| if xmin is None: | ||
| xmin = np.nanmin(values).item() | ||
| if xmax is None: | ||
| xmax = np.nanmax(values).item() | ||
| q = [float(x[1:]) for x in [xmin, xmax] if isinstance(x, str)] | ||
| quantiles = np.nanquantile(values, q) | ||
| if isinstance(xmin, str): | ||
| xmin = quantiles[0] | ||
| if isinstance(xmax, str): | ||
| xmax = quantiles[-1] | ||
| if isinstance(xmin, np.number): | ||
| xmin = xmin.item() | ||
| if isinstance(xmax, np.number): | ||
| xmax = xmax.item() | ||
| return (xmin, xmax) |
| r"""Contain statistics utility functions.""" | ||
| from __future__ import annotations | ||
| __all__ = [ | ||
| "compute_statistics_continuous", | ||
| "compute_statistics_continuous_array", | ||
| "compute_statistics_continuous_series", | ||
| "quantile", | ||
| ] | ||
| from typing import TYPE_CHECKING | ||
| import numpy as np | ||
| import polars as pl | ||
| from scipy.stats import kurtosis, skew | ||
| from arkas.utils.array import nonnan | ||
| if TYPE_CHECKING: | ||
| from collections.abc import Sequence | ||
| def compute_statistics_continuous(data: np.ndarray | pl.Series) -> dict[str, float]: | ||
| r"""Return several descriptive statistics for the data with | ||
| continuous values. | ||
| Args: | ||
| data: The data to analyze. | ||
| Returns: | ||
| The descriptive statistics for the input data. | ||
| Example usage: | ||
| ```pycon | ||
| >>> import numpy as np | ||
| >>> from arkas.utils.stats import compute_statistics_continuous | ||
| >>> compute_statistics_continuous(np.arange(101)) | ||
| {'count': 101, 'nunique': 101, 'num_non_nulls': 101, 'num_nulls': 0, | ||
| 'mean': 50.0, 'std': 29.15..., | ||
| 'skewness': 0.0, 'kurtosis': -1.20..., 'min': 0.0, 'q001': 0.1, 'q01': 1.0, | ||
| 'q05': 5.0, 'q10': 10.0, 'q25': 25.0, 'median': 50.0, 'q75': 75.0, 'q90': 90.0, | ||
| 'q95': 95.0, 'q99': 99.0, 'q999': 99.9, 'max': 100.0, '>0': 100, '<0': 0, '=0': 1} | ||
| ``` | ||
| """ | ||
| if isinstance(data, pl.Series): | ||
| return compute_statistics_continuous_series(data) | ||
| return compute_statistics_continuous_array(data) | ||
| def compute_statistics_continuous_array(array: np.ndarray) -> dict[str, float]: | ||
| r"""Return several descriptive statistics for the data with | ||
| continuous values. | ||
| Args: | ||
| array: The data to analyze. | ||
| Returns: | ||
| The descriptive statistics for the input data. | ||
| Example usage: | ||
| ```pycon | ||
| >>> import numpy as np | ||
| >>> from arkas.utils.stats import compute_statistics_continuous_array | ||
| >>> compute_statistics_continuous_array(np.arange(101)) | ||
| {'count': 101, 'nunique': 101, 'num_non_nulls': 101, 'num_nulls': 0, | ||
| 'mean': 50.0, 'std': 29.15..., | ||
| 'skewness': 0.0, 'kurtosis': -1.20..., 'min': 0.0, 'q001': 0.1, 'q01': 1.0, | ||
| 'q05': 5.0, 'q10': 10.0, 'q25': 25.0, 'median': 50.0, 'q75': 75.0, 'q90': 90.0, | ||
| 'q95': 95.0, 'q99': 99.0, 'q999': 99.9, 'max': 100.0, '>0': 100, '<0': 0, '=0': 1} | ||
| ``` | ||
| """ | ||
| array = array.ravel().astype(np.float64) | ||
| array_nonnan = nonnan(array) | ||
| stats = { | ||
| "count": int(array.size), | ||
| "nunique": int(np.unique(array).size), | ||
| "num_non_nulls": int(array_nonnan.size), | ||
| } | ||
| stats["num_nulls"] = stats["count"] - stats["num_non_nulls"] | ||
| if array_nonnan.size == 0: | ||
| return stats | { | ||
| "mean": float("nan"), | ||
| "std": float("nan"), | ||
| "skewness": float("nan"), | ||
| "kurtosis": float("nan"), | ||
| "min": float("nan"), | ||
| "q001": float("nan"), | ||
| "q01": float("nan"), | ||
| "q05": float("nan"), | ||
| "q10": float("nan"), | ||
| "q25": float("nan"), | ||
| "median": float("nan"), | ||
| "q75": float("nan"), | ||
| "q90": float("nan"), | ||
| "q95": float("nan"), | ||
| "q99": float("nan"), | ||
| "q999": float("nan"), | ||
| "max": float("nan"), | ||
| ">0": 0, | ||
| "<0": 0, | ||
| "=0": 0, | ||
| } | ||
| quantiles = quantile( | ||
| array_nonnan, q=[0.001, 0.01, 0.05, 0.1, 0.25, 0.75, 0.9, 0.95, 0.99, 0.999] | ||
| ) | ||
| return stats | { | ||
| "mean": np.mean(array_nonnan).item(), | ||
| "std": np.std(array_nonnan).item(), | ||
| "skewness": float(skew(array_nonnan)), | ||
| "kurtosis": float(kurtosis(array_nonnan)), | ||
| "min": np.min(array_nonnan).item(), | ||
| "q001": quantiles[0.001], | ||
| "q01": quantiles[0.01], | ||
| "q05": quantiles[0.05], | ||
| "q10": quantiles[0.1], | ||
| "q25": quantiles[0.25], | ||
| "median": np.median(array_nonnan).item(), | ||
| "q75": quantiles[0.75], | ||
| "q90": quantiles[0.9], | ||
| "q95": quantiles[0.95], | ||
| "q99": quantiles[0.99], | ||
| "q999": quantiles[0.999], | ||
| "max": np.max(array_nonnan).item(), | ||
| ">0": (array > 0).sum().item(), | ||
| "<0": (array < 0).sum().item(), | ||
| "=0": (array == 0).sum().item(), | ||
| } | ||
| def compute_statistics_continuous_series(series: pl.Series) -> dict[str, float]: | ||
| r"""Return several descriptive statistics for the data with | ||
| continuous values. | ||
| Args: | ||
| series: The series to analyze. | ||
| Returns: | ||
| The descriptive statistics for the input data. | ||
| Example usage: | ||
| ```pycon | ||
| >>> import polars as pl | ||
| >>> from arkas.utils.stats import compute_statistics_continuous_series | ||
| >>> compute_statistics_continuous_series(pl.Series(list(range(101)))) | ||
| {'count': 101, 'nunique': 101, 'num_non_nulls': 101, 'num_nulls': 0, | ||
| 'mean': 50.0, 'std': 29.15..., | ||
| 'skewness': 0.0, 'kurtosis': -1.20..., 'min': 0.0, 'q001': 0.1, 'q01': 1.0, | ||
| 'q05': 5.0, 'q10': 10.0, 'q25': 25.0, 'median': 50.0, 'q75': 75.0, 'q90': 90.0, | ||
| 'q95': 95.0, 'q99': 99.0, 'q999': 99.9, 'max': 100.0, '>0': 100, '<0': 0, '=0': 1} | ||
| ``` | ||
| """ | ||
| stats = { | ||
| "count": int(series.shape[0]), | ||
| "nunique": series.n_unique(), | ||
| "num_nulls": int(series.null_count()), | ||
| } | ||
| stats["num_non_nulls"] = stats["count"] - stats["num_nulls"] | ||
| return compute_statistics_continuous_array(series.drop_nulls().to_numpy()) | stats | ||
| def quantile(array: np.ndarray, q: Sequence[float]) -> dict[float, float]: | ||
| r"""Compute the q-th quantile of the data. | ||
| Args: | ||
| array: The input data. | ||
| q: The quantiles to compute. Values must be between 0 and 1 | ||
| inclusive. | ||
| Returns: | ||
| A dictionary with the quantiles values. | ||
| Example usage: | ||
| ```pycon | ||
| >>> import numpy as np | ||
| >>> from arkas.utils.stats import quantile | ||
| >>> quantile(np.arange(101), q=[0.001, 0.01, 0.05, 0.1, 0.25, 0.75, 0.9, 0.95, 0.99, 0.999]) | ||
| {0.001: 0.1, 0.01: 1.0, 0.05: 5.0, 0.1: 10.0, 0.25: 25.0, 0.75: 75.0, | ||
| 0.9: 90.0, 0.95: 95.0, 0.99: 99.0, 0.999: 99.9} | ||
| ``` | ||
| """ | ||
| array = array.ravel() | ||
| if array.size == 0: | ||
| return {v: float("nan") for v in q} | ||
| return dict(zip(q, np.quantile(array.astype(np.float64), q).tolist())) |
+3
-3
@@ -1,4 +0,4 @@ | ||
| Metadata-Version: 2.3 | ||
| Metadata-Version: 2.1 | ||
| Name: arkas | ||
| Version: 0.0.1a9 | ||
| Version: 0.0.1a10 | ||
| Summary: Library to evaluate ML model performances | ||
@@ -29,3 +29,3 @@ Home-page: https://github.com/durandtibo/arkas | ||
| Requires-Dist: coola (>=0.8.2,<1.0) | ||
| Requires-Dist: grizz (>=0.2.0a15,<1.0) | ||
| Requires-Dist: grizz (>=0.2.0a16,<1.0) | ||
| Requires-Dist: hya (>=0.2,<1.0) ; extra == "all" or extra == "cli" | ||
@@ -32,0 +32,0 @@ Requires-Dist: hydra-core (>=1.3,<2.0) ; extra == "all" or extra == "cli" |
+2
-2
| [tool.poetry] | ||
| name = "arkas" | ||
| version = "0.0.1a9" | ||
| version = "0.0.1a10" | ||
| description = "Library to evaluate ML model performances" | ||
@@ -35,3 +35,3 @@ readme = "README.md" | ||
| coola = ">=0.8.2,<1.0" | ||
| grizz = ">=0.2.0a15,<1.0" | ||
| grizz = ">=0.2.0a16,<1.0" | ||
| iden = ">=0.1,<1.0" | ||
@@ -38,0 +38,0 @@ jinja2 = ">=3.0,<4.0" |
@@ -14,6 +14,9 @@ r"""Contain DataFrame analyzers.""" | ||
| "ContentAnalyzer", | ||
| "ContinuousColumnAnalyzer", | ||
| "DataFrameSummaryAnalyzer", | ||
| "MappingAnalyzer", | ||
| "NullValueAnalyzer", | ||
| "PlotColumnAnalyzer", | ||
| "ScatterColumnAnalyzer", | ||
| "TemporalNullValueAnalyzer", | ||
| "TemporalPlotColumnAnalyzer", | ||
@@ -31,8 +34,11 @@ "TransformAnalyzer", | ||
| from arkas.analyzer.content import ContentAnalyzer | ||
| from arkas.analyzer.continuous_column import ContinuousColumnAnalyzer | ||
| from arkas.analyzer.frame_summary import DataFrameSummaryAnalyzer | ||
| from arkas.analyzer.lazy import BaseInNLazyAnalyzer, BaseLazyAnalyzer | ||
| from arkas.analyzer.mapping import MappingAnalyzer | ||
| from arkas.analyzer.null_value import NullValueAnalyzer | ||
| from arkas.analyzer.plot_column import PlotColumnAnalyzer | ||
| from arkas.analyzer.scatter_column import ScatterColumnAnalyzer | ||
| from arkas.analyzer.temporal_null_value import TemporalNullValueAnalyzer | ||
| from arkas.analyzer.temporal_plot_column import TemporalPlotColumnAnalyzer | ||
| from arkas.analyzer.transform import TransformAnalyzer |
@@ -12,5 +12,8 @@ r"""Contain HTML content generators.""" | ||
| "ContentGeneratorDict", | ||
| "ContinuousSeriesContentGenerator", | ||
| "DataFrameSummaryContentGenerator", | ||
| "NullValueContentGenerator", | ||
| "PlotColumnContentGenerator", | ||
| "ScatterColumnContentGenerator", | ||
| "TemporalNullValueContentGenerator", | ||
| "TemporalPlotColumnContentGenerator", | ||
@@ -23,7 +26,10 @@ ] | ||
| from arkas.content.column_cooccurrence import ColumnCooccurrenceContentGenerator | ||
| from arkas.content.continuous_series import ContinuousSeriesContentGenerator | ||
| from arkas.content.frame_summary import DataFrameSummaryContentGenerator | ||
| from arkas.content.mapping import ContentGeneratorDict | ||
| from arkas.content.null_value import NullValueContentGenerator | ||
| from arkas.content.plot_column import PlotColumnContentGenerator | ||
| from arkas.content.scatter_column import ScatterColumnContentGenerator | ||
| from arkas.content.temporal_null_value import TemporalNullValueContentGenerator | ||
| from arkas.content.temporal_plot_column import TemporalPlotColumnContentGenerator | ||
| from arkas.content.vanilla import ContentGenerator |
@@ -50,3 +50,3 @@ r"""Contain the base class to implement an output exporter.""" | ||
| (exist_ok): False | ||
| (show_metrics): True | ||
| (show_metrics): False | ||
| ) | ||
@@ -155,3 +155,3 @@ | ||
| (exist_ok): False | ||
| (show_metrics): True | ||
| (show_metrics): False | ||
| ) | ||
@@ -158,0 +158,0 @@ |
@@ -67,3 +67,3 @@ r"""Contain the implementation of a metric exporter.""" | ||
| exist_ok: bool = False, | ||
| show_metrics: bool = True, | ||
| show_metrics: bool = False, | ||
| ) -> None: | ||
@@ -70,0 +70,0 @@ self._path = sanitize_path(path) |
@@ -68,3 +68,3 @@ r"""Contain an exporter that sequentially calls several exporters.""" | ||
| (exist_ok): False | ||
| (show_metrics): True | ||
| (show_metrics): False | ||
| ) | ||
@@ -71,0 +71,0 @@ (1): FigureExporter( |
@@ -12,4 +12,6 @@ r"""Contain data outputs.""" | ||
| "ContentOutput", | ||
| "ContinuousSeriesOutput", | ||
| "DataFrameSummaryOutput", | ||
| "EmptyOutput", | ||
| "NullValueOutput", | ||
| "Output", | ||
@@ -19,2 +21,3 @@ "OutputDict", | ||
| "ScatterColumnOutput", | ||
| "TemporalNullValueOutput", | ||
| "TemporalPlotColumnOutput", | ||
@@ -28,2 +31,3 @@ ] | ||
| from arkas.output.content import ContentOutput | ||
| from arkas.output.continuous_series import ContinuousSeriesOutput | ||
| from arkas.output.empty import EmptyOutput | ||
@@ -33,5 +37,7 @@ from arkas.output.frame_summary import DataFrameSummaryOutput | ||
| from arkas.output.mapping import OutputDict | ||
| from arkas.output.null_value import NullValueOutput | ||
| from arkas.output.plot_column import PlotColumnOutput | ||
| from arkas.output.scatter_column import ScatterColumnOutput | ||
| from arkas.output.temporal_null_value import TemporalNullValueOutput | ||
| from arkas.output.temporal_plot_column import TemporalPlotColumnOutput | ||
| from arkas.output.vanilla import Output |
@@ -5,5 +5,25 @@ r"""Contain plotting functionalities.""" | ||
| __all__ = ["binary_precision_recall_curve", "binary_roc_curve"] | ||
| __all__ = [ | ||
| "bar_discrete", | ||
| "bar_discrete_temporal", | ||
| "binary_precision_recall_curve", | ||
| "binary_roc_curve", | ||
| "boxplot_continuous", | ||
| "boxplot_continuous_temporal", | ||
| "hist_continuous", | ||
| "hist_continuous2", | ||
| "plot_cdf", | ||
| "plot_null_temporal", | ||
| ] | ||
| from arkas.plot.cdf import plot_cdf | ||
| from arkas.plot.continuous import ( | ||
| boxplot_continuous, | ||
| boxplot_continuous_temporal, | ||
| hist_continuous, | ||
| hist_continuous2, | ||
| ) | ||
| from arkas.plot.discrete import bar_discrete, bar_discrete_temporal | ||
| from arkas.plot.null_temporal import plot_null_temporal | ||
| from arkas.plot.pr import binary_precision_recall_curve | ||
| from arkas.plot.roc import binary_roc_curve |
@@ -8,2 +8,4 @@ r"""Contain data plotters.""" | ||
| "ColumnCooccurrencePlotter", | ||
| "ContinuousSeriesPlotter", | ||
| "NullValuePlotter", | ||
| "PlotColumnPlotter", | ||
@@ -13,2 +15,3 @@ "Plotter", | ||
| "ScatterColumnPlotter", | ||
| "TemporalNullValuePlotter", | ||
| "TemporalPlotColumnPlotter", | ||
@@ -19,6 +22,9 @@ ] | ||
| from arkas.plotter.column_cooccurrence import ColumnCooccurrencePlotter | ||
| from arkas.plotter.continuous_series import ContinuousSeriesPlotter | ||
| from arkas.plotter.mapping import PlotterDict | ||
| from arkas.plotter.null_value import NullValuePlotter | ||
| from arkas.plotter.plot_column import PlotColumnPlotter | ||
| from arkas.plotter.scatter_column import ScatterColumnPlotter | ||
| from arkas.plotter.temporal_null_value import TemporalNullValuePlotter | ||
| from arkas.plotter.temporal_plot_column import TemporalPlotColumnPlotter | ||
| from arkas.plotter.vanilla import Plotter |
@@ -35,3 +35,3 @@ r"""Contain the implementation of a DataFrame column plotter.""" | ||
| Args: | ||
| state: The state containing the DataFrame to analyze. | ||
| state: The state containing the DataFrame to analyze. | ||
@@ -95,9 +95,10 @@ Returns: | ||
| fig, ax = plt.subplots(**state.figure_config.get_arg("init", {})) | ||
| for col in state.dataframe: | ||
| ax.plot(col.to_numpy(), label=col.name) | ||
| xmin, xmax = 0, state.dataframe.shape[0] - 1 | ||
| if xmin < xmax: | ||
| ax.set_xlim(xmin, xmax) | ||
| if yscale := state.figure_config.get_arg("yscale"): | ||
| ax.set_yscale(yscale) | ||
| ax.legend() | ||
@@ -104,0 +105,0 @@ fig.tight_layout() |
@@ -19,2 +19,3 @@ r"""Contain the implementation of a DataFrame column plotter.""" | ||
| from arkas.plotter.vanilla import Plotter | ||
| from arkas.utils.range import find_range | ||
@@ -95,8 +96,5 @@ if TYPE_CHECKING: | ||
| color = state.dataframe[state.color].to_numpy() if state.color else None | ||
| s = ax.scatter( | ||
| state.dataframe[state.x].to_numpy(), | ||
| state.dataframe[state.y].to_numpy(), | ||
| c=color, | ||
| label=state.color, | ||
| ) | ||
| x = state.dataframe[state.x].to_numpy() | ||
| y = state.dataframe[state.y].to_numpy() | ||
| s = ax.scatter(x=x, y=y, c=color, label=state.color) | ||
| if color is not None: | ||
@@ -106,2 +104,16 @@ fig.colorbar(s) | ||
| xmin, xmax = find_range( | ||
| x, | ||
| xmin=state.figure_config.get_arg("xmin"), | ||
| xmax=state.figure_config.get_arg("xmax"), | ||
| ) | ||
| if xmin < xmax: | ||
| ax.set_xlim(xmin, xmax) | ||
| ymin, ymax = find_range( | ||
| y, | ||
| xmin=state.figure_config.get_arg("ymin"), | ||
| xmax=state.figure_config.get_arg("ymax"), | ||
| ) | ||
| if ymin < ymax: | ||
| ax.set_ylim(ymin, ymax) | ||
| ax.set_xlabel(state.x) | ||
@@ -108,0 +120,0 @@ ax.set_ylabel(state.y) |
@@ -25,2 +25,3 @@ r"""Contain the implementation of a DataFrame column plotter.""" | ||
| from arkas.plotter.vanilla import Plotter | ||
| from arkas.utils.range import find_range | ||
@@ -133,2 +134,10 @@ if TYPE_CHECKING: | ||
| xmin, xmax = find_range( | ||
| time, | ||
| xmin=state.figure_config.get_arg("xmin"), | ||
| xmax=state.figure_config.get_arg("xmax"), | ||
| ) | ||
| if xmin < xmax: | ||
| ax.set_xlim(xmin, xmax) | ||
| ax.set_xlabel(state.temporal_column) | ||
| if yscale := state.figure_config.get_arg("yscale"): | ||
@@ -135,0 +144,0 @@ ax.set_yscale(yscale) |
@@ -68,3 +68,3 @@ r"""Contain a simple runner to analyze data.""" | ||
| (exist_ok): False | ||
| (show_metrics): True | ||
| (show_metrics): False | ||
| ) | ||
@@ -71,0 +71,0 @@ (lazy): True |
@@ -57,3 +57,3 @@ r"""Contain the base class to implement a runner.""" | ||
| (exist_ok): False | ||
| (show_metrics): True | ||
| (show_metrics): False | ||
| ) | ||
@@ -212,3 +212,3 @@ (lazy): True | ||
| (exist_ok): False | ||
| (show_metrics): True | ||
| (show_metrics): False | ||
| ) | ||
@@ -215,0 +215,0 @@ (lazy): True |
@@ -10,4 +10,6 @@ r"""Contain states.""" | ||
| "DataFrameState", | ||
| "NullValueState", | ||
| "PrecisionRecallState", | ||
| "ScatterDataFrameState", | ||
| "SeriesState", | ||
| "TemporalDataFrameState", | ||
@@ -20,4 +22,6 @@ ] | ||
| from arkas.state.dataframe import DataFrameState | ||
| from arkas.state.null_value import NullValueState | ||
| from arkas.state.precision_recall import PrecisionRecallState | ||
| from arkas.state.scatter_dataframe import ScatterDataFrameState | ||
| from arkas.state.series import SeriesState | ||
| from arkas.state.temporal_dataframe import TemporalDataFrameState |
Alert delta unavailable
Currently unable to show alert delta for PyPI packages.
914081
12.3%250
8.7%22903
12.23%