Latest Threat Research:SANDWORM_MODE: Shai-Hulud-Style npm Worm Hijacks CI Workflows and Poisons AI Toolchains.Details
Socket
Book a DemoInstallSign in
Socket

arkas

Package Overview
Dependencies
Maintainers
1
Versions
16
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

arkas - npm Package Compare versions

Comparing version
0.0.1a9
to
0.0.1a10
+83
src/arkas/analyzer/continuous_column.py
r"""Implement an analyzer that analyzes a column with continuous
values."""
from __future__ import annotations
__all__ = ["ContinuousColumnAnalyzer"]
import logging
from typing import TYPE_CHECKING, Any
from coola import objects_are_equal
from coola.utils.format import repr_mapping_line
from arkas.analyzer.lazy import BaseLazyAnalyzer
from arkas.output.continuous_series import ContinuousSeriesOutput
from arkas.state.series import SeriesState
if TYPE_CHECKING:
import polars as pl
from arkas.figure import BaseFigureConfig
logger = logging.getLogger(__name__)
class ContinuousColumnAnalyzer(BaseLazyAnalyzer):
r"""Implement an analyzer that analyzes a column with continuous
values.
Args:
column: The column to analyze.
figure_config: The figure configuration.
Example usage:
```pycon
>>> import polars as pl
>>> from arkas.analyzer import ContinuousColumnAnalyzer
>>> analyzer = ContinuousColumnAnalyzer(column="col1")
>>> analyzer
ContinuousColumnAnalyzer(column='col1', figure_config=None)
>>> frame = pl.DataFrame(
... {
... "col1": [0, 1, 0, 1],
... "col2": [1, 0, 1, 0],
... "col3": [1, 1, 1, 1],
... },
... schema={"col1": pl.Int64, "col2": pl.Int64, "col3": pl.Int64},
... )
>>> output = analyzer.analyze(frame)
>>> output
ContinuousSeriesOutput(
(state): SeriesState(name='col1', values=(4,), figure_config=MatplotlibFigureConfig())
)
```
"""
def __init__(self, column: str, figure_config: BaseFigureConfig | None = None) -> None:
self._column = column
self._figure_config = figure_config
def __repr__(self) -> str:
args = repr_mapping_line(self.get_args())
return f"{self.__class__.__qualname__}({args})"
def equal(self, other: Any, equal_nan: bool = False) -> bool:
if not isinstance(other, self.__class__):
return False
return objects_are_equal(self.get_args(), other.get_args(), equal_nan=equal_nan)
def get_args(self) -> dict:
return {"column": self._column, "figure_config": self._figure_config}
def _analyze(self, frame: pl.DataFrame) -> ContinuousSeriesOutput:
logger.info(f"Analyzing the continuous distribution of column {self._column!r}...")
return ContinuousSeriesOutput(
state=SeriesState(
series=frame[self._column],
figure_config=self._figure_config,
)
)
r"""Implement an analyzer that plots the content of each column."""
from __future__ import annotations
__all__ = ["NullValueAnalyzer"]
import logging
from typing import TYPE_CHECKING
from grizz.utils.format import str_shape_diff
from arkas.analyzer.lazy import BaseInNLazyAnalyzer
from arkas.output.null_value import NullValueOutput
from arkas.state.null_value import NullValueState
if TYPE_CHECKING:
from collections.abc import Sequence
import polars as pl
from arkas.figure import BaseFigureConfig
logger = logging.getLogger(__name__)
class NullValueAnalyzer(BaseInNLazyAnalyzer):
r"""Implement an analyzer that plots the content of each column.
Args:
columns: The columns to analyze. If ``None``, it analyzes all
the columns.
exclude_columns: The columns to exclude from the input
``columns``. If any column is not found, it will be ignored
during the filtering process.
missing_policy: The policy on how to handle missing columns.
The following options are available: ``'ignore'``,
``'warn'``, and ``'raise'``. If ``'raise'``, an exception
is raised if at least one column is missing.
If ``'warn'``, a warning is raised if at least one column
is missing and the missing columns are ignored.
If ``'ignore'``, the missing columns are ignored and
no warning message appears.
figure_config: The figure configuration.
Example usage:
```pycon
>>> import polars as pl
>>> from arkas.analyzer import NullValueAnalyzer
>>> analyzer = NullValueAnalyzer()
>>> analyzer
NullValueAnalyzer(columns=None, exclude_columns=(), missing_policy='raise', figure_config=None)
>>> frame = pl.DataFrame(
... {
... "col1": [0, 1, 1, 0, 0, 1, None],
... "col2": [0, 1, None, None, 0, 1, 0],
... "col3": [None, 0, 0, 0, None, 1, None],
... }
... )
>>> output = analyzer.analyze(frame)
>>> output
NullValueOutput(
(state): NullValueState(num_columns=3, figure_config=MatplotlibFigureConfig())
)
```
"""
def __init__(
self,
columns: Sequence[str] | None = None,
exclude_columns: Sequence[str] = (),
missing_policy: str = "raise",
figure_config: BaseFigureConfig | None = None,
) -> None:
super().__init__(
columns=columns,
exclude_columns=exclude_columns,
missing_policy=missing_policy,
)
self._figure_config = figure_config
def get_args(self) -> dict:
return super().get_args() | {
"figure_config": self._figure_config,
}
def _analyze(self, frame: pl.DataFrame) -> NullValueOutput:
logger.info(f"Plotting the content of {len(self.find_columns(frame)):,} columns...")
columns = self.find_common_columns(frame)
dataframe = frame.select(columns)
logger.info(str_shape_diff(orig=frame.shape, final=dataframe.shape))
return NullValueOutput(
state=NullValueState.from_dataframe(
dataframe=dataframe, figure_config=self._figure_config
)
)
r"""Implement an analyzer that plots the content of each column."""
from __future__ import annotations
__all__ = ["TemporalNullValueAnalyzer"]
import logging
from typing import TYPE_CHECKING
from grizz.utils.format import str_shape_diff
from arkas.analyzer.lazy import BaseInNLazyAnalyzer
from arkas.output.temporal_null_value import TemporalNullValueOutput
from arkas.state.temporal_dataframe import TemporalDataFrameState
if TYPE_CHECKING:
from collections.abc import Sequence
import polars as pl
from arkas.figure import BaseFigureConfig
logger = logging.getLogger(__name__)
class TemporalNullValueAnalyzer(BaseInNLazyAnalyzer):
r"""Implement an analyzer that analyzes the number of null values in
a DataFrame.
Args:
temporal_column: The temporal column in the DataFrame.
period: The temporal period e.g. monthly or daily.
columns: The columns to analyze. If ``None``, it analyzes all
the columns.
exclude_columns: The columns to exclude from the input
``columns``. If any column is not found, it will be ignored
during the filtering process.
missing_policy: The policy on how to handle missing columns.
The following options are available: ``'ignore'``,
``'warn'``, and ``'raise'``. If ``'raise'``, an exception
is raised if at least one column is missing.
If ``'warn'``, a warning is raised if at least one column
is missing and the missing columns are ignored.
If ``'ignore'``, the missing columns are ignored and
no warning message appears.
figure_config: The figure configuration.
Example usage:
```pycon
>>> from datetime import datetime, timezone
>>> import polars as pl
>>> from arkas.analyzer import TemporalNullValueAnalyzer
>>> analyzer = TemporalNullValueAnalyzer(temporal_column="datetime", period="1d")
>>> analyzer
TemporalNullValueAnalyzer(columns=None, exclude_columns=(), missing_policy='raise', temporal_column='datetime', period='1d', figure_config=None)
>>> frame = pl.DataFrame(
... {
... "col1": [0, 1, 1, 0],
... "col2": [0, 1, 0, 1],
... "col3": [1, 0, 0, 0],
... "datetime": [
... datetime(year=2020, month=1, day=3, tzinfo=timezone.utc),
... datetime(year=2020, month=2, day=3, tzinfo=timezone.utc),
... datetime(year=2020, month=3, day=3, tzinfo=timezone.utc),
... datetime(year=2020, month=4, day=3, tzinfo=timezone.utc),
... ],
... },
... schema={
... "col1": pl.Int64,
... "col2": pl.Int64,
... "col3": pl.Int64,
... "datetime": pl.Datetime(time_unit="us", time_zone="UTC"),
... },
... )
>>> output = analyzer.analyze(frame)
>>> output
TemporalNullValueOutput(
(state): TemporalDataFrameState(dataframe=(4, 4), temporal_column='datetime', period='1d', figure_config=MatplotlibFigureConfig())
)
```
"""
def __init__(
self,
temporal_column: str,
period: str,
columns: Sequence[str] | None = None,
exclude_columns: Sequence[str] = (),
missing_policy: str = "raise",
figure_config: BaseFigureConfig | None = None,
) -> None:
super().__init__(
columns=columns,
exclude_columns=exclude_columns,
missing_policy=missing_policy,
)
self._temporal_column = temporal_column
self._period = period
self._figure_config = figure_config
def get_args(self) -> dict:
return super().get_args() | {
"temporal_column": self._temporal_column,
"period": self._period,
"figure_config": self._figure_config,
}
def _analyze(self, frame: pl.DataFrame) -> TemporalNullValueOutput:
logger.info(
f"Plotting the number of null values of {len(self.find_columns(frame)):,} columns "
f"using the temporal column {self._temporal_column!r} and period {self._period!r}..."
)
columns = list(self.find_common_columns(frame))
if self._temporal_column not in columns:
columns.append(self._temporal_column)
dataframe = frame.select(columns)
logger.info(str_shape_diff(orig=frame.shape, final=dataframe.shape))
return TemporalNullValueOutput(
state=TemporalDataFrameState(
dataframe=dataframe,
temporal_column=self._temporal_column,
period=self._period,
figure_config=self._figure_config,
)
)
r"""Contain the implementation of a HTML content generator that analyzes
a Series with continuous values."""
from __future__ import annotations
__all__ = ["ContinuousSeriesContentGenerator", "create_template"]
import logging
from typing import TYPE_CHECKING, Any
from coola.utils import repr_indent, repr_mapping, str_indent, str_mapping
from jinja2 import Template
from arkas.content.section import BaseSectionContentGenerator
from arkas.figure.utils import figure2html
from arkas.plotter.continuous_series import ContinuousSeriesPlotter
from arkas.utils.range import find_range
from arkas.utils.stats import compute_statistics_continuous
if TYPE_CHECKING:
from arkas.state.series import SeriesState
logger = logging.getLogger(__name__)
class ContinuousSeriesContentGenerator(BaseSectionContentGenerator):
r"""Implement a content generator that analyzes a Series with
continuous values.
Args:
state: The state containing the Series to analyze.
Example usage:
```pycon
>>> import polars as pl
>>> from arkas.content import ContinuousSeriesContentGenerator
>>> from arkas.state import SeriesState
>>> content = ContinuousSeriesContentGenerator(
... SeriesState(pl.Series("col1", [1, 2, 3, 4, 5, 6, 7]))
... )
>>> content
ContinuousSeriesContentGenerator(
(state): SeriesState(name='col1', values=(7,), figure_config=MatplotlibFigureConfig())
)
```
"""
def __init__(self, state: SeriesState) -> None:
self._state = state
def __repr__(self) -> str:
args = repr_indent(repr_mapping({"state": self._state}))
return f"{self.__class__.__qualname__}(\n {args}\n)"
def __str__(self) -> str:
args = str_indent(str_mapping({"state": self._state}))
return f"{self.__class__.__qualname__}(\n {args}\n)"
def equal(self, other: Any, equal_nan: bool = False) -> bool:
if not isinstance(other, self.__class__):
return False
return self._state.equal(other._state, equal_nan=equal_nan)
def generate_content(self) -> str:
logger.info(f"Generating the continuous distribution of {self._state.series.name}...")
figures = ContinuousSeriesPlotter(state=self._state).plot()
stats = compute_statistics_continuous(self._state.series)
null_values_pct = (
f"{100 * stats['num_nulls'] / stats['count']:.2f}" if stats["count"] > 0 else "N/A"
)
xmin, xmax = find_range(
self._state.series.drop_nulls().to_numpy(),
xmin=self._state.figure_config.get_arg("xmin"),
xmax=self._state.figure_config.get_arg("xmin"),
)
return Template(create_template()).render(
{
"column": self._state.series.name,
"figure": figure2html(figures["continuous_histogram"], close_fig=True),
"table": create_table(stats),
"total_values": f"{stats['count']:,}",
"unique_values": f"{stats['nunique']:,}",
"null_values": f"{stats['num_nulls']:,}",
"null_values_pct": null_values_pct,
"min_value": f"{stats['min']:,}",
"max_value": f"{stats['max']:,}",
"xmin": f"{xmin:,}",
"xmax": f"{xmax:,}",
"dtype": str(self._state.series.dtype),
}
)
def create_template() -> str:
r"""Return the template of the content.
Returns:
The content template.
Example usage:
```pycon
>>> from arkas.content.continuous_series import create_template
>>> template = create_template()
```
"""
return """<p>This section analyzes the distribution of continuous values for column <em>{{column}}</em>.</p>
<ul>
<li> <b>total values:</b> {{total_values}} </li>
<li> <b>number of unique values:</b> {{unique_values}} </li>
<li> <b>number of null values:</b> {{null_values}} / {{total_values}} ({{null_values_pct}}%) </li>
<li> <b>range of values:</b> [{{min_value}}, {{max_value}}] </li>
<li> <b>data type:</b> <em>{{dtype}}</em> </li>
</ul>
<p>The histogram shows the distribution of values in the range [{{xmin}}, {{xmax}}].</p>
{{figure}}
<details>
<summary>[show statistics]</summary>
<p style="margin-top: 1rem;">
The following table shows some statistics about the distribution for column <em>{{column}}<em>.
</p>
{{table}}
</details>
"""
def create_table(stats: dict) -> str:
r"""Create the HTML code of the table with statistics.
Args:
stats: Specifies a dictionary with the statistics.
Returns:
The HTML code of the table.
Example usage:
```pycon
>>> from arkas.content.continuous_series import create_table
>>> table = create_table(
... stats={
... "count": 101,
... "nunique": 101,
... "num_non_nulls": 101,
... "num_nulls": 0,
... "mean": 50.0,
... "std": 29.15,
... "skewness": 0.0,
... "kurtosis": -1.20,
... "min": 0.0,
... "q001": 0.1,
... "q01": 1.0,
... "q05": 5.0,
... "q10": 10.0,
... "q25": 25.0,
... "median": 50.0,
... "q75": 75.0,
... "q90": 90.0,
... "q95": 95.0,
... "q99": 99.0,
... "q999": 99.9,
... "max": 100.0,
... ">0": 100,
... "<0": 0,
... "=0": 1,
... },
... )
```
"""
return Template(
"""<table class="table table-hover table-responsive w-auto" >
<thead class="thead table-group-divider">
<tr><th>stat</th><th>value</th></tr>
</thead>
<tbody class="tbody table-group-divider">
<tr><th>count</th><td {{num_style}}>{{count}}</td></tr>
<tr><th>mean</th><td {{num_style}}>{{mean}}</td></tr>
<tr><th>std</th><td {{num_style}}>{{std}}</td></tr>
<tr><th>skewness</th><td {{num_style}}>{{skewness}}</td></tr>
<tr><th>kurtosis</th><td {{num_style}}>{{kurtosis}}</td></tr>
<tr><th>min</th><td {{num_style}}>{{min}}</td></tr>
<tr><th>quantile 0.1%</th><td {{num_style}}>{{q01}}</td></tr>
<tr><th>quantile 1%</th><td {{num_style}}>{{q01}}</td></tr>
<tr><th>quantile 5%</th><td {{num_style}}>{{q05}}</td></tr>
<tr><th>quantile 10%</th><td {{num_style}}>{{q10}}</td></tr>
<tr><th>quantile 25%</th><td {{num_style}}>{{q25}}</td></tr>
<tr><th>median</th><td {{num_style}}>{{median}}</td></tr>
<tr><th>quantile 75%</th><td {{num_style}}>{{q75}}</td></tr>
<tr><th>quantile 90%</th><td {{num_style}}>{{q90}}</td></tr>
<tr><th>quantile 95%</th><td {{num_style}}>{{q95}}</td></tr>
<tr><th>quantile 99%</th><td {{num_style}}>{{q99}}</td></tr>
<tr><th>quantile 99.9%</th><td {{num_style}}>{{q99}}</td></tr>
<tr><th>max</th><td {{num_style}}>{{max}}</td></tr>
<tr><th>number of zeros</th><td {{num_style}}>{{num_zeros}}</td></tr>
<tr><th>number of positive values</th><td {{num_style}}>{{num_pos}}</td></tr>
<tr><th>number of negative values</th><td {{num_style}}>{{num_neg}}</td></tr>
<tr class="table-group-divider"></tr>
</tbody>
</table>
"""
).render(
{
"num_style": 'style="text-align: right;"',
"count": f"{stats['count']:,}",
"mean": f"{stats['mean']:,.4f}",
"std": f"{stats['std']:,.4f}",
"skewness": f"{stats['skewness']:,.4f}",
"kurtosis": f"{stats['kurtosis']:,.4f}",
"min": f"{stats['min']:,.4f}",
"q001": f"{stats['q001']:,.4f}",
"q01": f"{stats['q01']:,.4f}",
"q05": f"{stats['q05']:,.4f}",
"q10": f"{stats['q10']:,.4f}",
"q25": f"{stats['q25']:,.4f}",
"median": f"{stats['median']:,.4f}",
"q75": f"{stats['q75']:,.4f}",
"q90": f"{stats['q90']:,.4f}",
"q95": f"{stats['q95']:,.4f}",
"q99": f"{stats['q99']:,.4f}",
"q999": f"{stats['q999']:,.4f}",
"max": f"{stats['max']:,.4f}",
"num_pos": f"{stats['>0']:,}",
"num_neg": f"{stats['<0']:,}",
"num_zeros": f"{stats['=0']:,}",
}
)
r"""Contain the implementation of a HTML content generator that analyzes
the number of null values per column."""
from __future__ import annotations
__all__ = ["NullValueContentGenerator", "create_template"]
import logging
from typing import TYPE_CHECKING, Any
from coola.utils import repr_indent, repr_mapping, str_indent, str_mapping
from jinja2 import Template
from arkas.content.section import BaseSectionContentGenerator
from arkas.figure.utils import figure2html
from arkas.plotter.null_value import NullValuePlotter
if TYPE_CHECKING:
import polars as pl
from arkas.state.null_value import NullValueState
logger = logging.getLogger(__name__)
class NullValueContentGenerator(BaseSectionContentGenerator):
r"""Implement a content generator that analyzes the number of null
values per column.
Args:
state: The state containing the number of null values per
column.
Example usage:
```pycon
>>> import numpy as np
>>> from arkas.content import NullValueContentGenerator
>>> from arkas.state import NullValueState
>>> content = NullValueContentGenerator(
... NullValueState(
... null_count=np.array([0, 1, 2]),
... total_count=np.array([5, 5, 5]),
... columns=["col1", "col2", "col3"],
... )
... )
>>> content
NullValueContentGenerator(
(state): NullValueState(num_columns=3, figure_config=MatplotlibFigureConfig())
)
```
"""
def __init__(self, state: NullValueState) -> None:
self._state = state
def __repr__(self) -> str:
args = repr_indent(repr_mapping({"state": self._state}))
return f"{self.__class__.__qualname__}(\n {args}\n)"
def __str__(self) -> str:
args = str_indent(str_mapping({"state": self._state}))
return f"{self.__class__.__qualname__}(\n {args}\n)"
def equal(self, other: Any, equal_nan: bool = False) -> bool:
if not isinstance(other, self.__class__):
return False
return self._state.equal(other._state, equal_nan=equal_nan)
def generate_content(self) -> str:
ncols = len(self._state.columns)
logger.info(f"Generating the null values bar plot for {ncols:,} columns...")
figures = NullValuePlotter(state=self._state).plot()
frame = self._state.to_dataframe()
return Template(create_template()).render(
{
"ncols": f"{ncols:,}",
"columns": ", ".join(self._state.columns),
"figure": figure2html(figures["null_values"], close_fig=True),
"table_alpha": create_table(frame.sort(by="column")),
"table_sort": create_table(frame.sort(by="null")),
}
)
def create_template() -> str:
r"""Return the template of the content.
Returns:
The content template.
Example usage:
```pycon
>>> from arkas.content.null_value import create_template
>>> template = create_template()
```
"""
return """This section analyzes the number and proportion of null values for the {{ncols}}
columns: <em>{{columns}}</em>.
<p>The columns are sorted by ascending order of number of null values in the following bar plot.</p>
{{figure}}
<details>
<summary>[show statistics per column]</summary>
<p style="margin-top: 1rem;">
The following tables show the number and proportion of null values for the {{num_columns}}
columns.
The background color of the row indicates the proportion of missing values:
dark blues indicates more missing values than light blues. </p>
<ul>
<li> <b>column</b>: is the column name </li>
<li> <b>null pct</b>: is the percentage of null values in the column </li>
<li> <b>null count</b>: is the number of null values in the column </li>
<li> <b>total count</b>: is the total number of values in the column </li>
</ul>
<div class="container-fluid">
<div class="row align-items-start">
<div class="col align-self-center">
<p><b>Columns sorted by alphabetical order</b></p>
{{table_alpha}}
</div>
<div class="col">
<p><b>Columns sorted by ascending order of missing values</b></p>
{{table_sort}}
</div>
</div>
</div>
</details>
"""
def create_table(frame: pl.DataFrame) -> str:
r"""Return a HTML code of a table with the temporal distribution of
null values.
Args:
frame: The DataFrame to analyze.
Returns:
The HTML code of the table.
Example usage:
```pycon
>>> import polars as pl
>>> from arkas.content.null_value import create_table
>>> frame = pl.DataFrame(
... {"column": ["A", "B", "C"], "null": [0, 1, 2], "total": [4, 4, 4]},
... schema={"column": pl.String, "null": pl.Int64, "total": pl.Int64},
... )
>>> table = create_table(frame)
```
"""
rows = [
create_table_row(column=column, null_count=null, total_count=total)
for column, null, total in zip(
frame["column"],
frame["null"],
frame["total"],
)
]
return Template(
"""<table class="table table-hover table-responsive w-auto" >
<thead class="thead table-group-divider">
<tr>
<th>column</th>
<th>null pct</th>
<th>null count</th>
<th>total count</th>
</tr>
</thead>
<tbody class="tbody table-group-divider">
{{rows}}
<tr class="table-group-divider"></tr>
</tbody>
</table>
"""
).render({"rows": "\n".join(rows)})
def create_table_row(column: str, null_count: int, total_count: int) -> str:
r"""Create the HTML code of a new table row.
Args:
column: The column name.
null_count: The number of null values.
total_count: The total number of rows.
Returns:
The HTML code of a row.
Example usage:
```pycon
>>> from arkas.content.null_value import create_table_row
>>> row = create_table_row(column="col", null_count=5, total_count=101)
```
"""
pct = null_count / total_count if total_count > 0 else float("nan")
pct_color = pct if total_count > 0 else 0
return Template(
"<tr>"
'<th style="background-color: rgba(0, 191, 255, {{null_pct}})">{{column}}</th>'
"<td {{num_style}}>{{null_pct}}</td>"
"<td {{num_style}}>{{null_count}}</td>"
"<td {{num_style}}>{{total_count}}</td>"
"</tr>"
).render(
{
"num_style": (
f'style="text-align: right; background-color: rgba(0, 191, 255, {pct_color})"'
),
"column": column,
"null_count": f"{null_count:,}",
"null_pct": f"{pct:.4f}",
"total_count": f"{total_count:,}",
}
)
r"""Contain the implementation of a HTML content generator that analyzes
the temporal distribution of null values."""
from __future__ import annotations
__all__ = ["TemporalNullValueContentGenerator", "create_template"]
import logging
from typing import TYPE_CHECKING, Any
from coola.utils import repr_indent, repr_mapping, str_indent, str_mapping
from grizz.utils.null import compute_temporal_null_count
from jinja2 import Template
from arkas.content.section import BaseSectionContentGenerator
from arkas.figure.utils import figure2html
from arkas.plotter.temporal_null_value import TemporalNullValuePlotter
if TYPE_CHECKING:
import polars as pl
from arkas.state.temporal_dataframe import TemporalDataFrameState
logger = logging.getLogger(__name__)
class TemporalNullValueContentGenerator(BaseSectionContentGenerator):
r"""Implement a content generator that analyzes the temporal
distribution of null values.
Args:
state: The state containing the DataFrame to analyze.
Example usage:
```pycon
>>> from datetime import datetime, timezone
>>> import polars as pl
>>> from arkas.content import TemporalNullValueContentGenerator
>>> from arkas.state import TemporalDataFrameState
>>> dataframe = pl.DataFrame(
... {
... "col1": [0, 1, 1, 0],
... "col2": [0, 1, 0, 1],
... "col3": [1, 0, 0, 0],
... "datetime": [
... datetime(year=2020, month=1, day=3, tzinfo=timezone.utc),
... datetime(year=2020, month=2, day=3, tzinfo=timezone.utc),
... datetime(year=2020, month=3, day=3, tzinfo=timezone.utc),
... datetime(year=2020, month=4, day=3, tzinfo=timezone.utc),
... ],
... },
... schema={
... "col1": pl.Int64,
... "col2": pl.Int64,
... "col3": pl.Int64,
... "datetime": pl.Datetime(time_unit="us", time_zone="UTC"),
... },
... )
>>> content = TemporalNullValueContentGenerator(
... TemporalDataFrameState(dataframe, temporal_column="datetime")
... )
>>> content
TemporalNullValueContentGenerator(
(state): TemporalDataFrameState(dataframe=(4, 4), temporal_column='datetime', period=None, figure_config=MatplotlibFigureConfig())
)
```
"""
def __init__(self, state: TemporalDataFrameState) -> None:
self._state = state
def __repr__(self) -> str:
args = repr_indent(repr_mapping({"state": self._state}))
return f"{self.__class__.__qualname__}(\n {args}\n)"
def __str__(self) -> str:
args = str_indent(str_mapping({"state": self._state}))
return f"{self.__class__.__qualname__}(\n {args}\n)"
def equal(self, other: Any, equal_nan: bool = False) -> bool:
if not isinstance(other, self.__class__):
return False
return self._state.equal(other._state, equal_nan=equal_nan)
def generate_content(self) -> str:
nrows, ncols = self._state.dataframe.shape
logger.info(
f"Generating the temporal plot of {ncols} columns using the "
f"temporal column {self._state.temporal_column!r}..."
)
figures = TemporalNullValuePlotter(state=self._state).plot()
return Template(create_template()).render(
{
"nrows": f"{nrows:,}",
"ncols": f"{ncols:,}",
"columns": ", ".join(self._state.dataframe.columns),
"temporal_column": self._state.temporal_column,
"figure": figure2html(figures["temporal_null_value"], close_fig=True),
"table": create_table(
frame=self._state.dataframe,
temporal_column=self._state.temporal_column,
period=self._state.period,
),
}
)
def create_template() -> str:
r"""Return the template of the content.
Returns:
The content template.
Example usage:
```pycon
>>> from arkas.content.temporal_null_value import create_template
>>> template = create_template()
```
"""
return """<p>This section analyzes the temporal distribution of null values in all columns.
The column <em>{{temporal_column}}</em> is used as the temporal column.</p>
{{figure}}
<details>
<summary>[show statistics per temporal period]</summary>
<p style="margin-top: 1rem;">The following table shows some statistics for each period.</p>
{{table}}
</details>
"""
def create_table(frame: pl.DataFrame, temporal_column: str, period: str) -> str:
r"""Create a HTML representation of a table with the temporal
distribution of null values.
Args:
frame: The DataFrame to analyze.
temporal_column: The temporal column used to analyze the
temporal distribution.
period: The temporal period e.g. monthly or daily.
Returns:
The HTML representation of the table.
Example usage:
```pycon
>>> from datetime import datetime, timezone
>>> import polars as pl
>>> from arkas.content.temporal_null_value import create_table
>>> frame = pl.DataFrame(
... {
... "col1": [None, 1.0, 0.0, 1.0],
... "col2": [None, 1, 0, None],
... "datetime": [
... datetime(year=2020, month=1, day=3, tzinfo=timezone.utc),
... datetime(year=2020, month=2, day=3, tzinfo=timezone.utc),
... datetime(year=2020, month=3, day=3, tzinfo=timezone.utc),
... datetime(year=2020, month=4, day=3, tzinfo=timezone.utc),
... ],
... },
... schema={
... "col1": pl.Float64,
... "col2": pl.Int64,
... "datetime": pl.Datetime(time_unit="us", time_zone="UTC"),
... },
... )
>>> table = create_table(frame=frame, temporal_column="datetime", period="1mo")
```
"""
if frame.is_empty():
return ""
columns = list(frame.columns)
columns.remove(temporal_column)
nulls, totals, labels = compute_temporal_null_count(
frame=frame, columns=columns, temporal_column=temporal_column, period=period
)
rows = []
for label, null, total in zip(labels, nulls, totals):
rows.append(create_table_row(label=label, num_nulls=null, total=total))
return Template(
"""<table class="table table-hover table-responsive w-auto" >
<thead class="thead table-group-divider">
<tr>
<th>period</th>
<th>number of null values</th>
<th>number of non-null values</th>
<th>total number of values</th>
<th>percentage of null values</th>
<th>percentage of non-null values</th>
</tr>
</thead>
<tbody class="tbody table-group-divider">
{{rows}}
<tr class="table-group-divider"></tr>
</tbody>
</table>
"""
).render({"rows": "\n".join(rows), "period": period})
def create_table_row(label: str, num_nulls: int, total: int) -> str:
r"""Create the HTML code of a new table row.
Args:
label: The label of the row.
num_nulls: The number of null values.
total: The total number of values.
Returns:
The HTML code of a row.
Example usage:
```pycon
>>> from arkas.content.temporal_null_value import create_table_row
>>> row = create_table_row(label="col", num_nulls=5, total=42)
```
"""
num_non_nulls = total - num_nulls
return Template(
"""<tr>
<th>{{label}}</th>
<td {{num_style}}>{{num_nulls}}</td>
<td {{num_style}}>{{num_non_nulls}}</td>
<td {{num_style}}>{{total}}</td>
<td {{num_style}}>{{num_nulls_pct}}</td>
<td {{num_style}}>{{num_non_nulls_pct}}</td>
</tr>"""
).render(
{
"num_style": 'style="text-align: right;"',
"label": label,
"num_nulls": f"{num_nulls:,}",
"num_non_nulls": f"{num_non_nulls:,}",
"total": f"{total:,}",
"num_nulls_pct": f"{100 * num_nulls / total:.2f}%",
"num_non_nulls_pct": f"{100 * num_non_nulls / total:.2f}%",
}
)
r"""Implement an output to analyze a series with continuous values."""
from __future__ import annotations
__all__ = ["ContinuousSeriesOutput"]
from typing import TYPE_CHECKING, Any
from coola.utils import repr_indent, repr_mapping, str_indent, str_mapping
from arkas.content.continuous_series import ContinuousSeriesContentGenerator
from arkas.evaluator2.vanilla import Evaluator
from arkas.output.lazy import BaseLazyOutput
from arkas.plotter.continuous_series import ContinuousSeriesPlotter
if TYPE_CHECKING:
from arkas.state.series import SeriesState
class ContinuousSeriesOutput(BaseLazyOutput):
r"""Implement an output to analyze a series with continuous values.
Args:
state: The state containing the Series to analyze.
Example usage:
```pycon
>>> import polars as pl
>>> from arkas.output import ContinuousSeriesOutput
>>> from arkas.state import SeriesState
>>> output = ContinuousSeriesOutput(SeriesState(pl.Series("col1", [1, 2, 3, 4, 5, 6, 7])))
>>> output
ContinuousSeriesOutput(
(state): SeriesState(name='col1', values=(7,), figure_config=MatplotlibFigureConfig())
)
>>> output.get_content_generator()
ContinuousSeriesContentGenerator(
(state): SeriesState(name='col1', values=(7,), figure_config=MatplotlibFigureConfig())
)
>>> output.get_evaluator()
Evaluator(count=0)
>>> output.get_plotter()
ContinuousSeriesPlotter(
(state): SeriesState(name='col1', values=(7,), figure_config=MatplotlibFigureConfig())
)
```
"""
def __init__(self, state: SeriesState) -> None:
self._state = state
def __repr__(self) -> str:
args = repr_indent(repr_mapping({"state": self._state}))
return f"{self.__class__.__qualname__}(\n {args}\n)"
def __str__(self) -> str:
args = str_indent(str_mapping({"state": self._state}))
return f"{self.__class__.__qualname__}(\n {args}\n)"
def equal(self, other: Any, equal_nan: bool = False) -> bool:
if not isinstance(other, self.__class__):
return False
return self._state.equal(other._state, equal_nan=equal_nan)
def _get_content_generator(self) -> ContinuousSeriesContentGenerator:
return ContinuousSeriesContentGenerator(self._state)
def _get_evaluator(self) -> Evaluator:
return Evaluator()
def _get_plotter(self) -> ContinuousSeriesPlotter:
return ContinuousSeriesPlotter(self._state)
r"""Implement an output to analyze the number of null values per
column."""
from __future__ import annotations
__all__ = ["NullValueOutput"]
from typing import TYPE_CHECKING, Any
from coola.utils import repr_indent, repr_mapping, str_indent, str_mapping
from arkas.content.null_value import NullValueContentGenerator
from arkas.evaluator2.vanilla import Evaluator
from arkas.output.lazy import BaseLazyOutput
from arkas.plotter.null_value import NullValuePlotter
if TYPE_CHECKING:
from arkas.state.null_value import NullValueState
class NullValueOutput(BaseLazyOutput):
r"""Implement an output to analyze the number of null values per
column.
Args:
state: The state containing the number of null values per
column.
Example usage:
```pycon
>>> import numpy as np
>>> from arkas.output import NullValueOutput
>>> from arkas.state import NullValueState
>>> output = NullValueOutput(
... NullValueState(
... null_count=np.array([0, 1, 2]),
... total_count=np.array([5, 5, 5]),
... columns=["col1", "col2", "col3"],
... )
... )
>>> output
NullValueOutput(
(state): NullValueState(num_columns=3, figure_config=MatplotlibFigureConfig())
)
>>> output.get_content_generator()
NullValueContentGenerator(
(state): NullValueState(num_columns=3, figure_config=MatplotlibFigureConfig())
)
>>> output.get_evaluator()
Evaluator(count=0)
>>> output.get_plotter()
NullValuePlotter(
(state): NullValueState(num_columns=3, figure_config=MatplotlibFigureConfig())
)
```
"""
def __init__(self, state: NullValueState) -> None:
self._state = state
def __repr__(self) -> str:
args = repr_indent(repr_mapping({"state": self._state}))
return f"{self.__class__.__qualname__}(\n {args}\n)"
def __str__(self) -> str:
args = str_indent(str_mapping({"state": self._state}))
return f"{self.__class__.__qualname__}(\n {args}\n)"
def equal(self, other: Any, equal_nan: bool = False) -> bool:
if not isinstance(other, self.__class__):
return False
return self._state.equal(other._state, equal_nan=equal_nan)
def _get_content_generator(self) -> NullValueContentGenerator:
return NullValueContentGenerator(self._state)
def _get_evaluator(self) -> Evaluator:
return Evaluator()
def _get_plotter(self) -> NullValuePlotter:
return NullValuePlotter(self._state)
r"""Implement an output to analyze the number of null values in a
DataFrame."""
from __future__ import annotations
__all__ = ["TemporalNullValueOutput"]
from typing import TYPE_CHECKING, Any
from coola.utils import repr_indent, repr_mapping, str_indent, str_mapping
from arkas.content.temporal_null_value import TemporalNullValueContentGenerator
from arkas.evaluator2.vanilla import Evaluator
from arkas.output.lazy import BaseLazyOutput
from arkas.plotter.temporal_null_value import TemporalNullValuePlotter
if TYPE_CHECKING:
from arkas.state.temporal_dataframe import TemporalDataFrameState
class TemporalNullValueOutput(BaseLazyOutput):
r"""Implement an output to analyze the number of null values in a
DataFrame.
Args:
state: The state containing the DataFrame to analyze.
Example usage:
```pycon
>>> from datetime import datetime, timezone
>>> import polars as pl
>>> from arkas.output import TemporalNullValueOutput
>>> from arkas.state import TemporalDataFrameState
>>> frame = pl.DataFrame(
... {
... "col1": [0, 1, 1, 0],
... "col2": [0, 1, 0, 1],
... "col3": [1, 0, 0, 0],
... "datetime": [
... datetime(year=2020, month=1, day=3, tzinfo=timezone.utc),
... datetime(year=2020, month=2, day=3, tzinfo=timezone.utc),
... datetime(year=2020, month=3, day=3, tzinfo=timezone.utc),
... datetime(year=2020, month=4, day=3, tzinfo=timezone.utc),
... ],
... },
... schema={
... "col1": pl.Int64,
... "col2": pl.Int64,
... "col3": pl.Int64,
... "datetime": pl.Datetime(time_unit="us", time_zone="UTC"),
... },
... )
>>> output = TemporalNullValueOutput(
... TemporalDataFrameState(frame, temporal_column="datetime")
... )
>>> output
TemporalNullValueOutput(
(state): TemporalDataFrameState(dataframe=(4, 4), temporal_column='datetime', period=None, figure_config=MatplotlibFigureConfig())
)
>>> output.get_content_generator()
TemporalNullValueContentGenerator(
(state): TemporalDataFrameState(dataframe=(4, 4), temporal_column='datetime', period=None, figure_config=MatplotlibFigureConfig())
)
>>> output.get_evaluator()
Evaluator(count=0)
>>> output.get_plotter()
TemporalNullValuePlotter(
(state): TemporalDataFrameState(dataframe=(4, 4), temporal_column='datetime', period=None, figure_config=MatplotlibFigureConfig())
)
```
"""
def __init__(self, state: TemporalDataFrameState) -> None:
self._state = state
def __repr__(self) -> str:
args = repr_indent(repr_mapping({"state": self._state}))
return f"{self.__class__.__qualname__}(\n {args}\n)"
def __str__(self) -> str:
args = str_indent(str_mapping({"state": self._state}))
return f"{self.__class__.__qualname__}(\n {args}\n)"
def equal(self, other: Any, equal_nan: bool = False) -> bool:
if not isinstance(other, self.__class__):
return False
return self._state.equal(other._state, equal_nan=equal_nan)
def _get_content_generator(self) -> TemporalNullValueContentGenerator:
return TemporalNullValueContentGenerator(self._state)
def _get_evaluator(self) -> Evaluator:
return Evaluator()
def _get_plotter(self) -> TemporalNullValuePlotter:
return TemporalNullValuePlotter(self._state)
r"""Contain CDF plotting functions."""
from __future__ import annotations
__all__ = ["plot_cdf"]
from typing import TYPE_CHECKING
import numpy as np
from arkas.utils.array import nonnan
if TYPE_CHECKING:
from matplotlib.axes import Axes
def plot_cdf(
ax: Axes,
array: np.ndarray,
nbins: int | None = None,
xmin: float = float("-inf"),
xmax: float = float("inf"),
color: str = "tab:blue",
labelcolor: str = "black",
) -> None:
r"""Plot the cumulative distribution function (CDF).
Args:
ax: The axes of the matplotlib figure to update.
array: The array with the data.
nbins: The number of bins to use to plot the CDF.
xmin: The minimum value of the range or its
associated quantile. ``q0.1`` means the 10% quantile.
``0`` is the minimum value and ``1`` is the maximum value.
xmax: The maximum value of the range or its
associated quantile. ``q0.9`` means the 90% quantile.
``0`` is the minimum value and ``1`` is the maximum value.
color: The plot color.
labelcolor: The label color.
Example usage:
```pycon
>>> import numpy as np
>>> from matplotlib import pyplot as plt
>>> from arkas.plot import plot_cdf
>>> fig, ax = plt.subplots()
>>> plot_cdf(ax, array=np.arange(101))
```
"""
array = nonnan(array.ravel())
if array.size == 0:
return
nbins = nbins or min(1000, array.size)
nleft = array[array < xmin].size
nright = array[array > xmax].size
counts, edges = np.histogram(array[np.logical_and(array >= xmin, array <= xmax)], bins=nbins)
cdf = (np.cumsum(counts) + nleft) / (np.sum(counts) + nleft + nright)
x = [(left + right) * 0.5 for left, right in zip(edges[:-1], edges[1:])]
ax.tick_params(axis="y", labelcolor=labelcolor)
ax.plot(x, cdf, color=color, label="CDF")
ax.set_ylim(0.0, 1.0)
ax.set_ylabel("cumulative distribution function (CDF)", color=labelcolor)
r"""Contain plotting functions to analyze continuous values."""
from __future__ import annotations
__all__ = [
"boxplot_continuous",
"boxplot_continuous_temporal",
"hist_continuous",
"hist_continuous2",
]
from typing import TYPE_CHECKING
import numpy as np
from arkas.plot.cdf import plot_cdf
from arkas.plot.utils import (
auto_yscale_continuous,
axvline_quantile,
readable_xticklabels,
)
from arkas.utils.array import nonnan
from arkas.utils.range import find_range
if TYPE_CHECKING:
from collections.abc import Sequence
from matplotlib.axes import Axes
def boxplot_continuous(
ax: Axes,
array: np.ndarray,
xmin: float | str | None = None,
xmax: float | str | None = None,
) -> None:
r"""Plot the histogram of an array containing continuous values.
Args:
ax: The axes of the matplotlib figure to update.
array: The array with the data.
xmin: The minimum value of the range or its
associated quantile. ``q0.1`` means the 10% quantile.
``0`` is the minimum value and ``1`` is the maximum value.
xmax: The maximum value of the range or its
associated quantile. ``q0.9`` means the 90% quantile.
``0`` is the minimum value and ``1`` is the maximum value.
Example usage:
```pycon
>>> import numpy as np
>>> from matplotlib import pyplot as plt
>>> from arkas.plot import boxplot_continuous
>>> fig, ax = plt.subplots()
>>> boxplot_continuous(ax, array=np.arange(101))
```
"""
array = array.ravel()
if array.size == 0:
return
xmin, xmax = find_range(array, xmin=xmin, xmax=xmax)
ax.boxplot(
array,
notch=True,
vert=False,
widths=0.7,
patch_artist=True,
boxprops={"facecolor": "lightblue"},
)
readable_xticklabels(ax, max_num_xticks=100)
if xmin < xmax:
ax.set_xlim(xmin, xmax)
ax.set_ylabel(" ")
def boxplot_continuous_temporal(
ax: Axes,
data: Sequence[np.ndarray],
steps: Sequence,
ymin: float | str | None = None,
ymax: float | str | None = None,
yscale: str = "linear",
) -> None:
r"""Plot the histogram of an array containing continuous values.
Args:
ax: The axes of the matplotlib figure to update.
data: The sequence of data where each item is a 1-d array with
the values of the time step.
steps: The sequence time step names.
ymin: The minimum value of the range or its
associated quantile. ``q0.1`` means the 10% quantile.
``0`` is the minimum value and ``1`` is the maximum value.
ymax: The maximum value of the range or its
associated quantile. ``q0.9`` means the 90% quantile.
``0`` is the minimum value and ``1`` is the maximum value.
yscale: The y-axis scale. If ``'auto'``, the
``'linear'`` or ``'log'/'symlog'`` scale is chosen based
on the distribution.
Raises:
RuntimeError: if ``data`` and ``steps`` have different lengths
Example usage:
```pycon
>>> import numpy as np
>>> from matplotlib import pyplot as plt
>>> from arkas.plot import boxplot_continuous_temporal
>>> fig, ax = plt.subplots()
>>> rng = np.random.default_rng()
>>> data = [rng.standard_normal(1000) for _ in range(10)]
>>> boxplot_continuous_temporal(ax, data=data, steps=list(range(len(data))))
```
"""
if len(data) == 0:
return
if len(data) != len(steps):
msg = f"data and steps have different lengths: {len(data):,} vs {len(steps):,}"
raise RuntimeError(msg)
data = [nonnan(x) for x in data]
ax.boxplot(
data,
notch=True,
vert=True,
widths=0.7,
patch_artist=True,
boxprops={"facecolor": "lightblue"},
)
array = np.concatenate(data)
ymin, ymax = find_range(array, xmin=ymin, xmax=ymax)
if ymin < ymax:
ax.set_ylim(ymin, ymax)
ax.set_xticks(np.arange(len(steps)), labels=steps)
if yscale == "auto":
yscale = auto_yscale_continuous(array=array, nbins=100)
ax.set_yscale(yscale)
readable_xticklabels(ax)
def hist_continuous(
ax: Axes,
array: np.ndarray,
nbins: int | None = None,
density: bool = False,
yscale: str = "linear",
xmin: float | str | None = None,
xmax: float | str | None = None,
cdf: bool = True,
quantile: bool = True,
) -> None:
r"""Plot the histogram of an array containing continuous values.
Args:
ax: The axes of the matplotlib figure to update.
array: The array with the data.
nbins: The number of bins to use to plot.
density: If True, draw and return a probability density:
each bin will display the bin's raw count divided by the
total number of counts and the bin width, so that the area
under the histogram integrates to 1.
yscale: The y-axis scale. If ``'auto'``, the
``'linear'`` or ``'log'/'symlog'`` scale is chosen based
on the distribution.
xmin: The minimum value of the range or its
associated quantile. ``q0.1`` means the 10% quantile.
``0`` is the minimum value and ``1`` is the maximum value.
xmax: The maximum value of the range or its
associated quantile. ``q0.9`` means the 90% quantile.
``0`` is the minimum value and ``1`` is the maximum value.
cdf: If ``True``, the CDF is added to the plot.
quantile: If ``True``, the 5% and 95% quantiles are added to
the plot.
Example usage:
```pycon
>>> import numpy as np
>>> from matplotlib import pyplot as plt
>>> from arkas.plot import hist_continuous
>>> fig, ax = plt.subplots()
>>> hist_continuous(ax, array=np.arange(101))
```
"""
array = array.ravel()
if array.size == 0:
return
xmin, xmax = find_range(array, xmin=xmin, xmax=xmax)
ax.hist(array, bins=nbins, range=(xmin, xmax), color="tab:blue", alpha=0.9, density=density)
readable_xticklabels(ax, max_num_xticks=100)
if xmin < xmax:
ax.set_xlim(xmin, xmax)
ax.set_ylabel("density (number of occurrences/total)" if density else "number of occurrences")
if yscale == "auto":
yscale = auto_yscale_continuous(array=array, nbins=nbins)
ax.set_yscale(yscale)
if cdf:
plot_cdf(
ax=ax.twinx(),
array=array,
nbins=nbins,
xmin=xmin,
xmax=xmax,
color="tab:red",
labelcolor="tab:red",
)
if not quantile:
return
q05, q95 = np.quantile(array, q=[0.05, 0.95])
if xmin < q05 < xmax:
axvline_quantile(ax, quantile=q05, label="q0.05 ", horizontalalignment="right")
if xmin < q95 < xmax:
axvline_quantile(ax, quantile=q95, label=" q0.95", horizontalalignment="left")
def hist_continuous2(
ax: Axes,
array1: np.ndarray,
array2: np.ndarray,
label1: str = "first",
label2: str = "second",
nbins: int | None = None,
density: bool = False,
yscale: str = "linear",
xmin: float | str | None = None,
xmax: float | str | None = None,
) -> None:
r"""Plot the histogram of two arrays to compare the distributions.
Args:
ax: The axes of the matplotlib figure to update.
array1: The first array with the data.
array2: The second array with the data.
label1: The label associated to the first array.
label2: The label associated to the second array.
nbins: The number of bins to use to plot.
density: If True, draw and return a probability density:
each bin will display the bin's raw count divided by the
total number of counts and the bin width, so that the area
under the histogram integrates to 1.
yscale: The y-axis scale. If ``'auto'``, the
``'linear'`` or ``'log'/'symlog'`` scale is chosen based
on the distribution.
xmin: The minimum value of the range or its
associated quantile. ``q0.1`` means the 10% quantile.
``0`` is the minimum value and ``1`` is the maximum value.
xmax: The maximum value of the range or its
associated quantile. ``q0.9`` means the 90% quantile.
``0`` is the minimum value and ``1`` is the maximum value.
Example usage:
```pycon
>>> import numpy as np
>>> from matplotlib import pyplot as plt
>>> from arkas.plot import hist_continuous2
>>> fig, ax = plt.subplots()
>>> hist_continuous2(ax, array1=np.arange(101), array2=np.arange(51))
```
"""
array1, array2 = array1.ravel(), array2.ravel()
array = np.concatenate([array1, array2])
if array.size == 0:
return
xmin, xmax = find_range(array, xmin=xmin, xmax=xmax)
ax.hist(
array1,
bins=nbins,
range=(xmin, xmax),
color="tab:blue",
alpha=0.5,
label=label1,
density=density,
)
ax.hist(
array2,
bins=nbins,
range=(xmin, xmax),
color="tab:orange",
alpha=0.5,
label=label2,
density=density,
)
readable_xticklabels(ax, max_num_xticks=100)
if xmin < xmax:
ax.set_xlim(xmin, xmax)
ax.set_ylabel("density (number of occurrences/total)" if density else "number of occurrences")
if yscale == "auto":
yscale = auto_yscale_continuous(array=array, nbins=nbins)
ax.set_yscale(yscale)
ax.legend()
r"""Contain plotting functions to analyze discrete values."""
from __future__ import annotations
__all__ = ["bar_discrete", "bar_discrete_temporal"]
from typing import TYPE_CHECKING
import numpy as np
from matplotlib import pyplot as plt
from arkas.plot.utils import auto_yscale_discrete, readable_xticklabels
if TYPE_CHECKING:
from collections.abc import Sequence
from matplotlib.axes import Axes
def bar_discrete(
ax: Axes,
names: Sequence,
counts: Sequence[int],
yscale: str = "auto",
) -> None:
r"""Plot the histogram of an array containing discrete values.
Args:
ax: The axes of the matplotlib figure to update.
names: The name of the values to plot.
counts: The number of value occurrences.
yscale: The y-axis scale. If ``'auto'``, the
``'linear'`` or ``'log'/'symlog'`` scale is chosen based
on the distribution.
Example usage:
```pycon
>>> from matplotlib import pyplot as plt
>>> from arkas.plot import bar_discrete
>>> fig, ax = plt.subplots()
>>> bar_discrete(ax, names=["a", "b", "c", "d"], counts=[5, 100, 42, 27])
```
"""
n = len(names)
if n == 0:
return
x = np.arange(n)
ax.bar(x, counts, width=0.9 if n < 50 else 1, color="tab:blue")
if yscale == "auto":
yscale = auto_yscale_discrete(min_count=min(counts), max_count=max(counts))
ax.set_yscale(yscale)
ax.set_xticks(x, labels=map(str, names))
readable_xticklabels(ax, max_num_xticks=100)
ax.set_xlim(-0.5, len(names) - 0.5)
ax.set_xlabel("values")
ax.set_ylabel("number of occurrences")
def bar_discrete_temporal(
ax: Axes,
counts: np.ndarray,
steps: Sequence | None = None,
values: Sequence | None = None,
proportion: bool = False,
) -> None:
r"""Plot the temporal distribution of discrete values.
Args:
ax: The axes of the matplotlib figure to update.
counts: A 2-d array that indicates the number of occurrences
for each value and time step. The first dimension
represents the value and the second dimension
represents the steps.
steps: The name associated to each step.
values: The name associated to each value.
proportion: If ``True``, it plots the normalized number of
occurrences for each step.
Example usage:
```pycon
>>> from matplotlib import pyplot as plt
>>> from arkas.plot import bar_discrete_temporal
>>> fig, ax = plt.subplots()
>>> bar_discrete_temporal(
... ax, counts=np.ones((5, 20)), values=list(range(5)), steps=list(range(20))
... )
```
"""
if counts.size == 0:
return
num_values, num_steps = counts.shape
values = _prepare_values_bar_discrete_temporal(values=values, num_values=num_values)
steps = _prepare_steps_bar_discrete_temporal(steps=steps, num_steps=num_steps)
counts = _prepare_counts_bar_discrete_temporal(counts=counts, proportion=proportion)
x = np.arange(num_steps, dtype=np.int64)
bottom = np.zeros(num_steps, dtype=counts.dtype)
width = 0.9 if num_steps < 50 else 1
my_cmap = plt.get_cmap("viridis")
for i in range(num_values):
count = counts[i]
ax.bar(x, count, label=values[i], bottom=bottom, width=width, color=my_cmap(i / num_values))
bottom += count
num_valid_values = len(list(filter(lambda x: x is not None, values)))
if num_valid_values <= 10 and num_valid_values > 0:
ax.legend()
ax.set_xticks(x, labels=steps)
readable_xticklabels(ax, max_num_xticks=100)
ax.set_xlim(-0.5, num_steps - 0.5)
ax.set_ylabel("steps")
ax.set_ylabel("proportion" if proportion else "number of occurrences")
def _prepare_values_bar_discrete_temporal(values: Sequence | None, num_values: int) -> list:
r"""Return the list of values.
This function was designed to be used in ``bar_discrete_temporal``.
Args:
values: The sequence of values.
num_values: The expected number of values.
Returns:
The values. If ``values`` is ``None``, a list filled with
``None`` is returned.
Raises:
RuntimeError: if the length of ``values`` does not match with
``num_values``.
"""
if values is None:
return [None] * num_values
if len(values) != num_values:
msg = (
f"values length ({len(values):,}) do not match with the count matrix "
f"first dimension ({num_values:,})"
)
raise RuntimeError(msg)
return list(values)
def _prepare_steps_bar_discrete_temporal(steps: Sequence | None, num_steps: int) -> list:
r"""Return the list of steps.
This function was designed to be used in ``bar_discrete_temporal``.
Args:
steps: The sequence of steps.
num_steps: The expected number of steps.
Returns:
The steps. If ``steps`` is ``None``, a list filled with
``None`` is returned.
Raises:
RuntimeError: if the length of ``steps`` does not match with
``num_steps``.
"""
if steps is None:
return list(range(num_steps))
if len(steps) != num_steps:
msg = (
f"steps length ({len(steps):,}) do not match with the count matrix "
f"second dimension ({num_steps:,})"
)
raise RuntimeError(msg)
return list(steps)
def _prepare_counts_bar_discrete_temporal(counts: np.ndarray, proportion: bool) -> np.ndarray:
r"""Prepare the count matrix.
This function was designed to be used in ``bar_discrete_temporal``.
Args:
counts: A 2-d array that indicates the number of occurrences
for each value and time step. The first dimension
represents the value and the second dimension
represents the steps.
proportion: If ``True``, the count matrix is normalized number
of occurrences for each step.
Returns:
The count matrix.
"""
if not proportion:
return counts
return counts / np.clip(counts.sum(axis=0), a_min=1, a_max=None)
r"""Contain functionalities to plot the temporal distribution of the
number of missing values."""
from __future__ import annotations
__all__ = ["plot_null_temporal"]
from typing import TYPE_CHECKING
import numpy as np
if TYPE_CHECKING:
from collections.abc import Sequence
from matplotlib.axes import Axes
def plot_null_temporal(ax: Axes, nulls: Sequence, totals: Sequence, labels: Sequence) -> None:
r"""Plot the temporal distribution of the number of missing values.
``nulls``, ``totals``, and ``labels`` must have the same length
and have the same order.
Args:
ax: The Axes object that encapsulates all the elements of an
individual (sub-)plot in a figure.
nulls: The number of null values for each temporal period.
totals: The number of total values for each temporal period.
labels: The labels for each temporal period.
Raises:
RuntimeError: if ``nulls``, ``totals``, and ``labels`` have
different lengths.
Example usage:
```pycon
>>> from matplotlib import pyplot as plt
>>> from arkas.plot import plot_null_temporal
>>> fig, ax = plt.subplots()
>>> plot_null_temporal(
... ax, nulls=[1, 2, 3, 4], totals=[10, 12, 14, 16], labels=["jan", "feb", "mar", "apr"]
... )
```
"""
if len(nulls) != len(totals):
msg = f"nulls ({len(nulls):,}) and totals ({len(totals):,}) have different lengths"
raise ValueError(msg)
if len(labels) != len(totals):
msg = f"nulls ({len(nulls):,}) and labels ({len(labels):,}) have different lengths"
raise ValueError(msg)
if len(nulls) == 0:
return
labels = list(map(str, labels))
nulls = np.asarray(nulls)
totals = np.asarray(totals)
color = "tab:blue"
x = np.arange(len(labels))
ax.set_ylabel("number of null/total values", color=color)
ax.tick_params(axis="y", labelcolor=color)
ax.bar(x=x, height=totals, color="tab:cyan", alpha=0.5, label="total")
ax.bar(x=x, height=nulls, color=color, alpha=0.8, label="null")
ax.legend()
ax2 = ax.twinx()
color = "black"
ax2.set_ylabel("percentage", color=color)
ax2.tick_params(axis="y", labelcolor=color)
ax2.plot(x, nulls / totals, "o-", color=color)
ax.set_xticks(x, labels=labels)
ax.set_xlim(-0.5, len(labels) - 0.5)
r"""Contain the implementation of a plotter to analyze a Series with
continuous values."""
from __future__ import annotations
__all__ = ["BaseFigureCreator", "ContinuousSeriesPlotter", "MatplotlibFigureCreator"]
from abc import ABC, abstractmethod
from typing import TYPE_CHECKING, Any
import matplotlib.pyplot as plt
from coola.utils import repr_indent, repr_mapping, str_indent, str_mapping
from arkas.figure.creator import FigureCreatorRegistry
from arkas.figure.html import HtmlFigure
from arkas.figure.matplotlib import MatplotlibFigure, MatplotlibFigureConfig
from arkas.figure.utils import MISSING_FIGURE_MESSAGE
from arkas.plot.continuous import hist_continuous
from arkas.plot.utils.hist import adjust_nbins
from arkas.plotter.base import BasePlotter
from arkas.plotter.vanilla import Plotter
from arkas.utils.array import filter_range, nonnan, to_array
from arkas.utils.range import find_range
if TYPE_CHECKING:
from arkas.figure.base import BaseFigure
from arkas.state.series import SeriesState
class BaseFigureCreator(ABC):
r"""Define the base class to create a figure with the content of the
column."""
@abstractmethod
def create(self, state: SeriesState) -> BaseFigure:
r"""Create a figure with the content of the column.
Args:
state: The state containing the Series to analyze.
Returns:
The generated figure.
Example usage:
```pycon
>>> import polars as pl
>>> from arkas.figure import MatplotlibFigureConfig
>>> from arkas.state import SeriesState
>>> creator = MatplotlibFigureCreator()
>>> fig = creator.create(SeriesState(pl.Series("col1", [1, 2, 3, 4, 5, 6, 7])))
```
"""
class MatplotlibFigureCreator(BaseFigureCreator):
r"""Create a matplotlib figure with the content of each column.
Example usage:
```pycon
>>> import polars as pl
>>> from arkas.figure import MatplotlibFigureConfig
>>> from arkas.state import SeriesState
>>> creator = MatplotlibFigureCreator()
>>> fig = creator.create(SeriesState(pl.Series("col1", [1, 2, 3, 4, 5, 6, 7])))
```
"""
def __repr__(self) -> str:
return f"{self.__class__.__qualname__}()"
def create(self, state: SeriesState) -> BaseFigure:
array = nonnan(to_array(state.series))
if array.size == 0:
return HtmlFigure(MISSING_FIGURE_MESSAGE)
fig, ax = plt.subplots(**state.figure_config.get_arg("init", {}))
xmin, xmax = find_range(
array,
xmin=state.figure_config.get_arg("xmin"),
xmax=state.figure_config.get_arg("xmax"),
)
nbins = adjust_nbins(
nbins=state.figure_config.get_arg("nbins"),
array=filter_range(array, xmin=xmin, xmax=xmax),
)
hist_continuous(
ax=ax,
array=array,
nbins=nbins,
xmin=xmin,
xmax=xmax,
yscale=state.figure_config.get_arg("yscale", default="linear"),
)
ax.set_title(f"data distribution for column {state.series.name!r}")
fig.tight_layout()
return MatplotlibFigure(fig)
class ContinuousSeriesPlotter(BasePlotter):
r"""Implement a plotter that analyzes a column with continuous
values.
Args:
state: The state containing the Series to analyze.
Example usage:
```pycon
>>> import polars as pl
>>> from arkas.plotter import ContinuousSeriesPlotter
>>> from arkas.state import SeriesState
>>> plotter = ContinuousSeriesPlotter(SeriesState(pl.Series("col1", [1, 2, 3, 4, 5, 6, 7])))
>>> plotter
ContinuousSeriesPlotter(
(state): SeriesState(name='col1', values=(7,), figure_config=MatplotlibFigureConfig())
)
```
"""
registry = FigureCreatorRegistry[BaseFigureCreator](
{MatplotlibFigureConfig.backend(): MatplotlibFigureCreator()}
)
def __init__(self, state: SeriesState) -> None:
self._state = state
def __repr__(self) -> str:
args = repr_indent(repr_mapping({"state": self._state}))
return f"{self.__class__.__qualname__}(\n {args}\n)"
def __str__(self) -> str:
args = str_indent(str_mapping({"state": self._state}))
return f"{self.__class__.__qualname__}(\n {args}\n)"
def compute(self) -> Plotter:
return Plotter(self.plot())
def equal(self, other: Any, equal_nan: bool = False) -> bool:
if not isinstance(other, self.__class__):
return False
return self._state.equal(other._state, equal_nan=equal_nan)
def plot(self, prefix: str = "", suffix: str = "") -> dict:
figure = self.registry.find_creator(self._state.figure_config.backend()).create(self._state)
return {f"{prefix}continuous_histogram{suffix}": figure}
r"""Contain the implementation of a plotter that plots the number of
null values for each column."""
from __future__ import annotations
__all__ = ["BaseFigureCreator", "MatplotlibFigureCreator", "NullValuePlotter"]
from abc import ABC, abstractmethod
from typing import TYPE_CHECKING, Any
import matplotlib.pyplot as plt
from coola.utils import repr_indent, repr_mapping, str_indent, str_mapping
from arkas.figure.creator import FigureCreatorRegistry
from arkas.figure.html import HtmlFigure
from arkas.figure.matplotlib import MatplotlibFigure, MatplotlibFigureConfig
from arkas.figure.utils import MISSING_FIGURE_MESSAGE
from arkas.plot.utils import readable_xticklabels
from arkas.plotter.base import BasePlotter
from arkas.plotter.vanilla import Plotter
if TYPE_CHECKING:
from arkas.figure.base import BaseFigure
from arkas.state.null_value import NullValueState
class BaseFigureCreator(ABC):
r"""Define the base class to create a bar plot figure with the
number of null values for each column."""
@abstractmethod
def create(self, state: NullValueState) -> BaseFigure:
r"""Create a bar plot figure with the number of null values for
each column.
Args:
state: The state containing the number of null values per
column.
Returns:
The generated figure.
Example usage:
```pycon
>>> import numpy as np
>>> from arkas.figure import MatplotlibFigureConfig
>>> from arkas.state import NullValueState
>>> creator = MatplotlibFigureCreator()
>>> fig = creator.create(
... NullValueState(
... null_count=np.array([0, 1, 2]),
... total_count=np.array([5, 5, 5]),
... columns=["col1", "col2", "col3"],
... )
... )
```
"""
class MatplotlibFigureCreator(BaseFigureCreator):
r"""Create a matplotlib figure with the number of null values for
each column.
Example usage:
```pycon
>>> import numpy as np
>>> from arkas.figure import MatplotlibFigureConfig
>>> from arkas.state import NullValueState
>>> creator = MatplotlibFigureCreator()
>>> fig = creator.create(
... NullValueState(
... null_count=np.array([0, 1, 2]),
... total_count=np.array([5, 5, 5]),
... columns=["col1", "col2", "col3"],
... )
... )
```
"""
def __repr__(self) -> str:
return f"{self.__class__.__qualname__}()"
def create(self, state: NullValueState) -> BaseFigure:
if state.null_count.shape[0] == 0:
return HtmlFigure(MISSING_FIGURE_MESSAGE)
fig, ax = plt.subplots(**state.figure_config.get_arg("init", {}))
frame = state.to_dataframe().sort(by=["null", "column"])
ax.bar(x=frame["column"].to_list(), height=frame["null"].to_numpy(), color="tab:blue")
ax.set_xlim(-0.5, len(state.columns) - 0.5)
readable_xticklabels(ax, max_num_xticks=100)
ax.set_xlabel("column")
ax.set_ylabel("number of null values")
ax.set_title("number of null values per column")
fig.tight_layout()
return MatplotlibFigure(fig)
class NullValuePlotter(BasePlotter):
r"""Implement a plotter that plots the number of null values for each
column.
Args:
state: The state containing the number of null values per
column.
Example usage:
```pycon
>>> import numpy as np
>>> from arkas.plotter import NullValuePlotter
>>> from arkas.state import NullValueState
>>> plotter = NullValuePlotter(
... NullValueState(
... null_count=np.array([0, 1, 2]),
... total_count=np.array([5, 5, 5]),
... columns=["col1", "col2", "col3"],
... )
... )
>>> plotter
NullValuePlotter(
(state): NullValueState(num_columns=3, figure_config=MatplotlibFigureConfig())
)
```
"""
registry = FigureCreatorRegistry[BaseFigureCreator](
{MatplotlibFigureConfig.backend(): MatplotlibFigureCreator()}
)
def __init__(self, state: NullValueState) -> None:
self._state = state
def __repr__(self) -> str:
args = repr_indent(repr_mapping({"state": self._state}))
return f"{self.__class__.__qualname__}(\n {args}\n)"
def __str__(self) -> str:
args = str_indent(str_mapping({"state": self._state}))
return f"{self.__class__.__qualname__}(\n {args}\n)"
def compute(self) -> Plotter:
return Plotter(self.plot())
def equal(self, other: Any, equal_nan: bool = False) -> bool:
if not isinstance(other, self.__class__):
return False
return self._state.equal(other._state, equal_nan=equal_nan)
def plot(self, prefix: str = "", suffix: str = "") -> dict:
figure = self.registry.find_creator(self._state.figure_config.backend()).create(self._state)
return {f"{prefix}null_values{suffix}": figure}
r"""Contain the implementation of a DataFrame column plotter."""
from __future__ import annotations
__all__ = ["BaseFigureCreator", "MatplotlibFigureCreator", "TemporalNullValuePlotter"]
from abc import ABC, abstractmethod
from typing import TYPE_CHECKING, Any
import matplotlib.pyplot as plt
from coola.utils import repr_indent, repr_mapping, str_indent, str_mapping
from grizz.utils.null import compute_temporal_null_count
from arkas.figure.creator import FigureCreatorRegistry
from arkas.figure.html import HtmlFigure
from arkas.figure.matplotlib import MatplotlibFigure, MatplotlibFigureConfig
from arkas.figure.utils import MISSING_FIGURE_MESSAGE
from arkas.plot import plot_null_temporal
from arkas.plot.utils import readable_xticklabels
from arkas.plotter.base import BasePlotter
from arkas.plotter.vanilla import Plotter
if TYPE_CHECKING:
from arkas.figure.base import BaseFigure
from arkas.state.temporal_dataframe import TemporalDataFrameState
class BaseFigureCreator(ABC):
r"""Define the base class to create a figure with the content of
each column."""
@abstractmethod
def create(self, state: TemporalDataFrameState) -> BaseFigure:
r"""Create a figure with the content of each column.
Args:
state: The state containing the DataFrame to analyze.
Returns:
The generated figure.
Example usage:
```pycon
>>> from datetime import datetime, timezone
>>> import polars as pl
>>> from arkas.plotter.temporal_null_value import MatplotlibFigureCreator
>>> from arkas.state import TemporalDataFrameState
>>> creator = MatplotlibFigureCreator()
>>> frame = pl.DataFrame(
... {
... "col1": [0, 1, 1, 0],
... "col2": [0, 1, 0, 1],
... "col3": [1, 0, 0, 0],
... "datetime": [
... datetime(year=2020, month=1, day=3, tzinfo=timezone.utc),
... datetime(year=2020, month=2, day=3, tzinfo=timezone.utc),
... datetime(year=2020, month=3, day=3, tzinfo=timezone.utc),
... datetime(year=2020, month=4, day=3, tzinfo=timezone.utc),
... ],
... },
... schema={
... "col1": pl.Int64,
... "col2": pl.Int64,
... "col3": pl.Int64,
... "datetime": pl.Datetime(time_unit="us", time_zone="UTC"),
... },
... )
>>> fig = creator.create(
... TemporalDataFrameState(frame, temporal_column="datetime", period="1d")
... )
```
"""
class MatplotlibFigureCreator(BaseFigureCreator):
r"""Create a matplotlib figure with the content of each column.
Example usage:
```pycon
>>> from datetime import datetime, timezone
>>> import polars as pl
>>> from arkas.plotter.temporal_null_value import MatplotlibFigureCreator
>>> from arkas.state import TemporalDataFrameState
>>> creator = MatplotlibFigureCreator()
>>> frame = pl.DataFrame(
... {
... "col1": [0, 1, 1, 0],
... "col2": [0, 1, 0, 1],
... "col3": [1, 0, 0, 0],
... "datetime": [
... datetime(year=2020, month=1, day=3, tzinfo=timezone.utc),
... datetime(year=2020, month=2, day=3, tzinfo=timezone.utc),
... datetime(year=2020, month=3, day=3, tzinfo=timezone.utc),
... datetime(year=2020, month=4, day=3, tzinfo=timezone.utc),
... ],
... },
... schema={
... "col1": pl.Int64,
... "col2": pl.Int64,
... "col3": pl.Int64,
... "datetime": pl.Datetime(time_unit="us", time_zone="UTC"),
... },
... )
>>> fig = creator.create(
... TemporalDataFrameState(frame, temporal_column="datetime", period="1d")
... )
```
"""
def __repr__(self) -> str:
return f"{self.__class__.__qualname__}()"
def create(self, state: TemporalDataFrameState) -> BaseFigure:
if state.dataframe.shape[0] == 0:
return HtmlFigure(MISSING_FIGURE_MESSAGE)
fig, ax = plt.subplots(**state.figure_config.get_arg("init", {}))
columns = list(state.dataframe.columns)
columns.remove(state.temporal_column)
nulls, totals, labels = compute_temporal_null_count(
frame=state.dataframe,
columns=columns,
temporal_column=state.temporal_column,
period=state.period,
)
plot_null_temporal(ax=ax, labels=labels, nulls=nulls, totals=totals)
readable_xticklabels(ax, max_num_xticks=100)
fig.tight_layout()
return MatplotlibFigure(fig)
class TemporalNullValuePlotter(BasePlotter):
r"""Implement a DataFrame column plotter.
Args:
state: The state containing the DataFrame to analyze.
Example usage:
```pycon
>>> from datetime import datetime, timezone
>>> import polars as pl
>>> from arkas.plotter import TemporalNullValuePlotter
>>> from arkas.state import TemporalDataFrameState
>>> frame = pl.DataFrame(
... {
... "col1": [0, 1, 1, 0],
... "col2": [0, 1, 0, 1],
... "col3": [1, 0, 0, 0],
... "datetime": [
... datetime(year=2020, month=1, day=3, tzinfo=timezone.utc),
... datetime(year=2020, month=2, day=3, tzinfo=timezone.utc),
... datetime(year=2020, month=3, day=3, tzinfo=timezone.utc),
... datetime(year=2020, month=4, day=3, tzinfo=timezone.utc),
... ],
... },
... schema={
... "col1": pl.Int64,
... "col2": pl.Int64,
... "col3": pl.Int64,
... "datetime": pl.Datetime(time_unit="us", time_zone="UTC"),
... },
... )
>>> plotter = TemporalNullValuePlotter(
... TemporalDataFrameState(frame, temporal_column="datetime", period="1d")
... )
>>> plotter
TemporalNullValuePlotter(
(state): TemporalDataFrameState(dataframe=(4, 4), temporal_column='datetime', period='1d', figure_config=MatplotlibFigureConfig())
)
```
"""
registry = FigureCreatorRegistry[BaseFigureCreator](
{MatplotlibFigureConfig.backend(): MatplotlibFigureCreator()}
)
def __init__(self, state: TemporalDataFrameState) -> None:
self._state = state
def __repr__(self) -> str:
args = repr_indent(repr_mapping({"state": self._state}))
return f"{self.__class__.__qualname__}(\n {args}\n)"
def __str__(self) -> str:
args = str_indent(str_mapping({"state": self._state}))
return f"{self.__class__.__qualname__}(\n {args}\n)"
def compute(self) -> Plotter:
return Plotter(self.plot())
def equal(self, other: Any, equal_nan: bool = False) -> bool:
if not isinstance(other, self.__class__):
return False
return self._state.equal(other._state, equal_nan=equal_nan)
def plot(self, prefix: str = "", suffix: str = "") -> dict:
figure = self.registry.find_creator(self._state.figure_config.backend()).create(self._state)
return {f"{prefix}temporal_null_value{suffix}": figure}
r"""Implement a state that contains the number of null values per
columns."""
from __future__ import annotations
__all__ = ["NullValueState"]
import sys
from typing import TYPE_CHECKING, Any
import numpy as np
import polars as pl
from coola import objects_are_equal
from coola.utils.format import repr_mapping_line
from grizz.utils.null import compute_null_count
from arkas.figure import BaseFigureConfig, get_default_config
from arkas.state.base import BaseState
if sys.version_info >= (3, 11):
from typing import Self
else: # pragma: no cover
from typing_extensions import (
Self, # use backport because it was added in python 3.11
)
if TYPE_CHECKING:
from collections.abc import Sequence
class NullValueState(BaseState):
r"""Implement a state that contains the number of null values per
columns.
Args:
null_count: The array with the number of null values for each column.
total_count: The total number of values for each column.
columns: The column names.
figure_config: An optional figure configuration.
Example usage:
```pycon
>>> import numpy as np
>>> from arkas.state import NullValueState
>>> state = NullValueState(
... null_count=np.array([0, 1, 2]),
... total_count=np.array([5, 5, 5]),
... columns=["col1", "col2", "col3"],
... )
>>> state
NullValueState(num_columns=3, figure_config=MatplotlibFigureConfig())
```
"""
def __init__(
self,
null_count: np.ndarray,
total_count: np.ndarray,
columns: Sequence[str],
figure_config: BaseFigureConfig | None = None,
) -> None:
self._null_count = null_count.ravel()
self._total_count = total_count.ravel()
self._columns = tuple(columns)
self._figure_config = figure_config or get_default_config()
if len(self._columns) != self._null_count.shape[0]:
msg = (
f"'columns' ({len(self._columns):,}) and 'null_count' "
f"({self._null_count.shape[0]:,}) do not match"
)
raise ValueError(msg)
if len(self._columns) != self._total_count.shape[0]:
msg = (
f"'columns' ({len(self._columns):,}) and 'total_count' "
f"({self._total_count.shape[0]:,}) do not match"
)
raise ValueError(msg)
def __repr__(self) -> str:
args = repr_mapping_line(
{
"num_columns": self._null_count.shape[0],
"figure_config": self._figure_config,
}
)
return f"{self.__class__.__qualname__}({args})"
@property
def columns(self) -> tuple[str, ...]:
return self._columns
@property
def null_count(self) -> np.ndarray:
return self._null_count
@property
def total_count(self) -> np.ndarray:
return self._total_count
@property
def figure_config(self) -> BaseFigureConfig | None:
return self._figure_config
def clone(self, deep: bool = True) -> Self:
return self.__class__(
null_count=self._null_count.copy() if deep else self._null_count,
total_count=self._total_count.copy() if deep else self._total_count,
columns=self._columns,
figure_config=self._figure_config.clone() if deep else self._figure_config,
)
def equal(self, other: Any, equal_nan: bool = False) -> bool:
if not isinstance(other, self.__class__):
return False
return (
objects_are_equal(self.null_count, other.null_count, equal_nan=equal_nan)
and objects_are_equal(self.total_count, other.total_count, equal_nan=equal_nan)
and objects_are_equal(self.columns, other.columns, equal_nan=equal_nan)
and objects_are_equal(self.figure_config, other.figure_config, equal_nan=equal_nan)
)
def to_dataframe(self) -> pl.DataFrame:
r"""Export the content of the state to a DataFrame.
Returns:
The DataFrame.
```pycon
>>> import numpy as np
>>> from arkas.state import NullValueState
>>> state = NullValueState(
... null_count=np.array([0, 1, 2]),
... total_count=np.array([5, 5, 5]),
... columns=["col1", "col2", "col3"],
... )
>>> state.to_dataframe()
shape: (3, 3)
┌────────┬──────┬───────┐
│ column ┆ null ┆ total │
│ --- ┆ --- ┆ --- │
│ str ┆ i64 ┆ i64 │
╞════════╪══════╪═══════╡
│ col1 ┆ 0 ┆ 5 │
│ col2 ┆ 1 ┆ 5 │
│ col3 ┆ 2 ┆ 5 │
└────────┴──────┴───────┘
```
"""
return pl.DataFrame(
{"column": self._columns, "null": self._null_count, "total": self._total_count},
schema={"column": pl.String, "null": pl.Int64, "total": pl.Int64},
)
@classmethod
def from_dataframe(
cls, dataframe: pl.DataFrame, figure_config: BaseFigureConfig | None = None
) -> NullValueState:
r"""Instantiate a ``NullValueState`` object from a DataFrame.
Args:
dataframe: The DataFrame.
figure_config: An optional figure configuration.
Returns:
The instantiated ``NullValueState`` object.
Example usage:
```pycon
>>> import polars as pl
>>> from arkas.state import NullValueState
>>> frame = pl.DataFrame(
... {
... "col1": [0, 1, 1, 0, 0, 1, None],
... "col2": [0, 1, None, None, 0, 1, 0],
... "col3": [None, 0, 0, 0, None, 1, None],
... }
... )
>>> state = NullValueState.from_dataframe(frame)
>>> state
NullValueState(num_columns=3, figure_config=MatplotlibFigureConfig())
```
"""
nrows, ncols = dataframe.shape
return cls(
columns=list(dataframe.columns),
null_count=compute_null_count(dataframe),
total_count=np.full((ncols,), nrows),
figure_config=figure_config,
)
r"""Implement the Series state."""
from __future__ import annotations
__all__ = ["SeriesState"]
import sys
from typing import TYPE_CHECKING, Any
from coola import objects_are_equal
from coola.utils.format import repr_mapping_line, str_indent, str_mapping
from arkas.figure.utils import get_default_config
from arkas.state.base import BaseState
if sys.version_info >= (3, 11):
from typing import Self
else: # pragma: no cover
from typing_extensions import (
Self, # use backport because it was added in python 3.11
)
if TYPE_CHECKING:
import polars as pl
from arkas.figure.base import BaseFigureConfig
class SeriesState(BaseState):
r"""Implement the Series state.
Args:
series: The Series.
figure_config: An optional figure configuration.
Example usage:
```pycon
>>> import polars as pl
>>> from arkas.state import SeriesState
>>> state = SeriesState(pl.Series("col1", [1, 2, 3, 4, 5, 6, 7]))
>>> state
SeriesState(name='col1', values=(7,), figure_config=MatplotlibFigureConfig())
```
"""
def __init__(
self,
series: pl.Series,
figure_config: BaseFigureConfig | None = None,
) -> None:
self._series = series
self._figure_config = figure_config or get_default_config()
def __repr__(self) -> str:
args = repr_mapping_line(
{
"name": self._series.name,
"values": self._series.shape,
"figure_config": self._figure_config,
}
)
return f"{self.__class__.__qualname__}({args})"
def __str__(self) -> str:
args = str_indent(
str_mapping(
{
"name": self._series.name,
"values": self._series.shape,
"figure_config": self._figure_config,
}
)
)
return f"{self.__class__.__qualname__}({args})"
@property
def series(self) -> pl.Series:
return self._series
@property
def figure_config(self) -> BaseFigureConfig | None:
return self._figure_config
def clone(self, deep: bool = True) -> Self:
return self.__class__(
series=self._series.clone() if deep else self._series,
figure_config=self._figure_config.clone() if deep else self._figure_config,
)
def equal(self, other: Any, equal_nan: bool = False) -> bool:
if not isinstance(other, self.__class__):
return False
return objects_are_equal(self.get_args(), other.get_args(), equal_nan=equal_nan)
def get_args(self) -> dict:
return {
"series": self._series,
"figure_config": self._figure_config,
}
r"""Contain utility functions to manage ranges of values."""
from __future__ import annotations
__all__ = ["find_range"]
import numpy as np
def find_range(
values: np.ndarray,
xmin: float | str | None = None,
xmax: float | str | None = None,
) -> tuple[float, float]:
r"""Find a valid range of value.
Args:
values: The values used to find the quantiles.
xmin: The minimum value of the range or its
associated quantile. ``q0.1`` means the 10% quantile.
``0`` is the minimum value and ``1`` is the maximum value.
xmax: The maximum value of the range or its
associated quantile. ``q0.9`` means the 90% quantile.
``0`` is the minimum value and ``1`` is the maximum value.
Returns:
The range of values in the format ``(min, max)``.
It returns ``(nan, nan)`` if the input array is empty.
Example usage:
```pycon
>>> import numpy as np
>>> from arkas.utils.range import find_range
>>> data = np.arange(101)
>>> find_range(data)
(0, 100)
>>> find_range(data, xmin=5, xmax=50)
(5, 50)
>>> find_range(data, xmin="q0.1", xmax="q0.9")
(10.0, 90.0)
```
"""
if values.size == 0:
return float("nan"), float("nan")
if xmin is None:
xmin = np.nanmin(values).item()
if xmax is None:
xmax = np.nanmax(values).item()
q = [float(x[1:]) for x in [xmin, xmax] if isinstance(x, str)]
quantiles = np.nanquantile(values, q)
if isinstance(xmin, str):
xmin = quantiles[0]
if isinstance(xmax, str):
xmax = quantiles[-1]
if isinstance(xmin, np.number):
xmin = xmin.item()
if isinstance(xmax, np.number):
xmax = xmax.item()
return (xmin, xmax)
r"""Contain statistics utility functions."""
from __future__ import annotations
__all__ = [
"compute_statistics_continuous",
"compute_statistics_continuous_array",
"compute_statistics_continuous_series",
"quantile",
]
from typing import TYPE_CHECKING
import numpy as np
import polars as pl
from scipy.stats import kurtosis, skew
from arkas.utils.array import nonnan
if TYPE_CHECKING:
from collections.abc import Sequence
def compute_statistics_continuous(data: np.ndarray | pl.Series) -> dict[str, float]:
r"""Return several descriptive statistics for the data with
continuous values.
Args:
data: The data to analyze.
Returns:
The descriptive statistics for the input data.
Example usage:
```pycon
>>> import numpy as np
>>> from arkas.utils.stats import compute_statistics_continuous
>>> compute_statistics_continuous(np.arange(101))
{'count': 101, 'nunique': 101, 'num_non_nulls': 101, 'num_nulls': 0,
'mean': 50.0, 'std': 29.15...,
'skewness': 0.0, 'kurtosis': -1.20..., 'min': 0.0, 'q001': 0.1, 'q01': 1.0,
'q05': 5.0, 'q10': 10.0, 'q25': 25.0, 'median': 50.0, 'q75': 75.0, 'q90': 90.0,
'q95': 95.0, 'q99': 99.0, 'q999': 99.9, 'max': 100.0, '>0': 100, '<0': 0, '=0': 1}
```
"""
if isinstance(data, pl.Series):
return compute_statistics_continuous_series(data)
return compute_statistics_continuous_array(data)
def compute_statistics_continuous_array(array: np.ndarray) -> dict[str, float]:
r"""Return several descriptive statistics for the data with
continuous values.
Args:
array: The data to analyze.
Returns:
The descriptive statistics for the input data.
Example usage:
```pycon
>>> import numpy as np
>>> from arkas.utils.stats import compute_statistics_continuous_array
>>> compute_statistics_continuous_array(np.arange(101))
{'count': 101, 'nunique': 101, 'num_non_nulls': 101, 'num_nulls': 0,
'mean': 50.0, 'std': 29.15...,
'skewness': 0.0, 'kurtosis': -1.20..., 'min': 0.0, 'q001': 0.1, 'q01': 1.0,
'q05': 5.0, 'q10': 10.0, 'q25': 25.0, 'median': 50.0, 'q75': 75.0, 'q90': 90.0,
'q95': 95.0, 'q99': 99.0, 'q999': 99.9, 'max': 100.0, '>0': 100, '<0': 0, '=0': 1}
```
"""
array = array.ravel().astype(np.float64)
array_nonnan = nonnan(array)
stats = {
"count": int(array.size),
"nunique": int(np.unique(array).size),
"num_non_nulls": int(array_nonnan.size),
}
stats["num_nulls"] = stats["count"] - stats["num_non_nulls"]
if array_nonnan.size == 0:
return stats | {
"mean": float("nan"),
"std": float("nan"),
"skewness": float("nan"),
"kurtosis": float("nan"),
"min": float("nan"),
"q001": float("nan"),
"q01": float("nan"),
"q05": float("nan"),
"q10": float("nan"),
"q25": float("nan"),
"median": float("nan"),
"q75": float("nan"),
"q90": float("nan"),
"q95": float("nan"),
"q99": float("nan"),
"q999": float("nan"),
"max": float("nan"),
">0": 0,
"<0": 0,
"=0": 0,
}
quantiles = quantile(
array_nonnan, q=[0.001, 0.01, 0.05, 0.1, 0.25, 0.75, 0.9, 0.95, 0.99, 0.999]
)
return stats | {
"mean": np.mean(array_nonnan).item(),
"std": np.std(array_nonnan).item(),
"skewness": float(skew(array_nonnan)),
"kurtosis": float(kurtosis(array_nonnan)),
"min": np.min(array_nonnan).item(),
"q001": quantiles[0.001],
"q01": quantiles[0.01],
"q05": quantiles[0.05],
"q10": quantiles[0.1],
"q25": quantiles[0.25],
"median": np.median(array_nonnan).item(),
"q75": quantiles[0.75],
"q90": quantiles[0.9],
"q95": quantiles[0.95],
"q99": quantiles[0.99],
"q999": quantiles[0.999],
"max": np.max(array_nonnan).item(),
">0": (array > 0).sum().item(),
"<0": (array < 0).sum().item(),
"=0": (array == 0).sum().item(),
}
def compute_statistics_continuous_series(series: pl.Series) -> dict[str, float]:
r"""Return several descriptive statistics for the data with
continuous values.
Args:
series: The series to analyze.
Returns:
The descriptive statistics for the input data.
Example usage:
```pycon
>>> import polars as pl
>>> from arkas.utils.stats import compute_statistics_continuous_series
>>> compute_statistics_continuous_series(pl.Series(list(range(101))))
{'count': 101, 'nunique': 101, 'num_non_nulls': 101, 'num_nulls': 0,
'mean': 50.0, 'std': 29.15...,
'skewness': 0.0, 'kurtosis': -1.20..., 'min': 0.0, 'q001': 0.1, 'q01': 1.0,
'q05': 5.0, 'q10': 10.0, 'q25': 25.0, 'median': 50.0, 'q75': 75.0, 'q90': 90.0,
'q95': 95.0, 'q99': 99.0, 'q999': 99.9, 'max': 100.0, '>0': 100, '<0': 0, '=0': 1}
```
"""
stats = {
"count": int(series.shape[0]),
"nunique": series.n_unique(),
"num_nulls": int(series.null_count()),
}
stats["num_non_nulls"] = stats["count"] - stats["num_nulls"]
return compute_statistics_continuous_array(series.drop_nulls().to_numpy()) | stats
def quantile(array: np.ndarray, q: Sequence[float]) -> dict[float, float]:
r"""Compute the q-th quantile of the data.
Args:
array: The input data.
q: The quantiles to compute. Values must be between 0 and 1
inclusive.
Returns:
A dictionary with the quantiles values.
Example usage:
```pycon
>>> import numpy as np
>>> from arkas.utils.stats import quantile
>>> quantile(np.arange(101), q=[0.001, 0.01, 0.05, 0.1, 0.25, 0.75, 0.9, 0.95, 0.99, 0.999])
{0.001: 0.1, 0.01: 1.0, 0.05: 5.0, 0.1: 10.0, 0.25: 25.0, 0.75: 75.0,
0.9: 90.0, 0.95: 95.0, 0.99: 99.0, 0.999: 99.9}
```
"""
array = array.ravel()
if array.size == 0:
return {v: float("nan") for v in q}
return dict(zip(q, np.quantile(array.astype(np.float64), q).tolist()))
+3
-3

@@ -1,4 +0,4 @@

Metadata-Version: 2.3
Metadata-Version: 2.1
Name: arkas
Version: 0.0.1a9
Version: 0.0.1a10
Summary: Library to evaluate ML model performances

@@ -29,3 +29,3 @@ Home-page: https://github.com/durandtibo/arkas

Requires-Dist: coola (>=0.8.2,<1.0)
Requires-Dist: grizz (>=0.2.0a15,<1.0)
Requires-Dist: grizz (>=0.2.0a16,<1.0)
Requires-Dist: hya (>=0.2,<1.0) ; extra == "all" or extra == "cli"

@@ -32,0 +32,0 @@ Requires-Dist: hydra-core (>=1.3,<2.0) ; extra == "all" or extra == "cli"

[tool.poetry]
name = "arkas"
version = "0.0.1a9"
version = "0.0.1a10"
description = "Library to evaluate ML model performances"

@@ -35,3 +35,3 @@ readme = "README.md"

coola = ">=0.8.2,<1.0"
grizz = ">=0.2.0a15,<1.0"
grizz = ">=0.2.0a16,<1.0"
iden = ">=0.1,<1.0"

@@ -38,0 +38,0 @@ jinja2 = ">=3.0,<4.0"

@@ -14,6 +14,9 @@ r"""Contain DataFrame analyzers."""

"ContentAnalyzer",
"ContinuousColumnAnalyzer",
"DataFrameSummaryAnalyzer",
"MappingAnalyzer",
"NullValueAnalyzer",
"PlotColumnAnalyzer",
"ScatterColumnAnalyzer",
"TemporalNullValueAnalyzer",
"TemporalPlotColumnAnalyzer",

@@ -31,8 +34,11 @@ "TransformAnalyzer",

from arkas.analyzer.content import ContentAnalyzer
from arkas.analyzer.continuous_column import ContinuousColumnAnalyzer
from arkas.analyzer.frame_summary import DataFrameSummaryAnalyzer
from arkas.analyzer.lazy import BaseInNLazyAnalyzer, BaseLazyAnalyzer
from arkas.analyzer.mapping import MappingAnalyzer
from arkas.analyzer.null_value import NullValueAnalyzer
from arkas.analyzer.plot_column import PlotColumnAnalyzer
from arkas.analyzer.scatter_column import ScatterColumnAnalyzer
from arkas.analyzer.temporal_null_value import TemporalNullValueAnalyzer
from arkas.analyzer.temporal_plot_column import TemporalPlotColumnAnalyzer
from arkas.analyzer.transform import TransformAnalyzer

@@ -12,5 +12,8 @@ r"""Contain HTML content generators."""

"ContentGeneratorDict",
"ContinuousSeriesContentGenerator",
"DataFrameSummaryContentGenerator",
"NullValueContentGenerator",
"PlotColumnContentGenerator",
"ScatterColumnContentGenerator",
"TemporalNullValueContentGenerator",
"TemporalPlotColumnContentGenerator",

@@ -23,7 +26,10 @@ ]

from arkas.content.column_cooccurrence import ColumnCooccurrenceContentGenerator
from arkas.content.continuous_series import ContinuousSeriesContentGenerator
from arkas.content.frame_summary import DataFrameSummaryContentGenerator
from arkas.content.mapping import ContentGeneratorDict
from arkas.content.null_value import NullValueContentGenerator
from arkas.content.plot_column import PlotColumnContentGenerator
from arkas.content.scatter_column import ScatterColumnContentGenerator
from arkas.content.temporal_null_value import TemporalNullValueContentGenerator
from arkas.content.temporal_plot_column import TemporalPlotColumnContentGenerator
from arkas.content.vanilla import ContentGenerator

@@ -50,3 +50,3 @@ r"""Contain the base class to implement an output exporter."""

(exist_ok): False
(show_metrics): True
(show_metrics): False
)

@@ -155,3 +155,3 @@

(exist_ok): False
(show_metrics): True
(show_metrics): False
)

@@ -158,0 +158,0 @@

@@ -67,3 +67,3 @@ r"""Contain the implementation of a metric exporter."""

exist_ok: bool = False,
show_metrics: bool = True,
show_metrics: bool = False,
) -> None:

@@ -70,0 +70,0 @@ self._path = sanitize_path(path)

@@ -68,3 +68,3 @@ r"""Contain an exporter that sequentially calls several exporters."""

(exist_ok): False
(show_metrics): True
(show_metrics): False
)

@@ -71,0 +71,0 @@ (1): FigureExporter(

@@ -12,4 +12,6 @@ r"""Contain data outputs."""

"ContentOutput",
"ContinuousSeriesOutput",
"DataFrameSummaryOutput",
"EmptyOutput",
"NullValueOutput",
"Output",

@@ -19,2 +21,3 @@ "OutputDict",

"ScatterColumnOutput",
"TemporalNullValueOutput",
"TemporalPlotColumnOutput",

@@ -28,2 +31,3 @@ ]

from arkas.output.content import ContentOutput
from arkas.output.continuous_series import ContinuousSeriesOutput
from arkas.output.empty import EmptyOutput

@@ -33,5 +37,7 @@ from arkas.output.frame_summary import DataFrameSummaryOutput

from arkas.output.mapping import OutputDict
from arkas.output.null_value import NullValueOutput
from arkas.output.plot_column import PlotColumnOutput
from arkas.output.scatter_column import ScatterColumnOutput
from arkas.output.temporal_null_value import TemporalNullValueOutput
from arkas.output.temporal_plot_column import TemporalPlotColumnOutput
from arkas.output.vanilla import Output

@@ -5,5 +5,25 @@ r"""Contain plotting functionalities."""

__all__ = ["binary_precision_recall_curve", "binary_roc_curve"]
__all__ = [
"bar_discrete",
"bar_discrete_temporal",
"binary_precision_recall_curve",
"binary_roc_curve",
"boxplot_continuous",
"boxplot_continuous_temporal",
"hist_continuous",
"hist_continuous2",
"plot_cdf",
"plot_null_temporal",
]
from arkas.plot.cdf import plot_cdf
from arkas.plot.continuous import (
boxplot_continuous,
boxplot_continuous_temporal,
hist_continuous,
hist_continuous2,
)
from arkas.plot.discrete import bar_discrete, bar_discrete_temporal
from arkas.plot.null_temporal import plot_null_temporal
from arkas.plot.pr import binary_precision_recall_curve
from arkas.plot.roc import binary_roc_curve

@@ -8,2 +8,4 @@ r"""Contain data plotters."""

"ColumnCooccurrencePlotter",
"ContinuousSeriesPlotter",
"NullValuePlotter",
"PlotColumnPlotter",

@@ -13,2 +15,3 @@ "Plotter",

"ScatterColumnPlotter",
"TemporalNullValuePlotter",
"TemporalPlotColumnPlotter",

@@ -19,6 +22,9 @@ ]

from arkas.plotter.column_cooccurrence import ColumnCooccurrencePlotter
from arkas.plotter.continuous_series import ContinuousSeriesPlotter
from arkas.plotter.mapping import PlotterDict
from arkas.plotter.null_value import NullValuePlotter
from arkas.plotter.plot_column import PlotColumnPlotter
from arkas.plotter.scatter_column import ScatterColumnPlotter
from arkas.plotter.temporal_null_value import TemporalNullValuePlotter
from arkas.plotter.temporal_plot_column import TemporalPlotColumnPlotter
from arkas.plotter.vanilla import Plotter

@@ -35,3 +35,3 @@ r"""Contain the implementation of a DataFrame column plotter."""

Args:
state: The state containing the DataFrame to analyze.
state: The state containing the DataFrame to analyze.

@@ -95,9 +95,10 @@ Returns:

fig, ax = plt.subplots(**state.figure_config.get_arg("init", {}))
for col in state.dataframe:
ax.plot(col.to_numpy(), label=col.name)
xmin, xmax = 0, state.dataframe.shape[0] - 1
if xmin < xmax:
ax.set_xlim(xmin, xmax)
if yscale := state.figure_config.get_arg("yscale"):
ax.set_yscale(yscale)
ax.legend()

@@ -104,0 +105,0 @@ fig.tight_layout()

@@ -19,2 +19,3 @@ r"""Contain the implementation of a DataFrame column plotter."""

from arkas.plotter.vanilla import Plotter
from arkas.utils.range import find_range

@@ -95,8 +96,5 @@ if TYPE_CHECKING:

color = state.dataframe[state.color].to_numpy() if state.color else None
s = ax.scatter(
state.dataframe[state.x].to_numpy(),
state.dataframe[state.y].to_numpy(),
c=color,
label=state.color,
)
x = state.dataframe[state.x].to_numpy()
y = state.dataframe[state.y].to_numpy()
s = ax.scatter(x=x, y=y, c=color, label=state.color)
if color is not None:

@@ -106,2 +104,16 @@ fig.colorbar(s)

xmin, xmax = find_range(
x,
xmin=state.figure_config.get_arg("xmin"),
xmax=state.figure_config.get_arg("xmax"),
)
if xmin < xmax:
ax.set_xlim(xmin, xmax)
ymin, ymax = find_range(
y,
xmin=state.figure_config.get_arg("ymin"),
xmax=state.figure_config.get_arg("ymax"),
)
if ymin < ymax:
ax.set_ylim(ymin, ymax)
ax.set_xlabel(state.x)

@@ -108,0 +120,0 @@ ax.set_ylabel(state.y)

@@ -25,2 +25,3 @@ r"""Contain the implementation of a DataFrame column plotter."""

from arkas.plotter.vanilla import Plotter
from arkas.utils.range import find_range

@@ -133,2 +134,10 @@ if TYPE_CHECKING:

xmin, xmax = find_range(
time,
xmin=state.figure_config.get_arg("xmin"),
xmax=state.figure_config.get_arg("xmax"),
)
if xmin < xmax:
ax.set_xlim(xmin, xmax)
ax.set_xlabel(state.temporal_column)
if yscale := state.figure_config.get_arg("yscale"):

@@ -135,0 +144,0 @@ ax.set_yscale(yscale)

@@ -68,3 +68,3 @@ r"""Contain a simple runner to analyze data."""

(exist_ok): False
(show_metrics): True
(show_metrics): False
)

@@ -71,0 +71,0 @@ (lazy): True

@@ -57,3 +57,3 @@ r"""Contain the base class to implement a runner."""

(exist_ok): False
(show_metrics): True
(show_metrics): False
)

@@ -212,3 +212,3 @@ (lazy): True

(exist_ok): False
(show_metrics): True
(show_metrics): False
)

@@ -215,0 +215,0 @@ (lazy): True

@@ -10,4 +10,6 @@ r"""Contain states."""

"DataFrameState",
"NullValueState",
"PrecisionRecallState",
"ScatterDataFrameState",
"SeriesState",
"TemporalDataFrameState",

@@ -20,4 +22,6 @@ ]

from arkas.state.dataframe import DataFrameState
from arkas.state.null_value import NullValueState
from arkas.state.precision_recall import PrecisionRecallState
from arkas.state.scatter_dataframe import ScatterDataFrameState
from arkas.state.series import SeriesState
from arkas.state.temporal_dataframe import TemporalDataFrameState