Latest Threat Research:SANDWORM_MODE: Shai-Hulud-Style npm Worm Hijacks CI Workflows and Poisons AI Toolchains.Details
Socket
Book a DemoInstallSign in
Socket

deepchecks

Package Overview
Dependencies
Maintainers
1
Versions
59
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

deepchecks - npm Package Compare versions

Comparing version
0.17.3
to
0.17.4
+324
deepchecks/nlp/utils/text_data_plot.py
# ----------------------------------------------------------------------------
# Copyright (C) 2021-2023 Deepchecks (https://www.deepchecks.com)
#
# This file is part of Deepchecks.
# Deepchecks is distributed under the terms of the GNU Affero General
# Public License (version 3 or later).
# You should have received a copy of the GNU Affero General Public License
# along with Deepchecks. If not, see <http://www.gnu.org/licenses/>.
# ----------------------------------------------------------------------------
#
"""A module containing utils for displaying information on TextData object."""
from typing import List, Optional
import numpy as np
import pandas as pd
import plotly.graph_objs as go
from plotly.subplots import make_subplots
from deepchecks.nlp.task_type import TaskType, TTextLabel
from deepchecks.nlp.utils.text import break_to_lines_and_trim
from deepchecks.nlp.utils.text_properties import TEXT_PROPERTIES_DESCRIPTION
from deepchecks.utils.dataframes import un_numpy
from deepchecks.utils.distribution.plot import get_density
from deepchecks.utils.plot import feature_distribution_colors
from deepchecks.utils.strings import format_percent, get_docs_link
__all__ = ['text_data_describe_plot']
def _calculate_annoation_ratio(label, n_samples, is_mutli_label, task_type):
if label is None:
return format_percent(0)
if is_mutli_label or task_type == TaskType.TOKEN_CLASSIFICATION:
annotated_count = _calculate_number_of_annotated_samples(label=label,
is_multi_label=is_mutli_label,
task_type=task_type)
return format_percent(annotated_count / n_samples)
else:
return format_percent(pd.notna(label).sum() / n_samples)
def _get_table_row_data(n_samples, annotation_ratio, categorical_metadata, numerical_metadata,
categorical_properties, numerical_properties, max_values_to_show: int = 5):
info_cell = [n_samples, annotation_ratio]
if categorical_metadata is None or len(categorical_metadata) == 0:
info_cell.append('No categorical metadata')
else:
info_cell.append(', '.join(categorical_metadata) if len(categorical_metadata) <= max_values_to_show
else f'{len(categorical_metadata)} metadata columns')
if numerical_metadata is None or len(numerical_metadata) == 0:
info_cell.append('No numerical metadata')
else:
info_cell.append(', '.join(numerical_metadata) if len(numerical_metadata) <= max_values_to_show
else f'{len(numerical_metadata)} metadata columns')
if categorical_properties is None or len(categorical_properties) == 0:
info_cell.append('No categorical properties')
else:
info_cell.append(', '.join(categorical_properties) if len(categorical_properties) <= max_values_to_show
else f'{len(categorical_properties)} properties')
if numerical_properties is None or len(numerical_properties) == 0:
info_cell.append('No numerical properties')
else:
info_cell.append(', '.join(numerical_properties) if len(numerical_properties) <= max_values_to_show
else f'{len(numerical_properties)} properties')
return info_cell
def _generate_table_trace(n_samples, annotation_ratio, categorical_metadata, numerical_metadata,
categorical_properties, numerical_properties):
data_cell = ['<b>Number of samples</b>', '<b>Annotation ratio</b>', '<b>Metadata categorical columns</b>',
'<b>Metadata numerical columns</b>', '<b>Categorical properties</b>', '<b>Numerical properties</b>']
info_cell = _get_table_row_data(n_samples=n_samples, annotation_ratio=annotation_ratio,
categorical_metadata=categorical_metadata, numerical_metadata=numerical_metadata,
categorical_properties=categorical_properties,
numerical_properties=numerical_properties, max_values_to_show=7)
trace = go.Table(header={'fill': {'color': 'white'}},
cells={'values': [data_cell, info_cell], 'align': ['left'], 'font_size': 12,
'height': 30})
return trace
def _generate_categorical_distribution_plot(data, property_name):
dist_counts = data.value_counts(normalize=True).to_dict()
counts = list(dist_counts.values())
categories_list = list(dist_counts.keys())
cat_df = pd.DataFrame({property_name: counts}, index=[un_numpy(cat) for cat in categories_list])
trace = go.Bar(x=cat_df.index, y=cat_df[property_name], showlegend=False,
marker={'color': feature_distribution_colors['feature']},
hovertemplate='<b>Value:</b> %{x}<br><b>Frequency:</b> %{y}<extra></extra>')
yaxis_layout = dict(type='log', title='Frequency (Log Scale)')
xaxis_layout = dict(title=property_name)
return trace, xaxis_layout, yaxis_layout
def _get_distribution_values(data):
mean = data.mean()
median = data.median()
x_range = (data.min(), data.max())
if all(int(x) == x for x in data if x is not None):
# If the distribution is discrete, we take all the values in it:
xs = sorted(np.unique(data))
if len(xs) > 50:
# If there are too many values, we take only 50, using a constant interval between them:
xs = list(range(int(xs[0]), int(xs[-1]) + 1, int((xs[-1] - xs[0]) // 50)))
else:
xs = sorted(np.concatenate((np.linspace(x_range[0], x_range[1], 50),
np.quantile(data, q=np.arange(0.02, 1, 0.02)),
[mean, median]
)))
ixs = np.searchsorted(sorted(data), xs, side='left')
xs = [xs[i] for i in range(len(ixs)) if ixs[i] != ixs[i - 1]]
y_value = get_density(data, xs)
return y_value, xs
def _calculate_number_of_annotated_samples(label, is_multi_label, task_type):
if is_multi_label or task_type == TaskType.TOKEN_CLASSIFICATION:
annotated_count = 0
for label_data in label:
annotated_count = annotated_count + 1 if len(label_data) > 0 and pd.isna(label_data).sum() == 0 \
else annotated_count
return annotated_count
else:
return pd.notna(label).sum()
def _generate_numeric_distribution_plot(data, x_value, y_value, property_name):
mean = data.mean()
percentile_90 = data.quantile(0.9)
percentile_10 = data.quantile(0.1)
median = data.median()
trace = go.Scatter(x=x_value, y=y_value, fill='tozeroy', showlegend=False,
hovertemplate=f'<b>{property_name}:</b> ''%{x}<br><b>Density:</b> %{y}<extra></extra>',
line={'color': feature_distribution_colors['feature'],
'shape': 'linear', 'width': 5})
shapes = []
annotations = []
shapes.append(dict(type='line', x0=mean, y0=0, x1=mean, y1=max(y_value),
line={'color': feature_distribution_colors['measure'], 'dash': 'dash', 'width': 3}))
mean_xpos = mean + max(x_value) * 0.02 if median < mean else mean - max(x_value) * 0.02
annotations.append(dict(x=mean_xpos, y=max(y_value)/2, text='<b>Mean</b>', showarrow=False,
textangle=-90, font={'size': 12}))
shapes.append(dict(type='line', x0=median, y0=0, x1=median, y1=max(y_value),
line={'color': feature_distribution_colors['measure'], 'dash': 'dot', 'width': 3}))
median_xpos = median - max(x_value) * 0.02 if median < mean else median + max(x_value) * 0.02
annotations.append(dict(x=median_xpos, y=max(y_value)/2, text='<b>Median</b>', showarrow=False,
textangle=-90, font={'size': 12}))
shapes.append(dict(type='line', x0=percentile_10, y0=0, x1=percentile_10, y1=max(y_value),
line={'color': feature_distribution_colors['measure'], 'dash': 'dashdot', 'width': 3}))
annotations.append(dict(x=percentile_10 - max(x_value)*0.02, y=max(y_value)/2, textangle=-90,
text='<b>10<sup>th</sup> Percentile</b>', showarrow=False, font={'size': 12}))
shapes.append(dict(type='line', x0=percentile_90, y0=0, x1=percentile_90, y1=max(y_value),
line={'color': feature_distribution_colors['measure'], 'dash': 'dashdot', 'width': 3}))
annotations.append(dict(x=percentile_90 + max(x_value)*0.02, y=max(y_value)/2, textangle=-90,
text='<b>90<sup>th</sup> Percentile</b>', showarrow=False, font={'size': 12}))
xaxis_layout = dict(title=property_name)
yaxis_layout = dict(title='Density')
return trace, shapes, annotations, xaxis_layout, yaxis_layout
def text_data_describe_plot(n_samples: int, max_num_labels_to_show: int,
is_multi_label: bool, task_type: str,
properties: pd.DataFrame,
categorical_metadata: Optional[List[str]] = None,
numerical_metadata: Optional[List[str]] = None,
categorical_properties: Optional[List[str]] = None,
numerical_properties: Optional[List[str]] = None,
model_classes: Optional[List[str]] = None,
label: Optional[TTextLabel] = None):
"""Return a plotly figure instance.
Parameters
----------
properties: pd.DataFrame
The DataFrame consisting of the text properties data. If no prooperties are there, you can pass an
empty DataFrame as well.
n_samples: int
The total number of samples present in the TextData object.
max_num_labels_to_show : int
The threshold to display the maximum number of labels on the label distribution pie chart and display
rest of the labels under "Others" category.
is_multi_label: bool
A boolean where True denotes that the TextData contains multi labeled data otherwise false.
task_type: str
The task type for the text data. Can be either 'text_classification' or 'token_classification'.
categorical_metadata: Optional[List[str]], default: None
The names of the categorical metadata columns.
numerical_metadata: Optional[List[str]], default: None
The names of the numerical metadata columns.
categorical_properties: Optional[List[str]], default: None
The names of the categorical properties columns.
numerical_properties: Optional[List[str]], default: None
The names of the numerical text properties columns.
label: Optional[TTextLabel], default: None
The label for the text data. Can be either a text_classification label or a token_classification label.
If None, the label distribution graph is not generated.
- text_classification label - For text classification the accepted label format differs between multilabel and
single label cases. For single label data, the label should be passed as a sequence of labels, with one entry
per sample that can be either a string or an integer. For multilabel data, the label should be passed as a
sequence of sequences, with the sequence for each sample being a binary vector, representing the presence of
the i-th label in that sample.
- token_classification label - For token classification the accepted label format is the IOB format or similar
to it. The Label must be a sequence of sequences of strings or integers, with each sequence corresponding to
a sample in the tokenized text, and exactly the length of the corresponding tokenized text.
model_classes: Optional[List[str]], default: None
List of classes names to use for multi-label display. Only used if the dataset is multi-label.
Returns
-------
Plotly Figure instance.
"""
specs = [[{'type': 'pie'}, {'type': 'table'}] if label is not None else [{'type': 'table', 'colspan': 2}, None]] + \
[[{'type': 'xy', 'colspan': 2}, None] for _ in range(len(properties.columns))]
subplot_titles = []
if label is not None:
annotated_samples = _calculate_number_of_annotated_samples(label, is_multi_label, task_type)
subplot_titles.append(f'Label Distribution<br><sup>Out of {annotated_samples} annotated samples</sup><br><br>')
subplot_titles.append('') # Empty title for table figure
if not properties.empty:
for prop_name in properties:
if prop_name in TEXT_PROPERTIES_DESCRIPTION:
subplot_titles.append(f'{prop_name} Property Distribution<sup><a href="{get_docs_link()}nlp/'
'usage_guides/nlp_properties.html#deepchecks-built-in-properties">&#x24D8;</a>'
f'</sup><br><sup>{TEXT_PROPERTIES_DESCRIPTION[prop_name]}</sup>')
fig = make_subplots(rows=len(properties.columns) + 1, cols=2, specs=specs, subplot_titles=subplot_titles,
row_heights=[1.5] + [1.0] * len(properties.columns))
# Create label distribution if label is provided
if label is not None:
if is_multi_label:
df_label = pd.DataFrame(label).fillna(0)
if model_classes is not None:
hashmap = {}
for val in label:
model_array = np.array([model_classes[i] for i, val in enumerate(val) if val == 1])
for class_name in model_array:
hashmap[class_name] = hashmap[class_name] + 1 if class_name in hashmap else 1
label_counts = pd.Series(list(hashmap.values()), index=list(hashmap))
else:
label_counts = pd.Series(np.sum(df_label.to_numpy(), axis=0))
elif task_type == TaskType.TOKEN_CLASSIFICATION:
hashmap = {}
for val in label:
flattened_array = pd.Series(np.array(val).flatten()).fillna('NaN').to_numpy()
unique_values, counts = np.unique(flattened_array, return_counts=True)
for label_value, count in zip(unique_values, counts):
if label_value != 'NaN':
hashmap[label_value] = hashmap[label_value] + count if label_value in hashmap else count
label_counts = pd.Series(list(hashmap.values()), index=list(hashmap))
else:
label_counts = pd.Series(label).value_counts()
label_counts.sort_values(ascending=False, inplace=True)
labels_to_display = label_counts[:max_num_labels_to_show]
labels_to_display.index = [break_to_lines_and_trim(str(label)) for label in list(labels_to_display.index)]
count_other_labels = label_counts[max_num_labels_to_show + 1:].sum()
labels_to_display['Others'] = count_other_labels
# Pie chart for label distribution
fig.add_trace(go.Pie(labels=list(labels_to_display.index), values=list(labels_to_display),
textposition='inside', showlegend=False, textinfo='label+percent',
hovertemplate='%{label}: %{value} samples<extra></extra>'), row=1, col=1)
# Table figure for displaying some statistics
annotation_ratio = _calculate_annoation_ratio(label, n_samples, is_multi_label, task_type)
table_trace = _generate_table_trace(n_samples, annotation_ratio, categorical_metadata, numerical_metadata,
categorical_properties, numerical_properties)
fig.add_trace(table_trace, row=1, col=2 if label is not None else 1)
# Looping over all the properties to generate respective property distribution graphs
curr_row = 2 # Since row 1 is occupied with Pie and Table
for property_name in properties.columns:
if property_name in categorical_properties:
# Creating bar plots for categorical properties
trace, xaxis_layout, yaxis_layout = _generate_categorical_distribution_plot(
properties[property_name], property_name
)
fig.add_trace(trace, row=curr_row, col=1)
fig.update_xaxes(xaxis_layout, row=curr_row, col=1)
fig.update_yaxes(yaxis_layout, row=curr_row, col=1)
else:
# Creating scatter plots for numerical properties
y_value, xs = _get_distribution_values(properties[property_name])
trace, shapes, annotations, xaxis_layout, yaxis_layout = _generate_numeric_distribution_plot(
properties[property_name],
xs, y_value, property_name
)
fig.add_trace(trace, row=curr_row, col=1)
for shape, annotation in zip(shapes, annotations):
fig.add_shape(shape, row=curr_row, col=1)
fig.add_annotation(annotation, row=curr_row, col=1)
fig.update_yaxes(yaxis_layout, row=curr_row, col=1)
fig.update_xaxes(xaxis_layout, row=curr_row, col=1)
curr_row += 1
fig.update_layout(height=450*(len(properties.columns) + 1))
return fig
# ----------------------------------------------------------------------------
# Copyright (C) 2021-2023 Deepchecks (https://www.deepchecks.com)
#
# This file is part of Deepchecks.
# Deepchecks is distributed under the terms of the GNU Affero General
# Public License (version 3 or later).
# You should have received a copy of the GNU Affero General Public License
# along with Deepchecks. If not, see <http://www.gnu.org/licenses/>.
# ----------------------------------------------------------------------------
#
"""Module containing the text properties models for the NLP module."""
import importlib
import pathlib
from functools import lru_cache
from typing import Optional, Union
import requests
from nltk import corpus
MODELS_STORAGE = pathlib.Path(__file__).absolute().parent / '.nlp-models'
def get_create_model_storage(models_storage: Union[pathlib.Path, str, None] = None):
"""Get the models storage directory and create it if needed."""
if models_storage is None:
models_storage = MODELS_STORAGE
else:
if isinstance(models_storage, str):
models_storage = pathlib.Path(models_storage)
if not isinstance(models_storage, pathlib.Path):
raise ValueError(
f'Unexpected type of the "models_storage" parameter - {type(models_storage)}'
)
if not models_storage.exists():
models_storage.mkdir(parents=True)
if not models_storage.is_dir():
raise ValueError('"model_storage" expected to be a directory')
return models_storage
def _get_transformer_model(
property_name: str,
model_name: str,
device: Optional[str] = None,
quantize_model: bool = False,
models_storage: Union[pathlib.Path, str, None] = None
):
"""Get the transformer model and decide if to use optimum.onnxruntime.
optimum.onnxruntime is used to optimize running times on CPU.
"""
models_storage = get_create_model_storage(models_storage)
if device not in (None, 'cpu'):
transformers = import_optional_property_dependency('transformers', property_name=property_name)
# TODO: quantize if 'quantize_model' is True
return transformers.AutoModelForSequenceClassification.from_pretrained(
model_name,
cache_dir=models_storage,
device_map=device
)
onnx = import_optional_property_dependency(
'optimum.onnxruntime',
property_name=property_name,
error_template=(
f'The device was set to {device} while computing the {property_name} property,'
'in which case deepchecks resorts to accelerating the inference by using optimum,'
'bit it is not installed. Either:\n'
'\t- Set the device according to your hardware;\n'
'\t- Install optimum by running "pip install optimum";\n'
'\t- Install all dependencies needed for text properties by running '
'"pip install deepchecks[nlp-properties]";\n'
)
)
if quantize_model is False:
model_path = models_storage / 'onnx' / model_name
if model_path.exists():
return onnx.ORTModelForSequenceClassification.from_pretrained(model_path).to(device or -1)
model = onnx.ORTModelForSequenceClassification.from_pretrained(
model_name,
export=True,
cache_dir=models_storage,
).to(device or -1)
# NOTE:
# 'optimum', after exporting/converting a model to the ONNX format,
# does not store it onto disk we need to save it now to not reconvert
# it each time
model.save_pretrained(model_path)
return model
model_path = models_storage / 'onnx' / 'quantized' / model_name
if model_path.exists():
return onnx.ORTModelForSequenceClassification.from_pretrained(model_path).to(device or -1)
not_quantized_model = _get_transformer_model(
property_name,
model_name,
device,
quantize_model=False,
models_storage=models_storage
)
quantizer = onnx.ORTQuantizer.from_pretrained(not_quantized_model).to(device or -1)
quantizer.quantize(
save_dir=model_path,
# TODO: make it possible to provide a config as a parameter
quantization_config=onnx.configuration.AutoQuantizationConfig.avx512_vnni(
is_static=False,
per_channel=False
)
)
return onnx.ORTModelForSequenceClassification.from_pretrained(model_path).to(device or -1)
def import_optional_property_dependency(
module: str,
property_name: str,
package_name: Optional[str] = None,
error_template: Optional[str] = None
):
"""Import additional modules in runtime."""
try:
lib = importlib.import_module(module)
except ImportError as error:
package_name = package_name or module.split('.', maxsplit=1)[0]
error_template = error_template or (
'property {property_name} requires the {package_name} python package. '
'To get it, run:\n'
'>> pip install {package_name}\n\n'
'You may install dependencies for all text properties by running:\n'
'>> pip install deepchecks[nlp-properties]\n'
)
raise ImportError(error_template.format(
property_name=property_name,
package_name=package_name
)) from error
else:
return lib
def get_transformer_pipeline(
property_name: str,
model_name: str,
device: Optional[str] = None,
models_storage: Union[pathlib.Path, str, None] = None,
use_cache=False
):
"""Return a transformers pipeline for the given model name."""
if use_cache:
return _get_transformer_pipeline(property_name, model_name, device, models_storage)
# __wrapped__ is simply the function without decoration, in our case - without caching
return _get_transformer_pipeline.__wrapped__(property_name, model_name, device, models_storage)
@lru_cache(maxsize=5)
def _get_transformer_pipeline(
property_name: str,
model_name: str,
device: Optional[str] = None,
models_storage: Union[pathlib.Path, str, None] = None
):
"""Return a transformers pipeline for the given model name."""
transformers = import_optional_property_dependency('transformers', property_name=property_name)
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name, device_map=device)
model = _get_transformer_model(
property_name=property_name,
model_name=model_name,
device=device,
models_storage=models_storage
)
return transformers.pipeline(
'text-classification',
model=model,
tokenizer=tokenizer,
device=device
)
def get_cmudict_dict(use_cache=False):
"""Return corpus as dict."""
if use_cache:
return _get_cmudict_dict()
return _get_cmudict_dict.__wrapped__()
@lru_cache(maxsize=1)
def _get_cmudict_dict():
cmudict_dict = corpus.cmudict.dict()
return cmudict_dict
FASTTEXT_LANG_MODEL = 'https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin'
def get_fasttext_model(models_storage: Union[pathlib.Path, str, None] = None, use_cache=False):
"""Return fasttext model."""
if use_cache:
return _get_fasttext_model(models_storage)
return _get_fasttext_model.__wrapped__(models_storage)
@lru_cache(maxsize=1)
def _get_fasttext_model(models_storage: Union[pathlib.Path, str, None] = None):
"""Return fasttext model."""
fasttext = import_optional_property_dependency(module='fasttext', property_name='language')
model_name = FASTTEXT_LANG_MODEL.rsplit('/', maxsplit=1)[-1]
model_path = get_create_model_storage(models_storage)
model_path = model_path / 'fasttext'
if not model_path.exists():
model_path.mkdir(parents=True)
model_path = model_path / model_name
# Save the model to a file
if not model_path.exists():
response = requests.get(FASTTEXT_LANG_MODEL, timeout=240)
if response.status_code != 200:
raise RuntimeError('Failed to donwload fasttext model')
model_path.write_bytes(response.content)
# This weird code is to suppress a warning from fasttext about a deprecated function
try:
fasttext.FastText.eprint = lambda *args, **kwargs: None
fasttext_model = fasttext.load_model(str(model_path))
except Exception as exp:
raise exp
return fasttext_model
+2
-2
Metadata-Version: 2.1
Name: deepchecks
Version: 0.17.3
Version: 0.17.4
Summary: Package for validating your machine learning model and data

@@ -9,3 +9,3 @@ Home-page: https://github.com/deepchecks/deepchecks

License: UNKNOWN
Download-URL: https://github.com/deepchecks/deepchecks/releases/download/0.17.3/deepchecks-0.17.3.tar.gz
Download-URL: https://github.com/deepchecks/deepchecks/releases/download/0.17.4/deepchecks-0.17.4.tar.gz
Project-URL: Documentation, https://docs.deepchecks.com

@@ -12,0 +12,0 @@ Project-URL: Bug Reports, https://github.com/deepchecks/deepchecks

@@ -9,3 +9,3 @@ pandas>=1.1.5

category-encoders>=2.3.0
scipy>=1.4.1
scipy<=1.10.1,>=1.4.1
plotly>=5.13.1

@@ -46,3 +46,3 @@ matplotlib>=3.3.4

[nlp-properties]
optimum[onnxruntime]>=1.7.0
optimum[onnxruntime]>=1.8.8
fasttext>=0.8.0

@@ -49,0 +49,0 @@

@@ -119,4 +119,6 @@ DESCRIPTION.rst

deepchecks/nlp/utils/text.py
deepchecks/nlp/utils/text_data_plot.py
deepchecks/nlp/utils/text_embeddings.py
deepchecks/nlp/utils/text_properties.py
deepchecks/nlp/utils/text_properties_models.py
deepchecks/nlp/utils/token_classification_utils.py

@@ -123,0 +125,0 @@ deepchecks/nlp/utils/weak_segments.py

@@ -50,3 +50,4 @@ # ----------------------------------------------------------------------------

deepchecks.__version__)
os.environ['DISABLE_DEEPCHECKS_ANONYMOUS_TELEMETRY'] = 'True' # to ignore joblib
except Exception: # pylint: disable=broad-except
pass

@@ -91,2 +91,4 @@ # ----------------------------------------------------------------------------

labels = dataset.label
elif dataset.task_type is TaskType.OTHER:
raise DeepchecksValueError('Check is irrelevant when task type is not specified')
else:

@@ -93,0 +95,0 @@ raise DeepchecksValueError(f'Unknown task type - {dataset.task_type}')

@@ -105,2 +105,5 @@ # ----------------------------------------------------------------------------

sample = random.sample(sample, self.max_chars_to_review_per_sample)
if len(sample) == 0:
percent_special_chars_in_sample[idx] = 0
continue
special_chars_in_sample = [char for char in sample if char in self.special_characters_deny_list]

@@ -107,0 +110,0 @@ percent_special_chars_in_sample[idx] = len(special_chars_in_sample) / len(sample)

@@ -219,2 +219,4 @@ # ----------------------------------------------------------------------------

continue
if isinstance(info, str):
continue
if info['outlier_ratio'] > threshold:

@@ -221,0 +223,0 @@ failed_properties.append(property_name)

@@ -12,3 +12,3 @@ # ----------------------------------------------------------------------------

"""Module of the under annotated segments check."""
from typing import Dict, List, Tuple, Union
from typing import Dict, List, Optional, Tuple, Union

@@ -22,3 +22,3 @@ import numpy as np

from deepchecks.core.check_result import DisplayMap
from deepchecks.core.errors import DeepchecksProcessError
from deepchecks.core.errors import NotEnoughSamplesError
from deepchecks.nlp import Context, SingleDatasetCheck

@@ -35,2 +35,6 @@ from deepchecks.nlp.utils.text import break_to_lines_and_trim

MAX_SAMPLES_IN_FIGURE = 1000
# The threshold the UnderAnnotatedSegments considers the data to be well
# annotated and skips the checks
ANNOTATION_RATIO_THRESHOLD = 95.0
MIN_TEXT_SAMPLES = 10 # Min samples to calculate under annotated segments

@@ -43,4 +47,4 @@

ignore_columns: Union[Hashable, List[Hashable], None], n_top_features: int,
segment_minimum_size_ratio: float, n_samples: int,
categorical_aggregation_threshold: float, n_to_show: int, **kwargs):
segment_minimum_size_ratio: float, n_samples: int, n_to_show: int,
categorical_aggregation_threshold: float, multiple_segments_per_feature: bool, **kwargs):
super().__init__(**kwargs)

@@ -55,2 +59,4 @@ self.segment_by = segment_by

self.categorical_aggregation_threshold = categorical_aggregation_threshold
self.annotation_ratio_threshold = ANNOTATION_RATIO_THRESHOLD
self.multiple_segments_per_feature = multiple_segments_per_feature

@@ -67,2 +73,13 @@ def run_logic(self, context: Context, dataset_kind) -> CheckResult:

score_per_sample = pd.Series([1 - is_label_none(x) for x in text_data.label], index=features.index)
annotation_ratio = round(score_per_sample.sum() * 100 / text_data.n_samples, 2)
if annotation_ratio > self.annotation_ratio_threshold:
display_msg = f'Under annotated {self.segment_by} segments check is skipped since your data ' \
f'annotation ratio is > {self.annotation_ratio_threshold}%. Try increasing the ' \
'annotation_ratio_threshold parameter.'
return CheckResult(value={'message': display_msg}, display=[display_msg])
if text_data.n_samples < MIN_TEXT_SAMPLES:
raise NotEnoughSamplesError(f'Not enough samples to calculate under annotated {self.segment_by} '
'segments. Minimum 10 samples required.')
encoded_dataset = self._target_encode_categorical_features_fill_na(features, score_per_sample,

@@ -74,8 +91,9 @@ cat_features)

score_per_sample=score_per_sample,
scorer_name='Annotation Ratio')
scorer_name='Annotation Ratio',
multiple_segments_per_feature=self.multiple_segments_per_feature)
if len(weak_segments) == 0:
raise DeepchecksProcessError('Check was unable to find under annotated segments. This is expected if '
'your data is well annotated. If this is not the case, try increasing '
f'n_samples or supply more {self.segment_by}.')
display_msg = 'Check was unable to find under annotated segments. Try ' \
f'supplying more {self.segment_by}.'
return CheckResult(value={'message': display_msg}, display=[display_msg])

@@ -233,3 +251,3 @@ check_result_value = self._generate_check_result_value(weak_segments, cat_features, avg_score)

Properties to ignore, if none given checks based on properties variable
n_top_properties : int , default: 10
n_top_properties : Optional[int] , default: 10
Number of properties to use for segment search. Top properties are selected based on feature importance.

@@ -245,2 +263,5 @@ segment_minimum_size_ratio: float , default: 0.05

In each categorical column, categories with frequency below threshold will be merged into "Other" category.
multiple_segments_per_property : bool , default: False
If True, will allow the same property to be a segmenting feature in multiple segments,
otherwise each property can appear in one segment at most.
"""

@@ -251,3 +272,3 @@

ignore_properties: Union[Hashable, List[Hashable], None] = None,
n_top_properties: int = 15,
n_top_properties: Optional[int] = 10,
segment_minimum_size_ratio: float = 0.05,

@@ -257,2 +278,3 @@ n_samples: int = 10_000,

n_to_show: int = 3,
multiple_segments_per_property: bool = False,
**kwargs):

@@ -267,2 +289,3 @@ super().__init__(segment_by='properties',

categorical_aggregation_threshold=categorical_aggregation_threshold,
multiple_segments_per_feature=multiple_segments_per_property,
**kwargs)

@@ -289,3 +312,3 @@

Columns to ignore, if none given checks based on columns variable
n_top_columns : int , default: 10
n_top_columns : Optional[int] , default: 10
Number of features to use for segment search. Top columns are selected based on feature importance.

@@ -301,2 +324,5 @@ segment_minimum_size_ratio: float , default: 0.05

In each categorical column, categories with frequency below threshold will be merged into "Other" category.
multiple_segments_per_column : bool , default: True
If True, will allow the same metadata column to be a segmenting column in multiple segments,
otherwise each metadata column can appear in one segment at most.
"""

@@ -307,3 +333,3 @@

ignore_columns: Union[Hashable, List[Hashable], None] = None,
n_top_columns: int = 10,
n_top_columns: Optional[int] = 10,
segment_minimum_size_ratio: float = 0.05,

@@ -313,2 +339,3 @@ n_samples: int = 10_000,

n_to_show: int = 3,
multiple_segments_per_column: bool = True,
**kwargs):

@@ -323,2 +350,3 @@ super().__init__(segment_by='metadata',

categorical_aggregation_threshold=categorical_aggregation_threshold,
multiple_segments_per_feature=multiple_segments_per_column,
**kwargs)

@@ -20,3 +20,3 @@ # ----------------------------------------------------------------------------

from deepchecks.core.check_result import DisplayMap
from deepchecks.core.errors import DeepchecksNotSupportedError, DeepchecksProcessError
from deepchecks.core.errors import DeepchecksNotSupportedError, NotEnoughSamplesError
from deepchecks.nlp import Context, SingleDatasetCheck

@@ -31,3 +31,5 @@ from deepchecks.nlp.utils.weak_segments import get_relevant_data_table

MIN_TEXT_SAMPLES = 10 # Min samples to calculate weak segments performance
class WeakSegmentsAbstractText(SingleDatasetCheck, WeakSegmentAbstract):

@@ -40,3 +42,4 @@ """Check the performance of the model on different segments of the data."""

score_per_sample: Union[np.ndarray, pd.Series, None], n_samples: int,
categorical_aggregation_threshold: float, n_to_show: int, **kwargs):
categorical_aggregation_threshold: float, n_to_show: int,
multiple_segments_per_feature: bool = False, **kwargs):
super().__init__(**kwargs)

@@ -53,2 +56,3 @@ self.segment_by = segment_by

self.categorical_aggregation_threshold = categorical_aggregation_threshold
self.multiple_segments_per_feature = multiple_segments_per_feature

@@ -62,2 +66,5 @@ def run_logic(self, context: Context, dataset_kind) -> CheckResult:

if text_data.n_samples < MIN_TEXT_SAMPLES:
raise NotEnoughSamplesError(f'Not enough samples to find weak {self.segment_by} segments.'
f' Minimum {MIN_TEXT_SAMPLES} samples required.')
features, cat_features = get_relevant_data_table(text_data, data_type=self.segment_by,

@@ -110,7 +117,9 @@ columns=self.columns, ignore_columns=self.ignore_columns,

feature_rank_for_search=np.asarray(encoded_dataset.features),
dummy_model=dummy_model, scorer=scorer)
dummy_model=dummy_model, scorer=scorer,
multiple_segments_per_feature=self.multiple_segments_per_feature)
if len(weak_segments) == 0:
raise DeepchecksProcessError('WeakSegmentsPerformance was unable to train an error model to find weak '
f'segments. Try increasing n_samples or supply more {self.segment_by}.')
display_msg = 'WeakSegmentsPerformance was unable to train an error model to find weak segments.'\
f'Try supplying additional {self.segment_by}.'
return CheckResult(value={'message': display_msg}, display=[display_msg])

@@ -172,2 +181,5 @@ if context.with_display:

In each categorical column, categories with frequency below threshold will be merged into "Other" category.
multiple_segments_per_property : bool , default: False
If True, will allow the same property to be a segmenting feature in multiple segments,
otherwise each property can appear in one segment at most.
"""

@@ -178,3 +190,3 @@

ignore_properties: Union[Hashable, List[Hashable], None] = None,
n_top_properties: Optional[int] = 15,
n_top_properties: Optional[int] = 10,
segment_minimum_size_ratio: float = 0.05,

@@ -186,2 +198,3 @@ alternative_scorer: Dict[str, Union[str, Callable]] = None,

n_to_show: int = 3,
multiple_segments_per_property: bool = False,
**kwargs):

@@ -198,2 +211,3 @@ super().__init__(segment_by='properties',

categorical_aggregation_threshold=categorical_aggregation_threshold,
multiple_segments_per_feature=multiple_segments_per_property,
**kwargs)

@@ -242,2 +256,5 @@

In each categorical column, categories with frequency below threshold will be merged into "Other" category.
multiple_segments_column : bool , default: True
If True, will allow the same metadata column to be a segmenting column in multiple segments,
otherwise each metadata column can appear in one segment at most.
"""

@@ -248,3 +265,3 @@

ignore_columns: Union[Hashable, List[Hashable], None] = None,
n_top_columns: Optional[int] = 15,
n_top_columns: Optional[int] = 10,
segment_minimum_size_ratio: float = 0.05,

@@ -256,2 +273,3 @@ alternative_scorer: Dict[str, Union[str, Callable]] = None,

n_to_show: int = 3,
multiple_segments_column: bool = True,
**kwargs):

@@ -268,2 +286,3 @@ super().__init__(segment_by='metadata',

categorical_aggregation_threshold=categorical_aggregation_threshold,
multiple_segments_per_feature=multiple_segments_column,
**kwargs)

@@ -346,3 +346,5 @@ # ----------------------------------------------------------------------------

self._model_classes = self._observed_classes
get_logger().warning('Could not find model\'s classes, using the observed classes')
get_logger().warning('Could not find model\'s classes, using the observed classes. '
'In order to make sure the classes used by the model are inferred correctly, '
'please use the model_classes argument')
return self._model_classes

@@ -349,0 +351,0 @@

@@ -26,2 +26,3 @@ # ----------------------------------------------------------------------------

from deepchecks.nlp.utils.text import break_to_lines_and_trim
from deepchecks.nlp.utils.text_data_plot import text_data_describe_plot
from deepchecks.nlp.utils.text_embeddings import calculate_builtin_embeddings

@@ -415,2 +416,3 @@ from deepchecks.nlp.utils.text_properties import calculate_builtin_properties, get_builtin_properties_types

include_long_calculation_properties: bool = False,
ignore_non_english_samples_for_english_properties: bool = True,
device: t.Optional[str] = None

@@ -431,2 +433,8 @@ ):

ignored.
ignore_non_english_samples_for_english_properties : bool, default True
Whether to ignore samples that are not in English when calculating English properties. If False, samples
that are not in English will be calculated as well. This parameter is ignored when calculating non-English
properties.
English-Only properties WILL NOT work properly on non-English samples, and this parameter should be used
only when you are sure that all the samples are in English.
device : int, default None

@@ -443,2 +451,3 @@ The device to use for the calculation. If None, the default device will be used.

include_long_calculation_properties=include_long_calculation_properties,
ignore_non_english_samples_for_english_properties=ignore_non_english_samples_for_english_properties,
device=device

@@ -536,2 +545,10 @@ )

@property
def numerical_properties(self) -> t.List[str]:
"""Return numerical properties names."""
if self._properties is not None:
return [prop for prop in self._properties.columns if prop not in self._cat_properties]
else:
return []
@property
def task_type(self) -> t.Optional[TaskType]:

@@ -741,3 +758,57 @@ """Return the task type.

def describe(self, n_properties_to_show: t.Optional[int] = 4, properties_to_show: t.Optional[t.List[str]] = None,
max_num_labels_to_show: t.Optional[int] = 5, model_classes: t.Optional[t.List[str]] = None):
"""Provide holistic view of the data.
Generates the following plots:
1. Label distribution
2. Statistics about the data such as number of samples, annotation ratio, list of metadata columns, list of
text properties and so on.
3. Property distribution for the text properties defined either by n_properties_to_show or properties_to_show
parameter.
Parameters
----------
n_properties_to_show : int, default: 4
Number of properties to consider for generating property distribution graphs. If properties_to_show
is provided, this value is ignored.
properties_to_show : List[str], default: None
List of property names to consider for generating property distribution graphs. If None, all the
properties are considered.
max_num_labels_to_show : int, default: 5
The threshold to display the maximum number of labels on the label distribution pie chart and
display rest of the labels under "Others" category.
model_classes : Optional[List[str]], default: None
List of classes names to use for multi-label display. Only used if the dataset is multi-label.
Returns
-------
Displays the Plotly Figure.
"""
prop_names = []
all_properties_data = pd.DataFrame()
if self._properties is None and properties_to_show is not None:
raise DeepchecksValueError('No properties exist!')
elif self._properties is not None:
if properties_to_show is not None:
prop_names = [prop for prop in properties_to_show if prop in self.properties.columns]
if len(prop_names) != len(properties_to_show):
raise DeepchecksValueError(f'{set(properties_to_show)-set(prop_names)} '
'properties does not exist in the TextData object')
else:
prop_names = list(self.properties.columns)[:n_properties_to_show]
all_properties_data = self.properties[prop_names]
fig = text_data_describe_plot(properties=all_properties_data, n_samples=self.n_samples,
is_multi_label=self.is_multi_label_classification(), task_type=self.task_type,
categorical_metadata=self.categorical_metadata,
numerical_metadata=self.numerical_metadata,
categorical_properties=self.categorical_properties,
numerical_properties=self.numerical_properties, label=self._label,
model_classes=model_classes,
max_num_labels_to_show=max_num_labels_to_show)
return fig
@contextlib.contextmanager

@@ -744,0 +815,0 @@ def disable_deepchecks_logger():

@@ -189,4 +189,7 @@ # ----------------------------------------------------------------------------

text_embedding = np.average(text_embeddings, axis=0, weights=text_lens)
text_embedding = text_embedding / np.linalg.norm(text_embedding) # normalizes length to 1
if sum(text_lens) == 0:
text_embedding = np.ones((EMBEDDING_DIM, )) * np.nan
else:
text_embedding = np.average(text_embeddings, axis=0, weights=text_lens)
text_embedding = text_embedding / np.linalg.norm(text_embedding) # normalizes length to 1
result_embeddings.append(text_embedding.tolist())

@@ -193,0 +196,0 @@

@@ -13,3 +13,2 @@ # ----------------------------------------------------------------------------

import gc
import importlib
import pathlib

@@ -19,2 +18,3 @@ import re

import warnings
from collections import defaultdict
from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union

@@ -24,3 +24,2 @@

import pandas as pd
import requests
import textblob

@@ -30,2 +29,3 @@ from nltk import corpus

from nltk import sent_tokenize, word_tokenize
from tqdm import tqdm
from typing_extensions import TypedDict

@@ -35,5 +35,5 @@

from deepchecks.nlp.utils.text import cut_string, hash_text, normalize_text, remove_punctuation
from deepchecks.nlp.utils.text_properties_models import get_cmudict_dict, get_fasttext_model, get_transformer_pipeline
from deepchecks.utils.function import run_available_kwargs
from deepchecks.utils.ipython import create_progress_bar
from deepchecks.utils.strings import format_list, truncate_string
from deepchecks.utils.strings import SPECIAL_CHARACTERS, format_list

@@ -44,6 +44,8 @@ __all__ = ['calculate_builtin_properties', 'get_builtin_properties_types']

MODELS_STORAGE = pathlib.Path(__file__).absolute().parent / '.nlp-models'
FASTTEXT_LANG_MODEL = 'https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin'
DEFAULT_SENTENCE_SAMPLE_SIZE = 300
MAX_CHARS = 512 # Bert accepts max of 512 tokens, so without counting tokens we go for the lower bound.
# all SPECIAL_CHARACTERS - all string.punctuation except for <>@[]^_`{|}~ - all whitespace
NON_PUNCTUATION_SPECIAL_CHARS = frozenset(set(SPECIAL_CHARACTERS) - set(r"""!"#$%&'()*+,-./:;=?\@""")
- set(string.whitespace))
textblob_cache = {}

@@ -107,27 +109,2 @@ words_cache = {}

def _import_optional_property_dependency(
module: str,
property_name: str,
package_name: Optional[str] = None,
error_template: Optional[str] = None
):
try:
lib = importlib.import_module(module)
except ImportError as error:
package_name = package_name or module.split('.', maxsplit=1)[0]
error_template = error_template or (
'property {property_name} requires the {package_name} python package. '
'To get it, run:\n'
'>> pip install {package_name}\n\n'
'You may install dependencies for all text properties by running:\n'
'>> pip install deepchecks[nlp-properties]\n'
)
raise ImportError(error_template.format(
property_name=property_name,
package_name=package_name
)) from error
else:
return lib
def _warn_if_missing_nltk_dependencies(dependency: str, property_name: str):

@@ -139,123 +116,2 @@ """Warn if NLTK dependency is missing."""

def get_create_model_storage(models_storage: Union[pathlib.Path, str, None] = None):
"""Get the models storage directory and create it if needed."""
if models_storage is None:
models_storage = MODELS_STORAGE
else:
if isinstance(models_storage, str):
models_storage = pathlib.Path(models_storage)
if not isinstance(models_storage, pathlib.Path):
raise ValueError(
f'Unexpected type of the "models_storage" parameter - {type(models_storage)}'
)
if not models_storage.exists():
models_storage.mkdir(parents=True)
if not models_storage.is_dir():
raise ValueError('"model_storage" expected to be a directory')
return models_storage
def get_transformer_model(
property_name: str,
model_name: str,
device: Optional[str] = None,
quantize_model: bool = False,
models_storage: Union[pathlib.Path, str, None] = None
):
"""Get the transformer model and decide if to use optimum.onnxruntime.
optimum.onnxruntime is used to optimize running times on CPU.
"""
models_storage = get_create_model_storage(models_storage)
if device not in (None, 'cpu'):
transformers = _import_optional_property_dependency('transformers', property_name=property_name)
# TODO: quantize if 'quantize_model' is True
return transformers.AutoModelForSequenceClassification.from_pretrained(
model_name,
cache_dir=models_storage
)
onnx = _import_optional_property_dependency(
'optimum.onnxruntime',
property_name=property_name,
error_template=(
f'The device was set to {device} while computing the {property_name} property,'
'in which case deepchecks resorts to accelerating the inference by using optimum,'
'bit it is not installed. Either:\n'
'\t- Set the device according to your hardware;\n'
'\t- Install optimum by running "pip install optimum";\n'
'\t- Install all dependencies needed for text properties by running '
'"pip install deepchecks[nlp-properties]";\n'
)
)
if quantize_model is False:
model_path = models_storage / 'onnx' / model_name
if model_path.exists():
return onnx.ORTModelForSequenceClassification.from_pretrained(model_path)
model = onnx.ORTModelForSequenceClassification.from_pretrained(
model_name,
export=True,
cache_dir=models_storage
)
# NOTE:
# 'optimum', after exporting/converting a model to the ONNX format,
# does not store it onto disk we need to save it now to not reconvert
# it each time
model.save_pretrained(model_path)
return model
model_path = models_storage / 'onnx' / 'quantized' / model_name
if model_path.exists():
return onnx.ORTModelForSequenceClassification.from_pretrained(model_path)
not_quantized_model = get_transformer_model(
property_name,
model_name,
device,
quantize_model=False,
models_storage=models_storage
)
quantizer = onnx.ORTQuantizer.from_pretrained(not_quantized_model)
quantizer.quantize(
save_dir=model_path,
# TODO: make it possible to provide a config as a parameter
quantization_config=onnx.configuration.AutoQuantizationConfig.avx512_vnni(
is_static=False,
per_channel=False
)
)
return onnx.ORTModelForSequenceClassification.from_pretrained(model_path)
def get_transformer_pipeline(
property_name: str,
model_name: str,
device: Optional[str] = None,
models_storage: Union[pathlib.Path, str, None] = None
):
"""Return a transformers pipeline for the given model name."""
transformers = _import_optional_property_dependency('transformers', property_name=property_name)
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
model = get_transformer_model(
property_name=property_name,
model_name=model_name,
device=device,
models_storage=models_storage
)
return transformers.pipeline(
'text-classification',
model=model,
tokenizer=tokenizer,
device=device
)
def text_length(text: str) -> int:

@@ -274,2 +130,7 @@ """Return text length."""

"""Return percentage of special characters (as float between 0 and 1)."""
return len([c for c in text if c in NON_PUNCTUATION_SPECIAL_CHARS]) / len(text) if len(text) != 0 else 0
def percentage_punctuation(text: str) -> float:
"""Return percentage of punctuation (as float between 0 and 1)."""
return len([c for c in text if c in string.punctuation]) / len(text) if len(text) != 0 else 0

@@ -284,32 +145,2 @@

def _get_fasttext_model(models_storage: Union[pathlib.Path, str, None] = None):
"""Return fasttext model."""
fasttext = _import_optional_property_dependency(module='fasttext', property_name='language')
model_name = FASTTEXT_LANG_MODEL.rsplit('/', maxsplit=1)[-1]
model_path = get_create_model_storage(models_storage)
model_path = model_path / 'fasttext'
if not model_path.exists():
model_path.mkdir(parents=True)
model_path = model_path / model_name
# Save the model to a file
if not model_path.exists():
response = requests.get(FASTTEXT_LANG_MODEL, timeout=240)
if response.status_code != 200:
raise RuntimeError('Failed to donwload fasttext model')
model_path.write_bytes(response.content)
# This weird code is to suppress a warning from fasttext about a deprecated function
try:
fasttext.FastText.eprint = lambda *args, **kwargs: None
fasttext_model = fasttext.load_model(str(model_path))
except Exception as exp:
raise exp
return fasttext_model
def language(

@@ -323,5 +154,6 @@ text: str,

return None
# Not recommended, takes a long time. Here only to enable to call this function from outside:
# Load the model if it wasn't received as a parameter. This is done to avoid loading the model
# each time the function is called.
if fasttext_model is None:
fasttext_model = _get_fasttext_model()
fasttext_model = get_fasttext_model()

@@ -335,2 +167,16 @@ # Predictions are the first prediction (k=1), only if the probability is above the threshold

def english_text(
text: str,
lang_certainty_threshold: float = 0.8,
fasttext_model: Optional[Dict[object, Any]] = None,
language_property_result: Optional[str] = None
) -> Union[bool, None]:
"""Return whether text is in English or not."""
if not text:
return None
if language_property_result is None:
language_property_result = language(text, lang_certainty_threshold, fasttext_model)
return language_property_result == 'en'
def sentiment(text: str) -> float:

@@ -358,7 +204,12 @@ """Return float representing sentiment."""

def _predict(text: str, classifier, kind: str) -> float:
def predict_on_batch(text_batch: Sequence[str], classifier,
output_formatter: Callable[[Dict[str, Any]], float]) -> Sequence[float]:
"""Return prediction of huggingface Pipeline classifier."""
try:
# TODO: make this way smarter, and not just a hack. Count tokens, for a start. Then not just sample sentences.
# If text is longer than classifier context window, sample it:
# TODO: make this way smarter, and not just a hack. Count tokens, for a start. Then not just sample sentences.
# If text is longer than classifier context window, sample it:
text_list_to_predict = []
reduced_batch_size = len(text_batch) # Initialize the reduced batch size
retry_count = 0
for text in text_batch:
if len(text) > MAX_CHARS:

@@ -375,26 +226,33 @@ sentences = _sample_for_property(text, mode='sentences', limit=10, return_as_list=True)

text_to_use = cut_string(sentences[0], MAX_CHARS)
text = text_to_use
v = classifier(text)
except Exception: # pylint: disable=broad-except
return np.nan
else:
if not v:
return np.nan
v = v[0]
if kind == 'toxicity':
return v['score']
elif kind == 'fluency':
label_value = 'LABEL_1'
elif kind == 'formality':
label_value = 'formal'
text_list_to_predict.append(text_to_use)
else:
raise ValueError('Unsupported value for "kind" parameter')
return (
v['score']
if v['label'] == label_value
else 1 - v['score']
)
text_list_to_predict.append(text)
while reduced_batch_size >= 1:
try:
if reduced_batch_size == 1 or retry_count == 3:
results = []
for text in text_list_to_predict:
try:
v = classifier(text)[0]
results.append(output_formatter(v))
except Exception: # pylint: disable=broad-except
results.append(np.nan)
return results # Return the results if prediction is successful
v_list = classifier(text_list_to_predict, batch_size=reduced_batch_size)
results = []
for v in v_list:
results.append(output_formatter(v))
return results # Return the results if prediction is successful
except Exception: # pylint: disable=broad-except
reduced_batch_size = max(reduced_batch_size // 2, 1) # Reduce the batch size by half
retry_count += 1
return [np.nan] * len(text_batch) # Prediction failed, return NaN values for the original batch size
TOXICITY_MODEL_NAME = 'unitary/toxic-bert'

@@ -406,7 +264,7 @@ FLUENCY_MODEL_NAME = 'prithivida/parrot_fluency_model'

def toxicity(
text: str,
text_batch: Sequence[str],
device: Optional[str] = None,
models_storage: Union[pathlib.Path, str, None] = None,
toxicity_classifier: Optional[object] = None
) -> float:
) -> Sequence[float]:
"""Return float representing toxicity."""

@@ -416,11 +274,15 @@ if toxicity_classifier is None:

property_name='toxicity', model_name=TOXICITY_MODEL_NAME, device=device, models_storage=models_storage)
return _predict(text, toxicity_classifier, 'toxicity')
def output_formatter(v):
return v['score']
return predict_on_batch(text_batch, toxicity_classifier, output_formatter)
def fluency(
text: str,
text_batch: Sequence[str],
device: Optional[str] = None,
models_storage: Union[pathlib.Path, str, None] = None,
fluency_classifier: Optional[object] = None
) -> float:
) -> Sequence[float]:
"""Return float representing fluency."""

@@ -430,11 +292,15 @@ if fluency_classifier is None:

property_name='fluency', model_name=FLUENCY_MODEL_NAME, device=device, models_storage=models_storage)
return _predict(text, fluency_classifier, 'fluency')
def output_formatter(v):
return v['score'] if v['label'] == 'LABEL_1' else 1 - v['score']
return predict_on_batch(text_batch, fluency_classifier, output_formatter)
def formality(
text: str,
text_batch: Sequence[str],
device: Optional[str] = None,
models_storage: Union[pathlib.Path, str, None] = None,
formality_classifier: Optional[object] = None
) -> float:
) -> Sequence[float]:
"""Return float representing formality."""

@@ -444,5 +310,9 @@ if formality_classifier is None:

property_name='formality', model_name=FORMALITY_MODEL_NAME, device=device, models_storage=models_storage)
return _predict(text, formality_classifier, 'formality')
def output_formatter(v):
return v['score'] if v['label'] == 'formal' else 1 - v['score']
return predict_on_batch(text_batch, formality_classifier, output_formatter)
def lexical_density(text: str) -> float:

@@ -467,3 +337,3 @@ """Return a float representing lexical density.

def unique_noun_count(text: str) -> int:
def unique_noun_count(text: Sequence[str]) -> int:
"""Return the number of unique noun words in the text."""

@@ -491,3 +361,3 @@ if pd.isna(text):

if not nltk_download('cmudict', quiet=True):
_warn_if_missing_nltk_dependencies('cmudict', 'Readability Score')
_warn_if_missing_nltk_dependencies('cmudict', 'Reading Ease')
return np.nan

@@ -622,2 +492,18 @@ cmudict_dict = corpus.cmudict.dict()

def _batch_wrapper(text_batch: Sequence[str], func: Callable, **kwargs) -> List[Any]:
"""Wrap the non-batched properties execution with batches API."""
results = []
language_property_result = []
if 'language_property_result' in kwargs:
language_property_result = kwargs.pop('language_property_result')
language_property_exists = len(language_property_result) > 0
for i, text in enumerate(text_batch):
kwargs['language_property_result'] = language_property_result[i] if language_property_exists else None
results.append(run_available_kwargs(func, text=text, **kwargs))
return results
class TextProperty(TypedDict):

@@ -629,46 +515,46 @@ name: str

DEFAULT_PROPERTIES: Tuple[TextProperty, ...] = (
{'name': 'Text Length', 'method': text_length, 'output_type': 'numeric'},
{'name': 'Average Word Length', 'method': average_word_length, 'output_type': 'numeric'},
{'name': 'Max Word Length', 'method': max_word_length, 'output_type': 'numeric'},
{'name': '% Special Characters', 'method': percentage_special_characters, 'output_type': 'numeric'},
{'name': 'Language', 'method': language, 'output_type': 'categorical'},
{'name': 'Sentiment', 'method': sentiment, 'output_type': 'numeric'},
{'name': 'Subjectivity', 'method': subjectivity, 'output_type': 'numeric'},
{'name': 'Average Words Per Sentence', 'method': average_words_per_sentence, 'output_type': 'numeric'},
{'name': 'Readability Score', 'method': readability_score, 'output_type': 'numeric'},
{'name': 'Lexical Density', 'method': lexical_density, 'output_type': 'numeric'},
{'name': 'Toxicity', 'method': toxicity, 'output_type': 'numeric'},
{'name': 'Fluency', 'method': fluency, 'output_type': 'numeric'},
{'name': 'Formality', 'method': formality, 'output_type': 'numeric'},
{'name': 'Unique Noun Count', 'method': unique_noun_count, 'output_type': 'numeric'},
)
DEFAULT_PROPERTIES: Tuple[TextProperty, ...] = \
(
{'name': 'Text Length', 'method': text_length, 'output_type': 'numeric'},
{'name': 'Average Word Length', 'method': average_word_length, 'output_type': 'numeric'},
{'name': 'Max Word Length', 'method': max_word_length, 'output_type': 'numeric'},
{'name': '% Special Characters', 'method': percentage_special_characters, 'output_type': 'numeric'},
{'name': '% Punctuation', 'method': percentage_punctuation, 'output_type': 'numeric'},
{'name': 'Language', 'method': language, 'output_type': 'categorical'},
{'name': 'Sentiment', 'method': sentiment, 'output_type': 'numeric'},
{'name': 'Subjectivity', 'method': subjectivity, 'output_type': 'numeric'},
{'name': 'Average Words Per Sentence', 'method': average_words_per_sentence, 'output_type': 'numeric'},
{'name': 'Reading Ease', 'method': readability_score, 'output_type': 'numeric'},
{'name': 'Lexical Density', 'method': lexical_density, 'output_type': 'numeric'},
{'name': 'Toxicity', 'method': toxicity, 'output_type': 'numeric'},
{'name': 'Fluency', 'method': fluency, 'output_type': 'numeric'},
{'name': 'Formality', 'method': formality, 'output_type': 'numeric'},
{'name': 'Unique Noun Count', 'method': unique_noun_count, 'output_type': 'numeric'},
)
ALL_PROPERTIES: Tuple[TextProperty, ...] = (
{'name': 'URLs Count', 'method': urls_count, 'output_type': 'numeric'},
{'name': 'Email Addresses Count', 'method': email_addresses_count,
'output_type': 'numeric'},
{'name': 'Unique URLs Count', 'method': unique_urls_count,
'output_type': 'numeric'},
{'name': 'Unique Email Addresses Count',
'method': unique_email_addresses_count, 'output_type': 'numeric'},
{'name': 'Unique Syllables Count', 'method': unique_syllables_count,
'output_type': 'numeric'},
{'name': 'Reading Time', 'method': reading_time,
'output_type': 'numeric'},
{'name': 'Sentences Count', 'method': sentences_count,
'output_type': 'numeric'},
{'name': 'Average Syllable Length', 'method': average_syllable_length,
'output_type': 'numeric'},
) + DEFAULT_PROPERTIES
ALL_PROPERTIES: Tuple[TextProperty, ...] = \
(
{'name': 'English Text', 'method': english_text, 'output_type': 'categorical'},
{'name': 'URLs Count', 'method': urls_count, 'output_type': 'numeric'},
{'name': 'Email Addresses Count', 'method': email_addresses_count, 'output_type': 'numeric'},
{'name': 'Unique URLs Count', 'method': unique_urls_count, 'output_type': 'numeric'},
{'name': 'Unique Email Addresses Count', 'method': unique_email_addresses_count, 'output_type': 'numeric'},
{'name': 'Unique Syllables Count', 'method': unique_syllables_count, 'output_type': 'numeric'},
{'name': 'Reading Time', 'method': reading_time, 'output_type': 'numeric'},
{'name': 'Sentences Count', 'method': sentences_count, 'output_type': 'numeric'},
{'name': 'Average Syllable Length', 'method': average_syllable_length, 'output_type': 'numeric'},
) + DEFAULT_PROPERTIES
LONG_RUN_PROPERTIES = ('Toxicity', 'Fluency', 'Formality', 'Unique Noun Count')
BATCH_PROPERTIES = ('Toxicity', 'Fluency', 'Formality')
LARGE_SAMPLE_SIZE = 10_000
ENGLISH_ONLY_PROPERTIES = (
'Sentiment', 'Subjectivity', 'Toxicity', 'Fluency', 'Formality', 'Readability Score',
'Sentiment', 'Subjectivity', 'Toxicity', 'Fluency', 'Formality', 'Reading Ease',
'Unique Noun Count', 'Unique Syllables Count', 'Sentences Count', 'Average Syllable Length'
)
CMUDICT_PROPERTIES = ('Average Syllable Length', 'Unique Syllables Count', 'Readability Score')
CMUDICT_PROPERTIES = ('Average Syllable Length', 'Unique Syllables Count', 'Reading Ease')

@@ -679,12 +565,21 @@ TEXT_PROPERTIES_DESCRIPTION = {

'Max Word Length': 'Maximum number of characters in a word',
'% Special Characters': 'Percentage of special characters in the text',
'% Special Characters': 'Percentage of special characters in the text. Special characters are non-alphanumeric '
'unicode characters, excluding whitespaces and any of !\"#$%&\'()*+,-./:;=?\\@.',
'% Punctuation': 'Percentage of punctuation characters in the text. Punctuation characters are any of '
'!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~',
'Language': 'Language of the text, using the fasttext language detection model',
'Sentiment': 'Sentiment of the text, calculated using the TextBlob sentiment analysis model',
'Subjectivity': 'Subjectivity of the text, calculated using the TextBlob sentiment analysis model',
'Sentiment': 'Sentiment of the text, calculated using the TextBlob sentiment analysis model.'
' Ranging from -1 (negative) to 1 (positive)',
'Subjectivity': 'Subjectivity of the text, calculated using the TextBlob sentiment analysis model. Ranging from 0 '
'(objective) to 1 (subjective)',
'Average Words Per Sentence': 'Average number of words per sentence in the text',
'Readability Score': 'A score calculated based on Flesch reading-ease per text sample',
'Reading Ease': 'How easy to read a text sample is, typically ranges from around 0 (hard to read) to around '
'100 (very easy). Based on Flesch reading-ease score',
'Lexical Density': 'Percentage of unique words in the text',
'Toxicity': 'Toxicity score using unitary/toxic-bert HuggingFace model',
'Fluency': 'Fluency score using prithivida/parrot_fluency_model HuggingFace model',
'Formality': 'Formality score using s-nlp/roberta-base-formality-ranker HuggingFace model',
'Toxicity': 'A measure of how harmful or offensive a text sample is (0 to 1), uses the Detoxify library '
'unitary/toxic-bert model',
'Fluency': 'A measure of the fluency of the text (0 to 1), using the prithivida/parrot_fluency_model'
' model from the authors of the Parrot Paraphraser library',
'Formality': 'The formality / register of the text (0 to 1), using the s-nlp/roberta-base-formality-ranker'
' model by the Skolkovo Institute of Science and Technology',
'Unique Noun Count': 'Number of unique noun words in the text',

@@ -694,2 +589,3 @@ 'URLs Count': 'Number of URLS per text sample',

'Unique URLs Count': 'Number of unique URLS per text sample',
'English Text': 'Whether the text is in English (1) or not (0)',
'Unique Email Addresses Count': 'Number of unique email addresses per text sample',

@@ -778,4 +674,7 @@ 'Unique Syllables Count': 'Number of unique syllables per text sample',

include_long_calculation_properties: bool = False,
ignore_non_english_samples_for_english_properties: bool = True,
device: Optional[str] = None,
models_storage: Union[pathlib.Path, str, None] = None
models_storage: Union[pathlib.Path, str, None] = None,
batch_size: Optional[int] = 16,
cache_models: bool = False
) -> Tuple[Dict[str, List[float]], Dict[str, str]]:

@@ -791,12 +690,12 @@ """Calculate properties on provided text samples.

together with ignore_properties parameter. Available properties are:
['Text Length', 'Average Word Length', 'Max Word Length', '% Special Characters', 'Language',
['Text Length', 'Average Word Length', 'Max Word Length', '% Special Characters', '% Punctuation', 'Language',
'Sentiment', 'Subjectivity', 'Toxicity', 'Fluency', 'Formality', 'Lexical Density', 'Unique Noun Count',
'Readability Score', 'Average Words Per Sentence', 'URLs Count', Unique URLs Count', 'Email Address Count',
'Reading Ease', 'Average Words Per Sentence', 'URLs Count', Unique URLs Count', 'Email Address Count',
'Unique Email Address Count', 'Unique Syllables Count', 'Reading Time', 'Sentences Count',
'Average Syllable Length']
List of default properties are: ['Text Length', 'Average Word Length', 'Max Word Length',
'% Special Characters', 'Language', 'Sentiment', 'Subjectivity', 'Toxicity', 'Fluency', 'Formality',
'Lexical Density', 'Unique Noun Count', 'Readability Score', 'Average Words Per Sentence']
'% Special Characters', '% Punctuation', 'Language', 'Sentiment', 'Subjectivity', 'Toxicity', 'Fluency',
'Formality', 'Lexical Density', 'Unique Noun Count', 'Reading Ease', 'Average Words Per Sentence']
To calculate all the default properties, the include_properties and ignore_properties parameters should
be None. If you pass either include_properties or ignore_properties then the only the properties specified
be None. If you pass either include_properties or ignore_properties then only the properties specified
in the list will be calculated or ignored.

@@ -812,2 +711,8 @@ Note that the properties ['Toxicity', 'Fluency', 'Formality', 'Language', 'Unique Noun Count'] may

ignored, unless they are specified in the include_properties parameter explicitly.
ignore_non_english_samples_for_english_properties : bool, default True
Whether to ignore samples that are not in English when calculating English properties. If False, samples
that are not in English will be calculated as well. This parameter is ignored when calculating non-English
properties.
English-Only properties WILL NOT work properly on non-English samples, and this parameter should be used
only when you are sure that all the samples are in English.
device : int, default None

@@ -819,2 +724,6 @@ The device to use for the calculation. If None, the default device will be used.

Also, if a folder already contains relevant resources they are not re-downloaded.
batch_size : int, default 8
The batch size.
cache_models : bool, default False
cache the models being used in this function, to save load time in next execution

@@ -847,3 +756,3 @@ Returns

if 'fasttext_model' not in kwargs:
kwargs['fasttext_model'] = _get_fasttext_model(models_storage=models_storage)
kwargs['fasttext_model'] = get_fasttext_model(models_storage=models_storage, use_cache=cache_models)

@@ -857,16 +766,18 @@ if 'cmudict_dict' not in kwargs:

calculated_properties[prop] = [np.nan] * len(raw_text)
cmudict_dict = corpus.cmudict.dict()
kwargs['cmudict_dict'] = cmudict_dict
kwargs['cmudict_dict'] = get_cmudict_dict(use_cache=cache_models)
if 'Toxicity' in text_properties_names and 'toxicity_classifier' not in kwargs:
kwargs['toxicity_classifier'] = get_transformer_pipeline(
property_name='toxicity', model_name=TOXICITY_MODEL_NAME, device=device, models_storage=models_storage)
property_name='toxicity', model_name=TOXICITY_MODEL_NAME, device=device,
models_storage=models_storage, use_cache=cache_models)
if 'Formality' in text_properties_names and 'formality_classifier' not in kwargs:
kwargs['formality_classifier'] = get_transformer_pipeline(
property_name='formality', model_name=FORMALITY_MODEL_NAME, device=device, models_storage=models_storage)
property_name='formality', model_name=FORMALITY_MODEL_NAME, device=device,
models_storage=models_storage, use_cache=cache_models)
if 'Fluency' in text_properties_names and 'fluency_classifier' not in kwargs:
kwargs['fluency_classifier'] = get_transformer_pipeline(
property_name='fluency', model_name=FLUENCY_MODEL_NAME, device=device, models_storage=models_storage)
property_name='fluency', model_name=FLUENCY_MODEL_NAME, device=device,
models_storage=models_storage, use_cache=cache_models)

@@ -885,34 +796,52 @@ is_language_property_requested = 'Language' in [prop['name'] for prop in text_properties]

progress_bar = create_progress_bar(
iterable=list(raw_text),
name='Text Samples Calculation',
unit='Text Sample'
)
for text in progress_bar:
progress_bar.set_postfix(
{'Sample': truncate_string(text, max_length=20) if text else 'EMPTY STRING'},
refresh=False
)
if pd.isna(text):
for prop in text_properties:
calculated_properties[prop['name']].append(np.nan)
continue
sample_language = run_available_kwargs(language, text=text, **kwargs)
for i in tqdm(range(0, len(raw_text), batch_size)):
batch = raw_text[i:i + batch_size]
batch_properties = defaultdict(list)
# filtering out empty sequences
nan_indices = {i for i, seq in enumerate(batch) if pd.isna(seq) is True}
filtered_sequences = [e for i, e in enumerate(batch) if i not in nan_indices]
samples_language = _batch_wrapper(text_batch=filtered_sequences, func=language, **kwargs)
if is_language_property_requested:
calculated_properties['Language'].append(sample_language)
batch_properties['Language'].extend(samples_language)
calculated_properties['Language'].extend(samples_language)
kwargs['language_property_result'] = samples_language # Pass the language property to other properties
non_english_indices = set()
if ignore_non_english_samples_for_english_properties:
non_english_indices = {i for i, (seq, lang) in enumerate(zip(filtered_sequences, samples_language))
if lang != 'en'}
for prop in text_properties:
if prop['name'] in import_warnings: # Skip properties that failed to import:
calculated_properties[prop['name']].append(np.nan)
elif sample_language != 'en' and prop['name'] in english_properties_names:
calculated_properties[prop['name']].append(np.nan)
batch_properties[prop['name']].extend([np.nan] * len(batch))
else:
if prop['name'] in english_properties_names \
and ignore_non_english_samples_for_english_properties is True:
filtered_sequences = [e for i, e in enumerate(filtered_sequences) if i not in non_english_indices]
kwargs['batch_size'] = batch_size
try:
value = run_available_kwargs(prop['method'], text=text, **kwargs)
calculated_properties[prop['name']].append(value)
if prop['name'] in BATCH_PROPERTIES:
value = run_available_kwargs(func=prop['method'], text_batch=filtered_sequences, **kwargs)
else:
value = _batch_wrapper(text_batch=filtered_sequences, func=prop['method'], **kwargs)
batch_properties[prop['name']].extend(value)
except ImportError as e:
warnings.warn(warning_message.format(prop['name'], str(e)))
calculated_properties[prop['name']].append(np.nan)
batch_properties[prop['name']].extend([np.nan] * len(batch))
import_warnings.add(prop['name'])
result_index = 0
for index, seq in enumerate(batch):
if index in nan_indices or (index in non_english_indices and
ignore_non_english_samples_for_english_properties and
prop['name'] in english_properties_names):
calculated_properties[prop['name']].append(np.nan)
else:
calculated_properties[prop['name']].append(batch_properties[prop['name']][result_index])
result_index += 1
filtered_sequences = [e for i, e in enumerate(batch) if i not in nan_indices]
# Clear property caches:

@@ -924,3 +853,4 @@ textblob_cache.clear()

# Clean all remaining RAM:
gc.collect()
if not cache_models:
gc.collect()

@@ -927,0 +857,0 @@ if not calculated_properties:

@@ -35,7 +35,9 @@ # ----------------------------------------------------------------------------

_UNSUPPORTED_MODEL_ERROR = (
'Check is relevant for Boosting models of type '
'{supported_models}, but received model of type {model_type}'
)
_UNSUPPORTED_MODEL_ERROR = \
'Check is relevant for Boosting models of type {supported_models}, but received model of type {model_type}'
_NO_MODEL_ERROR = \
'Check is relevant only when receiving the model, but predictions/probabilities were received instead. ' \
'In order to use this check, please pass the model to the run() method.'
_SUPPORTED_CLASSIFICATION_MODELS = (

@@ -82,2 +84,12 @@ 'AdaBoostClassifier',

@classmethod
def _raise_not_supported_model_error(cls, model_class):
if model_class != '_DummyModel':
raise ModelValidationError(cls._UNSUPPORTED_MODEL_ERROR.format(
supported_models=cls._SUPPORTED_MODELS,
model_type=model_class
))
else:
raise ModelValidationError(cls._NO_MODEL_ERROR)
def predict_proba(self, x):

@@ -93,6 +105,3 @@ if self.model_class in ['AdaBoostClassifier', 'GradientBoostingClassifier']:

else:
raise ModelValidationError(self._UNSUPPORTED_MODEL_ERROR.format(
supported_models=self._SUPPORTED_CLASSIFICATION_MODELS,
model_type=self.model_class
))
self._raise_not_supported_model_error(self.model_class)

@@ -110,6 +119,3 @@ def predict(self, x):

else:
raise ModelValidationError(self._UNSUPPORTED_MODEL_ERROR.format(
supported_models=self._SUPPORTED_MODELS,
model_type=self.model_class
))
self._raise_not_supported_model_error(self.model_class)

@@ -130,6 +136,3 @@ @classmethod

else:
raise ModelValidationError(cls._UNSUPPORTED_MODEL_ERROR.format(
supported_models=cls._SUPPORTED_MODELS,
model_type=model_class
))
cls._raise_not_supported_model_error(model_class=model_class)

@@ -136,0 +139,0 @@

@@ -374,3 +374,4 @@ # ----------------------------------------------------------------------------

Used in classification models to flag if to run condition on average of classes, or on
each class individually
each class individually. If any scorer that return a single value is used, this parameter
is ignored (will act as if average=True).
"""

@@ -394,4 +395,9 @@ name = f'Model performance gain over simple model is greater than {format_percent(min_allowed_gain)}'

# If the depth of the nested scores dict is 2, average is not relevant and is set to True
inner_dict = scores[list(scores.keys())[0]]
inner_inner_dict = inner_dict[list(inner_dict.keys())[0]]
force_average = isinstance(inner_inner_dict, Number)
passed_condition = True
if task_type in [TaskType.MULTICLASS, TaskType.BINARY] and not average:
if task_type in [TaskType.MULTICLASS, TaskType.BINARY] and not average and not force_average:
passed_metrics = {}

@@ -438,3 +444,3 @@ failed_classes = defaultdict(dict)

perfect_metrics = []
if task_type in [TaskType.MULTICLASS, TaskType.BINARY]:
if task_type in [TaskType.MULTICLASS, TaskType.BINARY] and not force_average:
scores = average_scores(scores, include_classes)

@@ -441,0 +447,0 @@ for metric, models_scores in scores.items():

@@ -13,3 +13,3 @@ # ----------------------------------------------------------------------------

import warnings
from typing import TYPE_CHECKING, Callable, Dict, List, Union
from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Union

@@ -53,3 +53,3 @@ import numpy as np

Columns to ignore, if none given checks based on columns variable
n_top_features : int , default: 5
n_top_features : Optional[int] , default: 10
Number of features to use for segment search. Top columns are selected based on feature importance.

@@ -78,2 +78,5 @@ segment_minimum_size_ratio: float , default: 0.05

random seed for all check internals.
multiple_segments_per_feature : bool , default: True
If True, will allow the same feature to be a segmenting feature in multiple segments,
otherwise each feature can appear in one segment at most.
"""

@@ -85,3 +88,3 @@

ignore_columns: Union[Hashable, List[Hashable], None] = None,
n_top_features: int = 5,
n_top_features: Optional[int] = 10,
segment_minimum_size_ratio: float = 0.05,

@@ -95,2 +98,3 @@ alternative_scorer: Dict[str, Union[str, Callable]] = None,

random_state: int = 42,
multiple_segments_per_feature: bool = True,
**kwargs

@@ -116,2 +120,3 @@ ):

self.categorical_aggregation_threshold = categorical_aggregation_threshold
self.multiple_segments_per_feature = multiple_segments_per_feature

@@ -169,3 +174,4 @@ def run_logic(self, context: Context, dataset_kind) -> CheckResult:

feature_rank_for_search=feature_rank,
dummy_model=dummy_model, scorer=scorer)
dummy_model=dummy_model, scorer=scorer,
multiple_segments_per_feature=self.multiple_segments_per_feature)

@@ -172,0 +178,0 @@ if len(weak_segments) == 0:

@@ -104,3 +104,6 @@ # ----------------------------------------------------------------------------

sample_size = min(self.n_samples, train_dataset.n_samples, test_dataset.n_samples)
if self.n_samples is None:
sample_size = min(train_dataset.n_samples, test_dataset.n_samples)
else:
sample_size = min(train_dataset.n_samples, test_dataset.n_samples, self.n_samples)

@@ -107,0 +110,0 @@ headnote = """

@@ -294,4 +294,7 @@ # ----------------------------------------------------------------------------

# If in infer_task_type we didn't find classes on model, or user didn't pass any, then using the observed
get_logger().warning('Could not find model\'s classes, using the observed classes')
return self.observed_classes
get_logger().warning('Could not find model\'s classes, using the observed classes. '
'In order to make sure the classes used by the model are inferred correctly, '
'please use the model_classes argument')
self._model_classes = self.observed_classes
return self._model_classes

@@ -298,0 +301,0 @@

@@ -34,5 +34,5 @@ # ----------------------------------------------------------------------------

if with_display:
fig = create_confusion_matrix_figure(result, total_classes, normalize_display)
displays = create_confusion_matrix_figure(result, total_classes, normalize_display)
else:
fig = None
displays = None

@@ -42,3 +42,3 @@ # For accessing the class names from the condition

return CheckResult(result, display=fig)
return CheckResult(result, display=displays)

@@ -65,27 +65,44 @@

"""
confusion_matrix_norm = confusion_matrix_data.astype('float') / \
(confusion_matrix_data.sum(axis=1)[:, np.newaxis] + np.finfo(float).eps) * 100
if normalize_display:
confusion_matrix_norm = confusion_matrix_data.astype('float') / \
(confusion_matrix_data.sum(axis=1)[:, np.newaxis] + np.finfo(float).eps) * 100
z = np.vectorize(format_number_if_not_nan)(confusion_matrix_norm)
text_template = '%{z}%<br>(%{text})'
color_bar_title = '% out of<br>True Values'
plot_title = 'Percent Out of True Values (Count)'
else:
z = confusion_matrix_data
color_bar_title = None
text_template = '%{text}'
plot_title = 'Value Count'
fig = go.Figure(data=go.Heatmap(
x=classes_names, y=classes_names, z=z,
text=confusion_matrix_data, texttemplate=text_template))
fig.data[0].colorbar.title = color_bar_title
fig.update_layout(title=plot_title)
accuracy_array = np.diag(confusion_matrix_norm).round(decimals=2)
display = []
display_msg = f'The overall accuracy of your model is: {round(np.sum(accuracy_array)/len(accuracy_array), 2)}%.'
if min(accuracy_array) < 100:
display_msg += f'<br>Best accuracy achieved on samples with <b>{classes_names[np.argmax(accuracy_array)]}' \
f'</b> label ({np.max(accuracy_array)}%).'
display_msg += f'<br>Worst accuracy achieved on samples with <b>{classes_names[np.argmin(accuracy_array)]}' \
f'</b> label ({np.min(accuracy_array)}%).'
display.append(display_msg)
total_samples = np.nansum(confusion_matrix_data)
percent_data_each_row = np.round(confusion_matrix_norm, decimals=2)
percent_data_each_cell = np.round(np.divide(np.nan_to_num(confusion_matrix_data, nan=0.0), total_samples) * 100,
decimals=2)
percent_data_each_col = (confusion_matrix_data.astype('float') /
(confusion_matrix_data.sum(axis=0)[:, np.newaxis] +
np.finfo(float).eps) * 100).round(decimals=2)
custom_hoverdata = np.dstack((percent_data_each_cell, percent_data_each_row, percent_data_each_col))
fig = go.Figure(data=go.Heatmap(x=classes_names, y=classes_names, z=z, customdata=custom_hoverdata,
xgap=1, ygap=1, text=confusion_matrix_data, texttemplate='%{text}',
hovertemplate='% out of all data: <b>%{customdata[0]}%</b><br>% out '
'of row: <b>%{customdata[1]}%</b><br>% out of column: '
'<b>%{customdata[2]}%</b><extra></extra>',
showscale=False))
fig.update_layout(title='Confusion Matrix (# Samples)', title_x=0.5)
fig.update_layout(height=600)
fig.update_xaxes(title='Predicted Value', type='category', scaleanchor='y', constrain='domain')
fig.update_yaxes(title='True Value', type='category', constrain='domain', autorange='reversed')
display.append(fig)
return display
return fig
def misclassified_samples_lower_than_condition(value: pd.DataFrame,

@@ -92,0 +109,0 @@ misclassified_samples_threshold: float) -> ConditionResult:

@@ -84,3 +84,3 @@ # ----------------------------------------------------------------------------

with_display=with_display,
dataset_names=(test_dataframe_name, train_dataframe_name)
dataset_names=(train_dataframe_name, test_dataframe_name)
)

@@ -87,0 +87,0 @@

@@ -173,4 +173,4 @@ # ----------------------------------------------------------------------------

feature_rank_for_search: Optional[np.ndarray] = None,
dummy_model: Optional[_DummyModel] = None,
scorer: Optional[DeepcheckScorer] = None, scorer_name: Optional[str] = None) \
dummy_model: Optional[_DummyModel] = None, scorer: Optional[DeepcheckScorer] = None,
scorer_name: Optional[str] = None, multiple_segments_per_feature: bool = False) \
-> pd.DataFrame:

@@ -217,8 +217,21 @@ """Search for weak segments based on scorer."""

# Drop duplicates without considering column 'Samples in Segment'
result_no_duplicates = weak_segments.drop(columns='Samples in Segment').drop_duplicates()
result_no_duplicates['Samples in Segment'] = weak_segments.loc[result_no_duplicates.index, 'Samples in Segment']
# Sort and drop relevant columns
weak_segments = weak_segments.sort_values(score_title).reset_index(drop=True)
if multiple_segments_per_feature:
result = weak_segments.drop(columns='Samples in Segment').drop_duplicates()
result['Samples in Segment'] = weak_segments.loc[result.index, 'Samples in Segment']
else:
used_features = set()
result = pd.DataFrame(columns=weak_segments.columns)
for _, row in weak_segments.iterrows():
if row['Feature1'] in used_features or row['Feature2'] in used_features:
continue
return result_no_duplicates.sort_values(score_title).reset_index(drop=True)
result.loc[len(result)] = row
used_features.add(row['Feature1'])
if row['Feature2'] != '':
used_features.add(row['Feature2'])
return result
def _find_weak_segment(self, data: pd.DataFrame, features_for_segment: List[str], score_per_sample: pd.Series,

@@ -335,2 +348,5 @@ label_col: Optional[pd.Series] = None, dummy_model: Optional[_DummyModel] = None,

def condition(result: Dict) -> ConditionResult:
if 'message' in result:
return ConditionResult(ConditionCategory.PASS, result['message'])
weakest_segment_score = result['weak_segments_list'].iloc[0, 0]

@@ -337,0 +353,0 @@ scorer_name = result['weak_segments_list'].columns[0].lower()

@@ -34,2 +34,7 @@ # ----------------------------------------------------------------------------

feature_distribution_colors = {
'measure': '#00008b', # dark blue
'feature': 'rgba(105, 179, 162, 1)'
}
common_and_outlier_colors = {'common': 'rgba(105, 179, 162, 1)',

@@ -36,0 +41,0 @@ 'outliers': 'rgba(179, 106, 106, 1)',

@@ -148,3 +148,3 @@ # ----------------------------------------------------------------------------

description.append(
description.extend(
create_confusion_matrix_figure(confusion_matrix, x, self.normalized)

@@ -151,0 +151,0 @@ )

@@ -69,2 +69,5 @@ # ----------------------------------------------------------------------------

For each categorical property, categories with frequency below threshold will be merged into "Other" category.
multiple_segments_per_property : bool , default: True
If True, will allow the same property to be a segmenting feature in multiple segments,
otherwise each property can appear in one segment at most.
{additional_check_init_params:2*indent}

@@ -80,2 +83,3 @@ """

categorical_aggregation_threshold: float = 0.05,
multiple_segments_per_property: bool = True,
**kwargs

@@ -91,2 +95,3 @@ ):

self.categorical_aggregation_threshold = categorical_aggregation_threshold
self.multiple_segments_per_property = multiple_segments_per_property
self._properties_results = None

@@ -142,3 +147,4 @@ self._sample_scores = None

score_per_sample=score_per_sample_col,
scorer_name=self._scorer_name)
scorer_name=self._scorer_name,
multiple_segments_per_feature=self.multiple_segments_per_property)
if len(weak_segments) == 0:

@@ -145,0 +151,0 @@ raise DeepchecksProcessError('WeakSegmentsPerformance was unable to train an error model to find weak '

@@ -228,3 +228,5 @@ # ----------------------------------------------------------------------------

while jump_size > 1:
if font.getsize(text)[0] < desired_width:
left, _, right, _ = font.getbbox(text)
width = right - left
if width < desired_width:
font_size += jump_size

@@ -270,4 +272,4 @@ else:

size = (int(image.size[0] * factor), int(image.size[1] * factor))
# Resize the image
image = image.resize(size, pilimage.ANTIALIAS)
# Resize the image by Image.LANCZOS
image = image.resize(size, pilimage.LANCZOS)
else:

@@ -274,0 +276,0 @@ image = ensure_image(image, copy=False)

Metadata-Version: 2.1
Name: deepchecks
Version: 0.17.3
Version: 0.17.4
Summary: Package for validating your machine learning model and data

@@ -9,3 +9,3 @@ Home-page: https://github.com/deepchecks/deepchecks

License: UNKNOWN
Download-URL: https://github.com/deepchecks/deepchecks/releases/download/0.17.3/deepchecks-0.17.3.tar.gz
Download-URL: https://github.com/deepchecks/deepchecks/releases/download/0.17.4/deepchecks-0.17.4.tar.gz
Project-URL: Documentation, https://docs.deepchecks.com

@@ -12,0 +12,0 @@ Project-URL: Bug Reports, https://github.com/deepchecks/deepchecks

@@ -111,3 +111,3 @@ <!--

pip install deepchecks-installer
deepchecks-installer monitoring-install
deepchecks-installer install-monitoring
```

@@ -374,2 +374,2 @@

This project follows the [all-contributors](https://allcontributors.org)
specification. Contributions of any kind are welcome!
specification. Contributions of any kind are welcome!

@@ -40,3 +40,5 @@ twine

requests>=2.23.0
scipy>=1.4.1
# Remove the <=1.10.1 dependency below once sklearn's issue is fixed. The higher version causes
# issues with sklearn's _most_frequent() function using scipy's mode() function
scipy>=1.4.1, <=1.10.1
tqdm>=4.41.0

@@ -43,0 +45,0 @@ seaborn>=0.11.0

@@ -1,2 +0,2 @@

optimum[onnxruntime]>=1.7.0
optimum[onnxruntime]>=1.8.8
fasttext>=0.8.0

@@ -28,3 +28,5 @@ pandas>=1.1.5

statsmodels>=0.13.5; python_version >= '3.7'
scipy>=1.4.1
# Remove the <=1.10.1 dependency below once sklearn's issue is fixed. The higher version causes
# issues with sklearn's _most_frequent() function using scipy's mode() function
scipy>=1.4.1, <=1.10.1
dataclasses>=0.6; python_version < '3.7'

@@ -31,0 +33,0 @@ plotly>=5.13.1

@@ -1,1 +0,1 @@

0.17.3
0.17.4