deepchecks - npm Package Compare versions

+324

deepchecks/nlp/utils/text_data_plot.py

		# ----------------------------------------------------------------------------
		# Copyright (C) 2021-2023 Deepchecks (https://www.deepchecks.com)
		#
		# This file is part of Deepchecks.
		# Deepchecks is distributed under the terms of the GNU Affero General
		# Public License (version 3 or later).
		# You should have received a copy of the GNU Affero General Public License
		# along with Deepchecks. If not, see <http://www.gnu.org/licenses/>.
		# ----------------------------------------------------------------------------
		#
		"""A module containing utils for displaying information on TextData object."""
		from typing import List, Optional

		import numpy as np
		import pandas as pd
		import plotly.graph_objs as go
		from plotly.subplots import make_subplots

		from deepchecks.nlp.task_type import TaskType, TTextLabel
		from deepchecks.nlp.utils.text import break_to_lines_and_trim
		from deepchecks.nlp.utils.text_properties import TEXT_PROPERTIES_DESCRIPTION
		from deepchecks.utils.dataframes import un_numpy
		from deepchecks.utils.distribution.plot import get_density
		from deepchecks.utils.plot import feature_distribution_colors
		from deepchecks.utils.strings import format_percent, get_docs_link

		__all__ = ['text_data_describe_plot']


		def _calculate_annoation_ratio(label, n_samples, is_mutli_label, task_type):

		if label is None:
		return format_percent(0)
		if is_mutli_label or task_type == TaskType.TOKEN_CLASSIFICATION:
		annotated_count = _calculate_number_of_annotated_samples(label=label,
		is_multi_label=is_mutli_label,
		task_type=task_type)
		return format_percent(annotated_count / n_samples)
		else:
		return format_percent(pd.notna(label).sum() / n_samples)


		def _get_table_row_data(n_samples, annotation_ratio, categorical_metadata, numerical_metadata,
		categorical_properties, numerical_properties, max_values_to_show: int = 5):

		info_cell = [n_samples, annotation_ratio]

		if categorical_metadata is None or len(categorical_metadata) == 0:
		info_cell.append('No categorical metadata')
		else:
		info_cell.append(', '.join(categorical_metadata) if len(categorical_metadata) <= max_values_to_show
		else f'{len(categorical_metadata)} metadata columns')

		if numerical_metadata is None or len(numerical_metadata) == 0:
		info_cell.append('No numerical metadata')
		else:
		info_cell.append(', '.join(numerical_metadata) if len(numerical_metadata) <= max_values_to_show
		else f'{len(numerical_metadata)} metadata columns')

		if categorical_properties is None or len(categorical_properties) == 0:
		info_cell.append('No categorical properties')
		else:
		info_cell.append(', '.join(categorical_properties) if len(categorical_properties) <= max_values_to_show
		else f'{len(categorical_properties)} properties')

		if numerical_properties is None or len(numerical_properties) == 0:
		info_cell.append('No numerical properties')
		else:
		info_cell.append(', '.join(numerical_properties) if len(numerical_properties) <= max_values_to_show
		else f'{len(numerical_properties)} properties')

		return info_cell


		def _generate_table_trace(n_samples, annotation_ratio, categorical_metadata, numerical_metadata,
		categorical_properties, numerical_properties):
		data_cell = ['<b>Number of samples</b>', '<b>Annotation ratio</b>', '<b>Metadata categorical columns</b>',
		'<b>Metadata numerical columns</b>', '<b>Categorical properties</b>', '<b>Numerical properties</b>']

		info_cell = _get_table_row_data(n_samples=n_samples, annotation_ratio=annotation_ratio,
		categorical_metadata=categorical_metadata, numerical_metadata=numerical_metadata,
		categorical_properties=categorical_properties,
		numerical_properties=numerical_properties, max_values_to_show=7)

		trace = go.Table(header={'fill': {'color': 'white'}},
		cells={'values': [data_cell, info_cell], 'align': ['left'], 'font_size': 12,
		'height': 30})
		return trace


		def _generate_categorical_distribution_plot(data, property_name):

		dist_counts = data.value_counts(normalize=True).to_dict()
		counts = list(dist_counts.values())
		categories_list = list(dist_counts.keys())
		cat_df = pd.DataFrame({property_name: counts}, index=[un_numpy(cat) for cat in categories_list])
		trace = go.Bar(x=cat_df.index, y=cat_df[property_name], showlegend=False,
		marker={'color': feature_distribution_colors['feature']},
		hovertemplate='<b>Value:</b> %{x}<br><b>Frequency:</b> %{y}<extra></extra>')
		yaxis_layout = dict(type='log', title='Frequency (Log Scale)')
		xaxis_layout = dict(title=property_name)
		return trace, xaxis_layout, yaxis_layout


		def _get_distribution_values(data):
		mean = data.mean()
		median = data.median()
		x_range = (data.min(), data.max())
		if all(int(x) == x for x in data if x is not None):
		# If the distribution is discrete, we take all the values in it:
		xs = sorted(np.unique(data))
		if len(xs) > 50:
		# If there are too many values, we take only 50, using a constant interval between them:
		xs = list(range(int(xs[0]), int(xs[-1]) + 1, int((xs[-1] - xs[0]) // 50)))
		else:
		xs = sorted(np.concatenate((np.linspace(x_range[0], x_range[1], 50),
		np.quantile(data, q=np.arange(0.02, 1, 0.02)),
		[mean, median]
		)))
		ixs = np.searchsorted(sorted(data), xs, side='left')
		xs = [xs[i] for i in range(len(ixs)) if ixs[i] != ixs[i - 1]]
		y_value = get_density(data, xs)
		return y_value, xs


		def _calculate_number_of_annotated_samples(label, is_multi_label, task_type):

		if is_multi_label or task_type == TaskType.TOKEN_CLASSIFICATION:
		annotated_count = 0
		for label_data in label:
		annotated_count = annotated_count + 1 if len(label_data) > 0 and pd.isna(label_data).sum() == 0 \
		else annotated_count
		return annotated_count
		else:
		return pd.notna(label).sum()


		def _generate_numeric_distribution_plot(data, x_value, y_value, property_name):

		mean = data.mean()
		percentile_90 = data.quantile(0.9)
		percentile_10 = data.quantile(0.1)
		median = data.median()

		trace = go.Scatter(x=x_value, y=y_value, fill='tozeroy', showlegend=False,
		hovertemplate=f'<b>{property_name}:</b> ''%{x}<br><b>Density:</b> %{y}<extra></extra>',
		line={'color': feature_distribution_colors['feature'],
		'shape': 'linear', 'width': 5})
		shapes = []
		annotations = []

		shapes.append(dict(type='line', x0=mean, y0=0, x1=mean, y1=max(y_value),
		line={'color': feature_distribution_colors['measure'], 'dash': 'dash', 'width': 3}))
		mean_xpos = mean + max(x_value) * 0.02 if median < mean else mean - max(x_value) * 0.02
		annotations.append(dict(x=mean_xpos, y=max(y_value)/2, text='<b>Mean</b>', showarrow=False,
		textangle=-90, font={'size': 12}))

		shapes.append(dict(type='line', x0=median, y0=0, x1=median, y1=max(y_value),
		line={'color': feature_distribution_colors['measure'], 'dash': 'dot', 'width': 3}))
		median_xpos = median - max(x_value) * 0.02 if median < mean else median + max(x_value) * 0.02
		annotations.append(dict(x=median_xpos, y=max(y_value)/2, text='<b>Median</b>', showarrow=False,
		textangle=-90, font={'size': 12}))

		shapes.append(dict(type='line', x0=percentile_10, y0=0, x1=percentile_10, y1=max(y_value),
		line={'color': feature_distribution_colors['measure'], 'dash': 'dashdot', 'width': 3}))
		annotations.append(dict(x=percentile_10 - max(x_value)*0.02, y=max(y_value)/2, textangle=-90,
		text='<b>10<sup>th</sup> Percentile</b>', showarrow=False, font={'size': 12}))

		shapes.append(dict(type='line', x0=percentile_90, y0=0, x1=percentile_90, y1=max(y_value),
		line={'color': feature_distribution_colors['measure'], 'dash': 'dashdot', 'width': 3}))
		annotations.append(dict(x=percentile_90 + max(x_value)*0.02, y=max(y_value)/2, textangle=-90,
		text='<b>90<sup>th</sup> Percentile</b>', showarrow=False, font={'size': 12}))

		xaxis_layout = dict(title=property_name)
		yaxis_layout = dict(title='Density')

		return trace, shapes, annotations, xaxis_layout, yaxis_layout


		def text_data_describe_plot(n_samples: int, max_num_labels_to_show: int,
		is_multi_label: bool, task_type: str,
		properties: pd.DataFrame,
		categorical_metadata: Optional[List[str]] = None,
		numerical_metadata: Optional[List[str]] = None,
		categorical_properties: Optional[List[str]] = None,
		numerical_properties: Optional[List[str]] = None,
		model_classes: Optional[List[str]] = None,
		label: Optional[TTextLabel] = None):
		"""Return a plotly figure instance.

		Parameters
		----------
		properties: pd.DataFrame
		The DataFrame consisting of the text properties data. If no prooperties are there, you can pass an
		empty DataFrame as well.
		n_samples: int
		The total number of samples present in the TextData object.
		max_num_labels_to_show : int
		The threshold to display the maximum number of labels on the label distribution pie chart and display
		rest of the labels under "Others" category.
		is_multi_label: bool
		A boolean where True denotes that the TextData contains multi labeled data otherwise false.
		task_type: str
		The task type for the text data. Can be either 'text_classification' or 'token_classification'.
		categorical_metadata: Optional[List[str]], default: None
		The names of the categorical metadata columns.
		numerical_metadata: Optional[List[str]], default: None
		The names of the numerical metadata columns.
		categorical_properties: Optional[List[str]], default: None
		The names of the categorical properties columns.
		numerical_properties: Optional[List[str]], default: None
		The names of the numerical text properties columns.
		label: Optional[TTextLabel], default: None
		The label for the text data. Can be either a text_classification label or a token_classification label.
		If None, the label distribution graph is not generated.

		- text_classification label - For text classification the accepted label format differs between multilabel and
		single label cases. For single label data, the label should be passed as a sequence of labels, with one entry
		per sample that can be either a string or an integer. For multilabel data, the label should be passed as a
		sequence of sequences, with the sequence for each sample being a binary vector, representing the presence of
		the i-th label in that sample.
		- token_classification label - For token classification the accepted label format is the IOB format or similar
		to it. The Label must be a sequence of sequences of strings or integers, with each sequence corresponding to
		a sample in the tokenized text, and exactly the length of the corresponding tokenized text.
		model_classes: Optional[List[str]], default: None
		List of classes names to use for multi-label display. Only used if the dataset is multi-label.

		Returns
		-------
		Plotly Figure instance.
		"""
		specs = [[{'type': 'pie'}, {'type': 'table'}] if label is not None else [{'type': 'table', 'colspan': 2}, None]] + \
		[[{'type': 'xy', 'colspan': 2}, None] for _ in range(len(properties.columns))]

		subplot_titles = []
		if label is not None:
		annotated_samples = _calculate_number_of_annotated_samples(label, is_multi_label, task_type)
		subplot_titles.append(f'Label Distribution<br><sup>Out of {annotated_samples} annotated samples</sup><br><br>')

		subplot_titles.append('') # Empty title for table figure
		if not properties.empty:
		for prop_name in properties:
		if prop_name in TEXT_PROPERTIES_DESCRIPTION:
		subplot_titles.append(f'{prop_name} Property Distribution<sup><a href="{get_docs_link()}nlp/'
		'usage_guides/nlp_properties.html#deepchecks-built-in-properties">ⓘ</a>'
		f'</sup><br><sup>{TEXT_PROPERTIES_DESCRIPTION[prop_name]}</sup>')

		fig = make_subplots(rows=len(properties.columns) + 1, cols=2, specs=specs, subplot_titles=subplot_titles,
		row_heights=[1.5] + [1.0] * len(properties.columns))

		# Create label distribution if label is provided
		if label is not None:
		if is_multi_label:
		df_label = pd.DataFrame(label).fillna(0)
		if model_classes is not None:
		hashmap = {}
		for val in label:
		model_array = np.array([model_classes[i] for i, val in enumerate(val) if val == 1])
		for class_name in model_array:
		hashmap[class_name] = hashmap[class_name] + 1 if class_name in hashmap else 1
		label_counts = pd.Series(list(hashmap.values()), index=list(hashmap))
		else:
		label_counts = pd.Series(np.sum(df_label.to_numpy(), axis=0))
		elif task_type == TaskType.TOKEN_CLASSIFICATION:
		hashmap = {}
		for val in label:
		flattened_array = pd.Series(np.array(val).flatten()).fillna('NaN').to_numpy()
		unique_values, counts = np.unique(flattened_array, return_counts=True)
		for label_value, count in zip(unique_values, counts):
		if label_value != 'NaN':
		hashmap[label_value] = hashmap[label_value] + count if label_value in hashmap else count
		label_counts = pd.Series(list(hashmap.values()), index=list(hashmap))
		else:
		label_counts = pd.Series(label).value_counts()

		label_counts.sort_values(ascending=False, inplace=True)
		labels_to_display = label_counts[:max_num_labels_to_show]
		labels_to_display.index = [break_to_lines_and_trim(str(label)) for label in list(labels_to_display.index)]
		count_other_labels = label_counts[max_num_labels_to_show + 1:].sum()
		labels_to_display['Others'] = count_other_labels

		# Pie chart for label distribution
		fig.add_trace(go.Pie(labels=list(labels_to_display.index), values=list(labels_to_display),
		textposition='inside', showlegend=False, textinfo='label+percent',
		hovertemplate='%{label}: %{value} samples<extra></extra>'), row=1, col=1)

		# Table figure for displaying some statistics
		annotation_ratio = _calculate_annoation_ratio(label, n_samples, is_multi_label, task_type)
		table_trace = _generate_table_trace(n_samples, annotation_ratio, categorical_metadata, numerical_metadata,
		categorical_properties, numerical_properties)
		fig.add_trace(table_trace, row=1, col=2 if label is not None else 1)

		# Looping over all the properties to generate respective property distribution graphs
		curr_row = 2 # Since row 1 is occupied with Pie and Table
		for property_name in properties.columns:

		if property_name in categorical_properties:
		# Creating bar plots for categorical properties
		trace, xaxis_layout, yaxis_layout = _generate_categorical_distribution_plot(
		properties[property_name], property_name
		)
		fig.add_trace(trace, row=curr_row, col=1)
		fig.update_xaxes(xaxis_layout, row=curr_row, col=1)
		fig.update_yaxes(yaxis_layout, row=curr_row, col=1)
		else:
		# Creating scatter plots for numerical properties
		y_value, xs = _get_distribution_values(properties[property_name])
		trace, shapes, annotations, xaxis_layout, yaxis_layout = _generate_numeric_distribution_plot(
		properties[property_name],
		xs, y_value, property_name
		)
		fig.add_trace(trace, row=curr_row, col=1)

		for shape, annotation in zip(shapes, annotations):
		fig.add_shape(shape, row=curr_row, col=1)
		fig.add_annotation(annotation, row=curr_row, col=1)

		fig.update_yaxes(yaxis_layout, row=curr_row, col=1)
		fig.update_xaxes(xaxis_layout, row=curr_row, col=1)

		curr_row += 1

		fig.update_layout(height=450*(len(properties.columns) + 1))
		return fig

+237

deepchecks/nlp/utils/text_properties_models.py

		# ----------------------------------------------------------------------------
		# Copyright (C) 2021-2023 Deepchecks (https://www.deepchecks.com)
		#
		# This file is part of Deepchecks.
		# Deepchecks is distributed under the terms of the GNU Affero General
		# Public License (version 3 or later).
		# You should have received a copy of the GNU Affero General Public License
		# along with Deepchecks. If not, see <http://www.gnu.org/licenses/>.
		# ----------------------------------------------------------------------------
		#
		"""Module containing the text properties models for the NLP module."""
		import importlib
		import pathlib
		from functools import lru_cache
		from typing import Optional, Union

		import requests
		from nltk import corpus

		MODELS_STORAGE = pathlib.Path(__file__).absolute().parent / '.nlp-models'


		def get_create_model_storage(models_storage: Union[pathlib.Path, str, None] = None):
		"""Get the models storage directory and create it if needed."""
		if models_storage is None:
		models_storage = MODELS_STORAGE
		else:
		if isinstance(models_storage, str):
		models_storage = pathlib.Path(models_storage)
		if not isinstance(models_storage, pathlib.Path):
		raise ValueError(
		f'Unexpected type of the "models_storage" parameter - {type(models_storage)}'
		)
		if not models_storage.exists():
		models_storage.mkdir(parents=True)
		if not models_storage.is_dir():
		raise ValueError('"model_storage" expected to be a directory')

		return models_storage


		def _get_transformer_model(
		property_name: str,
		model_name: str,
		device: Optional[str] = None,
		quantize_model: bool = False,
		models_storage: Union[pathlib.Path, str, None] = None
		):
		"""Get the transformer model and decide if to use optimum.onnxruntime.

		optimum.onnxruntime is used to optimize running times on CPU.
		"""
		models_storage = get_create_model_storage(models_storage)

		if device not in (None, 'cpu'):
		transformers = import_optional_property_dependency('transformers', property_name=property_name)
		# TODO: quantize if 'quantize_model' is True
		return transformers.AutoModelForSequenceClassification.from_pretrained(
		model_name,
		cache_dir=models_storage,
		device_map=device
		)

		onnx = import_optional_property_dependency(
		'optimum.onnxruntime',
		property_name=property_name,
		error_template=(
		f'The device was set to {device} while computing the {property_name} property,'
		'in which case deepchecks resorts to accelerating the inference by using optimum,'
		'bit it is not installed. Either:\n'
		'\t- Set the device according to your hardware;\n'
		'\t- Install optimum by running "pip install optimum";\n'
		'\t- Install all dependencies needed for text properties by running '
		'"pip install deepchecks[nlp-properties]";\n'
		)
		)

		if quantize_model is False:
		model_path = models_storage / 'onnx' / model_name

		if model_path.exists():
		return onnx.ORTModelForSequenceClassification.from_pretrained(model_path).to(device or -1)

		model = onnx.ORTModelForSequenceClassification.from_pretrained(
		model_name,
		export=True,
		cache_dir=models_storage,
		).to(device or -1)
		# NOTE:
		# 'optimum', after exporting/converting a model to the ONNX format,
		# does not store it onto disk we need to save it now to not reconvert
		# it each time
		model.save_pretrained(model_path)
		return model

		model_path = models_storage / 'onnx' / 'quantized' / model_name

		if model_path.exists():
		return onnx.ORTModelForSequenceClassification.from_pretrained(model_path).to(device or -1)

		not_quantized_model = _get_transformer_model(
		property_name,
		model_name,
		device,
		quantize_model=False,
		models_storage=models_storage
		)

		quantizer = onnx.ORTQuantizer.from_pretrained(not_quantized_model).to(device or -1)

		quantizer.quantize(
		save_dir=model_path,
		# TODO: make it possible to provide a config as a parameter
		quantization_config=onnx.configuration.AutoQuantizationConfig.avx512_vnni(
		is_static=False,
		per_channel=False
		)
		)
		return onnx.ORTModelForSequenceClassification.from_pretrained(model_path).to(device or -1)


		def import_optional_property_dependency(
		module: str,
		property_name: str,
		package_name: Optional[str] = None,
		error_template: Optional[str] = None
		):
		"""Import additional modules in runtime."""
		try:
		lib = importlib.import_module(module)
		except ImportError as error:
		package_name = package_name or module.split('.', maxsplit=1)[0]
		error_template = error_template or (
		'property {property_name} requires the {package_name} python package. '
		'To get it, run:\n'
		'>> pip install {package_name}\n\n'
		'You may install dependencies for all text properties by running:\n'
		'>> pip install deepchecks[nlp-properties]\n'
		)
		raise ImportError(error_template.format(
		property_name=property_name,
		package_name=package_name
		)) from error
		else:
		return lib


		def get_transformer_pipeline(
		property_name: str,
		model_name: str,
		device: Optional[str] = None,
		models_storage: Union[pathlib.Path, str, None] = None,
		use_cache=False
		):
		"""Return a transformers pipeline for the given model name."""
		if use_cache:
		return _get_transformer_pipeline(property_name, model_name, device, models_storage)
		# __wrapped__ is simply the function without decoration, in our case - without caching
		return _get_transformer_pipeline.__wrapped__(property_name, model_name, device, models_storage)


		@lru_cache(maxsize=5)
		def _get_transformer_pipeline(
		property_name: str,
		model_name: str,
		device: Optional[str] = None,
		models_storage: Union[pathlib.Path, str, None] = None
		):
		"""Return a transformers pipeline for the given model name."""
		transformers = import_optional_property_dependency('transformers', property_name=property_name)
		tokenizer = transformers.AutoTokenizer.from_pretrained(model_name, device_map=device)
		model = _get_transformer_model(
		property_name=property_name,
		model_name=model_name,
		device=device,
		models_storage=models_storage
		)
		return transformers.pipeline(
		'text-classification',
		model=model,
		tokenizer=tokenizer,
		device=device
		)


		def get_cmudict_dict(use_cache=False):
		"""Return corpus as dict."""
		if use_cache:
		return _get_cmudict_dict()
		return _get_cmudict_dict.__wrapped__()


		@lru_cache(maxsize=1)
		def _get_cmudict_dict():
		cmudict_dict = corpus.cmudict.dict()
		return cmudict_dict


		FASTTEXT_LANG_MODEL = 'https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin'


		def get_fasttext_model(models_storage: Union[pathlib.Path, str, None] = None, use_cache=False):
		"""Return fasttext model."""
		if use_cache:
		return _get_fasttext_model(models_storage)
		return _get_fasttext_model.__wrapped__(models_storage)


		@lru_cache(maxsize=1)
		def _get_fasttext_model(models_storage: Union[pathlib.Path, str, None] = None):
		"""Return fasttext model."""
		fasttext = import_optional_property_dependency(module='fasttext', property_name='language')

		model_name = FASTTEXT_LANG_MODEL.rsplit('/', maxsplit=1)[-1]
		model_path = get_create_model_storage(models_storage)
		model_path = model_path / 'fasttext'

		if not model_path.exists():
		model_path.mkdir(parents=True)

		model_path = model_path / model_name

		# Save the model to a file
		if not model_path.exists():
		response = requests.get(FASTTEXT_LANG_MODEL, timeout=240)
		if response.status_code != 200:
		raise RuntimeError('Failed to donwload fasttext model')
		model_path.write_bytes(response.content)

		# This weird code is to suppress a warning from fasttext about a deprecated function
		try:
		fasttext.FastText.eprint = lambda args, *kwargs: None
		fasttext_model = fasttext.load_model(str(model_path))
		except Exception as exp:
		raise exp

		return fasttext_model

+2

-2

deepchecks.egg-info/PKG-INFO

		Metadata-Version: 2.1
		Name: deepchecks
		Version: 0.17.3
		Version: 0.17.4
		Summary: Package for validating your machine learning model and data
		@@ -9,3 +9,3 @@ Home-page: https://github.com/deepchecks/deepchecks
		License: UNKNOWN
		Download-URL: https://github.com/deepchecks/deepchecks/releases/download/0.17.3/deepchecks-0.17.3.tar.gz
		Download-URL: https://github.com/deepchecks/deepchecks/releases/download/0.17.4/deepchecks-0.17.4.tar.gz
		Project-URL: Documentation, https://docs.deepchecks.com
		@@ -12,0 +12,0 @@ Project-URL: Bug Reports, https://github.com/deepchecks/deepchecks

+2

-2

deepchecks.egg-info/requires.txt

		@@ -9,3 +9,3 @@ pandas>=1.1.5
		category-encoders>=2.3.0
		scipy>=1.4.1
		scipy<=1.10.1,>=1.4.1
		plotly>=5.13.1
		@@ -46,3 +46,3 @@ matplotlib>=3.3.4
		[nlp-properties]
		optimum[onnxruntime]>=1.7.0
		optimum[onnxruntime]>=1.8.8
		fasttext>=0.8.0
		@@ -49,0 +49,0 @@

+2

-0

deepchecks.egg-info/SOURCES.txt

		@@ -119,4 +119,6 @@ DESCRIPTION.rst
		deepchecks/nlp/utils/text.py
		deepchecks/nlp/utils/text_data_plot.py
		deepchecks/nlp/utils/text_embeddings.py
		deepchecks/nlp/utils/text_properties.py
		deepchecks/nlp/utils/text_properties_models.py
		deepchecks/nlp/utils/token_classification_utils.py
		@@ -123,0 +125,0 @@ deepchecks/nlp/utils/weak_segments.py

+1

-0

deepchecks/analytics/anonymous_telemetry.py

		@@ -50,3 +50,4 @@ # ----------------------------------------------------------------------------
		deepchecks.__version__)
		os.environ['DISABLE_DEEPCHECKS_ANONYMOUS_TELEMETRY'] = 'True' # to ignore joblib
		except Exception: # pylint: disable=broad-except
		pass

+2

-0

deepchecks/nlp/checks/data_integrity/conflicting_labels.py

		@@ -91,2 +91,4 @@ # ----------------------------------------------------------------------------
		labels = dataset.label
		elif dataset.task_type is TaskType.OTHER:
		raise DeepchecksValueError('Check is irrelevant when task type is not specified')
		else:
		@@ -93,0 +95,0 @@ raise DeepchecksValueError(f'Unknown task type - {dataset.task_type}')

+3

-0

deepchecks/nlp/checks/data_integrity/special_characters.py

		@@ -105,2 +105,5 @@ # ----------------------------------------------------------------------------
		sample = random.sample(sample, self.max_chars_to_review_per_sample)
		if len(sample) == 0:
		percent_special_chars_in_sample[idx] = 0
		continue
		special_chars_in_sample = [char for char in sample if char in self.special_characters_deny_list]
		@@ -107,0 +110,0 @@ percent_special_chars_in_sample[idx] = len(special_chars_in_sample) / len(sample)

+2

-0

deepchecks/nlp/che...ata_integrity/text_property_outliers.py

		@@ -219,2 +219,4 @@ # ----------------------------------------------------------------------------
		continue
		if isinstance(info, str):
		continue
		if info['outlier_ratio'] > threshold:
		@@ -221,0 +223,0 @@ failed_properties.append(property_name)

+40

-12

deepchecks/nlp/che...a_integrity/under_annotated_segments.py

		@@ -12,3 +12,3 @@ # ----------------------------------------------------------------------------
		"""Module of the under annotated segments check."""
		from typing import Dict, List, Tuple, Union
		from typing import Dict, List, Optional, Tuple, Union

		@@ -22,3 +22,3 @@ import numpy as np
		from deepchecks.core.check_result import DisplayMap
		from deepchecks.core.errors import DeepchecksProcessError
		from deepchecks.core.errors import NotEnoughSamplesError
		from deepchecks.nlp import Context, SingleDatasetCheck
		@@ -35,2 +35,6 @@ from deepchecks.nlp.utils.text import break_to_lines_and_trim
		MAX_SAMPLES_IN_FIGURE = 1000
		# The threshold the UnderAnnotatedSegments considers the data to be well
		# annotated and skips the checks
		ANNOTATION_RATIO_THRESHOLD = 95.0
		MIN_TEXT_SAMPLES = 10 # Min samples to calculate under annotated segments

		@@ -43,4 +47,4 @@
		ignore_columns: Union[Hashable, List[Hashable], None], n_top_features: int,
		segment_minimum_size_ratio: float, n_samples: int,
		categorical_aggregation_threshold: float, n_to_show: int, **kwargs):
		segment_minimum_size_ratio: float, n_samples: int, n_to_show: int,
		categorical_aggregation_threshold: float, multiple_segments_per_feature: bool, **kwargs):
		super().__init__(**kwargs)
		@@ -55,2 +59,4 @@ self.segment_by = segment_by
		self.categorical_aggregation_threshold = categorical_aggregation_threshold
		self.annotation_ratio_threshold = ANNOTATION_RATIO_THRESHOLD
		self.multiple_segments_per_feature = multiple_segments_per_feature

		@@ -67,2 +73,13 @@ def run_logic(self, context: Context, dataset_kind) -> CheckResult:
		score_per_sample = pd.Series([1 - is_label_none(x) for x in text_data.label], index=features.index)
		annotation_ratio = round(score_per_sample.sum() * 100 / text_data.n_samples, 2)
		if annotation_ratio > self.annotation_ratio_threshold:
		display_msg = f'Under annotated {self.segment_by} segments check is skipped since your data ' \
		f'annotation ratio is > {self.annotation_ratio_threshold}%. Try increasing the ' \
		'annotation_ratio_threshold parameter.'
		return CheckResult(value={'message': display_msg}, display=[display_msg])

		if text_data.n_samples < MIN_TEXT_SAMPLES:
		raise NotEnoughSamplesError(f'Not enough samples to calculate under annotated {self.segment_by} '
		'segments. Minimum 10 samples required.')

		encoded_dataset = self._target_encode_categorical_features_fill_na(features, score_per_sample,
		@@ -74,8 +91,9 @@ cat_features)
		score_per_sample=score_per_sample,
		scorer_name='Annotation Ratio')
		scorer_name='Annotation Ratio',
		multiple_segments_per_feature=self.multiple_segments_per_feature)

		if len(weak_segments) == 0:
		raise DeepchecksProcessError('Check was unable to find under annotated segments. This is expected if '
		'your data is well annotated. If this is not the case, try increasing '
		f'n_samples or supply more {self.segment_by}.')
		display_msg = 'Check was unable to find under annotated segments. Try ' \
		f'supplying more {self.segment_by}.'
		return CheckResult(value={'message': display_msg}, display=[display_msg])

		@@ -233,3 +251,3 @@ check_result_value = self._generate_check_result_value(weak_segments, cat_features, avg_score)
		Properties to ignore, if none given checks based on properties variable
		n_top_properties : int , default: 10
		n_top_properties : Optional[int] , default: 10
		Number of properties to use for segment search. Top properties are selected based on feature importance.
		@@ -245,2 +263,5 @@ segment_minimum_size_ratio: float , default: 0.05
		In each categorical column, categories with frequency below threshold will be merged into "Other" category.
		multiple_segments_per_property : bool , default: False
		If True, will allow the same property to be a segmenting feature in multiple segments,
		otherwise each property can appear in one segment at most.
		"""
		@@ -251,3 +272,3 @@
		ignore_properties: Union[Hashable, List[Hashable], None] = None,
		n_top_properties: int = 15,
		n_top_properties: Optional[int] = 10,
		segment_minimum_size_ratio: float = 0.05,
		@@ -257,2 +278,3 @@ n_samples: int = 10_000,
		n_to_show: int = 3,
		multiple_segments_per_property: bool = False,
		**kwargs):
		@@ -267,2 +289,3 @@ super().__init__(segment_by='properties',
		categorical_aggregation_threshold=categorical_aggregation_threshold,
		multiple_segments_per_feature=multiple_segments_per_property,
		**kwargs)
		@@ -289,3 +312,3 @@
		Columns to ignore, if none given checks based on columns variable
		n_top_columns : int , default: 10
		n_top_columns : Optional[int] , default: 10
		Number of features to use for segment search. Top columns are selected based on feature importance.
		@@ -301,2 +324,5 @@ segment_minimum_size_ratio: float , default: 0.05
		In each categorical column, categories with frequency below threshold will be merged into "Other" category.
		multiple_segments_per_column : bool , default: True
		If True, will allow the same metadata column to be a segmenting column in multiple segments,
		otherwise each metadata column can appear in one segment at most.
		"""
		@@ -307,3 +333,3 @@
		ignore_columns: Union[Hashable, List[Hashable], None] = None,
		n_top_columns: int = 10,
		n_top_columns: Optional[int] = 10,
		segment_minimum_size_ratio: float = 0.05,
		@@ -313,2 +339,3 @@ n_samples: int = 10_000,
		n_to_show: int = 3,
		multiple_segments_per_column: bool = True,
		**kwargs):
		@@ -323,2 +350,3 @@ super().__init__(segment_by='metadata',
		categorical_aggregation_threshold=categorical_aggregation_threshold,
		multiple_segments_per_feature=multiple_segments_per_column,
		**kwargs)

+26

-7

deepchecks/nlp/che...evaluation/weak_segments_performance.py

		@@ -20,3 +20,3 @@ # ----------------------------------------------------------------------------
		from deepchecks.core.check_result import DisplayMap
		from deepchecks.core.errors import DeepchecksNotSupportedError, DeepchecksProcessError
		from deepchecks.core.errors import DeepchecksNotSupportedError, NotEnoughSamplesError
		from deepchecks.nlp import Context, SingleDatasetCheck
		@@ -31,3 +31,5 @@ from deepchecks.nlp.utils.weak_segments import get_relevant_data_table

		MIN_TEXT_SAMPLES = 10 # Min samples to calculate weak segments performance


		class WeakSegmentsAbstractText(SingleDatasetCheck, WeakSegmentAbstract):
		@@ -40,3 +42,4 @@ """Check the performance of the model on different segments of the data."""
		score_per_sample: Union[np.ndarray, pd.Series, None], n_samples: int,
		categorical_aggregation_threshold: float, n_to_show: int, **kwargs):
		categorical_aggregation_threshold: float, n_to_show: int,
		multiple_segments_per_feature: bool = False, **kwargs):
		super().__init__(**kwargs)
		@@ -53,2 +56,3 @@ self.segment_by = segment_by
		self.categorical_aggregation_threshold = categorical_aggregation_threshold
		self.multiple_segments_per_feature = multiple_segments_per_feature

		@@ -62,2 +66,5 @@ def run_logic(self, context: Context, dataset_kind) -> CheckResult:

		if text_data.n_samples < MIN_TEXT_SAMPLES:
		raise NotEnoughSamplesError(f'Not enough samples to find weak {self.segment_by} segments.'
		f' Minimum {MIN_TEXT_SAMPLES} samples required.')
		features, cat_features = get_relevant_data_table(text_data, data_type=self.segment_by,
		@@ -110,7 +117,9 @@ columns=self.columns, ignore_columns=self.ignore_columns,
		feature_rank_for_search=np.asarray(encoded_dataset.features),
		dummy_model=dummy_model, scorer=scorer)
		dummy_model=dummy_model, scorer=scorer,
		multiple_segments_per_feature=self.multiple_segments_per_feature)

		if len(weak_segments) == 0:
		raise DeepchecksProcessError('WeakSegmentsPerformance was unable to train an error model to find weak '
		f'segments. Try increasing n_samples or supply more {self.segment_by}.')
		display_msg = 'WeakSegmentsPerformance was unable to train an error model to find weak segments.'\
		f'Try supplying additional {self.segment_by}.'
		return CheckResult(value={'message': display_msg}, display=[display_msg])

		@@ -172,2 +181,5 @@ if context.with_display:
		In each categorical column, categories with frequency below threshold will be merged into "Other" category.
		multiple_segments_per_property : bool , default: False
		If True, will allow the same property to be a segmenting feature in multiple segments,
		otherwise each property can appear in one segment at most.
		"""
		@@ -178,3 +190,3 @@
		ignore_properties: Union[Hashable, List[Hashable], None] = None,
		n_top_properties: Optional[int] = 15,
		n_top_properties: Optional[int] = 10,
		segment_minimum_size_ratio: float = 0.05,
		@@ -186,2 +198,3 @@ alternative_scorer: Dict[str, Union[str, Callable]] = None,
		n_to_show: int = 3,
		multiple_segments_per_property: bool = False,
		**kwargs):
		@@ -198,2 +211,3 @@ super().__init__(segment_by='properties',
		categorical_aggregation_threshold=categorical_aggregation_threshold,
		multiple_segments_per_feature=multiple_segments_per_property,
		**kwargs)
		@@ -242,2 +256,5 @@
		In each categorical column, categories with frequency below threshold will be merged into "Other" category.
		multiple_segments_column : bool , default: True
		If True, will allow the same metadata column to be a segmenting column in multiple segments,
		otherwise each metadata column can appear in one segment at most.
		"""
		@@ -248,3 +265,3 @@
		ignore_columns: Union[Hashable, List[Hashable], None] = None,
		n_top_columns: Optional[int] = 15,
		n_top_columns: Optional[int] = 10,
		segment_minimum_size_ratio: float = 0.05,
		@@ -256,2 +273,3 @@ alternative_scorer: Dict[str, Union[str, Callable]] = None,
		n_to_show: int = 3,
		multiple_segments_column: bool = True,
		**kwargs):
		@@ -268,2 +286,3 @@ super().__init__(segment_by='metadata',
		categorical_aggregation_threshold=categorical_aggregation_threshold,
		multiple_segments_per_feature=multiple_segments_column,
		**kwargs)

+3

-1

deepchecks/nlp/context.py

 @@ -346,3 +346,5 @@ # ----------------------------------------------------------------------------
             self._model_classes = self._observed_classes
             get_logger().warning('Could not find model\'s classes, using the observed classes')
             get_logger().warning('Could not find model\'s classes, using the observed classes. '
                                  'In order to make sure the classes used by the model are inferred correctly, '
                                  'please use the model_classes argument')
         return self._model_classes
 @@ -349,0 +351,0 @@

+71

-0

deepchecks/nlp/text_data.py

		@@ -26,2 +26,3 @@ # ----------------------------------------------------------------------------
		from deepchecks.nlp.utils.text import break_to_lines_and_trim
		from deepchecks.nlp.utils.text_data_plot import text_data_describe_plot
		from deepchecks.nlp.utils.text_embeddings import calculate_builtin_embeddings
		@@ -415,2 +416,3 @@ from deepchecks.nlp.utils.text_properties import calculate_builtin_properties, get_builtin_properties_types
		include_long_calculation_properties: bool = False,
		ignore_non_english_samples_for_english_properties: bool = True,
		device: t.Optional[str] = None
		@@ -431,2 +433,8 @@ ):
		ignored.
		ignore_non_english_samples_for_english_properties : bool, default True
		Whether to ignore samples that are not in English when calculating English properties. If False, samples
		that are not in English will be calculated as well. This parameter is ignored when calculating non-English
		properties.
		English-Only properties WILL NOT work properly on non-English samples, and this parameter should be used
		only when you are sure that all the samples are in English.
		device : int, default None
		@@ -443,2 +451,3 @@ The device to use for the calculation. If None, the default device will be used.
		include_long_calculation_properties=include_long_calculation_properties,
		ignore_non_english_samples_for_english_properties=ignore_non_english_samples_for_english_properties,
		device=device
		@@ -536,2 +545,10 @@ )
		@property
		def numerical_properties(self) -> t.List[str]:
		"""Return numerical properties names."""
		if self._properties is not None:
		return [prop for prop in self._properties.columns if prop not in self._cat_properties]
		else:
		return []

		@property
		def task_type(self) -> t.Optional[TaskType]:
		@@ -741,3 +758,57 @@ """Return the task type.

		def describe(self, n_properties_to_show: t.Optional[int] = 4, properties_to_show: t.Optional[t.List[str]] = None,
		max_num_labels_to_show: t.Optional[int] = 5, model_classes: t.Optional[t.List[str]] = None):
		"""Provide holistic view of the data.

		Generates the following plots:
		1. Label distribution
		2. Statistics about the data such as number of samples, annotation ratio, list of metadata columns, list of
		text properties and so on.
		3. Property distribution for the text properties defined either by n_properties_to_show or properties_to_show
		parameter.

		Parameters
		----------
		n_properties_to_show : int, default: 4
		Number of properties to consider for generating property distribution graphs. If properties_to_show
		is provided, this value is ignored.
		properties_to_show : List[str], default: None
		List of property names to consider for generating property distribution graphs. If None, all the
		properties are considered.
		max_num_labels_to_show : int, default: 5
		The threshold to display the maximum number of labels on the label distribution pie chart and
		display rest of the labels under "Others" category.
		model_classes : Optional[List[str]], default: None
		List of classes names to use for multi-label display. Only used if the dataset is multi-label.

		Returns
		-------
		Displays the Plotly Figure.
		"""
		prop_names = []
		all_properties_data = pd.DataFrame()
		if self._properties is None and properties_to_show is not None:
		raise DeepchecksValueError('No properties exist!')
		elif self._properties is not None:
		if properties_to_show is not None:
		prop_names = [prop for prop in properties_to_show if prop in self.properties.columns]
		if len(prop_names) != len(properties_to_show):
		raise DeepchecksValueError(f'{set(properties_to_show)-set(prop_names)} '
		'properties does not exist in the TextData object')
		else:
		prop_names = list(self.properties.columns)[:n_properties_to_show]
		all_properties_data = self.properties[prop_names]

		fig = text_data_describe_plot(properties=all_properties_data, n_samples=self.n_samples,
		is_multi_label=self.is_multi_label_classification(), task_type=self.task_type,
		categorical_metadata=self.categorical_metadata,
		numerical_metadata=self.numerical_metadata,
		categorical_properties=self.categorical_properties,
		numerical_properties=self.numerical_properties, label=self._label,
		model_classes=model_classes,
		max_num_labels_to_show=max_num_labels_to_show)

		return fig


		@contextlib.contextmanager
		@@ -744,0 +815,0 @@ def disable_deepchecks_logger():

+5

-2

deepchecks/nlp/utils/text_embeddings.py

		@@ -189,4 +189,7 @@ # ----------------------------------------------------------------------------

		text_embedding = np.average(text_embeddings, axis=0, weights=text_lens)
		text_embedding = text_embedding / np.linalg.norm(text_embedding) # normalizes length to 1
		if sum(text_lens) == 0:
		text_embedding = np.ones((EMBEDDING_DIM, )) * np.nan
		else:
		text_embedding = np.average(text_embeddings, axis=0, weights=text_lens)
		text_embedding = text_embedding / np.linalg.norm(text_embedding) # normalizes length to 1
		result_embeddings.append(text_embedding.tolist())
		@@ -193,0 +196,0 @@

+227

-297

deepchecks/nlp/utils/text_properties.py

		@@ -13,3 +13,2 @@ # ----------------------------------------------------------------------------
		import gc
		import importlib
		import pathlib
		@@ -19,2 +18,3 @@ import re
		import warnings
		from collections import defaultdict
		from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union
		@@ -24,3 +24,2 @@
		import pandas as pd
		import requests
		import textblob
		@@ -30,2 +29,3 @@ from nltk import corpus
		from nltk import sent_tokenize, word_tokenize
		from tqdm import tqdm
		from typing_extensions import TypedDict
		@@ -35,5 +35,5 @@
		from deepchecks.nlp.utils.text import cut_string, hash_text, normalize_text, remove_punctuation
		from deepchecks.nlp.utils.text_properties_models import get_cmudict_dict, get_fasttext_model, get_transformer_pipeline
		from deepchecks.utils.function import run_available_kwargs
		from deepchecks.utils.ipython import create_progress_bar
		from deepchecks.utils.strings import format_list, truncate_string
		from deepchecks.utils.strings import SPECIAL_CHARACTERS, format_list

		@@ -44,6 +44,8 @@ __all__ = ['calculate_builtin_properties', 'get_builtin_properties_types']

		MODELS_STORAGE = pathlib.Path(__file__).absolute().parent / '.nlp-models'
		FASTTEXT_LANG_MODEL = 'https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin'
		DEFAULT_SENTENCE_SAMPLE_SIZE = 300
		MAX_CHARS = 512 # Bert accepts max of 512 tokens, so without counting tokens we go for the lower bound.
		# all SPECIAL_CHARACTERS - all string.punctuation except for <>@[]^_`{\|}~ - all whitespace
		NON_PUNCTUATION_SPECIAL_CHARS = frozenset(set(SPECIAL_CHARACTERS) - set(r"""!"#$%&'()*+,-./:;=?\@""")
		- set(string.whitespace))

		textblob_cache = {}
		@@ -107,27 +109,2 @@ words_cache = {}

		def _import_optional_property_dependency(
		module: str,
		property_name: str,
		package_name: Optional[str] = None,
		error_template: Optional[str] = None
		):
		try:
		lib = importlib.import_module(module)
		except ImportError as error:
		package_name = package_name or module.split('.', maxsplit=1)[0]
		error_template = error_template or (
		'property {property_name} requires the {package_name} python package. '
		'To get it, run:\n'
		'>> pip install {package_name}\n\n'
		'You may install dependencies for all text properties by running:\n'
		'>> pip install deepchecks[nlp-properties]\n'
		)
		raise ImportError(error_template.format(
		property_name=property_name,
		package_name=package_name
		)) from error
		else:
		return lib


		def _warn_if_missing_nltk_dependencies(dependency: str, property_name: str):
		@@ -139,123 +116,2 @@ """Warn if NLTK dependency is missing."""

		def get_create_model_storage(models_storage: Union[pathlib.Path, str, None] = None):
		"""Get the models storage directory and create it if needed."""
		if models_storage is None:
		models_storage = MODELS_STORAGE
		else:
		if isinstance(models_storage, str):
		models_storage = pathlib.Path(models_storage)
		if not isinstance(models_storage, pathlib.Path):
		raise ValueError(
		f'Unexpected type of the "models_storage" parameter - {type(models_storage)}'
		)
		if not models_storage.exists():
		models_storage.mkdir(parents=True)
		if not models_storage.is_dir():
		raise ValueError('"model_storage" expected to be a directory')

		return models_storage


		def get_transformer_model(
		property_name: str,
		model_name: str,
		device: Optional[str] = None,
		quantize_model: bool = False,
		models_storage: Union[pathlib.Path, str, None] = None
		):
		"""Get the transformer model and decide if to use optimum.onnxruntime.

		optimum.onnxruntime is used to optimize running times on CPU.
		"""
		models_storage = get_create_model_storage(models_storage)

		if device not in (None, 'cpu'):
		transformers = _import_optional_property_dependency('transformers', property_name=property_name)
		# TODO: quantize if 'quantize_model' is True
		return transformers.AutoModelForSequenceClassification.from_pretrained(
		model_name,
		cache_dir=models_storage
		)

		onnx = _import_optional_property_dependency(
		'optimum.onnxruntime',
		property_name=property_name,
		error_template=(
		f'The device was set to {device} while computing the {property_name} property,'
		'in which case deepchecks resorts to accelerating the inference by using optimum,'
		'bit it is not installed. Either:\n'
		'\t- Set the device according to your hardware;\n'
		'\t- Install optimum by running "pip install optimum";\n'
		'\t- Install all dependencies needed for text properties by running '
		'"pip install deepchecks[nlp-properties]";\n'
		)
		)

		if quantize_model is False:
		model_path = models_storage / 'onnx' / model_name

		if model_path.exists():
		return onnx.ORTModelForSequenceClassification.from_pretrained(model_path)

		model = onnx.ORTModelForSequenceClassification.from_pretrained(
		model_name,
		export=True,
		cache_dir=models_storage
		)
		# NOTE:
		# 'optimum', after exporting/converting a model to the ONNX format,
		# does not store it onto disk we need to save it now to not reconvert
		# it each time
		model.save_pretrained(model_path)
		return model

		model_path = models_storage / 'onnx' / 'quantized' / model_name

		if model_path.exists():
		return onnx.ORTModelForSequenceClassification.from_pretrained(model_path)

		not_quantized_model = get_transformer_model(
		property_name,
		model_name,
		device,
		quantize_model=False,
		models_storage=models_storage
		)

		quantizer = onnx.ORTQuantizer.from_pretrained(not_quantized_model)

		quantizer.quantize(
		save_dir=model_path,
		# TODO: make it possible to provide a config as a parameter
		quantization_config=onnx.configuration.AutoQuantizationConfig.avx512_vnni(
		is_static=False,
		per_channel=False
		)
		)
		return onnx.ORTModelForSequenceClassification.from_pretrained(model_path)


		def get_transformer_pipeline(
		property_name: str,
		model_name: str,
		device: Optional[str] = None,
		models_storage: Union[pathlib.Path, str, None] = None
		):
		"""Return a transformers pipeline for the given model name."""
		transformers = _import_optional_property_dependency('transformers', property_name=property_name)
		tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
		model = get_transformer_model(
		property_name=property_name,
		model_name=model_name,
		device=device,
		models_storage=models_storage
		)
		return transformers.pipeline(
		'text-classification',
		model=model,
		tokenizer=tokenizer,
		device=device
		)


		def text_length(text: str) -> int:
		@@ -274,2 +130,7 @@ """Return text length."""
		"""Return percentage of special characters (as float between 0 and 1)."""
		return len([c for c in text if c in NON_PUNCTUATION_SPECIAL_CHARS]) / len(text) if len(text) != 0 else 0


		def percentage_punctuation(text: str) -> float:
		"""Return percentage of punctuation (as float between 0 and 1)."""
		return len([c for c in text if c in string.punctuation]) / len(text) if len(text) != 0 else 0
		@@ -284,32 +145,2 @@

		def _get_fasttext_model(models_storage: Union[pathlib.Path, str, None] = None):
		"""Return fasttext model."""
		fasttext = _import_optional_property_dependency(module='fasttext', property_name='language')

		model_name = FASTTEXT_LANG_MODEL.rsplit('/', maxsplit=1)[-1]
		model_path = get_create_model_storage(models_storage)
		model_path = model_path / 'fasttext'

		if not model_path.exists():
		model_path.mkdir(parents=True)

		model_path = model_path / model_name

		# Save the model to a file
		if not model_path.exists():
		response = requests.get(FASTTEXT_LANG_MODEL, timeout=240)
		if response.status_code != 200:
		raise RuntimeError('Failed to donwload fasttext model')
		model_path.write_bytes(response.content)

		# This weird code is to suppress a warning from fasttext about a deprecated function
		try:
		fasttext.FastText.eprint = lambda args, *kwargs: None
		fasttext_model = fasttext.load_model(str(model_path))
		except Exception as exp:
		raise exp

		return fasttext_model


		def language(
		@@ -323,5 +154,6 @@ text: str,
		return None
		# Not recommended, takes a long time. Here only to enable to call this function from outside:
		# Load the model if it wasn't received as a parameter. This is done to avoid loading the model
		# each time the function is called.
		if fasttext_model is None:
		fasttext_model = _get_fasttext_model()
		fasttext_model = get_fasttext_model()

		@@ -335,2 +167,16 @@ # Predictions are the first prediction (k=1), only if the probability is above the threshold

		def english_text(
		text: str,
		lang_certainty_threshold: float = 0.8,
		fasttext_model: Optional[Dict[object, Any]] = None,
		language_property_result: Optional[str] = None
		) -> Union[bool, None]:
		"""Return whether text is in English or not."""
		if not text:
		return None
		if language_property_result is None:
		language_property_result = language(text, lang_certainty_threshold, fasttext_model)
		return language_property_result == 'en'


		def sentiment(text: str) -> float:
		@@ -358,7 +204,12 @@ """Return float representing sentiment."""

		def _predict(text: str, classifier, kind: str) -> float:
		def predict_on_batch(text_batch: Sequence[str], classifier,
		output_formatter: Callable[[Dict[str, Any]], float]) -> Sequence[float]:
		"""Return prediction of huggingface Pipeline classifier."""
		try:
		# TODO: make this way smarter, and not just a hack. Count tokens, for a start. Then not just sample sentences.
		# If text is longer than classifier context window, sample it:
		# TODO: make this way smarter, and not just a hack. Count tokens, for a start. Then not just sample sentences.
		# If text is longer than classifier context window, sample it:
		text_list_to_predict = []
		reduced_batch_size = len(text_batch) # Initialize the reduced batch size
		retry_count = 0

		for text in text_batch:
		if len(text) > MAX_CHARS:
		@@ -375,26 +226,33 @@ sentences = _sample_for_property(text, mode='sentences', limit=10, return_as_list=True)
		text_to_use = cut_string(sentences[0], MAX_CHARS)
		text = text_to_use

		v = classifier(text)
		except Exception: # pylint: disable=broad-except
		return np.nan
		else:
		if not v:
		return np.nan
		v = v[0]
		if kind == 'toxicity':
		return v['score']
		elif kind == 'fluency':
		label_value = 'LABEL_1'
		elif kind == 'formality':
		label_value = 'formal'
		text_list_to_predict.append(text_to_use)
		else:
		raise ValueError('Unsupported value for "kind" parameter')
		return (
		v['score']
		if v['label'] == label_value
		else 1 - v['score']
		)
		text_list_to_predict.append(text)

		while reduced_batch_size >= 1:
		try:
		if reduced_batch_size == 1 or retry_count == 3:
		results = []
		for text in text_list_to_predict:
		try:
		v = classifier(text)[0]
		results.append(output_formatter(v))
		except Exception: # pylint: disable=broad-except
		results.append(np.nan)
		return results # Return the results if prediction is successful

		v_list = classifier(text_list_to_predict, batch_size=reduced_batch_size)
		results = []

		for v in v_list:
		results.append(output_formatter(v))

		return results # Return the results if prediction is successful

		except Exception: # pylint: disable=broad-except
		reduced_batch_size = max(reduced_batch_size // 2, 1) # Reduce the batch size by half
		retry_count += 1

		return [np.nan] * len(text_batch) # Prediction failed, return NaN values for the original batch size


		TOXICITY_MODEL_NAME = 'unitary/toxic-bert'
		@@ -406,7 +264,7 @@ FLUENCY_MODEL_NAME = 'prithivida/parrot_fluency_model'
		def toxicity(
		text: str,
		text_batch: Sequence[str],
		device: Optional[str] = None,
		models_storage: Union[pathlib.Path, str, None] = None,
		toxicity_classifier: Optional[object] = None
		) -> float:
		) -> Sequence[float]:
		"""Return float representing toxicity."""
		@@ -416,11 +274,15 @@ if toxicity_classifier is None:
		property_name='toxicity', model_name=TOXICITY_MODEL_NAME, device=device, models_storage=models_storage)
		return _predict(text, toxicity_classifier, 'toxicity')

		def output_formatter(v):
		return v['score']

		return predict_on_batch(text_batch, toxicity_classifier, output_formatter)


		def fluency(
		text: str,
		text_batch: Sequence[str],
		device: Optional[str] = None,
		models_storage: Union[pathlib.Path, str, None] = None,
		fluency_classifier: Optional[object] = None
		) -> float:
		) -> Sequence[float]:
		"""Return float representing fluency."""
		@@ -430,11 +292,15 @@ if fluency_classifier is None:
		property_name='fluency', model_name=FLUENCY_MODEL_NAME, device=device, models_storage=models_storage)
		return _predict(text, fluency_classifier, 'fluency')

		def output_formatter(v):
		return v['score'] if v['label'] == 'LABEL_1' else 1 - v['score']

		return predict_on_batch(text_batch, fluency_classifier, output_formatter)


		def formality(
		text: str,
		text_batch: Sequence[str],
		device: Optional[str] = None,
		models_storage: Union[pathlib.Path, str, None] = None,
		formality_classifier: Optional[object] = None
		) -> float:
		) -> Sequence[float]:
		"""Return float representing formality."""
		@@ -444,5 +310,9 @@ if formality_classifier is None:
		property_name='formality', model_name=FORMALITY_MODEL_NAME, device=device, models_storage=models_storage)
		return _predict(text, formality_classifier, 'formality')

		def output_formatter(v):
		return v['score'] if v['label'] == 'formal' else 1 - v['score']

		return predict_on_batch(text_batch, formality_classifier, output_formatter)


		def lexical_density(text: str) -> float:
		@@ -467,3 +337,3 @@ """Return a float representing lexical density.

		def unique_noun_count(text: str) -> int:
		def unique_noun_count(text: Sequence[str]) -> int:
		"""Return the number of unique noun words in the text."""
		@@ -491,3 +361,3 @@ if pd.isna(text):
		if not nltk_download('cmudict', quiet=True):
		_warn_if_missing_nltk_dependencies('cmudict', 'Readability Score')
		_warn_if_missing_nltk_dependencies('cmudict', 'Reading Ease')
		return np.nan
		@@ -622,2 +492,18 @@ cmudict_dict = corpus.cmudict.dict()

		def _batch_wrapper(text_batch: Sequence[str], func: Callable, **kwargs) -> List[Any]:
		"""Wrap the non-batched properties execution with batches API."""
		results = []
		language_property_result = []
		if 'language_property_result' in kwargs:
		language_property_result = kwargs.pop('language_property_result')

		language_property_exists = len(language_property_result) > 0

		for i, text in enumerate(text_batch):
		kwargs['language_property_result'] = language_property_result[i] if language_property_exists else None
		results.append(run_available_kwargs(func, text=text, **kwargs))

		return results


		class TextProperty(TypedDict):
		@@ -629,46 +515,46 @@ name: str

		DEFAULT_PROPERTIES: Tuple[TextProperty, ...] = (
		{'name': 'Text Length', 'method': text_length, 'output_type': 'numeric'},
		{'name': 'Average Word Length', 'method': average_word_length, 'output_type': 'numeric'},
		{'name': 'Max Word Length', 'method': max_word_length, 'output_type': 'numeric'},
		{'name': '% Special Characters', 'method': percentage_special_characters, 'output_type': 'numeric'},
		{'name': 'Language', 'method': language, 'output_type': 'categorical'},
		{'name': 'Sentiment', 'method': sentiment, 'output_type': 'numeric'},
		{'name': 'Subjectivity', 'method': subjectivity, 'output_type': 'numeric'},
		{'name': 'Average Words Per Sentence', 'method': average_words_per_sentence, 'output_type': 'numeric'},
		{'name': 'Readability Score', 'method': readability_score, 'output_type': 'numeric'},
		{'name': 'Lexical Density', 'method': lexical_density, 'output_type': 'numeric'},
		{'name': 'Toxicity', 'method': toxicity, 'output_type': 'numeric'},
		{'name': 'Fluency', 'method': fluency, 'output_type': 'numeric'},
		{'name': 'Formality', 'method': formality, 'output_type': 'numeric'},
		{'name': 'Unique Noun Count', 'method': unique_noun_count, 'output_type': 'numeric'},
		)
		DEFAULT_PROPERTIES: Tuple[TextProperty, ...] = \
		(
		{'name': 'Text Length', 'method': text_length, 'output_type': 'numeric'},
		{'name': 'Average Word Length', 'method': average_word_length, 'output_type': 'numeric'},
		{'name': 'Max Word Length', 'method': max_word_length, 'output_type': 'numeric'},
		{'name': '% Special Characters', 'method': percentage_special_characters, 'output_type': 'numeric'},
		{'name': '% Punctuation', 'method': percentage_punctuation, 'output_type': 'numeric'},
		{'name': 'Language', 'method': language, 'output_type': 'categorical'},
		{'name': 'Sentiment', 'method': sentiment, 'output_type': 'numeric'},
		{'name': 'Subjectivity', 'method': subjectivity, 'output_type': 'numeric'},
		{'name': 'Average Words Per Sentence', 'method': average_words_per_sentence, 'output_type': 'numeric'},
		{'name': 'Reading Ease', 'method': readability_score, 'output_type': 'numeric'},
		{'name': 'Lexical Density', 'method': lexical_density, 'output_type': 'numeric'},
		{'name': 'Toxicity', 'method': toxicity, 'output_type': 'numeric'},
		{'name': 'Fluency', 'method': fluency, 'output_type': 'numeric'},
		{'name': 'Formality', 'method': formality, 'output_type': 'numeric'},
		{'name': 'Unique Noun Count', 'method': unique_noun_count, 'output_type': 'numeric'},
		)

		ALL_PROPERTIES: Tuple[TextProperty, ...] = (
		{'name': 'URLs Count', 'method': urls_count, 'output_type': 'numeric'},
		{'name': 'Email Addresses Count', 'method': email_addresses_count,
		'output_type': 'numeric'},
		{'name': 'Unique URLs Count', 'method': unique_urls_count,
		'output_type': 'numeric'},
		{'name': 'Unique Email Addresses Count',
		'method': unique_email_addresses_count, 'output_type': 'numeric'},
		{'name': 'Unique Syllables Count', 'method': unique_syllables_count,
		'output_type': 'numeric'},
		{'name': 'Reading Time', 'method': reading_time,
		'output_type': 'numeric'},
		{'name': 'Sentences Count', 'method': sentences_count,
		'output_type': 'numeric'},
		{'name': 'Average Syllable Length', 'method': average_syllable_length,
		'output_type': 'numeric'},
		) + DEFAULT_PROPERTIES
		ALL_PROPERTIES: Tuple[TextProperty, ...] = \
		(
		{'name': 'English Text', 'method': english_text, 'output_type': 'categorical'},
		{'name': 'URLs Count', 'method': urls_count, 'output_type': 'numeric'},
		{'name': 'Email Addresses Count', 'method': email_addresses_count, 'output_type': 'numeric'},
		{'name': 'Unique URLs Count', 'method': unique_urls_count, 'output_type': 'numeric'},
		{'name': 'Unique Email Addresses Count', 'method': unique_email_addresses_count, 'output_type': 'numeric'},
		{'name': 'Unique Syllables Count', 'method': unique_syllables_count, 'output_type': 'numeric'},
		{'name': 'Reading Time', 'method': reading_time, 'output_type': 'numeric'},
		{'name': 'Sentences Count', 'method': sentences_count, 'output_type': 'numeric'},
		{'name': 'Average Syllable Length', 'method': average_syllable_length, 'output_type': 'numeric'},
		) + DEFAULT_PROPERTIES

		LONG_RUN_PROPERTIES = ('Toxicity', 'Fluency', 'Formality', 'Unique Noun Count')

		BATCH_PROPERTIES = ('Toxicity', 'Fluency', 'Formality')

		LARGE_SAMPLE_SIZE = 10_000

		ENGLISH_ONLY_PROPERTIES = (
		'Sentiment', 'Subjectivity', 'Toxicity', 'Fluency', 'Formality', 'Readability Score',
		'Sentiment', 'Subjectivity', 'Toxicity', 'Fluency', 'Formality', 'Reading Ease',
		'Unique Noun Count', 'Unique Syllables Count', 'Sentences Count', 'Average Syllable Length'
		)

		CMUDICT_PROPERTIES = ('Average Syllable Length', 'Unique Syllables Count', 'Readability Score')
		CMUDICT_PROPERTIES = ('Average Syllable Length', 'Unique Syllables Count', 'Reading Ease')

		@@ -679,12 +565,21 @@ TEXT_PROPERTIES_DESCRIPTION = {
		'Max Word Length': 'Maximum number of characters in a word',
		'% Special Characters': 'Percentage of special characters in the text',
		'% Special Characters': 'Percentage of special characters in the text. Special characters are non-alphanumeric '
		'unicode characters, excluding whitespaces and any of !\"#$%&\'()*+,-./:;=?\\@.',
		'% Punctuation': 'Percentage of punctuation characters in the text. Punctuation characters are any of '
		'!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{\|}~',
		'Language': 'Language of the text, using the fasttext language detection model',
		'Sentiment': 'Sentiment of the text, calculated using the TextBlob sentiment analysis model',
		'Subjectivity': 'Subjectivity of the text, calculated using the TextBlob sentiment analysis model',
		'Sentiment': 'Sentiment of the text, calculated using the TextBlob sentiment analysis model.'
		' Ranging from -1 (negative) to 1 (positive)',
		'Subjectivity': 'Subjectivity of the text, calculated using the TextBlob sentiment analysis model. Ranging from 0 '
		'(objective) to 1 (subjective)',
		'Average Words Per Sentence': 'Average number of words per sentence in the text',
		'Readability Score': 'A score calculated based on Flesch reading-ease per text sample',
		'Reading Ease': 'How easy to read a text sample is, typically ranges from around 0 (hard to read) to around '
		'100 (very easy). Based on Flesch reading-ease score',
		'Lexical Density': 'Percentage of unique words in the text',
		'Toxicity': 'Toxicity score using unitary/toxic-bert HuggingFace model',
		'Fluency': 'Fluency score using prithivida/parrot_fluency_model HuggingFace model',
		'Formality': 'Formality score using s-nlp/roberta-base-formality-ranker HuggingFace model',
		'Toxicity': 'A measure of how harmful or offensive a text sample is (0 to 1), uses the Detoxify library '
		'unitary/toxic-bert model',
		'Fluency': 'A measure of the fluency of the text (0 to 1), using the prithivida/parrot_fluency_model'
		' model from the authors of the Parrot Paraphraser library',
		'Formality': 'The formality / register of the text (0 to 1), using the s-nlp/roberta-base-formality-ranker'
		' model by the Skolkovo Institute of Science and Technology',
		'Unique Noun Count': 'Number of unique noun words in the text',
		@@ -694,2 +589,3 @@ 'URLs Count': 'Number of URLS per text sample',
		'Unique URLs Count': 'Number of unique URLS per text sample',
		'English Text': 'Whether the text is in English (1) or not (0)',
		'Unique Email Addresses Count': 'Number of unique email addresses per text sample',
		@@ -778,4 +674,7 @@ 'Unique Syllables Count': 'Number of unique syllables per text sample',
		include_long_calculation_properties: bool = False,
		ignore_non_english_samples_for_english_properties: bool = True,
		device: Optional[str] = None,
		models_storage: Union[pathlib.Path, str, None] = None
		models_storage: Union[pathlib.Path, str, None] = None,
		batch_size: Optional[int] = 16,
		cache_models: bool = False
		) -> Tuple[Dict[str, List[float]], Dict[str, str]]:
		@@ -791,12 +690,12 @@ """Calculate properties on provided text samples.
		together with ignore_properties parameter. Available properties are:
		['Text Length', 'Average Word Length', 'Max Word Length', '% Special Characters', 'Language',
		['Text Length', 'Average Word Length', 'Max Word Length', '% Special Characters', '% Punctuation', 'Language',
		'Sentiment', 'Subjectivity', 'Toxicity', 'Fluency', 'Formality', 'Lexical Density', 'Unique Noun Count',
		'Readability Score', 'Average Words Per Sentence', 'URLs Count', Unique URLs Count', 'Email Address Count',
		'Reading Ease', 'Average Words Per Sentence', 'URLs Count', Unique URLs Count', 'Email Address Count',
		'Unique Email Address Count', 'Unique Syllables Count', 'Reading Time', 'Sentences Count',
		'Average Syllable Length']
		List of default properties are: ['Text Length', 'Average Word Length', 'Max Word Length',
		'% Special Characters', 'Language', 'Sentiment', 'Subjectivity', 'Toxicity', 'Fluency', 'Formality',
		'Lexical Density', 'Unique Noun Count', 'Readability Score', 'Average Words Per Sentence']
		'% Special Characters', '% Punctuation', 'Language', 'Sentiment', 'Subjectivity', 'Toxicity', 'Fluency',
		'Formality', 'Lexical Density', 'Unique Noun Count', 'Reading Ease', 'Average Words Per Sentence']
		To calculate all the default properties, the include_properties and ignore_properties parameters should
		be None. If you pass either include_properties or ignore_properties then the only the properties specified
		be None. If you pass either include_properties or ignore_properties then only the properties specified
		in the list will be calculated or ignored.
		@@ -812,2 +711,8 @@ Note that the properties ['Toxicity', 'Fluency', 'Formality', 'Language', 'Unique Noun Count'] may
		ignored, unless they are specified in the include_properties parameter explicitly.
		ignore_non_english_samples_for_english_properties : bool, default True
		Whether to ignore samples that are not in English when calculating English properties. If False, samples
		that are not in English will be calculated as well. This parameter is ignored when calculating non-English
		properties.
		English-Only properties WILL NOT work properly on non-English samples, and this parameter should be used
		only when you are sure that all the samples are in English.
		device : int, default None
		@@ -819,2 +724,6 @@ The device to use for the calculation. If None, the default device will be used.
		Also, if a folder already contains relevant resources they are not re-downloaded.
		batch_size : int, default 8
		The batch size.
		cache_models : bool, default False
		cache the models being used in this function, to save load time in next execution

		@@ -847,3 +756,3 @@ Returns
		if 'fasttext_model' not in kwargs:
		kwargs['fasttext_model'] = _get_fasttext_model(models_storage=models_storage)
		kwargs['fasttext_model'] = get_fasttext_model(models_storage=models_storage, use_cache=cache_models)

		@@ -857,16 +766,18 @@ if 'cmudict_dict' not in kwargs:
		calculated_properties[prop] = [np.nan] * len(raw_text)
		cmudict_dict = corpus.cmudict.dict()
		kwargs['cmudict_dict'] = cmudict_dict
		kwargs['cmudict_dict'] = get_cmudict_dict(use_cache=cache_models)

		if 'Toxicity' in text_properties_names and 'toxicity_classifier' not in kwargs:
		kwargs['toxicity_classifier'] = get_transformer_pipeline(
		property_name='toxicity', model_name=TOXICITY_MODEL_NAME, device=device, models_storage=models_storage)
		property_name='toxicity', model_name=TOXICITY_MODEL_NAME, device=device,
		models_storage=models_storage, use_cache=cache_models)

		if 'Formality' in text_properties_names and 'formality_classifier' not in kwargs:
		kwargs['formality_classifier'] = get_transformer_pipeline(
		property_name='formality', model_name=FORMALITY_MODEL_NAME, device=device, models_storage=models_storage)
		property_name='formality', model_name=FORMALITY_MODEL_NAME, device=device,
		models_storage=models_storage, use_cache=cache_models)

		if 'Fluency' in text_properties_names and 'fluency_classifier' not in kwargs:
		kwargs['fluency_classifier'] = get_transformer_pipeline(
		property_name='fluency', model_name=FLUENCY_MODEL_NAME, device=device, models_storage=models_storage)
		property_name='fluency', model_name=FLUENCY_MODEL_NAME, device=device,
		models_storage=models_storage, use_cache=cache_models)

		@@ -885,34 +796,52 @@ is_language_property_requested = 'Language' in [prop['name'] for prop in text_properties]

		progress_bar = create_progress_bar(
		iterable=list(raw_text),
		name='Text Samples Calculation',
		unit='Text Sample'
		)
		for text in progress_bar:
		progress_bar.set_postfix(
		{'Sample': truncate_string(text, max_length=20) if text else 'EMPTY STRING'},
		refresh=False
		)
		if pd.isna(text):
		for prop in text_properties:
		calculated_properties[prop['name']].append(np.nan)
		continue
		sample_language = run_available_kwargs(language, text=text, **kwargs)
		for i in tqdm(range(0, len(raw_text), batch_size)):
		batch = raw_text[i:i + batch_size]
		batch_properties = defaultdict(list)

		# filtering out empty sequences
		nan_indices = {i for i, seq in enumerate(batch) if pd.isna(seq) is True}
		filtered_sequences = [e for i, e in enumerate(batch) if i not in nan_indices]

		samples_language = _batch_wrapper(text_batch=filtered_sequences, func=language, **kwargs)
		if is_language_property_requested:
		calculated_properties['Language'].append(sample_language)
		batch_properties['Language'].extend(samples_language)
		calculated_properties['Language'].extend(samples_language)
		kwargs['language_property_result'] = samples_language # Pass the language property to other properties

		non_english_indices = set()
		if ignore_non_english_samples_for_english_properties:
		non_english_indices = {i for i, (seq, lang) in enumerate(zip(filtered_sequences, samples_language))
		if lang != 'en'}
		for prop in text_properties:
		if prop['name'] in import_warnings: # Skip properties that failed to import:
		calculated_properties[prop['name']].append(np.nan)
		elif sample_language != 'en' and prop['name'] in english_properties_names:
		calculated_properties[prop['name']].append(np.nan)
		batch_properties[prop['name']].extend([np.nan] * len(batch))
		else:
		if prop['name'] in english_properties_names \
		and ignore_non_english_samples_for_english_properties is True:
		filtered_sequences = [e for i, e in enumerate(filtered_sequences) if i not in non_english_indices]
		kwargs['batch_size'] = batch_size
		try:
		value = run_available_kwargs(prop['method'], text=text, **kwargs)
		calculated_properties[prop['name']].append(value)
		if prop['name'] in BATCH_PROPERTIES:
		value = run_available_kwargs(func=prop['method'], text_batch=filtered_sequences, **kwargs)
		else:
		value = _batch_wrapper(text_batch=filtered_sequences, func=prop['method'], **kwargs)
		batch_properties[prop['name']].extend(value)
		except ImportError as e:
		warnings.warn(warning_message.format(prop['name'], str(e)))
		calculated_properties[prop['name']].append(np.nan)
		batch_properties[prop['name']].extend([np.nan] * len(batch))
		import_warnings.add(prop['name'])

		result_index = 0

		for index, seq in enumerate(batch):
		if index in nan_indices or (index in non_english_indices and
		ignore_non_english_samples_for_english_properties and
		prop['name'] in english_properties_names):
		calculated_properties[prop['name']].append(np.nan)
		else:
		calculated_properties[prop['name']].append(batch_properties[prop['name']][result_index])
		result_index += 1

		filtered_sequences = [e for i, e in enumerate(batch) if i not in nan_indices]

		# Clear property caches:
		@@ -924,3 +853,4 @@ textblob_cache.clear()
		# Clean all remaining RAM:
		gc.collect()
		if not cache_models:
		gc.collect()

		@@ -927,0 +857,0 @@ if not calculated_properties:

+19

-16

deepchecks/tabular...ks/model_evaluation/boosting_overfit.py

		@@ -35,7 +35,9 @@ # ----------------------------------------------------------------------------

		_UNSUPPORTED_MODEL_ERROR = (
		'Check is relevant for Boosting models of type '
		'{supported_models}, but received model of type {model_type}'
		)
		_UNSUPPORTED_MODEL_ERROR = \
		'Check is relevant for Boosting models of type {supported_models}, but received model of type {model_type}'

		_NO_MODEL_ERROR = \
		'Check is relevant only when receiving the model, but predictions/probabilities were received instead. ' \
		'In order to use this check, please pass the model to the run() method.'

		_SUPPORTED_CLASSIFICATION_MODELS = (
		@@ -82,2 +84,12 @@ 'AdaBoostClassifier',

		@classmethod
		def _raise_not_supported_model_error(cls, model_class):
		if model_class != '_DummyModel':
		raise ModelValidationError(cls._UNSUPPORTED_MODEL_ERROR.format(
		supported_models=cls._SUPPORTED_MODELS,
		model_type=model_class
		))
		else:
		raise ModelValidationError(cls._NO_MODEL_ERROR)

		def predict_proba(self, x):
		@@ -93,6 +105,3 @@ if self.model_class in ['AdaBoostClassifier', 'GradientBoostingClassifier']:
		else:
		raise ModelValidationError(self._UNSUPPORTED_MODEL_ERROR.format(
		supported_models=self._SUPPORTED_CLASSIFICATION_MODELS,
		model_type=self.model_class
		))
		self._raise_not_supported_model_error(self.model_class)

		@@ -110,6 +119,3 @@ def predict(self, x):
		else:
		raise ModelValidationError(self._UNSUPPORTED_MODEL_ERROR.format(
		supported_models=self._SUPPORTED_MODELS,
		model_type=self.model_class
		))
		self._raise_not_supported_model_error(self.model_class)

		@@ -130,6 +136,3 @@ @classmethod
		else:
		raise ModelValidationError(cls._UNSUPPORTED_MODEL_ERROR.format(
		supported_models=cls._SUPPORTED_MODELS,
		model_type=model_class
		))
		cls._raise_not_supported_model_error(model_class=model_class)

		@@ -136,0 +139,0 @@

+9

-3

deepchecks/tabular...l_evaluation/simple_model_comparison.py

		@@ -374,3 +374,4 @@ # ----------------------------------------------------------------------------
		Used in classification models to flag if to run condition on average of classes, or on
		each class individually
		each class individually. If any scorer that return a single value is used, this parameter
		is ignored (will act as if average=True).
		"""
		@@ -394,4 +395,9 @@ name = f'Model performance gain over simple model is greater than {format_percent(min_allowed_gain)}'

		# If the depth of the nested scores dict is 2, average is not relevant and is set to True
		inner_dict = scores[list(scores.keys())[0]]
		inner_inner_dict = inner_dict[list(inner_dict.keys())[0]]
		force_average = isinstance(inner_inner_dict, Number)

		passed_condition = True
		if task_type in [TaskType.MULTICLASS, TaskType.BINARY] and not average:
		if task_type in [TaskType.MULTICLASS, TaskType.BINARY] and not average and not force_average:
		passed_metrics = {}
		@@ -438,3 +444,3 @@ failed_classes = defaultdict(dict)
		perfect_metrics = []
		if task_type in [TaskType.MULTICLASS, TaskType.BINARY]:
		if task_type in [TaskType.MULTICLASS, TaskType.BINARY] and not force_average:
		scores = average_scores(scores, include_classes)
		@@ -441,0 +447,0 @@ for metric, models_scores in scores.items():

+10

-4

deepchecks/tabular...evaluation/weak_segments_performance.py

		@@ -13,3 +13,3 @@ # ----------------------------------------------------------------------------
		import warnings
		from typing import TYPE_CHECKING, Callable, Dict, List, Union
		from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Union

		@@ -53,3 +53,3 @@ import numpy as np
		Columns to ignore, if none given checks based on columns variable
		n_top_features : int , default: 5
		n_top_features : Optional[int] , default: 10
		Number of features to use for segment search. Top columns are selected based on feature importance.
		@@ -78,2 +78,5 @@ segment_minimum_size_ratio: float , default: 0.05
		random seed for all check internals.
		multiple_segments_per_feature : bool , default: True
		If True, will allow the same feature to be a segmenting feature in multiple segments,
		otherwise each feature can appear in one segment at most.
		"""
		@@ -85,3 +88,3 @@
		ignore_columns: Union[Hashable, List[Hashable], None] = None,
		n_top_features: int = 5,
		n_top_features: Optional[int] = 10,
		segment_minimum_size_ratio: float = 0.05,
		@@ -95,2 +98,3 @@ alternative_scorer: Dict[str, Union[str, Callable]] = None,
		random_state: int = 42,
		multiple_segments_per_feature: bool = True,
		**kwargs
		@@ -116,2 +120,3 @@ ):
		self.categorical_aggregation_threshold = categorical_aggregation_threshold
		self.multiple_segments_per_feature = multiple_segments_per_feature

		@@ -169,3 +174,4 @@ def run_logic(self, context: Context, dataset_kind) -> CheckResult:
		feature_rank_for_search=feature_rank,
		dummy_model=dummy_model, scorer=scorer)
		dummy_model=dummy_model, scorer=scorer,
		multiple_segments_per_feature=self.multiple_segments_per_feature)

		@@ -172,0 +178,0 @@ if len(weak_segments) == 0:

+4

-1

deepchecks/tabular...n_test_validation/multivariate_drift.py

		@@ -104,3 +104,6 @@ # ----------------------------------------------------------------------------

		sample_size = min(self.n_samples, train_dataset.n_samples, test_dataset.n_samples)
		if self.n_samples is None:
		sample_size = min(train_dataset.n_samples, test_dataset.n_samples)
		else:
		sample_size = min(train_dataset.n_samples, test_dataset.n_samples, self.n_samples)

		@@ -107,0 +110,0 @@ headnote = """

+5

-2

deepchecks/tabular/context.py

 @@ -294,4 +294,7 @@ # ----------------------------------------------------------------------------
             # If in infer_task_type we didn't find classes on model, or user didn't pass any, then using the observed
             get_logger().warning('Could not find model\'s classes, using the observed classes')
             return self.observed_classes
             get_logger().warning('Could not find model\'s classes, using the observed classes. '
                                  'In order to make sure the classes used by the model are inferred correctly, '
                                  'please use the model_classes argument')
             self._model_classes = self.observed_classes
         return self._model_classes
 @@ -298,0 +301,0 @@

+35

-18

deepchecks/utils/abstracts/confusion_matrix_abstract.py

		@@ -34,5 +34,5 @@ # ----------------------------------------------------------------------------
		if with_display:
		fig = create_confusion_matrix_figure(result, total_classes, normalize_display)
		displays = create_confusion_matrix_figure(result, total_classes, normalize_display)
		else:
		fig = None
		displays = None

		@@ -42,3 +42,3 @@ # For accessing the class names from the condition

		return CheckResult(result, display=fig)
		return CheckResult(result, display=displays)

		@@ -65,27 +65,44 @@
		"""
		confusion_matrix_norm = confusion_matrix_data.astype('float') / \
		(confusion_matrix_data.sum(axis=1)[:, np.newaxis] + np.finfo(float).eps) * 100
		if normalize_display:
		confusion_matrix_norm = confusion_matrix_data.astype('float') / \
		(confusion_matrix_data.sum(axis=1)[:, np.newaxis] + np.finfo(float).eps) * 100
		z = np.vectorize(format_number_if_not_nan)(confusion_matrix_norm)
		text_template = '%{z}%<br>(%{text})'
		color_bar_title = '% out of<br>True Values'
		plot_title = 'Percent Out of True Values (Count)'
		else:
		z = confusion_matrix_data
		color_bar_title = None
		text_template = '%{text}'
		plot_title = 'Value Count'

		fig = go.Figure(data=go.Heatmap(
		x=classes_names, y=classes_names, z=z,
		text=confusion_matrix_data, texttemplate=text_template))
		fig.data[0].colorbar.title = color_bar_title
		fig.update_layout(title=plot_title)
		accuracy_array = np.diag(confusion_matrix_norm).round(decimals=2)

		display = []
		display_msg = f'The overall accuracy of your model is: {round(np.sum(accuracy_array)/len(accuracy_array), 2)}%.'

		if min(accuracy_array) < 100:
		display_msg += f'<br>Best accuracy achieved on samples with <b>{classes_names[np.argmax(accuracy_array)]}' \
		f'</b> label ({np.max(accuracy_array)}%).'
		display_msg += f'<br>Worst accuracy achieved on samples with <b>{classes_names[np.argmin(accuracy_array)]}' \
		f'</b> label ({np.min(accuracy_array)}%).'
		display.append(display_msg)

		total_samples = np.nansum(confusion_matrix_data)
		percent_data_each_row = np.round(confusion_matrix_norm, decimals=2)
		percent_data_each_cell = np.round(np.divide(np.nan_to_num(confusion_matrix_data, nan=0.0), total_samples) * 100,
		decimals=2)
		percent_data_each_col = (confusion_matrix_data.astype('float') /
		(confusion_matrix_data.sum(axis=0)[:, np.newaxis] +
		np.finfo(float).eps) * 100).round(decimals=2)
		custom_hoverdata = np.dstack((percent_data_each_cell, percent_data_each_row, percent_data_each_col))

		fig = go.Figure(data=go.Heatmap(x=classes_names, y=classes_names, z=z, customdata=custom_hoverdata,
		xgap=1, ygap=1, text=confusion_matrix_data, texttemplate='%{text}',
		hovertemplate='% out of all data: <b>%{customdata[0]}%</b><br>% out '
		'of row: <b>%{customdata[1]}%</b><br>% out of column: '
		'<b>%{customdata[2]}%</b><extra></extra>',
		showscale=False))
		fig.update_layout(title='Confusion Matrix (# Samples)', title_x=0.5)
		fig.update_layout(height=600)
		fig.update_xaxes(title='Predicted Value', type='category', scaleanchor='y', constrain='domain')
		fig.update_yaxes(title='True Value', type='category', constrain='domain', autorange='reversed')
		display.append(fig)
		return display

		return fig


		def misclassified_samples_lower_than_condition(value: pd.DataFrame,
		@@ -92,0 +109,0 @@ misclassified_samples_threshold: float) -> ConditionResult:

+1

-1

deepchecks/utils/abstracts/feature_drift.py

		@@ -84,3 +84,3 @@ # ----------------------------------------------------------------------------
		with_display=with_display,
		dataset_names=(test_dataframe_name, train_dataframe_name)
		dataset_names=(train_dataframe_name, test_dataframe_name)
		)
		@@ -87,0 +87,0 @@

+22

-6

deepchecks/utils/abstracts/weak_segment_abstract.py

		@@ -173,4 +173,4 @@ # ----------------------------------------------------------------------------
		feature_rank_for_search: Optional[np.ndarray] = None,
		dummy_model: Optional[_DummyModel] = None,
		scorer: Optional[DeepcheckScorer] = None, scorer_name: Optional[str] = None) \
		dummy_model: Optional[_DummyModel] = None, scorer: Optional[DeepcheckScorer] = None,
		scorer_name: Optional[str] = None, multiple_segments_per_feature: bool = False) \
		-> pd.DataFrame:
		@@ -217,8 +217,21 @@ """Search for weak segments based on scorer."""

		# Drop duplicates without considering column 'Samples in Segment'
		result_no_duplicates = weak_segments.drop(columns='Samples in Segment').drop_duplicates()
		result_no_duplicates['Samples in Segment'] = weak_segments.loc[result_no_duplicates.index, 'Samples in Segment']
		# Sort and drop relevant columns
		weak_segments = weak_segments.sort_values(score_title).reset_index(drop=True)
		if multiple_segments_per_feature:
		result = weak_segments.drop(columns='Samples in Segment').drop_duplicates()
		result['Samples in Segment'] = weak_segments.loc[result.index, 'Samples in Segment']
		else:
		used_features = set()
		result = pd.DataFrame(columns=weak_segments.columns)
		for _, row in weak_segments.iterrows():
		if row['Feature1'] in used_features or row['Feature2'] in used_features:
		continue

		return result_no_duplicates.sort_values(score_title).reset_index(drop=True)
		result.loc[len(result)] = row
		used_features.add(row['Feature1'])
		if row['Feature2'] != '':
		used_features.add(row['Feature2'])

		return result

		def _find_weak_segment(self, data: pd.DataFrame, features_for_segment: List[str], score_per_sample: pd.Series,
		@@ -335,2 +348,5 @@ label_col: Optional[pd.Series] = None, dummy_model: Optional[_DummyModel] = None,
		def condition(result: Dict) -> ConditionResult:
		if 'message' in result:
		return ConditionResult(ConditionCategory.PASS, result['message'])

		weakest_segment_score = result['weak_segments_list'].iloc[0, 0]
		@@ -337,0 +353,0 @@ scorer_name = result['weak_segments_list'].columns[0].lower()

+5

-0

deepchecks/utils/plot.py

		@@ -34,2 +34,7 @@ # ----------------------------------------------------------------------------

		feature_distribution_colors = {
		'measure': '#00008b', # dark blue
		'feature': 'rgba(105, 179, 162, 1)'
		}

		common_and_outlier_colors = {'common': 'rgba(105, 179, 162, 1)',
		@@ -36,0 +41,0 @@ 'outliers': 'rgba(179, 106, 106, 1)',

+1

-1

deepchecks/vision/...ks/model_evaluation/confusion_matrix.py

		@@ -148,3 +148,3 @@ # ----------------------------------------------------------------------------

		description.append(
		description.extend(
		create_confusion_matrix_figure(confusion_matrix, x, self.normalized)
		@@ -151,0 +151,0 @@ )

+7

-1

deepchecks/vision/...evaluation/weak_segments_performance.py

		@@ -69,2 +69,5 @@ # ----------------------------------------------------------------------------
		For each categorical property, categories with frequency below threshold will be merged into "Other" category.
		multiple_segments_per_property : bool , default: True
		If True, will allow the same property to be a segmenting feature in multiple segments,
		otherwise each property can appear in one segment at most.
		{additional_check_init_params:2*indent}
		@@ -80,2 +83,3 @@ """
		categorical_aggregation_threshold: float = 0.05,
		multiple_segments_per_property: bool = True,
		**kwargs
		@@ -91,2 +95,3 @@ ):
		self.categorical_aggregation_threshold = categorical_aggregation_threshold
		self.multiple_segments_per_property = multiple_segments_per_property
		self._properties_results = None
		@@ -142,3 +147,4 @@ self._sample_scores = None
		score_per_sample=score_per_sample_col,
		scorer_name=self._scorer_name)
		scorer_name=self._scorer_name,
		multiple_segments_per_feature=self.multiple_segments_per_property)
		if len(weak_segments) == 0:
		@@ -145,0 +151,0 @@ raise DeepchecksProcessError('WeakSegmentsPerformance was unable to train an error model to find weak '

+5

-3

deepchecks/vision/utils/image_functions.py

		@@ -228,3 +228,5 @@ # ----------------------------------------------------------------------------
		while jump_size > 1:
		if font.getsize(text)[0] < desired_width:
		left, _, right, _ = font.getbbox(text)
		width = right - left
		if width < desired_width:
		font_size += jump_size
		@@ -270,4 +272,4 @@ else:
		size = (int(image.size[0] * factor), int(image.size[1] * factor))
		# Resize the image
		image = image.resize(size, pilimage.ANTIALIAS)
		# Resize the image by Image.LANCZOS
		image = image.resize(size, pilimage.LANCZOS)
		else:
		@@ -274,0 +276,0 @@ image = ensure_image(image, copy=False)

+2

-2

PKG-INFO

		Metadata-Version: 2.1
		Name: deepchecks
		Version: 0.17.3
		Version: 0.17.4
		Summary: Package for validating your machine learning model and data
		@@ -9,3 +9,3 @@ Home-page: https://github.com/deepchecks/deepchecks
		License: UNKNOWN
		Download-URL: https://github.com/deepchecks/deepchecks/releases/download/0.17.3/deepchecks-0.17.3.tar.gz
		Download-URL: https://github.com/deepchecks/deepchecks/releases/download/0.17.4/deepchecks-0.17.4.tar.gz
		Project-URL: Documentation, https://docs.deepchecks.com
		@@ -12,0 +12,0 @@ Project-URL: Bug Reports, https://github.com/deepchecks/deepchecks

+2

-2

README.md

		@@ -111,3 +111,3 @@ <!--
		pip install deepchecks-installer
		deepchecks-installer monitoring-install
		deepchecks-installer install-monitoring
		```
		@@ -374,2 +374,2 @@
		This project follows the [all-contributors](https://allcontributors.org)
		specification. Contributions of any kind are welcome!
		specification. Contributions of any kind are welcome!

+3

-1

requirements/dev-requirements.txt

		@@ -40,3 +40,5 @@ twine
		requests>=2.23.0
		scipy>=1.4.1
		# Remove the <=1.10.1 dependency below once sklearn's issue is fixed. The higher version causes
		# issues with sklearn's _most_frequent() function using scipy's mode() function
		scipy>=1.4.1, <=1.10.1
		tqdm>=4.41.0
		@@ -43,0 +45,0 @@ seaborn>=0.11.0

+1

-1

requirements/nlp-prop-requirements.txt

		@@ -1,2 +0,2 @@
		optimum[onnxruntime]>=1.7.0
		optimum[onnxruntime]>=1.8.8
		fasttext>=0.8.0

+3

-1

requirements/requirements.txt

		@@ -28,3 +28,5 @@ pandas>=1.1.5
		statsmodels>=0.13.5; python_version >= '3.7'
		scipy>=1.4.1
		# Remove the <=1.10.1 dependency below once sklearn's issue is fixed. The higher version causes
		# issues with sklearn's _most_frequent() function using scipy's mode() function
		scipy>=1.4.1, <=1.10.1
		dataclasses>=0.6; python_version < '3.7'
		@@ -31,0 +33,0 @@ plotly>=5.13.1

+1

-1

VERSION

		@@ -1,1 +0,1 @@
		0.17.3
		0.17.4

deepchecks - npm Package Compare versions

Improved metrics