deepchecks
Advanced tools
| # ---------------------------------------------------------------------------- | ||
| # Copyright (C) 2021-2023 Deepchecks (https://www.deepchecks.com) | ||
| # | ||
| # This file is part of Deepchecks. | ||
| # Deepchecks is distributed under the terms of the GNU Affero General | ||
| # Public License (version 3 or later). | ||
| # You should have received a copy of the GNU Affero General Public License | ||
| # along with Deepchecks. If not, see <http://www.gnu.org/licenses/>. | ||
| # ---------------------------------------------------------------------------- | ||
| # | ||
| """A module containing utils for displaying information on TextData object.""" | ||
| from typing import List, Optional | ||
| import numpy as np | ||
| import pandas as pd | ||
| import plotly.graph_objs as go | ||
| from plotly.subplots import make_subplots | ||
| from deepchecks.nlp.task_type import TaskType, TTextLabel | ||
| from deepchecks.nlp.utils.text import break_to_lines_and_trim | ||
| from deepchecks.nlp.utils.text_properties import TEXT_PROPERTIES_DESCRIPTION | ||
| from deepchecks.utils.dataframes import un_numpy | ||
| from deepchecks.utils.distribution.plot import get_density | ||
| from deepchecks.utils.plot import feature_distribution_colors | ||
| from deepchecks.utils.strings import format_percent, get_docs_link | ||
| __all__ = ['text_data_describe_plot'] | ||
| def _calculate_annoation_ratio(label, n_samples, is_mutli_label, task_type): | ||
| if label is None: | ||
| return format_percent(0) | ||
| if is_mutli_label or task_type == TaskType.TOKEN_CLASSIFICATION: | ||
| annotated_count = _calculate_number_of_annotated_samples(label=label, | ||
| is_multi_label=is_mutli_label, | ||
| task_type=task_type) | ||
| return format_percent(annotated_count / n_samples) | ||
| else: | ||
| return format_percent(pd.notna(label).sum() / n_samples) | ||
| def _get_table_row_data(n_samples, annotation_ratio, categorical_metadata, numerical_metadata, | ||
| categorical_properties, numerical_properties, max_values_to_show: int = 5): | ||
| info_cell = [n_samples, annotation_ratio] | ||
| if categorical_metadata is None or len(categorical_metadata) == 0: | ||
| info_cell.append('No categorical metadata') | ||
| else: | ||
| info_cell.append(', '.join(categorical_metadata) if len(categorical_metadata) <= max_values_to_show | ||
| else f'{len(categorical_metadata)} metadata columns') | ||
| if numerical_metadata is None or len(numerical_metadata) == 0: | ||
| info_cell.append('No numerical metadata') | ||
| else: | ||
| info_cell.append(', '.join(numerical_metadata) if len(numerical_metadata) <= max_values_to_show | ||
| else f'{len(numerical_metadata)} metadata columns') | ||
| if categorical_properties is None or len(categorical_properties) == 0: | ||
| info_cell.append('No categorical properties') | ||
| else: | ||
| info_cell.append(', '.join(categorical_properties) if len(categorical_properties) <= max_values_to_show | ||
| else f'{len(categorical_properties)} properties') | ||
| if numerical_properties is None or len(numerical_properties) == 0: | ||
| info_cell.append('No numerical properties') | ||
| else: | ||
| info_cell.append(', '.join(numerical_properties) if len(numerical_properties) <= max_values_to_show | ||
| else f'{len(numerical_properties)} properties') | ||
| return info_cell | ||
| def _generate_table_trace(n_samples, annotation_ratio, categorical_metadata, numerical_metadata, | ||
| categorical_properties, numerical_properties): | ||
| data_cell = ['<b>Number of samples</b>', '<b>Annotation ratio</b>', '<b>Metadata categorical columns</b>', | ||
| '<b>Metadata numerical columns</b>', '<b>Categorical properties</b>', '<b>Numerical properties</b>'] | ||
| info_cell = _get_table_row_data(n_samples=n_samples, annotation_ratio=annotation_ratio, | ||
| categorical_metadata=categorical_metadata, numerical_metadata=numerical_metadata, | ||
| categorical_properties=categorical_properties, | ||
| numerical_properties=numerical_properties, max_values_to_show=7) | ||
| trace = go.Table(header={'fill': {'color': 'white'}}, | ||
| cells={'values': [data_cell, info_cell], 'align': ['left'], 'font_size': 12, | ||
| 'height': 30}) | ||
| return trace | ||
| def _generate_categorical_distribution_plot(data, property_name): | ||
| dist_counts = data.value_counts(normalize=True).to_dict() | ||
| counts = list(dist_counts.values()) | ||
| categories_list = list(dist_counts.keys()) | ||
| cat_df = pd.DataFrame({property_name: counts}, index=[un_numpy(cat) for cat in categories_list]) | ||
| trace = go.Bar(x=cat_df.index, y=cat_df[property_name], showlegend=False, | ||
| marker={'color': feature_distribution_colors['feature']}, | ||
| hovertemplate='<b>Value:</b> %{x}<br><b>Frequency:</b> %{y}<extra></extra>') | ||
| yaxis_layout = dict(type='log', title='Frequency (Log Scale)') | ||
| xaxis_layout = dict(title=property_name) | ||
| return trace, xaxis_layout, yaxis_layout | ||
| def _get_distribution_values(data): | ||
| mean = data.mean() | ||
| median = data.median() | ||
| x_range = (data.min(), data.max()) | ||
| if all(int(x) == x for x in data if x is not None): | ||
| # If the distribution is discrete, we take all the values in it: | ||
| xs = sorted(np.unique(data)) | ||
| if len(xs) > 50: | ||
| # If there are too many values, we take only 50, using a constant interval between them: | ||
| xs = list(range(int(xs[0]), int(xs[-1]) + 1, int((xs[-1] - xs[0]) // 50))) | ||
| else: | ||
| xs = sorted(np.concatenate((np.linspace(x_range[0], x_range[1], 50), | ||
| np.quantile(data, q=np.arange(0.02, 1, 0.02)), | ||
| [mean, median] | ||
| ))) | ||
| ixs = np.searchsorted(sorted(data), xs, side='left') | ||
| xs = [xs[i] for i in range(len(ixs)) if ixs[i] != ixs[i - 1]] | ||
| y_value = get_density(data, xs) | ||
| return y_value, xs | ||
| def _calculate_number_of_annotated_samples(label, is_multi_label, task_type): | ||
| if is_multi_label or task_type == TaskType.TOKEN_CLASSIFICATION: | ||
| annotated_count = 0 | ||
| for label_data in label: | ||
| annotated_count = annotated_count + 1 if len(label_data) > 0 and pd.isna(label_data).sum() == 0 \ | ||
| else annotated_count | ||
| return annotated_count | ||
| else: | ||
| return pd.notna(label).sum() | ||
| def _generate_numeric_distribution_plot(data, x_value, y_value, property_name): | ||
| mean = data.mean() | ||
| percentile_90 = data.quantile(0.9) | ||
| percentile_10 = data.quantile(0.1) | ||
| median = data.median() | ||
| trace = go.Scatter(x=x_value, y=y_value, fill='tozeroy', showlegend=False, | ||
| hovertemplate=f'<b>{property_name}:</b> ''%{x}<br><b>Density:</b> %{y}<extra></extra>', | ||
| line={'color': feature_distribution_colors['feature'], | ||
| 'shape': 'linear', 'width': 5}) | ||
| shapes = [] | ||
| annotations = [] | ||
| shapes.append(dict(type='line', x0=mean, y0=0, x1=mean, y1=max(y_value), | ||
| line={'color': feature_distribution_colors['measure'], 'dash': 'dash', 'width': 3})) | ||
| mean_xpos = mean + max(x_value) * 0.02 if median < mean else mean - max(x_value) * 0.02 | ||
| annotations.append(dict(x=mean_xpos, y=max(y_value)/2, text='<b>Mean</b>', showarrow=False, | ||
| textangle=-90, font={'size': 12})) | ||
| shapes.append(dict(type='line', x0=median, y0=0, x1=median, y1=max(y_value), | ||
| line={'color': feature_distribution_colors['measure'], 'dash': 'dot', 'width': 3})) | ||
| median_xpos = median - max(x_value) * 0.02 if median < mean else median + max(x_value) * 0.02 | ||
| annotations.append(dict(x=median_xpos, y=max(y_value)/2, text='<b>Median</b>', showarrow=False, | ||
| textangle=-90, font={'size': 12})) | ||
| shapes.append(dict(type='line', x0=percentile_10, y0=0, x1=percentile_10, y1=max(y_value), | ||
| line={'color': feature_distribution_colors['measure'], 'dash': 'dashdot', 'width': 3})) | ||
| annotations.append(dict(x=percentile_10 - max(x_value)*0.02, y=max(y_value)/2, textangle=-90, | ||
| text='<b>10<sup>th</sup> Percentile</b>', showarrow=False, font={'size': 12})) | ||
| shapes.append(dict(type='line', x0=percentile_90, y0=0, x1=percentile_90, y1=max(y_value), | ||
| line={'color': feature_distribution_colors['measure'], 'dash': 'dashdot', 'width': 3})) | ||
| annotations.append(dict(x=percentile_90 + max(x_value)*0.02, y=max(y_value)/2, textangle=-90, | ||
| text='<b>90<sup>th</sup> Percentile</b>', showarrow=False, font={'size': 12})) | ||
| xaxis_layout = dict(title=property_name) | ||
| yaxis_layout = dict(title='Density') | ||
| return trace, shapes, annotations, xaxis_layout, yaxis_layout | ||
| def text_data_describe_plot(n_samples: int, max_num_labels_to_show: int, | ||
| is_multi_label: bool, task_type: str, | ||
| properties: pd.DataFrame, | ||
| categorical_metadata: Optional[List[str]] = None, | ||
| numerical_metadata: Optional[List[str]] = None, | ||
| categorical_properties: Optional[List[str]] = None, | ||
| numerical_properties: Optional[List[str]] = None, | ||
| model_classes: Optional[List[str]] = None, | ||
| label: Optional[TTextLabel] = None): | ||
| """Return a plotly figure instance. | ||
| Parameters | ||
| ---------- | ||
| properties: pd.DataFrame | ||
| The DataFrame consisting of the text properties data. If no prooperties are there, you can pass an | ||
| empty DataFrame as well. | ||
| n_samples: int | ||
| The total number of samples present in the TextData object. | ||
| max_num_labels_to_show : int | ||
| The threshold to display the maximum number of labels on the label distribution pie chart and display | ||
| rest of the labels under "Others" category. | ||
| is_multi_label: bool | ||
| A boolean where True denotes that the TextData contains multi labeled data otherwise false. | ||
| task_type: str | ||
| The task type for the text data. Can be either 'text_classification' or 'token_classification'. | ||
| categorical_metadata: Optional[List[str]], default: None | ||
| The names of the categorical metadata columns. | ||
| numerical_metadata: Optional[List[str]], default: None | ||
| The names of the numerical metadata columns. | ||
| categorical_properties: Optional[List[str]], default: None | ||
| The names of the categorical properties columns. | ||
| numerical_properties: Optional[List[str]], default: None | ||
| The names of the numerical text properties columns. | ||
| label: Optional[TTextLabel], default: None | ||
| The label for the text data. Can be either a text_classification label or a token_classification label. | ||
| If None, the label distribution graph is not generated. | ||
| - text_classification label - For text classification the accepted label format differs between multilabel and | ||
| single label cases. For single label data, the label should be passed as a sequence of labels, with one entry | ||
| per sample that can be either a string or an integer. For multilabel data, the label should be passed as a | ||
| sequence of sequences, with the sequence for each sample being a binary vector, representing the presence of | ||
| the i-th label in that sample. | ||
| - token_classification label - For token classification the accepted label format is the IOB format or similar | ||
| to it. The Label must be a sequence of sequences of strings or integers, with each sequence corresponding to | ||
| a sample in the tokenized text, and exactly the length of the corresponding tokenized text. | ||
| model_classes: Optional[List[str]], default: None | ||
| List of classes names to use for multi-label display. Only used if the dataset is multi-label. | ||
| Returns | ||
| ------- | ||
| Plotly Figure instance. | ||
| """ | ||
| specs = [[{'type': 'pie'}, {'type': 'table'}] if label is not None else [{'type': 'table', 'colspan': 2}, None]] + \ | ||
| [[{'type': 'xy', 'colspan': 2}, None] for _ in range(len(properties.columns))] | ||
| subplot_titles = [] | ||
| if label is not None: | ||
| annotated_samples = _calculate_number_of_annotated_samples(label, is_multi_label, task_type) | ||
| subplot_titles.append(f'Label Distribution<br><sup>Out of {annotated_samples} annotated samples</sup><br><br>') | ||
| subplot_titles.append('') # Empty title for table figure | ||
| if not properties.empty: | ||
| for prop_name in properties: | ||
| if prop_name in TEXT_PROPERTIES_DESCRIPTION: | ||
| subplot_titles.append(f'{prop_name} Property Distribution<sup><a href="{get_docs_link()}nlp/' | ||
| 'usage_guides/nlp_properties.html#deepchecks-built-in-properties">ⓘ</a>' | ||
| f'</sup><br><sup>{TEXT_PROPERTIES_DESCRIPTION[prop_name]}</sup>') | ||
| fig = make_subplots(rows=len(properties.columns) + 1, cols=2, specs=specs, subplot_titles=subplot_titles, | ||
| row_heights=[1.5] + [1.0] * len(properties.columns)) | ||
| # Create label distribution if label is provided | ||
| if label is not None: | ||
| if is_multi_label: | ||
| df_label = pd.DataFrame(label).fillna(0) | ||
| if model_classes is not None: | ||
| hashmap = {} | ||
| for val in label: | ||
| model_array = np.array([model_classes[i] for i, val in enumerate(val) if val == 1]) | ||
| for class_name in model_array: | ||
| hashmap[class_name] = hashmap[class_name] + 1 if class_name in hashmap else 1 | ||
| label_counts = pd.Series(list(hashmap.values()), index=list(hashmap)) | ||
| else: | ||
| label_counts = pd.Series(np.sum(df_label.to_numpy(), axis=0)) | ||
| elif task_type == TaskType.TOKEN_CLASSIFICATION: | ||
| hashmap = {} | ||
| for val in label: | ||
| flattened_array = pd.Series(np.array(val).flatten()).fillna('NaN').to_numpy() | ||
| unique_values, counts = np.unique(flattened_array, return_counts=True) | ||
| for label_value, count in zip(unique_values, counts): | ||
| if label_value != 'NaN': | ||
| hashmap[label_value] = hashmap[label_value] + count if label_value in hashmap else count | ||
| label_counts = pd.Series(list(hashmap.values()), index=list(hashmap)) | ||
| else: | ||
| label_counts = pd.Series(label).value_counts() | ||
| label_counts.sort_values(ascending=False, inplace=True) | ||
| labels_to_display = label_counts[:max_num_labels_to_show] | ||
| labels_to_display.index = [break_to_lines_and_trim(str(label)) for label in list(labels_to_display.index)] | ||
| count_other_labels = label_counts[max_num_labels_to_show + 1:].sum() | ||
| labels_to_display['Others'] = count_other_labels | ||
| # Pie chart for label distribution | ||
| fig.add_trace(go.Pie(labels=list(labels_to_display.index), values=list(labels_to_display), | ||
| textposition='inside', showlegend=False, textinfo='label+percent', | ||
| hovertemplate='%{label}: %{value} samples<extra></extra>'), row=1, col=1) | ||
| # Table figure for displaying some statistics | ||
| annotation_ratio = _calculate_annoation_ratio(label, n_samples, is_multi_label, task_type) | ||
| table_trace = _generate_table_trace(n_samples, annotation_ratio, categorical_metadata, numerical_metadata, | ||
| categorical_properties, numerical_properties) | ||
| fig.add_trace(table_trace, row=1, col=2 if label is not None else 1) | ||
| # Looping over all the properties to generate respective property distribution graphs | ||
| curr_row = 2 # Since row 1 is occupied with Pie and Table | ||
| for property_name in properties.columns: | ||
| if property_name in categorical_properties: | ||
| # Creating bar plots for categorical properties | ||
| trace, xaxis_layout, yaxis_layout = _generate_categorical_distribution_plot( | ||
| properties[property_name], property_name | ||
| ) | ||
| fig.add_trace(trace, row=curr_row, col=1) | ||
| fig.update_xaxes(xaxis_layout, row=curr_row, col=1) | ||
| fig.update_yaxes(yaxis_layout, row=curr_row, col=1) | ||
| else: | ||
| # Creating scatter plots for numerical properties | ||
| y_value, xs = _get_distribution_values(properties[property_name]) | ||
| trace, shapes, annotations, xaxis_layout, yaxis_layout = _generate_numeric_distribution_plot( | ||
| properties[property_name], | ||
| xs, y_value, property_name | ||
| ) | ||
| fig.add_trace(trace, row=curr_row, col=1) | ||
| for shape, annotation in zip(shapes, annotations): | ||
| fig.add_shape(shape, row=curr_row, col=1) | ||
| fig.add_annotation(annotation, row=curr_row, col=1) | ||
| fig.update_yaxes(yaxis_layout, row=curr_row, col=1) | ||
| fig.update_xaxes(xaxis_layout, row=curr_row, col=1) | ||
| curr_row += 1 | ||
| fig.update_layout(height=450*(len(properties.columns) + 1)) | ||
| return fig |
| # ---------------------------------------------------------------------------- | ||
| # Copyright (C) 2021-2023 Deepchecks (https://www.deepchecks.com) | ||
| # | ||
| # This file is part of Deepchecks. | ||
| # Deepchecks is distributed under the terms of the GNU Affero General | ||
| # Public License (version 3 or later). | ||
| # You should have received a copy of the GNU Affero General Public License | ||
| # along with Deepchecks. If not, see <http://www.gnu.org/licenses/>. | ||
| # ---------------------------------------------------------------------------- | ||
| # | ||
| """Module containing the text properties models for the NLP module.""" | ||
| import importlib | ||
| import pathlib | ||
| from functools import lru_cache | ||
| from typing import Optional, Union | ||
| import requests | ||
| from nltk import corpus | ||
| MODELS_STORAGE = pathlib.Path(__file__).absolute().parent / '.nlp-models' | ||
| def get_create_model_storage(models_storage: Union[pathlib.Path, str, None] = None): | ||
| """Get the models storage directory and create it if needed.""" | ||
| if models_storage is None: | ||
| models_storage = MODELS_STORAGE | ||
| else: | ||
| if isinstance(models_storage, str): | ||
| models_storage = pathlib.Path(models_storage) | ||
| if not isinstance(models_storage, pathlib.Path): | ||
| raise ValueError( | ||
| f'Unexpected type of the "models_storage" parameter - {type(models_storage)}' | ||
| ) | ||
| if not models_storage.exists(): | ||
| models_storage.mkdir(parents=True) | ||
| if not models_storage.is_dir(): | ||
| raise ValueError('"model_storage" expected to be a directory') | ||
| return models_storage | ||
| def _get_transformer_model( | ||
| property_name: str, | ||
| model_name: str, | ||
| device: Optional[str] = None, | ||
| quantize_model: bool = False, | ||
| models_storage: Union[pathlib.Path, str, None] = None | ||
| ): | ||
| """Get the transformer model and decide if to use optimum.onnxruntime. | ||
| optimum.onnxruntime is used to optimize running times on CPU. | ||
| """ | ||
| models_storage = get_create_model_storage(models_storage) | ||
| if device not in (None, 'cpu'): | ||
| transformers = import_optional_property_dependency('transformers', property_name=property_name) | ||
| # TODO: quantize if 'quantize_model' is True | ||
| return transformers.AutoModelForSequenceClassification.from_pretrained( | ||
| model_name, | ||
| cache_dir=models_storage, | ||
| device_map=device | ||
| ) | ||
| onnx = import_optional_property_dependency( | ||
| 'optimum.onnxruntime', | ||
| property_name=property_name, | ||
| error_template=( | ||
| f'The device was set to {device} while computing the {property_name} property,' | ||
| 'in which case deepchecks resorts to accelerating the inference by using optimum,' | ||
| 'bit it is not installed. Either:\n' | ||
| '\t- Set the device according to your hardware;\n' | ||
| '\t- Install optimum by running "pip install optimum";\n' | ||
| '\t- Install all dependencies needed for text properties by running ' | ||
| '"pip install deepchecks[nlp-properties]";\n' | ||
| ) | ||
| ) | ||
| if quantize_model is False: | ||
| model_path = models_storage / 'onnx' / model_name | ||
| if model_path.exists(): | ||
| return onnx.ORTModelForSequenceClassification.from_pretrained(model_path).to(device or -1) | ||
| model = onnx.ORTModelForSequenceClassification.from_pretrained( | ||
| model_name, | ||
| export=True, | ||
| cache_dir=models_storage, | ||
| ).to(device or -1) | ||
| # NOTE: | ||
| # 'optimum', after exporting/converting a model to the ONNX format, | ||
| # does not store it onto disk we need to save it now to not reconvert | ||
| # it each time | ||
| model.save_pretrained(model_path) | ||
| return model | ||
| model_path = models_storage / 'onnx' / 'quantized' / model_name | ||
| if model_path.exists(): | ||
| return onnx.ORTModelForSequenceClassification.from_pretrained(model_path).to(device or -1) | ||
| not_quantized_model = _get_transformer_model( | ||
| property_name, | ||
| model_name, | ||
| device, | ||
| quantize_model=False, | ||
| models_storage=models_storage | ||
| ) | ||
| quantizer = onnx.ORTQuantizer.from_pretrained(not_quantized_model).to(device or -1) | ||
| quantizer.quantize( | ||
| save_dir=model_path, | ||
| # TODO: make it possible to provide a config as a parameter | ||
| quantization_config=onnx.configuration.AutoQuantizationConfig.avx512_vnni( | ||
| is_static=False, | ||
| per_channel=False | ||
| ) | ||
| ) | ||
| return onnx.ORTModelForSequenceClassification.from_pretrained(model_path).to(device or -1) | ||
| def import_optional_property_dependency( | ||
| module: str, | ||
| property_name: str, | ||
| package_name: Optional[str] = None, | ||
| error_template: Optional[str] = None | ||
| ): | ||
| """Import additional modules in runtime.""" | ||
| try: | ||
| lib = importlib.import_module(module) | ||
| except ImportError as error: | ||
| package_name = package_name or module.split('.', maxsplit=1)[0] | ||
| error_template = error_template or ( | ||
| 'property {property_name} requires the {package_name} python package. ' | ||
| 'To get it, run:\n' | ||
| '>> pip install {package_name}\n\n' | ||
| 'You may install dependencies for all text properties by running:\n' | ||
| '>> pip install deepchecks[nlp-properties]\n' | ||
| ) | ||
| raise ImportError(error_template.format( | ||
| property_name=property_name, | ||
| package_name=package_name | ||
| )) from error | ||
| else: | ||
| return lib | ||
| def get_transformer_pipeline( | ||
| property_name: str, | ||
| model_name: str, | ||
| device: Optional[str] = None, | ||
| models_storage: Union[pathlib.Path, str, None] = None, | ||
| use_cache=False | ||
| ): | ||
| """Return a transformers pipeline for the given model name.""" | ||
| if use_cache: | ||
| return _get_transformer_pipeline(property_name, model_name, device, models_storage) | ||
| # __wrapped__ is simply the function without decoration, in our case - without caching | ||
| return _get_transformer_pipeline.__wrapped__(property_name, model_name, device, models_storage) | ||
| @lru_cache(maxsize=5) | ||
| def _get_transformer_pipeline( | ||
| property_name: str, | ||
| model_name: str, | ||
| device: Optional[str] = None, | ||
| models_storage: Union[pathlib.Path, str, None] = None | ||
| ): | ||
| """Return a transformers pipeline for the given model name.""" | ||
| transformers = import_optional_property_dependency('transformers', property_name=property_name) | ||
| tokenizer = transformers.AutoTokenizer.from_pretrained(model_name, device_map=device) | ||
| model = _get_transformer_model( | ||
| property_name=property_name, | ||
| model_name=model_name, | ||
| device=device, | ||
| models_storage=models_storage | ||
| ) | ||
| return transformers.pipeline( | ||
| 'text-classification', | ||
| model=model, | ||
| tokenizer=tokenizer, | ||
| device=device | ||
| ) | ||
| def get_cmudict_dict(use_cache=False): | ||
| """Return corpus as dict.""" | ||
| if use_cache: | ||
| return _get_cmudict_dict() | ||
| return _get_cmudict_dict.__wrapped__() | ||
| @lru_cache(maxsize=1) | ||
| def _get_cmudict_dict(): | ||
| cmudict_dict = corpus.cmudict.dict() | ||
| return cmudict_dict | ||
| FASTTEXT_LANG_MODEL = 'https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin' | ||
| def get_fasttext_model(models_storage: Union[pathlib.Path, str, None] = None, use_cache=False): | ||
| """Return fasttext model.""" | ||
| if use_cache: | ||
| return _get_fasttext_model(models_storage) | ||
| return _get_fasttext_model.__wrapped__(models_storage) | ||
| @lru_cache(maxsize=1) | ||
| def _get_fasttext_model(models_storage: Union[pathlib.Path, str, None] = None): | ||
| """Return fasttext model.""" | ||
| fasttext = import_optional_property_dependency(module='fasttext', property_name='language') | ||
| model_name = FASTTEXT_LANG_MODEL.rsplit('/', maxsplit=1)[-1] | ||
| model_path = get_create_model_storage(models_storage) | ||
| model_path = model_path / 'fasttext' | ||
| if not model_path.exists(): | ||
| model_path.mkdir(parents=True) | ||
| model_path = model_path / model_name | ||
| # Save the model to a file | ||
| if not model_path.exists(): | ||
| response = requests.get(FASTTEXT_LANG_MODEL, timeout=240) | ||
| if response.status_code != 200: | ||
| raise RuntimeError('Failed to donwload fasttext model') | ||
| model_path.write_bytes(response.content) | ||
| # This weird code is to suppress a warning from fasttext about a deprecated function | ||
| try: | ||
| fasttext.FastText.eprint = lambda *args, **kwargs: None | ||
| fasttext_model = fasttext.load_model(str(model_path)) | ||
| except Exception as exp: | ||
| raise exp | ||
| return fasttext_model |
| Metadata-Version: 2.1 | ||
| Name: deepchecks | ||
| Version: 0.17.3 | ||
| Version: 0.17.4 | ||
| Summary: Package for validating your machine learning model and data | ||
@@ -9,3 +9,3 @@ Home-page: https://github.com/deepchecks/deepchecks | ||
| License: UNKNOWN | ||
| Download-URL: https://github.com/deepchecks/deepchecks/releases/download/0.17.3/deepchecks-0.17.3.tar.gz | ||
| Download-URL: https://github.com/deepchecks/deepchecks/releases/download/0.17.4/deepchecks-0.17.4.tar.gz | ||
| Project-URL: Documentation, https://docs.deepchecks.com | ||
@@ -12,0 +12,0 @@ Project-URL: Bug Reports, https://github.com/deepchecks/deepchecks |
@@ -9,3 +9,3 @@ pandas>=1.1.5 | ||
| category-encoders>=2.3.0 | ||
| scipy>=1.4.1 | ||
| scipy<=1.10.1,>=1.4.1 | ||
| plotly>=5.13.1 | ||
@@ -46,3 +46,3 @@ matplotlib>=3.3.4 | ||
| [nlp-properties] | ||
| optimum[onnxruntime]>=1.7.0 | ||
| optimum[onnxruntime]>=1.8.8 | ||
| fasttext>=0.8.0 | ||
@@ -49,0 +49,0 @@ |
@@ -119,4 +119,6 @@ DESCRIPTION.rst | ||
| deepchecks/nlp/utils/text.py | ||
| deepchecks/nlp/utils/text_data_plot.py | ||
| deepchecks/nlp/utils/text_embeddings.py | ||
| deepchecks/nlp/utils/text_properties.py | ||
| deepchecks/nlp/utils/text_properties_models.py | ||
| deepchecks/nlp/utils/token_classification_utils.py | ||
@@ -123,0 +125,0 @@ deepchecks/nlp/utils/weak_segments.py |
@@ -50,3 +50,4 @@ # ---------------------------------------------------------------------------- | ||
| deepchecks.__version__) | ||
| os.environ['DISABLE_DEEPCHECKS_ANONYMOUS_TELEMETRY'] = 'True' # to ignore joblib | ||
| except Exception: # pylint: disable=broad-except | ||
| pass |
@@ -91,2 +91,4 @@ # ---------------------------------------------------------------------------- | ||
| labels = dataset.label | ||
| elif dataset.task_type is TaskType.OTHER: | ||
| raise DeepchecksValueError('Check is irrelevant when task type is not specified') | ||
| else: | ||
@@ -93,0 +95,0 @@ raise DeepchecksValueError(f'Unknown task type - {dataset.task_type}') |
@@ -105,2 +105,5 @@ # ---------------------------------------------------------------------------- | ||
| sample = random.sample(sample, self.max_chars_to_review_per_sample) | ||
| if len(sample) == 0: | ||
| percent_special_chars_in_sample[idx] = 0 | ||
| continue | ||
| special_chars_in_sample = [char for char in sample if char in self.special_characters_deny_list] | ||
@@ -107,0 +110,0 @@ percent_special_chars_in_sample[idx] = len(special_chars_in_sample) / len(sample) |
@@ -219,2 +219,4 @@ # ---------------------------------------------------------------------------- | ||
| continue | ||
| if isinstance(info, str): | ||
| continue | ||
| if info['outlier_ratio'] > threshold: | ||
@@ -221,0 +223,0 @@ failed_properties.append(property_name) |
@@ -12,3 +12,3 @@ # ---------------------------------------------------------------------------- | ||
| """Module of the under annotated segments check.""" | ||
| from typing import Dict, List, Tuple, Union | ||
| from typing import Dict, List, Optional, Tuple, Union | ||
@@ -22,3 +22,3 @@ import numpy as np | ||
| from deepchecks.core.check_result import DisplayMap | ||
| from deepchecks.core.errors import DeepchecksProcessError | ||
| from deepchecks.core.errors import NotEnoughSamplesError | ||
| from deepchecks.nlp import Context, SingleDatasetCheck | ||
@@ -35,2 +35,6 @@ from deepchecks.nlp.utils.text import break_to_lines_and_trim | ||
| MAX_SAMPLES_IN_FIGURE = 1000 | ||
| # The threshold the UnderAnnotatedSegments considers the data to be well | ||
| # annotated and skips the checks | ||
| ANNOTATION_RATIO_THRESHOLD = 95.0 | ||
| MIN_TEXT_SAMPLES = 10 # Min samples to calculate under annotated segments | ||
@@ -43,4 +47,4 @@ | ||
| ignore_columns: Union[Hashable, List[Hashable], None], n_top_features: int, | ||
| segment_minimum_size_ratio: float, n_samples: int, | ||
| categorical_aggregation_threshold: float, n_to_show: int, **kwargs): | ||
| segment_minimum_size_ratio: float, n_samples: int, n_to_show: int, | ||
| categorical_aggregation_threshold: float, multiple_segments_per_feature: bool, **kwargs): | ||
| super().__init__(**kwargs) | ||
@@ -55,2 +59,4 @@ self.segment_by = segment_by | ||
| self.categorical_aggregation_threshold = categorical_aggregation_threshold | ||
| self.annotation_ratio_threshold = ANNOTATION_RATIO_THRESHOLD | ||
| self.multiple_segments_per_feature = multiple_segments_per_feature | ||
@@ -67,2 +73,13 @@ def run_logic(self, context: Context, dataset_kind) -> CheckResult: | ||
| score_per_sample = pd.Series([1 - is_label_none(x) for x in text_data.label], index=features.index) | ||
| annotation_ratio = round(score_per_sample.sum() * 100 / text_data.n_samples, 2) | ||
| if annotation_ratio > self.annotation_ratio_threshold: | ||
| display_msg = f'Under annotated {self.segment_by} segments check is skipped since your data ' \ | ||
| f'annotation ratio is > {self.annotation_ratio_threshold}%. Try increasing the ' \ | ||
| 'annotation_ratio_threshold parameter.' | ||
| return CheckResult(value={'message': display_msg}, display=[display_msg]) | ||
| if text_data.n_samples < MIN_TEXT_SAMPLES: | ||
| raise NotEnoughSamplesError(f'Not enough samples to calculate under annotated {self.segment_by} ' | ||
| 'segments. Minimum 10 samples required.') | ||
| encoded_dataset = self._target_encode_categorical_features_fill_na(features, score_per_sample, | ||
@@ -74,8 +91,9 @@ cat_features) | ||
| score_per_sample=score_per_sample, | ||
| scorer_name='Annotation Ratio') | ||
| scorer_name='Annotation Ratio', | ||
| multiple_segments_per_feature=self.multiple_segments_per_feature) | ||
| if len(weak_segments) == 0: | ||
| raise DeepchecksProcessError('Check was unable to find under annotated segments. This is expected if ' | ||
| 'your data is well annotated. If this is not the case, try increasing ' | ||
| f'n_samples or supply more {self.segment_by}.') | ||
| display_msg = 'Check was unable to find under annotated segments. Try ' \ | ||
| f'supplying more {self.segment_by}.' | ||
| return CheckResult(value={'message': display_msg}, display=[display_msg]) | ||
@@ -233,3 +251,3 @@ check_result_value = self._generate_check_result_value(weak_segments, cat_features, avg_score) | ||
| Properties to ignore, if none given checks based on properties variable | ||
| n_top_properties : int , default: 10 | ||
| n_top_properties : Optional[int] , default: 10 | ||
| Number of properties to use for segment search. Top properties are selected based on feature importance. | ||
@@ -245,2 +263,5 @@ segment_minimum_size_ratio: float , default: 0.05 | ||
| In each categorical column, categories with frequency below threshold will be merged into "Other" category. | ||
| multiple_segments_per_property : bool , default: False | ||
| If True, will allow the same property to be a segmenting feature in multiple segments, | ||
| otherwise each property can appear in one segment at most. | ||
| """ | ||
@@ -251,3 +272,3 @@ | ||
| ignore_properties: Union[Hashable, List[Hashable], None] = None, | ||
| n_top_properties: int = 15, | ||
| n_top_properties: Optional[int] = 10, | ||
| segment_minimum_size_ratio: float = 0.05, | ||
@@ -257,2 +278,3 @@ n_samples: int = 10_000, | ||
| n_to_show: int = 3, | ||
| multiple_segments_per_property: bool = False, | ||
| **kwargs): | ||
@@ -267,2 +289,3 @@ super().__init__(segment_by='properties', | ||
| categorical_aggregation_threshold=categorical_aggregation_threshold, | ||
| multiple_segments_per_feature=multiple_segments_per_property, | ||
| **kwargs) | ||
@@ -289,3 +312,3 @@ | ||
| Columns to ignore, if none given checks based on columns variable | ||
| n_top_columns : int , default: 10 | ||
| n_top_columns : Optional[int] , default: 10 | ||
| Number of features to use for segment search. Top columns are selected based on feature importance. | ||
@@ -301,2 +324,5 @@ segment_minimum_size_ratio: float , default: 0.05 | ||
| In each categorical column, categories with frequency below threshold will be merged into "Other" category. | ||
| multiple_segments_per_column : bool , default: True | ||
| If True, will allow the same metadata column to be a segmenting column in multiple segments, | ||
| otherwise each metadata column can appear in one segment at most. | ||
| """ | ||
@@ -307,3 +333,3 @@ | ||
| ignore_columns: Union[Hashable, List[Hashable], None] = None, | ||
| n_top_columns: int = 10, | ||
| n_top_columns: Optional[int] = 10, | ||
| segment_minimum_size_ratio: float = 0.05, | ||
@@ -313,2 +339,3 @@ n_samples: int = 10_000, | ||
| n_to_show: int = 3, | ||
| multiple_segments_per_column: bool = True, | ||
| **kwargs): | ||
@@ -323,2 +350,3 @@ super().__init__(segment_by='metadata', | ||
| categorical_aggregation_threshold=categorical_aggregation_threshold, | ||
| multiple_segments_per_feature=multiple_segments_per_column, | ||
| **kwargs) |
@@ -20,3 +20,3 @@ # ---------------------------------------------------------------------------- | ||
| from deepchecks.core.check_result import DisplayMap | ||
| from deepchecks.core.errors import DeepchecksNotSupportedError, DeepchecksProcessError | ||
| from deepchecks.core.errors import DeepchecksNotSupportedError, NotEnoughSamplesError | ||
| from deepchecks.nlp import Context, SingleDatasetCheck | ||
@@ -31,3 +31,5 @@ from deepchecks.nlp.utils.weak_segments import get_relevant_data_table | ||
| MIN_TEXT_SAMPLES = 10 # Min samples to calculate weak segments performance | ||
| class WeakSegmentsAbstractText(SingleDatasetCheck, WeakSegmentAbstract): | ||
@@ -40,3 +42,4 @@ """Check the performance of the model on different segments of the data.""" | ||
| score_per_sample: Union[np.ndarray, pd.Series, None], n_samples: int, | ||
| categorical_aggregation_threshold: float, n_to_show: int, **kwargs): | ||
| categorical_aggregation_threshold: float, n_to_show: int, | ||
| multiple_segments_per_feature: bool = False, **kwargs): | ||
| super().__init__(**kwargs) | ||
@@ -53,2 +56,3 @@ self.segment_by = segment_by | ||
| self.categorical_aggregation_threshold = categorical_aggregation_threshold | ||
| self.multiple_segments_per_feature = multiple_segments_per_feature | ||
@@ -62,2 +66,5 @@ def run_logic(self, context: Context, dataset_kind) -> CheckResult: | ||
| if text_data.n_samples < MIN_TEXT_SAMPLES: | ||
| raise NotEnoughSamplesError(f'Not enough samples to find weak {self.segment_by} segments.' | ||
| f' Minimum {MIN_TEXT_SAMPLES} samples required.') | ||
| features, cat_features = get_relevant_data_table(text_data, data_type=self.segment_by, | ||
@@ -110,7 +117,9 @@ columns=self.columns, ignore_columns=self.ignore_columns, | ||
| feature_rank_for_search=np.asarray(encoded_dataset.features), | ||
| dummy_model=dummy_model, scorer=scorer) | ||
| dummy_model=dummy_model, scorer=scorer, | ||
| multiple_segments_per_feature=self.multiple_segments_per_feature) | ||
| if len(weak_segments) == 0: | ||
| raise DeepchecksProcessError('WeakSegmentsPerformance was unable to train an error model to find weak ' | ||
| f'segments. Try increasing n_samples or supply more {self.segment_by}.') | ||
| display_msg = 'WeakSegmentsPerformance was unable to train an error model to find weak segments.'\ | ||
| f'Try supplying additional {self.segment_by}.' | ||
| return CheckResult(value={'message': display_msg}, display=[display_msg]) | ||
@@ -172,2 +181,5 @@ if context.with_display: | ||
| In each categorical column, categories with frequency below threshold will be merged into "Other" category. | ||
| multiple_segments_per_property : bool , default: False | ||
| If True, will allow the same property to be a segmenting feature in multiple segments, | ||
| otherwise each property can appear in one segment at most. | ||
| """ | ||
@@ -178,3 +190,3 @@ | ||
| ignore_properties: Union[Hashable, List[Hashable], None] = None, | ||
| n_top_properties: Optional[int] = 15, | ||
| n_top_properties: Optional[int] = 10, | ||
| segment_minimum_size_ratio: float = 0.05, | ||
@@ -186,2 +198,3 @@ alternative_scorer: Dict[str, Union[str, Callable]] = None, | ||
| n_to_show: int = 3, | ||
| multiple_segments_per_property: bool = False, | ||
| **kwargs): | ||
@@ -198,2 +211,3 @@ super().__init__(segment_by='properties', | ||
| categorical_aggregation_threshold=categorical_aggregation_threshold, | ||
| multiple_segments_per_feature=multiple_segments_per_property, | ||
| **kwargs) | ||
@@ -242,2 +256,5 @@ | ||
| In each categorical column, categories with frequency below threshold will be merged into "Other" category. | ||
| multiple_segments_column : bool , default: True | ||
| If True, will allow the same metadata column to be a segmenting column in multiple segments, | ||
| otherwise each metadata column can appear in one segment at most. | ||
| """ | ||
@@ -248,3 +265,3 @@ | ||
| ignore_columns: Union[Hashable, List[Hashable], None] = None, | ||
| n_top_columns: Optional[int] = 15, | ||
| n_top_columns: Optional[int] = 10, | ||
| segment_minimum_size_ratio: float = 0.05, | ||
@@ -256,2 +273,3 @@ alternative_scorer: Dict[str, Union[str, Callable]] = None, | ||
| n_to_show: int = 3, | ||
| multiple_segments_column: bool = True, | ||
| **kwargs): | ||
@@ -268,2 +286,3 @@ super().__init__(segment_by='metadata', | ||
| categorical_aggregation_threshold=categorical_aggregation_threshold, | ||
| multiple_segments_per_feature=multiple_segments_column, | ||
| **kwargs) |
@@ -346,3 +346,5 @@ # ---------------------------------------------------------------------------- | ||
| self._model_classes = self._observed_classes | ||
| get_logger().warning('Could not find model\'s classes, using the observed classes') | ||
| get_logger().warning('Could not find model\'s classes, using the observed classes. ' | ||
| 'In order to make sure the classes used by the model are inferred correctly, ' | ||
| 'please use the model_classes argument') | ||
| return self._model_classes | ||
@@ -349,0 +351,0 @@ |
@@ -26,2 +26,3 @@ # ---------------------------------------------------------------------------- | ||
| from deepchecks.nlp.utils.text import break_to_lines_and_trim | ||
| from deepchecks.nlp.utils.text_data_plot import text_data_describe_plot | ||
| from deepchecks.nlp.utils.text_embeddings import calculate_builtin_embeddings | ||
@@ -415,2 +416,3 @@ from deepchecks.nlp.utils.text_properties import calculate_builtin_properties, get_builtin_properties_types | ||
| include_long_calculation_properties: bool = False, | ||
| ignore_non_english_samples_for_english_properties: bool = True, | ||
| device: t.Optional[str] = None | ||
@@ -431,2 +433,8 @@ ): | ||
| ignored. | ||
| ignore_non_english_samples_for_english_properties : bool, default True | ||
| Whether to ignore samples that are not in English when calculating English properties. If False, samples | ||
| that are not in English will be calculated as well. This parameter is ignored when calculating non-English | ||
| properties. | ||
| English-Only properties WILL NOT work properly on non-English samples, and this parameter should be used | ||
| only when you are sure that all the samples are in English. | ||
| device : int, default None | ||
@@ -443,2 +451,3 @@ The device to use for the calculation. If None, the default device will be used. | ||
| include_long_calculation_properties=include_long_calculation_properties, | ||
| ignore_non_english_samples_for_english_properties=ignore_non_english_samples_for_english_properties, | ||
| device=device | ||
@@ -536,2 +545,10 @@ ) | ||
| @property | ||
| def numerical_properties(self) -> t.List[str]: | ||
| """Return numerical properties names.""" | ||
| if self._properties is not None: | ||
| return [prop for prop in self._properties.columns if prop not in self._cat_properties] | ||
| else: | ||
| return [] | ||
| @property | ||
| def task_type(self) -> t.Optional[TaskType]: | ||
@@ -741,3 +758,57 @@ """Return the task type. | ||
| def describe(self, n_properties_to_show: t.Optional[int] = 4, properties_to_show: t.Optional[t.List[str]] = None, | ||
| max_num_labels_to_show: t.Optional[int] = 5, model_classes: t.Optional[t.List[str]] = None): | ||
| """Provide holistic view of the data. | ||
| Generates the following plots: | ||
| 1. Label distribution | ||
| 2. Statistics about the data such as number of samples, annotation ratio, list of metadata columns, list of | ||
| text properties and so on. | ||
| 3. Property distribution for the text properties defined either by n_properties_to_show or properties_to_show | ||
| parameter. | ||
| Parameters | ||
| ---------- | ||
| n_properties_to_show : int, default: 4 | ||
| Number of properties to consider for generating property distribution graphs. If properties_to_show | ||
| is provided, this value is ignored. | ||
| properties_to_show : List[str], default: None | ||
| List of property names to consider for generating property distribution graphs. If None, all the | ||
| properties are considered. | ||
| max_num_labels_to_show : int, default: 5 | ||
| The threshold to display the maximum number of labels on the label distribution pie chart and | ||
| display rest of the labels under "Others" category. | ||
| model_classes : Optional[List[str]], default: None | ||
| List of classes names to use for multi-label display. Only used if the dataset is multi-label. | ||
| Returns | ||
| ------- | ||
| Displays the Plotly Figure. | ||
| """ | ||
| prop_names = [] | ||
| all_properties_data = pd.DataFrame() | ||
| if self._properties is None and properties_to_show is not None: | ||
| raise DeepchecksValueError('No properties exist!') | ||
| elif self._properties is not None: | ||
| if properties_to_show is not None: | ||
| prop_names = [prop for prop in properties_to_show if prop in self.properties.columns] | ||
| if len(prop_names) != len(properties_to_show): | ||
| raise DeepchecksValueError(f'{set(properties_to_show)-set(prop_names)} ' | ||
| 'properties does not exist in the TextData object') | ||
| else: | ||
| prop_names = list(self.properties.columns)[:n_properties_to_show] | ||
| all_properties_data = self.properties[prop_names] | ||
| fig = text_data_describe_plot(properties=all_properties_data, n_samples=self.n_samples, | ||
| is_multi_label=self.is_multi_label_classification(), task_type=self.task_type, | ||
| categorical_metadata=self.categorical_metadata, | ||
| numerical_metadata=self.numerical_metadata, | ||
| categorical_properties=self.categorical_properties, | ||
| numerical_properties=self.numerical_properties, label=self._label, | ||
| model_classes=model_classes, | ||
| max_num_labels_to_show=max_num_labels_to_show) | ||
| return fig | ||
| @contextlib.contextmanager | ||
@@ -744,0 +815,0 @@ def disable_deepchecks_logger(): |
@@ -189,4 +189,7 @@ # ---------------------------------------------------------------------------- | ||
| text_embedding = np.average(text_embeddings, axis=0, weights=text_lens) | ||
| text_embedding = text_embedding / np.linalg.norm(text_embedding) # normalizes length to 1 | ||
| if sum(text_lens) == 0: | ||
| text_embedding = np.ones((EMBEDDING_DIM, )) * np.nan | ||
| else: | ||
| text_embedding = np.average(text_embeddings, axis=0, weights=text_lens) | ||
| text_embedding = text_embedding / np.linalg.norm(text_embedding) # normalizes length to 1 | ||
| result_embeddings.append(text_embedding.tolist()) | ||
@@ -193,0 +196,0 @@ |
@@ -13,3 +13,2 @@ # ---------------------------------------------------------------------------- | ||
| import gc | ||
| import importlib | ||
| import pathlib | ||
@@ -19,2 +18,3 @@ import re | ||
| import warnings | ||
| from collections import defaultdict | ||
| from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union | ||
@@ -24,3 +24,2 @@ | ||
| import pandas as pd | ||
| import requests | ||
| import textblob | ||
@@ -30,2 +29,3 @@ from nltk import corpus | ||
| from nltk import sent_tokenize, word_tokenize | ||
| from tqdm import tqdm | ||
| from typing_extensions import TypedDict | ||
@@ -35,5 +35,5 @@ | ||
| from deepchecks.nlp.utils.text import cut_string, hash_text, normalize_text, remove_punctuation | ||
| from deepchecks.nlp.utils.text_properties_models import get_cmudict_dict, get_fasttext_model, get_transformer_pipeline | ||
| from deepchecks.utils.function import run_available_kwargs | ||
| from deepchecks.utils.ipython import create_progress_bar | ||
| from deepchecks.utils.strings import format_list, truncate_string | ||
| from deepchecks.utils.strings import SPECIAL_CHARACTERS, format_list | ||
@@ -44,6 +44,8 @@ __all__ = ['calculate_builtin_properties', 'get_builtin_properties_types'] | ||
| MODELS_STORAGE = pathlib.Path(__file__).absolute().parent / '.nlp-models' | ||
| FASTTEXT_LANG_MODEL = 'https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin' | ||
| DEFAULT_SENTENCE_SAMPLE_SIZE = 300 | ||
| MAX_CHARS = 512 # Bert accepts max of 512 tokens, so without counting tokens we go for the lower bound. | ||
| # all SPECIAL_CHARACTERS - all string.punctuation except for <>@[]^_`{|}~ - all whitespace | ||
| NON_PUNCTUATION_SPECIAL_CHARS = frozenset(set(SPECIAL_CHARACTERS) - set(r"""!"#$%&'()*+,-./:;=?\@""") | ||
| - set(string.whitespace)) | ||
| textblob_cache = {} | ||
@@ -107,27 +109,2 @@ words_cache = {} | ||
| def _import_optional_property_dependency( | ||
| module: str, | ||
| property_name: str, | ||
| package_name: Optional[str] = None, | ||
| error_template: Optional[str] = None | ||
| ): | ||
| try: | ||
| lib = importlib.import_module(module) | ||
| except ImportError as error: | ||
| package_name = package_name or module.split('.', maxsplit=1)[0] | ||
| error_template = error_template or ( | ||
| 'property {property_name} requires the {package_name} python package. ' | ||
| 'To get it, run:\n' | ||
| '>> pip install {package_name}\n\n' | ||
| 'You may install dependencies for all text properties by running:\n' | ||
| '>> pip install deepchecks[nlp-properties]\n' | ||
| ) | ||
| raise ImportError(error_template.format( | ||
| property_name=property_name, | ||
| package_name=package_name | ||
| )) from error | ||
| else: | ||
| return lib | ||
| def _warn_if_missing_nltk_dependencies(dependency: str, property_name: str): | ||
@@ -139,123 +116,2 @@ """Warn if NLTK dependency is missing.""" | ||
| def get_create_model_storage(models_storage: Union[pathlib.Path, str, None] = None): | ||
| """Get the models storage directory and create it if needed.""" | ||
| if models_storage is None: | ||
| models_storage = MODELS_STORAGE | ||
| else: | ||
| if isinstance(models_storage, str): | ||
| models_storage = pathlib.Path(models_storage) | ||
| if not isinstance(models_storage, pathlib.Path): | ||
| raise ValueError( | ||
| f'Unexpected type of the "models_storage" parameter - {type(models_storage)}' | ||
| ) | ||
| if not models_storage.exists(): | ||
| models_storage.mkdir(parents=True) | ||
| if not models_storage.is_dir(): | ||
| raise ValueError('"model_storage" expected to be a directory') | ||
| return models_storage | ||
| def get_transformer_model( | ||
| property_name: str, | ||
| model_name: str, | ||
| device: Optional[str] = None, | ||
| quantize_model: bool = False, | ||
| models_storage: Union[pathlib.Path, str, None] = None | ||
| ): | ||
| """Get the transformer model and decide if to use optimum.onnxruntime. | ||
| optimum.onnxruntime is used to optimize running times on CPU. | ||
| """ | ||
| models_storage = get_create_model_storage(models_storage) | ||
| if device not in (None, 'cpu'): | ||
| transformers = _import_optional_property_dependency('transformers', property_name=property_name) | ||
| # TODO: quantize if 'quantize_model' is True | ||
| return transformers.AutoModelForSequenceClassification.from_pretrained( | ||
| model_name, | ||
| cache_dir=models_storage | ||
| ) | ||
| onnx = _import_optional_property_dependency( | ||
| 'optimum.onnxruntime', | ||
| property_name=property_name, | ||
| error_template=( | ||
| f'The device was set to {device} while computing the {property_name} property,' | ||
| 'in which case deepchecks resorts to accelerating the inference by using optimum,' | ||
| 'bit it is not installed. Either:\n' | ||
| '\t- Set the device according to your hardware;\n' | ||
| '\t- Install optimum by running "pip install optimum";\n' | ||
| '\t- Install all dependencies needed for text properties by running ' | ||
| '"pip install deepchecks[nlp-properties]";\n' | ||
| ) | ||
| ) | ||
| if quantize_model is False: | ||
| model_path = models_storage / 'onnx' / model_name | ||
| if model_path.exists(): | ||
| return onnx.ORTModelForSequenceClassification.from_pretrained(model_path) | ||
| model = onnx.ORTModelForSequenceClassification.from_pretrained( | ||
| model_name, | ||
| export=True, | ||
| cache_dir=models_storage | ||
| ) | ||
| # NOTE: | ||
| # 'optimum', after exporting/converting a model to the ONNX format, | ||
| # does not store it onto disk we need to save it now to not reconvert | ||
| # it each time | ||
| model.save_pretrained(model_path) | ||
| return model | ||
| model_path = models_storage / 'onnx' / 'quantized' / model_name | ||
| if model_path.exists(): | ||
| return onnx.ORTModelForSequenceClassification.from_pretrained(model_path) | ||
| not_quantized_model = get_transformer_model( | ||
| property_name, | ||
| model_name, | ||
| device, | ||
| quantize_model=False, | ||
| models_storage=models_storage | ||
| ) | ||
| quantizer = onnx.ORTQuantizer.from_pretrained(not_quantized_model) | ||
| quantizer.quantize( | ||
| save_dir=model_path, | ||
| # TODO: make it possible to provide a config as a parameter | ||
| quantization_config=onnx.configuration.AutoQuantizationConfig.avx512_vnni( | ||
| is_static=False, | ||
| per_channel=False | ||
| ) | ||
| ) | ||
| return onnx.ORTModelForSequenceClassification.from_pretrained(model_path) | ||
| def get_transformer_pipeline( | ||
| property_name: str, | ||
| model_name: str, | ||
| device: Optional[str] = None, | ||
| models_storage: Union[pathlib.Path, str, None] = None | ||
| ): | ||
| """Return a transformers pipeline for the given model name.""" | ||
| transformers = _import_optional_property_dependency('transformers', property_name=property_name) | ||
| tokenizer = transformers.AutoTokenizer.from_pretrained(model_name) | ||
| model = get_transformer_model( | ||
| property_name=property_name, | ||
| model_name=model_name, | ||
| device=device, | ||
| models_storage=models_storage | ||
| ) | ||
| return transformers.pipeline( | ||
| 'text-classification', | ||
| model=model, | ||
| tokenizer=tokenizer, | ||
| device=device | ||
| ) | ||
| def text_length(text: str) -> int: | ||
@@ -274,2 +130,7 @@ """Return text length.""" | ||
| """Return percentage of special characters (as float between 0 and 1).""" | ||
| return len([c for c in text if c in NON_PUNCTUATION_SPECIAL_CHARS]) / len(text) if len(text) != 0 else 0 | ||
| def percentage_punctuation(text: str) -> float: | ||
| """Return percentage of punctuation (as float between 0 and 1).""" | ||
| return len([c for c in text if c in string.punctuation]) / len(text) if len(text) != 0 else 0 | ||
@@ -284,32 +145,2 @@ | ||
| def _get_fasttext_model(models_storage: Union[pathlib.Path, str, None] = None): | ||
| """Return fasttext model.""" | ||
| fasttext = _import_optional_property_dependency(module='fasttext', property_name='language') | ||
| model_name = FASTTEXT_LANG_MODEL.rsplit('/', maxsplit=1)[-1] | ||
| model_path = get_create_model_storage(models_storage) | ||
| model_path = model_path / 'fasttext' | ||
| if not model_path.exists(): | ||
| model_path.mkdir(parents=True) | ||
| model_path = model_path / model_name | ||
| # Save the model to a file | ||
| if not model_path.exists(): | ||
| response = requests.get(FASTTEXT_LANG_MODEL, timeout=240) | ||
| if response.status_code != 200: | ||
| raise RuntimeError('Failed to donwload fasttext model') | ||
| model_path.write_bytes(response.content) | ||
| # This weird code is to suppress a warning from fasttext about a deprecated function | ||
| try: | ||
| fasttext.FastText.eprint = lambda *args, **kwargs: None | ||
| fasttext_model = fasttext.load_model(str(model_path)) | ||
| except Exception as exp: | ||
| raise exp | ||
| return fasttext_model | ||
| def language( | ||
@@ -323,5 +154,6 @@ text: str, | ||
| return None | ||
| # Not recommended, takes a long time. Here only to enable to call this function from outside: | ||
| # Load the model if it wasn't received as a parameter. This is done to avoid loading the model | ||
| # each time the function is called. | ||
| if fasttext_model is None: | ||
| fasttext_model = _get_fasttext_model() | ||
| fasttext_model = get_fasttext_model() | ||
@@ -335,2 +167,16 @@ # Predictions are the first prediction (k=1), only if the probability is above the threshold | ||
| def english_text( | ||
| text: str, | ||
| lang_certainty_threshold: float = 0.8, | ||
| fasttext_model: Optional[Dict[object, Any]] = None, | ||
| language_property_result: Optional[str] = None | ||
| ) -> Union[bool, None]: | ||
| """Return whether text is in English or not.""" | ||
| if not text: | ||
| return None | ||
| if language_property_result is None: | ||
| language_property_result = language(text, lang_certainty_threshold, fasttext_model) | ||
| return language_property_result == 'en' | ||
| def sentiment(text: str) -> float: | ||
@@ -358,7 +204,12 @@ """Return float representing sentiment.""" | ||
| def _predict(text: str, classifier, kind: str) -> float: | ||
| def predict_on_batch(text_batch: Sequence[str], classifier, | ||
| output_formatter: Callable[[Dict[str, Any]], float]) -> Sequence[float]: | ||
| """Return prediction of huggingface Pipeline classifier.""" | ||
| try: | ||
| # TODO: make this way smarter, and not just a hack. Count tokens, for a start. Then not just sample sentences. | ||
| # If text is longer than classifier context window, sample it: | ||
| # TODO: make this way smarter, and not just a hack. Count tokens, for a start. Then not just sample sentences. | ||
| # If text is longer than classifier context window, sample it: | ||
| text_list_to_predict = [] | ||
| reduced_batch_size = len(text_batch) # Initialize the reduced batch size | ||
| retry_count = 0 | ||
| for text in text_batch: | ||
| if len(text) > MAX_CHARS: | ||
@@ -375,26 +226,33 @@ sentences = _sample_for_property(text, mode='sentences', limit=10, return_as_list=True) | ||
| text_to_use = cut_string(sentences[0], MAX_CHARS) | ||
| text = text_to_use | ||
| v = classifier(text) | ||
| except Exception: # pylint: disable=broad-except | ||
| return np.nan | ||
| else: | ||
| if not v: | ||
| return np.nan | ||
| v = v[0] | ||
| if kind == 'toxicity': | ||
| return v['score'] | ||
| elif kind == 'fluency': | ||
| label_value = 'LABEL_1' | ||
| elif kind == 'formality': | ||
| label_value = 'formal' | ||
| text_list_to_predict.append(text_to_use) | ||
| else: | ||
| raise ValueError('Unsupported value for "kind" parameter') | ||
| return ( | ||
| v['score'] | ||
| if v['label'] == label_value | ||
| else 1 - v['score'] | ||
| ) | ||
| text_list_to_predict.append(text) | ||
| while reduced_batch_size >= 1: | ||
| try: | ||
| if reduced_batch_size == 1 or retry_count == 3: | ||
| results = [] | ||
| for text in text_list_to_predict: | ||
| try: | ||
| v = classifier(text)[0] | ||
| results.append(output_formatter(v)) | ||
| except Exception: # pylint: disable=broad-except | ||
| results.append(np.nan) | ||
| return results # Return the results if prediction is successful | ||
| v_list = classifier(text_list_to_predict, batch_size=reduced_batch_size) | ||
| results = [] | ||
| for v in v_list: | ||
| results.append(output_formatter(v)) | ||
| return results # Return the results if prediction is successful | ||
| except Exception: # pylint: disable=broad-except | ||
| reduced_batch_size = max(reduced_batch_size // 2, 1) # Reduce the batch size by half | ||
| retry_count += 1 | ||
| return [np.nan] * len(text_batch) # Prediction failed, return NaN values for the original batch size | ||
| TOXICITY_MODEL_NAME = 'unitary/toxic-bert' | ||
@@ -406,7 +264,7 @@ FLUENCY_MODEL_NAME = 'prithivida/parrot_fluency_model' | ||
| def toxicity( | ||
| text: str, | ||
| text_batch: Sequence[str], | ||
| device: Optional[str] = None, | ||
| models_storage: Union[pathlib.Path, str, None] = None, | ||
| toxicity_classifier: Optional[object] = None | ||
| ) -> float: | ||
| ) -> Sequence[float]: | ||
| """Return float representing toxicity.""" | ||
@@ -416,11 +274,15 @@ if toxicity_classifier is None: | ||
| property_name='toxicity', model_name=TOXICITY_MODEL_NAME, device=device, models_storage=models_storage) | ||
| return _predict(text, toxicity_classifier, 'toxicity') | ||
| def output_formatter(v): | ||
| return v['score'] | ||
| return predict_on_batch(text_batch, toxicity_classifier, output_formatter) | ||
| def fluency( | ||
| text: str, | ||
| text_batch: Sequence[str], | ||
| device: Optional[str] = None, | ||
| models_storage: Union[pathlib.Path, str, None] = None, | ||
| fluency_classifier: Optional[object] = None | ||
| ) -> float: | ||
| ) -> Sequence[float]: | ||
| """Return float representing fluency.""" | ||
@@ -430,11 +292,15 @@ if fluency_classifier is None: | ||
| property_name='fluency', model_name=FLUENCY_MODEL_NAME, device=device, models_storage=models_storage) | ||
| return _predict(text, fluency_classifier, 'fluency') | ||
| def output_formatter(v): | ||
| return v['score'] if v['label'] == 'LABEL_1' else 1 - v['score'] | ||
| return predict_on_batch(text_batch, fluency_classifier, output_formatter) | ||
| def formality( | ||
| text: str, | ||
| text_batch: Sequence[str], | ||
| device: Optional[str] = None, | ||
| models_storage: Union[pathlib.Path, str, None] = None, | ||
| formality_classifier: Optional[object] = None | ||
| ) -> float: | ||
| ) -> Sequence[float]: | ||
| """Return float representing formality.""" | ||
@@ -444,5 +310,9 @@ if formality_classifier is None: | ||
| property_name='formality', model_name=FORMALITY_MODEL_NAME, device=device, models_storage=models_storage) | ||
| return _predict(text, formality_classifier, 'formality') | ||
| def output_formatter(v): | ||
| return v['score'] if v['label'] == 'formal' else 1 - v['score'] | ||
| return predict_on_batch(text_batch, formality_classifier, output_formatter) | ||
| def lexical_density(text: str) -> float: | ||
@@ -467,3 +337,3 @@ """Return a float representing lexical density. | ||
| def unique_noun_count(text: str) -> int: | ||
| def unique_noun_count(text: Sequence[str]) -> int: | ||
| """Return the number of unique noun words in the text.""" | ||
@@ -491,3 +361,3 @@ if pd.isna(text): | ||
| if not nltk_download('cmudict', quiet=True): | ||
| _warn_if_missing_nltk_dependencies('cmudict', 'Readability Score') | ||
| _warn_if_missing_nltk_dependencies('cmudict', 'Reading Ease') | ||
| return np.nan | ||
@@ -622,2 +492,18 @@ cmudict_dict = corpus.cmudict.dict() | ||
| def _batch_wrapper(text_batch: Sequence[str], func: Callable, **kwargs) -> List[Any]: | ||
| """Wrap the non-batched properties execution with batches API.""" | ||
| results = [] | ||
| language_property_result = [] | ||
| if 'language_property_result' in kwargs: | ||
| language_property_result = kwargs.pop('language_property_result') | ||
| language_property_exists = len(language_property_result) > 0 | ||
| for i, text in enumerate(text_batch): | ||
| kwargs['language_property_result'] = language_property_result[i] if language_property_exists else None | ||
| results.append(run_available_kwargs(func, text=text, **kwargs)) | ||
| return results | ||
| class TextProperty(TypedDict): | ||
@@ -629,46 +515,46 @@ name: str | ||
| DEFAULT_PROPERTIES: Tuple[TextProperty, ...] = ( | ||
| {'name': 'Text Length', 'method': text_length, 'output_type': 'numeric'}, | ||
| {'name': 'Average Word Length', 'method': average_word_length, 'output_type': 'numeric'}, | ||
| {'name': 'Max Word Length', 'method': max_word_length, 'output_type': 'numeric'}, | ||
| {'name': '% Special Characters', 'method': percentage_special_characters, 'output_type': 'numeric'}, | ||
| {'name': 'Language', 'method': language, 'output_type': 'categorical'}, | ||
| {'name': 'Sentiment', 'method': sentiment, 'output_type': 'numeric'}, | ||
| {'name': 'Subjectivity', 'method': subjectivity, 'output_type': 'numeric'}, | ||
| {'name': 'Average Words Per Sentence', 'method': average_words_per_sentence, 'output_type': 'numeric'}, | ||
| {'name': 'Readability Score', 'method': readability_score, 'output_type': 'numeric'}, | ||
| {'name': 'Lexical Density', 'method': lexical_density, 'output_type': 'numeric'}, | ||
| {'name': 'Toxicity', 'method': toxicity, 'output_type': 'numeric'}, | ||
| {'name': 'Fluency', 'method': fluency, 'output_type': 'numeric'}, | ||
| {'name': 'Formality', 'method': formality, 'output_type': 'numeric'}, | ||
| {'name': 'Unique Noun Count', 'method': unique_noun_count, 'output_type': 'numeric'}, | ||
| ) | ||
| DEFAULT_PROPERTIES: Tuple[TextProperty, ...] = \ | ||
| ( | ||
| {'name': 'Text Length', 'method': text_length, 'output_type': 'numeric'}, | ||
| {'name': 'Average Word Length', 'method': average_word_length, 'output_type': 'numeric'}, | ||
| {'name': 'Max Word Length', 'method': max_word_length, 'output_type': 'numeric'}, | ||
| {'name': '% Special Characters', 'method': percentage_special_characters, 'output_type': 'numeric'}, | ||
| {'name': '% Punctuation', 'method': percentage_punctuation, 'output_type': 'numeric'}, | ||
| {'name': 'Language', 'method': language, 'output_type': 'categorical'}, | ||
| {'name': 'Sentiment', 'method': sentiment, 'output_type': 'numeric'}, | ||
| {'name': 'Subjectivity', 'method': subjectivity, 'output_type': 'numeric'}, | ||
| {'name': 'Average Words Per Sentence', 'method': average_words_per_sentence, 'output_type': 'numeric'}, | ||
| {'name': 'Reading Ease', 'method': readability_score, 'output_type': 'numeric'}, | ||
| {'name': 'Lexical Density', 'method': lexical_density, 'output_type': 'numeric'}, | ||
| {'name': 'Toxicity', 'method': toxicity, 'output_type': 'numeric'}, | ||
| {'name': 'Fluency', 'method': fluency, 'output_type': 'numeric'}, | ||
| {'name': 'Formality', 'method': formality, 'output_type': 'numeric'}, | ||
| {'name': 'Unique Noun Count', 'method': unique_noun_count, 'output_type': 'numeric'}, | ||
| ) | ||
| ALL_PROPERTIES: Tuple[TextProperty, ...] = ( | ||
| {'name': 'URLs Count', 'method': urls_count, 'output_type': 'numeric'}, | ||
| {'name': 'Email Addresses Count', 'method': email_addresses_count, | ||
| 'output_type': 'numeric'}, | ||
| {'name': 'Unique URLs Count', 'method': unique_urls_count, | ||
| 'output_type': 'numeric'}, | ||
| {'name': 'Unique Email Addresses Count', | ||
| 'method': unique_email_addresses_count, 'output_type': 'numeric'}, | ||
| {'name': 'Unique Syllables Count', 'method': unique_syllables_count, | ||
| 'output_type': 'numeric'}, | ||
| {'name': 'Reading Time', 'method': reading_time, | ||
| 'output_type': 'numeric'}, | ||
| {'name': 'Sentences Count', 'method': sentences_count, | ||
| 'output_type': 'numeric'}, | ||
| {'name': 'Average Syllable Length', 'method': average_syllable_length, | ||
| 'output_type': 'numeric'}, | ||
| ) + DEFAULT_PROPERTIES | ||
| ALL_PROPERTIES: Tuple[TextProperty, ...] = \ | ||
| ( | ||
| {'name': 'English Text', 'method': english_text, 'output_type': 'categorical'}, | ||
| {'name': 'URLs Count', 'method': urls_count, 'output_type': 'numeric'}, | ||
| {'name': 'Email Addresses Count', 'method': email_addresses_count, 'output_type': 'numeric'}, | ||
| {'name': 'Unique URLs Count', 'method': unique_urls_count, 'output_type': 'numeric'}, | ||
| {'name': 'Unique Email Addresses Count', 'method': unique_email_addresses_count, 'output_type': 'numeric'}, | ||
| {'name': 'Unique Syllables Count', 'method': unique_syllables_count, 'output_type': 'numeric'}, | ||
| {'name': 'Reading Time', 'method': reading_time, 'output_type': 'numeric'}, | ||
| {'name': 'Sentences Count', 'method': sentences_count, 'output_type': 'numeric'}, | ||
| {'name': 'Average Syllable Length', 'method': average_syllable_length, 'output_type': 'numeric'}, | ||
| ) + DEFAULT_PROPERTIES | ||
| LONG_RUN_PROPERTIES = ('Toxicity', 'Fluency', 'Formality', 'Unique Noun Count') | ||
| BATCH_PROPERTIES = ('Toxicity', 'Fluency', 'Formality') | ||
| LARGE_SAMPLE_SIZE = 10_000 | ||
| ENGLISH_ONLY_PROPERTIES = ( | ||
| 'Sentiment', 'Subjectivity', 'Toxicity', 'Fluency', 'Formality', 'Readability Score', | ||
| 'Sentiment', 'Subjectivity', 'Toxicity', 'Fluency', 'Formality', 'Reading Ease', | ||
| 'Unique Noun Count', 'Unique Syllables Count', 'Sentences Count', 'Average Syllable Length' | ||
| ) | ||
| CMUDICT_PROPERTIES = ('Average Syllable Length', 'Unique Syllables Count', 'Readability Score') | ||
| CMUDICT_PROPERTIES = ('Average Syllable Length', 'Unique Syllables Count', 'Reading Ease') | ||
@@ -679,12 +565,21 @@ TEXT_PROPERTIES_DESCRIPTION = { | ||
| 'Max Word Length': 'Maximum number of characters in a word', | ||
| '% Special Characters': 'Percentage of special characters in the text', | ||
| '% Special Characters': 'Percentage of special characters in the text. Special characters are non-alphanumeric ' | ||
| 'unicode characters, excluding whitespaces and any of !\"#$%&\'()*+,-./:;=?\\@.', | ||
| '% Punctuation': 'Percentage of punctuation characters in the text. Punctuation characters are any of ' | ||
| '!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~', | ||
| 'Language': 'Language of the text, using the fasttext language detection model', | ||
| 'Sentiment': 'Sentiment of the text, calculated using the TextBlob sentiment analysis model', | ||
| 'Subjectivity': 'Subjectivity of the text, calculated using the TextBlob sentiment analysis model', | ||
| 'Sentiment': 'Sentiment of the text, calculated using the TextBlob sentiment analysis model.' | ||
| ' Ranging from -1 (negative) to 1 (positive)', | ||
| 'Subjectivity': 'Subjectivity of the text, calculated using the TextBlob sentiment analysis model. Ranging from 0 ' | ||
| '(objective) to 1 (subjective)', | ||
| 'Average Words Per Sentence': 'Average number of words per sentence in the text', | ||
| 'Readability Score': 'A score calculated based on Flesch reading-ease per text sample', | ||
| 'Reading Ease': 'How easy to read a text sample is, typically ranges from around 0 (hard to read) to around ' | ||
| '100 (very easy). Based on Flesch reading-ease score', | ||
| 'Lexical Density': 'Percentage of unique words in the text', | ||
| 'Toxicity': 'Toxicity score using unitary/toxic-bert HuggingFace model', | ||
| 'Fluency': 'Fluency score using prithivida/parrot_fluency_model HuggingFace model', | ||
| 'Formality': 'Formality score using s-nlp/roberta-base-formality-ranker HuggingFace model', | ||
| 'Toxicity': 'A measure of how harmful or offensive a text sample is (0 to 1), uses the Detoxify library ' | ||
| 'unitary/toxic-bert model', | ||
| 'Fluency': 'A measure of the fluency of the text (0 to 1), using the prithivida/parrot_fluency_model' | ||
| ' model from the authors of the Parrot Paraphraser library', | ||
| 'Formality': 'The formality / register of the text (0 to 1), using the s-nlp/roberta-base-formality-ranker' | ||
| ' model by the Skolkovo Institute of Science and Technology', | ||
| 'Unique Noun Count': 'Number of unique noun words in the text', | ||
@@ -694,2 +589,3 @@ 'URLs Count': 'Number of URLS per text sample', | ||
| 'Unique URLs Count': 'Number of unique URLS per text sample', | ||
| 'English Text': 'Whether the text is in English (1) or not (0)', | ||
| 'Unique Email Addresses Count': 'Number of unique email addresses per text sample', | ||
@@ -778,4 +674,7 @@ 'Unique Syllables Count': 'Number of unique syllables per text sample', | ||
| include_long_calculation_properties: bool = False, | ||
| ignore_non_english_samples_for_english_properties: bool = True, | ||
| device: Optional[str] = None, | ||
| models_storage: Union[pathlib.Path, str, None] = None | ||
| models_storage: Union[pathlib.Path, str, None] = None, | ||
| batch_size: Optional[int] = 16, | ||
| cache_models: bool = False | ||
| ) -> Tuple[Dict[str, List[float]], Dict[str, str]]: | ||
@@ -791,12 +690,12 @@ """Calculate properties on provided text samples. | ||
| together with ignore_properties parameter. Available properties are: | ||
| ['Text Length', 'Average Word Length', 'Max Word Length', '% Special Characters', 'Language', | ||
| ['Text Length', 'Average Word Length', 'Max Word Length', '% Special Characters', '% Punctuation', 'Language', | ||
| 'Sentiment', 'Subjectivity', 'Toxicity', 'Fluency', 'Formality', 'Lexical Density', 'Unique Noun Count', | ||
| 'Readability Score', 'Average Words Per Sentence', 'URLs Count', Unique URLs Count', 'Email Address Count', | ||
| 'Reading Ease', 'Average Words Per Sentence', 'URLs Count', Unique URLs Count', 'Email Address Count', | ||
| 'Unique Email Address Count', 'Unique Syllables Count', 'Reading Time', 'Sentences Count', | ||
| 'Average Syllable Length'] | ||
| List of default properties are: ['Text Length', 'Average Word Length', 'Max Word Length', | ||
| '% Special Characters', 'Language', 'Sentiment', 'Subjectivity', 'Toxicity', 'Fluency', 'Formality', | ||
| 'Lexical Density', 'Unique Noun Count', 'Readability Score', 'Average Words Per Sentence'] | ||
| '% Special Characters', '% Punctuation', 'Language', 'Sentiment', 'Subjectivity', 'Toxicity', 'Fluency', | ||
| 'Formality', 'Lexical Density', 'Unique Noun Count', 'Reading Ease', 'Average Words Per Sentence'] | ||
| To calculate all the default properties, the include_properties and ignore_properties parameters should | ||
| be None. If you pass either include_properties or ignore_properties then the only the properties specified | ||
| be None. If you pass either include_properties or ignore_properties then only the properties specified | ||
| in the list will be calculated or ignored. | ||
@@ -812,2 +711,8 @@ Note that the properties ['Toxicity', 'Fluency', 'Formality', 'Language', 'Unique Noun Count'] may | ||
| ignored, unless they are specified in the include_properties parameter explicitly. | ||
| ignore_non_english_samples_for_english_properties : bool, default True | ||
| Whether to ignore samples that are not in English when calculating English properties. If False, samples | ||
| that are not in English will be calculated as well. This parameter is ignored when calculating non-English | ||
| properties. | ||
| English-Only properties WILL NOT work properly on non-English samples, and this parameter should be used | ||
| only when you are sure that all the samples are in English. | ||
| device : int, default None | ||
@@ -819,2 +724,6 @@ The device to use for the calculation. If None, the default device will be used. | ||
| Also, if a folder already contains relevant resources they are not re-downloaded. | ||
| batch_size : int, default 8 | ||
| The batch size. | ||
| cache_models : bool, default False | ||
| cache the models being used in this function, to save load time in next execution | ||
@@ -847,3 +756,3 @@ Returns | ||
| if 'fasttext_model' not in kwargs: | ||
| kwargs['fasttext_model'] = _get_fasttext_model(models_storage=models_storage) | ||
| kwargs['fasttext_model'] = get_fasttext_model(models_storage=models_storage, use_cache=cache_models) | ||
@@ -857,16 +766,18 @@ if 'cmudict_dict' not in kwargs: | ||
| calculated_properties[prop] = [np.nan] * len(raw_text) | ||
| cmudict_dict = corpus.cmudict.dict() | ||
| kwargs['cmudict_dict'] = cmudict_dict | ||
| kwargs['cmudict_dict'] = get_cmudict_dict(use_cache=cache_models) | ||
| if 'Toxicity' in text_properties_names and 'toxicity_classifier' not in kwargs: | ||
| kwargs['toxicity_classifier'] = get_transformer_pipeline( | ||
| property_name='toxicity', model_name=TOXICITY_MODEL_NAME, device=device, models_storage=models_storage) | ||
| property_name='toxicity', model_name=TOXICITY_MODEL_NAME, device=device, | ||
| models_storage=models_storage, use_cache=cache_models) | ||
| if 'Formality' in text_properties_names and 'formality_classifier' not in kwargs: | ||
| kwargs['formality_classifier'] = get_transformer_pipeline( | ||
| property_name='formality', model_name=FORMALITY_MODEL_NAME, device=device, models_storage=models_storage) | ||
| property_name='formality', model_name=FORMALITY_MODEL_NAME, device=device, | ||
| models_storage=models_storage, use_cache=cache_models) | ||
| if 'Fluency' in text_properties_names and 'fluency_classifier' not in kwargs: | ||
| kwargs['fluency_classifier'] = get_transformer_pipeline( | ||
| property_name='fluency', model_name=FLUENCY_MODEL_NAME, device=device, models_storage=models_storage) | ||
| property_name='fluency', model_name=FLUENCY_MODEL_NAME, device=device, | ||
| models_storage=models_storage, use_cache=cache_models) | ||
@@ -885,34 +796,52 @@ is_language_property_requested = 'Language' in [prop['name'] for prop in text_properties] | ||
| progress_bar = create_progress_bar( | ||
| iterable=list(raw_text), | ||
| name='Text Samples Calculation', | ||
| unit='Text Sample' | ||
| ) | ||
| for text in progress_bar: | ||
| progress_bar.set_postfix( | ||
| {'Sample': truncate_string(text, max_length=20) if text else 'EMPTY STRING'}, | ||
| refresh=False | ||
| ) | ||
| if pd.isna(text): | ||
| for prop in text_properties: | ||
| calculated_properties[prop['name']].append(np.nan) | ||
| continue | ||
| sample_language = run_available_kwargs(language, text=text, **kwargs) | ||
| for i in tqdm(range(0, len(raw_text), batch_size)): | ||
| batch = raw_text[i:i + batch_size] | ||
| batch_properties = defaultdict(list) | ||
| # filtering out empty sequences | ||
| nan_indices = {i for i, seq in enumerate(batch) if pd.isna(seq) is True} | ||
| filtered_sequences = [e for i, e in enumerate(batch) if i not in nan_indices] | ||
| samples_language = _batch_wrapper(text_batch=filtered_sequences, func=language, **kwargs) | ||
| if is_language_property_requested: | ||
| calculated_properties['Language'].append(sample_language) | ||
| batch_properties['Language'].extend(samples_language) | ||
| calculated_properties['Language'].extend(samples_language) | ||
| kwargs['language_property_result'] = samples_language # Pass the language property to other properties | ||
| non_english_indices = set() | ||
| if ignore_non_english_samples_for_english_properties: | ||
| non_english_indices = {i for i, (seq, lang) in enumerate(zip(filtered_sequences, samples_language)) | ||
| if lang != 'en'} | ||
| for prop in text_properties: | ||
| if prop['name'] in import_warnings: # Skip properties that failed to import: | ||
| calculated_properties[prop['name']].append(np.nan) | ||
| elif sample_language != 'en' and prop['name'] in english_properties_names: | ||
| calculated_properties[prop['name']].append(np.nan) | ||
| batch_properties[prop['name']].extend([np.nan] * len(batch)) | ||
| else: | ||
| if prop['name'] in english_properties_names \ | ||
| and ignore_non_english_samples_for_english_properties is True: | ||
| filtered_sequences = [e for i, e in enumerate(filtered_sequences) if i not in non_english_indices] | ||
| kwargs['batch_size'] = batch_size | ||
| try: | ||
| value = run_available_kwargs(prop['method'], text=text, **kwargs) | ||
| calculated_properties[prop['name']].append(value) | ||
| if prop['name'] in BATCH_PROPERTIES: | ||
| value = run_available_kwargs(func=prop['method'], text_batch=filtered_sequences, **kwargs) | ||
| else: | ||
| value = _batch_wrapper(text_batch=filtered_sequences, func=prop['method'], **kwargs) | ||
| batch_properties[prop['name']].extend(value) | ||
| except ImportError as e: | ||
| warnings.warn(warning_message.format(prop['name'], str(e))) | ||
| calculated_properties[prop['name']].append(np.nan) | ||
| batch_properties[prop['name']].extend([np.nan] * len(batch)) | ||
| import_warnings.add(prop['name']) | ||
| result_index = 0 | ||
| for index, seq in enumerate(batch): | ||
| if index in nan_indices or (index in non_english_indices and | ||
| ignore_non_english_samples_for_english_properties and | ||
| prop['name'] in english_properties_names): | ||
| calculated_properties[prop['name']].append(np.nan) | ||
| else: | ||
| calculated_properties[prop['name']].append(batch_properties[prop['name']][result_index]) | ||
| result_index += 1 | ||
| filtered_sequences = [e for i, e in enumerate(batch) if i not in nan_indices] | ||
| # Clear property caches: | ||
@@ -924,3 +853,4 @@ textblob_cache.clear() | ||
| # Clean all remaining RAM: | ||
| gc.collect() | ||
| if not cache_models: | ||
| gc.collect() | ||
@@ -927,0 +857,0 @@ if not calculated_properties: |
@@ -35,7 +35,9 @@ # ---------------------------------------------------------------------------- | ||
| _UNSUPPORTED_MODEL_ERROR = ( | ||
| 'Check is relevant for Boosting models of type ' | ||
| '{supported_models}, but received model of type {model_type}' | ||
| ) | ||
| _UNSUPPORTED_MODEL_ERROR = \ | ||
| 'Check is relevant for Boosting models of type {supported_models}, but received model of type {model_type}' | ||
| _NO_MODEL_ERROR = \ | ||
| 'Check is relevant only when receiving the model, but predictions/probabilities were received instead. ' \ | ||
| 'In order to use this check, please pass the model to the run() method.' | ||
| _SUPPORTED_CLASSIFICATION_MODELS = ( | ||
@@ -82,2 +84,12 @@ 'AdaBoostClassifier', | ||
| @classmethod | ||
| def _raise_not_supported_model_error(cls, model_class): | ||
| if model_class != '_DummyModel': | ||
| raise ModelValidationError(cls._UNSUPPORTED_MODEL_ERROR.format( | ||
| supported_models=cls._SUPPORTED_MODELS, | ||
| model_type=model_class | ||
| )) | ||
| else: | ||
| raise ModelValidationError(cls._NO_MODEL_ERROR) | ||
| def predict_proba(self, x): | ||
@@ -93,6 +105,3 @@ if self.model_class in ['AdaBoostClassifier', 'GradientBoostingClassifier']: | ||
| else: | ||
| raise ModelValidationError(self._UNSUPPORTED_MODEL_ERROR.format( | ||
| supported_models=self._SUPPORTED_CLASSIFICATION_MODELS, | ||
| model_type=self.model_class | ||
| )) | ||
| self._raise_not_supported_model_error(self.model_class) | ||
@@ -110,6 +119,3 @@ def predict(self, x): | ||
| else: | ||
| raise ModelValidationError(self._UNSUPPORTED_MODEL_ERROR.format( | ||
| supported_models=self._SUPPORTED_MODELS, | ||
| model_type=self.model_class | ||
| )) | ||
| self._raise_not_supported_model_error(self.model_class) | ||
@@ -130,6 +136,3 @@ @classmethod | ||
| else: | ||
| raise ModelValidationError(cls._UNSUPPORTED_MODEL_ERROR.format( | ||
| supported_models=cls._SUPPORTED_MODELS, | ||
| model_type=model_class | ||
| )) | ||
| cls._raise_not_supported_model_error(model_class=model_class) | ||
@@ -136,0 +139,0 @@ |
@@ -374,3 +374,4 @@ # ---------------------------------------------------------------------------- | ||
| Used in classification models to flag if to run condition on average of classes, or on | ||
| each class individually | ||
| each class individually. If any scorer that return a single value is used, this parameter | ||
| is ignored (will act as if average=True). | ||
| """ | ||
@@ -394,4 +395,9 @@ name = f'Model performance gain over simple model is greater than {format_percent(min_allowed_gain)}' | ||
| # If the depth of the nested scores dict is 2, average is not relevant and is set to True | ||
| inner_dict = scores[list(scores.keys())[0]] | ||
| inner_inner_dict = inner_dict[list(inner_dict.keys())[0]] | ||
| force_average = isinstance(inner_inner_dict, Number) | ||
| passed_condition = True | ||
| if task_type in [TaskType.MULTICLASS, TaskType.BINARY] and not average: | ||
| if task_type in [TaskType.MULTICLASS, TaskType.BINARY] and not average and not force_average: | ||
| passed_metrics = {} | ||
@@ -438,3 +444,3 @@ failed_classes = defaultdict(dict) | ||
| perfect_metrics = [] | ||
| if task_type in [TaskType.MULTICLASS, TaskType.BINARY]: | ||
| if task_type in [TaskType.MULTICLASS, TaskType.BINARY] and not force_average: | ||
| scores = average_scores(scores, include_classes) | ||
@@ -441,0 +447,0 @@ for metric, models_scores in scores.items(): |
@@ -13,3 +13,3 @@ # ---------------------------------------------------------------------------- | ||
| import warnings | ||
| from typing import TYPE_CHECKING, Callable, Dict, List, Union | ||
| from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Union | ||
@@ -53,3 +53,3 @@ import numpy as np | ||
| Columns to ignore, if none given checks based on columns variable | ||
| n_top_features : int , default: 5 | ||
| n_top_features : Optional[int] , default: 10 | ||
| Number of features to use for segment search. Top columns are selected based on feature importance. | ||
@@ -78,2 +78,5 @@ segment_minimum_size_ratio: float , default: 0.05 | ||
| random seed for all check internals. | ||
| multiple_segments_per_feature : bool , default: True | ||
| If True, will allow the same feature to be a segmenting feature in multiple segments, | ||
| otherwise each feature can appear in one segment at most. | ||
| """ | ||
@@ -85,3 +88,3 @@ | ||
| ignore_columns: Union[Hashable, List[Hashable], None] = None, | ||
| n_top_features: int = 5, | ||
| n_top_features: Optional[int] = 10, | ||
| segment_minimum_size_ratio: float = 0.05, | ||
@@ -95,2 +98,3 @@ alternative_scorer: Dict[str, Union[str, Callable]] = None, | ||
| random_state: int = 42, | ||
| multiple_segments_per_feature: bool = True, | ||
| **kwargs | ||
@@ -116,2 +120,3 @@ ): | ||
| self.categorical_aggregation_threshold = categorical_aggregation_threshold | ||
| self.multiple_segments_per_feature = multiple_segments_per_feature | ||
@@ -169,3 +174,4 @@ def run_logic(self, context: Context, dataset_kind) -> CheckResult: | ||
| feature_rank_for_search=feature_rank, | ||
| dummy_model=dummy_model, scorer=scorer) | ||
| dummy_model=dummy_model, scorer=scorer, | ||
| multiple_segments_per_feature=self.multiple_segments_per_feature) | ||
@@ -172,0 +178,0 @@ if len(weak_segments) == 0: |
@@ -104,3 +104,6 @@ # ---------------------------------------------------------------------------- | ||
| sample_size = min(self.n_samples, train_dataset.n_samples, test_dataset.n_samples) | ||
| if self.n_samples is None: | ||
| sample_size = min(train_dataset.n_samples, test_dataset.n_samples) | ||
| else: | ||
| sample_size = min(train_dataset.n_samples, test_dataset.n_samples, self.n_samples) | ||
@@ -107,0 +110,0 @@ headnote = """ |
@@ -294,4 +294,7 @@ # ---------------------------------------------------------------------------- | ||
| # If in infer_task_type we didn't find classes on model, or user didn't pass any, then using the observed | ||
| get_logger().warning('Could not find model\'s classes, using the observed classes') | ||
| return self.observed_classes | ||
| get_logger().warning('Could not find model\'s classes, using the observed classes. ' | ||
| 'In order to make sure the classes used by the model are inferred correctly, ' | ||
| 'please use the model_classes argument') | ||
| self._model_classes = self.observed_classes | ||
| return self._model_classes | ||
@@ -298,0 +301,0 @@ |
@@ -34,5 +34,5 @@ # ---------------------------------------------------------------------------- | ||
| if with_display: | ||
| fig = create_confusion_matrix_figure(result, total_classes, normalize_display) | ||
| displays = create_confusion_matrix_figure(result, total_classes, normalize_display) | ||
| else: | ||
| fig = None | ||
| displays = None | ||
@@ -42,3 +42,3 @@ # For accessing the class names from the condition | ||
| return CheckResult(result, display=fig) | ||
| return CheckResult(result, display=displays) | ||
@@ -65,27 +65,44 @@ | ||
| """ | ||
| confusion_matrix_norm = confusion_matrix_data.astype('float') / \ | ||
| (confusion_matrix_data.sum(axis=1)[:, np.newaxis] + np.finfo(float).eps) * 100 | ||
| if normalize_display: | ||
| confusion_matrix_norm = confusion_matrix_data.astype('float') / \ | ||
| (confusion_matrix_data.sum(axis=1)[:, np.newaxis] + np.finfo(float).eps) * 100 | ||
| z = np.vectorize(format_number_if_not_nan)(confusion_matrix_norm) | ||
| text_template = '%{z}%<br>(%{text})' | ||
| color_bar_title = '% out of<br>True Values' | ||
| plot_title = 'Percent Out of True Values (Count)' | ||
| else: | ||
| z = confusion_matrix_data | ||
| color_bar_title = None | ||
| text_template = '%{text}' | ||
| plot_title = 'Value Count' | ||
| fig = go.Figure(data=go.Heatmap( | ||
| x=classes_names, y=classes_names, z=z, | ||
| text=confusion_matrix_data, texttemplate=text_template)) | ||
| fig.data[0].colorbar.title = color_bar_title | ||
| fig.update_layout(title=plot_title) | ||
| accuracy_array = np.diag(confusion_matrix_norm).round(decimals=2) | ||
| display = [] | ||
| display_msg = f'The overall accuracy of your model is: {round(np.sum(accuracy_array)/len(accuracy_array), 2)}%.' | ||
| if min(accuracy_array) < 100: | ||
| display_msg += f'<br>Best accuracy achieved on samples with <b>{classes_names[np.argmax(accuracy_array)]}' \ | ||
| f'</b> label ({np.max(accuracy_array)}%).' | ||
| display_msg += f'<br>Worst accuracy achieved on samples with <b>{classes_names[np.argmin(accuracy_array)]}' \ | ||
| f'</b> label ({np.min(accuracy_array)}%).' | ||
| display.append(display_msg) | ||
| total_samples = np.nansum(confusion_matrix_data) | ||
| percent_data_each_row = np.round(confusion_matrix_norm, decimals=2) | ||
| percent_data_each_cell = np.round(np.divide(np.nan_to_num(confusion_matrix_data, nan=0.0), total_samples) * 100, | ||
| decimals=2) | ||
| percent_data_each_col = (confusion_matrix_data.astype('float') / | ||
| (confusion_matrix_data.sum(axis=0)[:, np.newaxis] + | ||
| np.finfo(float).eps) * 100).round(decimals=2) | ||
| custom_hoverdata = np.dstack((percent_data_each_cell, percent_data_each_row, percent_data_each_col)) | ||
| fig = go.Figure(data=go.Heatmap(x=classes_names, y=classes_names, z=z, customdata=custom_hoverdata, | ||
| xgap=1, ygap=1, text=confusion_matrix_data, texttemplate='%{text}', | ||
| hovertemplate='% out of all data: <b>%{customdata[0]}%</b><br>% out ' | ||
| 'of row: <b>%{customdata[1]}%</b><br>% out of column: ' | ||
| '<b>%{customdata[2]}%</b><extra></extra>', | ||
| showscale=False)) | ||
| fig.update_layout(title='Confusion Matrix (# Samples)', title_x=0.5) | ||
| fig.update_layout(height=600) | ||
| fig.update_xaxes(title='Predicted Value', type='category', scaleanchor='y', constrain='domain') | ||
| fig.update_yaxes(title='True Value', type='category', constrain='domain', autorange='reversed') | ||
| display.append(fig) | ||
| return display | ||
| return fig | ||
| def misclassified_samples_lower_than_condition(value: pd.DataFrame, | ||
@@ -92,0 +109,0 @@ misclassified_samples_threshold: float) -> ConditionResult: |
@@ -84,3 +84,3 @@ # ---------------------------------------------------------------------------- | ||
| with_display=with_display, | ||
| dataset_names=(test_dataframe_name, train_dataframe_name) | ||
| dataset_names=(train_dataframe_name, test_dataframe_name) | ||
| ) | ||
@@ -87,0 +87,0 @@ |
@@ -173,4 +173,4 @@ # ---------------------------------------------------------------------------- | ||
| feature_rank_for_search: Optional[np.ndarray] = None, | ||
| dummy_model: Optional[_DummyModel] = None, | ||
| scorer: Optional[DeepcheckScorer] = None, scorer_name: Optional[str] = None) \ | ||
| dummy_model: Optional[_DummyModel] = None, scorer: Optional[DeepcheckScorer] = None, | ||
| scorer_name: Optional[str] = None, multiple_segments_per_feature: bool = False) \ | ||
| -> pd.DataFrame: | ||
@@ -217,8 +217,21 @@ """Search for weak segments based on scorer.""" | ||
| # Drop duplicates without considering column 'Samples in Segment' | ||
| result_no_duplicates = weak_segments.drop(columns='Samples in Segment').drop_duplicates() | ||
| result_no_duplicates['Samples in Segment'] = weak_segments.loc[result_no_duplicates.index, 'Samples in Segment'] | ||
| # Sort and drop relevant columns | ||
| weak_segments = weak_segments.sort_values(score_title).reset_index(drop=True) | ||
| if multiple_segments_per_feature: | ||
| result = weak_segments.drop(columns='Samples in Segment').drop_duplicates() | ||
| result['Samples in Segment'] = weak_segments.loc[result.index, 'Samples in Segment'] | ||
| else: | ||
| used_features = set() | ||
| result = pd.DataFrame(columns=weak_segments.columns) | ||
| for _, row in weak_segments.iterrows(): | ||
| if row['Feature1'] in used_features or row['Feature2'] in used_features: | ||
| continue | ||
| return result_no_duplicates.sort_values(score_title).reset_index(drop=True) | ||
| result.loc[len(result)] = row | ||
| used_features.add(row['Feature1']) | ||
| if row['Feature2'] != '': | ||
| used_features.add(row['Feature2']) | ||
| return result | ||
| def _find_weak_segment(self, data: pd.DataFrame, features_for_segment: List[str], score_per_sample: pd.Series, | ||
@@ -335,2 +348,5 @@ label_col: Optional[pd.Series] = None, dummy_model: Optional[_DummyModel] = None, | ||
| def condition(result: Dict) -> ConditionResult: | ||
| if 'message' in result: | ||
| return ConditionResult(ConditionCategory.PASS, result['message']) | ||
| weakest_segment_score = result['weak_segments_list'].iloc[0, 0] | ||
@@ -337,0 +353,0 @@ scorer_name = result['weak_segments_list'].columns[0].lower() |
@@ -34,2 +34,7 @@ # ---------------------------------------------------------------------------- | ||
| feature_distribution_colors = { | ||
| 'measure': '#00008b', # dark blue | ||
| 'feature': 'rgba(105, 179, 162, 1)' | ||
| } | ||
| common_and_outlier_colors = {'common': 'rgba(105, 179, 162, 1)', | ||
@@ -36,0 +41,0 @@ 'outliers': 'rgba(179, 106, 106, 1)', |
@@ -148,3 +148,3 @@ # ---------------------------------------------------------------------------- | ||
| description.append( | ||
| description.extend( | ||
| create_confusion_matrix_figure(confusion_matrix, x, self.normalized) | ||
@@ -151,0 +151,0 @@ ) |
@@ -69,2 +69,5 @@ # ---------------------------------------------------------------------------- | ||
| For each categorical property, categories with frequency below threshold will be merged into "Other" category. | ||
| multiple_segments_per_property : bool , default: True | ||
| If True, will allow the same property to be a segmenting feature in multiple segments, | ||
| otherwise each property can appear in one segment at most. | ||
| {additional_check_init_params:2*indent} | ||
@@ -80,2 +83,3 @@ """ | ||
| categorical_aggregation_threshold: float = 0.05, | ||
| multiple_segments_per_property: bool = True, | ||
| **kwargs | ||
@@ -91,2 +95,3 @@ ): | ||
| self.categorical_aggregation_threshold = categorical_aggregation_threshold | ||
| self.multiple_segments_per_property = multiple_segments_per_property | ||
| self._properties_results = None | ||
@@ -142,3 +147,4 @@ self._sample_scores = None | ||
| score_per_sample=score_per_sample_col, | ||
| scorer_name=self._scorer_name) | ||
| scorer_name=self._scorer_name, | ||
| multiple_segments_per_feature=self.multiple_segments_per_property) | ||
| if len(weak_segments) == 0: | ||
@@ -145,0 +151,0 @@ raise DeepchecksProcessError('WeakSegmentsPerformance was unable to train an error model to find weak ' |
@@ -228,3 +228,5 @@ # ---------------------------------------------------------------------------- | ||
| while jump_size > 1: | ||
| if font.getsize(text)[0] < desired_width: | ||
| left, _, right, _ = font.getbbox(text) | ||
| width = right - left | ||
| if width < desired_width: | ||
| font_size += jump_size | ||
@@ -270,4 +272,4 @@ else: | ||
| size = (int(image.size[0] * factor), int(image.size[1] * factor)) | ||
| # Resize the image | ||
| image = image.resize(size, pilimage.ANTIALIAS) | ||
| # Resize the image by Image.LANCZOS | ||
| image = image.resize(size, pilimage.LANCZOS) | ||
| else: | ||
@@ -274,0 +276,0 @@ image = ensure_image(image, copy=False) |
+2
-2
| Metadata-Version: 2.1 | ||
| Name: deepchecks | ||
| Version: 0.17.3 | ||
| Version: 0.17.4 | ||
| Summary: Package for validating your machine learning model and data | ||
@@ -9,3 +9,3 @@ Home-page: https://github.com/deepchecks/deepchecks | ||
| License: UNKNOWN | ||
| Download-URL: https://github.com/deepchecks/deepchecks/releases/download/0.17.3/deepchecks-0.17.3.tar.gz | ||
| Download-URL: https://github.com/deepchecks/deepchecks/releases/download/0.17.4/deepchecks-0.17.4.tar.gz | ||
| Project-URL: Documentation, https://docs.deepchecks.com | ||
@@ -12,0 +12,0 @@ Project-URL: Bug Reports, https://github.com/deepchecks/deepchecks |
+2
-2
@@ -111,3 +111,3 @@ <!-- | ||
| pip install deepchecks-installer | ||
| deepchecks-installer monitoring-install | ||
| deepchecks-installer install-monitoring | ||
| ``` | ||
@@ -374,2 +374,2 @@ | ||
| This project follows the [all-contributors](https://allcontributors.org) | ||
| specification. Contributions of any kind are welcome! | ||
| specification. Contributions of any kind are welcome! |
@@ -40,3 +40,5 @@ twine | ||
| requests>=2.23.0 | ||
| scipy>=1.4.1 | ||
| # Remove the <=1.10.1 dependency below once sklearn's issue is fixed. The higher version causes | ||
| # issues with sklearn's _most_frequent() function using scipy's mode() function | ||
| scipy>=1.4.1, <=1.10.1 | ||
| tqdm>=4.41.0 | ||
@@ -43,0 +45,0 @@ seaborn>=0.11.0 |
@@ -1,2 +0,2 @@ | ||
| optimum[onnxruntime]>=1.7.0 | ||
| optimum[onnxruntime]>=1.8.8 | ||
| fasttext>=0.8.0 |
@@ -28,3 +28,5 @@ pandas>=1.1.5 | ||
| statsmodels>=0.13.5; python_version >= '3.7' | ||
| scipy>=1.4.1 | ||
| # Remove the <=1.10.1 dependency below once sklearn's issue is fixed. The higher version causes | ||
| # issues with sklearn's _most_frequent() function using scipy's mode() function | ||
| scipy>=1.4.1, <=1.10.1 | ||
| dataclasses>=0.6; python_version < '3.7' | ||
@@ -31,0 +33,0 @@ plotly>=5.13.1 |
+1
-1
@@ -1,1 +0,1 @@ | ||
| 0.17.3 | ||
| 0.17.4 |
Alert delta unavailable
Currently unable to show alert delta for PyPI packages.
19882660
0.19%339
0.59%69577
0.83%