deepchecks - npm Package Compare versions

+2

-2

deepchecks.egg-info/PKG-INFO

		Metadata-Version: 2.1
		Name: deepchecks
		Version: 0.17.5
		Version: 0.18.0
		Summary: Package for validating your machine learning model and data
		@@ -9,3 +9,3 @@ Home-page: https://github.com/deepchecks/deepchecks
		License: UNKNOWN
		Download-URL: https://github.com/deepchecks/deepchecks/releases/download/0.17.5/deepchecks-0.17.5.tar.gz
		Download-URL: https://github.com/deepchecks/deepchecks/releases/download/0.18.0/deepchecks-0.18.0.tar.gz
		Project-URL: Documentation, https://docs.deepchecks.com
		@@ -12,0 +12,0 @@ Project-URL: Bug Reports, https://github.com/deepchecks/deepchecks

+13

-7

deepchecks.egg-info/requires.txt

		pandas<2.2.0,>=1.1.5
		numpy>=1.19
		scikit-learn>=0.23.2
		scikit-learn<1.4.0,>=0.23.2
		jsonpickle>=2
		@@ -12,3 +11,2 @@ PyNomaly>=0.3.3
		matplotlib>=3.3.4
		pyzmq<24.0.0
		beautifulsoup4>=4.11.1
		@@ -22,3 +20,4 @@ requests>=2.22.0
		[:python_version < "3.8"]
		ipython<8,>=5.5.0
		numpy>=1.19
		ipython>=5.5.0
		ipykernel>=4.10.1
		@@ -35,12 +34,13 @@ ipywidgets<8,>=7.5.0
		[:python_version >= "3.8"]
		ipython<8,>=7.15.0
		numpy>=1.22.2
		ipython>=7.15.0
		ipykernel>=5.3.0
		ipywidgets<8,>=7.6.5
		jupyter-server>=2.7.2

		[nlp]
		seqeval>=1.0.0
		nltk<=3.6.7,>=3.4.0
		textblob>=0.17.1
		umap-learn
		transformers>=4.0.0
		transformers<4.37.0,>=4.0.0

		@@ -50,2 +50,8 @@ [nlp-properties]

		[nlp:python_version < "3.7"]
		nltk<=3.6.7

		[nlp:python_version >= "3.7"]
		nltk>=3.8.1

		[nlp:python_version >= "3.8"]
		@@ -52,0 +58,0 @@ tiktoken

+29

-8

deepchecks/core/suite.py

		@@ -109,11 +109,32 @@ # ----------------------------------------------------------------------------

		def select_results(self, idx: Set[int]) -> List[Union[
		def select_results(self, idx: Set[int] = None, names: Set[str] = None) -> List[Union[
		'check_types.CheckResult',
		'check_types.CheckFailure'
		]]:
		"""Select results by indexes."""
		output = []
		for index, result in enumerate(self.results):
		if index in idx:
		output.append(result)
		"""Select results either by indexes or result header names.

		Parameters
		----------
		idx : Set[int], default None
		The list of indexes to filter the check results from the results list. If
		names is None, then this parameter is required.
		names : Set[str], default None
		The list of names denoting the header of the check results. If idx is None,
		this parameter is required. Both idx and names cannot be passed.

		Returns
		-------
		List[Union['check_types.CheckResult', 'check_types.CheckFailure']] :
		A list of check results filtered either by the indexes or by their names.
		"""
		if idx is None and names is None:
		raise DeepchecksNotSupportedError('Either idx or names should be passed')
		if idx and names:
		raise DeepchecksNotSupportedError('Only one of idx or names should be passed')

		if names:
		names = [name.lower().replace('_', ' ').strip() for name in names]
		output = [result for name in names for result in self.results if result.get_header().lower() == name]
		else:
		output = [result for index, result in enumerate(self.results) if index in idx]
		return output
		@@ -425,3 +446,3 @@
		def get_not_passed_checks(self, fail_if_warning=True) -> List['check_types.CheckResult']:
		"""Get all the check results that have not passing condition.
		"""Get all the check results that have not passing condition. This does not include checks that failed to run.

		@@ -448,3 +469,3 @@ Parameters
		def get_passed_checks(self, fail_if_warning=True) -> List['check_types.CheckResult']:
		"""Get all the check results that have passing condition.
		"""Get all the check results that have passing condition. This does not include checks that failed to run.

		@@ -451,0 +472,0 @@ Parameters

+12

-0

deepchecks/nlp/che...a_integrity/under_annotated_segments.py

		@@ -45,2 +45,3 @@ # ----------------------------------------------------------------------------
		segment_minimum_size_ratio: float, n_samples: int, n_to_show: int,
		max_categories_weak_segment: Optional[int],
		categorical_aggregation_threshold: float, multiple_segments_per_feature: bool, **kwargs):
		@@ -53,2 +54,3 @@ super().__init__(**kwargs)
		self.segment_minimum_size_ratio = segment_minimum_size_ratio
		self.max_categories_weak_segment = max_categories_weak_segment
		self.n_samples = n_samples
		@@ -251,2 +253,5 @@ self.n_to_show = n_to_show
		size >= segment_minimum_size_ratio * data_size.
		max_categories_weak_segment: Optional[int] , default: None
		Maximum number of categories that can be included in a weak segment per categorical feature.
		If None, the number of categories is not limited.
		n_samples : int , default: 10_000
		@@ -268,2 +273,3 @@ Maximum number of samples to use for this check.
		segment_minimum_size_ratio: float = 0.05,
		max_categories_weak_segment: Optional[int] = None,
		n_samples: int = 10_000,
		@@ -279,2 +285,3 @@ categorical_aggregation_threshold: float = 0.05,
		segment_minimum_size_ratio=segment_minimum_size_ratio,
		max_categories_weak_segment=max_categories_weak_segment,
		n_samples=n_samples,
		@@ -310,2 +317,5 @@ n_to_show=n_to_show,
		size >= segment_minimum_size_ratio * data_size.
		max_categories_weak_segment: Optional[int] , default: None
		Maximum number of categories that can be included in a weak segment per categorical feature.
		If None, the number of categories is not limited.
		n_samples : int , default: 10_000
		@@ -327,2 +337,3 @@ Maximum number of samples to use for this check.
		segment_minimum_size_ratio: float = 0.05,
		max_categories_weak_segment: Optional[int] = None,
		n_samples: int = 10_000,
		@@ -338,2 +349,3 @@ categorical_aggregation_threshold: float = 0.05,
		segment_minimum_size_ratio=segment_minimum_size_ratio,
		max_categories_weak_segment=max_categories_weak_segment,
		n_samples=n_samples,
		@@ -340,0 +352,0 @@ n_to_show=n_to_show,

+14

-2

deepchecks/nlp/che...evaluation/weak_segments_performance.py

		@@ -38,3 +38,4 @@ # ----------------------------------------------------------------------------
		ignore_columns: Union[Hashable, List[Hashable], None], n_top_features: Optional[int],
		segment_minimum_size_ratio: float, alternative_scorer: Dict[str, Union[str, Callable]],
		segment_minimum_size_ratio: float, max_categories_weak_segment: Optional[int],
		alternative_scorer: Dict[str, Union[str, Callable]],
		score_per_sample: Union[np.ndarray, pd.Series, None], n_samples: int,
		@@ -49,2 +50,3 @@ categorical_aggregation_threshold: float, n_to_show: int,
		self.segment_minimum_size_ratio = segment_minimum_size_ratio
		self.max_categories_weak_segment = max_categories_weak_segment
		self.n_samples = n_samples
		@@ -118,3 +120,3 @@ self.n_to_show = n_to_show
		if len(weak_segments) == 0:
		display_msg = 'WeakSegmentsPerformance was unable to train an error model to find weak segments.'\
		display_msg = 'WeakSegmentsPerformance was unable to train an error model to find weak segments.' \
		f'Try supplying additional {self.segment_by}.'
		@@ -164,2 +166,5 @@ return CheckResult(value={'message': display_msg}, display=[display_msg])
		size >= segment_minimum_size_ratio * data_size.
		max_categories_weak_segment: Optional[int] , default: None
		Maximum number of categories that can be included in a weak segment per categorical feature.
		If None, the number of categories is not limited.
		alternative_scorer : Dict[str, Union[str, Callable]] , default: None
		@@ -189,2 +194,3 @@ Scorer to use as performance measure, either function or sklearn scorer name.
		segment_minimum_size_ratio: float = 0.05,
		max_categories_weak_segment: Optional[int] = None,
		alternative_scorer: Dict[str, Union[str, Callable]] = None,
		@@ -202,2 +208,3 @@ score_per_sample: Union[np.ndarray, pd.Series, None] = None,
		segment_minimum_size_ratio=segment_minimum_size_ratio,
		max_categories_weak_segment=max_categories_weak_segment,
		n_samples=n_samples,
		@@ -238,2 +245,5 @@ n_to_show=n_to_show,
		size >= segment_minimum_size_ratio * data_size.
		max_categories_weak_segment: Optional[int] , default: None
		Maximum number of categories that can be included in a weak segment per categorical feature.
		If None, the number of categories is not limited.
		alternative_scorer : Dict[str, Union[str, Callable]] , default: None
		@@ -263,2 +273,3 @@ Scorer to use as performance measure, either function or sklearn scorer name.
		segment_minimum_size_ratio: float = 0.05,
		max_categories_weak_segment: Optional[int] = None,
		alternative_scorer: Dict[str, Union[str, Callable]] = None,
		@@ -276,2 +287,3 @@ score_per_sample: Union[np.ndarray, pd.Series, None] = None,
		segment_minimum_size_ratio=segment_minimum_size_ratio,
		max_categories_weak_segment=max_categories_weak_segment,
		n_samples=n_samples,
		@@ -278,0 +290,0 @@ n_to_show=n_to_show,

+57

-21

deepchecks/nlp/text_data.py

		@@ -13,2 +13,3 @@ # ----------------------------------------------------------------------------
		import contextlib
		import pathlib
		import typing as t
		@@ -389,5 +390,5 @@ import warnings
		def set_metadata(
		self,
		metadata: pd.DataFrame,
		categorical_metadata: t.Optional[t.Sequence[str]] = None
		self,
		metadata: pd.DataFrame,
		categorical_metadata: t.Optional[t.Sequence[str]] = None
		):
		@@ -413,8 +414,12 @@ """Set the metadata of the dataset."""
		def calculate_builtin_properties(
		self,
		include_properties: t.Optional[t.List[str]] = None,
		ignore_properties: t.Optional[t.List[str]] = None,
		include_long_calculation_properties: bool = False,
		ignore_non_english_samples_for_english_properties: bool = True,
		device: t.Optional[str] = None
		self,
		include_properties: t.Optional[t.List[str]] = None,
		ignore_properties: t.Optional[t.List[str]] = None,
		include_long_calculation_properties: bool = False,
		ignore_non_english_samples_for_english_properties: bool = True,
		device: t.Optional[str] = None,
		models_storage: t.Union[pathlib.Path, str, None] = None,
		batch_size: t.Optional[int] = 16,
		cache_models: bool = False,
		use_onnx_models: bool = True,
		):
		@@ -426,10 +431,25 @@ """Calculate the default properties of the dataset.
		include_properties : List[str], default None
		The properties to calculate. If None, all default properties will be calculated. Cannot be used together
		with ignore_properties parameter.
		The properties to calculate. If None, all default properties will be calculated. Cannot be used
		together with ignore_properties parameter. Available properties are:
		['Text Length', 'Average Word Length', 'Max Word Length',
		'% Special Characters', '% Punctuation', 'Language',
		'Sentiment', 'Subjectivity', 'Toxicity', 'Fluency', 'Formality', 'Lexical Density', 'Unique Noun Count',
		'Reading Ease', 'Average Words Per Sentence', 'URLs Count', Unique URLs Count', 'Email Address Count',
		'Unique Email Address Count', 'Unique Syllables Count', 'Reading Time', 'Sentences Count',
		'Average Syllable Length']
		List of default properties are: ['Text Length', 'Average Word Length', 'Max Word Length',
		'% Special Characters', '% Punctuation', 'Language', 'Sentiment', 'Subjectivity', 'Toxicity', 'Fluency',
		'Formality', 'Lexical Density', 'Unique Noun Count', 'Reading Ease', 'Average Words Per Sentence']
		To calculate all the default properties, the include_properties and ignore_properties parameters should
		be None. If you pass either include_properties or ignore_properties then only the properties specified
		in the list will be calculated or ignored.
		Note that the properties ['Toxicity', 'Fluency', 'Formality', 'Language', 'Unique Noun Count'] may
		take a long time to calculate. If include_long_calculation_properties is False, these properties will be
		ignored, even if they are in the include_properties parameter.
		ignore_properties : List[str], default None
		The properties to ignore. If None, no properties will be ignored. Cannot be used together with
		properties parameter.
		The properties to ignore from the list of default properties. If None, no properties will be ignored and
		all the default properties will be calculated. Cannot be used together with include_properties parameter.
		include_long_calculation_properties : bool, default False
		Whether to include properties that may take a long time to calculate. If False, these properties will be
		ignored.
		ignored, unless they are specified in the include_properties parameter explicitly.
		ignore_non_english_samples_for_english_properties : bool, default True
		@@ -441,4 +461,16 @@ Whether to ignore samples that are not in English when calculating English properties. If False, samples
		only when you are sure that all the samples are in English.
		device : int, default None
		The device to use for the calculation. If None, the default device will be used.
		device : Optional[str], default None
		The device to use for the calculation. If None, the default device will be used. For onnx based models it is
		recommended to set device to None for optimized performance.
		models_storage : Union[str, pathlib.Path, None], default None
		A directory to store the models.
		If not provided, models will be stored in `DEEPCHECKS_LIB_PATH/nlp/.nlp-models`.
		Also, if a folder already contains relevant resources they are not re-downloaded.
		batch_size : int, default 8
		The batch size.
		cache_models : bool, default False
		If True, will store the models in device RAM memory. This will speed up the calculation for future calls.
		use_onnx_models : bool, default True
		If True, will use onnx gpu optimized models for the calculation. Requires the optimum[onnxruntime-gpu]
		library to be installed as well as the availability of GPU.
		"""
		@@ -454,3 +486,7 @@ if self._properties is not None:
		ignore_non_english_samples_for_english_properties=ignore_non_english_samples_for_english_properties,
		device=device
		device=device,
		models_storage=models_storage,
		batch_size=batch_size,
		cache_models=cache_models,
		use_onnx_models=use_onnx_models,
		)
		@@ -462,5 +498,5 @@
		def set_properties(
		self,
		properties: pd.DataFrame,
		categorical_properties: t.Optional[t.Sequence[str]] = None
		self,
		properties: pd.DataFrame,
		categorical_properties: t.Optional[t.Sequence[str]] = None
		):
		@@ -797,3 +833,3 @@ """Set the properties of the dataset."""
		if len(prop_names) != len(properties_to_show):
		raise DeepchecksValueError(f'{set(properties_to_show)-set(prop_names)} '
		raise DeepchecksValueError(f'{set(properties_to_show) - set(prop_names)} '
		'properties does not exist in the TextData object')
		@@ -800,0 +836,0 @@ else:

+22

-6

deepchecks/nlp/utils/text_embeddings.py

		@@ -12,2 +12,3 @@ # ----------------------------------------------------------------------------
		"""Utils module for calculating embeddings for text."""
		import re
		import sys
		@@ -26,3 +27,7 @@ import warnings

		PATTERN_SPECIAL_CHARS = re.compile(r"[!@#$%^&*()_+{}\|:\"<>?~`\-=\[\]\;',.\/]")
		PATTERN_SPACE_CHARS = re.compile(r'\s')
		PATTERN_BR_CHARS = re.compile(r'<br />')


		def batched(iterable, n):
		@@ -210,8 +215,19 @@ """Batch data into tuples of length n. The last batch may be shorter."""

		def _clean_special_chars(text):
		special_chars = r'!@#$%^&*()_+{}\|:"<>?~`-=[]\;\',./'
		for char in special_chars:
		text = text.replace(char, '')
		text = text.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ')
		text = text.replace('<br />', ' ')
		def _clean_special_chars(text: str) -> str:
		"""
		Remove special characters, replaces space characters with space.

		Parameters
		----------
		text : str
		The `text` parameter is a string that represents the input text that needs to be cleaned.

		Returns
		-------
		text
		Cleaned text string
		"""
		text = PATTERN_SPECIAL_CHARS.sub('', text)
		text = PATTERN_SPACE_CHARS.sub(' ', text)
		text = PATTERN_BR_CHARS.sub(' ', text)
		return text

+36

-39

deepchecks/nlp/utils/text_properties_models.py

		@@ -18,7 +18,5 @@ # ----------------------------------------------------------------------------
		from importlib import import_module
		from importlib.util import find_spec
		from typing import Optional, Union

		import requests
		import torch
		from nltk import corpus
		@@ -80,17 +78,24 @@ from transformers.utils import logging as transformers_logging
		models_storage: Union[pathlib.Path, str, None] = None,
		quantize_model: bool = True,
		use_onnx_model: bool = False,
		use_cache=False
		):
		"""Return a transformers' pipeline for the given model name."""
		transformers = import_optional_property_dependency('transformers', property_name=property_name)
		if use_onnx_model and 'onnx' not in model_name.lower():
		raise ValueError("use_onnx_model=True, but model_name is not for a 'onnx' model")

		if use_cache:
		model, tokenizer = _get_transformer_model_and_tokenizer(property_name, model_name,
		models_storage, quantize_model)
		models_storage, use_onnx_model)
		else:
		# __wrapped__ is simply the function without decoration, in our case - without caching
		model, tokenizer = _get_transformer_model_and_tokenizer.__wrapped__(property_name, model_name,
		models_storage, quantize_model)
		models_storage, use_onnx_model)

		pipeline_kwargs = {'device_map': 'auto'} if find_spec('accelerate') is not None else {'device': device}
		return transformers.pipeline('text-classification', model=model, tokenizer=tokenizer, **pipeline_kwargs)
		if use_onnx_model:
		onnx_pipe = import_optional_property_dependency('optimum.pipelines', property_name=property_name)
		return onnx_pipe.pipeline('text-classification', model=model, tokenizer=tokenizer,
		accelerator='ort', device=device)
		else:
		transformers = import_optional_property_dependency('transformers', property_name=property_name)
		return transformers.pipeline('text-classification', model=model, tokenizer=tokenizer, device=device)

		@@ -102,7 +107,5 @@
		user_logger_level = logging.getLogger('transformers').getEffectiveLevel()
		deepchecks_logger_level = logging.getLogger(__name__).getEffectiveLevel()
		is_progress_bar_enabled = transformers_logging.is_progress_bar_enabled()

		transformers_logging.set_verbosity_error()
		logging.getLogger(__name__).setLevel(50)
		transformers_logging.disable_progress_bar()
		@@ -116,3 +119,2 @@ logging.getLogger('transformers').setLevel(50)
		logging.getLogger('transformers').setLevel(user_logger_level)
		logging.getLogger(__name__).setLevel(deepchecks_logger_level)
		if is_progress_bar_enabled:
		@@ -122,19 +124,2 @@ transformers_logging.enable_progress_bar()

		def get_transformer_loader_params(model_name: str,
		models_storage: Union[pathlib.Path, str, None] = None,
		quantize_model: bool = True, ):
		"""Return the params for transformers' model and tokenizer auto classification loaders."""
		models_storage = get_create_model_storage(models_storage=models_storage)
		model_kwargs = dict(device_map=None)
		tokenizer_kwargs = dict(device_map=None)
		if quantize_model:
		model_path = models_storage / 'quantized' / model_name
		model_kwargs['load_in_8bit'] = True
		model_kwargs['torch_dtype'] = torch.float16
		else:
		model_path = models_storage / model_name

		return model_path, model_kwargs, tokenizer_kwargs


		@lru_cache(maxsize=5)
		@@ -145,3 +130,3 @@ def _get_transformer_model_and_tokenizer(
		models_storage: Union[pathlib.Path, str, None] = None,
		quantize_model: bool = True,
		use_onnx_model: bool = True,
		):
		@@ -152,17 +137,29 @@ """Return a transformers' model and tokenizer in cpu memory."""
		with _log_suppressor():
		model_path, model_kwargs, tokenizer_kwargs = get_transformer_loader_params(model_name, models_storage,
		quantize_model)
		models_storage = get_create_model_storage(models_storage=models_storage)
		model_path = models_storage / model_name
		model_path_exists = model_path.exists()

		if model_path.exists():
		tokenizer = transformers.AutoTokenizer.from_pretrained(model_path, **tokenizer_kwargs)
		model = transformers.AutoModelForSequenceClassification.from_pretrained(model_path, **model_kwargs)
		if use_onnx_model:
		onnx_runtime = import_optional_property_dependency('optimum.onnxruntime', property_name=property_name)
		classifier_cls = onnx_runtime.ORTModelForSequenceClassification
		if model_path_exists:
		model = classifier_cls.from_pretrained(model_path, provider='CUDAExecutionProvider')
		else:
		model = classifier_cls.from_pretrained(model_name, provider='CUDAExecutionProvider')
		model.save_pretrained(model_path)
		else:
		model = transformers.AutoModelForSequenceClassification.from_pretrained(model_name, **model_kwargs)
		model.save_pretrained(model_path)
		if model_path_exists:
		model = transformers.AutoModelForSequenceClassification.from_pretrained(model_path)
		else:
		model = transformers.AutoModelForSequenceClassification.from_pretrained(model_name)
		model.save_pretrained(model_path)
		model.eval()

		tokenizer = transformers.AutoTokenizer.from_pretrained(model_name, **tokenizer_kwargs)
		if model_path_exists:
		tokenizer = transformers.AutoTokenizer.from_pretrained(model_path)
		else:
		tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
		tokenizer.save_pretrained(model_path)

		model.eval()
		return model, tokenizer
		return model, tokenizer

		@@ -169,0 +166,0 @@

+73

-59

deepchecks/nlp/utils/text_properties.py

		@@ -35,3 +35,2 @@ # ----------------------------------------------------------------------------
		from deepchecks.utils.function import run_available_kwargs
		from deepchecks.utils.gpu_utils import empty_gpu
		from deepchecks.utils.strings import SPECIAL_CHARACTERS, format_list
		@@ -157,2 +156,5 @@
		language_code = prediction[0].replace('__label__', '') if prediction else None

		if language_code == 'eng': # both are english but different labels
		return 'en'
		return language_code
		@@ -251,4 +253,7 @@
		TOXICITY_MODEL_NAME = 'SkolkovoInstitute/roberta_toxicity_classifier'
		TOXICITY_MODEL_NAME_ONNX = 'Deepchecks/roberta_toxicity_classifier_onnx'
		FLUENCY_MODEL_NAME = 'prithivida/parrot_fluency_model'
		FLUENCY_MODEL_NAME_ONNX = 'Deepchecks/parrot_fluency_model_onnx'
		FORMALITY_MODEL_NAME = 's-nlp/roberta-base-formality-ranker'
		FORMALITY_MODEL_NAME_ONNX = 'Deepchecks/roberta_base_formality_ranker_onnx'

		@@ -260,2 +265,3 @@
		models_storage: Union[pathlib.Path, str, None] = None,
		use_onnx_models: bool = True,
		toxicity_classifier: Optional[object] = None
		@@ -265,4 +271,7 @@ ) -> Sequence[float]:
		if toxicity_classifier is None:
		use_onnx_models = _validate_onnx_model_availability(use_onnx_models, device)
		model_name = TOXICITY_MODEL_NAME_ONNX if use_onnx_models else TOXICITY_MODEL_NAME
		toxicity_classifier = get_transformer_pipeline(
		property_name='toxicity', model_name=TOXICITY_MODEL_NAME, models_storage=models_storage, device=device)
		property_name='toxicity', model_name=model_name, device=device,
		models_storage=models_storage, use_onnx_model=use_onnx_models)

		@@ -293,2 +302,3 @@ class UnitModel:
		models_storage: Union[pathlib.Path, str, None] = None,
		use_onnx_models: bool = True,
		fluency_classifier: Optional[object] = None
		@@ -298,4 +308,7 @@ ) -> Sequence[float]:
		if fluency_classifier is None:
		use_onnx_models = _validate_onnx_model_availability(use_onnx_models, device)
		model_name = FLUENCY_MODEL_NAME_ONNX if use_onnx_models else FLUENCY_MODEL_NAME
		fluency_classifier = get_transformer_pipeline(
		property_name='fluency', model_name=FLUENCY_MODEL_NAME, models_storage=models_storage, device=device)
		property_name='fluency', model_name=model_name, device=device,
		models_storage=models_storage, use_onnx_model=use_onnx_models)

		@@ -312,2 +325,3 @@ def output_formatter(v):
		models_storage: Union[pathlib.Path, str, None] = None,
		use_onnx_models: bool = True,
		formality_classifier: Optional[object] = None
		@@ -317,4 +331,7 @@ ) -> Sequence[float]:
		if formality_classifier is None:
		use_onnx_models = _validate_onnx_model_availability(use_onnx_models, device)
		model_name = FORMALITY_MODEL_NAME_ONNX if use_onnx_models else FORMALITY_MODEL_NAME
		formality_classifier = get_transformer_pipeline(
		property_name='formality', model_name=FORMALITY_MODEL_NAME, models_storage=models_storage, device=device)
		property_name='formality', model_name=model_name, device=device,
		models_storage=models_storage, use_onnx_model=use_onnx_models)

		@@ -343,3 +360,3 @@ def output_formatter(v):
		total_unique_words = len(set(all_words))
		return round(total_unique_words * 100 / len(all_words), 2)
		return round(total_unique_words / len(all_words), 2)

		@@ -583,3 +600,3 @@
		'100 (very easy). Based on Flesch reading-ease score',
		'Lexical Density': 'Percentage of unique words in the text',
		'Lexical Density': 'Ratio of unique words in the text',
		'Toxicity': 'A measure of how harmful or offensive a text sample is (0 to 1), '
		@@ -605,7 +622,5 @@ 'uses the SkolkovoInstitute/roberta_toxicity_classifier model',
		def _select_properties(
		n_of_samples: int,
		include_properties: Optional[List[str]] = None,
		ignore_properties: Optional[List[str]] = None,
		include_long_calculation_properties: bool = False,
		device: Optional[str] = None,
		) -> Sequence[TextProperty]:
		@@ -651,17 +666,4 @@ """Select properties to calculate based on provided parameters."""
		]
		else:
		heavy_properties = [
		prop['name'] for prop in properties
		if prop['name'] in LONG_RUN_PROPERTIES
		]
		if heavy_properties and n_of_samples > LARGE_SAMPLE_SIZE:
		warning_message = (
		f'Calculating the properties {heavy_properties} on a large dataset may take a long time. '
		'Consider using a smaller sample size or running this code on better hardware.'
		)
		if device is None or device == 'cpu':
		warning_message += ' Consider using a GPU or a similar device to run these properties.'
		warnings.warn(warning_message, UserWarning)

		return properties
		return properties

		@@ -679,3 +681,3 @@
		cache_models: bool = False,
		quantize_models: bool = True,
		use_onnx_models: bool = True,
		) -> Tuple[Dict[str, List[float]], Dict[str, str]]:
		@@ -717,4 +719,5 @@ """Calculate properties on provided text samples.
		only when you are sure that all the samples are in English.
		device : int, default None
		The device to use for the calculation. If None, the default device will be used.
		device : Optional[str], default None
		The device to use for the calculation. If None, the default device will be used. For onnx based models it is
		recommended to set device to None for optimized performance.
		models_storage : Union[str, pathlib.Path, None], default None
		@@ -727,8 +730,6 @@ A directory to store the models.
		cache_models : bool, default False
		If True, will store the models in CPU RAM memory. This will speed up the calculation, but will take up
		more memory. If device is not CPU, the models will be moved from CPU RAM memory to relevant device before
		calculation.
		quantize_models : bool, default True
		If True, will quantize the models to reduce their size and speed up the calculation. Requires the
		accelerate and bitsandbytes libraries to be installed as well as the availability of GPU.
		If True, will store the models in device RAM memory. This will speed up the calculation for future calls.
		use_onnx_models : bool, default True
		If True, will use onnx gpu optimized models for the calculation. Requires the optimum[onnxruntime-gpu] library
		to be installed as well as the availability of GPU.

		@@ -742,18 +743,9 @@ Returns
		"""
		if quantize_models:
		if find_spec('accelerate') is None or find_spec('bitsandbytes') is None:
		warnings.warn('Quantization requires the accelerate and bitsandbytes libraries to be installed. '
		'Calculating without quantization.')
		quantize_models = False
		if not torch.cuda.is_available():
		warnings.warn('GPU is required for the quantization process. Calculating without quantization.')
		quantize_models = False

		use_onnx_models = _validate_onnx_model_availability(use_onnx_models, device)
		text_properties = _select_properties(
		include_properties=include_properties,
		ignore_properties=ignore_properties,
		device=device,
		include_long_calculation_properties=include_long_calculation_properties,
		n_of_samples=len(raw_text)
		include_long_calculation_properties=include_long_calculation_properties
		)

		properties_types = {
		@@ -763,2 +755,3 @@ it['name']: it['output_type']
		}
		_warn_long_compute(device, properties_types, len(raw_text), use_onnx_models)

		@@ -779,16 +772,19 @@ kwargs = dict(device=device, models_storage=models_storage)

		if 'Toxicity' in properties_types:
		if 'Toxicity' in properties_types and 'toxicity_classifier' not in kwargs:
		model_name = TOXICITY_MODEL_NAME_ONNX if use_onnx_models else TOXICITY_MODEL_NAME
		kwargs['toxicity_classifier'] = get_transformer_pipeline(
		property_name='toxicity', model_name=TOXICITY_MODEL_NAME, device=device,
		models_storage=models_storage, use_cache=cache_models, quantize_model=quantize_models)
		property_name='toxicity', model_name=model_name, device=device,
		models_storage=models_storage, use_cache=cache_models, use_onnx_model=use_onnx_models)

		if 'Formality' in properties_types and 'formality_classifier' not in kwargs:
		model_name = FORMALITY_MODEL_NAME_ONNX if use_onnx_models else FORMALITY_MODEL_NAME
		kwargs['formality_classifier'] = get_transformer_pipeline(
		property_name='formality', model_name=FORMALITY_MODEL_NAME, device=device,
		models_storage=models_storage, use_cache=cache_models, quantize_model=quantize_models)
		property_name='formality', model_name=model_name, device=device,
		models_storage=models_storage, use_cache=cache_models, use_onnx_model=use_onnx_models)

		if 'Fluency' in properties_types and 'fluency_classifier' not in kwargs:
		model_name = FLUENCY_MODEL_NAME_ONNX if use_onnx_models else FLUENCY_MODEL_NAME
		kwargs['fluency_classifier'] = get_transformer_pipeline(
		property_name='fluency', model_name=FLUENCY_MODEL_NAME, device=device,
		models_storage=models_storage, use_cache=cache_models, quantize_model=quantize_models)
		property_name='fluency', model_name=model_name, device=device,
		models_storage=models_storage, use_cache=cache_models, use_onnx_model=use_onnx_models)

		@@ -861,3 +857,2 @@ # Remove language property from the list of properties to calculate as it will be calculated separately:
		sentences_cache.clear()
		empty_gpu(device)

		@@ -873,14 +868,33 @@ if not calculated_properties:

		if cache_models:
		# Move the transformers models to CPU RAM memory
		for model_name in ['toxicity_classifier', 'formality_classifier', 'fluency_classifier']:
		if model_name in kwargs:
		kwargs[model_name].model.to('cpu')
		return calculated_properties, properties_types

		# Clean all remaining RAM:
		empty_gpu(device)

		return calculated_properties, properties_types
		def _warn_long_compute(device, properties_types, n_samples, use_onnx_models):
		heavy_properties = [prop for prop in properties_types.keys() if prop in LONG_RUN_PROPERTIES]
		if len(heavy_properties) and n_samples > LARGE_SAMPLE_SIZE:
		warning_message = (
		f'Calculating the properties {heavy_properties} on a large dataset may take a long time. '
		'Consider using a smaller sample size or running this code on better hardware.'
		)
		if device == 'cpu' or (device is None and not use_onnx_models):
		warning_message += ' Consider using a GPU or a similar device to run these properties.'
		warnings.warn(warning_message, UserWarning)


		def _validate_onnx_model_availability(use_onnx_models: bool, device: Optional[str]):
		if not use_onnx_models:
		return False
		if find_spec('optimum') is None or find_spec('onnxruntime') is None:
		warnings.warn('Onnx models require the optimum[onnxruntime-gpu] library to be installed. '
		'Calculating using the default models.')
		return False
		if not torch.cuda.is_available():
		warnings.warn('GPU is required for the onnx models. Calculating using the default models.')
		return False
		if device is not None and device.lower() == 'cpu':
		warnings.warn('Onnx models are not supported on device CPU. Calculating using the default models.')
		return False
		return True


		def get_builtin_properties_types():
		@@ -887,0 +901,0 @@ """

+5

-0

deepchecks/tabular...evaluation/weak_segments_performance.py

		@@ -57,2 +57,5 @@ # ----------------------------------------------------------------------------
		size >= segment_minimum_size_ratio * data_size.
		max_categories_weak_segment: Optional[int] , default: None
		Maximum number of categories that can be included in a weak segment per categorical feature.
		If None, the number of categories is not limited.
		alternative_scorer : Dict[str, Union[str, Callable]] , default: None
		@@ -88,2 +91,3 @@ Scorer to use as performance measure, either function or sklearn scorer name.
		segment_minimum_size_ratio: float = 0.05,
		max_categories_weak_segment: Optional[int] = None,
		alternative_scorer: Dict[str, Union[str, Callable]] = None,
		@@ -110,2 +114,3 @@ loss_per_sample: Union[np.ndarray, pd.Series, None] = None,
		self.segment_minimum_size_ratio = segment_minimum_size_ratio
		self.max_categories_weak_segment = max_categories_weak_segment
		self.n_samples = n_samples
		@@ -112,0 +117,0 @@ self.n_to_show = n_to_show

+4

-1

deepchecks/tabular/metric_utils/scorers.py

		@@ -260,3 +260,6 @@ # ----------------------------------------------------------------------------
		self.is_binary = self.model_classes and len(self.model_classes) == 2
		self.predictions = pd.Series(self.user_model.predict(data).squeeze(), index=data.index)
		predictions = self.user_model.predict(data)
		if len(data) > 1:
		predictions = predictions.squeeze()
		self.predictions = pd.Series(predictions, index=data.index)

		@@ -263,0 +266,0 @@ def predict(self, data: pd.DataFrame) -> np.ndarray:

+78

-20

deepchecks/utils/abstracts/weak_segment_abstract.py

		@@ -43,2 +43,3 @@ # ----------------------------------------------------------------------------
		segment_minimum_size_ratio: float = 0.05
		max_categories_weak_segment: Optional[int] = None
		random_state: int = 42
		@@ -199,2 +200,5 @@ add_condition: Callable[..., Any]
		feature1, feature2 = feature_rank_for_search[[i, j]]
		# Categorical feature come first
		if feature1 not in self.encoder_mapping and feature2 in self.encoder_mapping:
		feature2, feature1 = feature_rank_for_search[[i, j]]
		weak_segment_score, weak_segment_filter = self._find_weak_segment(data, [feature1, feature2],
		@@ -218,21 +222,66 @@ score_per_sample, label_col,

		# Sort and drop relevant columns
		weak_segments = weak_segments.sort_values(score_title).reset_index(drop=True)
		if multiple_segments_per_feature:
		result = weak_segments.drop(columns='Samples in Segment').drop_duplicates()
		result['Samples in Segment'] = weak_segments.loc[result.index, 'Samples in Segment']
		else:
		used_features = set()
		result = pd.DataFrame(columns=weak_segments.columns)
		for _, row in weak_segments.iterrows():
		if row['Feature1'] in used_features or row['Feature2'] in used_features:
		continue
		# Filter and adapt the weak segments results
		result = pd.DataFrame(columns=weak_segments.columns)
		used_features = set()
		for _, row in weak_segments.sort_values(score_title).iterrows():
		new_row = row.copy()
		if not multiple_segments_per_feature and \
		(row['Feature1'] in used_features or row['Feature2'] in used_features):
		continue

		result.loc[len(result)] = row
		used_features.add(row['Feature1'])
		if row['Feature2'] != '':
		used_features.add(row['Feature2'])
		# Make sure segments based on categorical features are based only on a single category
		if self.max_categories_weak_segment is not None and row['Feature1'] in self.encoder_mapping:
		unique_values_in_range = [x for x in self.encoder_mapping[row['Feature1']]['encoded_value'].values if
		row['Feature1 Range'][1] > x > row['Feature1 Range'][0]]
		if len(unique_values_in_range) > self.max_categories_weak_segment:
		subset = data.loc[new_row['Samples in Segment']]
		value_segment_size = [len(subset[subset[row['Feature1']] == x]) for x in unique_values_in_range]
		# If all sub segments are too small, remove feature 2 filter
		if max(value_segment_size) < len(data) * self.segment_minimum_size_ratio and row['Feature2'] != '':
		subset = data
		value_segment_size = [len(data[data[row['Feature1']] == x]) for x in unique_values_in_range]
		new_row['Feature2'] = ''
		new_row['Feature2 Range'] = None
		if max(value_segment_size) < len(data) * self.segment_minimum_size_ratio:
		continue

		return result
		value_to_use = unique_values_in_range[np.argmax(value_segment_size)]
		subset = subset[subset[row['Feature1']] == value_to_use]
		new_row['Samples in Segment'] = list(subset.index)
		new_row['% of Data'] = round(100 * len(new_row['Samples in Segment']) / len(data), 2)
		new_row['Feature1 Range'] = [value_to_use - 0.5, value_to_use + 0.5]
		if dummy_model is not None and label_col is not None and scorer is not None:
		new_row[score_title] = scorer.run_on_data_and_label(dummy_model,
		subset, label_col[list(subset.index)])
		else:
		new_row[score_title] = score_per_sample[list(subset.index)].mean()

		if self.max_categories_weak_segment is not None and \
		new_row['Feature2'] != '' and row['Feature2'] in self.encoder_mapping:
		unique_values_in_range = [x for x in self.encoder_mapping[row['Feature2']]['encoded_value'].values if
		row['Feature2 Range'][1] > x > row['Feature2 Range'][0]]
		if len(unique_values_in_range) > self.max_categories_weak_segment:
		subset = data.loc[new_row['Samples in Segment']]
		value_segment_size = [len(subset[subset[row['Feature2']] == x]) for x in unique_values_in_range]
		# Feature 1 cannot be empty so if feature 2 do not have a large enough segment, ignore the row
		if max(value_segment_size) < len(data) * self.segment_minimum_size_ratio:
		continue
		value_to_use = unique_values_in_range[np.argmax(value_segment_size)]
		subset = subset[subset[row['Feature2']] == value_to_use]
		new_row['Samples in Segment'] = list(subset.index)
		new_row['% of Data'] = round(100 * len(new_row['Samples in Segment']) / len(data), 2)
		new_row['Feature2 Range'] = [value_to_use - 0.5, value_to_use + 0.5]
		if dummy_model is not None and label_col is not None and scorer is not None:
		new_row[score_title] = scorer.run_on_data_and_label(dummy_model,
		subset, label_col[list(subset.index)])
		else:
		new_row[score_title] = score_per_sample[list(subset.index)].mean()

		result.loc[len(result)] = new_row
		used_features.add(new_row['Feature1'])
		if new_row['Feature2'] != '':
		used_features.add(new_row['Feature2'])

		return result.sort_values(score_title).drop_duplicates(subset=['Feature1', 'Feature2'])

		def _find_weak_segment(self, data: pd.DataFrame, features_for_segment: List[str], score_per_sample: pd.Series,
		@@ -252,2 +301,11 @@ label_col: Optional[pd.Series] = None, dummy_model: Optional[_DummyModel] = None,
		"""
		# Remove rows with na values in the relevant columns
		data_for_search = data[features_for_segment].dropna()
		if len(data_for_search) == 0:
		return None, None
		segment_minimum_size_ratio = self.segment_minimum_size_ratio * len(data) / len(data_for_search)
		score_per_sample_for_search = score_per_sample.loc[data_for_search.index]
		if label_col is not None:
		label_col_for_search = label_col.loc[data_for_search.index]

		if version.parse(sklearn.__version__) < version.parse('1.0.0'):
		@@ -259,3 +317,3 @@ criterion = ['mse', 'mae']
		'max_depth': [5],
		'min_weight_fraction_leaf': [self.segment_minimum_size_ratio],
		'min_weight_fraction_leaf': [segment_minimum_size_ratio],
		'min_samples_leaf': [5],
		@@ -272,6 +330,6 @@ 'criterion': criterion,
		if scorer is not None and dummy_model is not None and label_col is not None:
		leaf_data, leaf_labels = leaf_filter.filter(data, label_col)
		leaf_data, leaf_labels = leaf_filter.filter(data_for_search, label_col_for_search)
		leaf_score = scorer.run_on_data_and_label(dummy_model, leaf_data, leaf_labels)
		else: # if no scorer is provided, use the average loss_per_sample of samples in the leaf as the score
		leaf_score = score_per_sample[list(leaf_filter.filter(data).index)].mean()
		leaf_score = score_per_sample_for_search[list(leaf_filter.filter(data_for_search).index)].mean()

		@@ -294,3 +352,3 @@ if leaf_score < min_score:
		try:
		grid_searcher.fit(data[features_for_segment], score_per_sample)
		grid_searcher.fit(data_for_search, score_per_sample_for_search)
		# Get the worst leaf filter out of the selected tree
		@@ -297,0 +355,0 @@ segment_score, segment_filter = get_worst_leaf_filter(grid_searcher.best_estimator_.tree_)

+6

-6

deepchecks/utils/dataframes.py

		@@ -46,10 +46,10 @@ # ----------------------------------------------------------------------------
		"""Fill NaN values based on column type if possible otherwise returns None."""
		if is_cat_column:
		if is_cat_column and 'None' not in col.astype('object').dropna().unique():
		return col.astype('object').fillna('None')
		elif is_numeric_dtype(col):
		return col.astype('float64').fillna(col.mean())
		else:
		common_values_list = col.mode()
		if isinstance(common_values_list, pd.Series) and len(common_values_list) > 0:
		return col.fillna(common_values_list[0])
		return col.astype('float64').fillna(np.nan)

		common_values_list = col.mode()
		if isinstance(common_values_list, pd.Series) and len(common_values_list) > 0:
		return col.fillna(common_values_list[0])
		return None
		@@ -56,0 +56,0 @@

+5

-0

deepchecks/vision/...evaluation/weak_segments_performance.py

		@@ -63,2 +63,5 @@ # ----------------------------------------------------------------------------
		size >= segment_minimum_size_ratio * data_size.
		max_categories_weak_segment: Optional[int] , default: None
		Maximum number of categories that can be included in a weak segment per categorical feature.
		If None, the number of categories is not limited.
		n_samples : Optional[int] , default: 10_000
		@@ -81,2 +84,3 @@ number of samples to use for this check.
		segment_minimum_size_ratio: float = 0.05,
		max_categories_weak_segment: Optional[int] = None,
		n_samples: Optional[int] = 10000,
		@@ -94,2 +98,3 @@ categorical_aggregation_threshold: float = 0.05,
		self.segment_minimum_size_ratio = segment_minimum_size_ratio
		self.max_categories_weak_segment = max_categories_weak_segment
		self.categorical_aggregation_threshold = categorical_aggregation_threshold
		@@ -96,0 +101,0 @@ self.multiple_segments_per_property = multiple_segments_per_property

+4

-4

deepchecks/vision/metrics_utils/scorers.py

		@@ -230,11 +230,11 @@ # ----------------------------------------------------------------------------
		if show_only == 'largest':
		tests_metrics_df = tests_metrics_df.sort_values(by='Number of samples', ascending=False)
		tests_metrics_df = tests_metrics_df.sort_values(by=['Number of samples', 'Value', 'Class'], ascending=False)
		elif show_only == 'smallest':
		tests_metrics_df = tests_metrics_df.sort_values(by='Number of samples', ascending=True)
		tests_metrics_df = tests_metrics_df.sort_values(by=['Number of samples', 'Value', 'Class'], ascending=True)
		elif show_only == 'random':
		tests_metrics_df = tests_metrics_df.sample(frac=1)
		elif show_only == 'best':
		tests_metrics_df = tests_metrics_df.sort_values(by='Value', ascending=False)
		tests_metrics_df = tests_metrics_df.sort_values(by=['Value', 'Number of samples'], ascending=False)
		elif show_only == 'worst':
		tests_metrics_df = tests_metrics_df.sort_values(by='Value', ascending=True)
		tests_metrics_df = tests_metrics_df.sort_values(by=['Value', 'Number of samples'], ascending=True)
		else:
		@@ -241,0 +241,0 @@ raise ValueError(f'Unknown show_only value: {show_only}')

+2

-2

PKG-INFO

		Metadata-Version: 2.1
		Name: deepchecks
		Version: 0.17.5
		Version: 0.18.0
		Summary: Package for validating your machine learning model and data
		@@ -9,3 +9,3 @@ Home-page: https://github.com/deepchecks/deepchecks
		License: UNKNOWN
		Download-URL: https://github.com/deepchecks/deepchecks/releases/download/0.17.5/deepchecks-0.17.5.tar.gz
		Download-URL: https://github.com/deepchecks/deepchecks/releases/download/0.18.0/deepchecks-0.18.0.tar.gz
		Project-URL: Documentation, https://docs.deepchecks.com
		@@ -12,0 +12,0 @@ Project-URL: Bug Reports, https://github.com/deepchecks/deepchecks

+8

-5

README.md

		@@ -22,3 +22,3 @@ <!--
		<!-- ALL-CONTRIBUTORS-BADGE:START - Do not remove or modify this section -->
		[![All Contributors](https://img.shields.io/badge/all_contributors-42-orange.svg?style=flat-round)](#https://github.com/deepchecks/deepchecks/blob/main/CONTRIBUTING.rst)
		[![All Contributors](https://img.shields.io/badge/all_contributors-43-orange.svg?style=flat-round)](#https://github.com/deepchecks/deepchecks/blob/main/CONTRIBUTING.rst)
		<!-- ALL-CONTRIBUTORS-BADGE:END -->
		@@ -116,3 +116,3 @@

		This will automatically download the necessary dependencies, run the installation prcoess
		This will automatically download the necessary dependencies, run the installation process
		and then start the application locally.
		@@ -223,3 +223,3 @@
		distribution (e.g. detect drifts or leakages), data integrity (e.g. find conflicting labels) and more.
		These checks results can be run manually (e.g. during research) or trigerred automatically (e.g. during CI
		These checks results can be run manually (e.g. during research) or triggered automatically (e.g. during CI
		and production monitoring) and enable automatically making informed decisions regarding your model pipelines'
		@@ -239,3 +239,3 @@ production-readiness, and behavior over time.
		the Deepchecks Testing Python package (this repo).
		- Each check tests for a specific potential problem. Deepchecks has many preimplemented checks for
		- Each check tests for a specific potential problem. Deepchecks has many pre-implemented checks for
		finding issues with the model's performance (e.g. identifying weak segments),
		@@ -260,3 +260,3 @@ data distribution (e.g. detect drifts or leakages) and data integrity (e.g. find conflicting labels).
		The check's results can be examined with visual reports (by saving them to an HTML file, or seeing them in Jupyter),
		processed with code (using their json output), and inspected and colloaborated upon with a dynamic UI
		processed with code (using their json output), and inspected and collaborated upon with a dynamic UI
		(for examining test results and for production monitoring).
		@@ -371,2 +371,5 @@ Optional conditions can be added to each check, to automatically validate whether it passed or not.
		</tr>
		<tr>
		<td align="center" valign="top" width="14.28%"><a href="https://www.kaggle.com/rayanaay"><img src="https://avatars.githubusercontent.com/u/55285736?v=4?s=100" width="100px;" alt="AIT ALI YAHIA Rayane"/><br /><sub><b>AIT ALI YAHIA Rayane</b></sub></a><br /><a href="#code-RayanAAY-ops" title="Code">💻</a> <a href="#ideas-RayanAAY-ops" title="Ideas, Planning, & Feedback">🤔</a></td>
		</tr>
		</tbody>
		@@ -373,0 +376,0 @@ </table>

+13

-4

requirements/dev-requirements.txt

		@@ -11,3 +11,3 @@ twine

		pytest
		pytest<8.0.0
		pytest-cov
		@@ -38,3 +38,4 @@ pyhamcrest
		opencv-python>=4.1.2
		Pillow>=7.1.2
		pillow>=7.1.2; python_version < '3.8'
		pillow>=10.0.1; python_version >= '3.8' # not directly required, pinned by Snyk to avoid a vulnerability
		PyYAML>=5.3.1
		@@ -51,6 +52,14 @@ requests>=2.23.0
		# NLP
		nltk<=3.6.7
		nltk<=3.6.7; python_version < '3.7'
		nltk>=3.8.1; python_version >= '3.7' # not directly required, pinned by Snyk to avoid a vulnerability
		datasets
		textblob
		transformers
		sentence-transformers
		sentence-transformers

		# Avoid Vulnerabilities
		tornado>=6.3.3; python_version >= '3.8' # not directly required, pinned by Snyk to avoid a vulnerability
		pyarrow>=14.0.1; python_version >= '3.8' # not directly required, pinned by Snyk to avoid a vulnerability
		aiohttp>=3.9.0; python_version >= '3.8' # not directly required, pinned by Snyk to avoid a vulnerability

		umap-learn==0.5.4

+3

-2

requirements/nlp-requirements.txt

		seqeval>=1.0.0
		nltk>=3.4.0,<=3.6.7
		nltk<=3.6.7; python_version < '3.7'
		nltk>=3.8.1; python_version >= '3.7' # not directly required, pinned by Snyk to avoid a vulnerability
		textblob>=0.17.1
		umap-learn
		transformers>=4.0.0
		transformers>=4.0.0,<4.37.0
		tiktoken; python_version >= '3.8'

+6

-5

requirements/requirements.txt

		pandas>=1.1.5,<2.2.0
		numpy>=1.19
		scikit-learn>=0.23.2
		numpy>=1.19; python_version < '3.8'
		numpy>=1.22.2; python_version >= '3.8'
		scikit-learn>=0.23.2, <1.4.0
		jsonpickle>=2
		@@ -8,3 +9,3 @@ PyNomaly>=0.3.3
		# require for python 3.8+
		ipython>=7.15.0,<8; python_version >= '3.8'
		ipython>=7.15.0; python_version >= '3.8'
		ipykernel>=5.3.0; python_version >= '3.8'
		@@ -14,3 +15,3 @@ ipywidgets>=7.6.5,<8; python_version >= '3.8'
		# google colab requirements (python 3.7)
		ipython>=5.5.0,<8; python_version < '3.8'
		ipython>=5.5.0; python_version < '3.8'
		ipykernel>=4.10.1; python_version < '3.8'
		@@ -36,4 +37,4 @@ ipywidgets>=7.5.0,<8; python_version < '3.8'
		matplotlib>=3.3.4
		pyzmq<24.0.0
		beautifulsoup4>=4.11.1
		requests>=2.22.0
		jupyter-server>=2.7.2; python_version >= '3.8' # not directly required, pinned by Snyk to avoid a vulnerability

+1

-1

VERSION

		@@ -1,1 +0,1 @@
		0.17.5
		0.18.0

deepchecks - npm Package Compare versions

Improved metrics