deepchecks
Advanced tools
| Metadata-Version: 2.1 | ||
| Name: deepchecks | ||
| Version: 0.17.5 | ||
| Version: 0.18.0 | ||
| Summary: Package for validating your machine learning model and data | ||
@@ -9,3 +9,3 @@ Home-page: https://github.com/deepchecks/deepchecks | ||
| License: UNKNOWN | ||
| Download-URL: https://github.com/deepchecks/deepchecks/releases/download/0.17.5/deepchecks-0.17.5.tar.gz | ||
| Download-URL: https://github.com/deepchecks/deepchecks/releases/download/0.18.0/deepchecks-0.18.0.tar.gz | ||
| Project-URL: Documentation, https://docs.deepchecks.com | ||
@@ -12,0 +12,0 @@ Project-URL: Bug Reports, https://github.com/deepchecks/deepchecks |
| pandas<2.2.0,>=1.1.5 | ||
| numpy>=1.19 | ||
| scikit-learn>=0.23.2 | ||
| scikit-learn<1.4.0,>=0.23.2 | ||
| jsonpickle>=2 | ||
@@ -12,3 +11,2 @@ PyNomaly>=0.3.3 | ||
| matplotlib>=3.3.4 | ||
| pyzmq<24.0.0 | ||
| beautifulsoup4>=4.11.1 | ||
@@ -22,3 +20,4 @@ requests>=2.22.0 | ||
| [:python_version < "3.8"] | ||
| ipython<8,>=5.5.0 | ||
| numpy>=1.19 | ||
| ipython>=5.5.0 | ||
| ipykernel>=4.10.1 | ||
@@ -35,12 +34,13 @@ ipywidgets<8,>=7.5.0 | ||
| [:python_version >= "3.8"] | ||
| ipython<8,>=7.15.0 | ||
| numpy>=1.22.2 | ||
| ipython>=7.15.0 | ||
| ipykernel>=5.3.0 | ||
| ipywidgets<8,>=7.6.5 | ||
| jupyter-server>=2.7.2 | ||
| [nlp] | ||
| seqeval>=1.0.0 | ||
| nltk<=3.6.7,>=3.4.0 | ||
| textblob>=0.17.1 | ||
| umap-learn | ||
| transformers>=4.0.0 | ||
| transformers<4.37.0,>=4.0.0 | ||
@@ -50,2 +50,8 @@ [nlp-properties] | ||
| [nlp:python_version < "3.7"] | ||
| nltk<=3.6.7 | ||
| [nlp:python_version >= "3.7"] | ||
| nltk>=3.8.1 | ||
| [nlp:python_version >= "3.8"] | ||
@@ -52,0 +58,0 @@ tiktoken |
@@ -109,11 +109,32 @@ # ---------------------------------------------------------------------------- | ||
| def select_results(self, idx: Set[int]) -> List[Union[ | ||
| def select_results(self, idx: Set[int] = None, names: Set[str] = None) -> List[Union[ | ||
| 'check_types.CheckResult', | ||
| 'check_types.CheckFailure' | ||
| ]]: | ||
| """Select results by indexes.""" | ||
| output = [] | ||
| for index, result in enumerate(self.results): | ||
| if index in idx: | ||
| output.append(result) | ||
| """Select results either by indexes or result header names. | ||
| Parameters | ||
| ---------- | ||
| idx : Set[int], default None | ||
| The list of indexes to filter the check results from the results list. If | ||
| names is None, then this parameter is required. | ||
| names : Set[str], default None | ||
| The list of names denoting the header of the check results. If idx is None, | ||
| this parameter is required. Both idx and names cannot be passed. | ||
| Returns | ||
| ------- | ||
| List[Union['check_types.CheckResult', 'check_types.CheckFailure']] : | ||
| A list of check results filtered either by the indexes or by their names. | ||
| """ | ||
| if idx is None and names is None: | ||
| raise DeepchecksNotSupportedError('Either idx or names should be passed') | ||
| if idx and names: | ||
| raise DeepchecksNotSupportedError('Only one of idx or names should be passed') | ||
| if names: | ||
| names = [name.lower().replace('_', ' ').strip() for name in names] | ||
| output = [result for name in names for result in self.results if result.get_header().lower() == name] | ||
| else: | ||
| output = [result for index, result in enumerate(self.results) if index in idx] | ||
| return output | ||
@@ -425,3 +446,3 @@ | ||
| def get_not_passed_checks(self, fail_if_warning=True) -> List['check_types.CheckResult']: | ||
| """Get all the check results that have not passing condition. | ||
| """Get all the check results that have not passing condition. This does not include checks that failed to run. | ||
@@ -448,3 +469,3 @@ Parameters | ||
| def get_passed_checks(self, fail_if_warning=True) -> List['check_types.CheckResult']: | ||
| """Get all the check results that have passing condition. | ||
| """Get all the check results that have passing condition. This does not include checks that failed to run. | ||
@@ -451,0 +472,0 @@ Parameters |
@@ -45,2 +45,3 @@ # ---------------------------------------------------------------------------- | ||
| segment_minimum_size_ratio: float, n_samples: int, n_to_show: int, | ||
| max_categories_weak_segment: Optional[int], | ||
| categorical_aggregation_threshold: float, multiple_segments_per_feature: bool, **kwargs): | ||
@@ -53,2 +54,3 @@ super().__init__(**kwargs) | ||
| self.segment_minimum_size_ratio = segment_minimum_size_ratio | ||
| self.max_categories_weak_segment = max_categories_weak_segment | ||
| self.n_samples = n_samples | ||
@@ -251,2 +253,5 @@ self.n_to_show = n_to_show | ||
| size >= segment_minimum_size_ratio * data_size. | ||
| max_categories_weak_segment: Optional[int] , default: None | ||
| Maximum number of categories that can be included in a weak segment per categorical feature. | ||
| If None, the number of categories is not limited. | ||
| n_samples : int , default: 10_000 | ||
@@ -268,2 +273,3 @@ Maximum number of samples to use for this check. | ||
| segment_minimum_size_ratio: float = 0.05, | ||
| max_categories_weak_segment: Optional[int] = None, | ||
| n_samples: int = 10_000, | ||
@@ -279,2 +285,3 @@ categorical_aggregation_threshold: float = 0.05, | ||
| segment_minimum_size_ratio=segment_minimum_size_ratio, | ||
| max_categories_weak_segment=max_categories_weak_segment, | ||
| n_samples=n_samples, | ||
@@ -310,2 +317,5 @@ n_to_show=n_to_show, | ||
| size >= segment_minimum_size_ratio * data_size. | ||
| max_categories_weak_segment: Optional[int] , default: None | ||
| Maximum number of categories that can be included in a weak segment per categorical feature. | ||
| If None, the number of categories is not limited. | ||
| n_samples : int , default: 10_000 | ||
@@ -327,2 +337,3 @@ Maximum number of samples to use for this check. | ||
| segment_minimum_size_ratio: float = 0.05, | ||
| max_categories_weak_segment: Optional[int] = None, | ||
| n_samples: int = 10_000, | ||
@@ -338,2 +349,3 @@ categorical_aggregation_threshold: float = 0.05, | ||
| segment_minimum_size_ratio=segment_minimum_size_ratio, | ||
| max_categories_weak_segment=max_categories_weak_segment, | ||
| n_samples=n_samples, | ||
@@ -340,0 +352,0 @@ n_to_show=n_to_show, |
@@ -38,3 +38,4 @@ # ---------------------------------------------------------------------------- | ||
| ignore_columns: Union[Hashable, List[Hashable], None], n_top_features: Optional[int], | ||
| segment_minimum_size_ratio: float, alternative_scorer: Dict[str, Union[str, Callable]], | ||
| segment_minimum_size_ratio: float, max_categories_weak_segment: Optional[int], | ||
| alternative_scorer: Dict[str, Union[str, Callable]], | ||
| score_per_sample: Union[np.ndarray, pd.Series, None], n_samples: int, | ||
@@ -49,2 +50,3 @@ categorical_aggregation_threshold: float, n_to_show: int, | ||
| self.segment_minimum_size_ratio = segment_minimum_size_ratio | ||
| self.max_categories_weak_segment = max_categories_weak_segment | ||
| self.n_samples = n_samples | ||
@@ -118,3 +120,3 @@ self.n_to_show = n_to_show | ||
| if len(weak_segments) == 0: | ||
| display_msg = 'WeakSegmentsPerformance was unable to train an error model to find weak segments.'\ | ||
| display_msg = 'WeakSegmentsPerformance was unable to train an error model to find weak segments.' \ | ||
| f'Try supplying additional {self.segment_by}.' | ||
@@ -164,2 +166,5 @@ return CheckResult(value={'message': display_msg}, display=[display_msg]) | ||
| size >= segment_minimum_size_ratio * data_size. | ||
| max_categories_weak_segment: Optional[int] , default: None | ||
| Maximum number of categories that can be included in a weak segment per categorical feature. | ||
| If None, the number of categories is not limited. | ||
| alternative_scorer : Dict[str, Union[str, Callable]] , default: None | ||
@@ -189,2 +194,3 @@ Scorer to use as performance measure, either function or sklearn scorer name. | ||
| segment_minimum_size_ratio: float = 0.05, | ||
| max_categories_weak_segment: Optional[int] = None, | ||
| alternative_scorer: Dict[str, Union[str, Callable]] = None, | ||
@@ -202,2 +208,3 @@ score_per_sample: Union[np.ndarray, pd.Series, None] = None, | ||
| segment_minimum_size_ratio=segment_minimum_size_ratio, | ||
| max_categories_weak_segment=max_categories_weak_segment, | ||
| n_samples=n_samples, | ||
@@ -238,2 +245,5 @@ n_to_show=n_to_show, | ||
| size >= segment_minimum_size_ratio * data_size. | ||
| max_categories_weak_segment: Optional[int] , default: None | ||
| Maximum number of categories that can be included in a weak segment per categorical feature. | ||
| If None, the number of categories is not limited. | ||
| alternative_scorer : Dict[str, Union[str, Callable]] , default: None | ||
@@ -263,2 +273,3 @@ Scorer to use as performance measure, either function or sklearn scorer name. | ||
| segment_minimum_size_ratio: float = 0.05, | ||
| max_categories_weak_segment: Optional[int] = None, | ||
| alternative_scorer: Dict[str, Union[str, Callable]] = None, | ||
@@ -276,2 +287,3 @@ score_per_sample: Union[np.ndarray, pd.Series, None] = None, | ||
| segment_minimum_size_ratio=segment_minimum_size_ratio, | ||
| max_categories_weak_segment=max_categories_weak_segment, | ||
| n_samples=n_samples, | ||
@@ -278,0 +290,0 @@ n_to_show=n_to_show, |
@@ -13,2 +13,3 @@ # ---------------------------------------------------------------------------- | ||
| import contextlib | ||
| import pathlib | ||
| import typing as t | ||
@@ -389,5 +390,5 @@ import warnings | ||
| def set_metadata( | ||
| self, | ||
| metadata: pd.DataFrame, | ||
| categorical_metadata: t.Optional[t.Sequence[str]] = None | ||
| self, | ||
| metadata: pd.DataFrame, | ||
| categorical_metadata: t.Optional[t.Sequence[str]] = None | ||
| ): | ||
@@ -413,8 +414,12 @@ """Set the metadata of the dataset.""" | ||
| def calculate_builtin_properties( | ||
| self, | ||
| include_properties: t.Optional[t.List[str]] = None, | ||
| ignore_properties: t.Optional[t.List[str]] = None, | ||
| include_long_calculation_properties: bool = False, | ||
| ignore_non_english_samples_for_english_properties: bool = True, | ||
| device: t.Optional[str] = None | ||
| self, | ||
| include_properties: t.Optional[t.List[str]] = None, | ||
| ignore_properties: t.Optional[t.List[str]] = None, | ||
| include_long_calculation_properties: bool = False, | ||
| ignore_non_english_samples_for_english_properties: bool = True, | ||
| device: t.Optional[str] = None, | ||
| models_storage: t.Union[pathlib.Path, str, None] = None, | ||
| batch_size: t.Optional[int] = 16, | ||
| cache_models: bool = False, | ||
| use_onnx_models: bool = True, | ||
| ): | ||
@@ -426,10 +431,25 @@ """Calculate the default properties of the dataset. | ||
| include_properties : List[str], default None | ||
| The properties to calculate. If None, all default properties will be calculated. Cannot be used together | ||
| with ignore_properties parameter. | ||
| The properties to calculate. If None, all default properties will be calculated. Cannot be used | ||
| together with ignore_properties parameter. Available properties are: | ||
| ['Text Length', 'Average Word Length', 'Max Word Length', | ||
| '% Special Characters', '% Punctuation', 'Language', | ||
| 'Sentiment', 'Subjectivity', 'Toxicity', 'Fluency', 'Formality', 'Lexical Density', 'Unique Noun Count', | ||
| 'Reading Ease', 'Average Words Per Sentence', 'URLs Count', Unique URLs Count', 'Email Address Count', | ||
| 'Unique Email Address Count', 'Unique Syllables Count', 'Reading Time', 'Sentences Count', | ||
| 'Average Syllable Length'] | ||
| List of default properties are: ['Text Length', 'Average Word Length', 'Max Word Length', | ||
| '% Special Characters', '% Punctuation', 'Language', 'Sentiment', 'Subjectivity', 'Toxicity', 'Fluency', | ||
| 'Formality', 'Lexical Density', 'Unique Noun Count', 'Reading Ease', 'Average Words Per Sentence'] | ||
| To calculate all the default properties, the include_properties and ignore_properties parameters should | ||
| be None. If you pass either include_properties or ignore_properties then only the properties specified | ||
| in the list will be calculated or ignored. | ||
| Note that the properties ['Toxicity', 'Fluency', 'Formality', 'Language', 'Unique Noun Count'] may | ||
| take a long time to calculate. If include_long_calculation_properties is False, these properties will be | ||
| ignored, even if they are in the include_properties parameter. | ||
| ignore_properties : List[str], default None | ||
| The properties to ignore. If None, no properties will be ignored. Cannot be used together with | ||
| properties parameter. | ||
| The properties to ignore from the list of default properties. If None, no properties will be ignored and | ||
| all the default properties will be calculated. Cannot be used together with include_properties parameter. | ||
| include_long_calculation_properties : bool, default False | ||
| Whether to include properties that may take a long time to calculate. If False, these properties will be | ||
| ignored. | ||
| ignored, unless they are specified in the include_properties parameter explicitly. | ||
| ignore_non_english_samples_for_english_properties : bool, default True | ||
@@ -441,4 +461,16 @@ Whether to ignore samples that are not in English when calculating English properties. If False, samples | ||
| only when you are sure that all the samples are in English. | ||
| device : int, default None | ||
| The device to use for the calculation. If None, the default device will be used. | ||
| device : Optional[str], default None | ||
| The device to use for the calculation. If None, the default device will be used. For onnx based models it is | ||
| recommended to set device to None for optimized performance. | ||
| models_storage : Union[str, pathlib.Path, None], default None | ||
| A directory to store the models. | ||
| If not provided, models will be stored in `DEEPCHECKS_LIB_PATH/nlp/.nlp-models`. | ||
| Also, if a folder already contains relevant resources they are not re-downloaded. | ||
| batch_size : int, default 8 | ||
| The batch size. | ||
| cache_models : bool, default False | ||
| If True, will store the models in device RAM memory. This will speed up the calculation for future calls. | ||
| use_onnx_models : bool, default True | ||
| If True, will use onnx gpu optimized models for the calculation. Requires the optimum[onnxruntime-gpu] | ||
| library to be installed as well as the availability of GPU. | ||
| """ | ||
@@ -454,3 +486,7 @@ if self._properties is not None: | ||
| ignore_non_english_samples_for_english_properties=ignore_non_english_samples_for_english_properties, | ||
| device=device | ||
| device=device, | ||
| models_storage=models_storage, | ||
| batch_size=batch_size, | ||
| cache_models=cache_models, | ||
| use_onnx_models=use_onnx_models, | ||
| ) | ||
@@ -462,5 +498,5 @@ | ||
| def set_properties( | ||
| self, | ||
| properties: pd.DataFrame, | ||
| categorical_properties: t.Optional[t.Sequence[str]] = None | ||
| self, | ||
| properties: pd.DataFrame, | ||
| categorical_properties: t.Optional[t.Sequence[str]] = None | ||
| ): | ||
@@ -797,3 +833,3 @@ """Set the properties of the dataset.""" | ||
| if len(prop_names) != len(properties_to_show): | ||
| raise DeepchecksValueError(f'{set(properties_to_show)-set(prop_names)} ' | ||
| raise DeepchecksValueError(f'{set(properties_to_show) - set(prop_names)} ' | ||
| 'properties does not exist in the TextData object') | ||
@@ -800,0 +836,0 @@ else: |
@@ -12,2 +12,3 @@ # ---------------------------------------------------------------------------- | ||
| """Utils module for calculating embeddings for text.""" | ||
| import re | ||
| import sys | ||
@@ -26,3 +27,7 @@ import warnings | ||
| PATTERN_SPECIAL_CHARS = re.compile(r"[!@#$%^&*()_+{}|:\"<>?~`\-=\[\]\;',.\/]") | ||
| PATTERN_SPACE_CHARS = re.compile(r'\s') | ||
| PATTERN_BR_CHARS = re.compile(r'<br />') | ||
| def batched(iterable, n): | ||
@@ -210,8 +215,19 @@ """Batch data into tuples of length n. The last batch may be shorter.""" | ||
| def _clean_special_chars(text): | ||
| special_chars = r'!@#$%^&*()_+{}|:"<>?~`-=[]\;\',./' | ||
| for char in special_chars: | ||
| text = text.replace(char, '') | ||
| text = text.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ') | ||
| text = text.replace('<br />', ' ') | ||
| def _clean_special_chars(text: str) -> str: | ||
| """ | ||
| Remove special characters, replaces space characters with space. | ||
| Parameters | ||
| ---------- | ||
| text : str | ||
| The `text` parameter is a string that represents the input text that needs to be cleaned. | ||
| Returns | ||
| ------- | ||
| text | ||
| Cleaned text string | ||
| """ | ||
| text = PATTERN_SPECIAL_CHARS.sub('', text) | ||
| text = PATTERN_SPACE_CHARS.sub(' ', text) | ||
| text = PATTERN_BR_CHARS.sub(' ', text) | ||
| return text |
@@ -18,7 +18,5 @@ # ---------------------------------------------------------------------------- | ||
| from importlib import import_module | ||
| from importlib.util import find_spec | ||
| from typing import Optional, Union | ||
| import requests | ||
| import torch | ||
| from nltk import corpus | ||
@@ -80,17 +78,24 @@ from transformers.utils import logging as transformers_logging | ||
| models_storage: Union[pathlib.Path, str, None] = None, | ||
| quantize_model: bool = True, | ||
| use_onnx_model: bool = False, | ||
| use_cache=False | ||
| ): | ||
| """Return a transformers' pipeline for the given model name.""" | ||
| transformers = import_optional_property_dependency('transformers', property_name=property_name) | ||
| if use_onnx_model and 'onnx' not in model_name.lower(): | ||
| raise ValueError("use_onnx_model=True, but model_name is not for a 'onnx' model") | ||
| if use_cache: | ||
| model, tokenizer = _get_transformer_model_and_tokenizer(property_name, model_name, | ||
| models_storage, quantize_model) | ||
| models_storage, use_onnx_model) | ||
| else: | ||
| # __wrapped__ is simply the function without decoration, in our case - without caching | ||
| model, tokenizer = _get_transformer_model_and_tokenizer.__wrapped__(property_name, model_name, | ||
| models_storage, quantize_model) | ||
| models_storage, use_onnx_model) | ||
| pipeline_kwargs = {'device_map': 'auto'} if find_spec('accelerate') is not None else {'device': device} | ||
| return transformers.pipeline('text-classification', model=model, tokenizer=tokenizer, **pipeline_kwargs) | ||
| if use_onnx_model: | ||
| onnx_pipe = import_optional_property_dependency('optimum.pipelines', property_name=property_name) | ||
| return onnx_pipe.pipeline('text-classification', model=model, tokenizer=tokenizer, | ||
| accelerator='ort', device=device) | ||
| else: | ||
| transformers = import_optional_property_dependency('transformers', property_name=property_name) | ||
| return transformers.pipeline('text-classification', model=model, tokenizer=tokenizer, device=device) | ||
@@ -102,7 +107,5 @@ | ||
| user_logger_level = logging.getLogger('transformers').getEffectiveLevel() | ||
| deepchecks_logger_level = logging.getLogger(__name__).getEffectiveLevel() | ||
| is_progress_bar_enabled = transformers_logging.is_progress_bar_enabled() | ||
| transformers_logging.set_verbosity_error() | ||
| logging.getLogger(__name__).setLevel(50) | ||
| transformers_logging.disable_progress_bar() | ||
@@ -116,3 +119,2 @@ logging.getLogger('transformers').setLevel(50) | ||
| logging.getLogger('transformers').setLevel(user_logger_level) | ||
| logging.getLogger(__name__).setLevel(deepchecks_logger_level) | ||
| if is_progress_bar_enabled: | ||
@@ -122,19 +124,2 @@ transformers_logging.enable_progress_bar() | ||
| def get_transformer_loader_params(model_name: str, | ||
| models_storage: Union[pathlib.Path, str, None] = None, | ||
| quantize_model: bool = True, ): | ||
| """Return the params for transformers' model and tokenizer auto classification loaders.""" | ||
| models_storage = get_create_model_storage(models_storage=models_storage) | ||
| model_kwargs = dict(device_map=None) | ||
| tokenizer_kwargs = dict(device_map=None) | ||
| if quantize_model: | ||
| model_path = models_storage / 'quantized' / model_name | ||
| model_kwargs['load_in_8bit'] = True | ||
| model_kwargs['torch_dtype'] = torch.float16 | ||
| else: | ||
| model_path = models_storage / model_name | ||
| return model_path, model_kwargs, tokenizer_kwargs | ||
| @lru_cache(maxsize=5) | ||
@@ -145,3 +130,3 @@ def _get_transformer_model_and_tokenizer( | ||
| models_storage: Union[pathlib.Path, str, None] = None, | ||
| quantize_model: bool = True, | ||
| use_onnx_model: bool = True, | ||
| ): | ||
@@ -152,17 +137,29 @@ """Return a transformers' model and tokenizer in cpu memory.""" | ||
| with _log_suppressor(): | ||
| model_path, model_kwargs, tokenizer_kwargs = get_transformer_loader_params(model_name, models_storage, | ||
| quantize_model) | ||
| models_storage = get_create_model_storage(models_storage=models_storage) | ||
| model_path = models_storage / model_name | ||
| model_path_exists = model_path.exists() | ||
| if model_path.exists(): | ||
| tokenizer = transformers.AutoTokenizer.from_pretrained(model_path, **tokenizer_kwargs) | ||
| model = transformers.AutoModelForSequenceClassification.from_pretrained(model_path, **model_kwargs) | ||
| if use_onnx_model: | ||
| onnx_runtime = import_optional_property_dependency('optimum.onnxruntime', property_name=property_name) | ||
| classifier_cls = onnx_runtime.ORTModelForSequenceClassification | ||
| if model_path_exists: | ||
| model = classifier_cls.from_pretrained(model_path, provider='CUDAExecutionProvider') | ||
| else: | ||
| model = classifier_cls.from_pretrained(model_name, provider='CUDAExecutionProvider') | ||
| model.save_pretrained(model_path) | ||
| else: | ||
| model = transformers.AutoModelForSequenceClassification.from_pretrained(model_name, **model_kwargs) | ||
| model.save_pretrained(model_path) | ||
| if model_path_exists: | ||
| model = transformers.AutoModelForSequenceClassification.from_pretrained(model_path) | ||
| else: | ||
| model = transformers.AutoModelForSequenceClassification.from_pretrained(model_name) | ||
| model.save_pretrained(model_path) | ||
| model.eval() | ||
| tokenizer = transformers.AutoTokenizer.from_pretrained(model_name, **tokenizer_kwargs) | ||
| if model_path_exists: | ||
| tokenizer = transformers.AutoTokenizer.from_pretrained(model_path) | ||
| else: | ||
| tokenizer = transformers.AutoTokenizer.from_pretrained(model_name) | ||
| tokenizer.save_pretrained(model_path) | ||
| model.eval() | ||
| return model, tokenizer | ||
| return model, tokenizer | ||
@@ -169,0 +166,0 @@ |
@@ -35,3 +35,2 @@ # ---------------------------------------------------------------------------- | ||
| from deepchecks.utils.function import run_available_kwargs | ||
| from deepchecks.utils.gpu_utils import empty_gpu | ||
| from deepchecks.utils.strings import SPECIAL_CHARACTERS, format_list | ||
@@ -157,2 +156,5 @@ | ||
| language_code = prediction[0].replace('__label__', '') if prediction else None | ||
| if language_code == 'eng': # both are english but different labels | ||
| return 'en' | ||
| return language_code | ||
@@ -251,4 +253,7 @@ | ||
| TOXICITY_MODEL_NAME = 'SkolkovoInstitute/roberta_toxicity_classifier' | ||
| TOXICITY_MODEL_NAME_ONNX = 'Deepchecks/roberta_toxicity_classifier_onnx' | ||
| FLUENCY_MODEL_NAME = 'prithivida/parrot_fluency_model' | ||
| FLUENCY_MODEL_NAME_ONNX = 'Deepchecks/parrot_fluency_model_onnx' | ||
| FORMALITY_MODEL_NAME = 's-nlp/roberta-base-formality-ranker' | ||
| FORMALITY_MODEL_NAME_ONNX = 'Deepchecks/roberta_base_formality_ranker_onnx' | ||
@@ -260,2 +265,3 @@ | ||
| models_storage: Union[pathlib.Path, str, None] = None, | ||
| use_onnx_models: bool = True, | ||
| toxicity_classifier: Optional[object] = None | ||
@@ -265,4 +271,7 @@ ) -> Sequence[float]: | ||
| if toxicity_classifier is None: | ||
| use_onnx_models = _validate_onnx_model_availability(use_onnx_models, device) | ||
| model_name = TOXICITY_MODEL_NAME_ONNX if use_onnx_models else TOXICITY_MODEL_NAME | ||
| toxicity_classifier = get_transformer_pipeline( | ||
| property_name='toxicity', model_name=TOXICITY_MODEL_NAME, models_storage=models_storage, device=device) | ||
| property_name='toxicity', model_name=model_name, device=device, | ||
| models_storage=models_storage, use_onnx_model=use_onnx_models) | ||
@@ -293,2 +302,3 @@ class UnitModel: | ||
| models_storage: Union[pathlib.Path, str, None] = None, | ||
| use_onnx_models: bool = True, | ||
| fluency_classifier: Optional[object] = None | ||
@@ -298,4 +308,7 @@ ) -> Sequence[float]: | ||
| if fluency_classifier is None: | ||
| use_onnx_models = _validate_onnx_model_availability(use_onnx_models, device) | ||
| model_name = FLUENCY_MODEL_NAME_ONNX if use_onnx_models else FLUENCY_MODEL_NAME | ||
| fluency_classifier = get_transformer_pipeline( | ||
| property_name='fluency', model_name=FLUENCY_MODEL_NAME, models_storage=models_storage, device=device) | ||
| property_name='fluency', model_name=model_name, device=device, | ||
| models_storage=models_storage, use_onnx_model=use_onnx_models) | ||
@@ -312,2 +325,3 @@ def output_formatter(v): | ||
| models_storage: Union[pathlib.Path, str, None] = None, | ||
| use_onnx_models: bool = True, | ||
| formality_classifier: Optional[object] = None | ||
@@ -317,4 +331,7 @@ ) -> Sequence[float]: | ||
| if formality_classifier is None: | ||
| use_onnx_models = _validate_onnx_model_availability(use_onnx_models, device) | ||
| model_name = FORMALITY_MODEL_NAME_ONNX if use_onnx_models else FORMALITY_MODEL_NAME | ||
| formality_classifier = get_transformer_pipeline( | ||
| property_name='formality', model_name=FORMALITY_MODEL_NAME, models_storage=models_storage, device=device) | ||
| property_name='formality', model_name=model_name, device=device, | ||
| models_storage=models_storage, use_onnx_model=use_onnx_models) | ||
@@ -343,3 +360,3 @@ def output_formatter(v): | ||
| total_unique_words = len(set(all_words)) | ||
| return round(total_unique_words * 100 / len(all_words), 2) | ||
| return round(total_unique_words / len(all_words), 2) | ||
@@ -583,3 +600,3 @@ | ||
| '100 (very easy). Based on Flesch reading-ease score', | ||
| 'Lexical Density': 'Percentage of unique words in the text', | ||
| 'Lexical Density': 'Ratio of unique words in the text', | ||
| 'Toxicity': 'A measure of how harmful or offensive a text sample is (0 to 1), ' | ||
@@ -605,7 +622,5 @@ 'uses the SkolkovoInstitute/roberta_toxicity_classifier model', | ||
| def _select_properties( | ||
| n_of_samples: int, | ||
| include_properties: Optional[List[str]] = None, | ||
| ignore_properties: Optional[List[str]] = None, | ||
| include_long_calculation_properties: bool = False, | ||
| device: Optional[str] = None, | ||
| ) -> Sequence[TextProperty]: | ||
@@ -651,17 +666,4 @@ """Select properties to calculate based on provided parameters.""" | ||
| ] | ||
| else: | ||
| heavy_properties = [ | ||
| prop['name'] for prop in properties | ||
| if prop['name'] in LONG_RUN_PROPERTIES | ||
| ] | ||
| if heavy_properties and n_of_samples > LARGE_SAMPLE_SIZE: | ||
| warning_message = ( | ||
| f'Calculating the properties {heavy_properties} on a large dataset may take a long time. ' | ||
| 'Consider using a smaller sample size or running this code on better hardware.' | ||
| ) | ||
| if device is None or device == 'cpu': | ||
| warning_message += ' Consider using a GPU or a similar device to run these properties.' | ||
| warnings.warn(warning_message, UserWarning) | ||
| return properties | ||
| return properties | ||
@@ -679,3 +681,3 @@ | ||
| cache_models: bool = False, | ||
| quantize_models: bool = True, | ||
| use_onnx_models: bool = True, | ||
| ) -> Tuple[Dict[str, List[float]], Dict[str, str]]: | ||
@@ -717,4 +719,5 @@ """Calculate properties on provided text samples. | ||
| only when you are sure that all the samples are in English. | ||
| device : int, default None | ||
| The device to use for the calculation. If None, the default device will be used. | ||
| device : Optional[str], default None | ||
| The device to use for the calculation. If None, the default device will be used. For onnx based models it is | ||
| recommended to set device to None for optimized performance. | ||
| models_storage : Union[str, pathlib.Path, None], default None | ||
@@ -727,8 +730,6 @@ A directory to store the models. | ||
| cache_models : bool, default False | ||
| If True, will store the models in CPU RAM memory. This will speed up the calculation, but will take up | ||
| more memory. If device is not CPU, the models will be moved from CPU RAM memory to relevant device before | ||
| calculation. | ||
| quantize_models : bool, default True | ||
| If True, will quantize the models to reduce their size and speed up the calculation. Requires the | ||
| accelerate and bitsandbytes libraries to be installed as well as the availability of GPU. | ||
| If True, will store the models in device RAM memory. This will speed up the calculation for future calls. | ||
| use_onnx_models : bool, default True | ||
| If True, will use onnx gpu optimized models for the calculation. Requires the optimum[onnxruntime-gpu] library | ||
| to be installed as well as the availability of GPU. | ||
@@ -742,18 +743,9 @@ Returns | ||
| """ | ||
| if quantize_models: | ||
| if find_spec('accelerate') is None or find_spec('bitsandbytes') is None: | ||
| warnings.warn('Quantization requires the accelerate and bitsandbytes libraries to be installed. ' | ||
| 'Calculating without quantization.') | ||
| quantize_models = False | ||
| if not torch.cuda.is_available(): | ||
| warnings.warn('GPU is required for the quantization process. Calculating without quantization.') | ||
| quantize_models = False | ||
| use_onnx_models = _validate_onnx_model_availability(use_onnx_models, device) | ||
| text_properties = _select_properties( | ||
| include_properties=include_properties, | ||
| ignore_properties=ignore_properties, | ||
| device=device, | ||
| include_long_calculation_properties=include_long_calculation_properties, | ||
| n_of_samples=len(raw_text) | ||
| include_long_calculation_properties=include_long_calculation_properties | ||
| ) | ||
| properties_types = { | ||
@@ -763,2 +755,3 @@ it['name']: it['output_type'] | ||
| } | ||
| _warn_long_compute(device, properties_types, len(raw_text), use_onnx_models) | ||
@@ -779,16 +772,19 @@ kwargs = dict(device=device, models_storage=models_storage) | ||
| if 'Toxicity' in properties_types: | ||
| if 'Toxicity' in properties_types and 'toxicity_classifier' not in kwargs: | ||
| model_name = TOXICITY_MODEL_NAME_ONNX if use_onnx_models else TOXICITY_MODEL_NAME | ||
| kwargs['toxicity_classifier'] = get_transformer_pipeline( | ||
| property_name='toxicity', model_name=TOXICITY_MODEL_NAME, device=device, | ||
| models_storage=models_storage, use_cache=cache_models, quantize_model=quantize_models) | ||
| property_name='toxicity', model_name=model_name, device=device, | ||
| models_storage=models_storage, use_cache=cache_models, use_onnx_model=use_onnx_models) | ||
| if 'Formality' in properties_types and 'formality_classifier' not in kwargs: | ||
| model_name = FORMALITY_MODEL_NAME_ONNX if use_onnx_models else FORMALITY_MODEL_NAME | ||
| kwargs['formality_classifier'] = get_transformer_pipeline( | ||
| property_name='formality', model_name=FORMALITY_MODEL_NAME, device=device, | ||
| models_storage=models_storage, use_cache=cache_models, quantize_model=quantize_models) | ||
| property_name='formality', model_name=model_name, device=device, | ||
| models_storage=models_storage, use_cache=cache_models, use_onnx_model=use_onnx_models) | ||
| if 'Fluency' in properties_types and 'fluency_classifier' not in kwargs: | ||
| model_name = FLUENCY_MODEL_NAME_ONNX if use_onnx_models else FLUENCY_MODEL_NAME | ||
| kwargs['fluency_classifier'] = get_transformer_pipeline( | ||
| property_name='fluency', model_name=FLUENCY_MODEL_NAME, device=device, | ||
| models_storage=models_storage, use_cache=cache_models, quantize_model=quantize_models) | ||
| property_name='fluency', model_name=model_name, device=device, | ||
| models_storage=models_storage, use_cache=cache_models, use_onnx_model=use_onnx_models) | ||
@@ -861,3 +857,2 @@ # Remove language property from the list of properties to calculate as it will be calculated separately: | ||
| sentences_cache.clear() | ||
| empty_gpu(device) | ||
@@ -873,14 +868,33 @@ if not calculated_properties: | ||
| if cache_models: | ||
| # Move the transformers models to CPU RAM memory | ||
| for model_name in ['toxicity_classifier', 'formality_classifier', 'fluency_classifier']: | ||
| if model_name in kwargs: | ||
| kwargs[model_name].model.to('cpu') | ||
| return calculated_properties, properties_types | ||
| # Clean all remaining RAM: | ||
| empty_gpu(device) | ||
| return calculated_properties, properties_types | ||
| def _warn_long_compute(device, properties_types, n_samples, use_onnx_models): | ||
| heavy_properties = [prop for prop in properties_types.keys() if prop in LONG_RUN_PROPERTIES] | ||
| if len(heavy_properties) and n_samples > LARGE_SAMPLE_SIZE: | ||
| warning_message = ( | ||
| f'Calculating the properties {heavy_properties} on a large dataset may take a long time. ' | ||
| 'Consider using a smaller sample size or running this code on better hardware.' | ||
| ) | ||
| if device == 'cpu' or (device is None and not use_onnx_models): | ||
| warning_message += ' Consider using a GPU or a similar device to run these properties.' | ||
| warnings.warn(warning_message, UserWarning) | ||
| def _validate_onnx_model_availability(use_onnx_models: bool, device: Optional[str]): | ||
| if not use_onnx_models: | ||
| return False | ||
| if find_spec('optimum') is None or find_spec('onnxruntime') is None: | ||
| warnings.warn('Onnx models require the optimum[onnxruntime-gpu] library to be installed. ' | ||
| 'Calculating using the default models.') | ||
| return False | ||
| if not torch.cuda.is_available(): | ||
| warnings.warn('GPU is required for the onnx models. Calculating using the default models.') | ||
| return False | ||
| if device is not None and device.lower() == 'cpu': | ||
| warnings.warn('Onnx models are not supported on device CPU. Calculating using the default models.') | ||
| return False | ||
| return True | ||
| def get_builtin_properties_types(): | ||
@@ -887,0 +901,0 @@ """ |
@@ -57,2 +57,5 @@ # ---------------------------------------------------------------------------- | ||
| size >= segment_minimum_size_ratio * data_size. | ||
| max_categories_weak_segment: Optional[int] , default: None | ||
| Maximum number of categories that can be included in a weak segment per categorical feature. | ||
| If None, the number of categories is not limited. | ||
| alternative_scorer : Dict[str, Union[str, Callable]] , default: None | ||
@@ -88,2 +91,3 @@ Scorer to use as performance measure, either function or sklearn scorer name. | ||
| segment_minimum_size_ratio: float = 0.05, | ||
| max_categories_weak_segment: Optional[int] = None, | ||
| alternative_scorer: Dict[str, Union[str, Callable]] = None, | ||
@@ -110,2 +114,3 @@ loss_per_sample: Union[np.ndarray, pd.Series, None] = None, | ||
| self.segment_minimum_size_ratio = segment_minimum_size_ratio | ||
| self.max_categories_weak_segment = max_categories_weak_segment | ||
| self.n_samples = n_samples | ||
@@ -112,0 +117,0 @@ self.n_to_show = n_to_show |
@@ -260,3 +260,6 @@ # ---------------------------------------------------------------------------- | ||
| self.is_binary = self.model_classes and len(self.model_classes) == 2 | ||
| self.predictions = pd.Series(self.user_model.predict(data).squeeze(), index=data.index) | ||
| predictions = self.user_model.predict(data) | ||
| if len(data) > 1: | ||
| predictions = predictions.squeeze() | ||
| self.predictions = pd.Series(predictions, index=data.index) | ||
@@ -263,0 +266,0 @@ def predict(self, data: pd.DataFrame) -> np.ndarray: |
@@ -43,2 +43,3 @@ # ---------------------------------------------------------------------------- | ||
| segment_minimum_size_ratio: float = 0.05 | ||
| max_categories_weak_segment: Optional[int] = None | ||
| random_state: int = 42 | ||
@@ -199,2 +200,5 @@ add_condition: Callable[..., Any] | ||
| feature1, feature2 = feature_rank_for_search[[i, j]] | ||
| # Categorical feature come first | ||
| if feature1 not in self.encoder_mapping and feature2 in self.encoder_mapping: | ||
| feature2, feature1 = feature_rank_for_search[[i, j]] | ||
| weak_segment_score, weak_segment_filter = self._find_weak_segment(data, [feature1, feature2], | ||
@@ -218,21 +222,66 @@ score_per_sample, label_col, | ||
| # Sort and drop relevant columns | ||
| weak_segments = weak_segments.sort_values(score_title).reset_index(drop=True) | ||
| if multiple_segments_per_feature: | ||
| result = weak_segments.drop(columns='Samples in Segment').drop_duplicates() | ||
| result['Samples in Segment'] = weak_segments.loc[result.index, 'Samples in Segment'] | ||
| else: | ||
| used_features = set() | ||
| result = pd.DataFrame(columns=weak_segments.columns) | ||
| for _, row in weak_segments.iterrows(): | ||
| if row['Feature1'] in used_features or row['Feature2'] in used_features: | ||
| continue | ||
| # Filter and adapt the weak segments results | ||
| result = pd.DataFrame(columns=weak_segments.columns) | ||
| used_features = set() | ||
| for _, row in weak_segments.sort_values(score_title).iterrows(): | ||
| new_row = row.copy() | ||
| if not multiple_segments_per_feature and \ | ||
| (row['Feature1'] in used_features or row['Feature2'] in used_features): | ||
| continue | ||
| result.loc[len(result)] = row | ||
| used_features.add(row['Feature1']) | ||
| if row['Feature2'] != '': | ||
| used_features.add(row['Feature2']) | ||
| # Make sure segments based on categorical features are based only on a single category | ||
| if self.max_categories_weak_segment is not None and row['Feature1'] in self.encoder_mapping: | ||
| unique_values_in_range = [x for x in self.encoder_mapping[row['Feature1']]['encoded_value'].values if | ||
| row['Feature1 Range'][1] > x > row['Feature1 Range'][0]] | ||
| if len(unique_values_in_range) > self.max_categories_weak_segment: | ||
| subset = data.loc[new_row['Samples in Segment']] | ||
| value_segment_size = [len(subset[subset[row['Feature1']] == x]) for x in unique_values_in_range] | ||
| # If all sub segments are too small, remove feature 2 filter | ||
| if max(value_segment_size) < len(data) * self.segment_minimum_size_ratio and row['Feature2'] != '': | ||
| subset = data | ||
| value_segment_size = [len(data[data[row['Feature1']] == x]) for x in unique_values_in_range] | ||
| new_row['Feature2'] = '' | ||
| new_row['Feature2 Range'] = None | ||
| if max(value_segment_size) < len(data) * self.segment_minimum_size_ratio: | ||
| continue | ||
| return result | ||
| value_to_use = unique_values_in_range[np.argmax(value_segment_size)] | ||
| subset = subset[subset[row['Feature1']] == value_to_use] | ||
| new_row['Samples in Segment'] = list(subset.index) | ||
| new_row['% of Data'] = round(100 * len(new_row['Samples in Segment']) / len(data), 2) | ||
| new_row['Feature1 Range'] = [value_to_use - 0.5, value_to_use + 0.5] | ||
| if dummy_model is not None and label_col is not None and scorer is not None: | ||
| new_row[score_title] = scorer.run_on_data_and_label(dummy_model, | ||
| subset, label_col[list(subset.index)]) | ||
| else: | ||
| new_row[score_title] = score_per_sample[list(subset.index)].mean() | ||
| if self.max_categories_weak_segment is not None and \ | ||
| new_row['Feature2'] != '' and row['Feature2'] in self.encoder_mapping: | ||
| unique_values_in_range = [x for x in self.encoder_mapping[row['Feature2']]['encoded_value'].values if | ||
| row['Feature2 Range'][1] > x > row['Feature2 Range'][0]] | ||
| if len(unique_values_in_range) > self.max_categories_weak_segment: | ||
| subset = data.loc[new_row['Samples in Segment']] | ||
| value_segment_size = [len(subset[subset[row['Feature2']] == x]) for x in unique_values_in_range] | ||
| # Feature 1 cannot be empty so if feature 2 do not have a large enough segment, ignore the row | ||
| if max(value_segment_size) < len(data) * self.segment_minimum_size_ratio: | ||
| continue | ||
| value_to_use = unique_values_in_range[np.argmax(value_segment_size)] | ||
| subset = subset[subset[row['Feature2']] == value_to_use] | ||
| new_row['Samples in Segment'] = list(subset.index) | ||
| new_row['% of Data'] = round(100 * len(new_row['Samples in Segment']) / len(data), 2) | ||
| new_row['Feature2 Range'] = [value_to_use - 0.5, value_to_use + 0.5] | ||
| if dummy_model is not None and label_col is not None and scorer is not None: | ||
| new_row[score_title] = scorer.run_on_data_and_label(dummy_model, | ||
| subset, label_col[list(subset.index)]) | ||
| else: | ||
| new_row[score_title] = score_per_sample[list(subset.index)].mean() | ||
| result.loc[len(result)] = new_row | ||
| used_features.add(new_row['Feature1']) | ||
| if new_row['Feature2'] != '': | ||
| used_features.add(new_row['Feature2']) | ||
| return result.sort_values(score_title).drop_duplicates(subset=['Feature1', 'Feature2']) | ||
| def _find_weak_segment(self, data: pd.DataFrame, features_for_segment: List[str], score_per_sample: pd.Series, | ||
@@ -252,2 +301,11 @@ label_col: Optional[pd.Series] = None, dummy_model: Optional[_DummyModel] = None, | ||
| """ | ||
| # Remove rows with na values in the relevant columns | ||
| data_for_search = data[features_for_segment].dropna() | ||
| if len(data_for_search) == 0: | ||
| return None, None | ||
| segment_minimum_size_ratio = self.segment_minimum_size_ratio * len(data) / len(data_for_search) | ||
| score_per_sample_for_search = score_per_sample.loc[data_for_search.index] | ||
| if label_col is not None: | ||
| label_col_for_search = label_col.loc[data_for_search.index] | ||
| if version.parse(sklearn.__version__) < version.parse('1.0.0'): | ||
@@ -259,3 +317,3 @@ criterion = ['mse', 'mae'] | ||
| 'max_depth': [5], | ||
| 'min_weight_fraction_leaf': [self.segment_minimum_size_ratio], | ||
| 'min_weight_fraction_leaf': [segment_minimum_size_ratio], | ||
| 'min_samples_leaf': [5], | ||
@@ -272,6 +330,6 @@ 'criterion': criterion, | ||
| if scorer is not None and dummy_model is not None and label_col is not None: | ||
| leaf_data, leaf_labels = leaf_filter.filter(data, label_col) | ||
| leaf_data, leaf_labels = leaf_filter.filter(data_for_search, label_col_for_search) | ||
| leaf_score = scorer.run_on_data_and_label(dummy_model, leaf_data, leaf_labels) | ||
| else: # if no scorer is provided, use the average loss_per_sample of samples in the leaf as the score | ||
| leaf_score = score_per_sample[list(leaf_filter.filter(data).index)].mean() | ||
| leaf_score = score_per_sample_for_search[list(leaf_filter.filter(data_for_search).index)].mean() | ||
@@ -294,3 +352,3 @@ if leaf_score < min_score: | ||
| try: | ||
| grid_searcher.fit(data[features_for_segment], score_per_sample) | ||
| grid_searcher.fit(data_for_search, score_per_sample_for_search) | ||
| # Get the worst leaf filter out of the selected tree | ||
@@ -297,0 +355,0 @@ segment_score, segment_filter = get_worst_leaf_filter(grid_searcher.best_estimator_.tree_) |
@@ -46,10 +46,10 @@ # ---------------------------------------------------------------------------- | ||
| """Fill NaN values based on column type if possible otherwise returns None.""" | ||
| if is_cat_column: | ||
| if is_cat_column and 'None' not in col.astype('object').dropna().unique(): | ||
| return col.astype('object').fillna('None') | ||
| elif is_numeric_dtype(col): | ||
| return col.astype('float64').fillna(col.mean()) | ||
| else: | ||
| common_values_list = col.mode() | ||
| if isinstance(common_values_list, pd.Series) and len(common_values_list) > 0: | ||
| return col.fillna(common_values_list[0]) | ||
| return col.astype('float64').fillna(np.nan) | ||
| common_values_list = col.mode() | ||
| if isinstance(common_values_list, pd.Series) and len(common_values_list) > 0: | ||
| return col.fillna(common_values_list[0]) | ||
| return None | ||
@@ -56,0 +56,0 @@ |
@@ -63,2 +63,5 @@ # ---------------------------------------------------------------------------- | ||
| size >= segment_minimum_size_ratio * data_size. | ||
| max_categories_weak_segment: Optional[int] , default: None | ||
| Maximum number of categories that can be included in a weak segment per categorical feature. | ||
| If None, the number of categories is not limited. | ||
| n_samples : Optional[int] , default: 10_000 | ||
@@ -81,2 +84,3 @@ number of samples to use for this check. | ||
| segment_minimum_size_ratio: float = 0.05, | ||
| max_categories_weak_segment: Optional[int] = None, | ||
| n_samples: Optional[int] = 10000, | ||
@@ -94,2 +98,3 @@ categorical_aggregation_threshold: float = 0.05, | ||
| self.segment_minimum_size_ratio = segment_minimum_size_ratio | ||
| self.max_categories_weak_segment = max_categories_weak_segment | ||
| self.categorical_aggregation_threshold = categorical_aggregation_threshold | ||
@@ -96,0 +101,0 @@ self.multiple_segments_per_property = multiple_segments_per_property |
@@ -230,11 +230,11 @@ # ---------------------------------------------------------------------------- | ||
| if show_only == 'largest': | ||
| tests_metrics_df = tests_metrics_df.sort_values(by='Number of samples', ascending=False) | ||
| tests_metrics_df = tests_metrics_df.sort_values(by=['Number of samples', 'Value', 'Class'], ascending=False) | ||
| elif show_only == 'smallest': | ||
| tests_metrics_df = tests_metrics_df.sort_values(by='Number of samples', ascending=True) | ||
| tests_metrics_df = tests_metrics_df.sort_values(by=['Number of samples', 'Value', 'Class'], ascending=True) | ||
| elif show_only == 'random': | ||
| tests_metrics_df = tests_metrics_df.sample(frac=1) | ||
| elif show_only == 'best': | ||
| tests_metrics_df = tests_metrics_df.sort_values(by='Value', ascending=False) | ||
| tests_metrics_df = tests_metrics_df.sort_values(by=['Value', 'Number of samples'], ascending=False) | ||
| elif show_only == 'worst': | ||
| tests_metrics_df = tests_metrics_df.sort_values(by='Value', ascending=True) | ||
| tests_metrics_df = tests_metrics_df.sort_values(by=['Value', 'Number of samples'], ascending=True) | ||
| else: | ||
@@ -241,0 +241,0 @@ raise ValueError(f'Unknown show_only value: {show_only}') |
+2
-2
| Metadata-Version: 2.1 | ||
| Name: deepchecks | ||
| Version: 0.17.5 | ||
| Version: 0.18.0 | ||
| Summary: Package for validating your machine learning model and data | ||
@@ -9,3 +9,3 @@ Home-page: https://github.com/deepchecks/deepchecks | ||
| License: UNKNOWN | ||
| Download-URL: https://github.com/deepchecks/deepchecks/releases/download/0.17.5/deepchecks-0.17.5.tar.gz | ||
| Download-URL: https://github.com/deepchecks/deepchecks/releases/download/0.18.0/deepchecks-0.18.0.tar.gz | ||
| Project-URL: Documentation, https://docs.deepchecks.com | ||
@@ -12,0 +12,0 @@ Project-URL: Bug Reports, https://github.com/deepchecks/deepchecks |
+8
-5
@@ -22,3 +22,3 @@ <!-- | ||
| <!-- ALL-CONTRIBUTORS-BADGE:START - Do not remove or modify this section --> | ||
| [](#https://github.com/deepchecks/deepchecks/blob/main/CONTRIBUTING.rst) | ||
| [](#https://github.com/deepchecks/deepchecks/blob/main/CONTRIBUTING.rst) | ||
| <!-- ALL-CONTRIBUTORS-BADGE:END --> | ||
@@ -116,3 +116,3 @@ | ||
| This will automatically download the necessary dependencies, run the installation prcoess | ||
| This will automatically download the necessary dependencies, run the installation process | ||
| and then start the application locally. | ||
@@ -223,3 +223,3 @@ | ||
| distribution (e.g. detect drifts or leakages), data integrity (e.g. find conflicting labels) and more. | ||
| These checks results can be run manually (e.g. during research) or trigerred automatically (e.g. during CI | ||
| These checks results can be run manually (e.g. during research) or triggered automatically (e.g. during CI | ||
| and production monitoring) and enable automatically making informed decisions regarding your model pipelines' | ||
@@ -239,3 +239,3 @@ production-readiness, and behavior over time. | ||
| the Deepchecks Testing Python package (this repo). | ||
| - Each check tests for a specific potential problem. Deepchecks has many preimplemented checks for | ||
| - Each check tests for a specific potential problem. Deepchecks has many pre-implemented checks for | ||
| finding issues with the model's performance (e.g. identifying weak segments), | ||
@@ -260,3 +260,3 @@ data distribution (e.g. detect drifts or leakages) and data integrity (e.g. find conflicting labels). | ||
| The check's results can be examined with visual reports (by saving them to an HTML file, or seeing them in Jupyter), | ||
| processed with code (using their json output), and inspected and colloaborated upon with a dynamic UI | ||
| processed with code (using their json output), and inspected and collaborated upon with a dynamic UI | ||
| (for examining test results and for production monitoring). | ||
@@ -371,2 +371,5 @@ Optional conditions can be added to each check, to automatically validate whether it passed or not. | ||
| </tr> | ||
| <tr> | ||
| <td align="center" valign="top" width="14.28%"><a href="https://www.kaggle.com/rayanaay"><img src="https://avatars.githubusercontent.com/u/55285736?v=4?s=100" width="100px;" alt="AIT ALI YAHIA Rayane"/><br /><sub><b>AIT ALI YAHIA Rayane</b></sub></a><br /><a href="#code-RayanAAY-ops" title="Code">💻</a> <a href="#ideas-RayanAAY-ops" title="Ideas, Planning, & Feedback">🤔</a></td> | ||
| </tr> | ||
| </tbody> | ||
@@ -373,0 +376,0 @@ </table> |
@@ -11,3 +11,3 @@ twine | ||
| pytest | ||
| pytest<8.0.0 | ||
| pytest-cov | ||
@@ -38,3 +38,4 @@ pyhamcrest | ||
| opencv-python>=4.1.2 | ||
| Pillow>=7.1.2 | ||
| pillow>=7.1.2; python_version < '3.8' | ||
| pillow>=10.0.1; python_version >= '3.8' # not directly required, pinned by Snyk to avoid a vulnerability | ||
| PyYAML>=5.3.1 | ||
@@ -51,6 +52,14 @@ requests>=2.23.0 | ||
| # NLP | ||
| nltk<=3.6.7 | ||
| nltk<=3.6.7; python_version < '3.7' | ||
| nltk>=3.8.1; python_version >= '3.7' # not directly required, pinned by Snyk to avoid a vulnerability | ||
| datasets | ||
| textblob | ||
| transformers | ||
| sentence-transformers | ||
| sentence-transformers | ||
| # Avoid Vulnerabilities | ||
| tornado>=6.3.3; python_version >= '3.8' # not directly required, pinned by Snyk to avoid a vulnerability | ||
| pyarrow>=14.0.1; python_version >= '3.8' # not directly required, pinned by Snyk to avoid a vulnerability | ||
| aiohttp>=3.9.0; python_version >= '3.8' # not directly required, pinned by Snyk to avoid a vulnerability | ||
| umap-learn==0.5.4 |
| seqeval>=1.0.0 | ||
| nltk>=3.4.0,<=3.6.7 | ||
| nltk<=3.6.7; python_version < '3.7' | ||
| nltk>=3.8.1; python_version >= '3.7' # not directly required, pinned by Snyk to avoid a vulnerability | ||
| textblob>=0.17.1 | ||
| umap-learn | ||
| transformers>=4.0.0 | ||
| transformers>=4.0.0,<4.37.0 | ||
| tiktoken; python_version >= '3.8' |
| pandas>=1.1.5,<2.2.0 | ||
| numpy>=1.19 | ||
| scikit-learn>=0.23.2 | ||
| numpy>=1.19; python_version < '3.8' | ||
| numpy>=1.22.2; python_version >= '3.8' | ||
| scikit-learn>=0.23.2, <1.4.0 | ||
| jsonpickle>=2 | ||
@@ -8,3 +9,3 @@ PyNomaly>=0.3.3 | ||
| # require for python 3.8+ | ||
| ipython>=7.15.0,<8; python_version >= '3.8' | ||
| ipython>=7.15.0; python_version >= '3.8' | ||
| ipykernel>=5.3.0; python_version >= '3.8' | ||
@@ -14,3 +15,3 @@ ipywidgets>=7.6.5,<8; python_version >= '3.8' | ||
| # google colab requirements (python 3.7) | ||
| ipython>=5.5.0,<8; python_version < '3.8' | ||
| ipython>=5.5.0; python_version < '3.8' | ||
| ipykernel>=4.10.1; python_version < '3.8' | ||
@@ -36,4 +37,4 @@ ipywidgets>=7.5.0,<8; python_version < '3.8' | ||
| matplotlib>=3.3.4 | ||
| pyzmq<24.0.0 | ||
| beautifulsoup4>=4.11.1 | ||
| requests>=2.22.0 | ||
| jupyter-server>=2.7.2; python_version >= '3.8' # not directly required, pinned by Snyk to avoid a vulnerability |
+1
-1
@@ -1,1 +0,1 @@ | ||
| 0.17.5 | ||
| 0.18.0 |
Alert delta unavailable
Currently unable to show alert delta for PyPI packages.
19920163
0.07%70207
0.24%