Latest Threat Research:SANDWORM_MODE: Shai-Hulud-Style npm Worm Hijacks CI Workflows and Poisons AI Toolchains.Details
Socket
Book a DemoInstallSign in
Socket

deepchecks

Package Overview
Dependencies
Maintainers
1
Versions
59
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

deepchecks - npm Package Compare versions

Comparing version
0.18.1
to
0.19.0
+2
-2
deepchecks.egg-info/PKG-INFO
Metadata-Version: 2.1
Name: deepchecks
Version: 0.18.1
Version: 0.19.0
Summary: Package for validating your machine learning model and data

@@ -9,3 +9,3 @@ Home-page: https://github.com/deepchecks/deepchecks

License: UNKNOWN
Download-URL: https://github.com/deepchecks/deepchecks/releases/download/0.18.1/deepchecks-0.18.1.tar.gz
Download-URL: https://github.com/deepchecks/deepchecks/releases/download/0.19.0/deepchecks-0.19.0.tar.gz
Project-URL: Documentation, https://docs.deepchecks.com

@@ -12,0 +12,0 @@ Project-URL: Bug Reports, https://github.com/deepchecks/deepchecks

@@ -35,3 +35,3 @@ pandas<2.2.0,>=1.1.5

ipykernel>=5.3.0
ipywidgets<8,>=7.6.5
ipywidgets>=7.6.5
jupyter-server>=2.7.2

@@ -43,6 +43,8 @@

umap-learn
transformers<4.37.0,>=4.0.0
transformers>=4.0.0
huggingface_hub
sentence_transformers>=3.0.0
[nlp-properties]
fasttext>=0.8.0
fasttext<0.9.3,>=0.8.0

@@ -61,3 +63,3 @@ [nlp:python_version < "3.7"]

opencv-python>=4.5.5.62
albumentations>=1.1.0
albumentations<1.4.0,>=1.1.0
imgaug>=0.4.0

@@ -64,0 +66,0 @@ seaborn>=0.1.0

@@ -22,3 +22,4 @@ # ----------------------------------------------------------------------------

from IPython.core.display import display, display_html
from IPython.core.display import display_html
from IPython.display import display
from ipywidgets import Widget

@@ -109,3 +110,3 @@

elif as_widget is True:
display_html(self.widget_serializer.serialize(
display(self.widget_serializer.serialize(
output_id=unique_id,

@@ -112,0 +113,0 @@ **kwargs

@@ -252,5 +252,7 @@ # ----------------------------------------------------------------------------

children = []
titles = [] # Create a list to store titles
for i, (name, display) in enumerate(item.items()):
tab.set_title(i, name)
# First collect all children and titles
for name, display in item.items():
titles.append(name) # Add title to list
children.append(VBox(children=cls.handle_display(

@@ -263,3 +265,6 @@ display,

# Set all properties at once
tab.children = children
tab.titles = tuple(titles) # Set all titles at once
style = '<style>.jupyter-widgets.widget-tab > .p-TabBar .p-TabBar-tab {min-width: fit-content;}</style>'

@@ -266,0 +271,0 @@ return VBox(children=[

@@ -118,7 +118,8 @@ # ----------------------------------------------------------------------------

])
return Accordion(
accordion = Accordion(
children=[content],
_titles={'0': self.value.name},
selected_index='0'
)
accordion.set_title(0, self.value.name)
return accordion

@@ -163,5 +164,5 @@ def prepare_summary(

children=children,
_titles={'0': title},
selected_index=None
))
accordion.set_title(0, title)
return VBox(children=(

@@ -205,5 +206,5 @@ # by putting `section_anchor` before the results accordion

children=(HTML(value='<p>No outputs to show.</p>'),),
_titles={'0': title},
selected_index=None
))
accordion.set_title(0, title)
else:

@@ -232,5 +233,5 @@ section_id = f'{output_id}-section-{get_random_string()}'

children=(VBox(children=children),),
_titles={'0': title},
selected_index=None
))
accordion.set_title(0, title)

@@ -237,0 +238,0 @@ return VBox(children=(

@@ -27,2 +27,3 @@ # ----------------------------------------------------------------------------

from deepchecks.nlp.text_data import TextData
from deepchecks.nlp.utils.text_properties_models import check_nltk_resource
from deepchecks.utils.numbers import round_sig

@@ -183,6 +184,6 @@ from deepchecks.utils.strings import format_list, format_percent, truncate_string

# Choose tokenizer based on availability of nltk
if nltk.download('punkt', quiet=True):
if check_nltk_resource('punkt_tab'):
tokenize = nltk.word_tokenize
else:
warnings.warn('nltk punkt is not available, using str.split instead to identify individual words. '
warnings.warn('nltk punkt_tab is not available, using str.split instead to identify individual words. '
'Please check your internet connection.')

@@ -189,0 +190,0 @@ tokenize = str.split

@@ -21,3 +21,4 @@ # ----------------------------------------------------------------------------

import requests
from nltk import corpus
from nltk import corpus, data
from nltk import download as nltk_download
from transformers.utils import logging as transformers_logging

@@ -28,2 +29,29 @@

def check_nltk_resource(resource_name: str, resource_path: Optional[str] = None) -> bool:
"""Check if NLTK resource exists and download if it doesn't.
Parameters
----------
resource_name : str
Name of the NLTK resource to check/download
resource_path : str, optional
Path to check for resource existence. If None, defaults to 'tokenizers/{resource_name}'
Returns
-------
bool
True if resource exists or was successfully downloaded, False otherwise
"""
try:
# Default to tokenizers path if not specified
path_to_check = resource_path if resource_path else f'tokenizers/{resource_name}'
data.find(path_to_check)
return True
except LookupError:
try:
return nltk_download(resource_name, quiet=True)
except Exception: # pylint: disable=broad-except
return False
def get_create_model_storage(models_storage: Union[pathlib.Path, str, None] = None):

@@ -30,0 +58,0 @@ """Get the models storage directory and create it if needed."""

@@ -25,4 +25,2 @@ # ----------------------------------------------------------------------------

import torch.cuda
from nltk import corpus
from nltk import download as nltk_download
from nltk import sent_tokenize, word_tokenize

@@ -34,3 +32,4 @@ from tqdm import tqdm

from deepchecks.nlp.utils.text import cut_string, hash_text, normalize_text, remove_punctuation
from deepchecks.nlp.utils.text_properties_models import get_cmudict_dict, get_fasttext_model, get_transformer_pipeline
from deepchecks.nlp.utils.text_properties_models import (check_nltk_resource, get_cmudict_dict, get_fasttext_model,
get_transformer_pipeline)
from deepchecks.utils.function import run_available_kwargs

@@ -69,4 +68,4 @@ from deepchecks.utils.strings import SPECIAL_CHARACTERS, format_list

if hash_key not in sentences_cache:
if not nltk_download('punkt', quiet=True):
_warn_if_missing_nltk_dependencies('punkt', 'property')
if not check_nltk_resource('punkt_tab'):
_warn_if_missing_nltk_dependencies('punkt_tab', 'property')
return None

@@ -220,3 +219,6 @@ sentences_cache[hash_key] = sent_tokenize(text)

if len(text_to_use) == 0:
text_to_use = cut_string(sentences[0], MAX_CHARS)
if len(sentences) > 0:
text_to_use = cut_string(sentences[0], MAX_CHARS)
else:
text_to_use = None
text_list_to_predict.append(text_to_use)

@@ -231,7 +233,10 @@ else:

for text in text_list_to_predict:
try:
v = classifier(text)[0]
results.append(output_formatter(v))
except Exception: # pylint: disable=broad-except
if text is None:
results.append(np.nan)
else:
try:
v = classifier(text)[0]
results.append(output_formatter(v))
except Exception: # pylint: disable=broad-except
results.append(np.nan)
return results # Return the results if prediction is successful

@@ -348,6 +353,5 @@

return np.nan
if not nltk_download('punkt', quiet=True):
_warn_if_missing_nltk_dependencies('punkt', 'Lexical Density')
if not check_nltk_resource('punkt_tab'):
_warn_if_missing_nltk_dependencies('punkt_tab', 'Lexical Density')
return np.nan
all_words = _split_to_words_with_cache(text)

@@ -364,6 +368,7 @@ if len(all_words) == 0:

return np.nan
if not nltk_download('averaged_perceptron_tagger', quiet=True):
_warn_if_missing_nltk_dependencies('averaged_perceptron_tagger', 'Unique Noun Count')
if not check_nltk_resource('averaged_perceptron_tagger_eng',
'taggers/averaged_perceptron_tagger_eng'):
_warn_if_missing_nltk_dependencies('averaged_perceptron_tagger_eng',
'Unique Noun Count')
return np.nan
unique_words_with_tags = set(textblob.TextBlob(text).tags)

@@ -383,6 +388,7 @@ return sum(1 for (_, tag) in unique_words_with_tags if tag.startswith('N'))

if cmudict_dict is None:
if not nltk_download('cmudict', quiet=True):
if not check_nltk_resource('cmudict', 'corpora/cmudict'):
_warn_if_missing_nltk_dependencies('cmudict', 'Reading Ease')
return np.nan
cmudict_dict = corpus.cmudict.dict()
else:
cmudict_dict = get_cmudict_dict()
text_sentences = _sample_for_property(text, mode='sentences', limit=DEFAULT_SENTENCE_SAMPLE_SIZE,

@@ -454,10 +460,11 @@ return_as_list=True)

return np.nan
if not nltk_download('punkt', quiet=True):
_warn_if_missing_nltk_dependencies('punkt', 'Unique Syllables Count')
if not check_nltk_resource('punkt_tab'):
_warn_if_missing_nltk_dependencies('punkt_tab', 'Unique Syllables Count')
return np.nan
if cmudict_dict is None:
if not nltk_download('cmudict', quiet=True):
if not check_nltk_resource('cmudict', 'corpora/cmudict'):
_warn_if_missing_nltk_dependencies('cmudict', 'Unique Syllables Count')
return np.nan
cmudict_dict = corpus.cmudict.dict()
else:
cmudict_dict = get_cmudict_dict()

@@ -491,4 +498,4 @@ text = remove_punctuation(text.lower())

return np.nan
if not nltk_download('punkt', quiet=True):
_warn_if_missing_nltk_dependencies('punkt', 'Sentences Count')
if not check_nltk_resource('punkt_tab'):
_warn_if_missing_nltk_dependencies('punkt_tab', 'Sentences Count')
return np.nan

@@ -502,10 +509,11 @@ return len(_split_to_sentences_with_cache(text))

return np.nan
if not nltk_download('punkt', quiet=True):
_warn_if_missing_nltk_dependencies('punkt', 'Average Syllable Length')
if not check_nltk_resource('punkt_tab'):
_warn_if_missing_nltk_dependencies('punkt_tab', 'Average Syllable Length')
return np.nan
if cmudict_dict is None:
if not nltk_download('cmudict', quiet=True):
if not check_nltk_resource('cmudict', 'corpora/cmudict'):
_warn_if_missing_nltk_dependencies('cmudict', 'Average Syllable Length')
return np.nan
cmudict_dict = corpus.cmudict.dict()
else:
cmudict_dict = get_cmudict_dict()
sentence_count = len(_split_to_sentences_with_cache(text))

@@ -557,3 +565,3 @@ text = remove_punctuation(text.lower())

{'name': 'Unique Noun Count', 'method': unique_noun_count, 'output_type': 'numeric'},
)
)

@@ -571,3 +579,3 @@ ALL_PROPERTIES: Tuple[TextProperty, ...] = \

{'name': 'Average Syllable Length', 'method': average_syllable_length, 'output_type': 'numeric'},
) + DEFAULT_PROPERTIES
) + DEFAULT_PROPERTIES

@@ -761,7 +769,8 @@ LONG_RUN_PROPERTIES = ('Toxicity', 'Fluency', 'Formality', 'Unique Noun Count')

if properties_requiring_cmudict:
if not nltk_download('cmudict', quiet=True):
if not check_nltk_resource('cmudict', 'corpora/cmudict'):
_warn_if_missing_nltk_dependencies('cmudict', format_list(properties_requiring_cmudict))
for prop in properties_requiring_cmudict:
calculated_properties[prop] = [np.nan] * len(raw_text)
kwargs['cmudict_dict'] = get_cmudict_dict(use_cache=cache_models)
else:
kwargs['cmudict_dict'] = get_cmudict_dict(use_cache=cache_models)

@@ -768,0 +777,0 @@ if 'Toxicity' in properties_types and 'toxicity_classifier' not in kwargs:

@@ -89,3 +89,3 @@ # ----------------------------------------------------------------------------

return text
if nltk.download('punkt', quiet=True):
if nltk.download('punkt', quiet=True) and nltk.download('punkt_tab', quiet=True):
tokenize = word_tokenize

@@ -92,0 +92,0 @@ else:

@@ -39,4 +39,4 @@ """PPS (Predictive Power Score) module."""

import pandas as pd
from pandas.api.types import (is_bool_dtype, is_categorical_dtype, is_datetime64_any_dtype, is_numeric_dtype,
is_object_dtype, is_string_dtype, is_timedelta64_dtype)
from pandas.api.types import (is_bool_dtype, is_datetime64_any_dtype, is_numeric_dtype, is_object_dtype,
is_string_dtype, is_timedelta64_dtype)
from sklearn import preprocessing, tree

@@ -228,3 +228,3 @@ from sklearn.metrics import f1_score, mean_absolute_error

def _dtype_represents_categories(series) -> bool:
def _dtype_represents_categories(series: pd.Series) -> bool:
"""Determine if the dtype of the series represents categorical values."""

@@ -235,3 +235,3 @@ return (

or is_string_dtype(series)
or is_categorical_dtype(series)
or isinstance(series.dtype, pd.CategoricalDtype)
)

@@ -238,0 +238,0 @@

@@ -17,3 +17,2 @@ # ----------------------------------------------------------------------------

import pandas as pd
from pandas.api.types import is_categorical_dtype

@@ -109,3 +108,3 @@ from deepchecks.core import CheckResult, ConditionCategory, ConditionResult

column_data = df[column_name]
if is_categorical_dtype(column_data) is True:
if isinstance(column_data.dtype, pd.CategoricalDtype):
# NOTE:

@@ -112,0 +111,0 @@ # 'pandas.Series.value_counts' and 'pandas.Series.apply'

@@ -93,4 +93,6 @@ # ----------------------------------------------------------------------------

_TRAIN_DATA_URL = 'https://drive.google.com/uc?export=download&id=1UWkr1BQlyyUkbsW5hHIFTr-x0evZE3Ie'
_TEST_DATA_URL = 'https://drive.google.com/uc?export=download&id=1lfpWVtDktrnsLUzCN1tkRc1jRbguEz3a'
_TRAIN_DATA_URL = ('https://raw.githubusercontent.com/deepchecks/deepchecks-datasets/'
'8dd24134239b9df5d2a3a13cdce38cc22caaaaf4/airbnb_ref_data.csv')
_TEST_DATA_URL = ('https://raw.githubusercontent.com/deepchecks/deepchecks-datasets/'
'8dd24134239b9df5d2a3a13cdce38cc22caaaaf4/airbnb_prod_data.csv')
_target = 'price'

@@ -97,0 +99,0 @@ _predictions = 'predictions'

@@ -264,2 +264,3 @@ # ----------------------------------------------------------------------------

self.predictions = pd.Series(predictions, index=data.index)
self._estimator_type = 'classifier'

@@ -266,0 +267,0 @@ def predict(self, data: pd.DataFrame) -> np.ndarray:

@@ -325,3 +325,4 @@ # ----------------------------------------------------------------------------

if scorer is not None and dummy_model is not None and label_col is not None:
leaf_data, leaf_labels = leaf_filter.filter(data_for_search, label_col_for_search)
leaf_data = leaf_filter.filter(data_for_search)
leaf_labels = label_col_for_search.loc[leaf_data.index]
leaf_score = scorer.run_on_data_and_label(dummy_model, leaf_data, leaf_labels)

@@ -345,3 +346,3 @@ else: # if no scorer is provided, use the average loss_per_sample of samples in the leaf as the score

grid_searcher = GridSearchCV(DecisionTreeRegressor(random_state=random_state),
scoring=neg_worst_segment_score, param_grid=search_space, n_jobs=-1, cv=3)
scoring=neg_worst_segment_score, param_grid=search_space, n_jobs=1, cv=3)
try:

@@ -348,0 +349,0 @@ grid_searcher.fit(data_for_search, score_per_sample_for_search)

@@ -14,3 +14,3 @@ # ----------------------------------------------------------------------------

from copy import deepcopy
from typing import Callable, List, Optional, Tuple, Union
from typing import Callable, List

@@ -50,16 +50,9 @@ import numpy as np

def filter(self, dataframe: pd.DataFrame, label_col: Optional[pd.Series] = None) -> \
Union[Tuple[pd.DataFrame, pd.Series], pd.DataFrame]:
def filter(self, dataframe: pd.DataFrame) -> pd.DataFrame:
"""Run the filter on given dataframe. Return rows in data frame satisfying the filter properties."""
if label_col is not None:
dataframe['temp_label_col'] = label_col
for func in self.filter_functions:
dataframe = dataframe.loc[func(dataframe)]
return dataframe
if label_col is not None:
return dataframe.drop(columns=['temp_label_col']), dataframe['temp_label_col']
else:
return dataframe
class DeepchecksBaseFilter(DeepchecksFilter):

@@ -219,2 +212,3 @@ """Extend DeepchecksFilter class for feature range based filters.

"""
# pylint: disable=function-redefined,multiple-statements
column = dataset.data[column_name]

@@ -233,6 +227,6 @@ if column_name in dataset.numerical_features:

if end == percentile_values[-1]:
f = lambda df, a=start, b=end: (df[column_name] >= a) & (df[column_name] <= b)
def f(df, a=start, b=end): return (df[column_name] >= a) & (df[column_name] <= b)
label = f'[{format_number(start)} - {format_number(end)}]'
else:
f = lambda df, a=start, b=end: (df[column_name] >= a) & (df[column_name] < b)
def f(df, a=start, b=end): return (df[column_name] >= a) & (df[column_name] < b)
label = f'[{format_number(start)} - {format_number(end)})'

@@ -239,0 +233,0 @@

Metadata-Version: 2.1
Name: deepchecks
Version: 0.18.1
Version: 0.19.0
Summary: Package for validating your machine learning model and data

@@ -9,3 +9,3 @@ Home-page: https://github.com/deepchecks/deepchecks

License: UNKNOWN
Download-URL: https://github.com/deepchecks/deepchecks/releases/download/0.18.1/deepchecks-0.18.1.tar.gz
Download-URL: https://github.com/deepchecks/deepchecks/releases/download/0.19.0/deepchecks-0.19.0.tar.gz
Project-URL: Documentation, https://docs.deepchecks.com

@@ -12,0 +12,0 @@ Project-URL: Bug Reports, https://github.com/deepchecks/deepchecks

@@ -18,7 +18,6 @@ <!--

![pkgVersion](https://img.shields.io/pypi/v/deepchecks)
[![Maintainability](https://api.codeclimate.com/v1/badges/970b11794144139975fa/maintainability)](https://codeclimate.com/github/deepchecks/deepchecks/maintainability)
[![Coverage
Status](https://coveralls.io/repos/github/deepchecks/deepchecks/badge.svg?branch=main)](https://coveralls.io/github/deepchecks/deepchecks?branch=main)
<!-- ALL-CONTRIBUTORS-BADGE:START - Do not remove or modify this section -->
[![All Contributors](https://img.shields.io/badge/all_contributors-43-orange.svg?style=flat-round)](#https://github.com/deepchecks/deepchecks/blob/main/CONTRIBUTING.rst)
[![All Contributors](https://img.shields.io/badge/all_contributors-44-orange.svg?style=flat-round)](#https://github.com/deepchecks/deepchecks/blob/main/CONTRIBUTING.rst)
<!-- ALL-CONTRIBUTORS-BADGE:END -->

@@ -369,2 +368,3 @@

<td align="center" valign="top" width="14.28%"><a href="https://www.kaggle.com/rayanaay"><img src="https://avatars.githubusercontent.com/u/55285736?v=4?s=100" width="100px;" alt="AIT ALI YAHIA Rayane"/><br /><sub><b>AIT ALI YAHIA Rayane</b></sub></a><br /><a href="#code-RayanAAY-ops" title="Code">💻</a> <a href="#ideas-RayanAAY-ops" title="Ideas, Planning, & Feedback">🤔</a></td>
<td align="center" valign="top" width="14.28%"><a href="https://github.com/chris-santiago"><img src="https://avatars.githubusercontent.com/u/40875112?v=4?s=100" width="100px;" alt="Chris Santiago"/><br /><sub><b>Chris Santiago</b></sub></a><br /><a href="#bug-chris-santiago" title="Bug reports">🐛</a> <a href="#code-chris-santiago" title="Code">💻</a></td>
</tr>

@@ -371,0 +371,0 @@ </tbody>

@@ -24,3 +24,4 @@ twine

catboost
catboost; python_version >= '3.7'
catboost<=1.2.2; python_version < '3.7'
lightgbm

@@ -37,3 +38,4 @@ xgboost<=1.7.5

numpy>=1.18.5
opencv-python>=4.1.2
opencv-python>=4.1.2; python_version > '3.6'
opencv-python<=4.6.0.66; python_version == '3.6'
pillow>=7.1.2; python_version < '3.8'

@@ -40,0 +42,0 @@ pillow>=10.0.1; python_version >= '3.8' # not directly required, pinned by Snyk to avoid a vulnerability

@@ -1,1 +0,1 @@

fasttext>=0.8.0
fasttext>=0.8.0, <0.9.3

@@ -6,3 +6,5 @@ seqeval>=1.0.0

umap-learn
transformers>=4.0.0,<4.37.0
transformers>=4.0.0
huggingface_hub
sentence_transformers>=3.0.0
tiktoken; python_version >= '3.8'

@@ -11,3 +11,3 @@ pandas>=1.1.5,<2.2.0

ipykernel>=5.3.0; python_version >= '3.8'
ipywidgets>=7.6.5,<8; python_version >= '3.8'
ipywidgets>=7.6.5; python_version >= '3.8'

@@ -14,0 +14,0 @@ # google colab requirements (python 3.7)

pytorch-ignite>=0.4.8
opencv-python>=4.5.5.62
albumentations>=1.1.0
albumentations>=1.1.0,<1.4.0
imgaug>=0.4.0

@@ -5,0 +5,0 @@ seaborn>=0.1.0

@@ -1,1 +0,1 @@

0.18.1
0.19.0