Latest Threat Research:SANDWORM_MODE: Shai-Hulud-Style npm Worm Hijacks CI Workflows and Poisons AI Toolchains.Details
Socket
Book a DemoInstallSign in
Socket

adaptor

Package Overview
Dependencies
Maintainers
1
Versions
13
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

adaptor - npm Package Compare versions

Comparing version
0.2.0
to
0.2.1
+1
-1
adaptor.egg-info/PKG-INFO
Metadata-Version: 2.1
Name: adaptor
Version: 0.2.0
Version: 0.2.1
Summary: Adaptor: Objective-centric Adaptation Framework for Language Models.

@@ -5,0 +5,0 @@ Home-page: https://github.com/gaussalgo/adaptor

torch>=1.7
transformers>=4.10.2
transformers>=4.10.2<=4.19.1
sentencepiece

@@ -4,0 +4,0 @@

@@ -21,3 +21,2 @@ .gitignore

adaptor/evaluators/generative.py
adaptor/evaluators/language_modeling.py
adaptor/evaluators/prism.py

@@ -24,0 +23,0 @@ adaptor/evaluators/question_answering.py

@@ -1,2 +0,2 @@

from typing import Dict, Iterable, Optional, Union
from typing import Dict, Iterable, Optional, Union, Iterator

@@ -115,3 +115,3 @@ import torch

def _get_inputs_iterator(self, split: str) -> Iterable[Union[BatchEncoding, Dict[str, torch.Tensor]]]:
def _get_inputs_iterator(self, split: str) -> Iterator[Union[BatchEncoding, Dict[str, torch.Tensor]]]:
"""

@@ -118,0 +118,0 @@ Constructs input encodings for token classification using Transformers.

@@ -63,3 +63,3 @@ from typing import Iterable, Dict, Union, Optional

"""
texts_iter = self._per_split_iterator_single(split)
texts_iter = self._per_split_iterator_sources(split)
collated_iter = self._mask_some_tokens(texts_iter)

@@ -66,0 +66,0 @@ return collated_iter

@@ -7,3 +7,3 @@ import abc

import torch
from tqdm import trange
from tqdm import trange, tqdm
from transformers import BatchEncoding, DataCollatorWithPadding

@@ -38,3 +38,3 @@

evaluations_history: Dict[str, Dict[Union[str, EvaluatorBase], List[float]]]
progressbar: Dict[str, trange]
progressbar: Dict[str, tqdm]
evaluators: Dict[str, List[EvaluatorBase]]

@@ -53,2 +53,3 @@

objective_module: Optional[torch.nn.Module] = None,
objective_args_for_head_config: Dict[str, Any] = {},
objective_id: Optional[str] = "",

@@ -73,2 +74,3 @@ loss_weight: Optional[float] = 1,

:param objective_module: If given, this module will be registered for this objective.
:param objective_args_for_head_config: Extra arguments that can be passed to .from_pretrained() on head init.
:param objective_id: Identifier of this objective, used in logging and checkpoints persistence.

@@ -95,3 +97,3 @@ Necessary, if you train with multiple objectives of the same type, otherwise they might override each other.

share_other_objective_head,
{},
objective_args_for_head_config,
objective_module)

@@ -355,12 +357,40 @@ self.epoch = 0

def _per_split_iterator_sources(self, split: str) -> Iterable[str]:
"""
An iterator over source texts.
:param split: split to iterate data over
:return: Iterable of input texts.
"""
if split == "train":
if self.texts is not None:
sources_iter = iter(self.texts)
else:
sources_iter = AdaptationDataset.iter_text_file_per_line(self.texts_path)
elif split == "eval":
if self.val_texts is not None:
sources_iter = iter(self.val_texts)
elif self.val_texts_path is not None:
sources_iter = AdaptationDataset.iter_text_file_per_line(self.val_texts_path)
else:
raise ValueError("Objective %s did not get any validation texts :( "
"Hint: pass `AdaptationArgs(do_eval=False)` to avoid evaluation, "
"or set Objective(val_texts) param." % self)
else:
raise ValueError("Unrecognized split: %s" % split)
return sources_iter
@abc.abstractmethod
def _per_split_iterators(self, split: str) -> Union[Iterable[str], Tuple[Iterable[str], Iterable[str]]]:
def _per_split_iterators(self, split: str) -> Union[Tuple[Iterable[str], ],
Tuple[Iterable[str], Iterable[str]],
Tuple[Iterable[str], Iterable[str], Iterable[str]]]:
"""
Implementations of shared (un/)supervised iterations in (Un/)SupervisedObjective.
Not meant to be overriden when implementing custom data set. Instead choose to inherit either
from SupervisedObjective, or UnsupervisedObjective (or their ancestors).
Not meant to be overriden when implementing custom data set.
Choose to inherit either from SupervisedObjective, or UnsupervisedObjective (or their ancestors),
or override _get_inputs_iterator() instead.
:param split: Data split to iterate over
:return: A pair of [inputs_iterator, labels_iterator]
:return: A pair of [inputs_iterator, [+input_pairs_iterator,] [+labels_iterator]]
"""

@@ -409,27 +439,2 @@ pass

def _per_split_iterator_single(self, split: str) -> Iterable[str]:
"""
An iterator over unsupervised texts.
:param split: split to iterate data over
:return: Iterable of input texts.
"""
if split == "train":
if self.texts is not None:
sources_iter = iter(self.texts)
else:
sources_iter = AdaptationDataset.iter_text_file_per_line(self.texts_path)
elif split == "eval":
if self.val_texts is not None:
sources_iter = iter(self.val_texts)
elif self.val_texts_path is not None:
sources_iter = AdaptationDataset.iter_text_file_per_line(self.val_texts_path)
else:
raise ValueError("Objective %s did not get any validation texts :( "
"Hint: pass `AdaptationArgs(do_eval=False)` to avoid evaluation, "
"or set Objective(val_texts) param." % self)
else:
raise ValueError("Unrecognized split: %s" % split)
return sources_iter
def _per_split_iterators(self, split: str) -> Tuple[Iterable[str], Iterable[str]]:

@@ -442,6 +447,6 @@ """

"""
return self._per_split_iterator_single(split), self._per_split_iterator_single(split)
return self._per_split_iterator_sources(split), self._per_split_iterator_sources(split)
class SupervisedObjective(UnsupervisedObjective, abc.ABC):
class SupervisedObjective(Objective, abc.ABC):
labels_path: Optional[str] = None

@@ -519,3 +524,3 @@ labels: Optional[List[str]] = None

objective_args_for_head_config = {"num_labels": len(all_labels),
objective_args_for_head_config = {"num_labels": len(self.labels_map),
"label2id": self.labels_map,

@@ -572,3 +577,3 @@ "id2label": {v: k for k, v in self.labels_map.items()},

"""
sources_iter, _ = super(SupervisedObjective, self)._per_split_iterators(split)
sources_iter = self._per_split_iterator_sources(split)

@@ -575,0 +580,0 @@ if split == "train":

@@ -55,3 +55,3 @@ import abc

def _get_inputs_iterator(self, split: str) -> Iterable[Union[BatchEncoding, Dict[str, torch.Tensor]]]:
def _get_inputs_iterator(self, split: str) -> Iterator[Union[BatchEncoding, Dict[str, torch.Tensor]]]:
"""

@@ -58,0 +58,0 @@ Creates a default iterator over encodings with aligned input and output texts.

Metadata-Version: 2.1
Name: adaptor
Version: 0.2.0
Version: 0.2.1
Summary: Adaptor: Objective-centric Adaptation Framework for Language Models.

@@ -5,0 +5,0 @@ Home-page: https://github.com/gaussalgo/adaptor

@@ -12,3 +12,3 @@ #!/usr/bin/env python

name="adaptor",
version='0.2.0',
version='0.2.1',
description="Adaptor: Objective-centric Adaptation Framework for Language Models.",

@@ -34,3 +34,3 @@ long_description_content_type="text/markdown",

"torch>=1.7",
"transformers>=4.10.2", # intentionally open dependency, but tested with transformers==4.18.0
"transformers>=4.10.2<=4.19.1", # upper-closed on 4.19.1 for now, due to minor bug in eval loss logging
"sentencepiece",

@@ -37,0 +37,0 @@ ],

@@ -1,2 +0,3 @@

from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, AutoModelForTokenClassification
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, AutoModelForTokenClassification, \
AutoModelForSequenceClassification

@@ -9,3 +10,3 @@ from adaptor.adapter import Adapter

from adaptor.objectives.backtranslation import BackTranslation, BackTranslator
from adaptor.objectives.classification import TokenClassification
from adaptor.objectives.classification import SequenceClassification, TokenClassification
from adaptor.objectives.seq2seq import Sequence2Sequence

@@ -115,1 +116,35 @@ from adaptor.schedules import ParallelSchedule, SequentialSchedule

assert "eval_%s_%s" % (eval_ner_objective, evaluator) in evaluation
def test_adaptation_classification():
lang_module = LangModule(test_base_models["sequence_classification"])
mlm = MaskedLanguageModeling(lang_module=lang_module,
texts_or_path=paths["texts"]["unsup"],
batch_size=1)
classification = SequenceClassification(lang_module=lang_module,
texts_or_path=paths["texts"]["classification"],
labels_or_path=paths["labels"]["classification"],
batch_size=1)
parallel_schedule = ParallelSchedule(objectives=[mlm, classification],
args=training_arguments)
# 4. train using Adapter
adapter = Adapter(lang_module=lang_module,
schedule=parallel_schedule,
args=training_arguments)
adapter.train()
# 5. save the trained (multi-headed) lang_module
adapter.save_model("output_models")
# 6. reload and use it like any other Hugging Face model
classifier = AutoModelForSequenceClassification.from_pretrained("output_models/SequenceClassification")
tokenizer = AutoTokenizer.from_pretrained("output_models/SequenceClassification")
inputs = tokenizer("A piece of text to translate.", return_tensors="pt")
output = classifier(**inputs)
output_label_id = output.logits.argmax(-1)[0].item()
print("Prediction: %s" % classifier.config.id2label[output_label_id])

@@ -31,3 +31,3 @@ from adaptor.utils import AdaptationArguments, StoppingStrategy

"sequence_classification": "bert-base-cased",
"extractive_QA": "Unbabel/xlm-roberta-comet-small",
"extractive_QA": "bert-base-cased",
"MLM": "bert-base-cased",

@@ -34,0 +34,0 @@ "MLM_student": "distilbert-base-cased"

from typing import List
import torch
from transformers import PreTrainedTokenizer
from .evaluator_base import EvaluatorBase
from ..utils import Head, AdaptationDataset
class Perplexity(EvaluatorBase):
compatible_heads: List[Head] = [Head.MLM, Head.CLM, Head.SEQ2SEQ]
def __call__(self, model: torch.nn.Module, tokenizer: PreTrainedTokenizer, dataset: AdaptationDataset) -> float:
raise NotImplementedError()