adaptor
Advanced tools
| Metadata-Version: 2.1 | ||
| Name: adaptor | ||
| Version: 0.2.0 | ||
| Version: 0.2.1 | ||
| Summary: Adaptor: Objective-centric Adaptation Framework for Language Models. | ||
@@ -5,0 +5,0 @@ Home-page: https://github.com/gaussalgo/adaptor |
| torch>=1.7 | ||
| transformers>=4.10.2 | ||
| transformers>=4.10.2<=4.19.1 | ||
| sentencepiece | ||
@@ -4,0 +4,0 @@ |
@@ -21,3 +21,2 @@ .gitignore | ||
| adaptor/evaluators/generative.py | ||
| adaptor/evaluators/language_modeling.py | ||
| adaptor/evaluators/prism.py | ||
@@ -24,0 +23,0 @@ adaptor/evaluators/question_answering.py |
@@ -1,2 +0,2 @@ | ||
| from typing import Dict, Iterable, Optional, Union | ||
| from typing import Dict, Iterable, Optional, Union, Iterator | ||
@@ -115,3 +115,3 @@ import torch | ||
| def _get_inputs_iterator(self, split: str) -> Iterable[Union[BatchEncoding, Dict[str, torch.Tensor]]]: | ||
| def _get_inputs_iterator(self, split: str) -> Iterator[Union[BatchEncoding, Dict[str, torch.Tensor]]]: | ||
| """ | ||
@@ -118,0 +118,0 @@ Constructs input encodings for token classification using Transformers. |
@@ -63,3 +63,3 @@ from typing import Iterable, Dict, Union, Optional | ||
| """ | ||
| texts_iter = self._per_split_iterator_single(split) | ||
| texts_iter = self._per_split_iterator_sources(split) | ||
| collated_iter = self._mask_some_tokens(texts_iter) | ||
@@ -66,0 +66,0 @@ return collated_iter |
@@ -7,3 +7,3 @@ import abc | ||
| import torch | ||
| from tqdm import trange | ||
| from tqdm import trange, tqdm | ||
| from transformers import BatchEncoding, DataCollatorWithPadding | ||
@@ -38,3 +38,3 @@ | ||
| evaluations_history: Dict[str, Dict[Union[str, EvaluatorBase], List[float]]] | ||
| progressbar: Dict[str, trange] | ||
| progressbar: Dict[str, tqdm] | ||
| evaluators: Dict[str, List[EvaluatorBase]] | ||
@@ -53,2 +53,3 @@ | ||
| objective_module: Optional[torch.nn.Module] = None, | ||
| objective_args_for_head_config: Dict[str, Any] = {}, | ||
| objective_id: Optional[str] = "", | ||
@@ -73,2 +74,3 @@ loss_weight: Optional[float] = 1, | ||
| :param objective_module: If given, this module will be registered for this objective. | ||
| :param objective_args_for_head_config: Extra arguments that can be passed to .from_pretrained() on head init. | ||
| :param objective_id: Identifier of this objective, used in logging and checkpoints persistence. | ||
@@ -95,3 +97,3 @@ Necessary, if you train with multiple objectives of the same type, otherwise they might override each other. | ||
| share_other_objective_head, | ||
| {}, | ||
| objective_args_for_head_config, | ||
| objective_module) | ||
@@ -355,12 +357,40 @@ self.epoch = 0 | ||
| def _per_split_iterator_sources(self, split: str) -> Iterable[str]: | ||
| """ | ||
| An iterator over source texts. | ||
| :param split: split to iterate data over | ||
| :return: Iterable of input texts. | ||
| """ | ||
| if split == "train": | ||
| if self.texts is not None: | ||
| sources_iter = iter(self.texts) | ||
| else: | ||
| sources_iter = AdaptationDataset.iter_text_file_per_line(self.texts_path) | ||
| elif split == "eval": | ||
| if self.val_texts is not None: | ||
| sources_iter = iter(self.val_texts) | ||
| elif self.val_texts_path is not None: | ||
| sources_iter = AdaptationDataset.iter_text_file_per_line(self.val_texts_path) | ||
| else: | ||
| raise ValueError("Objective %s did not get any validation texts :( " | ||
| "Hint: pass `AdaptationArgs(do_eval=False)` to avoid evaluation, " | ||
| "or set Objective(val_texts) param." % self) | ||
| else: | ||
| raise ValueError("Unrecognized split: %s" % split) | ||
| return sources_iter | ||
| @abc.abstractmethod | ||
| def _per_split_iterators(self, split: str) -> Union[Iterable[str], Tuple[Iterable[str], Iterable[str]]]: | ||
| def _per_split_iterators(self, split: str) -> Union[Tuple[Iterable[str], ], | ||
| Tuple[Iterable[str], Iterable[str]], | ||
| Tuple[Iterable[str], Iterable[str], Iterable[str]]]: | ||
| """ | ||
| Implementations of shared (un/)supervised iterations in (Un/)SupervisedObjective. | ||
| Not meant to be overriden when implementing custom data set. Instead choose to inherit either | ||
| from SupervisedObjective, or UnsupervisedObjective (or their ancestors). | ||
| Not meant to be overriden when implementing custom data set. | ||
| Choose to inherit either from SupervisedObjective, or UnsupervisedObjective (or their ancestors), | ||
| or override _get_inputs_iterator() instead. | ||
| :param split: Data split to iterate over | ||
| :return: A pair of [inputs_iterator, labels_iterator] | ||
| :return: A pair of [inputs_iterator, [+input_pairs_iterator,] [+labels_iterator]] | ||
| """ | ||
@@ -409,27 +439,2 @@ pass | ||
| def _per_split_iterator_single(self, split: str) -> Iterable[str]: | ||
| """ | ||
| An iterator over unsupervised texts. | ||
| :param split: split to iterate data over | ||
| :return: Iterable of input texts. | ||
| """ | ||
| if split == "train": | ||
| if self.texts is not None: | ||
| sources_iter = iter(self.texts) | ||
| else: | ||
| sources_iter = AdaptationDataset.iter_text_file_per_line(self.texts_path) | ||
| elif split == "eval": | ||
| if self.val_texts is not None: | ||
| sources_iter = iter(self.val_texts) | ||
| elif self.val_texts_path is not None: | ||
| sources_iter = AdaptationDataset.iter_text_file_per_line(self.val_texts_path) | ||
| else: | ||
| raise ValueError("Objective %s did not get any validation texts :( " | ||
| "Hint: pass `AdaptationArgs(do_eval=False)` to avoid evaluation, " | ||
| "or set Objective(val_texts) param." % self) | ||
| else: | ||
| raise ValueError("Unrecognized split: %s" % split) | ||
| return sources_iter | ||
| def _per_split_iterators(self, split: str) -> Tuple[Iterable[str], Iterable[str]]: | ||
@@ -442,6 +447,6 @@ """ | ||
| """ | ||
| return self._per_split_iterator_single(split), self._per_split_iterator_single(split) | ||
| return self._per_split_iterator_sources(split), self._per_split_iterator_sources(split) | ||
| class SupervisedObjective(UnsupervisedObjective, abc.ABC): | ||
| class SupervisedObjective(Objective, abc.ABC): | ||
| labels_path: Optional[str] = None | ||
@@ -519,3 +524,3 @@ labels: Optional[List[str]] = None | ||
| objective_args_for_head_config = {"num_labels": len(all_labels), | ||
| objective_args_for_head_config = {"num_labels": len(self.labels_map), | ||
| "label2id": self.labels_map, | ||
@@ -572,3 +577,3 @@ "id2label": {v: k for k, v in self.labels_map.items()}, | ||
| """ | ||
| sources_iter, _ = super(SupervisedObjective, self)._per_split_iterators(split) | ||
| sources_iter = self._per_split_iterator_sources(split) | ||
@@ -575,0 +580,0 @@ if split == "train": |
@@ -55,3 +55,3 @@ import abc | ||
| def _get_inputs_iterator(self, split: str) -> Iterable[Union[BatchEncoding, Dict[str, torch.Tensor]]]: | ||
| def _get_inputs_iterator(self, split: str) -> Iterator[Union[BatchEncoding, Dict[str, torch.Tensor]]]: | ||
| """ | ||
@@ -58,0 +58,0 @@ Creates a default iterator over encodings with aligned input and output texts. |
+1
-1
| Metadata-Version: 2.1 | ||
| Name: adaptor | ||
| Version: 0.2.0 | ||
| Version: 0.2.1 | ||
| Summary: Adaptor: Objective-centric Adaptation Framework for Language Models. | ||
@@ -5,0 +5,0 @@ Home-page: https://github.com/gaussalgo/adaptor |
+2
-2
@@ -12,3 +12,3 @@ #!/usr/bin/env python | ||
| name="adaptor", | ||
| version='0.2.0', | ||
| version='0.2.1', | ||
| description="Adaptor: Objective-centric Adaptation Framework for Language Models.", | ||
@@ -34,3 +34,3 @@ long_description_content_type="text/markdown", | ||
| "torch>=1.7", | ||
| "transformers>=4.10.2", # intentionally open dependency, but tested with transformers==4.18.0 | ||
| "transformers>=4.10.2<=4.19.1", # upper-closed on 4.19.1 for now, due to minor bug in eval loss logging | ||
| "sentencepiece", | ||
@@ -37,0 +37,0 @@ ], |
@@ -1,2 +0,3 @@ | ||
| from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, AutoModelForTokenClassification | ||
| from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, AutoModelForTokenClassification, \ | ||
| AutoModelForSequenceClassification | ||
@@ -9,3 +10,3 @@ from adaptor.adapter import Adapter | ||
| from adaptor.objectives.backtranslation import BackTranslation, BackTranslator | ||
| from adaptor.objectives.classification import TokenClassification | ||
| from adaptor.objectives.classification import SequenceClassification, TokenClassification | ||
| from adaptor.objectives.seq2seq import Sequence2Sequence | ||
@@ -115,1 +116,35 @@ from adaptor.schedules import ParallelSchedule, SequentialSchedule | ||
| assert "eval_%s_%s" % (eval_ner_objective, evaluator) in evaluation | ||
| def test_adaptation_classification(): | ||
| lang_module = LangModule(test_base_models["sequence_classification"]) | ||
| mlm = MaskedLanguageModeling(lang_module=lang_module, | ||
| texts_or_path=paths["texts"]["unsup"], | ||
| batch_size=1) | ||
| classification = SequenceClassification(lang_module=lang_module, | ||
| texts_or_path=paths["texts"]["classification"], | ||
| labels_or_path=paths["labels"]["classification"], | ||
| batch_size=1) | ||
| parallel_schedule = ParallelSchedule(objectives=[mlm, classification], | ||
| args=training_arguments) | ||
| # 4. train using Adapter | ||
| adapter = Adapter(lang_module=lang_module, | ||
| schedule=parallel_schedule, | ||
| args=training_arguments) | ||
| adapter.train() | ||
| # 5. save the trained (multi-headed) lang_module | ||
| adapter.save_model("output_models") | ||
| # 6. reload and use it like any other Hugging Face model | ||
| classifier = AutoModelForSequenceClassification.from_pretrained("output_models/SequenceClassification") | ||
| tokenizer = AutoTokenizer.from_pretrained("output_models/SequenceClassification") | ||
| inputs = tokenizer("A piece of text to translate.", return_tensors="pt") | ||
| output = classifier(**inputs) | ||
| output_label_id = output.logits.argmax(-1)[0].item() | ||
| print("Prediction: %s" % classifier.config.id2label[output_label_id]) |
+1
-1
@@ -31,3 +31,3 @@ from adaptor.utils import AdaptationArguments, StoppingStrategy | ||
| "sequence_classification": "bert-base-cased", | ||
| "extractive_QA": "Unbabel/xlm-roberta-comet-small", | ||
| "extractive_QA": "bert-base-cased", | ||
| "MLM": "bert-base-cased", | ||
@@ -34,0 +34,0 @@ "MLM_student": "distilbert-base-cased" |
| from typing import List | ||
| import torch | ||
| from transformers import PreTrainedTokenizer | ||
| from .evaluator_base import EvaluatorBase | ||
| from ..utils import Head, AdaptationDataset | ||
| class Perplexity(EvaluatorBase): | ||
| compatible_heads: List[Head] = [Head.MLM, Head.CLM, Head.SEQ2SEQ] | ||
| def __call__(self, model: torch.nn.Module, tokenizer: PreTrainedTokenizer, dataset: AdaptationDataset) -> float: | ||
| raise NotImplementedError() |
Alert delta unavailable
Currently unable to show alert delta for PyPI packages.
337073
0.47%3392
0.65%43
-2.27%