adaptor
Advanced tools
| include LICENSE | ||
| include README.md |
| Metadata-Version: 2.1 | ||
| Name: adaptor | ||
| Version: 0.2.3 | ||
| Version: 0.2.4 | ||
| Summary: Adaptor: Objective-centric Adaptation Framework for Language Models. | ||
@@ -9,3 +9,2 @@ Home-page: https://github.com/gaussalgo/adaptor | ||
| License: MIT | ||
| Platform: UNKNOWN | ||
| Classifier: Intended Audience :: Developers | ||
@@ -16,7 +15,17 @@ Classifier: Intended Audience :: Education | ||
| Classifier: Programming Language :: Python :: 3.8 | ||
| Requires-Python: >=3.7 | ||
| Requires-Python: >=3.8 | ||
| Description-Content-Type: text/markdown | ||
| License-File: LICENSE | ||
| Requires-Dist: torch>=1.7 | ||
| Requires-Dist: transformers<=4.30.2 | ||
| Requires-Dist: sentencepiece | ||
| Requires-Dist: accelerate>=0.20.1 | ||
| Provides-Extra: generative | ||
| Requires-Dist: sacrebleu; extra == "generative" | ||
| Requires-Dist: rouge-score; extra == "generative" | ||
| Requires-Dist: bert-score; extra == "generative" | ||
| Requires-Dist: fairseq; extra == "generative" | ||
| Requires-Dist: protobuf<=3.20.1; extra == "generative" | ||
| Provides-Extra: examples | ||
| License-File: LICENSE | ||
| Requires-Dist: comet-ml; extra == "examples" | ||
@@ -288,3 +297,1 @@ # Adaptor: Objective-centric Adaptation library | ||
| If you have any other question(s), feel free to create an issue. | ||
@@ -1,5 +0,4 @@ | ||
| .gitignore | ||
| LICENSE | ||
| MANIFEST.in | ||
| README.md | ||
| dockerfile.in | ||
| setup.cfg | ||
@@ -34,10 +33,2 @@ setup.py | ||
| adaptor/objectives/question_answering.py | ||
| adaptor/objectives/seq2seq.py | ||
| docs/user_flow.png | ||
| tests/adapter_test.py | ||
| tests/end2end_usecases_test.py | ||
| tests/evaluators_test.py | ||
| tests/lang_module_test.py | ||
| tests/objectives_test.py | ||
| tests/schedules_test.py | ||
| tests/utils.py | ||
| adaptor/objectives/seq2seq.py |
@@ -161,4 +161,4 @@ import logging | ||
| list_of_model_specific_inputs = inspect.getfullargspec(selected_head_model.forward).args | ||
| model_specific_inputs = {k: v for k, v in inputs.items() if k in list_of_model_specific_inputs} | ||
| model_specific_inputs = {k: v for k, v in inputs.items() | ||
| if k in list_of_model_specific_inputs and k not in ("label", "labels")} | ||
| # including labels cause the loss to be computed twice - by objective + by HF models forward() | ||
@@ -165,0 +165,0 @@ # but labels are also used to infer decoder_input_ids of some models, so we need to pass it |
| import abc | ||
| from typing import List, Union, Dict, Iterable, Optional | ||
| from typing import Union, Dict, Optional, List, Iterable, Iterator | ||
@@ -33,3 +33,3 @@ import torch | ||
| features: List[Union[BatchEncoding, Dict[str, Iterable[Union[int, float]]]]], | ||
| return_tensors=None) -> BatchEncoding: | ||
| return_tensors=None) -> Iterable[Dict[str, torch.Tensor]]: | ||
| """ | ||
@@ -101,4 +101,40 @@ Custom DataCollator allowing to apply CausalLM also on models with fully-attended encoder. | ||
| self.collator = DataCollatorForCausalLM(self.tokenizer, self.compatible_head_model) | ||
| self.collator = DataCollatorForSeq2Seq(self.tokenizer, self.compatible_head_model, pad_to_multiple_of=8) | ||
| def _get_seq2seq_collated_iterator(self, | ||
| source_texts: Iterable[str], | ||
| target_texts: Iterable[str]) -> Iterator[BatchEncoding]: | ||
| """ | ||
| Creates an iterator over batches of encoded `source_texts` as inputs and `target_texts` as labels. | ||
| Override this to implement custom mapping, or unsupervised seq2seq objective. See e.g. BackTranslation. | ||
| :param source_texts: Input texts. | ||
| :param target_texts: Output (expected) texts to translate input texts into. | ||
| :return: Iterator of encoded batches. | ||
| """ | ||
| features_batch = [] | ||
| asserted_equal = False # speedup: avoid repeated assertions of string equality | ||
| self.tokenizer.src_lang = self.source_lang_id | ||
| self.tokenizer.tgt_lang = self.target_lang_id | ||
| for source_text, target_text in zip(source_texts, target_texts): | ||
| if not asserted_equal: | ||
| assert source_text == target_text, ("CLM objective expects both texts to be the same. " | ||
| "If you override this objective, it's possible that you should " | ||
| "rather override SequentialMixin supporting different src and tgt.") | ||
| asserted_equal = True | ||
| sample_features = self.tokenizer(source_text, truncation=True) | ||
| sample_targets = self.tokenizer(target_text, truncation=True) | ||
| features_batch.append({"input_ids": sample_features.input_ids, | ||
| "attention_mask": sample_features.attention_mask, | ||
| "labels": sample_targets.input_ids}) | ||
| if len(features_batch) == self.batch_size: | ||
| yield self.collator(features_batch) | ||
| features_batch = [] | ||
| if features_batch: | ||
| # yield last nonempty residual batch | ||
| yield self.collator(features_batch) | ||
| def _compute_loss(self, | ||
@@ -105,0 +141,0 @@ logit_outputs: torch.FloatTensor, |
@@ -436,3 +436,6 @@ import abc | ||
| Default inputs iterator for unsupervised objectives. Returns input texts as both inputs and labels. | ||
| Not meant to be overriden when implementing custom data set. Instead choose to inherit either | ||
| Not meant to be overriden when implementing custom data set. Instead inherit | ||
| from SupervisedObjective, or UnsupervisedObjective (or their ancestors), | ||
| or override _get_inputs_iterator() instead. | ||
| :param split: Data split to iterate over | ||
@@ -439,0 +442,0 @@ :return: a pair of identical [inputs_iterator, inputs_iterator] |
+13
-6
| Metadata-Version: 2.1 | ||
| Name: adaptor | ||
| Version: 0.2.3 | ||
| Version: 0.2.4 | ||
| Summary: Adaptor: Objective-centric Adaptation Framework for Language Models. | ||
@@ -9,3 +9,2 @@ Home-page: https://github.com/gaussalgo/adaptor | ||
| License: MIT | ||
| Platform: UNKNOWN | ||
| Classifier: Intended Audience :: Developers | ||
@@ -16,7 +15,17 @@ Classifier: Intended Audience :: Education | ||
| Classifier: Programming Language :: Python :: 3.8 | ||
| Requires-Python: >=3.7 | ||
| Requires-Python: >=3.8 | ||
| Description-Content-Type: text/markdown | ||
| License-File: LICENSE | ||
| Requires-Dist: torch>=1.7 | ||
| Requires-Dist: transformers<=4.30.2 | ||
| Requires-Dist: sentencepiece | ||
| Requires-Dist: accelerate>=0.20.1 | ||
| Provides-Extra: generative | ||
| Requires-Dist: sacrebleu; extra == "generative" | ||
| Requires-Dist: rouge-score; extra == "generative" | ||
| Requires-Dist: bert-score; extra == "generative" | ||
| Requires-Dist: fairseq; extra == "generative" | ||
| Requires-Dist: protobuf<=3.20.1; extra == "generative" | ||
| Provides-Extra: examples | ||
| License-File: LICENSE | ||
| Requires-Dist: comet-ml; extra == "examples" | ||
@@ -288,3 +297,1 @@ # Adaptor: Objective-centric Adaptation library | ||
| If you have any other question(s), feel free to create an issue. | ||
+3
-2
@@ -12,3 +12,3 @@ #!/usr/bin/env python | ||
| name="adaptor", | ||
| version='0.2.3', | ||
| version='0.2.4', | ||
| description="Adaptor: Objective-centric Adaptation Framework for Language Models.", | ||
@@ -27,4 +27,5 @@ long_description_content_type="text/markdown", | ||
| url="https://github.com/gaussalgo/adaptor", | ||
| python_requires=">=3.7", | ||
| python_requires=">=3.8", | ||
| license="MIT", | ||
| license_files=["LICENSE"], | ||
| packages=find_packages(include=["adaptor", "adaptor.*"]), | ||
@@ -31,0 +32,0 @@ include_package_data=True, |
-15
| .venv/ | ||
| **/__pycache__/ | ||
| adaptor.egg-info/ | ||
| examples/adaptation_output_dir | ||
| tests/adaptation_output_dir | ||
| tests/translator_model | ||
| entity_detector_model/ | ||
| bin/ | ||
| lib/ | ||
| share/ | ||
| pyvenv.cfg | ||
| m39v1.tar | ||
| m39v1/ | ||
| .idea | ||
| dist |
| # this is an example dockerfile, that can be used for a quick resolution of gpu-supporting env | ||
| FROM pytorch/pytorch:latest | ||
| CMD apt update && apt install -y git | ||
| CMD git clone <this repo>.git && cd adaptor && python3.8 -m pip install -e . | ||
| # DPP training: chmod 777 /dev/shm && chmod 777 /dev/shm/* |
Sorry, the diff of this file is not supported yet
| from adaptor.adapter import Adapter | ||
| from adaptor.lang_module import LangModule | ||
| from adaptor.objectives.MLM import MaskedLanguageModeling | ||
| from adaptor.objectives.backtranslation import BackTranslation, BackTranslator | ||
| from adaptor.objectives.classification import TokenClassification | ||
| from adaptor.objectives.denoising import DenoisingObjective | ||
| from adaptor.objectives.seq2seq import Sequence2Sequence | ||
| from adaptor.schedules import SequentialSchedule | ||
| from adaptor.utils import AdaptationArguments, StoppingStrategy | ||
| from utils import paths, test_base_models | ||
| training_arguments = AdaptationArguments(output_dir="adaptation_output_dir", | ||
| stopping_strategy=StoppingStrategy.FIRST_OBJECTIVE_NUM_EPOCHS, | ||
| do_train=True, | ||
| do_eval=True, | ||
| gradient_accumulation_steps=2, | ||
| log_level="critical", | ||
| logging_steps=1, | ||
| num_train_epochs=3) | ||
| def run_adaptation(adapter: Adapter, trained_model_output_dir: str = "adaptation_output_dir/finished"): | ||
| adapter.train() | ||
| adapter.save_model(trained_model_output_dir) | ||
| def test_ner_adaptation(): | ||
| lang_module = LangModule(test_base_models["token_classification"]) | ||
| objectives = [MaskedLanguageModeling(lang_module, | ||
| texts_or_path=paths["texts"]["unsup"], | ||
| batch_size=1), | ||
| TokenClassification(lang_module, | ||
| texts_or_path=paths["texts"]["ner"], | ||
| labels_or_path=paths["labels"]["ner"], | ||
| batch_size=1)] | ||
| schedule = SequentialSchedule(objectives, training_arguments) | ||
| adapter = Adapter(lang_module, schedule, args=training_arguments) | ||
| run_adaptation(adapter) | ||
| def test_mt_adaptation(): | ||
| lang_module = LangModule(test_base_models["translation_mono"]) | ||
| objectives = [DenoisingObjective(lang_module, | ||
| texts_or_path=paths["texts"]["unsup"], | ||
| batch_size=1), | ||
| Sequence2Sequence(lang_module, | ||
| texts_or_path=paths["texts"]["translation"], | ||
| labels_or_path=paths["labels"]["translation"], | ||
| batch_size=1)] | ||
| schedule = SequentialSchedule(objectives, training_arguments) | ||
| adapter = Adapter(lang_module, schedule, args=training_arguments) | ||
| run_adaptation(adapter) | ||
| def test_mt_adaptation_bt(): | ||
| lang_module = LangModule(test_base_models["translation_mono"]) | ||
| translator = BackTranslator("Helsinki-NLP/opus-mt-cs-en") | ||
| objectives = [BackTranslation(lang_module, | ||
| back_translator=translator, | ||
| texts_or_path=paths["texts"]["unsup"], | ||
| batch_size=4), | ||
| Sequence2Sequence(lang_module, | ||
| texts_or_path=paths["texts"]["translation"], | ||
| labels_or_path=paths["labels"]["translation"], | ||
| batch_size=1)] | ||
| schedule = SequentialSchedule(objectives, training_arguments) | ||
| adapter = Adapter(lang_module, schedule, args=training_arguments) | ||
| run_adaptation(adapter) |
| from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, AutoModelForTokenClassification, \ | ||
| AutoModelForSequenceClassification | ||
| from adaptor.adapter import Adapter | ||
| from adaptor.evaluators.generative import BLEU | ||
| from adaptor.evaluators.token_classification import MeanFScore, AverageAccuracy | ||
| from adaptor.lang_module import LangModule | ||
| from adaptor.objectives.MLM import MaskedLanguageModeling | ||
| from adaptor.objectives.backtranslation import BackTranslation, BackTranslator | ||
| from adaptor.objectives.classification import SequenceClassification, TokenClassification | ||
| from adaptor.objectives.seq2seq import Sequence2Sequence | ||
| from adaptor.schedules import ParallelSchedule, SequentialSchedule | ||
| from utils import training_arguments, paths, test_base_models | ||
| unsup_target_domain_texts = paths["texts"]["unsup"] | ||
| sup_target_domain_texts = paths["texts"]["ner"] | ||
| sup_target_domain_labels = paths["labels"]["ner"] | ||
| ner_model_out_dir = "entity_detector_model" | ||
| def test_adaptation_ner(): | ||
| # 1. pick the models - randomly pre-initialize the appropriate heads | ||
| lang_module = LangModule(test_base_models["token_classification"]) | ||
| # 2. pick objectives | ||
| # Objectives take either List[str] for in-memory iteration, or a source file path for streamed iteration | ||
| objectives = [MaskedLanguageModeling(lang_module, | ||
| batch_size=1, | ||
| texts_or_path=paths["texts"]["unsup"]), | ||
| TokenClassification(lang_module, | ||
| batch_size=1, | ||
| texts_or_path=paths["texts"]["ner"], | ||
| labels_or_path=paths["labels"]["ner"])] | ||
| # 4. pick a schedule of the selected objectives | ||
| schedule = SequentialSchedule(objectives, training_arguments) | ||
| # 5. Run the training using Adapter, similarly to running HF.Trainer, only adding `schedule` | ||
| adapter = Adapter(lang_module, schedule, training_arguments) | ||
| adapter.train() | ||
| # 6. save the trained lang_module (with all heads) | ||
| adapter.save_model(ner_model_out_dir) | ||
| # 7. reload and use it like any other Hugging Face model | ||
| ner_model = AutoModelForTokenClassification.from_pretrained("%s/TokenClassification" % ner_model_out_dir) | ||
| tokenizer = AutoTokenizer.from_pretrained("%s/TokenClassification" % ner_model_out_dir) | ||
| inputs = tokenizer("Is there any Abraham Lincoln here?", return_tensors="pt") | ||
| outputs = ner_model(**inputs) | ||
| ner_tags = [ner_model.config.id2label[label_id.item()] for label_id in outputs.logits[0].argmax(-1)] | ||
| assert ner_tags | ||
| def test_adaptation_translation(): | ||
| # 1. pick the models - randomly pre-initialize the appropriate heads | ||
| lang_module = LangModule(test_base_models["translation_mono"]) | ||
| # (optional) pick train and validation evaluators for the objectives | ||
| seq2seq_evaluators = [BLEU(use_generate=True, decides_convergence=True)] | ||
| # 2. pick objectives | ||
| objectives = [BackTranslation(lang_module, | ||
| back_translator=BackTranslator("Helsinki-NLP/opus-mt-cs-en"), | ||
| batch_size=1, | ||
| texts_or_path=paths["texts"]["unsup"]), | ||
| Sequence2Sequence(lang_module, batch_size=1, | ||
| texts_or_path=paths["texts"]["translation"], | ||
| val_evaluators=seq2seq_evaluators, | ||
| labels_or_path=paths["labels"]["translation"])] | ||
| # 3. pick a schedule of the selected objectives | ||
| # this one will shuffle the batches of both objectives | ||
| schedule = ParallelSchedule(objectives, training_arguments) | ||
| # 4. train using Adapter | ||
| adapter = Adapter(lang_module, schedule, training_arguments) | ||
| adapter.train() | ||
| # 5. save the trained (multi-headed) lang_module | ||
| adapter.save_model("translator_model") | ||
| # 6. reload and use it like any other Hugging Face model | ||
| translator_model = AutoModelForSeq2SeqLM.from_pretrained("translator_model/Sequence2Sequence") | ||
| tokenizer = AutoTokenizer.from_pretrained("translator_model/Sequence2Sequence") | ||
| tokenizer.src_lang, tokenizer.tgt_lang = "en", "cs" | ||
| inputs = tokenizer("A piece of text to translate.", return_tensors="pt") | ||
| output_ids = translator_model.generate(**inputs) | ||
| output_text = tokenizer.batch_decode(output_ids, skip_special_tokens=True) | ||
| print(output_text) | ||
| def test_evaluation_ner(): | ||
| # first, create a model to evaluate: | ||
| test_adaptation_ner() | ||
| # reload LangModule from this directory | ||
| lang_module = LangModule("%s/TokenClassification" % ner_model_out_dir) | ||
| evaluators = [MeanFScore(), AverageAccuracy()] | ||
| # evaluate the result again through the Objective, that takes care of labels alignment | ||
| eval_ner_objective = TokenClassification(lang_module, | ||
| batch_size=1, | ||
| texts_or_path=[], | ||
| labels_or_path=[], | ||
| val_texts_or_path=paths["texts"]["ner"], | ||
| val_labels_or_path=paths["labels"]["ner"], | ||
| val_evaluators=evaluators) | ||
| evaluation = eval_ner_objective.per_objective_log("eval") | ||
| for evaluator in evaluators: | ||
| assert "eval_%s_%s" % (eval_ner_objective, evaluator) in evaluation | ||
| def test_adaptation_classification(): | ||
| lang_module = LangModule(test_base_models["sequence_classification"]) | ||
| mlm = MaskedLanguageModeling(lang_module=lang_module, | ||
| texts_or_path=paths["texts"]["unsup"], | ||
| batch_size=1) | ||
| classification = SequenceClassification(lang_module=lang_module, | ||
| texts_or_path=paths["texts"]["classification"], | ||
| labels_or_path=paths["labels"]["classification"], | ||
| batch_size=1) | ||
| parallel_schedule = ParallelSchedule(objectives=[mlm, classification], | ||
| args=training_arguments) | ||
| # 4. train using Adapter | ||
| adapter = Adapter(lang_module=lang_module, | ||
| schedule=parallel_schedule, | ||
| args=training_arguments) | ||
| adapter.train() | ||
| # 5. save the trained (multi-headed) lang_module | ||
| adapter.save_model("output_models") | ||
| # 6. reload and use it like any other Hugging Face model | ||
| classifier = AutoModelForSequenceClassification.from_pretrained("output_models/SequenceClassification") | ||
| tokenizer = AutoTokenizer.from_pretrained("output_models/SequenceClassification") | ||
| inputs = tokenizer("A piece of text to translate.", return_tensors="pt") | ||
| output = classifier(**inputs) | ||
| output_label_id = output.logits.argmax(-1)[0].item() | ||
| print("Prediction: %s" % classifier.config.id2label[output_label_id]) |
| from adaptor.evaluators.generative import GenerativeEvaluator | ||
| from adaptor.evaluators.question_answering import ExtractiveQAEvaluator | ||
| from adaptor.evaluators.sequence_classification import SeqClassificationEvaluator | ||
| from adaptor.evaluators.token_classification import TokenClassificationEvaluator | ||
| from adaptor.lang_module import LangModule | ||
| from adaptor.objectives.objective_base import Objective | ||
| from adaptor.objectives.seq2seq import Sequence2Sequence | ||
| from utils import paths, test_base_models | ||
| def assert_evaluator_logs(lang_module: LangModule, objective: Objective, split: str) -> None: | ||
| # dataset iteration test | ||
| dataset_sample = next(iter(objective.get_dataset(split, objective_i=0, device="cpu"))) | ||
| # providing labels makes HF lang_module to compute its own loss, which is in DA redundantly done by Objective | ||
| outputs = lang_module(**dataset_sample) | ||
| # request objective for its loss | ||
| loss = objective.compute_loss(outputs, dataset_sample["labels"], dataset_sample, split) | ||
| assert loss.item() | ||
| log = objective.per_objective_log(split) | ||
| # assert that objective's id can be found in each key of the logs | ||
| assert all(str(objective) in k for k in log.keys()) | ||
| for split_evaluator in objective.evaluators[split]: | ||
| # assert that each evaluator of given split was logged and has a value of expected type | ||
| assert any(str(split_evaluator) in k and isinstance(v, float) for k, v in log.items()) | ||
| gen_lang_module = LangModule(test_base_models["translation_mono"]) | ||
| gen_lang_module_multi = LangModule(test_base_models["translation_multi"]["model"]) | ||
| def assert_gen_evaluator_logs(evaluator: GenerativeEvaluator, split: str) -> None: | ||
| gen_objective = Sequence2Sequence(gen_lang_module, | ||
| texts_or_path=paths["texts"]["translation"], | ||
| labels_or_path=paths["labels"]["translation"], | ||
| batch_size=1, | ||
| train_evaluators=[evaluator], | ||
| val_evaluators=[evaluator]) | ||
| assert_evaluator_logs(gen_lang_module, gen_objective, split) | ||
| def assert_gen_evaluator_logs_mbart(evaluator: GenerativeEvaluator, split: str) -> None: | ||
| gen_objective = Sequence2Sequence(gen_lang_module_multi, | ||
| texts_or_path=paths["texts"]["translation"], | ||
| labels_or_path=paths["labels"]["translation"], | ||
| batch_size=1, | ||
| train_evaluators=[evaluator], | ||
| val_evaluators=[evaluator], | ||
| source_lang_id=test_base_models["translation_multi"]["test_src_lang"], | ||
| target_lang_id=test_base_models["translation_multi"]["test_tgt_lang"]) | ||
| assert_evaluator_logs(gen_lang_module_multi, gen_objective, split) | ||
| def assert_ner_evaluator_logs(evaluator: TokenClassificationEvaluator, split: str) -> None: | ||
| from adaptor.objectives.classification import TokenClassification | ||
| lang_module = LangModule(test_base_models["token_classification"]) | ||
| gen_objective = TokenClassification(lang_module, | ||
| texts_or_path=paths["texts"]["ner"], | ||
| labels_or_path=paths["labels"]["ner"], | ||
| batch_size=1, | ||
| train_evaluators=[evaluator], | ||
| val_evaluators=[evaluator]) | ||
| assert_evaluator_logs(lang_module, gen_objective, split) | ||
| def assert_classification_evaluator_logs(evaluator: SeqClassificationEvaluator, split: str) -> None: | ||
| from adaptor.objectives.classification import SequenceClassification | ||
| lang_module = LangModule(test_base_models["sequence_classification"]) | ||
| gen_objective = SequenceClassification(lang_module, | ||
| texts_or_path=paths["texts"]["classification"], | ||
| labels_or_path=paths["labels"]["classification"], | ||
| batch_size=1, | ||
| train_evaluators=[evaluator], | ||
| val_evaluators=[evaluator]) | ||
| assert_evaluator_logs(lang_module, gen_objective, split) | ||
| def assert_qa_evaluator_logs(evaluator: ExtractiveQAEvaluator, split: str) -> None: | ||
| from adaptor.objectives.question_answering import ExtractiveQA | ||
| lang_module = LangModule(test_base_models["extractive_QA"]) | ||
| qa_objective = ExtractiveQA(lang_module, | ||
| texts_or_path=paths["texts"]["QA"], | ||
| text_pair_or_path=paths["text_pair"]["QA"], | ||
| labels_or_path=paths["labels"]["QA"], | ||
| batch_size=2, | ||
| train_evaluators=[evaluator], | ||
| val_evaluators=[evaluator]) | ||
| assert_evaluator_logs(lang_module, qa_objective, split) | ||
| def test_bleu(): | ||
| from adaptor.evaluators.generative import BLEU | ||
| assert_gen_evaluator_logs(BLEU(use_generate=True, decides_convergence=True), "train") | ||
| def test_bleu_mbart(): | ||
| from adaptor.evaluators.generative import BLEU | ||
| assert_gen_evaluator_logs_mbart(BLEU(use_generate=True, decides_convergence=True), "train") | ||
| def test_rouge(): | ||
| from adaptor.evaluators.generative import ROUGE | ||
| assert_gen_evaluator_logs(ROUGE(use_generate=False, decides_convergence=True), "train") | ||
| def test_bertscore(): | ||
| from adaptor.evaluators.generative import BERTScore | ||
| assert_gen_evaluator_logs(BERTScore(use_generate=False, decides_convergence=True), "train") | ||
| def test_meteor(): | ||
| from adaptor.evaluators.generative import METEOR | ||
| assert_gen_evaluator_logs(METEOR(decides_convergence=True), "train") | ||
| def test_prism(): | ||
| """ | ||
| PRISM downloads relatively big model, we omit that by default. | ||
| """ | ||
| # from adaptor.evaluators.generative import PRISM | ||
| # assert_gen_evaluator_logs(PRISM(use_cuda=False, language="en", decides_convergence=True), "train") | ||
| def test_divergence(): | ||
| """ | ||
| Default JS_Divergence uses PRISM - note that this test will download PRISM model | ||
| """ | ||
| # from adaptor.evaluators.generative import JS_Divergence | ||
| # assert_gen_evaluator_logs(JS_Divergence(decides_convergence=True), "train") | ||
| def test_token_fscore(): | ||
| from adaptor.evaluators.token_classification import MeanFScore | ||
| assert_ner_evaluator_logs(MeanFScore(decides_convergence=True), "train") | ||
| def test_sequence_accuracy(): | ||
| from adaptor.evaluators.sequence_classification import SequenceAccuracy | ||
| assert_classification_evaluator_logs(SequenceAccuracy(decides_convergence=False), "train") | ||
| def test_QA_exact_match(): | ||
| from adaptor.evaluators.question_answering import ExactMatch | ||
| assert_qa_evaluator_logs(ExactMatch(), "train") | ||
| def test_QA_fscore(): | ||
| from adaptor.evaluators.question_answering import F1ScoreForQA | ||
| assert_qa_evaluator_logs(F1ScoreForQA(), "train") | ||
| def test_QA_BLEU(): | ||
| from adaptor.evaluators.question_answering import BLEUForQA | ||
| assert_qa_evaluator_logs(BLEUForQA(), "train") |
| from adaptor.lang_module import LangModule | ||
| from adaptor.objectives.MLM import MaskedLanguageModeling | ||
| from adaptor.objectives.classification import TokenClassification | ||
| from utils import paths | ||
| def test_lang_module(): | ||
| lang_module = LangModule("bert-base-multilingual-cased") | ||
| assert lang_module | ||
| def test_register_head(): | ||
| lang_module = LangModule("bert-base-multilingual-cased") | ||
| objective = TokenClassification(lang_module, | ||
| texts_or_path=paths["texts"]["ner"], | ||
| labels_or_path=paths["labels"]["ner"], | ||
| batch_size=4) | ||
| assert objective.compatible_head_model | ||
| def test_merge_objectives(): | ||
| import torch | ||
| lang_module = LangModule("bert-base-multilingual-cased") | ||
| objective_base = TokenClassification(lang_module, | ||
| texts_or_path=paths["texts"]["ner"], | ||
| labels_or_path=paths["labels"]["ner"], | ||
| batch_size=4) | ||
| objective_new = MaskedLanguageModeling(lang_module, | ||
| texts_or_path=paths["texts"]["unsup"], | ||
| batch_size=4) | ||
| # check that merge-able modules now refer to the same physical address | ||
| for i, (new_param_key, orig_model_param) in enumerate(objective_base.compatible_head_model.named_parameters()): | ||
| if new_param_key in dict(objective_new.compatible_head_model.named_parameters()): | ||
| new_model_param = objective_new.compatible_head_model.get_parameter(new_param_key) | ||
| if orig_model_param.shape == new_model_param.shape and \ | ||
| torch.all(orig_model_param == new_model_param): | ||
| assert id(new_model_param) == id(orig_model_param) |
| from adaptor.lang_module import LangModule | ||
| from adaptor.objectives.CLM import CausalLanguageModeling | ||
| from adaptor.objectives.MLM import MaskedLanguageModeling | ||
| from adaptor.objectives.backtranslation import BackTranslation, BackTranslator | ||
| from adaptor.objectives.classification import TokenClassification | ||
| from adaptor.objectives.denoising import DenoisingObjective | ||
| from adaptor.objectives.objective_base import Objective | ||
| from adaptor.objectives.question_answering import ExtractiveQA | ||
| from adaptor.objectives.seq2seq import Sequence2Sequence | ||
| from utils import paths, test_base_models | ||
| def assert_module_objective_ok(lang_module: LangModule, objective: Objective, split: str = "train"): | ||
| # dataset iteration test | ||
| dataset_sample = next(iter(objective.get_dataset(split, objective_i=0, device="cpu"))) | ||
| # providing labels makes HF lang_module to compute its own loss, which is in DA redundantly done by Objective | ||
| outputs = lang_module(**dataset_sample) | ||
| # loss computation test, possible label smoothing is performed by Adapter | ||
| loss = objective.compute_loss(outputs, dataset_sample["labels"], dataset_sample, split) | ||
| # check that retrieved loss has a backward_fn | ||
| loss.backward() | ||
| assert True | ||
| def test_token_classification_objective(): | ||
| lang_module = LangModule(test_base_models["token_classification"]) | ||
| objective = TokenClassification(lang_module, | ||
| texts_or_path=paths["texts"]["ner"], | ||
| labels_or_path=paths["labels"]["ner"], | ||
| batch_size=4) | ||
| assert_module_objective_ok(lang_module, objective) | ||
| def test_mlm_objective(): | ||
| lang_module = LangModule(test_base_models["token_classification"]) | ||
| objective = MaskedLanguageModeling(lang_module, | ||
| texts_or_path=paths["texts"]["unsup"], | ||
| batch_size=4) | ||
| assert_module_objective_ok(lang_module, objective) | ||
| def test_clm_unsup_objective_bert(): | ||
| lang_module = LangModule(test_base_models["token_classification"]) | ||
| objective = CausalLanguageModeling(lang_module, | ||
| texts_or_path=paths["texts"]["unsup"], | ||
| batch_size=4) | ||
| assert_module_objective_ok(lang_module, objective) | ||
| def test_clm_unsup_objective_marian(): | ||
| lang_module = LangModule(test_base_models["translation_mono"]) | ||
| objective = CausalLanguageModeling(lang_module, | ||
| texts_or_path=paths["texts"]["unsup"], | ||
| batch_size=4) | ||
| assert_module_objective_ok(lang_module, objective) | ||
| def test_denoising_objective(): | ||
| lang_module = LangModule(test_base_models["translation_mono"]) | ||
| objective = DenoisingObjective(lang_module, texts_or_path=paths["texts"]["unsup"], batch_size=4) | ||
| assert_module_objective_ok(lang_module, objective) | ||
| def test_backtranslation_objective(): | ||
| # we are adapting cs->en translator with back-translation | ||
| lang_module = LangModule(test_base_models["translation_mono"]) | ||
| translator = BackTranslator("Helsinki-NLP/opus-mt-en-cs") | ||
| objective = BackTranslation(lang_module, | ||
| back_translator=translator, | ||
| texts_or_path=paths["texts"]["unsup"], | ||
| batch_size=4) | ||
| assert_module_objective_ok(lang_module, objective) | ||
| def test_backtranslation_objective_mbart(): | ||
| # we are adapting cs->en translator with back-translation | ||
| lang_module = LangModule(test_base_models["translation_multi"]["model"]) | ||
| translator = BackTranslator("Helsinki-NLP/opus-mt-en-cs") | ||
| objective = BackTranslation(lang_module, | ||
| back_translator=translator, | ||
| texts_or_path=paths["texts"]["unsup"], | ||
| batch_size=4, | ||
| source_lang_id=test_base_models["translation_multi"]["test_src_lang"], | ||
| target_lang_id=test_base_models["translation_multi"]["test_tgt_lang"]) | ||
| assert_module_objective_ok(lang_module, objective) | ||
| def test_supervised_seq2seq_objective(): | ||
| lang_module = LangModule(test_base_models["translation_mono"]) | ||
| objective = Sequence2Sequence(lang_module, | ||
| texts_or_path=paths["texts"]["translation"], | ||
| labels_or_path=paths["labels"]["translation"], | ||
| batch_size=4) | ||
| assert_module_objective_ok(lang_module, objective) | ||
| def test_supervised_seq2seq_objective_mbart(): | ||
| # we are adapting cs->en translator with back-translation | ||
| lang_module = LangModule(test_base_models["translation_multi"]["model"]) | ||
| objective = Sequence2Sequence(lang_module, | ||
| texts_or_path=paths["texts"]["translation"], | ||
| labels_or_path=paths["labels"]["translation"], | ||
| batch_size=4, | ||
| source_lang_id=test_base_models["translation_multi"]["test_src_lang"], | ||
| target_lang_id=test_base_models["translation_multi"]["test_tgt_lang"]) | ||
| assert_module_objective_ok(lang_module, objective) | ||
| def test_supervised_QA_objective(): | ||
| lang_module = LangModule(test_base_models["extractive_QA"]) | ||
| objective = ExtractiveQA(lang_module, | ||
| texts_or_path=paths["texts"]["QA"], | ||
| text_pair_or_path=paths["text_pair"]["QA"], | ||
| labels_or_path=paths["labels"]["QA"], | ||
| batch_size=4) | ||
| assert_module_objective_ok(lang_module, objective) |
| from adaptor.lang_module import LangModule | ||
| from adaptor.objectives.MLM import MaskedLanguageModeling | ||
| from adaptor.objectives.classification import TokenClassification | ||
| from adaptor.objectives.denoising import DenoisingObjective | ||
| from adaptor.objectives.seq2seq import Sequence2Sequence | ||
| from adaptor.schedules import SequentialSchedule, Schedule, ParallelSchedule | ||
| from adaptor.utils import AdaptationArguments, StoppingStrategy | ||
| from utils import test_base_models, paths | ||
| unsup_target_domain_texts = "mock_data/domain_unsup.txt" | ||
| sup_target_domain_texts = "mock_data/supervised_texts.txt" | ||
| sup_target_domain_labels = "mock_data/supervised_texts_token_labels.txt" | ||
| sup_translation_texts_src = "mock_data/seq2seq_sources.txt" | ||
| sup_translation_texts_tgt = "mock_data/seq2seq_targets.txt" | ||
| args = AdaptationArguments(output_dir="adaptation_output_dir", | ||
| stopping_strategy=StoppingStrategy.FIRST_OBJECTIVE_CONVERGED, | ||
| do_train=True, | ||
| do_eval=True, | ||
| gradient_accumulation_steps=2, | ||
| log_level="critical", | ||
| logging_steps=1, | ||
| eval_steps=1, | ||
| num_train_epochs=10) | ||
| def assert_schedule(lang_module: LangModule, schedule: Schedule): | ||
| for batch in iter(schedule.iterable_dataset("train")): | ||
| logit_outputs = lang_module(**batch) | ||
| loss_combined = schedule.compute_loss(logit_outputs, batch["labels"], batch) | ||
| loss_combined.backward() | ||
| # every objective has some key in its logs | ||
| train_logs = schedule.objectives_log("train") | ||
| assert all(any(str(obj) for log_key, _ in train_logs.items()) for obj in schedule.objectives["train"].keys()) | ||
| for batch in iter(schedule.iterable_dataset("eval")): | ||
| logit_outputs = lang_module(**batch) | ||
| loss_combined = schedule.compute_loss(logit_outputs, batch["labels"], batch) | ||
| loss_combined.backward() | ||
| eval_logs = schedule.objectives_log("eval") | ||
| assert all(any(str(obj) for log_key, _ in eval_logs.items()) for obj in schedule.objectives["eval"].keys()) | ||
| assert True | ||
| def ner_da_schedule(schedule_type): | ||
| lang_module = LangModule(test_base_models["token_classification"]) | ||
| lm_adaptation = MaskedLanguageModeling(lang_module, | ||
| texts_or_path=unsup_target_domain_texts, | ||
| val_texts_or_path=unsup_target_domain_texts, | ||
| batch_size=1) | ||
| token_classification = TokenClassification(lang_module, | ||
| texts_or_path=sup_target_domain_texts, | ||
| labels_or_path=sup_target_domain_labels, | ||
| val_texts_or_path=sup_target_domain_texts, | ||
| val_labels_or_path=sup_target_domain_labels, | ||
| batch_size=1) | ||
| assert_schedule(lang_module, schedule_type(objectives=[lm_adaptation, token_classification], args=args)) | ||
| def test_ner_da_schedule_sequential(): | ||
| ner_da_schedule(SequentialSchedule) | ||
| def test_ner_da_schedule_strided(): | ||
| ner_da_schedule(ParallelSchedule) | ||
| def test_mt_da_schedule(): | ||
| lang_module = LangModule(test_base_models["translation_mono"]) | ||
| denoising_adaptation = DenoisingObjective(lang_module, | ||
| texts_or_path=unsup_target_domain_texts, | ||
| val_texts_or_path=unsup_target_domain_texts, | ||
| batch_size=1) | ||
| clm_finetuning = Sequence2Sequence(lang_module, | ||
| texts_or_path=sup_translation_texts_src, | ||
| labels_or_path=sup_translation_texts_tgt, | ||
| val_texts_or_path=sup_translation_texts_src, | ||
| val_labels_or_path=sup_translation_texts_tgt, | ||
| batch_size=1) | ||
| assert_schedule(lang_module, SequentialSchedule(objectives=[denoising_adaptation, clm_finetuning], args=args)) | ||
| def test_multilang_multiobj_langs_do_match(): | ||
| # we check that BoS tokens in objectives (sharing the tokenizer) are resolved correctly, in both inputs and labels | ||
| lang_module = LangModule(test_base_models["translation_multi"]["model"]) | ||
| objectives = [Sequence2Sequence(lang_module, | ||
| texts_or_path=paths["texts"]["translation"], | ||
| labels_or_path=paths["labels"]["translation"], | ||
| batch_size=1, | ||
| source_lang_id=test_base_models["translation_multi"]["test_src_lang"], | ||
| target_lang_id=test_base_models["translation_multi"]["test_tgt_lang"]), | ||
| Sequence2Sequence(lang_module, | ||
| texts_or_path=paths["labels"]["translation"], | ||
| labels_or_path=paths["texts"]["translation"], | ||
| batch_size=1, | ||
| source_lang_id=test_base_models["translation_multi"]["test_tgt_lang"], | ||
| target_lang_id=test_base_models["translation_multi"]["test_src_lang"])] | ||
| schedule = ParallelSchedule(objectives, args=args) | ||
| # we iterate over two batches, associated with objectives in the corresponding order | ||
| for objective, objective_batch in zip(objectives * 2, schedule.iterable_dataset("train")): | ||
| sample_input_lang = lang_module.tokenizer.decode([t_id for t_id in objective_batch["input_ids"][0] | ||
| if t_id in lang_module.tokenizer.lang_code_to_id.values()][0]) | ||
| sample_label_lang = lang_module.tokenizer.decode([t_id for t_id in objective_batch["labels"][0] | ||
| if t_id in lang_module.tokenizer.lang_code_to_id.values()][0]) | ||
| assert sample_input_lang == objective.source_lang_id | ||
| assert sample_label_lang == objective.target_lang_id |
| from adaptor.utils import AdaptationArguments, StoppingStrategy | ||
| paths = { | ||
| "texts": { | ||
| "ner": "mock_data/supervised_texts.txt", | ||
| "classification": "mock_data/supervised_texts.txt", | ||
| "translation": "mock_data/seq2seq_sources.txt", | ||
| "unsup": "mock_data/domain_unsup.txt", | ||
| "QA": "mock_data/QA_questions.txt" | ||
| }, | ||
| "labels": { | ||
| "ner": "mock_data/supervised_texts_token_labels.txt", | ||
| "classification": "mock_data/supervised_texts_sequence_labels.txt", | ||
| "translation": "mock_data/seq2seq_targets.txt", | ||
| "QA": "mock_data/QA_answers.txt" | ||
| }, | ||
| "text_pair": { | ||
| "QA": "mock_data/QA_contexts.txt" | ||
| } | ||
| } | ||
| test_base_models = { | ||
| "translation_mono": "Helsinki-NLP/opus-mt-en-cs", | ||
| "translation_multi": { | ||
| "model": "sshleifer/tiny-mbart", | ||
| "test_src_lang": "en_XX", | ||
| "test_tgt_lang": "cs_CZ" | ||
| }, | ||
| "token_classification": "bert-base-cased", | ||
| "sequence_classification": "bert-base-cased", | ||
| "extractive_QA": "bert-base-cased", | ||
| "MLM": "bert-base-cased", | ||
| "MLM_student": "distilbert-base-cased" | ||
| } | ||
| training_arguments = AdaptationArguments(output_dir="adaptation_output_dir", | ||
| stopping_strategy=StoppingStrategy.FIRST_OBJECTIVE_NUM_EPOCHS, | ||
| do_train=True, | ||
| do_eval=True, | ||
| gradient_accumulation_steps=2, | ||
| log_level="critical", | ||
| logging_steps=1, | ||
| num_train_epochs=2) |
Alert delta unavailable
Currently unable to show alert delta for PyPI packages.
208115
-39.28%34
-20.93%2888
-14.98%