You're Invited:Meet the Socket Team at RSAC and BSidesSF 2026, March 23–26.RSVP
Socket
Book a DemoSign in
Socket

adaptor

Package Overview
Dependencies
Maintainers
1
Versions
13
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

adaptor - pypi Package Compare versions

Comparing version
0.2.3
to
0.2.4
+2
MANIFEST.in
include LICENSE
include README.md
+13
-6
Metadata-Version: 2.1
Name: adaptor
Version: 0.2.3
Version: 0.2.4
Summary: Adaptor: Objective-centric Adaptation Framework for Language Models.

@@ -9,3 +9,2 @@ Home-page: https://github.com/gaussalgo/adaptor

License: MIT
Platform: UNKNOWN
Classifier: Intended Audience :: Developers

@@ -16,7 +15,17 @@ Classifier: Intended Audience :: Education

Classifier: Programming Language :: Python :: 3.8
Requires-Python: >=3.7
Requires-Python: >=3.8
Description-Content-Type: text/markdown
License-File: LICENSE
Requires-Dist: torch>=1.7
Requires-Dist: transformers<=4.30.2
Requires-Dist: sentencepiece
Requires-Dist: accelerate>=0.20.1
Provides-Extra: generative
Requires-Dist: sacrebleu; extra == "generative"
Requires-Dist: rouge-score; extra == "generative"
Requires-Dist: bert-score; extra == "generative"
Requires-Dist: fairseq; extra == "generative"
Requires-Dist: protobuf<=3.20.1; extra == "generative"
Provides-Extra: examples
License-File: LICENSE
Requires-Dist: comet-ml; extra == "examples"

@@ -288,3 +297,1 @@ # Adaptor: Objective-centric Adaptation library

If you have any other question(s), feel free to create an issue.

@@ -1,5 +0,4 @@

.gitignore
LICENSE
MANIFEST.in
README.md
dockerfile.in
setup.cfg

@@ -34,10 +33,2 @@ setup.py

adaptor/objectives/question_answering.py
adaptor/objectives/seq2seq.py
docs/user_flow.png
tests/adapter_test.py
tests/end2end_usecases_test.py
tests/evaluators_test.py
tests/lang_module_test.py
tests/objectives_test.py
tests/schedules_test.py
tests/utils.py
adaptor/objectives/seq2seq.py

@@ -161,4 +161,4 @@ import logging

list_of_model_specific_inputs = inspect.getfullargspec(selected_head_model.forward).args
model_specific_inputs = {k: v for k, v in inputs.items() if k in list_of_model_specific_inputs}
model_specific_inputs = {k: v for k, v in inputs.items()
if k in list_of_model_specific_inputs and k not in ("label", "labels")}
# including labels cause the loss to be computed twice - by objective + by HF models forward()

@@ -165,0 +165,0 @@ # but labels are also used to infer decoder_input_ids of some models, so we need to pass it

import abc
from typing import List, Union, Dict, Iterable, Optional
from typing import Union, Dict, Optional, List, Iterable, Iterator

@@ -33,3 +33,3 @@ import torch

features: List[Union[BatchEncoding, Dict[str, Iterable[Union[int, float]]]]],
return_tensors=None) -> BatchEncoding:
return_tensors=None) -> Iterable[Dict[str, torch.Tensor]]:
"""

@@ -101,4 +101,40 @@ Custom DataCollator allowing to apply CausalLM also on models with fully-attended encoder.

self.collator = DataCollatorForCausalLM(self.tokenizer, self.compatible_head_model)
self.collator = DataCollatorForSeq2Seq(self.tokenizer, self.compatible_head_model, pad_to_multiple_of=8)
def _get_seq2seq_collated_iterator(self,
source_texts: Iterable[str],
target_texts: Iterable[str]) -> Iterator[BatchEncoding]:
"""
Creates an iterator over batches of encoded `source_texts` as inputs and `target_texts` as labels.
Override this to implement custom mapping, or unsupervised seq2seq objective. See e.g. BackTranslation.
:param source_texts: Input texts.
:param target_texts: Output (expected) texts to translate input texts into.
:return: Iterator of encoded batches.
"""
features_batch = []
asserted_equal = False # speedup: avoid repeated assertions of string equality
self.tokenizer.src_lang = self.source_lang_id
self.tokenizer.tgt_lang = self.target_lang_id
for source_text, target_text in zip(source_texts, target_texts):
if not asserted_equal:
assert source_text == target_text, ("CLM objective expects both texts to be the same. "
"If you override this objective, it's possible that you should "
"rather override SequentialMixin supporting different src and tgt.")
asserted_equal = True
sample_features = self.tokenizer(source_text, truncation=True)
sample_targets = self.tokenizer(target_text, truncation=True)
features_batch.append({"input_ids": sample_features.input_ids,
"attention_mask": sample_features.attention_mask,
"labels": sample_targets.input_ids})
if len(features_batch) == self.batch_size:
yield self.collator(features_batch)
features_batch = []
if features_batch:
# yield last nonempty residual batch
yield self.collator(features_batch)
def _compute_loss(self,

@@ -105,0 +141,0 @@ logit_outputs: torch.FloatTensor,

@@ -436,3 +436,6 @@ import abc

Default inputs iterator for unsupervised objectives. Returns input texts as both inputs and labels.
Not meant to be overriden when implementing custom data set. Instead choose to inherit either
Not meant to be overriden when implementing custom data set. Instead inherit
from SupervisedObjective, or UnsupervisedObjective (or their ancestors),
or override _get_inputs_iterator() instead.
:param split: Data split to iterate over

@@ -439,0 +442,0 @@ :return: a pair of identical [inputs_iterator, inputs_iterator]

Metadata-Version: 2.1
Name: adaptor
Version: 0.2.3
Version: 0.2.4
Summary: Adaptor: Objective-centric Adaptation Framework for Language Models.

@@ -9,3 +9,2 @@ Home-page: https://github.com/gaussalgo/adaptor

License: MIT
Platform: UNKNOWN
Classifier: Intended Audience :: Developers

@@ -16,7 +15,17 @@ Classifier: Intended Audience :: Education

Classifier: Programming Language :: Python :: 3.8
Requires-Python: >=3.7
Requires-Python: >=3.8
Description-Content-Type: text/markdown
License-File: LICENSE
Requires-Dist: torch>=1.7
Requires-Dist: transformers<=4.30.2
Requires-Dist: sentencepiece
Requires-Dist: accelerate>=0.20.1
Provides-Extra: generative
Requires-Dist: sacrebleu; extra == "generative"
Requires-Dist: rouge-score; extra == "generative"
Requires-Dist: bert-score; extra == "generative"
Requires-Dist: fairseq; extra == "generative"
Requires-Dist: protobuf<=3.20.1; extra == "generative"
Provides-Extra: examples
License-File: LICENSE
Requires-Dist: comet-ml; extra == "examples"

@@ -288,3 +297,1 @@ # Adaptor: Objective-centric Adaptation library

If you have any other question(s), feel free to create an issue.

@@ -12,3 +12,3 @@ #!/usr/bin/env python

name="adaptor",
version='0.2.3',
version='0.2.4',
description="Adaptor: Objective-centric Adaptation Framework for Language Models.",

@@ -27,4 +27,5 @@ long_description_content_type="text/markdown",

url="https://github.com/gaussalgo/adaptor",
python_requires=">=3.7",
python_requires=">=3.8",
license="MIT",
license_files=["LICENSE"],
packages=find_packages(include=["adaptor", "adaptor.*"]),

@@ -31,0 +32,0 @@ include_package_data=True,

.venv/
**/__pycache__/
adaptor.egg-info/
examples/adaptation_output_dir
tests/adaptation_output_dir
tests/translator_model
entity_detector_model/
bin/
lib/
share/
pyvenv.cfg
m39v1.tar
m39v1/
.idea
dist
# this is an example dockerfile, that can be used for a quick resolution of gpu-supporting env
FROM pytorch/pytorch:latest
CMD apt update && apt install -y git
CMD git clone <this repo>.git && cd adaptor && python3.8 -m pip install -e .
# DPP training: chmod 777 /dev/shm && chmod 777 /dev/shm/*

Sorry, the diff of this file is not supported yet

from adaptor.adapter import Adapter
from adaptor.lang_module import LangModule
from adaptor.objectives.MLM import MaskedLanguageModeling
from adaptor.objectives.backtranslation import BackTranslation, BackTranslator
from adaptor.objectives.classification import TokenClassification
from adaptor.objectives.denoising import DenoisingObjective
from adaptor.objectives.seq2seq import Sequence2Sequence
from adaptor.schedules import SequentialSchedule
from adaptor.utils import AdaptationArguments, StoppingStrategy
from utils import paths, test_base_models
training_arguments = AdaptationArguments(output_dir="adaptation_output_dir",
stopping_strategy=StoppingStrategy.FIRST_OBJECTIVE_NUM_EPOCHS,
do_train=True,
do_eval=True,
gradient_accumulation_steps=2,
log_level="critical",
logging_steps=1,
num_train_epochs=3)
def run_adaptation(adapter: Adapter, trained_model_output_dir: str = "adaptation_output_dir/finished"):
adapter.train()
adapter.save_model(trained_model_output_dir)
def test_ner_adaptation():
lang_module = LangModule(test_base_models["token_classification"])
objectives = [MaskedLanguageModeling(lang_module,
texts_or_path=paths["texts"]["unsup"],
batch_size=1),
TokenClassification(lang_module,
texts_or_path=paths["texts"]["ner"],
labels_or_path=paths["labels"]["ner"],
batch_size=1)]
schedule = SequentialSchedule(objectives, training_arguments)
adapter = Adapter(lang_module, schedule, args=training_arguments)
run_adaptation(adapter)
def test_mt_adaptation():
lang_module = LangModule(test_base_models["translation_mono"])
objectives = [DenoisingObjective(lang_module,
texts_or_path=paths["texts"]["unsup"],
batch_size=1),
Sequence2Sequence(lang_module,
texts_or_path=paths["texts"]["translation"],
labels_or_path=paths["labels"]["translation"],
batch_size=1)]
schedule = SequentialSchedule(objectives, training_arguments)
adapter = Adapter(lang_module, schedule, args=training_arguments)
run_adaptation(adapter)
def test_mt_adaptation_bt():
lang_module = LangModule(test_base_models["translation_mono"])
translator = BackTranslator("Helsinki-NLP/opus-mt-cs-en")
objectives = [BackTranslation(lang_module,
back_translator=translator,
texts_or_path=paths["texts"]["unsup"],
batch_size=4),
Sequence2Sequence(lang_module,
texts_or_path=paths["texts"]["translation"],
labels_or_path=paths["labels"]["translation"],
batch_size=1)]
schedule = SequentialSchedule(objectives, training_arguments)
adapter = Adapter(lang_module, schedule, args=training_arguments)
run_adaptation(adapter)
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, AutoModelForTokenClassification, \
AutoModelForSequenceClassification
from adaptor.adapter import Adapter
from adaptor.evaluators.generative import BLEU
from adaptor.evaluators.token_classification import MeanFScore, AverageAccuracy
from adaptor.lang_module import LangModule
from adaptor.objectives.MLM import MaskedLanguageModeling
from adaptor.objectives.backtranslation import BackTranslation, BackTranslator
from adaptor.objectives.classification import SequenceClassification, TokenClassification
from adaptor.objectives.seq2seq import Sequence2Sequence
from adaptor.schedules import ParallelSchedule, SequentialSchedule
from utils import training_arguments, paths, test_base_models
unsup_target_domain_texts = paths["texts"]["unsup"]
sup_target_domain_texts = paths["texts"]["ner"]
sup_target_domain_labels = paths["labels"]["ner"]
ner_model_out_dir = "entity_detector_model"
def test_adaptation_ner():
# 1. pick the models - randomly pre-initialize the appropriate heads
lang_module = LangModule(test_base_models["token_classification"])
# 2. pick objectives
# Objectives take either List[str] for in-memory iteration, or a source file path for streamed iteration
objectives = [MaskedLanguageModeling(lang_module,
batch_size=1,
texts_or_path=paths["texts"]["unsup"]),
TokenClassification(lang_module,
batch_size=1,
texts_or_path=paths["texts"]["ner"],
labels_or_path=paths["labels"]["ner"])]
# 4. pick a schedule of the selected objectives
schedule = SequentialSchedule(objectives, training_arguments)
# 5. Run the training using Adapter, similarly to running HF.Trainer, only adding `schedule`
adapter = Adapter(lang_module, schedule, training_arguments)
adapter.train()
# 6. save the trained lang_module (with all heads)
adapter.save_model(ner_model_out_dir)
# 7. reload and use it like any other Hugging Face model
ner_model = AutoModelForTokenClassification.from_pretrained("%s/TokenClassification" % ner_model_out_dir)
tokenizer = AutoTokenizer.from_pretrained("%s/TokenClassification" % ner_model_out_dir)
inputs = tokenizer("Is there any Abraham Lincoln here?", return_tensors="pt")
outputs = ner_model(**inputs)
ner_tags = [ner_model.config.id2label[label_id.item()] for label_id in outputs.logits[0].argmax(-1)]
assert ner_tags
def test_adaptation_translation():
# 1. pick the models - randomly pre-initialize the appropriate heads
lang_module = LangModule(test_base_models["translation_mono"])
# (optional) pick train and validation evaluators for the objectives
seq2seq_evaluators = [BLEU(use_generate=True, decides_convergence=True)]
# 2. pick objectives
objectives = [BackTranslation(lang_module,
back_translator=BackTranslator("Helsinki-NLP/opus-mt-cs-en"),
batch_size=1,
texts_or_path=paths["texts"]["unsup"]),
Sequence2Sequence(lang_module, batch_size=1,
texts_or_path=paths["texts"]["translation"],
val_evaluators=seq2seq_evaluators,
labels_or_path=paths["labels"]["translation"])]
# 3. pick a schedule of the selected objectives
# this one will shuffle the batches of both objectives
schedule = ParallelSchedule(objectives, training_arguments)
# 4. train using Adapter
adapter = Adapter(lang_module, schedule, training_arguments)
adapter.train()
# 5. save the trained (multi-headed) lang_module
adapter.save_model("translator_model")
# 6. reload and use it like any other Hugging Face model
translator_model = AutoModelForSeq2SeqLM.from_pretrained("translator_model/Sequence2Sequence")
tokenizer = AutoTokenizer.from_pretrained("translator_model/Sequence2Sequence")
tokenizer.src_lang, tokenizer.tgt_lang = "en", "cs"
inputs = tokenizer("A piece of text to translate.", return_tensors="pt")
output_ids = translator_model.generate(**inputs)
output_text = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
print(output_text)
def test_evaluation_ner():
# first, create a model to evaluate:
test_adaptation_ner()
# reload LangModule from this directory
lang_module = LangModule("%s/TokenClassification" % ner_model_out_dir)
evaluators = [MeanFScore(), AverageAccuracy()]
# evaluate the result again through the Objective, that takes care of labels alignment
eval_ner_objective = TokenClassification(lang_module,
batch_size=1,
texts_or_path=[],
labels_or_path=[],
val_texts_or_path=paths["texts"]["ner"],
val_labels_or_path=paths["labels"]["ner"],
val_evaluators=evaluators)
evaluation = eval_ner_objective.per_objective_log("eval")
for evaluator in evaluators:
assert "eval_%s_%s" % (eval_ner_objective, evaluator) in evaluation
def test_adaptation_classification():
lang_module = LangModule(test_base_models["sequence_classification"])
mlm = MaskedLanguageModeling(lang_module=lang_module,
texts_or_path=paths["texts"]["unsup"],
batch_size=1)
classification = SequenceClassification(lang_module=lang_module,
texts_or_path=paths["texts"]["classification"],
labels_or_path=paths["labels"]["classification"],
batch_size=1)
parallel_schedule = ParallelSchedule(objectives=[mlm, classification],
args=training_arguments)
# 4. train using Adapter
adapter = Adapter(lang_module=lang_module,
schedule=parallel_schedule,
args=training_arguments)
adapter.train()
# 5. save the trained (multi-headed) lang_module
adapter.save_model("output_models")
# 6. reload and use it like any other Hugging Face model
classifier = AutoModelForSequenceClassification.from_pretrained("output_models/SequenceClassification")
tokenizer = AutoTokenizer.from_pretrained("output_models/SequenceClassification")
inputs = tokenizer("A piece of text to translate.", return_tensors="pt")
output = classifier(**inputs)
output_label_id = output.logits.argmax(-1)[0].item()
print("Prediction: %s" % classifier.config.id2label[output_label_id])
from adaptor.evaluators.generative import GenerativeEvaluator
from adaptor.evaluators.question_answering import ExtractiveQAEvaluator
from adaptor.evaluators.sequence_classification import SeqClassificationEvaluator
from adaptor.evaluators.token_classification import TokenClassificationEvaluator
from adaptor.lang_module import LangModule
from adaptor.objectives.objective_base import Objective
from adaptor.objectives.seq2seq import Sequence2Sequence
from utils import paths, test_base_models
def assert_evaluator_logs(lang_module: LangModule, objective: Objective, split: str) -> None:
# dataset iteration test
dataset_sample = next(iter(objective.get_dataset(split, objective_i=0, device="cpu")))
# providing labels makes HF lang_module to compute its own loss, which is in DA redundantly done by Objective
outputs = lang_module(**dataset_sample)
# request objective for its loss
loss = objective.compute_loss(outputs, dataset_sample["labels"], dataset_sample, split)
assert loss.item()
log = objective.per_objective_log(split)
# assert that objective's id can be found in each key of the logs
assert all(str(objective) in k for k in log.keys())
for split_evaluator in objective.evaluators[split]:
# assert that each evaluator of given split was logged and has a value of expected type
assert any(str(split_evaluator) in k and isinstance(v, float) for k, v in log.items())
gen_lang_module = LangModule(test_base_models["translation_mono"])
gen_lang_module_multi = LangModule(test_base_models["translation_multi"]["model"])
def assert_gen_evaluator_logs(evaluator: GenerativeEvaluator, split: str) -> None:
gen_objective = Sequence2Sequence(gen_lang_module,
texts_or_path=paths["texts"]["translation"],
labels_or_path=paths["labels"]["translation"],
batch_size=1,
train_evaluators=[evaluator],
val_evaluators=[evaluator])
assert_evaluator_logs(gen_lang_module, gen_objective, split)
def assert_gen_evaluator_logs_mbart(evaluator: GenerativeEvaluator, split: str) -> None:
gen_objective = Sequence2Sequence(gen_lang_module_multi,
texts_or_path=paths["texts"]["translation"],
labels_or_path=paths["labels"]["translation"],
batch_size=1,
train_evaluators=[evaluator],
val_evaluators=[evaluator],
source_lang_id=test_base_models["translation_multi"]["test_src_lang"],
target_lang_id=test_base_models["translation_multi"]["test_tgt_lang"])
assert_evaluator_logs(gen_lang_module_multi, gen_objective, split)
def assert_ner_evaluator_logs(evaluator: TokenClassificationEvaluator, split: str) -> None:
from adaptor.objectives.classification import TokenClassification
lang_module = LangModule(test_base_models["token_classification"])
gen_objective = TokenClassification(lang_module,
texts_or_path=paths["texts"]["ner"],
labels_or_path=paths["labels"]["ner"],
batch_size=1,
train_evaluators=[evaluator],
val_evaluators=[evaluator])
assert_evaluator_logs(lang_module, gen_objective, split)
def assert_classification_evaluator_logs(evaluator: SeqClassificationEvaluator, split: str) -> None:
from adaptor.objectives.classification import SequenceClassification
lang_module = LangModule(test_base_models["sequence_classification"])
gen_objective = SequenceClassification(lang_module,
texts_or_path=paths["texts"]["classification"],
labels_or_path=paths["labels"]["classification"],
batch_size=1,
train_evaluators=[evaluator],
val_evaluators=[evaluator])
assert_evaluator_logs(lang_module, gen_objective, split)
def assert_qa_evaluator_logs(evaluator: ExtractiveQAEvaluator, split: str) -> None:
from adaptor.objectives.question_answering import ExtractiveQA
lang_module = LangModule(test_base_models["extractive_QA"])
qa_objective = ExtractiveQA(lang_module,
texts_or_path=paths["texts"]["QA"],
text_pair_or_path=paths["text_pair"]["QA"],
labels_or_path=paths["labels"]["QA"],
batch_size=2,
train_evaluators=[evaluator],
val_evaluators=[evaluator])
assert_evaluator_logs(lang_module, qa_objective, split)
def test_bleu():
from adaptor.evaluators.generative import BLEU
assert_gen_evaluator_logs(BLEU(use_generate=True, decides_convergence=True), "train")
def test_bleu_mbart():
from adaptor.evaluators.generative import BLEU
assert_gen_evaluator_logs_mbart(BLEU(use_generate=True, decides_convergence=True), "train")
def test_rouge():
from adaptor.evaluators.generative import ROUGE
assert_gen_evaluator_logs(ROUGE(use_generate=False, decides_convergence=True), "train")
def test_bertscore():
from adaptor.evaluators.generative import BERTScore
assert_gen_evaluator_logs(BERTScore(use_generate=False, decides_convergence=True), "train")
def test_meteor():
from adaptor.evaluators.generative import METEOR
assert_gen_evaluator_logs(METEOR(decides_convergence=True), "train")
def test_prism():
"""
PRISM downloads relatively big model, we omit that by default.
"""
# from adaptor.evaluators.generative import PRISM
# assert_gen_evaluator_logs(PRISM(use_cuda=False, language="en", decides_convergence=True), "train")
def test_divergence():
"""
Default JS_Divergence uses PRISM - note that this test will download PRISM model
"""
# from adaptor.evaluators.generative import JS_Divergence
# assert_gen_evaluator_logs(JS_Divergence(decides_convergence=True), "train")
def test_token_fscore():
from adaptor.evaluators.token_classification import MeanFScore
assert_ner_evaluator_logs(MeanFScore(decides_convergence=True), "train")
def test_sequence_accuracy():
from adaptor.evaluators.sequence_classification import SequenceAccuracy
assert_classification_evaluator_logs(SequenceAccuracy(decides_convergence=False), "train")
def test_QA_exact_match():
from adaptor.evaluators.question_answering import ExactMatch
assert_qa_evaluator_logs(ExactMatch(), "train")
def test_QA_fscore():
from adaptor.evaluators.question_answering import F1ScoreForQA
assert_qa_evaluator_logs(F1ScoreForQA(), "train")
def test_QA_BLEU():
from adaptor.evaluators.question_answering import BLEUForQA
assert_qa_evaluator_logs(BLEUForQA(), "train")
from adaptor.lang_module import LangModule
from adaptor.objectives.MLM import MaskedLanguageModeling
from adaptor.objectives.classification import TokenClassification
from utils import paths
def test_lang_module():
lang_module = LangModule("bert-base-multilingual-cased")
assert lang_module
def test_register_head():
lang_module = LangModule("bert-base-multilingual-cased")
objective = TokenClassification(lang_module,
texts_or_path=paths["texts"]["ner"],
labels_or_path=paths["labels"]["ner"],
batch_size=4)
assert objective.compatible_head_model
def test_merge_objectives():
import torch
lang_module = LangModule("bert-base-multilingual-cased")
objective_base = TokenClassification(lang_module,
texts_or_path=paths["texts"]["ner"],
labels_or_path=paths["labels"]["ner"],
batch_size=4)
objective_new = MaskedLanguageModeling(lang_module,
texts_or_path=paths["texts"]["unsup"],
batch_size=4)
# check that merge-able modules now refer to the same physical address
for i, (new_param_key, orig_model_param) in enumerate(objective_base.compatible_head_model.named_parameters()):
if new_param_key in dict(objective_new.compatible_head_model.named_parameters()):
new_model_param = objective_new.compatible_head_model.get_parameter(new_param_key)
if orig_model_param.shape == new_model_param.shape and \
torch.all(orig_model_param == new_model_param):
assert id(new_model_param) == id(orig_model_param)
from adaptor.lang_module import LangModule
from adaptor.objectives.CLM import CausalLanguageModeling
from adaptor.objectives.MLM import MaskedLanguageModeling
from adaptor.objectives.backtranslation import BackTranslation, BackTranslator
from adaptor.objectives.classification import TokenClassification
from adaptor.objectives.denoising import DenoisingObjective
from adaptor.objectives.objective_base import Objective
from adaptor.objectives.question_answering import ExtractiveQA
from adaptor.objectives.seq2seq import Sequence2Sequence
from utils import paths, test_base_models
def assert_module_objective_ok(lang_module: LangModule, objective: Objective, split: str = "train"):
# dataset iteration test
dataset_sample = next(iter(objective.get_dataset(split, objective_i=0, device="cpu")))
# providing labels makes HF lang_module to compute its own loss, which is in DA redundantly done by Objective
outputs = lang_module(**dataset_sample)
# loss computation test, possible label smoothing is performed by Adapter
loss = objective.compute_loss(outputs, dataset_sample["labels"], dataset_sample, split)
# check that retrieved loss has a backward_fn
loss.backward()
assert True
def test_token_classification_objective():
lang_module = LangModule(test_base_models["token_classification"])
objective = TokenClassification(lang_module,
texts_or_path=paths["texts"]["ner"],
labels_or_path=paths["labels"]["ner"],
batch_size=4)
assert_module_objective_ok(lang_module, objective)
def test_mlm_objective():
lang_module = LangModule(test_base_models["token_classification"])
objective = MaskedLanguageModeling(lang_module,
texts_or_path=paths["texts"]["unsup"],
batch_size=4)
assert_module_objective_ok(lang_module, objective)
def test_clm_unsup_objective_bert():
lang_module = LangModule(test_base_models["token_classification"])
objective = CausalLanguageModeling(lang_module,
texts_or_path=paths["texts"]["unsup"],
batch_size=4)
assert_module_objective_ok(lang_module, objective)
def test_clm_unsup_objective_marian():
lang_module = LangModule(test_base_models["translation_mono"])
objective = CausalLanguageModeling(lang_module,
texts_or_path=paths["texts"]["unsup"],
batch_size=4)
assert_module_objective_ok(lang_module, objective)
def test_denoising_objective():
lang_module = LangModule(test_base_models["translation_mono"])
objective = DenoisingObjective(lang_module, texts_or_path=paths["texts"]["unsup"], batch_size=4)
assert_module_objective_ok(lang_module, objective)
def test_backtranslation_objective():
# we are adapting cs->en translator with back-translation
lang_module = LangModule(test_base_models["translation_mono"])
translator = BackTranslator("Helsinki-NLP/opus-mt-en-cs")
objective = BackTranslation(lang_module,
back_translator=translator,
texts_or_path=paths["texts"]["unsup"],
batch_size=4)
assert_module_objective_ok(lang_module, objective)
def test_backtranslation_objective_mbart():
# we are adapting cs->en translator with back-translation
lang_module = LangModule(test_base_models["translation_multi"]["model"])
translator = BackTranslator("Helsinki-NLP/opus-mt-en-cs")
objective = BackTranslation(lang_module,
back_translator=translator,
texts_or_path=paths["texts"]["unsup"],
batch_size=4,
source_lang_id=test_base_models["translation_multi"]["test_src_lang"],
target_lang_id=test_base_models["translation_multi"]["test_tgt_lang"])
assert_module_objective_ok(lang_module, objective)
def test_supervised_seq2seq_objective():
lang_module = LangModule(test_base_models["translation_mono"])
objective = Sequence2Sequence(lang_module,
texts_or_path=paths["texts"]["translation"],
labels_or_path=paths["labels"]["translation"],
batch_size=4)
assert_module_objective_ok(lang_module, objective)
def test_supervised_seq2seq_objective_mbart():
# we are adapting cs->en translator with back-translation
lang_module = LangModule(test_base_models["translation_multi"]["model"])
objective = Sequence2Sequence(lang_module,
texts_or_path=paths["texts"]["translation"],
labels_or_path=paths["labels"]["translation"],
batch_size=4,
source_lang_id=test_base_models["translation_multi"]["test_src_lang"],
target_lang_id=test_base_models["translation_multi"]["test_tgt_lang"])
assert_module_objective_ok(lang_module, objective)
def test_supervised_QA_objective():
lang_module = LangModule(test_base_models["extractive_QA"])
objective = ExtractiveQA(lang_module,
texts_or_path=paths["texts"]["QA"],
text_pair_or_path=paths["text_pair"]["QA"],
labels_or_path=paths["labels"]["QA"],
batch_size=4)
assert_module_objective_ok(lang_module, objective)
from adaptor.lang_module import LangModule
from adaptor.objectives.MLM import MaskedLanguageModeling
from adaptor.objectives.classification import TokenClassification
from adaptor.objectives.denoising import DenoisingObjective
from adaptor.objectives.seq2seq import Sequence2Sequence
from adaptor.schedules import SequentialSchedule, Schedule, ParallelSchedule
from adaptor.utils import AdaptationArguments, StoppingStrategy
from utils import test_base_models, paths
unsup_target_domain_texts = "mock_data/domain_unsup.txt"
sup_target_domain_texts = "mock_data/supervised_texts.txt"
sup_target_domain_labels = "mock_data/supervised_texts_token_labels.txt"
sup_translation_texts_src = "mock_data/seq2seq_sources.txt"
sup_translation_texts_tgt = "mock_data/seq2seq_targets.txt"
args = AdaptationArguments(output_dir="adaptation_output_dir",
stopping_strategy=StoppingStrategy.FIRST_OBJECTIVE_CONVERGED,
do_train=True,
do_eval=True,
gradient_accumulation_steps=2,
log_level="critical",
logging_steps=1,
eval_steps=1,
num_train_epochs=10)
def assert_schedule(lang_module: LangModule, schedule: Schedule):
for batch in iter(schedule.iterable_dataset("train")):
logit_outputs = lang_module(**batch)
loss_combined = schedule.compute_loss(logit_outputs, batch["labels"], batch)
loss_combined.backward()
# every objective has some key in its logs
train_logs = schedule.objectives_log("train")
assert all(any(str(obj) for log_key, _ in train_logs.items()) for obj in schedule.objectives["train"].keys())
for batch in iter(schedule.iterable_dataset("eval")):
logit_outputs = lang_module(**batch)
loss_combined = schedule.compute_loss(logit_outputs, batch["labels"], batch)
loss_combined.backward()
eval_logs = schedule.objectives_log("eval")
assert all(any(str(obj) for log_key, _ in eval_logs.items()) for obj in schedule.objectives["eval"].keys())
assert True
def ner_da_schedule(schedule_type):
lang_module = LangModule(test_base_models["token_classification"])
lm_adaptation = MaskedLanguageModeling(lang_module,
texts_or_path=unsup_target_domain_texts,
val_texts_or_path=unsup_target_domain_texts,
batch_size=1)
token_classification = TokenClassification(lang_module,
texts_or_path=sup_target_domain_texts,
labels_or_path=sup_target_domain_labels,
val_texts_or_path=sup_target_domain_texts,
val_labels_or_path=sup_target_domain_labels,
batch_size=1)
assert_schedule(lang_module, schedule_type(objectives=[lm_adaptation, token_classification], args=args))
def test_ner_da_schedule_sequential():
ner_da_schedule(SequentialSchedule)
def test_ner_da_schedule_strided():
ner_da_schedule(ParallelSchedule)
def test_mt_da_schedule():
lang_module = LangModule(test_base_models["translation_mono"])
denoising_adaptation = DenoisingObjective(lang_module,
texts_or_path=unsup_target_domain_texts,
val_texts_or_path=unsup_target_domain_texts,
batch_size=1)
clm_finetuning = Sequence2Sequence(lang_module,
texts_or_path=sup_translation_texts_src,
labels_or_path=sup_translation_texts_tgt,
val_texts_or_path=sup_translation_texts_src,
val_labels_or_path=sup_translation_texts_tgt,
batch_size=1)
assert_schedule(lang_module, SequentialSchedule(objectives=[denoising_adaptation, clm_finetuning], args=args))
def test_multilang_multiobj_langs_do_match():
# we check that BoS tokens in objectives (sharing the tokenizer) are resolved correctly, in both inputs and labels
lang_module = LangModule(test_base_models["translation_multi"]["model"])
objectives = [Sequence2Sequence(lang_module,
texts_or_path=paths["texts"]["translation"],
labels_or_path=paths["labels"]["translation"],
batch_size=1,
source_lang_id=test_base_models["translation_multi"]["test_src_lang"],
target_lang_id=test_base_models["translation_multi"]["test_tgt_lang"]),
Sequence2Sequence(lang_module,
texts_or_path=paths["labels"]["translation"],
labels_or_path=paths["texts"]["translation"],
batch_size=1,
source_lang_id=test_base_models["translation_multi"]["test_tgt_lang"],
target_lang_id=test_base_models["translation_multi"]["test_src_lang"])]
schedule = ParallelSchedule(objectives, args=args)
# we iterate over two batches, associated with objectives in the corresponding order
for objective, objective_batch in zip(objectives * 2, schedule.iterable_dataset("train")):
sample_input_lang = lang_module.tokenizer.decode([t_id for t_id in objective_batch["input_ids"][0]
if t_id in lang_module.tokenizer.lang_code_to_id.values()][0])
sample_label_lang = lang_module.tokenizer.decode([t_id for t_id in objective_batch["labels"][0]
if t_id in lang_module.tokenizer.lang_code_to_id.values()][0])
assert sample_input_lang == objective.source_lang_id
assert sample_label_lang == objective.target_lang_id
from adaptor.utils import AdaptationArguments, StoppingStrategy
paths = {
"texts": {
"ner": "mock_data/supervised_texts.txt",
"classification": "mock_data/supervised_texts.txt",
"translation": "mock_data/seq2seq_sources.txt",
"unsup": "mock_data/domain_unsup.txt",
"QA": "mock_data/QA_questions.txt"
},
"labels": {
"ner": "mock_data/supervised_texts_token_labels.txt",
"classification": "mock_data/supervised_texts_sequence_labels.txt",
"translation": "mock_data/seq2seq_targets.txt",
"QA": "mock_data/QA_answers.txt"
},
"text_pair": {
"QA": "mock_data/QA_contexts.txt"
}
}
test_base_models = {
"translation_mono": "Helsinki-NLP/opus-mt-en-cs",
"translation_multi": {
"model": "sshleifer/tiny-mbart",
"test_src_lang": "en_XX",
"test_tgt_lang": "cs_CZ"
},
"token_classification": "bert-base-cased",
"sequence_classification": "bert-base-cased",
"extractive_QA": "bert-base-cased",
"MLM": "bert-base-cased",
"MLM_student": "distilbert-base-cased"
}
training_arguments = AdaptationArguments(output_dir="adaptation_output_dir",
stopping_strategy=StoppingStrategy.FIRST_OBJECTIVE_NUM_EPOCHS,
do_train=True,
do_eval=True,
gradient_accumulation_steps=2,
log_level="critical",
logging_steps=1,
num_train_epochs=2)