(Unofficial) Pytorch Implementation of EncT5
: Fine-tuning T5 Encoder for Non-autoregressive Tasks
Tokenizer
and Model
for EncT5.<s>
) for tokenizer, and use this token for classification & regression.model.resize_token_embeddings()
)<s> X </s>
<s> A </s> B </s>
Highly recommend to use the same version of
transformers
.
transformers==4.15.0
torch==1.8.1
sentencepiece==0.1.96
datasets==1.17.0
scikit-learn==0.24.2
```python from enc_t5 import EncT5ForSequenceClassification, EncT5Tokenizer
model = EncT5ForSequenceClassification.from_pretrained("t5-base") tokenizer = EncT5Tokenizer.from_pretrained("t5-base")
if model.config.vocab_size < len(tokenizer.get_vocab()): model.resize_token_embeddings(len(tokenizer.get_vocab())) ```
Huggingface Transformers Trainer
for finetuning GLUE Task.T5 1.1 base
for finetuning.run_glue_tpu.sh
for more details.AdamW
optimizer instead of Adafactor
.EarlyStoppingCallback
.| | Metric | Result (Paper) | Result (Implementation) | | :-------- | ----------- | :------------: | :---------------------: | | CoLA | Matthew | 53.1 | 52.4 | | SST-2 | Acc | 94.0 | 94.5 | | MRPC | F1/Acc | 91.5/88.3 | 91.7/88.0 | | STS-B | PCC/SCC | 80.5/79.3 | 88.0/88.3 | | QQP | F1/Acc | 72.9/89.8 | 88.4/91.3 | | MNLI | Mis/Matched | 88.0/86.7 | 87.5/88.1 | | QNLI | Acc | 93.3 | 93.2 | | RTE | Acc | 67.8 | 69.7 |
Integration into sentence-transformer
library.
I tried to load this tokenizer with sentence-transformer
library but it failed.
AutoTokenizer
couldn't load this tokenizer.
So, I simply added code to override save_pretrained
and its dependencies so that this tokenizer is saved as T5Tokenizer
, its super class.
``` def save_pretrained( self, save_directory, legacy_format: Optional[bool] = None, filename_prefix: Optional[str] = None, push_to_hub: bool = False, **kwargs, ): if os.path.isfile(save_directory): logger.error(f"Provided path ({save_directory}) should be a directory, not a file") return
if push_to_hub:
commit_message = kwargs.pop("commit_message", None)
repo = self._create_or_get_repo(save_directory, **kwargs)
os.makedirs(save_directory, exist_ok=True)
special_tokens_map_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + SPECIAL_TOKENS_MAP_FILE
)
tokenizer_config_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + TOKENIZER_CONFIG_FILE
)
tokenizer_config = copy.deepcopy(self.init_kwargs)
if len(self.init_inputs) > 0:
tokenizer_config["init_inputs"] = copy.deepcopy(self.init_inputs)
for file_id in self.vocab_files_names.keys():
tokenizer_config.pop(file_id, None)
# Sanitize AddedTokens
def convert_added_tokens(obj: Union[AddedToken, Any], add_type_field=True):
if isinstance(obj, AddedToken):
out = obj.__getstate__()
if add_type_field:
out["__type"] = "AddedToken"
return out
elif isinstance(obj, (list, tuple)):
return list(convert_added_tokens(o, add_type_field=add_type_field) for o in obj)
elif isinstance(obj, dict):
return {k: convert_added_tokens(v, add_type_field=add_type_field) for k, v in obj.items()}
return obj
# add_type_field=True to allow dicts in the kwargs / differentiate from AddedToken serialization
tokenizer_config = convert_added_tokens(tokenizer_config, add_type_field=True)
# Add tokenizer class to the tokenizer config to be able to reload it with from_pretrained
############################################################################
tokenizer_class = self.__class__.__base__.__name__
############################################################################
# Remove the Fast at the end unless we have a special `PreTrainedTokenizerFast`
if tokenizer_class.endswith("Fast") and tokenizer_class != "PreTrainedTokenizerFast":
tokenizer_class = tokenizer_class[:-4]
tokenizer_config["tokenizer_class"] = tokenizer_class
if getattr(self, "_auto_map", None) is not None:
tokenizer_config["auto_map"] = self._auto_map
if getattr(self, "_processor_class", None) is not None:
tokenizer_config["processor_class"] = self._processor_class
# If we have a custom model, we copy the file defining it in the folder and set the attributes so it can be
# loaded from the Hub.
if self._auto_class is not None:
custom_object_save(self, save_directory, config=tokenizer_config)
with open(tokenizer_config_file, "w", encoding="utf-8") as f:
f.write(json.dumps(tokenizer_config, ensure_ascii=False))
logger.info(f"tokenizer config file saved in {tokenizer_config_file}")
# Sanitize AddedTokens in special_tokens_map
write_dict = convert_added_tokens(self.special_tokens_map_extended, add_type_field=False)
with open(special_tokens_map_file, "w", encoding="utf-8") as f:
f.write(json.dumps(write_dict, ensure_ascii=False))
logger.info(f"Special tokens file saved in {special_tokens_map_file}")
file_names = (tokenizer_config_file, special_tokens_map_file)
save_files = self._save_pretrained(
save_directory=save_directory,
file_names=file_names,
legacy_format=legacy_format,
filename_prefix=filename_prefix,
)
if push_to_hub:
url = self._push_to_hub(repo, commit_message=commit_message)
logger.info(f"Tokenizer pushed to the hub in this commit: {url}")
return save_files
```