| | |
| |
|
| | from datasets import load_dataset |
| | from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments |
| | import torch |
| |
|
| | |
| | dataset = load_dataset("imdb", split='train[:2%]').train_test_split(test_size=0.2) |
| |
|
| | |
| | tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased") |
| | model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2) |
| |
|
| | |
| | def tokenize(batch): |
| | return tokenizer(batch['text'], padding=True, truncation=True) |
| |
|
| | tokenized_dataset = dataset.map(tokenize, batched=True) |
| | tokenized_dataset = tokenized_dataset.rename_column("label", "labels") |
| |
|
| | |
| | training_args = TrainingArguments( |
| | output_dir="./results", |
| | evaluation_strategy="epoch", |
| | per_device_train_batch_size=4, |
| | per_device_eval_batch_size=4, |
| | num_train_epochs=1, |
| | logging_steps=10, |
| | save_steps=10, |
| | report_to="none" |
| | ) |
| |
|
| | |
| | trainer = Trainer( |
| | model=model, |
| | args=training_args, |
| | train_dataset=tokenized_dataset["train"], |
| | eval_dataset=tokenized_dataset["test"] |
| | ) |
| |
|
| | |
| | trainer.train() |
| |
|
| | |
| | trainer.save_model("my-simple-sentiment-model") |
| |
|