1. MODEL - habibmarzuqi/Atom GitHub Wiki
Install required libraries
!pip install transformers datasets
import torch from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments from datasets import load_dataset
Cek apakah GPU tersedia
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
Load dataset
dataset = load_dataset("imdb")
Load pre-trained tokenizer dan model
model_name = "stepfun-ai/GOT-OCR2_0" tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2) model.to(device)
Tokenize dataset
def preprocess_function(examples): return tokenizer(examples['text'], truncation=True, padding=True)
tokenized_datasets = dataset.map(preprocess_function, batched=True)
Split dataset into train and test
train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(10000)) # menggunakan subset kecil untuk contoh test_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(2000))
Definisikan argumen pelatihan
training_args = TrainingArguments( output_dir="./results", evaluation_strategy="epoch", per_device_train_batch_size=8, per_device_eval_batch_size=8, num_train_epochs=3, weight_decay=0.01, )
Definisikan Trainer
trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=test_dataset, )
Melatih model
trainer.train()
Evaluasi model
trainer.evaluate()
Simpan model setelah dilatih
model.save_pretrained("./trained_model") tokenizer.save_pretrained("./trained_model")