soft-analytics-02/train_model.py

77 lines
3.0 KiB
Python

import os
import pandas as pd
os.environ["CUDA_VISIBLE_DEVICES"] = "2"
import torch
from transformers import T5ForConditionalGeneration
from train.evaluate import evaluate_accuracy
from train.finetune import fine_tune_with_eval
from train.dataset import build_pretrain_dataloader, build_fine_tune_dataloader
from train.pretrain import pretrain
from train.load import DataSet
IN_PATH: str = os.path.join(os.path.dirname(__file__), 'dataset', 'extracted', 'functions.pq')
IN_PATH_USI: str = os.path.join(os.path.dirname(__file__), 'dataset', 'extracted', 'test_set_usi.csv')
OUT_PATH: str = os.path.join(os.path.dirname(__file__), 'models', 'final')
RANDOM_STATE: int = 42
def train():
dataset = DataSet.load(IN_PATH, IN_PATH_USI, RANDOM_STATE)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')
pretrain_dir = os.path.join(OUT_PATH, "pretrain")
if os.path.isfile(os.path.join(pretrain_dir, "config.json")):
# load the pretrained model if it exists
model = T5ForConditionalGeneration.from_pretrained(pretrain_dir)
model.to(device)
else:
# Pre-train the model
model = T5ForConditionalGeneration.from_pretrained('Salesforce/codet5-small')
model.to(device)
pretrain_loader = build_pretrain_dataloader(dataset.pretrain_df)
pretrain(model, pretrain_loader, device, 1, pretrain_dir)
# Dataloaders for fine-tuning and validation
best_epoch_file = os.path.join(OUT_PATH, "best.txt")
if not os.path.isfile(best_epoch_file):
fine_tune_loader = build_fine_tune_dataloader(dataset.fine_tune_train_df, 'train')
eval_loader = build_fine_tune_dataloader(dataset.fine_tune_val_df, 'val')
best_epoch = fine_tune_with_eval(model, device, fine_tune_loader, eval_loader, 20, OUT_PATH)
with open(best_epoch_file, "w") as f:
f.write(str(best_epoch) + "\n")
# Load model for best epoch
with open(best_epoch_file, "r") as f:
best_epoch = int(f.read().strip())
best_model_directory = os.path.join(OUT_PATH, str(best_epoch))
best_model = T5ForConditionalGeneration.from_pretrained(best_model_directory)
best_model.to(device)
test_loader = build_fine_tune_dataloader(dataset.fine_tune_test_df, 'test')
test_usi_loader = build_fine_tune_dataloader(dataset.usi_test_df, 'test_usi')
# Evaluate the model on the test set
test_accuracy, _, test_outs = evaluate_accuracy(best_model, test_loader, device, track_predictions=True)
pd.DataFrame.from_records(test_outs).to_csv(os.path.join(OUT_PATH, 'test_outputs.csv'))
print(f"Test Accuracy: {test_accuracy * 100:02.02f}%")
# Evaluate the model on the usi test set
test_accuracy, _, test_usi_outs = evaluate_accuracy(best_model, test_usi_loader, device, track_predictions=True)
pd.DataFrame.from_records(test_usi_outs).to_csv(os.path.join(OUT_PATH, 'test_usi_outputs.csv'))
print(f"USI Test Accuracy: {test_accuracy * 100:02.02f}%")
if __name__ == "__main__":
train()