Module monk.gluon.finetune.level_3_training_base
Expand source code
from gluon.finetune.imports import *
from system.imports import *
from gluon.finetune.level_2_model_base import finetune_model
class finetune_training(finetune_model):
'''
Base class for training and associated functions
Args:
verbose (int): Set verbosity levels
0 - Print Nothing
1 - Print desired details
'''
def __init__(self, verbose=1):
super().__init__(verbose=verbose);
###############################################################################################################################################
def get_training_estimate(self):
'''
Get estimated time for training a single epoch based on all set parameters
Args:
None
Returns:
float: Total time per epoch in seconds
'''
total_time_per_epoch = 0;
self.system_dict = load_scheduler(self.system_dict);
self.system_dict = load_optimizer(self.system_dict);
self.system_dict = load_loss(self.system_dict);
num_iterations_train = len(self.system_dict["local"]["data_loaders"]["train"])//10;
num_iterations_val = len(self.system_dict["local"]["data_loaders"]["val"])//10;
since = time.time();
train_loss = 0;
for i, batch in enumerate(self.system_dict["local"]["data_loaders"]["train"]):
data = mx.gluon.utils.split_and_load(batch[0], ctx_list=self.system_dict["local"]["ctx"], batch_axis=0, even_split=False)
label = mx.gluon.utils.split_and_load(batch[1], ctx_list=self.system_dict["local"]["ctx"], batch_axis=0, even_split=False)
with ag.record():
outputs = [self.system_dict["local"]["model"](X) for X in data]
loss = [self.system_dict["local"]["criterion"](yhat, y) for yhat, y in zip(outputs, label)]
for l in loss:
l.backward()
train_loss += sum([l.mean().asscalar() for l in loss]) / len(loss)
if(i==num_iterations_train):
break;
for i, batch in enumerate(self.system_dict["local"]["data_loaders"]["val"]):
data = mx.gluon.utils.split_and_load(batch[0], ctx_list=self.system_dict["local"]["ctx"], batch_axis=0, even_split=False)
label = mx.gluon.utils.split_and_load(batch[1], ctx_list=self.system_dict["local"]["ctx"], batch_axis=0, even_split=False)
with ag.record():
outputs = [self.system_dict["local"]["model"](X) for X in data]
loss = [self.system_dict["local"]["criterion"](yhat, y) for yhat, y in zip(outputs, label)]
if(i==num_iterations_val):
break;
total_time_per_epoch = (time.time() - since)*10;
return total_time_per_epoch;
###############################################################################################################################################
###############################################################################################################################################
def set_training_evaluation(self):
'''
Base function for running validation while training
Args:
None
Returns:
dict: Validation metrics
float: Test Loss
'''
num_batch = len(self.system_dict["local"]["data_loaders"]["val"]);
metric = mx.metric.Accuracy()
if(self.system_dict["training"]["settings"]["display_progress_realtime"] and self.system_dict["verbose"]):
pbar = tqdm(total=num_batch);
test_loss = 0;
for i, batch in enumerate(self.system_dict["local"]["data_loaders"]["val"]):
if(self.system_dict["training"]["settings"]["display_progress_realtime"] and self.system_dict["verbose"]):
pbar.update();
data = mx.gluon.utils.split_and_load(batch[0], ctx_list=self.system_dict["local"]["ctx"], batch_axis=0, even_split=False)
label = mx.gluon.utils.split_and_load(batch[1], ctx_list=self.system_dict["local"]["ctx"], batch_axis=0, even_split=False)
with ag.record():
outputs = [self.system_dict["local"]["model"](X) for X in data]
loss = [self.system_dict["local"]["criterion"](yhat, y) for yhat, y in zip(outputs, label)]
test_loss += sum([l.mean().asscalar() for l in loss]) / len(loss)
metric.update(label, outputs)
return metric.get(), test_loss;
###############################################################################################################################################
###############################################################################################################################################
def set_training_final(self):
'''
Main training function
Args:
None
Returns:
None
'''
if(self.system_dict["states"]["resume_train"]):
self.custom_print("Training Resume");
self.system_dict = load_scheduler(self.system_dict);
self.system_dict = load_optimizer(self.system_dict);
self.system_dict = load_loss(self.system_dict);
metric = mx.metric.Accuracy();
trainer = mx.gluon.Trainer(self.system_dict["local"]["model"].collect_params(), optimizer=self.system_dict["local"]["optimizer"]);
self.system_dict["training"]["status"] = False;
since = time.time()
pid = os.getpid();
if(self.system_dict["training"]["settings"]["save_training_logs"]):
val_acc_history = list(np.load(self.system_dict["log_dir"] + "val_acc_history.npy", allow_pickle=True));
train_acc_history = list(np.load(self.system_dict["log_dir"] + "train_acc_history.npy", allow_pickle=True));
val_loss_history = list(np.load(self.system_dict["log_dir"] + "val_loss_history.npy", allow_pickle=True));
train_loss_history = list(np.load(self.system_dict["log_dir"] + "train_loss_history.npy", allow_pickle=True));
num_batch_train = len(self.system_dict["local"]["data_loaders"]["train"]);
num_batch_val = len(self.system_dict["local"]["data_loaders"]["val"]);
best_acc = 0.0;
best_acc_epoch = 0;
max_gpu_usage = 0;
for epoch in range(self.system_dict["hyper-parameters"]["num_epochs"]):
if(self.system_dict["training"]["settings"]["display_progress"]):
self.custom_print(' Epoch {}/{}'.format(epoch+1, self.system_dict["hyper-parameters"]["num_epochs"]))
self.custom_print(' ' + '-' * 10)
if(epoch < self.system_dict["training"]["outputs"]["epochs_completed"]):
self.custom_print("Skipping Current Epoch");
self.custom_print("");
self.custom_print("");
continue;
since = time.time();
train_loss = 0
metric.reset()
if(self.system_dict["training"]["settings"]["display_progress_realtime"] and self.system_dict["verbose"]):
pbar = tqdm(total=num_batch_train);
for i, batch in enumerate(self.system_dict["local"]["data_loaders"]["train"]):
if(self.system_dict["training"]["settings"]["display_progress_realtime"] and self.system_dict["verbose"]):
pbar.update();
data = mx.gluon.utils.split_and_load(batch[0], ctx_list=self.system_dict["local"]["ctx"], batch_axis=0, even_split=False)
label = mx.gluon.utils.split_and_load(batch[1], ctx_list=self.system_dict["local"]["ctx"], batch_axis=0, even_split=False)
with ag.record():
outputs = [self.system_dict["local"]["model"](X) for X in data]
loss = [self.system_dict["local"]["criterion"](yhat, y) for yhat, y in zip(outputs, label)]
for l in loss:
l.backward()
trainer.step(self.system_dict["dataset"]["params"]["batch_size"]);
train_loss += sum([l.mean().asscalar() for l in loss]) / len(loss)
metric.update(label, outputs)
_, train_acc = metric.get()
train_loss /= num_batch_train;
val_acc, val_loss = self.set_training_evaluation();
val_acc = val_acc[1];
val_loss /= num_batch_val;
if(self.system_dict["model"]["params"]["use_gpu"]):
GPUs = GPUtil.getGPUs()
gpuMemoryUsed = GPUs[0].memoryUsed
if(self.system_dict["training"]["outputs"]["max_gpu_memory_usage"] < int(gpuMemoryUsed)):
self.system_dict["training"]["outputs"]["max_gpu_memory_usage"] = int(gpuMemoryUsed);
if(self.system_dict["training"]["settings"]["save_training_logs"]):
val_acc_history.append(val_acc);
val_loss_history.append(val_loss);
train_acc_history.append(train_acc);
train_loss_history.append(train_loss);
if(val_acc > best_acc):
best_acc = val_acc;
best_acc_epoch = epoch;
if(self.system_dict["training"]["settings"]["save_intermediate_models"]):
self.system_dict["local"]["model"].export(self.system_dict["model_dir"] + self.system_dict["training"]["settings"]["intermediate_model_prefix"],
epoch=epoch)
self.system_dict["local"]["model"].export(self.system_dict["model_dir"] + "best_model", epoch=0);
self.system_dict["training"]["outputs"]["best_val_acc"] = "{:4f}".format(best_acc);
self.system_dict["training"]["outputs"]["best_val_acc_epoch_num"] = best_acc_epoch;
time_elapsed_since = time.time() - since;
if("training_time" in self.system_dict["training"]["outputs"].keys()):
minutes, seconds = self.system_dict["training"]["outputs"]["training_time"].split(" ");
minutes = int(minutes[:len(minutes)-1]);
seconds = int(seconds[:len(seconds)-1]);
time_elapsed_since += minutes*60 + seconds;
self.system_dict["training"]["outputs"]["training_time"] = "{:.0f}m {:.0f}s".format(time_elapsed_since // 60, time_elapsed_since % 60);
if(self.system_dict["training"]["settings"]["save_training_logs"]):
np.save(self.system_dict["log_dir"] + "val_acc_history.npy", np.array(val_acc_history));
np.save(self.system_dict["log_dir"] + "val_loss_history.npy", np.array(val_loss_history));
np.save(self.system_dict["log_dir"] + "train_acc_history.npy", np.array(train_acc_history));
np.save(self.system_dict["log_dir"] + "train_loss_history.npy", np.array(train_loss_history));
create_train_test_plots_accuracy([train_acc_history, val_acc_history], ["Epoch Num", "Accuracy"], self.system_dict["log_dir"], show_img=False, save_img=True);
create_train_test_plots_loss([train_loss_history, val_loss_history], ["Epoch Num", "Loss"], self.system_dict["log_dir"], show_img=False, save_img=True);
self.system_dict["local"]["model"].export(self.system_dict["model_dir"] + "resume_state", epoch=0);
if(self.system_dict["training"]["settings"]["display_progress_realtime"] and self.system_dict["verbose"]):
self.custom_print("");
self.custom_print("");
if(self.system_dict["training"]["settings"]["display_progress"]):
curr_lr = trainer.learning_rate
self.custom_print(" curr_lr - {}".format(curr_lr));
self.custom_print(' [Epoch %d] Train-acc: %.3f, Train-loss: %.3f | Val-acc: %3f, Val-loss: %.3f, | time: %.1f sec' %
(epoch+1, train_acc, train_loss, val_acc, val_loss, time.time() - since));
self.custom_print("");
self.system_dict["training"]["outputs"]["epochs_completed"] = epoch+1;
save(self.system_dict);
elif(self.system_dict["states"]["eval_infer"]):
msg = "Cannot train in testing (eval_infer) mode.\n";
msg += "Tip - use new_experiment function with a copy_from argument.\n";
raise ConstraintError(msg);
else:
self.custom_print("Training Start");
self.system_dict = load_scheduler(self.system_dict);
self.system_dict = load_optimizer(self.system_dict);
self.system_dict = load_loss(self.system_dict);
metric = mx.metric.Accuracy();
trainer = mx.gluon.Trainer(self.system_dict["local"]["model"].collect_params(), optimizer=self.system_dict["local"]["optimizer"]);
self.system_dict["training"]["status"] = False;
pid = os.getpid()
if(self.system_dict["training"]["settings"]["save_training_logs"]):
val_acc_history = [];
train_acc_history = [];
val_loss_history = [];
train_loss_history = [];
num_batch_train = len(self.system_dict["local"]["data_loaders"]["train"]);
num_batch_val = len(self.system_dict["local"]["data_loaders"]["val"]);
best_acc = 0.0;
best_acc_epoch = 0;
max_gpu_usage = 0;
for epoch in range(self.system_dict["hyper-parameters"]["num_epochs"]):
if(self.system_dict["training"]["settings"]["display_progress"]):
self.custom_print(' Epoch {}/{}'.format(epoch+1, self.system_dict["hyper-parameters"]["num_epochs"]))
self.custom_print(' ' + '-' * 10)
since = time.time();
train_loss = 0
metric.reset()
if(self.system_dict["training"]["settings"]["display_progress_realtime"] and self.system_dict["verbose"]):
pbar = tqdm(total=num_batch_train);
for i, batch in enumerate(self.system_dict["local"]["data_loaders"]["train"]):
if(self.system_dict["training"]["settings"]["display_progress_realtime"] and self.system_dict["verbose"]):
pbar.update();
data = mx.gluon.utils.split_and_load(batch[0], ctx_list=self.system_dict["local"]["ctx"], batch_axis=0, even_split=False)
label = mx.gluon.utils.split_and_load(batch[1], ctx_list=self.system_dict["local"]["ctx"], batch_axis=0, even_split=False)
with ag.record():
outputs = [self.system_dict["local"]["model"](X) for X in data]
loss = [self.system_dict["local"]["criterion"](yhat, y) for yhat, y in zip(outputs, label)]
for l in loss:
l.backward()
trainer.step(self.system_dict["dataset"]["params"]["batch_size"]);
train_loss += sum([l.mean().asscalar() for l in loss]) / len(loss)
metric.update(label, outputs)
_, train_acc = metric.get()
train_loss /= num_batch_train;
val_acc, val_loss = self.set_training_evaluation();
val_acc = val_acc[1];
val_loss /= num_batch_val;
if(self.system_dict["model"]["params"]["use_gpu"]):
GPUs = GPUtil.getGPUs()
gpuMemoryUsed = GPUs[0].memoryUsed
if(self.system_dict["training"]["outputs"]["max_gpu_memory_usage"] < int(gpuMemoryUsed)):
self.system_dict["training"]["outputs"]["max_gpu_memory_usage"] = int(gpuMemoryUsed);
if(self.system_dict["training"]["settings"]["save_training_logs"]):
val_acc_history.append(val_acc);
val_loss_history.append(val_loss);
train_acc_history.append(train_acc);
train_loss_history.append(train_loss);
if(self.system_dict["training"]["settings"]["save_intermediate_models"]):
self.system_dict["local"]["model"].export(self.system_dict["model_dir"] + self.system_dict["training"]["settings"]["intermediate_model_prefix"],
epoch=epoch)
if(val_acc > best_acc):
best_acc = val_acc;
best_acc_epoch = epoch;
self.system_dict["local"]["model"].export(self.system_dict["model_dir"] + "best_model", epoch=0);
self.system_dict["training"]["outputs"]["best_val_acc"] = "{:4f}".format(best_acc);
self.system_dict["training"]["outputs"]["best_val_acc_epoch_num"] = best_acc_epoch;
time_elapsed_since = time.time() - since;
if("training_time" in self.system_dict["training"]["outputs"].keys()):
minutes, seconds = self.system_dict["training"]["outputs"]["training_time"].split(" ");
minutes = int(minutes[:len(minutes)-1]);
seconds = int(seconds[:len(seconds)-1]);
time_elapsed_since += minutes*60 + seconds;
self.system_dict["training"]["outputs"]["training_time"] = "{:.0f}m {:.0f}s".format(time_elapsed_since // 60, time_elapsed_since % 60);
if(self.system_dict["training"]["settings"]["save_training_logs"]):
np.save(self.system_dict["log_dir"] + "val_acc_history.npy", np.array(val_acc_history));
np.save(self.system_dict["log_dir"] + "val_loss_history.npy", np.array(val_loss_history));
np.save(self.system_dict["log_dir"] + "train_acc_history.npy", np.array(train_acc_history));
np.save(self.system_dict["log_dir"] + "train_loss_history.npy", np.array(train_loss_history));
create_train_test_plots_accuracy([train_acc_history, val_acc_history], ["Epoch Num", "Accuracy"], self.system_dict["log_dir"], show_img=False, save_img=True);
create_train_test_plots_loss([train_loss_history, val_loss_history], ["Epoch Num", "Loss"], self.system_dict["log_dir"], show_img=False, save_img=True);
self.system_dict["local"]["model"].export(self.system_dict["model_dir"] + "resume_state", epoch=0);
if(self.system_dict["training"]["settings"]["display_progress_realtime"] and self.system_dict["verbose"]):
self.custom_print("");
self.custom_print("");
if(self.system_dict["training"]["settings"]["display_progress"]):
curr_lr = trainer.learning_rate
self.custom_print(" curr_lr - {}".format(curr_lr));
self.custom_print(' [Epoch %d] Train-acc: %.3f, Train-loss: %.3f | Val-acc: %3f, Val-loss: %.3f, | time: %.1f sec' %
(epoch+1, train_acc, train_loss, val_acc, val_loss, time.time() - since));
self.custom_print("");
self.system_dict["training"]["outputs"]["epochs_completed"] = epoch+1;
save(self.system_dict);
if(self.system_dict["training"]["settings"]["display_progress"]):
self.custom_print(' Training completed in: {:.0f}m {:.0f}s'.format(time_elapsed_since // 60, time_elapsed_since % 60))
self.custom_print(' Best val Acc: {:4f}'.format(best_acc))
self.custom_print("");
if(not self.system_dict["states"]["eval_infer"]):
self.custom_print("Training End");
self.custom_print("");
self.system_dict["training"]["outputs"]["best_val_acc"] = "{:4f}".format(best_acc);
self.system_dict["training"]["outputs"]["best_val_acc_epoch_num"] = best_acc_epoch;
self.system_dict["training"]["outputs"]["training_time"] = "{:.0f}m {:.0f}s".format(time_elapsed_since // 60, time_elapsed_since % 60);
self.system_dict["training"]["outputs"]["max_gpu_usage"] = str(self.system_dict["training"]["outputs"]["max_gpu_memory_usage"]) + " Mb";
self.system_dict["local"]["model"].export(self.system_dict["model_dir"] + "final", epoch=0);
if(self.system_dict["training"]["settings"]["save_training_logs"]):
self.custom_print("Training Outputs");
self.custom_print(" Model Dir: {}".format(self.system_dict["model_dir"]));
self.custom_print(" Log Dir: {}".format(self.system_dict["log_dir"]));
self.custom_print(" Final model: {}".format("final"));
self.custom_print(" Best model: {}".format("best_model"));
self.custom_print(" Log 1 - Validation accuracy history log: {}".format("val_acc_history.npy"));
self.custom_print(" Log 2 - Validation loss history log: {}".format("val_loss_history.npy"));
self.custom_print(" Log 3 - Training accuracy history log: {}".format("train_acc_history.npy"));
self.custom_print(" Log 4 - Training loss history log: {}".format("train_loss_history.npy"));
self.custom_print(" Log 5 - Training curve: {}".format("train_loss_history.npy"));
self.custom_print(" Log 6 - Validation curve: {}".format("train_loss_history.npy"));
self.custom_print("");
np.save(self.system_dict["log_dir"] + "val_acc_history.npy", np.array(val_acc_history));
np.save(self.system_dict["log_dir"] + "val_loss_history.npy", np.array(val_loss_history));
np.save(self.system_dict["log_dir"] + "train_acc_history.npy", np.array(train_acc_history));
np.save(self.system_dict["log_dir"] + "train_loss_history.npy", np.array(train_loss_history));
self.system_dict["training"]["outputs"]["log_val_acc_history"] = self.system_dict["log_dir"] + "val_acc_history.npy";
self.system_dict["training"]["outputs"]["log_val_loss_history"] = self.system_dict["log_dir"] + "val_loss_history.npy";
self.system_dict["training"]["outputs"]["log_train_acc_history"] = self.system_dict["log_dir"] + "train_acc_history.npy";
self.system_dict["training"]["outputs"]["log_train_loss_history"] = self.system_dict["log_dir"] + "train_loss_history.npy";
self.system_dict["training"]["outputs"]["log_val_acc_history_relative"] = self.system_dict["log_dir_relative"] + "val_acc_history.npy";
self.system_dict["training"]["outputs"]["log_val_loss_history_relative"] = self.system_dict["log_dir_relative"] + "val_loss_history.npy";
self.system_dict["training"]["outputs"]["log_train_acc_history_relative"] = self.system_dict["log_dir_relative"] + "train_acc_history.npy";
self.system_dict["training"]["outputs"]["log_train_loss_history_relative"] = self.system_dict["log_dir_relative"] + "train_loss_history.npy";
create_train_test_plots_accuracy([train_acc_history, val_acc_history], ["Epoch Num", "Accuracy"], self.system_dict["log_dir"], show_img=False, save_img=True);
create_train_test_plots_loss([train_loss_history, val_loss_history], ["Epoch Num", "Loss"], self.system_dict["log_dir"], show_img=False, save_img=True);
self.system_dict["training"]["status"] = True;
###############################################################################################################################################
Classes
class finetune_training (verbose=1)
-
Base class for training and associated functions
Args
verbose
:int
- Set verbosity levels 0 - Print Nothing 1 - Print desired details
Expand source code
class finetune_training(finetune_model): ''' Base class for training and associated functions Args: verbose (int): Set verbosity levels 0 - Print Nothing 1 - Print desired details ''' def __init__(self, verbose=1): super().__init__(verbose=verbose); ############################################################################################################################################### def get_training_estimate(self): ''' Get estimated time for training a single epoch based on all set parameters Args: None Returns: float: Total time per epoch in seconds ''' total_time_per_epoch = 0; self.system_dict = load_scheduler(self.system_dict); self.system_dict = load_optimizer(self.system_dict); self.system_dict = load_loss(self.system_dict); num_iterations_train = len(self.system_dict["local"]["data_loaders"]["train"])//10; num_iterations_val = len(self.system_dict["local"]["data_loaders"]["val"])//10; since = time.time(); train_loss = 0; for i, batch in enumerate(self.system_dict["local"]["data_loaders"]["train"]): data = mx.gluon.utils.split_and_load(batch[0], ctx_list=self.system_dict["local"]["ctx"], batch_axis=0, even_split=False) label = mx.gluon.utils.split_and_load(batch[1], ctx_list=self.system_dict["local"]["ctx"], batch_axis=0, even_split=False) with ag.record(): outputs = [self.system_dict["local"]["model"](X) for X in data] loss = [self.system_dict["local"]["criterion"](yhat, y) for yhat, y in zip(outputs, label)] for l in loss: l.backward() train_loss += sum([l.mean().asscalar() for l in loss]) / len(loss) if(i==num_iterations_train): break; for i, batch in enumerate(self.system_dict["local"]["data_loaders"]["val"]): data = mx.gluon.utils.split_and_load(batch[0], ctx_list=self.system_dict["local"]["ctx"], batch_axis=0, even_split=False) label = mx.gluon.utils.split_and_load(batch[1], ctx_list=self.system_dict["local"]["ctx"], batch_axis=0, even_split=False) with ag.record(): outputs = [self.system_dict["local"]["model"](X) for X in data] loss = [self.system_dict["local"]["criterion"](yhat, y) for yhat, y in zip(outputs, label)] if(i==num_iterations_val): break; total_time_per_epoch = (time.time() - since)*10; return total_time_per_epoch; ############################################################################################################################################### ############################################################################################################################################### def set_training_evaluation(self): ''' Base function for running validation while training Args: None Returns: dict: Validation metrics float: Test Loss ''' num_batch = len(self.system_dict["local"]["data_loaders"]["val"]); metric = mx.metric.Accuracy() if(self.system_dict["training"]["settings"]["display_progress_realtime"] and self.system_dict["verbose"]): pbar = tqdm(total=num_batch); test_loss = 0; for i, batch in enumerate(self.system_dict["local"]["data_loaders"]["val"]): if(self.system_dict["training"]["settings"]["display_progress_realtime"] and self.system_dict["verbose"]): pbar.update(); data = mx.gluon.utils.split_and_load(batch[0], ctx_list=self.system_dict["local"]["ctx"], batch_axis=0, even_split=False) label = mx.gluon.utils.split_and_load(batch[1], ctx_list=self.system_dict["local"]["ctx"], batch_axis=0, even_split=False) with ag.record(): outputs = [self.system_dict["local"]["model"](X) for X in data] loss = [self.system_dict["local"]["criterion"](yhat, y) for yhat, y in zip(outputs, label)] test_loss += sum([l.mean().asscalar() for l in loss]) / len(loss) metric.update(label, outputs) return metric.get(), test_loss; ############################################################################################################################################### ############################################################################################################################################### def set_training_final(self): ''' Main training function Args: None Returns: None ''' if(self.system_dict["states"]["resume_train"]): self.custom_print("Training Resume"); self.system_dict = load_scheduler(self.system_dict); self.system_dict = load_optimizer(self.system_dict); self.system_dict = load_loss(self.system_dict); metric = mx.metric.Accuracy(); trainer = mx.gluon.Trainer(self.system_dict["local"]["model"].collect_params(), optimizer=self.system_dict["local"]["optimizer"]); self.system_dict["training"]["status"] = False; since = time.time() pid = os.getpid(); if(self.system_dict["training"]["settings"]["save_training_logs"]): val_acc_history = list(np.load(self.system_dict["log_dir"] + "val_acc_history.npy", allow_pickle=True)); train_acc_history = list(np.load(self.system_dict["log_dir"] + "train_acc_history.npy", allow_pickle=True)); val_loss_history = list(np.load(self.system_dict["log_dir"] + "val_loss_history.npy", allow_pickle=True)); train_loss_history = list(np.load(self.system_dict["log_dir"] + "train_loss_history.npy", allow_pickle=True)); num_batch_train = len(self.system_dict["local"]["data_loaders"]["train"]); num_batch_val = len(self.system_dict["local"]["data_loaders"]["val"]); best_acc = 0.0; best_acc_epoch = 0; max_gpu_usage = 0; for epoch in range(self.system_dict["hyper-parameters"]["num_epochs"]): if(self.system_dict["training"]["settings"]["display_progress"]): self.custom_print(' Epoch {}/{}'.format(epoch+1, self.system_dict["hyper-parameters"]["num_epochs"])) self.custom_print(' ' + '-' * 10) if(epoch < self.system_dict["training"]["outputs"]["epochs_completed"]): self.custom_print("Skipping Current Epoch"); self.custom_print(""); self.custom_print(""); continue; since = time.time(); train_loss = 0 metric.reset() if(self.system_dict["training"]["settings"]["display_progress_realtime"] and self.system_dict["verbose"]): pbar = tqdm(total=num_batch_train); for i, batch in enumerate(self.system_dict["local"]["data_loaders"]["train"]): if(self.system_dict["training"]["settings"]["display_progress_realtime"] and self.system_dict["verbose"]): pbar.update(); data = mx.gluon.utils.split_and_load(batch[0], ctx_list=self.system_dict["local"]["ctx"], batch_axis=0, even_split=False) label = mx.gluon.utils.split_and_load(batch[1], ctx_list=self.system_dict["local"]["ctx"], batch_axis=0, even_split=False) with ag.record(): outputs = [self.system_dict["local"]["model"](X) for X in data] loss = [self.system_dict["local"]["criterion"](yhat, y) for yhat, y in zip(outputs, label)] for l in loss: l.backward() trainer.step(self.system_dict["dataset"]["params"]["batch_size"]); train_loss += sum([l.mean().asscalar() for l in loss]) / len(loss) metric.update(label, outputs) _, train_acc = metric.get() train_loss /= num_batch_train; val_acc, val_loss = self.set_training_evaluation(); val_acc = val_acc[1]; val_loss /= num_batch_val; if(self.system_dict["model"]["params"]["use_gpu"]): GPUs = GPUtil.getGPUs() gpuMemoryUsed = GPUs[0].memoryUsed if(self.system_dict["training"]["outputs"]["max_gpu_memory_usage"] < int(gpuMemoryUsed)): self.system_dict["training"]["outputs"]["max_gpu_memory_usage"] = int(gpuMemoryUsed); if(self.system_dict["training"]["settings"]["save_training_logs"]): val_acc_history.append(val_acc); val_loss_history.append(val_loss); train_acc_history.append(train_acc); train_loss_history.append(train_loss); if(val_acc > best_acc): best_acc = val_acc; best_acc_epoch = epoch; if(self.system_dict["training"]["settings"]["save_intermediate_models"]): self.system_dict["local"]["model"].export(self.system_dict["model_dir"] + self.system_dict["training"]["settings"]["intermediate_model_prefix"], epoch=epoch) self.system_dict["local"]["model"].export(self.system_dict["model_dir"] + "best_model", epoch=0); self.system_dict["training"]["outputs"]["best_val_acc"] = "{:4f}".format(best_acc); self.system_dict["training"]["outputs"]["best_val_acc_epoch_num"] = best_acc_epoch; time_elapsed_since = time.time() - since; if("training_time" in self.system_dict["training"]["outputs"].keys()): minutes, seconds = self.system_dict["training"]["outputs"]["training_time"].split(" "); minutes = int(minutes[:len(minutes)-1]); seconds = int(seconds[:len(seconds)-1]); time_elapsed_since += minutes*60 + seconds; self.system_dict["training"]["outputs"]["training_time"] = "{:.0f}m {:.0f}s".format(time_elapsed_since // 60, time_elapsed_since % 60); if(self.system_dict["training"]["settings"]["save_training_logs"]): np.save(self.system_dict["log_dir"] + "val_acc_history.npy", np.array(val_acc_history)); np.save(self.system_dict["log_dir"] + "val_loss_history.npy", np.array(val_loss_history)); np.save(self.system_dict["log_dir"] + "train_acc_history.npy", np.array(train_acc_history)); np.save(self.system_dict["log_dir"] + "train_loss_history.npy", np.array(train_loss_history)); create_train_test_plots_accuracy([train_acc_history, val_acc_history], ["Epoch Num", "Accuracy"], self.system_dict["log_dir"], show_img=False, save_img=True); create_train_test_plots_loss([train_loss_history, val_loss_history], ["Epoch Num", "Loss"], self.system_dict["log_dir"], show_img=False, save_img=True); self.system_dict["local"]["model"].export(self.system_dict["model_dir"] + "resume_state", epoch=0); if(self.system_dict["training"]["settings"]["display_progress_realtime"] and self.system_dict["verbose"]): self.custom_print(""); self.custom_print(""); if(self.system_dict["training"]["settings"]["display_progress"]): curr_lr = trainer.learning_rate self.custom_print(" curr_lr - {}".format(curr_lr)); self.custom_print(' [Epoch %d] Train-acc: %.3f, Train-loss: %.3f | Val-acc: %3f, Val-loss: %.3f, | time: %.1f sec' % (epoch+1, train_acc, train_loss, val_acc, val_loss, time.time() - since)); self.custom_print(""); self.system_dict["training"]["outputs"]["epochs_completed"] = epoch+1; save(self.system_dict); elif(self.system_dict["states"]["eval_infer"]): msg = "Cannot train in testing (eval_infer) mode.\n"; msg += "Tip - use new_experiment function with a copy_from argument.\n"; raise ConstraintError(msg); else: self.custom_print("Training Start"); self.system_dict = load_scheduler(self.system_dict); self.system_dict = load_optimizer(self.system_dict); self.system_dict = load_loss(self.system_dict); metric = mx.metric.Accuracy(); trainer = mx.gluon.Trainer(self.system_dict["local"]["model"].collect_params(), optimizer=self.system_dict["local"]["optimizer"]); self.system_dict["training"]["status"] = False; pid = os.getpid() if(self.system_dict["training"]["settings"]["save_training_logs"]): val_acc_history = []; train_acc_history = []; val_loss_history = []; train_loss_history = []; num_batch_train = len(self.system_dict["local"]["data_loaders"]["train"]); num_batch_val = len(self.system_dict["local"]["data_loaders"]["val"]); best_acc = 0.0; best_acc_epoch = 0; max_gpu_usage = 0; for epoch in range(self.system_dict["hyper-parameters"]["num_epochs"]): if(self.system_dict["training"]["settings"]["display_progress"]): self.custom_print(' Epoch {}/{}'.format(epoch+1, self.system_dict["hyper-parameters"]["num_epochs"])) self.custom_print(' ' + '-' * 10) since = time.time(); train_loss = 0 metric.reset() if(self.system_dict["training"]["settings"]["display_progress_realtime"] and self.system_dict["verbose"]): pbar = tqdm(total=num_batch_train); for i, batch in enumerate(self.system_dict["local"]["data_loaders"]["train"]): if(self.system_dict["training"]["settings"]["display_progress_realtime"] and self.system_dict["verbose"]): pbar.update(); data = mx.gluon.utils.split_and_load(batch[0], ctx_list=self.system_dict["local"]["ctx"], batch_axis=0, even_split=False) label = mx.gluon.utils.split_and_load(batch[1], ctx_list=self.system_dict["local"]["ctx"], batch_axis=0, even_split=False) with ag.record(): outputs = [self.system_dict["local"]["model"](X) for X in data] loss = [self.system_dict["local"]["criterion"](yhat, y) for yhat, y in zip(outputs, label)] for l in loss: l.backward() trainer.step(self.system_dict["dataset"]["params"]["batch_size"]); train_loss += sum([l.mean().asscalar() for l in loss]) / len(loss) metric.update(label, outputs) _, train_acc = metric.get() train_loss /= num_batch_train; val_acc, val_loss = self.set_training_evaluation(); val_acc = val_acc[1]; val_loss /= num_batch_val; if(self.system_dict["model"]["params"]["use_gpu"]): GPUs = GPUtil.getGPUs() gpuMemoryUsed = GPUs[0].memoryUsed if(self.system_dict["training"]["outputs"]["max_gpu_memory_usage"] < int(gpuMemoryUsed)): self.system_dict["training"]["outputs"]["max_gpu_memory_usage"] = int(gpuMemoryUsed); if(self.system_dict["training"]["settings"]["save_training_logs"]): val_acc_history.append(val_acc); val_loss_history.append(val_loss); train_acc_history.append(train_acc); train_loss_history.append(train_loss); if(self.system_dict["training"]["settings"]["save_intermediate_models"]): self.system_dict["local"]["model"].export(self.system_dict["model_dir"] + self.system_dict["training"]["settings"]["intermediate_model_prefix"], epoch=epoch) if(val_acc > best_acc): best_acc = val_acc; best_acc_epoch = epoch; self.system_dict["local"]["model"].export(self.system_dict["model_dir"] + "best_model", epoch=0); self.system_dict["training"]["outputs"]["best_val_acc"] = "{:4f}".format(best_acc); self.system_dict["training"]["outputs"]["best_val_acc_epoch_num"] = best_acc_epoch; time_elapsed_since = time.time() - since; if("training_time" in self.system_dict["training"]["outputs"].keys()): minutes, seconds = self.system_dict["training"]["outputs"]["training_time"].split(" "); minutes = int(minutes[:len(minutes)-1]); seconds = int(seconds[:len(seconds)-1]); time_elapsed_since += minutes*60 + seconds; self.system_dict["training"]["outputs"]["training_time"] = "{:.0f}m {:.0f}s".format(time_elapsed_since // 60, time_elapsed_since % 60); if(self.system_dict["training"]["settings"]["save_training_logs"]): np.save(self.system_dict["log_dir"] + "val_acc_history.npy", np.array(val_acc_history)); np.save(self.system_dict["log_dir"] + "val_loss_history.npy", np.array(val_loss_history)); np.save(self.system_dict["log_dir"] + "train_acc_history.npy", np.array(train_acc_history)); np.save(self.system_dict["log_dir"] + "train_loss_history.npy", np.array(train_loss_history)); create_train_test_plots_accuracy([train_acc_history, val_acc_history], ["Epoch Num", "Accuracy"], self.system_dict["log_dir"], show_img=False, save_img=True); create_train_test_plots_loss([train_loss_history, val_loss_history], ["Epoch Num", "Loss"], self.system_dict["log_dir"], show_img=False, save_img=True); self.system_dict["local"]["model"].export(self.system_dict["model_dir"] + "resume_state", epoch=0); if(self.system_dict["training"]["settings"]["display_progress_realtime"] and self.system_dict["verbose"]): self.custom_print(""); self.custom_print(""); if(self.system_dict["training"]["settings"]["display_progress"]): curr_lr = trainer.learning_rate self.custom_print(" curr_lr - {}".format(curr_lr)); self.custom_print(' [Epoch %d] Train-acc: %.3f, Train-loss: %.3f | Val-acc: %3f, Val-loss: %.3f, | time: %.1f sec' % (epoch+1, train_acc, train_loss, val_acc, val_loss, time.time() - since)); self.custom_print(""); self.system_dict["training"]["outputs"]["epochs_completed"] = epoch+1; save(self.system_dict); if(self.system_dict["training"]["settings"]["display_progress"]): self.custom_print(' Training completed in: {:.0f}m {:.0f}s'.format(time_elapsed_since // 60, time_elapsed_since % 60)) self.custom_print(' Best val Acc: {:4f}'.format(best_acc)) self.custom_print(""); if(not self.system_dict["states"]["eval_infer"]): self.custom_print("Training End"); self.custom_print(""); self.system_dict["training"]["outputs"]["best_val_acc"] = "{:4f}".format(best_acc); self.system_dict["training"]["outputs"]["best_val_acc_epoch_num"] = best_acc_epoch; self.system_dict["training"]["outputs"]["training_time"] = "{:.0f}m {:.0f}s".format(time_elapsed_since // 60, time_elapsed_since % 60); self.system_dict["training"]["outputs"]["max_gpu_usage"] = str(self.system_dict["training"]["outputs"]["max_gpu_memory_usage"]) + " Mb"; self.system_dict["local"]["model"].export(self.system_dict["model_dir"] + "final", epoch=0); if(self.system_dict["training"]["settings"]["save_training_logs"]): self.custom_print("Training Outputs"); self.custom_print(" Model Dir: {}".format(self.system_dict["model_dir"])); self.custom_print(" Log Dir: {}".format(self.system_dict["log_dir"])); self.custom_print(" Final model: {}".format("final")); self.custom_print(" Best model: {}".format("best_model")); self.custom_print(" Log 1 - Validation accuracy history log: {}".format("val_acc_history.npy")); self.custom_print(" Log 2 - Validation loss history log: {}".format("val_loss_history.npy")); self.custom_print(" Log 3 - Training accuracy history log: {}".format("train_acc_history.npy")); self.custom_print(" Log 4 - Training loss history log: {}".format("train_loss_history.npy")); self.custom_print(" Log 5 - Training curve: {}".format("train_loss_history.npy")); self.custom_print(" Log 6 - Validation curve: {}".format("train_loss_history.npy")); self.custom_print(""); np.save(self.system_dict["log_dir"] + "val_acc_history.npy", np.array(val_acc_history)); np.save(self.system_dict["log_dir"] + "val_loss_history.npy", np.array(val_loss_history)); np.save(self.system_dict["log_dir"] + "train_acc_history.npy", np.array(train_acc_history)); np.save(self.system_dict["log_dir"] + "train_loss_history.npy", np.array(train_loss_history)); self.system_dict["training"]["outputs"]["log_val_acc_history"] = self.system_dict["log_dir"] + "val_acc_history.npy"; self.system_dict["training"]["outputs"]["log_val_loss_history"] = self.system_dict["log_dir"] + "val_loss_history.npy"; self.system_dict["training"]["outputs"]["log_train_acc_history"] = self.system_dict["log_dir"] + "train_acc_history.npy"; self.system_dict["training"]["outputs"]["log_train_loss_history"] = self.system_dict["log_dir"] + "train_loss_history.npy"; self.system_dict["training"]["outputs"]["log_val_acc_history_relative"] = self.system_dict["log_dir_relative"] + "val_acc_history.npy"; self.system_dict["training"]["outputs"]["log_val_loss_history_relative"] = self.system_dict["log_dir_relative"] + "val_loss_history.npy"; self.system_dict["training"]["outputs"]["log_train_acc_history_relative"] = self.system_dict["log_dir_relative"] + "train_acc_history.npy"; self.system_dict["training"]["outputs"]["log_train_loss_history_relative"] = self.system_dict["log_dir_relative"] + "train_loss_history.npy"; create_train_test_plots_accuracy([train_acc_history, val_acc_history], ["Epoch Num", "Accuracy"], self.system_dict["log_dir"], show_img=False, save_img=True); create_train_test_plots_loss([train_loss_history, val_loss_history], ["Epoch Num", "Loss"], self.system_dict["log_dir"], show_img=False, save_img=True); self.system_dict["training"]["status"] = True;
Ancestors
- gluon.finetune.level_2_model_base.finetune_model
- gluon.finetune.level_1_dataset_base.finetune_dataset
- system.base_class.system
Methods
def get_training_estimate(self)
-
Get estimated time for training a single epoch based on all set parameters
Args
None
Returns
float
- Total time per epoch in seconds
Expand source code
def get_training_estimate(self): ''' Get estimated time for training a single epoch based on all set parameters Args: None Returns: float: Total time per epoch in seconds ''' total_time_per_epoch = 0; self.system_dict = load_scheduler(self.system_dict); self.system_dict = load_optimizer(self.system_dict); self.system_dict = load_loss(self.system_dict); num_iterations_train = len(self.system_dict["local"]["data_loaders"]["train"])//10; num_iterations_val = len(self.system_dict["local"]["data_loaders"]["val"])//10; since = time.time(); train_loss = 0; for i, batch in enumerate(self.system_dict["local"]["data_loaders"]["train"]): data = mx.gluon.utils.split_and_load(batch[0], ctx_list=self.system_dict["local"]["ctx"], batch_axis=0, even_split=False) label = mx.gluon.utils.split_and_load(batch[1], ctx_list=self.system_dict["local"]["ctx"], batch_axis=0, even_split=False) with ag.record(): outputs = [self.system_dict["local"]["model"](X) for X in data] loss = [self.system_dict["local"]["criterion"](yhat, y) for yhat, y in zip(outputs, label)] for l in loss: l.backward() train_loss += sum([l.mean().asscalar() for l in loss]) / len(loss) if(i==num_iterations_train): break; for i, batch in enumerate(self.system_dict["local"]["data_loaders"]["val"]): data = mx.gluon.utils.split_and_load(batch[0], ctx_list=self.system_dict["local"]["ctx"], batch_axis=0, even_split=False) label = mx.gluon.utils.split_and_load(batch[1], ctx_list=self.system_dict["local"]["ctx"], batch_axis=0, even_split=False) with ag.record(): outputs = [self.system_dict["local"]["model"](X) for X in data] loss = [self.system_dict["local"]["criterion"](yhat, y) for yhat, y in zip(outputs, label)] if(i==num_iterations_val): break; total_time_per_epoch = (time.time() - since)*10; return total_time_per_epoch;
def set_training_evaluation(self)
-
Base function for running validation while training
Args
None
Returns
dict
- Validation metrics
float
- Test Loss
Expand source code
def set_training_evaluation(self): ''' Base function for running validation while training Args: None Returns: dict: Validation metrics float: Test Loss ''' num_batch = len(self.system_dict["local"]["data_loaders"]["val"]); metric = mx.metric.Accuracy() if(self.system_dict["training"]["settings"]["display_progress_realtime"] and self.system_dict["verbose"]): pbar = tqdm(total=num_batch); test_loss = 0; for i, batch in enumerate(self.system_dict["local"]["data_loaders"]["val"]): if(self.system_dict["training"]["settings"]["display_progress_realtime"] and self.system_dict["verbose"]): pbar.update(); data = mx.gluon.utils.split_and_load(batch[0], ctx_list=self.system_dict["local"]["ctx"], batch_axis=0, even_split=False) label = mx.gluon.utils.split_and_load(batch[1], ctx_list=self.system_dict["local"]["ctx"], batch_axis=0, even_split=False) with ag.record(): outputs = [self.system_dict["local"]["model"](X) for X in data] loss = [self.system_dict["local"]["criterion"](yhat, y) for yhat, y in zip(outputs, label)] test_loss += sum([l.mean().asscalar() for l in loss]) / len(loss) metric.update(label, outputs) return metric.get(), test_loss;
def set_training_final(self)
-
Main training function
Args
None
Returns
None
Expand source code
def set_training_final(self): ''' Main training function Args: None Returns: None ''' if(self.system_dict["states"]["resume_train"]): self.custom_print("Training Resume"); self.system_dict = load_scheduler(self.system_dict); self.system_dict = load_optimizer(self.system_dict); self.system_dict = load_loss(self.system_dict); metric = mx.metric.Accuracy(); trainer = mx.gluon.Trainer(self.system_dict["local"]["model"].collect_params(), optimizer=self.system_dict["local"]["optimizer"]); self.system_dict["training"]["status"] = False; since = time.time() pid = os.getpid(); if(self.system_dict["training"]["settings"]["save_training_logs"]): val_acc_history = list(np.load(self.system_dict["log_dir"] + "val_acc_history.npy", allow_pickle=True)); train_acc_history = list(np.load(self.system_dict["log_dir"] + "train_acc_history.npy", allow_pickle=True)); val_loss_history = list(np.load(self.system_dict["log_dir"] + "val_loss_history.npy", allow_pickle=True)); train_loss_history = list(np.load(self.system_dict["log_dir"] + "train_loss_history.npy", allow_pickle=True)); num_batch_train = len(self.system_dict["local"]["data_loaders"]["train"]); num_batch_val = len(self.system_dict["local"]["data_loaders"]["val"]); best_acc = 0.0; best_acc_epoch = 0; max_gpu_usage = 0; for epoch in range(self.system_dict["hyper-parameters"]["num_epochs"]): if(self.system_dict["training"]["settings"]["display_progress"]): self.custom_print(' Epoch {}/{}'.format(epoch+1, self.system_dict["hyper-parameters"]["num_epochs"])) self.custom_print(' ' + '-' * 10) if(epoch < self.system_dict["training"]["outputs"]["epochs_completed"]): self.custom_print("Skipping Current Epoch"); self.custom_print(""); self.custom_print(""); continue; since = time.time(); train_loss = 0 metric.reset() if(self.system_dict["training"]["settings"]["display_progress_realtime"] and self.system_dict["verbose"]): pbar = tqdm(total=num_batch_train); for i, batch in enumerate(self.system_dict["local"]["data_loaders"]["train"]): if(self.system_dict["training"]["settings"]["display_progress_realtime"] and self.system_dict["verbose"]): pbar.update(); data = mx.gluon.utils.split_and_load(batch[0], ctx_list=self.system_dict["local"]["ctx"], batch_axis=0, even_split=False) label = mx.gluon.utils.split_and_load(batch[1], ctx_list=self.system_dict["local"]["ctx"], batch_axis=0, even_split=False) with ag.record(): outputs = [self.system_dict["local"]["model"](X) for X in data] loss = [self.system_dict["local"]["criterion"](yhat, y) for yhat, y in zip(outputs, label)] for l in loss: l.backward() trainer.step(self.system_dict["dataset"]["params"]["batch_size"]); train_loss += sum([l.mean().asscalar() for l in loss]) / len(loss) metric.update(label, outputs) _, train_acc = metric.get() train_loss /= num_batch_train; val_acc, val_loss = self.set_training_evaluation(); val_acc = val_acc[1]; val_loss /= num_batch_val; if(self.system_dict["model"]["params"]["use_gpu"]): GPUs = GPUtil.getGPUs() gpuMemoryUsed = GPUs[0].memoryUsed if(self.system_dict["training"]["outputs"]["max_gpu_memory_usage"] < int(gpuMemoryUsed)): self.system_dict["training"]["outputs"]["max_gpu_memory_usage"] = int(gpuMemoryUsed); if(self.system_dict["training"]["settings"]["save_training_logs"]): val_acc_history.append(val_acc); val_loss_history.append(val_loss); train_acc_history.append(train_acc); train_loss_history.append(train_loss); if(val_acc > best_acc): best_acc = val_acc; best_acc_epoch = epoch; if(self.system_dict["training"]["settings"]["save_intermediate_models"]): self.system_dict["local"]["model"].export(self.system_dict["model_dir"] + self.system_dict["training"]["settings"]["intermediate_model_prefix"], epoch=epoch) self.system_dict["local"]["model"].export(self.system_dict["model_dir"] + "best_model", epoch=0); self.system_dict["training"]["outputs"]["best_val_acc"] = "{:4f}".format(best_acc); self.system_dict["training"]["outputs"]["best_val_acc_epoch_num"] = best_acc_epoch; time_elapsed_since = time.time() - since; if("training_time" in self.system_dict["training"]["outputs"].keys()): minutes, seconds = self.system_dict["training"]["outputs"]["training_time"].split(" "); minutes = int(minutes[:len(minutes)-1]); seconds = int(seconds[:len(seconds)-1]); time_elapsed_since += minutes*60 + seconds; self.system_dict["training"]["outputs"]["training_time"] = "{:.0f}m {:.0f}s".format(time_elapsed_since // 60, time_elapsed_since % 60); if(self.system_dict["training"]["settings"]["save_training_logs"]): np.save(self.system_dict["log_dir"] + "val_acc_history.npy", np.array(val_acc_history)); np.save(self.system_dict["log_dir"] + "val_loss_history.npy", np.array(val_loss_history)); np.save(self.system_dict["log_dir"] + "train_acc_history.npy", np.array(train_acc_history)); np.save(self.system_dict["log_dir"] + "train_loss_history.npy", np.array(train_loss_history)); create_train_test_plots_accuracy([train_acc_history, val_acc_history], ["Epoch Num", "Accuracy"], self.system_dict["log_dir"], show_img=False, save_img=True); create_train_test_plots_loss([train_loss_history, val_loss_history], ["Epoch Num", "Loss"], self.system_dict["log_dir"], show_img=False, save_img=True); self.system_dict["local"]["model"].export(self.system_dict["model_dir"] + "resume_state", epoch=0); if(self.system_dict["training"]["settings"]["display_progress_realtime"] and self.system_dict["verbose"]): self.custom_print(""); self.custom_print(""); if(self.system_dict["training"]["settings"]["display_progress"]): curr_lr = trainer.learning_rate self.custom_print(" curr_lr - {}".format(curr_lr)); self.custom_print(' [Epoch %d] Train-acc: %.3f, Train-loss: %.3f | Val-acc: %3f, Val-loss: %.3f, | time: %.1f sec' % (epoch+1, train_acc, train_loss, val_acc, val_loss, time.time() - since)); self.custom_print(""); self.system_dict["training"]["outputs"]["epochs_completed"] = epoch+1; save(self.system_dict); elif(self.system_dict["states"]["eval_infer"]): msg = "Cannot train in testing (eval_infer) mode.\n"; msg += "Tip - use new_experiment function with a copy_from argument.\n"; raise ConstraintError(msg); else: self.custom_print("Training Start"); self.system_dict = load_scheduler(self.system_dict); self.system_dict = load_optimizer(self.system_dict); self.system_dict = load_loss(self.system_dict); metric = mx.metric.Accuracy(); trainer = mx.gluon.Trainer(self.system_dict["local"]["model"].collect_params(), optimizer=self.system_dict["local"]["optimizer"]); self.system_dict["training"]["status"] = False; pid = os.getpid() if(self.system_dict["training"]["settings"]["save_training_logs"]): val_acc_history = []; train_acc_history = []; val_loss_history = []; train_loss_history = []; num_batch_train = len(self.system_dict["local"]["data_loaders"]["train"]); num_batch_val = len(self.system_dict["local"]["data_loaders"]["val"]); best_acc = 0.0; best_acc_epoch = 0; max_gpu_usage = 0; for epoch in range(self.system_dict["hyper-parameters"]["num_epochs"]): if(self.system_dict["training"]["settings"]["display_progress"]): self.custom_print(' Epoch {}/{}'.format(epoch+1, self.system_dict["hyper-parameters"]["num_epochs"])) self.custom_print(' ' + '-' * 10) since = time.time(); train_loss = 0 metric.reset() if(self.system_dict["training"]["settings"]["display_progress_realtime"] and self.system_dict["verbose"]): pbar = tqdm(total=num_batch_train); for i, batch in enumerate(self.system_dict["local"]["data_loaders"]["train"]): if(self.system_dict["training"]["settings"]["display_progress_realtime"] and self.system_dict["verbose"]): pbar.update(); data = mx.gluon.utils.split_and_load(batch[0], ctx_list=self.system_dict["local"]["ctx"], batch_axis=0, even_split=False) label = mx.gluon.utils.split_and_load(batch[1], ctx_list=self.system_dict["local"]["ctx"], batch_axis=0, even_split=False) with ag.record(): outputs = [self.system_dict["local"]["model"](X) for X in data] loss = [self.system_dict["local"]["criterion"](yhat, y) for yhat, y in zip(outputs, label)] for l in loss: l.backward() trainer.step(self.system_dict["dataset"]["params"]["batch_size"]); train_loss += sum([l.mean().asscalar() for l in loss]) / len(loss) metric.update(label, outputs) _, train_acc = metric.get() train_loss /= num_batch_train; val_acc, val_loss = self.set_training_evaluation(); val_acc = val_acc[1]; val_loss /= num_batch_val; if(self.system_dict["model"]["params"]["use_gpu"]): GPUs = GPUtil.getGPUs() gpuMemoryUsed = GPUs[0].memoryUsed if(self.system_dict["training"]["outputs"]["max_gpu_memory_usage"] < int(gpuMemoryUsed)): self.system_dict["training"]["outputs"]["max_gpu_memory_usage"] = int(gpuMemoryUsed); if(self.system_dict["training"]["settings"]["save_training_logs"]): val_acc_history.append(val_acc); val_loss_history.append(val_loss); train_acc_history.append(train_acc); train_loss_history.append(train_loss); if(self.system_dict["training"]["settings"]["save_intermediate_models"]): self.system_dict["local"]["model"].export(self.system_dict["model_dir"] + self.system_dict["training"]["settings"]["intermediate_model_prefix"], epoch=epoch) if(val_acc > best_acc): best_acc = val_acc; best_acc_epoch = epoch; self.system_dict["local"]["model"].export(self.system_dict["model_dir"] + "best_model", epoch=0); self.system_dict["training"]["outputs"]["best_val_acc"] = "{:4f}".format(best_acc); self.system_dict["training"]["outputs"]["best_val_acc_epoch_num"] = best_acc_epoch; time_elapsed_since = time.time() - since; if("training_time" in self.system_dict["training"]["outputs"].keys()): minutes, seconds = self.system_dict["training"]["outputs"]["training_time"].split(" "); minutes = int(minutes[:len(minutes)-1]); seconds = int(seconds[:len(seconds)-1]); time_elapsed_since += minutes*60 + seconds; self.system_dict["training"]["outputs"]["training_time"] = "{:.0f}m {:.0f}s".format(time_elapsed_since // 60, time_elapsed_since % 60); if(self.system_dict["training"]["settings"]["save_training_logs"]): np.save(self.system_dict["log_dir"] + "val_acc_history.npy", np.array(val_acc_history)); np.save(self.system_dict["log_dir"] + "val_loss_history.npy", np.array(val_loss_history)); np.save(self.system_dict["log_dir"] + "train_acc_history.npy", np.array(train_acc_history)); np.save(self.system_dict["log_dir"] + "train_loss_history.npy", np.array(train_loss_history)); create_train_test_plots_accuracy([train_acc_history, val_acc_history], ["Epoch Num", "Accuracy"], self.system_dict["log_dir"], show_img=False, save_img=True); create_train_test_plots_loss([train_loss_history, val_loss_history], ["Epoch Num", "Loss"], self.system_dict["log_dir"], show_img=False, save_img=True); self.system_dict["local"]["model"].export(self.system_dict["model_dir"] + "resume_state", epoch=0); if(self.system_dict["training"]["settings"]["display_progress_realtime"] and self.system_dict["verbose"]): self.custom_print(""); self.custom_print(""); if(self.system_dict["training"]["settings"]["display_progress"]): curr_lr = trainer.learning_rate self.custom_print(" curr_lr - {}".format(curr_lr)); self.custom_print(' [Epoch %d] Train-acc: %.3f, Train-loss: %.3f | Val-acc: %3f, Val-loss: %.3f, | time: %.1f sec' % (epoch+1, train_acc, train_loss, val_acc, val_loss, time.time() - since)); self.custom_print(""); self.system_dict["training"]["outputs"]["epochs_completed"] = epoch+1; save(self.system_dict); if(self.system_dict["training"]["settings"]["display_progress"]): self.custom_print(' Training completed in: {:.0f}m {:.0f}s'.format(time_elapsed_since // 60, time_elapsed_since % 60)) self.custom_print(' Best val Acc: {:4f}'.format(best_acc)) self.custom_print(""); if(not self.system_dict["states"]["eval_infer"]): self.custom_print("Training End"); self.custom_print(""); self.system_dict["training"]["outputs"]["best_val_acc"] = "{:4f}".format(best_acc); self.system_dict["training"]["outputs"]["best_val_acc_epoch_num"] = best_acc_epoch; self.system_dict["training"]["outputs"]["training_time"] = "{:.0f}m {:.0f}s".format(time_elapsed_since // 60, time_elapsed_since % 60); self.system_dict["training"]["outputs"]["max_gpu_usage"] = str(self.system_dict["training"]["outputs"]["max_gpu_memory_usage"]) + " Mb"; self.system_dict["local"]["model"].export(self.system_dict["model_dir"] + "final", epoch=0); if(self.system_dict["training"]["settings"]["save_training_logs"]): self.custom_print("Training Outputs"); self.custom_print(" Model Dir: {}".format(self.system_dict["model_dir"])); self.custom_print(" Log Dir: {}".format(self.system_dict["log_dir"])); self.custom_print(" Final model: {}".format("final")); self.custom_print(" Best model: {}".format("best_model")); self.custom_print(" Log 1 - Validation accuracy history log: {}".format("val_acc_history.npy")); self.custom_print(" Log 2 - Validation loss history log: {}".format("val_loss_history.npy")); self.custom_print(" Log 3 - Training accuracy history log: {}".format("train_acc_history.npy")); self.custom_print(" Log 4 - Training loss history log: {}".format("train_loss_history.npy")); self.custom_print(" Log 5 - Training curve: {}".format("train_loss_history.npy")); self.custom_print(" Log 6 - Validation curve: {}".format("train_loss_history.npy")); self.custom_print(""); np.save(self.system_dict["log_dir"] + "val_acc_history.npy", np.array(val_acc_history)); np.save(self.system_dict["log_dir"] + "val_loss_history.npy", np.array(val_loss_history)); np.save(self.system_dict["log_dir"] + "train_acc_history.npy", np.array(train_acc_history)); np.save(self.system_dict["log_dir"] + "train_loss_history.npy", np.array(train_loss_history)); self.system_dict["training"]["outputs"]["log_val_acc_history"] = self.system_dict["log_dir"] + "val_acc_history.npy"; self.system_dict["training"]["outputs"]["log_val_loss_history"] = self.system_dict["log_dir"] + "val_loss_history.npy"; self.system_dict["training"]["outputs"]["log_train_acc_history"] = self.system_dict["log_dir"] + "train_acc_history.npy"; self.system_dict["training"]["outputs"]["log_train_loss_history"] = self.system_dict["log_dir"] + "train_loss_history.npy"; self.system_dict["training"]["outputs"]["log_val_acc_history_relative"] = self.system_dict["log_dir_relative"] + "val_acc_history.npy"; self.system_dict["training"]["outputs"]["log_val_loss_history_relative"] = self.system_dict["log_dir_relative"] + "val_loss_history.npy"; self.system_dict["training"]["outputs"]["log_train_acc_history_relative"] = self.system_dict["log_dir_relative"] + "train_acc_history.npy"; self.system_dict["training"]["outputs"]["log_train_loss_history_relative"] = self.system_dict["log_dir_relative"] + "train_loss_history.npy"; create_train_test_plots_accuracy([train_acc_history, val_acc_history], ["Epoch Num", "Accuracy"], self.system_dict["log_dir"], show_img=False, save_img=True); create_train_test_plots_loss([train_loss_history, val_loss_history], ["Epoch Num", "Loss"], self.system_dict["log_dir"], show_img=False, save_img=True); self.system_dict["training"]["status"] = True;