Module monk.pytorch.finetune.level_3_training_base

Expand source code
from pytorch.finetune.imports import *
from system.imports import *
from pytorch.finetune.level_2_model_base import finetune_model



class finetune_training(finetune_model):
    '''
    Base class for training and associated functions

    Args:
        verbose (int): Set verbosity levels
                        0 - Print Nothing
                        1 - Print desired details
    '''

    def __init__(self, verbose=1):
        super().__init__(verbose=verbose);



    ###############################################################################################################################################
    def get_training_estimate(self):
        '''
        Get estimated time for training a single epoch based on all set parameters

        Args:
            None

        Returns:
            float: Total time per epoch in seconds
        '''
        total_time_per_epoch = 0;

        self.system_dict = load_optimizer(self.system_dict);
        self.system_dict = load_scheduler(self.system_dict);
        self.system_dict = load_loss(self.system_dict);

        since = time.time();

        

        for phase in ['train', 'val']:
            if phase == 'train':
                self.system_dict["local"]["model"].train() 
            else:
                self.system_dict["local"]["model"].eval()  

            running_loss = 0.0
            running_corrects = 0

            required_iters = len(self.system_dict["local"]["data_loaders"][phase])//10;
            current_iter = 0;

            for inputs, labels in self.system_dict["local"]["data_loaders"][phase]:

                inputs = inputs.to(self.system_dict["local"]["device"]);
                labels = labels.to(self.system_dict["local"]["device"]);

                self.system_dict["local"]["optimizer"].zero_grad();


                with torch.set_grad_enabled(phase == 'train'):
                    if(self.system_dict["model"]["params"]["model_name"]):
                        if "inception" in self.system_dict["model"]["params"]["model_name"] and phase == 'train':
                            outputs, aux_outputs = self.system_dict["local"]["model"](inputs)
                            loss1 = self.system_dict["local"]["criterion"](outputs, labels)
                            loss2 = self.system_dict["local"]["criterion"](aux_outputs, labels)
                            loss = loss1 + 0.4*loss2
                        else:
                            outputs = self.system_dict["local"]["model"](inputs)
                            loss = self.system_dict["local"]["criterion"](outputs, labels)
                    else:
                        outputs = self.system_dict["local"]["model"](inputs)
                        loss = self.system_dict["local"]["criterion"](outputs, labels)

                    _, preds = torch.max(outputs, 1)

                    if phase == 'train':
                        loss.backward()
                        self.system_dict["local"]["optimizer"].step()

                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)

                current_iter += 1;
                if(current_iter >= required_iters):
                    break;




        total_time_per_epoch = (time.time() - since)*10;
        
        return total_time_per_epoch;

    ###############################################################################################################################################





    ###############################################################################################################################################
    def set_training_final(self):
        '''
        Main training function

        Args:
            None

        Returns:
            None
        '''
        if(self.system_dict["states"]["resume_train"]):
            self.custom_print("Training Resume");
            total_time_per_epoch = 0;

            self.system_dict = load_optimizer(self.system_dict);
            self.system_dict = load_scheduler(self.system_dict);
            self.system_dict = load_loss(self.system_dict);
            
            self.system_dict["training"]["status"] = False;

            pid = os.getpid();

            if(self.system_dict["training"]["settings"]["save_training_logs"]):
                val_acc_history = list(np.load(self.system_dict["log_dir"] + "val_acc_history.npy", allow_pickle=True));
                train_acc_history = list(np.load(self.system_dict["log_dir"] + "train_acc_history.npy", allow_pickle=True));
                val_loss_history = list(np.load(self.system_dict["log_dir"] + "val_loss_history.npy", allow_pickle=True));
                train_loss_history = list(np.load(self.system_dict["log_dir"] + "train_loss_history.npy", allow_pickle=True));

            best_acc = 0.0;
            best_acc_epoch = 0;
            max_gpu_usage = 0;
            best_model_wts = copy.deepcopy(self.system_dict["local"]["model"].state_dict());

            for epoch in range(self.system_dict["hyper-parameters"]["num_epochs"]):
                if(self.system_dict["training"]["settings"]["display_progress"]):
                    self.custom_print('    Epoch {}/{}'.format(epoch+1, self.system_dict["hyper-parameters"]["num_epochs"]))
                    self.custom_print('    ' + '-' * 10)


                if(epoch < self.system_dict["training"]["outputs"]["epochs_completed"]):
                    self.custom_print("Skipping Current Epoch");
                    self.custom_print("");
                    self.custom_print("");
                    continue;

                since = time.time();

                for phase in ['train', 'val']:
                    if(self.system_dict["training"]["settings"]["display_progress_realtime"] and self.system_dict["verbose"]):
                        pbar=tqdm(total=len(self.system_dict["local"]["data_loaders"][phase]));

                    if phase == 'train':
                        self.system_dict["local"]["model"].train() 
                    else:
                        self.system_dict["local"]["model"].eval()  

                    running_loss = 0.0
                    running_corrects = 0


                    for inputs, labels in self.system_dict["local"]["data_loaders"][phase]:
                        if(self.system_dict["training"]["settings"]["display_progress_realtime"] and self.system_dict["verbose"]):
                            pbar.update();
                        inputs = inputs.to(self.system_dict["local"]["device"]);
                        labels = labels.to(self.system_dict["local"]["device"]);

                        self.system_dict["local"]["optimizer"].zero_grad();


                        with torch.set_grad_enabled(phase == 'train'):
                            if(self.system_dict["model"]["params"]["model_name"]):
                                if "inception" in self.system_dict["model"]["params"]["model_name"] and phase == 'train':
                                    outputs, aux_outputs = self.system_dict["local"]["model"](inputs)
                                    loss1 = self.system_dict["local"]["criterion"](outputs, labels)
                                    loss2 = self.system_dict["local"]["criterion"](aux_outputs, labels)
                                    loss = loss1 + 0.4*loss2
                                else:
                                    outputs = self.system_dict["local"]["model"](inputs)
                                    loss = self.system_dict["local"]["criterion"](outputs, labels)
                            else:
                                outputs = self.system_dict["local"]["model"](inputs)
                                loss = self.system_dict["local"]["criterion"](outputs, labels)

                            _, preds = torch.max(outputs, 1)

                            if phase == 'train':
                                loss.backward()
                                self.system_dict["local"]["optimizer"].step()


                        running_loss += loss.item() * inputs.size(0)
                        running_corrects += torch.sum(preds == labels.data)


                    epoch_loss = running_loss / len(self.system_dict["local"]["data_loaders"][phase].dataset)
                    epoch_acc = running_corrects.double() / len(self.system_dict["local"]["data_loaders"][phase].dataset)


                    if(self.system_dict["model"]["params"]["use_gpu"]):
                        GPUs = GPUtil.getGPUs()
                        gpuMemoryUsed = GPUs[0].memoryUsed
                        if(self.system_dict["training"]["outputs"]["max_gpu_memory_usage"] < int(gpuMemoryUsed)):
                            self.system_dict["training"]["outputs"]["max_gpu_memory_usage"] = int(gpuMemoryUsed);


                    if(self.system_dict["training"]["settings"]["save_training_logs"]):
                        if phase == 'val':
                            val_acc = epoch_acc;
                            val_loss = epoch_loss;
                            val_acc_history.append(epoch_acc.cpu().detach().numpy());
                            val_loss_history.append(epoch_loss);
                        else:
                            train_acc = epoch_acc;
                            train_loss = epoch_loss;
                            train_acc_history.append(epoch_acc.cpu().detach().numpy());
                            train_loss_history.append(epoch_loss);

                if(self.system_dict["training"]["settings"]["save_intermediate_models"]):
                    torch.save(self.system_dict["local"]["model"], self.system_dict["model_dir"] + 
                        self.system_dict["training"]["settings"]["intermediate_model_prefix"] + "{}".format(epoch));



                if(val_acc > best_acc):
                    best_acc = val_acc;
                    best_acc_epoch = epoch;
                    best_model_wts = copy.deepcopy(self.system_dict["local"]["model"].state_dict());
                    torch.save(self.system_dict["local"]["model"], self.system_dict["model_dir"] + "best_model");
                    self.system_dict["training"]["outputs"]["best_val_acc"] = "{:4f}".format(best_acc);
                    self.system_dict["training"]["outputs"]["best_val_acc_epoch_num"] = best_acc_epoch;

                time_elapsed_since = time.time() - since;
                if("training_time" in self.system_dict["training"]["outputs"].keys()):
                    minutes, seconds = self.system_dict["training"]["outputs"]["training_time"].split(" ");
                    minutes = int(minutes[:len(minutes)-1]);
                    seconds = int(seconds[:len(seconds)-1]);
                    time_elapsed_since += minutes*60 + seconds;
                self.system_dict["training"]["outputs"]["training_time"] = "{:.0f}m {:.0f}s".format(time_elapsed_since // 60, time_elapsed_since % 60);


                if(self.system_dict["training"]["settings"]["save_training_logs"]):
                    np.save(self.system_dict["log_dir"] + "val_acc_history.npy", np.array(val_acc_history), allow_pickle=True);
                    np.save(self.system_dict["log_dir"] + "val_loss_history.npy", np.array(val_loss_history), allow_pickle=True);
                    np.save(self.system_dict["log_dir"] + "train_acc_history.npy", np.array(train_acc_history), allow_pickle=True);
                    np.save(self.system_dict["log_dir"] + "train_loss_history.npy", np.array(train_loss_history), allow_pickle=True);

                    create_train_test_plots_accuracy([train_acc_history, val_acc_history], ["Epoch Num", "Accuracy"], self.system_dict["log_dir"], show_img=False, save_img=True);
                    create_train_test_plots_loss([train_loss_history, val_loss_history], ["Epoch Num", "Loss"], self.system_dict["log_dir"], show_img=False, save_img=True);
                
                torch.save(self.system_dict["local"]["model"], self.system_dict["model_dir"] + "resume_state");

                if(self.system_dict["local"]["learning_rate_scheduler"]):
                    if(self.system_dict["hyper-parameters"]["learning_rate_scheduler"]["name"] == "reduceonplateaulr"):
                        self.system_dict["local"]["learning_rate_scheduler"].step(epoch_loss);
                    else:
                        self.system_dict["local"]["learning_rate_scheduler"].step();

                if(self.system_dict["training"]["settings"]["display_progress_realtime"] and self.system_dict["verbose"]):
                    self.custom_print("");
                    self.custom_print("");

                if(self.system_dict["training"]["settings"]["display_progress"]):
                    for param_group in self.system_dict["local"]["optimizer"].param_groups:
                        curr_lr = param_group['lr'];
                    self.custom_print("    curr_lr - {}".format(curr_lr));
                    self.custom_print('    [Epoch %d] Train-acc: %.3f, Train-loss: %.3f | Val-acc: %3f, Val-loss: %.3f, | time: %.1f sec' %
                             (epoch+1, train_acc, train_loss, val_acc, val_loss, time.time() - since));
                    self.custom_print("");
                self.system_dict["training"]["outputs"]["epochs_completed"] = epoch+1;
                save(self.system_dict);

            if(self.system_dict["training"]["settings"]["display_progress"]):
                self.custom_print('    Training completed in: {:.0f}m {:.0f}s'.format(time_elapsed_since // 60, time_elapsed_since % 60))
                self.custom_print('    Best val Acc:          {:4f}'.format(best_acc))
                self.custom_print("");
            
            






        
        elif(self.system_dict["states"]["eval_infer"]):
            msg = "Cannot train in testing (eval_infer) mode.\n";
            msg += "Tip - use new_experiment function with a copy_from argument.\n";
            raise ConstraintError(msg);



        else:
            self.custom_print("Training Start");
            self.system_dict = load_optimizer(self.system_dict);
            self.system_dict = load_scheduler(self.system_dict);
            self.system_dict = load_loss(self.system_dict);


            self.system_dict["training"]["status"] = False;

            pid = os.getpid();

            if(self.system_dict["training"]["settings"]["save_training_logs"]):
                val_acc_history = [];
                train_acc_history = [];
                val_loss_history = [];
                train_loss_history = [];

            num_batch_train = len(self.system_dict["local"]["data_loaders"]["train"]);
            num_batch_val = len(self.system_dict["local"]["data_loaders"]["val"]);

            best_acc = 0.0;
            best_acc_epoch = 0;
            max_gpu_usage = 0;
            best_model_wts = copy.deepcopy(self.system_dict["local"]["model"].state_dict());

            for epoch in range(self.system_dict["hyper-parameters"]["num_epochs"]):
                if(self.system_dict["training"]["settings"]["display_progress"]):
                    self.custom_print('    Epoch {}/{}'.format(epoch+1, self.system_dict["hyper-parameters"]["num_epochs"]))
                    self.custom_print('    ' + '-' * 10)

                since = time.time();

                for phase in ['train', 'val']:
                    if(self.system_dict["training"]["settings"]["display_progress_realtime"] and self.system_dict["verbose"]):
                        pbar=tqdm(total=len(self.system_dict["local"]["data_loaders"][phase]));

                    if phase == 'train':
                        self.system_dict["local"]["model"].train() 
                    else:
                        self.system_dict["local"]["model"].eval()  

                    running_loss = 0.0
                    running_corrects = 0


                    for inputs, labels in self.system_dict["local"]["data_loaders"][phase]:
                        if(self.system_dict["training"]["settings"]["display_progress_realtime"] and self.system_dict["verbose"]):
                            pbar.update();
                        inputs = inputs.to(self.system_dict["local"]["device"]);
                        labels = labels.to(self.system_dict["local"]["device"]);

                        self.system_dict["local"]["optimizer"].zero_grad();


                        with torch.set_grad_enabled(phase == 'train'):
                            if(self.system_dict["model"]["params"]["model_name"]):
                                if "inception" in self.system_dict["model"]["params"]["model_name"] and phase == 'train':
                                    outputs, aux_outputs = self.system_dict["local"]["model"](inputs)
                                    loss1 = self.system_dict["local"]["criterion"](outputs, labels)
                                    loss2 = self.system_dict["local"]["criterion"](aux_outputs, labels)
                                    loss = loss1 + 0.4*loss2
                                else:
                                    outputs = self.system_dict["local"]["model"](inputs)
                                    loss = self.system_dict["local"]["criterion"](outputs, labels)
                            else:
                                outputs = self.system_dict["local"]["model"](inputs)
                                loss = self.system_dict["local"]["criterion"](outputs, labels)

                            _, preds = torch.max(outputs, 1)

                            if phase == 'train':
                                loss.backward()
                                self.system_dict["local"]["optimizer"].step()


                        running_loss += loss.item() * inputs.size(0)
                        running_corrects += torch.sum(preds == labels.data)


                    epoch_loss = running_loss / len(self.system_dict["local"]["data_loaders"][phase].dataset)
                    epoch_acc = running_corrects.double() / len(self.system_dict["local"]["data_loaders"][phase].dataset)


                    if(self.system_dict["model"]["params"]["use_gpu"]):
                        GPUs = GPUtil.getGPUs()
                        gpuMemoryUsed = GPUs[0].memoryUsed
                        if(self.system_dict["training"]["outputs"]["max_gpu_memory_usage"] < int(gpuMemoryUsed)):
                            self.system_dict["training"]["outputs"]["max_gpu_memory_usage"] = int(gpuMemoryUsed);


                    if(self.system_dict["training"]["settings"]["save_training_logs"]):
                        if phase == 'val':
                            val_acc = epoch_acc;
                            val_loss = epoch_loss;
                            val_acc_history.append(epoch_acc.cpu().detach().numpy());
                            val_loss_history.append(epoch_loss);
                        else:
                            train_acc = epoch_acc;
                            train_loss = epoch_loss;
                            train_acc_history.append(epoch_acc.cpu().detach().numpy());
                            train_loss_history.append(epoch_loss);


                if(self.system_dict["training"]["settings"]["save_intermediate_models"]):
                    torch.save(self.system_dict["local"]["model"], self.system_dict["model_dir"] + 
                        self.system_dict["training"]["settings"]["intermediate_model_prefix"] + "{}".format(epoch));



                if(val_acc > best_acc):
                    best_acc = val_acc;
                    best_acc_epoch = epoch;
                    best_model_wts = copy.deepcopy(self.system_dict["local"]["model"].state_dict());
                    torch.save(self.system_dict["local"]["model"], self.system_dict["model_dir"] + "best_model");
                    self.system_dict["training"]["outputs"]["best_val_acc"] = "{:4f}".format(best_acc);
                    self.system_dict["training"]["outputs"]["best_val_acc_epoch_num"] = best_acc_epoch;

                time_elapsed_since = time.time() - since;
                if("training_time" in self.system_dict["training"]["outputs"].keys()):
                    minutes, seconds = self.system_dict["training"]["outputs"]["training_time"].split(" ");
                    minutes = int(minutes[:len(minutes)-1]);
                    seconds = int(seconds[:len(seconds)-1]);
                    time_elapsed_since += minutes*60 + seconds;
                self.system_dict["training"]["outputs"]["training_time"] = "{:.0f}m {:.0f}s".format(time_elapsed_since // 60, time_elapsed_since % 60);


                if(self.system_dict["training"]["settings"]["save_training_logs"]):
                    np.save(self.system_dict["log_dir"] + "val_acc_history.npy", np.array(val_acc_history), allow_pickle=True);
                    np.save(self.system_dict["log_dir"] + "val_loss_history.npy", np.array(val_loss_history), allow_pickle=True);
                    np.save(self.system_dict["log_dir"] + "train_acc_history.npy", np.array(train_acc_history), allow_pickle=True);
                    np.save(self.system_dict["log_dir"] + "train_loss_history.npy", np.array(train_loss_history), allow_pickle=True);

                    create_train_test_plots_accuracy([train_acc_history, val_acc_history], ["Epoch Num", "Accuracy"], self.system_dict["log_dir"], show_img=False, save_img=True);
                    create_train_test_plots_loss([train_loss_history, val_loss_history], ["Epoch Num", "Loss"], self.system_dict["log_dir"], show_img=False, save_img=True);
                
                torch.save(self.system_dict["local"]["model"], self.system_dict["model_dir"] + "resume_state");

                if(self.system_dict["local"]["learning_rate_scheduler"]):
                    if(self.system_dict["hyper-parameters"]["learning_rate_scheduler"]["name"] == "reduceonplateaulr"):
                        self.system_dict["local"]["learning_rate_scheduler"].step(epoch_loss);
                    else:
                        self.system_dict["local"]["learning_rate_scheduler"].step();

                if(self.system_dict["training"]["settings"]["display_progress_realtime"] and self.system_dict["verbose"]):
                    self.custom_print("");
                    self.custom_print("");

                if(self.system_dict["training"]["settings"]["display_progress"]):
                    for param_group in self.system_dict["local"]["optimizer"].param_groups:
                        curr_lr = param_group['lr'];
                    self.custom_print("    curr_lr - {}".format(curr_lr));
                    self.custom_print('    [Epoch %d] Train-acc: %.3f, Train-loss: %.3f | Val-acc: %3f, Val-loss: %.3f, | time: %.1f sec' %
                             (epoch+1, train_acc, train_loss, val_acc, val_loss, time.time() - since));
                    self.custom_print("");
                self.system_dict["training"]["outputs"]["epochs_completed"] = epoch+1;
                save(self.system_dict);

            if(self.system_dict["training"]["settings"]["display_progress"]):
                self.custom_print('    Training completed in: {:.0f}m {:.0f}s'.format(time_elapsed_since // 60, time_elapsed_since % 60))
                self.custom_print('    Best val Acc:          {:4f}'.format(best_acc))
                self.custom_print("");


        
        if(not self.system_dict["states"]["eval_infer"]):
            self.custom_print("Training End");
            self.custom_print("");
            self.system_dict["training"]["outputs"]["best_val_acc"] = "{:4f}".format(best_acc);
            self.system_dict["training"]["outputs"]["best_val_acc_epoch_num"] = best_acc_epoch;
            self.system_dict["training"]["outputs"]["training_time"] = "{:.0f}m {:.0f}s".format(time_elapsed_since // 60, time_elapsed_since % 60);
            self.system_dict["training"]["outputs"]["max_gpu_usage"] = str(self.system_dict["training"]["outputs"]["max_gpu_memory_usage"]) + " Mb";

            torch.save(self.system_dict["local"]["model"], self.system_dict["model_dir"] + "final");

            if(self.system_dict["training"]["settings"]["save_training_logs"]): 
                self.custom_print("Training Outputs");
                self.custom_print("    Model Dir:   {}".format(self.system_dict["model_dir"]));
                self.custom_print("    Log Dir:     {}".format(self.system_dict["log_dir"]));
                self.custom_print("    Final model: {}".format("final"));
                self.custom_print("    Best model:  {}".format("best_model"));
                self.custom_print("    Log 1 - Validation accuracy history log: {}".format("val_acc_history.npy"));
                self.custom_print("    Log 2 - Validation loss history log:     {}".format("val_loss_history.npy"));
                self.custom_print("    Log 3 - Training accuracy history log:   {}".format("train_acc_history.npy"));
                self.custom_print("    Log 4 - Training loss history log:       {}".format("train_loss_history.npy"));
                self.custom_print("    Log 5 - Training curve:                  {}".format("train_loss_history.npy"));
                self.custom_print("    Log 6 - Validation curve:                {}".format("train_loss_history.npy"));
                self.custom_print("");

                np.save(self.system_dict["log_dir"] + "val_acc_history.npy", np.array(val_acc_history), allow_pickle=True);
                np.save(self.system_dict["log_dir"] + "val_loss_history.npy", np.array(val_loss_history), allow_pickle=True);
                np.save(self.system_dict["log_dir"] + "train_acc_history.npy", np.array(train_acc_history), allow_pickle=True);
                np.save(self.system_dict["log_dir"] + "train_loss_history.npy", np.array(train_loss_history), allow_pickle=True);
                
                self.system_dict["training"]["outputs"]["log_val_acc_history"] = self.system_dict["log_dir"] + "val_acc_history.npy";
                self.system_dict["training"]["outputs"]["log_val_loss_history"] = self.system_dict["log_dir"] + "val_loss_history.npy";
                self.system_dict["training"]["outputs"]["log_train_acc_history"] = self.system_dict["log_dir"] + "train_acc_history.npy";
                self.system_dict["training"]["outputs"]["log_train_loss_history"] = self.system_dict["log_dir"] + "train_loss_history.npy";

                self.system_dict["training"]["outputs"]["log_val_acc_history_relative"] = self.system_dict["log_dir_relative"] + "val_acc_history.npy";
                self.system_dict["training"]["outputs"]["log_val_loss_history_relative"] = self.system_dict["log_dir_relative"] + "val_loss_history.npy";
                self.system_dict["training"]["outputs"]["log_train_acc_history_relative"] = self.system_dict["log_dir_relative"] + "train_acc_history.npy";
                self.system_dict["training"]["outputs"]["log_train_loss_history_relative"] = self.system_dict["log_dir_relative"] + "train_loss_history.npy";


                create_train_test_plots_accuracy([train_acc_history, val_acc_history], ["Epoch Num", "Accuracy"], self.system_dict["log_dir"], show_img=False, save_img=True);
                create_train_test_plots_loss([train_loss_history, val_loss_history], ["Epoch Num", "Loss"], self.system_dict["log_dir"], show_img=False, save_img=True);

            self.system_dict["training"]["status"] = True;
        

    ###############################################################################################################################################

Classes

class finetune_training (verbose=1)

Base class for training and associated functions

Args

verbose : int
Set verbosity levels 0 - Print Nothing 1 - Print desired details
Expand source code
class finetune_training(finetune_model):
    '''
    Base class for training and associated functions

    Args:
        verbose (int): Set verbosity levels
                        0 - Print Nothing
                        1 - Print desired details
    '''

    def __init__(self, verbose=1):
        super().__init__(verbose=verbose);



    ###############################################################################################################################################
    def get_training_estimate(self):
        '''
        Get estimated time for training a single epoch based on all set parameters

        Args:
            None

        Returns:
            float: Total time per epoch in seconds
        '''
        total_time_per_epoch = 0;

        self.system_dict = load_optimizer(self.system_dict);
        self.system_dict = load_scheduler(self.system_dict);
        self.system_dict = load_loss(self.system_dict);

        since = time.time();

        

        for phase in ['train', 'val']:
            if phase == 'train':
                self.system_dict["local"]["model"].train() 
            else:
                self.system_dict["local"]["model"].eval()  

            running_loss = 0.0
            running_corrects = 0

            required_iters = len(self.system_dict["local"]["data_loaders"][phase])//10;
            current_iter = 0;

            for inputs, labels in self.system_dict["local"]["data_loaders"][phase]:

                inputs = inputs.to(self.system_dict["local"]["device"]);
                labels = labels.to(self.system_dict["local"]["device"]);

                self.system_dict["local"]["optimizer"].zero_grad();


                with torch.set_grad_enabled(phase == 'train'):
                    if(self.system_dict["model"]["params"]["model_name"]):
                        if "inception" in self.system_dict["model"]["params"]["model_name"] and phase == 'train':
                            outputs, aux_outputs = self.system_dict["local"]["model"](inputs)
                            loss1 = self.system_dict["local"]["criterion"](outputs, labels)
                            loss2 = self.system_dict["local"]["criterion"](aux_outputs, labels)
                            loss = loss1 + 0.4*loss2
                        else:
                            outputs = self.system_dict["local"]["model"](inputs)
                            loss = self.system_dict["local"]["criterion"](outputs, labels)
                    else:
                        outputs = self.system_dict["local"]["model"](inputs)
                        loss = self.system_dict["local"]["criterion"](outputs, labels)

                    _, preds = torch.max(outputs, 1)

                    if phase == 'train':
                        loss.backward()
                        self.system_dict["local"]["optimizer"].step()

                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)

                current_iter += 1;
                if(current_iter >= required_iters):
                    break;




        total_time_per_epoch = (time.time() - since)*10;
        
        return total_time_per_epoch;

    ###############################################################################################################################################





    ###############################################################################################################################################
    def set_training_final(self):
        '''
        Main training function

        Args:
            None

        Returns:
            None
        '''
        if(self.system_dict["states"]["resume_train"]):
            self.custom_print("Training Resume");
            total_time_per_epoch = 0;

            self.system_dict = load_optimizer(self.system_dict);
            self.system_dict = load_scheduler(self.system_dict);
            self.system_dict = load_loss(self.system_dict);
            
            self.system_dict["training"]["status"] = False;

            pid = os.getpid();

            if(self.system_dict["training"]["settings"]["save_training_logs"]):
                val_acc_history = list(np.load(self.system_dict["log_dir"] + "val_acc_history.npy", allow_pickle=True));
                train_acc_history = list(np.load(self.system_dict["log_dir"] + "train_acc_history.npy", allow_pickle=True));
                val_loss_history = list(np.load(self.system_dict["log_dir"] + "val_loss_history.npy", allow_pickle=True));
                train_loss_history = list(np.load(self.system_dict["log_dir"] + "train_loss_history.npy", allow_pickle=True));

            best_acc = 0.0;
            best_acc_epoch = 0;
            max_gpu_usage = 0;
            best_model_wts = copy.deepcopy(self.system_dict["local"]["model"].state_dict());

            for epoch in range(self.system_dict["hyper-parameters"]["num_epochs"]):
                if(self.system_dict["training"]["settings"]["display_progress"]):
                    self.custom_print('    Epoch {}/{}'.format(epoch+1, self.system_dict["hyper-parameters"]["num_epochs"]))
                    self.custom_print('    ' + '-' * 10)


                if(epoch < self.system_dict["training"]["outputs"]["epochs_completed"]):
                    self.custom_print("Skipping Current Epoch");
                    self.custom_print("");
                    self.custom_print("");
                    continue;

                since = time.time();

                for phase in ['train', 'val']:
                    if(self.system_dict["training"]["settings"]["display_progress_realtime"] and self.system_dict["verbose"]):
                        pbar=tqdm(total=len(self.system_dict["local"]["data_loaders"][phase]));

                    if phase == 'train':
                        self.system_dict["local"]["model"].train() 
                    else:
                        self.system_dict["local"]["model"].eval()  

                    running_loss = 0.0
                    running_corrects = 0


                    for inputs, labels in self.system_dict["local"]["data_loaders"][phase]:
                        if(self.system_dict["training"]["settings"]["display_progress_realtime"] and self.system_dict["verbose"]):
                            pbar.update();
                        inputs = inputs.to(self.system_dict["local"]["device"]);
                        labels = labels.to(self.system_dict["local"]["device"]);

                        self.system_dict["local"]["optimizer"].zero_grad();


                        with torch.set_grad_enabled(phase == 'train'):
                            if(self.system_dict["model"]["params"]["model_name"]):
                                if "inception" in self.system_dict["model"]["params"]["model_name"] and phase == 'train':
                                    outputs, aux_outputs = self.system_dict["local"]["model"](inputs)
                                    loss1 = self.system_dict["local"]["criterion"](outputs, labels)
                                    loss2 = self.system_dict["local"]["criterion"](aux_outputs, labels)
                                    loss = loss1 + 0.4*loss2
                                else:
                                    outputs = self.system_dict["local"]["model"](inputs)
                                    loss = self.system_dict["local"]["criterion"](outputs, labels)
                            else:
                                outputs = self.system_dict["local"]["model"](inputs)
                                loss = self.system_dict["local"]["criterion"](outputs, labels)

                            _, preds = torch.max(outputs, 1)

                            if phase == 'train':
                                loss.backward()
                                self.system_dict["local"]["optimizer"].step()


                        running_loss += loss.item() * inputs.size(0)
                        running_corrects += torch.sum(preds == labels.data)


                    epoch_loss = running_loss / len(self.system_dict["local"]["data_loaders"][phase].dataset)
                    epoch_acc = running_corrects.double() / len(self.system_dict["local"]["data_loaders"][phase].dataset)


                    if(self.system_dict["model"]["params"]["use_gpu"]):
                        GPUs = GPUtil.getGPUs()
                        gpuMemoryUsed = GPUs[0].memoryUsed
                        if(self.system_dict["training"]["outputs"]["max_gpu_memory_usage"] < int(gpuMemoryUsed)):
                            self.system_dict["training"]["outputs"]["max_gpu_memory_usage"] = int(gpuMemoryUsed);


                    if(self.system_dict["training"]["settings"]["save_training_logs"]):
                        if phase == 'val':
                            val_acc = epoch_acc;
                            val_loss = epoch_loss;
                            val_acc_history.append(epoch_acc.cpu().detach().numpy());
                            val_loss_history.append(epoch_loss);
                        else:
                            train_acc = epoch_acc;
                            train_loss = epoch_loss;
                            train_acc_history.append(epoch_acc.cpu().detach().numpy());
                            train_loss_history.append(epoch_loss);

                if(self.system_dict["training"]["settings"]["save_intermediate_models"]):
                    torch.save(self.system_dict["local"]["model"], self.system_dict["model_dir"] + 
                        self.system_dict["training"]["settings"]["intermediate_model_prefix"] + "{}".format(epoch));



                if(val_acc > best_acc):
                    best_acc = val_acc;
                    best_acc_epoch = epoch;
                    best_model_wts = copy.deepcopy(self.system_dict["local"]["model"].state_dict());
                    torch.save(self.system_dict["local"]["model"], self.system_dict["model_dir"] + "best_model");
                    self.system_dict["training"]["outputs"]["best_val_acc"] = "{:4f}".format(best_acc);
                    self.system_dict["training"]["outputs"]["best_val_acc_epoch_num"] = best_acc_epoch;

                time_elapsed_since = time.time() - since;
                if("training_time" in self.system_dict["training"]["outputs"].keys()):
                    minutes, seconds = self.system_dict["training"]["outputs"]["training_time"].split(" ");
                    minutes = int(minutes[:len(minutes)-1]);
                    seconds = int(seconds[:len(seconds)-1]);
                    time_elapsed_since += minutes*60 + seconds;
                self.system_dict["training"]["outputs"]["training_time"] = "{:.0f}m {:.0f}s".format(time_elapsed_since // 60, time_elapsed_since % 60);


                if(self.system_dict["training"]["settings"]["save_training_logs"]):
                    np.save(self.system_dict["log_dir"] + "val_acc_history.npy", np.array(val_acc_history), allow_pickle=True);
                    np.save(self.system_dict["log_dir"] + "val_loss_history.npy", np.array(val_loss_history), allow_pickle=True);
                    np.save(self.system_dict["log_dir"] + "train_acc_history.npy", np.array(train_acc_history), allow_pickle=True);
                    np.save(self.system_dict["log_dir"] + "train_loss_history.npy", np.array(train_loss_history), allow_pickle=True);

                    create_train_test_plots_accuracy([train_acc_history, val_acc_history], ["Epoch Num", "Accuracy"], self.system_dict["log_dir"], show_img=False, save_img=True);
                    create_train_test_plots_loss([train_loss_history, val_loss_history], ["Epoch Num", "Loss"], self.system_dict["log_dir"], show_img=False, save_img=True);
                
                torch.save(self.system_dict["local"]["model"], self.system_dict["model_dir"] + "resume_state");

                if(self.system_dict["local"]["learning_rate_scheduler"]):
                    if(self.system_dict["hyper-parameters"]["learning_rate_scheduler"]["name"] == "reduceonplateaulr"):
                        self.system_dict["local"]["learning_rate_scheduler"].step(epoch_loss);
                    else:
                        self.system_dict["local"]["learning_rate_scheduler"].step();

                if(self.system_dict["training"]["settings"]["display_progress_realtime"] and self.system_dict["verbose"]):
                    self.custom_print("");
                    self.custom_print("");

                if(self.system_dict["training"]["settings"]["display_progress"]):
                    for param_group in self.system_dict["local"]["optimizer"].param_groups:
                        curr_lr = param_group['lr'];
                    self.custom_print("    curr_lr - {}".format(curr_lr));
                    self.custom_print('    [Epoch %d] Train-acc: %.3f, Train-loss: %.3f | Val-acc: %3f, Val-loss: %.3f, | time: %.1f sec' %
                             (epoch+1, train_acc, train_loss, val_acc, val_loss, time.time() - since));
                    self.custom_print("");
                self.system_dict["training"]["outputs"]["epochs_completed"] = epoch+1;
                save(self.system_dict);

            if(self.system_dict["training"]["settings"]["display_progress"]):
                self.custom_print('    Training completed in: {:.0f}m {:.0f}s'.format(time_elapsed_since // 60, time_elapsed_since % 60))
                self.custom_print('    Best val Acc:          {:4f}'.format(best_acc))
                self.custom_print("");
            
            






        
        elif(self.system_dict["states"]["eval_infer"]):
            msg = "Cannot train in testing (eval_infer) mode.\n";
            msg += "Tip - use new_experiment function with a copy_from argument.\n";
            raise ConstraintError(msg);



        else:
            self.custom_print("Training Start");
            self.system_dict = load_optimizer(self.system_dict);
            self.system_dict = load_scheduler(self.system_dict);
            self.system_dict = load_loss(self.system_dict);


            self.system_dict["training"]["status"] = False;

            pid = os.getpid();

            if(self.system_dict["training"]["settings"]["save_training_logs"]):
                val_acc_history = [];
                train_acc_history = [];
                val_loss_history = [];
                train_loss_history = [];

            num_batch_train = len(self.system_dict["local"]["data_loaders"]["train"]);
            num_batch_val = len(self.system_dict["local"]["data_loaders"]["val"]);

            best_acc = 0.0;
            best_acc_epoch = 0;
            max_gpu_usage = 0;
            best_model_wts = copy.deepcopy(self.system_dict["local"]["model"].state_dict());

            for epoch in range(self.system_dict["hyper-parameters"]["num_epochs"]):
                if(self.system_dict["training"]["settings"]["display_progress"]):
                    self.custom_print('    Epoch {}/{}'.format(epoch+1, self.system_dict["hyper-parameters"]["num_epochs"]))
                    self.custom_print('    ' + '-' * 10)

                since = time.time();

                for phase in ['train', 'val']:
                    if(self.system_dict["training"]["settings"]["display_progress_realtime"] and self.system_dict["verbose"]):
                        pbar=tqdm(total=len(self.system_dict["local"]["data_loaders"][phase]));

                    if phase == 'train':
                        self.system_dict["local"]["model"].train() 
                    else:
                        self.system_dict["local"]["model"].eval()  

                    running_loss = 0.0
                    running_corrects = 0


                    for inputs, labels in self.system_dict["local"]["data_loaders"][phase]:
                        if(self.system_dict["training"]["settings"]["display_progress_realtime"] and self.system_dict["verbose"]):
                            pbar.update();
                        inputs = inputs.to(self.system_dict["local"]["device"]);
                        labels = labels.to(self.system_dict["local"]["device"]);

                        self.system_dict["local"]["optimizer"].zero_grad();


                        with torch.set_grad_enabled(phase == 'train'):
                            if(self.system_dict["model"]["params"]["model_name"]):
                                if "inception" in self.system_dict["model"]["params"]["model_name"] and phase == 'train':
                                    outputs, aux_outputs = self.system_dict["local"]["model"](inputs)
                                    loss1 = self.system_dict["local"]["criterion"](outputs, labels)
                                    loss2 = self.system_dict["local"]["criterion"](aux_outputs, labels)
                                    loss = loss1 + 0.4*loss2
                                else:
                                    outputs = self.system_dict["local"]["model"](inputs)
                                    loss = self.system_dict["local"]["criterion"](outputs, labels)
                            else:
                                outputs = self.system_dict["local"]["model"](inputs)
                                loss = self.system_dict["local"]["criterion"](outputs, labels)

                            _, preds = torch.max(outputs, 1)

                            if phase == 'train':
                                loss.backward()
                                self.system_dict["local"]["optimizer"].step()


                        running_loss += loss.item() * inputs.size(0)
                        running_corrects += torch.sum(preds == labels.data)


                    epoch_loss = running_loss / len(self.system_dict["local"]["data_loaders"][phase].dataset)
                    epoch_acc = running_corrects.double() / len(self.system_dict["local"]["data_loaders"][phase].dataset)


                    if(self.system_dict["model"]["params"]["use_gpu"]):
                        GPUs = GPUtil.getGPUs()
                        gpuMemoryUsed = GPUs[0].memoryUsed
                        if(self.system_dict["training"]["outputs"]["max_gpu_memory_usage"] < int(gpuMemoryUsed)):
                            self.system_dict["training"]["outputs"]["max_gpu_memory_usage"] = int(gpuMemoryUsed);


                    if(self.system_dict["training"]["settings"]["save_training_logs"]):
                        if phase == 'val':
                            val_acc = epoch_acc;
                            val_loss = epoch_loss;
                            val_acc_history.append(epoch_acc.cpu().detach().numpy());
                            val_loss_history.append(epoch_loss);
                        else:
                            train_acc = epoch_acc;
                            train_loss = epoch_loss;
                            train_acc_history.append(epoch_acc.cpu().detach().numpy());
                            train_loss_history.append(epoch_loss);


                if(self.system_dict["training"]["settings"]["save_intermediate_models"]):
                    torch.save(self.system_dict["local"]["model"], self.system_dict["model_dir"] + 
                        self.system_dict["training"]["settings"]["intermediate_model_prefix"] + "{}".format(epoch));



                if(val_acc > best_acc):
                    best_acc = val_acc;
                    best_acc_epoch = epoch;
                    best_model_wts = copy.deepcopy(self.system_dict["local"]["model"].state_dict());
                    torch.save(self.system_dict["local"]["model"], self.system_dict["model_dir"] + "best_model");
                    self.system_dict["training"]["outputs"]["best_val_acc"] = "{:4f}".format(best_acc);
                    self.system_dict["training"]["outputs"]["best_val_acc_epoch_num"] = best_acc_epoch;

                time_elapsed_since = time.time() - since;
                if("training_time" in self.system_dict["training"]["outputs"].keys()):
                    minutes, seconds = self.system_dict["training"]["outputs"]["training_time"].split(" ");
                    minutes = int(minutes[:len(minutes)-1]);
                    seconds = int(seconds[:len(seconds)-1]);
                    time_elapsed_since += minutes*60 + seconds;
                self.system_dict["training"]["outputs"]["training_time"] = "{:.0f}m {:.0f}s".format(time_elapsed_since // 60, time_elapsed_since % 60);


                if(self.system_dict["training"]["settings"]["save_training_logs"]):
                    np.save(self.system_dict["log_dir"] + "val_acc_history.npy", np.array(val_acc_history), allow_pickle=True);
                    np.save(self.system_dict["log_dir"] + "val_loss_history.npy", np.array(val_loss_history), allow_pickle=True);
                    np.save(self.system_dict["log_dir"] + "train_acc_history.npy", np.array(train_acc_history), allow_pickle=True);
                    np.save(self.system_dict["log_dir"] + "train_loss_history.npy", np.array(train_loss_history), allow_pickle=True);

                    create_train_test_plots_accuracy([train_acc_history, val_acc_history], ["Epoch Num", "Accuracy"], self.system_dict["log_dir"], show_img=False, save_img=True);
                    create_train_test_plots_loss([train_loss_history, val_loss_history], ["Epoch Num", "Loss"], self.system_dict["log_dir"], show_img=False, save_img=True);
                
                torch.save(self.system_dict["local"]["model"], self.system_dict["model_dir"] + "resume_state");

                if(self.system_dict["local"]["learning_rate_scheduler"]):
                    if(self.system_dict["hyper-parameters"]["learning_rate_scheduler"]["name"] == "reduceonplateaulr"):
                        self.system_dict["local"]["learning_rate_scheduler"].step(epoch_loss);
                    else:
                        self.system_dict["local"]["learning_rate_scheduler"].step();

                if(self.system_dict["training"]["settings"]["display_progress_realtime"] and self.system_dict["verbose"]):
                    self.custom_print("");
                    self.custom_print("");

                if(self.system_dict["training"]["settings"]["display_progress"]):
                    for param_group in self.system_dict["local"]["optimizer"].param_groups:
                        curr_lr = param_group['lr'];
                    self.custom_print("    curr_lr - {}".format(curr_lr));
                    self.custom_print('    [Epoch %d] Train-acc: %.3f, Train-loss: %.3f | Val-acc: %3f, Val-loss: %.3f, | time: %.1f sec' %
                             (epoch+1, train_acc, train_loss, val_acc, val_loss, time.time() - since));
                    self.custom_print("");
                self.system_dict["training"]["outputs"]["epochs_completed"] = epoch+1;
                save(self.system_dict);

            if(self.system_dict["training"]["settings"]["display_progress"]):
                self.custom_print('    Training completed in: {:.0f}m {:.0f}s'.format(time_elapsed_since // 60, time_elapsed_since % 60))
                self.custom_print('    Best val Acc:          {:4f}'.format(best_acc))
                self.custom_print("");


        
        if(not self.system_dict["states"]["eval_infer"]):
            self.custom_print("Training End");
            self.custom_print("");
            self.system_dict["training"]["outputs"]["best_val_acc"] = "{:4f}".format(best_acc);
            self.system_dict["training"]["outputs"]["best_val_acc_epoch_num"] = best_acc_epoch;
            self.system_dict["training"]["outputs"]["training_time"] = "{:.0f}m {:.0f}s".format(time_elapsed_since // 60, time_elapsed_since % 60);
            self.system_dict["training"]["outputs"]["max_gpu_usage"] = str(self.system_dict["training"]["outputs"]["max_gpu_memory_usage"]) + " Mb";

            torch.save(self.system_dict["local"]["model"], self.system_dict["model_dir"] + "final");

            if(self.system_dict["training"]["settings"]["save_training_logs"]): 
                self.custom_print("Training Outputs");
                self.custom_print("    Model Dir:   {}".format(self.system_dict["model_dir"]));
                self.custom_print("    Log Dir:     {}".format(self.system_dict["log_dir"]));
                self.custom_print("    Final model: {}".format("final"));
                self.custom_print("    Best model:  {}".format("best_model"));
                self.custom_print("    Log 1 - Validation accuracy history log: {}".format("val_acc_history.npy"));
                self.custom_print("    Log 2 - Validation loss history log:     {}".format("val_loss_history.npy"));
                self.custom_print("    Log 3 - Training accuracy history log:   {}".format("train_acc_history.npy"));
                self.custom_print("    Log 4 - Training loss history log:       {}".format("train_loss_history.npy"));
                self.custom_print("    Log 5 - Training curve:                  {}".format("train_loss_history.npy"));
                self.custom_print("    Log 6 - Validation curve:                {}".format("train_loss_history.npy"));
                self.custom_print("");

                np.save(self.system_dict["log_dir"] + "val_acc_history.npy", np.array(val_acc_history), allow_pickle=True);
                np.save(self.system_dict["log_dir"] + "val_loss_history.npy", np.array(val_loss_history), allow_pickle=True);
                np.save(self.system_dict["log_dir"] + "train_acc_history.npy", np.array(train_acc_history), allow_pickle=True);
                np.save(self.system_dict["log_dir"] + "train_loss_history.npy", np.array(train_loss_history), allow_pickle=True);
                
                self.system_dict["training"]["outputs"]["log_val_acc_history"] = self.system_dict["log_dir"] + "val_acc_history.npy";
                self.system_dict["training"]["outputs"]["log_val_loss_history"] = self.system_dict["log_dir"] + "val_loss_history.npy";
                self.system_dict["training"]["outputs"]["log_train_acc_history"] = self.system_dict["log_dir"] + "train_acc_history.npy";
                self.system_dict["training"]["outputs"]["log_train_loss_history"] = self.system_dict["log_dir"] + "train_loss_history.npy";

                self.system_dict["training"]["outputs"]["log_val_acc_history_relative"] = self.system_dict["log_dir_relative"] + "val_acc_history.npy";
                self.system_dict["training"]["outputs"]["log_val_loss_history_relative"] = self.system_dict["log_dir_relative"] + "val_loss_history.npy";
                self.system_dict["training"]["outputs"]["log_train_acc_history_relative"] = self.system_dict["log_dir_relative"] + "train_acc_history.npy";
                self.system_dict["training"]["outputs"]["log_train_loss_history_relative"] = self.system_dict["log_dir_relative"] + "train_loss_history.npy";


                create_train_test_plots_accuracy([train_acc_history, val_acc_history], ["Epoch Num", "Accuracy"], self.system_dict["log_dir"], show_img=False, save_img=True);
                create_train_test_plots_loss([train_loss_history, val_loss_history], ["Epoch Num", "Loss"], self.system_dict["log_dir"], show_img=False, save_img=True);

            self.system_dict["training"]["status"] = True;

Ancestors

  • pytorch.finetune.level_2_model_base.finetune_model
  • pytorch.finetune.level_1_dataset_base.finetune_dataset
  • system.base_class.system

Methods

def get_training_estimate(self)

Get estimated time for training a single epoch based on all set parameters

Args

None
 

Returns

float
Total time per epoch in seconds
Expand source code
def get_training_estimate(self):
    '''
    Get estimated time for training a single epoch based on all set parameters

    Args:
        None

    Returns:
        float: Total time per epoch in seconds
    '''
    total_time_per_epoch = 0;

    self.system_dict = load_optimizer(self.system_dict);
    self.system_dict = load_scheduler(self.system_dict);
    self.system_dict = load_loss(self.system_dict);

    since = time.time();

    

    for phase in ['train', 'val']:
        if phase == 'train':
            self.system_dict["local"]["model"].train() 
        else:
            self.system_dict["local"]["model"].eval()  

        running_loss = 0.0
        running_corrects = 0

        required_iters = len(self.system_dict["local"]["data_loaders"][phase])//10;
        current_iter = 0;

        for inputs, labels in self.system_dict["local"]["data_loaders"][phase]:

            inputs = inputs.to(self.system_dict["local"]["device"]);
            labels = labels.to(self.system_dict["local"]["device"]);

            self.system_dict["local"]["optimizer"].zero_grad();


            with torch.set_grad_enabled(phase == 'train'):
                if(self.system_dict["model"]["params"]["model_name"]):
                    if "inception" in self.system_dict["model"]["params"]["model_name"] and phase == 'train':
                        outputs, aux_outputs = self.system_dict["local"]["model"](inputs)
                        loss1 = self.system_dict["local"]["criterion"](outputs, labels)
                        loss2 = self.system_dict["local"]["criterion"](aux_outputs, labels)
                        loss = loss1 + 0.4*loss2
                    else:
                        outputs = self.system_dict["local"]["model"](inputs)
                        loss = self.system_dict["local"]["criterion"](outputs, labels)
                else:
                    outputs = self.system_dict["local"]["model"](inputs)
                    loss = self.system_dict["local"]["criterion"](outputs, labels)

                _, preds = torch.max(outputs, 1)

                if phase == 'train':
                    loss.backward()
                    self.system_dict["local"]["optimizer"].step()

            running_loss += loss.item() * inputs.size(0)
            running_corrects += torch.sum(preds == labels.data)

            current_iter += 1;
            if(current_iter >= required_iters):
                break;




    total_time_per_epoch = (time.time() - since)*10;
    
    return total_time_per_epoch;
def set_training_final(self)

Main training function

Args

None
 

Returns

None
 
Expand source code
def set_training_final(self):
    '''
    Main training function

    Args:
        None

    Returns:
        None
    '''
    if(self.system_dict["states"]["resume_train"]):
        self.custom_print("Training Resume");
        total_time_per_epoch = 0;

        self.system_dict = load_optimizer(self.system_dict);
        self.system_dict = load_scheduler(self.system_dict);
        self.system_dict = load_loss(self.system_dict);
        
        self.system_dict["training"]["status"] = False;

        pid = os.getpid();

        if(self.system_dict["training"]["settings"]["save_training_logs"]):
            val_acc_history = list(np.load(self.system_dict["log_dir"] + "val_acc_history.npy", allow_pickle=True));
            train_acc_history = list(np.load(self.system_dict["log_dir"] + "train_acc_history.npy", allow_pickle=True));
            val_loss_history = list(np.load(self.system_dict["log_dir"] + "val_loss_history.npy", allow_pickle=True));
            train_loss_history = list(np.load(self.system_dict["log_dir"] + "train_loss_history.npy", allow_pickle=True));

        best_acc = 0.0;
        best_acc_epoch = 0;
        max_gpu_usage = 0;
        best_model_wts = copy.deepcopy(self.system_dict["local"]["model"].state_dict());

        for epoch in range(self.system_dict["hyper-parameters"]["num_epochs"]):
            if(self.system_dict["training"]["settings"]["display_progress"]):
                self.custom_print('    Epoch {}/{}'.format(epoch+1, self.system_dict["hyper-parameters"]["num_epochs"]))
                self.custom_print('    ' + '-' * 10)


            if(epoch < self.system_dict["training"]["outputs"]["epochs_completed"]):
                self.custom_print("Skipping Current Epoch");
                self.custom_print("");
                self.custom_print("");
                continue;

            since = time.time();

            for phase in ['train', 'val']:
                if(self.system_dict["training"]["settings"]["display_progress_realtime"] and self.system_dict["verbose"]):
                    pbar=tqdm(total=len(self.system_dict["local"]["data_loaders"][phase]));

                if phase == 'train':
                    self.system_dict["local"]["model"].train() 
                else:
                    self.system_dict["local"]["model"].eval()  

                running_loss = 0.0
                running_corrects = 0


                for inputs, labels in self.system_dict["local"]["data_loaders"][phase]:
                    if(self.system_dict["training"]["settings"]["display_progress_realtime"] and self.system_dict["verbose"]):
                        pbar.update();
                    inputs = inputs.to(self.system_dict["local"]["device"]);
                    labels = labels.to(self.system_dict["local"]["device"]);

                    self.system_dict["local"]["optimizer"].zero_grad();


                    with torch.set_grad_enabled(phase == 'train'):
                        if(self.system_dict["model"]["params"]["model_name"]):
                            if "inception" in self.system_dict["model"]["params"]["model_name"] and phase == 'train':
                                outputs, aux_outputs = self.system_dict["local"]["model"](inputs)
                                loss1 = self.system_dict["local"]["criterion"](outputs, labels)
                                loss2 = self.system_dict["local"]["criterion"](aux_outputs, labels)
                                loss = loss1 + 0.4*loss2
                            else:
                                outputs = self.system_dict["local"]["model"](inputs)
                                loss = self.system_dict["local"]["criterion"](outputs, labels)
                        else:
                            outputs = self.system_dict["local"]["model"](inputs)
                            loss = self.system_dict["local"]["criterion"](outputs, labels)

                        _, preds = torch.max(outputs, 1)

                        if phase == 'train':
                            loss.backward()
                            self.system_dict["local"]["optimizer"].step()


                    running_loss += loss.item() * inputs.size(0)
                    running_corrects += torch.sum(preds == labels.data)


                epoch_loss = running_loss / len(self.system_dict["local"]["data_loaders"][phase].dataset)
                epoch_acc = running_corrects.double() / len(self.system_dict["local"]["data_loaders"][phase].dataset)


                if(self.system_dict["model"]["params"]["use_gpu"]):
                    GPUs = GPUtil.getGPUs()
                    gpuMemoryUsed = GPUs[0].memoryUsed
                    if(self.system_dict["training"]["outputs"]["max_gpu_memory_usage"] < int(gpuMemoryUsed)):
                        self.system_dict["training"]["outputs"]["max_gpu_memory_usage"] = int(gpuMemoryUsed);


                if(self.system_dict["training"]["settings"]["save_training_logs"]):
                    if phase == 'val':
                        val_acc = epoch_acc;
                        val_loss = epoch_loss;
                        val_acc_history.append(epoch_acc.cpu().detach().numpy());
                        val_loss_history.append(epoch_loss);
                    else:
                        train_acc = epoch_acc;
                        train_loss = epoch_loss;
                        train_acc_history.append(epoch_acc.cpu().detach().numpy());
                        train_loss_history.append(epoch_loss);

            if(self.system_dict["training"]["settings"]["save_intermediate_models"]):
                torch.save(self.system_dict["local"]["model"], self.system_dict["model_dir"] + 
                    self.system_dict["training"]["settings"]["intermediate_model_prefix"] + "{}".format(epoch));



            if(val_acc > best_acc):
                best_acc = val_acc;
                best_acc_epoch = epoch;
                best_model_wts = copy.deepcopy(self.system_dict["local"]["model"].state_dict());
                torch.save(self.system_dict["local"]["model"], self.system_dict["model_dir"] + "best_model");
                self.system_dict["training"]["outputs"]["best_val_acc"] = "{:4f}".format(best_acc);
                self.system_dict["training"]["outputs"]["best_val_acc_epoch_num"] = best_acc_epoch;

            time_elapsed_since = time.time() - since;
            if("training_time" in self.system_dict["training"]["outputs"].keys()):
                minutes, seconds = self.system_dict["training"]["outputs"]["training_time"].split(" ");
                minutes = int(minutes[:len(minutes)-1]);
                seconds = int(seconds[:len(seconds)-1]);
                time_elapsed_since += minutes*60 + seconds;
            self.system_dict["training"]["outputs"]["training_time"] = "{:.0f}m {:.0f}s".format(time_elapsed_since // 60, time_elapsed_since % 60);


            if(self.system_dict["training"]["settings"]["save_training_logs"]):
                np.save(self.system_dict["log_dir"] + "val_acc_history.npy", np.array(val_acc_history), allow_pickle=True);
                np.save(self.system_dict["log_dir"] + "val_loss_history.npy", np.array(val_loss_history), allow_pickle=True);
                np.save(self.system_dict["log_dir"] + "train_acc_history.npy", np.array(train_acc_history), allow_pickle=True);
                np.save(self.system_dict["log_dir"] + "train_loss_history.npy", np.array(train_loss_history), allow_pickle=True);

                create_train_test_plots_accuracy([train_acc_history, val_acc_history], ["Epoch Num", "Accuracy"], self.system_dict["log_dir"], show_img=False, save_img=True);
                create_train_test_plots_loss([train_loss_history, val_loss_history], ["Epoch Num", "Loss"], self.system_dict["log_dir"], show_img=False, save_img=True);
            
            torch.save(self.system_dict["local"]["model"], self.system_dict["model_dir"] + "resume_state");

            if(self.system_dict["local"]["learning_rate_scheduler"]):
                if(self.system_dict["hyper-parameters"]["learning_rate_scheduler"]["name"] == "reduceonplateaulr"):
                    self.system_dict["local"]["learning_rate_scheduler"].step(epoch_loss);
                else:
                    self.system_dict["local"]["learning_rate_scheduler"].step();

            if(self.system_dict["training"]["settings"]["display_progress_realtime"] and self.system_dict["verbose"]):
                self.custom_print("");
                self.custom_print("");

            if(self.system_dict["training"]["settings"]["display_progress"]):
                for param_group in self.system_dict["local"]["optimizer"].param_groups:
                    curr_lr = param_group['lr'];
                self.custom_print("    curr_lr - {}".format(curr_lr));
                self.custom_print('    [Epoch %d] Train-acc: %.3f, Train-loss: %.3f | Val-acc: %3f, Val-loss: %.3f, | time: %.1f sec' %
                         (epoch+1, train_acc, train_loss, val_acc, val_loss, time.time() - since));
                self.custom_print("");
            self.system_dict["training"]["outputs"]["epochs_completed"] = epoch+1;
            save(self.system_dict);

        if(self.system_dict["training"]["settings"]["display_progress"]):
            self.custom_print('    Training completed in: {:.0f}m {:.0f}s'.format(time_elapsed_since // 60, time_elapsed_since % 60))
            self.custom_print('    Best val Acc:          {:4f}'.format(best_acc))
            self.custom_print("");
        
        






    
    elif(self.system_dict["states"]["eval_infer"]):
        msg = "Cannot train in testing (eval_infer) mode.\n";
        msg += "Tip - use new_experiment function with a copy_from argument.\n";
        raise ConstraintError(msg);



    else:
        self.custom_print("Training Start");
        self.system_dict = load_optimizer(self.system_dict);
        self.system_dict = load_scheduler(self.system_dict);
        self.system_dict = load_loss(self.system_dict);


        self.system_dict["training"]["status"] = False;

        pid = os.getpid();

        if(self.system_dict["training"]["settings"]["save_training_logs"]):
            val_acc_history = [];
            train_acc_history = [];
            val_loss_history = [];
            train_loss_history = [];

        num_batch_train = len(self.system_dict["local"]["data_loaders"]["train"]);
        num_batch_val = len(self.system_dict["local"]["data_loaders"]["val"]);

        best_acc = 0.0;
        best_acc_epoch = 0;
        max_gpu_usage = 0;
        best_model_wts = copy.deepcopy(self.system_dict["local"]["model"].state_dict());

        for epoch in range(self.system_dict["hyper-parameters"]["num_epochs"]):
            if(self.system_dict["training"]["settings"]["display_progress"]):
                self.custom_print('    Epoch {}/{}'.format(epoch+1, self.system_dict["hyper-parameters"]["num_epochs"]))
                self.custom_print('    ' + '-' * 10)

            since = time.time();

            for phase in ['train', 'val']:
                if(self.system_dict["training"]["settings"]["display_progress_realtime"] and self.system_dict["verbose"]):
                    pbar=tqdm(total=len(self.system_dict["local"]["data_loaders"][phase]));

                if phase == 'train':
                    self.system_dict["local"]["model"].train() 
                else:
                    self.system_dict["local"]["model"].eval()  

                running_loss = 0.0
                running_corrects = 0


                for inputs, labels in self.system_dict["local"]["data_loaders"][phase]:
                    if(self.system_dict["training"]["settings"]["display_progress_realtime"] and self.system_dict["verbose"]):
                        pbar.update();
                    inputs = inputs.to(self.system_dict["local"]["device"]);
                    labels = labels.to(self.system_dict["local"]["device"]);

                    self.system_dict["local"]["optimizer"].zero_grad();


                    with torch.set_grad_enabled(phase == 'train'):
                        if(self.system_dict["model"]["params"]["model_name"]):
                            if "inception" in self.system_dict["model"]["params"]["model_name"] and phase == 'train':
                                outputs, aux_outputs = self.system_dict["local"]["model"](inputs)
                                loss1 = self.system_dict["local"]["criterion"](outputs, labels)
                                loss2 = self.system_dict["local"]["criterion"](aux_outputs, labels)
                                loss = loss1 + 0.4*loss2
                            else:
                                outputs = self.system_dict["local"]["model"](inputs)
                                loss = self.system_dict["local"]["criterion"](outputs, labels)
                        else:
                            outputs = self.system_dict["local"]["model"](inputs)
                            loss = self.system_dict["local"]["criterion"](outputs, labels)

                        _, preds = torch.max(outputs, 1)

                        if phase == 'train':
                            loss.backward()
                            self.system_dict["local"]["optimizer"].step()


                    running_loss += loss.item() * inputs.size(0)
                    running_corrects += torch.sum(preds == labels.data)


                epoch_loss = running_loss / len(self.system_dict["local"]["data_loaders"][phase].dataset)
                epoch_acc = running_corrects.double() / len(self.system_dict["local"]["data_loaders"][phase].dataset)


                if(self.system_dict["model"]["params"]["use_gpu"]):
                    GPUs = GPUtil.getGPUs()
                    gpuMemoryUsed = GPUs[0].memoryUsed
                    if(self.system_dict["training"]["outputs"]["max_gpu_memory_usage"] < int(gpuMemoryUsed)):
                        self.system_dict["training"]["outputs"]["max_gpu_memory_usage"] = int(gpuMemoryUsed);


                if(self.system_dict["training"]["settings"]["save_training_logs"]):
                    if phase == 'val':
                        val_acc = epoch_acc;
                        val_loss = epoch_loss;
                        val_acc_history.append(epoch_acc.cpu().detach().numpy());
                        val_loss_history.append(epoch_loss);
                    else:
                        train_acc = epoch_acc;
                        train_loss = epoch_loss;
                        train_acc_history.append(epoch_acc.cpu().detach().numpy());
                        train_loss_history.append(epoch_loss);


            if(self.system_dict["training"]["settings"]["save_intermediate_models"]):
                torch.save(self.system_dict["local"]["model"], self.system_dict["model_dir"] + 
                    self.system_dict["training"]["settings"]["intermediate_model_prefix"] + "{}".format(epoch));



            if(val_acc > best_acc):
                best_acc = val_acc;
                best_acc_epoch = epoch;
                best_model_wts = copy.deepcopy(self.system_dict["local"]["model"].state_dict());
                torch.save(self.system_dict["local"]["model"], self.system_dict["model_dir"] + "best_model");
                self.system_dict["training"]["outputs"]["best_val_acc"] = "{:4f}".format(best_acc);
                self.system_dict["training"]["outputs"]["best_val_acc_epoch_num"] = best_acc_epoch;

            time_elapsed_since = time.time() - since;
            if("training_time" in self.system_dict["training"]["outputs"].keys()):
                minutes, seconds = self.system_dict["training"]["outputs"]["training_time"].split(" ");
                minutes = int(minutes[:len(minutes)-1]);
                seconds = int(seconds[:len(seconds)-1]);
                time_elapsed_since += minutes*60 + seconds;
            self.system_dict["training"]["outputs"]["training_time"] = "{:.0f}m {:.0f}s".format(time_elapsed_since // 60, time_elapsed_since % 60);


            if(self.system_dict["training"]["settings"]["save_training_logs"]):
                np.save(self.system_dict["log_dir"] + "val_acc_history.npy", np.array(val_acc_history), allow_pickle=True);
                np.save(self.system_dict["log_dir"] + "val_loss_history.npy", np.array(val_loss_history), allow_pickle=True);
                np.save(self.system_dict["log_dir"] + "train_acc_history.npy", np.array(train_acc_history), allow_pickle=True);
                np.save(self.system_dict["log_dir"] + "train_loss_history.npy", np.array(train_loss_history), allow_pickle=True);

                create_train_test_plots_accuracy([train_acc_history, val_acc_history], ["Epoch Num", "Accuracy"], self.system_dict["log_dir"], show_img=False, save_img=True);
                create_train_test_plots_loss([train_loss_history, val_loss_history], ["Epoch Num", "Loss"], self.system_dict["log_dir"], show_img=False, save_img=True);
            
            torch.save(self.system_dict["local"]["model"], self.system_dict["model_dir"] + "resume_state");

            if(self.system_dict["local"]["learning_rate_scheduler"]):
                if(self.system_dict["hyper-parameters"]["learning_rate_scheduler"]["name"] == "reduceonplateaulr"):
                    self.system_dict["local"]["learning_rate_scheduler"].step(epoch_loss);
                else:
                    self.system_dict["local"]["learning_rate_scheduler"].step();

            if(self.system_dict["training"]["settings"]["display_progress_realtime"] and self.system_dict["verbose"]):
                self.custom_print("");
                self.custom_print("");

            if(self.system_dict["training"]["settings"]["display_progress"]):
                for param_group in self.system_dict["local"]["optimizer"].param_groups:
                    curr_lr = param_group['lr'];
                self.custom_print("    curr_lr - {}".format(curr_lr));
                self.custom_print('    [Epoch %d] Train-acc: %.3f, Train-loss: %.3f | Val-acc: %3f, Val-loss: %.3f, | time: %.1f sec' %
                         (epoch+1, train_acc, train_loss, val_acc, val_loss, time.time() - since));
                self.custom_print("");
            self.system_dict["training"]["outputs"]["epochs_completed"] = epoch+1;
            save(self.system_dict);

        if(self.system_dict["training"]["settings"]["display_progress"]):
            self.custom_print('    Training completed in: {:.0f}m {:.0f}s'.format(time_elapsed_since // 60, time_elapsed_since % 60))
            self.custom_print('    Best val Acc:          {:4f}'.format(best_acc))
            self.custom_print("");


    
    if(not self.system_dict["states"]["eval_infer"]):
        self.custom_print("Training End");
        self.custom_print("");
        self.system_dict["training"]["outputs"]["best_val_acc"] = "{:4f}".format(best_acc);
        self.system_dict["training"]["outputs"]["best_val_acc_epoch_num"] = best_acc_epoch;
        self.system_dict["training"]["outputs"]["training_time"] = "{:.0f}m {:.0f}s".format(time_elapsed_since // 60, time_elapsed_since % 60);
        self.system_dict["training"]["outputs"]["max_gpu_usage"] = str(self.system_dict["training"]["outputs"]["max_gpu_memory_usage"]) + " Mb";

        torch.save(self.system_dict["local"]["model"], self.system_dict["model_dir"] + "final");

        if(self.system_dict["training"]["settings"]["save_training_logs"]): 
            self.custom_print("Training Outputs");
            self.custom_print("    Model Dir:   {}".format(self.system_dict["model_dir"]));
            self.custom_print("    Log Dir:     {}".format(self.system_dict["log_dir"]));
            self.custom_print("    Final model: {}".format("final"));
            self.custom_print("    Best model:  {}".format("best_model"));
            self.custom_print("    Log 1 - Validation accuracy history log: {}".format("val_acc_history.npy"));
            self.custom_print("    Log 2 - Validation loss history log:     {}".format("val_loss_history.npy"));
            self.custom_print("    Log 3 - Training accuracy history log:   {}".format("train_acc_history.npy"));
            self.custom_print("    Log 4 - Training loss history log:       {}".format("train_loss_history.npy"));
            self.custom_print("    Log 5 - Training curve:                  {}".format("train_loss_history.npy"));
            self.custom_print("    Log 6 - Validation curve:                {}".format("train_loss_history.npy"));
            self.custom_print("");

            np.save(self.system_dict["log_dir"] + "val_acc_history.npy", np.array(val_acc_history), allow_pickle=True);
            np.save(self.system_dict["log_dir"] + "val_loss_history.npy", np.array(val_loss_history), allow_pickle=True);
            np.save(self.system_dict["log_dir"] + "train_acc_history.npy", np.array(train_acc_history), allow_pickle=True);
            np.save(self.system_dict["log_dir"] + "train_loss_history.npy", np.array(train_loss_history), allow_pickle=True);
            
            self.system_dict["training"]["outputs"]["log_val_acc_history"] = self.system_dict["log_dir"] + "val_acc_history.npy";
            self.system_dict["training"]["outputs"]["log_val_loss_history"] = self.system_dict["log_dir"] + "val_loss_history.npy";
            self.system_dict["training"]["outputs"]["log_train_acc_history"] = self.system_dict["log_dir"] + "train_acc_history.npy";
            self.system_dict["training"]["outputs"]["log_train_loss_history"] = self.system_dict["log_dir"] + "train_loss_history.npy";

            self.system_dict["training"]["outputs"]["log_val_acc_history_relative"] = self.system_dict["log_dir_relative"] + "val_acc_history.npy";
            self.system_dict["training"]["outputs"]["log_val_loss_history_relative"] = self.system_dict["log_dir_relative"] + "val_loss_history.npy";
            self.system_dict["training"]["outputs"]["log_train_acc_history_relative"] = self.system_dict["log_dir_relative"] + "train_acc_history.npy";
            self.system_dict["training"]["outputs"]["log_train_loss_history_relative"] = self.system_dict["log_dir_relative"] + "train_loss_history.npy";


            create_train_test_plots_accuracy([train_acc_history, val_acc_history], ["Epoch Num", "Accuracy"], self.system_dict["log_dir"], show_img=False, save_img=True);
            create_train_test_plots_loss([train_loss_history, val_loss_history], ["Epoch Num", "Loss"], self.system_dict["log_dir"], show_img=False, save_img=True);

        self.system_dict["training"]["status"] = True;