from . import RegularizationMethod
import torch

class EWC(RegularizationMethod):
    """Learning Without Forgetting.

    The method applies knowledge distilllation to mitigate forgetting.
    The teacher is the model checkpoint after the last experience.
    """

    def __init__(self, EWC_lambda=1, temperature=2):
        """
        :param alpha: distillation hyperparameter. It can be either a float
                number or a list containing alpha for each experience.
        :param temperature: softmax temperature for distillation
        """
        self.EWC_lambda = EWC_lambda
        self.temperature = temperature
        self.fisher = {}
        self.optpar = {}
        """ In Avalanche, targets of different experiences are not ordered. 
        As a result, some units may be allocated even though their 
        corresponding class has never been seen by the model.
        Knowledge distillation uses only units corresponding
        to old classes. 
        """
    def adapt(self, output,model, **kwargs):
        ewc_loss = 0
        for n, p in model.named_parameters():
            if p.requires_grad:
                dev = p.device
                l = self.EWC_lambda * self.fisher[n].to(dev) * (p.data - self.optpar[n].to(dev)).pow(2)
                ewc_loss += l.sum()
        output['loss'] += ewc_loss
        return output
    
    def init_epoch(self, model):
        """Update the previous logits for the given question id."""
        optpar = {}
        fisher = {}
        for n, p in model.module.base_model.model.named_parameters():
            if p.requires_grad:
                fisher[n] = torch.zeros(p.data.shape)
                optpar[n] = p.clone().cpu().data
    def update_fisher(self, model):
        """Update the fisher information for the given question id."""
        for n, p in model.module.base_model.model.named_parameters():
            if p.requires_grad:
                fisher = self.fisher[n]
                fisher += p.grad.data.pow(2).cpu()
                self.fisher[n] = fisher