"""
//*************************************************************************
// Example program
//					
// @file	sdg.py					
// @author Luis M. Jimenez
// @date 2025
//
// @brief Course: Computer Vision (1782)
// Dpo. of Systems Engineering and Automation
// Automation, Robotics and Computer Vision Lab (ARVC)
// http://arvc.umh.es
// University Miguel Hernandez
//
// @note Description: 
//	- Stochastic Gradien Descent  with Logistic Regression Classifier
//
//  Dependencies:
//     matplotlib, numpy
//*************************************************************************
"""
import numpy as np
import matplotlib.pyplot as plt
import copy


# Plot Data / Classification rules
def plotData(data, labels=None, val_data=None, val_labels=None, 
              model=None, fig=None, title="Data", show_ids=False):

    # Plot Data 
    plt.figure(fig, figsize=(9, 7))
    plt.clf()
    plt.title(title)
    plt.xlabel("x1")
    plt.ylabel("x2")
    plt.gcf().canvas.manager.set_window_title('Logistic Regression Classifier')

     # plot decision rule
    if (model is not None):
        all_data = data if val_data is None else np.concatenate((data, val_data), axis=0)
        x_min, x_max = all_data[:, 0].min() - 1, all_data[:, 0].max() + 1
        y_min, y_max = all_data[:, 1].min() - 1, all_data[:, 1].max() + 1
        xx, yy = np.meshgrid(np.linspace(x_min, x_max, 200),
                            np.linspace(y_min, y_max, 200))

        # predict the grid points
        Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
        Z = Z.reshape(xx.shape)
        # plot decision rule regions
        plt.contourf(xx, yy, Z, alpha=0.2, cmap='coolwarm')

        # predict probabilities for the grid points
        Z_proba = model.predict_proba(np.c_[xx.ravel(), yy.ravel()])  # probabilities
        Z_proba = Z_proba.reshape(xx.shape)
        # Plot decision rule limit (equal probability)
        plt.contour(xx, yy, Z_proba, levels=[0.5], colors='k', linewidths=1)

    # plot data points
    if labels is not None:
        plt.scatter(data[:,0], data[:,1], s=25, marker='o', c=labels, cmap='coolwarm', alpha=0.6, edgecolors='k')
    else:
        plt.scatter(data[:,0], data[:,1], s=25, marker='o', alpha=0.6, edgecolors='k')
 
    # Plot Validation Data (predict)
    if val_data is not None and val_labels is not None:
        plt.scatter(val_data[:,0],val_data[:,1], marker='x', s=40, c=val_labels, cmap='coolwarm', label="Validation Data")
        
        # Plot items ids
        if show_ids:
            for i, (x, y) in enumerate(zip(val_data[:, 0], val_data[:, 1])):
                plt.annotate(str(i), xy=(x, y), xytext=(-3, 3), textcoords='offset points', ha='right', va='bottom')
        plt.legend(loc='lower right')

    plt.draw()
    plt.pause(0.1)  # gives control to GUI event manager to show the plot
# end plotData


# Plot evolution and final model parameters (only theta_1, theta_2)
# background: colormap loss function evaluated for a grid of thetas and data (X,y)
# Evolution of Loss function and gradients during training
def plotTrainHistory(X, y, model, fig=None, label='SGD'):
    
    if fig is None: 
        fig = plt.gcf().number  # current figure or a new one

    # function attribute to do some things only on the first call
    if not hasattr(plotTrainHistory, "first"):
           plotTrainHistory.first = True
    else:   plotTrainHistory.first = False 

    theta = model.theta
    history = model.history
    
    # Plot parameters (theta_1, theta_2) evolution
    plt.figure(fig, figsize=(10,8))
    # plot theta parameters evolution
    plt.plot(history['theta'][:, 1], history['theta'][:, 2], marker='o', linestyle='-', label=label,  markersize=3)
    # plot final theta parameters
    plt.scatter(theta[1], theta[2], marker='X', s=100, label="Final "+label)

    plt.xlabel(r"$\theta_1$")
    plt.ylabel(r"$\theta_2$")
    plt.title(r"Parameters evolution in space $(\theta_1, \theta_2)$")
    plt.gcf().canvas.manager.set_window_title("Parameters evolution")
    plt.legend(loc='upper left')
    plt.grid()
    plt.pause(0.1)

    if plotTrainHistory.first:
        # background Loss Fuction contour in the paramateres space
        # get plot limits
        ax = plt.gca()  # get current figure axis to get coordinate limits 
        xmin, xmax = ax.get_xlim()
        ymin, ymax = ax.get_ylim()

        # sample the parameters space (theta_1, theta_2)
        theta1_vals = np.linspace(xmin, xmax+0.4, 100)
        theta2_vals = np.linspace(ymin, ymax+0.1, 100)
        J_vals = np.zeros((len(theta1_vals), len(theta2_vals)))

        X_bias = np.c_[np.ones((X.shape[0],1)), X]
        for i, t1 in enumerate(theta1_vals):
            for j, t2 in enumerate(theta2_vals):
                theta_temp = np.array([0, t1, t2])  # theta_0 = 0
                y_pred_temp = model.predict_proba(X_bias, theta_temp)
                J_vals[j, i] = model.cross_entropy_loss_l2(y, y_pred_temp, theta=theta_temp)

        plt.contourf(theta1_vals, theta2_vals, J_vals, levels=20, cmap='coolwarm', alpha=0.4)
        plt.colorbar(label="Loss function")
        plt.pause(0.1)
    # end if plotTrainHistory.first:

    # Plot loss evolution 
    plt.figure(fig+1, figsize=(8,5))
    plt.plot(history['loss'], label=label)
    plt.xlabel('Epochs')
    plt.ylabel('Loss (Cross-Entropy)')
    plt.title('Loss evolution')
    plt.gcf().canvas.manager.set_window_title('Loss evolution')
    plt.legend(loc='best')
    plt.grid()
    plt.pause(0.1)

    # Plot gradient evolution 
    plt.figure(fig+2, figsize=(8,5))
    plt.plot(history['grad'], label=label)
    plt.xlabel('Epochs')
    plt.ylabel('Gradients (max(|grads|)')
    plt.title('Gradients evolution')
    plt.gcf().canvas.manager.set_window_title('Gradients evolution')
    plt.legend(loc='best')
    plt.grid()
    plt.pause(0.1)
#end plotTrainHistory

# Logistig Regression Class using Stochastic Gradien Descent
class LogReg_SDG:
    def __init__(self, lr=0.1, lambda_reg=0.1):
        self.lr = lr
        self.lambda_reg = lambda_reg
        self.theta = np.array([])    # model parameters
        self.history = { 
            'theta': [],  # Model parameters history
            'loss' : [],  # Loss history
            'grad' : []   # Gradients history (max |grads|)
        }

    # Training function 
    def fit(self, X, y, epochs=5, iter=300, batch_size=None, theta_init=None):
        m, n = X.shape
        
        if batch_size is None:
            batch_size = len(X)     # Gradiente descent  averaging all samples

        # Init model parameters
        if theta_init is not None:
            self.theta = theta_init  # random initialization
        else:
            self.theta = np.zeros(n + 1)  # zeros initialization
        
        #clear history
        self.history['theta']= [self.theta.copy()] 
        self.history['loss'] = []
        self.history['grad'] = []

        # Add bias column to X 
        X_bias = np.c_[np.ones((m, 1)), X]

        for epoch in range(iter):
            # **Batch Gradient Descent (MBGD)**: Select a random batch
            batch_indices = np.random.choice(m, batch_size, replace=False)
            X_batch, y_batch = X_bias[batch_indices], y[batch_indices]
            
            # Predictions and gradient
            y_pred =  self.predict_proba(X_batch)
            gradient = (1/len(y_batch)) * np.dot(X_batch.T, (y_pred - y_batch))
            
            # Apply L2 regularization except for  theta_0
            gradient[1:] += (self.lambda_reg / m) * self.theta[1:]

            # Update parameters 
            self.theta -= self.lr * gradient

            # Store parameters  trajectory
            self.history['theta'].append(self.theta.copy())  
            self.history['grad'].append(np.abs(gradient).max())  # max |grads|
            self.history['loss'].append(self.cross_entropy_loss_l2(y, self.predict_proba(X_bias)))
        
        # convert history lists to np.arrays
        self.history['theta'] = np.array(self.history['theta'])
        self.history['grad'] = np.array(self.history['grad'])
        self.history['loss'] = np.array(self.history['loss'])

        # return a copy of the parameters and history
        return self.theta.copy(), copy.deepcopy(self.history)
    # end fit
    
    # Cross-entropy loss function with L2 regularization 
    def cross_entropy_loss_l2(self, y, y_pred, theta=None, lambda_reg=None):
        m = len(y)
        if theta is None:  theta= self.theta
        if lambda_reg is None:  lambda_reg= self.lambda_reg
        
        loss = - (1/m) * np.sum(y * np.log(y_pred) + (1 - y) * np.log(1 - y_pred))
        reg_term = (lambda_reg / (2 * m)) * np.sum(theta[1:] ** 2)  # No regularization for theta_0
        return loss + reg_term
    # end cross_entropy_loss_l2

    def predict(self, X, theta=None):
        if theta is None:  theta= self.theta   
        if X.shape[1] != len(theta):
            X = np.c_[np.ones((X.shape[0], 1)), X]   # Add bias column to X 
        return (np.dot(X, theta)>=0).astype(int)
    
    def predict_proba(self, X, theta=None):
        if theta is None:  theta= self.theta
        if X.shape[1] != len(theta):
            X = np.c_[np.ones((X.shape[0], 1)), X]   # Add bias column to X 
        return 1 / (1 + np.exp(-np.dot(X, theta)))
    
#end class LogReg_SGD


#########################################################
# Non correlated data generator (same variance)
np.random.seed(42)
mean_class0 = [-1, 0]  # mean class 0
mean_class1 = [2, 2]  # mean class 1
cov_matrix = [[1, 0], [0, 1]]  # both variables are independent with same variance 

# Generate random samples for each class
m = 100  # number of samples
X0 = np.random.multivariate_normal(mean_class0, cov_matrix, m)
X1 = np.random.multivariate_normal(mean_class1, cov_matrix, m)

# Build dataset
X = np.vstack((X0, X1))
y = np.hstack((np.zeros(m), np.ones(m)))  # Labels {0  1}

#########################################################
model = LogReg_SDG(lr=0.1, lambda_reg=0.1)

np.random.seed()
theta0 = np.random.randn(X.shape[1] + 1)  # Random init parameters
#print(r"Random init values (theta0, theta1, theta2): ", theta0)

n_iter = 200
# **Train with  Stochastic Gradient Descent**
model.fit(X, y, iter=n_iter, batch_size=1)
plotTrainHistory(X, y, model, fig=1, label='SGD')

# **Train with Mini-Batch Gradient Descent**
model.fit(X, y, iter=n_iter, batch_size=10)
plotTrainHistory(X, y, model, fig=1, label='MB-GD')

# **Train with all data Gradient Descent**
model.fit(X, y, iter=n_iter) 
plotTrainHistory(X, y, model, fig=1, label='GD')

# Plot Data and Decision rule
plotData(X, y, model=model, title="Logistic Regression SDG")

input("... press return to finish")


