# (0) Importing necessary libraries

import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import QuantileTransformer
from torch.utils.data import DataLoader, TensorDataset
import time
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error, r2_score


# (1) Preparing the dataset 

data = pd.read_csv("hform_dataset_ABSe3.csv")
print(data.head())

# shuffle the rows randomly
data = data.sample(frac=1,axis=0).reset_index(drop=True)

# select the target value: hform
y = data[["PBE_hof"]].values

# select the input features: 30 sine matrix eigenvalues
x = data.iloc[:, 1:31]

# convert input to numpy array
x = x.to_numpy()

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2 , random_state=42)

# Scaling the data

# Feature
xscaler = QuantileTransformer(n_quantiles=100, output_distribution="normal").fit(x_train)
x_train_scaled = xscaler.transform(x_train)
x_test_scaled = xscaler.transform(x_test)

# Target 
y_train = y_train.reshape(-1,1)
y_test = y_test.reshape(-1,1)
yscaler = QuantileTransformer(n_quantiles=100, output_distribution="normal").fit(y_train)
y_train_scaled = yscaler.transform(y_train)
y_test_scaled = yscaler.transform(y_test)


# (2) Create pytorch loader object to load the data
# Convert to PyTorch tensors

x_train_scaled = torch.from_numpy(x_train_scaled).type(torch.float)
x_test_scaled = torch.from_numpy(x_test_scaled).type(torch.float)

y_train_scaled = torch.tensor(y_train_scaled, dtype=torch.float32)
y_test_scaled = torch.tensor(y_test_scaled, dtype=torch.float32)

batch_size = 64
train_loader = DataLoader(dataset=TensorDataset(x_train_scaled, y_train_scaled), batch_size=batch_size, shuffle=True)
test_loader = DataLoader(dataset=TensorDataset(x_test_scaled, y_test_scaled), batch_size=batch_size, shuffle=True)

# (3) Build The model

class RegressorANN(nn.Module):
    """
    A simple Multi-Layer Perceptron (MLP) for regression from 10 features to 1 output.
    Includes Dropout regularization suitable for a small dataset (800 samples).
    """
    def __init__(self, input_size=30, hidden_size_1=64, hidden_size_2=16, dropout_rate=0.2):
        super(RegressorANN, self).__init__()

        # Define layers
        self.fc1 = nn.Linear(input_size, hidden_size_1)
        self.fc2 = nn.Linear(hidden_size_1, hidden_size_2)
        self.fc3 = nn.Linear(hidden_size_2, 1) 
        self.act = nn.ReLU()

    def forward(self, x):
        x = self.act(self.fc1(x))
        x = self.act(self.fc2(x))
        x = self.fc3(x) 
        return x


# Setting up the device & initialize the model

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print(f"Available device device: {device}")

model = RegressorANN().to(device)

print(f"Model Architecture:\n{model}")

# count the total # of parameters

total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"\nTotal # of trainable parameters: {total_params}")


# (5) Training the model

lr = 1e-4   # learning rate
optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=0)
criterion = nn.MSELoss() 


train_loss_with_epoch = []
test_loss_with_epoch = []

num_epochs = 200

start_time = time.time()

for e in tqdm(range(num_epochs), desc=f"Training Progress", colour="GREEN"):

    #------------------------
    #     Training Phase
    #-------------------------
    model.train()
    # training data
    train_loss = 0.0
    for data, target in tqdm(train_loader, leave=False, desc=f"Epoch {e+1}/{num_epochs}", colour="BLUE"):
        # Move data to device (GPU/CPU)
        data = data.float().to(device)     
        target = target.float().to(device) 

        # Forward Pass
        output = model(data)
        loss = criterion(output,target)

        # Accumulate batch loss
        train_loss += loss.item()

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    # Average loss over all training batches
    train_loss = train_loss/len(train_loader)
    train_loss_with_epoch.append(train_loss)

    #------------------------
    #     Test Phase
    #------------------------
    test_loss = 0.0
    for data, target in tqdm(test_loader, leave=False, desc=f"Epoch {e+1}/{num_epochs}", colour="BLUE"):
        model.eval()
        with torch.no_grad():   # No gradient calculation
            data = data.float().to(device)
            target = target.float().to(device)

            # Forward Pass
            output = model(data)
            loss = criterion(output,target)

            # Accumulate loss
            test_loss += loss.item()

    # Average test loss
    test_loss = test_loss/len(test_loader)
    test_loss_with_epoch.append(test_loss)

 
    log_string = f"epoch {e+1} training loss: {train_loss:.4f} test loss: {test_loss:.4f} "
    print(log_string)


#  Save the model   
torch.save(model.state_dict(), "ann.pt")

print("Total time taken to train the model : %.2fs" % (time.time() - start_time))


# Store the history

epochs_list = list(np.arange(0,num_epochs,1))

data_dict = {"epochs":epochs_list, "train_loss":train_loss_with_epoch, "test_loss":test_loss_with_epoch}
# converting to dataframe
df = pd.DataFrame(data_dict)
df.to_csv("epoch-loss.csv",index=False)


# Plotting 
plt.figure(figsize=(7, 5))

# Plot curves
plt.plot(epochs_list, train_loss_with_epoch, linewidth=2, label="Train Loss")
plt.plot(epochs_list, test_loss_with_epoch, linewidth=2, label="Test Loss")

# Labels and title
plt.xlabel("Epochs", fontsize=14)
plt.ylabel("Loss", fontsize=14)
plt.title("Learning Curve", fontsize=16)

# Legend
plt.legend(fontsize=12)

# Grid and layout
plt.grid(True, linestyle="--", alpha=0.5)
plt.tight_layout()

# Show or save
plt.show()
plt.savefig("learning_curve.jpg", dpi=300)


# Prediction on training set

with torch.no_grad():
    y_train_pred = model(x_train_scaled)
    y_train_pred = y_train_pred.detach().cpu().numpy()
    # inverse transform
    y_train_pred = yscaler.inverse_transform(y_train_pred)

# Prediction on test set

with torch.no_grad():
    y_test_pred = model(x_test_scaled)
    y_test_pred = y_test_pred.detach().cpu().numpy()
    # inverse transform
    y_test_pred = yscaler.inverse_transform(y_test_pred)


# Calculate metrics
train_mae = mean_absolute_error(y_train, y_train_pred)
test_mae  = mean_absolute_error(y_test, y_test_pred)

train_r2 = r2_score(y_train, y_train_pred)
test_r2  = r2_score(y_test, y_test_pred)

# Print neatly
print("Model Performance:")
print("------------------")
print(f"Training MAE : {train_mae:.4f}")
print(f"Training R²  : {train_r2:.4f}")
print()
print(f"Test MAE     : {test_mae:.4f}")
print(f"Test R²      : {test_r2:.4f}")


plt.figure(figsize=(7, 6))

# Scatter plots
plt.scatter(y_train, y_train_pred, alpha=0.6, label="Train", s=40)
plt.scatter(y_test, y_test_pred, alpha=0.6, label="Test", s=40)

# Perfect prediction line
min_val = min(y_train.min(), y_test.min(), 
              np.min(y_train_pred), np.min(y_test_pred))
max_val = max(y_train.max(), y_test.max(), 
              np.max(y_train_pred), np.max(y_test_pred))

plt.plot([min_val, max_val], [min_val, max_val], 'k--', linewidth=2, label="Ideal")

# Labels and title
plt.xlabel("Actual Values", fontsize=14)
plt.ylabel("Predicted Values", fontsize=14)
plt.title("Parity Plot (Actual vs Predicted)", fontsize=16)

plt.legend(fontsize=12)
plt.grid(True, linestyle="--", alpha=0.4)
plt.tight_layout()

plt.show()
plt.savefig("parity_plot.jpg", dpi=300)