###########################
# This program trains a lasso regression model
# P. Sen, August 21, 2025
############################

import matplotlib.pyplot as plt
import seaborn as sb
import numpy as np
from sklearn import linear_model
#from sklearn.linear_model import LinearRegression
from scipy.stats import pearsonr
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, explained_variance_score
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
# Scale the features and the target
from sklearn.preprocessing import QuantileTransformer, StandardScaler
import statistics
import pandas as pd
from math import sqrt

import sys

hof_data = pd.read_csv('./hform_dataset_ABSe3.csv',sep=',')
print(hof_data.shape)

Nsamples = hof_data.shape[0]
print('No. of samples = ', Nsamples)

#Shuffle rows randomly
hof_data = hof_data.sample(frac=1, axis=0).reset_index(drop=True)
# sample(frac=1, axis=0) → takes 100% (frac=1) of the rows, along axis=0 (rows), but shuffles their order.
# .reset_index(drop=True) → resets the row index after shuffling so you get a clean 0,1,2,… index instead of keeping the old shuffled indices.
# Reassignment (hof_data = ...) → updates your DataFrame to the shuffled version.

y = hof_data[['PBE_hof']].values
# This line is pulling out one column from your DataFrame as a NumPy array:
# hof_data[['PBE_hof']] → selects the column PBE_hof, but because you used double brackets [['...']], 
# it’s returned as a DataFrame (2-D), not a Series.
# .values → converts that DataFrame into a NumPy ndarray.
# So the result is a 2-D array of shape (n_samples, 1)

# This line shows the distribution of y
sb.histplot(y)
plt.title("Distribution of PBE hform before preprocessing")
plt.show()

# Now drop the columns 'Materials' and 'PBE_hof' from the hof_data dataframe.
hof_data.drop(labels=['Material','PBE_hof'], axis=1, inplace=True)
# labels=['PBE_hof'] → which column(s) to drop.
# axis=1 → means drop along columns (if you used axis=0, it would drop rows).
# inplace=True → modifies the DataFrame directly instead of returning a new one.

X = hof_data.to_numpy()
# Gives a NumPy array of features (shape will be (n_samples, n_features)), ready to feed into scikit-learn models.

# Test/train split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
# 70% training set, 30% test set. random_state = 1 will produce same split every time it is run.
print('Shape ',X_train.shape)

Ntrain = np.shape(y_train)[0] # no. of training examples


# Preprocessing feature data
Xscaler = QuantileTransformer(n_quantiles=Ntrain, output_distribution='normal').fit(X_train) # Handles outliers better
#Xscaler = MinMaxScaler(feature_range=(-1,1)).fit(X_train)
X_train_scaled = Xscaler.transform(X_train)
X_test_scaled = Xscaler.transform(X_test)

# Scaling target data
# First make it a 2D array
y_train = y_train.reshape(-1,1)
# -1 means: Figure out this dimension automatically based on the length of the array and the other dimensions I specify.
# reshape(-1, 1) means: “Make it 2D, with 1 column, and as many rows as needed.”
y_test = y_test.reshape(-1,1)


Yscaler = QuantileTransformer(n_quantiles=Ntrain, output_distribution='normal').fit(y_train)
#Yscaler = MinMaxScaler(feature_range=(-1, 1)).fit(y_train)
y_train_scaled = Yscaler.transform(y_train)
y_test_scaled = Yscaler.transform(y_test)
#######################

y_test_data = []
for i in range(len(y_test)):
	y_test_data.append(y_test[i][0])

#Create lasso regressor
reg = linear_model.Lasso(alpha=0.01)
# Fit the regressor
reg.fit(X_train_scaled,y_train_scaled)

print('Regression coefficient ',reg.coef_,'\n')
print('Regression intercept w_0 ',reg.intercept_,'\n')
sys.exit()

# Predict results for the test set
y_pred_scaled = reg.predict(X_test_scaled)

# The test set R^2 score on scaled data
print('R^2 score for test set: %.4f'
     	% r2_score(y_test_scaled, y_pred_scaled))

y_train_data = []
for i in range(len(y_train)):
	y_train_data.append(y_train_scaled[i][0])

# Pearson's correlation coefficient
y_test_data = np.asarray(y_test_data).ravel()
y_pred_scaled = np.asarray(y_pred_scaled).ravel()
#r = np.corrcoef(y_test, y_pred_scaled)
r = np.corrcoef(y_test_data, y_pred_scaled)
print('Test Pearson correlation  ', r)

# Making predictions for the test set data
y_pred_train = reg.predict(X_train_scaled)

# Explained variance
print('Explained variance for training set %.4f' % explained_variance_score(y_train_scaled,y_pred_train))
print('Explained variance for test set %.4f'  % explained_variance_score(y_test,y_pred_scaled))
print()

# The mean squared error
print('RMSE for train y: %.4f'
        % np.sqrt(mean_squared_error(y_train_scaled, y_pred_train)))
print('RMSE for test y: %.4f '
        % np.sqrt(mean_squared_error(y_test_scaled, y_pred_scaled)))
print()

#Mean absolute error
print('MAE for train y: %.4f' % mean_absolute_error(y_train_scaled, y_pred_train))
print('MAE for test y: %.4f' % mean_absolute_error(y_test_scaled, y_pred_scaled))
print()


print("==========",'\n')
print("Results for original data",'\n')


y_pred_train_inv = Yscaler.inverse_transform(y_pred_train.reshape(-1,1))
y_pred_inv = Yscaler.inverse_transform(y_pred_scaled.reshape(-1,1))

#Mean absolute error after inverse transform
print('MAE for train y: %.4f' % mean_absolute_error(y_train, y_pred_train_inv),'eV/atom')
print('MAE for test y: %.4f' % mean_absolute_error(y_test, y_pred_inv),'eV/atom')
print()

# Mean squared error
print('RMSE for train y: %.4f'
        % np.sqrt(mean_squared_error(y_train, y_pred_train_inv)), 'eV/atom')
print('RMSE for test y: %.4f '
        % np.sqrt(mean_squared_error(y_test, y_pred_inv)), 'eV/atom')
print()

# Explained variance
print('Explained variance for training set %.4f' % explained_variance_score(y_train,y_pred_train_inv))
print('Explained variance for test set %.4f'  % explained_variance_score(y_test,y_pred_inv))
print()

# The R^2 score on original data
print('R^2 score for test set: %.4f'
     	% r2_score(y_test, y_pred_inv))


#Plot distribution of relative errors in y
print('Shapes of y_test & y_pred_inv  ', y_test.shape, y_pred_inv.shape)
y_rel = (np.ravel(y_test) - np.ravel(y_pred_inv))/np.ravel(y_test)
print("Shape of y_rel  ",y_rel.shape)
print('Mean of y_rel ', statistics.mean(np.abs(y_rel)))


sb.histplot(np.ravel(y_rel), bins=50, kde =True)
#plt.xlim(-15,15)
plt.title("Distribution of relative errors")
plt.xlabel('Relative error in y')
plt.ylabel('Number')
plt.show()

plt.scatter(y_train, y_pred_train_inv, marker='*', color='red')
plt.title("Scatter plot for training data set")
plt.xlabel('Training hform',fontsize=18)
plt.ylabel('Predicted hform',fontsize=18)
plt.show()

# Plot outputs
plt.scatter(y_test, y_pred_inv, marker='*', color='red')
plt.title("Scatter plot for test data set")
plt.xlabel('$y_{test}$',fontsize=18)
plt.ylabel('$y_{pred}$',fontsize=18)
plt.show()