###########################
# This program trains a linear regression model
# P. Sen, August 21, 2025
############################

import matplotlib.pyplot as plt
import seaborn as sb

import numpy as np

from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from scipy.stats import pearsonr
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, explained_variance_score
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split
from sklearn import preprocessing

# Scale the features and the target
from sklearn.preprocessing import QuantileTransformer, RobustScaler, MinMaxScaler, StandardScaler

import statistics
import pandas as pd
from math import sqrt

import sys

hof_data = pd.read_csv('./hform_dataset_ABSe3.csv',sep=',')
print(hof_data.shape)

Nsamples = hof_data.shape[0]
print('No. of samples = ', Nsamples)

#Shuffle rows randomly
hof_data = hof_data.sample(frac=1, axis=0).reset_index(drop=True)
# sample(frac=1, axis=0) → takes 100% (frac=1) of the rows, along axis=0 (rows), but shuffles their order.
# .reset_index(drop=True) → resets the row index after shuffling so a 0,1,2,… index order is obtained instead of keeping the old shuffled indices.
# Reassignment (hof_data = ...) → updates the existing DataFrame to the shuffled version.

# This line is pulling out one column from the hof_data DataFrame as a NumPy array:
smeig10 = hof_data[['smeig10']].values
# Print the values for different percentiles of distribution of smeig10
print(np.percentile(smeig10, [0, 25, 50, 75, 100]))

y = hof_data[['PBE_hof']].values
# This line is pulling out one column from the hof_data DataFrame as a NumPy array:
# hof_data[['PBE_hof']] → selects the column PBE_hof, but because of double brackets [['...']], 
# it’s returned as a DataFrame (2-D), not a Series.
# .values → converts that DataFrame into a NumPy ndarray.
# So the result is a 2-D array of shape (n_samples, 1)

# This line shows the distribution of smeig10 before preprocessing/scaling
sb.histplot(smeig10)
plt.title("Smeig10 before preprocessing")
plt.show()

# This line shows the distribution of y before scaling
sb.histplot(y)
plt.title("Distribution of PBE hform before preprocessing")
plt.show()

# Now drop the column 'PBE_hof' from the hof_data dataframe.
hof_data.drop(labels=['Material','PBE_hof'], axis=1, inplace=True)
# labels=['PBE_hof'] → which column(s) to drop.
# axis=1 → means drop along columns (if you used axis=0, it would drop rows).
# inplace=True → modifies the DataFrame directly instead of returning a new one.

X = hof_data.to_numpy()
# Gives a NumPy array of features (shape will be (n_samples, n_features)), ready to feed into scikit-learn models.

# Test/train split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
# 70% training set, 30% test set. random_state = 1 will produce the same split every time it is run.
print('Shape ',X_train.shape)

Ntrain = np.shape(y_train)[0] # no. of training examples

# Preprocessing feature data
Xscaler = QuantileTransformer(n_quantiles=Ntrain, output_distribution='normal').fit(X_train) # Handles outliers better
#Xscaler = preprocessing.StandardScaler().fit(X_train)
X_train_scaled = Xscaler.transform(X_train)
X_test_scaled = Xscaler.transform(X_test)

# Selects out smeig10 after data scaling
smeig10 = X_train_scaled[:9]
# This line shows the distribution of smeig10 after preprocessing/scaling
sb.histplot(smeig10, binwidth=0.1, legend = False)
plt.title("Distribution of smeig10 after preprocessing")
plt.show()
#print(np.percentile(X_train_scaled[:0], [0, 25, 50, 75, 100]))

# Scaling target data
# First make it a 2D array
y_train = y_train.reshape(-1,1)
# -1 means: Figure out this dimension automatically based on the length of the array and the other dimensions I specify.
# reshape(-1, 1) means: “Make it 2D, with 1 column, and as many rows as needed.”
y_test = y_test.reshape(-1,1)


Yscaler = QuantileTransformer(n_quantiles=Ntrain, output_distribution='normal').fit(y_train)
#Yscaler = preprocessing.StandardScaler().fit(y_train)
y_train_scaled = Yscaler.transform(y_train)
y_test_scaled = Yscaler.transform(y_test)
#######################

y_test_data = []
for i in range(len(y_test)):
	y_test_data.append(y_test[i][0])

#Create linear regressor
reg = linear_model.LinearRegression()
reg.fit(X_train_scaled, y_train_scaled)
print('Regression coefficient ',reg.coef_,'\n')
print('Regression intercept w_0 ',reg.intercept_,'\n')
#sys.exit()


y_pred_scaled = reg.predict(X_test_scaled) #Predicts y values from the model reg

# The R^2 value
print('R^2 score for test set: %.4f'
     	% r2_score(y_test_scaled, y_pred_scaled))

#y_train_list = y_train.flatten().tolist()
# #y_train_data = np.array(y_train_list)
y_train_data = []
for i in range(len(y_train)):
	y_train_data.append(y_train_scaled[i][0])

# Pearson's correlation coefficient
y_test_data = np.asarray(y_test_data).ravel()
y_pred_scaled = np.asarray(y_pred_scaled).ravel()
#r = np.corrcoef(y_test, y_pred_scaled)
r = np.corrcoef(y_test_data, y_pred_scaled)
print('Test Pearson correlation  ', r)

y_pred_train = reg.predict(X_train_scaled)

# Explained variance
print('Explained variance for training set %.4f' % explained_variance_score(y_train_scaled,y_pred_train))
print('Explained variance for test set %.4f'  % explained_variance_score(y_test,y_pred_scaled))
print()

# The mean squared error
print('RMSE for train y: %.4f'
        % np.sqrt(mean_squared_error(y_train_scaled, y_pred_train)))
print('RMSE for test y: %.4f '
        % np.sqrt(mean_squared_error(y_test_scaled, y_pred_scaled)))
print()

#Mean absolute error
print('MAE for train y: %.4f' % mean_absolute_error(y_train_scaled, y_pred_train))
print('MAE for test y: %.4f' % mean_absolute_error(y_test_scaled, y_pred_scaled))
print()

#Plot distribution of relative errors in y
#y_rel = (y_test_scaled - y_pred_scaled)/y_test_scaled
#print('Mean of y_rel ', statistics.mean(np.ravel(y_rel)))

#sb.histplot(np.ravel(y_rel))
#plt.xlim(-10,10)
#plt.xlabel('Relative error in y')
#plt.ylabel('Number')
#plt.show()

#plt.scatter(y_train_scaled, y_pred_train, marker='*', color='red')
#plt.xlim(0,5)
#plt.ylim(0,5)
#plt.xlabel('Training hform',fontsize=18)
#plt.ylabel('Predicted hform',fontsize=18)
#plt.show()

# Plot outputs
#plt.scatter(y_test_scaled, y_pred_scaled, marker='*', color='red')
#plt.xlim(-3,3)
#plt.xlim(0,10)
#plt.ylim(0,10)
#plt.ylim(-3,3)
#plt.xlabel('$y_{test}$',fontsize=18)
#plt.ylabel('$y_{pred}$',fontsize=18)

#plt.show()

# So far the results were on the scaled data.
# The following are on data with original units.
print("==========")
print("Results for original data",'\n')

# Perform the inverse scaling transormations
#X_train_inv = X_train
#X_test_inv = X_test
#y_train_inv = y_train
#y_test_inv = y_test

y_pred_train_inv = Yscaler.inverse_transform(y_pred_train.reshape(-1,1))
y_pred_inv = Yscaler.inverse_transform(y_pred_scaled.reshape(-1,1))

#Mean absolute error after inverse transform
print('MAE for train y after inverse transform: %.4f' % mean_absolute_error(y_train, y_pred_train_inv))
print('MAE for test y after inverse transform: %.4f' % mean_absolute_error(y_test, y_pred_inv))
print()

# Mean squared error
print('RMSE for train y: %.4f'
        % np.sqrt(mean_squared_error(y_train, y_pred_train_inv)))
print('RMSE for test y: %.4f '
        % np.sqrt(mean_squared_error(y_test, y_pred_inv)))
print()

# Explained variance
print('Explained variance for training set %.4f' % explained_variance_score(y_train,y_pred_train_inv))
print('Explained variance for test set %.4f'  % explained_variance_score(y_test,y_pred_inv))
print()

# The R^2 value
print('R^2 score for test set: %.4f'
     	% r2_score(y_test, y_pred_inv))


#Plot distribution of relative errors in y
print('Shapes of y_test & y_pred_inv  ', y_test.shape, y_pred_inv.shape)
y_rel = (np.ravel(y_test) - np.ravel(y_pred_inv))/np.ravel(y_test)
print("Shape of y_rel  ",y_rel.shape)
print('Mean of y_rel ', statistics.mean(np.abs(y_rel)))


sb.histplot(np.ravel(y_rel), bins=50, kde =True)
plt.title("Distribution of relative errors")
plt.xlim(-15,15)
plt.xlabel('Relative error in y')
plt.ylabel('Number')
plt.show()

plt.scatter(y_train, y_pred_train_inv, marker='*', color='red')
#plt.xlim(0,5)
#plt.ylim(0,5)
plt.title("Scatter plot for training data set")
plt.xlabel('True hform',fontsize=18)
plt.ylabel('Predicted hform',fontsize=18)
plt.show()

# Plot outputs
plt.scatter(y_test, y_pred_inv, marker='*', color='red')
plt.title("Scatter plot for test data set")
#plt.xlim(-3,3)
#plt.ylim(-3,3)
plt.xlabel('True hform',fontsize=18)
plt.ylabel('Predicted hform',fontsize=18)
plt.show()