########################### # This program trains a lasso regression model # P. Sen, August 21, 2025 ############################ import matplotlib.pyplot as plt import seaborn as sb import numpy as np from sklearn import linear_model #from sklearn.linear_model import LinearRegression from scipy.stats import pearsonr from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, explained_variance_score from sklearn.model_selection import train_test_split from sklearn import preprocessing # Scale the features and the target from sklearn.preprocessing import QuantileTransformer, StandardScaler import statistics import pandas as pd from math import sqrt import sys hof_data = pd.read_csv('./hform_dataset_ABSe3.csv',sep=',') print(hof_data.shape) Nsamples = hof_data.shape[0] print('No. of samples = ', Nsamples) #Shuffle rows randomly hof_data = hof_data.sample(frac=1, axis=0).reset_index(drop=True) # sample(frac=1, axis=0) → takes 100% (frac=1) of the rows, along axis=0 (rows), but shuffles their order. # .reset_index(drop=True) → resets the row index after shuffling so you get a clean 0,1,2,… index instead of keeping the old shuffled indices. # Reassignment (hof_data = ...) → updates your DataFrame to the shuffled version. y = hof_data[['PBE_hof']].values # This line is pulling out one column from your DataFrame as a NumPy array: # hof_data[['PBE_hof']] → selects the column PBE_hof, but because you used double brackets [['...']], # it’s returned as a DataFrame (2-D), not a Series. # .values → converts that DataFrame into a NumPy ndarray. # So the result is a 2-D array of shape (n_samples, 1) # This line shows the distribution of y sb.histplot(y) plt.title("Distribution of PBE hform before preprocessing") plt.show() # Now drop the columns 'Materials' and 'PBE_hof' from the hof_data dataframe. hof_data.drop(labels=['Material','PBE_hof'], axis=1, inplace=True) # labels=['PBE_hof'] → which column(s) to drop. # axis=1 → means drop along columns (if you used axis=0, it would drop rows). # inplace=True → modifies the DataFrame directly instead of returning a new one. X = hof_data.to_numpy() # Gives a NumPy array of features (shape will be (n_samples, n_features)), ready to feed into scikit-learn models. # Test/train split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) # 70% training set, 30% test set. random_state = 1 will produce same split every time it is run. print('Shape ',X_train.shape) Ntrain = np.shape(y_train)[0] # no. of training examples # Preprocessing feature data Xscaler = QuantileTransformer(n_quantiles=Ntrain, output_distribution='normal').fit(X_train) # Handles outliers better #Xscaler = MinMaxScaler(feature_range=(-1,1)).fit(X_train) X_train_scaled = Xscaler.transform(X_train) X_test_scaled = Xscaler.transform(X_test) # Scaling target data # First make it a 2D array y_train = y_train.reshape(-1,1) # -1 means: Figure out this dimension automatically based on the length of the array and the other dimensions I specify. # reshape(-1, 1) means: “Make it 2D, with 1 column, and as many rows as needed.” y_test = y_test.reshape(-1,1) Yscaler = QuantileTransformer(n_quantiles=Ntrain, output_distribution='normal').fit(y_train) #Yscaler = MinMaxScaler(feature_range=(-1, 1)).fit(y_train) y_train_scaled = Yscaler.transform(y_train) y_test_scaled = Yscaler.transform(y_test) ####################### y_test_data = [] for i in range(len(y_test)): y_test_data.append(y_test[i][0]) #Create lasso regressor reg = linear_model.Lasso(alpha=0.01) # Fit the regressor reg.fit(X_train_scaled,y_train_scaled) print('Regression coefficient ',reg.coef_,'\n') print('Regression intercept w_0 ',reg.intercept_,'\n') sys.exit() # Predict results for the test set y_pred_scaled = reg.predict(X_test_scaled) # The test set R^2 score on scaled data print('R^2 score for test set: %.4f' % r2_score(y_test_scaled, y_pred_scaled)) y_train_data = [] for i in range(len(y_train)): y_train_data.append(y_train_scaled[i][0]) # Pearson's correlation coefficient y_test_data = np.asarray(y_test_data).ravel() y_pred_scaled = np.asarray(y_pred_scaled).ravel() #r = np.corrcoef(y_test, y_pred_scaled) r = np.corrcoef(y_test_data, y_pred_scaled) print('Test Pearson correlation ', r) # Making predictions for the test set data y_pred_train = reg.predict(X_train_scaled) # Explained variance print('Explained variance for training set %.4f' % explained_variance_score(y_train_scaled,y_pred_train)) print('Explained variance for test set %.4f' % explained_variance_score(y_test,y_pred_scaled)) print() # The mean squared error print('RMSE for train y: %.4f' % np.sqrt(mean_squared_error(y_train_scaled, y_pred_train))) print('RMSE for test y: %.4f ' % np.sqrt(mean_squared_error(y_test_scaled, y_pred_scaled))) print() #Mean absolute error print('MAE for train y: %.4f' % mean_absolute_error(y_train_scaled, y_pred_train)) print('MAE for test y: %.4f' % mean_absolute_error(y_test_scaled, y_pred_scaled)) print() print("==========",'\n') print("Results for original data",'\n') y_pred_train_inv = Yscaler.inverse_transform(y_pred_train.reshape(-1,1)) y_pred_inv = Yscaler.inverse_transform(y_pred_scaled.reshape(-1,1)) #Mean absolute error after inverse transform print('MAE for train y: %.4f' % mean_absolute_error(y_train, y_pred_train_inv),'eV/atom') print('MAE for test y: %.4f' % mean_absolute_error(y_test, y_pred_inv),'eV/atom') print() # Mean squared error print('RMSE for train y: %.4f' % np.sqrt(mean_squared_error(y_train, y_pred_train_inv)), 'eV/atom') print('RMSE for test y: %.4f ' % np.sqrt(mean_squared_error(y_test, y_pred_inv)), 'eV/atom') print() # Explained variance print('Explained variance for training set %.4f' % explained_variance_score(y_train,y_pred_train_inv)) print('Explained variance for test set %.4f' % explained_variance_score(y_test,y_pred_inv)) print() # The R^2 score on original data print('R^2 score for test set: %.4f' % r2_score(y_test, y_pred_inv)) #Plot distribution of relative errors in y print('Shapes of y_test & y_pred_inv ', y_test.shape, y_pred_inv.shape) y_rel = (np.ravel(y_test) - np.ravel(y_pred_inv))/np.ravel(y_test) print("Shape of y_rel ",y_rel.shape) print('Mean of y_rel ', statistics.mean(np.abs(y_rel))) sb.histplot(np.ravel(y_rel), bins=50, kde =True) #plt.xlim(-15,15) plt.title("Distribution of relative errors") plt.xlabel('Relative error in y') plt.ylabel('Number') plt.show() plt.scatter(y_train, y_pred_train_inv, marker='*', color='red') plt.title("Scatter plot for training data set") plt.xlabel('Training hform',fontsize=18) plt.ylabel('Predicted hform',fontsize=18) plt.show() # Plot outputs plt.scatter(y_test, y_pred_inv, marker='*', color='red') plt.title("Scatter plot for test data set") plt.xlabel('$y_{test}$',fontsize=18) plt.ylabel('$y_{pred}$',fontsize=18) plt.show()