########################### # This program trains a linear regression model # P. Sen, August 21, 2025 ############################ import matplotlib.pyplot as plt import seaborn as sb import numpy as np from sklearn import linear_model from sklearn.linear_model import LinearRegression from scipy.stats import pearsonr from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, explained_variance_score from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split from sklearn import preprocessing # Scale the features and the target from sklearn.preprocessing import QuantileTransformer, RobustScaler, MinMaxScaler, StandardScaler import statistics import pandas as pd from math import sqrt import sys hof_data = pd.read_csv('./hform_dataset_ABSe3.csv',sep=',') print(hof_data.shape) Nsamples = hof_data.shape[0] print('No. of samples = ', Nsamples) #Shuffle rows randomly hof_data = hof_data.sample(frac=1, axis=0).reset_index(drop=True) # sample(frac=1, axis=0) → takes 100% (frac=1) of the rows, along axis=0 (rows), but shuffles their order. # .reset_index(drop=True) → resets the row index after shuffling so a 0,1,2,… index order is obtained instead of keeping the old shuffled indices. # Reassignment (hof_data = ...) → updates the existing DataFrame to the shuffled version. # This line is pulling out one column from the hof_data DataFrame as a NumPy array: smeig10 = hof_data[['smeig10']].values # Print the values for different percentiles of distribution of smeig10 print(np.percentile(smeig10, [0, 25, 50, 75, 100])) y = hof_data[['PBE_hof']].values # This line is pulling out one column from the hof_data DataFrame as a NumPy array: # hof_data[['PBE_hof']] → selects the column PBE_hof, but because of double brackets [['...']], # it’s returned as a DataFrame (2-D), not a Series. # .values → converts that DataFrame into a NumPy ndarray. # So the result is a 2-D array of shape (n_samples, 1) # This line shows the distribution of smeig10 before preprocessing/scaling sb.histplot(smeig10) plt.title("Smeig10 before preprocessing") plt.show() # This line shows the distribution of y before scaling sb.histplot(y) plt.title("Distribution of PBE hform before preprocessing") plt.show() # Now drop the column 'PBE_hof' from the hof_data dataframe. hof_data.drop(labels=['Material','PBE_hof'], axis=1, inplace=True) # labels=['PBE_hof'] → which column(s) to drop. # axis=1 → means drop along columns (if you used axis=0, it would drop rows). # inplace=True → modifies the DataFrame directly instead of returning a new one. X = hof_data.to_numpy() # Gives a NumPy array of features (shape will be (n_samples, n_features)), ready to feed into scikit-learn models. # Test/train split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) # 70% training set, 30% test set. random_state = 1 will produce the same split every time it is run. print('Shape ',X_train.shape) Ntrain = np.shape(y_train)[0] # no. of training examples # Preprocessing feature data Xscaler = QuantileTransformer(n_quantiles=Ntrain, output_distribution='normal').fit(X_train) # Handles outliers better #Xscaler = preprocessing.StandardScaler().fit(X_train) X_train_scaled = Xscaler.transform(X_train) X_test_scaled = Xscaler.transform(X_test) # Selects out smeig10 after data scaling smeig10 = X_train_scaled[:9] # This line shows the distribution of smeig10 after preprocessing/scaling sb.histplot(smeig10, binwidth=0.1, legend = False) plt.title("Distribution of smeig10 after preprocessing") plt.show() #print(np.percentile(X_train_scaled[:0], [0, 25, 50, 75, 100])) # Scaling target data # First make it a 2D array y_train = y_train.reshape(-1,1) # -1 means: Figure out this dimension automatically based on the length of the array and the other dimensions I specify. # reshape(-1, 1) means: “Make it 2D, with 1 column, and as many rows as needed.” y_test = y_test.reshape(-1,1) Yscaler = QuantileTransformer(n_quantiles=Ntrain, output_distribution='normal').fit(y_train) #Yscaler = preprocessing.StandardScaler().fit(y_train) y_train_scaled = Yscaler.transform(y_train) y_test_scaled = Yscaler.transform(y_test) ####################### y_test_data = [] for i in range(len(y_test)): y_test_data.append(y_test[i][0]) #Create linear regressor reg = linear_model.LinearRegression() reg.fit(X_train_scaled, y_train_scaled) print('Regression coefficient ',reg.coef_,'\n') print('Regression intercept w_0 ',reg.intercept_,'\n') #sys.exit() y_pred_scaled = reg.predict(X_test_scaled) #Predicts y values from the model reg # The R^2 value print('R^2 score for test set: %.4f' % r2_score(y_test_scaled, y_pred_scaled)) #y_train_list = y_train.flatten().tolist() # #y_train_data = np.array(y_train_list) y_train_data = [] for i in range(len(y_train)): y_train_data.append(y_train_scaled[i][0]) # Pearson's correlation coefficient y_test_data = np.asarray(y_test_data).ravel() y_pred_scaled = np.asarray(y_pred_scaled).ravel() #r = np.corrcoef(y_test, y_pred_scaled) r = np.corrcoef(y_test_data, y_pred_scaled) print('Test Pearson correlation ', r) y_pred_train = reg.predict(X_train_scaled) # Explained variance print('Explained variance for training set %.4f' % explained_variance_score(y_train_scaled,y_pred_train)) print('Explained variance for test set %.4f' % explained_variance_score(y_test,y_pred_scaled)) print() # The mean squared error print('RMSE for train y: %.4f' % np.sqrt(mean_squared_error(y_train_scaled, y_pred_train))) print('RMSE for test y: %.4f ' % np.sqrt(mean_squared_error(y_test_scaled, y_pred_scaled))) print() #Mean absolute error print('MAE for train y: %.4f' % mean_absolute_error(y_train_scaled, y_pred_train)) print('MAE for test y: %.4f' % mean_absolute_error(y_test_scaled, y_pred_scaled)) print() #Plot distribution of relative errors in y #y_rel = (y_test_scaled - y_pred_scaled)/y_test_scaled #print('Mean of y_rel ', statistics.mean(np.ravel(y_rel))) #sb.histplot(np.ravel(y_rel)) #plt.xlim(-10,10) #plt.xlabel('Relative error in y') #plt.ylabel('Number') #plt.show() #plt.scatter(y_train_scaled, y_pred_train, marker='*', color='red') #plt.xlim(0,5) #plt.ylim(0,5) #plt.xlabel('Training hform',fontsize=18) #plt.ylabel('Predicted hform',fontsize=18) #plt.show() # Plot outputs #plt.scatter(y_test_scaled, y_pred_scaled, marker='*', color='red') #plt.xlim(-3,3) #plt.xlim(0,10) #plt.ylim(0,10) #plt.ylim(-3,3) #plt.xlabel('$y_{test}$',fontsize=18) #plt.ylabel('$y_{pred}$',fontsize=18) #plt.show() # So far the results were on the scaled data. # The following are on data with original units. print("==========") print("Results for original data",'\n') # Perform the inverse scaling transormations #X_train_inv = X_train #X_test_inv = X_test #y_train_inv = y_train #y_test_inv = y_test y_pred_train_inv = Yscaler.inverse_transform(y_pred_train.reshape(-1,1)) y_pred_inv = Yscaler.inverse_transform(y_pred_scaled.reshape(-1,1)) #Mean absolute error after inverse transform print('MAE for train y after inverse transform: %.4f' % mean_absolute_error(y_train, y_pred_train_inv)) print('MAE for test y after inverse transform: %.4f' % mean_absolute_error(y_test, y_pred_inv)) print() # Mean squared error print('RMSE for train y: %.4f' % np.sqrt(mean_squared_error(y_train, y_pred_train_inv))) print('RMSE for test y: %.4f ' % np.sqrt(mean_squared_error(y_test, y_pred_inv))) print() # Explained variance print('Explained variance for training set %.4f' % explained_variance_score(y_train,y_pred_train_inv)) print('Explained variance for test set %.4f' % explained_variance_score(y_test,y_pred_inv)) print() # The R^2 value print('R^2 score for test set: %.4f' % r2_score(y_test, y_pred_inv)) #Plot distribution of relative errors in y print('Shapes of y_test & y_pred_inv ', y_test.shape, y_pred_inv.shape) y_rel = (np.ravel(y_test) - np.ravel(y_pred_inv))/np.ravel(y_test) print("Shape of y_rel ",y_rel.shape) print('Mean of y_rel ', statistics.mean(np.abs(y_rel))) sb.histplot(np.ravel(y_rel), bins=50, kde =True) plt.title("Distribution of relative errors") plt.xlim(-15,15) plt.xlabel('Relative error in y') plt.ylabel('Number') plt.show() plt.scatter(y_train, y_pred_train_inv, marker='*', color='red') #plt.xlim(0,5) #plt.ylim(0,5) plt.title("Scatter plot for training data set") plt.xlabel('True hform',fontsize=18) plt.ylabel('Predicted hform',fontsize=18) plt.show() # Plot outputs plt.scatter(y_test, y_pred_inv, marker='*', color='red') plt.title("Scatter plot for test data set") #plt.xlim(-3,3) #plt.ylim(-3,3) plt.xlabel('True hform',fontsize=18) plt.ylabel('Predicted hform',fontsize=18) plt.show()