Skip to content
Snippets Groups Projects
Commit db3835de authored by a272-jones's avatar a272-jones
Browse files

Deleting Model 1, symbolicRegressor.py is now this model.

parent 2a334f4b
No related branches found
No related tags found
No related merge requests found
import os
import pandas as pd
import numpy as np
from gplearn.genetic import SymbolicRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
import seaborn as sns
# Load the data
project_root = os.path.dirname(os.path.dirname(__file__))
file_path = r"C:\Users\Charlie1\PycharmProjects\shallowsinks\ActualProjectCode\DjangoProject\records\Synthetic_Data_For_Students.csv"
data = pd.read_csv(file_path)
# Will need to be changed to work with different csv files maybe ask user for their target column?
target_col = 'SettlementValue'
X = data.drop(target_col, axis=1)
y = data[target_col]
# dropping non numeric and nan features
numeric_columns = X.select_dtypes(include=[np.number]).columns.tolist()
X = X[numeric_columns]
print(f"Features used: {len(numeric_columns)} numeric features")
data_clean = data.dropna(subset=[*numeric_columns, target_col])
print(f"Rows after dropping missing values: {data_clean.shape[0]} out of {data.shape[0]} ({data_clean.shape[0]/data.shape[0]*100:.1f}%)")
# Redefine X and y with clean data
X_clean = data_clean[numeric_columns]
y_clean = data_clean[target_col]
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_clean, y_clean, test_size=0.2, random_state=42)
# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Configure and training the model
print("Training the Symbolic Regressor...")
symbolic_reg = SymbolicRegressor(
population_size=5000,
generations=20,
p_crossover=0.7,
p_subtree_mutation=0.1,
p_hoist_mutation=0.05,
p_point_mutation=0.1,
max_samples=0.9,
verbose=1,
parsimony_coefficient=0.01,
random_state=42,
function_set=('add', 'sub', 'mul', 'div', 'sqrt', 'log', 'sin', 'cos')
)
symbolic_reg.fit(X_train_scaled, y_train)
# Make predictions
y_pred_train = symbolic_reg.predict(X_train_scaled)
y_pred_test = symbolic_reg.predict(X_test_scaled)
# Evaluate the model
train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train))
test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
train_r2 = r2_score(y_train, y_pred_train)
test_r2 = r2_score(y_test, y_pred_test)
print(f"Train RMSE: {train_rmse:.2f}")
print(f"Test RMSE: {test_rmse:.2f}")
print(f"Train R² Score: {train_r2:.4f}")
print(f"Test R² Score: {test_r2:.4f}")
# Display the learned expression
print("\nBest symbolic expression:")
print(symbolic_reg._program)
# Plot actual vs predicted values
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred_test, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
plt.xlabel('Actual SettlementValue')
plt.ylabel('Predicted SettlementValue')
plt.title('Actual vs Predicted Values')
plt.savefig('symbolic_regression_results.png')
plt.show()
# Save the model expression to a file
with open('symbolic_regression_formula.txt', 'w') as f:
f.write(str(symbolic_reg._program))
f.write('\n\nModel Performance:\n')
f.write(f"Train RMSE: {train_rmse:.2f}\n")
f.write(f"Test RMSE: {test_rmse:.2f}\n")
f.write(f"Train R² Score: {train_r2:.4f}\n")
f.write(f"Test R² Score: {test_r2:.4f}\n")
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment