diff --git a/ActualProjectCode/DjangoProject/mlModels/modelName/model1.py b/ActualProjectCode/DjangoProject/mlModels/modelName/model1.py deleted file mode 100644 index e9e5534baf59c2ed650c6741f6a7f9ff27d40469..0000000000000000000000000000000000000000 --- a/ActualProjectCode/DjangoProject/mlModels/modelName/model1.py +++ /dev/null @@ -1,96 +0,0 @@ -import os -import pandas as pd -import numpy as np -from gplearn.genetic import SymbolicRegressor -from sklearn.model_selection import train_test_split -from sklearn.metrics import mean_squared_error, r2_score -import matplotlib.pyplot as plt -from sklearn.preprocessing import StandardScaler -import seaborn as sns - -# Load the data -project_root = os.path.dirname(os.path.dirname(__file__)) -file_path = r"C:\Users\Charlie1\PycharmProjects\shallowsinks\ActualProjectCode\DjangoProject\records\Synthetic_Data_For_Students.csv" -data = pd.read_csv(file_path) - -# Will need to be changed to work with different csv files maybe ask user for their target column? -target_col = 'SettlementValue' -X = data.drop(target_col, axis=1) -y = data[target_col] - -# dropping non numeric and nan features -numeric_columns = X.select_dtypes(include=[np.number]).columns.tolist() -X = X[numeric_columns] -print(f"Features used: {len(numeric_columns)} numeric features") - - -data_clean = data.dropna(subset=[*numeric_columns, target_col]) -print(f"Rows after dropping missing values: {data_clean.shape[0]} out of {data.shape[0]} ({data_clean.shape[0]/data.shape[0]*100:.1f}%)") - -# Redefine X and y with clean data -X_clean = data_clean[numeric_columns] -y_clean = data_clean[target_col] - -# Split the data -X_train, X_test, y_train, y_test = train_test_split(X_clean, y_clean, test_size=0.2, random_state=42) - -# Scale the features -scaler = StandardScaler() -X_train_scaled = scaler.fit_transform(X_train) -X_test_scaled = scaler.transform(X_test) - -# Configure and training the model -print("Training the Symbolic Regressor...") -symbolic_reg = SymbolicRegressor( - population_size=5000, - generations=20, - p_crossover=0.7, - p_subtree_mutation=0.1, - p_hoist_mutation=0.05, - p_point_mutation=0.1, - max_samples=0.9, - verbose=1, - parsimony_coefficient=0.01, - random_state=42, - function_set=('add', 'sub', 'mul', 'div', 'sqrt', 'log', 'sin', 'cos') -) - -symbolic_reg.fit(X_train_scaled, y_train) - -# Make predictions -y_pred_train = symbolic_reg.predict(X_train_scaled) -y_pred_test = symbolic_reg.predict(X_test_scaled) - -# Evaluate the model -train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train)) -test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test)) -train_r2 = r2_score(y_train, y_pred_train) -test_r2 = r2_score(y_test, y_pred_test) - -print(f"Train RMSE: {train_rmse:.2f}") -print(f"Test RMSE: {test_rmse:.2f}") -print(f"Train R² Score: {train_r2:.4f}") -print(f"Test R² Score: {test_r2:.4f}") - -# Display the learned expression -print("\nBest symbolic expression:") -print(symbolic_reg._program) - -# Plot actual vs predicted values -plt.figure(figsize=(10, 6)) -plt.scatter(y_test, y_pred_test, alpha=0.5) -plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--') -plt.xlabel('Actual SettlementValue') -plt.ylabel('Predicted SettlementValue') -plt.title('Actual vs Predicted Values') -plt.savefig('symbolic_regression_results.png') -plt.show() - -# Save the model expression to a file -with open('symbolic_regression_formula.txt', 'w') as f: - f.write(str(symbolic_reg._program)) - f.write('\n\nModel Performance:\n') - f.write(f"Train RMSE: {train_rmse:.2f}\n") - f.write(f"Test RMSE: {test_rmse:.2f}\n") - f.write(f"Train R² Score: {train_r2:.4f}\n") - f.write(f"Test R² Score: {test_r2:.4f}\n") \ No newline at end of file