diff --git a/Working Models/symbolicRegressor.py b/Working Models/symbolicRegressor.py new file mode 100644 index 0000000000000000000000000000000000000000..b9b16cbb93bc86be3070ff78307d51ec4bd089cb --- /dev/null +++ b/Working Models/symbolicRegressor.py @@ -0,0 +1,172 @@ +import os +import sys +import pandas as pd +import numpy as np +import re +from gplearn.genetic import SymbolicRegressor +from sklearn.model_selection import train_test_split +from sklearn.metrics import mean_squared_error, r2_score +from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder +from sklearn.pipeline import Pipeline +from sklearn.compose import ColumnTransformer +from sklearn.impute import SimpleImputer +import seaborn as sns + +### IMPORTANT: When doing these models, be careful what is printed, as it will be stored as it's response. + +# Load the data +project_root = os.path.dirname(os.path.dirname(__file__)) +file_path = f"{sys.argv[2]}" +data = pd.read_csv(file_path) + +# Will need to be changed to work with different csv files maybe ask user for their target column? +target_col = f"{sys.argv[1]}" +X = data.drop(target_col, axis=1) +y = data[target_col] + +# Function to convert time periods to number of days +def convert_time_period(value): + if pd.isna(value): + return np.nan + + try: + # Handle numeric values + if isinstance(value, (int, float)): + return value + + # Convert string to lowercase for consistency + value = str(value).lower() + + # Extract number and unit + match = re.search(r'(\d+)\s*(\w+)', value) + if not match: + # Try to extract just a number + number_match = re.search(r'(\d+)', value) + if number_match: + return int(number_match.group(1)) + return np.nan + + number = int(match.group(1)) + unit = match.group(2) + + # Convert to days + if 'day' in unit: + return number + elif 'week' in unit: + return number * 7 + elif 'month' in unit: + return number * 30 + elif 'year' in unit: + return number * 365 + else: + # If unit is not recognized, just return the number + return number + except Exception as e: + print(f"Error converting '{value}': {e}") + return np.nan + +# Categorize columns by data type +numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist() +categorical_cols = X.select_dtypes(include=['object']).columns.tolist() + +# Process each categorical column appropriately +for col in categorical_cols: + + # First, fill missing values with a placeholder + missing_pct = X[col].isna().mean() * 100 + + # Check if column contains time periods (e.g., "5 months") + if col == 'Injury_Prognosis' or any( + re.search(r'\d+\s*(?:day|week|month|year)', str(val)) for val in X[col].dropna().iloc[:20]): + X[col] = X[col].apply(convert_time_period) + # Fill missing values with median after conversion + median_value = X[col].median() + X[col].fillna(median_value, inplace=True) + else: + # For regular categorical variables, use label encoding with a special category for missing values + # First, fill NaN with a placeholder string + X[col].fillna("MISSING_VALUE", inplace=True) + + # Then apply label encoding + le = LabelEncoder() + X[col] = le.fit_transform(X[col]) + + # Store mapping for reference + mapping = dict(zip(le.classes_, le.transform(le.classes_))) + +# Check for any remaining non-numeric columns +non_numeric_cols = X.select_dtypes(exclude=[np.number]).columns.tolist() +if non_numeric_cols: + # Drop any remaining non-numeric columns + X = X.drop(columns=non_numeric_cols) + +# Analyze missing values +missing_values = X.isna().sum() + +# Check for missing values in target column +target_missing = y.isna().sum() + +# Handle missing values with imputation instead of dropping + +# For numerical columns +num_imputer = SimpleImputer(strategy='median') +X_imputed = pd.DataFrame(num_imputer.fit_transform(X), columns=X.columns) + +# Handle missing values in target (if any) +if target_missing > 0: + mask = y.notna() + X_imputed = X_imputed[mask] + y_clean = y[mask] +else: + y_clean = y.copy() + +# Redefine X with imputed data +X_clean = X_imputed + +# Split the data +X_train, X_test, y_train, y_test = train_test_split(X_clean, y_clean, test_size=0.2, random_state=42) + +# Scale the features +scaler = StandardScaler() +X_train_scaled = scaler.fit_transform(X_train) +X_test_scaled = scaler.transform(X_test) + +# Configure and training the model +# Training Symbolic Regressor +symbolic_reg = SymbolicRegressor( + population_size=2000, + generations=30, + tournament_size=20, + p_crossover=0.7, + p_subtree_mutation=0.1, + p_hoist_mutation=0.05, + p_point_mutation=0.1, + max_samples=0.8, + verbose=0, + parsimony_coefficient=0.05, + random_state=42, + function_set=('add', 'sub', 'mul', 'div', 'sqrt', 'log', 'sin', 'cos') +) + +symbolic_reg.fit(X_train_scaled, y_train) + +# Make predictions +y_pred_train = symbolic_reg.predict(X_train_scaled) +y_pred_test = symbolic_reg.predict(X_test_scaled) + +# Evaluate the model +train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train)) +test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test)) +train_r2 = r2_score(y_train, y_pred_train) +test_r2 = r2_score(y_test, y_pred_test) + +### OUTPUTS + +print(f"Train RMSE: {train_rmse:.2f}") +print(f"Test RMSE: {test_rmse:.2f}") +print(f"Train R² Score: {train_r2:.4f}") +print(f"Test R² Score: {test_r2:.4f}") + +# Display the learned expression +print("\nBest symbolic expression:") +print(symbolic_reg._program)