diff --git a/Working Models/Linear_training.py b/Working Models/Linear_training.py index 1547e87b2cbf5decd2b2723c057010adc9a423d2..eb3d013719870347cda26e55c11063c8c659e2ce 100644 --- a/Working Models/Linear_training.py +++ b/Working Models/Linear_training.py @@ -90,7 +90,6 @@ class PiecewiseLinearRegressor: self._program = "\n".join(model_str) def get_model_params(self): - """Export model parameters for later use""" model_params = { "feature_count": self.linear_models[list(self.linear_models.keys())[0]].coef_.shape[0], "segments": {} @@ -285,8 +284,6 @@ def main(): # Restore original stderr before printing sys.stderr = original_stderr - print(f"Model parameters saved to: {model_path}") - print(f"Training results saved to: {output_path}") finally: # Restore original stderr and close the null file diff --git a/Working Models/predict_from_json.py b/Working Models/predict_from_json.py new file mode 100644 index 0000000000000000000000000000000000000000..d02afffc9422c137d2fcab9c83127f212c72eed8 --- /dev/null +++ b/Working Models/predict_from_json.py @@ -0,0 +1,63 @@ +import os +import sys +import re +import json +import pandas as pd +import numpy as np +from joblib import load +from sklearn.metrics import mean_squared_error, r2_score + +# Path setup +script_dir = os.path.dirname(os.path.abspath(__file__)) +csv_path = f"{sys.argv[2]}" +model_path = os.path.join(script_dir, "symbolic_model.pkl") +scaler_path = os.path.join(script_dir, "symbolic_scaler.pkl") +imputer_path = os.path.join(script_dir, "symbolic_imputer.pkl") +meta_path = os.path.join(script_dir, "symbolic_model_meta.json") + +# Load models +model = load(model_path) +scaler = load(scaler_path) +imputer = load(imputer_path) + +# Load Metadata +with open(meta_path, "r") as f: + metadata = json.load(f) +target_col = metadata["target_column"] + +# Load and preprocess data +df = pd.read_csv(csv_path) +X = df.drop(columns=[target_col]) if target_col in df.columns else df.copy() + +# Time periods and categories +for col in X.select_dtypes(include=['object']).columns: + if col == 'Injury_Prognosis' or any(re.search(r'\d+\s*(?:day|week|month|year)', str(v)) for v in X[col].dropna().iloc[:10]): + X[col] = X[col].fillna("0").apply(lambda v: float(re.findall(r'\d+', str(v))[0]) if re.findall(r'\d+', str(v)) else 0) + else: + X[col] = X[col].fillna("MISSING_VALUE").astype("category").cat.codes + +X = X.select_dtypes(include=[np.number]) +X = pd.DataFrame(imputer.transform(X), columns=X.columns) +X_scaled = scaler.transform(X) + +#Predict +predictions = model.predict(X_scaled) + +#Save output +output_df = df.copy() +output_df[f"Predicted_{target_col}"] = predictions +output_path = os.path.join(script_dir, "new_data_for_prediction_predictions.csv") +output_df.to_csv(output_path, index=False) + +# Optional evaluation +if target_col in df.columns: + y_true = df[target_col].values + y_pred = np.array(predictions) + + mask = (~np.isnan(y_true)) & (~np.isnan(y_pred)) + if np.sum(mask) > 0: + rmse = np.sqrt(mean_squared_error(y_true[mask], y_pred[mask])) + r2 = r2_score(y_true[mask], y_pred[mask]) + + else: + end diff --git a/Working Models/symbolic_model.pkl b/Working Models/symbolic_model.pkl new file mode 100644 index 0000000000000000000000000000000000000000..807ac7ef53533f9d02e7d1ff76767be2f2038941 Binary files /dev/null and b/Working Models/symbolic_model.pkl differ diff --git a/Working Models/symbolic_scaler.pkl b/Working Models/symbolic_scaler.pkl new file mode 100644 index 0000000000000000000000000000000000000000..fcfce15c7b0d54035cd13f9d2b8c4a5854172a37 Binary files /dev/null and b/Working Models/symbolic_scaler.pkl differ diff --git a/Working Models/train_symbolic_model.py b/Working Models/train_symbolic_model.py new file mode 100644 index 0000000000000000000000000000000000000000..3aec09d78403f03b6c48ae16df0c2db8b035310d --- /dev/null +++ b/Working Models/train_symbolic_model.py @@ -0,0 +1,138 @@ +import os +import json +import re +import pandas as pd +import numpy as np +from gplearn.genetic import SymbolicRegressor +from sklearn.model_selection import train_test_split +from sklearn.metrics import mean_squared_error, r2_score +from sklearn.preprocessing import StandardScaler, LabelEncoder +from sklearn.impute import SimpleImputer +from joblib import dump + +# Path construction +script_dir = os.path.dirname(os.path.abspath(__file__)) +file_path = f"{sys.argv[2]}" +target_col = target_col = f"{sys.argv[1]}" + +# Load data +data = pd.read_csv(file_path) + +X = data.drop(target_col, axis=1) +y = data[target_col] + +# Function to convert time periods to number of days +def convert_time_period(value): + if pd.isna(value): + return np.nan + + try: + # Handle numeric values + if isinstance(value, (int, float)): + return value + + # Convert string to lowercase for consistency + value = str(value).lower() + + # Extract number and unit + match = re.search(r'(\d+)\s*(\w+)', value) + if not match: + # Try to extract just a number + number_match = re.search(r'(\d+)', value) + if number_match: + return int(number_match.group(1)) + return np.nan + + number = int(match.group(1)) + unit = match.group(2) + + # Convert to days + if 'day' in unit: + return number + elif 'week' in unit: + return number * 7 + elif 'month' in unit: + return number * 30 + elif 'year' in unit: + return number * 365 + else: + # If unit is not recognized, just return the number + return number + except Exception as e: + return np.nan + +# Categorize columns by data type +categorical_cols = X.select_dtypes(include=['object']).columns.tolist() + +# Process each categorical column appropriately +for col in categorical_cols: + # Check if column contains time periods (e.g., "5 months") + if col == 'Injury_Prognosis' or any(re.search(r'\d+\s*(?:day|week|month|year)', str(v)) for v in X[col].dropna().iloc[:10]): + X[col] = X[col].apply(convert_time_period) + # Fill missing values with median after conversion + X[col] = X[col].fillna(X[col].median()) + else: + # For regular categorical variables, use label encoding with a special category for missing values + # First, fill NaN with a placeholder string + X[col] = X[col].fillna("MISSING_VALUE") + # Then apply label encoding + le = LabelEncoder() + X[col] = le.fit_transform(X[col]) + +# Drop non-numeric leftovers +X = X.select_dtypes(include=[np.number]) + +# Impute missing values +imputer = SimpleImputer(strategy='median') +X_clean = pd.DataFrame(imputer.fit_transform(X), columns=X.columns) + +# Handle missing targets +if y.isna().sum() > 0: + mask = y.notna() + X_clean = X_clean[mask] + y = y[mask] + +# Scale the data +scaler = StandardScaler() +X_scaled = scaler.fit_transform(X_clean) + +# Save preprocessing objects +dump(scaler, os.path.join(script_dir, "symbolic_scaler.pkl")) +dump(imputer, os.path.join(script_dir, "symbolic_imputer.pkl")) + +# Train regressor +X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42) + +model = SymbolicRegressor( + population_size=1000, + generations=50, + tournament_size=20, + p_crossover=0.7, + p_subtree_mutation=0.1, + p_hoist_mutation=0.05, + p_point_mutation=0.1, + max_samples=0.9, + verbose=1, + parsimony_coefficient=0.01, + random_state=42, + function_set=('add', 'sub', 'mul', 'div', 'sqrt', 'log', 'sin', 'cos') +) +model.fit(X_train, y_train) + +# Evaluate +y_pred = model.predict(X_test) +rmse = round(np.sqrt(mean_squared_error(y_test, y_pred)), 2) +r2 = round(r2_score(y_test, y_pred), 4) + +# Save model +dump(model, os.path.join(script_dir, "symbolic_model.pkl")) + +# Save model metadata for reference +with open(os.path.join(script_dir, "symbolic_model_meta.json"), "w") as f: + json.dump({ + "target_column": target_col, + "rmse": rmse, + "r2": r2, + "expression": str(model._program) + }, f, indent=4) +