diff --git a/README.md b/README.md index 60e188c2c039ca377cb544e48829637a546cdc35..24715bdc17cf86b9d9f0e805090953d3262152c5 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,82 @@ <h3>DESD Group Project - ShallowSink </h3> <h2>Group Members</h2> -<p>Jack McDaid - 21023808<br> +<p> + +Jack McDaid - 21023808<br> Charlie Taylor - 21024059<br> Aaron Jones - 21035135<br> James Burt - 20016437<br> +Michael Cook - 21020078<br> + +</p> +<h2>Setting it up</h2> +<p> + +To set the system up, you need to have a Python IDE and Docker Desktop. Download the Git code, and navigate the terminal to "ActualProjectCode/DjangoProject". Make sure Docker Desktop is open, and then run "docker compose build". This may take a while. After its finished, run these two commands in order: + +docker compose run --rm web sh -c "python manage.py makemigrations" +docker compose run --rm web sh -c "python manage.py migrate" + +This will make the docker container(s) run both makemigrations and migrate, in order to update the postgresql database in the Docker Desktop volume. + +To start the project, in the same terminal use this command: + +docker-compose up --build + +From here, it will boot up in the localhost, or your selected hosting service (AWS, Azure Cloud, etc...) + +</p> +<h2>Docker Basics</h2> +<p> + +In docker, there are two main screens you will use; Containers, and Volumes. + +In 'Containers', this will include the active and inactive containers of the project. +In 'Volumes', you can access the data stored in a postgresql database. + +If the system goes down, data will be cached as well as the libraries install for python, so it won't take as long as the first time. + +</p +<h2>Creating Superuser<h2> +<p> + +Upon startup, there will be no users or profiles. In the terminal of your Python IDE, use this command: + +docker compose run --rm web sh -c "python manage.py createsuperuser" + +This will create a superuser, which is needed to initialise the first admin. This is a flaw in our system, but after this, the Django admin screen will not be used. + +To initialise the admin, go to the host/admin screen, and login with the details used in the superuser. Head to 'Profiles' and change the role of a created user to 'Admin'. + +</p> +<h2>Uploading CSV Format</h2> +<p> + +User(s) can upload CSV files, along with a following Target Column, in order to get predictions based on the data and model they've chosen. A User, AI Engineer and Administrator can choose a local file to upload, provide the target column, which must match one found in the local file, and choose a ML model in order to get a prediction. + +In return, an invoice is created, and a prediction of the final settlement is given in a file format, Either CSV, txt, etc... + +When an invoice has been created, an Administrator or one of the Finance Team can approve or deny this invoice. + +</p> + +<h2>Adding New Models:</h2> +<p> + +To add new ML models, the AI engineer will need to check some things beforehand. +First of all, make sure that the target column and file_path variables in the new model contain both sys.argv[1] and sys.argv[2] respectively. + +After that, the AI engineer will need to ensure that the model outputs a txt, or csv file containing the data to be outputted to the system. After this, it will need to print the file_path of the new file you created. Using this piece of code, you can do this: + + input_path = file_path + input_dir = os.path.dirname(input_path) + input_filename = os.path.basename(input_path) + input_name = os.path.splitext(input_filename)[0] + output_path = os.path.join(input_dir, f"{input_name}_predictions.csv") + + result_df.to_csv(output_path, index=False) # This can be a txt, but for this example its a csv with a pandas dataframe. + print(output_path) + +When a model is created, it can be deleted by the AI Engineer. </p> \ No newline at end of file diff --git a/Working Models/linearPiecewise.py b/Working Models/linearPiecewise.py new file mode 100644 index 0000000000000000000000000000000000000000..5a35ae6141d2c8740ce0142d16cf06b34864f99e --- /dev/null +++ b/Working Models/linearPiecewise.py @@ -0,0 +1,250 @@ +import os +import sys +import pandas as pd +import numpy as np +import re +from sklearn.model_selection import train_test_split +from sklearn.metrics import mean_squared_error, r2_score +from sklearn.preprocessing import StandardScaler, LabelEncoder +from sklearn.linear_model import LinearRegression +from sklearn.tree import DecisionTreeRegressor +from sklearn.pipeline import Pipeline +from sklearn.impute import SimpleImputer + + +### IMPORTANT: When doing these models, be careful what is printed, as it will be stored as its response. + +class PiecewiseLinearRegressor: + def __init__(self, max_depth=5, min_samples_leaf=20): + self.max_depth = max_depth + self.min_samples_leaf = min_samples_leaf + self.tree = None + self.linear_models = {} + self.leaf_ids = None + self._program = None # To store the model representation + + def fit(self, X, y): + # First, use a decision tree to partition the space + self.tree = DecisionTreeRegressor( + max_depth=self.max_depth, + min_samples_leaf=self.min_samples_leaf, + random_state=42 + ) + self.tree.fit(X, y) + + # Get leaf node assignments for each sample + self.leaf_ids = self.tree.apply(X) + + # Fit a linear model for each leaf + unique_leaves = np.unique(self.leaf_ids) + for leaf_id in unique_leaves: + mask = self.leaf_ids == leaf_id + if np.sum(mask) > 1: # Ensure we have enough samples + leaf_model = LinearRegression() + leaf_model.fit(X[mask], y[mask]) + self.linear_models[leaf_id] = leaf_model + + # Generate a readable representation of the model + self._create_program_representation(X) + return self + + def predict(self, X): + leaf_ids = self.tree.apply(X) + predictions = np.zeros(X.shape[0]) + + for leaf_id in self.linear_models: + mask = leaf_ids == leaf_id + if np.sum(mask) > 0: + predictions[mask] = self.linear_models[leaf_id].predict(X[mask]) + + return predictions + + def _create_program_representation(self, X): + if len(self.linear_models) == 0: + self._program = "No valid model could be created" + return + + model_str = [] + model_str.append("Piecewise Linear Model with the following segments:") + + # Sort leaf IDs for consistent output + sorted_leaves = sorted(self.linear_models.keys()) + + for i, leaf_id in enumerate(sorted_leaves): + linear_model = self.linear_models[leaf_id] + coefs = linear_model.coef_ + intercept = linear_model.intercept_ + + segment_str = f"\nSegment {i + 1} (Leaf {leaf_id}):" + + # Add linear equation for this segment + equation = f"y = {intercept:.4f}" + for j, coef in enumerate(coefs): + if j < X.shape[1]: # Ensure we don't go out of bounds + if coef >= 0: + equation += f" + {coef:.4f} * x{j + 1}" + else: + equation += f" - {abs(coef):.4f} * x{j + 1}" + + segment_str += f"\n {equation}" + model_str.append(segment_str) + + self._program = "\n".join(model_str) + + +# Function to convert time periods to number of days +def convert_time_period(value): + if pd.isna(value): + return np.nan + + try: + # Handle numeric values + if isinstance(value, (int, float)): + return value + + # Convert string to lowercase for consistency + value = str(value).lower() + + # Extract number and unit + match = re.search(r'(\d+)\s*(\w+)', value) + if not match: + # Try to extract just a number + number_match = re.search(r'(\d+)', value) + if number_match: + return int(number_match.group(1)) + return np.nan + + number = int(match.group(1)) + unit = match.group(2) + + # Convert to days + if 'day' in unit: + return number + elif 'week' in unit: + return number * 7 + elif 'month' in unit: + return number * 30 + elif 'year' in unit: + return number * 365 + else: + # If unit is not recognized, just return the number + return number + except Exception: + # Silently handle errors + return np.nan + + +# Load the data +project_root = os.path.dirname(os.path.dirname(__file__)) +file_path = f"{sys.argv[2]}" +data = pd.read_csv(file_path) + +# Will need to be changed to work with different csv files maybe ask user for their target column? +target_col = f"{sys.argv[1]}" +X = data.drop(target_col, axis=1) +y = data[target_col] + +# Categorize columns by data type +numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist() +categorical_cols = X.select_dtypes(include=['object']).columns.tolist() + +# Process each categorical column appropriately +for col in categorical_cols: + # Check if column contains time periods (e.g., "5 months") + if col == 'Injury_Prognosis' or any( + re.search(r'\d+\s*(?:day|week|month|year)', str(val)) for val in X[col].dropna().iloc[:20]): + X[col] = X[col].apply(convert_time_period) + # Fill missing values with median after conversion + median_value = X[col].median() + X[col].fillna(median_value, inplace=True) + else: + # For regular categorical variables, use label encoding with a special category for missing values + # First, fill NaN with a placeholder string + X[col].fillna("MISSING_VALUE", inplace=True) + + # Then apply label encoding + le = LabelEncoder() + X[col] = le.fit_transform(X[col]) + +# Check for any remaining non-numeric columns +non_numeric_cols = X.select_dtypes(exclude=[np.number]).columns.tolist() +if non_numeric_cols: + # Drop any remaining non-numeric columns + X = X.drop(columns=non_numeric_cols) + +# Analyze missing values +missing_values = X.isna().sum() + +# Check for missing values in target column +target_missing = y.isna().sum() + +# Handle missing values with imputation instead of dropping + +# For numerical columns +num_imputer = SimpleImputer(strategy='median') +X_imputed = pd.DataFrame(num_imputer.fit_transform(X), columns=X.columns) + +# Handle missing values in target (if any) +if target_missing > 0: + mask = y.notna() + X_imputed = X_imputed[mask] + y_clean = y[mask] +else: + y_clean = y.copy() + +# Redefine X with imputed data +X_clean = X_imputed + +# Split the data +X_train, X_test, y_train, y_test = train_test_split(X_clean, y_clean, test_size=0.2, random_state=42) + +# Scale the features +scaler = StandardScaler() +X_train_scaled = scaler.fit_transform(X_train) +X_test_scaled = scaler.transform(X_test) + +# Configure and train the piecewise linear model +# We can tune these parameters to get the desired complexity +piecewise_model = PiecewiseLinearRegressor( + max_depth=5, # Controls the number of segments + min_samples_leaf=20 # Minimum samples in each segment +) + +piecewise_model.fit(X_train_scaled, y_train) + +# Make predictions +y_pred_train = piecewise_model.predict(X_train_scaled) +y_pred_test = piecewise_model.predict(X_test_scaled) + +# Evaluate the model +train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train)) +test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test)) +train_r2 = r2_score(y_train, y_pred_train) +test_r2 = r2_score(y_test, y_pred_test) + + +results = { + "Train RMSE": f"{train_rmse:.2f}", + "Test RMSE": f"{test_rmse:.2f}", + "Train R² Score": f"{train_r2:.4f}", + "Test R² Score": f"{test_r2:.4f}", + "Model": piecewise_model._program +} + +if __name__ == "__main__": + input_path=file_path + input_dir = os.path.dirname(input_path) + input_filename = os.path.basename(input_path) + input_name = os.path.splitext(input_filename)[0] + + output_path = os.path.join(input_dir, f"{input_name}_predictions.txt") + + with open(output_path, "w") as f: + f.write(f"Train RMSE: {results['Train RMSE']}\n") + f.write(f"Test RMSE: {results['Test RMSE']}\n") + f.write(f"Train R² Score: {results['Train R² Score']}\n") + f.write(f"Test R² Score: {results['Test R² Score']}\n") + f.write("\nBest piecewise linear model:") + f.write(str(results['Model'])) + + print(output_path) \ No newline at end of file