From 1c59ed05cb83c331d60c54b47ba8d2b18ad3cb09 Mon Sep 17 00:00:00 2001
From: s2-gado <shekwoyeyilo2.gado@live.uwe.ac.uk>
Date: Sun, 2 Mar 2025 20:56:36 +0000
Subject: [PATCH] Upload New File

---
 GA2.ipynb | 405 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 405 insertions(+)
 create mode 100644 GA2.ipynb

diff --git a/GA2.ipynb b/GA2.ipynb
new file mode 100644
index 0000000..1131f01
--- /dev/null
+++ b/GA2.ipynb
@@ -0,0 +1,405 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Genetic Algorithm"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import random\n",
+    "from scipy.stats import pearsonr\n",
+    "from deap import base, creator, tools, algorithms\n",
+    "from sklearn.ensemble import RandomForestClassifier\n",
+    "from sklearn.tree import DecisionTreeClassifier\n",
+    "from sklearn.model_selection import cross_val_score\n",
+    "from sklearn.model_selection import RandomizedSearchCV\n",
+    "from sklearn.base import BaseEstimator\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "import matplotlib.pyplot as plt"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#reading data\n",
+    "a_df = pd.read_csv('alzheimers_disease_data.csv')\n",
+    "df =a_df.drop(['DoctorInCharge', 'PatientID'], axis=1, inplace=True)\n",
+    "X = a_df.drop('Diagnosis', axis = 1)\n",
+    "y = a_df['Diagnosis']\n",
+    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#defining the fitness function\n",
+    "def fitness_function(individual):\n",
+    "    selected_indices = [i for i, val in enumerate(individual) if val == 1]\n",
+    "\n",
+    "    if len(selected_indices) == 0:\n",
+    "        return -1.0,  # Penalize empty selection\n",
+    "\n",
+    "    correlation = []\n",
+    "    # Get selected features\n",
+    "    selected_features = X_train.iloc[:, selected_indices]\n",
+    "    #from selected features calculate correlation using pearson correlation\n",
+    "    for feature in range(len(selected_indices)):\n",
+    "        r, _= pearsonr((selected_features.iloc[:, feature].values), y_train.values)\n",
+    "        correlation.append(abs(r))\n",
+    "    \n",
+    "    mean_correlation = np.mean(correlation)\n",
+    "\n",
+    "    model = DecisionTreeClassifier(random_state= 42)\n",
+    "    model.fit(selected_features, y_train)\n",
+    "\n",
+    "    scores = cross_val_score(model, selected_features, y_train, cv=3, scoring='accuracy')\n",
+    "    mean_accuracy = np.mean(scores)\n",
+    "    #fitness_score = mean_correlation , mean_accuracy\n",
+    "        \n",
+    "    return mean_correlation, mean_accuracy \n",
+    "\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Using DEAP to create Genetic Algorithm"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "C:\\Users\\s2-gado\\AppData\\Roaming\\Python\\Python312\\site-packages\\deap\\creator.py:185: RuntimeWarning: A class named 'FitnessMax' has already been created and it will be overwritten. Consider deleting previous creation of that class or rename it.\n",
+      "  warnings.warn(\"A class named '{0}' has already been created and it \"\n",
+      "C:\\Users\\s2-gado\\AppData\\Roaming\\Python\\Python312\\site-packages\\deap\\creator.py:185: RuntimeWarning: A class named 'Individual' has already been created and it will be overwritten. Consider deleting previous creation of that class or rename it.\n",
+      "  warnings.warn(\"A class named '{0}' has already been created and it \"\n"
+     ]
+    }
+   ],
+   "source": [
+    "#setting up the GA\n",
+    "creator.create(\"FitnessMax\", base.Fitness, weights=(1.0,1.0))  # Maximize fitness\n",
+    "creator.create(\"Individual\", list, fitness=creator.FitnessMax)\n",
+    "\n",
+    "toolbox = base.Toolbox()\n",
+    "toolbox.register(\"attr_bool\", random.randint, 0, 1)  # Binary encoding (0 = ignore, 1 = select)\n",
+    "toolbox.register(\"individual\", tools.initRepeat, creator.Individual, toolbox.attr_bool, n=len(X.columns))\n",
+    "toolbox.register(\"population\", tools.initRepeat, list, toolbox.individual)\n",
+    "\n",
+    "#Operators\n",
+    "toolbox.register(\"mate\", tools.cxTwoPoint)  # Crossover\n",
+    "toolbox.register(\"mutate\", tools.mutFlipBit, indpb=0.05)  # Mutation probability\n",
+    "toolbox.register(\"select\", tools.selNSGA2)  # Multi-objective selection selNSGA2 (Non-dominated Sorting Genetic Algorithm II) is well-suited for multi-objective optimization.\n",
+    "toolbox.register(\"evaluate\", fitness_function)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# --- Define Main GA Function ---\n",
+    "def main(pop_size, generations, cxpb, mutpb):\n",
+    "    population = toolbox.population(n=pop_size)\n",
+    "\n",
+    "    # Evaluate initial population\n",
+    "    fitnesses = list(map(toolbox.evaluate, population))\n",
+    "    for ind, fit in zip(population, fitnesses):\n",
+    "        ind.fitness.values = fit\n",
+    "\n",
+    "    for g in range(generations):\n",
+    "        # Select the next generation\n",
+    "        offspring = toolbox.select(population, len(population))\n",
+    "        offspring = list(map(toolbox.clone, offspring))\n",
+    "\n",
+    "        # Apply crossover and mutationi\n",
+    "        for child1, child2 in zip(offspring[::2], offspring[1::2]):\n",
+    "            if random.random() < cxpb:\n",
+    "                toolbox.mate(child1, child2)\n",
+    "                del child1.fitness.values\n",
+    "                del child2.fitness.values\n",
+    "\n",
+    "        for mutant in offspring:\n",
+    "            if random.random() < mutpb:\n",
+    "                toolbox.mutate(mutant)\n",
+    "                del mutant.fitness.values\n",
+    "\n",
+    "        # Evaluate new offspring\n",
+    "        invalid_ind = [ind for ind in offspring if not ind.fitness.valid]\n",
+    "        fitnesses = list(map(toolbox.evaluate, invalid_ind))\n",
+    "        for ind, fit in zip(invalid_ind, fitnesses):\n",
+    "            ind.fitness.values = fit\n",
+    "\n",
+    "        # Update population\n",
+    "        population = toolbox.select(offspring, k=len(population))\n",
+    "\n",
+    "    # Use sortNondominated to select the best individual(s)\n",
+    "    fronts = tools.sortNondominated(population, len(population))\n",
+    "    best_front = fronts[0]  # Front 1 contains the best non-dominated solutions\n",
+    "\n",
+    "    # Find the individual with the highest correlation in the best front\n",
+    "    #find individual with highest correclation\n",
+    "    best_corr_individual = max(best_front, key=lambda ind: ind.fitness.values[0]) \n",
+    "    #fitness.values[0]: Corr score\n",
+    "    # fitness.values[1]: Accuracy score\n",
+    "    #loops through best_corr_individual and selects the corresponding feature names from X_train.columns\n",
+    "    best_corr_features = [X_train.columns[i] for i in range(len(best_corr_individual)) if best_corr_individual[i] == 1]\n",
+    "    best_corr_value = best_corr_individual.fitness.values[0] #corr of the best individual\n",
+    "\n",
+    "    # Find the individual with the highest accuracy in the best front\n",
+    "    best_acc_individual = max(best_front, key=lambda ind: ind.fitness.values[1])\n",
+    "    best_acc_features = [X_train.columns[i] for i in range(len(best_acc_individual)) if best_acc_individual[i] == 1]\n",
+    "    best_acc_value = best_acc_individual.fitness.values[1]\n",
+    "\n",
+    "    combined_features = list(set(best_corr_features + best_acc_features))\n",
+    "\n",
+    "    # Print results in the desired format\n",
+    "    #print(f\"Correlation: {best_corr_value},\\n Accuracy: {best_acc_value},\\n Best Features: {combined_features}\")\n",
+    "\n",
+    "    return best_corr_value, best_acc_value, combined_features\n",
+    "    # # Get Best Individual\n",
+    "    # best_individual = tools.selBest(population, k=1)[0]\n",
+    "    # selected_features = [X.columns[i] for i in range(len(best_individual)) if best_individual[i] == 1]\n",
+    "\n",
+    "    # print(\"Selected Features using Pearson Correlation GA: \\n\", selected_features)\n",
+    "    # return best_individual.fitness.values[0], selected_features\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "==================================================\n",
+      "Run 1:\n",
+      "Best Correlation Score: 0.1955\n",
+      "Best Accuracy Score: 0.9102\n",
+      "\n",
+      "Selected Features:\n",
+      " ['CardiovascularDisease', 'BehavioralProblems', 'SleepQuality', 'DifficultyCompletingTasks', 'DietQuality', 'PersonalityChanges', 'FamilyHistoryAlzheimers', 'DiastolicBP', 'CholesterolTriglycerides', 'MemoryComplaints', 'FunctionalAssessment', 'ADL', 'CholesterolHDL', 'MMSE', 'AlcoholConsumption']\n",
+      "================================================== \n",
+      "\n",
+      "==================================================\n",
+      "Run 2:\n",
+      "Best Correlation Score: 0.2241\n",
+      "Best Accuracy Score: 0.9136\n",
+      "\n",
+      "Selected Features:\n",
+      " ['Diabetes', 'BehavioralProblems', 'Confusion', 'FamilyHistoryAlzheimers', 'Gender', 'DiastolicBP', 'MemoryComplaints', 'FunctionalAssessment', 'ADL', 'Ethnicity', 'MMSE', 'Hypertension']\n",
+      "================================================== \n",
+      "\n",
+      "==================================================\n",
+      "Run 3:\n",
+      "Best Correlation Score: 0.2480\n",
+      "Best Accuracy Score: 0.9182\n",
+      "\n",
+      "Selected Features:\n",
+      " ['BehavioralProblems', 'Gender', 'MemoryComplaints', 'FunctionalAssessment', 'ADL', 'MMSE']\n",
+      "================================================== \n",
+      "\n",
+      "==================================================\n",
+      "Run 4:\n",
+      "Best Correlation Score: 0.2474\n",
+      "Best Accuracy Score: 0.9129\n",
+      "\n",
+      "Selected Features:\n",
+      " ['Diabetes', 'BehavioralProblems', 'DietQuality', 'EducationLevel', 'Gender', 'CholesterolTriglycerides', 'MemoryComplaints', 'CholesterolHDL', 'FunctionalAssessment', 'CholesterolTotal', 'ADL', 'MMSE']\n",
+      "================================================== \n",
+      "\n",
+      "==================================================\n",
+      "Run 5:\n",
+      "Best Correlation Score: 0.2140\n",
+      "Best Accuracy Score: 0.9122\n",
+      "\n",
+      "Selected Features:\n",
+      " ['BehavioralProblems', 'FunctionalAssessment', 'ADL', 'CholesterolHDL', 'Confusion', 'CardiovascularDisease', 'SleepQuality', 'HeadInjury', 'DietQuality', 'PersonalityChanges', 'PhysicalActivity', 'Disorientation', 'DiastolicBP', 'Ethnicity', 'Smoking', 'MMSE', 'FamilyHistoryAlzheimers', 'MemoryComplaints', 'AlcoholConsumption']\n",
+      "================================================== \n",
+      "\n",
+      "==================================================\n",
+      "Run 6:\n",
+      "Best Correlation Score: 0.2195\n",
+      "Best Accuracy Score: 0.9142\n",
+      "\n",
+      "Selected Features:\n",
+      " ['Diabetes', 'BehavioralProblems', 'SleepQuality', 'Confusion', 'PersonalityChanges', 'EducationLevel', 'MemoryComplaints', 'FunctionalAssessment', 'ADL', 'Ethnicity', 'MMSE', 'SystolicBP']\n",
+      "================================================== \n",
+      "\n",
+      "==================================================\n",
+      "Run 7:\n",
+      "Best Correlation Score: 0.2142\n",
+      "Best Accuracy Score: 0.9209\n",
+      "\n",
+      "Selected Features:\n",
+      " ['CardiovascularDisease', 'BehavioralProblems', 'SleepQuality', 'Forgetfulness', 'DietQuality', 'PhysicalActivity', 'Smoking', 'Disorientation', 'CholesterolTriglycerides', 'MemoryComplaints', 'FunctionalAssessment', 'ADL', 'MMSE', 'Hypertension']\n",
+      "================================================== \n",
+      "\n",
+      "==================================================\n",
+      "Run 8:\n",
+      "Best Correlation Score: 0.2152\n",
+      "Best Accuracy Score: 0.9156\n",
+      "\n",
+      "Selected Features:\n",
+      " ['BehavioralProblems', 'Disorientation', 'MemoryComplaints', 'FunctionalAssessment', 'ADL', 'MMSE', 'Hypertension']\n",
+      "================================================== \n",
+      "\n",
+      "==================================================\n",
+      "Run 9:\n",
+      "Best Correlation Score: 0.1906\n",
+      "Best Accuracy Score: 0.9182\n",
+      "\n",
+      "Selected Features:\n",
+      " ['Diabetes', 'BehavioralProblems', 'SleepQuality', 'DietQuality', 'Disorientation', 'Gender', 'MemoryComplaints', 'FunctionalAssessment', 'ADL', 'Ethnicity', 'MMSE', 'AlcoholConsumption']\n",
+      "================================================== \n",
+      "\n",
+      "==================================================\n",
+      "Run 10:\n",
+      "Best Correlation Score: 0.2151\n",
+      "Best Accuracy Score: 0.9169\n",
+      "\n",
+      "Selected Features:\n",
+      " ['BehavioralProblems', 'DifficultyCompletingTasks', 'SleepQuality', 'PersonalityChanges', 'DietQuality', 'Confusion', 'Forgetfulness', 'Gender', 'HeadInjury', 'MemoryComplaints', 'CholesterolTriglycerides', 'FunctionalAssessment', 'ADL', 'MMSE']\n",
+      "================================================== \n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "runs = 10\n",
+    "corr_values = []\n",
+    "acc_values = []\n",
+    "for i in range(runs):\n",
+    "    best_corr_value, best_acc_value, selected_features = main(pop_size=300, generations=300, cxpb=0.9, mutpb=0.2)\n",
+    "\n",
+    "    print(\"=\" * 50)\n",
+    "    print(f\"Run {i+1}:\")\n",
+    "    print(f\"Best Correlation Score: {best_corr_value:.4f}\")\n",
+    "    print(f\"Best Accuracy Score: {best_acc_value:.4f}\")\n",
+    "    print(\"\\nSelected Features:\\n\", selected_features)\n",
+    "    print(\"=\" * 50, \"\\n\")\n",
+    "\n",
+    "    corr_values.append(best_corr_value)\n",
+    "    acc_values.append(best_acc_value)\n",
+    "    \n",
+    " \n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "image/png": "",
+      "text/plain": [
+       "<Figure size 2000x1000 with 1 Axes>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "plt.figure(figsize=(20,10))\n",
+    "plt.scatter(corr_values, acc_values, c='blue', marker='o', label='Runs')\n",
+    "for i in range(runs):\n",
+    "    plt.annotate(f'({corr_values[i]:.4f}, {acc_values[i]:.4f})',  \n",
+    "    (corr_values[i], acc_values[i]),  \n",
+    "    textcoords=\"offset points\",  \n",
+    "    xytext=(5,5),  \n",
+    "    ha='right',  \n",
+    "    fontsize=10,  \n",
+    "    bbox=dict(boxstyle=\"round,pad=0.3\", edgecolor='gray', facecolor='white'))\n",
+    "\n",
+    "plt.savefig(\"Tradeoff_plot.png\", dpi = 300, bbox_inches = \"tight\")\n",
+    "plt.xlabel('Correlation')\n",
+    "plt.ylabel('Accuracy')\n",
+    "plt.title('Trade-off between Correlation and Accuracy')\n",
+    "plt.grid(True)\n",
+    "plt.legend()\n",
+    "plt.show()\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Best Correlation: 0.2480 (Run 3)\n",
+      "Best Accuracy: 0.9209 (Run 7) \n"
+     ]
+    }
+   ],
+   "source": [
+    "best_C = max(corr_values)\n",
+    "best_CIndex = corr_values.index(best_C)\n",
+    "\n",
+    "best_a = max(acc_values)\n",
+    "best_aindex = acc_values.index(best_a)\n",
+    "\n",
+    "print(f\"Best Correlation: {best_C:.4f} (Run {best_CIndex + 1})\")\n",
+    "print(f\"Best Accuracy: {best_a:.4f} (Run {best_aindex + 1}) \")\n",
+    "\n",
+    "with open(\"Correlation-Accuracy Results\", \"w\") as f:\n",
+    "    f.write(\"Correlation\\tAccuracy\\n\")\n",
+    "    for i in range(runs):\n",
+    "        f.write(f\"Runs {i+1}\\t{corr_values[i]:.4f}\\t{acc_values[i]:.4f}\\n\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
-- 
GitLab