From e3bbb730847392231d2f7026fd8852211f0dc0c6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9CMSS3-ALSULAIMA=E2=80=9D?=
 <Mohammed3.Alsulaimani@live.uwe.ac.uk>
Date: Sun, 12 Nov 2023 06:51:41 +0400
Subject: [PATCH] statistical summary

---
 UFCFVQ-15-M Programming Task 1 Template.ipynb | 216 +++++++++++++++---
 1 file changed, 185 insertions(+), 31 deletions(-)

diff --git a/UFCFVQ-15-M Programming Task 1 Template.ipynb b/UFCFVQ-15-M Programming Task 1 Template.ipynb
index daa0543..08a53ae 100644
--- a/UFCFVQ-15-M Programming Task 1 Template.ipynb	
+++ b/UFCFVQ-15-M Programming Task 1 Template.ipynb	
@@ -19,9 +19,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The mean value is: 14.5\n"
+     ]
+    }
+   ],
    "source": [
     "numbers_list = [\n",
     "    29, 17, 28, 6, 14, 7, 4, 27, 21, 15,\n",
@@ -62,9 +70,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The standard deviation is: 8.65544144839919\n"
+     ]
+    }
+   ],
    "source": [
     "def find_std(numbers, mean):\n",
     "\n",
@@ -104,9 +120,18 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The minimum value is: 0\n",
+      "The maximum value is: 29\n"
+     ]
+    }
+   ],
    "source": [
     "numbers_list = [\n",
     "    29, 17, 28, 6, 14, 7, 4, 27, 21, 15,\n",
@@ -151,9 +176,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The 25th percentile is: 7\n"
+     ]
+    }
+   ],
    "source": [
     "numbers_list = [\n",
     "    29, 17, 28, 6, 14, 7, 4, 27, 21, 15,\n",
@@ -195,9 +228,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 5,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The 50th percentile is: 14\n"
+     ]
+    }
+   ],
    "source": [
     "\n",
     "numbers_list = [\n",
@@ -236,9 +277,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The 75th percentile is: 21\n"
+     ]
+    }
+   ],
    "source": [
     "numbers_list = [\n",
     "    29, 17, 28, 6, 14, 7, 4, 27, 21, 15,\n",
@@ -277,9 +326,25 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Statistic           Value\n",
+      "-------------------------\n",
+      "Mean                14.50\n",
+      "STD                  8.66\n",
+      "Min                     0\n",
+      "Max                    29\n",
+      "25%                     7\n",
+      "50%                    14\n",
+      "75%                    21\n"
+     ]
+    }
+   ],
    "source": [
     "\n",
     "#  A summary of the statistics, including the standard deviation, minimum and maximum values, as well, as percentiles.\n",
@@ -314,13 +379,21 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 9,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "['19', '18', '28', '33', '32', '31', '46', '37', '37', '60', '25', '62', '23', '56', '27', '19', '52', '23', '56', '30', '60', '30', '18', '34', '37', '59', '63', '55', '23', '31', '22', '18', '19', '63', '28', '19', '62', '26', '35', '60', '24', '31', '41', '37', '38', '55', '18', '28', '60', '36', '18', '21', '48', '36', '40', '58', '58', '18', '53', '34', '43', '25', '64', '28', '20', '19', '61', '40', '40', '28', '27', '31', '53', '58', '44', '57', '29', '21', '22', '41', '31', '45', '22', '48', '37', '45', '57', '56', '46', '55', '21', '53', '59', '35', '64', '28', '54', '55', '56', '38', '41', '30', '18', '61', '34', '20', '19', '26', '29', '63', '54', '55', '37', '21', '52', '60', '58', '29', '49', '37', '44', '18', '20', '44', '47', '26', '19', '52', '32', '38', '59', '61', '53', '19', '20', '22', '19', '22', '54', '22', '34', '26', '34', '29', '30', '29', '46', '51', '53', '19', '35', '48', '32', '42', '40', '44', '48', '18', '30', '50', '42', '18', '54', '32', '37', '47', '20', '32', '19', '27', '63', '49', '18', '35', '24', '63', '38', '54', '46', '41', '58', '18', '22', '44', '44', '36', '26', '30', '41', '29', '61', '36', '25', '56', '18', '19', '39', '45', '51', '64', '19', '48', '60', '27', '46', '28', '59', '35', '63', '40', '20', '40', '24', '34', '45', '41', '53', '27', '26', '24', '34', '53', '32', '19', '42', '55', '28', '58', '41', '47', '42', '59', '19', '59', '39', '40', '18', '31', '19', '44', '23', '33', '55', '40', '63', '54', '60', '24', '19', '29', '18', '63', '54', '27', '50', '55', '56', '38', '51', '19', '58', '20', '52', '19', '53', '46', '40', '59', '45', '49', '18', '50', '41', '50', '25', '47', '19', '22', '59', '51', '40', '54', '30', '55', '52', '46', '46', '63', '59', '52', '28', '29', '25', '22', '25', '18', '19', '47', '31', '48', '36', '53', '56', '28', '57', '29', '28', '30', '58', '41', '50', '19', '43', '49', '27', '52', '50', '54', '44', '32', '34', '26', '34', '57', '29', '40', '27', '45', '64', '52', '61', '52', '61', '56', '43', '64', '60', '62', '50', '46', '24', '62', '60', '63', '49', '34', '33', '46', '36', '19', '57', '50', '30', '33', '18', '46', '46', '47', '23', '18', '48', '35', '19', '21', '21', '49', '56', '42', '44', '18', '61', '57', '42', '26', '20', '23', '39', '24', '64', '62', '27', '55', '55', '35', '44', '19', '58', '50', '26', '24', '48', '19', '48', '49', '46', '46', '43', '21', '64', '18', '51', '47', '64', '49', '31', '52', '33', '47', '38', '32', '19', '44', '26', '25', '19', '43', '52', '36', '64', '63', '64', '61', '40', '25', '48', '45', '38', '18', '21', '27', '19', '29', '42', '60', '31', '60', '22', '35', '52', '26', '31', '33', '18', '59', '56', '45', '60', '56', '40', '35', '39', '30', '24', '20', '32', '59', '55', '57', '56', '40', '49', '42', '62', '56', '19', '30', '60', '56', '28', '18', '27', '18', '19', '47', '54', '61', '24', '25', '21', '23', '63', '49', '18', '51', '48', '31', '54', '19', '44', '53', '19', '61', '18', '61', '21', '20', '31', '45', '44', '62', '29', '43', '51', '19', '38', '37', '22', '21', '24', '57', '56', '27', '51', '19', '39', '58', '20', '45', '35', '31', '50', '32', '51', '38', '42', '18', '19', '51', '46', '18', '57', '62', '59', '37', '64', '38', '33', '46', '46', '53', '34', '20', '63', '54', '54', '49', '28', '54', '25', '43', '63', '32', '62', '52', '25', '28', '46', '34', '35', '19', '46', '54', '27', '50', '18', '19', '38', '41', '49', '48', '31', '18', '30', '62', '57', '58', '22', '31', '52', '25', '59', '19', '39', '32', '19', '33', '21', '34', '61', '38', '58', '47', '20', '21', '41', '46', '42', '34', '43', '52', '18', '51', '56', '64', '19', '51', '27', '59', '28', '30', '47', '38', '18', '34', '20', '47', '56', '49', '19', '55', '30', '37', '49', '18', '59', '29', '36', '33', '58', '44', '53', '24', '29', '40', '51', '64', '19', '35', '39', '56', '33', '42', '61', '23', '43', '48', '39', '40', '18', '58', '49', '53', '48', '45', '59', '52', '26', '27', '48', '57', '37', '57', '32', '18', '64', '43', '49', '40', '62', '40', '30', '29', '36', '41', '44', '45', '55', '60', '56', '49', '21', '19', '39', '53', '33', '53', '42', '40', '47', '27', '21', '47', '20', '24', '27', '26', '53', '41', '56', '23', '21', '50', '53', '34', '47', '33', '51', '49', '31', '36', '18', '50', '43', '20', '24', '60', '49', '60', '51', '58', '51', '53', '62', '19', '50', '30', '41', '29', '18', '41', '35', '53', '24', '48', '59', '49', '37', '26', '23', '29', '45', '27', '53', '31', '50', '50', '34', '19', '47', '28', '37', '21', '64', '58', '24', '31', '39', '47', '30', '18', '22', '23', '33', '27', '45', '57', '47', '42', '64', '38', '61', '53', '44', '19', '41', '51', '40', '45', '35', '53', '30', '18', '51', '50', '31', '35', '60', '21', '29', '62', '39', '19', '22', '53', '39', '27', '30', '30', '58', '33', '42', '64', '21', '18', '23', '45', '40', '19', '18', '25', '46', '33', '54', '28', '36', '20', '24', '23', '47', '33', '45', '26', '18', '44', '60', '64', '56', '36', '41', '39', '63', '36', '28', '58', '36', '42', '36', '56', '35', '59', '21', '59', '23', '57', '53', '60', '51', '23', '27', '55', '37', '61', '46', '53', '49', '20', '48', '25', '25', '57', '37', '38', '55', '36', '51', '40', '18', '57', '61', '25', '50', '26', '42', '43', '44', '23', '49', '33', '41', '37', '22', '23', '21', '51', '25', '32', '57', '36', '22', '57', '64', '36', '54', '47', '62', '61', '43', '19', '18', '19', '49', '60', '26', '49', '60', '26', '27', '44', '63', '32', '22', '18', '59', '44', '33', '24', '43', '45', '61', '35', '62', '62', '38', '34', '43', '50', '19', '57', '62', '41', '26', '39', '46', '45', '32', '59', '44', '39', '18', '53', '18', '50', '18', '19', '62', '56', '42', '37', '42', '25', '57', '51', '30', '44', '34', '31', '54', '24', '43', '48', '19', '29', '63', '46', '52', '35', '51', '44', '21', '39', '50', '34', '22', '19', '26', '29', '48', '26', '45', '36', '54', '34', '31', '27', '20', '44', '43', '45', '34', '24', '26', '38', '50', '38', '27', '39', '39', '63', '33', '36', '']\n"
+     ]
+    }
+   ],
    "source": [
     "\n",
     "# The file path\n",
-    "file_pathname = '/Users/mscdatascience/Documents/assignment-PDS/mohammad_alsuulaimani_uwe_23086369_2023/task.dat'\n",
+    "file_pathname = '/Users/mscdatascience/Documents/assignment-PDS/mohammad_alsuulaimani_uwe_23086369_2023/task1.dat'\n",
     "\n",
     "# Retrieve the data, from the file and add each line as an element, in the list.\n",
     "with open(file_pathname, 'r') as file:\n",
@@ -347,10 +420,33 @@
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": null,
+   "cell_type": "markdown",
    "metadata": {},
-   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'age': ['19', '18', '28'],\n",
+       " 'sex': ['female', 'male', 'male'],\n",
+       " 'bmi': ['27.9', '33.77', '33'],\n",
+       " 'children': ['0', '1', '3'],\n",
+       " 'smoker': ['yes', 'no', 'no'],\n",
+       " 'region': ['southwest', 'southeast', 'southeast'],\n",
+       " 'charges': ['16884.924', '1725.5523', '4449.462'],\n",
+       " 'income': ['1037450', '1857149', '1420147']}"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "\n",
     "# Reading the CSV file and filling up the data structure.\n",
@@ -390,11 +486,69 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# add code here"
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Column                Mean        STD        Min        Max        25%        50%        75%\n",
+      "--------------------------------------------------------------------------------------------\n",
+      "age                  39.62      14.15      18.00      64.00      27.00      40.00      52.00\n",
+      "bmi                  30.86       6.04      15.96      50.38      26.60      30.59      35.10\n",
+      "children              1.08       1.20       0.00       5.00       0.00       1.00       2.00\n",
+      "charges           13075.76   11979.93    1121.87   63770.43    4719.52    9282.48   15820.70\n",
+      "income          1386781.87  355771.09  800284.00 1996746.00 1056156.00 1390208.00 1704354.00\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Definea function that calculates summaries from a CSV file\n",
+    "def statistical_summary(file_path):\n",
+    "    #  Read the line to obtain the headers and initialize a dictionary, with column names as keys and empty lists as values.\n",
+    "    csv_data = {column: [] for column in open(file_path, 'r').readline().strip().split(',')}\n",
+    "    \n",
+    "    with open(file_path, 'r') as file:\n",
+    "        next(file)  \n",
+    "        \n",
+    "        #  Go through each line in the file combine the column headers and values from the line and iterate over them.\n",
+    "        for line in file:\n",
+    "            \n",
+    "            for column, value in zip(csv_data.keys(), line.strip().split(',')):\n",
+    "                try:\n",
+    "                    #  Attempt to convert each value to a float and add it to the column in `csv_data`.\n",
+    "                    csv_data[column].append(float(value))\n",
+    "                except ValueError:\n",
+    "                    pass  \n",
+    "    \n",
+    "    #  Calculate summaries for each column containing numerical data.\n",
+    "    summary = {column: {\n",
+    "                'Mean': arithmetic_mean(values), \n",
+    "                'STD': find_std(values, arithmetic_mean(values)), \n",
+    "                'Min': min(values),  \n",
+    "                'Max': max(values),  \n",
+    "                '25%': calculate_percentile(values, 25),  \n",
+    "                '50%': calculate_percentile(values, 50),  \n",
+    "                '75%': calculate_percentile(values, 75) \n",
+    "            } for column, values in csv_data.items() if values}  # Only for columns that have values\n",
+    "    \n",
+    "    return summary\n",
+    "\n",
+    "# Path name CSV file\n",
+    "file_path = '/Users/mscdatascience/Desktop/MScproject/pdsproject-1/pdsproject/task1.csv'\n",
+    "\n",
+    "# Call the function to obtain the statistical summary for the CSV data.\n",
+    "summary = statistical_summary(file_path)\n",
+    "\n",
+    "# Make the tatistical summary looks better by displaying the header, followed by a separator and then present the data, for each column.\"\n",
+    "\n",
+    "print(f\"{'Column':<15} {'Mean':>10} {'STD':>10} {'Min':>10} {'Max':>10} {'25%':>10} {'50%':>10} {'75%':>10}\")\n",
+    "print('-' * 92)\n",
+    "\n",
+    "for column, stats in summary.items():\n",
+    "    print(f\"{column:<15} {stats['Mean']:>10.2f} {stats['STD']:>10.2f} {stats['Min']:>10.2f} {stats['Max']:>10.2f} {stats['25%']:>10.2f} {stats['50%']:>10.2f} {stats['75%']:>10.2f}\")\n",
+    "\n"
    ]
   },
   {
-- 
GitLab