From e3bbb730847392231d2f7026fd8852211f0dc0c6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CMSS3-ALSULAIMA=E2=80=9D?= <Mohammed3.Alsulaimani@live.uwe.ac.uk> Date: Sun, 12 Nov 2023 06:51:41 +0400 Subject: [PATCH] statistical summary --- UFCFVQ-15-M Programming Task 1 Template.ipynb | 216 +++++++++++++++--- 1 file changed, 185 insertions(+), 31 deletions(-) diff --git a/UFCFVQ-15-M Programming Task 1 Template.ipynb b/UFCFVQ-15-M Programming Task 1 Template.ipynb index daa0543..08a53ae 100644 --- a/UFCFVQ-15-M Programming Task 1 Template.ipynb +++ b/UFCFVQ-15-M Programming Task 1 Template.ipynb @@ -19,9 +19,17 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The mean value is: 14.5\n" + ] + } + ], "source": [ "numbers_list = [\n", " 29, 17, 28, 6, 14, 7, 4, 27, 21, 15,\n", @@ -62,9 +70,17 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The standard deviation is: 8.65544144839919\n" + ] + } + ], "source": [ "def find_std(numbers, mean):\n", "\n", @@ -104,9 +120,18 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The minimum value is: 0\n", + "The maximum value is: 29\n" + ] + } + ], "source": [ "numbers_list = [\n", " 29, 17, 28, 6, 14, 7, 4, 27, 21, 15,\n", @@ -151,9 +176,17 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The 25th percentile is: 7\n" + ] + } + ], "source": [ "numbers_list = [\n", " 29, 17, 28, 6, 14, 7, 4, 27, 21, 15,\n", @@ -195,9 +228,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The 50th percentile is: 14\n" + ] + } + ], "source": [ "\n", "numbers_list = [\n", @@ -236,9 +277,17 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The 75th percentile is: 21\n" + ] + } + ], "source": [ "numbers_list = [\n", " 29, 17, 28, 6, 14, 7, 4, 27, 21, 15,\n", @@ -277,9 +326,25 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Statistic Value\n", + "-------------------------\n", + "Mean 14.50\n", + "STD 8.66\n", + "Min 0\n", + "Max 29\n", + "25% 7\n", + "50% 14\n", + "75% 21\n" + ] + } + ], "source": [ "\n", "# A summary of the statistics, including the standard deviation, minimum and maximum values, as well, as percentiles.\n", @@ -314,13 +379,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['19', '18', '28', '33', '32', '31', '46', '37', '37', '60', '25', '62', '23', '56', '27', '19', '52', '23', '56', '30', '60', '30', '18', '34', '37', '59', '63', '55', '23', '31', '22', '18', '19', '63', '28', '19', '62', '26', '35', '60', '24', '31', '41', '37', '38', '55', '18', '28', '60', '36', '18', '21', '48', '36', '40', '58', '58', '18', '53', '34', '43', '25', '64', '28', '20', '19', '61', '40', '40', '28', '27', '31', '53', '58', '44', '57', '29', '21', '22', '41', '31', '45', '22', '48', '37', '45', '57', '56', '46', '55', '21', '53', '59', '35', '64', '28', '54', '55', '56', '38', '41', '30', '18', '61', '34', '20', '19', '26', '29', '63', '54', '55', '37', '21', '52', '60', '58', '29', '49', '37', '44', '18', '20', '44', '47', '26', '19', '52', '32', '38', '59', '61', '53', '19', '20', '22', '19', '22', '54', '22', '34', '26', '34', '29', '30', '29', '46', '51', '53', '19', '35', '48', '32', '42', '40', '44', '48', '18', '30', '50', '42', '18', '54', '32', '37', '47', '20', '32', '19', '27', '63', '49', '18', '35', '24', '63', '38', '54', '46', '41', '58', '18', '22', '44', '44', '36', '26', '30', '41', '29', '61', '36', '25', '56', '18', '19', '39', '45', '51', '64', '19', '48', '60', '27', '46', '28', '59', '35', '63', '40', '20', '40', '24', '34', '45', '41', '53', '27', '26', '24', '34', '53', '32', '19', '42', '55', '28', '58', '41', '47', '42', '59', '19', '59', '39', '40', '18', '31', '19', '44', '23', '33', '55', '40', '63', '54', '60', '24', '19', '29', '18', '63', '54', '27', '50', '55', '56', '38', '51', '19', '58', '20', '52', '19', '53', '46', '40', '59', '45', '49', '18', '50', '41', '50', '25', '47', '19', '22', '59', '51', '40', '54', '30', '55', '52', '46', '46', '63', '59', '52', '28', '29', '25', '22', '25', '18', '19', '47', '31', '48', '36', '53', '56', '28', '57', '29', '28', '30', '58', '41', '50', '19', '43', '49', '27', '52', '50', '54', '44', '32', '34', '26', '34', '57', '29', '40', '27', '45', '64', '52', '61', '52', '61', '56', '43', '64', '60', '62', '50', '46', '24', '62', '60', '63', '49', '34', '33', '46', '36', '19', '57', '50', '30', '33', '18', '46', '46', '47', '23', '18', '48', '35', '19', '21', '21', '49', '56', '42', '44', '18', '61', '57', '42', '26', '20', '23', '39', '24', '64', '62', '27', '55', '55', '35', '44', '19', '58', '50', '26', '24', '48', '19', '48', '49', '46', '46', '43', '21', '64', '18', '51', '47', '64', '49', '31', '52', '33', '47', '38', '32', '19', '44', '26', '25', '19', '43', '52', '36', '64', '63', '64', '61', '40', '25', '48', '45', '38', '18', '21', '27', '19', '29', '42', '60', '31', '60', '22', '35', '52', '26', '31', '33', '18', '59', '56', '45', '60', '56', '40', '35', '39', '30', '24', '20', '32', '59', '55', '57', '56', '40', '49', '42', '62', '56', '19', '30', '60', '56', '28', '18', '27', '18', '19', '47', '54', '61', '24', '25', '21', '23', '63', '49', '18', '51', '48', '31', '54', '19', '44', '53', '19', '61', '18', '61', '21', '20', '31', '45', '44', '62', '29', '43', '51', '19', '38', '37', '22', '21', '24', '57', '56', '27', '51', '19', '39', '58', '20', '45', '35', '31', '50', '32', '51', '38', '42', '18', '19', '51', '46', '18', '57', '62', '59', '37', '64', '38', '33', '46', '46', '53', '34', '20', '63', '54', '54', '49', '28', '54', '25', '43', '63', '32', '62', '52', '25', '28', '46', '34', '35', '19', '46', '54', '27', '50', '18', '19', '38', '41', '49', '48', '31', '18', '30', '62', '57', '58', '22', '31', '52', '25', '59', '19', '39', '32', '19', '33', '21', '34', '61', '38', '58', '47', '20', '21', '41', '46', '42', '34', '43', '52', '18', '51', '56', '64', '19', '51', '27', '59', '28', '30', '47', '38', '18', '34', '20', '47', '56', '49', '19', '55', '30', '37', '49', '18', '59', '29', '36', '33', '58', '44', '53', '24', '29', '40', '51', '64', '19', '35', '39', '56', '33', '42', '61', '23', '43', '48', '39', '40', '18', '58', '49', '53', '48', '45', '59', '52', '26', '27', '48', '57', '37', '57', '32', '18', '64', '43', '49', '40', '62', '40', '30', '29', '36', '41', '44', '45', '55', '60', '56', '49', '21', '19', '39', '53', '33', '53', '42', '40', '47', '27', '21', '47', '20', '24', '27', '26', '53', '41', '56', '23', '21', '50', '53', '34', '47', '33', '51', '49', '31', '36', '18', '50', '43', '20', '24', '60', '49', '60', '51', '58', '51', '53', '62', '19', '50', '30', '41', '29', '18', '41', '35', '53', '24', '48', '59', '49', '37', '26', '23', '29', '45', '27', '53', '31', '50', '50', '34', '19', '47', '28', '37', '21', '64', '58', '24', '31', '39', '47', '30', '18', '22', '23', '33', '27', '45', '57', '47', '42', '64', '38', '61', '53', '44', '19', '41', '51', '40', '45', '35', '53', '30', '18', '51', '50', '31', '35', '60', '21', '29', '62', '39', '19', '22', '53', '39', '27', '30', '30', '58', '33', '42', '64', '21', '18', '23', '45', '40', '19', '18', '25', '46', '33', '54', '28', '36', '20', '24', '23', '47', '33', '45', '26', '18', '44', '60', '64', '56', '36', '41', '39', '63', '36', '28', '58', '36', '42', '36', '56', '35', '59', '21', '59', '23', '57', '53', '60', '51', '23', '27', '55', '37', '61', '46', '53', '49', '20', '48', '25', '25', '57', '37', '38', '55', '36', '51', '40', '18', '57', '61', '25', '50', '26', '42', '43', '44', '23', '49', '33', '41', '37', '22', '23', '21', '51', '25', '32', '57', '36', '22', '57', '64', '36', '54', '47', '62', '61', '43', '19', '18', '19', '49', '60', '26', '49', '60', '26', '27', '44', '63', '32', '22', '18', '59', '44', '33', '24', '43', '45', '61', '35', '62', '62', '38', '34', '43', '50', '19', '57', '62', '41', '26', '39', '46', '45', '32', '59', '44', '39', '18', '53', '18', '50', '18', '19', '62', '56', '42', '37', '42', '25', '57', '51', '30', '44', '34', '31', '54', '24', '43', '48', '19', '29', '63', '46', '52', '35', '51', '44', '21', '39', '50', '34', '22', '19', '26', '29', '48', '26', '45', '36', '54', '34', '31', '27', '20', '44', '43', '45', '34', '24', '26', '38', '50', '38', '27', '39', '39', '63', '33', '36', '']\n" + ] + } + ], "source": [ "\n", "# The file path\n", - "file_pathname = '/Users/mscdatascience/Documents/assignment-PDS/mohammad_alsuulaimani_uwe_23086369_2023/task.dat'\n", + "file_pathname = '/Users/mscdatascience/Documents/assignment-PDS/mohammad_alsuulaimani_uwe_23086369_2023/task1.dat'\n", "\n", "# Retrieve the data, from the file and add each line as an element, in the list.\n", "with open(file_pathname, 'r') as file:\n", @@ -347,10 +420,33 @@ ] }, { - "cell_type": "code", - "execution_count": null, + "cell_type": "markdown", "metadata": {}, - "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'age': ['19', '18', '28'],\n", + " 'sex': ['female', 'male', 'male'],\n", + " 'bmi': ['27.9', '33.77', '33'],\n", + " 'children': ['0', '1', '3'],\n", + " 'smoker': ['yes', 'no', 'no'],\n", + " 'region': ['southwest', 'southeast', 'southeast'],\n", + " 'charges': ['16884.924', '1725.5523', '4449.462'],\n", + " 'income': ['1037450', '1857149', '1420147']}" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "\n", "# Reading the CSV file and filling up the data structure.\n", @@ -390,11 +486,69 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# add code here" + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Column Mean STD Min Max 25% 50% 75%\n", + "--------------------------------------------------------------------------------------------\n", + "age 39.62 14.15 18.00 64.00 27.00 40.00 52.00\n", + "bmi 30.86 6.04 15.96 50.38 26.60 30.59 35.10\n", + "children 1.08 1.20 0.00 5.00 0.00 1.00 2.00\n", + "charges 13075.76 11979.93 1121.87 63770.43 4719.52 9282.48 15820.70\n", + "income 1386781.87 355771.09 800284.00 1996746.00 1056156.00 1390208.00 1704354.00\n" + ] + } + ], + "source": [ + "# Definea function that calculates summaries from a CSV file\n", + "def statistical_summary(file_path):\n", + " # Read the line to obtain the headers and initialize a dictionary, with column names as keys and empty lists as values.\n", + " csv_data = {column: [] for column in open(file_path, 'r').readline().strip().split(',')}\n", + " \n", + " with open(file_path, 'r') as file:\n", + " next(file) \n", + " \n", + " # Go through each line in the file combine the column headers and values from the line and iterate over them.\n", + " for line in file:\n", + " \n", + " for column, value in zip(csv_data.keys(), line.strip().split(',')):\n", + " try:\n", + " # Attempt to convert each value to a float and add it to the column in `csv_data`.\n", + " csv_data[column].append(float(value))\n", + " except ValueError:\n", + " pass \n", + " \n", + " # Calculate summaries for each column containing numerical data.\n", + " summary = {column: {\n", + " 'Mean': arithmetic_mean(values), \n", + " 'STD': find_std(values, arithmetic_mean(values)), \n", + " 'Min': min(values), \n", + " 'Max': max(values), \n", + " '25%': calculate_percentile(values, 25), \n", + " '50%': calculate_percentile(values, 50), \n", + " '75%': calculate_percentile(values, 75) \n", + " } for column, values in csv_data.items() if values} # Only for columns that have values\n", + " \n", + " return summary\n", + "\n", + "# Path name CSV file\n", + "file_path = '/Users/mscdatascience/Desktop/MScproject/pdsproject-1/pdsproject/task1.csv'\n", + "\n", + "# Call the function to obtain the statistical summary for the CSV data.\n", + "summary = statistical_summary(file_path)\n", + "\n", + "# Make the tatistical summary looks better by displaying the header, followed by a separator and then present the data, for each column.\"\n", + "\n", + "print(f\"{'Column':<15} {'Mean':>10} {'STD':>10} {'Min':>10} {'Max':>10} {'25%':>10} {'50%':>10} {'75%':>10}\")\n", + "print('-' * 92)\n", + "\n", + "for column, stats in summary.items():\n", + " print(f\"{column:<15} {stats['Mean']:>10.2f} {stats['STD']:>10.2f} {stats['Min']:>10.2f} {stats['Max']:>10.2f} {stats['25%']:>10.2f} {stats['50%']:>10.2f} {stats['75%']:>10.2f}\")\n", + "\n" ] }, { -- GitLab