From dc2bc02cd5e28b574cbdff7096b4fa5231d85c48 Mon Sep 17 00:00:00 2001 From: zoonalink <zoonalink@gmail.com> Date: Mon, 19 Dec 2022 12:40:53 +0000 Subject: [PATCH] FR7-9 done; FR10 start over; working file created --- UFCFVQ-15-M_Programming_Task_2_submit.ipynb | 593 ++++++++++++++++++ ...CFVQ-15-M_Programming_Task_2_working.ipynb | 38 +- 2 files changed, 630 insertions(+), 1 deletion(-) create mode 100644 UFCFVQ-15-M_Programming_Task_2_submit.ipynb rename UFCFVQ-15-M_Programming_Task_2.ipynb => UFCFVQ-15-M_Programming_Task_2_working.ipynb (99%) diff --git a/UFCFVQ-15-M_Programming_Task_2_submit.ipynb b/UFCFVQ-15-M_Programming_Task_2_submit.ipynb new file mode 100644 index 0000000..6094911 --- /dev/null +++ b/UFCFVQ-15-M_Programming_Task_2_submit.ipynb @@ -0,0 +1,593 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "deletable": false + }, + "source": [ + "# UFCFVQ-15-M Programming for Data Science (Autumn 2022)\n", + "# Programming Task 2\n", + "\n", + "## Student Id: 05976423" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false + }, + "source": [ + "### Requirement FR7 - Read CSV data from two files and merge it into a single Data Frame " + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "deletable": false + }, + "outputs": [], + "source": [ + "# import libraries for use Task 2 code\n", + "import pandas as pd\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "<class 'pandas.core.frame.DataFrame'>\n", + "Int64Index: 26746 entries, 0 to 26745\n", + "Data columns (total 8 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 id_student 26746 non-null int64 \n", + " 1 gender 26746 non-null object \n", + " 2 region 26746 non-null object \n", + " 3 highest_education 26746 non-null object \n", + " 4 age_band 26746 non-null object \n", + " 5 disability 26746 non-null object \n", + " 6 final_result 26746 non-null object \n", + " 7 score 26727 non-null float64\n", + "dtypes: float64(1), int64(1), object(6)\n", + "memory usage: 1.8+ MB\n", + "None\n", + "\n", + "-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-\n", + "-\n", + "<class 'pandas.core.frame.DataFrame'>\n", + "RangeIndex: 26074 entries, 0 to 26073\n", + "Data columns (total 2 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 id_student 26074 non-null int64 \n", + " 1 click_events 24743 non-null float64\n", + "dtypes: float64(1), int64(1)\n", + "memory usage: 407.5 KB\n", + "None\n" + ] + } + ], + "source": [ + "# read data from csv file into a pandas dataframe, specifying separator, header and index column\n", + "student_bio = pd.read_csv(r'task2a.csv', sep=',', header=0, index_col=0)\n", + "\n", + "#inspect the data and dataframe\n", + "#print(student_bio.head())\n", + "print(student_bio.info())\n", + "\n", + "print('\\n-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-\\n')\n", + "# read data from csv file into a pandas dataframe, specifying separator, header and index column\n", + "student_clicks = pd.read_csv(r'task2b.csv', sep=',', header=0)\n", + "\n", + "#inspect the data and dataframe\n", + "#print(student_clicks.head())\n", + "print(student_clicks.info())" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " id_student gender region highest_education age_band \\\n", + "0 11391 M East Anglian Region HE Qualification 55<= \n", + "1 28400 F Scotland HE Qualification 35-55 \n", + "2 31604 F South East Region A Level or Equivalent 35-55 \n", + "3 32885 F West Midlands Region Lower Than A Level 0-35 \n", + "4 38053 M Wales A Level or Equivalent 35-55 \n", + "\n", + " disability final_result score click_events \n", + "0 N Pass 82.0 934.0 \n", + "1 N Pass 67.0 1435.0 \n", + "2 N Pass 76.0 2158.0 \n", + "3 N Pass 55.0 1034.0 \n", + "4 N Pass 68.0 2445.0 \n", + "\n", + "-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-\n", + "-\n", + "<class 'pandas.core.frame.DataFrame'>\n", + "Int64Index: 29476 entries, 0 to 29475\n", + "Data columns (total 9 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 id_student 29476 non-null int64 \n", + " 1 gender 26746 non-null object \n", + " 2 region 26746 non-null object \n", + " 3 highest_education 26746 non-null object \n", + " 4 age_band 26746 non-null object \n", + " 5 disability 26746 non-null object \n", + " 6 final_result 26746 non-null object \n", + " 7 score 26727 non-null float64\n", + " 8 click_events 27936 non-null float64\n", + "dtypes: float64(2), int64(1), object(6)\n", + "memory usage: 2.2+ MB\n", + "None\n" + ] + } + ], + "source": [ + "# merge the two dataframes into one - using outer join to keep all rows from each df - FR8 is cleaning task\n", + "\n", + "students_merged = pd.merge(student_bio, student_clicks, how = 'outer', on='id_student')\n", + "print(students_merged.head())\n", + "print('\\n-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-\\n')\n", + "print(students_merged.info())\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false + }, + "source": [ + "<p style=\"color:red; font-weight:bold; font-size:xx-small\">MARK: __%</p>\n", + "<p style=\"color:red; font-weight:bold; font-size:xx-small\">FEEDBACK: </p>" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false + }, + "source": [ + "### Requirement FR8 - Clean the merged data" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [], + "source": [ + "# reusable function to show missing data in df \n", + "\n", + "def missing_data(df, include_all=True):\n", + " '''\n", + " Function to show summary of missing data in a given dataframe.\n", + " '''\n", + " \n", + " missing_totals = df.isnull().sum().sort_values(ascending=False) # sum of missing data in each column\n", + " percent = ((missing_totals/len(df))*100).sort_values(ascending=False).round(2) # percent of missing data in each column\n", + " missing_data_all_cols = pd.concat([missing_totals, percent], axis=1, keys=['Missing Data Count', 'Percent of Total (%)']) # combine counts and percent into one df for all columns\n", + " missing_data_cols_only = missing_data_all_cols[missing_data_all_cols.iloc[:,1] != 0].sort_values('Percent of Total (%)', ascending=False) # remove columns with no missing data\n", + " \n", + " # if statement to return either all columns or only columns with missing data\n", + " if include_all == True:\n", + " print(missing_data_all_cols)\n", + " else:\n", + " print(missing_data_cols_only)" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Missing Data Count Percent of Total (%)\n", + "score 2749 9.33\n", + "gender 2730 9.26\n", + "region 2730 9.26\n", + "highest_education 2730 9.26\n", + "age_band 2730 9.26\n", + "disability 2730 9.26\n", + "final_result 2730 9.26\n", + "click_events 1540 5.22\n" + ] + } + ], + "source": [ + "missing_data(students_merged, include_all=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "<class 'pandas.core.frame.DataFrame'>\n", + "Int64Index: 25332 entries, 0 to 26745\n", + "Data columns (total 9 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 id_student 25332 non-null int64 \n", + " 1 gender 25332 non-null object \n", + " 2 region 25332 non-null object \n", + " 3 highest_education 25332 non-null object \n", + " 4 age_band 25332 non-null object \n", + " 5 disability 25332 non-null object \n", + " 6 final_result 25332 non-null object \n", + " 7 score 25332 non-null float64\n", + " 8 click_events 25332 non-null float64\n", + "dtypes: float64(2), int64(1), object(6)\n", + "memory usage: 1.9+ MB\n", + "None\n", + "\n", + "-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-\n", + "-\n", + " Missing Data Count Percent of Total (%)\n", + "id_student 0 0.0\n", + "gender 0 0.0\n", + "region 0 0.0\n", + "highest_education 0 0.0\n", + "age_band 0 0.0\n", + "disability 0 0.0\n", + "final_result 0 0.0\n", + "score 0 0.0\n", + "click_events 0 0.0\n" + ] + } + ], + "source": [ + "# drop rows with missing values\n", + "students_cleaned = students_merged.dropna() \n", + "\n", + "# check that all rows have been removed - 25332 non-null values for each column\n", + "print(students_cleaned.info()) \n", + "\n", + "print('\\n-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-\\n')\n", + "\n", + "# double check that no missing data remains\n", + "missing_data(students_cleaned, include_all=True) " + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": { + "deletable": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "<class 'pandas.core.frame.DataFrame'>\n", + "Int64Index: 25332 entries, 0 to 26745\n", + "Data columns (total 6 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 id_student 25332 non-null int64 \n", + " 1 gender 25332 non-null object \n", + " 2 age_band 25332 non-null object \n", + " 3 disability 25332 non-null object \n", + " 4 score 25332 non-null float64\n", + " 5 click_events 25332 non-null float64\n", + "dtypes: float64(2), int64(1), object(3)\n", + "memory usage: 1.4+ MB\n", + "None\n" + ] + } + ], + "source": [ + "# drop columns - 'region', 'final_result', 'highest_education' \n", + "students_reduced = students_cleaned.drop(['region', 'final_result', 'highest_education'], axis=1)\n", + "\n", + "# check that columns have been removed - 6 columns remaining, each with 25332 non-null values\n", + "print(students_reduced.info()) \n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false + }, + "source": [ + "<p style=\"color:red; font-weight:bold; font-size:xx-small\">MARK: __%</p>\n", + "<p style=\"color:red; font-weight:bold; font-size:xx-small\">FEEDBACK: </p>" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false + }, + "source": [ + "### Requirement FR9 - Filter out unnecessary rows" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": { + "deletable": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "<class 'pandas.core.frame.DataFrame'>\n", + "Int64Index: 73 entries, 914 to 26699\n", + "Data columns (total 6 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 id_student 73 non-null int64 \n", + " 1 gender 73 non-null object \n", + " 2 age_band 73 non-null object \n", + " 3 disability 73 non-null object \n", + " 4 score 73 non-null float64\n", + " 5 click_events 73 non-null float64\n", + "dtypes: float64(2), int64(1), object(3)\n", + "memory usage: 4.0+ KB\n", + "None\n", + "\n", + "+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+\n", + "\n", + "There are 73 rows with less than 10 click events\n" + ] + } + ], + "source": [ + "# check to see how many rows have less than 10 click events\n", + "print(students_reduced.loc[students_reduced['click_events'] < 10].info())\n", + "print('\\n+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+\\n')\n", + "print('There are', len(students_reduced.loc[students_reduced['click_events'] < 10]), 'rows with less than 10 click events.')\n" + ] + }, + { + "cell_type": "code", + "execution_count": 97, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "<class 'pandas.core.frame.DataFrame'>\n", + "Int64Index: 25259 entries, 0 to 26745\n", + "Data columns (total 6 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 id_student 25259 non-null int64 \n", + " 1 gender 25259 non-null object \n", + " 2 age_band 25259 non-null object \n", + " 3 disability 25259 non-null object \n", + " 4 score 25259 non-null float64\n", + " 5 click_events 25259 non-null float64\n", + "dtypes: float64(2), int64(1), object(3)\n", + "memory usage: 1.3+ MB\n", + "None\n" + ] + } + ], + "source": [ + "students = students_reduced.loc[students_reduced['click_events'] >= 10] # keep rows with 10 or more click events\n", + "print(students.info()) # check that rows have been removed - 25259 non-null values for each column\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false + }, + "source": [ + "<p style=\"color:red; font-weight:bold; font-size:xx-small\">MARK: __%</p>\n", + "<p style=\"color:red; font-weight:bold; font-size:xx-small\">FEEDBACK: </p>" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false + }, + "source": [ + "### Requirement FR10 - Investigate the effects of engagement on attainment" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "```\n", + "Use an appropriate visualisation tool (such as Matplotlib or Seaborn) to investigate if there is any relation between the engagement (click events) and the level of attainment (score). You must include an explanation of your findings to achieve good marks for this requirement.\n", + "\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false + }, + "source": [ + "<p style=\"color:red; font-weight:bold; font-size:xx-small\">MARK: __%</p>\n", + "<p style=\"color:red; font-weight:bold; font-size:xx-small\">FEEDBACK: </p>" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false + }, + "source": [ + "### Requirement FR11 - Test the hypothesis that engagement has some effect on levels of attainment " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false + }, + "outputs": [], + "source": [ + "# replace with your code" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false + }, + "source": [ + "<p style=\"color:red; font-weight:bold; font-size:xx-small\">MARK: __%</p>\n", + "<p style=\"color:red; font-weight:bold; font-size:xx-small\">FEEDBACK: </p>" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false + }, + "source": [ + "### Requirement FR12 - Investigate the effects of disability on levels of attainment " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false + }, + "outputs": [], + "source": [ + "# replace with your code" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false + }, + "source": [ + "<p style=\"color:red; font-weight:bold; font-size:xx-small\">MARK: __%</p>\n", + "<p style=\"color:red; font-weight:bold; font-size:xx-small\">FEEDBACK: </p>" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false + }, + "source": [ + "### Requirement FR13 - Test if there is any difference between the attainment of disabled and non-disabled students" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false + }, + "outputs": [], + "source": [ + "# replace with your code" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false + }, + "source": [ + "<p style=\"color:red; font-weight:bold; font-size:xx-small\">MARK: __%</p>\n", + "<p style=\"color:red; font-weight:bold; font-size:xx-small\">FEEDBACK: </p>" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false + }, + "source": [ + "# Coding Standards\n", + "<p style=\"color:red; font-weight:bold; font-size:xx-small\">MARK: __%</p>\n", + "<p style=\"color:red; font-weight:bold; font-size:xx-small\">FEEDBACK: </p>" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false + }, + "source": [ + "# Process Development Report for Task 2\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "add markdown text here" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false + }, + "source": [ + "<p style=\"color:red; font-weight:bold; font-size:xx-small\">MARK: __%</p>\n", + "<p style=\"color:red; font-weight:bold; font-size:xx-small\">FEEDBACK: </p>" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.11.0 64-bit", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.0" + }, + "vscode": { + "interpreter": { + "hash": "3a85823825384e2f260493b9b35c69d8eaac198ff59bb0d6c0e72fffbde301e2" + } + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/UFCFVQ-15-M_Programming_Task_2.ipynb b/UFCFVQ-15-M_Programming_Task_2_working.ipynb similarity index 99% rename from UFCFVQ-15-M_Programming_Task_2.ipynb rename to UFCFVQ-15-M_Programming_Task_2_working.ipynb index 5afcce3..da1b337 100644 --- a/UFCFVQ-15-M_Programming_Task_2.ipynb +++ b/UFCFVQ-15-M_Programming_Task_2_working.ipynb @@ -186,6 +186,25 @@ " " ] }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + " ## <font color = 'red'><b>Reflection notes: </font></b>\n", + "\n", + "* Created a reusable function to quickly summarise missing data in a pandas dataframe\n", + "* Took my original rows and made them into a function, stringing methods together to make it more readable and reusable\n", + "* Liked making the function to summarise missing data - thinking about it from users perspective, with optional parameter.\n", + "\n", + "`Strengths:` \n", + " * reusable functions, documented\n", + "\n", + "`Weakness:` \n", + " * too many comments, over-engineered? (risk of)\n", + " * too much printing??" + ] + }, { "cell_type": "code", "execution_count": 80, @@ -314,6 +333,23 @@ "<input type = \"checkbox\" checked> filter out rows where click_event < 10" ] }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + " ## <font color = 'red'><b>Reflection notes: </font></b>\n", + "\n", + "* Very simple, just used the pandas query method to filter out rows where click_event < 10\n", + "\n", + "`Strengths:` \n", + " *straightforward\n", + "\n", + "`Weakness:` \n", + " * \n", + " * too much printing??" + ] + }, { "cell_type": "code", "execution_count": 90, @@ -1146,7 +1182,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.0" + "version": "3.11.0 (main, Oct 24 2022, 18:26:48) [MSC v.1933 64 bit (AMD64)]" }, "vscode": { "interpreter": { -- GitLab