diff --git a/.ipynb_checkpoints/UFCFVQ-15-M Programming Task 2 Template-checkpoint.ipynb b/.ipynb_checkpoints/UFCFVQ-15-M Programming Task 2 Template-checkpoint.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..49c9d62ad8f3dfbfa8addbcb34b27457eda40d2a --- /dev/null +++ b/.ipynb_checkpoints/UFCFVQ-15-M Programming Task 2 Template-checkpoint.ipynb @@ -0,0 +1,607 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# UFCFVQ-15-M Programming for Data Science\n", + "# Programming Task 2\n", + "\n", + "## Student Id: 23086369" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Requirement FR2.1 - Read CSV data from a file (with a header row) into memory " + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Unnamed: 0 id_student gender region highest_education \\\n", + "0 0 11391 M East Anglian Region HE Qualification \n", + "1 1 28400 F Scotland HE Qualification \n", + "2 2 31604 F South East Region A Level or Equivalent \n", + "3 3 32885 F West Midlands Region Lower Than A Level \n", + "4 4 38053 M Wales A Level or Equivalent \n", + "\n", + " age_band disability final_result score \n", + "0 55<= N Pass 82.0 \n", + "1 35-55 N Pass 67.0 \n", + "2 35-55 N Pass 76.0 \n", + "3 0-35 N Pass 55.0 \n", + "4 35-55 N Pass 68.0 \n" + ] + } + ], + "source": [ + "# Importing the pandas library\n", + "\n", + "import pandas as pd\n", + "\n", + "# Read data, from a CSV file. Store it in a DataFrame.\n", + "\n", + "df = pd.read_csv(\"/Users/mscdatascience/Documents/assignment-PDS/mohammad_alsuulaimani_uwe_23086369_2023/task2a.csv\")\n", + "\n", + "# Display the five rows of the DataFrame to quickly examine its structure and content.\n", + "\n", + "print(df.head())\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### MARK: \n", + "#### FEEDBACK: " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Requirement FR2.2 - Read CSV data from a file (without a header row) into memory" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " id_student click_events\n", + "0 6516 2791.0\n", + "1 8462 656.0\n", + "2 11391 934.0\n", + "3 23629 NaN\n", + "4 23698 910.0\n" + ] + } + ], + "source": [ + "# Importing the pandas library\n", + "\n", + "import pandas as pd\n", + "\n", + "# Read data, from a CSV file.\n", + "# The columns are labeled as 'id_student' and 'click_events.\n", + "\n", + "df = pd.read_csv(\"/Users/mscdatascience/Documents/assignment-PDS/mohammad_alsuulaimani_uwe_23086369_2023/task2b.csv\", names=['id_student', 'click_events'])\n", + "\n", + "# Display the five rows of the DataFrame to quickly examine its structure and content.\n", + "\n", + "print(df.head())\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### MARK: \n", + "#### FEEDBACK: " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Requirement FR2.3 - Merge the data from two Dataframes" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Unnamed: 0 id_student gender region highest_education \\\n", + "0 0 11391 M East Anglian Region HE Qualification \n", + "1 1 28400 F Scotland HE Qualification \n", + "2 2 31604 F South East Region A Level or Equivalent \n", + "3 3 32885 F West Midlands Region Lower Than A Level \n", + "4 4 38053 M Wales A Level or Equivalent \n", + "\n", + " age_band disability final_result score click_events \n", + "0 55<= N Pass 82.0 934.0 \n", + "1 35-55 N Pass 67.0 1435.0 \n", + "2 35-55 N Pass 76.0 2158.0 \n", + "3 0-35 N Pass 55.0 1034.0 \n", + "4 35-55 N Pass 68.0 2445.0 \n" + ] + } + ], + "source": [ + "# Importing the pandas library\n", + "\n", + "import pandas as pd\n", + "\n", + "# Read data, from a CSV file in a DataFrame1 & DataFrame2.\n", + "\n", + "\n", + "Dataframe1 = pd.read_csv('task2a.csv')\n", + "Dataframe2 = pd.read_csv('task2b.csv', names=['id_student', 'click_events'])\n", + "\n", + "# Merging DataFrame1 & DataFrame2 into a new DataFrame.\n", + "# How ? By utilizing the 'inner' merge technique we combine the rows, in both DataFrames that share common 'id_student' values.\n", + "\n", + "merged_data_frame = pd.merge(Dataframe1, Dataframe2, on='id_student', how='inner')\n", + "\n", + "# Display the five rows of the mergd DataFrame.\n", + "\n", + "print(merged_data_frame.head())\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### MARK: \n", + "#### FEEDBACK: " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Requirement FR2.4 - Remove any rows that contain missing values" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Unnamed: 0 id_student gender region highest_education \\\n", + "0 0 11391 M East Anglian Region HE Qualification \n", + "1 1 28400 F Scotland HE Qualification \n", + "2 2 31604 F South East Region A Level or Equivalent \n", + "3 3 32885 F West Midlands Region Lower Than A Level \n", + "4 4 38053 M Wales A Level or Equivalent \n", + "\n", + " age_band disability final_result score click_events \n", + "0 55<= N Pass 82.0 934.0 \n", + "1 35-55 N Pass 67.0 1435.0 \n", + "2 35-55 N Pass 76.0 2158.0 \n", + "3 0-35 N Pass 55.0 1034.0 \n", + "4 35-55 N Pass 68.0 2445.0 \n" + ] + } + ], + "source": [ + "\n", + "# Removing rows containing missing values\n", + "cleaned_data_frame = merged_data_frame.dropna()\n", + "\n", + "# Displaying the cleaned new DataFrame\n", + "print(cleaned_data_frame.head())\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### MARK: \n", + "#### FEEDBACK: " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Requirement FR2.5 - Filter out unnecessary rows" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Unnamed: 0 id_student gender region highest_education \\\n", + "0 0 11391 M East Anglian Region HE Qualification \n", + "1 1 28400 F Scotland HE Qualification \n", + "2 2 31604 F South East Region A Level or Equivalent \n", + "3 3 32885 F West Midlands Region Lower Than A Level \n", + "4 4 38053 M Wales A Level or Equivalent \n", + "\n", + " age_band disability final_result score click_events \n", + "0 55<= N Pass 82.0 934.0 \n", + "1 35-55 N Pass 67.0 1435.0 \n", + "2 35-55 N Pass 76.0 2158.0 \n", + "3 0-35 N Pass 55.0 1034.0 \n", + "4 35-55 N Pass 68.0 2445.0 \n" + ] + } + ], + "source": [ + "# Filtering unnecessary rows where 'click_events' is smaller than 10\n", + "filtered_data_frame = cleaned_data_frame[cleaned_data_frame['click_events'] >= 10]\n", + "\n", + "# Displaying the filtered DataFrame\n", + "print(filtered_data_frame.head())\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### MARK: \n", + "#### FEEDBACK: " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Requirement FR2.6 - Rename the score column" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Unnamed: 0 id_student gender region highest_education \\\n", + "0 0 11391 M East Anglian Region HE Qualification \n", + "1 1 28400 F Scotland HE Qualification \n", + "2 2 31604 F South East Region A Level or Equivalent \n", + "3 3 32885 F West Midlands Region Lower Than A Level \n", + "4 4 38053 M Wales A Level or Equivalent \n", + "\n", + " age_band disability final_result final_mark click_events \n", + "0 55<= N Pass 82.0 934.0 \n", + "1 35-55 N Pass 67.0 1435.0 \n", + "2 35-55 N Pass 76.0 2158.0 \n", + "3 0-35 N Pass 55.0 1034.0 \n", + "4 35-55 N Pass 68.0 2445.0 \n" + ] + } + ], + "source": [ + "# Renaming the 'score' column to 'final_mark'\n", + "renamed_data_frame = filtered_data_frame.rename(columns={'score': 'final_mark'})\n", + "\n", + "# Displaying the DataFrame with the renamed column\n", + "print(renamed_data_frame.head())\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### MARK: \n", + "#### FEEDBACK: " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Requirement FR2.7 - Remove unnecessary column(s)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Unnamed: 0 id_student gender age_band disability final_mark click_events\n", + "0 0 11391 M 55<= N 82.0 934.0\n", + "1 1 28400 F 35-55 N 67.0 1435.0\n", + "2 2 31604 F 35-55 N 76.0 2158.0\n", + "3 3 32885 F 0-35 N 55.0 1034.0\n", + "4 4 38053 M 35-55 N 68.0 2445.0\n" + ] + } + ], + "source": [ + "# Removing unnecessary rows from 'cleaned_data_frame' by using 'drop' method.\n", + "# The result will be stored in 'final_data_frame', which no longer includes 'region', 'final_result', 'highest_education' columns.\n", + "\n", + "final_data_frame = renamed_data_frame.drop(columns=['region', 'final_result', 'highest_education'])\n", + "\n", + "# Displaying the Final DataFrame\n", + "\n", + "print(final_data_frame.head())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### MARK: \n", + "#### FEEDBACK: " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Requirement FR2.8 - Write the DataFrame data to a CSV file" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "# Saving the 'final_data_frame' to a CSV file called 'updated.csv'.\n", + "# By using the 'index=False' parameter I can ensure that the CSV file does not include row indices.\n", + "\n", + "final_data_frame.to_csv('updated.csv', index=False)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### MARK: \n", + "#### FEEDBACK: " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Requirement FR2.9 - Investigate the effects of age-group on attainment and engagement" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " click_events final_mark\n", + "age_band \n", + "0-35 1616.472655 72.503923\n", + "35-55 2193.000267 75.035810\n", + "55<= 3574.864865 77.718919\n" + ] + } + ], + "source": [ + "# Calculating the engagement and final mark for each age group.\n", + "average_by_age = final_data_frame.groupby('age_band').agg({'click_events': 'mean', 'final_mark': 'mean'})\n", + "\n", + "# The DataFrame will be displayed, showing the engagement and final mark, for each age group.\n", + "print(average_by_age)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### MARK: \n", + "#### FEEDBACK: " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Requirement FR2.10 - Present the results of the age-group investigation using an appropriate visualisation" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "<Figure size 640x480 with 2 Axes>" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import matplotlib.pyplot as plt\n", + "\n", + "# Creating the plot\n", + "fig, ax1 = plt.subplots()\n", + "\n", + "# Plotting the average final mark for each age band\n", + "ax1.bar(average_by_age.index, average_by_age['final_mark'], color='b', alpha=0.6, label='Final Mark')\n", + "\n", + "# Making the y-axis label and tick labels match the line color.\n", + "ax1.set_ylabel('Average Final Mark', color='b')\n", + "for tl in ax1.get_yticklabels():\n", + " tl.set_color('b')\n", + "\n", + "# Setting x-axis label and title\n", + "ax1.set_xlabel('Age Band')\n", + "plt.title('Average Engagement and Final Mark by Age Band')\n", + "\n", + "# Creating another y-axis for the average click_events\n", + "ax2 = ax1.twinx()\n", + "ax2.plot(average_by_age.index, average_by_age['click_events'], color='r', label='Click Events')\n", + "ax2.set_ylabel('Average Click Events', color='r')\n", + "for tl in ax2.get_yticklabels():\n", + " tl.set_color('r')\n", + "\n", + "# Showing the plot\n", + "plt.show()\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### MARK: \n", + "#### FEEDBACK: " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Requirement FR2.11 - Investigate the effects of engagement on attainment" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "ename": "ModuleNotFoundError", + "evalue": "No module named 'seaborn'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m/Users/mscdatascience/Documents/assignment-PDS/mohammad_alsuulaimani_uwe_23086369_2023/UFCFVQ-15-M Programming Task 2 Template.ipynb Cell 33\u001b[0m line \u001b[0;36m1\n\u001b[0;32m----> <a href='vscode-notebook-cell:/Users/mscdatascience/Documents/assignment-PDS/mohammad_alsuulaimani_uwe_23086369_2023/UFCFVQ-15-M%20Programming%20Task%202%20Template.ipynb#X44sZmlsZQ%3D%3D?line=0'>1</a>\u001b[0m \u001b[39mimport\u001b[39;00m \u001b[39mseaborn\u001b[39;00m \u001b[39mas\u001b[39;00m \u001b[39msns\u001b[39;00m\n\u001b[1;32m <a href='vscode-notebook-cell:/Users/mscdatascience/Documents/assignment-PDS/mohammad_alsuulaimani_uwe_23086369_2023/UFCFVQ-15-M%20Programming%20Task%202%20Template.ipynb#X44sZmlsZQ%3D%3D?line=1'>2</a>\u001b[0m \u001b[39mimport\u001b[39;00m \u001b[39mmatplotlib\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mpyplot\u001b[39;00m \u001b[39mas\u001b[39;00m \u001b[39mplt\u001b[39;00m\n\u001b[1;32m <a href='vscode-notebook-cell:/Users/mscdatascience/Documents/assignment-PDS/mohammad_alsuulaimani_uwe_23086369_2023/UFCFVQ-15-M%20Programming%20Task%202%20Template.ipynb#X44sZmlsZQ%3D%3D?line=4'>5</a>\u001b[0m \u001b[39m# Creating a scatter plot to investigate the correlation between engagement and final mark\u001b[39;00m\n", + "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'seaborn'" + ] + } + ], + "source": [ + "import seaborn as sns\n", + "import matplotlib.pyplot as plt\n", + "\n", + "\n", + "# Creating a scatter plot to investigate the correlation between engagement and final mark\n", + "sns.scatterplot(x='click_events', y='final_mark', data=final_data_frame)\n", + "\n", + "# Adding title and labels\n", + "plt.xlabel('Click Events')\n", + "plt.ylabel('Final Mark ')\n", + "\n", + "# Showing the plot\n", + "plt.show()\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### MARK: \n", + "#### FEEDBACK: " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Adherence to good coding style" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### MARK: \n", + "#### FEEDBACK: " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Process Development Report for Task 2\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Write here" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### MARK: \n", + "#### FEEDBACK: " + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.5" + }, + "vscode": { + "interpreter": { + "hash": "9ef62a9e119055cb3f7e8378d4eaf3b008dbaf8b8298b9c44f87df8240f3e8bc" + } + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/UFCFVQ-15-M Programming Task 2 Template.ipynb b/UFCFVQ-15-M Programming Task 2 Template.ipynb index fa24b42bb65b3be868a4dad17ec925eaeb667997..588ced36d9465fc6b6275e93d54e415ff9be6208 100644 --- a/UFCFVQ-15-M Programming Task 2 Template.ipynb +++ b/UFCFVQ-15-M Programming Task 2 Template.ipynb @@ -19,7 +19,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 15, "metadata": {}, "outputs": [ { @@ -73,7 +73,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 16, "metadata": {}, "outputs": [ { @@ -121,7 +121,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 17, "metadata": {}, "outputs": [ { @@ -182,7 +182,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 18, "metadata": {}, "outputs": [ { @@ -231,7 +231,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 19, "metadata": {}, "outputs": [ { @@ -280,7 +280,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 20, "metadata": {}, "outputs": [ { @@ -328,7 +328,7 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 21, "metadata": {}, "outputs": [ { @@ -372,7 +372,7 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 22, "metadata": {}, "outputs": [], "source": [ @@ -399,7 +399,7 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 23, "metadata": {}, "outputs": [ { @@ -439,11 +439,52 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 27, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "<Figure size 640x480 with 2 Axes>" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ - "# add code here" + "import matplotlib.pyplot as plt\n", + "\n", + "# Creating the plot\n", + "fig, ax1 = plt.subplots()\n", + "\n", + "# Plotting the grade for each age group\n", + "\n", + "ax1.bar(average_by_age.index, average_by_age['final_mark'], color='b', alpha=0.6, label='Final Mark')\n", + "\n", + "# Ensuring the y axis label and tick labels are consistent, with the line color.\n", + "\n", + "ax1.set_ylabel('Average Final Mark', color='b')\n", + "for tl in ax1.get_yticklabels():\n", + " tl.set_color('b')\n", + "\n", + "# Setting x-axis label and title\n", + "ax1.set_xlabel('Age Band')\n", + "plt.title('Average Engagement and Final Mark by Age Band')\n", + "\n", + "# Creating another y axis to represent the average number of click events\n", + "\n", + "ax2 = ax1.twinx()\n", + "ax2.plot(average_by_age.index, average_by_age['click_events'], color='r', label='Click Events')\n", + "ax2.set_ylabel('Average Click Events', color='r')\n", + "for tl in ax2.get_yticklabels():\n", + " tl.set_color('r')\n", + "\n", + "# Displaying the plot\n", + "plt.show()\n", + "\n", + "\n" ] }, { @@ -466,9 +507,7 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": [ - "# add code here" - ] + "source": [] }, { "cell_type": "markdown",