From e812c236002d921b4dc83aa709f40def705242a7 Mon Sep 17 00:00:00 2001 From: am2-liyanaarac <akash2.liyanaarachchi@live.uwe.ac.uk> Date: Thu, 20 Apr 2023 12:36:40 +0100 Subject: [PATCH] Task C Reflection report added --- UFCFVQ-15-M_Python_Programming_Template.ipynb | 103 ++++++++++-------- ..._Programming_With_Libraries_Template.ipynb | 67 +++++++++--- 2 files changed, 109 insertions(+), 61 deletions(-) diff --git a/UFCFVQ-15-M_Python_Programming_Template.ipynb b/UFCFVQ-15-M_Python_Programming_Template.ipynb index f955381..e6cd0c7 100644 --- a/UFCFVQ-15-M_Python_Programming_Template.ipynb +++ b/UFCFVQ-15-M_Python_Programming_Template.ipynb @@ -37,7 +37,7 @@ "source": [ "# Python Programming (Task B)\n", "\n", - "## Student Id: " + "## Student Id: 22074847" ] }, { @@ -51,7 +51,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 10, "metadata": { "deletable": false }, @@ -60,7 +60,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "The geometric mean of the list is: 40.40574036744275\n" + "The geometric mean is: 40.40574036744275\n" ] } ], @@ -71,15 +71,15 @@ " :param num_lst: A list of positive integers or floats.\n", " :returns float: The geometric mean of the numbers in the list.\n", " \"\"\"\n", - " # Initialize the product of the numbers\n", - " product = 1\n", + " # product of numbers\n", + " prod = 1\n", "\n", - " # Iterate through the list of numbers and multiply each number\n", + " # loop through the numbers and multiply each number\n", " for number in num_lst:\n", - " product *= number\n", + " prod *= number\n", "\n", - " # Calculate the nth root of the product and return it\n", - " return product ** (1 / len(num_lst))\n", + " # Return the nth root of the product\n", + " return prod ** (1 / len(num_lst))\n", "\n", "# Test the function with the provided list of numbers\n", "test_number_list = [64, 9, 90, 28, 46, 95, 34, 28, 86, 62, 14, 77, 99, 80,\n", @@ -88,7 +88,7 @@ " 48, 27, 13, 41, 13, 28, 17, 64]\n", "\n", "result = geometric_mean(test_number_list)\n", - "print(f\"The geometric mean of the list is: {result}\")\n" + "print(f\"The geometric mean is: {result}\")\n" ] }, { @@ -112,7 +112,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 19, "metadata": { "deletable": false }, @@ -121,13 +121,13 @@ "name": "stdout", "output_type": "stream", "text": [ - "Column name: age\n", + "Column: age\n", "Data: [16.0, 27.0, 26.0, 25.0, 29.0, 29.0, 22.0, 35.0, 44.0, 31.0, 76.0, 40.0, 31.0, 23.0, 39.0, 25.0, 54.0, 24.0, 57.0, 21.0, 42.0, 21.0, 36.0, 26.0, 49.0, 54.0, 26.0, 48.0, 33.0, 21.0, 41.0, 48.0, 36.0, 41.0, 29.0, 27.0, 45.0, 32.0, 35.0, 36.0, 35.0, 40.0, 18.0, 34.0, 39.0, 21.0, 62.0, 43.0, 44.0, 29.0, 35.0, 50.0, 18.0, 25.0, 31.0, 29.0, 49.0, 23.0, 45.0, 26.0, 35.0, 34.0, 46.0, 29.0, 39.0, 28.0, 51.0, 67.0, 53.0, 25.0, 30.0, 24.0, 35.0, 43.0, 24.0, 29.0, 38.0, 31.0, 36.0, 23.0, 38.0, 53.0, 24.0, 26.0, 28.0, 34.0, 28.0, 40.0, 51.0, 44.0, 25.0, 56.0, 37.0, 58.0, 39.0, 37.0, 35.0, 26.0, 47.0, 31.0, 60.0, 32.0, 45.0, 42.0, 17.0, 22.0, 33.0, 18.0, 39.0, 59.0, 33.0, 58.0, 58.0, 47.0, 41.0, 64.0, 45.0, 53.0, 24.0, 48.0, 29.0, 40.0, 34.0, 21.0, 26.0, 45.0, 25.0, 17.0, 24.0, 42.0, 29.0, 42.0, 30.0, 29.0, 39.0, 63.0, 49.0, 41.0, 27.0, 30.0, 60.0, 77.0, 19.0, 37.0, 54.0, 29.0, 30.0, 24.0, 21.0, 44.0, 40.0, 32.0, 22.0, 43.0, 52.0, 27.0, 34.0, 20.0, 25.0, 24.0, 20.0, 46.0, 42.0, 43.0, 41.0, 59.0, 25.0, 42.0, 64.0, 22.0, 24.0, 63.0, 56.0, 60.0, 54.0, 37.0, 22.0, 39.0, 45.0, 57.0, 42.0, 19.0, 26.0, 34.0, 69.0, 64.0, 35.0, 40.0, 19.0, 27.0, 37.0, 17.0, 39.0, 74.0, 42.0, 47.0, 43.0, 46.0, 44.0, 31.0, 47.0, 41.0, 43.0, 40.0, 32.0, 31.0, 20.0, 20.0, 33.0, 22.0, 41.0, 41.0, 32.0, 16.0, 29.0, 42.0, 29.0, 47.0, 53.0, 18.0, 47.0, 34.0, 36.0, 63.0, 36.0, 27.0, 28.0, 33.0, 32.0, 42.0, 31.0, 17.0, 28.0, 24.0, 71.0, 51.0, 28.0, 53.0, 54.0, 45.0, 33.0, 48.0, 34.0, 23.0, 35.0, 33.0, 32.0, 52.0, 30.0, 23.0, 35.0, 42.0, 37.0, 56.0, 36.0, 27.0, 30.0, 31.0, 46.0, 51.0, 72.0, 28.0, 63.0, 28.0, 33.0, 24.0, 27.0, 24.0, 28.0, 28.0, 17.0, 46.0, 52.0, 39.0, 49.0, 30.0, 51.0, 16.0, 18.0, 22.0, 40.0, 61.0, 52.0, 51.0, 36.0, 36.0, 59.0, 17.0, 18.0, 41.0, 33.0, 25.0, 23.0, 47.0, 58.0, 47.0, 34.0, 28.0, 37.0, 87.0, 39.0, 27.0, 35.0, 36.0, 24.0, 26.0, 34.0, 51.0, 49.0, 41.0, 54.0, 36.0, 26.0, 35.0, 22.0, 27.0, 42.0, 32.0, 32.0, 25.0, 26.0, 53.0, 26.0, 40.0, 55.0, 29.0, 31.0, 19.0, 57.0, 40.0, 35.0, 35.0, 39.0, 37.0, 36.0, 62.0, 43.0, 32.0, 34.0, 37.0, 37.0, 33.0, 35.0, 40.0, 21.0, 30.0, 23.0, 26.0, 39.0, 33.0, 34.0, 37.0, 26.0, 24.0, 25.0, 31.0, 49.0, 59.0, 50.0, 37.0, 28.0, 26.0, 23.0, 32.0, 24.0, 42.0, 34.0, 68.0, 31.0, 83.0, 35.0, 29.0, 50.0, 56.0, 43.0, 38.0, 27.0, 36.0, 55.0, 36.0, 68.0, 61.0, 46.0, 47.0, 26.0, 37.0, 22.0, 18.0, 39.0, 49.0, 23.0, 47.0, 32.0, 45.0, 51.0, 31.0, 54.0, 31.0, 23.0, 29.0, 28.0, 31.0, 24.0, 27.0, 57.0, 39.0, 38.0, 34.0, 39.0, 20.0, 35.0, 36.0, 38.0, 33.0, 57.0, 38.0, 72.0, 37.0, 47.0, 43.0, 37.0, 75.0, 21.0, 20.0, 29.0, 37.0, 41.0, 22.0, 23.0, 64.0, 34.0, 49.0, 32.0, 25.0, 39.0, 53.0, 27.0, 36.0, 20.0, 39.0, 19.0, 34.0, 36.0, 34.0, 31.0, 45.0, 34.0, 31.0, 28.0, 57.0, 29.0, 50.0, 40.0, 35.0, 53.0, 59.0, 18.0, 28.0, 52.0, 38.0, 48.0]\n" ] } ], "source": [ - "def read_csv_column(file, index, convert_to_numbers=True):\n", + "def read_column(file, index, convert_to_numbers=True):\n", " \"\"\"\n", " :param file: The path of the CSV file.\n", " :param index: The index of the column to read (0-based).\n", @@ -136,9 +136,7 @@ " \"\"\"\n", "\n", " try:\n", - " # Open the CSV file\n", " with open(file, 'r') as csvfile:\n", - " # Initialize the data list\n", " csv_column_data = []\n", "\n", " # Read the file content line by line\n", @@ -150,18 +148,16 @@ "\n", " # Iterate through the remaining lines\n", " for line in lines:\n", - " # Split the line by commas\n", + " # Split the line by commas and get the specified column\n", " row = line.strip().split(',')\n", - "\n", - " # Get the value from the specified column\n", " value = row[index]\n", "\n", - " # Convert the value to a number if the flag is set to True\n", + " # Convert to a number if required\n", " if convert_to_numbers:\n", " try:\n", " value = float(value)\n", " except ValueError:\n", - " pass # If the conversion fails, keep the value as a string\n", + " pass # If fails, keep the value as a string\n", "\n", " # Add the value to the data list\n", " csv_column_data.append(value)\n", @@ -170,13 +166,14 @@ " except FileNotFoundError:\n", " print(f\"Error: File {file} not found.\")\n", " return None, None\n", + " except Exception as e:\n", + " print(f\"Something went wrong: {e}\")\n", + " return None, None\n", "\n", - "# Test the function using the 'task1.csv' file and a specified column index\n", - "file_path = 'task1.csv'\n", - "column_index = 0\n", - "column_name, data = read_csv_column(file_path, column_index)\n", + "# Test the function\n", + "column_name, data = read_column('task1.csv', 0)\n", "\n", - "print(f\"Column name: {column_name}\")\n", + "print(f\"Column: {column_name}\")\n", "print(f\"Data: {data}\")\n" ] }, @@ -201,7 +198,7 @@ }, { "cell_type": "code", - "execution_count": 190, + "execution_count": 12, "metadata": { "deletable": false }, @@ -215,11 +212,11 @@ } ], "source": [ - "def read_all_csv_data(file_name, conversion_indicators):\n", + "def read_all_data(file_name, conversion_indicators):\n", " \"\"\"\n", " This function will return all columns data as a dictionary\n", " :param file_name: CSV file path\n", - " :param conversion_indicators: indicator to check column values should be converted from strings to numbers\n", + " :param conversion_indicators: indicator to check column values needs be converted from strings to numbers\n", " :return: dictionary\n", " \"\"\"\n", " #initializing the data dictionary\n", @@ -232,7 +229,7 @@ "\n", " for index in range(num_columns):\n", " convert = conversion_indicators[index]\n", - " col_name, col_data = read_csv_column(file_name, index, convert)\n", + " col_name, col_data = read_column(file_name, index, convert)\n", "\n", " #add data to the dictionary\n", " dict_data[col_name] = col_data\n", @@ -244,7 +241,7 @@ "\n", "file_path = 'task1.csv'\n", "conversion_flags = [True, True, True, True, True, True, True, True, True, True]\n", - "all_csv_data = read_all_csv_data(file_path, conversion_flags)\n", + "all_csv_data = read_all_data(file_path, conversion_flags)\n", "\n", "print(all_csv_data)" ] @@ -280,29 +277,50 @@ }, { "cell_type": "code", - "execution_count": 194, + "execution_count": 17, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Spear-man's Rank Correlation Coefficient for `pop` and `share_white`: 0.07665516130573191\n" + "Spear-man's Rank Correlation Coefficient for `age` and `pop`: 0.0295488534854752\n" ] } ], "source": [ + "def arithmatic_mean(numbers):\n", + " \"\"\"\n", + " This function will take a list of number and will return arithmatic mean\n", + " :param numbers: list of numbers\n", + " :return: float\n", + " \"\"\"\n", + "\n", + " # Handle exception for divide by zero error\n", + " if len(numbers) == 0:\n", + " raise ValueError(\"Your list should have at least one number\")\n", + "\n", + " # initialize sum\n", + " total = 0\n", + "\n", + " for num in numbers:\n", + " total += num\n", + "\n", + " return total / len(numbers)\n", + "\n", "def rank_list(lst):\n", " \"\"\"\n", - " This function returns a list of ranks for the input list, taking duplicates into account.\n", - " :param lst: A list of data that needed to be ranked\n", + " Returns a list of ranks for the input list, taking duplicates into account.\n", + " :param lst: A list of numbers\n", " :return list: A list of ranked values\n", " \"\"\"\n", " ranks = {}\n", " for i, val in enumerate(sorted(lst), 1):\n", + " #create a new list for each key if it doesn't already exist and append the rank of the corresponding value to it.\n", " ranks.setdefault(val, []).append(i)\n", "\n", - " # get the average rank of each item using dictionary comprehension\n", - " avg_ranks = {v: sum(r) / len(r) for v, r in ranks.items()}\n", + " avg_ranks = {}\n", + " for index, rank in ranks.items():\n", + " avg_ranks[index] = arithmatic_mean(rank)\n", "\n", " return [avg_ranks[val] for val in lst]\n", "\n", @@ -337,8 +355,8 @@ "\n", "# Read two columns of data from the CSV file\n", "file_path = 'task1.csv'\n", - "column1_name, column1_data = read_csv_column(file_path, 1, True)\n", - "column2_name, column2_data = read_csv_column(file_path, 2, True)\n", + "column1_name, column1_data = read_column(file_path, 0, True)\n", + "column2_name, column2_data = read_column(file_path, 1, False)\n", "\n", "# Calculate the Spear-man's Rank Correlation Coefficient for the two columns\n", "try:\n", @@ -362,7 +380,7 @@ }, { "cell_type": "code", - "execution_count": 189, + "execution_count": 14, "metadata": { "deletable": false }, @@ -384,7 +402,7 @@ " \"\"\"\n", " # Read the data from the CSV file\n", " con_indicators = [True, True, True, True, True, True, True, True, True, True]\n", - " all_column_data = read_all_csv_data(csv_file, con_indicators)\n", + " all_column_data = read_all_data(csv_file, con_indicators)\n", "\n", " # Get the column names\n", " column_names = list(all_column_data.keys())\n", @@ -415,8 +433,7 @@ " return all_correlation_coefficients\n", "\n", "# Test the function\n", - "file = 'task1.csv'\n", - "result = generate_all_correlation_coefficients(file)\n", + "result = generate_all_correlation_coefficients('task1.csv')\n", "print(result)\n" ] }, @@ -441,7 +458,7 @@ }, { "cell_type": "code", - "execution_count": 205, + "execution_count": 15, "metadata": { "deletable": false }, diff --git a/UFCFVQ-15-M_Python_Programming_With_Libraries_Template.ipynb b/UFCFVQ-15-M_Python_Programming_With_Libraries_Template.ipynb index ced66f4..97dd2bd 100644 --- a/UFCFVQ-15-M_Python_Programming_With_Libraries_Template.ipynb +++ b/UFCFVQ-15-M_Python_Programming_With_Libraries_Template.ipynb @@ -9,7 +9,7 @@ "# UFCFVQ-15-M Programming for Data Science (Spring 2023)\n", "# Python Programming with Libraries (Task C)\n", "\n", - "## Student Id: " + "## Student Id: 22074847" ] }, { @@ -23,7 +23,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 1, "metadata": { "deletable": false }, @@ -113,7 +113,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 2, "metadata": { "deletable": false }, @@ -170,7 +170,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 3, "metadata": { "deletable": false }, @@ -227,7 +227,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 4, "metadata": { "deletable": false }, @@ -244,18 +244,18 @@ "name": "stdout", "output_type": "stream", "text": [ - "Correlation Coefficient between Click Events and Score: 0.28 \n", + "Correlation Coefficient between Click Events and Scores: 0.28 \n", "\n", "Explanation of findings: \n", " \n", " It was difficult to discern a clear pattern in the data when I attempted to visualise the relationship between click events and scores using various types of plots due to the high density of data points. However, I utilised a scatter plot with each dot's transparency set to 0.2. Because of this, I can observe the Visualization's density as well as the data's distribution.\n", - " \n", + "\n", " As can be seen in the graph, there is a high density between 0 and 5000 click events, and the students earned more than 60% of marks on their exams. This visualisation illustrates a weakly positive correlation between click events and scores. This indicates that as students' engagement (as measured by click events) increases, their achievement (as measured by scores) tends to increase as well, though not significantly.\n", - " \n", + "\n", " According to the plot, many students have superior grades (>60) despite having a small number of click events. Nonetheless, there are a few students with greater click events who scored above 80%.\n", - " \n", + "\n", " According to the calculated correlation coefficient of 0.28, a positive but faint correlation exists between click events and scores.\n", - " \n", + "\n", " In conclusion, the weak positive correlation between click events and scores indicates that there may be a weak relationship between engagement and achievement.\n", "\n" ] @@ -312,7 +312,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 5, "metadata": { "deletable": false }, @@ -321,10 +321,9 @@ "name": "stdout", "output_type": "stream", "text": [ - "Independent Two-Sample t-test Statistic: 123.15732942753041\n", "P-value: 0.0000 \n", "\n", - "Samples are likely drawn from different distributions (reject H0 - SIGNIFICANT) \n", + "reject H0. This means there are not enough evidence to say that Click events do not have significant effect on scores\n", "\n", "Explanation of findings: \n", " \n", @@ -396,7 +395,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 6, "metadata": { "deletable": false }, @@ -415,13 +414,13 @@ "text": [ "Explanation of findings: \n", " \n", - " To find out the correlation between gender and scores, I used a box plot instead of a scatter plot. The reason for choosing box plots is that, box plots are better suited for visualizing the distribution of a continuous variable across different categories of a categorical variable, such as comparing scores across gender, because they provide a clearer and more informative view of the data distribution, handle overlapping data points more effectively, and scale well with large datasets. Box plots are designed to display the distribution of data for categorical variables, making it easy to compare medians, interquartile ranges, and potential outliers across different.\n", + " Box plots were chosen because they are better at showing the distribution of a continuous variable across different categories of a categorical variable, like comparing scores by gender. This is because box plots give a clearer and more informative picture of how the data is distributed, handle overlapping data points better, and work well with large datasets.Box plots are made to show how the data for category variables are spread out. This makes it easy to compare medians, inter quartile ranges, and possible outliers across different groups.\n", "\n", " As you can see in the box plots, the median score of female is slightly higher than median score of male. since the medians are not significantly different, we don't have strong evidence to conclude that female students getting more score than male students.\n", "\n", " On other hand, the boxes itself are almost identical and it is not indicating a big difference in score variability between genders. According to the box plot, males have more outlines than females. It indicates that there are more individual data points for males that falls outside the typical range of scores.As I can see in the boxplot, the bottom whisker for females is slightly longer than males, indicating that the range of lower scores is somewhat larger for females.\n", "\n", - " Overall, this observations shows that, although there are some differences between the genders in terms of score distribution, the overall patterns are quite similar. \n", + " Overall, this observations shows that, although there are some differences between the genders in terms of score distribution, the overall patterns are quite similar.\n", "\n" ] } @@ -472,7 +471,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 7, "metadata": { "deletable": false }, @@ -563,7 +562,39 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "add markdown text here" + "\n", + "# Development Process Report\n", + "\n", + "## Introduction\n", + "\n", + "The objective of this task was to analyse student performance data using Python libraries such as NumPy, Pandas, Matplotlib, and Seaborn. I merged two CSV files, cleaned the merged data, filtered out redundant rows, and investigated relationships between variables such as Click Events , gender, and educational score.\n", + "\n", + "## Merging and Cleaning Data\n", + "\n", + "The first step was to merge the datasets from task2a.csv and task2b.csv into a single DataFrame. I utilized Pandas to read the CSV files and merge them using the `merge()` function. Cleaning the data by removing missing values and unnecessary columns is also a critical step in data preparation. It helps to ensure that the data is accurate, complete, and consistent, which is essential for meaningful analysis. It demonstrates my understanding of data quality and how to handle data issues. Removing duplicate rows based on the `id_student` column is also a crucial step in data preparation. It helps to ensure that your analysis is based on unique observations and avoids biases that could arise from double-counting data.\n", + "\n", + "## Investigating Relationships\n", + "\n", + "### Engagement and Attainment\n", + "\n", + "The task examined engagement (click_events) and attainment (score). I started with scatter, hexbin, and density plots. These visualisations failed to show a relationship owing to data density. I used a scatter plot with bot transparency to see the density and data diffusion.\n", + "\n", + "The Pearson correlation coefficient showed a modest positive association between click_events and score, confirming my observations. This suggests that there is a relationship between engagement and attainment, but it is not strong.\n", + "\n", + "An independent two-sample t-test tested the hypothesis that engagement affects attainment. The p-value was 0.000, showing a statistically significant difference in attainment and score. However, correlation does not imply causality. The t-test assumed normality and equal variances in the two groups. If these assumptions are not met, a non-parametric test like the Mann-Whitney U test may be more reliable.\n", + "\n", + "\n", + "### Gender and Attainment\n", + "\n", + "I used box plots to investigate the effects of gender on levels of attainment. Since `gender` is a categorical variable, box plot is the most suitable Visualization. Due to the density of data points, I saw too many outliers which are overlapped. I can combined a violin plot with a swarm plot overlay to better visualize the distribution of scores and potential outliers.\n", + "\n", + "I performed an independent two-sample t-test to test if there is any statistically significant difference between the attainment of male and female students. The resulting p-value was 0.5073, indicating that there is not enough evidence to conclude that there is a statistically significant difference in the attainment levels of male and female students.\n", + "\n", + "## Reflection\n", + "\n", + "Throughout the development process, I faced challenges in visualizing and interpreting dense data. I iteratively experimented with different visualization techniques and statistical tests to analyze the relationships between variables effectively.\n", + "\n", + "In future analyses, it would be beneficial to explore other factors that may contribute to student performance and test alternative statistical methods to better understand the relationships between variables." ] }, { -- GitLab