{ "cells": [ { "cell_type": "markdown", "id": "5d835106-2c35-4478-a134-0b3ad99e5bcd", "metadata": {}, "source": [ "a.\tLoad the dataset into a Pandas dataframe and display the first five rows and inspect the column names and their data types." ] }, { "cell_type": "code", "execution_count": 2, "id": "9a1eb9db-7d22-4c9a-bdf0-13c242885d65", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "First five rows:\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
agesexchest painrest bpcholesterolfbsrest ecgmax heart rateex anginaoldpeakslope STvesselsthaltarget
052101252120116801.02230
153101402031015513.10030
270101451740112512.60030
361101482030116100.02130
462001382941110601.91320
\n", "
" ], "text/plain": [ " age sex chest pain rest bp cholesterol fbs rest ecg max heart rate \\\n", "0 52 1 0 125 212 0 1 168 \n", "1 53 1 0 140 203 1 0 155 \n", "2 70 1 0 145 174 0 1 125 \n", "3 61 1 0 148 203 0 1 161 \n", "4 62 0 0 138 294 1 1 106 \n", "\n", " ex angina oldpeak slope ST vessels thal target \n", "0 0 1.0 2 2 3 0 \n", "1 1 3.1 0 0 3 0 \n", "2 1 2.6 0 0 3 0 \n", "3 0 0.0 2 1 3 0 \n", "4 0 1.9 1 3 2 0 " ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Column names and data types:\n", "age int64\n", "sex int64\n", "chest pain int64\n", "rest bp int64\n", "cholesterol int64\n", "fbs int64\n", "rest ecg int64\n", "max heart rate int64\n", "ex angina int64\n", "oldpeak float64\n", "slope ST int64\n", "vessels int64\n", "thal int64\n", "target int64\n", "dtype: object\n" ] } ], "source": [ "import pandas as pd\n", "\n", "# Load the dataset\n", "file_path = \"/Users/patriciaxufre/Documents/SBE - Disciplinas/2957 | ABA/2024-25/Datasets Examples/heart.csv\"\n", "df = pd.read_csv(file_path)\n", "\n", "# Display first five rows\n", "print(\"First five rows:\")\n", "display(df.head())\n", "\n", "# Inspect column names and data types\n", "print(\"Column names and data types:\")\n", "print(df.dtypes)" ] }, { "cell_type": "markdown", "id": "475f0086-b2e3-458f-8eb8-9c7d99d3cfaa", "metadata": {}, "source": [ "b.\tCheck the shape of the dataframe and identify missing values, if any." ] }, { "cell_type": "code", "execution_count": 4, "id": "7a9e0821-b85d-4fcb-aea8-31b602ec2033", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Shape of dataframe: (1025, 14)\n", "Missing values:\n", "age 0\n", "sex 0\n", "chest pain 0\n", "rest bp 0\n", "cholesterol 0\n", "fbs 0\n", "rest ecg 0\n", "max heart rate 0\n", "ex angina 0\n", "oldpeak 0\n", "slope ST 0\n", "vessels 0\n", "thal 0\n", "target 0\n", "dtype: int64\n" ] } ], "source": [ "# Check shape and missing values\n", "print(\"Shape of dataframe:\", df.shape)\n", "print(\"Missing values:\")\n", "print(df.isnull().sum())" ] }, { "cell_type": "code", "execution_count": 5, "id": "1fa12942-12ce-4f77-925e-e4aef36a9406", "metadata": {}, "outputs": [], "source": [ "# The data set has 1025 observations and 13 input variables and 1 output variable. There are no missing values in the dataset." ] }, { "cell_type": "markdown", "id": "a8a7b2dd-2609-45b1-be1a-fb344c1d1085", "metadata": {}, "source": [ "c.\tRename the column rest bp to Resting_Blood_Pressure." ] }, { "cell_type": "code", "execution_count": 7, "id": "bd630d77-e836-4852-8fc0-c15361addfba", "metadata": {}, "outputs": [], "source": [ "# Rename 'rest bp' to 'Resting_Blood_Pressure' (if it exists in the dataset)\n", "df.rename(columns={'rest bp': 'Resting_Blood_Pressure'}, inplace=True) # inplace=True modifies the DataFrame directly without returning a new object." ] }, { "cell_type": "markdown", "id": "59906a77-e35f-42ea-89cc-9989c5f346fd", "metadata": {}, "source": [ "d.\tReplace spaces in column names with underscores." ] }, { "cell_type": "code", "execution_count": 9, "id": "78ce10d2-29be-427d-8b3f-7e44c967f32b", "metadata": {}, "outputs": [], "source": [ "# Replace spaces in column names with underscores\n", "df.columns = df.columns.str.replace(\" \", \"_\")" ] }, { "cell_type": "markdown", "id": "1515cacd-0128-4a85-935a-2c3e796febd9", "metadata": {}, "source": [ "e.\tClassify all the variables in the dataset. Using .dtypes method, check the data type of all columns in the dataframe. Convert target to a categorical variable, and ensure cholesterol is a continuous variable." ] }, { "cell_type": "code", "execution_count": 11, "id": "d216b92c-3ed6-4066-8960-c3253d05ab80", "metadata": {}, "outputs": [], "source": [ "# Categorical - nominal variables: sex, ex angina, thal\n", "# Categorical - ordinal variables: chest pain, fbs, rest ecg, slope ST\n", "# Numerical - discrete variables: age, rest bp, vessels\n", "# Numerical - continuous variables: cholestrerol, max heart rate, old peak" ] }, { "cell_type": "code", "execution_count": 12, "id": "0c8bc2b7-afe3-4dbe-b7c4-72992070ac18", "metadata": {}, "outputs": [], "source": [ "# Classify variables and convert data types\n", "df['target'] = df['target'].astype('category') # Convert target to categorical\n", "df['sex'] = df['sex'].astype('category')\n", "df['ex_angina'] = df['ex_angina'].astype('category')\n", "df['thal'] = df['thal'].astype('category')\n", "\n", "df['cholesterol'] = pd.to_numeric(df['cholesterol'], errors='coerce') # Ensure cholesterol is continuous\n", "df['max_heart_rate'] = pd.to_numeric(df['max_heart_rate'], errors='coerce')" ] }, { "cell_type": "markdown", "id": "7e60db58-b9fd-4bbf-8799-fcae29dfa280", "metadata": {}, "source": [ "f.\tSelect and display the cholesterol levels of the first four patients." ] }, { "cell_type": "code", "execution_count": 14, "id": "216921ee-9617-47e8-b25b-65ded69adbd5", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Cholesterol levels of first four patients:\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
cholesterol
0212
1203
2174
3203
\n", "
" ], "text/plain": [ " cholesterol\n", "0 212\n", "1 203\n", "2 174\n", "3 203" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Display cholesterol levels of the first four patients\n", "print(\"Cholesterol levels of first four patients:\")\n", "df[['cholesterol']].head(4)" ] }, { "cell_type": "markdown", "id": "4aa137c2-4b38-4cb7-af55-2b88cc66748c", "metadata": {}, "source": [ "g.\tSelect the first four rows of columns age, cholesterol, and max heart rate." ] }, { "cell_type": "code", "execution_count": 16, "id": "1c31f9c3-071b-41ee-8917-02b9856edf30", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "First four rows of age, cholesterol, and max_heart_rate:\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
agecholesterolmax_heart_rate
052212168
153203155
270174125
361203161
\n", "
" ], "text/plain": [ " age cholesterol max_heart_rate\n", "0 52 212 168\n", "1 53 203 155\n", "2 70 174 125\n", "3 61 203 161" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Select first four rows of specific columns\n", "print(\"First four rows of age, cholesterol, and max_heart_rate:\")\n", "display(df.loc[:3, ['age', 'cholesterol', 'max_heart_rate']])" ] }, { "cell_type": "markdown", "id": "ff7244e4-3055-4357-b0cb-d2c66a707997", "metadata": {}, "source": [ "h.\tCombine non-consecutive columns age, sex, and max heart rate into a new dataframe." ] }, { "cell_type": "code", "execution_count": 18, "id": "30a2673c-3731-4554-9d7c-6aed86b26bc1", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Combined dataframe:\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
agesexmax_heart_rate
0521168
1531155
2701125
3611161
4620106
\n", "
" ], "text/plain": [ " age sex max_heart_rate\n", "0 52 1 168\n", "1 53 1 155\n", "2 70 1 125\n", "3 61 1 161\n", "4 62 0 106" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Combine non-consecutive columns into a new dataframe\n", "combined_df = df[['age', 'sex', 'max_heart_rate']]\n", "print(\"Combined dataframe:\")\n", "combined_df.head()" ] }, { "cell_type": "markdown", "id": "e7a5cbd8-273c-4d7a-bbcb-1fc141567573", "metadata": {}, "source": [ "i.\tGenerate a random sample of 10 patients from the dataset. Oversample patients older than 60 years for a new sample." ] }, { "cell_type": "code", "execution_count": 20, "id": "21938ebe-a2b7-4999-8e32-eeff0a0b3d4d", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Random sample of 10 patients:\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
agesexchest_painResting_Blood_Pressurecholesterolfbsrest_ecgmax_heart_rateex_anginaoldpeakslope_STvesselsthaltarget
52762001242090116300.02021
35953021282160011500.02001
44755101602890014510.81130
3150011202440116201.12021
62148101302561015010.02230
59074011202690012110.22121
9056410120246009612.20120
73767101202290012912.61230
7648121242551117500.02221
94870101451740112512.60030
\n", "
" ], "text/plain": [ " age sex chest_pain Resting_Blood_Pressure cholesterol fbs rest_ecg \\\n", "527 62 0 0 124 209 0 1 \n", "359 53 0 2 128 216 0 0 \n", "447 55 1 0 160 289 0 0 \n", "31 50 0 1 120 244 0 1 \n", "621 48 1 0 130 256 1 0 \n", "590 74 0 1 120 269 0 0 \n", "905 64 1 0 120 246 0 0 \n", "737 67 1 0 120 229 0 0 \n", "76 48 1 2 124 255 1 1 \n", "948 70 1 0 145 174 0 1 \n", "\n", " max_heart_rate ex_angina oldpeak slope_ST vessels thal target \n", "527 163 0 0.0 2 0 2 1 \n", "359 115 0 0.0 2 0 0 1 \n", "447 145 1 0.8 1 1 3 0 \n", "31 162 0 1.1 2 0 2 1 \n", "621 150 1 0.0 2 2 3 0 \n", "590 121 1 0.2 2 1 2 1 \n", "905 96 1 2.2 0 1 2 0 \n", "737 129 1 2.6 1 2 3 0 \n", "76 175 0 0.0 2 2 2 1 \n", "948 125 1 2.6 0 0 3 0 " ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Generate a random sample of 10 patients\n", "sample_df = df.sample(n=10, random_state=42) # random state = 42: it ensures the reproducibility of the random sample\n", "print(\"Random sample of 10 patients:\")\n", "sample_df" ] }, { "cell_type": "code", "execution_count": 21, "id": "09146aac-32c8-4af4-a327-a1da154a48bc", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Oversampled patients older than 60 years:\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
agesexchest_painResting_Blood_Pressurecholesterolfbsrest_ecgmax_heart_rateex_anginaoldpeakslope_STvesselsthaltarget
41370101303220010902.41320
100266101122120013210.12120
43165001502250011401.01330
2966710120237017101.01020
69866101122120013210.12120
8968101441931114103.41230
41370101303220010902.41320
48765101352540012702.81130
79268101441931114103.41230
37566101602280013802.32011
\n", "
" ], "text/plain": [ " age sex chest_pain Resting_Blood_Pressure cholesterol fbs rest_ecg \\\n", "413 70 1 0 130 322 0 0 \n", "1002 66 1 0 112 212 0 0 \n", "431 65 0 0 150 225 0 0 \n", "296 67 1 0 120 237 0 1 \n", "698 66 1 0 112 212 0 0 \n", "89 68 1 0 144 193 1 1 \n", "413 70 1 0 130 322 0 0 \n", "487 65 1 0 135 254 0 0 \n", "792 68 1 0 144 193 1 1 \n", "375 66 1 0 160 228 0 0 \n", "\n", " max_heart_rate ex_angina oldpeak slope_ST vessels thal target \n", "413 109 0 2.4 1 3 2 0 \n", "1002 132 1 0.1 2 1 2 0 \n", "431 114 0 1.0 1 3 3 0 \n", "296 71 0 1.0 1 0 2 0 \n", "698 132 1 0.1 2 1 2 0 \n", "89 141 0 3.4 1 2 3 0 \n", "413 109 0 2.4 1 3 2 0 \n", "487 127 0 2.8 1 1 3 0 \n", "792 141 0 3.4 1 2 3 0 \n", "375 138 0 2.3 2 0 1 1 " ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Oversample patients older than 60 years\n", "older_patients = df[df['age'] > 60].sample(n=10, replace=True, random_state=42)\n", "print(\"Oversampled patients older than 60 years:\")\n", "older_patients" ] }, { "cell_type": "code", "execution_count": null, "id": "af7951eb-c375-4484-8848-593609ac2495", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python [conda env:base] *", "language": "python", "name": "conda-base-py" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.7" } }, "nbformat": 4, "nbformat_minor": 5 }