# first i will import the packages needed in the project.

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import chi2_contingency
%matplotlib inline


# Load your data and print out a few lines. 
df=pd.read_csv('./noshowappointments-kagglev2-may-2016.csv')
df.head(10)


# Checking number of rows and columns
print ('There are', df.shape[0], 'rows and ', df.shape[1], 'columns')

There are 110527 rows and  14 columns


#Checking the data types of each feature.
df.dtypes

PatientId         float64
AppointmentID       int64
Gender             object
ScheduledDay       object
AppointmentDay     object
Age                 int64
Neighbourhood      object
Scholarship         int64
Hipertension        int64
Diabetes            int64
Alcoholism          int64
Handcap             int64
SMS_received        int64
No-show            object
dtype: object


# Checking for missing values in the dataset
df.isnull().sum()

PatientId         0
AppointmentID     0
Gender            0
ScheduledDay      0
AppointmentDay    0
Age               0
Neighbourhood     0
Scholarship       0
Hipertension      0
Diabetes          0
Alcoholism        0
Handcap           0
SMS_received      0
No-show           0
dtype: int64


# Checking for duplicate rows
df.duplicated().sum()

0


# Getting Summary statistics of Age
df.describe()


df.describe()


# Dropping features/columns that i won't be needing (PatientId and AppointmentID)
df.drop(columns=['PatientId','AppointmentID'], inplace=True)


#To confirm if the features have been dropped
df.head(3)


# Converting the Schedule and Appointment Day feature to date type
df['ScheduledDay']=pd.to_datetime(df['ScheduledDay']).dt.date
df['AppointmentDay']=pd.to_datetime(df['AppointmentDay']).dt.date
df[['ScheduledDay', 'AppointmentDay']].head()


# Creating a new feature/column (Waiting time), which is the difference between Appointment Day and Scheduled Day
df['Waiting_time'] = (df['AppointmentDay'] - df['ScheduledDay']).dt.days
df.head(3)


#Checking the summary Statistics for the waiting time
df['Waiting_time'].describe()

count    110527.000000
mean         10.183702
std          15.254996
min          -6.000000
25%           0.000000
50%           4.000000
75%          15.000000
max         179.000000
Name: Waiting_time, dtype: float64


print(f'"Scholarship" unique values: {df.Scholarship.unique()}')
print(f'"Hipertension" unique values: {df.Hipertension.unique()}')
print(f'"Diabetes" unique values: {df.Diabetes.unique()}')
print(f'"Alcoholism" unique values: {df.Alcoholism.unique()}')

"Scholarship" unique values: [0 1]
"Hipertension" unique values: [1 0]
"Diabetes" unique values: [0 1]
"Alcoholism" unique values: [0 1]


df.Scholarship = df.Scholarship.astype(bool)
df.Hipertension = df.Hipertension.astype(bool)
df.Diabetes = df.Diabetes.astype(bool)
df.Alcoholism = df.Alcoholism.astype(bool)


df[['Scholarship', 'Hipertension', 'Diabetes', 'Alcoholism']].head()


df[['Scholarship', 'Hipertension', 'Diabetes', 'Alcoholism']].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110527 entries, 0 to 110526
Data columns (total 4 columns):
 #   Column        Non-Null Count   Dtype
---  ------        --------------   -----
 0   Scholarship   110527 non-null  bool 
 1   Hipertension  110527 non-null  bool 
 2   Diabetes      110527 non-null  bool 
 3   Alcoholism    110527 non-null  bool 
dtypes: bool(4)
memory usage: 431.9 KB


df[['Scholarship', 'Hipertension', 'Diabetes', 'Alcoholism']].describe()


print(f'"handicap" unique values: {df.Handcap.unique()}')
print(f'"sms_received" unique values: {df.SMS_received.unique()}')

"handicap" unique values: [0 1 2 3 4]
"sms_received" unique values: [0 1]


#converting SMS Recieved Coulmn
df.SMS_received = df.SMS_received.astype(bool)


df.rename(columns={'Handcap': 'Handicap'}, inplace=True)


df[['Handicap', 'SMS_received']].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110527 entries, 0 to 110526
Data columns (total 2 columns):
 #   Column        Non-Null Count   Dtype
---  ------        --------------   -----
 0   Handicap      110527 non-null  int64
 1   SMS_received  110527 non-null  bool 
dtypes: bool(1), int64(1)
memory usage: 971.6 KB


df.rename(columns={'No-show': 'no_show'}, inplace=True)
print(f'"no_show" unique values: {df.no_show.unique()}')

"no_show" unique values: ['No' 'Yes']


df.no_show = df.no_show.apply(lambda x: x == 'Yes')
print(f'"no_show" unique values: {df.no_show.unique()}')

"no_show" unique values: [False  True]


# To comfirm
df.no_show.dtype

dtype('bool')


df['show_up'] = ~df.no_show
df[['no_show', 'show_up']].head()


df.drop(columns='no_show', inplace=True)
df.columns

Index(['Gender', 'ScheduledDay', 'AppointmentDay', 'Age', 'Neighbourhood',
       'Scholarship', 'Hipertension', 'Diabetes', 'Alcoholism', 'Handicap',
       'SMS_received', 'Waiting_time', 'show_up'],
      dtype='object')


df.head()


df.describe()


df.Age.hist(bins=50)
plt.title('Histogram of Age')
plt.xlabel('Age')
plt.ylabel('Count');


df.query('Age<0')


df.drop(df[df.Age < 0].index, inplace=True)


#Now let's see the new distribution of age:

df.Age.describe()

count    110526.000000
mean         37.089219
std          23.110026
min           0.000000
25%          18.000000
50%          37.000000
75%          55.000000
max         115.000000
Name: Age, dtype: float64


df.query('Waiting_time<0')


df.drop(df.query('Waiting_time<0').index, inplace=True)


# lets see our waiting time distribution
df.Waiting_time.describe()

count    110521.000000
mean         10.184345
std          15.255153
min           0.000000
25%           0.000000
50%           4.000000
75%          15.000000
max         179.000000
Name: Waiting_time, dtype: float64


print(f'"gender" unique values: {df.Gender.unique()}')

"gender" unique values: ['F' 'M']


neighbourhood_values = df.Neighbourhood.unique()
neighbourhood_values.sort()
print('\n'.join(neighbourhood_values))

AEROPORTO
ANDORINHAS
ANTÔNIO HONÓRIO
ARIOVALDO FAVALESSA
BARRO VERMELHO
BELA VISTA
BENTO FERREIRA
BOA VISTA
BONFIM
CARATOÍRA
CENTRO
COMDUSA
CONQUISTA
CONSOLAÇÃO
CRUZAMENTO
DA PENHA
DE LOURDES
DO CABRAL
DO MOSCOSO
DO QUADRO
ENSEADA DO SUÁ
ESTRELINHA
FONTE GRANDE
FORTE SÃO JOÃO
FRADINHOS
GOIABEIRAS
GRANDE VITÓRIA
GURIGICA
HORTO
ILHA DAS CAIEIRAS
ILHA DE SANTA MARIA
ILHA DO BOI
ILHA DO FRADE
ILHA DO PRÍNCIPE
ILHAS OCEÂNICAS DE TRINDADE
INHANGUETÁ
ITARARÉ
JABOUR
JARDIM CAMBURI
JARDIM DA PENHA
JESUS DE NAZARETH
JOANA D´ARC
JUCUTUQUARA
MARIA ORTIZ
MARUÍPE
MATA DA PRAIA
MONTE BELO
MORADA DE CAMBURI
MÁRIO CYPRESTE
NAZARETH
NOVA PALESTINA
PARQUE INDUSTRIAL
PARQUE MOSCOSO
PIEDADE
PONTAL DE CAMBURI
PRAIA DO CANTO
PRAIA DO SUÁ
REDENÇÃO
REPÚBLICA
RESISTÊNCIA
ROMÃO
SANTA CECÍLIA
SANTA CLARA
SANTA HELENA
SANTA LUÍZA
SANTA LÚCIA
SANTA MARTHA
SANTA TEREZA
SANTO ANDRÉ
SANTO ANTÔNIO
SANTOS DUMONT
SANTOS REIS
SEGURANÇA DO LAR
SOLON BORGES
SÃO BENEDITO
SÃO CRISTÓVÃO
SÃO JOSÉ
SÃO PEDRO
TABUAZEIRO
UNIVERSITÁRIO
VILA RUBIM


df.Neighbourhood.value_counts().idxmin()

'PARQUE INDUSTRIAL'

df


df.to_csv('./medical_appointment_no_show_cleaned.csv', index=False)


data = pd.read_csv('./medical_appointment_no_show_cleaned.csv',
                 infer_datetime_format=True)


data.head(3)


print(data['show_up'].value_counts())
print(data['show_up'].value_counts(normalize=True).mul(100).astype(str)+'%')
data['show_up'].value_counts().sort_values().plot(kind = 'barh')

True     88207
False    22314
Name: show_up, dtype: int64
True     79.81017182254956%
False    20.18982817745044%
Name: show_up, dtype: object

<AxesSubplot:>


data['Age'].plot(kind='hist', title='The distribution of Patient\'s Age');


data['Age'].plot(kind='box');


#To create a contigency table
chisqt = pd.crosstab(data['Gender'], data['show_up'], margins = False)
chisqt


# Gender Distribution of showing up for appointment
title='A bar chart showing gender with respect to showing up for appointment'
chisqt.plot(kind='bar', title=title);


# run chi-square test for independence
chi2_contingency(chisqt)
print ('p-value:', chi2_contingency(chisqt)[1].round(4))

p-value: 0.172


#To create a contigency table
chisqt = pd.crosstab(data['Neighbourhood'], data['show_up'], margins = False)
chisqt.head()


# Neighbourhood Distribution of showing up for appointment

title='A bar chart showing neighbourhood with respect to showing up for appointment'
chisqt.plot(kind='barh', title=title, figsize=(10,30));


# run chi-square test for independence
chi2_contingency(chisqt)
print ('p-value:', chi2_contingency(chisqt)[1].round(4))

p-value: 0.0


#To create a contigency table
chisqt = pd.crosstab(data['Alcoholism'], data['show_up'], margins = False)
chisqt.head()


# Alcoholism Distribution of showing up for appointment

title='A bar chart showing Alcoholism with respect to showing up for appointment'
chisqt.plot(kind='bar', title=title, figsize=(20,8), fontsize=14);


# run chi-square test for independence
chi2_contingency(chisqt)
print ('p-value:', chi2_contingency(chisqt)[1].round(4))

p-value: 0.9694


hc=data.groupby('show_up')['Handicap'].mean()
hc

show_up
False    0.020212
True     0.022742
Name: Handicap, dtype: float64


hc.plot(kind='bar', title='Average waiting for showing up for appointment');


#To create a contigency table
chisqt = pd.crosstab(data['Handicap'], data['show_up'], margins = False)
print(chisqt.head())

# run chi-square test for independence
chi2_contingency(chisqt)
print ('p-value:', chi2_contingency(chisqt)[1].round(4))

# Handicap Distribution of showing up for appointment

title='A bar chart showing Handicap with respect to showing up for appointment'
chisqt.plot(kind='bar', title=title, figsize=(20,8), fontsize=14);

show_up   False   True
Handicap              
0         21909  86373
1           364   1676
2            37    146
3             3     10
4             1      2
p-value: 0.1123


#To create a contigency table
chisqt = pd.crosstab(data['Hipertension'], data['show_up'], margins = False)
chisqt.head()


# Hipertension Distribution of showing up for appointment

title='A bar chart showing Hipertension with respect to showing up for appointment'
chisqt.plot(kind='bar', title=title, figsize=(15,5));


# run chi-square test for independence
chi2_contingency(chisqt)
print('p-value:', chi2_contingency(chisqt)[1].round(4))

p-value: 0.0


#To create a contigency table
chisqt = pd.crosstab(data['Diabetes'], data['show_up'], margins = False)
chisqt.head()


# Diabetes Distribution of showing up for appointment

title='A bar chart showing Diabetes with respect to showing up for appointment'
chisqt.plot(kind='bar', title=title, figsize=(15,5));


# run chi-square test for independence
chi2_contingency(chisqt)
print('p-value:', chi2_contingency(chisqt)[1].round(4))

p-value: 0.0


wt=data.groupby('show_up')['Waiting_time'].mean().round()
wt

show_up
False    16.0
True      9.0
Name: Waiting_time, dtype: float64


wt.plot(kind='barh', title='Average waiting for showing up for appointment');


#To create a contigency table
chisqt = pd.crosstab(data['Scholarship'], data['show_up'], margins = False)
chisqt.head()


# Neighbourhood Distribution of showing up for appointment

title='A bar chart showing scholarship with respect to showing up for appointment'
chisqt.plot(kind='bar', title=title, figsize=(20,8));


# run chi-square test for independence
chi2_contingency(chisqt)
print ('p-value:', chi2_contingency(chisqt)[1].round(4))

p-value: 0.0


no_illment = data[(data.Scholarship==False) & (data.Hipertension==False) & (data.Diabetes==False) & (data.Alcoholism==False) & (data.Handicap<1) & (data.SMS_received==False)]
no_illment.head()


#To create a contigency table
chisqt = pd.crosstab(no_illment['Scholarship'], no_illment['show_up'], margins = False)
chisqt.head()


print(no_illment['show_up'].value_counts())
no_illment['show_up'].value_counts(normalize=True).mul(100).astype(str)+'%'

True     42394
False     8642
Name: show_up, dtype: int64

True      83.06685476918254%
False    16.933145230817463%
Name: show_up, dtype: object


no_illment.show_up.value_counts().sort_values().plot(kind = 'barh')

<AxesSubplot:>


#To create a contigency table
chisqt = pd.crosstab(data['SMS_received'], data['show_up'], margins = False)
chisqt.head()


# Hipertension Distribution of showing up for appointment

title='A bar chart showing Hipertension with respect to showing up for appointment'
chisqt.plot(kind='bar', title=title, figsize=(15,5));


# run chi-square test for independence
chi2_contingency(chisqt)
print ('p-value:', chi2_contingency(chisqt)[1].round(4))

p-value: 0.0

	PatientId	AppointmentID	Gender	ScheduledDay	AppointmentDay	Age	Neighbourhood	Hipertension	Diabetes	No-show
0	2.987250e+13	5642903	F	2016-04-29T18:38:08Z	2016-04-29T00:00:00Z	62	JARDIM DA PENHA	1	0	No
1	5.589978e+14	5642503	M	2016-04-29T16:08:27Z	2016-04-29T00:00:00Z	56	JARDIM DA PENHA	0	0	No
2	4.262962e+12	5642549	F	2016-04-29T16:19:04Z	2016-04-29T00:00:00Z	62	MATA DA PRAIA	0	0	No
3	8.679512e+11	5642828	F	2016-04-29T17:29:31Z	2016-04-29T00:00:00Z	8	PONTAL DE CAMBURI	0	0	No
4	8.841186e+12	5642494	F	2016-04-29T16:07:23Z	2016-04-29T00:00:00Z	56	JARDIM DA PENHA	1	1	No
5	9.598513e+13	5626772	F	2016-04-27T08:36:51Z	2016-04-29T00:00:00Z	76	REPÚBLICA	1	0	No
6	7.336882e+14	5630279	F	2016-04-27T15:05:12Z	2016-04-29T00:00:00Z	23	GOIABEIRAS	0	0	Yes
7	3.449833e+12	5630575	F	2016-04-27T15:39:58Z	2016-04-29T00:00:00Z	39	GOIABEIRAS	0	0	Yes
8	5.639473e+13	5638447	F	2016-04-29T08:02:16Z	2016-04-29T00:00:00Z	21	ANDORINHAS	0	0	No
9	7.812456e+13	5629123	F	2016-04-27T12:48:25Z	2016-04-29T00:00:00Z	19	CONQUISTA	0	0	No

	ScheduledDay	AppointmentDay
0	2016-04-29	2016-04-29
1	2016-04-29	2016-04-29
2	2016-04-29	2016-04-29
3	2016-04-29	2016-04-29
4	2016-04-29	2016-04-29

	PatientId	AppointmentID	Age	Scholarship	Hipertension	Diabetes	Alcoholism	Handcap	SMS_received
count	1.105270e+05	1.105270e+05	110527.000000	110527.000000	110527.000000	110527.000000	110527.000000	110527.000000	110527.000000
mean	1.474963e+14	5.675305e+06	37.088874	0.098266	0.197246	0.071865	0.030400	0.022248	0.321026
std	2.560949e+14	7.129575e+04	23.110205	0.297675	0.397921	0.258265	0.171686	0.161543	0.466873
min	3.921784e+04	5.030230e+06	-1.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
25%	4.172614e+12	5.640286e+06	18.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
50%	3.173184e+13	5.680573e+06	37.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
75%	9.439172e+13	5.725524e+06	55.000000	0.000000	0.000000	0.000000	0.000000	0.000000	1.000000
max	9.999816e+14	5.790484e+06	115.000000	1.000000	1.000000	1.000000	1.000000	4.000000	1.000000

	Scholarship	Hipertension	Diabetes	Alcoholism
0	False	True	False	False
1	False	False	False	False
2	False	False	False	False
3	False	False	False	False
4	False	True	True	False

	Scholarship	Hipertension	Diabetes	Alcoholism
count	110527	110527	110527	110527
unique	2	2	2	2
top	False	False	False	False
freq	99666	88726	102584	107167

	Gender	ScheduledDay	AppointmentDay	Age	Neighbourhood	Scholarship	Hipertension	Diabetes	Alcoholism	Handicap	SMS_received	Waiting_time	show_up
27033	M	2016-05-10	2016-05-09	38	RESISTÊNCIA	False	False	False	False	1	False	-1	False
55226	F	2016-05-18	2016-05-17	19	SANTO ANTÔNIO	False	False	False	False	1	False	-1	False
64175	F	2016-05-05	2016-05-04	22	CONSOLAÇÃO	False	False	False	False	0	False	-1	False
71533	F	2016-05-11	2016-05-05	81	SANTO ANTÔNIO	False	False	False	False	0	False	-6	False
72362	M	2016-05-04	2016-05-03	7	TABUAZEIRO	False	False	False	False	0	False	-1	False

show_up	False	True
Neighbourhood
AEROPORTO	1	7
ANDORINHAS	521	1741
ANTÔNIO HONÓRIO	50	221
ARIOVALDO FAVALESSA	62	220
BARRO VERMELHO	91	332

show_up	False	True
Gender
F	14591	57245
M	7723	30962

Medical Appointment No Shows Analysis

Table of Contents

Introduction¶

About Dataset¶

Dataset link: https://www.kaggle.com/joniarroba/noshowappointments¶

Context¶

Data Dictionary¶

Questions for Analysis¶

Importation of packages¶

Data Wrangling¶

General Properties¶

Observations¶

Data Cleaning (Replace this with more specific notes!¶

Working with the ScheduledDay and AppointmentDay¶

Converting scholarship, hipertension, diabetes, and alcoholism¶

Processing handicap and sms_received¶

Converting no_show into show_up¶

Remaining columns...¶

Column Data Types Conclusion¶

Fixing Column Values¶

Fixing age values¶

Confirming validity of gender¶

Confirming validity of neighbourhood¶

I would like to see the head and tail of our data table¶

End Data Wrangling¶

Exploratory Data Analysis¶

Data Overview¶

Research Question 1. Distribution of Patient's Age¶

Note: The Pearson’s Chi-Square statistical hypothesis will be used.¶

Research Question 2: Is there a relationship between Gender and showing up for appointments?¶

Decision¶

Conclusion¶

Research Question 3: Is there a relationship between Neighbourhood and Showing up for Appointments?¶

Decision¶

Conclusion¶

Research Question 4: Is there a relationship between Alcoholism and Showing up for Appointments?¶

Decision¶

Conclusion¶

Research Question 5: Is there a relationship between individual illments and showing for appointments?¶

For Handicap¶

Decision¶

Conclusion¶

For Hipertension¶

Decision¶

Conclusion¶

Diabetes¶

Decision¶

Conclusion¶

Research Question 6: Does waiting time affect showing up for appointments?¶

Observation¶

Research Question 7: Is there a relationship between scholarship and showing for appointments?¶

Decision¶

Conclusion¶

Research Question 8: Is there a relationship between absence of "scholarship, illments, alcholism" and showing for appointments?¶

Research Question 9: Is there a relationship between SMS Received and showing for appointments?¶

Conclusions¶

Navigation¶

Dataset link: https://www.kaggle.com/joniarroba/noshowappointments ¶