Author: Sultan Albogami
Last Updated: 3/31/2020
Description: Initial investigations on COVID-19 data state and county wise so as to discover patterns, spot anomalies, test hypothesis and check assumptions with the help of summary statistics and graphical representations.
Importing Libraries
import os
# !pip install numpy, run only for the first time.
import numpy as np
# !pip install pandas
import pandas as pd
# !pip install matplotlib
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
from matplotlib import style
style.use('ggplot')
Reading Data
os.chdir(r"/home/jovyan/")
df = pd.read_csv(r'util/data/us-states-03-30-20.csv')
df.head()
df.shape
ax = plt.gca()
df.plot(kind='line', x='date', y='cases', figsize=(12, 8), ax=ax)
df.plot(kind='line', x='date', y='deaths', figsize=(12, 8), ax=ax)
plt.ylabel('Count')
plt.title('Increase of cases and deaths over time')
plt.show()
# Sum the cases and deaths
latest_sum = df.groupby(['state'])['cases', 'deaths'].agg('sum')
# Sort in descending order
latest_sum = latest_sum.sort_values(by=['cases', 'deaths'], ascending=False)
latest_sum.head(10)
# Plot the result
latest_sum.head(10).plot(kind='bar', figsize=(10, 6))
plt.ylabel('Count')
plt.title('Top 10 states with the most number of cases and deaths as of 03-30-2020')
plt.show()
Total Number of Cases and Deaths as of 2020-03-30
latest_total = df.groupby('date')['cases', 'deaths'].sum().reset_index()
latest_total = latest_total[latest_total['date']==max(latest_total['date'])].reset_index(drop=True)
latest_total
# Extract new cases and deaths by date using loc.
present_stats = df.loc[df['date'] == '2020-03-30', ['date', 'state', 'cases', 'deaths']]
# Present death percentage
present_stats['death percentage'] = (present_stats['deaths'] / present_stats['cases']) * 100
# Sort in descending order
present_stats = present_stats.sort_values(by=['cases', 'deaths', 'death percentage'], ascending=False)
present_stats.head(10)
# Plot the result
present_stats.head(10).plot(kind='bar', x='state', y='death percentage', figsize=(10, 6))
# Set the plot title
plt.title('Top 10 states with the highest death death percentage as of 03-30-2020')
# !pip install plotly
# !conda install psutil --yes
import plotly.express as px
fig = px.bar(df , x='date', y='cases', color='state', labels={'y':'cases'},
hover_data=['state'],
title='Evolution of Reported COVID-19 Cases in the United States')
fig.show()
fig = px.bar(df , x='date', y='deaths', color='state', labels={'y':'cases'},
hover_data=['state'],
title='Evolution of Reported COVID-19 Deaths in the United States')
fig.show()
# Tree Map Visualization of COVID-19 Cases by Date and State
fig = px.treemap(df.sort_values(by='cases', ascending=False).reset_index(drop=True),
path=["state", "date"], values="cases", height=700,
title='Number of COVID-19 Cases by State and Date',
color_discrete_sequence = px.colors.qualitative.Prism)
fig.data[0].textinfo = 'label+text+value'
fig.show()
# Tree Map Visualization of COVID-19 Death Cases by State and Date
fig = px.treemap(df.sort_values(by='deaths', ascending=False).reset_index(drop=True),
path=["state", "date"], values="deaths", height=700,
title='Number of deaths from COVID-19 by State and Date',
color_discrete_sequence = px.colors.qualitative.Prism)
fig.data[0].textinfo = 'label+text+value'
fig.show()
df = pd.read_csv('util/data/us-counties-03-30-20.csv')
df.head()
df.shape
# Sum the cases and deaths
latest_sum = df.groupby(['county'])['cases', 'deaths'].agg('sum')
# Sort in descending order
latest_sum = latest_sum.sort_values(by=['cases', 'deaths'], ascending=False)
latest_sum.head(10)
# Plot the result
latest_sum.head(10).plot(kind='bar', figsize=(10, 6))
plt.ylabel('Count')
plt.title('Top 10 counties with the most number of cases and deaths as of 03-30-2020')
plt.show()
fig = px.bar(df, x='date', y='cases', color='county', labels={'y':'cases'},
hover_data=['county'],
title='Evolution of Reported COVID-19 Cases in the United States Counties')
fig.show()
fig = px.bar(df, x='date', y='deaths', color='county', labels={'y':'cases'},
hover_data=['county'],
title='Evolution of Reported COVID-19 Deaths in the United States Counties')
fig.show()
# Tree Map Visualization of COVID-19 Cases by County and Date
fig = px.treemap(df.sort_values(by='cases', ascending=False).reset_index(drop=True),
path=["county", "date"], values="deaths", height=700,
title='Number of deaths from COVID-19 by County and Date',
color_discrete_sequence = px.colors.qualitative.Prism)
fig.data[0].textinfo = 'label+text+value'
fig.show()
# Tree Map Visualization of COVID-19 Deaths by County and Date
fig = px.treemap(df.sort_values(by='deaths', ascending=False).reset_index(drop=True),
path=["county", "date"], values="deaths", height=700,
title='Number of deaths from COVID-19 by County and Date',
color_discrete_sequence = px.colors.qualitative.Prism)
fig.data[0].textinfo = 'label+text+value'
fig.show()