# import the necessary libraries
import numpy as np
import pandas as pd
import os
# Visualisation libraries
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()
import pycountry
import plotly.express as px
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
import plotly.offline as py
from plotly.offline import download_plotlyjs,init_notebook_mode,plot,iplot
import chart_studio.plotly as py
import cufflinks
cufflinks.go_offline()
cufflinks.set_config_file(world_readable=True, theme='pearl')
#py.init_notebook_mode(connected=True)
#Geographical Plotting
import folium
from folium import Choropleth, Circle, Marker
from folium import plugins
from folium.plugins import HeatMap, MarkerCluster
#Racing Bar Chart
import bar_chart_race as bcr
from IPython.display import HTML
# Increase the default plot size and set the color scheme
plt.rcParams['figure.figsize'] = 8, 5
plt.style.use("fivethirtyeight")# for pretty graphs
for dirname, _, filenames in os.walk('/kaggle/input'):
for filename in filenames:
print(os.path.join(dirname, filename))
# Disable warnings
import warnings
warnings.filterwarnings('ignore')
city_day = pd.read_csv('city_day.csv')
#city_hour = pd.read_csv('../input/air-quality-data-in-india/city_hour.csv')
#station = pd.read_csv('../input/air-quality-data-in-india/stations.csv')
#station_day = pd.read_csv('../input/air-quality-data-in-india/station_day.csv')
#station_hour = pd.read_csv('../input/air-quality-data-in-india/station_hour.csv')
cities_db = pd.read_csv('Indian Cities Database.csv')
We have daily and hourly city data as well as daily and hourly Station data. Station refers to the continuous pollution monitoring stations operated and maintained by the Central Pollution Control Board (CPCB) and the State Pollution Control Boards. Let's begin by analyzing the various cities' daily data to get a big picture. We shall begin by importing the dataset and the necessary libraries for the analysis.
display("CITY DAILY DATA")
display(city_day.head())
city_day.info()
It is always a good idea to see the missing values as a percentage of total values
# Missing values
def missing_values_table(df):
# Total missing values
mis_val = df.isnull().sum()
# Percentage of missing values
mis_val_percent = 100 * df.isnull().sum() / len(df)
# Make a table with the results
mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
# Rename the columns
mis_val_table_ren_columns = mis_val_table.rename(
columns = {0 : 'Missing Values', 1 : '% of Total Values'})
# Sort the table by percentage of missing descending
mis_val_table_ren_columns = mis_val_table_ren_columns[
mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
'% of Total Values', ascending=False).round(1)
# Print some summary information
print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"
"There are " + str(mis_val_table_ren_columns.shape[0]) +
" columns that have missing values.")
# Return the dataframe with missing information
return mis_val_table_ren_columns
missing_values= missing_values_table(city_day)
missing_values.style.background_gradient(cmap='Greens')
cities = city_day['City'].value_counts()
print(f'Total number of cities in the dataset : {len(cities)}')
print(cities.index)
#Converting the `Date` column into Date Time format with the help of `datetime` library.
# Convert string to datetime64
city_day['Date'] = pd.to_datetime(city_day['Date'])
#city_day.set_index('Date',inplace=True)
print(f"The available data is between {city_day['Date'].min()} and {city_day['Date'].max()}")
Let's now analyse the work with the data to see what patterns and insights we can uncover from it.
city_day['BTX'] = city_day['Benzene']+city_day['Toluene']+city_day['Xylene']
city_day.drop(['Benzene','Toluene','Xylene'],axis=1);
city_day['Particulate_Matter'] = city_day['PM2.5']+city_day['PM10']
We will select a few prominent ones. Let's create a new dataframe called pollutants containg the major pollutants responsible for air pollution.
pollutants = ['PM2.5','PM10','NO2', 'CO', 'SO2','O3', 'BTX']
def trend_plot(dataframe,value):
# Prepare data
df['year'] = [d.year for d in df.Date]
df['month'] = [d.strftime('%b') for d in df.Date]
years = df['year'].unique()
# Draw Plot
fig, axes = plt.subplots(1, 2, figsize=(14,6), dpi= 80)
sns.boxplot(x='year', y=value, data=df, ax=axes[0])
sns.pointplot(x='month', y=value, data=df.loc[~df.year.isin([2015, 2020]), :])
# Set Title
axes[0].set_title('Year-wise Box Plot \n(The Trend)', fontsize=18);
axes[1].set_title('Month-wise Plot \n(The Seasonality)', fontsize=18)
plt.show()
city_day.reset_index(inplace=True)
df = city_day.copy()
value='NO2'
trend_plot(df,value)
#city_day.reset_index(inplace=True)
df = city_day.copy()
value='SO2'
trend_plot(df,value)
value='BTX'
trend_plot(df,value)
value='PM2.5'
trend_plot(df,value)
value='PM10'
trend_plot(df,value)
def max_polluted_city(pollutant):
x1 = city_day[[pollutant,'City']].groupby(["City"]).mean().sort_values(by=pollutant,ascending=False).reset_index()
x1[pollutant] = round(x1[pollutant],2)
return x1[:10].style.background_gradient(cmap='GnBu')
#source: https://stackoverflow.com/questions/38783027/jupyter-notebook-display-two-pandas-tables-side-by-side
from IPython.display import display_html
def display_side_by_side(*args):
html_str=''
for df in args:
html_str+=df.render()
display_html(html_str.replace('table','table style="display:inline"'),raw=True)
pm2_5 = max_polluted_city('PM2.5')
pm10 = max_polluted_city('PM10')
no2 = max_polluted_city('NO2')
so2 = max_polluted_city('SO2')
co = max_polluted_city('CO')
btx = max_polluted_city('BTX')
display_side_by_side(pm2_5,pm10,no2,so2,co,btx)
x2= city_day[['PM2.5','City']].groupby(["City"]).median().sort_values(by='PM2.5',ascending=False).reset_index()
x3 = city_day[['PM10','City']].groupby(["City"]).median().sort_values(by='PM10',ascending=False).reset_index()
from plotly.subplots import make_subplots
fig = make_subplots(
rows=1, cols=2,
subplot_titles=("PM2.5","PM10"))
fig.add_trace(go.Bar( y=x2['PM2.5'], x=x2["City"],
marker=dict(color=x2['PM2.5'], coloraxis="coloraxis")),
1, 1)
fig.add_trace(go.Bar( y=x3['PM10'], x=x2["City"],
marker=dict(color=x3['PM10'], coloraxis="coloraxis")),
1, 2)
fig.update_layout(coloraxis=dict(colorscale='blues'), showlegend=False,plot_bgcolor='white')
fig.update_xaxes(ticks="outside", tickwidth=2,tickangle=45, tickcolor='crimson', ticklen=10,title_text="cities")
fig.update_yaxes(title_text="ug / m3", row=1, col=1)
fig.update_yaxes(title_text="ug / m3", row=1, col=2)
fig.show()
x4= city_day[['CO','City']].groupby(["City"]).median().sort_values(by='CO',ascending=False).reset_index()
x5 = city_day[['NO2','City']].groupby(["City"]).median().sort_values(by='NO2',ascending=False).reset_index()
x6 = city_day[['SO2','City']].groupby(["City"]).median().sort_values(by='SO2',ascending=False).reset_index()
from plotly.subplots import make_subplots
fig = make_subplots(
rows=1, cols=3,
subplot_titles=("CO","NO2",'SO2'))
fig.add_trace(go.Bar( y=x4['CO'], x=x4["City"],
marker=dict(color=x4['CO'], coloraxis="coloraxis")),
1, 1)
fig.add_trace(go.Bar( y=x5['NO2'], x=x5["City"],
marker=dict(color=x5['NO2'], coloraxis="coloraxis")),
1, 2)
fig.add_trace(go.Bar( y=x6['SO2'], x=x5["City"],
marker=dict(color=x6['SO2'], coloraxis="coloraxis")),
1, 3)
fig.update_layout(coloraxis=dict(colorscale='blues'), showlegend=False,plot_bgcolor='white')
fig.update_xaxes(ticks="outside", tickwidth=2,tickangle=45, tickcolor='crimson', ticklen=10,title_text="cities")
fig.update_yaxes(title_text="ug / m3", row=1, col=1)
fig.update_yaxes(title_text="ug / m3", row=1, col=2)
fig.update_yaxes(title_text="ug / m3", row=1, col=3)
fig.show()
Let's now see how has the Lockdown affected the AQI levels in the prominent cities of India.For this we will consider the data from 2019 onwards only.
An air quality index (AQI) is used by government agencies[1] to communicate to the public how polluted the air currently is or how polluted it is forecast to become.There are six AQI categories, namely Good, Satisfactory, Moderately polluted, Poor, Very Poor, and Severe. The proposed AQI will consider eight pollutants (PM10, PM2.5, NO2, SO2, CO, O3, NH3, and Pb) for which short-term (up to 24-hourly averaging period) National Ambient Air Quality Standards are prescribed.[23] Based on the measured ambient concentrations, corresponding standards and likely health impact, a sub-index is calculated for each of these pollutants. The worst sub-index reflects overall AQI. Likely health impacts for different AQI categories and pollutants have also been suggested, with primary inputs from the medical experts in the group. The AQI values and corresponding ambient concentrations (health breakpoints) as well as associated likely health impacts for the identified eight pollutants are as follows:[Wikipedia]
from IPython.display import Image
Image(filename='aqi cont.png', width=1200, height=1200)
cities = ['Ahmedabad','Delhi','Bengaluru','Mumbai','Hyderabad','Chennai']
filtered_city_day = city_day[city_day['Date'] >= '2019-01-01']
AQI = filtered_city_day[filtered_city_day.City.isin(cities)][['Date','City','AQI','AQI_Bucket']]
AQI.head()
AQI_pivot = AQI.pivot(index='Date', columns='City', values='AQI')
AQI_pivot.fillna(method='bfill',inplace=True)
from plotly.subplots import make_subplots
import plotly.graph_objects as go
fig = make_subplots(
rows=6, cols=1,
#specs=[[{}, {}],
# [{"colspan": 6}, None]],
subplot_titles=("Ahmedabad","Bengaluru","Chennai","Delhi",'Hyderabad','Mumbai'))
fig.add_trace(go.Bar(x=AQI_pivot.index, y=AQI_pivot['Ahmedabad'],
marker=dict(color=AQI_pivot['Ahmedabad'],coloraxis="coloraxis")),
1, 1)
fig.add_trace(go.Bar(x=AQI_pivot.index, y=AQI_pivot['Bengaluru'],
marker=dict(color=AQI_pivot['Bengaluru'], coloraxis="coloraxis")),
2, 1)
fig.add_trace(go.Bar(x=AQI_pivot.index, y=AQI_pivot['Chennai'],
marker=dict(color=AQI_pivot['Chennai'], coloraxis="coloraxis")),
3, 1)
fig.add_trace(go.Bar(x=AQI_pivot.index, y=AQI_pivot['Delhi'],
marker=dict(color=AQI_pivot['Delhi'], coloraxis="coloraxis")),
4, 1)
fig.add_trace(go.Bar(x=AQI_pivot.index, y=AQI_pivot['Hyderabad'],
marker=dict(color=AQI_pivot['Hyderabad'], coloraxis="coloraxis")),
5, 1)
fig.add_trace(go.Bar(x=AQI_pivot.index, y=AQI_pivot['Mumbai'],
marker=dict(color=AQI_pivot['Mumbai'], coloraxis="coloraxis")),
6, 1)
fig.update_layout(coloraxis=dict(colorscale='Temps'),showlegend=False,title_text="AQI Levels")
fig.update_layout(plot_bgcolor='white')
fig.update_layout( width=800,height=1200,shapes=[
dict(
type= 'line',
yref= 'paper', y0= 0, y1= 1,
xref= 'x', x0= '2020-03-25', x1= '2020-03-25'
)
])
fig.show()
#Source code for racing barchart: https://github.com/dexplo/bar_chart_race
AQI_2020 = AQI_pivot[AQI_pivot.index > '2019-12-31']
bcr_html = bcr.bar_chart_race(df=AQI_2020, filename=None, period_length=300,orientation='v',figsize=(6, 4),bar_label_size=7,tick_label_size=7,title='AQI levels in 2020')
bcr_html
AQI_beforeLockdown = AQI_pivot['2020-01-01':'2020-03-25']
AQI_afterLockdown = AQI_pivot['2020-03-26':'2020-05-01']
print(AQI_beforeLockdown.mean())
print(AQI_afterLockdown.mean())
from matplotlib.patches import Circle, Wedge, Rectangle
def degree_range(n):
start = np.linspace(0,180,n+1, endpoint=True)[0:-1]
end = np.linspace(0,180,n+1, endpoint=True)[1::]
mid_points = start + ((end-start)/2.)
return np.c_[start, end], mid_points
def rot_text(ang):
rotation = np.degrees(np.radians(ang) * np.pi / np.pi - np.radians(90))
return rotation
from matplotlib.patches import Circle, Wedge, Rectangle
def gauge(labels=['GOOD','SATISFACTORY','MODERATE','POOR','VERY POOR','EXTREME'], \
colors='jet_r', arrow=1, title='', fname=False):
N = len(labels)
if arrow > N:
raise Exception("\n\nThe category ({}) is greated than \
the length\nof the labels ({})".format(arrow, N))
if isinstance(colors, str):
cmap = cm.get_cmap(colors, N)
cmap = cmap(np.arange(N))
colors = cmap[::-1,:].tolist()
if isinstance(colors, list):
if len(colors) == N:
colors = colors[::-1]
else:
raise Exception("\n\nnumber of colors {} not equal \
to number of categories{}\n".format(len(colors), N))
#begin the plotting
fig, ax = plt.subplots()
ang_range, mid_points = degree_range(N)
labels = labels[::-1]
#arc
patches = []
for ang, c in zip(ang_range, colors):
# sectors
patches.append(Wedge((0.,0.), .4, *ang, facecolor='w', lw=2))
# arcs
patches.append(Wedge((0.,0.), .4, *ang, width=0.10, facecolor=c, lw=2, alpha=0.5))
[ax.add_patch(p) for p in patches]
#Labels
for mid, lab in zip(mid_points, labels):
ax.text(0.35 * np.cos(np.radians(mid)), 0.35 * np.sin(np.radians(mid)), lab, \
horizontalalignment='center', verticalalignment='center', fontsize=14, \
fontweight='bold', rotation = rot_text(mid))
r = Rectangle((-0.4,-0.1),0.8,0.1, facecolor='w', lw=2)
ax.add_patch(r)
ax.text(0, -0.05, title, horizontalalignment='center', \
verticalalignment='center', fontsize=22, fontweight='bold')
#arrow
pos = mid_points[abs(arrow - N)]
ax.arrow(0, 0, 0.225 * np.cos(np.radians(pos)), 0.225 * np.sin(np.radians(pos)), \
width=0.04, head_width=0.09, head_length=0.1, fc='k', ec='k')
ax.add_patch(Circle((0, 0), radius=0.02, facecolor='k'))
ax.add_patch(Circle((0, 0), radius=0.01, facecolor='w', zorder=11))
ax.set_frame_on(False)
ax.axes.set_xticks([])
ax.axes.set_yticks([])
ax.axis('equal')
plt.tight_layout()
if fname:
fig.savefig(fname, dpi=200)
display("Ahmedabad's AQI levels")
gauge(labels=['Good','Satisfactory','Moderate','Poor','Very Poor','Extreme'], \
colors=['#007A00','#90EE90','#ffff00','#FF9900','#ff0000','#CC0000'], arrow=5, title='Ahmedabad AQI before Lockdown')
gauge(labels=['Good','Satisfactory','Moderate','Poor','Very Poor','Extreme'], \
colors=['#007A00','#90EE90','#ffff00','#FF9900','#ff0000','#CC0000'], arrow=3, title='Ahmedabad AQI After Lockdown')
display("Delhi's AQI levels")
gauge(labels=['Good','Satisfactory','Moderate','Poor','Very Poor','Extreme'], \
colors=['#007A00','#90EE90','#ffff00','#FF9900','#ff0000','#CC0000'], arrow=4, title="Delhi's AQI before Lockdown")
gauge(labels=['Good','Satisfactory','Moderate','Poor','Very Poor','Extreme'], \
colors=['#007A00','#90EE90','#ffff00','#FF9900','#ff0000','#CC0000'], arrow=3, title="Delhi's AQI after Lockdown")
display("Mumbai's AQI levels")
gauge(labels=['Good','Satisfactory','Moderate','Poor','Very Poor','Extreme'], \
colors=['#007A00','#90EE90','#ffff00','#FF9900','#ff0000','#CC0000'], arrow=3, title="Mumbai's AQI before Lockdown")
gauge(labels=['Good','Satisfactory','Moderate','Poor','Very Poor','Extreme'], \
colors=['#007A00','#90EE90','#ffff00','#FF9900','#ff0000','#CC0000'], arrow=3, title="Mumbai's AQI after Lockdown")
start_date1 = '2019-01-01'
end_date1 = '2019-05-01'
mask1 = (city_day['Date'] >= start_date1) & (city_day['Date'] <= end_date1)
pollutants_filtered_2019 = city_day.loc[mask1]
pollutants_filtered_2019.fillna(method='bfill',inplace=True)
pollutants_filtered_2019.set_index('Date',inplace=True);
start_date2 = '2020-01-01'
end_date2 = '2020-05-01'
mask2 = (city_day['Date'] >= start_date2) & (city_day['Date'] <= end_date2)
pollutants_filtered_2020 = city_day.loc[mask2]
pollutants_filtered_2020.fillna(method='bfill',inplace=True)
pollutants_filtered_2020.set_index('Date',inplace=True);
df1 = pollutants_filtered_2019[pollutants_filtered_2019.City.isin(cities)][['City','NO2','SO2','PM2.5','CO']]
df2 = pollutants_filtered_2020[pollutants_filtered_2020.City.isin(cities)][['City','NO2','SO2','PM2.5','CO']]
def pollution_comparison(city):
fig = go.Figure()
fig.add_trace(go.Scatter(x=df1.index, y=df1[df1['City']==city]['NO2'],
line=dict(dash='solid',color='green'),name='NO2'))
fig.add_trace(go.Scatter(x=df1.index, y=df1[df1['City']==city]['SO2'],
line=dict(dash='dot',color='red'),name='SO2'))
fig.add_trace(go.Scatter(x=df1.index, y=df1[df1['City']==city]['PM2.5'],
line=dict(dash='dashdot',color='dodgerblue'),name='Particulate_Matter'))
fig.add_trace(go.Scatter(x=df1.index, y=df1[df1['City']==city]['CO'],
line=dict(dash='longdashdot'),mode='lines',name='CO'))
fig.update_layout(title_text=city+' 2019 ',plot_bgcolor='white')
fig.update_xaxes(rangeslider_visible=True,zeroline=True,zerolinewidth=1, zerolinecolor='Black')
fig.show()
fig = go.Figure()
fig.add_trace(go.Scatter(x=df2.index, y=df2[df2['City']==city]['NO2'],
line=dict(dash='solid',color='green'),name='NO2'))
fig.add_trace(go.Scatter(x=df2.index, y=df2[df2['City']==city]['SO2'],
line=dict(dash='dot',color='red'),name='SO2'))
fig.add_trace(go.Scatter(x=df2.index, y=df2[df2['City']==city]['PM2.5'],
line=dict(dash='dashdot',color='dodgerblue'),name='Particulate_Matter'))
fig.add_trace(go.Scatter(x=df2.index, y=df2[df2['City']==city]['CO'],
line=dict(dash='longdashdot'),mode='lines',name='CO'))
fig.update_layout(title_text=city+' 2020 ',plot_bgcolor='white')
fig.update_xaxes(rangeslider_visible=True,zeroline=True,zerolinewidth=1, zerolinecolor='Black')
fig.show()
pollution_comparison('Ahmedabad')
pollution_comparison('Bengaluru')
pollution_comparison('Chennai')
pollution_comparison('Delhi')
pollution_comparison('Mumbai')
pollution_comparison('Hyderabad')
df11 = pollutants_filtered_2019[['City','AQI']]
df22 = pollutants_filtered_2020[['City','AQI']]
df_2019_coord = df11.groupby('City')['AQI'].mean().to_frame().reset_index()
df_2020_coord = df22.groupby('City')['AQI'].mean().to_frame().reset_index()
df_2019_AQI = pd.merge(df_2019_coord,cities_db,on='City')
df_2019_AQI['AQI'] = df_2019_AQI['AQI'].round(0)
df_2020_AQI = pd.merge(df_2020_coord,cities_db,on='City')
df_2020_AQI['AQI'] = df_2020_AQI['AQI'].round(0)
m = plugins.DualMap(location=(22.9734, 78.6569), tiles=None, zoom_start=5)
folium.TileLayer('openstreetmap').add_to(m)
fg_1 = folium.FeatureGroup(name='2019').add_to(m.m1)
fg_2 = folium.FeatureGroup(name='2020').add_to(m.m2)
for lat, lon, value, name in zip(df_2019_AQI['Lat'], df_2019_AQI['Long'], df_2019_AQI['AQI'], df_2019_AQI['City']):
folium.CircleMarker([lat, lon],
radius=10,
icon=folium.Icon(color='red'),
popup = ('<strong>City</strong>: ' + str(name).capitalize() + '<br>'
'<strong>AQI(Average)</strong>: ' + str(value) + '<br>'),
fill_color='red',
fill_opacity=0.7 ).add_to(fg_1)
for lat, lon, value, name in zip(df_2020_AQI['Lat'], df_2020_AQI['Long'], df_2020_AQI['AQI'], df_2020_AQI['City']):
folium.CircleMarker([lat, lon],
radius=10,
icon=folium.Icon(color='orange'),
popup = ('<strong>City</strong>: ' + str(name).capitalize() + '<br>'
'<strong>AQI(Average)</strong>: ' + str(value) + '<br>'),
fill_color='orange',
fill_opacity=0.7 ).add_to(fg_2)
folium.LayerControl(collapsed=False).add_to(m)
m
We can see a slight drop in concentration of pollutants like SO3, NO2, CO etc. as well as PM because of the lockdown.
Due to less concentration of air pollutants there are less case of asthmas in 2020 and other air pollution related problems when compared to the previous years. So the lockdown was blessing in disguise for Air Quality Index.