import numpy as nb
import pandas as pd
df1 = pd.read_csv('earthquakes-dataset.csv')
df1.head()
date_demo = '21st of July 2000'
date_demo
date_demo_datetime = pd.to_datetime(date_demo)
date_demo_datetime
df1['Datetime'] = pd.to_datetime(df1['Date'] + ' ' + df1['Time'], errors = 'coerce')
df1.head()
df1['Datetime'][0]
df1['Datetime'][0].strftime('%A or %a and in %B')
df1 = df1.set_index(['Datetime'])
df1.head()
df1.dtypes
df1['Depth'] = df1['Depth'].astype(int)
df1.dtypes
using pandas, read_csv the data, use Date and Time columns to parse Date_Time column, and make it the index
df1 = pd.read_csv('earthquakes-dataset.csv', index_col=0, parse_dates=[['Date','Time']])
df1.head()
df1.index
df1.index = pd.to_datetime(df1.index, errors='coerce')
df1.index
df1.info()
Drop any column (axis=1) that has null values
df1 = df1.dropna(axis=1)
df1.info()
df2 = df1[['Latitude','Longitude','Type','Depth','Magnitude']]
df2.sample(5)
df2.Type.unique()
df2[df2.Type=='Earthquake'].Magnitude.max()
df2[df2.Type=='Earthquake'].Depth.max()
df2[df2.Type=='Earthquake'].Depth.min()
df2.Type.value_counts()
df2['Magnitude'][df2.Type=='Earthquake'].resample('1D').mean()
df2['Magnitude'][df2.Type=='Earthquake'].resample('3M').std()
df2.loc[df2['Magnitude']==9.1]
df2.loc[df2['Depth']==-1.1]
df2.index.year[df2.Type=='Nuclear Explosion'].value_counts().sort_index()
import matplotlib as mpl
import matplotlib.pyplot as plt
Enable static images of plots inside the notebook
%matplotlib inline
plt.hist(df2['Magnitude'])
plt.hist(df2['Magnitude'])
plt.xlabel('Magnitude')
plt.ylabel('Number of Earthquakes')
plt.title('1965-2016 Earthquakes')
plt.hist(df2['Magnitude'], bins=20, edgecolor='black')
plt.xlabel('Magnitude')
plt.ylabel('Number of Earthquakes')
plt.title('1965-2016 Earthquakes')
plt.hist(df2['Magnitude'], bins=[6,6.1,6.2,6.3,6.4,6.5,6.6,6.7,6.8,6.9,7], edgecolor='black')
plt.xlabel('Magnitude')
plt.ylabel('Number of Earthquakes')
plt.title('1965-2016 Earthquakes')
make a function
def myplot(bins):
plt.hist(df2['Magnitude'], bins=20, edgecolor='black')
plt.xlabel('Magnitude')
plt.ylabel('Number of Earthquakes')
plt.title('1965-2016 Earthquakes')
myplot(20)
Use shift-tab multiple times to get documentation on a function
plt.hist(df2['Magnitude'], bins=20, edgecolor='black', range=[8,9.1])
plt.xlabel('Magnitude')
plt.ylabel('Number of Earthquakes')
plt.title('1965-2016 Earthquakes')
plt.plot(df2['Magnitude'][df1.Type=='Earthquake'].resample('1Y').max())
Investigate the shape mismatch error by checking the shape of each part
df2.index.year.unique().shape
df2.index.year[df2.Type=='Earthquake'].value_counts().sort_index().shape
df2.index.year.isnull().sum()
df2.loc[df2.index.year.isnull()]
the nulls are causing a mismatch, so drop them
df2 = df2.reset_index().dropna().set_index('Date_Time')
now try it again
plt.bar(df2.index.year.unique(),df2.index.year[df2.Type=='Earthquake'].value_counts().sort_index())
plt.bar(df2[df2.Type=='Nuclear Explosion'].index.year.unique(),
df2.index.year[df2.Type=='Nuclear Explosion'].value_counts().sort_index())
plt.title('Nukes per year')
plt.scatter(df2[df2.Type=='Earthquake'].index.year.unique(),
df2.index.year[df2.Type=='Earthquake'].value_counts().sort_index())
plt.scatter(df2[df2.Type=='Earthquake'].index.year.unique(),
df2.index.year[df2.Type=='Earthquake'].value_counts().sort_index(), df2['Depth'], 'green', alpha=0.7)
plt.scatter(df2['Magnitude'],df2['Depth'])
plt.figure(figsize=(19,10))
plt.scatter(df2['Longitude'],df2['Latitude'],df2['Magnitude']*10,df2['Depth'])
plt.scatter(df2[df2.Type=='Earthquake'].index.year.unique(),
df2.index.year[df2.Type=='Earthquake'].value_counts().sort_index(), df2['Depth'], 'green', alpha=0.7)
area= df2['Depth']
for area in [10,50,100]:
plt.scatter([],[],c='k',s=area,label=str(area))
plt.legend(scatterpoints=1, title='I am legend', loc='lower center')