import numpy as nb
import pandas as pd
df1 = pd.read_csv('earthquakes-dataset.csv')
df1.head()
Date Time Latitude Longitude Type Depth Depth Error Depth Seismic Stations Magnitude Magnitude Type ... Magnitude Seismic Stations Azimuthal Gap Horizontal Distance Horizontal Error Root Mean Square ID Source Location Source Magnitude Source Status
0 01/02/1965 13:44:18 19.246 145.616 Earthquake 131.6 NaN NaN 6.0 MW ... NaN NaN NaN NaN NaN ISCGEM860706 ISCGEM ISCGEM ISCGEM Automatic
1 01/04/1965 11:29:49 1.863 127.352 Earthquake 80.0 NaN NaN 5.8 MW ... NaN NaN NaN NaN NaN ISCGEM860737 ISCGEM ISCGEM ISCGEM Automatic
2 01/05/1965 18:05:58 -20.579 -173.972 Earthquake 20.0 NaN NaN 6.2 MW ... NaN NaN NaN NaN NaN ISCGEM860762 ISCGEM ISCGEM ISCGEM Automatic
3 01/08/1965 18:49:43 -59.076 -23.557 Earthquake 15.0 NaN NaN 5.8 MW ... NaN NaN NaN NaN NaN ISCGEM860856 ISCGEM ISCGEM ISCGEM Automatic
4 01/09/1965 13:32:50 11.938 126.427 Earthquake 15.0 NaN NaN 5.8 MW ... NaN NaN NaN NaN NaN ISCGEM860890 ISCGEM ISCGEM ISCGEM Automatic

5 rows × 21 columns

date_demo = '21st of July 2000'
date_demo
'21st of July 2000'
date_demo_datetime = pd.to_datetime(date_demo)
date_demo_datetime
Timestamp('2000-07-21 00:00:00')
df1['Datetime'] = pd.to_datetime(df1['Date'] + ' ' + df1['Time'], errors = 'coerce')
df1.head()
Date Time Latitude Longitude Type Depth Depth Error Depth Seismic Stations Magnitude Magnitude Type ... Azimuthal Gap Horizontal Distance Horizontal Error Root Mean Square ID Source Location Source Magnitude Source Status Datetime
0 01/02/1965 13:44:18 19.246 145.616 Earthquake 131.6 NaN NaN 6.0 MW ... NaN NaN NaN NaN ISCGEM860706 ISCGEM ISCGEM ISCGEM Automatic 1965-01-02 13:44:18
1 01/04/1965 11:29:49 1.863 127.352 Earthquake 80.0 NaN NaN 5.8 MW ... NaN NaN NaN NaN ISCGEM860737 ISCGEM ISCGEM ISCGEM Automatic 1965-01-04 11:29:49
2 01/05/1965 18:05:58 -20.579 -173.972 Earthquake 20.0 NaN NaN 6.2 MW ... NaN NaN NaN NaN ISCGEM860762 ISCGEM ISCGEM ISCGEM Automatic 1965-01-05 18:05:58
3 01/08/1965 18:49:43 -59.076 -23.557 Earthquake 15.0 NaN NaN 5.8 MW ... NaN NaN NaN NaN ISCGEM860856 ISCGEM ISCGEM ISCGEM Automatic 1965-01-08 18:49:43
4 01/09/1965 13:32:50 11.938 126.427 Earthquake 15.0 NaN NaN 5.8 MW ... NaN NaN NaN NaN ISCGEM860890 ISCGEM ISCGEM ISCGEM Automatic 1965-01-09 13:32:50

5 rows × 22 columns

df1['Datetime'][0]
Timestamp('1965-01-02 13:44:18')
df1['Datetime'][0].strftime('%A or %a and in %B')
'Saturday or Sat and in January'
df1 = df1.set_index(['Datetime'])
df1.head()
Date Time Latitude Longitude Type Depth Depth Error Depth Seismic Stations Magnitude Magnitude Type ... Magnitude Seismic Stations Azimuthal Gap Horizontal Distance Horizontal Error Root Mean Square ID Source Location Source Magnitude Source Status
Datetime
1965-01-02 13:44:18 01/02/1965 13:44:18 19.246 145.616 Earthquake 131.6 NaN NaN 6.0 MW ... NaN NaN NaN NaN NaN ISCGEM860706 ISCGEM ISCGEM ISCGEM Automatic
1965-01-04 11:29:49 01/04/1965 11:29:49 1.863 127.352 Earthquake 80.0 NaN NaN 5.8 MW ... NaN NaN NaN NaN NaN ISCGEM860737 ISCGEM ISCGEM ISCGEM Automatic
1965-01-05 18:05:58 01/05/1965 18:05:58 -20.579 -173.972 Earthquake 20.0 NaN NaN 6.2 MW ... NaN NaN NaN NaN NaN ISCGEM860762 ISCGEM ISCGEM ISCGEM Automatic
1965-01-08 18:49:43 01/08/1965 18:49:43 -59.076 -23.557 Earthquake 15.0 NaN NaN 5.8 MW ... NaN NaN NaN NaN NaN ISCGEM860856 ISCGEM ISCGEM ISCGEM Automatic
1965-01-09 13:32:50 01/09/1965 13:32:50 11.938 126.427 Earthquake 15.0 NaN NaN 5.8 MW ... NaN NaN NaN NaN NaN ISCGEM860890 ISCGEM ISCGEM ISCGEM Automatic

5 rows × 21 columns

df1.dtypes
Date                           object
Time                           object
Latitude                      float64
Longitude                     float64
Type                           object
Depth                         float64
Depth Error                   float64
Depth Seismic Stations        float64
Magnitude                     float64
Magnitude Type                 object
Magnitude Error               float64
Magnitude Seismic Stations    float64
Azimuthal Gap                 float64
Horizontal Distance           float64
Horizontal Error              float64
Root Mean Square              float64
ID                             object
Source                         object
Location Source                object
Magnitude Source               object
Status                         object
dtype: object
df1['Depth'] = df1['Depth'].astype(int)
df1.dtypes
Date                           object
Time                           object
Latitude                      float64
Longitude                     float64
Type                           object
Depth                           int64
Depth Error                   float64
Depth Seismic Stations        float64
Magnitude                     float64
Magnitude Type                 object
Magnitude Error               float64
Magnitude Seismic Stations    float64
Azimuthal Gap                 float64
Horizontal Distance           float64
Horizontal Error              float64
Root Mean Square              float64
ID                             object
Source                         object
Location Source                object
Magnitude Source               object
Status                         object
dtype: object

using pandas, read_csv the data, use Date and Time columns to parse Date_Time column, and make it the index

df1 = pd.read_csv('earthquakes-dataset.csv', index_col=0, parse_dates=[['Date','Time']])
df1.head()
Latitude Longitude Type Depth Depth Error Depth Seismic Stations Magnitude Magnitude Type Magnitude Error Magnitude Seismic Stations Azimuthal Gap Horizontal Distance Horizontal Error Root Mean Square ID Source Location Source Magnitude Source Status
Date_Time
01/02/1965 13:44:18 19.246 145.616 Earthquake 131.6 NaN NaN 6.0 MW NaN NaN NaN NaN NaN NaN ISCGEM860706 ISCGEM ISCGEM ISCGEM Automatic
01/04/1965 11:29:49 1.863 127.352 Earthquake 80.0 NaN NaN 5.8 MW NaN NaN NaN NaN NaN NaN ISCGEM860737 ISCGEM ISCGEM ISCGEM Automatic
01/05/1965 18:05:58 -20.579 -173.972 Earthquake 20.0 NaN NaN 6.2 MW NaN NaN NaN NaN NaN NaN ISCGEM860762 ISCGEM ISCGEM ISCGEM Automatic
01/08/1965 18:49:43 -59.076 -23.557 Earthquake 15.0 NaN NaN 5.8 MW NaN NaN NaN NaN NaN NaN ISCGEM860856 ISCGEM ISCGEM ISCGEM Automatic
01/09/1965 13:32:50 11.938 126.427 Earthquake 15.0 NaN NaN 5.8 MW NaN NaN NaN NaN NaN NaN ISCGEM860890 ISCGEM ISCGEM ISCGEM Automatic
df1.index
Index(['01/02/1965 13:44:18', '01/04/1965 11:29:49', '01/05/1965 18:05:58',
       '01/08/1965 18:49:43', '01/09/1965 13:32:50', '01/10/1965 13:36:32',
       '01/12/1965 13:32:25', '01/15/1965 23:17:42', '01/16/1965 11:32:37',
       '01/17/1965 10:43:17',
       ...
       '12/24/2016 03:58:55', '12/25/2016 14:22:27', '12/25/2016 14:32:13',
       '12/27/2016 23:20:56', '12/28/2016 08:18:01', '12/28/2016 08:22:12',
       '12/28/2016 09:13:47', '12/28/2016 12:38:51', '12/29/2016 22:30:19',
       '12/30/2016 20:08:28'],
      dtype='object', name='Date_Time', length=23412)
df1.index = pd.to_datetime(df1.index, errors='coerce')
df1.index
DatetimeIndex(['1965-01-02 13:44:18', '1965-01-04 11:29:49',
               '1965-01-05 18:05:58', '1965-01-08 18:49:43',
               '1965-01-09 13:32:50', '1965-01-10 13:36:32',
               '1965-01-12 13:32:25', '1965-01-15 23:17:42',
               '1965-01-16 11:32:37', '1965-01-17 10:43:17',
               ...
               '2016-12-24 03:58:55', '2016-12-25 14:22:27',
               '2016-12-25 14:32:13', '2016-12-27 23:20:56',
               '2016-12-28 08:18:01', '2016-12-28 08:22:12',
               '2016-12-28 09:13:47', '2016-12-28 12:38:51',
               '2016-12-29 22:30:19', '2016-12-30 20:08:28'],
              dtype='datetime64[ns]', name='Date_Time', length=23412, freq=None)
df1.info()
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 23412 entries, 1965-01-02 13:44:18 to 2016-12-30 20:08:28
Data columns (total 19 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Latitude                    23412 non-null  float64
 1   Longitude                   23412 non-null  float64
 2   Type                        23412 non-null  object 
 3   Depth                       23412 non-null  float64
 4   Depth Error                 4461 non-null   float64
 5   Depth Seismic Stations      7097 non-null   float64
 6   Magnitude                   23412 non-null  float64
 7   Magnitude Type              23409 non-null  object 
 8   Magnitude Error             327 non-null    float64
 9   Magnitude Seismic Stations  2564 non-null   float64
 10  Azimuthal Gap               7299 non-null   float64
 11  Horizontal Distance         1604 non-null   float64
 12  Horizontal Error            1156 non-null   float64
 13  Root Mean Square            17352 non-null  float64
 14  ID                          23412 non-null  object 
 15  Source                      23412 non-null  object 
 16  Location Source             23412 non-null  object 
 17  Magnitude Source            23412 non-null  object 
 18  Status                      23412 non-null  object 
dtypes: float64(12), object(7)
memory usage: 3.6+ MB

Drop any column (axis=1) that has null values

df1 = df1.dropna(axis=1)
df1.info()
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 23412 entries, 1965-01-02 13:44:18 to 2016-12-30 20:08:28
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Latitude          23412 non-null  float64
 1   Longitude         23412 non-null  float64
 2   Type              23412 non-null  object 
 3   Depth             23412 non-null  float64
 4   Magnitude         23412 non-null  float64
 5   ID                23412 non-null  object 
 6   Source            23412 non-null  object 
 7   Location Source   23412 non-null  object 
 8   Magnitude Source  23412 non-null  object 
 9   Status            23412 non-null  object 
dtypes: float64(4), object(6)
memory usage: 2.0+ MB

df2 = df1[['Latitude','Longitude','Type','Depth','Magnitude']]
df2.sample(5)
Latitude Longitude Type Depth Magnitude
Date_Time
2015-03-02 02:50:48 -59.561 -150.6522 Earthquake 13.0 5.5
2005-01-23 03:32:27 -13.882 66.1910 Earthquake 10.0 5.5
2010-07-20 19:38:10 27.022 53.8610 Earthquake 10.0 5.8
1989-02-10 16:59:21 6.317 92.2690 Earthquake 41.7 5.8
1975-01-17 09:30:42 -17.905 -174.5810 Earthquake 153.0 5.8
df2.Type.unique()
array(['Earthquake', 'Nuclear Explosion', 'Explosion', 'Rock Burst'],
      dtype=object)
df2[df2.Type=='Earthquake'].Magnitude.max()
9.1
df2[df2.Type=='Earthquake'].Depth.max()
700.0
df2[df2.Type=='Earthquake'].Depth.min()
-1.1
df2.Type.value_counts()
Earthquake           23232
Nuclear Explosion      175
Explosion                4
Rock Burst               1
Name: Type, dtype: int64
df2['Magnitude'][df2.Type=='Earthquake'].resample('1D').mean()
Date_Time
1965-01-02    6.00
1965-01-03     NaN
1965-01-04    5.80
1965-01-05    6.20
1965-01-06     NaN
              ... 
2016-12-26     NaN
2016-12-27    5.60
2016-12-28    5.65
2016-12-29    6.30
2016-12-30    5.50
Name: Magnitude, Length: 18991, dtype: float64
df2['Magnitude'][df2.Type=='Earthquake'].resample('3M').std()
Date_Time
1965-01-31    0.683318
1965-04-30    0.466179
1965-07-31    0.458874
1965-10-31    0.503535
1966-01-31    0.291472
                ...   
2016-01-31    0.497722
2016-04-30    0.445377
2016-07-31    0.388641
2016-10-31    0.406118
2017-01-31    0.530923
Name: Magnitude, Length: 209, dtype: float64
df2.loc[df2['Magnitude']==9.1]
Latitude Longitude Type Depth Magnitude
Date_Time
2004-12-26 00:58:53 3.295 95.982 Earthquake 30.0 9.1
2011-03-11 05:46:24 38.297 142.373 Earthquake 29.0 9.1
df2.loc[df2['Depth']==-1.1]
Latitude Longitude Type Depth Magnitude
Date_Time
1992-06-28 12:00:45 34.131 -116.408 Earthquake -1.1 5.77
df2.index.year[df2.Type=='Nuclear Explosion'].value_counts().sort_index()
1966.0     1
1968.0     2
1969.0     1
1970.0     1
1973.0     6
1974.0     6
1975.0     9
1976.0    10
1977.0     8
1978.0    15
1979.0    13
1980.0    13
1981.0     8
1982.0     6
1983.0    10
1984.0    14
1985.0     6
1986.0     1
1987.0    16
1988.0    10
1989.0     7
1990.0     4
1991.0     1
1992.0     1
1993.0     1
1994.0     2
1995.0     2
1996.0     1
Name: Date_Time, dtype: int64
import matplotlib as mpl
import matplotlib.pyplot as plt

Enable static images of plots inside the notebook

%matplotlib inline
plt.hist(df2['Magnitude'])
(array([1.4107e+04, 5.6570e+03, 1.8100e+03, 1.1000e+03, 3.9300e+02,
        2.1400e+02, 1.0400e+02, 1.9000e+01, 5.0000e+00, 3.0000e+00]),
 array([5.5 , 5.86, 6.22, 6.58, 6.94, 7.3 , 7.66, 8.02, 8.38, 8.74, 9.1 ]),
 <a list of 10 Patch objects>)
plt.hist(df2['Magnitude'])
plt.xlabel('Magnitude')
plt.ylabel('Number of Earthquakes')
plt.title('1965-2016 Earthquakes')
Text(0.5, 1.0, '1965-2016 Earthquakes')
plt.hist(df2['Magnitude'], bins=20, edgecolor='black')
plt.xlabel('Magnitude')
plt.ylabel('Number of Earthquakes')
plt.title('1965-2016 Earthquakes')
Text(0.5, 1.0, '1965-2016 Earthquakes')
plt.hist(df2['Magnitude'], bins=[6,6.1,6.2,6.3,6.4,6.5,6.6,6.7,6.8,6.9,7], edgecolor='black')
plt.xlabel('Magnitude')
plt.ylabel('Number of Earthquakes')
plt.title('1965-2016 Earthquakes')
Text(0.5, 1.0, '1965-2016 Earthquakes')

make a function

def myplot(bins):
    plt.hist(df2['Magnitude'], bins=20, edgecolor='black')
    plt.xlabel('Magnitude')
    plt.ylabel('Number of Earthquakes')
    plt.title('1965-2016 Earthquakes')
myplot(20)

Use shift-tab multiple times to get documentation on a function

plt.hist(df2['Magnitude'], bins=20, edgecolor='black', range=[8,9.1])
plt.xlabel('Magnitude')
plt.ylabel('Number of Earthquakes')
plt.title('1965-2016 Earthquakes')
Text(0.5, 1.0, '1965-2016 Earthquakes')
plt.plot(df2['Magnitude'][df1.Type=='Earthquake'].resample('1Y').max())
[<matplotlib.lines.Line2D at 0x122a829d0>]

Investigate the shape mismatch error by checking the shape of each part

df2.index.year.unique().shape
(53,)
df2.index.year[df2.Type=='Earthquake'].value_counts().sort_index().shape
(52,)
df2.index.year.isnull().sum()
3
df2.loc[df2.index.year.isnull()]
Latitude Longitude Type Depth Magnitude
Date_Time
NaT 8.017 124.075 Earthquake 623.0 5.6
NaT -32.998 -71.766 Earthquake 33.0 5.6
NaT 36.344 142.344 Earthquake 10.1 5.8

the nulls are causing a mismatch, so drop them

df2 = df2.reset_index().dropna().set_index('Date_Time')

now try it again

plt.bar(df2.index.year.unique(),df2.index.year[df2.Type=='Earthquake'].value_counts().sort_index())
<BarContainer object of 52 artists>
plt.bar(df2[df2.Type=='Nuclear Explosion'].index.year.unique(),
        df2.index.year[df2.Type=='Nuclear Explosion'].value_counts().sort_index())
plt.title('Nukes per year')
Text(0.5, 1.0, 'Nukes per year')
plt.scatter(df2[df2.Type=='Earthquake'].index.year.unique(),
        df2.index.year[df2.Type=='Earthquake'].value_counts().sort_index())
<matplotlib.collections.PathCollection at 0x1231509d0>
plt.scatter(df2[df2.Type=='Earthquake'].index.year.unique(),
        df2.index.year[df2.Type=='Earthquake'].value_counts().sort_index(), df2['Depth'], 'green', alpha=0.7)
/Users/stevebailey/opt/anaconda3/lib/python3.7/site-packages/matplotlib/collections.py:857: RuntimeWarning: invalid value encountered in sqrt
  scale = np.sqrt(self._sizes) * dpi / 72.0 * self._factor

<matplotlib.collections.PathCollection at 0x1231937d0>
plt.scatter(df2['Magnitude'],df2['Depth'])
<matplotlib.collections.PathCollection at 0x123129690>
plt.figure(figsize=(19,10))
plt.scatter(df2['Longitude'],df2['Latitude'],df2['Magnitude']*10,df2['Depth'])
<matplotlib.collections.PathCollection at 0x125591d90>
plt.scatter(df2[df2.Type=='Earthquake'].index.year.unique(),
        df2.index.year[df2.Type=='Earthquake'].value_counts().sort_index(), df2['Depth'], 'green', alpha=0.7)
area= df2['Depth']
for area in [10,50,100]:
    plt.scatter([],[],c='k',s=area,label=str(area))
plt.legend(scatterpoints=1, title='I am legend', loc='lower center')
/Users/stevebailey/opt/anaconda3/lib/python3.7/site-packages/matplotlib/collections.py:857: RuntimeWarning: invalid value encountered in sqrt
  scale = np.sqrt(self._sizes) * dpi / 72.0 * self._factor

<matplotlib.legend.Legend at 0x1266bbf90>