import numpy as nb

import pandas as pd

df1 = pd.read_csv('earthquakes-dataset.csv')

df1.head()

	Date	Time	Latitude	Longitude	Type	Depth	Depth Error	Depth Seismic Stations	Magnitude	Magnitude Type	...	Magnitude Seismic Stations	Azimuthal Gap	Horizontal Distance	Horizontal Error	Root Mean Square	ID	Source	Location Source	Magnitude Source	Status
0	01/02/1965	13:44:18	19.246	145.616	Earthquake	131.6	NaN	NaN	6.0	MW	...	NaN	NaN	NaN	NaN	NaN	ISCGEM860706	ISCGEM	ISCGEM	ISCGEM	Automatic
1	01/04/1965	11:29:49	1.863	127.352	Earthquake	80.0	NaN	NaN	5.8	MW	...	NaN	NaN	NaN	NaN	NaN	ISCGEM860737	ISCGEM	ISCGEM	ISCGEM	Automatic
2	01/05/1965	18:05:58	-20.579	-173.972	Earthquake	20.0	NaN	NaN	6.2	MW	...	NaN	NaN	NaN	NaN	NaN	ISCGEM860762	ISCGEM	ISCGEM	ISCGEM	Automatic
3	01/08/1965	18:49:43	-59.076	-23.557	Earthquake	15.0	NaN	NaN	5.8	MW	...	NaN	NaN	NaN	NaN	NaN	ISCGEM860856	ISCGEM	ISCGEM	ISCGEM	Automatic
4	01/09/1965	13:32:50	11.938	126.427	Earthquake	15.0	NaN	NaN	5.8	MW	...	NaN	NaN	NaN	NaN	NaN	ISCGEM860890	ISCGEM	ISCGEM	ISCGEM	Automatic

5 rows × 21 columns

date_demo = '21st of July 2000'

date_demo

'21st of July 2000'

date_demo_datetime = pd.to_datetime(date_demo)

date_demo_datetime

Timestamp('2000-07-21 00:00:00')

df1['Datetime'] = pd.to_datetime(df1['Date'] + ' ' + df1['Time'], errors = 'coerce')

df1.head()

	Date	Time	Latitude	Longitude	Type	Depth	Depth Error	Depth Seismic Stations	Magnitude	Magnitude Type	...	Azimuthal Gap	Horizontal Distance	Horizontal Error	Root Mean Square	ID	Source	Location Source	Magnitude Source	Status	Datetime
0	01/02/1965	13:44:18	19.246	145.616	Earthquake	131.6	NaN	NaN	6.0	MW	...	NaN	NaN	NaN	NaN	ISCGEM860706	ISCGEM	ISCGEM	ISCGEM	Automatic	1965-01-02 13:44:18
1	01/04/1965	11:29:49	1.863	127.352	Earthquake	80.0	NaN	NaN	5.8	MW	...	NaN	NaN	NaN	NaN	ISCGEM860737	ISCGEM	ISCGEM	ISCGEM	Automatic	1965-01-04 11:29:49
2	01/05/1965	18:05:58	-20.579	-173.972	Earthquake	20.0	NaN	NaN	6.2	MW	...	NaN	NaN	NaN	NaN	ISCGEM860762	ISCGEM	ISCGEM	ISCGEM	Automatic	1965-01-05 18:05:58
3	01/08/1965	18:49:43	-59.076	-23.557	Earthquake	15.0	NaN	NaN	5.8	MW	...	NaN	NaN	NaN	NaN	ISCGEM860856	ISCGEM	ISCGEM	ISCGEM	Automatic	1965-01-08 18:49:43
4	01/09/1965	13:32:50	11.938	126.427	Earthquake	15.0	NaN	NaN	5.8	MW	...	NaN	NaN	NaN	NaN	ISCGEM860890	ISCGEM	ISCGEM	ISCGEM	Automatic	1965-01-09 13:32:50

5 rows × 22 columns

df1['Datetime'][0]

Timestamp('1965-01-02 13:44:18')

df1['Datetime'][0].strftime('%A or %a and in %B')

'Saturday or Sat and in January'

df1 = df1.set_index(['Datetime'])

df1.head()

	Date	Time	Latitude	Longitude	Type	Depth	Depth Error	Depth Seismic Stations	Magnitude	Magnitude Type	...	Magnitude Seismic Stations	Azimuthal Gap	Horizontal Distance	Horizontal Error	Root Mean Square	ID	Source	Location Source	Magnitude Source	Status
Datetime
1965-01-02 13:44:18	01/02/1965	13:44:18	19.246	145.616	Earthquake	131.6	NaN	NaN	6.0	MW	...	NaN	NaN	NaN	NaN	NaN	ISCGEM860706	ISCGEM	ISCGEM	ISCGEM	Automatic
1965-01-04 11:29:49	01/04/1965	11:29:49	1.863	127.352	Earthquake	80.0	NaN	NaN	5.8	MW	...	NaN	NaN	NaN	NaN	NaN	ISCGEM860737	ISCGEM	ISCGEM	ISCGEM	Automatic
1965-01-05 18:05:58	01/05/1965	18:05:58	-20.579	-173.972	Earthquake	20.0	NaN	NaN	6.2	MW	...	NaN	NaN	NaN	NaN	NaN	ISCGEM860762	ISCGEM	ISCGEM	ISCGEM	Automatic
1965-01-08 18:49:43	01/08/1965	18:49:43	-59.076	-23.557	Earthquake	15.0	NaN	NaN	5.8	MW	...	NaN	NaN	NaN	NaN	NaN	ISCGEM860856	ISCGEM	ISCGEM	ISCGEM	Automatic
1965-01-09 13:32:50	01/09/1965	13:32:50	11.938	126.427	Earthquake	15.0	NaN	NaN	5.8	MW	...	NaN	NaN	NaN	NaN	NaN	ISCGEM860890	ISCGEM	ISCGEM	ISCGEM	Automatic

5 rows × 21 columns

df1.dtypes

Date                           object
Time                           object
Latitude                      float64
Longitude                     float64
Type                           object
Depth                         float64
Depth Error                   float64
Depth Seismic Stations        float64
Magnitude                     float64
Magnitude Type                 object
Magnitude Error               float64
Magnitude Seismic Stations    float64
Azimuthal Gap                 float64
Horizontal Distance           float64
Horizontal Error              float64
Root Mean Square              float64
ID                             object
Source                         object
Location Source                object
Magnitude Source               object
Status                         object
dtype: object

df1['Depth'] = df1['Depth'].astype(int)

df1.dtypes

Date                           object
Time                           object
Latitude                      float64
Longitude                     float64
Type                           object
Depth                           int64
Depth Error                   float64
Depth Seismic Stations        float64
Magnitude                     float64
Magnitude Type                 object
Magnitude Error               float64
Magnitude Seismic Stations    float64
Azimuthal Gap                 float64
Horizontal Distance           float64
Horizontal Error              float64
Root Mean Square              float64
ID                             object
Source                         object
Location Source                object
Magnitude Source               object
Status                         object
dtype: object

using pandas, read_csv the data, use Date and Time columns to parse Date_Time column, and make it the index

df1 = pd.read_csv('earthquakes-dataset.csv', index_col=0, parse_dates=[['Date','Time']])

df1.head()

	Latitude	Longitude	Type	Depth	Depth Error	Depth Seismic Stations	Magnitude	Magnitude Type	Magnitude Error	Magnitude Seismic Stations	Azimuthal Gap	Horizontal Distance	Horizontal Error	Root Mean Square	ID	Source	Location Source	Magnitude Source	Status
Date_Time
01/02/1965 13:44:18	19.246	145.616	Earthquake	131.6	NaN	NaN	6.0	MW	NaN	NaN	NaN	NaN	NaN	NaN	ISCGEM860706	ISCGEM	ISCGEM	ISCGEM	Automatic
01/04/1965 11:29:49	1.863	127.352	Earthquake	80.0	NaN	NaN	5.8	MW	NaN	NaN	NaN	NaN	NaN	NaN	ISCGEM860737	ISCGEM	ISCGEM	ISCGEM	Automatic
01/05/1965 18:05:58	-20.579	-173.972	Earthquake	20.0	NaN	NaN	6.2	MW	NaN	NaN	NaN	NaN	NaN	NaN	ISCGEM860762	ISCGEM	ISCGEM	ISCGEM	Automatic
01/08/1965 18:49:43	-59.076	-23.557	Earthquake	15.0	NaN	NaN	5.8	MW	NaN	NaN	NaN	NaN	NaN	NaN	ISCGEM860856	ISCGEM	ISCGEM	ISCGEM	Automatic
01/09/1965 13:32:50	11.938	126.427	Earthquake	15.0	NaN	NaN	5.8	MW	NaN	NaN	NaN	NaN	NaN	NaN	ISCGEM860890	ISCGEM	ISCGEM	ISCGEM	Automatic

df1.index

Index(['01/02/1965 13:44:18', '01/04/1965 11:29:49', '01/05/1965 18:05:58',
       '01/08/1965 18:49:43', '01/09/1965 13:32:50', '01/10/1965 13:36:32',
       '01/12/1965 13:32:25', '01/15/1965 23:17:42', '01/16/1965 11:32:37',
       '01/17/1965 10:43:17',
       ...
       '12/24/2016 03:58:55', '12/25/2016 14:22:27', '12/25/2016 14:32:13',
       '12/27/2016 23:20:56', '12/28/2016 08:18:01', '12/28/2016 08:22:12',
       '12/28/2016 09:13:47', '12/28/2016 12:38:51', '12/29/2016 22:30:19',
       '12/30/2016 20:08:28'],
      dtype='object', name='Date_Time', length=23412)

df1.index = pd.to_datetime(df1.index, errors='coerce')

df1.index

DatetimeIndex(['1965-01-02 13:44:18', '1965-01-04 11:29:49',
               '1965-01-05 18:05:58', '1965-01-08 18:49:43',
               '1965-01-09 13:32:50', '1965-01-10 13:36:32',
               '1965-01-12 13:32:25', '1965-01-15 23:17:42',
               '1965-01-16 11:32:37', '1965-01-17 10:43:17',
               ...
               '2016-12-24 03:58:55', '2016-12-25 14:22:27',
               '2016-12-25 14:32:13', '2016-12-27 23:20:56',
               '2016-12-28 08:18:01', '2016-12-28 08:22:12',
               '2016-12-28 09:13:47', '2016-12-28 12:38:51',
               '2016-12-29 22:30:19', '2016-12-30 20:08:28'],
              dtype='datetime64[ns]', name='Date_Time', length=23412, freq=None)

df1.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 23412 entries, 1965-01-02 13:44:18 to 2016-12-30 20:08:28
Data columns (total 19 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Latitude                    23412 non-null  float64
 1   Longitude                   23412 non-null  float64
 2   Type                        23412 non-null  object 
 3   Depth                       23412 non-null  float64
 4   Depth Error                 4461 non-null   float64
 5   Depth Seismic Stations      7097 non-null   float64
 6   Magnitude                   23412 non-null  float64
 7   Magnitude Type              23409 non-null  object 
 8   Magnitude Error             327 non-null    float64
 9   Magnitude Seismic Stations  2564 non-null   float64
 10  Azimuthal Gap               7299 non-null   float64
 11  Horizontal Distance         1604 non-null   float64
 12  Horizontal Error            1156 non-null   float64
 13  Root Mean Square            17352 non-null  float64
 14  ID                          23412 non-null  object 
 15  Source                      23412 non-null  object 
 16  Location Source             23412 non-null  object 
 17  Magnitude Source            23412 non-null  object 
 18  Status                      23412 non-null  object 
dtypes: float64(12), object(7)
memory usage: 3.6+ MB

df1 = df1.dropna(axis=1)

df1.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 23412 entries, 1965-01-02 13:44:18 to 2016-12-30 20:08:28
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Latitude          23412 non-null  float64
 1   Longitude         23412 non-null  float64
 2   Type              23412 non-null  object 
 3   Depth             23412 non-null  float64
 4   Magnitude         23412 non-null  float64
 5   ID                23412 non-null  object 
 6   Source            23412 non-null  object 
 7   Location Source   23412 non-null  object 
 8   Magnitude Source  23412 non-null  object 
 9   Status            23412 non-null  object 
dtypes: float64(4), object(6)
memory usage: 2.0+ MB

df2 = df1[['Latitude','Longitude','Type','Depth','Magnitude']]

df2.sample(5)

	Latitude	Longitude	Type	Depth	Magnitude
Date_Time
2015-03-02 02:50:48	-59.561	-150.6522	Earthquake	13.0	5.5
2005-01-23 03:32:27	-13.882	66.1910	Earthquake	10.0	5.5
2010-07-20 19:38:10	27.022	53.8610	Earthquake	10.0	5.8
1989-02-10 16:59:21	6.317	92.2690	Earthquake	41.7	5.8
1975-01-17 09:30:42	-17.905	-174.5810	Earthquake	153.0	5.8

df2.Type.unique()

array(['Earthquake', 'Nuclear Explosion', 'Explosion', 'Rock Burst'],
      dtype=object)

df2[df2.Type=='Earthquake'].Magnitude.max()

9.1

df2[df2.Type=='Earthquake'].Depth.max()

700.0

df2[df2.Type=='Earthquake'].Depth.min()

-1.1

df2.Type.value_counts()

Earthquake           23232
Nuclear Explosion      175
Explosion                4
Rock Burst               1
Name: Type, dtype: int64

df2['Magnitude'][df2.Type=='Earthquake'].resample('1D').mean()

Date_Time
1965-01-02    6.00
1965-01-03     NaN
1965-01-04    5.80
1965-01-05    6.20
1965-01-06     NaN
              ... 
2016-12-26     NaN
2016-12-27    5.60
2016-12-28    5.65
2016-12-29    6.30
2016-12-30    5.50
Name: Magnitude, Length: 18991, dtype: float64

df2['Magnitude'][df2.Type=='Earthquake'].resample('3M').std()

Date_Time
1965-01-31    0.683318
1965-04-30    0.466179
1965-07-31    0.458874
1965-10-31    0.503535
1966-01-31    0.291472
                ...   
2016-01-31    0.497722
2016-04-30    0.445377
2016-07-31    0.388641
2016-10-31    0.406118
2017-01-31    0.530923
Name: Magnitude, Length: 209, dtype: float64

df2.loc[df2['Magnitude']==9.1]

	Latitude	Longitude	Type	Depth	Magnitude
Date_Time
2004-12-26 00:58:53	3.295	95.982	Earthquake	30.0	9.1
2011-03-11 05:46:24	38.297	142.373	Earthquake	29.0	9.1

df2.loc[df2['Depth']==-1.1]

	Latitude	Longitude	Type	Depth	Magnitude
Date_Time
1992-06-28 12:00:45	34.131	-116.408	Earthquake	-1.1	5.77

df2.index.year[df2.Type=='Nuclear Explosion'].value_counts().sort_index()

1966.0     1
1968.0     2
1969.0     1
1970.0     1
1973.0     6
1974.0     6
1975.0     9
1976.0    10
1977.0     8
1978.0    15
1979.0    13
1980.0    13
1981.0     8
1982.0     6
1983.0    10
1984.0    14
1985.0     6
1986.0     1
1987.0    16
1988.0    10
1989.0     7
1990.0     4
1991.0     1
1992.0     1
1993.0     1
1994.0     2
1995.0     2
1996.0     1
Name: Date_Time, dtype: int64

import matplotlib as mpl
import matplotlib.pyplot as plt

Enable static images of plots inside the notebook

%matplotlib inline

plt.hist(df2['Magnitude'])

(array([1.4107e+04, 5.6570e+03, 1.8100e+03, 1.1000e+03, 3.9300e+02,
        2.1400e+02, 1.0400e+02, 1.9000e+01, 5.0000e+00, 3.0000e+00]),
 array([5.5 , 5.86, 6.22, 6.58, 6.94, 7.3 , 7.66, 8.02, 8.38, 8.74, 9.1 ]),
 <a list of 10 Patch objects>)

plt.hist(df2['Magnitude'])
plt.xlabel('Magnitude')
plt.ylabel('Number of Earthquakes')
plt.title('1965-2016 Earthquakes')

Text(0.5, 1.0, '1965-2016 Earthquakes')

plt.hist(df2['Magnitude'], bins=20, edgecolor='black')
plt.xlabel('Magnitude')
plt.ylabel('Number of Earthquakes')
plt.title('1965-2016 Earthquakes')

Text(0.5, 1.0, '1965-2016 Earthquakes')

plt.hist(df2['Magnitude'], bins=[6,6.1,6.2,6.3,6.4,6.5,6.6,6.7,6.8,6.9,7], edgecolor='black')
plt.xlabel('Magnitude')
plt.ylabel('Number of Earthquakes')
plt.title('1965-2016 Earthquakes')

Text(0.5, 1.0, '1965-2016 Earthquakes')

make a function

def myplot(bins):
    plt.hist(df2['Magnitude'], bins=20, edgecolor='black')
    plt.xlabel('Magnitude')
    plt.ylabel('Number of Earthquakes')
    plt.title('1965-2016 Earthquakes')

myplot(20)

Use shift-tab multiple times to get documentation on a function

plt.hist(df2['Magnitude'], bins=20, edgecolor='black', range=[8,9.1])
plt.xlabel('Magnitude')
plt.ylabel('Number of Earthquakes')
plt.title('1965-2016 Earthquakes')

Text(0.5, 1.0, '1965-2016 Earthquakes')

plt.plot(df2['Magnitude'][df1.Type=='Earthquake'].resample('1Y').max())

[<matplotlib.lines.Line2D at 0x122a829d0>]

Investigate the shape mismatch error by checking the shape of each part

df2.index.year.unique().shape

(53,)

df2.index.year[df2.Type=='Earthquake'].value_counts().sort_index().shape

(52,)

df2.index.year.isnull().sum()

df2.loc[df2.index.year.isnull()]

	Latitude	Longitude	Type	Depth	Magnitude
Date_Time
NaT	8.017	124.075	Earthquake	623.0	5.6
NaT	-32.998	-71.766	Earthquake	33.0	5.6
NaT	36.344	142.344	Earthquake	10.1	5.8

the nulls are causing a mismatch, so drop them

df2 = df2.reset_index().dropna().set_index('Date_Time')

now try it again

plt.bar(df2.index.year.unique(),df2.index.year[df2.Type=='Earthquake'].value_counts().sort_index())

<BarContainer object of 52 artists>

plt.bar(df2[df2.Type=='Nuclear Explosion'].index.year.unique(),
        df2.index.year[df2.Type=='Nuclear Explosion'].value_counts().sort_index())
plt.title('Nukes per year')

Text(0.5, 1.0, 'Nukes per year')

plt.scatter(df2[df2.Type=='Earthquake'].index.year.unique(),
        df2.index.year[df2.Type=='Earthquake'].value_counts().sort_index())

<matplotlib.collections.PathCollection at 0x1231509d0>

plt.scatter(df2[df2.Type=='Earthquake'].index.year.unique(),
        df2.index.year[df2.Type=='Earthquake'].value_counts().sort_index(), df2['Depth'], 'green', alpha=0.7)

/Users/stevebailey/opt/anaconda3/lib/python3.7/site-packages/matplotlib/collections.py:857: RuntimeWarning: invalid value encountered in sqrt
  scale = np.sqrt(self._sizes) * dpi / 72.0 * self._factor

<matplotlib.collections.PathCollection at 0x1231937d0>

plt.scatter(df2['Magnitude'],df2['Depth'])

<matplotlib.collections.PathCollection at 0x123129690>

plt.figure(figsize=(19,10))
plt.scatter(df2['Longitude'],df2['Latitude'],df2['Magnitude']*10,df2['Depth'])

<matplotlib.collections.PathCollection at 0x125591d90>

plt.scatter(df2[df2.Type=='Earthquake'].index.year.unique(),
        df2.index.year[df2.Type=='Earthquake'].value_counts().sort_index(), df2['Depth'], 'green', alpha=0.7)
area= df2['Depth']
for area in [10,50,100]:
    plt.scatter([],[],c='k',s=area,label=str(area))
plt.legend(scatterpoints=1, title='I am legend', loc='lower center')

/Users/stevebailey/opt/anaconda3/lib/python3.7/site-packages/matplotlib/collections.py:857: RuntimeWarning: invalid value encountered in sqrt
  scale = np.sqrt(self._sizes) * dpi / 72.0 * self._factor

<matplotlib.legend.Legend at 0x1266bbf90>

Keys	Action
`?`	Open this help
`n`	Next page
`p`	Previous page
`s`	Search