GBIF Data Download and Methods for 2019¶

This notebook downloads migration data from GBIF for the year of 2019, and runs through the steps and methods to create a month-by-month migration map with an interative slider.

In [1]:
# Important: If you haven't already, you need to install pygbif using the command below
# Run this command in the terminal to install: pip install pygbif

# Import packages
import os # For file paths, data management, interacting with the operating system
import pathlib # Object oriented file path management
import time # Time how long operations take in Python
import zipfile # Deal with zip files
from getpass import getpass # Security login for downloading GBIF data
from glob import glob # Search for specific file extensions

# Libraries for data analysis 
import pandas as pd # Data analysis for tabular data
import geopandas as gpd # Data analysis for geospatial data
import pygbif.occurrences as occ # Get occurrence data
import pygbif.species as species # Get species data
import requests # Download data

# Libraries for Dynamic mapping
import cartopy.crs as ccrs #Coordinate reference systems 
import panel as pn #customize the dynamic map
import hvplot.pandas #make the holoview plot with geo-dataframe
import calendar # Change month numbers to month names
In [2]:
# Define data directory in user's home directory
data_dir_2019 = os.path.join(
    # Home directory
    pathlib.Path.home(),
    # Earth analytics data directory
    'earth-analytics',
    'data',
    # Project directory
    'redstart-migration-2019',
)
print(data_dir_2019)
# Make the directory using the path defined above
os.makedirs(data_dir_2019, exist_ok=True)
C:\Users\warno\earth-analytics\data\redstart-migration-2019
In [3]:
# GBIF LOGIN CODE CHUNK
# Code credit: Elsa Culler
# This code allows us to download data in Python from gbif.org
# This code ASKS for your credentials for gbif.org
# and saves it for the rest of the session.
# NEVER put your credentials into your code

# GBIF needs a username, password, and email 
# All 3 need to match the account
reset = False

# Request and store username
if (not ('GBIF_USER'  in os.environ)) or reset:
    os.environ['GBIF_USER'] = input('GBIF username:')

# Securely request and store password
if (not ('GBIF_PWD'  in os.environ)) or reset:
    os.environ['GBIF_PWD'] = getpass('GBIF password:')
    
# Request and store account email address
if (not ('GBIF_EMAIL'  in os.environ)) or reset:
    os.environ['GBIF_EMAIL'] = input('GBIF email:')
In [4]:
# Search for the American Redstart (Setophaga ruticilla)
backbone = species.name_backbone(name='Setophaga ruticilla')

# Save the species key
species_key = backbone["speciesKey"]
backbone
Out[4]:
{'usageKey': 2489985,
 'scientificName': 'Setophaga ruticilla (Linnaeus, 1758)',
 'canonicalName': 'Setophaga ruticilla',
 'rank': 'SPECIES',
 'status': 'ACCEPTED',
 'confidence': 99,
 'matchType': 'EXACT',
 'kingdom': 'Animalia',
 'phylum': 'Chordata',
 'order': 'Passeriformes',
 'family': 'Parulidae',
 'genus': 'Setophaga',
 'species': 'Setophaga ruticilla',
 'kingdomKey': 1,
 'phylumKey': 44,
 'classKey': 212,
 'orderKey': 729,
 'familyKey': 5263,
 'genusKey': 2489984,
 'speciesKey': 2489985,
 'class': 'Aves'}

Submit download request to GBIF and download occurrence data for year of interest

In [5]:
gbif_pattern_2019 = os.path.join(data_dir_2019, '*.csv')
if not glob(gbif_pattern_2019):
    # Only submit one request
    if not 'GBIF_DOWNLOAD_KEY' in os.environ:
        # Submit query to GBIF
        gbif_query = occ.download([
            f"speciesKey = { species_key }",
            "hasCoordinate = True",
            "year = 2009",
        ])
        os.environ['GBIF_DOWNLOAD_KEY'] = gbif_query[0]
        print("Submitted query to GBIF")
    
    # Wait for download to build
    download_key = os.environ['GBIF_DOWNLOAD_KEY']
    wait = occ.download_meta(download_key)['status']
    while not  wait=='SUCCEEDED':
        wait = occ.download_meta(download_key)['status']
        time.sleep(5)
    print("Building download")

    # Download GBIF data
    print("Downloading GBIF Data")
    download_info = occ.download_get(
        os.environ['GBIF_DOWNLOAD_KEY'],
        path=data_dir_2019)
    print("Finished Downloading GBIF Data")
    
    # Unzip GBIF data
    with zipfile.ZipFile(download_info['path']) as download_zip:
        download_zip.extractall(path=data_dir_2019)
    print("Unzipping GBIF data")

# Find extracted .csv file path and take 1st result
gbif_path_2019 = glob(gbif_pattern_2019)[0]

Read in the downloaded data from GBIF

In [6]:
with open(gbif_path_2019) as f:
    for i in range(2):
        print(f.readline().strip())
gbifID	datasetKey	occurrenceID	kingdom	phylum	class	order	family	genus	species	infraspecificEpithet	taxonRank	scientificName	verbatimScientificName	verbatimScientificNameAuthorship	countryCode	locality	stateProvince	occurrenceStatus	individualCount	publishingOrgKey	decimalLatitude	decimalLongitude	coordinateUncertaintyInMeters	coordinatePrecision	elevation	elevationAccuracy	depth	depthAccuracy	eventDate	day	month	year	taxonKey	speciesKey	basisOfRecord	institutionCode	collectionCode	catalogNumber	recordNumber	identifiedBy	dateIdentified	license	rightsHolder	recordedBy	typeStatus	establishmentMeans	lastInterpreted	mediaType	issue
2350139097	36f15a36-6b45-442e-9c31-cd633423aee0	9c9a443e-bd18-46bd-a90a-ea22ac073dcc	Animalia	Chordata	Aves	Passeriformes	Parulidae	Setophaga	Setophaga ruticilla		SPECIES	Setophaga ruticilla (Linnaeus, 1758)	Setophaga ruticilla		US	320 N Clark	Illinois	PRESENT		7b8aff00-a9f8-11d8-944b-b8a03c50a862	41.88803	-87.63103	100.0						2019-05-16	16	5	2019	2489985	2489985	PRESERVED_SPECIMEN	FMNH	Birds	517833	4094266			CC0_1_0	The Field Museum of Natural History	D. Cohen			2025-11-25T13:05:56.559Z		COORDINATE_ROUNDED;INSTITUTION_MATCH_FUZZY;COLLECTION_MATCH_FUZZY

Load the GBIF Data

In [7]:
# Load the GBIF data, this time defining the delimiter (tabular) represents as '\t'
# We also set the index to gbifID, and select the columns of interest (lat, long, and month)
gbif_df = pd.read_csv(
    gbif_path_2019,
    delimiter='\t',
    index_col='gbifID',
    usecols=['gbifID','decimalLongitude','decimalLatitude','month']
)
gbif_df.head()
Out[7]:
decimalLatitude decimalLongitude month
gbifID
2350139097 41.888030 -87.631030 5.0
3881981512 41.877678 -87.636623 9.0
2421785940 41.886963 -87.622959 5.0
2598325430 41.880675 -87.634880 8.0
2421786429 41.886191 -87.624431 5.0

Converting the dataframe to a geodataframe

In [8]:
# Converting the dataframe to a geodataframe
# Using geopandas library
gbif_gdf = (
    gpd.GeoDataFrame(
        gbif_df, 
        geometry=gpd.points_from_xy(
            gbif_df.decimalLongitude, 
            gbif_df.decimalLatitude), 
        crs="EPSG:4326") # Manually set the crs to match the ecoregions data
    # Select the desired columns
    [['month','geometry']]
)
gbif_gdf
Out[8]:
month geometry
gbifID
2350139097 5.0 POINT (-87.63103 41.88803)
3881981512 9.0 POINT (-87.63662 41.87768)
2421785940 5.0 POINT (-87.62296 41.88696)
2598325430 8.0 POINT (-87.63488 41.88068)
2421786429 5.0 POINT (-87.62443 41.88619)
... ... ...
2432448138 9.0 POINT (-109.0521 25.6036)
2432420665 6.0 POINT (-76.3351 42.3306)
2432426881 6.0 POINT (-76.3669 42.3473)
2432422656 6.0 POINT (-76.3351 42.3306)
3056502334 8.0 POINT (-96.34039 30.62109)

308536 rows × 2 columns

Set up the ecoregion boundary URL

In [9]:
# Ecoregions data from GeographyRealm, RESOLVE, and OneEarth: 
# GeographyRealm: https://www.geographyrealm.com/terrestrial-ecoregions-gis-data/
# OneEarth: https://www.oneearth.org/announcing-the-release-of-ecoregion-snapshots/
eco_url = (
    "https://storage.googleapis.com/teow2016/Ecoregions2017.zip")

# Set up a path to save the data to your machine
eco_dir = os.path.join(data_dir_2019, 'resolve_ecoregions') 
# Make ecoregions directory
os.makedirs(eco_dir, exist_ok=True)

# Join ecoregions shapefile path
eco_path = os.path.join(eco_dir, 'ecoregions.shp')

# Only download once
if not os.path.exists(eco_path):
    my_gdf = gpd.read_file(eco_url)
    my_gdf.to_file(eco_path)

Open the downloaded ecoregion boundaries

In [10]:
# Opening the download and renaming some of the column names
eco_gdf = (
    gpd.read_file(eco_path)
    [['OBJECTID','ECO_NAME','SHAPE_AREA','geometry']]
    .rename(columns={
        'OBJECTID': 'ecoregion_id',
        'ECO_NAME': 'name',
        'SHAPE_AREA': 'area'
    })
    .set_index('ecoregion_id')
)

# Plot the ecoregions quickly to check download
eco_gdf.plot()
eco_gdf.head()
Out[10]:
name area geometry
ecoregion_id
1.0 Adelie Land tundra 0.038948 MULTIPOLYGON (((158.7141 -69.60657, 158.71264 ...
2.0 Admiralty Islands lowland rain forests 0.170599 MULTIPOLYGON (((147.28819 -2.57589, 147.2715 -...
3.0 Aegean and Western Turkey sclerophyllous and m... 13.844952 MULTIPOLYGON (((26.88659 35.32161, 26.88297 35...
4.0 Afghan Mountains semi-desert 1.355536 MULTIPOLYGON (((65.48655 34.71401, 65.52872 34...
5.0 Ahklun and Kilbuck Upland Tundra 8.196573 MULTIPOLYGON (((-160.26404 58.64097, -160.2673...
No description has been provided for this image

Join the ecoregion and GBIF data

To identify the ecoregion for each observation, we need to join the eco_gdf and gbif_gdf together. The joined data is a new variable called gbif_ecoregion_gdf

In [11]:
gbif_ecoregion_gdf = (
    eco_gdf
    # Match the CRS of the GBIF data and the ecoregions
    .to_crs(gbif_gdf.crs)
    # Find ecoregion for each observation
    .sjoin(
        gbif_gdf,
        how='inner', 
        predicate='contains')
)
gbif_ecoregion_gdf
Out[11]:
name area geometry gbifID month
ecoregion_id
13.0 Alberta-British Columbia foothills forests 17.133639 MULTIPOLYGON (((-119.53979 55.81661, -119.5443... 2791146151 5.0
13.0 Alberta-British Columbia foothills forests 17.133639 MULTIPOLYGON (((-119.53979 55.81661, -119.5443... 2700037181 5.0
13.0 Alberta-British Columbia foothills forests 17.133639 MULTIPOLYGON (((-119.53979 55.81661, -119.5443... 2746629347 6.0
13.0 Alberta-British Columbia foothills forests 17.133639 MULTIPOLYGON (((-119.53979 55.81661, -119.5443... 2703878404 6.0
13.0 Alberta-British Columbia foothills forests 17.133639 MULTIPOLYGON (((-119.53979 55.81661, -119.5443... 5564237904 5.0
... ... ... ... ... ...
844.0 Lesser Antillean dry forests 0.053407 MULTIPOLYGON (((-61.65263 12.23089, -61.63773 ... 2844359951 4.0
844.0 Lesser Antillean dry forests 0.053407 MULTIPOLYGON (((-61.65263 12.23089, -61.63773 ... 2834405031 10.0
844.0 Lesser Antillean dry forests 0.053407 MULTIPOLYGON (((-61.65263 12.23089, -61.63773 ... 2720187298 10.0
844.0 Lesser Antillean dry forests 0.053407 MULTIPOLYGON (((-61.65263 12.23089, -61.63773 ... 2701542161 10.0
844.0 Lesser Antillean dry forests 0.053407 MULTIPOLYGON (((-61.65263 12.23089, -61.63773 ... 2659622087 3.0

299909 rows × 5 columns

Count observations in each ecoregion each month

In [12]:
# Count the observations in each ecoregion each month
occurrence_df = (
    gbif_ecoregion_gdf
    # Reset index
    .reset_index()
    # For each ecoregion, for each month...
    .groupby(['ecoregion_id','month'])
    # ...count the number of occurrences
    # creating a new column 'occurrences' that counts all the gbif IDs in a group
    .agg(
        occurrences=('gbifID', 'count'),
        area=('area','first')) 
)

Normalize by area

This step gives us a new column called "density" which shows the number of occurrences divided by the ecoregion area. Not all ecoregions are the same size, so this step accounts for this. The density column gives us a count of the number of occurences for a given unit area.

In [13]:
#Normalize by area
occurrence_df['density'] = (
    occurrence_df.occurrences
    / occurrence_df.area
)

# Get rid of rare observations (possible misidentification?)
occurrence_df = occurrence_df[occurrence_df.occurrences>1]

Take the mean by ecoregion

Gives us the mean number of occurrences in an ecoregion.

In [14]:
# Take the mean by ecoregion
mean_occurrences_by_ecoregion = (
    occurrence_df
    .groupby('ecoregion_id')
    .mean()
)
mean_occurrences_by_ecoregion
Out[14]:
occurrences area density
ecoregion_id
13.0 22.000000 17.133639 1.284024
17.0 848.714286 7.958751 106.639134
23.0 26.600000 3.346216 7.949278
33.0 1273.363636 16.637804 76.534359
34.0 640.142857 18.674884 34.278278
... ... ... ...
833.0 97.777778 0.610793 160.083203
838.0 23.000000 4.286144 5.366129
839.0 240.666667 35.905513 6.702777
843.0 6.000000 0.022340 268.576193
844.0 5.000000 0.053407 93.621001

166 rows × 3 columns

Take the mean by month

Gives us the mean number of occurrences in a given month

In [15]:
# Take the mean by month
mean_occurrences_by_month = (
    occurrence_df
    .groupby('month')
    .mean()
)
mean_occurrences_by_month
Out[15]:
occurrences area density
month
1.0 82.592105 4.905595 105.226842
2.0 70.460526 5.727130 71.482392
3.0 73.945205 5.178642 96.490903
4.0 118.237113 8.370935 180.450796
5.0 1114.688679 15.329932 112.975132
6.0 675.935484 23.504186 40.413337
7.0 368.000000 25.489154 21.327626
8.0 250.512821 19.117869 23.015322
9.0 385.723214 13.356664 75.044759
10.0 181.285714 8.352464 131.956897
11.0 62.831325 6.413926 78.131438
12.0 54.321429 6.182757 131.461003

Normalize by space and time for sampling effort

The goal of this step is to remove systematic differences such as some months have more sampling efforts because more birdwatchers are outside, and some regions are more heavily surveyed.

The goal of the new column, "norm_occurrences" is to give an occurrence count that is normalized for space and time differences.

In [16]:
# Normalize by space and time for sampling effort
occurrence_df['norm_occurrences'] = (
    occurrence_df[['density']]
    / mean_occurrences_by_ecoregion[['density']]
    / mean_occurrences_by_month[['density']]
)
occurrence_df
Out[16]:
occurrences area density norm_occurrences
ecoregion_id month
13.0 5.0 23 17.133639 1.342388 0.009254
6.0 43 17.133639 2.509683 0.048364
7.0 29 17.133639 1.692577 0.061806
8.0 12 17.133639 0.700377 0.023700
9.0 3 17.133639 0.175094 0.001817
... ... ... ... ... ...
839.0 9.0 19 35.905513 0.529167 0.001052
10.0 3 35.905513 0.083553 0.000094
843.0 12.0 6 0.022340 268.576193 0.007607
844.0 4.0 6 0.053407 112.345201 0.006650
10.0 4 0.053407 74.896801 0.006063

1009 rows × 4 columns

In [17]:
occurrence_df.norm_occurrences.plot.hist()
Out[17]:
<Axes: ylabel='Frequency'>
No description has been provided for this image

Figure 1: This histogram plot shows that there is a higher frequency of normalized ecoregions with low occurrence values, and fewer ecoregions with higher occurrence values.

In [18]:
# Plotting a scatter with regular "occurrences" by month
occurrence_df.reset_index().plot.scatter(
    x='month', y='occurrences', c='ecoregion_id', logy=True
)
Out[18]:
<Axes: xlabel='month', ylabel='occurrences'>
No description has been provided for this image

Figure 2: Plot showing the un-normalized occurrence data of the number of bird observations.

In [19]:
# Plotting a scatter with normalized occurrences by month
occurrence_df.reset_index().plot.scatter(
    x='month', y='norm_occurrences', c='ecoregion_id', logy=True
)
Out[19]:
<Axes: xlabel='month', ylabel='norm_occurrences'>
No description has been provided for this image

Figure 3: Plot showing normallized occurrence data across the months.

In [20]:
# Simplify the geometry to speed up processing
eco_gdf.geometry = eco_gdf.simplify(.1, preserve_topology=False)

# Change the CRS to Mercator for mapping
eco_gdf = eco_gdf.to_crs(ccrs.Mercator())

# Check that the plot runs in a reasonable amount of time
eco_gdf.hvplot(geo=True, crs=ccrs.Mercator())
Out[20]:

Create an interactive plot of the migration data

Note: The plot will not display in the HTML version of this notebook.

In [21]:
# Join the occurrences with the plotting GeoDataFrame
occurrence_gdf = eco_gdf.join(occurrence_df[['norm_occurrences']])

# Remove any empty geometry rows. These can cause errors if they persist in the data. 
occurrence_gdf = occurrence_gdf[~occurrence_gdf.geometry.is_empty]

# Get the plot bounds so they don't change with the slider
# xmin, ymin, xmax, ymax = gbif_gdf.to_crs(ccrs.Mercator()).total_bounds

# This code above will plot the entire range of the data (even other continents/unusual sightings)
# I am just interested in the South-North American data, so I set the bounds manually.

xmin = -35000000.0
xmax = -2000000.0
ymax = 11000000.0
ymin = -4000000.0

# List month names on the slider
month_widget = pn.widgets.DiscreteSlider(
    options={
        calendar.month_name[month_num]: month_num
        for month_num in range(1,13)
    }
)

# Plot occurrence by ecoregion and month
migration_plot = (
    occurrence_gdf
    .hvplot(
        c='norm_occurrences',
        groupby='month',
        # Use background tiles
        geo=True, crs=ccrs.Mercator(), tiles='CartoLight',
        title="American Redstart Migration, 2019",
        xlim=(xmin, xmax), ylim=(ymin, ymax),
        frame_height=220,
        frame_width=220,
        widgets={'month': month_widget},
        widget_location='bottom'
    )
)

# Save the plot
migration_plot.save('redstart-migration-2019.html', embed=True)
# Display the plot
migration_plot
                                               
WARNING:W-1005 (FIXED_SIZING_MODE): 'fixed' sizing mode requires width and height to be set: figure(id='p11482', ...)

Out[21]:
BokehModel(combine_events=True, render_bundle={'docs_json': {'d0ef0dde-bf35-4167-b5ae-c00884d0885d': {'version…
In [22]:
occurrence_gdf.to_file("redstart_occurrence_2019.geojson", driver="GeoJSON")
INFO:Created 988 records