# Important: If you haven't already, you need to install pygbif using the command below
# Run this command in the terminal to install: pip install pygbif

# Import packages
import os # For file paths, data management, interacting with the operating system
import pathlib # Object oriented file path management
import time # Time how long operations take in Python
import zipfile # Deal with zip files
from getpass import getpass # Security login for downloading GBIF data
from glob import glob # Search for specific file extensions

# Libraries for data analysis 
import pandas as pd # Data analysis for tabular data
import geopandas as gpd # Data analysis for geospatial data
import pygbif.occurrences as occ # Get occurrence data
import pygbif.species as species # Get species data
import requests # Download data

# Libraries for Dynamic mapping
import cartopy.crs as ccrs #Coordinate reference systems 
import panel as pn #customize the dynamic map
import hvplot.pandas #make the holoview plot with geo-dataframe
import calendar # Change month numbers to month names

# Define data directory in user's home directory
data_dir = os.path.join(
    # Home directory
    pathlib.Path.home(),
    # Earth analytics data directory
    'earth-analytics',
    'data',
    # Project directory
    'baltimore-oriole-migration',
)
print(data_dir)
# Make the directory using the path defined above
os.makedirs(data_dir, exist_ok=True)

C:\Users\warno\earth-analytics\data\baltimore-oriole-migration

# GBIF LOGIN CODE CHUNK
# Code credit: Elsa Culler
# This code allows us to download data in Python from gbif.org
# This code ASKS for your credentials for gbif.org
# and saves it for the rest of the session.
# NEVER put your credentials into your code

# GBIF needs a username, password, and email 
# All 3 need to match the account
reset = False

# Request and store username
if (not ('GBIF_USER'  in os.environ)) or reset:
    os.environ['GBIF_USER'] = input('GBIF username:')

# Securely request and store password
if (not ('GBIF_PWD'  in os.environ)) or reset:
    os.environ['GBIF_PWD'] = getpass('GBIF password:')
    
# Request and store account email address
if (not ('GBIF_EMAIL'  in os.environ)) or reset:
    os.environ['GBIF_EMAIL'] = input('GBIF email:')

# Search for the Baltimore Oriole (Icterus galbula)
backbone = species.name_backbone(name='Icterus galbula')

# Save the species key
species_key = backbone["speciesKey"]
backbone

{'usageKey': 2484340,
 'scientificName': 'Icterus galbula (Linnaeus, 1758)',
 'canonicalName': 'Icterus galbula',
 'rank': 'SPECIES',
 'status': 'ACCEPTED',
 'confidence': 99,
 'matchType': 'EXACT',
 'kingdom': 'Animalia',
 'phylum': 'Chordata',
 'order': 'Passeriformes',
 'family': 'Icteridae',
 'genus': 'Icterus',
 'species': 'Icterus galbula',
 'kingdomKey': 1,
 'phylumKey': 44,
 'classKey': 212,
 'orderKey': 729,
 'familyKey': 6176,
 'genusKey': 2484282,
 'speciesKey': 2484340,
 'class': 'Aves'}

gbif_pattern = os.path.join(data_dir, '*.csv')
if not glob(gbif_pattern):
    # Only submit one request
    if not 'GBIF_DOWNLOAD_KEY' in os.environ:
        # Submit query to GBIF
        gbif_query = occ.download([
            f"speciesKey = { species_key }",
            "hasCoordinate = True",
            "year = 2024",
        ])
        os.environ['GBIF_DOWNLOAD_KEY'] = gbif_query[0]
    
    # Wait for download to build
    download_key = os.environ['GBIF_DOWNLOAD_KEY']
    wait = occ.download_meta(download_key)['status']
    while not  wait=='SUCCEEDED':
        wait = occ.download_meta(download_key)['status']
        time.sleep(5)

    # Download GBIF data
    download_info = occ.download_get(
        os.environ['GBIF_DOWNLOAD_KEY'],
        path=data_dir)
    
    # Unzip GBIF data
    with zipfile.ZipFile(download_info['path']) as download_zip:
        download_zip.extractall(path=data_dir)

# Find extracted .csv file path and take 1st result
gbif_path = glob(gbif_pattern)[0]

with open(gbif_path) as f:
    for i in range(2):
        print(f.readline().strip())

gbifID	datasetKey	occurrenceID	kingdom	phylum	class	order	family	genus	species	infraspecificEpithet	taxonRank	scientificName	verbatimScientificName	verbatimScientificNameAuthorship	countryCode	locality	stateProvince	occurrenceStatus	individualCount	publishingOrgKey	decimalLatitude	decimalLongitude	coordinateUncertaintyInMeters	coordinatePrecision	elevation	elevationAccuracy	depth	depthAccuracy	eventDate	day	month	year	taxonKey	speciesKey	basisOfRecord	institutionCode	collectionCode	catalogNumber	recordNumber	identifiedBy	dateIdentified	license	rightsHolder	recordedBy	typeStatus	establishmentMeans	lastInterpreted	mediaType	issue
4923513378	36f15a36-6b45-442e-9c31-cd633423aee0	b7ea4555-d6e0-49d9-adaa-ba3f5cad4d85	Animalia	Chordata	Aves	Passeriformes	Icteridae	Icterus	Icterus galbula	galbula	SUBSPECIES	Icterus galbula galbula (Linnaeus, 1758)	Icterus galbula galbula		US	McCormick Place	Illinois	PRESENT		7b8aff00-a9f8-11d8-944b-b8a03c50a862	41.852812	-87.611721	2454.0						2023-05-16	16	5	2023	5229901	2484340	PRESERVED_SPECIMEN	FMNH	Birds	530909	4623183			CC0_1_0	The Field Museum of Natural History	D. E. Willard			2025-10-07T19:21:28.597Z		COORDINATE_ROUNDED;INSTITUTION_MATCH_FUZZY;COLLECTION_MATCH_FUZZY

# Load the GBIF data, this time defining the delimiter (tabular) represents as '\t'
# We also set the index to gbifID, and select the columns of interest (lat, long, and month)
gbif_df = pd.read_csv(
    gbif_path,
    delimiter='\t',
    index_col='gbifID',
    usecols=['gbifID','decimalLongitude','decimalLatitude','month']
)
gbif_df.head()

# Converting the dataframe to a geodataframe
# Using geopandas library
gbif_gdf = (
    gpd.GeoDataFrame(
        gbif_df, 
        geometry=gpd.points_from_xy(
            gbif_df.decimalLongitude, 
            gbif_df.decimalLatitude), 
        crs="EPSG:4326") # Manually set the crs to match the ecoregions data
    # Select the desired columns
    [['month','geometry']]
)
gbif_gdf

# Ecoregions data from GeographyRealm, RESOLVE, and OneEarth: 
# GeographyRealm: https://www.geographyrealm.com/terrestrial-ecoregions-gis-data/
# OneEarth: https://www.oneearth.org/announcing-the-release-of-ecoregion-snapshots/
eco_url = (
    "https://storage.googleapis.com/teow2016/Ecoregions2017.zip")

# Set up a path to save the data to your machine
eco_dir = os.path.join(data_dir, 'resolve_ecoregions') 
# Make ecoregions directory
os.makedirs(eco_dir, exist_ok=True)

# Join ecoregions shapefile path
eco_path = os.path.join(eco_dir, 'ecoregions.shp')

# Only download once
if not os.path.exists(eco_path):
    my_gdf = gpd.read_file(eco_url)
    my_gdf.to_file(eco_path)

# Opening the download and renaming some of the column names
eco_gdf = (
    gpd.read_file(eco_path)
    [['OBJECTID','ECO_NAME','SHAPE_AREA','geometry']]
    .rename(columns={
        'OBJECTID': 'ecoregion_id',
        'ECO_NAME': 'name',
        'SHAPE_AREA': 'area'
    })
    .set_index('ecoregion_id')
)

# Plot the ecoregions quickly to check download
eco_gdf.plot()
eco_gdf.head()

gbif_ecoregion_gdf = (
    eco_gdf
    # Match the CRS of the GBIF data and the ecoregions
    .to_crs(gbif_gdf.crs)
    # Find ecoregion for each observation
    .sjoin(
        gbif_gdf,
        how='inner', 
        predicate='contains')
)
gbif_ecoregion_gdf

# Count the observations in each ecoregion each month
occurrence_df = (
    gbif_ecoregion_gdf
    # Reset index
    .reset_index()
    # For each ecoregion, for each month...
    .groupby(['ecoregion_id','month'])
    # ...count the number of occurrences
    # creating a new column 'occurrences' that counts all the gbif IDs in a group
    .agg(
        occurrences=('gbifID', 'count'),
        area=('area','first')) 
)

#Normalize by area
occurrence_df['density'] = (
    occurrence_df.occurrences
    / occurrence_df.area
)

# Get rid of rare observations (possible misidentification?)
occurrence_df = occurrence_df[occurrence_df.occurrences>1]

# Take the mean by ecoregion
mean_occurrences_by_ecoregion = (
    occurrence_df
    .groupby('ecoregion_id')
    .mean()
)
mean_occurrences_by_ecoregion

# Take the mean by month
mean_occurrences_by_month = (
    occurrence_df
    .groupby('month')
    .mean()
)
mean_occurrences_by_month

# Normalize by space and time for sampling effort
occurrence_df['norm_occurrences'] = (
    occurrence_df[['density']]
    / mean_occurrences_by_ecoregion[['density']]
    / mean_occurrences_by_month[['density']]
)
occurrence_df

occurrence_df.norm_occurrences.plot.hist()

<Axes: ylabel='Frequency'>

# Plotting a scatter with regular "occurrences" by month
occurrence_df.reset_index().plot.scatter(
    x='month', y='occurrences', c='ecoregion_id', logy=True
)

<Axes: xlabel='month', ylabel='occurrences'>

# Plotting a scatter with normalized occurrences by month
occurrence_df.reset_index().plot.scatter(
    x='month', y='norm_occurrences', c='ecoregion_id', logy=True
)

<Axes: xlabel='month', ylabel='norm_occurrences'>

# Simplify the geometry to speed up processing
eco_gdf.geometry = eco_gdf.simplify(.1, preserve_topology=False)

# Change the CRS to Mercator for mapping
eco_gdf = eco_gdf.to_crs(ccrs.Mercator())

# Check that the plot runs in a reasonable amount of time
eco_gdf.hvplot(geo=True, crs=ccrs.Mercator())

# Join the occurrences with the plotting GeoDataFrame
occurrence_gdf = eco_gdf.join(occurrence_df[['norm_occurrences']])

# Remove any empty geometry rows. These can cause errors if they persist in the data. 
occurrence_gdf = occurrence_gdf[~occurrence_gdf.geometry.is_empty]

# Get the plot bounds so they don't change with the slider
# xmin, ymin, xmax, ymax = gbif_gdf.to_crs(ccrs.Mercator()).total_bounds

# This code above will plot the entire range of the data (even other continents/unusual sightings)
# I am just interested in the South-North American data, so I set the bounds manually.

xmin = -14821585.49397232
xmax = -4000000.0
ymax = 8826427.537488604
ymin = -1000000.0

# List month names on the slider
month_widget = pn.widgets.DiscreteSlider(
    options={
        calendar.month_name[month_num]: month_num
        for month_num in range(1,13)
    }
)

# Plot occurrence by ecoregion and month
migration_plot = (
    occurrence_gdf
    .hvplot(
        c='norm_occurrences',
        groupby='month',
        # Use background tiles
        geo=True, crs=ccrs.Mercator(), tiles='CartoLight',
        title="Baltimore Oriole Migration, 2024",
        xlim=(xmin, xmax), ylim=(ymin, ymax),
        frame_height=500,
        widgets={'month': month_widget},
        widget_location='bottom'
    )
)

# Save the plot
migration_plot.save('baltimore-oriole-migration.html', embed=True)
# Display the plot
migration_plot

WARNING:W-1005 (FIXED_SIZING_MODE): 'fixed' sizing mode requires width and height to be set: figure(id='p204203', ...)

BokehModel(combine_events=True, render_bundle={'docs_json': {'cd833894-6a91-45d1-ab89-a745994ab6d8': {'version…

	decimalLatitude	decimalLongitude	month
gbifID
4923513378	41.852812	-87.611721	5.0
4444338249	29.757400	-95.392000	4.0
4158425811	41.913727	-82.509377	5.0
4059974834	56.351650	12.815890	3.0
5139450263	56.349510	12.807660	1.0

	month	geometry
gbifID
4923513378	5.0	POINT (-87.61172 41.85281)
4444338249	4.0	POINT (-95.392 29.7574)
4158425811	5.0	POINT (-82.50938 41.91373)
4059974834	3.0	POINT (12.81589 56.35165)
5139450263	1.0	POINT (12.80766 56.34951)
...	...	...
4173218595	5.0	POINT (-83.4423 44.2588)
4173220499	5.0	POINT (-83.1897 41.6275)
4173225614	5.0	POINT (-83.4423 44.2588)
4173224607	5.0	POINT (-83.4423 44.2588)
4444336253	4.0	POINT (-95.6255 30.2757)

	name	area	geometry
ecoregion_id
1.0	Adelie Land tundra	0.038948	MULTIPOLYGON (((158.7141 -69.60657, 158.71264 ...
2.0	Admiralty Islands lowland rain forests	0.170599	MULTIPOLYGON (((147.28819 -2.57589, 147.2715 -...
3.0	Aegean and Western Turkey sclerophyllous and m...	13.844952	MULTIPOLYGON (((26.88659 35.32161, 26.88297 35...
4.0	Afghan Mountains semi-desert	1.355536	MULTIPOLYGON (((65.48655 34.71401, 65.52872 34...
5.0	Ahklun and Kilbuck Upland Tundra	8.196573	MULTIPOLYGON (((-160.26404 58.64097, -160.2673...

	name	area	geometry	gbifID	month
ecoregion_id
13.0	Alberta-British Columbia foothills forests	17.133639	MULTIPOLYGON (((-119.53979 55.81661, -119.5443...	5395228674	6.0
13.0	Alberta-British Columbia foothills forests	17.133639	MULTIPOLYGON (((-119.53979 55.81661, -119.5443...	4843921873	6.0
13.0	Alberta-British Columbia foothills forests	17.133639	MULTIPOLYGON (((-119.53979 55.81661, -119.5443...	4845489589	6.0
13.0	Alberta-British Columbia foothills forests	17.133639	MULTIPOLYGON (((-119.53979 55.81661, -119.5443...	4844073404	6.0
13.0	Alberta-British Columbia foothills forests	17.133639	MULTIPOLYGON (((-119.53979 55.81661, -119.5443...	4620230968	7.0
...	...	...	...	...	...
839.0	Northern Rockies conifer forests	35.905513	POLYGON ((-119.99977 54.53117, -119.8914 54.45...	5317118606	8.0
839.0	Northern Rockies conifer forests	35.905513	POLYGON ((-119.99977 54.53117, -119.8914 54.45...	5491326849	8.0
839.0	Northern Rockies conifer forests	35.905513	POLYGON ((-119.99977 54.53117, -119.8914 54.45...	4779690761	8.0
839.0	Northern Rockies conifer forests	35.905513	POLYGON ((-119.99977 54.53117, -119.8914 54.45...	5315148847	5.0
839.0	Northern Rockies conifer forests	35.905513	POLYGON ((-119.99977 54.53117, -119.8914 54.45...	4804308520	6.0

	occurrences	area	density
ecoregion_id
13.0	23.250000	17.133639	1.356980
17.0	1820.714286	7.958751	228.768855
23.0	8.166667	3.346216	2.440568
33.0	1304.750000	16.637804	78.420808
34.0	1145.875000	18.674884	61.359151
...	...	...	...
796.0	144.500000	49.311356	2.930359
809.0	27.142857	4.295824	6.318429
810.0	37.888889	5.968650	6.347984
833.0	4.400000	0.610793	7.203744
839.0	32.000000	35.905513	0.891228

Study of Annual Migration Patterns of the Baltimore Oriole¶

Background on the Baltimore Oriole (Icterus galbula)¶

Migration Mapping¶

Migration Data from GBIF¶

Ecoregion Data¶

Create data directory in user's home directory¶

Download Data from GBIF¶

Search for the Baltimore Oriole (Icterus galbula)¶

Submit download request to GBIF and download occurrence data for year of interest¶

Read in the downloaded data from GBIF¶

Load the GBIF Data¶

Converting the dataframe to a geodataframe¶

Set up the ecoregion boundary URL¶

Open the downloaded ecoregion boundaries¶

Join the ecoregion and GBIF data¶

Count observations in each ecoregion each month¶

Normalize by area¶

Take the mean by ecoregion¶

Take the mean by month¶

Normalize by space and time for sampling effort¶

Plotting geographically¶

Create an interactive plot of the migration data¶

	occurrences	area	density
month
1.0	186.096774	7.103442	119.720622
2.0	167.935484	5.971543	114.267011
3.0	147.676923	5.911391	119.012923
4.0	339.815789	9.274769	57.863214
5.0	4020.181818	17.520827	324.009395
6.0	1846.558140	24.536460	149.367654
7.0	731.162162	22.870137	60.125886
8.0	656.714286	19.313669	60.287617
9.0	214.259740	13.187489	46.109531
10.0	149.589041	8.409373	63.454302
11.0	126.456140	8.570266	64.232304
12.0	139.112676	7.586825	63.587655