Data Sources and Processing

Library load

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyverse)

## ── Attaching core tidyverse packages ─────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ readr     2.1.5
## ✔ ggplot2   3.5.1     ✔ stringr   1.5.1
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.1

## ── Conflicts ───────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(stringr)
library(raster)

## Loading required package: sp
## 
## Attaching package: 'raster'
## 
## The following object is masked from 'package:dplyr':
## 
##     select

library(sf)

## Linking to GEOS 3.10.2, GDAL 3.4.2, PROJ 8.2.1; sf_use_s2() is TRUE

library(terra)

## terra 1.7.65
## 
## Attaching package: 'terra'
## 
## The following object is masked from 'package:tidyr':
## 
##     extract

library(geosphere)
library(leaflet)
library(leaflet.extras2)

Elk Data Source

Our main data set is the Elk GPS collar data from National Elk Refuge (2006-2015) published by the Northern Rocky Mountain Science Center in 2018. The data follows 17 adult female elk captured in the National Elk Refuge in their migration to Yellowstone National Park. Elk that did not migrate to Yellowstone National Park were not included in the data set. The data is available here.

Cleaning Elk Data

The elk data contains a unique ID for each of the 17 elk. It includes a date-time variable and coordinates. There are To clean the elk data, we we drop the tz timezone variable, because it is homogeneous. We drop the utm_x and utm_y variables because they are simply another kind of location tracking, and we already have latitude and longitude. We create the dist_km variable, which measures the distance traveled in kilometers between observed points using the distHaversine function from the geosphere package. The length of the shortest line between two points on a sphere is the Haversine distance.

elk = 
  read_csv('raw_data/Elk GPS collar data from National Elk Refuge 2006-2015.csv') |> 
  janitor::clean_names() |> 
  dplyr::mutate(
    day = day(dt),
    hour = hour(dt),
    dist_km = 
      ifelse(
        elk_id == lag(elk_id),
        geosphere::distHaversine(cbind(long, lat), cbind(lag(long), lag(lat)))/1000,
        NA)
    ) |> 
  dplyr::select(
    elk_id,
    dt,
    year, 
    month,
    day,
    hour, 
    lat,
    long,
    dist_km
  )

## Rows: 104913 Columns: 11
## ── Column specification ───────────────────────────────────────────
## Delimiter: ","
## chr  (1): TZ
## dbl  (9): Elk_ID, UTM_X, UTM_Y, Zone, Lat, Long, month, year, t
## dttm (1): DT
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Land Cover Cleaning

Our second data set is the land cover data in the state of Wyoming published by the GAP Analysis Project in 2019. This data set uses satellite imagery from 2011 to get land cover data accurate to thirty square meters. While our elk migration data ranges from 2006 to 2015, we make the assumption that land coverage in the form of water or vegetation stays approximately constant over time. The data is available here.

We use the terra package to import and transform the data into latitude and longitude coordinates. We save a cleaned, cropped version of the land cover data. For the purpose of this website, we won’t evaluate the code chunk below.

land = rast('raw_data/land_cover/land_cover.tif')

# Reprojecting in latitude and longitude
land_coord = project(land, "EPSG:4326")

plot(land_coord)

# Subset to the relevant area
min_long = elk |> pull(long) |> min()
max_long = elk |> pull(long) |> max()
rng_long = abs(min_long - max_long)
lowerleftlon = min_long - 0.1 * rng_long 
upperrightlon = max_long + 0.1 * rng_long
min_lat = elk |> pull(lat) |> min()
max_lat = elk |> pull(lat) |> max()
rng_lat = abs(min_lat - max_lat)
lowerleftlat = min_lat - 0.1 * rng_lat
upperrightlat = max_lat + 0.1 * rng_lat

# cropping 
small_land_coord = crop(
  land_coord, 
  extent(lowerleftlon, upperrightlon, lowerleftlat, upperrightlat))

plot(small_land_coord)

# writing raster
terra::writeRaster(small_land_coord, 'clean_data/land_cover.tif', overwrite=TRUE)

Using terra’s extract function, we get the land cover at the relevant points of the analysis, i.e. the locations of the elk.

small_land_coord = rast('clean_data/land_cover.tif')

temp_elk = 
  elk |> 
  mutate(
    longitude = long,
    latitude  = lat) |> 
  dplyr::select(
    longitude,
    latitude
  )

elk_land_cover = terra::extract(x = small_land_coord, y = temp_elk)

We add the elk land cover data to the elk data frame. We save the elk data frame.

elk = 
  elk |> 
  mutate(land_cover = elk_land_cover$land_cover)

elk |> write.csv('clean_data/elk.csv', row.names =  FALSE)

Cleaning Water Quality Data

The water quality data is sourced from the Greater Yellowstone Network and published by the National Park Service. The data contains readings on a variety of water quality measures including mineral composition, flow speed, and temperature. The two locations relevant to this analysis are GRTE_SNR01 and GRTE_SNR02, where the elk migrate. The data is available here

Locations

This data set gives the locations that water was sampled from. We drop the org_code variable because it is homogenous.

water_quality_locations = 
  read_csv('raw_data/water_quality/Locations.csv')|> 
  janitor::clean_names() |> 
  dplyr::select(
    location_id,
    location_name,
    park_code, 
    location_type,
    latitude,
    longitude
  )

## Rows: 18 Columns: 40
## ── Column specification ───────────────────────────────────────────
## Delimiter: ","
## chr (38): Org_Code, Park_Code, Location_ID, Location_Name, Location_Type, La...
## dbl  (2): Latitude, Longitude
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Results

Here we read in the results of the water quality sampling.

water_quality_results = 
  read_csv('raw_data/water_quality/Results.csv')|> 
  janitor::clean_names()

## Warning: One or more parsing issues, call `problems()` on your data frame
## for details, e.g.:
##   dat <- vroom(...)
##   problems(dat)

## Rows: 35397 Columns: 89
## ── Column specification ───────────────────────────────────────────
## Delimiter: ","
## chr  (87): Org_Code, Project_ID, Location_ID, Activity_ID, Activity_Type, Me...
## date  (2): Activity_Start_Date, Analysis_Start_Date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

We find the most common observations.

common_obs = 
  water_quality_results |> 
  drop_na(result_text) |> 
  group_by(characteristic_name) |> 
  summarize(n = n())|> 
  arrange(desc(n)) |> 
  filter(
    n > 875, 
    characteristic_name != "Weather Comments (text)") |> 
  pull(characteristic_name)

We filter for the most common observations. We filter for readings that we can use, given in the acceptable_readings variable. We replace non-detected values with zero. We select for the relevant columns

acceptable_readings = c("Detected and Quantified", "Not Detected", "Present Below Quantification Limit")

water_quality_results=
  water_quality_results |> 
  filter(
    characteristic_name %in% common_obs,
    result_detection_condition %in% acceptable_readings) |> 
  mutate(
    result_text = stringr::str_replace(result_text, "NULL", "0"),
    result_unit = stringr::str_replace(result_unit, "None", ""),
    characteristic_name = paste0(characteristic_name, " ", result_unit) |> trimws(),
    year = year(activity_start_date),
    month = month(activity_start_date),
    day = day(activity_start_date)
    ) |> 
  dplyr::select(
    location_id,
    activity_id,
    activity_type,
    activity_start_date,
    year,
    month,
    day,
    characteristic_name,
    result_text
  )

Combining Water Data

We aggregate the data at the month level by taking the minimum, mean, and maximum readings. We filter for the GRTE_SNR01 and GRTE_SNR02 locations, which are the two relevant locations to our analysis. We save the water quality data in raw, cleaned, and wide formats for ease of use.

water_quality = 
  water_quality_locations |> 
  left_join(water_quality_results) |> 
  filter(location_id %in% c('GRTE_SNR01', 'GRTE_SNR02'))

## Joining with `by = join_by(location_id)`

water_quality2 = 
  water_quality %>% 
  mutate(
    result_text = stringr::str_replace(result_text, 'LOW', '1'),
    result_text = stringr::str_replace(result_text, 'ABOVE NORMAL', '3'),
    result_text = stringr::str_replace(result_text, 'NORMAL', '2'),
    result_text = stringr::str_replace(result_text, 'FLOOD', '4'),
  ) |> 
  group_by(
    location_id,
    location_name,
    year,
    month,
    characteristic_name
  ) %>% 
  summarize(
    monthly_mean = mean(as.numeric(result_text),na.rm = TRUE),
    monthly_min = min(as.numeric(result_text),na.rm = TRUE),
    monthly_max = max(as.numeric(result_text),na.rm = TRUE)
  )

## `summarise()` has grouped output by 'location_id',
## 'location_name', 'year', 'month'. You can override using the
## `.groups` argument.

water_quality |> write.csv('clean_data/water_quality.csv', row.names = FALSE)
water_quality2 |> write.csv('clean_data/water_quality2.csv', row.names = FALSE)


clean_water = 
  water_quality2 %>% 
  pivot_wider(
    names_from = characteristic_name,
    values_from = c('monthly_mean', 'monthly_min', 'monthly_max')
  )

write.csv(clean_water, 'clean_data/clean_water.csv', row.names = FALSE)

Reading in Weather Data

To explore the relationship between temperature, precipitation, and migration, we use the Global Historical Climatology Network’s data. The data is available here

We read in the data, we filter for the four closest stations to the elk, we filter for the correct date range, and finally we average the temperature and snowfall across the stations. Since we only have weather data at four points in the general area of the elk, we average the readings from the four stations.

# closest four stations
four_stations <- 
  c("SNAKE RIVER STATION, WY US", "MORAN 5 WNW, WY US", "BURRO HILL WYOMING, WY US", "MOOSE 1 NNE, WY US")

weather = read_csv("raw_data/raw_weather_data.csv") |> 
  janitor::clean_names() |> 
  filter(
    name %in% four_stations,
    date >= '2006-03-01', 
    date <= '2015-08-25') |> 
  group_by(date) |> 
  summarize(
    tavg = mean(tavg, na.rm = TRUE),
    tmin = mean(tmin, na.rm = TRUE),
    tmax = mean(tmin, na.rm = TRUE),
    prcp = mean(prcp, na.rm = TRUE),
    snow = mean(snow, na.rm = TRUE),
    snwd = mean(snwd, na.rm = TRUE),
  ) |> 
  mutate(
    year = year(date),
    month = month(date),
    day = day(date)
  )

## Warning: One or more parsing issues, call `problems()` on your data frame
## for details, e.g.:
##   dat <- vroom(...)
##   problems(dat)

## Rows: 54745 Columns: 31
## ── Column specification ───────────────────────────────────────────
## Delimiter: ","
## chr   (2): STATION, NAME
## dbl  (21): LATITUDE, LONGITUDE, ELEVATION, AWND, DAPR, MDPR, PRCP, SNOW, SNW...
## lgl   (7): MDSF, WT02, WT03, WT04, WT05, WT06, WT11
## date  (1): DATE
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

weather |> write.csv('clean_data/weather.csv', row.names = FALSE)

Combining all data

We combine the data for ease of comparison. We begin with the elk data, then left join the weather data by date. We find the closest water quality location, then join on the relevant water quality data. We save the data.

all_data = 
  elk %>% 
  left_join(weather |> dplyr::select(-date)) %>% 
  mutate(
    dist_to_GRTE_SNR01 = distHaversine(cbind(long, lat), c(-110.6716, 44.10177))/1000,
    dist_to_GRTE_SNR02 = distHaversine(cbind(long, lat), c(-110.7159, 43.65261))/1000,
    location_id = 
      ifelse(dist_to_GRTE_SNR01 < dist_to_GRTE_SNR02, 'GRTE_SNR01', 'GRTE_SNR02')
  ) %>% 
  left_join(clean_water) %>% 
  dplyr::select(-dist_to_GRTE_SNR01, -dist_to_GRTE_SNR02)

## Joining with `by = join_by(year, month, day)`
## Joining with `by = join_by(year, month, location_id)`

all_data  |> drop_na(location_name)

## # A tibble: 36,754 × 78
##    elk_id dt                   year month   day  hour   lat  long dist_km
##     <dbl> <dttm>              <dbl> <dbl> <int> <int> <dbl> <dbl>   <dbl>
##  1    572 2006-06-30 19:00:00  2006     7    30    19  44.1 -110.  1.13  
##  2    572 2006-07-01 00:01:00  2006     7     1     0  44.1 -110.  0.117 
##  3    572 2006-07-01 05:00:00  2006     7     1     5  44.1 -110.  0.962 
##  4    572 2006-07-01 10:01:00  2006     7     1    10  44.1 -110.  0.156 
##  5    572 2006-07-01 15:01:00  2006     7     1    15  44.1 -110.  0.369 
##  6    572 2006-07-01 20:00:00  2006     7     1    20  44.1 -110.  0.182 
##  7    572 2006-07-02 01:01:00  2006     7     2     1  44.1 -110.  0.127 
##  8    572 2006-07-02 06:01:00  2006     7     2     6  44.1 -110.  0.800 
##  9    572 2006-07-02 11:03:00  2006     7     2    11  44.1 -110.  0.0780
## 10    572 2006-07-02 16:00:00  2006     7     2    16  44.1 -110.  0.719 
## # ℹ 36,744 more rows
## # ℹ 69 more variables: land_cover <dbl>, tavg <dbl>, tmin <dbl>, tmax <dbl>,
## #   prcp <dbl>, snow <dbl>, snwd <dbl>, location_id <chr>, location_name <chr>,
## #   `monthly_mean_Arsenic mg/l` <dbl>, `monthly_mean_Calcium mg/l` <dbl>,
## #   `monthly_mean_Chloride mg/l` <dbl>,
## #   `monthly_mean_Dissolved oxygen (DO) mg/l` <dbl>,
## #   `monthly_mean_Flow cfs` <dbl>, …

all_data |> write.csv('clean_data/all_data.csv', row.names =  FALSE)