Back to Article
Tracer selection using design specific mixtures
Download Source

Tracer selection using design specific mixtures

Author

Alex Koiter

Load Libraries

In [1]:
library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.4
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.0     ✔ tibble    3.2.1
✔ lubridate 1.9.2     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggfortify)
library(patchwork)

% classified function

In [2]:
classify <- function(data, model, by = "site", type = "percent") {
  if(type == "count") x <- table(as.data.frame(data)[,"site"], model$class)
  if(type == "percent") x <- diag(prop.table(table(as.data.frame(data)[,"site"], model$class), 1)) * 100
  x
}

Load in data

In [3]:
geo_results <- read.csv(here::here("./notebooks/Geochemistry analysis - Copy 2.csv")) %>%
  pivot_longer(cols = Ag:Zr, names_to = "Fingerprint", values_to = "value") %>%
  filter(Fingerprint %in% c("Ag", "Al", "As","B","Ba","Be","Bi","Ca","Cd","Ce","Co", "Cr", "Cs", "Cu", "Fe", "Ga", "Hf", "Hg", "In", "K", "La", "Li", "Mg", "Mn", "Mo", "Nb", "Ni", "P", "Pb", "Rb", "S", "Sb", "Sc", "Se", "Sn", "Sr", "Te", "Th", "Tl", "U", "V", "Y", "Zn", "Zr")) %>% # excludes fingerprints that are below level of detection
  dplyr::select(-X) %>% # don't need this column
  filter(sample_design %in% c("Grid", "Transect", "Likely to erode")) 

col_results <- read.csv(here::here("./notebooks/final results revised.csv")) %>%
  pivot_longer(cols = X:B, names_to = "Fingerprint", values_to = "value") %>%
  dplyr::select(-X.1) %>% # don't need this column
  filter(sample_design %in% c("Grid", "Transect", "Likely to erode")) %>%
  mutate(Fingerprint = paste0(Fingerprint, "_col")) # appended _col as some of the colour coefficients have the same id eg Boron = B and Blue also = B

f_order <- c("Ag", "Al", "As","B","Ba","Be","Bi","Ca","Cd","Ce","Co", "Cr", "Cs", "Cu", "Fe", "Ga", "Hf", "Hg", "In", "K", "La", "Li", "Mg", "Mn", "Mo", "Nb", "Ni", "P", "Pb", "Rb", "S", "Sb", "Sc", "Se", "Sn", "Sr", "Te", "Th", "Tl", "U", "V", "Y", "Zn", "Zr", "R_col", "G_col", "B_col", "x_col", "y_col", "Y_col", "X_col", "Z_col","L_col", "a_col", "b_col", "u_col", "v_col", "c_col", "h_col")

# Bind data sets
results <- geo_results %>%
  bind_rows(col_results) 

Virtual mixtures

In [4]:
proportions <- seq(0, 1, 0.05)

mixtures <- results %>%
  group_by(Fingerprint, site, sample_design) %>%
  summarise(avg =  mean(value)) %>%
  pivot_wider(names_from = site, values_from = avg) %>%
  group_by(sample_design) %>%
  mutate(mix_ag = map(Agriculture, ~.x * (1 - proportions)),
         mix_forest = map(Forest, ~.x * proportions)) %>%
  group_by(Fingerprint, Agriculture, Forest, sample_design) %>%
  summarize(mix = map2(mix_ag, mix_forest, ~data.frame(mix = .x + .y, prop_forest = proportions))) %>%
  unnest(mix) %>%
  pivot_wider(id_cols = c("Fingerprint", "Agriculture", "Forest", "sample_design"), names_from = prop_forest, values_from = mix) 
`summarise()` has grouped output by 'Fingerprint', 'site'. You can override
using the `.groups` argument.
`summarise()` has grouped output by 'Fingerprint', 'Agriculture', 'Forest'. You
can override using the `.groups` argument.

Range test

In [5]:
range <- results %>%
  group_by(site, sample_design, Fingerprint) %>%
  summarise(upper_lim = quantile(value, 0.75),
            lower_lim = quantile(value, 0.25)) %>%
  group_by(Fingerprint, sample_design) %>%
  summarise(lower_lim = min(lower_lim),
            upper_lim = max(upper_lim)) %>%
  ungroup() %>%
  left_join(mixtures) %>%
  dplyr::select(-c(Agriculture, Forest)) %>%
  group_by(Fingerprint, sample_design) %>%
  mutate(pass = all(round(`0`:`1`, 5) >= round(lower_lim, 5)) & all(round(`0`:`1`, 5) <= round(upper_lim, 5)))  ## rounding as differences as small as 10^-18 were being removed otherwise
`summarise()` has grouped output by 'site', 'sample_design'. You can override
using the `.groups` argument.
`summarise()` has grouped output by 'Fingerprint'. You can override using the
`.groups` argument.
Joining with `by = join_by(Fingerprint, sample_design)`
grid_range <- filter(range, pass == TRUE , sample_design == "Grid") %>%
  arrange(factor(Fingerprint, levels = f_order)) %>%
  pull(Fingerprint)

transect_range <- filter(range, pass == TRUE , sample_design == "Transect") %>%
  arrange(factor(Fingerprint, levels = f_order)) %>%
  pull(Fingerprint)

likely_range <- filter(range, pass == TRUE , sample_design == "Likely to erode") %>%
  arrange(factor(Fingerprint, levels = f_order)) %>%
  pull(Fingerprint)

range_results <- filter(range, pass == TRUE) %>%
  dplyr::select(Fingerprint, sample_design)

Mann-W test

In [6]:
wilcox_test <- range_results %>%
  inner_join(results) %>%
  nest(data = c(-sample_design, -Fingerprint)) %>%
  mutate(test = map(data, ~wilcox.test(value ~ site, data = ., exact = FALSE))) %>%
  mutate(p_value = map_dbl(test, ~ .$p.value),
         statistic =  map_dbl(test, ~ .$statistic))
Joining with `by = join_by(Fingerprint, sample_design)`
grid_wilcox <- filter(wilcox_test, p_value < 0.01 , sample_design == "Grid") %>%
  arrange(factor(Fingerprint, levels = f_order))
grid_wilcox
# A tibble: 49 × 6
# Groups:   Fingerprint, sample_design [49]
   Fingerprint sample_design data              test     p_value statistic
   <chr>       <chr>         <list>            <list>     <dbl>     <dbl>
 1 Ag          Grid          <tibble [98 × 3]> <htest> 8.43e-15     2282.
 2 Al          Grid          <tibble [98 × 3]> <htest> 1.48e-17     2401 
 3 As          Grid          <tibble [98 × 3]> <htest> 8.37e-17     2372.
 4 Be          Grid          <tibble [98 × 3]> <htest> 1.47e-17     2401 
 5 Bi          Grid          <tibble [98 × 3]> <htest> 2.85e-16     2349 
 6 Ca          Grid          <tibble [98 × 3]> <htest> 8.60e- 8     1954.
 7 Cd          Grid          <tibble [98 × 3]> <htest> 1.54e- 7     1939 
 8 Ce          Grid          <tibble [98 × 3]> <htest> 1.28e- 8     2002.
 9 Co          Grid          <tibble [98 × 3]> <htest> 3.35e-11     2134.
10 Cr          Grid          <tibble [98 × 3]> <htest> 1.16e-17     2400 
# ℹ 39 more rows
pull(grid_wilcox, Fingerprint)
 [1] "Ag"    "Al"    "As"    "Be"    "Bi"    "Ca"    "Cd"    "Ce"    "Co"   
[10] "Cr"    "Cs"    "Cu"    "Fe"    "Ga"    "Hf"    "Hg"    "In"    "K"    
[19] "La"    "Li"    "Mg"    "Nb"    "Ni"    "Rb"    "S"     "Sb"    "Sc"   
[28] "Se"    "Sn"    "Sr"    "Te"    "Th"    "U"     "V"     "Y"     "Zr"   
[37] "G_col" "B_col" "x_col" "Y_col" "X_col" "Z_col" "L_col" "a_col" "b_col"
[46] "u_col" "v_col" "c_col" "h_col"
transect_wilcox <- filter(wilcox_test, p_value < 0.01 , sample_design == "Transect") %>%
  arrange(factor(Fingerprint, levels = f_order))
transect_wilcox
# A tibble: 38 × 6
# Groups:   Fingerprint, sample_design [38]
   Fingerprint sample_design data              test       p_value statistic
   <chr>       <chr>         <list>            <list>       <dbl>     <dbl>
 1 Ag          Transect      <tibble [28 × 3]> <htest> 0.000319        176 
 2 Al          Transect      <tibble [28 × 3]> <htest> 0.00000734      196 
 3 As          Transect      <tibble [28 × 3]> <htest> 0.00000920      195 
 4 B           Transect      <tibble [28 × 3]> <htest> 0.00000106      196 
 5 Be          Transect      <tibble [28 × 3]> <htest> 0.00000734      196 
 6 Bi          Transect      <tibble [28 × 3]> <htest> 0.0000229       190 
 7 Ca          Transect      <tibble [28 × 3]> <htest> 0.000310        177 
 8 Cd          Transect      <tibble [28 × 3]> <htest> 0.00930         155 
 9 Ce          Transect      <tibble [28 × 3]> <htest> 0.00152         168.
10 Co          Transect      <tibble [28 × 3]> <htest> 0.000433        175 
# ℹ 28 more rows
pull(transect_wilcox, Fingerprint)
 [1] "Ag"    "Al"    "As"    "B"     "Be"    "Bi"    "Ca"    "Cd"    "Ce"   
[10] "Co"    "Cr"    "Cs"    "Cu"    "Fe"    "Ga"    "In"    "La"    "Li"   
[19] "Nb"    "Ni"    "S"     "Sb"    "Sc"    "Se"    "Sn"    "Sr"    "Te"   
[28] "U"     "V"     "Y"     "B_col" "x_col" "Z_col" "a_col" "b_col" "u_col"
[37] "c_col" "h_col"
likely_wilcox <- filter(wilcox_test, p_value < 0.01 , sample_design == "Likely to erode") %>%
  arrange(factor(Fingerprint, levels = f_order))
likely_wilcox
# A tibble: 48 × 6
# Groups:   Fingerprint, sample_design [48]
   Fingerprint sample_design   data              test     p_value statistic
   <chr>       <chr>           <list>            <list>     <dbl>     <dbl>
 1 Ag          Likely to erode <tibble [16 × 3]> <htest> 0.000612        64
 2 Al          Likely to erode <tibble [16 × 3]> <htest> 0.000923        64
 3 As          Likely to erode <tibble [16 × 3]> <htest> 0.000931        64
 4 B           Likely to erode <tibble [16 × 3]> <htest> 0.000325        64
 5 Ba          Likely to erode <tibble [16 × 3]> <htest> 0.000830         0
 6 Be          Likely to erode <tibble [16 × 3]> <htest> 0.000923        64
 7 Bi          Likely to erode <tibble [16 × 3]> <htest> 0.000830        64
 8 Ca          Likely to erode <tibble [16 × 3]> <htest> 0.00741          6
 9 Cd          Likely to erode <tibble [16 × 3]> <htest> 0.000923        64
10 Co          Likely to erode <tibble [16 × 3]> <htest> 0.00710         58
# ℹ 38 more rows
pull(likely_wilcox, Fingerprint)
 [1] "Ag"    "Al"    "As"    "B"     "Ba"    "Be"    "Bi"    "Ca"    "Cd"   
[10] "Co"    "Cr"    "Cs"    "Cu"    "Fe"    "Ga"    "Hg"    "In"    "K"    
[19] "Li"    "Mg"    "Mo"    "Nb"    "Ni"    "Pb"    "Rb"    "Sb"    "Sc"   
[28] "Se"    "Sn"    "Sr"    "Th"    "Tl"    "U"     "V"     "Y"     "Zn"   
[37] "R_col" "G_col" "x_col" "y_col" "Y_col" "X_col" "L_col" "a_col" "b_col"
[46] "u_col" "v_col" "c_col"
wilcox_results <- filter(wilcox_test, p_value < 0.01) %>%
  dplyr::select(Fingerprint, sample_design)

DFA

In [7]:
dfa_test <- wilcox_results %>%
  inner_join(results) %>%
  ungroup() %>%
  nest(data = c(-sample_design)) %>%
  mutate(newdata = map(data, ~pivot_wider(., names_from = Fingerprint, values_from = value))) %>%
  mutate(newdata = map(newdata, ~dplyr::select(., -sample_number))) %>%
  mutate(test = map(newdata, ~ klaR::greedy.wilks(site ~ ., data =. , niveau = 0.1, na.action = "na.omit"))) %>%
  mutate(Fingerprint = map(test, ~ .$results$vars)) %>%
  mutate(wilks = map(test, ~.x$results$Wilks.lambda))
Joining with `by = join_by(Fingerprint, sample_design)`
grid_dfa <- dfa_test %>%
  unnest(Fingerprint) %>%
  filter(sample_design == "Grid") %>%
  pull(Fingerprint)
grid_dfa
 [1] "Li"    "a_col" "Fe"    "Co"    "Hg"    "x_col" "Cs"    "La"    "Ni"   
[10] "Nb"    "h_col" "b_col" "Rb"    "Ca"    "Sr"    "c_col"
transect_dfa <- 
  dfa_test %>%
  unnest(Fingerprint) %>%
  filter(sample_design == "Transect") %>%
  pull(Fingerprint)
transect_dfa
[1] "Li" "Cu" "Ca" "Be" "Co"
likely_dfa <- 
  dfa_test %>%
  unnest(Fingerprint) %>%
  filter(sample_design == "Likely to erode") %>%
  pull(Fingerprint)
likely_dfa
[1] "Li" "Sc" "Sn"

PCA

In [8]:
df_grid <- results %>% 
  filter(sample_design == "Grid") %>%
  filter(Fingerprint %in% grid_dfa) %>%
  pivot_wider(names_from = Fingerprint, values_from = value) %>%
  dplyr::select(-sample_design, -sample_number)

df_transect<- results %>%
  filter(sample_design == "Transect") %>%
  filter(Fingerprint %in% transect_dfa) %>%
  pivot_wider(names_from = Fingerprint, values_from = value) %>%
  dplyr::select(-sample_design, -sample_number)

df_likely <- results %>%
  filter(sample_design == "Likely to erode") %>%
  filter(Fingerprint %in% likely_dfa) %>%
  pivot_wider(names_from = Fingerprint, values_from = value) %>%
  dplyr::select(-sample_design, -sample_number)

pca_grid <- prcomp(select(df_grid, 2:ncol(df_grid)), scale. = TRUE)

autoplot(pca_grid)

p1 <- autoplot(pca_grid, 
               data = df_grid, 
               colour = 'site',
               frame = TRUE, 
               frame.type = 'norm') +
  theme_bw() +
  scale_colour_viridis_d(begin = 0, end = 0.6) +
  scale_fill_viridis_d(begin = 0, end = 0.6) +
  theme(legend.title = element_blank()) +
  labs(title = "Grid")
p1

pca_transect <- prcomp(select(df_transect, 2:ncol(df_transect)) , scale. = TRUE)
autoplot(pca_transect)

p2 <- autoplot(pca_transect, 
               data = df_transect, 
               colour = 'site', 
               frame = TRUE, 
               frame.type = 'norm') +
  theme_bw() +
  scale_colour_viridis_d(begin = 0, end = 0.6) +
  scale_fill_viridis_d(begin = 0, end = 0.6) +
  theme(legend.title = element_blank()) +
  labs(title = "Transect")
p2

pca_likely <- prcomp(select(df_likely, 2:ncol(df_likely)) , scale. = TRUE)
autoplot(pca_likely)

p3 <- autoplot(pca_likely, 
               data = df_likely, 
               colour = 'site', 
               frame = TRUE, 
               frame.type = 'norm') +
  theme_bw() +
  scale_colour_viridis_d(begin = 0, end = 0.6) +
  scale_fill_viridis_d(begin = 0, end = 0.6) +
  theme(legend.title = element_blank())+
  labs(title = "Likely to erode")
p3

p4 <- p1 + p2 + p3+
  plot_layout(guides = 'collect') & theme(legend.position = "bottom")
p4

#ggsave(filename = "Figures R1/design_pca.png", plot = p4, width = 174, height = 100, units = "mm", dpi = 600)

Assess accuracy of discrimination

In [9]:
grid.lda <- MASS::lda(site ~ ., data = df_grid, na.action = "na.omit", CV = T)
transect.lda <- MASS::lda(site ~ ., data = df_transect, na.action = "na.omit", CV = T)
likely.lda <- MASS::lda(site ~ ., data = df_likely, na.action = "na.omit", CV = T)

#' ## percent correct for each category 
#' ### Grid 
table(as.data.frame(df_grid)[,"site"], grid.lda$class)
             
              Agriculture Forest
  Agriculture          49      0
  Forest                0     49
diag(prop.table(table(as.data.frame(df_grid)[,"site"], grid.lda$class), 1))
Agriculture      Forest 
          1           1 
#' ### Transect
table(as.data.frame(df_transect)[,"site"], transect.lda$class)
             
              Agriculture Forest
  Agriculture          14      0
  Forest                0     14
diag(prop.table(table(as.data.frame(df_transect)[,"site"], transect.lda$class), 1))
Agriculture      Forest 
          1           1 
#' ### Likely to erode
table(as.data.frame(df_likely)[,"site"], likely.lda$class)
             
              Agriculture Forest
  Agriculture           8      0
  Forest                0      8
diag(prop.table(table(as.data.frame(df_likely)[,"site"], likely.lda$class), 1))
Agriculture      Forest 
          1           1 
#' ##  total percent correct  
#' ### Grid 
sum(diag(prop.table(table(as.data.frame(df_grid)[,"site"], grid.lda$class))))
[1] 1
#' ### Transect
sum(diag(prop.table(table(as.data.frame(df_transect)[,"site"], transect.lda$class))))
[1] 1
#' ### Likely to erode
sum(diag(prop.table(table(as.data.frame(df_likely)[,"site"], likely.lda$class))))
[1] 1

Percent Classified correctly and Wilks Lamda

Grid design

In [10]:
grid_classify <- tibble(n = 1:length(grid_dfa),
            params = map(n, ~grid_dfa[1:.x])) %>%
  mutate(params = map_chr(params, ~paste0(.x, collapse = " + ")),
         f = map(params, ~as.formula(paste0("site ~ ", .x))),
         m = map(f, ~MASS::lda(.x, data = df_grid, na.action = "na.omit", CV = TRUE)),
         c = map_df(m, ~classify(data = df_grid, model = .x))) %>%
  select(-n, -f, -m) %>%
  mutate(wilks = dfa_test$wilks[dfa_test$sample_design == "Grid"][[1]])

grid_classify
# A tibble: 16 × 3
   params                                                  c$Agriculture   wilks
   <chr>                                                           <dbl>   <dbl>
 1 Li                                                                100 0.0620 
 2 Li + a_col                                                        100 0.0441 
 3 Li + a_col + Fe                                                   100 0.0276 
 4 Li + a_col + Fe + Co                                              100 0.0233 
 5 Li + a_col + Fe + Co + Hg                                         100 0.0217 
 6 Li + a_col + Fe + Co + Hg + x_col                                 100 0.0188 
 7 Li + a_col + Fe + Co + Hg + x_col + Cs                            100 0.0178 
 8 Li + a_col + Fe + Co + Hg + x_col + Cs + La                       100 0.0145 
 9 Li + a_col + Fe + Co + Hg + x_col + Cs + La + Ni                  100 0.0134 
10 Li + a_col + Fe + Co + Hg + x_col + Cs + La + Ni + Nb             100 0.0128 
11 Li + a_col + Fe + Co + Hg + x_col + Cs + La + Ni + Nb …           100 0.0123 
12 Li + a_col + Fe + Co + Hg + x_col + Cs + La + Ni + Nb …           100 0.0114 
13 Li + a_col + Fe + Co + Hg + x_col + Cs + La + Ni + Nb …           100 0.0110 
14 Li + a_col + Fe + Co + Hg + x_col + Cs + La + Ni + Nb …           100 0.00970
15 Li + a_col + Fe + Co + Hg + x_col + Cs + La + Ni + Nb …           100 0.00925
16 Li + a_col + Fe + Co + Hg + x_col + Cs + La + Ni + Nb …           100 0.00886
# ℹ 1 more variable: c$Forest <dbl>

Transect design

In [11]:
transect_classify <- tibble(n = 1:length(transect_dfa),
                        params = map(n, ~transect_dfa[1:.x])) %>%
  mutate(params = map_chr(params, ~paste0(.x, collapse = " + ")),
         f = map(params, ~as.formula(paste0("site ~ ", .x))),
         m = map(f, ~MASS::lda(.x, data = df_transect, na.action = "na.omit", CV = TRUE)),
         c = map_df(m, ~classify(data = df_transect, model = .x))) %>%
  select(-n, -f, -m) %>%
  mutate(wilks = dfa_test$wilks[dfa_test$sample_design == "Transect"][[1]])

transect_classify
# A tibble: 5 × 3
  params                 c$Agriculture $Forest  wilks
  <chr>                          <dbl>   <dbl>  <dbl>
1 Li                               100     100 0.0479
2 Li + Cu                          100     100 0.0345
3 Li + Cu + Ca                     100     100 0.0262
4 Li + Cu + Ca + Be                100     100 0.0208
5 Li + Cu + Ca + Be + Co           100     100 0.0162

Likely to erode design

In [12]:
likely_classify <- tibble(n = 1:length(likely_dfa),
                            params = map(n, ~likely_dfa[1:.x])) %>%
  mutate(params = map_chr(params, ~paste0(.x, collapse = " + ")),
         f = map(params, ~as.formula(paste0("site ~ ", .x))),
         m = map(f, ~MASS::lda(.x, data = df_likely, na.action = "na.omit", CV = TRUE)),
         c = map_df(m, ~classify(data = df_likely, model = .x))) %>%
  select(-n, -f, -m) %>%
  mutate(wilks = dfa_test$wilks[dfa_test$sample_design == "Likely to erode"][[1]])

likely_classify
# A tibble: 3 × 3
  params       c$Agriculture $Forest   wilks
  <chr>                <dbl>   <dbl>   <dbl>
1 Li                     100     100 0.0237 
2 Li + Sc                100     100 0.00835
3 Li + Sc + Sn           100     100 0.00611

Plotting

In [13]:
plotting3 <- results %>%
  filter(Fingerprint %in% unique(c(grid_dfa, likely_dfa, transect_dfa))) %>%
  group_by(Fingerprint, sample_design, site) %>%
  summarise(avg = mean(value),
            sd = sd(value)) %>%
  rename("Source" = site) %>%
  mutate(Fingerprint = as.factor(Fingerprint)) %>%
  mutate(Fingerprint = fct_recode(Fingerprint, "italic(`a*`)" = "a_col", "italic(`b*`)" = "b_col", "italic(`c*`)" = "c_col", "italic(`h*`)" = "h_col", "italic(`x*`)" = "x_col", "`Ca (%)`" = "Ca", "`Fe (%)`" = "Fe"))
`summarise()` has grouped output by 'Fingerprint', 'sample_design'. You can
override using the `.groups` argument.
p1 <- ggplot(data = plotting3, aes(x = sample_design, y = avg, colour = Source)) +
  geom_point(position = position_dodge(width = 0.5)) +
  geom_errorbar(data = plotting3, aes(ymax = avg + sd, ymin = avg -sd, x = sample_design), position = position_dodge(width = 0.5), width = 0.5) +
  scale_colour_viridis_d(begin = 0, end = 0.6) +
  theme_bw() +
  labs(y = "Concentation (ppm) or value") +
  theme(axis.title.x = element_blank(),
        axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1),
        legend.position = "bottom",
        legend.title = element_blank()) +
  facet_wrap(~Fingerprint, ncol = 4, scales = "free_y", labeller = "label_parsed")

p1

#ggsave(plot = p1, filename = "Figures R1/design_means_sds.png", height = 150, width = 175, units = "mm", dpi = 600)