Chapter 4 Template for Munging Data and Creating Graphs

The following is a basic outline of how we used R to munge data and generate graphs

4.1 Load Data Munging Functions

For a particular treatment, extract the value for the data from the filename using string manipulation

get_treatment_val <- function(filename, treatment_name){
  #Input: a filename string and a string representing which treatment you are looking for
  #       Ex: "HostVals_VT0.6_SEED10.data" and "VT"
  #Output: A string representing the value for the given treatment
  #       Ex: "0.6"
  number_pattern <- "[\\d][._][\\d]*"
  treatment_pattern <- str_c(c(treatment_name, number_pattern), sep="", collapse="")
  treatment <- str_extract(filename, treatment_pattern)
  treatment_val <- NA
  
  decimal_pattern <- "[\\d].[\\d]+"
  integer_pattern <- "[\\d]+"
  if(str_detect(treatment, "\\.")){
    treatment_val <- as.numeric(str_extract(treatment, decimal_pattern))
  } else {
    treatment_val <- as.numeric(str_extract(treatment, integer_pattern))
  }
  
  treatment_val 
}

Retrieve the dataset for a particular filename, adding columns for each treatment

get_data <- function(filename, folder, treatments){
  #Input: A datafile name as a string, the folder where the data is located, and a list of treatment names
  #       Ex: "HostVals_VT0.6_SEED10.data", "VertTrans_Data/", and c("VT")
  #Output: A tibble containing all data in the file with a column added for the Seed and for all treatment names
  #       Ex: The columns added for the above data would be SEED with all values at 10 and 
  #           VT with all values at 0.6
  seed <- str_extract(filename, "SEED[\\d]+.")
  seed_val <- str_extract(seed, "[\\d]+")
  
  full_filename <- str_c(c(folder, filename), sep="", collapse="")
  data <- read_csv(full_filename) %>% mutate(SEED = seed_val)
  
  for (i in c(1:length(treatments))){
    data <- data %>% mutate(!!treatments[i] := as.factor(get_treatment_val(filename, treatments[i])))
  }
  data
}

For all files of a particular type (e.g. HostVals), create one dataset that can be used for time series graphs

combine_time_data <- function(filenames, folder, treatments){
  #Input: A list of relevant datafile names, the folder where the data is, and a list of treatment names
  #       Ex: c("HostVals_VT0.6_SEED10.data", "HostVals_VT0.8_SEED10.data"), "VertTrans_Data/", 
  #           and c("VT")
  #Output: A tibble with all data from all datafiles given, with columns added for the seed and 
  #       all treatment names
  all_data <- get_data(filenames[1], folder, treatments)
  
  if(length(filenames) > 1){
    for (i in c(2:length(filenames))){
      add_data <- get_data(filenames[i], folder, treatments)
      all_data <- full_join(all_data, add_data)
    }
  }
  all_data %>% mutate(SEED = as.factor(SEED))
}

Take the time series data and turn it into a format that can be used for stacked histograms

combine_histogram_data <- function(time_data, separate_by, bins){
  #Input: The time series data produced by combine_time_data, the column name 
  #       which the stacked histogram will be facet wrapped by,
  #       and 1:1 list for the desired names of histogram bins
  #       Ex: lysischances, "PLR", and
  #           lysis_histogram_bins <- c(Hist_0.0 = "0 to 0.2 (Highly lysogenic)",
                          # Hist_0.1 = "0 to 0.2 (Highly lysogenic)",
                          # Hist_0.2 = "0.2 to 0.4 (Moderately lysogenic)",
                          # Hist_0.3 = "0.2 to 0.4 (Moderately lysogenic)",
                          # Hist_0.4 = "0.4 to 0.6 (Temperate)",
                          # Hist_0.5 = "0.4 to 0.6 (Temperate)",
                          # Hist_0.6 = "0.6 to 0.8 (Moderately lytic)",
                          # Hist_0.7 = "0.6 to 0.8 (Moderately lytic)",
                          # Hist_0.8 = "0.8 to 1.0 (Highly lytic)",
                          # Hist_0.9 = "0.8 to 1.0 (Highly lytic)")
  #Output: A tibble with 4 columns: update, treatment, Histogram_bins, and count. 
  #       This is the format needed to create a stacked histogram.
  
  #collect initial data and pivot longer
  initial_data <- time_data %>%
    pivot_longer(
      cols = starts_with("Hist_"),
      names_to = "Histogram_bin",
      values_to = "Bin_count"
    ) %>%
    mutate(Histogram_bin = as.factor(Histogram_bin))
  
  #collapse the bins
  collapsed_data <- initial_data
  collapsed_data$Histogram_factor <- dplyr::recode(collapsed_data$Histogram_bin, !!!bins)
  
  #aggregate the counts
  collapsed_data <- collapsed_data %>%
    mutate(count = Bin_count,
           treatment = get(separate_by)) %>%
    select(treatment, update, Histogram_factor, count)
  
  final_data <- aggregate(list(count = collapsed_data$count), 
                          list(update = collapsed_data$update,
                               treatment = collapsed_data$treatment,
                               Histogram_bins = collapsed_data$Histogram_factor), sum)
  
  final_data 
}

Filter the data to only the last update

last_update <- function(time_data){
  time_data %>% filter(update == max(time_data$update))
}

4.2 Gather Settings and Treatments

Note: the following is an example of how to use the above functions

General settings

folder <- "Data/" #Folder where datafiles are 
treatments <- c("PLR", "COI") #Names of the treatments being tested - should match what is in filenames

Stacked histogram settings

#separate_by vals indicate what the stacked histograms should be facet wrapped by
separate_by <- "SEED" 
#How stacked histogram bins should be collapsed and renamed
histogram_bins <- c(Hist_0.0 = "0 to 0.2 (First bin)", 
                    Hist_0.1 = "0 to 0.2 (First bin)",
                    Hist_0.2 = "0.2 to 0.4 (Second bin)",
                    Hist_0.3 = "0.2 to 0.4 (Second bin)",
                    Hist_0.4 = "0.4 to 0.6 (Third bin)",
                    Hist_0.5 = "0.4 to 0.6 (Third bin)",
                    Hist_0.6 = "0.6 to 0.8 (Fourth bin)",
                    Hist_0.7 = "0.6 to 0.8 (Fourth bin)",
                    Hist_0.8 = "0.8 to 1.0 (Fifth bin)",
                    Hist_0.9 = "0.8 to 1.0 (Fifth bin)")

4.3 Collect and Munge Data

Gather filenames

all_filenames <- list.files(folder)

hostval_filenames <- str_subset(all_filenames, "HostVals")
symvals_filenames <- str_subset(all_filenames, "SymVals")

Combine time series data for all subsets of datafiles

hostvals <- combine_time_data(hostval_filenames, folder, treatments)
symvals <- combine_time_data(phagevals_filenames, folder, treatments)

Rearrange time series data into stacked histogram data

symval_histdata <- combine_histogram_data(symvals, separate_by, histogram_bins)

Extract information about average genome values at the end of the runs

final_hostvals <- last_update(hostvals)
final_symvals <- last_update(symvals)

4.4 Analyze Data

4.4.1 Summary Stats

hostvals_summary_stats <- final_hostvals %>%
  group_by(PLR, COI) %>%
  summarise(final_intval = mean(mean_intval))

hostvals_summary_stats

4.5 Create graphs

4.5.1 Graph templates

Time Series template

#All values with this format: <VAL_NAME> need to be filled in
timeseries_graph <- ggplot(data= <TIME_SERIES_DATASET_NAME>,
                         aes(x=update, y= <Y_AXIS_VAL>, 
                             group=<GROUP_VAL_NAME>, colour=<GROUP_VAL_NAME>)) + 
  ylab(<Y_AXIS_LABEL>) + xlab("Updates") + 
  stat_summary(aes(color=<GROUP_VAL_NAME>, fill=<GROUP_VAL_NAME>),
               fun.data="mean_cl_boot", geom=c("smooth"), se=TRUE) + 
  theme(panel.background = element_rect(fill='white', colour='black')) +
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) +
  guides(fill="none") + 
  scale_colour_manual(values=plasma(nlevels(<TIME_SERIES_DATASET_NAME>$<GROUP_VAL_NAME>)+2)) + 
  scale_fill_manual(values=plasma(nlevels(<TIME_SERIES_DATASET_NAME>$<GROUP_VAL_NAME>)+2))

#Facet wrapping is optional, but can be used if multiple treatments are being used
timeseries_graph + facet_wrap(<GROUP_VAL2_NAME>)

Stacked Histogram template

#All values with this format: <VAL_NAME> need to be filled in
stackedhistogram <- ggplot(data = <HISTOGRAM_DATASET_NAME>, 
                                       aes(update, count)) + 
  geom_area(aes(fill=Histogram_bins), position='stack') +
  ylab(<Y_AXIS_LABEL>) + xlab("Evolutionary time (in updates)") +
  scale_fill_manual(<LEGEND_TITLE>,values=plasma(nlevels(<HISTOGRAM_DATASET_NAME>$Histogram_bins)+2)) +
  facet_wrap(~treatment) + 
  theme(panel.background = element_rect(fill='light grey', colour='black')) + 
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) + 
  guides(fill="none") + guides(fill = guide_legend())

stackedhistogram

Boxplot template

#All values with this format: <VAL_NAME> need to be filled in
boxplot <- ggplot(data = <FINAL_INTVAL_DATASET_NAME>, 
                  aes(x = <TREATMENT_NAME>, y = mean_intval,
                                  group=<TREATMENT_NAME>, color=<TREATMENT_NAME>)) + 
  geom_boxplot(
    width = .25, 
    outlier.shape = NA
  ) +
  geom_point(
    size = 1.3,
    alpha = .3,
    position = position_jitter(
      seed = 1, width = .1
    )
  ) + 
  coord_cartesian(xlim = c(1.2, NA), clip = "off") +
  ylab("Mean Interaction Value") + xlab(<TREATMENT_LABEL>) +
  theme(panel.background = element_rect(fill='white', colour='black')) +
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) +
  guides(fill="none") +
  scale_colour_manual(values=plasma(nlevels(<FINAL_INTVAL_DATASET_NAME>$<TREATMENT_NAME>)+2)) + 
  scale_fill_manual(values=plasma(nlevels(<FINAL_INTVAL_DATASET_NAME>$<TREATMENT_NAME>)+2)) +
  scale_y_continuous(labels = scales::percent) +
  theme(legend.position = "none")

boxplot

4.5.2 Examples

Host count

hostcount_plot <- ggplot(data=hostvals,
                         aes(x=update, y=count, 
                             group=PLR, colour=PLR)) + 
  ylab("Host Count") + xlab("Updates") + 
  stat_summary(aes(color=PLR, fill=PLR),
               fun.data="mean_cl_boot", geom=c("smooth"), se=TRUE) + 
  theme(panel.background = element_rect(fill='white', colour='black')) +
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) +
  guides(fill=FALSE) + scale_colour_manual(values=plasma(nlevels(hostvals$PLR)+2)) + 
  scale_fill_manual(values=plasma(nlevels(hostvals$PLR)+2))

hostcount_plot + facet_wrap(~COI)

Sym interaction vals plot

symintval_plot <- ggplot(data=symvals, aes(x=update, y=mean_intval,
                                               group=PLR, color=PLR)) +
  ylab("Symbiont Interaction value") + xlab("Updates") + 
  stat_summary(aes(color=PLR, fill=PLR),
               fun.data="mean_cl_boot", geom=c("smooth"), se=TRUE) + 
  theme(panel.background = element_rect(fill='white', colour='black')) +
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) +
  guides(fill=FALSE) + scale_colour_manual(values=plasma(nlevels(symvals$PLR)+2)) + 
  scale_fill_manual(values=plasma(nlevels(symvals$PLR)+2))

symintval_plot + facet_wrap(~COI)

Sym interaction vals stacked histogram

symval_stackedhistogram <- ggplot(symval_histdata, 
                                       aes(update, count)) + 
  geom_area(aes(fill=Histogram_bins), position='stack') +
  ylab("Count of Symbionts with Phenotype") + xlab("Evolutionary time (in updates)") +
  scale_fill_manual("Interaction Val\n Phenotypes",values=plasma(nlevels(symval_histdata$Histogram_bins)+2)) +
  facet_wrap(~treatment) + 
  theme(panel.background = element_rect(fill='light grey', colour='black')) + 
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) + 
  guides(fill=FALSE) + guides(fill = guide_legend())

symval_stackedhistogram

Sym interaction vals boxplot

boxplot <- ggplot(data = final_symvals, 
                  aes(x = PLR, y = mean_intval,
                                  group=PLR, color=PLR)) + 
  geom_boxplot(
    width = .25, 
    outlier.shape = NA
  ) +
  geom_point(
    size = 1.3,
    alpha = .3,
    position = position_jitter(
      seed = 1, width = .1
    )
  ) + 
  coord_cartesian(xlim = c(1.2, NA), clip = "off") +
  ylab("Mean Interaction Value") + xlab("Prophage Loss Rate") +
  theme(panel.background = element_rect(fill='white', colour='black')) +
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) +
  guides(fill="none") +
  scale_colour_manual(values=plasma(nlevels(final_symvals$PLR)+2)) + 
  scale_fill_manual(values=plasma(nlevels(final_symvals$PLR)+2)) +
  scale_y_continuous(labels = scales::percent) +
  theme(legend.position = "none")

boxplot