Chapter 4 Template for Munging Data and Creating Graphs
The following is a basic outline of how we used R to munge data and generate graphs
4.1 Load Data Munging Functions
For a particular treatment, extract the value for the data from the filename using string manipulation
get_treatment_val <- function(filename, treatment_name){
#Input: a filename string and a string representing which treatment you are looking for
# Ex: "HostVals_VT0.6_SEED10.data" and "VT"
#Output: A string representing the value for the given treatment
# Ex: "0.6"
number_pattern <- "[\\d][._][\\d]*"
treatment_pattern <- str_c(c(treatment_name, number_pattern), sep="", collapse="")
treatment <- str_extract(filename, treatment_pattern)
treatment_val <- NA
decimal_pattern <- "[\\d].[\\d]+"
integer_pattern <- "[\\d]+"
if(str_detect(treatment, "\\.")){
treatment_val <- as.numeric(str_extract(treatment, decimal_pattern))
} else {
treatment_val <- as.numeric(str_extract(treatment, integer_pattern))
}
treatment_val
}Retrieve the dataset for a particular filename, adding columns for each treatment
get_data <- function(filename, folder, treatments){
#Input: A datafile name as a string, the folder where the data is located, and a list of treatment names
# Ex: "HostVals_VT0.6_SEED10.data", "VertTrans_Data/", and c("VT")
#Output: A tibble containing all data in the file with a column added for the Seed and for all treatment names
# Ex: The columns added for the above data would be SEED with all values at 10 and
# VT with all values at 0.6
seed <- str_extract(filename, "SEED[\\d]+.")
seed_val <- str_extract(seed, "[\\d]+")
full_filename <- str_c(c(folder, filename), sep="", collapse="")
data <- read_csv(full_filename) %>% mutate(SEED = seed_val)
for (i in c(1:length(treatments))){
data <- data %>% mutate(!!treatments[i] := as.factor(get_treatment_val(filename, treatments[i])))
}
data
}For all files of a particular type (e.g. HostVals), create one dataset that can be used for time series graphs
combine_time_data <- function(filenames, folder, treatments){
#Input: A list of relevant datafile names, the folder where the data is, and a list of treatment names
# Ex: c("HostVals_VT0.6_SEED10.data", "HostVals_VT0.8_SEED10.data"), "VertTrans_Data/",
# and c("VT")
#Output: A tibble with all data from all datafiles given, with columns added for the seed and
# all treatment names
all_data <- get_data(filenames[1], folder, treatments)
if(length(filenames) > 1){
for (i in c(2:length(filenames))){
add_data <- get_data(filenames[i], folder, treatments)
all_data <- full_join(all_data, add_data)
}
}
all_data %>% mutate(SEED = as.factor(SEED))
}Take the time series data and turn it into a format that can be used for stacked histograms
combine_histogram_data <- function(time_data, separate_by, bins){
#Input: The time series data produced by combine_time_data, the column name
# which the stacked histogram will be facet wrapped by,
# and 1:1 list for the desired names of histogram bins
# Ex: lysischances, "PLR", and
# lysis_histogram_bins <- c(Hist_0.0 = "0 to 0.2 (Highly lysogenic)",
# Hist_0.1 = "0 to 0.2 (Highly lysogenic)",
# Hist_0.2 = "0.2 to 0.4 (Moderately lysogenic)",
# Hist_0.3 = "0.2 to 0.4 (Moderately lysogenic)",
# Hist_0.4 = "0.4 to 0.6 (Temperate)",
# Hist_0.5 = "0.4 to 0.6 (Temperate)",
# Hist_0.6 = "0.6 to 0.8 (Moderately lytic)",
# Hist_0.7 = "0.6 to 0.8 (Moderately lytic)",
# Hist_0.8 = "0.8 to 1.0 (Highly lytic)",
# Hist_0.9 = "0.8 to 1.0 (Highly lytic)")
#Output: A tibble with 4 columns: update, treatment, Histogram_bins, and count.
# This is the format needed to create a stacked histogram.
#collect initial data and pivot longer
initial_data <- time_data %>%
pivot_longer(
cols = starts_with("Hist_"),
names_to = "Histogram_bin",
values_to = "Bin_count"
) %>%
mutate(Histogram_bin = as.factor(Histogram_bin))
#collapse the bins
collapsed_data <- initial_data
collapsed_data$Histogram_factor <- dplyr::recode(collapsed_data$Histogram_bin, !!!bins)
#aggregate the counts
collapsed_data <- collapsed_data %>%
mutate(count = Bin_count,
treatment = get(separate_by)) %>%
select(treatment, update, Histogram_factor, count)
final_data <- aggregate(list(count = collapsed_data$count),
list(update = collapsed_data$update,
treatment = collapsed_data$treatment,
Histogram_bins = collapsed_data$Histogram_factor), sum)
final_data
}Filter the data to only the last update
last_update <- function(time_data){
time_data %>% filter(update == max(time_data$update))
}4.2 Gather Settings and Treatments
Note: the following is an example of how to use the above functions
General settings
folder <- "Data/" #Folder where datafiles are
treatments <- c("PLR", "COI") #Names of the treatments being tested - should match what is in filenamesStacked histogram settings
#separate_by vals indicate what the stacked histograms should be facet wrapped by
separate_by <- "SEED"
#How stacked histogram bins should be collapsed and renamed
histogram_bins <- c(Hist_0.0 = "0 to 0.2 (First bin)",
Hist_0.1 = "0 to 0.2 (First bin)",
Hist_0.2 = "0.2 to 0.4 (Second bin)",
Hist_0.3 = "0.2 to 0.4 (Second bin)",
Hist_0.4 = "0.4 to 0.6 (Third bin)",
Hist_0.5 = "0.4 to 0.6 (Third bin)",
Hist_0.6 = "0.6 to 0.8 (Fourth bin)",
Hist_0.7 = "0.6 to 0.8 (Fourth bin)",
Hist_0.8 = "0.8 to 1.0 (Fifth bin)",
Hist_0.9 = "0.8 to 1.0 (Fifth bin)")4.3 Collect and Munge Data
Gather filenames
all_filenames <- list.files(folder)
hostval_filenames <- str_subset(all_filenames, "HostVals")
symvals_filenames <- str_subset(all_filenames, "SymVals")Combine time series data for all subsets of datafiles
hostvals <- combine_time_data(hostval_filenames, folder, treatments)
symvals <- combine_time_data(phagevals_filenames, folder, treatments)Rearrange time series data into stacked histogram data
symval_histdata <- combine_histogram_data(symvals, separate_by, histogram_bins)Extract information about average genome values at the end of the runs
final_hostvals <- last_update(hostvals)
final_symvals <- last_update(symvals)4.4 Analyze Data
4.4.1 Summary Stats
hostvals_summary_stats <- final_hostvals %>%
group_by(PLR, COI) %>%
summarise(final_intval = mean(mean_intval))
hostvals_summary_stats4.5 Create graphs
4.5.1 Graph templates
Time Series template
#All values with this format: <VAL_NAME> need to be filled in
timeseries_graph <- ggplot(data= <TIME_SERIES_DATASET_NAME>,
aes(x=update, y= <Y_AXIS_VAL>,
group=<GROUP_VAL_NAME>, colour=<GROUP_VAL_NAME>)) +
ylab(<Y_AXIS_LABEL>) + xlab("Updates") +
stat_summary(aes(color=<GROUP_VAL_NAME>, fill=<GROUP_VAL_NAME>),
fun.data="mean_cl_boot", geom=c("smooth"), se=TRUE) +
theme(panel.background = element_rect(fill='white', colour='black')) +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) +
guides(fill="none") +
scale_colour_manual(values=plasma(nlevels(<TIME_SERIES_DATASET_NAME>$<GROUP_VAL_NAME>)+2)) +
scale_fill_manual(values=plasma(nlevels(<TIME_SERIES_DATASET_NAME>$<GROUP_VAL_NAME>)+2))
#Facet wrapping is optional, but can be used if multiple treatments are being used
timeseries_graph + facet_wrap(<GROUP_VAL2_NAME>) Stacked Histogram template
#All values with this format: <VAL_NAME> need to be filled in
stackedhistogram <- ggplot(data = <HISTOGRAM_DATASET_NAME>,
aes(update, count)) +
geom_area(aes(fill=Histogram_bins), position='stack') +
ylab(<Y_AXIS_LABEL>) + xlab("Evolutionary time (in updates)") +
scale_fill_manual(<LEGEND_TITLE>,values=plasma(nlevels(<HISTOGRAM_DATASET_NAME>$Histogram_bins)+2)) +
facet_wrap(~treatment) +
theme(panel.background = element_rect(fill='light grey', colour='black')) +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) +
guides(fill="none") + guides(fill = guide_legend())
stackedhistogramBoxplot template
#All values with this format: <VAL_NAME> need to be filled in
boxplot <- ggplot(data = <FINAL_INTVAL_DATASET_NAME>,
aes(x = <TREATMENT_NAME>, y = mean_intval,
group=<TREATMENT_NAME>, color=<TREATMENT_NAME>)) +
geom_boxplot(
width = .25,
outlier.shape = NA
) +
geom_point(
size = 1.3,
alpha = .3,
position = position_jitter(
seed = 1, width = .1
)
) +
coord_cartesian(xlim = c(1.2, NA), clip = "off") +
ylab("Mean Interaction Value") + xlab(<TREATMENT_LABEL>) +
theme(panel.background = element_rect(fill='white', colour='black')) +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) +
guides(fill="none") +
scale_colour_manual(values=plasma(nlevels(<FINAL_INTVAL_DATASET_NAME>$<TREATMENT_NAME>)+2)) +
scale_fill_manual(values=plasma(nlevels(<FINAL_INTVAL_DATASET_NAME>$<TREATMENT_NAME>)+2)) +
scale_y_continuous(labels = scales::percent) +
theme(legend.position = "none")
boxplot4.5.2 Examples
Host count
hostcount_plot <- ggplot(data=hostvals,
aes(x=update, y=count,
group=PLR, colour=PLR)) +
ylab("Host Count") + xlab("Updates") +
stat_summary(aes(color=PLR, fill=PLR),
fun.data="mean_cl_boot", geom=c("smooth"), se=TRUE) +
theme(panel.background = element_rect(fill='white', colour='black')) +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) +
guides(fill=FALSE) + scale_colour_manual(values=plasma(nlevels(hostvals$PLR)+2)) +
scale_fill_manual(values=plasma(nlevels(hostvals$PLR)+2))
hostcount_plot + facet_wrap(~COI)Sym interaction vals plot
symintval_plot <- ggplot(data=symvals, aes(x=update, y=mean_intval,
group=PLR, color=PLR)) +
ylab("Symbiont Interaction value") + xlab("Updates") +
stat_summary(aes(color=PLR, fill=PLR),
fun.data="mean_cl_boot", geom=c("smooth"), se=TRUE) +
theme(panel.background = element_rect(fill='white', colour='black')) +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) +
guides(fill=FALSE) + scale_colour_manual(values=plasma(nlevels(symvals$PLR)+2)) +
scale_fill_manual(values=plasma(nlevels(symvals$PLR)+2))
symintval_plot + facet_wrap(~COI)Sym interaction vals stacked histogram
symval_stackedhistogram <- ggplot(symval_histdata,
aes(update, count)) +
geom_area(aes(fill=Histogram_bins), position='stack') +
ylab("Count of Symbionts with Phenotype") + xlab("Evolutionary time (in updates)") +
scale_fill_manual("Interaction Val\n Phenotypes",values=plasma(nlevels(symval_histdata$Histogram_bins)+2)) +
facet_wrap(~treatment) +
theme(panel.background = element_rect(fill='light grey', colour='black')) +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) +
guides(fill=FALSE) + guides(fill = guide_legend())
symval_stackedhistogramSym interaction vals boxplot
boxplot <- ggplot(data = final_symvals,
aes(x = PLR, y = mean_intval,
group=PLR, color=PLR)) +
geom_boxplot(
width = .25,
outlier.shape = NA
) +
geom_point(
size = 1.3,
alpha = .3,
position = position_jitter(
seed = 1, width = .1
)
) +
coord_cartesian(xlim = c(1.2, NA), clip = "off") +
ylab("Mean Interaction Value") + xlab("Prophage Loss Rate") +
theme(panel.background = element_rect(fill='white', colour='black')) +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) +
guides(fill="none") +
scale_colour_manual(values=plasma(nlevels(final_symvals$PLR)+2)) +
scale_fill_manual(values=plasma(nlevels(final_symvals$PLR)+2)) +
scale_y_continuous(labels = scales::percent) +
theme(legend.position = "none")
boxplot