Remember the basic rules of tidy data structure

  1. One column per type of information
  2. One row per observation
  3. One value in each cell
install.packages("tidyr")
library(tidyr)
library(dplyr)

Pivot data from wide to long

Copy link to Western Ghats tree data from datasets page

raw_data = read.csv("http://esapubs.org/archive/ecol/E091/216/Macroplot_data_Rev.txt", sep = "\t")

View data

Lead discussion to correct structure

clean_data <- raw_data %>%
  pivot_longer(TreeGirth1:TreeGirth5, names_to = "stem", values_to = "girth")
clean_data <- raw_data %>%
  pivot_longer(c(-PlotID, -SpCode), names_to = "stem", values_to = "girth")

View data

clean_data <- raw_data %>%
  pivot_longer(c(-PlotID, -SpCode), names_to = "stem", values_to = "girth") %>%
  filter(girth != 0)

Extract

clean_data <- raw_data %>%
  pivot_longer(c(-PlotID, -SpCode), names_to = "stem", values_to = "girth") %>%
  filter(girth != 0) %>%
  extract(stem, 'stem', 'TreeGirth(.)')

Separate

clean_data <- raw_data %>%
  pivot_longer(c(-PlotID, -SpCode), names_to = "stem", values_to = "girth") %>%
  filter(girth != 0) %>%
  extract(stem, 'stem', 'TreeGirth(.)') %>%
  separate(SpCode, c('genus', 'species'), 4)

Unite and Pivot Wider

stem_counts <- clean_data %>% 
  group_by(PlotID, genus, species) %>% 
  summarize(count = n())
stem_counts_wide <- stem_counts %>% 
  unite(species_id, genus, species)
stem_counts_wide <- stem_counts %>% 
  unite(species_id, genus, species) %>%
  pivot_wider(names_from = species_id, values_from = count)
stem_counts_wide <- stem_counts %>% 
  unite(species_id, genus, species) %>%
  pivot_wider(names_from = species_id,
              values_from = count,
              values_fill = list(count = 0))

Completing data with gaps

gappy_data <- read.csv("http://www.datacarpentry.org/semester-biology/data/gappy-data.csv")
gappy_data
clean_data <- gappy_data %>%
  fill(Species)
clean_data <- gappy_data %>%
  fill(Species) %>%
  complete(Species, Individual)
stem_counts <- clean_data %>% 
  group_by(PlotID, genus, species) %>% 
  summarize(count = n()) %>%
  complete(PlotID, nesting(genus, species), fill = list(count = 0))