################################################################################
# R scripts to create figures on archaeozoological data from mainland Southeast
# Asian prehistoric sites. 
# 
# written by Cyler Conrad, Department of Anthropology, University of New Mexico
# cylerc@unm.edu
#
# Results described in:
# Conrad, C. (in press). Archaeozoology in Mainland Southeast Asia: Changing
# Methodology and Pleistocene to Holocene Forager Subsistence Patterns in 
# Thailand and Peninsular Malaysia. Open Quaternary. 
#
# NOTES
# All data required to perform the analyses can be found at
# http://hdl.handle.net/1928/25699 (Conrad 2015). The script was run using R 
# version 3.1.1 on Mac OS X 10.8.5
#
# Reference: 
# Conrad, C. 2015. Faunal Data for Pleistocene-Holocene Archaeological Sites in 
# Thailand and Peninsular Malaysia [dataset]. University of New Mexico. 
# http://hdl.handle.net/1928/25699
#
# LICENSE
# 
# The MIT License (MIT)
# 
# Copyright (c) 2015 Cyler Conrad
#   
#   Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#   
#   The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
# 
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
################################################################################
# set the base directory for knitr to the directory above this one
library(knitr)
opts_knit$set(root.dir = '../')
################################################################################
# CHUNK 1: load libraries
################################################################################

library(ggplot2) #Version 1.0.0
library(reshape2)
library(gridExtra)
## Loading required package: grid
# define a custom plot theme to avoid repeating it...
theme_new <- theme_set(theme_bw())
theme_new <- theme_update(
          axis.text.x = element_text(vjust=0.5, color="black", size=20, angle=90),
          axis.text.y = element_text(vjust=0.5, color="black", size=20), 
          axis.title.y = element_text(vjust=1.0, color="black", size=20, angle=90),
          axis.title.x = element_text(vjust=0.5, color="black", size=20),
          strip.text.x = element_text(size=25),
          legend.position = "bottom", 
          legend.text = element_text(size=20), 
          legend.title = element_text(size=20)) 
################################################################################
# CHUNK 1-Figure 2: Decadal division of faunal publications using differential 
# methodological recording techniques during the 1930s–2010s. Download data file 
# "data/figure2.csv" from Conrad (2015). This file includes counts per decade for 
# each recording technique used in archaeozoological publications. This figure 
# is constructed by reshaping the data into long format, then using 
# facet_wrap() to construct four plots, one per recording technique. 
################################################################################


# Set absolute path to csv file on your computer
fn <- "data/figure2.csv" 
sea <- read.csv(fn, stringsAsFactors = FALSE, check.names = FALSE)
sea
##   Faunal Publications NISP MNI Presence/Absence Descriptive
## 1           1930-1940    0   0                0           1
## 2           1960-1970    1   0                1           0
## 3           1970-1980    0   0                1           0
## 4           1980-1990    0   2                1           2
## 5           1990-2000    3   5                1           0
## 6           2000-2010    3   0                0           3
## 7        2010-Present    4   0                0           0
str(sea)
## 'data.frame':    7 obs. of  5 variables:
##  $ Faunal Publications: chr  "1930-1940" "1960-1970" "1970-1980" "1980-1990" ...
##  $ NISP               : int  0 1 0 0 3 3 4
##  $ MNI                : int  0 0 0 2 5 0 0
##  $ Presence/Absence   : int  0 1 1 1 1 0 0
##  $ Descriptive        : int  1 0 0 2 0 3 0
sea.long <- melt(sea, value.name="Count")
## Using Faunal Publications as id variables
sea.long  
##    Faunal Publications         variable Count
## 1            1930-1940             NISP     0
## 2            1960-1970             NISP     1
## 3            1970-1980             NISP     0
## 4            1980-1990             NISP     0
## 5            1990-2000             NISP     3
## 6            2000-2010             NISP     3
## 7         2010-Present             NISP     4
## 8            1930-1940              MNI     0
## 9            1960-1970              MNI     0
## 10           1970-1980              MNI     0
## 11           1980-1990              MNI     2
## 12           1990-2000              MNI     5
## 13           2000-2010              MNI     0
## 14        2010-Present              MNI     0
## 15           1930-1940 Presence/Absence     0
## 16           1960-1970 Presence/Absence     1
## 17           1970-1980 Presence/Absence     1
## 18           1980-1990 Presence/Absence     1
## 19           1990-2000 Presence/Absence     1
## 20           2000-2010 Presence/Absence     0
## 21        2010-Present Presence/Absence     0
## 22           1930-1940      Descriptive     1
## 23           1960-1970      Descriptive     0
## 24           1970-1980      Descriptive     0
## 25           1980-1990      Descriptive     2
## 26           1990-2000      Descriptive     0
## 27           2000-2010      Descriptive     3
## 28        2010-Present      Descriptive     0
ggplot(sea.long, aes(`Faunal Publications`, Count, group = 1)) +
  geom_point(size = 4) +
  geom_line() +
  facet_wrap(~ variable) 

# there are no dimensions specified at http://www.openquaternary.com/about/submissions/#Figures & Tables
# so we'll just do what looks good...
fig_width <- 300 # play with this number
ggsave(filename = "figures/fig_2.png",
       dpi = 300, units = "mm",
       height = fig_width/1.6, width =  fig_width)
################################################################################
# CHUNK 2-Figure 3: Grouped classifications from Thailand sites expressing total NISP 
# counts per site (left column), and total NISP counts for all sites (right column).
# Download data file "data/figure3.csv" from Conrad (2015). This file 
# includes total NISP counts for all Thai sites in this dataset, grouped by 
# highest taxon specific values. To create this figure the data is reshaped 
# to a long format and wrapped to express NISP counts per taxonomic classification 
# in each site, and total for all sites. The plot gives total counts per site on
# the left half, with total aggregated NISPs appearing on the right half. 
# Colored points denote sites.
################################################################################


fn <- "data/figure3.csv"
thai <- read.csv(fn, stringsAsFactors = FALSE, skip = 1, check.names=FALSE)
thai
##   Taxonomic Classification Occurences  NISP Tham Lod Lang Kamnan
## 1              Testudines          14 14454      746         121
## 2                Varanidae          7  3966        1           2
## 3                Cervidae          42  3000     2234          11
## 4                  Primate         29  2275       23          NA
## 5                  Suidae          13   897      356          NA
## 6                 Bovidae          26   662      561           9
## 7                  Pisces           7   527       27          NA
##   Lang Rongrien Khao Toh Chong Moh Khiew II Thung Nong Nien
## 1          1221            215        11951             200
## 2            NA             39         3582             342
## 3           118              6          344             287
## 4            NA             25         1960             267
## 5            55             NA          367             119
## 6            11              1           75               5
## 7             9              7          100             384
str(thai)
## 'data.frame':    7 obs. of  9 variables:
##  $ Taxonomic Classification: chr  "Testudines " "Varanidae" "Cervidae " "Primate" ...
##  $ Occurences              : int  14 7 42 29 13 26 7
##  $ NISP                    : int  14454 3966 3000 2275 897 662 527
##  $ Tham Lod                : int  746 1 2234 23 356 561 27
##  $ Lang Kamnan             : int  121 2 11 NA NA 9 NA
##  $ Lang Rongrien           : int  1221 NA 118 NA 55 11 9
##  $ Khao Toh Chong          : int  215 39 6 25 NA 1 7
##  $ Moh Khiew II            : int  11951 3582 344 1960 367 75 100
##  $ Thung Nong Nien         : int  200 342 287 267 119 5 384
thai$`Taxonomic Classification order` <- thai$`Taxonomic Classification`
thai$`Taxonomic Classification` <- factor(thai$`Taxonomic Classification`, 
                                levels = rev(thai$`Taxonomic Classification order`))

thai.long <- melt(thai
                  , measure.vars = c("NISP", "Tham Lod", "Lang Kamnan", 
                                     "Lang Rongrien", "Khao Toh Chong", 
                                     "Moh Khiew II", "Thung Nong Nien")
                  , variable.name = "Site"
                  , value.name = "NISP"
                  , na.rm = TRUE)
thai.long$Total.ind <- factor(ifelse((thai.long$Site == "NISP"), "Total", "Sites"))

p1 <- ggplot(thai.long, aes(x = NISP, y = `Taxonomic Classification`)) +
  guides(colour=guide_legend(nrow=3,byrow=TRUE)) +
  geom_point(aes(colour = Site), alpha = 0.75, size=5) + 
  facet_grid(. ~ Total.ind)
p1

# save plot
fig_width <- 300 # play with this number
ggsave(filename = "figures/fig_3.png",
       dpi = 300, units = "mm",
       height = fig_width/1.6, width =  fig_width)
################################################################################
# CHUNK 3-Figure 4: Grouped classifications from Thailand sites expressing total NISP 
# counts per site (left column), and total NISP counts for all sites (right column) 
# without Moh Khiew Cave II. Download data file "data/figure4.csv" from Conrad (2015).  
# This file includes total NISP counts for all Thai sites in this dataset, 
# excluding Moh Khiew Cave II. This csv file is also grouped by highest taxon 
# specific values.To create this figure the data is reshaped like prior. The plot 
# gives total counts per site on the left half, with total aggregated NISPs 
# appearing on the right half. Colored points denote sites.
################################################################################


fn <- "data/figure4.csv"
thai2 <- read.csv(fn, stringsAsFactors = FALSE, skip = 1, check.names=FALSE)
thai2
##   Taxonomic Classification Occurences NISP Tham Lod Lang Kamnan
## 1                Cervidae          42 2656     2234          11
## 2              Testudines          14 2503      746         121
## 3                 Bovidae          26  587      561           9
## 4                  Suidae          13  530      356          NA
## 5                  Pisces           7  427       27          NA
## 6                Varanidae          7  384        1           2
## 7                 Primates         29  315       23          NA
##   Lang Rongrien Khao Toh Chong Thung Nong Nien
## 1           118              6             287
## 2          1221            215             200
## 3            11              1               5
## 4            55             NA             119
## 5             9              7             384
## 6            NA             39             342
## 7            NA             25             267
str(thai2)
## 'data.frame':    7 obs. of  8 variables:
##  $ Taxonomic Classification: chr  "Cervidae " "Testudines " "Bovidae " "Suidae " ...
##  $ Occurences              : int  42 14 26 13 7 7 29
##  $ NISP                    : int  2656 2503 587 530 427 384 315
##  $ Tham Lod                : int  2234 746 561 356 27 1 23
##  $ Lang Kamnan             : int  11 121 9 NA NA 2 NA
##  $ Lang Rongrien           : int  118 1221 11 55 9 NA NA
##  $ Khao Toh Chong          : int  6 215 1 NA 7 39 25
##  $ Thung Nong Nien         : int  287 200 5 119 384 342 267
thai2$`Taxonomic Classification order` <- thai2$`Taxonomic Classification`
thai2$`Taxonomic Classification` <- factor(thai2$`Taxonomic Classification`, 
                                levels = rev(thai2$`Taxonomic Classification order`))

thai2.long <- melt(thai2
                  , measure.vars = c("NISP", "Tham Lod", "Lang Kamnan", 
                                     "Lang Rongrien", "Khao Toh Chong", 
                                     "Thung Nong Nien")
                  , variable.name = "Site"
                  , value.name = "NISP"
                  , na.rm = TRUE)
thai2.long$Total.ind <- factor(ifelse((thai2.long$Site == "NISP"), "Total", "Sites"))

p2 <- ggplot(thai2.long, aes(x = NISP, y = `Taxonomic Classification`))
p2 <- p2 + geom_point(aes(colour = Site), alpha = 0.75, size=5)
p2 <- p2 + facet_grid(. ~ Total.ind) +  guides(colour=guide_legend(nrow=3,byrow=TRUE)) 
p2

# save plot
fig_width <- 300 # play with this number
ggsave(filename = "figures/fig_4.png",
       dpi = 300, units = "mm",
       height = fig_width/1.6, width =  fig_width)
################################################################################
# CHUNK 4-Figure 5: Grouped classifications from Thailand sites expressing total NISP 
# counts per site (left column), and total NISP counts for all sites 
# (right column) without Testudines. Download data file "data/figure5.csv" from 
# Conrad (2015). This file includes total NISP counts for all Thai sites in 
# this dataset, excluding Testudines. This csv file is also grouped by highest 
# taxon specific values. To create this figure the data is reshaped like prior. 
# The plot gives total counts per site on the left half, with total aggregated 
# NISPs appearing on the right half. Colored points denote sites.
################################################################################


fn <- "data/figure5.csv"
thai3 <- read.csv(fn, stringsAsFactors = FALSE, skip = 1, check.names=FALSE)
thai3
##   Taxonomic Classification Occurences NISP Tham Lod Lang Kamnan
## 1                Varanidae          7 3966        1           2
## 2                Cervidae          42 3000     2234          11
## 3                  Primate         29 2275       23          NA
## 4                  Suidae          13  897      356          NA
## 5                 Bovidae          26  662      561           9
## 6                  Pisces           7  527       27          NA
##   Lang Rongrien Khao Toh Chong Moh Khiew II Thung Nong Nien
## 1            NA             39         3582             342
## 2           118              6          344             287
## 3            NA             25         1960             267
## 4            55             NA          367             119
## 5            11              1           75               5
## 6             9              7          100             384
str(thai3)
## 'data.frame':    6 obs. of  9 variables:
##  $ Taxonomic Classification: chr  "Varanidae" "Cervidae " "Primate" "Suidae " ...
##  $ Occurences              : int  7 42 29 13 26 7
##  $ NISP                    : int  3966 3000 2275 897 662 527
##  $ Tham Lod                : int  1 2234 23 356 561 27
##  $ Lang Kamnan             : int  2 11 NA NA 9 NA
##  $ Lang Rongrien           : int  NA 118 NA 55 11 9
##  $ Khao Toh Chong          : int  39 6 25 NA 1 7
##  $ Moh Khiew II            : int  3582 344 1960 367 75 100
##  $ Thung Nong Nien         : int  342 287 267 119 5 384
thai3$`Taxonomic Classification order` <- thai3$`Taxonomic Classification`
thai3$`Taxonomic Classification` <- factor(thai3$`Taxonomic Classification`, 
                                levels = rev(thai3$`Taxonomic Classification order`))

thai3.long <- melt(thai3
                  , measure.vars = c("NISP", "Tham Lod", "Lang Kamnan", 
                                     "Lang Rongrien", "Khao Toh Chong", 
                                     "Moh Khiew II", "Thung Nong Nien")
                  , variable.name = "Site"
                  , value.name = "NISP"
                  , na.rm = TRUE)
thai3.long$Total.ind <- factor(ifelse((thai3.long$Site == "NISP"), "Total", "Sites"))

p3 <- ggplot(thai3.long, aes(x = NISP, y = `Taxonomic Classification`))
p3 <- p3 + geom_point(aes(colour = Site), alpha = 0.75, size=5)
p3 <- p3 + facet_grid(. ~ Total.ind) +  guides(colour=guide_legend(nrow=3,byrow=TRUE)) 
p3

# save plot
fig_width <- 300 # play with this number
ggsave(filename = "figures/fig_5.png",
       dpi = 300, units = "mm",
       height = fig_width/1.6, width =  fig_width)
################################################################################
# CHUNK 5-Figure 6: Grouped classifications from Peninsular Malaysian sites expressing 
# total NISP counts per site (left column), and total NISP counts for all sites 
# (right column). Download data file "data/figure6.csv" from Conrad (2015). This file 
# includes total NISP counts for all Peninsula Malaysia sites in this dataset.
# This csv file is also grouped by highest taxon specific values. 
# To create this figure the data is reshaped like prior. The plot gives total 
# counts per site on the left half, with total aggregated NISPs appearing 
# on the right half. Colored points denote sites.
################################################################################


fn <- "data/figure6.csv"
ma <- read.csv(fn, stringsAsFactors = FALSE, skip = 1, check.names=FALSE)
ma
##   Taxonomic Classification Occurences Total NISP Gua Gunung Runtuh
## 1                   Suidae          9        212                26
## 2               Testudines         14        212                 4
## 3                  Primate         18         76                24
## 4                  Pisces           7         60                NA
## 5                Cervidae          10         54                13
## 6             Varanus sp.           4         24                 1
## 7                  Bovidae          6          6                 1
##   Gua Peraling Gua Kechil Gua Tenggek Gua Sagu
## 1          152         28          NA        6
## 2           23         22          11      152
## 3           50         NA          NA        2
## 4           43          9          NA        8
## 5           16          7          NA       18
## 6           14         NA          NA        9
## 7           NA         NA           1        4
str(ma)
## 'data.frame':    7 obs. of  8 variables:
##  $ Taxonomic Classification: chr  "Suidae" "Testudines" "Primate" "Pisces " ...
##  $ Occurences              : int  9 14 18 7 10 4 6
##  $ Total NISP              : int  212 212 76 60 54 24 6
##  $ Gua Gunung Runtuh       : int  26 4 24 NA 13 1 1
##  $ Gua Peraling            : int  152 23 50 43 16 14 NA
##  $ Gua Kechil              : int  28 22 NA 9 7 NA NA
##  $ Gua Tenggek             : int  NA 11 NA NA NA NA 1
##  $ Gua Sagu                : int  6 152 2 8 18 9 4
ma$`Taxonomic Classification order` <- ma$`Taxonomic Classification`
ma$`Taxonomic Classification` <- factor(ma$`Taxonomic Classification`, 
                                 levels = rev(ma$`Taxonomic Classification order`))

ma.long <- melt(ma
                 , measure.vars = c("Total NISP", "Gua Gunung Runtuh", 
                                    "Gua Peraling", "Gua Kechil", "Gua Tenggek", 
                                    "Gua Sagu")
                 , variable.name = "Site"
                 , value.name = "NISP"
                 , na.rm = TRUE)
ma.long$Total.ind <- factor(ifelse((ma.long$Site == "Total NISP"), "Total", "Sites"))

p4 <- ggplot(ma.long, aes(x = NISP, y = `Taxonomic Classification`))
p4 <- p4 + geom_point(aes(colour = Site), alpha = 0.75, size=5)
p4 <- p4 + facet_grid(. ~ Total.ind) +  guides(colour=guide_legend(nrow=3,byrow=TRUE)) 
p4

# save plot
fig_width <- 300 # play with this number
ggsave(filename = "figures/fig_6.png",
       dpi = 300, units = "mm",
       height = fig_width/1.6, width = fig_width)
################################################################################
# CHUNK 6-Figure 7: Squared chord distance values for Thailand and Peninsular 
# Malaysian sites. S=Surface context. This group of data is arranged in its final 
# form to include eight different plots into one. For these plots, download all 
# site specific csv files "data/figure7..." in Conrad (2015), then use the function 
# list.files () to upload into R. These files include squared chord distance 
# values, quantified between contexts, for each site. The code is constructed to 
# input the values, plot each site and loop all sites together into one plot. 
################################################################################


fig7_file_names <- list.files(path = "data", pattern = "figure7", full.names = TRUE) 
fig7_data <- lapply(fig7_file_names, read.csv)
## Warning in read.table(file = file, header = header, sep = sep, quote
## = quote, : incomplete final line found by readTableHeader on 'data/
## figure7_KT.csv'
## Warning in read.table(file = file, header = header, sep = sep, quote
## = quote, : incomplete final line found by readTableHeader on 'data/
## figure7_LK.csv'
## Warning in read.table(file = file, header = header, sep = sep, quote
## = quote, : incomplete final line found by readTableHeader on 'data/
## figure7_LR.csv'
## Warning in read.table(file = file, header = header, sep = sep, quote
## = quote, : incomplete final line found by readTableHeader on 'data/
## figure7_MC.csv'
## Warning in read.table(file = file, header = header, sep = sep, quote
## = quote, : incomplete final line found by readTableHeader on 'data/
## figure7_MKI.csv'
## Warning in read.table(file = file, header = header, sep = sep, quote
## = quote, : incomplete final line found by readTableHeader on 'data/
## figure7_SK.csv'
## Warning in read.table(file = file, header = header, sep = sep, quote
## = quote, : incomplete final line found by readTableHeader on 'data/
## figure7_TL.csv'
pattern_to_remove <- c("figure7_", "data/", ".csv")
names(fig7_data) <- gsub(paste0(pattern_to_remove, collapse = "|"), "", fig7_file_names)
my_plots <- vector("list", length = length(fig7_data))
for(i in seq_along(fig7_data)){
  p <- ggplot(fig7_data[[i]], aes(Context, SCD, group = 1)) +
    geom_point(position = "identity", na.rm=FALSE, size=4) +
    geom_line() +
    labs(title = names(fig7_data)[i]) + xlab("") +ylab("") + ylim(0,2) +
    coord_flip() +
    theme_bw() +
    theme(axis.text.x = element_text(vjust=0.5, color="black", size=30),
          axis.text.y = element_text(vjust=0.5, color="black", size=30), 
          axis.title.y = element_text(vjust=2.0, color="black", size=30),
          axis.title.x = element_text(vjust=0.5, color="black", size=30),
          strip.text.x = element_text(size=30),
          plot.title = element_text(vjust=0.5, color="black", size=30))
  my_plots[[i]] <- p
}


# to Save it
g1 <- do.call("arrangeGrob", c(my_plots, ncol=4))
# view it
g1

# save plot
fig_width <- 600 # play with this number
ggsave(filename = "figures/fig_7.png", g1,
       dpi = 300, units = "mm",
       height = fig_width, width = fig_width)