DiebackGenoVar.R

setwd("./")
# 510Mb with 11* 20k low filtered SNPs.
#https://drive.google.com/file/d/11u73E9TXYNt3Q_an4thZch7AWWYY56It/view?usp=sharing
dieback <- read.csv("2000-ChrALL.csv", na.string = ".")
dieback.meta <- read.csv("DiebackMeta.csv")

dieback.snp <- dieback[,1:5] # remove snp name columns
dieback <- dieback[,-(1:5)] # remove snp name columns
dieback[1:5,1:5]

#have a look at row data
image(as.matrix(dieback[1:2000,]), xlab = "SNPs", ylab = "samples")

# cut bad samples
snp.call.per.sample <- colSums(!is.na(dieback))

hist(snp.call.per.sample,breaks = 50)
threshold = 30000
abline(v=threshold)
dieback <- dieback[, snp.call.per.sample > threshold]
dim(dieback)

# cut bad snps
samp.call.per.snp <- rowSums(!is.na(dieback))
hist(samp.call.per.snp)
high.cut <- 850 # repetitive, overcalled
low.cut <- 200 # snps mostly missing
abline(v=c(high.cut,low.cut))
keepers <- samp.call.per.snp < high.cut & samp.call.per.snp > low.cut
table(keepers)
dieback <- dieback[keepers,]
# genotype filters
snp0 <- rowSums(dieback==0,na.rm=T) # very few homologous ref
snp1 <- rowSums(dieback==1,na.rm=T) # some surplus hets, likely repetitive to exclude
snp2 <- rowSums(dieback==2, na.rm=T) # many homologous alt, ref is rare, need to exclude
snpT <- rowSums(!is.na(dieback))

hist(snp2/snpT,breaks = 20)
homo.alt = 0.9
abline (v=homo.alt,col="red") 
hist(snp0/snpT,breaks = 50) # this is what we expect,  SNPs are not called when they are the same as ref
homo.ref = 0.025
abline (v= homo.ref)
hist(snp1/snpT,breaks = 20)
het <- 0.4
abline(v=het)
keep.allele <- snp1/snpT < het & snp2/snpT < homo.alt # het should be less than 50% max
table(keep.allele)
keep.allele
#FALSE  TRUE 
#79076 30602 
dieback <- dieback[keep.allele,]

# old method
#kinship <- cor(dieback,use = "pair")^2

# big distance mat cal! takes time!!
kin <- as.matrix(dist(t(dieback)))
save( kin,file = "dieback.kin.RData", compress = T)
save( dieback,file = "dieback.RData", compress = T)

#If already created
load("dieback.kin.Rdata")
load("dieback.Rdata")

#


#Clean up species names and store in species.clean.dieback
{
  
  table(species.dieback,exclude=NULL)
  species.clean.dieback<-species.dieback
  species.clean.dieback[which(species.dieback=="Eucalyptus pauciflora subsp niphophila")]<-"E. niphophila"
  species.clean.dieback[which(species.dieback=="Eucalyptus pauciflora subsp pauciflora")]<-"E. pauciflora"
  species.clean.dieback[which(species.dieback=="Eucalptus pauciflora hybrid pauc x niph")]<-"E. pauciflora x E. niphophila"
  table(species.clean.dieback,exclude=NULL)
}
#Then index the samples you want to keep, e.g. pauci
#For just pauci and outgroup and reduce dieback and kin accordingly
#Skip this section if keeping all samples
  {
  pauci.index<-which(species.clean.dieback %in% c("E. pauciflora","E. niphophila", "E. stellulata","E. pauciflora x E. niphophila"))
 dieback<-dieback[,pauci.index] 
 dim(kin)
 kin<-kin[pauci.index,pauci.index]
 dim(kin)
  }
  #Now subset the kinship matrix to just pauci and stellulata
#and create dendrogram


dieback.clust<- hclust(as.dist(kin))

image(kin[dieback.clust$order,dieback.clust$order])

#Get sample names for analysis
samp.names <- matrix(unlist(strsplit(names(dieback),split = "_S")),nr=2)[1,]

#meta.samp <- match(dieback.meta$sample, samp.names)
m.samp <- match(samp.names, dieback.meta$sample)
species.dieback <- dieback.meta$Species.ID[m.samp]
species.clean.dieback<-species.clean.dieback[pauci.index]
sample.dieback<-dieback.meta$sample[m.samp]
batch.dieback <- as.numeric(factor(dieback.meta$run[m.samp]))
lab.name <-  paste(batch.dieback,species.clean.dieback,names(dieback) ) 


#install.packages("geodist")
library(geodist)
long.lat <- cbind(dieback.meta$Longtitude[m.samp],dieback.meta$Latitude[m.samp])
geo.mat <- geodist(long.lat )
image(geo.mat)
hist(geo.mat/1000,xlab = "km apart")
plot(geo.mat/1000,kin,pch = ".",xlim = c(0,50),xlab = "km") #doesn't make sense across subgenera

# something like this?
#dend <- color_labels(as.dendrogram(dieback.clust, labels = lab.name),col = batch.dieback )
# something like this?
#dend <- color_labels(as.dendrogram(dieback.clust, labels = lab.name),col = batch.dieback )


pdf("dieback.hclust220.pauci.pdf",width = 26)
plot(dieback.clust,cex=0.2,  labels = lab.name)
#plot(dend)
dev.off()

#install.packages('dendextend')
library(dendextend)

#labels_colors(as.dendrogram(dieback.clust)) <- 

plot(dend,cex=0.2 )
#Colour by batch
lab.col <- as.numeric(factor(batch.dieback))+1
lab.col[is.na(lab.col)] <- 1
dend <- as.dendrogram(dieback.clust,labels = paste(species.clean.dieback,names(dieback) ))
labels_colors(dend) <- lab.col[dieback.clust$order]
plot(dend)


ld.block <- cor(t(dieback[1:400,]),use = "pair")^2

dim(ld.block)
image(ld.block)


#install.packages("dendextend")
library(dendextend)
library(Polychrome)
#.....what you had....
#Set up colour palette  with no. of species classes
palette19<- glasbey.colors(32)[-c(1,5,10,19,21,25:32)]
pie(rep(1,19), col=palette19)
colors_to_use <- as.numeric(as.factor(species.clean.dieback))
colors_to_use[is.na(colors_to_use)]<-19 #Set species with NA to colour 19
hues_to_use<-sample(palette19)[colors_to_use]  #Create colours from numbers 
table(hues_to_use)
# Create a new object for dendrogram so that can colour the dendrogram using library dendextend
# Then reorder colour vectors according to dendrogram order
dend<-as.dendrogram(dieback.clust)
colors_to_use <- colors_to_use[order.dendrogram(dend)]
hues_to_use <- hues_to_use[order.dendrogram(dend)]
lab.name <-  paste(species.clean.dieback,names(dieback) ) [order.dendrogram(dend)]
labs_to_use<-lab.name
 # paste(sample.dieback," ",species.clean.dieback)[order.dendrogram(dend)]

#Now create the output
pdf("dieback.hclust.200k.with_sp_cols.pauci.pdf",width = 30)

labels_cex(dend)<-0.2
labels(dend)<-labs_to_use
labels_colors(dend)<-hues_to_use
plot(dend)
dev.off()


#To create circularized dendrogram
#install.packages("circlize")
library(circlize)
#Needs non-unique labels so fix
labs_to_use[duplicated(labs_to_use)]<-which(duplicated(labs_to_use))
labels(dend)<-labs_to_use


dev.off()
#png("dieback.hclust220circle.with_sp_cols.png",width=10,height=10,units="cm",res=5000)
#Found better to output top plot windown then pdf it from there rather than directly outputting to pdf or image file
par(mar=c(0,0,0,0),oma=c(0,0,0,0))
circos.par(cell.padding = c(0, 0),track.margin=c(0,0))
labels_cex(dend)<-.1
labels(dend)<-labs_to_use
labels_colors(dend)<-hues_to_use
circlize_dendrogram(dend,labels=TRUE,labels_track_height = .2, dend_track_height = 0.5,branches_lwd=.1,facing="inside",track.height=.2)
#or
circlize_dendrogram(dend,labels=TRUE,labels_track_height = .2, dend_track_height = 0.5,branches_lwd=.1,facing="outside",track.height=.2)

dev.off()


#Curating species on outliers
hist(kin)
thresh<-120
kin[which(kin[1,]>thresh),1:5]
rownames(kin)[which(kin[1,]>thresh)]
species.clean.dieback[which(kin[1,]>thresh)]