#----------------------------------------------------------------
# genetic stratification
#----------------------------------------------------------------
# this script calculates:
# (1) fraction of genetic variance explained by each multidimennsional scaling (MDS) dimension
# (2) MDS plot
# files required
# *.mds file (from PLINK)
# a file with phenotypic information (e.g. breed, affection status, sex, etc.), with FID and IID fields
# -------- edit this ---------------------
# specify *.mds and file path
mds.path <- "D:/jqo/Rscripts/disease.mds"
pheno.path <- "D:/jqo/Rscripts/gwas.pheno"
# ----------------------------------------
#----------------------------------------------------------------
# prepare files
#----------------------------------------------------------------
# import
mds.file <- read.delim(mds.path, sep = "")
pheno.file <- read.delim(pheno.path, sep = "")
# merge files
mds <- merge(pheno.file, mds.file, by = c("FID", "IID"))
# number of samples and columns where MDS dimensions start
samples <- length(mds$IID)
# pheno.file adds n colums + 1 from the "SOL" field in the mds.file
start.dimensions <- ncol(pheno.file) + 2
# variance for each dimension
var.mds <- data.frame(apply(mds[, start.dimensions:(start.dimensions + samples - 1)], 2,
function(x)
{c(variance = var(x, na.rm = TRUE))}))
names(var.mds) <- "varC"
# total variance
all.var.mds <- sum(var.mds[, 1])
# relative variance
var.mds <- transform(var.mds, rel.varC = varC / all.var.mds)
#----------------------------------------------------------------
# plots
#----------------------------------------------------------------
# relative genetic fraction
windows()
plot(var.mds[, 2] * 100,
type = "b",
ylab = "Relative genetic fraction (%)",
xlab = "MDS dimension (Ci)")
# MDS plot for C1 and C2
rel.var.C1 <- paste("C1 (", round(var.mds[1, 2], digit = 3) * 100 , "%)")
rel.var.C2 <- paste("C2 (", round(var.mds[2, 2], digit = 3) * 100 , "%)")
windows()
plot(mds$C1, mds$C2,
col = mds$disease,
pch = 16,
xlab = rel.var.C1,
ylab = rel.var.C2)