#we download the data from https://tcga-data.nci.nih.gov/docs/publications/gbm_exp/
# for 202 samples

type = read.table("E:/backup (2017.7.10)/from desktop/xiaoying/realdata/Broad202.txt", sep = "\t", header = TRUE, 
                    check.names = FALSE, stringsAsFactors = FALSE)
type = read.table("E:/backup (2017.7.10)/from desktop/xiaoying/realdata/UNC202.txt", sep = "\t", header = TRUE, 
                  check.names = FALSE, stringsAsFactors = FALSE)

#type = read.table("E:/backup (2017.7.10)/from desktop/xiaoying/realdata/LBL202.txt", sep = "\t", header = TRUE, 
                  #check.names = FALSE, stringsAsFactors = FALSE)
dim(type)
head(type)
type[3217,]
vc20 <- c("EGFR","PDGFRA","FGFR3","PIK3C2B","PIK3R1","PIK3R3","PIK3IP1","AKTIP","NFIB","CDKN3","CDK4","CDKN1A","CDKN2C","CCND2","CASP1","CASP4","RASGRP3",
          "RRAS","IDH1","FOXM1")

indices <- which(type[,1] == "EGFR")

indices <- which(type[,1] == vc20)

gene20 <- matrix(0,202,20)
dim(gene20)
for (i in 1:20)
{
  indices <- which(type[,1] == vc20[i])
  gene20[,i] <- as.numeric(type[indices,][,2:203])
}
gene20

#length(as.numeric(type[indices,][,2:202]))

p = 20
n = 202

#center_colmeans <- function(x) {
#  xcenter = colMeans(x)
#  x - rep(xcenter, rep.int(nrow(x), ncol(x)))
#}

#subset.centered <- center_colmeans(gene20)

x <- as.matrix(gene20)

dim(x)
cov(x)
solve(cov(x))
max(solve(cov(x)))
min(solve(cov(x)))

color20 <- color[[1]]
color20

##########

vc10 <- c("TP53","PTEN","NF1","EGFR","IDH1","PIK3R1","RB1","ERBB2","PIK3CA","PDGFRA")

gene10 <- matrix(0,202,10)
dim(gene10)
for (i in 1:10)
{
  indices <- which(type[,1] == vc10[i])
  gene10[,i] <- as.numeric(type[indices,][,2:203])
}
gene10
p = 10
n = 173

#center_colmeans <- function(x) {
#  xcenter = colMeans(x)
#  x - rep(xcenter, rep.int(nrow(x), ncol(x)))
#}

#subset.centered <- center_colmeans(gene10)
x <- as.matrix(gene10)

cov(x)
solve(cov(x))
max(solve(cov(x)))
min(solve(cov(x)))

color10 <- color[[1]]
color10

####################################

s11861 = read.table("E:/backup (2017.7.10)/from desktop/xiaoying/realdata/unifiedScaled.txt", sep = "\t", header = TRUE, 
                     check.names = FALSE, stringsAsFactors = FALSE)


x1 = s11861

mydata <- matrix(0,dim(x1)[2],dim(x1)[1])
for(i in 1:n)
{
  mydata[i,] = as.numeric(x1[,i])  
}

dim(mydata)
n=202
p=50

set.seed(1000)

ccr <- sample(1:11861,p,replace=F)
ccr

x <- mydata[,ccr]

dim(x)
str(x)
cov(x)
solve(cov(x))

######################################


#x <- gene10

#indices <- which(type[,1] == "A2BP1")
#indices

#for 840 genes and 173 samples

subtype = read.table("E:/backup (2017.7.10)/from desktop/xiaoying/realdata/TCGA_unified_CORE_ClaNC840.txt", sep = "\t", header = TRUE, 
                     check.names = FALSE, stringsAsFactors = FALSE)

subtype = read.table("TCGA_unified_CORE_ClaNC840.txt", sep = "\t", header = TRUE, 
                     check.names = FALSE, stringsAsFactors = FALSE)
getwd()
dir()
#setwd("E:/backup (2017.7.10)/from desktop/xiaoying/realdata/") 

dim(subtype)
head(subtype)
subtype[1:20,1:3]
subtype[1,2:10]

realdata <- subtype[c(-1),c(-1,-2)]
dim(realdata)
head(realdata)

n = 173
p = 100

x1 <- realdata[1:840,1:n]
dim(x1)
#str(x1)
mydata <- matrix(0,dim(x1)[2],dim(x1)[1])
for(i in 1:n)
{
mydata[i,] = as.numeric(x1[,i])  
}

set.seed(1000)

ccr <- sample(1:840,p,replace=F)
ccr
subtype[2:840,1][569]
vcrandom <- subtype[2:840,1][ccr] # the names of selected genes
mydata[,569]
subGSE <- mydata[,ccr] ## the subset data I am going to do the clustering

x <- as.matrix(subGSE)
x <- exp(x) # the real data is log (see verhaak 2010 p107)

dim(x)
str(x)
cov(x)
solve(cov(x))

v <- MBVB(solve(cov(x)),0.5)
v[[1]]

colorrandom <- color[[1]]
colorrandom

write.table(color[[1]], file="E:/backup (2017.7.10)/from desktop/xiaoying/simulations/gene100.txt", append=T, row.names=FALSE, col.names=FALSE, sep=" ",eol = "\n")
colorrandom <- read.table("E:/backup (2017.7.10)/from desktop/xiaoying/simulations/gene100.txt", header=F)
colorrandom <- as.matrix(colorrandom)
write.table(color[[1]], file="gene100.txt", append=T, row.names=FALSE, col.names=FALSE, sep=" ",eol = "\n")
colorrandom100 <- read.table("gene100.txt", header=F)
colorrandom100 <- as.matrix(colorrandom100)





#the end








# define the filename
filename <- "Sonar.csv"
# load the CSV file from the local directory
dataset <- read.csv("E:/backup (2017.7.10)/from desktop/xiaoying/realdata/realdata_sonar/sonar.all-data", header=FALSE)
# preview the first 5 rows
head(dataset)
dim(dataset)
dataset[,61]
subset <- dataset[,1:60]
head(subset)
dim(subset)
p <- dim(subset)[2]
n <- dim(subset)[1]

center_colmeans <- function(x) {
  xcenter = colMeans(x)
  x - rep(xcenter, rep.int(nrow(x), ncol(x)))
}

subset.centered <- center_colmeans(subset)
x <- as.matrix(subset.centered)
head(x)
dim(x)
cov(x)
solve(cov(x))

write.table(best, file="realdata.txt", append=T, row.names=FALSE, col.names=FALSE, sep=" ",eol = "\n")
realeatimated <- read.table("realdata.txt", header=F)
realeatimated


cov(subset)
ek <- solve(cov(x))
max(ek)
min(ek)








source("http://bioconductor.org/biocLite.R")

biocLite()
install.packages("GSVAdata")

#install.packages("scales")
#library(scales)
#subset.rescaled <- subset-min(subset)/(max(subset)-min(subset))
#head(subset.rescaled)


ek <- solve(cov(subset.centered))
dim(ek)
max(ek)
min(ek)