WGCNA - k821209/pipelines GitHub Wiki

데이터 클린업 및 준비

#워킹디렉토리 세팅
getwd();
workingDir = ".";
setwd(workingDir); 

library(WGCNA);

# 왠지 모르지만 해야된다함 
options(stringsAsFactors = FALSE);


femData = read.csv("LiverFemale3600.csv");
maleData = read.csv("LiverMale3600.csv");

# Take a quick look at what is in the data sets (caution, longish output):
dim(femData)
names(femData)
dim(maleData)
names(maleData)

# 샘플 세트가 두개일 경우
nSets = 2;

# 각 샘플의 대표 이름을 정해준다. 이후 플롯팅에 사용될 예정
setLabels = c("Female liver", "Male liver")
shortLabels = c("Female", "Male")

# 데이터세트 만드는 과정, 입력데이터를 보면  9번째 열부터 실제 발현 데이터가 시작됨 
multiExpr = vector(mode = "list", length = nSets)

multiExpr[1](/k821209/pipelines/wiki/1) = list(data = as.data.frame(t(femData[-c(1:8)]))); # 1:8까지를 빼라는 이야기인듯 
names(multiExpr[1](/k821209/pipelines/wiki/1)$data) = femData$substanceBXH;                # 열 이름 정해줌 
rownames(multiExpr[1](/k821209/pipelines/wiki/1)$data) = names(femData)[-c(1:8)];          # 행 이름 정해줌 
multiExpr[2](/k821209/pipelines/wiki/2) = list(data = as.data.frame(t(maleData[-c(1:8)])));
names(multiExpr[2](/k821209/pipelines/wiki/2)$data) = maleData$substanceBXH;
rownames(multiExpr[2](/k821209/pipelines/wiki/2)$data) = names(maleData)[-c(1:8)];

# Check that the data has the correct format for many functions operating on multiple sets:
exprSize = checkSets(multiExpr)

# Check that all genes and samples have sufficiently low numbers of missing values.
# 데이터세트가 괜찮은지 확인하고 구리면 퀄리티 컨트롤 하는 스크립트임. 
gsg = goodSamplesGenesMS(multiExpr, verbose = 3);
gsg$allOK

if (!gsg$allOK)
{
  # Print information about the removed genes:
  if (sum(!gsg$goodGenes) > 0)
    printFlush(paste("Removing genes:", paste(names(multiExpr[1](/k821209/pipelines/wiki/1)$data)[!gsg$goodGenes], 
                                              collapse = ", ")))
  for (set in 1:exprSize$nSets)
  {
    if (sum(!gsg$goodSamples[set](/k821209/pipelines/wiki/set)))
      printFlush(paste("In set", setLabels[set], "removing samples",
                       paste(rownames(multiExpr[set](/k821209/pipelines/wiki/set)$data)[!gsg$goodSamples[set](/k821209/pipelines/wiki/set)], collapse = ", ")))
    # Remove the offending genes and samples
    multiExpr[set](/k821209/pipelines/wiki/set)$data = multiExpr[set](/k821209/pipelines/wiki/set)$data[gsg$goodSamples[set](/k821209/pipelines/wiki/set), gsg$goodGenes];
  }
  # Update exprSize
  exprSize = checkSets(multiExpr)
}

# 각 세트별로 클러스터링 
sampleTrees = list()
for (set in 1:nSets)
{
  sampleTrees[set](/k821209/pipelines/wiki/set) = hclust(dist(multiExpr[set](/k821209/pipelines/wiki/set)$data), method = "average")
}

# 클러스터링 결과 시각화 
# pdf(file = "Plots/SampleClustering.pdf", width = 12, height = 12);
# cannot open file './Plots/SampleClustering.pdf' 에러난다면 다음과 같이. 
pdf(paste('test.pdf',sep = ''), width = 12, height = 12);
par(mfrow=c(2,1))
par(mar = c(0, 4, 2, 0))
for (set in 1:nSets)
  plot(sampleTrees[set](/k821209/pipelines/wiki/set), main = paste("Sample clustering on all genes in", setLabels[set]),
       xlab="", sub="", cex = 0.7);
dev.off();

# 그림을 보고 트리에서 컷오프 높이를 정함. 
# Choose the "base" cut height for the female data set
baseHeight = 16

# 총 두개의 샘플에 대해서 baseHeight를 정해준다. 
# Adjust the cut height for the male data set for the number of samples
cutHeights = c(16, 16*exprSize$nSamples[2]/exprSize$nSamples[1]);

# 새로그림. 컷 라인을 넣는다. 두번째 샘플은 나오지 않는다. 두번째샘플에 선이 나오게 하려면 cutHeights의 값을 내려줘야함. 
# Re-plot the dendrograms including the cut lines
pdf(file = "Plots/SampleClustering.pdf", width = 12, height = 12);
par(mfrow=c(2,1))
par(mar = c(0, 4, 2, 0))
for (set in 1:nSets)
{
  plot(sampleTrees[set](/k821209/pipelines/wiki/set), main = paste("Sample clustering on all genes in", setLabels[set]),
       xlab="", sub="", cex = 0.7);
  abline(h=cutHeights[set], col = "red");
}
dev.off();


# 
for (set in 1:nSets)
{
  # Find clusters cut by the line
  # 위에서 정해준 cutHeight 값으로 자른뒤 큰 클러스터만 남긴다. 큰 클러스터가 1로 label 되나봄
  labels = cutreeStatic(sampleTrees[set](/k821209/pipelines/wiki/set), cutHeight = cutHeights[set])
  # Keep the largest one (labeled by the number 1)
  keep = (labels==1)
  multiExpr[set](/k821209/pipelines/wiki/set)$data = multiExpr[set](/k821209/pipelines/wiki/set)$data[keep, ]
}
collectGarbage();

# Check the size of the leftover data
exprSize = checkSets(multiExpr)
exprSize

# 데이터 인풋 작성완료
save(multiExpr, Traits, nGenes, nSamples, setLabels, shortLabels, exprSize, 
     file = "Consensus-dataInput.RData");

Network construction and consensus module detection

# 워킹디렉토리 등록
getwd();
workingDir = ".";
setwd(workingDir); 

library(WGCNA)

# The following setting is important, do not omit.
# 시키는대로 하자. 
options(stringsAsFactors = FALSE);

# Allow multi-threading within WGCNA. 
# Caution: skip this line if you run RStudio or other third-party R environments.
# See note above.
# 시키는대로 하자. 
enableWGCNAThreads()

# 1번에서 작성했던 데이터를 불러오자. 
lnames = load(file = "Consensus-dataInput.RData");

# 웃긴건 데이터가 lnames에 들어가는게 아니라 변수명들이 불러와지고, 변수명 목록이 lnames에 들어감. 
lnames

# 준비햇던 세트 수를 불러옴. 
nSets = checkSets(multiExpr)$nSets


## 본격 네트워크 작성 
# power 값을 정하는 과정임. 찔러볼 power값 list를 정함. 
powers = c(seq(4,10,by=1), seq(12,20, by=2));


# powerTable 준비 R은 기본적으로 빈통을 만들고 채워넣는 식임. 빈통부터 만들어야함 
powerTables = vector(mode = "list", length = nSets);

# Call the network topology analysis function for each set in turn
# pickSoftThreshold 기능을 이용해서 준비한 찔러볼 powers와 expression 데이터를 집어넣고 powerTable을 채워 넣는다. 
# 요 기능을 이용하면 scale free topology 분석이 되는듯함. 논문에서도 선행연구를 언급한것으로 보아 topology분석에 대한 다른 연구가 존재 
for (set in 1:nSets)
  powerTables[set](/k821209/pipelines/wiki/set) = list(data = pickSoftThreshold(multiExpr[set](/k821209/pipelines/wiki/set)$data, powerVector=powers,
                                                     verbose = 2)[2](/k821209/pipelines/wiki/2));
collectGarbage();

# 시각화 
colors = c("black", "red")

# Will plot these columns of the returned scale free analysis tables
plotCols = c(2,5,6,7)
colNames = c("Scale Free Topology Model Fit", "Mean connectivity", "Median connectivity",
"Max connectivity");
# Get the minima and maxima of the plotted points
ylim = matrix(NA, nrow = 2, ncol = 4);
for (set in 1:nSets)
{
  for (col in 1:length(plotCols))
  {
    ylim[1, col] = min(ylim[1, col], powerTables[set](/k821209/pipelines/wiki/set)$data[, plotCols[col]], na.rm = TRUE);
    ylim[2, col] = max(ylim[2, col], powerTables[set](/k821209/pipelines/wiki/set)$data[, plotCols[col]], na.rm = TRUE);
  }
}


# Plot the quantities in the chosen columns vs. the soft thresholding power
sizeGrWindow(8, 6)
pdf(paste("scaleFreeAnalysis.pdf",sep=''), wi = 8, he = 6)
par(mfcol = c(2,2));
par(mar = c(4.2, 4.2 , 2.2, 0.5))
cex1 = 0.7;
for (col in 1:length(plotCols)) for (set in 1:nSets)
{
  if (set==1)
  {
    plot(powerTables[set](/k821209/pipelines/wiki/set)$data[,1], -sign(powerTables[set](/k821209/pipelines/wiki/set)$data[,3])*powerTables[set](/k821209/pipelines/wiki/set)$data[,2],
         xlab="Soft Threshold (power)",ylab=colNames[col],type="n", ylim = ylim[, col],
         main = colNames[col]);
    addGrid();
  }
  if (col==1)
  {
    text(powerTables[set](/k821209/pipelines/wiki/set)$data[,1], -sign(powerTables[set](/k821209/pipelines/wiki/set)$data[,3])*powerTables[set](/k821209/pipelines/wiki/set)$data[,2],
         labels=powers,cex=cex1,col=colors[set]);
  } else
    text(powerTables[set](/k821209/pipelines/wiki/set)$data[,1], powerTables[set](/k821209/pipelines/wiki/set)$data[,plotCols[col]],
         labels=powers,cex=cex1,col=colors[set]);
  if (col==1)
  {
    legend("bottomright", legend = setLabels, col = colors, pch = 20) ;
  } else
    legend("topright", legend = setLabels, col = colors, pch = 20) ;
}
dev.off();

# 윗그림에서 power 를 6으로 정하고 그림을 그림. 
net = blockwiseConsensusModules(
        multiExpr, power = 6, minModuleSize = 30, deepSplit = 2,
        pamRespectsDendro = FALSE, 
        mergeCutHeight = 0.25, numericLabels = TRUE,
        minKMEtoStay = 0,
        saveTOMs = TRUE, verbose = 5)

# 네트워크 시각화, 각 모듈을 트리와 색을 통해 보여줌. 
consMEs = net$multiMEs;
moduleLabels = net$colors;
# Convert the numeric labels to color labels
moduleColors = labels2colors(moduleLabels)
consTree = net$dendrograms[1](/k821209/pipelines/wiki/1); 

sizeGrWindow(8,6);
pdf(paste("ConsensusDendrogram-auto.pdf",sep=''), wi = 8, he = 6)
plotDendroAndColors(consTree, moduleColors,
                    "Module colors",
                    dendroLabels = FALSE, hang = 0.03,
                    addGuide = TRUE, guideHang = 0.05,
                    main = "Consensus gene dendrogram and module colors")

dev.off()

# 작성된 네트워크 메트릭스 저장 
save(consMEs, moduleLabels, moduleColors, consTree, file = "Consensus-NetworkConstruction-auto.RData")

sample group specific module 가져오기

getwd();
workingDir = ".";
setwd(workingDir); 
library(WGCNA)

# The following setting is important, do not omit.
options(stringsAsFactors = FALSE);

# Load the data saved in the first part
lnames = load(file = "Consensus-dataInput.RData");

#The variable lnames contains the names of loaded variables.
lnames
# Load the results of network analysis, tutorial part 2.a
lnames = load(file = "Consensus-NetworkConstruction-auto.RData");
lnames