首页 > 美文鉴赏

GEO数据挖掘-第二期-三阴性乳腺癌（TNBC）

更新时间:2023-07-07 06:14:12 阅读：评论：0

GEO数据挖掘-第⼆期-三阴性乳腺癌（TNBC）这个数据集⼀共有三篇⽂章对他进⾏了数据挖掘：

1. PMID: 25208879

2. PMID: 26921331

3. PMID: 30175120

我们今天实践最后⼀个。

⽂章标题

Identification of Key Genes and Pathways in Triple-Negative Breast Cancer by Integrated Bioinformatics Analysis 关键词

三阴性乳腺癌

疾病：

疾病：三阴性乳腺癌

不表达下⾯的受体

1.estrogen receptor (ER)

2.progesterone receptor(PR)

3.human epidermal growth factor receptor 2 (Her2)

◆◆◆◆◆

数据库编号：GSE76275

GEO数据库编号

实验设计

实验组：198个三阴性乳腺癌肿瘤样本

对照组：67⾮三阴性乳腺癌肿瘤样本

◆◆◆◆◆

GEO数据挖掘过程

第⼀步.

R包下载安装：跳过---⼤家可回看第⼀期

第⼆步.

数据集下载：跳过---⼤家可回看第⼀期

第三步.回首过往

提取挖掘的数据集：

⽐较198个三阴性乳腺癌肿瘤样本和67⾮三阴性乳腺癌肿瘤样本

library( 'GEOquery' )

## 取表达矩阵和样本信息表

{

gt = gt[[1]]

exprSet = exprs( gt )

pdata = pData( gt )

chl = length( colnames( pdata ) )

group_list = as.character( pdata[, 67] )

}

dim( exprSet )

exprSet[ 1:5, 1:5 ]

table( group_list )

## 取出研究样本

{

not_TN_expr = exprSet[ , grep( 'not TN', group_list )]

TN_expr = exprSet[ , !(colnames(exprSet) %in% colnames(not_TN_expr)) ]

exprSet=cbind(not_TN_expr, TN_expr)

}

## 样本分组

{

group_list = c(rep( 'not_TN', ncol( not_TN_expr ) ),

rep( 'TN', ncol( TN_expr ) ) )

}

dim( exprSet )

exprSet[ 1:5, 1:5 ]

table( group_list )

save( exprSet, group_list, file = 'exprSet_by_group.Rdata')

第四步.

对平台⽂件的注释数据处理

GPL570的⼀个探针对应了多个基因。

## 筛选探针

GPL = gt@featureData@data

colnames( GPL )

View( GPL )

ids = GPL[ ,c( 1, 11 ) ]

ids = ids[ ids[ , 2 ] != '' , ]

## ⼀个探针对应多个基因

library('plyr')

a<>2]), ' /// ')

tmp <>1], a )

df <>

ID2gene = df[,2:3]

colnames( ID2gene ) = c( 'id', 'gene' )

dim(ID2gene)

save(ID2gene, file = 'ID2gene.Rdata')

第五步.

去除没有基因注释的探针

这个数据集不需要进⾏log处理。

取相同基因的表达数据的最⼤值这⼀步，运⾏时间长，⼤家可以想下有没有优雅的代码解决这个问题。

精神面貌如何形容{

exprSet = exprSet[ rownames(exprSet) %in% ID2gene[ , 1 ], ]

ID2gene = ID2gene[ match(rownames(exprSet), ID2gene[ , 1 ] ), ]

}

dim( exprSet )

dim( ID2gene )

tail( sort( table( ID2gene[ , 2 ] ) ), n = 12L )

## 相同基因的表达数据取最⼤值

{

MAX = by( exprSet, ID2gene[ , 2 ],怎么看自己显卡

function(x) rownames(x)[ which.max( rowMeans(x) ) ] )

MAX = as.character(MAX)

exprSet = exprSet[ rownames(exprSet) %in% MAX , ]

rownames( exprSet ) = ID2gene[ match( rownames( exprSet ), ID2gene[ , 1 ] ), 2 ]

}

dim(exprSet)

exprSet[1:5,1:5]

save(exprSet, group_list, file = 'final_exprSet.Rdata')

这步结束我们就得到了最后的数据集。

第六步.

PCA图

data = as.data.frame( t( exprSet ) )

data$group = group_list

png( 'pca_plot.png', res=100 )

autoplot( prcomp( data[ , 1:( ncol( data ) - 1 ) ] ), data = data, colour = 'group') + theme_bw()

autoplot( prcomp( data[ , 1:( ncol( data ) - 1 ) ] ), data = data, colour = 'group') + theme_bw() dev.off()

第七步.

差异分析-热图

library( 'limma' )

{

design <>0 + factor( group_list ) )

colnames( design ) = levels( factor( group_list ) )

rownames( design ) = colnames( exprSet )

}

design考研数学分数线

contrast.matrix <>'TN-not_TN', levels = design )

contrast.matrix

load( './ID2gene.Rdata' )

内蒙社保{

fit <>

fit2 <>

nrDEG = topTable( fit2, coef = 1, n = Inf )

write.table( nrDEG, file = 'nrDEG.out')

}

国家动物园

head(nrDEG)

## heatmap

library( 'pheatmap' )

{

nrDEG_Z = nrDEG[ order( nrDEG$logFC ), ]

nrDEG_F = nrDEG[ order( -nrDEG$logFC ), ]

choo_gene = c( rownames( nrDEG_Z )[1:100], rownames( nrDEG_F )[1:100] )

choo_matrix = exprSet[ choo_gene, ]

choo_matrix = t( scale( t( choo_matrix ) ) )

choo_matrix[choo_matrix > 1] = 1

choo_matrix[choo_matrix <>1] = -1

annotation_col = data.frame( CellType = factor( group_list ) )

rownames( annotation_col ) = colnames( exprSet )

pheatmap( fontsize = 2, choo_matrix, annotation_col = annotation_col, show_rownames = F, annotation_legend = F, cluster_cols = F, filename = 'heatmap.png')

}

第⼋步.

差异分析-⽕⼭图

## volcano plot

library( 'ggplot2' )

logFC_cutoff <>2 * sd( abs( logFC ) ) )拮据是什么意思

如何挽留分手

logFC_cutoff

logFC_cutoff = 1.5

{

nrDEG$change = as.factor( ifel( nrDEG$P.Value <>0.01 & abs(nrDEG$logFC) > logFC_cutoff, ifel( nrDEG$logFC > logFC_cutoff , 'UP', 'DOWN' ), 'NOT' ) )

save( nrDEG, file = 'nrDEG.Rdata' )

this_tile <>'Cutoff for logFC is ', round( logFC_cutoff, 3 ),

'The number of up gene is ', nrow(nrDEG[ nrDEG$change =='UP', ] ),

'The number of down gene is ', nrow(nrDEG[ nrDEG$change =='DOWN', ] ) )

volcano = ggplot(data = nrDEG, aes( x = logFC, y = -log10(P.Value), color = change)) +

geom_point( alpha = 0.4, size = 1.75) +

theme_t( theme_t( theme_bw( ba_size = 15 ) ) ) +

xlab( 'log2 fold change' ) + ylab( '-log10 p-value' ) +

ggtitle( this_tile ) + theme( plot.title = element_text( size = 15, hjust = 0.5)) +

scale_colour_manual( values = c('blue','black','red') )

print( volcano )

ggsave( volcano, filename = 'volcano.png' )

dev.off()

}

本文发布于:2023-07-07 06:14:12，感谢您对本站的认可！

本文链接：https://www.wtabcd.cn/fanwen/fan/89/1071274.html

上一篇：雅培抗-HCV操作规程 (4)

下一篇：Python小白的数学建模课-16.最短路径算法

标签：数据基因样本阴性乳腺癌探针表达

留言与评论（共有 0 条评论）