R语⾔源码注释之kmeans()
最近在学习数据挖掘的⼀些基础知识,涉及到的⼀些算法的实现会通过阅读R语⾔源码的⽅式学习,学习后在源码基础上做中⽂注释,并发布出来,供他⼈使⽤。
R中,运⾏kmeans()后,执⾏代码如下(R语⾔源码可以上官⽹下载):
kmeans <-
function(x, centers, iter.max = 10L, nstart = 1L,
algorithm = c("Hartigan-Wong", "Lloyd", "Forgy", "MacQueen"),
trace = FALSE)
{
//.Mimax为R中定义的变量,赋值为int的最⼤值:2147483647
.Mimax <- .Machine$integer.max
学业生涯规划书
//定义⼀个函数,供后续调⽤
do_one <- function(nmeth) {
switch(nmeth,蚯蚓是什么动物
{ # 1 : Hartigan-Wong
//默认选择执⾏第⼀个算法
isteps.Qtran <- as.integer(min(.Mimax, 50 * m))
iTran <- c(isteps.Qtran, integer(max(0,k-1)))
Z <- .Fortran(C_kmns, x, m, p,
centers = centers,
as.integer(k), c1 = integer(m), c2 = integer(m),
nc = integer(k), double(k), double(k), ncp=integer(k),
D = double(m), iTran = iTran, live = integer(k),
iter = iter.max, wss = double(k),曹操代表作
ifault = as.integer(trace))
switch(Z$ifault,
## 1:
stop("empty cluster: try a better t of initial centers",
call. = FALSE),
## 2:
Z$iter <- max(Z$iter, iter.max+1L), # -> and warn below
## 3:
stop("number of cluster centres must lie between 1 and nrow(x)",
call.=FALSE),
## 4: {new @ 2013-06-30; maybe better fix (in Fortran) ?}
warning(gettextf("Quick-TRANSfer stage steps exceeded maximum (= %d)",
isteps.Qtran),
call.=FALSE)
)
},
{ # 2 : Lloyd-Forgy
//这⾥由于"Hartigan-Wong"属于Fortran调⽤,⽆源码,改为看"Lloyd"算法,实际上都差不多
Z <- .C(C_kmeans_Lloyd, x, m, p,
centers = centers, k,
c1 = integer(m), iter = iter.max,
nc = integer(k), wss = double(k))
},
{ # 3 : MacQueen
Z <- .C(C_kmeans_MacQueen, x, m, p,
centers = as.double(centers), k,
c1 = integer(m), iter = iter.max,
nc = integer(k), wss = double(k))
})
if(m23 <- any(nmeth == c(2L, 3L))) {
if(any(Z$nc == 0))
warning("empty cluster: try a better t of initial centers",
call. = FALSE)
}
}
if(Z$iter > iter.max) {
warning(sprintf(ngettext(iter.max,
"did not converge in %d iteration",
小猪变干净了故事"did not converge in %d iterations"),
iter.max), call. = FALSE, domain = NA)
if(m23) Z$ifault <- 2L
}
if(nmeth %in% c(2L, 3L)) {
if(any(Z$nc == 0))
warning("empty cluster: try a better t of initial centers",
call. = FALSE)
}
Z
}
//x为输⼊的需要聚类的数据,as.matrix将x转换成维数相同的矩阵
x <- as.matrix(x)
## as.integer(<too large>) gives NA ==> not allowing too large nrow() / ncol():
//取矩阵化后的x的⾏数和列数,如果不矩阵化,则取值为null
//is.na判断⾏列值是否为空值,如果是,则stop
m <- as.integer(nrow(x)); if(is.na(m)) stop("invalid nrow(x)")
p <- as.integer(ncol(x)); if(is.na(p)) stop("invalid ncol(x)")
//判断centers参数是否为空,如果是,则stop
//center为中⼼点个数
if(missing(centers))
stop("'centers' must be a number or a matrix")
//给nmeth赋值,不同参数algorithm对应不同值
//match.arg(x)中,x为函数⼊参,如果⼊参为空,则默认取第⼀个值
//本例中,algorithm可以从c("Hartigan-Wong", "Lloyd", "Forgy", "MacQueen")中取值
//由于调⽤kmeans是不设置,则algorithm为空,则默认取第⼀个,即"Hartigan-Wong"
nmeth <- switch(match.arg(algorithm),
"Hartigan-Wong" = 1L,
"Lloyd" = 2L, "Forgy" = 2L,
"MacQueen" = 3L)
//de是获取参数的数据类型
//length获取centers的数值个数,如果centers为单个数值则返回1,为向量则返回数值个数if(length(centers) == 1L) {
k <- centers
## we need to avoid duplicates here
//nstart为⼊参,值为1L
if(nstart == 1L)
//sample随机抽样,sample.int抽取int类型的随机抽样
//这⾥sample.int(m, k)代表从m个⾏号中,随机抽取k个作为中⼼点
//所以centers为矩阵化后的三个随机⾏的的数值
//这⾥为kmeans算法第⼀步
centers <- x[sample.int(m, k), , drop = FALSE]
//duplicated()找出重复出现的元素,为了防⽌上边centers的取值重复
if(nstart >= 2L || any(duplicated(centers))) {
cn <- unique(x)
mm <- nrow(cn)
活动音乐
if(mm < k)
stop("more cluster centers than distinct data points.")
centers <- cn[sample.int(mm, k), , drop=FALSE]
}
} el {
centers <- as.matrix(centers)
centers <- as.matrix(centers)
if(any(duplicated(centers)))
stop("initial centers are not distinct")
cn <- NULL
k <- nrow(centers)
if(m < k)
stop("more cluster centers than data points")
}
/
/k还是中⼼点数,为空则stop
k <- as.integer(k)
if(is.na(k)) stop("'invalid value of 'k'")
//如果k == 1,则nmeth对应Hartigan-Wong算法
if (k == 1L) nmeth <- 3L # Hartigan-Wong, (Fortran) needs k > 1
//iter.max是⼊参,值是10L,如果为空则stop
iter.max <- as.integer(iter.max)
if(is.na(iter.max) || iter.max < 1L) stop("'iter.max' must be positive")
//x有⼏列,center就应该有⼏列,也就是说x与centers的维数应该相同,否则stop
if(ncol(x) != ncol(centers))
stop("must have same number of columns in 'x' and 'centers'")
/
/设置数值类型
//调⽤do_one函数,返回Z,⼊参algrithm为空时,nmeth默认为"Hartigan-Wong"
//这⾥由于"Hartigan-Wong"属于Fortran调⽤,改为看"Lloyd"算法,实际上都差不多
Z <- do_one(nmeth)
best <- sum(Z$wss)
if(nstart >= 2L && !is.null(cn))
for(i in2:nstart) {
centers <- cn[sample.int(mm, k), , drop=FALSE]
ZZ <- do_one(nmeth)
if((z <- sum(ZZ$wss)) < best) {
Z <- ZZ
best <- z
}
}
centers <- matrix(Z$centers, k)
dimnames(centers) <- list(1L:k, dimnames(x)[[2L]])
cluster <- Z$c1
if(!is.null(rn <- rownames(x)))
names(cluster) <- rn
totss <- sum(scale(x, scale = FALSE)^2)
structure(list(cluster = cluster, centers = centers, totss = totss,
withinss = Z$wss, tot.withinss = best,
betweenss = totss - best, size = Z$nc,
iter = Z$iter, ifault = Z$ifault),
class = "kmeans")
}
以下为R语⾔中聚类函数kmeans_Lloyd()源码注释(R语⾔源码可以上官⽹下载):
看见黄鼠狼
//运⽤该函数⾸先要对数据矩阵x有⼀个准确的判断
//x的每⼀⾏为⼀个向量,它代表某个事物
//⽽x的每⼀列代表这个向量的维,它表⽰本⾏对应的事物的元素
//这是对x的要求
void kmeans_Lloyd(double *x, int *pn, int *pp, double *cen, int *pk, int *cl,
int *pmaxiter, int *nc, double *wss)
{
int n = *pn, k = *pk, p = *pp, maxiter = *pmaxiter;
int iter, i, j, c, it, inew = 0;
double best, dd, tmp;
Rboolean updated;
//数组初始化为-1,n等于m
for(i = 0; i < n; i++) cl[i] = -1;
//maxiter赋值为最⼤迭代次数,即iter.max
杭州的特产for(iter = 0; iter < maxiter; iter++) {
updated = FALSE;
/
/遍历n⾏(即m⾏数据),每⾏数据为⼀个向量,向量维数为p
for(i = 0; i < n; i++) {
/* find nearest centre for each point */
//R_PosInf = ML_POSINF = (1.0 / 0.0) 即⽆穷⼤的意思
best = R_PosInf;
//k为中⼼点个数
for(j = 0; j < k; j++) {
dd = 0.0;
//p = ncol(x)
//计算第i个p维向量与第j个中⼼点的距离
for(c = 0; c < p; c++) {
/
/k和n表⽰⾏数,x[i+n*c]表⽰第i个向量的第c维数据,cen[j+k*c]表⽰第j个向量的第c维数据,即中⼼点向量 tmp = x[i+n*c] - cen[j+k*c];
//两个向量之间的距离 - 欧⼏⾥德距离
dd += tmp * tmp;
}
//判断x数据矩阵中第i个向量离k个中⼼点的距离,距离哪个中⼼点最近则inew赋值为哪个中⼼点对应的簇
if(dd < best) {
best = dd;
inew = j+1;
}
}
//c1[i]记录x矩阵中第i个向量属于哪个中⼼点对应的簇
if(cl[i] != inew) {
updated = TRUE;
cl[i] = inew;
}
}
if(!updated) break;
/* update each centre */
//升级中⼼点
//将k⾏p列的矩阵,即k个p维向量,也即k个中⼼点向量初始化为0.0
for(j = 0; j < k*p; j++) cen[j] = 0.0;
/
/初始化k元数组,⽤来记录n⾏数据中,分别属于k个簇的向量个数
for(j = 0; j < k; j++) nc[j] = 0;
//n为数据矩阵的⾏数,即向量个数
for(i = 0; i < n; i++) {
//⽤来记录n⾏数据中,分别属于k个簇的向量个数
it = cl[i] - 1; nc[it]++;
//p为数据矩阵x的列数,如果每⾏作为⼀个向量,则p为向量的维数
//以下循环⽤cen[it+c*k]记录第c列的所有数值中,分别属于it标号的簇的值的和
for(c = 0; c < p; c++) cen[it+c*k] += x[i+c*n];
}
//注意C语法:1%3表⽰1除以3,等于0,余1;2%3表2除以3,等于0,余2;0%3等于0,余0
/
/j = 0时,cen[0] /= nc[0 % 3],即cen[0] /= nc[0]
//j = 1时,cen[1] /= nc[1 % 3],即cen[1] /= nc[1]
金牛女狮子男//j = 2时,cen[2] /= nc[2 % 3],即cen[2] /= nc[2]
//意义:n⾏向量中,第j列中所有数据,属于簇标号0-k的值的和cen[j],除以n⾏中属于簇标号0-k的向量的个数nc[j%k],取均值
for(j = 0; j < k*p; j++) cen[j] /= nc[j % k];
}
*pmaxiter = iter + 1;
for(j = 0; j < k; j++) wss[j] = 0.0;
for(i = 0; i < n; i++) {
it = cl[i] - 1;
for(c = 0; c < p; c++) {
tmp = x[i+n*c] - cen[it+k*c];
wss[it] += tmp * tmp;
}
}
}
后续的学习有APL,aprior和分类决策树,APL和apriori暂时不打算做注释,分类决策树会在RWeka包的源码上做注释,RWeka为java语⾔编写。
再以后的计划未定。
有问题可以留⾔交流。