首页 > 美文阅读

R语言源码注释之kmeans（）

更新时间:2023-05-15 00:42:59 阅读：评论：0

R语⾔源码注释之kmeans（）

最近在学习数据挖掘的⼀些基础知识，涉及到的⼀些算法的实现会通过阅读R语⾔源码的⽅式学习，学习后在源码基础上做中⽂注释，并发布出来，供他⼈使⽤。

R中，运⾏kmeans()后，执⾏代码如下（R语⾔源码可以上官⽹下载）：　

kmeans <-

function(x, centers, iter.max = 10L, nstart = 1L,

algorithm = c("Hartigan-Wong", "Lloyd", "Forgy", "MacQueen"),

trace = FALSE)

{

//.Mimax为R中定义的变量，赋值为int的最⼤值:2147483647

.Mimax <- .Machine$integer.max

学业生涯规划书

//定义⼀个函数，供后续调⽤

do_one <- function(nmeth) {

switch(nmeth,蚯蚓是什么动物

{ # 1 : Hartigan-Wong

//默认选择执⾏第⼀个算法

isteps.Qtran <- as.integer(min(.Mimax, 50 * m))

iTran <- c(isteps.Qtran, integer(max(0,k-1)))

Z <- .Fortran(C_kmns, x, m, p,

centers = centers,

as.integer(k), c1 = integer(m), c2 = integer(m),

nc = integer(k), double(k), double(k), ncp=integer(k),

D = double(m), iTran = iTran, live = integer(k),

iter = iter.max, wss = double(k),曹操代表作

ifault = as.integer(trace))

switch(Z$ifault,

## 1:

stop("empty cluster: try a better t of initial centers",

call. = FALSE),

## 2:

Z$iter <- max(Z$iter, iter.max+1L), # -> and warn below

## 3:

stop("number of cluster centres must lie between 1 and nrow(x)",

call.=FALSE),

## 4: {new @ 2013-06-30; maybe better fix (in Fortran) ?}

warning(gettextf("Quick-TRANSfer stage steps exceeded maximum (= %d)",

isteps.Qtran),

call.=FALSE)

)

{ # 2 : Lloyd-Forgy

//这⾥由于"Hartigan-Wong"属于Fortran调⽤，⽆源码，改为看"Lloyd"算法，实际上都差不多

Z <- .C(C_kmeans_Lloyd, x, m, p,

centers = centers, k,

c1 = integer(m), iter = iter.max,

nc = integer(k), wss = double(k))

{ # 3 : MacQueen

Z <- .C(C_kmeans_MacQueen, x, m, p,

centers = as.double(centers), k,

c1 = integer(m), iter = iter.max,

nc = integer(k), wss = double(k))

})

if(m23 <- any(nmeth == c(2L, 3L))) {

if(any(Z$nc == 0))

warning("empty cluster: try a better t of initial centers",

call. = FALSE)

}

if(Z$iter > iter.max) {

warning(sprintf(ngettext(iter.max,

"did not converge in %d iteration",

小猪变干净了故事"did not converge in %d iterations"),

iter.max), call. = FALSE, domain = NA)

if(m23) Z$ifault <- 2L

}

if(nmeth %in% c(2L, 3L)) {

if(any(Z$nc == 0))

warning("empty cluster: try a better t of initial centers",

call. = FALSE)

}

//x为输⼊的需要聚类的数据，as.matrix将x转换成维数相同的矩阵

x <- as.matrix(x)

## as.integer(<too large>) gives NA ==> not allowing too large nrow() / ncol():

//取矩阵化后的x的⾏数和列数，如果不矩阵化，则取值为null

//is.na判断⾏列值是否为空值，如果是，则stop

m <- as.integer(nrow(x)); if(is.na(m)) stop("invalid nrow(x)")

p <- as.integer(ncol(x)); if(is.na(p)) stop("invalid ncol(x)")

//判断centers参数是否为空，如果是，则stop

//center为中⼼点个数

if(missing(centers))

stop("'centers' must be a number or a matrix")

//给nmeth赋值，不同参数algorithm对应不同值

//match.arg(x)中，x为函数⼊参，如果⼊参为空，则默认取第⼀个值

//本例中，algorithm可以从c("Hartigan-Wong", "Lloyd", "Forgy", "MacQueen")中取值

//由于调⽤kmeans是不设置，则algorithm为空，则默认取第⼀个，即"Hartigan-Wong"

nmeth <- switch(match.arg(algorithm),

"Hartigan-Wong" = 1L,

"Lloyd" = 2L, "Forgy" = 2L,

"MacQueen" = 3L)

//de是获取参数的数据类型

//length获取centers的数值个数，如果centers为单个数值则返回1，为向量则返回数值个数if(length(centers) == 1L) {

k <- centers

## we need to avoid duplicates here

//nstart为⼊参，值为1L

if(nstart == 1L)

//sample随机抽样，sample.int抽取int类型的随机抽样

//这⾥sample.int(m, k)代表从m个⾏号中，随机抽取k个作为中⼼点

//所以centers为矩阵化后的三个随机⾏的的数值

//这⾥为kmeans算法第⼀步

centers <- x[sample.int(m, k), , drop = FALSE]

//duplicated()找出重复出现的元素，为了防⽌上边centers的取值重复

if(nstart >= 2L || any(duplicated(centers))) {

cn <- unique(x)

mm <- nrow(cn)

活动音乐

if(mm < k)

stop("more cluster centers than distinct data points.")

centers <- cn[sample.int(mm, k), , drop=FALSE]

}

} el {

centers <- as.matrix(centers)

if(any(duplicated(centers)))

stop("initial centers are not distinct")

cn <- NULL

k <- nrow(centers)

if(m < k)

stop("more cluster centers than data points")

}

/k还是中⼼点数，为空则stop

k <- as.integer(k)

if(is.na(k)) stop("'invalid value of 'k'")

//如果k == 1，则nmeth对应Hartigan-Wong算法

if (k == 1L) nmeth <- 3L # Hartigan-Wong, (Fortran) needs k > 1

//iter.max是⼊参，值是10L，如果为空则stop

iter.max <- as.integer(iter.max)

if(is.na(iter.max) || iter.max < 1L) stop("'iter.max' must be positive")

//x有⼏列，center就应该有⼏列，也就是说x与centers的维数应该相同，否则stop

if(ncol(x) != ncol(centers))

stop("must have same number of columns in 'x' and 'centers'")

/设置数值类型

//调⽤do_one函数，返回Z，⼊参algrithm为空时，nmeth默认为"Hartigan-Wong"

//这⾥由于"Hartigan-Wong"属于Fortran调⽤，改为看"Lloyd"算法，实际上都差不多

Z <- do_one(nmeth)

best <- sum(Z$wss)

if(nstart >= 2L && !is.null(cn))

for(i in2:nstart) {

centers <- cn[sample.int(mm, k), , drop=FALSE]

ZZ <- do_one(nmeth)

if((z <- sum(ZZ$wss)) < best) {

Z <- ZZ

best <- z

}

centers <- matrix(Z$centers, k)

dimnames(centers) <- list(1L:k, dimnames(x)[[2L]])

cluster <- Z$c1

if(!is.null(rn <- rownames(x)))

names(cluster) <- rn

totss <- sum(scale(x, scale = FALSE)^2)

structure(list(cluster = cluster, centers = centers, totss = totss,

withinss = Z$wss, tot.withinss = best,

betweenss = totss - best, size = Z$nc,

iter = Z$iter, ifault = Z$ifault),

class = "kmeans")

}

以下为R语⾔中聚类函数kmeans_Lloyd()源码注释（R语⾔源码可以上官⽹下载）：

看见黄鼠狼

//运⽤该函数⾸先要对数据矩阵x有⼀个准确的判断

//x的每⼀⾏为⼀个向量，它代表某个事物

//⽽x的每⼀列代表这个向量的维，它表⽰本⾏对应的事物的元素

//这是对x的要求

void kmeans_Lloyd(double *x, int *pn, int *pp, double *cen, int *pk, int *cl,

int *pmaxiter, int *nc, double *wss)

{

int n = *pn, k = *pk, p = *pp, maxiter = *pmaxiter;

int iter, i, j, c, it, inew = 0;

double best, dd, tmp;

Rboolean updated;

//数组初始化为-1，n等于m

for(i = 0; i < n; i++) cl[i] = -1;

//maxiter赋值为最⼤迭代次数，即iter.max

杭州的特产for(iter = 0; iter < maxiter; iter++) {

updated = FALSE;

/遍历n⾏(即m⾏数据)，每⾏数据为⼀个向量，向量维数为p

for(i = 0; i < n; i++) {

/* find nearest centre for each point */

//R_PosInf = ML_POSINF = (1.0 / 0.0) 即⽆穷⼤的意思

best = R_PosInf;

//k为中⼼点个数

for(j = 0; j < k; j++) {

dd = 0.0;

//p = ncol(x)

//计算第i个p维向量与第j个中⼼点的距离

for(c = 0; c < p; c++) {

/k和n表⽰⾏数，x[i+n*c]表⽰第i个向量的第c维数据，cen[j+k*c]表⽰第j个向量的第c维数据，即中⼼点向量 tmp = x[i+n*c] - cen[j+k*c];

//两个向量之间的距离 - 欧⼏⾥德距离

dd += tmp * tmp;

}

//判断x数据矩阵中第i个向量离k个中⼼点的距离，距离哪个中⼼点最近则inew赋值为哪个中⼼点对应的簇

if(dd < best) {

best = dd;

inew = j+1;

}

//c1[i]记录x矩阵中第i个向量属于哪个中⼼点对应的簇

if(cl[i] != inew) {

updated = TRUE;

cl[i] = inew;

}

if(!updated) break;

/* update each centre */

//升级中⼼点

//将k⾏p列的矩阵，即k个p维向量，也即k个中⼼点向量初始化为0.0

for(j = 0; j < k*p; j++) cen[j] = 0.0;

/初始化k元数组，⽤来记录n⾏数据中，分别属于k个簇的向量个数

for(j = 0; j < k; j++) nc[j] = 0;

//n为数据矩阵的⾏数，即向量个数

for(i = 0; i < n; i++) {

//⽤来记录n⾏数据中，分别属于k个簇的向量个数

it = cl[i] - 1; nc[it]++;

//p为数据矩阵x的列数，如果每⾏作为⼀个向量，则p为向量的维数

//以下循环⽤cen[it+c*k]记录第c列的所有数值中，分别属于it标号的簇的值的和

for(c = 0; c < p; c++) cen[it+c*k] += x[i+c*n];

}

//注意C语法:1%3表⽰1除以3，等于0，余1；2%3表2除以3，等于0，余2；0%3等于0，余0

/j = 0时，cen[0] /= nc[0 % 3]，即cen[0] /= nc[0]

//j = 1时，cen[1] /= nc[1 % 3]，即cen[1] /= nc[1]

金牛女狮子男//j = 2时，cen[2] /= nc[2 % 3]，即cen[2] /= nc[2]

//意义:n⾏向量中，第j列中所有数据，属于簇标号0-k的值的和cen[j]，除以n⾏中属于簇标号0-k的向量的个数nc[j%k]，取均值

for(j = 0; j < k*p; j++) cen[j] /= nc[j % k];

}

*pmaxiter = iter + 1;

for(j = 0; j < k; j++) wss[j] = 0.0;

for(i = 0; i < n; i++) {

it = cl[i] - 1;

for(c = 0; c < p; c++) {

tmp = x[i+n*c] - cen[it+k*c];

wss[it] += tmp * tmp;

}

后续的学习有APL，aprior和分类决策树，APL和apriori暂时不打算做注释，分类决策树会在RWeka包的源码上做注释，RWeka为java语⾔编写。

再以后的计划未定。

有问题可以留⾔交流。

本文发布于:2023-05-15 00:42:59，感谢您对本站的认可！

本文链接：https://www.wtabcd.cn/fanwen/fan/82/634219.html

上一篇：双减政策下开家长会小学小学双减家长会班主任发言稿(3篇)

下一篇：双减后家长会发言稿班主任(7篇)

标签：向量源码数据

留言与评论（共有 0 条评论）