翻翻过去那场雪
data.table太容易了......library(data.table)setorder(setDT(d), -x)[, head(.SD, 5), keyby = grp]要么setorder(setDT(d), grp, -x)[, head(.SD, 5), by = grp]或者(对于大数据集应该更快,因为避免调用.SD每个组)setorder(setDT(d), grp, -x)[, indx := seq_len(.N), by = grp][indx <= 5]编辑:这是dplyr比较data.table(如果有人感兴趣)set.seed(123)d <- data.frame( x = runif(1e6), grp = sample(1e4, 1e6, TRUE))library(dplyr)library(microbenchmark)library(data.table)dd <- copy(d)microbenchmark( top_n = {d %>% group_by(grp) %>% top_n(n = 5, wt = x)}, dohead = {d %>% arrange_(~ desc(x)) %>% group_by_(~ grp) %>% do(head(., n = 5))}, slice = {d %>% arrange_(~ desc(x)) %>% group_by_(~ grp) %>% slice(1:5)}, filter = {d %>% arrange(desc(x)) %>% group_by(grp) %>% filter(row_number() <= 5L)}, data.table1 = setorder(setDT(dd), -x)[, head(.SD, 5L), keyby = grp], data.table2 = setorder(setDT(dd), grp, -x)[, head(.SD, 5L), grp], data.table3 = setorder(setDT(dd), grp, -x)[, indx := seq_len(.N), grp][indx <= 5L], times = 10, unit = "relative")# expr min lq mean median uq max neval# top_n 24.246401 24.492972 16.300391 24.441351 11.749050 7.644748 10# dohead 122.891381 120.329722 77.763843 115.621635 54.996588 34.114738 10# slice 27.365711 26.839443 17.714303 26.433924 12.628934 7.899619 10# filter 27.755171 27.225461 17.936295 26.363739 12.935709 7.969806 10# data.table1 13.753046 16.631143 10.775278 16.330942 8.359951 5.077140 10# data.table2 12.047111 11.944557 7.862302 11.653385 5.509432 3.642733 10# data.table3 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 10添加速度稍慢的data.table解决方案:set.seed(123L)d <- data.frame( x = runif(1e8), grp = sample(1e4, 1e8, TRUE))setDT(d)setorder(d, grp, -x)dd <- copy(d)library(microbenchmark)microbenchmark( data.table3 = d[, indx := seq_len(.N), grp][indx <= 5L], data.table4 = dd[dd[, .I[seq_len(.N) <= 5L], grp]$V1], times = 10L)定时输出:Unit: milliseconds expr min lq mean median uq max neval data.table3 826.2148 865.6334 950.1380 902.1689 1006.1237 1260.129 10 data.table4 729.3229 783.7000 859.2084 823.1635 966.8239 1014.397 10