按组获取最高值

这是一个示例数据框:


d <- data.frame(

  x   = runif(90),

  grp = gl(3, 30)

我想要d包含x每个值的前5个值的行的子集grp。


使用base-R,我的方法是这样的:


ordered <- d[order(d$x, decreasing = TRUE), ]    

splits <- split(ordered, ordered$grp)

heads <- lapply(splits, head)

do.call(rbind, heads)

##              x grp

## 1.19 0.8879631   1

## 1.4  0.8844818   1

## 1.12 0.8596197   1

## 1.26 0.8481809   1

## 1.18 0.8461516   1

## 1.29 0.8317092   1

## 2.31 0.9751049   2

## 2.34 0.9269764   2

## 2.57 0.8964114   2

## 2.58 0.8896466   2

## 2.45 0.8888834   2

## 2.35 0.8706823   2

## 3.74 0.9884852   3

## 3.73 0.9837653   3

## 3.83 0.9375398   3

## 3.64 0.9229036   3

## 3.69 0.8021373   3

## 3.86 0.7418946   3

使用dplyr,我希望这可以工作:


d %>%

  arrange_(~ desc(x)) %>%

  group_by_(~ grp) %>%

  head(n = 5)

但它只返回前5行。


交换head的top_n整个的回报d。


d %>%

  arrange_(~ desc(x)) %>%

  group_by_(~ grp) %>%

  top_n(n = 5)

我如何获得正确的子集?


波斯汪
浏览 497回答 3
3回答

哔哔one

来自?top_n,关于wt论点:用于排序[...] 的变量默认为 tbl中的最后一个变量 “。数据集中的最后一个变量是“grp”,它不是你想要排名的变量,这就是你的top_n尝试“返回整个d”的原因。因此,如果您希望在数据集中按“x”排名,则需要指定wt = x。set.seed(123)d <- data.frame(&nbsp; x&nbsp; &nbsp;= runif(90),&nbsp; grp = gl(3, 30))d %>%&nbsp; group_by(grp) %>%&nbsp; top_n(n = 5, wt = x)#&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; x grp# 1&nbsp; 0.9404673&nbsp; &nbsp;1# 2&nbsp; 0.9568333&nbsp; &nbsp;1# 3&nbsp; 0.8998250&nbsp; &nbsp;1# 4&nbsp; 0.9545036&nbsp; &nbsp;1# 5&nbsp; 0.9942698&nbsp; &nbsp;1# 6&nbsp; 0.9630242&nbsp; &nbsp;2# 7&nbsp; 0.9022990&nbsp; &nbsp;2# 8&nbsp; 0.8578277&nbsp; &nbsp;2# 9&nbsp; 0.7989248&nbsp; &nbsp;2# 10 0.8950454&nbsp; &nbsp;2# 11 0.8146400&nbsp; &nbsp;3# 12 0.8123895&nbsp; &nbsp;3# 13 0.9849570&nbsp; &nbsp;3# 14 0.8930511&nbsp; &nbsp;3# 15 0.8864691&nbsp; &nbsp;3

翻翻过去那场雪

data.table太容易了......library(data.table)setorder(setDT(d), -x)[, head(.SD, 5), keyby = grp]要么setorder(setDT(d), grp, -x)[, head(.SD, 5), by = grp]或者(对于大数据集应该更快,因为避免调用.SD每个组)setorder(setDT(d), grp, -x)[, indx := seq_len(.N), by = grp][indx <= 5]编辑:这是dplyr比较data.table(如果有人感兴趣)set.seed(123)d <- data.frame(&nbsp; x&nbsp; &nbsp;= runif(1e6),&nbsp; grp = sample(1e4, 1e6, TRUE))library(dplyr)library(microbenchmark)library(data.table)dd <- copy(d)microbenchmark(&nbsp; top_n = {d %>%&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;group_by(grp) %>%&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;top_n(n = 5, wt = x)},&nbsp; dohead = {d %>%&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; arrange_(~ desc(x)) %>%&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; group_by_(~ grp) %>%&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; do(head(., n = 5))},&nbsp; slice = {d %>%&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;arrange_(~ desc(x)) %>%&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;group_by_(~ grp) %>%&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;slice(1:5)},&nbsp; filter = {d %>%&nbsp;&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; arrange(desc(x)) %>%&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; group_by(grp) %>%&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; filter(row_number() <= 5L)},&nbsp; data.table1 = setorder(setDT(dd), -x)[, head(.SD, 5L), keyby = grp],&nbsp; data.table2 = setorder(setDT(dd), grp, -x)[, head(.SD, 5L), grp],&nbsp; data.table3 = setorder(setDT(dd), grp, -x)[, indx := seq_len(.N), grp][indx <= 5L],&nbsp; times = 10,&nbsp; unit = "relative")#&nbsp; &nbsp; &nbsp; &nbsp; expr&nbsp; &nbsp; &nbsp; &nbsp; min&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;lq&nbsp; &nbsp; &nbsp; mean&nbsp; &nbsp; &nbsp;median&nbsp; &nbsp; &nbsp; &nbsp; uq&nbsp; &nbsp; &nbsp; &nbsp;max neval#&nbsp; &nbsp; &nbsp; &nbsp;top_n&nbsp; 24.246401&nbsp; 24.492972 16.300391&nbsp; 24.441351 11.749050&nbsp; 7.644748&nbsp; &nbsp; 10#&nbsp; &nbsp; &nbsp; dohead 122.891381 120.329722 77.763843 115.621635 54.996588 34.114738&nbsp; &nbsp; 10#&nbsp; &nbsp; &nbsp; &nbsp;slice&nbsp; 27.365711&nbsp; 26.839443 17.714303&nbsp; 26.433924 12.628934&nbsp; 7.899619&nbsp; &nbsp; 10#&nbsp; &nbsp; &nbsp; filter&nbsp; 27.755171&nbsp; 27.225461 17.936295&nbsp; 26.363739 12.935709&nbsp; 7.969806&nbsp; &nbsp; 10# data.table1&nbsp; 13.753046&nbsp; 16.631143 10.775278&nbsp; 16.330942&nbsp; 8.359951&nbsp; 5.077140&nbsp; &nbsp; 10# data.table2&nbsp; 12.047111&nbsp; 11.944557&nbsp; 7.862302&nbsp; 11.653385&nbsp; 5.509432&nbsp; 3.642733&nbsp; &nbsp; 10# data.table3&nbsp; &nbsp;1.000000&nbsp; &nbsp;1.000000&nbsp; 1.000000&nbsp; &nbsp;1.000000&nbsp; 1.000000&nbsp; 1.000000&nbsp; &nbsp; 10添加速度稍慢的data.table解决方案:set.seed(123L)d <- data.frame(&nbsp; &nbsp; x&nbsp; &nbsp;= runif(1e8),&nbsp; &nbsp; grp = sample(1e4, 1e8, TRUE))setDT(d)setorder(d, grp, -x)dd <- copy(d)library(microbenchmark)microbenchmark(&nbsp; &nbsp; data.table3 = d[, indx := seq_len(.N), grp][indx <= 5L],&nbsp; &nbsp; data.table4 = dd[dd[, .I[seq_len(.N) <= 5L], grp]$V1],&nbsp; &nbsp; times = 10L)定时输出:Unit: milliseconds&nbsp; &nbsp; &nbsp; &nbsp; expr&nbsp; &nbsp; &nbsp; min&nbsp; &nbsp; &nbsp; &nbsp;lq&nbsp; &nbsp; &nbsp;mean&nbsp; &nbsp;median&nbsp; &nbsp; &nbsp; &nbsp; uq&nbsp; &nbsp; &nbsp; max neval&nbsp;data.table3 826.2148 865.6334 950.1380 902.1689 1006.1237 1260.129&nbsp; &nbsp; 10&nbsp;data.table4 729.3229 783.7000 859.2084 823.1635&nbsp; 966.8239 1014.397&nbsp; &nbsp; 10

汪汪一只猫

你需要head打电话给do。在下面的代码,.表示当前组(见说明...在do帮助页面)。d %>%&nbsp; arrange_(~ desc(x)) %>%&nbsp; group_by_(~ grp) %>%&nbsp; do(head(., n = 5))如akrun所述,slice是另一种选择。d %>%&nbsp; arrange_(~ desc(x)) %>%&nbsp; group_by_(~ grp) %>%&nbsp; slice(1:5)
打开App,查看更多内容
随时随地看视频慕课网APP