根据特定条件对数据进行分组，并在 R 或 Python 中查找持续时间

df %>%   # The original data was loaded as factors, which have their uses, but  #   converting those to characters will be simpler to work with here.  mutate_if(is.factor, as.character) %>%   # I'm replacing NA in Subj & Recip with an empty string, and trimming   #    excess spaces from the start and end. One of the recipients is " "  #    but I assume that's functionally the same as blank.  mutate_at(c("Subject", "Recipient"), ~if_else(is.na(.), "", stringr::str_trim(.))) %>%  filter(Subject != '') %>%  mutate(Date = as.POSIXct(Date, format = '%m/%d/%Y %H:%M:%OS')) %>%  mutate(cond = Edit & Folder %in% c('out', 'draft') & Message == '') %>%   mutate(segment = cumsum(!cond)) %>%  filter(cond) %>%   # EDIT: Added to exclude rows matching cond  # Get summary stats for each segment  group_by(Subject, Recipient, Length, segment) %>%  summarize(Start = min(Date),            End = max(Date),            Duration = End - Start) %>%  # This counts the number of times that these columns don't match their  #   predecessor. TRUE = 1, so we get 1 when anything changes.  #   Look at ?lag for more on what those parameters mean.  mutate(new_group = (Subject   != lag(Subject, 1, "")) *                     (Recipient != lag(Recipient, 1, "")) *                     (Length    != lag(Length, 1, ""))) %>%  ungroup() %>%  mutate(group = LETTERS[cumsum(new_group)])# A tibble: 3 x 9  Subject Recipient                   Length segment Start               End                 Duration new_group group  <chr>   <chr>                        <int>   <int> <dttm>              <dttm>              <drtn>       <int> <chr>1 hey     sarah@mail.com,gee@mail.com     80       0 2020-01-02 01:00:10 2020-01-02 01:00:30 20 secs          1 A    2 hey     sarah@mail.com,gee@mail.com     80       3 2020-01-02 01:02:00 2020-01-02 01:02:05  5 secs          0 A    3 hey     sarah@mail.com,gee@mail.com     80       7 2020-01-02 01:03:00 2020-01-02 01:03:20 20 secs          0 A    

根据特定条件对数据进行分组，并在 R 或 Python 中查找持续时间

1回答