• 设为首页
  • 点击收藏
  • 手机版
    手机扫一扫访问
    迪恩网络手机版
  • 关注官方公众号
    微信扫一扫关注
    迪恩网络公众号

R语言进行数据预处理

原作者: [db:作者] 来自: [db:来源] 收藏 邀请

R语言进行数据预处理wranging

li_volleyball

2016年3月22日

data wrangling with R
packages:tidyr dplyr

Ground rules

library(tidyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
View(iris)
View(mtcars)
select(mtcars,am:1)
##                     am vs  qsec    wt drat  hp  disp cyl  mpg
## Mazda RX4            1  0 16.46 2.620 3.90 110 160.0   6 21.0
## Mazda RX4 Wag        1  0 17.02 2.875 3.90 110 160.0   6 21.0
## Datsun 710           1  1 18.61 2.320 3.85  93 108.0   4 22.8
## Hornet 4 Drive       0  1 19.44 3.215 3.08 110 258.0   6 21.4
## Hornet Sportabout    0  0 17.02 3.440 3.15 175 360.0   8 18.7
## Valiant              0  1 20.22 3.460 2.76 105 225.0   6 18.1
## Duster 360           0  0 15.84 3.570 3.21 245 360.0   8 14.3
## Merc 240D            0  1 20.00 3.190 3.69  62 146.7   4 24.4
## Merc 230             0  1 22.90 3.150 3.92  95 140.8   4 22.8
## Merc 280             0  1 18.30 3.440 3.92 123 167.6   6 19.2
## Merc 280C            0  1 18.90 3.440 3.92 123 167.6   6 17.8
## Merc 450SE           0  0 17.40 4.070 3.07 180 275.8   8 16.4
## Merc 450SL           0  0 17.60 3.730 3.07 180 275.8   8 17.3
## Merc 450SLC          0  0 18.00 3.780 3.07 180 275.8   8 15.2
## Cadillac Fleetwood   0  0 17.98 5.250 2.93 205 472.0   8 10.4
## Lincoln Continental  0  0 17.82 5.424 3.00 215 460.0   8 10.4
## Chrysler Imperial    0  0 17.42 5.345 3.23 230 440.0   8 14.7
## Fiat 128             1  1 19.47 2.200 4.08  66  78.7   4 32.4
## Honda Civic          1  1 18.52 1.615 4.93  52  75.7   4 30.4
## Toyota Corolla       1  1 19.90 1.835 4.22  65  71.1   4 33.9
## Toyota Corona        0  1 20.01 2.465 3.70  97 120.1   4 21.5
## Dodge Challenger     0  0 16.87 3.520 2.76 150 318.0   8 15.5
## AMC Javelin          0  0 17.30 3.435 3.15 150 304.0   8 15.2
## Camaro Z28           0  0 15.41 3.840 3.73 245 350.0   8 13.3
## Pontiac Firebird     0  0 17.05 3.845 3.08 175 400.0   8 19.2
## Fiat X1-9            1  1 18.90 1.935 4.08  66  79.0   4 27.3
## Porsche 914-2        1  0 16.70 2.140 4.43  91 120.3   4 26.0
## Lotus Europa         1  1 16.90 1.513 3.77 113  95.1   4 30.4
## Ford Pantera L       1  0 14.50 3.170 4.22 264 351.0   8 15.8
## Ferrari Dino         1  0 15.50 2.770 3.62 175 145.0   6 19.7
## Maserati Bora        1  0 14.60 3.570 3.54 335 301.0   8 15.0
## Volvo 142E           1  1 18.60 2.780 4.11 109 121.0   4 21.4
mtcars %>% select(am:1)
##                     am vs  qsec    wt drat  hp  disp cyl  mpg
## Mazda RX4            1  0 16.46 2.620 3.90 110 160.0   6 21.0
## Mazda RX4 Wag        1  0 17.02 2.875 3.90 110 160.0   6 21.0
## Datsun 710           1  1 18.61 2.320 3.85  93 108.0   4 22.8
## Hornet 4 Drive       0  1 19.44 3.215 3.08 110 258.0   6 21.4
## Hornet Sportabout    0  0 17.02 3.440 3.15 175 360.0   8 18.7
## Valiant              0  1 20.22 3.460 2.76 105 225.0   6 18.1
## Duster 360           0  0 15.84 3.570 3.21 245 360.0   8 14.3
## Merc 240D            0  1 20.00 3.190 3.69  62 146.7   4 24.4
## Merc 230             0  1 22.90 3.150 3.92  95 140.8   4 22.8
## Merc 280             0  1 18.30 3.440 3.92 123 167.6   6 19.2
## Merc 280C            0  1 18.90 3.440 3.92 123 167.6   6 17.8
## Merc 450SE           0  0 17.40 4.070 3.07 180 275.8   8 16.4
## Merc 450SL           0  0 17.60 3.730 3.07 180 275.8   8 17.3
## Merc 450SLC          0  0 18.00 3.780 3.07 180 275.8   8 15.2
## Cadillac Fleetwood   0  0 17.98 5.250 2.93 205 472.0   8 10.4
## Lincoln Continental  0  0 17.82 5.424 3.00 215 460.0   8 10.4
## Chrysler Imperial    0  0 17.42 5.345 3.23 230 440.0   8 14.7
## Fiat 128             1  1 19.47 2.200 4.08  66  78.7   4 32.4
## Honda Civic          1  1 18.52 1.615 4.93  52  75.7   4 30.4
## Toyota Corolla       1  1 19.90 1.835 4.22  65  71.1   4 33.9
## Toyota Corona        0  1 20.01 2.465 3.70  97 120.1   4 21.5
## Dodge Challenger     0  0 16.87 3.520 2.76 150 318.0   8 15.5
## AMC Javelin          0  0 17.30 3.435 3.15 150 304.0   8 15.2
## Camaro Z28           0  0 15.41 3.840 3.73 245 350.0   8 13.3
## Pontiac Firebird     0  0 17.05 3.845 3.08 175 400.0   8 19.2
## Fiat X1-9            1  1 18.90 1.935 4.08  66  79.0   4 27.3
## Porsche 914-2        1  0 16.70 2.140 4.43  91 120.3   4 26.0
## Lotus Europa         1  1 16.90 1.513 3.77 113  95.1   4 30.4
## Ford Pantera L       1  0 14.50 3.170 4.22 264 351.0   8 15.8
## Ferrari Dino         1  0 15.50 2.770 3.62 175 145.0   6 19.7
## Maserati Bora        1  0 14.60 3.570 3.54 335 301.0   8 15.0
## Volvo 142E           1  1 18.60 2.780 4.11 109 121.0   4 21.4
example1<-data.frame(A=c(paste("x",1:6,sep = "")),
                     B=seq(1,11,2),
                     c=1:6,
                     date=c("2000-08-15","1998-07-15","1995-06-04","1997-07-01","1999-06-01","1996-06-25"))
example1
##    A  B c       date
## 1 x1  1 1 2000-08-15
## 2 x2  3 2 1998-07-15
## 3 x3  5 3 1995-06-04
## 4 x4  7 4 1997-07-01
## 5 x5  9 5 1999-06-01
## 6 x6 11 6 1996-06-25
# 一个变量一列
# 一个观测值一行
#每一种观测在一个表里
#separate()
separate(example1,date,c("Y","m","d"),sep="-")
##    A  B c    Y  m  d
## 1 x1  1 1 2000 08 15
## 2 x2  3 2 1998 07 15
## 3 x3  5 3 1995 06 04
## 4 x4  7 4 1997 07 01
## 5 x5  9 5 1999 06 01
## 6 x6 11 6 1996 06 25
example12<-example1 %>% separate(date,c("Y","m","d"),sep="-")
#unite()
unite(example12,"YM",Y,m,sep="-")
##    A  B c      YM  d
## 1 x1  1 1 2000-08 15
## 2 x2  3 2 1998-07 15
## 3 x3  5 3 1995-06 04
## 4 x4  7 4 1997-07 01
## 5 x5  9 5 1999-06 01
## 6 x6 11 6 1996-06 25
#select()
select(example1,A,B)
##    A  B
## 1 x1  1
## 2 x2  3
## 3 x3  5
## 4 x4  7
## 5 x5  9
## 6 x6 11
select(example1,-A)
##    B c       date
## 1  1 1 2000-08-15
## 2  3 2 1998-07-15
## 3  5 3 1995-06-04
## 4  7 4 1997-07-01
## 5  9 5 1999-06-01
## 6 11 6 1996-06-25
select(example1,B:date)
##    B c       date
## 1  1 1 2000-08-15
## 2  3 2 1998-07-15
## 3  5 3 1995-06-04
## 4  7 4 1997-07-01
## 5  9 5 1999-06-01
## 6 11 6 1996-06-25
select(example1,starts_with("d"))
##         date
## 1 2000-08-15
## 2 1998-07-15
## 3 1995-06-04
## 4 1997-07-01
## 5 1999-06-01
## 6 1996-06-25
select(example1,ends_with("e"))
##         date
## 1 2000-08-15
## 2 1998-07-15
## 3 1995-06-04
## 4 1997-07-01
## 5 1999-06-01
## 6 1996-06-25
select(example1,contains("a"))
##    A       date
## 1 x1 2000-08-15
## 2 x2 1998-07-15
## 3 x3 1995-06-04
## 4 x4 1997-07-01
## 5 x5 1999-06-01
## 6 x6 1996-06-25
#filter()
filter(example1,B>=6)
##    A  B c       date
## 1 x4  7 4 1997-07-01
## 2 x5  9 5 1999-06-01
## 3 x6 11 6 1996-06-25
filter(example1,B>=6,A%in%c("x1","x4","x5"))
##    A B c       date
## 1 x4 7 4 1997-07-01
## 2 x5 9 5 1999-06-01
#mutate()
mutate(example1,ratio=B/c)
##    A  B c       date    ratio
## 1 x1  1 1 2000-08-15 1.000000
## 2 x2  3 2 1998-07-15 1.500000
## 3 x3  5 3 1995-06-04 1.666667
## 4 x4  7 4 1997-07-01 1.750000
## 5 x5  9 5 1999-06-01 1.800000
## 6 x6 11 6 1996-06-25 1.833333
mutate(example1,ratio=B/c,inverse=ratio-1)
##    A  B c       date    ratio   inverse
## 1 x1  1 1 2000-08-15 1.000000 0.0000000
## 2 x2  3 2 1998-07-15 1.500000 0.5000000
## 3 x3  5 3 1995-06-04 1.666667 0.6666667
## 4 x4  7 4 1997-07-01 1.750000 0.7500000
## 5 x5  9 5 1999-06-01 1.800000 0.8000000
## 6 x6 11 6 1996-06-25 1.833333 0.8333333
mutate(example1,cumsum(B))
##    A  B c       date cumsum(B)
## 1 x1  1 1 2000-08-15         1
## 2 x2  3 2 1998-07-15         4
## 3 x3  5 3 1995-06-04         9
## 4 x4  7 4 1997-07-01        16
## 5 x5  9 5 1999-06-01        25
## 6 x6 11 6 1996-06-25        36
mutate(example1,cumsum(B),cummean(B),cumany(B>6),cumall(B>6))
##    A  B c       date cumsum(B) cummean(B) cumany(B > 6) cumall(B > 6)
## 1 x1  1 1 2000-08-15         1          1         FALSE         FALSE
## 2 x2  3 2 1998-07-15         4          2         FALSE         FALSE
## 3 x3  5 3 1995-06-04         9          3         FALSE         FALSE
## 4 x4  7 4 1997-07-01        16          4          TRUE         FALSE
## 5 x5  9 5 1999-06-01        25          5          TRUE         FALSE
## 6 x6 11 6 1996-06-25        36          6          TRUE         FALSE
mutate(example1,cummin(B),cummax(B))
##    A  B c       date cummin(B) cummax(B)
## 1 x1  1 1 2000-08-15         1         1
## 2 x2  3 2 1998-07-15         1         3
## 3 x3  5 3 1995-06-04         1         5
## 4 x4  7 4 1997-07-01         1         7
## 5 x5  9 5 1999-06-01         1         9
## 6 x6 11 6 1996-06-25         1        11
mutate(example1,between(B,4,8))
##    A  B c       date between(B, 4, 8)
## 1 x1  1 1 2000-08-15            FALSE
## 2 x2  3 2 1998-07-15            FALSE
## 3 x3  5 3 1995-06-04             TRUE
## 4 x4  7 4 1997-07-01             TRUE
## 5 x5  9 5 1999-06-01            FALSE
## 6 x6 11 6 1996-06-25            FALSE
mutate(example1,cume_dist(B))
##    A  B c       date cume_dist(B)
## 1 x1  1 1 2000-08-15    0.1666667
## 2 x2  3 2 1998-07-15    0.3333333

鲜花

握手

雷人

路过

鸡蛋
该文章已有0人参与评论

请发表评论

全部评论

专题导读
上一篇:
R语言高性能编程,优化(一)发布时间:2022-07-18
下一篇:
R语言包_dplyr_1发布时间:2022-07-18
热门推荐
热门话题
阅读排行榜

扫描微信二维码

查看手机版网站

随时了解更新最新资讯

139-2527-9053

在线客服(服务时间 9:00~18:00)

在线QQ客服
地址:深圳市南山区西丽大学城创智工业园
电邮:jeky_zhao#qq.com
移动电话:139-2527-9053

Powered by 互联科技 X3.4© 2001-2213 极客世界.|Sitemap