library(ggplot2)

data(diamonds)

head(diamonds)

options(repr.plot.width=7,repr.plot.height=4) # Jupyter notebook 에서 ggplot 크기를 조절

ggplot(data=diamonds) +
    geom_bar(mapping = aes(x=cut)) # 데이터를 diamonds로 하고 x축에는 cut필드를 기준으로 하여 barplot을 생성

library(dplyr)

Attaching package: ‘dplyr’

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union

diamonds %>% 
    count(cut)

ggplot(data = diamonds) +
  geom_histogram(mapping = aes(x = carat), binwidth = 0.5)

diamonds %>%
    count(cut_width(carat, .5))

smaller <- diamonds %>% filter(carat < 3)
head(smaller)

ggplot(data = smaller, mapping = aes(x = carat)) +
     geom_histogram(binwidth = .1)

ggplot(data = smaller, mapping = aes(x = carat, colour = cut)) +
    geom_freqpoly(binwidth = .1)

ggplot(data = smaller, mapping = aes(x = carat, colour = cut)) +
    geom_histogram(binwidth = .1, position = )

ggplot(data = diamonds, mapping = aes(x = carat)) +
     geom_histogram(binwidth = .01)

data(faithful)

ggplot(data = faithful, mapping = aes(x = eruptions)) +
    geom_histogram(binwidth = .25)

ggplot(data = diamonds) +
    geom_histogram(mapping = aes(x = y), binwidth = .5)

ggplot(diamonds) + 
  geom_histogram(mapping = aes(x = y), binwidth = .5) +
  coord_cartesian(ylim = c(0, 50))

unusual <- diamonds %>%
        filter(y < 3 | y > 20) %>%
        select(price, x,y,z) %>%
        arrange(y)

head(unusual,10)

diamonds2 <- diamonds %>%
    mutate(y = ifelse(y<3 | y>20,NA,y))

head(diamonds2)

ggplot(data = diamonds2, mapping = aes(x = x, y = y)) + 
  geom_point()

Warning message:
“Removed 9 rows containing missing values (geom_point).”

ggplot(data = diamonds2, mapping = aes(x = x, y = y)) + 
  geom_point(na.rm = TRUE) # 결측치 제거

ggplot(data = diamonds, mapping = aes(x = price)) + 
  geom_freqpoly(mapping = aes(colour = cut), binwidth = 500)

ggplot(data = diamonds, mapping = aes(x = price, y = ..density..)) + 
  geom_freqpoly(mapping = aes(colour = cut), binwidth = 500)

ggplot(data = diamonds, mapping = aes(x = cut, y=price)) +
    geom_boxplot()

ggplot(data = diamonds, mapping = aes(x = cut, y = price)) +
  geom_boxplot() + 
  coord_flip()

ggplot(data = diamonds) + 
    geom_count(mapping = aes(x = cut, y = color))

diamonds %>% 
  count(color, cut) %>%  
  ggplot(mapping = aes(x = color, y = cut)) +
    geom_tile(mapping = aes(fill = n))

ggplot(data = diamonds) +
  geom_point(mapping = aes(x = carat, y = price))

ggplot(data = diamonds) + 
  geom_point(mapping = aes(x = carat, y = price), alpha = 1 / 100)

ggplot(data = smaller) +
  geom_bin2d(mapping = aes(x = carat, y = price))

carat	cut	color	clarity	depth	table	price	x	y	z
0.23	Ideal	E	SI2	61.5	55	326	3.95	3.98	2.43
0.21	Premium	E	SI1	59.8	61	326	3.89	3.84	2.31
0.23	Good	E	VS1	56.9	65	327	4.05	4.07	2.31
0.29	Premium	I	VS2	62.4	58	334	4.20	4.23	2.63
0.31	Good	J	SI2	63.3	58	335	4.34	4.35	2.75
0.24	Very Good	J	VVS2	62.8	57	336	3.94	3.96	2.48

cut	n
Fair	1610
Good	4906
Very Good	12082
Premium	13791
Ideal	21551

cut_width(carat, 0.5)	n
[-0.25,0.25]	785
(0.25,0.75]	29498
(0.75,1.25]	15977
(1.25,1.75]	5313
(1.75,2.25]	2002
(2.25,2.75]	322
(2.75,3.25]	32
(3.25,3.75]	5
(3.75,4.25]	4
(4.25,4.75]	1
(4.75,5.25]	1

carat	cut	color	clarity	depth	table	price	x	y	z
0.23	Ideal	E	SI2	61.5	55	326	3.95	3.98	2.43
0.21	Premium	E	SI1	59.8	61	326	3.89	3.84	2.31
0.23	Good	E	VS1	56.9	65	327	4.05	4.07	2.31
0.29	Premium	I	VS2	62.4	58	334	4.20	4.23	2.63
0.31	Good	J	SI2	63.3	58	335	4.34	4.35	2.75
0.24	Very Good	J	VVS2	62.8	57	336	3.94	3.96	2.48

price	x	y	z
5139	0.00	0.0	0.00
6381	0.00	0.0	0.00
12800	0.00	0.0	0.00
15686	0.00	0.0	0.00
18034	0.00	0.0	0.00
2130	0.00	0.0	0.00
2130	0.00	0.0	0.00
2075	5.15	31.8	5.12
12210	8.09	58.9	8.06

[DataMining] 2. 시각화를 통한 탐색적 데이터 분석(EDA)

데이터 시각화를 통한 탐색적 데이터 분석

ggplot2

기본 데이터 ( Diamonds )

ggplot의 기초

1. 히스토그램 (Histogram)

2. Freqploy

다른 변수들에서 분포의 특성을 찾아보기

faithful data

3. Outlier(이상점)

세로축을 제한하여 그리기

4. Missing Value(결측치)

5. Count vs Density

6. Box plot

7. 두 개의 이산 변수

7. 두개의 연속 변수

'BIGDATA > R' 카테고리의 다른 글

티스토리툴바

[DataMining] 3. 데이터 전처리 (0)	2017.09.25
[DataMining] 1. R Basic Programming (0)	2017.09.09
#12. 고급 시각화 (0)	2016.07.13
#11. 기초 시각화 [ R 내장 함수 ] (0)	2016.07.12
#10 .기술통계 [ 예제 ] (0)	2016.07.12