데이터 다루기 2 [ 실제 데이터 ]
실습은 알까기2 를 바탕으로 하였습니다.
1. hflight
install.packages("hflights")
library("hflights")
hflights
str(hflights)
head(hflights)
names(hflights)
# Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum TailNum ActualElapsedTime
# 5424 2011 1 1 6 1400 1500 AA 428 N576AA 60
# 5425 2011 1 2 7 1401 1501 AA 428 N557AA 60
# 5426 2011 1 3 1 1352 1502 AA 428 N541AA 70
# 5427 2011 1 4 2 1403 1513 AA 428 N403AA 70
# 5428 2011 1 5 3 1405 1507 AA 428 N492AA 62
# 5429 2011 1 6 4 1359 1503 AA 428 N262AA 64
# AirTime ArrDelay DepDelay Origin Dest Distance TaxiIn TaxiOut Cancelled CancellationCode Diverted
# 5424 40 -10 0 IAH DFW 224 7 13 0 0
# 5425 45 -9 1 IAH DFW 224 6 9 0 0
# 5426 48 -8 -8 IAH DFW 224 5 17 0 0
# 5427 39 3 3 IAH DFW 224 9 22 0 0
# 5428 44 -3 5 IAH DFW 224 9 9 0 0
# 5429 45 -7 -1 IAH DFW 224 6 13 0 0
TableOfDest <- table(hflights$Dest) #도착지 별 Count
barplot(TableOfDest)
length(TableOfDest) # 116 개의 도시가 돈재.
range(TableOfDest) # 1~9820 Cnt 범위.
TableOfDest[TableOfDest > 6000] # 6000회 이상 도착한 곳.
TableOfDest[TableOfDest == max(TableOfDest)] # DAL 9820회.
TableOfDest[TableOfDest == min(TableOfDest)] # AGS 1회.
Over6000 <- TableOfDest[TableOfDest > 6000]
barplot(Over6000[-6])
2. Cancer
# 나이에 따른 환자수
DF <- read.csv("example_cancer.csv",stringsAsFactors = F)
head(DF)
# age sex height weight dateOfoperation cancerStaging hospitalization diseaseCode
# 1 75 남 161 64 2011-06-22 I 48 C187
# 2 52 여 176.6 75.3 2011-05-19 IV 17 C187
# 3 67 여 154 65.6 2011-05-31 III 10 C187
# 4 62 남 162 57 2011-06-21 I 11 C187
# 5 70 남 171 65 2011-06-09 II 10 C182
# 6 76 여 171 87 2011-06-16 III 10 C189
DegreeAge <- table(cut(DF$age,breaks = (1:11)*10))
names(DegreeAge) <- c("10s","20s","30s","40s","50s","60s","70s","80s","90s","100s")
barplot(DegreeAge)
library("ggplot2")
install.packages("ggthemes")
library("ggthemes")
De <- ggplot(data=DF , aes(x=age))
De + geom_freqpoly(binwidth=10, size=1.4, colour="orange") + theme_wsj()
3. Caffee
install.packages("data.table") #data.frame 과 비슷하지만 처리속도가 빨라 빅데이터에 많이 사용 된다.
library("data.table")
library("ggplot2")
DF <- fread("example_coffee.csv", header = T,stringsAsFactors = T,data.table = F) # read.csv 보다 빨리 데이터를 가지고온다.
head(DF)
str(DF)
class(DF)
DF <- subset(DF,select=c(-adress,-adressBystreet,-dateOfclosure,-startdateOfcessation,-duedateOfcessation,-dateOfreOpen,-zip))
names(DF)
range(DF$yearOfStart,na.rm = T)
subset(DF,subset = (yearOfStart == min(DF$yearOfStart,na.rm = T)))
unique(DF$stateOfbusiness)
#현존하는 가장 오래된 커피샵
DFFilter <- subset(DF,subset = (stateOfbusiness=="운영중"))
range(DFFilter$yearOfStart,na.rm = T)
subset(DFFilter,subset = (yearOfStart == min(DFFilter$yearOfStart,na.rm = T)))
#매년 생겨나는 카페 수.
table(DF$yearOfStart)
barplot(table(DF$yearOfStart))
qplot(yearOfStart,data=DF,geom="bar",binwidth=1)
#연도에 따른 영업상태 분할.
Freq <- table(DF$stateOfbusiness,DF$yearOfStart)
Freq <- Freq[,which(colnames(Freq) >= "1997")]
PFreq <- prop.table(Freq,margin = 2)
?prop.table
colnames(Freq)
NewDF <- data.frame(colnames(Freq),Freq[1,],Freq[2,],PFreq[1,],PFreq[2,])
rownames(NewDF) <- NULL
colnames(NewDF) <- c("Time","Open","Close","POpen","PClose")
NewDF <- NewDF[-19,]
GP <- ggplot(NewDF, aes(x=factor(Time),y=Close,group = 1))
GP + geom_line(colour="steelblue1", size=1) + geom_point(colour="steelblue",size=3) + geom_line(aes(y=Open), colour="tomato2",size=1) + geom_point(aes(y=Open), colour = "red", size = 3) + theme_bw()
4. Size of Coffee
Size <- DF$sizeOfsite
summary(Size)
# Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
# 0.00 28.12 50.00 75.53 93.75 24080.00 19
plot(Size) # 10000 이상의 특이한 점 발견.
#아웃라이어 ( 특이점 ) 제거 )
Size[Size > 10000] <- NA
summary(Size)
# Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
# 0.00 28.12 50.00 75.02 93.75 1406.00 20
Size[Size==0] <- NA
Size <- Size[complete.cases(Size)]
summary(Size)
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 0.25 30.00 51.92 77.23 95.30 1406.00
DegreeOfSize <- table(cut(Size,breaks=c(0:72)*20)) # 20단위로 계급생성.
library("ggplot2")
library("ggthemes")
GE <- ggplot(data=DF,aes(x=sizeOfsite))
GE + geom_freqpoly(binwidth=10, size=1.2, colour="orange") + scale_x_continuous(limits = c(0,300), breaks=seq(0,300,20)) + theme_wsj()
head(DF$sizeOfsite)
5. Population Of Korea
############### 전국 인구조사 자료정리 ( 전처리 ) ################
DF <- read.csv("example_population.csv",stringsAsFactors = F)
str(DF)
head(DF,5)
# Provinces city Population Households PersInHou Male Female SexRatio
# 2 서울특별시 종로구 155695 72882 2.14 76962 78733 0.98
# 3 서울특별시 중구 126817 59614 2.13 63292 63525 1.00
# 4 서울특별시 용산구 235186 108138 2.17 114119 121067 0.94
# 5 서울특별시 성동구 298145 126915 2.35 148265 149880 0.99
install.packages("stringr")
library("stringr")
library("ggplot2")
#str_split_fixed(문자열, 분할 기준문자,분리할 개수)
temp <- str_split_fixed(DF[,1],"\\(",2)
NewCity <- str_split_fixed(temp[,1]," ",2)
colnames(NewCity) <- c("Provinces","city")
head(NewCity)
DF <- data.frame(NewCity,DF[,c(2:7)])
DF[DF==" "] <- NA
DF <- DF[complete.cases(DF),]
head(DF,10)
for(i in 3:8){
DF[,i] <- sapply(DF[,i], function(x) gsub(",","",x))
DF[,i] <- as.character(DF[,i])
DF[,i] <- as.numeric(DF[,i])
}
str(DF)
ProPopul <- tapply(DF$Population, DF$Provinces, sum)
DF[,1] <- factor(DF[,1])
ProPopul <- tapply(DF$Population, DF$Provinces, sum)
Graph <- ggplot(DF,aes(x=Provinces,y=Population, fill=Provinces))
Graph + geom_bar(stat="identity") + theme_wsj()
'BIGDATA > R' 카테고리의 다른 글
#07. 특강2. [ 평균에 대한 추론 ] (0) | 2016.07.03 |
---|---|
#06. 특강1. [ 경영통계 데이터 요약 및 정리 ] (0) | 2016.07.03 |
#04. 도수분포 [ 연속, 명목 ] (0) | 2016.06.30 |
#03. 데이터 다루기. (0) | 2016.06.27 |
#02. 외부데이터 호출. (0) | 2016.06.26 |