예제로 개념 잡기
DF <- read.csv("data/example_salary.csv",stringsAsFactors = T,na="-") # na is "-"
head(DF)
# age salary specialSalary workingTime numberOfWorker career sex scale
# 1 -19 1346534 151840 169.5 15042 1년미만 남 -1.28886999
# 2 20-24 1584214 115375 180.1 74251 1년미만 남 -0.91757018
# 3 25-29 1922043 268058 178.0 143338 1년미만 남 -0.38981924
# 4 30-34 2130988 335710 180.8 103585 1년미만 남 -0.06340878
# 5 35-39 2414345 352816 181.4 65385 1년미만 남 0.37924689
# 6 40-44 2372214 233728 182.9 55422 1년미만 남 0.31343053
summary(DF)
colnames(DF) <- c("age","salary","specialSalary","workingTime","numberOfWorker","career","sex")
detach(DF)
attach(DF)
Mean <- mean(salary,na.rm=T)
Mid <- median(salary,na.rm=T)
Range <- range(salary,na.rm=T)
w <- which(DF$salary==4064286)
DF[w,]
Qnt <- quantile(salary,na.rm=T)
SelfSummary <- function(x){
Mean <- mean(salary,na.rm=T)
Mid <- median(salary,na.rm=T)
Range <- range(salary,na.rm=T)
Qnt <- quantile(salary,na.rm=T)
Result <- list(평균월급=Mean,중앙값월급=Mid,월급번위=Range, 월급사분위 = Qnt)
return(Result)
}
Result <- SelfSummary(salary)
Result
$평균월급
[1] 2171578
$중앙값월급
[1] 2120345
$월급번위
[1] 1117605 4064286
$월급사분위
0% 25% 50% 75% 100%
1117605 1689658 2120345 2519221 4064286
############## 그룹별 평균구하기 #############
temp <- tapply(salary,sex, mean,na.rm=T) # tapply(데이터,기준,함수)
# 남 여
# 2477332 1865823
class(temp) # array
library(reshape2)
library(ggplot2)
melt <- melt(temp) # array = > dataFrame
# Var1 value
# 1 남 2477332
# 2 여 1865823
ggplot(melt,aes(x=Var1,y=value,fill=Var1)) + geom_bar(stat="identity")
# 성별 별 표준편차
tapply(salary, sex, sd, na.rm=T)
# 남 여
# 646470.7 468270.6
# 성별 별 월급 범위.
tapply(salary, sex, range, na.rm=T)
# 경력별로 월급의 평균
tapply(salary,career, mean, na.rm=T)
melt <- melt(tapply(salary,career, mean, na.rm=T))
ggplot(melt,aes(x=Var1,y=value,group=1)) + geom_line(colour="skyblue2",size=2) + coord_polar() + ylim(0,max(melt$value))
tapply(salary, career,sd,na.rm=T)
temp <- tapply(salary, career,range,na.rm=T)
temp[[1]][1] # 1~3
temp[[2]][1] # 10년 이상
temp[[3]][1] # 1년미만
temp[[4]][1] # 3~5년미만
temp[[5]][1] # 5~10년미만
a1 <- DF[which(salary==temp[[1]][1]),]
a2 <- DF[which(salary==temp[[2]][1]),]
a3 <- DF[which(salary==temp[[3]][1]),]
a4 <- DF[which(salary==temp[[4]][1]),]
a5 <- DF[which(salary==temp[[5]][1]),]
list <- list(a1,a2,a3,a4,a5)
# [[1]]
# age salary specialSalary workingTime numberOfWorker career sex
# 70 60- 1172399 299639 151.2 30253 1~3년미만 여
#
# [[2]]
# age salary specialSalary workingTime numberOfWorker career sex
# 92 20-24 1685204 1970720 179.4 1886 10년이상 여
#
# [[3]]
# age salary specialSalary workingTime numberOfWorker career sex
# 60 60- 1117605 10667 148 18737 1년미만 여
#
# [[4]]
# age salary specialSalary workingTime numberOfWorker career sex
# 80 60- 1245540 423826 155.1 21106 3~5년미만 여
#
# [[5]]
# age salary specialSalary workingTime numberOfWorker career sex
# 90 60- 1548036 806919 169.1 20282 5~10년미만 여
############# IQR 구하기 #################
Salary <- summary(salary,na.rm=T)
distIQR <- Salary[[5]] - Salary[[2]]
iqr <- IQR(salary,na.rm=T)
############ 아웃라이어 찾기와 제거 ######
DF <- read.csv("data/example_cancer.csv",stringsAsFactors = F,na="기록없음")
str(DF)
# 'data.frame': 18310 obs. of 8 variables:
# $ age : int 75 52 67 62 70 76 55 72 64 71 ...
# $ sex : chr "남" "여" "여" "남" ...
# $ height : num 161 177 154 162 171 ...
# $ weight : num 64 75.3 65.6 57 65 87 77 55 67 55.5 ...
# $ dateOfoperation: chr "2011-06-22" "2011-05-19" "2011-05-31" "2011-06-21" ...
# $ cancerStaging : chr "I" "IV" "III" "I" ...
# $ hospitalization: int 48 17 10 11 10 10 12 18 15 35 ...
# $ diseaseCode : chr "C187" "C187" "C187" "C187" ...
detach(DF)
attach(DF)
mean(age)
summary(age)
boxplot(age) # 대략적으로 30대 이하의 사람들이 Outlier 또는 100세 이상.
grid() #모눈 그리기
distIQR <- IQR(age, na.rm=T)
# 17
posIQR <- quantile(age,probs=c(0.25,0.75),na.rm=T)
# 25% 75%
# 55 72
DownWhisker <- posIQR[[1]] - distIQR * 1.5 # 25%지점에서 1.5배만큼 연장.
UpWhisker <- posIQR[[2]] + distIQR * 1.5
Outlier <- subset(DF, subset = (DF$age < DownWhisker | DF$age > UpWhisker))
########### 평균값 표준화하여 그래프를 그리기. ( 표준화 그래프 )
library(ggplot2)
DF <- read.csv("data/example_salary.csv",stringsAsFactors = F, na="-")
colnames(DF) <- c("age","salary","specialSalary","workingTime","numberOfWorker","career","sex")
Scale <- scale(DF$salary)
# Scale <- matrix(Scale[!is.na(Scale)])
DF <- cbind(DF,scale=Scale)
DF2 <- DF[!is.na(DF$scale),]
g1 <- ggplot(DF,aes(x=scale,y=age))
g2 <- geom_segment(aes(yend=age),xend=0)
g3 <- g1 + g2 + geom_point(size=7, aes(colour=sex,shape=career)) + theme_minimal() + theme(text = element_text(size=20))
g3
'BIGDATA > R' 카테고리의 다른 글
#12. 고급 시각화 (0) | 2016.07.13 |
---|---|
#11. 기초 시각화 [ R 내장 함수 ] (0) | 2016.07.12 |
#09. 기술통계 (0) | 2016.07.10 |
#08. 특강3 [ 모비율에 대한 검정 ] (0) | 2016.07.04 |
#07. 특강2. [ 평균에 대한 추론 ] (0) | 2016.07.03 |