#install.packages('caret')

library(caret)

Loading required package: lattice
Loading required package: ggplot2

car = read.csv("../0909/automobile.csv")

head(car)

dummies = dummyVars(city_mpg ~ wheels, car)

head(predict(dummies, newdata = car))

nearZeroVar(car)

names(car)[9]

car$engine_location

names(car)

sapply(car, is.numeric)

names(car)[sapply(car, is.numeric)]

cont.vars <- names(car)[sapply(car, is.numeric)]

head(car[cont.vars])

cor(car[cont.vars])

findCorrelation(cor(car[cont.vars]), cutoff = .9)

cont.vars[15]

cs.pre = preProcess(car, method = c('center','scale'))

cs.pre

Created from 159 samples and 26 variables

Pre-processing:
  - centered (16)
  - ignored (10)
  - scaled (16)

head(predict(cs.pre, car))

tr.pre = preProcess(car, method = 'YeoJohnson')
tr.pre

Created from 159 samples and 22 variables

Pre-processing:
  - ignored (10)
  - Yeo-Johnson transformation (12)

Lambda estimates for Yeo-Johnson transformation:
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
-1.1260 -0.6780 -0.3015 -0.1524  0.1700  1.3301

head(predict(tr.pre, car))

set.seed(1234) # 결과를 동일하게 보여주기 위한 용도, 실무에는 불필요하다

idx <- createDataPartition(car$price, p=.8, list=F, times =1)

as.vector(idx)

train.data <- car[idx,]
test.data <- car[-idx,]

dim(train.data)

dim(test.data)

head(test.data)

symboling	normalized_losses	maker	fuel	aspiration	doors	body	wheels	engine_location	wheel_base	⋯	engine_size	fuel_system	bore	stroke	compression_ratio	horsepower	peak_rpm	city_mpg	highway_mpg	price
2	164	audi	gas	std	four	sedan	fwd	front	99.8	⋯	109	mpfi	3.19	3.4	10.0	102	5500	24	30	13950
2	164	audi	gas	std	four	sedan	4wd	front	99.4	⋯	136	mpfi	3.19	3.4	8.0	115	5500	18	22	17450
1	158	audi	gas	std	four	sedan	fwd	front	105.8	⋯	136	mpfi	3.19	3.4	8.5	110	5500	19	25	17710
1	158	audi	gas	turbo	four	sedan	fwd	front	105.8	⋯	131	mpfi	3.13	3.4	8.3	140	5500	17	20	23875
2	192	bmw	gas	std	two	sedan	rwd	front	101.2	⋯	108	mpfi	3.50	2.8	8.8	101	5800	23	29	16430
0	192	bmw	gas	std	four	sedan	rwd	front	101.2	⋯	108	mpfi	3.50	2.8	8.8	101	5800	23	29	16925

symboling	normalized_losses	wheel_base	length	width	height	curb_weight	engine_size	bore	stroke	compression_ratio	horsepower	peak_rpm	city_mpg	highway_mpg	price
2	164	99.8	176.6	66.2	54.3	2337	109	3.19	3.4	10.0	102	5500	24	30	13950
2	164	99.4	176.6	66.4	54.3	2824	136	3.19	3.4	8.0	115	5500	18	22	17450
1	158	105.8	192.7	71.4	55.7	2844	136	3.19	3.4	8.5	110	5500	19	25	17710
1	158	105.8	192.7	71.4	55.9	3086	131	3.13	3.4	8.3	140	5500	17	20	23875
2	192	101.2	176.8	64.8	54.3	2395	108	3.50	2.8	8.8	101	5800	23	29	16430
0	192	101.2	176.8	64.8	54.3	2395	108	3.50	2.8	8.8	101	5800	23	29	16925

	symboling	normalized_losses	wheel_base	length	width	height	curb_weight	engine_size	bore	stroke	compression_ratio	horsepower	peak_rpm	city_mpg	highway_mpg	price
symboling	1.000000000	0.51834364	-0.52059057	-0.33625676	-0.2191860	-0.47518487	-0.2518800	-0.1094533	-0.25646934	-0.02128509	-0.13831574	-0.003949088	0.19910557	0.08954963	0.14983007	-0.1627943
normalized_losses	0.518343643	1.00000000	-0.06008568	0.03554071	0.1097262	-0.41370154	0.1258579	0.2078196	-0.03155814	0.06333048	-0.12725910	0.290510553	0.23769662	-0.23552348	-0.18856420	0.2027613
wheel_base	-0.520590573	-0.06008568	1.00000000	0.87153448	0.8149912	0.55576713	0.8101815	0.6492056	0.57815853	0.16744868	0.29143145	0.516947529	-0.28923445	-0.58065720	-0.61174990	0.7344189
length	-0.336256756	0.03554071	0.87153448	1.00000000	0.8383385	0.49925137	0.8712911	0.7259533	0.64631755	0.12107308	0.18481418	0.672063296	-0.23407384	-0.72454445	-0.72459867	0.7609522
width	-0.219185970	0.10972620	0.81499125	0.83833846	1.0000000	0.29270580	0.8705945	0.7792534	0.57255416	0.19661872	0.25875169	0.681871757	-0.23221605	-0.66668439	-0.69333851	0.8433705
height	-0.475184870	-0.41370154	0.55576713	0.49925137	0.2927058	1.00000000	0.3670518	0.1110826	0.25483608	-0.09131269	0.23330821	0.034317135	-0.24586416	-0.19973748	-0.22613562	0.2448363
curb_weight	-0.251879975	0.12585792	0.81018149	0.87129108	0.8705945	0.36705181	1.0000000	0.8886261	0.64579158	0.17384442	0.22472399	0.790095392	-0.25998788	-0.76215523	-0.78933796	0.8936391
engine_size	-0.109453297	0.20781961	0.64920558	0.72595331	0.7792534	0.11108260	0.8886261	1.0000000	0.59573688	0.29968307	0.14109671	0.812072626	-0.28468581	-0.69913926	-0.71409510	0.8414956
bore	-0.256469345	-0.03155814	0.57815853	0.64631755	0.5725542	0.25483608	0.6457916	0.5957369	1.00000000	-0.10258113	0.01511908	0.560239168	-0.31226891	-0.59044028	-0.59085039	0.5338904
stroke	-0.021285092	0.06333048	0.16744868	0.12107308	0.1966187	-0.09131269	0.1738444	0.2996831	-0.10258113	1.00000000	0.24358681	0.148803798	-0.01131191	-0.02005506	-0.01293438	0.1606643
compression_ratio	-0.138315742	-0.12725910	0.29143145	0.18481418	0.2587517	0.23330821	0.2247240	0.1410967	0.01511908	0.24358681	1.00000000	-0.162305245	-0.41676855	0.27833158	0.22148258	0.2093615
horsepower	-0.003949088	0.29051055	0.51694753	0.67206330	0.6818718	0.03431713	0.7900954	0.8120726	0.56023917	0.14880380	-0.16230524	1.000000000	0.07405682	-0.83721415	-0.82794105	0.7598739
peak_rpm	0.199105574	0.23769662	-0.28923445	-0.23407384	-0.2322160	-0.24586416	-0.2599879	-0.2846858	-0.31226891	-0.01131191	-0.41676855	0.074056824	1.00000000	-0.05292904	-0.03277717	-0.1719161
city_mpg	0.089549633	-0.23552348	-0.58065720	-0.72454445	-0.6666844	-0.19973748	-0.7621552	-0.6991393	-0.59044028	-0.02005506	0.27833158	-0.837214155	-0.05292904	1.00000000	0.97199880	-0.6922731
highway_mpg	0.149830066	-0.18856420	-0.61174990	-0.72459867	-0.6933385	-0.22613562	-0.7893380	-0.7140951	-0.59085039	-0.01293438	0.22148258	-0.827941051	-0.03277717	0.97199880	1.00000000	-0.7200901
price	-0.162794281	0.20276130	0.73441894	0.76095218	0.8433705	0.24483625	0.8936391	0.8414956	0.53389035	0.16066434	0.20936147	0.759873945	-0.17191607	-0.69227306	-0.72009010	1.0000000

symboling	normalized_losses	maker	fuel	aspiration	doors	body	wheels	engine_location	wheel_base	⋯	engine_size	fuel_system	bore	stroke	compression_ratio	horsepower	peak_rpm	city_mpg	highway_mpg	price
1.0595642	1.202423	audi	gas	std	four	sedan	fwd	front	0.2972180	⋯	-0.3357239	mpfi	-0.4119383	0.5549495	-0.04142772	0.2006447	0.8291132	-0.4136385	-0.3222945	0.4260517
1.0595642	1.202423	audi	gas	std	four	sedan	4wd	front	0.2198099	⋯	0.5506615	mpfi	-0.4119383	0.5549495	-0.55563599	0.6238413	0.8291132	-1.3977060	-1.5608401	1.0215069
0.2214015	1.034126	audi	gas	std	four	sedan	fwd	front	1.4583399	⋯	0.5506615	mpfi	-0.4119383	0.5549495	-0.42708392	0.4610734	0.8291132	-1.2336948	-1.0963855	1.0657407
0.2214015	1.034126	audi	gas	turbo	four	sedan	fwd	front	1.4583399	⋯	0.3865161	mpfi	-0.6363753	0.5549495	-0.47850475	1.4376810	0.8291132	-1.5617173	-1.8704765	2.1145925
1.0595642	1.987808	bmw	gas	std	two	sedan	rwd	front	0.5681465	⋯	-0.3685530	mpfi	0.7476527	-1.4797233	-0.34995268	0.1680912	1.4732289	-0.5776497	-0.4771127	0.8479742
-0.6167613	1.987808	bmw	gas	std	four	sedan	rwd	front	0.5681465	⋯	-0.3685530	mpfi	0.7476527	-1.4797233	-0.34995268	0.1680912	1.4732289	-0.5776497	-0.4771127	0.9321886

symboling	normalized_losses	maker	fuel	aspiration	doors	body	wheels	engine_location	wheel_base	⋯	engine_size	fuel_system	bore	stroke	compression_ratio	horsepower	peak_rpm	city_mpg	highway_mpg	price
1.8832956	2.734490	audi	gas	std	four	sedan	fwd	front	99.8	⋯	0.8836456	mpfi	1.144342	3.4	10.0	1.703635	172.1545	2.871724	3.759902	1.291944
1.8832956	2.734490	audi	gas	std	four	sedan	4wd	front	99.4	⋯	0.8846235	mpfi	1.144342	3.4	8.0	1.713120	172.1545	2.652102	3.405769	1.292072
0.9647145	2.725423	audi	gas	std	four	sedan	fwd	front	105.8	⋯	0.8846235	mpfi	1.144342	3.4	8.5	1.709675	172.1545	2.693482	3.550559	1.292080
0.9647145	2.725423	audi	gas	turbo	four	sedan	fwd	front	105.8	⋯	0.8844744	mpfi	1.135284	3.4	8.3	1.727436	172.1545	2.608318	3.298929	1.292218
1.8832956	2.771846	bmw	gas	std	two	sedan	rwd	front	101.2	⋯	0.8835994	mpfi	1.188540	2.8	8.8	1.702829	177.0659	2.839331	3.720730	1.292040
0.0000000	2.771846	bmw	gas	std	four	sedan	rwd	front	101.2	⋯	0.8835994	mpfi	1.188540	2.8	8.8	1.702829	177.0659	2.839331	3.720730	1.292056

[DataMining] 3. 데이터 전처리

Machine Learning의 종류

데이터 전처리

caret

더미 변수 (Dummy Variables)

분산이 0인 예측 변수

연속 변수 고르기

상관관계가 높은 변수 찾기

Centering과 Scaling

데이터 변환

데이터 분할

'BIGDATA > R' 카테고리의 다른 글

티스토리툴바

[DataMining] 2. 시각화를 통한 탐색적 데이터 분석(EDA) (0)	2017.09.16
[DataMining] 1. R Basic Programming (0)	2017.09.09
#12. 고급 시각화 (0)	2016.07.13
#11. 기초 시각화 [ R 내장 함수 ] (0)	2016.07.12
#10 .기술통계 [ 예제 ] (0)	2016.07.12

	wheels.4wd	wheels.fwd	wheels.rwd
1	0	1	0
2	1	0	0
3	0	1	0
4	0	1	0
5	0	0	1
6	0	0	1