반응형
목차
0. 데이터
1. 선형회귀
가. train/test 나누기
나. 2차원 배열 변환
다. 스코어 확인
라. 이웃3으로 변경하여 ax + b에서 a,b값 구하기
마. 선형 그래프 그리기, 산점도, 50cm, 1241.8cm 데이터
바. 스코어 확인(R**2)
2. 다항회귀
가. 2차원 배열
나. 훈련
다. 그래프그리기
라. 스코어 확인
3. 다중회귀
가. 사이킷럿 변환기
나. 정규화
다. 릿지(Lidge)
1) Lidge적용 및 스코어 확인
2) alpha 그래프 작성 후 값 도출
3) 적용
라. 라쏘(Lasso)
1) Lasso 적용 및 스코어 확인
2) alpha 그래프 작성 후 값 도출
3) 적용
데이터
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
perch_length = np.array(
[8.4, 13.7, 15.0, 16.2, 17.4, 18.0, 18.7, 19.0, 19.6, 20.0,
21.0, 21.0, 21.0, 21.3, 22.0, 22.0, 22.0, 22.0, 22.0, 22.5,
22.5, 22.7, 23.0, 23.5, 24.0, 24.0, 24.6, 25.0, 25.6, 26.5,
27.3, 27.5, 27.5, 27.5, 28.0, 28.7, 30.0, 32.8, 34.5, 35.0,
36.5, 36.0, 37.0, 37.0, 39.0, 39.0, 39.0, 40.0, 40.0, 40.0,
40.0, 42.0, 43.0, 43.0, 43.5, 44.0]
)
perch_weight = np.array(
[5.9, 32.0, 40.0, 51.5, 70.0, 100.0, 78.0, 80.0, 85.0, 85.0,
110.0, 115.0, 125.0, 130.0, 120.0, 120.0, 130.0, 135.0, 110.0,
130.0, 150.0, 145.0, 150.0, 170.0, 225.0, 145.0, 188.0, 180.0,
197.0, 218.0, 300.0, 260.0, 265.0, 250.0, 250.0, 300.0, 320.0,
514.0, 556.0, 840.0, 685.0, 700.0, 700.0, 690.0, 900.0, 650.0,
820.0, 850.0, 900.0, 1015.0, 820.0, 1100.0, 1000.0, 1100.0,
1000.0, 1000.0]
)
df = pd.read_csv('https://bit.ly/perch_csv_data')
perch_full = df.to_numpy()
perch_weight = np.array(
[5.9, 32.0, 40.0, 51.5, 70.0, 100.0, 78.0, 80.0, 85.0, 85.0,
110.0, 115.0, 125.0, 130.0, 120.0, 120.0, 130.0, 135.0, 110.0,
130.0, 150.0, 145.0, 150.0, 170.0, 225.0, 145.0, 188.0, 180.0,
197.0, 218.0, 300.0, 260.0, 265.0, 250.0, 250.0, 300.0, 320.0,
514.0, 556.0, 840.0, 685.0, 700.0, 700.0, 690.0, 900.0, 650.0,
820.0, 850.0, 900.0, 1015.0, 820.0, 1100.0, 1000.0, 1100.0,
1000.0, 1000.0]
)
선형회귀
from sklearn.model_selection import train_test_split
#훈련/테스트 세트로 나누기
train_input, test_input, train_target, test_target = train_test_split(perch_length, perch_weight, random_state = 42)
#2차원배열 변환
train_input = train_input.reshape(-1,1)
test_input = test_input.reshape(-1,1)
print(train_input.shape, test_input.shape)
#파이썬에서는 1*2의 1차원 배열형태이나, 사이파이에서는 2*1의 2차원 배열을 사용해야 함
#회귀모델 스코어
from sklearn.neighbors import KNeighborsRegressor
knr = KNeighborsRegressor()
knr.fit(train_input, train_target)
print(knr.score(test_input, test_target)) # →0.99
#회귀 모델을 이웃3으로 변경하여 농어50cm 예측
knr.n_neighbors = 3
knr.fit(train_input, train_target)
print(knr.predict([[50]])) # → 1033.33
#선형회귀로 농어 50cm 예측
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(train_input, train_target)
print(lr.predict([[50]])) #→1241.83
print(lr.coef_, lr.intercept_) #→[39.0] -709
#선형 그래프 그리기, 산점도, 50cm, 1241.8cm 데이터
plt.scatter(train_input, train_target)
plt.plot([15,50], [15*lr.coef_ + lr.intercept_, 50*lr.coef_+ lr.intercept_])
plt.scatter(50, 1241.8, marker = '^')
plt.show
#스코어 확인(R**2)
print(lr.score(train_input, train_target)) #0.93
print(lr.score(test_input, test_target)) #0.82
다항회귀
#다항회귀 2차원 배열
train_poly = np.column_stack((train_input**2, train_input))
test_poly = np.column_stack((test_input**2, test_input))
print(train_poly.shape, test_poly.shape)
#훈련
lr = LinearRegression()
lr.fit(train_poly, train_target)
print(lr.predict([[50**2, 50]]))
print(lr.coef_, lr.intercept_)
#그래프 그리기
point = np.arange(15,50)
plt.scatter(train_input, train_target)
plt.plot(point, 1.01*point**2 -21.6*point + 116.05 )
plt.scatter(50, 1574, marker = '^')
plt.show()
#스코어 확인
print(lr.score(train_poly, train_target))
print(lr.score(test_poly, test_target))
다중회귀
#다중회귀
from sklearn.model_selection import train_test_split
train_input, test_input, train_target, test_target = train_test_split(perch_full, perch_weight, random_state = 42)
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(include_bias = False) #1인 특성 제거
poly.fit(train_input)
train_poly = poly.transform(train_input)
test_poly = poly.transform(test_input)
print(train_poly.shape) #→ (42,9) 즉, 9개의 특성을 가짐.
#poly.get_feature_names_out()
#→ array(['x0', 'x1', 'x2', 'x0^2', 'x0 x1', 'x0 x2', 'x1^2', 'x1 x2','x2^2'], dtype=object)
#poly.fit([[2,3]])
#print(poly.transform([[2,3]]))
#→[[2,3,4,6,9]]
#다중회귀 모델 훈련
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(train_poly, train_target)
#print(lr.score(train_poly, train_target)) →0.99
#print(lr.score(test_poly, test_target)) →0.97
#5제곱까지 특성을 넣으면
poly = PolynomialFeatures(degree = 5, include_bias = False)
poly.fit(train_input)
train_poly = poly.transform(train_input)
test_poly = poly.transform(test_input)
print(train_poly.shape) # → 특성이 55개나 됨
#정규화 세팅
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
ss.fit(train_poly)
train_scaled = ss.transform(train_poly)
test_scaled = ss.transform(test_poly)
다중회귀 - 릿지
#릿지
from sklearn.linear_model import Ridge
ridge = Ridge()
ridge.fit(train_scaled, train_target)
print(ridge.score(train_scaled, train_target))
print(ridge.score(test_scaled, test_target))
#릿지의 적절한 alpha값을 찾기 위해
train_score = []
test_score = []
alpha_list = [0.001, 0.01, 0.1, 1, 10, 100]
for alpha in alpha_list:
ridge = Ridge(alpha = alpha)
ridge.fit(train_scaled, train_target)
train_score.append(ridge.score(train_scaled, train_target))
test_score.append(ridge.score(test_scaled, test_target))
plt.plot(np.log10(alpha_list), train_score)
plt.plot(np.log10(alpha_list), test_score)
plt.xlabel('alpha')
plt.ylabel('R^2')
plt.show()
#→즉 0.1일 때 최적값
ridge = Ridge(alpha = 0.1)
ridge.fit(train_scaled, train_target)
print(ridge.score(train_scaled, train_target))
print(ridge.score(test_scaled, test_target))
다중회귀 - 라쏘
#라쏘
from sklearn.linear_model import Lasso
lasso = Lasso()
lasso.fit(train_scaled, train_target)
print(lasso.score(train_scaled, train_target))
print(lasso.score(test_scaled, test_target))
#점수계산
train_score = []
test_score = []
alpha_list = [0.001, 0.01, 0.1, 1, 10, 100]
for alpha in alpha_list:
lasso = Lasso(alpha = alpha, max_iter = 10000)
lasso.fit(train_scaled, train_target)
train_score.append(lasso.score(train_scaled, train_target))
test_score.append(lasso.score(test_scaled, test_target))
#그래프 그려서 찾아내기
plt.plot(np.log10(alpha_list), train_score)
plt.plot(np.log10(alpha_list), test_score)
plt.xlabel('alpha')
plt.ylabel('R^2')
plt.show()
#→ 최적값은 1^1(10)
lasso = Lasso(alpha = 10)
lasso.fit(train_scaled, train_target)
print(lasso.score(train_scaled, train_target))
print(lasso.score(test_scaled, test_target))
※ 출처 : 혼자 공부하는 머신러닝 딥러닝
반응형