2์ฃผ ์ ์ ์บ๊ธ๋ฌธ์ ํ๋ ํ๊ณ ์ถ์ด์ ์ฃผํ๊ฐ๊ฒฉ์์ธก ๋ํ์ ๋ค์ด๊ฐ๋ค.
๋ค์ด๊ฐ์ ๊ทธ๋ฅ ํ๊ท ๊ฐ์ผ๋ก๋ง ์ ๋ถ ๋๋ ค๋ฐ์ผ๋ฉด ๋ช์ ๋์ฌ๊น ๊ถ๊ธํด์ ํด๋ดค๋๋
0.4์ ๋์ค๊ธธ๋, 1์ ์ด ๋ง์ ์ด ์๋๊ฐ ๋ดค๋๋ 0์ ์ ๊ทผ์ ํ ์๋ก ๋์ ์ ์์๋ค. ์ ๊ทธ๋ ๊ตฌ๋ ํ๊ณ ์ข ๋ฃํ์๋๋ฐ,
์๋ ํ๋๊ฑฐ ๋๋ ๊ธฐ๋ ์ผ๋ก 3์๊ฐ๋์ ๋ ธ๋๋ค์ผ๋ฉด์ ๋์ ์ฌ๋ดค๋ค.
import numpy as np
import pandas as pd
import os
for dirname, _, filenames in os.walk('./house_prices'):
for filename in filenames:
print(os.path.join(dirname, filename))
train_data = pd.read_csv('./house_prices/train.csv')
test_data = pd.read_csv('./house_prices/test.csv')
train_target = train_data["SalePrice"]
print(train_data.shape)
print(test_data.shape)
print("target : ",train_target)
์บ๊ธ์์ ๋ฐ์ดํฐ์ ๊ฐ์ ธ์ค๊ณ ,
train_data_backUp = train_data
test_data_backUP = test_data
train_target_backUp = train_target
train_data = train_data.drop(labels=["SalePrice"], axis= 1)
train_data.describe()
test_data.describe()
print(np.shape(train_data))
print(np.shape(train_target))
print(np.shape(test_data))
from sklearn.model_selection import train_test_split
x_train, y_test, x_label, y_label = train_test_split(train_data,
train_target,
test_size=0.4,
shuffle=True,
random_state=1004)
print("------")
print(np.shape(x_train))
print(np.shape(y_test))
print(np.shape(x_label))
print(np.shape(y_label))
print(x_train.describe)
print(x_label.describe)
๋ฐ์ดํฐ ๋๋ ์ฃผ๊ณ ,
x_train = (x_train - x_train.mean(axis=0)) / train_data.std(axis=0)
y_test = (y_test - y_test.mean(axis=0)) / y_test.std(axis=0)
print(x_train.describe)
x_train = x_train.fillna(0)
x_train = x_train.dropna(axis=1)
print(x_train.describe)
y_test = y_test.fillna(0)
๋ฐ์ดํฐ ์ ์ฒ๋ฆฌ ๊ฐ๋จํ๊ฒ ํด์ฃผ์๋ค.
๊ทธ๋ฌ๋ค๊ฐ ๊ทธ๋ฅ ๋ฐ์ดํฐ ์๋๋๊ณ ํ๋๋ก ํ๊ณ ๊ฐ๋จํ ํด์ผ๊ฒ ๋ค๊ณ ์๊ฐ๋ค์ด์
train_data = (train_data - train_data.mean(axis=0)) / train_data.std(axis=0)
train_data = train_data.fillna(0)
train_data = train_data.dropna(axis=1)
print(train_data.describe)
์ด๋ ๊ฒ train ๋ฐ์ดํฐ์ ์ ๋ถ ๋ฃ์ด๋ณผ๋ ค๊ณ ์ฒ๋ฆฌํด๋์๋ค.
๊ทธ๋ค์์ ๋ชจ๋ธ๋ ๊ฐ๋จํ๊ฒ ์ ์ํด๋ณด์๋ค.
from tensorflow.keras import models
from tensorflow.keras import layers
def build_model() :
model = models.Sequential()
model.add(layers.Dense(128, activation='relu', input_shape = (train_data.shape[1],)))
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(1))
model.compile(optimizer = 'rmsprop', loss = 'mse', metrics = ['mae'])
return model
model = build_model()
history = model.fit(train_data, train_target, epochs = 1000, batch_size=16, verbose =1)
val_mse, val_mae = model.evaluate(train_data, train_target, verbose=1)
mae_history = history.history['mae']
๋๋ฌด ๋นจ๋ฆฌ๋์์ ์ํฌํฌ 1000ํด๋์๋ค.
ํ์ต์ํค๊ณ ํ์ธํด๋ณด๋
import matplotlib.pyplot as plt
plt.plot(range(1, len(mae_history) + 1), mae_history)
plt.xlabel('Epochs')
plt.ylabel('mae_history')
plt.show()
loss: 751364928.0000, mae: 13097.6797 ์ด์ ๋ ๋์จ ๊ฒ ๊ฐ๋ค.
๊ทธ๋ฆฌ๊ณ ๊ทธ๋ฅ ์ฌ๊ธฐ๋ค๊ฐ ํ ์คํธ ๋ฐ์ดํฐ์ ์ ๋ฃ์๋ค.
ํ ์คํธ ๋ฐ์ดํฐ์ ๋ ์ ์ฒ๋ฆฌํ๊ณ
test_data1 = test_data_backUP
test_data1 = test_data1.fillna(0)
test_data1 = test_data1.dropna(axis=1)
test_data1 = (test_data1 - test_data1.mean(axis=0)) / test_data1.std(axis=0)
test_data1 = test_data1.astype(float)
test_data1 = test_data1.fillna(0)
print(test_data1.describe())
์ด๊ฒ์ ๊ฒ ํ์ธํด๋ณด๊ณ ๋ฐ์ดํฐ๋ง์ถ๊ณ ์์ธกํด๋ณด๋
#print(train_data.describe())
# print(test_data1.describe())
# print(np.shape(test_data1))
#print(np.shape(y_test))
#test_data = (test_data - test_data.mean(axis=0)) / test_data.std(axis=0)
predict = model.predict(test_data1)
print(np.shape(predict))
print(predict)
๊ฐ์ด ๋์๊ธธ๋ csvํ์ผ๋ก ๋ง๋ค์๋ค.
testR11 = pd.DataFrame(test_data_backUP["Id"])
testR22 = pd.DataFrame(predict)
result11 = pd.concat([testR11, testR22], axis=1)
print(result11)
result11.to_csv("210813_2.csv", index=False)
๋ง๋ค๊ณ ์ ์ถํด๋ณด๋
์ด๋ ๊ฒ ๋์๋ค. ์์ ์ํ๊ณ ์ผํ์ฑ์ผ๋ก ๋์จ๋๋ก ๋ฃ๊ณ ๋๋ด๋ดค๋๋ฐ,
๊ทธ๋ฅ ํ๊ท ๊ฐ๋ฃ๋๊ฑฐ๋ณด๋ค ๊ฐ๋จํ๊ฒ ๋ชจ๋ธ๋ง๋ค๊ณ ๋ค๋ฅธ ์์ฑ๋ค ์๋ฐ์ง๊ณ ๋๋ ค๋ด๋ ์๋์ ์ผ๋ก ๋๊ฒ ๋์ค๋ ๊ฒ ๊ฐ๋ค.
์ฌ๊ธฐ์ ๊ฐ ์์ฑ๋ค์ ๊ด๊ณ๋ฅผ ๋ฐ์ง๊ณ , ๋ฌธ์์ด ์์ฑ๋ค๋ floatํ์ผ๋ก ๋ฐ๊พธ๊ณ ๋๋ฆฌ๋ฉด ํจ์ฌ ๋๊ฒ๋์ฌ๊ฒ๊ฐ๋ค.
ํ ์ผ๋ ๋๋ฌ๊ณ , ๊ธ์์ผ์ด๋ผ ํ๋ฃจ์ด๊ฒธ ๋๋ ค๋ดค๋๋ฐ ์ฌ๋ฐ์๋ฐ.