%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


df = pd.read_csv('/home/jaeyoon89/python-data-analysis/data/used_mobile_phone.csv')
print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4951 entries, 0 to 4950
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   create_date    4951 non-null   object 
 1   price          4951 non-null   float64
 2   text           4951 non-null   object 
 3   phone_model    4951 non-null   object 
 4   factory_price  4951 non-null   int64  
 5   maker          4951 non-null   object 
 6   price_index    4951 non-null   float64
dtypes: float64(2), int64(1), object(4)
memory usage: 270.9+ KB
None


# create_date로 부터 '월'을 의미하는 month 정보를 피처로 추출한다.
df['month'] = df['create_date'].apply(lambda x : x[:7])

# 월별 거래 횟수를 계산하여 출력한다.
df['month'].value_counts()

2016-10    2956
2017-03    1311
2016-08     107
2016-09     105
2016-04     102
2016-05      89
2016-06      76
2016-07      74
2016-03      70
2016-02      61
Name: month, dtype: int64


# 일별 거래 횟수를 계산하여 그래프로 출력한다.
df_day = pd.to_datetime(df['create_date'].apply(lambda x : x[:10])).value_counts()
df_day.plot()
plt.show()


# 가격의 분포를 그래프로 탐색한다.
df['price'].hist(bins="auto")

<AxesSubplot:>


# 휴대폰 기종(phone_model)별 가격의 평균과 표준편차를 계산한다.
df_price_model_mean = df.groupby('phone_model')['price'].transform(lambda x: np.mean(x))
df_price_model_std = df.groupby('phone_model')['price'].transform(lambda x: np.std(x))

# 이를 바탕으로 모든 데이터의 z-score를 계산한다. 이는 해당 데이터의 가격이 기종별 평균에 비해 어느 정도로 높거나 낮은지를 
# 알 수 있게 하는 점수이다.
df_price_model_z_score = (df['price'] - df_price_model_mean) / df_price_model_std
df_price_model_z_score.hist(bins="auto")

<AxesSubplot:>


# factory_price 피처의 분포를 탐색한다.
df['factory_price'].hist(bins="auto")

# factory_price와 price 피처를 산점도 그래프로 출력하여 상관 관계를 살펴본다.
df.plot.scatter(x='factory_price', y='price')

<AxesSubplot:xlabel='factory_price', ylabel='price'>


# 기종별 총 거래 데이터 개수를 집계한다.
model_counts = df['phone_model'].value_counts()
print(model_counts.describe())

# 기종별 총 거래 데이터 개수를 상자 그림으로 살펴보자.
plt.boxplot(model_counts)

count      64.000000
mean       77.359375
std       143.432786
min        10.000000
25%        23.000000
50%        35.000000
75%        90.500000
max      1002.000000
Name: phone_model, dtype: float64

{'whiskers': [<matplotlib.lines.Line2D at 0x7f0e363db128>,
  <matplotlib.lines.Line2D at 0x7f0e363db400>],
 'caps': [<matplotlib.lines.Line2D at 0x7f0e363db6d8>,
  <matplotlib.lines.Line2D at 0x7f0e363db9b0>],
 'boxes': [<matplotlib.lines.Line2D at 0x7f0e363cd390>],
 'medians': [<matplotlib.lines.Line2D at 0x7f0e363dbc88>],
 'fliers': [<matplotlib.lines.Line2D at 0x7f0e363dbf60>],
 'means': []}


from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error


# 데이터를 학습/테스트용 데이터로 분리한다.
df = df[['price','phone_model','factory_price','maker','price_index','month']]
df = pd.get_dummies(df,columns=['phone_model','maker','month'])
X = df.loc[:, df.columns !='price']
y = df['price']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=0)


# 랜덤 포레스트 모델을 학습한다.
forest = RandomForestRegressor(n_estimators=1000,
                              criterion='mse')
forest.fit(X_train,y_train)
y_train_pred = forest.predict(X_train)
y_test_pred = forest.predict(X_test)


# 학습한 모델을 평가한다.
print('MSE train: %.3f, test: %.3f' %(mean_squared_error(y_train,y_train_pred),
                                     mean_squared_error(y_test, y_test_pred)))
print('R^2 train: %.3f, test: %.3f'%(r2_score(y_train, y_train_pred),
                                    r2_score(y_test, y_test_pred)))

MSE train: 10629606583.939, test: 13869786617.291
R^2 train: 0.781, test: 0.683


# 학습한 모델의 피처 중요도를 그래프로 살펴본다.
importances = forest.feature_importances_
indices = np.argsort(importances)[::-1]
plt.bar(range(X.shape[1]), importances[indices])

# 학습한 모델의 피처 중요도를 출력한다.
feat_labels = X.columns.tolist()
feature = list(zip(feat_labels, forest.feature_importances_))
sorted(feature, key=lambda tup: tup[1], reverse=True)[:10]

[('factory_price', 0.4059412771341484),
 ('maker_apple', 0.2969665704508786),
 ('phone_model_galaxy s3 3g 8gb', 0.02214335755126495),
 ('phone_model_iphone se 64gb', 0.02184903571534052),
 ('price_index', 0.02067440244383609),
 ('phone_model_galaxy s4 32gb', 0.017086545532462798),
 ('month_2017-03', 0.01478287257051149),
 ('maker_samsung', 0.014712789035793653),
 ('phone_model_galaxy s6 32gb', 0.012582924738080153),
 ('month_2016-05', 0.01099365095990527)]


# month 피처 중, 영향력이 높은순으로 정렬하여 출력한다.
for sorted_feature in sorted(feature, key=lambda tup: tup[1], reverse=True):
    if "month" in sorted_feature[0]:
        print(sorted_feature)

('month_2017-03', 0.01478287257051149)
('month_2016-05', 0.01099365095990527)
('month_2016-09', 0.008403642222469207)
('month_2016-04', 0.00754796183598632)
('month_2016-10', 0.006461208993586404)
('month_2016-06', 0.004434834144756636)
('month_2016-08', 0.0036054588147535877)
('month_2016-07', 0.002716414447187438)
('month_2016-03', 0.0024471929326292602)
('month_2016-02', 0.0010583458515554083)


# 데이터를 다시 불러오자.
df = pd.read_csv('/home/jaeyoon89/python-data-analysis/data/used_mobile_phone.csv')


from datetime import datetime
import time


# create_date 피처를 수치적으로 계산하기 위해 unixtime으로 변환하는 함수를 정의한다.
def date_to_unixtime(date_str):
    timestamp = time.mktime(datetime.strptime(date_str, '%Y-%m-%d').timetuple())
    return timestamp


# create_date 피처를 '현재와 얼마나 가까운 데이터인지' 판단하기 위한 점수를 생성한다. 먼저 unixtime으로 데이터를 변환한다.
df['create_unixtime'] = df['create_date'].apply(lambda x : date_to_unixtime(x[:10]))


# 변환된 unixtime에 min-max 스케일링을 적용한다.
df['create_time_score'] = (df['create_unixtime']- df['create_unixtime'].min()) / (df['create_unixtime'].max() - df['create_unixtime'].min())
df[['create_date','create_unixtime', 'create_time_score']].head()


# phone_model 피처에서 저장 용량(phone_model_storage) 피처를 추출한다.
df['phone_model_storage'] = df['phone_model'].apply(lambda x: x.split(" ")[-1])

# phone_model 피처에서 기존 세부명(phone_model_detail) 피처를 추출한다.
df['phone_model_detail'] = df['phone_model'].apply(lambda x: ' '.join(x.split(" ")[:-1]))
df[['phone_model_storage', 'phone_model_detail']].head()


# phone_model 피처의 기종별 거래 데이터 개수를 집계한다.
model_counts = df['phone_model'].value_counts()

# phone_model_detail 피처의 기종별 거래 데이터 개수를 집계한다.
model_detail_counts = df['phone_model_detail'].value_counts()
data = [model_counts, model_detail_counts]

# 두 피처간의 기종별 거래 데이터 개수를 비교하자.
mpl_fig = plt.figure()
ax = mpl_fig.add_subplot(111)
ax.boxplot(data)

{'whiskers': [<matplotlib.lines.Line2D at 0x7f0e310b57f0>,
  <matplotlib.lines.Line2D at 0x7f0e310b5ac8>,
  <matplotlib.lines.Line2D at 0x7f0e310c3c18>,
  <matplotlib.lines.Line2D at 0x7f0e310c3eb8>],
 'caps': [<matplotlib.lines.Line2D at 0x7f0e310b5da0>,
  <matplotlib.lines.Line2D at 0x7f0e310c30b8>,
  <matplotlib.lines.Line2D at 0x7f0e31051208>,
  <matplotlib.lines.Line2D at 0x7f0e310514e0>],
 'boxes': [<matplotlib.lines.Line2D at 0x7f0e310b5588>,
  <matplotlib.lines.Line2D at 0x7f0e310c3860>],
 'medians': [<matplotlib.lines.Line2D at 0x7f0e310c3390>,
  <matplotlib.lines.Line2D at 0x7f0e31051780>],
 'fliers': [<matplotlib.lines.Line2D at 0x7f0e310c3668>,
  <matplotlib.lines.Line2D at 0x7f0e31051a58>],
 'means': []}


# 거래 가격(price)의 z-score를 계산한다. 이는 해당 데이터의 가격이 기종의 평균에 비해 어느정도로 높거나 낮은지를 알 수 있게 하는 점수이다.
df['price_by_group'] = df.groupby('phone_model_detail')['price'].transform(lambda x: (x - x.mean()) / x.std())

# 거래 가격의 z-score(price_by_group)의 분포를 그래프로 출력한다.
ax = df['price_by_group'].hist(bins="auto")

# z-score(price_by_group) 기준으로 하위 5%, 상위 5%에 해당하는 점수를 lower_bound, upper_bound 라고 지정한다.
lower_bound = df['price_by_group'].quantile(0.05)
upper_bound = df['price_by_group'].quantile(0.95)

# lower_bound, upper_bound 그래프에 추가한다.
ax.axvline(x=lower_bound, color='r', linestyle='dashed', linewidth=2)
ax.axvline(x=upper_bound, color='r', linestyle='dashed', linewidth=2)

# lower_bound를, upper_bound 출력한다.
print(lower_bound)
print(upper_bound)

-1.3966616903783375
1.666982156397844


# lower_bound보다 낮으면 0, upper_bound보다 높으면 2, 그 중간이면 1로 가격의 상태를 분류하는 함수를 정의한다.
def get_price_level(price, lower, upper):
    if price <= lower:
        return "0"
    elif price >= upper:
        return "2"
    else:
        return "1"

# lower_bound보다 낮으면 0, upper_bound보다 높으면 2, 그 중간이면 1로 가격의 상태를 분류합니다.
df['price_lower'] = df.groupby('phone_model_detail')['price'].transform(lambda x: x.quantile(0.05))
df['price_upper'] = df.groupby('phone_model_detail')['price'].transform(lambda x: x.quantile(0.95))
df['price_level'] = df.apply(lambda row: get_price_level(row['price'], row['price_lower'], 
                                                         row['price_upper']), axis=1)
df[['price', 'price_lower', 'price_upper', 'price_level', 'text']].head()


import pickle
import re

# 중고나라 불용어 사전을 불러오자.
with open('/home/jaeyoon89/python-data-analysis/data/used_mobile_phone_stopwords.pkl','rb') as f:
    stopwords = pickle.load(f)
# 불용어 사전에 등록된 단어 10개를 출력하자.
print(stopwords[:10])

['거래', '입니', '판매', '아이폰', '갤럭시', '골드', '팝', '만원', '폰', '시']


df['price_level'].value_counts()

1    4296
0     334
2     321
Name: price_level, dtype: int64


from konlpy.tag import Okt

# + 를 제외한 특수문자를 제거하고, 숫자형태의 문자를 제거한다.
def text_cleaning(text):
    text = ''.join(c for c in text if c.isalnum() or c in '+, ')
    text = ''.join([i for i in text if not i.isdigit()])
    return text


# 불용어에 등장하지 않는 형태소만을 추출하여 반환하는 함수이다.
def get_pos(x):
    tagger = Okt()
    poses = tagger.pos(x)
    return [pos[0] for pos in poses if pos[0] not in stopwords]


# 위 함수들을 적용한 형태소 추출을 테스트한다.
df['text'] = df['text'].apply(lambda x : text_cleaning(x))
result = get_pos(df['text'][0])
print(result)

['+', '애플', '라이트', '팝니다', '+', '애플', '라이트', '팝니다', '+', '애플', '라이트', '팝니다', '리퍼', '기간', '만료', '되어서', '징', '하게', '되었습니다', '상태', '초', 'a', '급', '스', '없습니다', '+', '애플', '라이트', '팝니다', '+', '애플', '라이트', '팝니다', '리퍼', '기간', '만료', '되어서', '징', '하게', '되었습니다', '상태', '초', 'a', '급', '스', '없습니다', '징', '애플', '라이트', '홈', '버튼', '링', '카메라', '링', '볼륨', '버튼', '슬립', '버튼', '검금', '심플', '튀지', '않게', '이쁘게', '했구요', '유심', '꽂고', '바로', '사용', '하시면', '됩니다', '사람', '이냐', '자주', '물어보고', '실제', '더욱', '이쁩니다', '밤', '영롱하게', '맥북', '뒷', '사과', '로고', '비춰지고', '요전', '넘어가기', '위해', '합니다', '가능합니다', '박스', '어머니', '버리시고', '이어폰', '충전기', '정품', '드립니다', '직거래', '우선', '순', '위로', '정', '싶으시면', '선', '입금', '택배', '발송', '해드리겠습니다', '믿으시면', '직거래', '하시길', '추천', '해요', '안전', '합니다', '서울시', '강남구', '역삼동', '차병원', '사거리', '근처', '가격']


from collections import Counter

# get_pos() 함수를 모든 텍스트 데이터에 적용하여 형태소 말뭉치를 추출한다.
corpus = sum(df['text'].apply(lambda x : get_pos(x)).tolist(), [])

# 추출된 형태소 말뭉치에서 가장 많이 등장한 형태소 2500개를 추출한다.
counter = Counter(corpus)
common_words = [key for key, _ in counter.most_common(2500)]
common_words

['입니다',
 '직거래',
 's',
 '합니다',
 '택배',
 '사용',
 '급',
 '상태',
 '팝니다',
 '가능합니다',
 '정상',
 '사진',
 '가격',
 '+',
 '케이스',
 'a',
 '주세요',
 '해지',
 '삭제',
 '제품',
 '있습니다',
 '박스',
 '가능',
 '직접',
 '액정',
 '배터리',
 '성품',
 '필름',
 '리퍼',
 '충전기',
 '없습니다',
 '풀',
 '개통',
 '유심',
 '즈',
 '안전',
 '스',
 '하기',
 '신청',
 '드립니다',
 '통신사',
 '구입',
 '약정',
 '이어폰',
 '공기',
 '새',
 '기변',
 '포함',
 '모델',
 '선택',
 '됩니다',
 '확인',
 '기간',
 '기스',
 '그레이',
 '찍힘',
 '방법',
 '바로',
 '할인',
 '제',
 '시기',
 '스페이스',
 '희망',
 '번호',
 '중고나라',
 '본체',
 '같이',
 '생활',
 '잘',
 '무',
 '퀵',
 '글',
 '된',
 '않을',
 '공식',
 '앱',
 '확정',
 '기능',
 '다운',
 '양',
 '호환',
 '케이블',
 '받기',
 '미',
 '이메일',
 '작성',
 '부분',
 '금지',
 '될수',
 '식아이디',
 '허위',
 '임의',
 '통보',
 '채우지',
 '핸드폰',
 '편한',
 '전혀',
 '충전',
 '내용',
 '연락처',
 '단말기',
 '부산',
 '대구',
 '미사',
 '정품',
 '중고',
 '문제',
 '없이',
 '이상',
 '보호',
 '방문',
 '없는',
 '외관',
 '외',
 '인천',
 '깨끗합니다',
 'x',
 '초기',
 '하나',
 '참고',
 '이구',
 '요금',
 '개월',
 '이나',
 '사이트',
 '팔아요',
 '하시면',
 '했습니다',
 '동',
 '카페',
 '보내',
 '카톡',
 '블로그',
 '링크',
 '싸이',
 '강퇴',
 '삼성',
 '거치',
 '있는',
 '호선',
 '모든',
 '입금',
 '처리',
 '없음',
 '뒷',
 '핑크',
 '카메라',
 '거주지',
 '공',
 '하여',
 '역도',
 '재판매',
 '유도',
 '선',
 '교체',
 '강화유리',
 '경매',
 '만만',
 '부탁드립니다',
 '구성',
 '필수',
 '차대',
 '재시',
 '없고',
 '전체',
 '파손',
 '다른',
 '가능하며',
 '작동',
 '가능한',
 '교환',
 '드리겠습니다',
 '좋습니다',
 '기계',
 '생각',
 '그대로',
 '추가',
 '약간',
 '살짝',
 '바랍니다',
 '테두리',
 '풀박',
 '미개',
 '거의',
 '부담',
 '쪽',
 '조금',
 '비',
 '주시',
 'as',
 '하지',
 '싸게',
 '때',
 '하겠습니다',
 '봉',
 '완전',
 '상품',
 '댓글',
 '착불',
 '부착',
 '때문',
 '금액',
 '아주',
 '폴더',
 '원하시면',
 '와인',
 '할',
 '하며',
 '수',
 '하단',
 '현재',
 '거주',
 '한번',
 '버튼',
 '더',
 '가능하고',
 '정말',
 '있고',
 '엘지',
 '커버',
 '블루',
 '번',
 '원합니다',
 '기본',
 '해주세요',
 '터치',
 '그냥',
 '하는',
 '용감',
 '시간',
 '전부',
 '대전',
 '되어',
 '통화',
 '센터',
 '있구요',
 '쿨',
 '깨끗한',
 '앞',
 '새거',
 '미국',
 '같습니다',
 '광주',
 '근처',
 '없구요',
 '하실',
 '착',
 '방식',
 '환불',
 '애플',
 '아래',
 '서비스',
 '젤리',
 '유리',
 '가능하구요',
 '선호',
 '않습니다',
 '수원',
 '모서리',
 '곳',
 '달',
 '하세요',
 '스마트폰',
 '화면',
 '경기도',
 '신품',
 '강화',
 '제트',
 '드려요',
 '특',
 '부품',
 '발송',
 '년월',
 '상단',
 '유플러스',
 '새것',
 '이후',
 '풀셋',
 '좋은',
 '사양',
 '홍',
 '역',
 '개봉',
 '테스트',
 '수리',
 '있어요',
 '아무',
 '투명',
 '잔기스',
 '흠집',
 '미노트',
 '분실',
 '천안',
 '받은',
 '나머지',
 '점',
 '하자',
 '하시고',
 '저렴하게',
 '않은',
 '신분',
 '유',
 '용량',
 '하시는',
 '전면',
 '언락폰',
 '양호',
 '금',
 '하였습니다',
 '보시다시피',
 '별도',
 '하구요',
 '방탄',
 '스그',
 '여분',
 '본',
 '하셔도',
 '무상',
 '따로',
 '세이프',
 '좀',
 '와이파이',
 '무선',
 '제외',
 '종',
 '되었습니다',
 '천원',
 '프로',
 '눌',
 '매트',
 '배송',
 '약',
 '절충',
 '했구요',
 '우선',
 '대리점',
 '인터넷',
 '없으며',
 '수수료',
 '다시',
 '초',
 '끼',
 '평일',
 '삽니다',
 '고장',
 '이번',
 '부근',
 '겁니다',
 '아님',
 '매장',
 '해드립니다',
 '보조',
 '하던',
 '인식',
 '불량',
 '포장',
 '부천',
 '걸',
 '군데',
 '베가',
 '언제',
 '월일',
 '첨부',
 '전주',
 '항상',
 '갤',
 '처분',
 '불가',
 '비는',
 '매우',
 '남음',
 '없어요',
 '참조',
 '국내',
 '언',
 '가입',
 '잔',
 '지문',
 '있어서',
 '장소',
 '절대',
 '밑',
 '해외',
 '되는',
 '용인',
 '적용',
 '오시',
 '옵티머스',
 '있으며',
 '유니크로',
 '무음',
 '실사',
 '분만',
 '좋아요',
 '팔',
 '안녕하세요',
 '하면서',
 '답변',
 '작은',
 '젠더',
 '가능해요',
 '아직',
 '당연히',
 '넥서스',
 '최초',
 '불입',
 '삼',
 '베터리',
 '드릴게요',
 '개인',
 '오른쪽',
 '아이디',
 '밧데리',
 '드릴께요',
 '스크래치',
 '보관',
 '관심',
 '있지만',
 '선불',
 '변경',
 '큰',
 '되고',
 '팩',
 '공장',
 '급처',
 '꼭',
 '예약',
 '혹시',
 '감사합니다',
 '출구',
 '이용',
 '되구요',
 '안심',
 '내장',
 '펜',
 '먼저',
 '했던',
 '버전',
 '아닙니다',
 '메인보드',
 '안산',
 '한지',
 '안됩니다',
 '주말',
 '사절',
 '스마트',
 '하면',
 '용이',
 '한국',
 '문',
 '최상',
 '오늘',
 '겔럭시',
 '맥스',
 '하게',
 '너무',
 '필요하시면',
 '자국',
 '일본',
 '물',
 '홈',
 '보기',
 '지금',
 '있음',
 'nbsp',
 '최대한',
 '홍콩',
 '저녁',
 '답장',
 '보니',
 '가지',
 '있는데',
 '궁금하신',
 '쓰던',
 '보고',
 '왼쪽',
 '깨끗하고',
 '택포',
 '쪽지',
 '있으면',
 '작년',
 '기타',
 '반품',
 '뒤',
 '많은',
 '사항',
 '만료',
 '락',
 '문의사항',
 '아이언',
 '자세한',
 '상처',
 '그랜드',
 '범퍼',
 '붙여서',
 '진행',
 '청주',
 'schw',
 '알파',
 '남아있습니다',
 '분당',
 '위해',
 '배송비',
 '망',
 '기준',
 '카드',
 '드리구요',
 '네오',
 '사실',
 '임',
 '보이는',
 '오후',
 '사파이어',
 '새로',
 '조건',
 '침수',
 '인근',
 '해요',
 '자세히',
 '처음',
 '강남',
 '환영',
 '이어팟',
 '물건',
 '원하시는',
 '등록',
 '이미지',
 '일산',
 '이동',
 '또한',
 '징',
 '비닐',
 '찍힘이',
 '완납',
 '무료',
 '단자',
 '이외',
 '연',
 '금제',
 '구성은',
 '찍힌',
 '의사',
 '중화역',
 '있으나',
 '하니',
 '부',
 '의정부',
 '있으니',
 '일반',
 '남았습니다',
 '칩',
 '톡',
 '엑스페리아',
 '후시',
 '받지',
 '붙여',
 '빠른',
 '락폰',
 '개호환',
 '현상',
 '대개',
 '얼마',
 '제거',
 '감안',
 '했는데',
 '하였고',
 '안전거래도',
 '팜',
 '하려고',
 '올립니다',
 '예정',
 '유지',
 '목포',
 '월희',
 '알뜰폰',
 '둘다',
 '성남',
 '악세사리',
 '공공',
 '물품',
 '크게',
 '울산',
 '안양',
 '심',
 '같은',
 '집',
 '어댑터',
 '않았습니다',
 'iphone',
 '검수',
 '정중히',
 '레드',
 '전화기',
 '빼',
 '화웨이',
 '경기',
 '내년',
 '이력',
 '업무',
 '샤오미',
 '짐',
 '용기',
 '일단',
 '받고',
 '각',
 '있으시면',
 '광역시',
 '창원',
 '결과',
 '가죽',
 '설정',
 '하셔서',
 '어디',
 '이면',
 '지프로',
 '했고',
 '결제',
 '데이터',
 '전원',
 '미세한',
 'aa',
 '가능하니',
 '있으',
 '써서',
 '받습니다',
 '기존',
 '등등',
 '없네요',
 '되도록',
 '무기',
 '실기',
 '자부',
 '아시겠지만',
 '세트',
 '반',
 '눈',
 '우측',
 '언락',
 '올해',
 '빨리',
 '제조',
 '++',
 '주변',
 '않으며',
 '봅니다',
 '지원',
 '함',
 '상관없이',
 '입니다구',
 '이유',
 '가서',
 '위주',
 '않고',
 '출시',
 '화이트골드',
 '오닉스',
 '티',
 '내부',
 '쓰실',
 '없어서',
 'slte',
 '현금',
 '챙겨',
 '말씀',
 '보증',
 '에누리',
 'sktg',
 '전용',
 '신용',
 '헬로모바일',
 '통일',
 '실제',
 '드리고',
 '깨끗하게',
 '스피커',
 '총',
 '흰색',
 '우체국택배',
 '보면',
 '통신',
 '스카이',
 '쓰시',
 '밖에',
 '원래',
 '정보',
 '물론',
 '날',
 '마지막',
 '되어있습니다',
 '받았습니다',
 '애플스토어',
 '믿고',
 '흔적',
 '되면',
 '통',
 '드림',
 '실리콘',
 '선물',
 '위치',
 '조정',
 '회사',
 'plus',
 '근무',
 '바',
 '내놓습니다',
 '상해',
 '붙이',
 '있어',
 '지장',
 '되지',
 '셋',
 '부평',
 '편입',
 '김포',
 '의무',
 '안전하게',
 '일이',
 '시크릿',
 '칠',
 '거리',
 '인치',
 '역시',
 '흥정',
 '찔러',
 '빼고',
 '순천',
 'c',
 '비밀번호',
 '잠금',
 'sphw',
 '부탁',
 'cj',
 '직',
 '되며',
 '성능',
 '최상급',
 'aaa',
 '사서',
 '해지한',
 '램',
 '잠실',
 '스크레치',
 '걱정',
 '늦어요',
 '법적',
 '찍힘은',
 '순',
 '불로',
 '없지만',
 'lgsu',
 '다녀서',
 '넣어',
 '쓰고',
 '카카오',
 '받아서',
 '일주일',
 '지하철',
 '직구',
 '무조건',
 '평택',
 '협의',
 '루나',
 '잔기',
 '좋구요',
 '끝',
 '죄송합니다',
 '법',
 '빠르게',
 '하다가',
 '찍어',
 '안나',
 '로만',
 '브라운',
 '육',
 '티타늄',
 '안전한',
 '책임집니다',
 '되서',
 'note',
 '색',
 '티탄',
 '여기',
 '일체',
 '메모리',
 '측면',
 '옆',
 '드릴수',
 '오셔서',
 '외부',
 'gold',
 '빼고는',
 '우체국',
 '깨짐',
 '고객',
 '좋음',
 '특성',
 '기단',
 'lt',
 'dmb',
 'lglu',
 '좋겠습니다',
 '계속',
 '위쪽',
 '알',
 '하는데',
 '깨끗이',
 '하루',
 '드릴',
 '롬',
 '애지중지',
 '정',
 '서울시',
 '편이',
 '관계',
 '신규',
 '년도',
 '하였으며',
 '업',
 '어플',
 '지난',
 '쓴',
 '방수',
 '조회',
 '관',
 '하셔야',
 '시오',
 '슈피겐',
 '앞뒤',
 '추천',
 'lgsh',
 '서구',
 '상관없습니다',
 '가능하십니다',
 'm',
 '울',
 '별로',
 '이상무',
 '고속',
 '마시고',
 '등급',
 '저촉',
 '이기',
 '됐습니다',
 'smnk',
 '끝났습니다',
 '취급',
 '요청',
 '동봉',
 '팬택',
 '수준',
 '맨',
 '깨끗함',
 '세용',
 '동대문구',
 '깨진',
 '보입니다',
 '강동',
 '사설',
 '계양구',
 '업자',
 '퀄컴',
 '바꾸게',
 '본인',
 '키',
 '되있습니다',
 '이내',
 '진동',
 '같은거',
 '사은',
 '않아',
 '스티커',
 '깔끔한',
 '뽁뽁',
 '송파',
 '여부',
 '홍대',
 '롤리팝',
 '핀',
 'im',
 '깨끗',
 '블루투스',
 '씌우고',
 '깔끔합니다',
 '라인',
 '했어요',
 '남아',
 '쓰셔도',
 '할부',
 '값',
 '라이트닝',
 '해드리겠습니다',
 '보이지',
 '남겨주세요',
 '하다',
 '샤베트',
 'p',
 '검색',
 '끼워서',
 '인하',
 '남은',
 '받을',
 '관련',
 '드리고요',
 '경산',
 '끼우면',
 '사기',
 '영',
 '광양',
 '퀵서비스',
 '사람',
 '넣어서',
 '아니니',
 '마세요',
 '상의',
 '짜리',
 '현',
 '낮',
 '깨끗해요',
 '여러',
 '잭',
 '흠',
 '됨',
 '끝난',
 '설치',
 '있는거',
 '가장',
 '진짜',
 '광대역',
 '없다고',
 '찍힘이나',
 '원하구요',
 'ok',
 'lgkh',
 '자급',
 '아이',
 '경남',
 '오전',
 '부탁드려요',
 '일괄',
 '가셔서',
 '올려',
 '하시기',
 '대신',
 '촬영',
 '전북',
 '바꾸면서',
 '저장',
 '한글',
 '시세',
 '상자',
 '여수',
 '칠이사이',
 '답터',
 '착용',
 '이전',
 '멀쩡합니다',
 '연결',
 '해보니',
 '최저',
 '당일',
 '편의점',
 'shves',
 '써',
 '회',
 '알리',
 '드리며',
 '없고요',
 '파는',
 '있네요',
 '아예',
 '중랑구',
 '일자',
 '있고요',
 '월말',
 '쓰다가',
 '여서',
 '와같이',
 '받아',
 '소니',
 '스타일',
 '휘',
 '긁',
 '되었고',
 '요구',
 '가구',
 '뭐',
 '강동구',
 '실물',
 '넣고',
 '붙어있는',
 '되요',
 '가능하고요',
 '정식',
 '강남역',
 '눌러서',
 '점검',
 '일대',
 '보험',
 '밤',
 '둘',
 ...]


df['price_level'].value_counts()

1    4296
0     334
2     321
Name: price_level, dtype: int64


from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

# 빈출 형태소를 제외한 모든 형태소를 제거하는 함수를 정의한다.
def get_common_pos(x):
    tagger = Okt()
    poses = tagger.pos(x)
    return [pos[0] for pos in poses if pos[0] in common_words]

# 1:3:1 비율로 랜덤 샘플링을 수행한다.
negative_random = df[df['price_level']=='0'].sample(321, random_state=30)
neutral_random = df[df['price_level']=='1'].sample(321*3, random_state=30)
positive_random = df[df['price_level']=='2'].sample(321, random_state=30)

# 샘플링 완료된 데이터셋을 정의한다.
df_sample = negative_random.append(neutral_random).append(positive_random)

# TF-IDF를 수행하여 피처를 변환한다.
index_vectorizer = CountVectorizer(tokenizer = lambda x: get_common_pos(x))
X = index_vectorizer.fit_transform(df_sample['text'].tolist())
tfidf_vectorizer = TfidfTransformer()
X = tfidf_vectorizer.fit_transform(X)

# 감성 분류를 위한 학습 데이터셋을 정의한다.
y = df_sample['price_level']
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=30)
print(x_train.shape)
print(x_test.shape)

(1284, 2476)
(321, 2476)


from sklearn.metrics import accuracy_score
from sklearn.svm import SVC

# 비선형 SVM 분류 모델을 학습하고 평가한다.
svm = SVC(kernel='rbf', C=10.0, random_state=0, gamma=0.10)
svm.fit(x_train, y_train)
y_pred_ksvc = svm.predict(x_test)
print('Accuracy: %.2f' % accuracy_score(y_test, y_pred_ksvc))

Accuracy: 0.77


from sklearn.metrics import confusion_matrix

# Confusion Matrix를 출력한다.
confmat = confusion_matrix(y_true=y_test, y_pred=y_pred_ksvc)
print(confmat)

[[ 30  28   4]
 [ 10 181  10]
 [  2  21  35]]


# text 피처로부터 '상품 상태 피처'를 생성한다.
X = index_vectorizer.transform(df['text'].tolist())
X = tfidf_vectorizer.transform(X)
df['product_status'] = pd.Series(svm.predict(X))


# 랜덤 포레스트 모델 학습을 위한 데이터를 준비한다.
df = df[['price', 'factory_price', 'maker', 'price_index', 'create_time_score', 'phone_model_storage',
                'phone_model_detail', 'product_status']]
df = pd.get_dummies(df, columns=['maker','phone_model_storage','phone_model_detail','product_status'])


# 학습/테스트 데이터를 분리한다.
X = df.loc[:, df.columns !='price']
y = df['price']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=0)


# 랜덤 포레스트 모델을 학습하고 평가한다.
forest = RandomForestRegressor(n_estimators=1000,
                               criterion='mse')
forest.fit(X_train, y_train)
y_train_pred = forest.predict(X_train)
y_test_pred = forest.predict(X_test)
print('MSE train: %.3f, test: %.3f' % (
        mean_squared_error(y_train, y_train_pred),
        mean_squared_error(y_test, y_test_pred)))
print('R^2 train: %.3f, test: %.3f' % (
        r2_score(y_train, y_train_pred),
        r2_score(y_test, y_test_pred)))

MSE train: 4773645749.137, test: 11017774616.937
R^2 train: 0.902, test: 0.748


# 피처 중요도 plot 을 출력한다.
importances = forest.feature_importances_
plt.plot(importances, "o")

[<matplotlib.lines.Line2D at 0x7f0ddd8be780>]


# 피처 중요도를 print로 출력한다.
feat_labels = X.columns.tolist()
feature = list(zip(feat_labels, forest.feature_importances_))
sorted(feature, key=lambda tup: tup[1], reverse=True)[:10]

[('maker_apple', 0.2543507464906023),
 ('factory_price', 0.25333310231260026),
 ('create_time_score', 0.11654417957554435),
 ('phone_model_storage_16gb', 0.06025384434282711),
 ('product_status_2', 0.047092572747126506),
 ('phone_model_detail_galaxy s7 edge', 0.029231983949269756),
 ('phone_model_storage_64gb', 0.026883102768965418),
 ('phone_model_detail_galaxy note5', 0.0247083709924326),
 ('phone_model_detail_galaxy s7', 0.022464167900807866),
 ('product_status_0', 0.021591495211777917)]


plt.scatter(y_test.values, y_test_pred)

<matplotlib.collections.PathCollection at 0x7f0e312376d8>


from sklearn.model_selection import RandomizedSearchCV

# Randomized Search로 찾아낼 파라미터 후보군을 각각 리스트로 선정한다.
n_estimators = [int(x) for x in np.linspace(start=200, stop = 2000, num = 10)]
max_features = ['auto','sqrt']
max_depth = [int(x) for x in np.linspace(10, 110, num=11)]
max_depth.append(None)
bootstrap = [True, False]

# RandomizedSearchCV 오브젝트를 생성하여 모델을 정의한다.
random_grid = {'n_estimators': n_estimators,
              'max_features': max_features,
              'max_depth': max_depth,
              'bootstrap': bootstrap}
forest = RandomForestRegressor()
optimal_forest = RandomizedSearchCV(estimator = forest,
                                   param_distributions = random_grid,
                                   n_iter = 100,
                                   cv = 3,
                                   verbose = 2,
                                   random_state = 42,
                                   n_jobs = -1)

# RandomizedSearchCV 모델을 학습한다.
X = df.loc[:, df.columns !='price']
y = df['price']
optimal_forest.fit(X,y)

Fitting 3 folds for each of 100 candidates, totalling 300 fits

RandomizedSearchCV(cv=3, estimator=RandomForestRegressor(), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110,
                                                      None],
                                        'max_features': ['auto', 'sqrt'],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
                                                         1800, 2000]},
                   random_state=42, verbose=2)


# 앞서 선정한 파라미터 후보군 중에서 가장 좋은 결과를 보인 파라미터의 조합을 출력한다.
print(optimal_forest.best_params_)

{'n_estimators': 2000, 'max_features': 'auto', 'max_depth': 10, 'bootstrap': False}


# 최적의 파라미터를 적용한 모델로 중고 휴대폰의 가격을 예측하고 평가한다.
y_train_pred = optimal_forest.predict(X_train)
y_test_pred = optimal_forest.predict(X_test)
print('MSE train: %.3f, test: %.3f' % (
        mean_squared_error(y_train, y_train_pred),
        mean_squared_error(y_test, y_test_pred)))
print('R^2 train: %.3f, test: %.3f' % (
        r2_score(y_train, y_train_pred),
        r2_score(y_test, y_test_pred)))

# 가격 예측 모델의 피처 중요도 plot을 출력한다.
importances = optimal_forest.best_estimator_.feature_importances_
indices = np.argsort(importances)[::-1]
plt.bar(range(X.shape[1]), importances[indices])

MSE train: 8808363320.130, test: 7645657594.144
R^2 train: 0.818, test: 0.825

<BarContainer object of 64 artists>


# 가격 예측 모델의 피처 중요도를 출력한다.
feat_labels = X.columns.tolist()
feature = list(zip(feat_labels, optimal_forest.best_estimator_.feature_importances_))
sorted(feature, key=lambda tup: tup[1], reverse=True)[:10]

[('factory_price', 0.29661887725382863),
 ('maker_apple', 0.2756146378767071),
 ('phone_model_storage_16gb', 0.07214587829278914),
 ('create_time_score', 0.06581894080008098),
 ('product_status_2', 0.06258234575279699),
 ('phone_model_detail_galaxy s7 edge', 0.02707710135203515),
 ('phone_model_detail_iphone 6', 0.025031998840692824),
 ('product_status_0', 0.024499299222839332),
 ('phone_model_detail_galaxy s4', 0.02053025327910071),
 ('product_status_1', 0.01380911645897503)]


# 테스트 데이터의 y값과, 예측된 y값을 산점도 그래프를 활용하여 상관 관계를 살펴본다.
plt.scatter(y_test.values, y_test_pred)

<matplotlib.collections.PathCollection at 0x7f0eb189dbe0>

	create_date	price	text	phone_model	factory_price	maker	price_index
0	2017-03-19 4 35 00 PM	550000.0	아이폰6플러스 블랙+애플라이트 64기가 팝니다 아이폰6플러스 블랙+애플라이트 64...	iphone 6 64gb	924000	apple	95.96
1	2016-10-26 12 08 00 PM	380000.0	갤럭시s6엣지 32기가 팝니다 직거래 갤럭시s6엣지 32기가 품명 갤럭시s6엣지제...	galaxy s6 edge 32gb	979000	samsung	103.05
2	2016-10-25 12 52 00 PM	300000.0	갤럭시s6 풀박스로 팝니다~~~ 새상품급 실기스조차 없어요 직접거래 구매한지 1...	galaxy s6 32gb	854000	samsung	103.05
3	2017-03-23 11 14 00 PM	290000.0	sk g5 티탄 폰 단품판매합니다 직접거래 sk g5 티탄 폰 단품판매합니다 올...	lg g5 32gb	836000	lg	95.96
4	2016-04-11 7 35 00 PM	280000.0	sony 엑스페리아 c5 ultra e5506 16gb 미사용 새제품 팝니다 1...	lg u 32gb	396000	lg	102.59

	create_date	create_unixtime	create_time_score
0	2017-03-19 4 35 00 PM	1.489849e+09	0.985612
1	2016-10-26 12 08 00 PM	1.477408e+09	0.640288
2	2016-10-25 12 52 00 PM	1.477321e+09	0.637890
3	2017-03-23 11 14 00 PM	1.490195e+09	0.995204
4	2016-04-11 7 35 00 PM	1.460300e+09	0.165468

	price	price_lower	price_upper	price_level	text
0	550000.0	180000.0	680000.0	1	아이폰6플러스 블랙+애플라이트 64기가 팝니다 아이폰6플러스 블랙+애플라이트 64...
1	380000.0	180000.0	414000.0	1	갤럭시s6엣지 32기가 팝니다 직거래 갤럭시s6엣지 32기가 품명 갤럭시s6엣지제...
2	300000.0	150000.0	349000.0	1	갤럭시s6 풀박스로 팝니다~~~ 새상품급 실기스조차 없어요 직접거래 구매한지 1...
3	290000.0	100000.0	500000.0	1	sk g5 티탄 폰 단품판매합니다 직접거래 sk g5 티탄 폰 단품판매합니다 올...
4	280000.0	18000.0	400000.0	1	sony 엑스페리아 c5 ultra e5506 16gb 미사용 새제품 팝니다 1...

이것이 데이터 분석이다 wiht 파이썬 ch4-2(강남역 맛집 리뷰로 알아보는 감성 분류) (0)	2021.05.20
이것이 데이터 분석이다 with 파이썬ch2-1,2( 나무위키 최근 변경 페이지 키워드 분석하기) (1)	2021.05.19
이것이 데이터 분석이다 with 파이썬 ch5-2(구매 데이터를 분석하여 상품 추천하기) (0)	2021.04.24
이것이 데이터 분석이다 with 파이썬 ch4-1(타이타닉 생존자 가려내기) (0)	2021.04.16
이것이 데이터 분석이다 with 파이썬 ch3-3(미래에 볼 영화의 평점 예측하기) (0)	2021.04.15

speed&direction

티스토리 뷰

이것이 데이터 분석이다 with 파이썬 ch5-1(중고나라 휴대폰 거래가격 예측하기)

ch.05 데이터 종합 분석 예제¶

5.1 중고나라 휴대폰 거래가격 예측하기¶

step.1 탐색적 분석: 중고나라 데이터 분석하기¶

stpe.2 피처 엔지니어링: 예측 모델 계산하기¶

Step.3 예측 : 중고 휴대폰 거래가 예측하기¶

'이것이 데이터분석이다 with 파이썬' 카테고리의 다른 글

티스토리툴바

	phone_model_storage	phone_model_detail
0	64gb	iphone 6
1	32gb	galaxy s6 edge
2	32gb	galaxy s6
3	32gb	lg g5
4	32gb	lg u

« 2025/06 »
일	월	화	수	목	금	토
1	2	3	4	5	6	7
8	9	10	11	12	13	14
15	16	17	18	19	20	21
22	23	24	25	26	27	28
29	30