import pandas as pd
import seaborn as sns 
import matplotlib.pyplot as plt
import requests
import urllib.request
import json
import requests
import re
from bs4 import BeautifulSoup

%matplotlib inline

url = 'https://play.google.com/store/apps/details?id=com.venticake.retrica&hl=ko#details-reviews'
headers = {'User-Agent': 'Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)'}
req = urllib.request.Request(url, headers=headers)
data = urllib.request.urlopen(req).read().decode('utf-8')

bs = BeautifulSoup(data, 'html.parser')
div_list = bs.find_all('div',class_="details-section-body expandable")

div_root_list = bs.find_all('div',class_="details-wrapper apps")

# 각 리뷰 스코오별 투표 인원 가지고 오기 
# 1~5점 각 인원, 평균, 전체 인원 
for div_list in div_root_list:
    rating = div_list.find_all('div',class_="rating-box")
    if len(rating)>0:
        total_num = rating[0].find_all('span', class_="reviews-num")[0].text
        avg = rating[0].find_all("div",class_="score")[0].text
        score_detail = rating[0].find_all("div",class_="rating-histogram")[0].find_all('div',class_="rating-bar-container")
        total_number = {}
        for score in score_detail:
            label = score.find('span',class_="bar-label").text.strip()
            label_number = score.find('span',class_="bar-number").text
            total_number[label] = label_number
        total_number['avg'] = avg
        total_number['total'] = total_num

total_series = pd.Series(total_number)
total_series.to_csv("total_series.csv")

total_series

1          401,603
2          222,419
3          464,243
4          833,429
5        3,951,162
avg            4.3
total    5,872,856
dtype: object

url = "https://play.google.com/store/getreviews?authuser=0"
id = 'com.venticake.retrica'
token = 'ZLqR3TmB64y6koyq8uj1tqqiQ4k:14191636750027'

# Review가지고오기.
def GetReviews(url,id,token,pages):  
    param = {'reviewType': '0', 'pageNum': '10000', 'id':'','reviewSortOrder':'4','xhr':'1','token':'','hl':'ko'}
    param['id'] = id
    param['token'] = token
    
    review_date_all = []
    review_star_all = []
    review_user_all = []
    review_title_all = []
    review_body_all = []
    for i in range(1,pages):
        param['pageNum'] = i
        res = requests.post(url, data=param)
        print(" line : {line}, code : {code}".format(line=i,code=res.status_code))
        if res.status_code == 400 or i == pages-1:
            review_df = pd.DataFrame({'DATE':review_date_all,'STAR':review_star_all,'TITLE':review_title_all, 'USER':review_user_all,'BODY':review_body_all})
            columns_list = ['DATE','USER','STAR','TITLE','BODY']
            return(review_df[columns_list])
        else:
            body = res.text[6:]
            res_json = json.loads(body)
            bs = BeautifulSoup(res_json[0][2], 'html.parser')
            review_lists = bs.find_all('div',class_="single-review")
            for j in range(1,len(review_lists)):
                review_date = review_lists[j].find('span',class_="review-date").text # review_date
                review_star = int(re.findall(r'\d+', review_lists[0].find('div',class_="tiny-star star-rating-non-editable-container")['aria-label'])[1]) # Get Digits from Rates
                review_title = review_lists[j].find('span',class_="review-title").text # title
                review_body = review_lists[j].find('div',class_="review-body with-review-wrapper").text # review body
                review_body = str.replace(review_body,review_title,"")
                review_body = str.replace(review_body,"전체 리뷰","")
                review_body = review_body.strip()
                review_user = review_lists[j].find('span',class_="author-name").text.strip()  # get Reviewer
                review_date_all.append(review_date)
                review_star_all.append(review_star)
                review_user_all.append(review_user)
                review_title_all.append(review_title)
                review_body_all.append(review_body)

DF = GetReviews(url,id,token,10000)

 line : 1, code : 200
 line : 2, code : 200
 line : 3, code : 200
 line : 4, code : 200
 line : 5, code : 200
 line : 6, code : 200
 line : 7, code : 200
 line : 8, code : 200
 line : 9, code : 200
 line : 10, code : 200
 line : 11, code : 200
 line : 12, code : 200
 line : 13, code : 200
 line : 14, code : 200
 line : 15, code : 200
 line : 16, code : 200
 line : 17, code : 200
 line : 18, code : 200
 line : 19, code : 200
 line : 20, code : 200
 line : 21, code : 200
 line : 22, code : 200
 line : 23, code : 200
 line : 24, code : 200
 line : 25, code : 200
 line : 26, code : 200
 line : 27, code : 200
 line : 28, code : 200
 line : 29, code : 200
 line : 30, code : 200
 line : 31, code : 200
 line : 32, code : 200
 line : 33, code : 200
 line : 34, code : 200
 line : 35, code : 200
 line : 36, code : 200
 line : 37, code : 200
 line : 38, code : 200
 line : 39, code : 200
 line : 40, code : 200
 line : 41, code : 200
 line : 42, code : 200
 line : 43, code : 200
 line : 44, code : 200
 line : 45, code : 200
 line : 46, code : 200
 line : 47, code : 200
 line : 48, code : 200
 line : 49, code : 200
 line : 50, code : 200
 line : 51, code : 200
 line : 52, code : 200
 line : 53, code : 200
 line : 54, code : 200
 line : 55, code : 200
 line : 56, code : 200
 line : 57, code : 200
 line : 58, code : 200
 line : 59, code : 200
 line : 60, code : 200
 line : 61, code : 200
 line : 62, code : 200
 line : 63, code : 200
 line : 64, code : 200
 line : 65, code : 200
 line : 66, code : 200
 line : 67, code : 200
 line : 68, code : 200
 line : 69, code : 200
 line : 70, code : 200
 line : 71, code : 200
 line : 72, code : 200
 line : 73, code : 200
 line : 74, code : 200
 line : 75, code : 200
 line : 76, code : 200
 line : 77, code : 200
 line : 78, code : 200
 line : 79, code : 200
 line : 80, code : 200
 line : 81, code : 200
 line : 82, code : 200
 line : 83, code : 200
 line : 84, code : 200
 line : 85, code : 200
 line : 86, code : 200
 line : 87, code : 200
 line : 88, code : 200
 line : 89, code : 200
 line : 90, code : 200
 line : 91, code : 200
 line : 92, code : 200
 line : 93, code : 200
 line : 94, code : 200
 line : 95, code : 200
 line : 96, code : 200
 line : 97, code : 200
 line : 98, code : 200
 line : 99, code : 200
 line : 100, code : 200
 line : 101, code : 200
 line : 102, code : 200
 line : 103, code : 200
 line : 104, code : 200
 line : 105, code : 200
 line : 106, code : 200
 line : 107, code : 200
 line : 108, code : 200
 line : 109, code : 200
 line : 110, code : 200
 line : 111, code : 200
 line : 112, code : 400

DF['DATE'] = pd.to_datetime(DF['DATE'],format='%Y년 %m월 %d일')
DF = DF.sort_values(by='DATE', ascending=True).reindex()

DF.head()

DF.to_csv("Retrica_review.csv")

sns.factorplot('STAR',kind='count',data=DF) # 실질적으로 1을 준 사용자도 많다. 완전히 실망하거나 만족하거나 하는 성향을 보였다.

<seaborn.axisgrid.FacetGrid at 0xd1281b0>

Under_three = DF[DF['STAR']<=3]
Over_three = DF[DF['STAR']>3]

len(Under_three) + len(Over_three)

3792

len(DF)

3792

under_text = Under_three['BODY']

under_text[2499]

'이거 좌우반전도 기능 넣으묜 안되는거예요ㅠㅠ?정말?ㅠㅠㅠㅠㅠㅠㅠㅠㅠㅠ업데이트될때 꼭 좌우반전이 되길ㅠㅠㅠㅠ'

re.sub('[^가-힣\s]',"",under_text[2499])

'이거 좌우반전도 기능 넣으묜 안되는거예요정말업데이트될때 꼭 좌우반전이 되길'

under_text = under_text.apply(lambda x:re.sub('[^가-힣\s]',"",x))

from konlpy.tag import Twitter
from sklearn.feature_extraction.text import CountVectorizer

tagger = Twitter()

def get_word(doc):
    nouns = tagger.nouns(doc)
    return [noun for noun in nouns if len(noun) > 1]

cv = CountVectorizer(tokenizer=get_word, max_features=200)
tdf = cv.fit_transform(under_text)
words = cv.get_feature_names()

words[:10]

['가끔', '가나', '가안', '가요', '갑자기', '강제', '강추', '개선', '갤러리', '갤럭시']

import numpy as np 
count_mat = tdf.sum(axis=0)
count_mat

matrix([[ 24,  11,  10,  32,  90,  12,  11,  19,  36,  21,  11,   8,   7,
          16,  15,  11,  12,   8,  11,  14, 147,   9,  11, 326,   8,  31,
          26,  33,  19,   7,  73,  14,  10, 193,  14,  51, 105,  43,   7,
           7,  17,  13,   8,  34,  10,  33,   7,  33,  20,   7,   7,  38,
           9,  18,   7,  23,   7,  11,  11,  29,  57,   8,  27,   7,  45,
          51,  24,   9,  15,  25,   7,  16,  27,  13,  18,  18,   7,  10,
           7,  12,  21,   7, 588,  85,   7,  11,   8,   9,  39,  15,  63,
          10, 100,  13,  52,   9,   8,  10,  34,   8,  39,  15,  11,  27,
           9,   7,  28,   9,  19,  10,  80,  10,  13,   8,  65,  21,  32,
          18,  13,  23,  11,  11,  59,  10,  18,  54,  29,  58,  31,  26,
          16,  13,  13,   7,  22,  18,  11,   9,  11,   8,  16,  93,  16,
          13,   7,   7,   8, 126,  11,  15,  66,  11,  30,  15,  21,  15,
          10,  20,  75,  29,  33,  28,  11,   9, 138,  72,  21,  31,   8,
          26,  21,  20,  10,  11,  43, 218,  12,  16,  12,  10,  21,  13,
          11,  15,  13, 313,  24,  11,  11,  50,  24,  11,  13,  16,  10,
           7, 133,  67,   9, 111]], dtype=int64)

count = np.squeeze(np.asarray(count_mat))
word_count = list(zip(words, count))

word_count[:10]

[('가끔', 24),
 ('가나', 11),
 ('가안', 10),
 ('가요', 32),
 ('갑자기', 90),
 ('강제', 12),
 ('강추', 11),
 ('개선', 19),
 ('갤러리', 36),
 ('갤럭시', 21)]

word_count = sorted(word_count, key=lambda t:t[1], reverse=True)

word_count[:5]

[('사진', 588), ('광고', 326), ('필터', 313), ('카메라', 218), ('너무', 193)]

# 20번 이상 언급됐던 부분만 나오게 하기 ( 내용이 너무 많다.)
word_count2 = []
for item in word_count:
    if item[1] > 20:
        word_count2.append(item)

import matplotlib.pyplot as plt
from wordcloud import WordCloud

wc = WordCloud(font_path='C:\\Windows\\Fonts\\malgun.ttf', background_color='white', width=400, height=300)
cloud = wc.generate_from_frequencies(dict(word_count2))
plt.figure(figsize=(12,9))
plt.imshow(cloud)
plt.axis('off')
plt.show()

	DATE	USER	STAR	TITLE	BODY
2771	2014-04-13	Kwangmyung Choi	5	꺄악~~ 드뎌 안드로이드에도~	아이폰에서 보던 그 앱이 안드로이드에도 드뎌 설치가 되는군요? 좋아요~^^
2770	2014-04-13	TAEOK KIM	5		환영합니다!촣네요!ㅋ
2773	2014-04-14	공성준	5	워우~	좋아용 아주 좋아용~ ㅎㅎㅎ
2777	2014-04-14	김진학	5		좋네요!
2774	2014-04-14	정민영	5		멋지네요!

[Crawling] Beautifulsoup & Requests (Crawling) (0)	2017.09.11
[TEXT MINING] ENCODING, 인코딩 (0)	2017.09.11
[URL] URL 분해 및 URL Encoding (0)	2017.09.10
[Crawling] Web Crawling(크롤링) (2)	2017.09.10
[TEXT MINING] 텍스트마이닝의 기초 (TDM) (0)	2017.09.10

[Crawling] Retrica App Review Analysis

GooglePlay Reviews Crawling

Retrica App(사진어플)

1. 전체 평점 및 투표인원

2. 리뷰 전체 가지고 오기.

3. 저평가를 준 사람들 리뷰 WordCloud

3.1 형태소 분석

3.2 1글자 짜리 빼기

3.3 단어별 출현 빈도

3.4 빈도 정렬

3.5 WordCloud 생성

'BIGDATA > TEXT MINING' 카테고리의 다른 글

티스토리툴바