In [2]:
import requests
In [3]:
url = 'https://techcrunch.com/2017/03/08/a-new-affordable-naming-startup-for-startups/'
res = requests.get(url)
In [4]:
res # Success
Out[4]:
In [5]:
res.text[:200] # 텍스트 형태로 데이터를 가지고 온다.
Out[5]:
In [6]:
import lxml.html
from bs4 import BeautifulSoup
In [10]:
root = lxml.html.fromstring(res.text)
- html 요소에 접근 방법
- class : .
- id : #
- class 값으로 접근
- . 로 접근
In [14]:
entries = root.cssselect('.article-entry')
entries
Out[14]:
In [15]:
len(entries) # 본문의 내용으로 된 tag는 1개 밖에 없다.
Out[15]:
In [16]:
article = entries[0]
content = article.text_content() # 본문의 내용을 TEXT 형태로 추출.
content[:100]
Out[16]:
- id 값으로 접근
- "#" 로 접근
In [23]:
root.cssselect('#speakable-summary')[0].text_content()[:100]
Out[23]:
In [24]:
bs = BeautifulSoup(res.text, 'html.parser')
type(bs)
Out[24]:
In [25]:
bs.findAll("div", class_='article-entry')
Out[25]:
In [26]:
h2s = bs.find_all('h2')
len(h2s)
Out[26]:
In [27]:
h2s
Out[27]:
In [29]:
res = requests.get('https://techcrunch.com/startups/')
root = lxml.html.fromstring(res.text)
titles = root.cssselect('h2 a')
len(titles)
Out[29]:
In [30]:
i = 0
for title in titles:
i += 1
print("- " + str(i) + " " + title.text)
In [31]:
titles[0].attrib['href']
Out[31]:
In [32]:
for title in titles:
print(title.attrib['href'])
In [34]:
articles = []
for title in titles:
url = title.attrib['href']
res_a = requests.get(url)
articles.append(res_a.text)
In [35]:
len(articles)
Out[35]:
In [38]:
articles[0][:100]
Out[38]:
In [39]:
import tqdm
In [41]:
subTitles = root.cssselect('.river li .block .byline a')
In [44]:
for subTitle in tqdm.tqdm_notebook(subTitles[:10]):
print(subTitle.text)
print("https://techcrunch.com" + subTitle.attrib['href'])
'BIGDATA > TEXT MINING' 카테고리의 다른 글
[Crawling] Beautifulsoup & Requests (Crawling) (0) | 2017.09.11 |
---|---|
[TEXT MINING] ENCODING, 인코딩 (0) | 2017.09.11 |
[URL] URL 분해 및 URL Encoding (0) | 2017.09.10 |
[TEXT MINING] 텍스트마이닝의 기초 (TDM) (0) | 2017.09.10 |
[TEXT MINING] 노무현 대통령 vs 이명박 대통령 (텍스트마이닝) (0) | 2017.09.04 |
In [2]:
import requests
In [3]:
url = 'https://techcrunch.com/2017/03/08/a-new-affordable-naming-startup-for-startups/'
res = requests.get(url)
In [4]:
res # Success
Out[4]:
In [5]:
res.text[:200] # 텍스트 형태로 데이터를 가지고 온다.
Out[5]:
In [6]:
import lxml.html
from bs4 import BeautifulSoup
In [10]:
root = lxml.html.fromstring(res.text)
- html 요소에 접근 방법
- class : .
- id : #
- class 값으로 접근
- . 로 접근
In [14]:
entries = root.cssselect('.article-entry')
entries
Out[14]:
In [15]:
len(entries) # 본문의 내용으로 된 tag는 1개 밖에 없다.
Out[15]:
In [16]:
article = entries[0]
content = article.text_content() # 본문의 내용을 TEXT 형태로 추출.
content[:100]
Out[16]:
- id 값으로 접근
- "#" 로 접근
In [23]:
root.cssselect('#speakable-summary')[0].text_content()[:100]
Out[23]:
In [24]:
bs = BeautifulSoup(res.text, 'html.parser')
type(bs)
Out[24]:
In [25]:
bs.findAll("div", class_='article-entry')
Out[25]:
In [26]:
h2s = bs.find_all('h2')
len(h2s)
Out[26]:
In [27]:
h2s
Out[27]:
In [29]:
res = requests.get('https://techcrunch.com/startups/')
root = lxml.html.fromstring(res.text)
titles = root.cssselect('h2 a')
len(titles)
Out[29]:
In [30]:
i = 0
for title in titles:
i += 1
print("- " + str(i) + " " + title.text)
In [31]:
titles[0].attrib['href']
Out[31]:
In [32]:
for title in titles:
print(title.attrib['href'])
In [34]:
articles = []
for title in titles:
url = title.attrib['href']
res_a = requests.get(url)
articles.append(res_a.text)
In [35]:
len(articles)
Out[35]:
In [38]:
articles[0][:100]
Out[38]:
In [39]:
import tqdm
In [41]:
subTitles = root.cssselect('.river li .block .byline a')
In [44]:
for subTitle in tqdm.tqdm_notebook(subTitles[:10]):
print(subTitle.text)
print("https://techcrunch.com" + subTitle.attrib['href'])
'BIGDATA > TEXT MINING' 카테고리의 다른 글
[Crawling] Beautifulsoup & Requests (Crawling) (0) | 2017.09.11 |
---|---|
[TEXT MINING] ENCODING, 인코딩 (0) | 2017.09.11 |
[URL] URL 분해 및 URL Encoding (0) | 2017.09.10 |
[TEXT MINING] 텍스트마이닝의 기초 (TDM) (0) | 2017.09.10 |
[TEXT MINING] 노무현 대통령 vs 이명박 대통령 (텍스트마이닝) (0) | 2017.09.04 |