12. 网络爬虫之提取

标签元素

Beautiful soup 库的理解

bs4 库的便利功能

HTML 页面的友好输出

soup.prettify()

信息标记与提取方法

实例: 中国大学排名定向爬虫

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
import re
import requests
import bs4
from bs4 import BeautifulSoup

def getHTMLText(url):
try:
# 模拟浏览器请求头
kv = {'User-Agent':'Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132'}
response = requests.get(url, timeout = 10, headers = kv)
response.raise_for_status()
response.encoding = response.apparent_encoding
return response.text
except:
return "产生HTTPError"

def fillUnivList(ulist, html):
document = BeautifulSoup(html, "html.parser")
for item in document.tbody.children:
if isinstance(item, bs4.element.Tag):
tds = item.find_all('td')
ulist.append((tds[0].string, tds[1].string, tds[2].string))

def printUnivList(ulist, num):
tplt = "{0:^10}\t{1:{3}^10}\t{2:^10}"
# 中文空格作为填充字符, 解决中英文输出混排的问题
char_blank = ' '
print(tplt .format("排名", "学校", "省市", char_blank))
for i in range(num):
u = ulist[i]
print(tplt .format(u[0], u[1], u[2], char_blank))


def main():
uinfo = list()
url = "http://www.zuihaodaxue.cn/zuihaodaxuepaiming2018.html"
html = getHTMLText(url)
fillUnivList(uinfo, html)
printUnivList(uinfo, 30)
main()

输出结果:
rank