抓取结果保存为python模块(Python3.0)
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
import urllib.request
import re
key_word = '番茄操蛋'
key_enc = urllib.parse.quote(key_word)
#不用代理的无视
proxy_support = urllib.request.ProxyHandler({"http" : "http://192.168.60.250:8080"})
opener = urllib.request.build_opener(proxy_support)
urllib.request.install_opener(opener)
def load(url):
f = urllib.request.urlopen('http://fanfou.com' + url)
resp = f.read().decode("utf-8")
f.close()
return resp
user_re = re.compile(r'<a href="/(?P<uid>[^"]+?)" title="(?P<nick>[^"]+?)" class="avatar"><img src="(?P<avatar>[^"]+?)".+?<span class="content">(?P<content>.+?)</span>.+?<a href="/statuses/(?P<mid>[^"]+?)" class="time" title="(?P<time>[^"]+?)">')
next_re = re.compile(r'<a href="(?P<url>[^"]+?)">下一页</a>')
collected_data = [];
import codecs
f = codecs.open('output_'+key_word+'.py','w','utf-8')
f.write('''
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
[
''')
def parse_page(text):
for m in user_re.finditer(text):
data = m.groupdict()
f.write(str(data))
f.write(',\n')
collected_data.append(data)
print(m.group('mid'),'@',m.group('time'))
mid = m.group('mid')
print('---')
m = next_re.search(text)
if m:
return '/search?q='+key_enc+'&noframe=yes&m='+mid
else:
return None
url = '/search?q='+key_enc+'&noframe=yes'
while url:
text = load(url)
url = parse_page(text)
print(url)
f.write(']\n')
f.close();
0 人次吐槽:
发表评论