2009年5月25日星期一

【蛋痛】吧饭否的老底翻出来

蛋痛的饭否搜索抓取,开始的时候被饭否耍了一下,url里面有个p的参数是控制最大搜索数量的。
抓取结果保存为python模块(Python3.0)


#!/usr/bin/env python
# -*- coding: UTF-8 -*-

import urllib.request
import re

key_word = '番茄操蛋'
key_enc = urllib.parse.quote(key_word)

#不用代理的无视
proxy_support = urllib.request.ProxyHandler({"http" : "http://192.168.60.250:8080"})
opener = urllib.request.build_opener(proxy_support)

urllib.request.install_opener(opener)

def load(url):
f = urllib.request.urlopen('http://fanfou.com' + url)
resp = f.read().decode("utf-8")
f.close()
return resp

user_re = re.compile(r'<a href="/(?P<uid>[^"]+?)" title="(?P<nick>[^"]+?)" class="avatar"><img src="(?P<avatar>[^"]+?)".+?<span class="content">(?P<content>.+?)</span>.+?<a href="/statuses/(?P<mid>[^"]+?)" class="time" title="(?P<time>[^"]+?)">')
next_re = re.compile(r'<a href="(?P<url>[^"]+?)">下一页</a>')

collected_data = [];
import codecs
f = codecs.open('output_'+key_word+'.py','w','utf-8')
f.write('''
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
[
''')
def parse_page(text):
for m in user_re.finditer(text):
data = m.groupdict()
f.write(str(data))
f.write(',\n')
collected_data.append(data)
print(m.group('mid'),'@',m.group('time'))
mid = m.group('mid')
print('---')

m = next_re.search(text)
if m:
return '/search?q='+key_enc+'&noframe=yes&m='+mid
else:
return None

url = '/search?q='+key_enc+'&noframe=yes'
while url:
text = load(url)
url = parse_page(text)
print(url)

f.write(']\n')
f.close();

0 人次吐槽:

发表评论