pycurl参考文档:http://pycurl.io/docs/latest/index.html 是英文文档,看起来也不是特么吃力跟着做问题不大。
#coding=utf-8import pycurlimport StringIO """简单原则:不要对str使用encode,不要对unicode使用decode ( s是code_A的str s.decode(‘code_A‘).encode(‘code_B‘) 0"""class PySpider(): def __init__(self): self.c = pycurl.Curl() self.c.setopt(pycurl.COOKIEFILE, "cookie_file_name")#把cookie保存在该文件中 self.c.setopt(pycurl.COOKIEJAR, "cookie_file_name") #设置跳转 self.c.setopt(pycurl.FOLLOWLOCATION, 1) #遇到302时候是否进行自动跳转 self.c.setopt(pycurl.MAXREDIRS, 5) #网页最多跳转的次数 #设置超时 self.c.setopt(pycurl.CONNECTTIMEOUT,60) #设置链接超时 self.c.setopt(pycurl.TIMEOUT,120) #设置下载超时 #设置代理 如果有需要请去掉注释,并设置合适的参数 #c.setopt(pycurl.PROXY, 'http://11.11.11.11:8080') #c.setopt(pycurl.PROXYUSERPWD, '’'aaa:aaa') #-----------------------------------get函数-----------------------------------# def GetData(self, url): headers = ['Accept:*/*', 'User-Agent:Mozilla/5.0 (Windows NT 6.1; WOW64; rv:32.0) Gecko/20100101 Firefox/32.0'] buf = StringIO.StringIO() self.c.setopt(self.c.WRITEFUNCTION, buf.write) self.c.setopt(pycurl.URL, url) self.c.setopt(self.c.HTTPHEADER, headers) self.c.perform() the_page =buf.getvalue() charset = re.findall("""charset=([a-zA-Z0-9_\S][^"^>^']*)""",the_page) buf.close() return the_page,charset def PostData(self, url, data): headers = ['User-Agent:Mozilla/5.0 (Windows NT 6.1; WOW64; rv:32.0) Gecko/20100101 Firefox/32.0'] #写入html buf = StringIO.StringIO() self.c.setopt(self.c.WRITEFUNCTION, buf.write) #设置POST传入数据#是否是post方法,默认是get#post的数据,是字典:个字典:{"key":"value"} self.c.setopt(pycurl.POST,1) self.c.setopt(pycurl.POSTFIELDS, data) #访问的url self.c.setopt(pycurl.URL, url) # 传入Headers self.c.setopt(pycurl.HEADER, True) self.c.setopt(self.c.HTTPHEADER, headers) self.c.perform() charset = re.findall('charset=([a-zA-Z0-9_\S][^"^>]*)',the_page) the_page = buf.getvalue() buf.close() return the_page,charset
稍微封装了一下,就变成上面这玩意了保存上面代码到Spider.py文件中,用法如下:
from Spider import PySpiderimport jsonspider = PySpider()#GET方法html = spider.GetData('http://www.baidu.com') print html,charset#post方法postdate = json.dumps({ 'cityListName':'gz', 'trade':'',})html = spider.PostData('http://qy.m.58.com/m_entlist/ajax_listinfo/2') print html