不得不说 python真是一个神奇的东西,学三天就能爬网站 真香
完整代码
# -*- coding: utf-8 -*- """ Created on Wed May 26 17:53:13 2021 @author: 19088 """ import urllib.request import os import pickle import re import random import sys #获取http代理 class getHttpAgents: #初始化函数 def __init__(self): self.attArray=self.__loadAgentList() self.myagent="" #注意 返回对象未进行解码 def openUrl(self,url,istry=1): response="" ip="" if(0 != len(self.myagent.strip())): ip=self.myagent i=1 if not istry: i=99 while i<100: try: #print(self.attArray) if(0 == len(self.attArray) and 0==len(ip.strip())): req=urllib.request.Request(url) #设置访问头 req.add_header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36") response=urllib.request.urlopen(req) else: if(0 != len(self.attArray)): ip=random.choice(self.attArray) if(0 != len(self.myagent.strip())): ip=self.myagent print("以{}访问 {}".format(ip,url)) #设置代理 proxy={"http":ip} #print(proxy) #定义一个代理字段 proxy_support=urllib.request.ProxyHandler(proxy) #建立一个opener opener=urllib.request.build_opener(proxy_support) opener.addheaders=[("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36")] #urllib.request.install_opener(opener) #获得网页对象 response=opener.open(url) except: if not istry: print("{} 无法使用".format(ip)) else: print("第{}次尝试连接!".format(i)) else: break; finally: i+=1 if 11==i and istry: raise ValueError if not response: return html=response.read() #print(html) return html #检查代理池 去除掉不可用代理ip def checkMyIpPool(self): agentsResult=[] agentList=self.attArray for iter in agentList: ip=iter self.setMyIp(ip) b=self.__getMyIp() if not b: #代理不能用 #agentList.pop(-iter) pass else: agentsResult.append(ip) #print(b) #记录爬取过的可以使用的代理ip self.__writeAgentList(agentsResult) self.__setAgents(agentsResult) self.setMyIp("") #解析读取网页中所有的代理地址 def getAgents(self,html): #print(html) #匹配 ip地址 正则表达式 pattern = re.compile(r'(<td>)\s*((25[0-5]|2[0-4]\d|[0-1]\d\d|\d\d|\d)\.){3}(25[0-5]|2[0-4]\d|[0-1]\d\d|\d\d|\d)\s*</td>') ipList=[] ip=pattern.finditer(html) for ipiter in ip: ipText=ipiter.group() ipGroup=re.search(r"((25[0-5]|2[0-4]\d|[0-1]\d\d|\d\d|\d)\.){3}(25[0-5]|2[0-4]\d|[0-1]\d\d|\d\d|\d)", ipText) ipList.append(ipGroup.group()) #匹配 端口地址 正则表达式 portList=[] pattern = re.compile(r'(<td>)\s*\d+\s*</td>') port = pattern.finditer(html) for portiter in port: portText=portiter.group() portGroup=re.search(r"\d+", portText) portList.append(portGroup.group()) if(len(ipList) is not len(portList)): print("注意: ip和端口参数不匹配!") return ipDict=dict(zip(ipList,portList)) agentList=[] for key in ipDict: agentList.append(key+":"+ipDict.get(key)) agentsResult=[] for iter in agentList: ip=iter self.setMyIp(ip) b=self.__getMyIp() if not b: #代理不能用 pass #agentList.pop(-iter) else : agentsResult.append(ip) self.__setAgents(agentsResult) print("{} 可以使用".format(ip)) agentsResult.extend(self.attArray) #记录爬取过的可以使用的代理ip if(0==len(agentsResult)): return self.__writeAgentList(agentsResult) self.__setAgents(agentsResult) self.setMyIp("") return agentList def __setAgents(self,ipArray): self.attArray=ipArray def setMyIp(self,ip): self.myagent=ip #存储爬取过的ip代理 def __writeAgentList(self, agentList): if os.path.exists("agent.pkl"): os.remove("agent.pkl") #每次重新生成 要不多次 dump需要多次 load with open("agent.pkl.","wb") as f: pickle.dump(agentList, f) print("存储{}条代理".format(len(agentList))) #加载之前存储过的ip代理 def __loadAgentList(self): agentlist=[] if not os.path.exists("agent.pkl"): return agentlist with open("agent.pkl","rb") as f: agentlist=pickle.load(f) print("加载{}条代理".format(len(agentlist))) return agentlist #获取当前使用的ip地址 类的内部方法 仅供内部调用 def __getMyIp(self,ip=""): url="https://www.baidu.com/" html="" try: html=self.openUrl(url,0).decode("utf-8") except: return #匹配ip地址 #pattern = re.compile(r'((25[0-5]|2[0-4]\d|[0-1]\d\d|\d\d|\d)\.){3}(25[0-5]|2[0-4]\d|[0-1]\d\d|\d\d|\d)') #groupIp=pattern.search(html) #if groupIp: #return groupIp.group() else: return html #通过不同的网站去爬取代理 def crawlingAgents(self,index): try: url ="http://ip.yqie.com/ipproxy.htm" print(url) html=self.openUrl(url) html=html.decode("utf-8") self.setMyIp("") #不指定ip 随机挑选一个作为代理 self.getAgents(html) except Exception as e: print("{} 爬取失败".format(url)) #一共搜集多少页 page=index indexCur=1 while indexCur<=page: try: url=r"https://www.89ip.cn/index_{}.html".format(indexCur) print(url) self.setMyIp("") html=self.openUrl(url) #不指定ip 随机挑选一个作为代理 html=html.decode("utf-8") self.getAgents(html) except Exception as e: print("{} 爬取失败".format(url)) finally: indexCur+=1 indexCur=1 while indexCur<=page: try: url=r"http://www.66ip.cn/{}.html".format(indexCur) print(url) self.setMyIp("") html=a.openUrl(url) #不指定ip 随机挑选一个作为代理 html=html.decode("gb2312") self.getAgents(html) except Exception as e: print("{} 爬取失败".format(url)) finally: indexCur+=1 indexCur=1 while indexCur<=page: try: url=r"http://www.ip3366.net/?stype=1&page={}".format(indexCur) print(url) self.setMyIp("") html=a.openUrl(url) #不指定ip 随机挑选一个作为代理 html=html.decode("gb2312") self.getAgents(html) except Exception as e: print("{} 爬取失败".format(url)) finally: indexCur+=1 indexCur=1 while indexCur<=page: try: url=r"http://www.kxdaili.com/dailiip/1/{}.html".format(indexCur) print(url) self.setMyIp("") html=a.openUrl(url) #不指定ip 随机挑选一个作为代理 html=html.decode("utf-8") self.getAgents(html) except Exception as e: print("{} 爬取失败".format(url)) finally: indexCur+=1 #下载图片封装类 class downLoadPictures: #构造函数 def __init__(self): self.sortKey={} #定义一个搜索关键字的字典 self.urlLoad=getHttpAgents() self.bzmenuDict={} #分类信息 风景 美女 什么的分类 self.sortscreenDict={} #按照屏幕尺寸分类 self.littleSignDict={} #分类信息下面的小分类 pass def getPictures(self,url): #第一步 打开网页 读取page信息 pagerHtml=self.urlLoad.openUrl(url) #第二步 获取 pageFolder 链接和各种分类信息 返回的是一堆folder链接的url folderPictursUrl=self.readPages(pagerHtml).values() if not folderPictursUrl: print("获取图片集失败!") return for floderiterUrl in folderPictursUrl: folderUrl=str("https://www.ivsky.com/")+floderiterUrl folderHtml=self.urlLoad.openUrl(folderUrl) #第三步 读取图片集 获取单个图片的链接地址 返回的是图片集里面的一堆文件url pictursUrlDict=self.readFolders(folderHtml) for iterPictureKey in pictursUrlDict: fileName=iterPictureKey+".jpg" pictureUrl=str("https://www.ivsky.com/")+pictursUrlDict.get(iterPictureKey) #读取图片页相关信息 pictureHtml=self.urlLoad.openUrl(pictureUrl) picturDownUrl=self.readPictures(pictureHtml) pictureDownHtml=self.urlLoad.openUrl(picturDownUrl) if not pictureDownHtml: continue #保存图片 with open(fileName,"wb+") as f: f.write(pictureDownHtml) #提取匹配内容中的所有链接地址 def getHrefMap(self,html,isPicture=0,isFolder=0): hrefDict={} pattern=re.compile(r'<a\s*.*?\s*</a>',re.I) if isPicture: pattern=re.compile(r'<p>\s*?<a\s*.*?</p>',re.I) hrefIter=pattern.finditer(html) index=0 for iter in hrefIter: hrefText=iter.group() #匹配分类名字 pattern=re.compile(r'"\s*?>\s*?.*?</a>',re.I) name="" nameGroup=pattern.search(hrefText) if nameGroup: name=nameGroup.group() if(5==len(nameGroup.group().replace(" ", ""))): pattern=re.compile(r'title=".*?"',re.I) nameGroup=pattern.search(hrefText) if nameGroup: name=nameGroup.group()[7:-1] name=name[2:-4].replace(" ", '') #匹配href pattern=re.compile(r'href=".*?" rel="external nofollow" ',re.I) url="" urlGroup=pattern.search(hrefText) if urlGroup: url=urlGroup.group()[6:-1].replace(" ", '') if isFolder: index+=1 name+="_"+str(index) hrefDict[name]=url return hrefDict #读取首页信息 包含各种分类的链接地址 以及图片集的地址集合 def readPages(self,html): html=html.decode("utf-8") #检索壁纸分类 #匹配 壁纸分类信息 pattern=re.compile(r'<ul\s*class="bzmenu".*?</ul>',re.I) sortClassGroup=pattern.search(html) if sortClassGroup: sortMessage=sortClassGroup.group() self.bzmenuDict=self.getHrefMap(sortMessage) #print(self.bzmenuDict) else: print("匹配壁纸分类出错!") return #匹配 按照屏幕大小分类 pattern=re.compile(r'<ul\s*class="sall_dd".*?</ul>',re.I) sortClassGroup=pattern.search(html) if sortClassGroup: sortMessage=sortClassGroup.group() self.sortscreenDict=self.getHrefMap(sortMessage) #print(self.sortscreenDict) else: print("匹配屏幕尺寸分类失败!") return #匹配 获取小分类 pattern=re.compile(r'<div\s*class="sline".*?</div>',re.I) sortClassGroup=pattern.search(html) if sortClassGroup: sortMessage=sortClassGroup.group() #print(sortMessage) self.littleSignDict=self.getHrefMap(sortMessage) #print(self.littleSignDict) else: print("匹配小分类失败") return pictureDict={} #匹配 图片集地址 pattern=re.compile(r'<ul\s*class="ali".*?</ul>',re.I) sortClassGroup=pattern.search(html) if sortClassGroup: sortMessage=sortClassGroup.group() pictureDict=self.getHrefMap(sortMessage,1) #print(pictureDict) else: print("匹配图片集地址失败!") return #print(html) return pictureDict #解析每个图片集合对应的图片集内容 解析出单个图片的链接地址 def readFolders(self,html): if not html: return html=html.decode("utf-8") #获取图片集里面每个图片的具体地址和名称 #匹配 获取小分类 pattern=re.compile(r'<ul\s*class="pli".*?</ul>',re.I) sortClassGroup=pattern.search(html) pictureUrlDict={} if sortClassGroup: sortMessage=sortClassGroup.group() #print(sortMessage) pictureUrlDict=self.getHrefMap(sortMessage,1,1) #print(pictureUrlDict) else: print("匹配小分类失败") return return pictureUrlDict #解析每个图片集合对应的图片集内容 解析出单个图片的链接地址 def readPictures(self,html): if not html: return html=html.decode("utf-8") #获取图片集里面每个图片的具体地址和名称 #匹配 获取小分类 pattern=re.compile(r'<div\s*class="pic".*?</div>',re.I) sortClassGroup=pattern.search(html) pictureUrl="" if sortClassGroup: sortMessage=sortClassGroup.group() #匹配href pattern=re.compile(u"src='.*?'",re.I) url="" urlGroup=pattern.search(sortMessage) if urlGroup: url=urlGroup.group()[5:-1].replace(" ", '') url=url.replace('img-pre', 'img-picdown') url=url.replace('pre', 'pic') url=str("https:")+url #print(sortMessage) pictureUrlDict=url #print(url) else: print("匹配小分类失败") return return pictureUrlDict class UrlUser: def __init__(self): self.agent=getHttpAgents() self.downPicture=downLoadPictures() #下载图片调用函数 def downPictures(self): #url="https://www.ivsky.com/bizhi" #b.getPictures(url) #确定保存路径 dirPath=input("请输入保存路径:") if not os.path.exists(dirPath): os.mkdir(dirPath) if not os.path.isdir(dirPath): print("savePath is wrong!") sys.exit() os.chdir(dirPath) #切换工作目录 #url=r"https://www.ivsky.com/bizhi/nvxing_1920x1080/index_{}.html" page=input("爬取前多少页的图片?\n") indexRe = re.search(r"\d+", page) if(not indexRe): print("输入页数有误!") indexRe=int(indexRe.group()) indexCur=1 while indexCur<=indexRe: try: #注意 爬取什么类型的图片可以根据不同的网址进行设计 下载类里面已经读取了所有分类对应的地址 有兴趣可以自己完善 url=r"https://www.ivsky.com/bizhi/nvxing_1920x1080/index_{}.html".format(indexCur) print(url) self.downPicture.getPictures(url) except: print("打开出错!") pass finally: indexCur+=1 #爬取代理 def downAgents(self): page=input("爬取前多少页的代理?\n") indexRe = re.search(r"\d+", page) if(not indexRe): print("输入页数有误!") return indexRe=int(indexRe.group()) self.agent.crawlingAgents(indexRe) # 检查当前代理池是否可以 def checkPool(self): self.agent.checkMyIpPool() if __name__ == "__main__": print("*"*20) print("1.爬取代理\n") print("2.检查代理\n") print("3.爬取图片") print("*"*20) mode=input("请输入数字选择处理模式:\n") indexRe = re.search(r"\d+", mode) if(not indexRe): print("输入页数有误!") sys.exit() indexRe=int(indexRe.group()) #实例化一个对象 uesrObj=UrlUser() if 1 == indexRe: uesrObj.downAgents() elif 2 == indexRe: uesrObj.checkPool() elif 3 == indexRe: uesrObj.downPictures() else: print("模式选择错误!") sys.exit() print("爬取完毕!")