爬虫 | Wool Blue's BLOG

IP查询

import requests
def getHTMl_ip(url):   #ip查询  url = "https://ipchaxun.com/"
    headers = {"Accept": "image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8",
               "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.16 Safari/537.36"}
    try:
        r = requests.get(url+'182.46.236.11',timeout=30,headers=headers)
        r.raise_for_status()  # 如果不是200则引发httpError异常
        r.encoding = r.apparent_encoding
        return r.text[2605:2800]
    except:
        return "出现异常"

图片爬取

import requests
import os
def getHTMl_picure(url):   #爬取图片
    root = "D://picture//"
    path = root+url.split('/')[-1]
    try:
        if not os.path.exists(root):
            os.mkdir(root)
        if not os.path.exists(path):
            r = requests.get(url)
            with open(path,'wb') as f:
                f.write(r.content)
                f.close()
                print("文件保存成功")
        else:
            print("文件已存在")
    except:
        print("爬取失败")

百度关键字搜索

import requests
def getHTMl_keyword(url):   #百度/360关键字搜索 url = "https://www.baidu.com/s"
    try:
        kv = {'wd': 'python'}
        headers = {"Accept":"image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8",
                   "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.16 Safari/537.36"}
        r = requests.get(url,params=kv,headers=headers)  #通过params字段,将关键字加入url链接中
        r.raise_for_status()  #如果不是200则引发httpError异常
        r.encoding = r.apparent_encoding
        print(len(r.text))
        print(r.request.headers)
    except:
        return "出现异常"

HTML页面获取

from bs4 import BeautifulSoup
import requests
def beautiful_soup(url):                     #通过BeautifulSoup获取html内容标准格式
    try:
        r = requests.get(url, timeout=30)
        r.raise_for_status()  # 如果不是200则引发httpError异常
        r.encoding = r.apparent_encoding
        demo = r.text
        soup = BeautifulSoup(demo,"html.parser")

        return soup.prettify()
        # return soup.prettify()
    except:
        return "出现异常"

基于bs4库的HTML应用

def text(url):  #基于bs4库的HTML应用
    try: #contents 返回子类标签 生成一个列表  children.....
        r = requests.get(url, timeout=30)
        r.raise_for_status()  # 如果不是200则引发httpError异常
        r.encoding = r.apparent_encoding
        demo = r.text
        soup = BeautifulSoup(demo, "html.parser")
        tag = soup.a
        """print(soup.html.parent.prettify())  #f返回父类标签
        for parent in soup.a.parents:     #标签树的上行遍历  a为标签
            if parent is None:
                print(parent)
            else:
                print(parent.name)
        for sibling in soup.a.next_siblings:     #标签树的平行遍历,遍历后续节点
            print(sibling)
        for sibling in soup.a.previous_siblings:     #标签树的平行遍历,遍历前续节点
        #     print(sibling)
        """
        # for link in soup.find_all('a'):   #返回标签a的href链接
        #     print(link.get('href'))
        # for tag in soup.find_all(re.compile('b')):   #返回以b开头的所有标签
        #     print(tag.name)
        # for attrs in soup.find_all('p','course'):  # 返回带有course属性值的标签
        #     print(attrs)
        #     print(soup.find_all(id=re.compile('link')))
        print(soup.find_all(string = re.compile("python")))
        return None
        # return soup.a.prettify()
    except:
        return "出现异常"

实例–获取中国大学排名

from bs4 import BeautifulSoup
import requests
import re
import bs4

def getHTMLtext(url):  #获取html内容
    try:
        r = requests.get(url, timeout=30)
        r.raise_for_status()  # 如果不是200则引发httpError异常
        r.encoding = r.apparent_encoding
        return r.text
        # return soup.prettify()
    except:
        return ""

def fillUnivList(ulist,html):   #将html页面放到ulist列表中(核心)
     soup = BeautifulSoup(html,"html.parser")
     for tr in soup.find('tbody').children:
         if isinstance(tr,bs4.element.Tag):    #检测tr标签的类型是否为bs4库定义的tag类型,不是则过滤掉
            a = tr('a')  #将所有的a标签存为一个列表类型
            tds = tr('td')   #对tr标签中的td标签做查询
            ulist.append([tds[0].text.strip(),a[0].text.strip(),tds[4].text.strip()])

def printUnivList(ulist,num):  #打印输出ulist列表信息,num表示打印的数量
    tplt = "{0:^5}\t{1:{3}^15}\t{2:^5}"   #{3}表示使用format里面第三个变量填充 中文空格
    print(tplt.format("排名","学校名称","总分",chr(12288)))
    for i in range(num):
        u = ulist[i]
        print(tplt.format(u[0], u[1], u[2],chr(12288)))

def main():  #定义主函数
    uinfo = []
    url = "https://www.shanghairanking.cn/rankings/bcur/2023"
    html = getHTMLtext(url)
    fillUnivList(uinfo, html)
    printUnivList(uinfo, 20)
    
if __name__ == "__main__":
    main()

实例–淘宝商品信息定向爬取

from bs4 import BeautifulSoup
import re
import requests
import traceback

def getHTMLtext(url):  #获取html内容  获取不到内容，不知道哪里有问题
    try:
        kv = {"user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36 Edg/117.0.0.0",
              "cookie":"cna=ZDxuHaiYU0gCAXAJZPKACun6; xlly_s=1; t=cb2b528f38950479c040270c69dfe5d0; sgcookie=E100cQeHV4KntWivIK%2BavGIQtZocvt%2BhPw%2FKYR%2BL1%2FE8NWhb3TAH5c4Y6C%2Fj%2BBXLIfTX0K8oxwEG%2FaNyESKEp6BUdxl8CRZzXTFSPXM7c0fV6Xh3rCZPXvFB9NWhTAQTmUsW; uc3=id2=UNGToAjW7o5SSg%3D%3D&vt3=F8dCsGZKBPow2Sp%2BemE%3D&nk2=EOkytQM4LNRD&lg2=UIHiLt3xD8xYTw%3D%3D; lgc=s%5Cu8FFD%5Cu68A6%5Cu7684%5Cu4EBA; uc4=nk4=0%40EjAlHR3uudIQGMI6Up6rlwJJUIY%3D&id4=0%40UgbrDhBIeguk%2F%2B8T4oj7DRIQ9QTv; tracknick=s%5Cu8FFD%5Cu68A6%5Cu7684%5Cu4EBA; _cc_=W5iHLLyFfA%3D%3D; thw=cn; mt=ci=-1_0; uc1=cookie14=Uoe9b6DcK7ZVMA%3D%3D; _tb_token_=fe37598b7870b; _m_h5_tk=f00303acfdc3b90cb1d43b5fe0371ddc_1692932775023; _m_h5_tk_enc=4a26e5a47ffa21a8d0e0931c32fd5af5; JSESSIONID=BA7E8C8BFE703DBE2FD2AE79DCD31037; tfstk=drVyHAVqu_CrWJD3KAlU7y2hMEl-OfI1aWiI-y4nP0moVy0nToqCy0ah2kPEmye5yvsJ8UF4bBO52TUHTXGhfG111zUS9Xj_iJ5_ygTmTss111aRE0-6UGsLGgYCUSZv5a_x3G4im7Z3XhgFGro2tC8tzYmd6mRHTSuzrcXaGH30r_VLUHvEEq3qfZ72czDw3; l=fBP8mhoVN8dmxcHEBOfZPurza77tIIRYSuPzaNbMi9fPO35p5UHAW1todA89CnGVF6oDR3zvqD8WBeYBqIccSQLy2j-lasHmnm9SIEf..; isg=BFRUABKVzOCQWVis0Ea7zTXoJZLGrXiXt0YBOe41519i2fQjFr8wJj9b3dHBJLDv"}
        r = requests.get(url,headers=kv,timeout=30)
        r.raise_for_status()  # 如果不是200则引发httpError异常
        r.encoding = r.apparent_encoding
        return r.text
        # return soup.prettify()
    except:
        return "获取页面失败"
def prasePage(ilt,html):
    try:
        plt = re.findall(r'\"view_price\"\:\"[\d\.]*\"',html)   #\表示引入符号
        tlt = re.findall(r'\"raw_title\"\:\".?\"', html)
        for i in range(len(plt)):
            price = eval(plt[i].split(':')[1]) #split(':')[1]将键值对字符串分开,获得值部分  价格
            title = eval(title[i].split(':')[1])#split(':')[1]将键值对字符串分开,获得值部分  商品名称
            ilt.append([price,title])
    except:
        print("网页解析失败")

def printGoodlist(ilt):
    try:
        tplt = "{:4}\t{:8}\t{:16}"
        print(tplt.format("序号","价格","商品名称"))
        count = 0
        for g in ilt:
            count=count+1
            print(tplt.format(count,g[0]),g[1])  #count 为序号,g[0]为商品价格,g[1]商品名称
    except:
        print("输出失败")
def main1():
    goods ="书包"
    depth = 2
    start_url = 'https://s.taobao.com/search?q='+goods
    infoList = []
    for i in range(depth):
        try:
            url = start_url+"&s="+str(44*i)
            html = getHTMLtext(url)
            prasePage(infoList,html)
        except:
            continue
    printGoodlist(infoList)

实例–股票信息定向爬取

from bs4 import BeautifulSoup
import re
import requests
import traceback

def GetHTMltext(url,code = 'utf-8'):   #网站发生变动 待更新
    try:
        kv = {"user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36 Edg/117.0.0.0",
              "cookie":"qgqp_b_id=1fa9098b5972ea7e3036dbdc3ff568a6; st_si=45750356129814; st_asi=delete; td_cookie=802729162; HAList=ty-1-600588-%u7528%u53CB%u7F51%u7EDC; st_pvi=40869652126604; st_sp=2023-08-25%2015%3A45%3A59; st_inirUrl=https%3A%2F%2Fcn.bing.com%2F; st_sn=4; st_psi=20230825161637105-113200301321-4929148757"}
        r = requests.get(url,headers=kv,timeout=30)
        r.raise_for_status()  # 如果不是200则引发httpError异常
        r.encoding = code
        return r.text
    except:
        return "获取页面失败"
def getstockList(lst,stockURl):
    html = GetHTMltext(stockURl,'GB2312')
    soup = BeautifulSoup(html,'html.parser')
    a =  soup.find_all('a')
    for i in a:
        try:
            href = i.attrs['href']  #获得href属性的对应值："https://hq.gucheng.com/SZ000001/"
            lst.append(re.findall(r'[s][hz]\d{6}',href)[0])
        except:
            continue

def getstockInfo(lst,stockURl,fpath):
    # 使用lst中的股票号码与stockURL组合，得到对应股票信息的URL链接，并将其内容存入fpath指定的路径内
    count = 0
    for stock in lst:
        url = stockURl+stock+".html"
        html = GetHTMltext(url)
        try:
            if html ==" ":
                continue
            infoDict = {}    #定义一个字典存储股票信息
            soup = BeautifulSoup(html, 'html.parser')
            stockInfo = soup.find('div',attrs={'class':'stock_top clearfix'})  #返回name=div且attrs={'class':'stock_top clearfix'}的Tag对象
            name = soup.find('h1').string.strip()  #找到name=h1的标签并得到它的string部分，跳过空格
            infoDict.update({"股票名称":name.split()[0]})  #update函数对两个字典进行合并，将字典{'股票名称':name}合并进字典infoDict内
            Keylist = stockInfo.find_all('dt')  #找到div标签内所有的dt标签，以列表形式返回
            valuelist = stockInfo.find_all('dd')   #找到div标签内所有的dd标签，以列表形式返回

            for i in range(len(Keylist)):
                key = Keylist[i].string   #获取dt标签的string属性
                val = valuelist[i].string   #获取dd标签的string属性
                infoDict[key] = val  #字典可以直接使用infoDict[key] = val，向其中增添内容
                with open(fpath,'a',encoding='utf-8') as f:
                    f.write(str(infoDict)+'\n')
                    count = count+1
                    print('\r当前速度:{:.2f}%'.format(count*100/len(lst)),end='')
        except:
            count = count + 1
            print('\r当前速度:{:.2f}%'.format(count * 100 / len(lst)), end='')
            traceback.print_exc()
            continue

    return ""
def main2():
    stock_list_url = 'https://quote.eastmoney.com/stocklist.html'
    stock_info_url = 'https://xueqiu.com/S/BIDU'
    output_file  = 'D://BaiduGupiao.txt'
    slist = []
    getstockList(slist,stock_list_url)
    getstockInfo(slist,stock_info_url,output_file)