第一次写出python的爬虫小项目挺开心的 ,也是对自己这段时间学习成果的认可。主要功能就是爬取pc端的虎牙直播lol分类下的主播名称和主播的人气 然后进行数据的排序
尽管项目虽小 但是也算是踏入python学习中的一大步

爬虫代码如下

import re
from urllib import request

class Spider():

    url = 'https://www.huya.com/g/lol'

    # 爬虫框架 BeautifulSoup , Scrapy

    # <span class="txt">
    #   <span class="avatar fl">
    #         <img data-original="https://huyaimg.msstatic.com/avatar/1094/63/f20eec58c49c79f9925e88c60463e0_180_135.jpg" src="//a.msstatic.com/huya/main/assets/img/default/84x84.jpg" data-default-img="84x84" alt="虎丶牙莎莉" title="虎丶牙莎莉">
    #         <i class="nick" title="虎丶牙莎莉">虎丶牙莎莉</i>
    #     </span>
    #             <span class="num"><i class="num-icon"></i><i class="js-num">360.0万</i></span>
    #   </span>
    # </span>
    
    # ? 非贪婪
    root_pattern = '<span class="txt">([\s\S]*?)</li>'
    name_pattern = '<i class="nick" title="[\s\S]*?">([\s\S]*?)</i>'
    number_pattern = '<i class="js-num">([\s\S]*?)</i>'

    # 私有方法
    def __fetch_content(self):
        r = request.urlopen(Spider.url)
        # bytes
        htmls = r.read()
        htmls = str(htmls,encoding='utf-8')
        return htmls
    
    # 处理字符串 htmls
    def __analysis(self,htmls):
        root_html = re.findall(Spider.root_pattern,htmls)
        anchors = []
        for html in root_html:
           name =  re.findall(Spider.name_pattern,html)
           number =  re.findall(Spider.number_pattern,html)
           anchor = {'name':name,'number':number}
           anchors.append(anchor)
        return anchors
    
    # 数据精炼
    def __refine(self,anchors):
        l = lambda anchor: {
            'name':anchor['name'][0],
            'number':anchor['number'][0]
        }
    
        return map(l,anchors)
    
    # 排序
    def __sort(self,anchors):
        # filter
        anchors = sorted(anchors,key=self.__sort_seed,reverse=True)
        return anchors

    # 比较
    def __sort_seed(self,anchor):
        r = re.findall('[1-9]\d*\.?\d*',anchor['number'])
        number = float(r[0])
        # print(number)
        if '万' in anchor['number']:
            number *= 10000
            return number

    # 显示
    def __show(self,anchors):
        for rank in range(0,len(anchors)):
            print('排名' + str(rank+1) + ' : ' + anchors[rank]['name'] + '---------' + '人气'+anchors[rank]['number']+'人')

    # 入口方法
    def go(self):
        htmls = self.__fetch_content()
        anchors = self.__analysis(htmls)
        anchors = list(self.__refine(anchors))
        anchors = self.__sort(anchors)
        self.__show(anchors)
      
spider = Spider()
spider.go()

本文由 来鹏飞 创作,采用 知识共享署名 3.0,可自由转载、引用,但需署名作者且注明文章出处。

只有地板了

  1. 周帅
    周帅

    加油阿飞!

添加新评论