本文共 3673 字,大约阅读时间需要 12 分钟。
本文将介绍如何使用Python编写爬虫脚本,抓取简书、博客园及CSDN等平台用户个人主页的文章目录。基于实际使用需求,代码注释详细,直接阅读即可理解实现逻辑。
# -*- coding:utf-8 -*-import urllib.requestfrom lxml import etreeclass CrawlJs: def getArticle(self, url): print('开始爬取数据') headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.104 Safari/537.36' } request = urllib.request.Request(url, headers=headers) content = urllib.request.urlopen(request).read() return content def save(self, content): xml = etree.HTML(content) title = xml.xpath('//div[@class="content"]/a[@class="title"]/text()') link = xml.xpath('//div[@class="content"]/a[@class="title"]/@href') for t, l in zip(title, link): with open('JsIndex.txt', 'a') as f: f.write(f'[{t}]({l})\n') print('爬取完成')if __name__ == '__main__': page = int(input('请输入要抓取的页码总数:')) for num in range(page): url = f'http://www.jianshu.com/u/c475403112ce?order_by=shared_at&page={num + 1}' js = CrawlJs() content = js.getArticle(url) js.save(content) # -*- coding:utf-8 -*-import urllib2from lxml import etreeclass CrawlJs: def getArticle(self, url): print('开始爬取数据') headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.104 Safari/537.36' } request = urllib2.Request(url, headers=headers) content = urllib2.urlopen(request).read() return content def save(self, content): xml = etree.HTML(content) title = xml.xpath('//div[@class="postTitle"]/a/text()') link = xml.xpath('//div[@class="postTitle"]/a/@href') for t, l in zip(title, link): with open('bokeyuan.txt', 'a') as f: f.write(f'{t} {l}\n') print('爬取完成')if __name__ == '__main__': page = int(input('请输入要抓取的页码总数:')) for num in range(page): url = f'http://www.cnblogs.com/zhouxinfei/default.html?page={num + 1}' js = CrawlJs() content = js.getArticle(url) js.save(content) urllib2库发送请求,模拟浏览器访问lxml.etree库解析网页内容,提取需要的文章信息# -*- coding:utf-8 -*-import urllib.requestfrom lxml import etreeclass CrawlJs: def getArticle(self, url): print('开始爬取数据') headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.104 Safari/537.36' } request = urllib.request.Request(url, headers=headers) content = urllib.request.urlopen(request).read() return content def save(self, content): xml = etree.HTML(content) title = xml.xpath('//div[@class="article-list"]/div/h4/a/text()[2]') link = xml.xpath('//div[@class="article-list"]/div/h4/a/@href') if not title: return for t, l in zip(title, link): with open('csdn.txt', 'a') as f: f.write(f'{t.strip()} {l}\n') print('爬取完成')if __name__ == '__main__': page = int(input('请输入要抓取的页码总数:')) for num in range(page): url = f'https://blog.csdn.net/xc_zhou/article/list/{num + 1}' js = CrawlJs() content = js.getArticle(url) js.save(content) 以上代码均支持多页面抓取,用户可根据实际需求调整爬取范围和存储路径。