博客
关于我
强烈建议你试试无所不能的chatGPT,快点击我
爬取微信公众号文章
阅读量:6210 次
发布时间:2019-06-21

本文共 5213 字,大约阅读时间需要 17 分钟。

搜狗对微信公众平台的公众号和文章做了整合,使用代理爬取。

 

spider.py

1 from urllib.parse import urlencode  2 import pymongo  3 import requests  4 from lxml.etree import XMLSyntaxError  5 from requests.exceptions import ConnectionError  6 from pyquery import PyQuery as pq  7 from config import *  8   9 client = pymongo.MongoClient(MONGO_URI) 10 db = client[MONGO_DB] 11  12 base_url = 'http://weixin.sogou.com/weixin?' 13  14 headers = { 15 'Cookie': 'IPLOC=CN1100; SUID=194E796A2E08990A000000005B114E85; SUV=1527860869604056; ABTEST=1|1527860872|v1; SNUID=9FCBFCEF8680EB12510E6A9C86088B29; weixinIndexVisited=1; JSESSIONID=aaaqa95rD87Zu9-CJwlnw; sct=5; ppinf=5|1527862844|1529072444|dHJ1c3Q6MToxfGNsaWVudGlkOjQ6MjAxN3x1bmlxbmFtZToyNzolRTclOEUlOEIlRTclOTAlQjMlRTYlOUQlQjB8Y3J0OjEwOjE1Mjc4NjI4NDR8cmVmbmljazoyNzolRTclOEUlOEIlRTclOTAlQjMlRTYlOUQlQjB8dXNlcmlkOjQ0Om85dDJsdUh5bE5VSDJEVWNuSHBDWnVOVG9sN2tAd2VpeGluLnNvaHUuY29tfA; pprdig=EZE8CVVtoUTqmCoJj6bEWwKngY4di5UpGDFImTA9-1qrMK_tIJEtUyGR9_0Jcv5Xw1EuqLO9BNFvAKQv5DOQvmCWh-jxudk7SGv89NuhCLow7dxPysoOtLSI-keSaKVLKT82Vhg7rDBg0SlQ3y2uiG53lBUWL0wLVw4D_f_7MLg; sgid=17-35315605-AVsRVjwpV4ichpAzPibp6olGY; ppmdig=1527862844000000243bdb95cb03e086685bb1de06087c32', 16 'Host': 'weixin.sogou.com', 17 'Upgrade-Insecure-Requests': '1', 18 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.7 Safari/537.36' 19 } 20  21  22 proxy = None 23  24  25 def get_proxy(): 26     try: 27         response = requests.get(PROXY_POOL_URL) 28         if response.status_code == 200: 29             return response.text 30         return None 31     except ConnectionError: 32         return None 33  34 def get_html(url, count=1): 35     print('Crawling', url) 36     print('Trying Count', count) 37     global proxy 38     if count >= MAX_COUNT: 39         print('Tried Too Many Counts') 40         return None 41     try: 42         if proxy: 43             proxies = { 44                 'http': 'http://' + proxy 45             } 46             response = requests.get(url, allow_redirects=False, headers=headers, proxies=proxies) 47         else: 48             response = requests.get(url, allow_redirects=False, headers=headers) 49         if response.status_code == 200: 50             return response.text 51         if response.status_code == 302: 52             # Need Proxy 53             print('302') 54             proxy = get_proxy() 55             if proxy: 56                 print('Using Proxy', proxy) 57                 #count += 1 58                 #return get_html(url, count) 59                 return get_html(url) 60             else: 61                 print('Get Proxy Failed') 62                 return None 63     except ConnectionError as e: 64         print('Error Occurred', e.args) 65         proxy = get_proxy() 66         count += 1 67         return get_html(url, count) 68  69  70  71 def get_index(keyword, page): 72     data = { 73         'query': keyword, 74         'type': 2, 75         'page': page 76     } 77     queries = urlencode(data) 78     url = base_url + queries 79     html = get_html(url) 80     return html 81  82 def parse_index(html): 83     doc = pq(html) 84     items = doc('.news-box .news-list li .txt-box h3 a').items() 85     for item in items: 86         yield item.attr('href') 87  88 def get_detail(url): 89     try: 90         response = requests.get(url) 91         if response.status_code == 200: 92             return response.text 93         return None 94     except ConnectionError: 95         return None 96  97 def parse_detail(html): 98     try: 99         doc = pq(html)100         title = doc('.rich_media_title').text()101         content = doc('.rich_media_content').text()102         date = doc('#publish_time').text()103         nickname = doc('#js_profile_qrcode > div > strong').text()104         wechat = doc('#js_profile_qrcode > div > p:nth-child(3) > span').text()105         return {106             'title': title,107             'content': content,108             'date': date,109             'nickname': nickname,110             'wechat': wechat111         }112     except XMLSyntaxError:113         return None114 115 def save_to_mongo(data):116     if db['articles'].update({
'title': data['title']}, {
'$set': data}, True):117 print('Saved to Mongo', data['title'])118 else:119 print('Saved to Mongo Failed', data['title'])120 121 122 def main():123 for page in range(1, 101):124 html = get_index(KEYWORD, page)125 if html:126 article_urls = parse_index(html)127 for article_url in article_urls:128 #print(article_url)129 article_html = get_detail(article_url)130 if article_html:131 article_data = parse_detail(article_html)132 print(article_data)133 if article_data:134 save_to_mongo(article_data)135 136 137 138 if __name__ == '__main__':139 main()

 

config.py
1 PROXY_POOL_URL = 'http://127.0.0.1:5555/random'2 KEYWORD = 'python'3 MONGO_URI = 'localhost'4 MONGO_DB = 'weixin'5 MAX_COUNT = 5

 

转载于:https://www.cnblogs.com/wanglinjie/p/9231559.html

你可能感兴趣的文章
Log4j2配置文件详解
查看>>
我的友情链接
查看>>
Android MVP 框架 Demo
查看>>
分布式事务 CAP 理解论证 解决方案
查看>>
Ubuntu 12.04安装及配置vsftp步骤
查看>>
ipv6地址格式
查看>>
基于虚拟用户的电子邮件系统
查看>>
Java日期和日历相关的面试题
查看>>
CentOS 6.3(x86_32)下安装Oracle 10g R2
查看>>
MySQL三大范式
查看>>
nginx在reload时候报错invalid PID number
查看>>
javascript对象---2 构造函数
查看>>
写着好玩的(7)——IP地址+DHCP
查看>>
记一次云计算测试实验-openstack-icehouse-安装nova
查看>>
asp 源码建站
查看>>
Linux系统在信息社会的发展
查看>>
Exchange规则-发送部门群发组并密送某用户
查看>>
linux 安装redis
查看>>
Xamarin 学习笔记 - 配置环境(Windows & iOS)
查看>>
RRDtool的用法(结合实例)
查看>>