38e7c284e8
- Updated `.gitignore` to streamline ignored files and added logging for common sites. - Expanded `config.py` with new configurations for Weixin and Redis, and improved database connection settings. - Refined `README.md` to clarify project structure and usage instructions. - Enhanced `requirements.txt` with additional dependencies for MongoDB and Redis support. - Refactored multiple spider scripts to utilize a session-based approach for HTTP requests, improving error handling and proxy management. - Updated `export_lawyers_excel.py` to include a default timestamp for data exports.
120 lines
5.6 KiB
Python
120 lines
5.6 KiB
Python
# 数据库连接配置
|
|
DB_CONFIG = {
|
|
"host": "8.134.219.222", # 数据库地址
|
|
"user": "lawyer", # 数据库用户名
|
|
"password": "CTxr8yGwsSX3NdfJ", # 数据库密码
|
|
"database": "lawyer", # 数据库名称
|
|
"charset": "utf8mb4",
|
|
}
|
|
|
|
# 微信爬虫特定的配置
|
|
WEIXIN_CONFIG = {
|
|
"TOKEN": "756858506", # 您的Token
|
|
"FINGERPRINT": "1caa5fc52ac489e20a175e153dd3ef21",
|
|
"COOKIE": {
|
|
"appmsglist_action_3258147150": "card",
|
|
"mm_lang": "zh_CN",
|
|
"ts_uid": "8295434560",
|
|
"markHashId_L": "417c7f0e-5d9f-4048-b844-28f78ed2a838",
|
|
"_qimei_uuid42": "19b0d0b0c2d100de3df57d2afbc5018a9b4ae103e1",
|
|
"_qimei_i_3": "59c5508a935b04dac7c1ab340fd172b5a5eba4f7160d5683e2867a5a7094713e616364943989e2a29e9f",
|
|
"_qimei_h38": "b885c955f8e9995f103aac140200000421811e",
|
|
"_qimei_i_1": "4ddd76d09d525588c892fb6653d17ae9feebf2f0125852d3e78e2c582493206c616333973981e3dd83abc2e0",
|
|
"_qpsvr_localtk": "0.2780749298744084",
|
|
"RK": "ZGEMOpzbOS",
|
|
"ptcz": "90084a2b43c84a92d1b9082da98fd0e92369fcde4f2edbbc85661539c7917055",
|
|
"pac_uid": "0_HXj3iphPm0Y4a",
|
|
"_qimei_fingerprint": "bd1870aaecd7a9bb84aa53b9ad9a2c55",
|
|
"rewardsn": "",
|
|
"wxtokenkey": "777",
|
|
"omgid": "0_HXj3iphPm0Y4a",
|
|
"sig_login": "h01218fdccf5b63c15a6c5edb19ce20d0481c52723ee44ab56b9fc1415ff39c9ff0dd2000e12f1de8ae",
|
|
"ua_id": "QXSOTQUjDFjoH63yAAAAAPILc15EwzRTwdqntEiCGSE=",
|
|
"mp_token": "1331492699",
|
|
"appletToken": "2105598806",
|
|
"__wx_phantom_mark__": "breQbE92JS",
|
|
"mmad_session": "2bd2e1824d701b521c16fa35de0378e55273ce93a68ac0cc9ca30e8ad5b2e9f6fc419dd5fed1cd17f0a57fc3c327e03ccf325c1e1e97dde41374a9d8067d9aa700c8b87a29b0d3caf7f949761d8f4eeb56a1e3ddbc5a5d3a573e5b83971cd92e11de1c56c245721266e7088080fefde3",
|
|
"pgv_info": "ssid=s5739471549",
|
|
"pgv_pvid": "2616937300",
|
|
"_gcl_au": "1.1.954868153.1769494261",
|
|
"wxuin": "69676812527831",
|
|
"_clck": "3258147150|1|g35|0",
|
|
"uuid": "e07aa2889db56b1901e1fb6b1286d9a7",
|
|
"rand_info": "CAESIBnfIxLJoUVe5wP4SI/ADWnrnPUBlJDb4yyA7Et1+ZfF",
|
|
"slave_bizuin": "3258147150",
|
|
"data_bizuin": "3258147150",
|
|
"bizuin": "3258147150",
|
|
"data_ticket": "kv+SnLJADgPlcKQPIbYnfbEAxogpIMfAo/n0/HjtChnfDmQSogWvkO82/mUtzpcc",
|
|
"slave_sid": "eFNMcEZ3bnhvRkppZVNkTDE4dFFnM0ZzdFM1REhpemZORHRnVnlnRHhKU29vY1ZBY0dJZkFHcXB5Nko4aV9pbVlnRTBRVDE0NzdIUDF4T3NTSDVzdXBJS2d3WFFuR3hiMWVVbG5ZTURfYmh3YTFTallIb2JXOWpyTWxXS25jbVFRVmtXWHVaWGdCN2lqZzVm",
|
|
"slave_user": "gh_fe76760560d0",
|
|
"xid": "34f577adf2c28e5b9f04de93c614c5c4",
|
|
"_clsk": "639w4k|1769742296130|3|1|mp.weixin.qq.com/weheat-agent/payload/record"
|
|
},
|
|
"COUNT": 20, # 单页条数
|
|
"REQUESTS_PER_SECOND": 8, # 每秒最大请求数(调高更快,但有风控风险)
|
|
"PAGE_DELAY": 0.8, # 每页采集后的等待秒数
|
|
"CITY_DELAY": 0.3, # 每城市采集后的等待秒数
|
|
}
|
|
|
|
# 通用请求头
|
|
HEADERS = {
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36',
|
|
'Accept': '*/*',
|
|
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,zh-TW;q=0.7',
|
|
'X-Requested-With': 'XMLHttpRequest',
|
|
}
|
|
|
|
# 法律快车爬虫配置
|
|
LAWTIME_CONFIG = {
|
|
"HEADERS": {
|
|
"User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1"
|
|
}
|
|
}
|
|
|
|
# Redis配置 - 用于采集索引和断点恢复
|
|
REDIS_CONFIG = {
|
|
"host": "127.0.0.1",
|
|
"port": 6379,
|
|
"password": "",
|
|
"db": 0, # 使用数据库0
|
|
"decode_responses": True, # 自动解码响应
|
|
"socket_timeout": 5, # 连接超时时间
|
|
"socket_connect_timeout": 5, # 连接建立超时时间
|
|
"health_check_interval": 30, # 健康检查间隔
|
|
"retry_on_timeout": True, # 超时重试
|
|
"max_connections": 20, # 最大连接数
|
|
}
|
|
|
|
# Redis键名配置
|
|
REDIS_KEYS = {
|
|
"spider_progress": "lawyer:spider:progress:{spider_name}", # 爬虫进度
|
|
"url_processed": "lawyer:url:processed:{spider_name}", # 已处理URL集合
|
|
"url_failed": "lawyer:url:failed:{spider_name}", # 失败URL集合
|
|
"spider_stats": "lawyer:stats:{spider_name}", # 爬虫统计信息
|
|
"global_stats": "lawyer:global:stats", # 全局统计
|
|
"session_info": "lawyer:session:{session_id}", # 会话信息
|
|
"url_queue": "lawyer:queue:{spider_name}", # URL队列
|
|
"duplicate_filter": "lawyer:duplicate:{spider_name}", # 去重过滤器
|
|
}
|
|
|
|
# MongoDB配置 - 用于日志存储
|
|
MONGO_CONFIG = {
|
|
"uri": "mongodb://127.0.0.1:27017/",
|
|
"database": "lawyer",
|
|
"collections": {
|
|
"logs": "logs", # 通用日志
|
|
"spider_logs": "spider_logs", # 爬虫专用日志
|
|
"error_logs": "error_logs", # 错误日志
|
|
"system_logs": "system_logs", # 系统日志
|
|
"performance_logs": "performance_logs" # 性能日志
|
|
},
|
|
"options": {
|
|
"maxPoolSize": 10, # 连接池最大连接数
|
|
"minPoolSize": 1, # 连接池最小连接数
|
|
"maxIdleTimeMS": 30000, # 最大空闲时间
|
|
"serverSelectionTimeoutMS": 5000, # 服务器选择超时
|
|
"connectTimeoutMS": 10000, # 连接超时
|
|
"socketTimeoutMS": 30000, # Socket超时
|
|
}
|
|
}
|