Files
lawyers/config.py
T
hello-dd-code 38e7c284e8 feat: enhance project configuration and improve data export functionality
- Updated `.gitignore` to streamline ignored files and added logging for common sites.
- Expanded `config.py` with new configurations for Weixin and Redis, and improved database connection settings.
- Refined `README.md` to clarify project structure and usage instructions.
- Enhanced `requirements.txt` with additional dependencies for MongoDB and Redis support.
- Refactored multiple spider scripts to utilize a session-based approach for HTTP requests, improving error handling and proxy management.
- Updated `export_lawyers_excel.py` to include a default timestamp for data exports.
2026-03-18 10:02:25 +08:00

120 lines
5.6 KiB
Python

# 数据库连接配置
DB_CONFIG = {
"host": "8.134.219.222", # 数据库地址
"user": "lawyer", # 数据库用户名
"password": "CTxr8yGwsSX3NdfJ", # 数据库密码
"database": "lawyer", # 数据库名称
"charset": "utf8mb4",
}
# 微信爬虫特定的配置
WEIXIN_CONFIG = {
"TOKEN": "756858506", # 您的Token
"FINGERPRINT": "1caa5fc52ac489e20a175e153dd3ef21",
"COOKIE": {
"appmsglist_action_3258147150": "card",
"mm_lang": "zh_CN",
"ts_uid": "8295434560",
"markHashId_L": "417c7f0e-5d9f-4048-b844-28f78ed2a838",
"_qimei_uuid42": "19b0d0b0c2d100de3df57d2afbc5018a9b4ae103e1",
"_qimei_i_3": "59c5508a935b04dac7c1ab340fd172b5a5eba4f7160d5683e2867a5a7094713e616364943989e2a29e9f",
"_qimei_h38": "b885c955f8e9995f103aac140200000421811e",
"_qimei_i_1": "4ddd76d09d525588c892fb6653d17ae9feebf2f0125852d3e78e2c582493206c616333973981e3dd83abc2e0",
"_qpsvr_localtk": "0.2780749298744084",
"RK": "ZGEMOpzbOS",
"ptcz": "90084a2b43c84a92d1b9082da98fd0e92369fcde4f2edbbc85661539c7917055",
"pac_uid": "0_HXj3iphPm0Y4a",
"_qimei_fingerprint": "bd1870aaecd7a9bb84aa53b9ad9a2c55",
"rewardsn": "",
"wxtokenkey": "777",
"omgid": "0_HXj3iphPm0Y4a",
"sig_login": "h01218fdccf5b63c15a6c5edb19ce20d0481c52723ee44ab56b9fc1415ff39c9ff0dd2000e12f1de8ae",
"ua_id": "QXSOTQUjDFjoH63yAAAAAPILc15EwzRTwdqntEiCGSE=",
"mp_token": "1331492699",
"appletToken": "2105598806",
"__wx_phantom_mark__": "breQbE92JS",
"mmad_session": "2bd2e1824d701b521c16fa35de0378e55273ce93a68ac0cc9ca30e8ad5b2e9f6fc419dd5fed1cd17f0a57fc3c327e03ccf325c1e1e97dde41374a9d8067d9aa700c8b87a29b0d3caf7f949761d8f4eeb56a1e3ddbc5a5d3a573e5b83971cd92e11de1c56c245721266e7088080fefde3",
"pgv_info": "ssid=s5739471549",
"pgv_pvid": "2616937300",
"_gcl_au": "1.1.954868153.1769494261",
"wxuin": "69676812527831",
"_clck": "3258147150|1|g35|0",
"uuid": "e07aa2889db56b1901e1fb6b1286d9a7",
"rand_info": "CAESIBnfIxLJoUVe5wP4SI/ADWnrnPUBlJDb4yyA7Et1+ZfF",
"slave_bizuin": "3258147150",
"data_bizuin": "3258147150",
"bizuin": "3258147150",
"data_ticket": "kv+SnLJADgPlcKQPIbYnfbEAxogpIMfAo/n0/HjtChnfDmQSogWvkO82/mUtzpcc",
"slave_sid": "eFNMcEZ3bnhvRkppZVNkTDE4dFFnM0ZzdFM1REhpemZORHRnVnlnRHhKU29vY1ZBY0dJZkFHcXB5Nko4aV9pbVlnRTBRVDE0NzdIUDF4T3NTSDVzdXBJS2d3WFFuR3hiMWVVbG5ZTURfYmh3YTFTallIb2JXOWpyTWxXS25jbVFRVmtXWHVaWGdCN2lqZzVm",
"slave_user": "gh_fe76760560d0",
"xid": "34f577adf2c28e5b9f04de93c614c5c4",
"_clsk": "639w4k|1769742296130|3|1|mp.weixin.qq.com/weheat-agent/payload/record"
},
"COUNT": 20, # 单页条数
"REQUESTS_PER_SECOND": 8, # 每秒最大请求数(调高更快,但有风控风险)
"PAGE_DELAY": 0.8, # 每页采集后的等待秒数
"CITY_DELAY": 0.3, # 每城市采集后的等待秒数
}
# 通用请求头
HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36',
'Accept': '*/*',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,zh-TW;q=0.7',
'X-Requested-With': 'XMLHttpRequest',
}
# 法律快车爬虫配置
LAWTIME_CONFIG = {
"HEADERS": {
"User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1"
}
}
# Redis配置 - 用于采集索引和断点恢复
REDIS_CONFIG = {
"host": "127.0.0.1",
"port": 6379,
"password": "",
"db": 0, # 使用数据库0
"decode_responses": True, # 自动解码响应
"socket_timeout": 5, # 连接超时时间
"socket_connect_timeout": 5, # 连接建立超时时间
"health_check_interval": 30, # 健康检查间隔
"retry_on_timeout": True, # 超时重试
"max_connections": 20, # 最大连接数
}
# Redis键名配置
REDIS_KEYS = {
"spider_progress": "lawyer:spider:progress:{spider_name}", # 爬虫进度
"url_processed": "lawyer:url:processed:{spider_name}", # 已处理URL集合
"url_failed": "lawyer:url:failed:{spider_name}", # 失败URL集合
"spider_stats": "lawyer:stats:{spider_name}", # 爬虫统计信息
"global_stats": "lawyer:global:stats", # 全局统计
"session_info": "lawyer:session:{session_id}", # 会话信息
"url_queue": "lawyer:queue:{spider_name}", # URL队列
"duplicate_filter": "lawyer:duplicate:{spider_name}", # 去重过滤器
}
# MongoDB配置 - 用于日志存储
MONGO_CONFIG = {
"uri": "mongodb://127.0.0.1:27017/",
"database": "lawyer",
"collections": {
"logs": "logs", # 通用日志
"spider_logs": "spider_logs", # 爬虫专用日志
"error_logs": "error_logs", # 错误日志
"system_logs": "system_logs", # 系统日志
"performance_logs": "performance_logs" # 性能日志
},
"options": {
"maxPoolSize": 10, # 连接池最大连接数
"minPoolSize": 1, # 连接池最小连接数
"maxIdleTimeMS": 30000, # 最大空闲时间
"serverSelectionTimeoutMS": 5000, # 服务器选择超时
"connectTimeoutMS": 10000, # 连接超时
"socketTimeoutMS": 30000, # Socket超时
}
}