38e7c284e8
- Updated `.gitignore` to streamline ignored files and added logging for common sites. - Expanded `config.py` with new configurations for Weixin and Redis, and improved database connection settings. - Refined `README.md` to clarify project structure and usage instructions. - Enhanced `requirements.txt` with additional dependencies for MongoDB and Redis support. - Refactored multiple spider scripts to utilize a session-based approach for HTTP requests, improving error handling and proxy management. - Updated `export_lawyers_excel.py` to include a default timestamp for data exports.
356 lines
13 KiB
Python
356 lines
13 KiB
Python
import json
|
||
import os
|
||
import re
|
||
import sys
|
||
import time
|
||
from html import unescape
|
||
from http.cookies import SimpleCookie
|
||
from typing import Dict, Optional
|
||
from urllib.parse import urlencode
|
||
|
||
import requests
|
||
import urllib3
|
||
|
||
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||
project_root = os.path.dirname(current_dir)
|
||
for path in (current_dir, project_root):
|
||
if path not in sys.path:
|
||
sys.path.append(path)
|
||
|
||
import config as project_config
|
||
from utils.rate_limiter import wait_for_request, global_rate_limiter
|
||
|
||
API_ENDPOINT = "https://mp.weixin.qq.com/cgi-bin/videosnap"
|
||
DOMAIN = "mp.weixin.qq.com"
|
||
DEFAULT_HEADERS = {
|
||
"User-Agent": (
|
||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||
"Chrome/138.0.0.0 Safari/537.36"
|
||
),
|
||
"Accept": "*/*",
|
||
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,zh-TW;q=0.7",
|
||
"X-Requested-With": "XMLHttpRequest",
|
||
}
|
||
DEFAULT_WEIXIN_CONFIG = {
|
||
"TOKEN": "32299576",
|
||
"FINGERPRINT": "64a1c659b8b944d6e7fe596b0794ab35",
|
||
"COOKIE": {
|
||
"appmsglist_action_3876849679": "card",
|
||
"mm_lang": "zh_CN",
|
||
"ts_uid": "8295434560",
|
||
"markHashId_L": "417c7f0e-5d9f-4048-b844-28f78ed2a838",
|
||
"_qimei_uuid42": "19b0d0b0c2d100de3df57d2afbc5018a9b4ae103e1",
|
||
"_qimei_i_3": "59c5508a935b04dac7c1ab340fd172b5a5eba4f7160d5683e2867a5a7094713e616364943989e2a29e9f",
|
||
"_qimei_h38": "b885c955f8e9995f103aac140200000421811e",
|
||
"RK": "ZGEMOpzbOS",
|
||
"ptcz": "90084a2b43c84a92d1b9082da98fd0e92369fcde4f2edbbc85661539c7917055",
|
||
"pac_uid": "0_HXj3iphPm0Y4a",
|
||
"_qimei_fingerprint": "bd1870aaecd7a9bb84aa53b9ad9a2c55",
|
||
"wxuin": "70085167371972",
|
||
"omgid": "0_HXj3iphPm0Y4a",
|
||
"rewardsn": "",
|
||
"wxtokenkey": "777",
|
||
"sig_login": "h017c22e8921e6bf5a1f8659d9f34ee0db2be31cdcf03786b9ab4b787a9821ad84d3046473d9076181a",
|
||
"_qpsvr_localtk": "0.9079082151544442",
|
||
"appletToken": "880792228",
|
||
"mmad_session": "ae5215dd3c930e6256d8f0656bd8497e719817e0df77a677766e128e2135218486f674b88b349db0d47039f54cb99c8753beb8d4b921ae452b66773db51ad3006ab1f0d19253ae83e2cb9ba53ff5b5b4f45f2fe160db66fd300a1fb4e04a92bd11de1c56c245721266e7088080fefde3",
|
||
"qq_domain_video_guid_verify": "6cce52525a146907",
|
||
"_qimei_q36": "",
|
||
"pgv_info": "ssid=s4741843528",
|
||
"pgv_pvid": "9337874960",
|
||
"_qimei_i_2": "47e96bdff700",
|
||
"_qimei_i_1": "40bb51d09d525588c892fb6653d17ae9feebf2f0125852d3e78e2c582493206c616333973981e3dd838fd0da",
|
||
"_qimei_q32": "",
|
||
"mp_token": "1555009133",
|
||
"ua_id": "390pNywJFJA6BsgOAAAAADO0TqlmW7NBB1GD0Y7OVwk=",
|
||
"__wx_phantom_mark__": "UTRZE71JZ7",
|
||
"_clck": "3841887471|1|g4a|0",
|
||
"uuid": "6ae7cb97104627c5d3b9d1d9ab2eef60",
|
||
"rand_info": "CAESIGjvJyiJ58Ii0enQVKBwl6d4IyCrWeN7kzhIAVTgM2lc",
|
||
"slave_bizuin": "3876849679",
|
||
"data_bizuin": "3876849679",
|
||
"bizuin": "3876849679",
|
||
"data_ticket": "8wg11/LIrTLHAbJdbAH2HWdqlW/K2jijwP27oPSrH2myYNpuSR1NedfmSbzeq5go",
|
||
"slave_sid": "TjBzVV83WThEaThRdUhlcFpqRFhQejFSUzRfOWdGa0l3S0dPSW41QWdkSk9qSkQ2ZTljbWRHa0poQ1lNTXlub25WMUJORVluVU5HaFBGRXVJS19yeG53SUNWWU14YjNQeWpxTUczalBHV1dTY0V3TDZ6aE14bFNaS2ExeGNhb3J0WlRWMlM4NnNmNGFST0ZD",
|
||
"slave_user": "gh_6c1283858808",
|
||
"xid": "116378d10877a35558158970698ca0c3",
|
||
"_clsk": "3okzsf|1773282377657|6|1|mp.weixin.qq.com/weheat-agent/payload/record"
|
||
},
|
||
"COUNT": 20,
|
||
"REQUESTS_PER_SECOND": 5,
|
||
"PAGE_DELAY": 5,
|
||
"CITY_DELAY": 2,
|
||
}
|
||
|
||
|
||
def _parse_cookie_value(cookie_value) -> Dict[str, str]:
|
||
if isinstance(cookie_value, dict):
|
||
return {str(key): str(value) for key, value in cookie_value.items()}
|
||
|
||
if not cookie_value:
|
||
return {}
|
||
|
||
if isinstance(cookie_value, str):
|
||
text = cookie_value.strip()
|
||
if not text:
|
||
return {}
|
||
try:
|
||
parsed = json.loads(text)
|
||
except json.JSONDecodeError:
|
||
parsed = None
|
||
if isinstance(parsed, dict):
|
||
return {str(key): str(value) for key, value in parsed.items()}
|
||
|
||
cookie = SimpleCookie()
|
||
cookie.load(text)
|
||
return {key: morsel.value for key, morsel in cookie.items()}
|
||
|
||
return {}
|
||
|
||
|
||
def _load_weixin_config() -> Dict:
|
||
config = DEFAULT_WEIXIN_CONFIG.copy()
|
||
module_config = getattr(project_config, "WEIXIN_CONFIG", None)
|
||
if isinstance(module_config, dict):
|
||
config.update(module_config)
|
||
|
||
env_mapping = {
|
||
"TOKEN": os.getenv("WEIXIN_TOKEN"),
|
||
"FINGERPRINT": os.getenv("WEIXIN_FINGERPRINT"),
|
||
"COOKIE": os.getenv("WEIXIN_COOKIE"),
|
||
"COUNT": os.getenv("WEIXIN_COUNT"),
|
||
"REQUESTS_PER_SECOND": os.getenv("WEIXIN_REQUESTS_PER_SECOND"),
|
||
"PAGE_DELAY": os.getenv("WEIXIN_PAGE_DELAY"),
|
||
"CITY_DELAY": os.getenv("WEIXIN_CITY_DELAY"),
|
||
}
|
||
for key, value in env_mapping.items():
|
||
if value not in (None, ""):
|
||
config[key] = value
|
||
|
||
config["COOKIE"] = _parse_cookie_value(config.get("COOKIE"))
|
||
|
||
for key in ("COUNT", "REQUESTS_PER_SECOND"):
|
||
try:
|
||
config[key] = int(config[key])
|
||
except (TypeError, ValueError):
|
||
config[key] = DEFAULT_WEIXIN_CONFIG[key]
|
||
|
||
for key in ("PAGE_DELAY", "CITY_DELAY"):
|
||
try:
|
||
config[key] = float(config[key])
|
||
except (TypeError, ValueError):
|
||
config[key] = DEFAULT_WEIXIN_CONFIG[key]
|
||
|
||
return config
|
||
|
||
|
||
def _strip_html(text: str) -> str:
|
||
if not text:
|
||
return ""
|
||
return re.sub(r"<[^>]+>", "", unescape(text)).strip()
|
||
|
||
|
||
class WeixinSpider:
|
||
"""基于 requests 的微信视频号采集器"""
|
||
|
||
def __init__(self, db_connection):
|
||
self.db = db_connection
|
||
self.config = _load_weixin_config()
|
||
self.token = str(self.config.get("TOKEN", "")).strip()
|
||
self.fingerprint = str(self.config.get("FINGERPRINT", "")).strip()
|
||
self.cookies = self.config.get("COOKIE", {})
|
||
self.count = str(self.config.get("COUNT", DEFAULT_WEIXIN_CONFIG["COUNT"]))
|
||
self.page_delay = max(0.0, float(self.config.get("PAGE_DELAY", DEFAULT_WEIXIN_CONFIG["PAGE_DELAY"])))
|
||
self.city_delay = max(0.0, float(self.config.get("CITY_DELAY", DEFAULT_WEIXIN_CONFIG["CITY_DELAY"])))
|
||
max_rps = self.config.get("REQUESTS_PER_SECOND")
|
||
if max_rps:
|
||
global_rate_limiter.max_requests = int(max_rps)
|
||
|
||
headers = getattr(project_config, "HEADERS", DEFAULT_HEADERS).copy()
|
||
headers["Referer"] = "https://mp.weixin.qq.com/"
|
||
self.session = requests.Session()
|
||
self.session.trust_env = False
|
||
self.session.headers.update(headers)
|
||
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
||
|
||
def _validate_runtime_config(self) -> bool:
|
||
missing = []
|
||
if not self.token:
|
||
missing.append("TOKEN")
|
||
if not self.fingerprint:
|
||
missing.append("FINGERPRINT")
|
||
if not self.cookies:
|
||
missing.append("COOKIE")
|
||
|
||
if not missing:
|
||
return True
|
||
|
||
print(
|
||
"[微信] 配置不完整,缺少: "
|
||
+ ", ".join(missing)
|
||
+ "。请在 config.py 的 WEIXIN_CONFIG 中补齐,"
|
||
+ "或通过环境变量 WEIXIN_TOKEN / WEIXIN_FINGERPRINT / WEIXIN_COOKIE 提供。"
|
||
)
|
||
return False
|
||
|
||
def _load_areas(self):
|
||
condition = "domain='maxlaw' AND level=2"
|
||
tables = ("area_new", "area", "area2")
|
||
last_error = None
|
||
for table in tables:
|
||
try:
|
||
rows = self.db.select_data(table, "province, city", condition) or []
|
||
except Exception as exc:
|
||
last_error = exc
|
||
continue
|
||
if rows:
|
||
print(f"[微信] 城市来源表: {table}, 城市数: {len(rows)}")
|
||
return rows
|
||
|
||
if last_error:
|
||
print(f"[微信] 加载地区数据失败: {last_error}")
|
||
print("[微信] 无城市数据(已尝试 area_new/area/area2)")
|
||
return []
|
||
|
||
def _build_query_url(self, query: str, buffer: str) -> str:
|
||
params = {
|
||
"action": "search",
|
||
"scene": "1",
|
||
"query": query,
|
||
"count": self.count,
|
||
"buffer": buffer,
|
||
"fingerprint": self.fingerprint,
|
||
"token": self.token,
|
||
"lang": "zh_CN",
|
||
"f": "json",
|
||
"ajax": "1",
|
||
}
|
||
return f"{API_ENDPOINT}?{urlencode(params)}"
|
||
|
||
def _extract_phone(self, text: str) -> Optional[str]:
|
||
if not text:
|
||
return None
|
||
match = re.search(r"1[3-9]\d{9}", text)
|
||
return match.group(0) if match else None
|
||
|
||
def _parse_name(self, acct: Dict) -> str:
|
||
highlight = _strip_html(acct.get("highlight_nickname", ""))
|
||
if highlight:
|
||
return highlight
|
||
return _strip_html(acct.get("nickname", ""))
|
||
|
||
def _store_account(self, acct: Dict, province: str, city: str) -> None:
|
||
signature = acct.get("signature", "")
|
||
phone = self._extract_phone(signature)
|
||
if not phone:
|
||
return
|
||
|
||
if self.db.is_data_exist("lawyer", f"phone='{phone}' and domain='{DOMAIN}'"):
|
||
name = self._parse_name(acct)
|
||
print(f" -- 已存在律师: {name} ({phone})")
|
||
return
|
||
|
||
params = json.dumps(acct, ensure_ascii=False)
|
||
lawyer_data = {
|
||
"phone": phone,
|
||
"province": province,
|
||
"city": city,
|
||
"law_firm": acct.get("auth_info", {}).get("auth_profession"),
|
||
"url": f"https://channels.weixin.qq.com/finder/creator/feeds?finder_username={acct.get('username', '')}",
|
||
"create_time": int(time.time()),
|
||
"domain": DOMAIN,
|
||
"name": self._parse_name(acct),
|
||
"params": params,
|
||
}
|
||
|
||
try:
|
||
inserted_id = self.db.insert_data("lawyer", lawyer_data)
|
||
print(f" -> 新增律师: {lawyer_data['name']} ({phone}), ID: {inserted_id}")
|
||
except Exception as exc:
|
||
print(f" 插入失败 {lawyer_data['name']} ({phone}): {exc}")
|
||
|
||
def _search_city(self, province: str, city: str) -> None:
|
||
city_name = city.replace('市', '')
|
||
query = f"{city_name}律所"
|
||
print(f"--- [微信] 开始采集城市: {province} - {city_name} ---")
|
||
|
||
buffer = ""
|
||
has_more = True
|
||
page_no = 0
|
||
|
||
while has_more:
|
||
page_no += 1
|
||
url = self._build_query_url(query, buffer)
|
||
print(f"正在采集 '{query}' 第 {page_no} 页: {url}")
|
||
|
||
wait_for_request()
|
||
try:
|
||
response = self.session.get(
|
||
url,
|
||
timeout=15,
|
||
cookies=self.cookies,
|
||
proxies={}, # 明确禁用代理
|
||
verify=False,
|
||
)
|
||
response.raise_for_status()
|
||
data = response.json()
|
||
except requests.exceptions.RequestException as exc:
|
||
print(f"网络请求失败: {exc}")
|
||
break
|
||
except json.JSONDecodeError:
|
||
print("解析返回的JSON失败。返回内容:", response.text[:200])
|
||
break
|
||
|
||
base_resp = data.get("base_resp", {})
|
||
if base_resp.get("ret") != 0:
|
||
print(f"API返回错误: {base_resp.get('err_msg')}")
|
||
if "invalid ticket" in (base_resp.get('err_msg') or ""):
|
||
print("Token 或 Cookie 可能失效,请更新配置。")
|
||
break
|
||
|
||
accounts = data.get("acct_list", [])
|
||
if not accounts:
|
||
print("本页未找到更多律师信息。")
|
||
break
|
||
|
||
for acct in accounts:
|
||
self._store_account(acct, province, city_name)
|
||
|
||
has_more = bool(data.get("acct_continue_flag"))
|
||
buffer = data.get("last_buff", "")
|
||
time.sleep(self.page_delay)
|
||
|
||
print(f"--- [微信] 城市: {city_name} 采集完成 ---\n")
|
||
|
||
def run(self) -> None:
|
||
print("启动微信视频号律师信息采集...")
|
||
if not self._validate_runtime_config():
|
||
return
|
||
|
||
areas = self._load_areas()
|
||
if not areas:
|
||
print("[微信] 未能从 `area_new` 表获取到地区信息。")
|
||
return
|
||
|
||
for area in areas:
|
||
province = area.get("province", "")
|
||
city = area.get("city", "")
|
||
if not city:
|
||
continue
|
||
try:
|
||
self._search_city(province, city)
|
||
except Exception as exc:
|
||
print(f"采集 {province}-{city} 时发生错误: {exc}")
|
||
time.sleep(self.city_delay)
|
||
|
||
print("微信视频号律师信息采集完成。")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
from Db import Db
|
||
|
||
with Db() as db:
|
||
spider = WeixinSpider(db)
|
||
spider.run()
|