38e7c284e8
- Updated `.gitignore` to streamline ignored files and added logging for common sites. - Expanded `config.py` with new configurations for Weixin and Redis, and improved database connection settings. - Refined `README.md` to clarify project structure and usage instructions. - Enhanced `requirements.txt` with additional dependencies for MongoDB and Redis support. - Refactored multiple spider scripts to utilize a session-based approach for HTTP requests, improving error handling and proxy management. - Updated `export_lawyers_excel.py` to include a default timestamp for data exports.
225 lines
8.3 KiB
Python
225 lines
8.3 KiB
Python
import json
|
||
import os
|
||
import sys
|
||
import time
|
||
import random
|
||
from typing import Dict, List, Set, Optional
|
||
|
||
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||
project_root = os.path.dirname(current_dir)
|
||
request_dir = os.path.join(project_root, "request")
|
||
if request_dir not in sys.path:
|
||
sys.path.insert(0, request_dir)
|
||
if project_root not in sys.path:
|
||
sys.path.append(project_root)
|
||
|
||
import requests
|
||
from request.proxy_config import get_proxies, report_proxy_status
|
||
from Db import Db
|
||
|
||
DOMAIN = "找法网"
|
||
LIST_TEMPLATE = "https://m.findlaw.cn/{pinyin}/q_lawyer/p{page}?ajax=1&order=0&sex=-1"
|
||
|
||
|
||
class FindlawSpider:
|
||
def __init__(self, db_connection):
|
||
self.db = db_connection
|
||
self.session = self._build_session()
|
||
self.cities = self._load_cities()
|
||
|
||
def _build_session(self) -> requests.Session:
|
||
report_proxy_status()
|
||
session = requests.Session()
|
||
session.trust_env = False
|
||
proxies = get_proxies()
|
||
if proxies:
|
||
session.proxies.update(proxies)
|
||
else:
|
||
session.proxies.clear()
|
||
session.headers.update({
|
||
"User-Agent": (
|
||
"Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) "
|
||
"AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 "
|
||
"Mobile/15E148 Safari/604.1"
|
||
),
|
||
"Accept": "application/json, text/javascript, */*; q=0.01",
|
||
"X-Requested-With": "XMLHttpRequest",
|
||
"Connection": "close",
|
||
})
|
||
return session
|
||
|
||
def _refresh_session(self) -> None:
|
||
try:
|
||
self.session.close()
|
||
except Exception:
|
||
pass
|
||
self.session = self._build_session()
|
||
|
||
def _get(self, url: str, referer: str, verify: bool = True, max_retries: int = 3) -> Optional[str]:
|
||
headers = {"Referer": referer}
|
||
for attempt in range(max_retries):
|
||
try:
|
||
resp = self.session.get(url, timeout=15, verify=verify, headers=headers)
|
||
status_code = resp.status_code
|
||
text = resp.text
|
||
resp.close()
|
||
if status_code == 403:
|
||
if attempt < max_retries - 1:
|
||
wait_time = 2 ** attempt + random.uniform(0.3, 1.0)
|
||
print(f"403被拦截,{wait_time}秒后重试 ({attempt + 1}/{max_retries}): {url}")
|
||
self._refresh_session()
|
||
time.sleep(wait_time)
|
||
continue
|
||
print(f"请求失败 {url}: 403 Forbidden")
|
||
return None
|
||
if status_code >= 400:
|
||
raise requests.exceptions.HTTPError(f"{status_code} Error: {url}")
|
||
return text
|
||
except requests.exceptions.SSLError:
|
||
if verify:
|
||
return self._get(url, referer, verify=False, max_retries=max_retries)
|
||
print(f"SSL错误 {url}")
|
||
return None
|
||
except requests.exceptions.RequestException as exc:
|
||
print(f"请求失败 {url}: {exc}")
|
||
return None
|
||
return None
|
||
|
||
def _existing_phones(self, phones: List[str]) -> Set[str]:
|
||
if not phones:
|
||
return set()
|
||
existing: Set[str] = set()
|
||
cur = self.db.db.cursor()
|
||
try:
|
||
chunk_size = 500
|
||
for i in range(0, len(phones), chunk_size):
|
||
chunk = phones[i:i + chunk_size]
|
||
placeholders = ",".join(["%s"] * len(chunk))
|
||
sql = f"SELECT phone FROM lawyer WHERE domain=%s AND phone IN ({placeholders})"
|
||
cur.execute(sql, [DOMAIN, *chunk])
|
||
for row in cur.fetchall():
|
||
existing.add(row[0])
|
||
finally:
|
||
cur.close()
|
||
return existing
|
||
|
||
def _load_cities(self):
|
||
condition = "domain='findlaw' AND level=2"
|
||
tables = ("area_new", "area2", "area")
|
||
last_error = None
|
||
for table in tables:
|
||
try:
|
||
rows = self.db.select_data(table, "city, province, pinyin", condition) or []
|
||
except Exception as exc:
|
||
last_error = exc
|
||
continue
|
||
if rows:
|
||
missing_pinyin = sum(1 for r in rows if not (r.get("pinyin") or "").strip())
|
||
print(f"[找法网] 城市来源表: {table}, 城市数: {len(rows)}, 缺少pinyin: {missing_pinyin}")
|
||
return rows
|
||
|
||
if last_error:
|
||
print(f"[找法网] 加载地区数据失败: {last_error}")
|
||
print("[找法网] 无城市数据(已尝试 area_new/area2/area)")
|
||
for table in tables:
|
||
try:
|
||
cnt = self.db.select_data(table, "COUNT(*) AS cnt", condition)
|
||
c = (cnt[0].get("cnt") if cnt else 0) if isinstance(cnt, (list, tuple)) else 0
|
||
print(f"[找法网] 校验: {table} 满足条件记录数: {c}")
|
||
except Exception:
|
||
pass
|
||
return []
|
||
|
||
def _fetch_page(self, url: str, referer: str) -> List[Dict]:
|
||
text = self._get(url, referer, verify=True)
|
||
if not text:
|
||
return []
|
||
|
||
try:
|
||
# 某些返回体前会携带 BOM 或包装脚本,此处做兼容
|
||
text = text.strip().lstrip("\ufeff")
|
||
try:
|
||
data = json.loads(text)
|
||
except ValueError:
|
||
json_start = text.find('{')
|
||
json_end = text.rfind('}')
|
||
if json_start == -1 or json_end == -1:
|
||
print(f"解析JSON失败 {url}: 返回内容开头: {text[:80]!r}")
|
||
return []
|
||
cleaned = text[json_start:json_end + 1]
|
||
data = json.loads(cleaned)
|
||
if isinstance(data, str):
|
||
try:
|
||
data = json.loads(data)
|
||
except ValueError:
|
||
print(f"解析JSON失败 {url}: 二次解析仍为字符串,开头: {str(data)[:80]!r}")
|
||
return []
|
||
except ValueError as exc:
|
||
print(f"解析JSON失败 {url}: {exc}")
|
||
return []
|
||
|
||
items = data.get("data", {}).get("lawyer_list", [])
|
||
parsed = []
|
||
for item in items:
|
||
phone = (item.get("mobile") or "").replace("-", "")
|
||
parsed.append({
|
||
"name": item.get("username", ""),
|
||
"law_firm": item.get("lawyer_lawroom", ""),
|
||
"province": item.get("areaInfo", {}).get("province", ""),
|
||
"city": item.get("areaInfo", {}).get("city", ""),
|
||
"phone": phone,
|
||
"url": url,
|
||
"domain": DOMAIN,
|
||
"create_time": int(time.time()),
|
||
"params": json.dumps(item, ensure_ascii=False)
|
||
})
|
||
return parsed
|
||
|
||
def run(self):
|
||
print("启动找法网采集...")
|
||
if not self.cities:
|
||
print("无城市数据")
|
||
return
|
||
|
||
for city in self.cities:
|
||
pinyin = city.get("pinyin")
|
||
province = city.get("province", "")
|
||
city_name = city.get("city", "")
|
||
if not pinyin:
|
||
continue
|
||
print(f"采集 {province}-{city_name}")
|
||
page = 1
|
||
while True:
|
||
url = LIST_TEMPLATE.format(pinyin=pinyin, page=page)
|
||
referer = f"https://m.findlaw.cn/{pinyin}/q_lawyer/"
|
||
print(f" 第 {page} 页: {url}")
|
||
items = self._fetch_page(url, referer)
|
||
if not items:
|
||
break
|
||
|
||
phones = [it.get("phone") for it in items if (it.get("phone") or "").strip()]
|
||
existing = self._existing_phones(phones)
|
||
|
||
for entry in items:
|
||
phone = entry.get("phone")
|
||
if not phone:
|
||
continue
|
||
if phone in existing:
|
||
print(f" -- 已存在: {entry['name']} ({phone})")
|
||
continue
|
||
try:
|
||
self.db.insert_data("lawyer", entry)
|
||
print(f" -> 新增: {entry['name']} ({phone})")
|
||
except Exception as exc:
|
||
print(f" 插入失败: {exc}")
|
||
|
||
page += 1
|
||
|
||
print("找法网采集完成")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
with Db() as db:
|
||
spider = FindlawSpider(db)
|
||
spider.run()
|