重构采集脚本并新增按时间导出Excel

- 统一五个站点采集逻辑与启动脚本\n- 新增 dls_fresh 采集流程与日志优化\n- 新增 export_lawyers_excel 按时间条件导出\n- 默认导出近7天并支持扩展字段解析\n- 整理 .gitignore,忽略 data/logs 本地产物
This commit is contained in:
hello-dd-code
2026-03-02 11:46:05 +08:00
parent 03847a4b8e
commit 19cf9ce901
12 changed files with 3545 additions and 1059 deletions
+433 -163
View File
@@ -1,9 +1,16 @@
import argparse
import ast
import hashlib
import json
import os
import random
import re
import sys
import time
import random
from typing import Dict, List, Set, Optional
from dataclasses import dataclass
from typing import Dict, Iterable, List, Optional, Set, Tuple
import urllib3
current_dir = os.path.dirname(os.path.abspath(__file__))
project_root = os.path.dirname(current_dir)
@@ -13,197 +20,460 @@ if request_dir not in sys.path:
if project_root not in sys.path:
sys.path.append(project_root)
from request.requests_client import RequestClientError, RequestSSLError, RequestsClient
from Db import Db
from request.requests_client import RequestClientError, RequestsClient
from utils.rate_limiter import wait_for_request
DOMAIN = "找法网"
LIST_TEMPLATE = "https://m.findlaw.cn/{pinyin}/q_lawyer/p{page}?ajax=1&order=0&sex=-1"
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
SITE_NAME = "findlaw"
LEGACY_DOMAIN = "找法网"
SITE_BASE = "https://m.findlaw.cn"
CITY_DATA_URL = "https://static.findlawimg.com/minify/?b=js&f=m/public/v1/areaDataTwo.js"
LIST_URL_TEMPLATE = SITE_BASE + "/{city_py}/q_lawyer/p{page}?ajax=1&order=0&sex=-1"
PHONE_RE = re.compile(r"1[3-9]\d{9}")
class FindlawSpider:
def __init__(self, db_connection):
@dataclass
class CityTarget:
province_id: str
province_name: str
province_py: str
city_id: str
city_name: str
city_py: str
def normalize_phone(text: str) -> str:
compact = re.sub(r"\D", "", text or "")
match = PHONE_RE.search(compact)
return match.group(0) if match else ""
class FindlawCrawler:
def __init__(
self,
max_pages: int = 9999,
sleep_seconds: float = 0.1,
use_proxy: bool = True,
db_connection=None,
):
self.max_pages = max_pages
self.sleep_seconds = max(0.0, sleep_seconds)
self.db = db_connection
self.client = self._build_session()
self.cities = self._load_cities()
self.client = RequestsClient(
headers={
"User-Agent": (
"Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) "
"AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 "
"Mobile/15E148 Safari/604.1"
),
"Accept": "application/json, text/javascript, */*; q=0.01",
"X-Requested-With": "XMLHttpRequest",
"Connection": "close",
},
use_proxy=use_proxy,
retry_total=2,
retry_backoff_factor=1,
retry_status_forcelist=(429, 500, 502, 503, 504),
retry_allowed_methods=("GET",),
)
def _build_session(self) -> RequestsClient:
return RequestsClient(headers={
"User-Agent": (
"Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) "
"AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 "
"Mobile/15E148 Safari/604.1"
),
"Accept": "application/json, text/javascript, */*; q=0.01",
"X-Requested-With": "XMLHttpRequest",
"Connection": "close",
})
def _refresh_session(self) -> None:
self.client.refresh()
def _get(self, url: str, referer: str, verify: bool = True, max_retries: int = 3) -> Optional[str]:
def _get_text(
self,
url: str,
timeout: int = 20,
max_retries: int = 3,
referer: str = SITE_BASE,
) -> str:
headers = {"Referer": referer}
for attempt in range(max_retries):
try:
resp = self.client.get_text(url, timeout=15, verify=verify, headers=headers)
status_code = resp.status_code
text = resp.text
if status_code == 403:
if attempt < max_retries - 1:
wait_time = 2 ** attempt + random.uniform(0.3, 1.0)
print(f"403被拦截,{wait_time}秒后重试 ({attempt + 1}/{max_retries}): {url}")
self._refresh_session()
time.sleep(wait_time)
continue
print(f"请求失败 {url}: 403 Forbidden")
return None
if status_code >= 400:
raise RequestClientError(f"{status_code} Error: {url}")
return text
except RequestSSLError:
if verify:
return self._get(url, referer, verify=False, max_retries=max_retries)
print(f"SSL错误 {url}")
return None
except RequestClientError as exc:
print(f"请求失败 {url}: {exc}")
return None
return None
last_error: Optional[Exception] = None
def _existing_phones(self, phones: List[str]) -> Set[str]:
if not phones:
for attempt in range(max_retries):
wait_for_request()
try:
resp = self.client.get_text(url, timeout=timeout, verify=False, headers=headers)
code = resp.status_code
if code == 403:
if attempt < max_retries - 1:
self.client.refresh()
time.sleep((2 ** attempt) + random.uniform(0.2, 0.8))
continue
raise RequestClientError(f"{code} Error: {url}")
if code >= 500 and attempt < max_retries - 1:
time.sleep((2 ** attempt) + random.uniform(0.2, 0.8))
continue
if code >= 400:
raise RequestClientError(f"{code} Error: {url}")
return resp.text
except Exception as exc:
last_error = exc
if attempt < max_retries - 1:
time.sleep((2 ** attempt) + random.uniform(0.2, 0.8))
continue
raise
if last_error is not None:
raise last_error
raise RequestClientError(f"Unknown request error: {url}")
def _parse_city_js_array(self, script_text: str, var_name: str) -> List[Dict]:
pattern = rf"var\s+{re.escape(var_name)}\s*=\s*(\[[\s\S]*?\]);"
match = re.search(pattern, script_text)
if not match:
return []
raw = match.group(1)
try:
rows = ast.literal_eval(raw)
return rows if isinstance(rows, list) else []
except Exception:
return []
def discover_cities(self) -> List[CityTarget]:
js_text = self._get_text(CITY_DATA_URL, referer=SITE_BASE + "/findlawyers/")
provinces = self._parse_city_js_array(js_text, "iosProvinces")
cities = self._parse_city_js_array(js_text, "iosCitys")
province_map: Dict[str, Dict] = {}
for item in provinces:
pid = str(item.get("id") or "").strip()
if pid:
province_map[pid] = item
results: List[CityTarget] = []
seen_py: Set[str] = set()
for city in cities:
city_py = str(city.get("pinyin") or "").strip()
city_name = str(city.get("value") or "").strip()
city_id = str(city.get("id") or "").strip()
province_id = str(city.get("parentId") or "").strip()
if not city_py or not city_name or not city_id:
continue
if city_py in seen_py:
continue
seen_py.add(city_py)
province_row = province_map.get(province_id, {})
province_name = str(province_row.get("value") or city_name).strip()
province_py = str(province_row.get("pinyin") or city_py).strip()
results.append(
CityTarget(
province_id=province_id,
province_name=province_name,
province_py=province_py,
city_id=city_id,
city_name=city_name,
city_py=city_py,
)
)
return results
def _parse_list_payload(self, text: str) -> Dict:
cleaned = (text or "").strip().lstrip("\ufeff")
try:
return json.loads(cleaned)
except ValueError:
start = cleaned.find("{")
end = cleaned.rfind("}")
if start == -1 or end == -1:
return {}
return json.loads(cleaned[start:end + 1])
def fetch_list_page(self, city_py: str, page: int) -> Tuple[List[Dict], bool, str]:
list_url = LIST_URL_TEMPLATE.format(city_py=city_py, page=page)
referer = f"{SITE_BASE}/{city_py}/q_lawyer/"
text = self._get_text(list_url, referer=referer)
payload = self._parse_list_payload(text)
if payload.get("errcode") != 0:
return [], False, list_url
data = payload.get("data", {}) or {}
items = data.get("lawyer_list", []) or []
has_more = str(data.get("has_more", "0")) == "1"
return items, has_more, list_url
def crawl_city(self, target: CityTarget) -> Iterable[Dict]:
for page in range(1, self.max_pages + 1):
try:
items, has_more, list_url = self.fetch_list_page(target.city_py, page)
except Exception as exc:
print(f"[list] 失败 {target.city_py} p{page}: {exc}")
break
if not items:
break
for item in items:
detail_url = item.get("siteask_m") or item.get("site_url") or ""
detail_url = str(detail_url).strip()
if not detail_url.startswith("http"):
detail_url = list_url
phone = normalize_phone(item.get("mobile", ""))
profile = {
"uid": str(item.get("uid") or ""),
"name": str(item.get("username") or "").strip(),
"law_firm": str(item.get("lawyer_lawroom") or "").strip(),
"phone": phone,
"lawyer_year": item.get("lawyer_year"),
"service_area": str(item.get("service_area") or "").strip(),
"address": str(item.get("addr") or "").strip(),
"specialties": item.get("professionArr") or [],
"answer_count": item.get("ansnum"),
"comment_count": item.get("askcommentnum"),
}
now = int(time.time())
uid = profile.get("uid", "")
record_key = uid or detail_url
record_id = hashlib.md5(record_key.encode("utf-8")).hexdigest()
area = item.get("areaInfo", {}) or {}
yield {
"record_id": record_id,
"collected_at": now,
"source": {
"site": SITE_NAME,
"list_url": list_url,
"detail_url": detail_url,
"province": str(area.get("province") or target.province_name),
"province_py": target.province_py,
"city": str(area.get("city") or target.city_name),
"city_py": target.city_py,
"page": page,
},
"list_snapshot": {
"uid": uid,
"name": profile["name"],
"law_firm": profile["law_firm"],
"answer_count": profile["answer_count"],
"comment_count": profile["comment_count"],
},
"profile": profile,
"raw": item,
}
if self.sleep_seconds:
time.sleep(self.sleep_seconds)
if not has_more:
break
def _to_legacy_lawyer_row(self, record: Dict) -> Optional[Dict[str, str]]:
source = record.get("source", {}) or {}
profile = record.get("profile", {}) or {}
phone = normalize_phone(profile.get("phone", ""))
if not phone:
return None
province = (source.get("province") or "").strip()
city = (source.get("city") or province).strip()
return {
"name": (profile.get("name") or "").strip(),
"law_firm": (profile.get("law_firm") or "").strip(),
"province": province,
"city": city,
"phone": phone,
"url": (source.get("detail_url") or source.get("list_url") or "").strip(),
"domain": LEGACY_DOMAIN,
"create_time": int(record.get("collected_at") or time.time()),
"params": json.dumps(record, ensure_ascii=False),
}
def _existing_phones_in_db(self, phones: List[str]) -> Set[str]:
if not self.db or not phones:
return set()
deduped = sorted({p for p in phones if p})
if not deduped:
return set()
existing: Set[str] = set()
cur = self.db.db.cursor()
try:
chunk_size = 500
for i in range(0, len(phones), chunk_size):
chunk = phones[i:i + chunk_size]
for i in range(0, len(deduped), chunk_size):
chunk = deduped[i:i + chunk_size]
placeholders = ",".join(["%s"] * len(chunk))
sql = f"SELECT phone FROM lawyer WHERE domain=%s AND phone IN ({placeholders})"
cur.execute(sql, [DOMAIN, *chunk])
cur.execute(sql, [LEGACY_DOMAIN, *chunk])
for row in cur.fetchall():
existing.add(row[0])
finally:
cur.close()
return existing
def _load_cities(self):
condition = "domain='findlaw' AND level=2"
tables = ("area_new", "area2", "area")
last_error = None
for table in tables:
def _write_records_to_db(self, records: List[Dict]) -> Tuple[int, int]:
if not self.db:
return 0, 0
rows: List[Dict[str, str]] = []
for record in records:
row = self._to_legacy_lawyer_row(record)
if row:
rows.append(row)
if not rows:
return 0, 0
existing = self._existing_phones_in_db([row["phone"] for row in rows])
inserted = 0
skipped = 0
for row in rows:
phone = row.get("phone", "")
if not phone or phone in existing:
skipped += 1
continue
try:
rows = self.db.select_data(table, "city, province, pinyin", condition) or []
self.db.insert_data("lawyer", row)
existing.add(phone)
inserted += 1
except Exception as exc:
last_error = exc
continue
if rows:
missing_pinyin = sum(1 for r in rows if not (r.get("pinyin") or "").strip())
print(f"[找法网] 城市来源表: {table}, 城市数: {len(rows)}, 缺少pinyin: {missing_pinyin}")
return rows
skipped += 1
print(f"[db] 插入失败 phone={phone} url={row.get('url', '')}: {exc}")
return inserted, skipped
if last_error:
print(f"[找法网] 加载地区数据失败: {last_error}")
print("[找法网] 无城市数据(已尝试 area_new/area2/area")
for table in tables:
try:
cnt = self.db.select_data(table, "COUNT(*) AS cnt", condition)
c = (cnt[0].get("cnt") if cnt else 0) if isinstance(cnt, (list, tuple)) else 0
print(f"[找法网] 校验: {table} 满足条件记录数: {c}")
except Exception:
pass
return []
def crawl(
self,
output_path: str,
max_cities: int = 0,
city_filter: Optional[str] = None,
) -> None:
cities = self.discover_cities()
print(f"[discover] 共发现城市 {len(cities)}")
if city_filter:
key = city_filter.strip().lower()
cities = [c for c in cities if key in c.city_py.lower() or key in c.city_name.lower()]
print(f"[discover] 过滤后城市 {len(cities)} 个, filter={city_filter}")
if max_cities > 0:
cities = cities[:max_cities]
print(f"[discover] 截断城市数 {len(cities)}")
def _fetch_page(self, url: str, referer: str) -> List[Dict]:
text = self._get(url, referer, verify=True)
if not text:
return []
os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True)
try:
# 某些返回体前会携带 BOM 或包装脚本,此处做兼容
text = text.strip().lstrip("\ufeff")
try:
data = json.loads(text)
except ValueError:
json_start = text.find('{')
json_end = text.rfind('}')
if json_start == -1 or json_end == -1:
print(f"解析JSON失败 {url}: 返回内容开头: {text[:80]!r}")
return []
cleaned = text[json_start:json_end + 1]
data = json.loads(cleaned)
if isinstance(data, str):
try:
data = json.loads(data)
except ValueError:
print(f"解析JSON失败 {url}: 二次解析仍为字符串,开头: {str(data)[:80]!r}")
return []
except ValueError as exc:
print(f"解析JSON失败 {url}: {exc}")
return []
items = data.get("data", {}).get("lawyer_list", [])
parsed = []
for item in items:
phone = (item.get("mobile") or "").replace("-", "")
parsed.append({
"name": item.get("username", ""),
"law_firm": item.get("lawyer_lawroom", ""),
"province": item.get("areaInfo", {}).get("province", ""),
"city": item.get("areaInfo", {}).get("city", ""),
"phone": phone,
"url": url,
"domain": DOMAIN,
"create_time": int(time.time()),
"params": json.dumps(item, ensure_ascii=False)
})
return parsed
def run(self):
print("启动找法网采集...")
if not self.cities:
print("无城市数据")
return
for city in self.cities:
pinyin = city.get("pinyin")
province = city.get("province", "")
city_name = city.get("city", "")
if not pinyin:
continue
print(f"采集 {province}-{city_name}")
page = 1
while True:
url = LIST_TEMPLATE.format(pinyin=pinyin, page=page)
referer = f"https://m.findlaw.cn/{pinyin}/q_lawyer/"
print(f"{page} 页: {url}")
items = self._fetch_page(url, referer)
if not items:
break
phones = [it.get("phone") for it in items if (it.get("phone") or "").strip()]
existing = self._existing_phones(phones)
for entry in items:
phone = entry.get("phone")
if not phone:
continue
if phone in existing:
print(f" -- 已存在: {entry['name']} ({phone})")
seen_ids: Set[str] = set()
if os.path.exists(output_path):
with open(output_path, "r", encoding="utf-8") as old_file:
for line in old_file:
line = line.strip()
if not line:
continue
try:
self.db.insert_data("lawyer", entry)
print(f" -> 新增: {entry['name']} ({phone})")
except Exception as exc:
print(f" 插入失败: {exc}")
item = json.loads(line)
except Exception:
continue
rid = item.get("record_id")
if rid:
seen_ids.add(rid)
print(f"[resume] 已有记录 {len(seen_ids)}")
page += 1
total_new_json = 0
total_new_db = 0
total_skip_db = 0
print("找法网采集完成")
with open(output_path, "a", encoding="utf-8") as out:
for idx, target in enumerate(cities, start=1):
print(
f"[city {idx}/{len(cities)}] {target.province_name}-{target.city_name} "
f"({target.city_py})"
)
city_records = list(self.crawl_city(target))
city_new_json = 0
for record in city_records:
rid = record["record_id"]
if rid in seen_ids:
continue
out.write(json.dumps(record, ensure_ascii=False) + "\n")
seen_ids.add(rid)
city_new_json += 1
total_new_json += 1
city_new_db, city_skip_db = self._write_records_to_db(city_records)
total_new_db += city_new_db
total_skip_db += city_skip_db
print(
f"[city] 采集{len(city_records)}条, JSON新增{city_new_json}条, "
f"DB新增{city_new_db}条, DB跳过{city_skip_db}"
)
print(
f"[done] JSON新增{total_new_json}条, DB新增{total_new_db}条, "
f"DB跳过{total_skip_db}条, 输出: {output_path}"
)
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="找法网全新采集脚本(重写版)")
parser.add_argument(
"--output",
default="/www/wwwroot/lawyers/data/findlaw_records_all.jsonl",
help="输出 jsonl 文件路径",
)
parser.add_argument(
"--max-cities",
type=int,
default=0,
help="最多采集多少个城市,0 表示不限",
)
parser.add_argument(
"--max-pages",
type=int,
default=9999,
help="每个城市最多采集多少页",
)
parser.add_argument(
"--city-filter",
default="",
help="按城市拼音或城市名过滤,如 beijing",
)
parser.add_argument(
"--sleep",
type=float,
default=0.1,
help="每条记录采集间隔秒数",
)
parser.add_argument(
"--direct",
action="store_true",
help="直连模式,不使用 proxy_settings.json 代理",
)
parser.add_argument(
"--no-db",
action="store_true",
help="只输出 JSONL,不写入数据库",
)
return parser.parse_args()
def main():
args = parse_args()
if args.no_db:
crawler = FindlawCrawler(
max_pages=args.max_pages,
sleep_seconds=args.sleep,
use_proxy=not args.direct,
db_connection=None,
)
crawler.crawl(
output_path=args.output,
max_cities=args.max_cities,
city_filter=args.city_filter or None,
)
return
with Db() as db:
crawler = FindlawCrawler(
max_pages=args.max_pages,
sleep_seconds=args.sleep,
use_proxy=not args.direct,
db_connection=db,
)
crawler.crawl(
output_path=args.output,
max_cities=args.max_cities,
city_filter=args.city_filter or None,
)
if __name__ == "__main__":
with Db() as db:
spider = FindlawSpider(db)
spider.run()
main()