重构采集脚本并新增按时间导出Excel

- 统一五个站点采集逻辑与启动脚本\n- 新增 dls_fresh 采集流程与日志优化\n- 新增 export_lawyers_excel 按时间条件导出\n- 默认导出近7天并支持扩展字段解析\n- 整理 .gitignore,忽略 data/logs 本地产物
This commit is contained in:
hello-dd-code
2026-03-02 11:46:05 +08:00
parent 03847a4b8e
commit 19cf9ce901
12 changed files with 3545 additions and 1059 deletions
+606 -272
View File
@@ -1,10 +1,18 @@
import argparse
import ast
import hashlib
import json
import os
import random
import re
import sys
import time
import random
from typing import Dict, Optional
from dataclasses import dataclass
from typing import Dict, Iterable, List, Optional, Set, Tuple
from urllib.parse import urljoin
import urllib3
from bs4 import BeautifulSoup
current_dir = os.path.dirname(os.path.abspath(__file__))
project_root = os.path.dirname(current_dir)
@@ -14,312 +22,638 @@ if request_dir not in sys.path:
if project_root not in sys.path:
sys.path.append(project_root)
from bs4 import BeautifulSoup
from request.requests_client import RequestClientError, RequestsClient
from Db import Db
from config import HEADERS
from request.requests_client import RequestClientError, RequestsClient
from utils.rate_limiter import wait_for_request
LIST_URL = "https://m.66law.cn/findlawyer/rpc/loadlawyerlist/"
DOMAIN = "华律"
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
SITE_NAME = "hualv"
LEGACY_DOMAIN = "华律"
SITE_BASE = "https://m.66law.cn"
CITY_DATA_URL = "https://cache.66law.cn/dist/main-v2.0.js"
LIST_API_URL = "https://m.66law.cn/findlawyer/rpc/loadlawyerlist/"
PHONE_RE = re.compile(r"1[3-9]\d{9}")
EMAIL_RE = re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}")
YEAR_RE = re.compile(r"执业\s*(\d+)\s*年")
class HualvSpider:
def __init__(self, db_connection):
@dataclass
class CityTarget:
province_id: int
province_name: str
city_id: int
city_name: str
def normalize_phone(text: str) -> str:
compact = re.sub(r"\D", "", text or "")
match = PHONE_RE.search(compact)
return match.group(0) if match else ""
def strip_html_tags(text: str) -> str:
return re.sub(r"<[^>]+>", "", text or "").strip()
class HualvCrawler:
def __init__(
self,
max_pages: int = 9999,
sleep_seconds: float = 0.15,
use_proxy: bool = True,
db_connection=None,
):
self.max_pages = max_pages
self.sleep_seconds = max(0.0, sleep_seconds)
self.db = db_connection
self.client = self._build_session()
self.areas = self._load_areas()
def _build_session(self) -> RequestsClient:
custom_headers = HEADERS.copy()
custom_headers['User-Agent'] = (
'Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) '
'AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 '
'Mobile/15E148 Safari/604.1'
self.client = RequestsClient(
headers={
"User-Agent": (
"Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) "
"AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 "
"Mobile/15E148 Safari/604.1"
),
"Accept": "application/json, text/javascript, */*; q=0.01",
"X-Requested-With": "XMLHttpRequest",
"Connection": "close",
},
use_proxy=use_proxy,
retry_total=2,
retry_backoff_factor=1,
retry_status_forcelist=(429, 500, 502, 503, 504),
retry_allowed_methods=("GET", "POST"),
)
custom_headers["Connection"] = "close"
return RequestsClient(headers=custom_headers)
def _refresh_session(self) -> None:
self.client.refresh()
def _request_text(
self,
method: str,
url: str,
*,
timeout: int = 20,
max_retries: int = 3,
referer: str = SITE_BASE,
data: Optional[Dict] = None,
) -> str:
headers = {"Referer": referer}
last_error: Optional[Exception] = None
def _load_areas(self):
tables = ("area_new", "area2", "area")
last_error = None
for table in tables:
for attempt in range(max_retries):
wait_for_request()
try:
provinces = self.db.select_data(
table,
"code, province, pinyin, id",
"domain='66law' AND level=1"
) or []
cities = self.db.select_data(
table,
"code, city, province, pid",
"domain='66law' AND level=2"
) or []
if method.upper() == "POST":
resp = self.client.post_text(
url,
timeout=timeout,
verify=False,
headers=headers,
data=data,
)
else:
resp = self.client.get_text(
url,
timeout=timeout,
verify=False,
headers=headers,
)
code = resp.status_code
if code == 403:
if attempt < max_retries - 1:
self.client.refresh()
time.sleep((2 ** attempt) + random.uniform(0.2, 0.8))
continue
raise RequestClientError(f"{code} Error: {url}")
if code >= 500 and attempt < max_retries - 1:
time.sleep((2 ** attempt) + random.uniform(0.2, 0.8))
continue
if code >= 400:
raise RequestClientError(f"{code} Error: {url}")
return resp.text
except Exception as exc:
last_error = exc
continue
if attempt < max_retries - 1:
time.sleep((2 ** attempt) + random.uniform(0.2, 0.8))
continue
raise
if not cities:
continue
if last_error is not None:
raise last_error
raise RequestClientError(f"Unknown request error: {url}")
province_map = {p.get('id'): {"code": p.get('code'), "name": p.get('province')} for p in provinces}
city_map = {}
for city in cities:
province_info = province_map.get(city.get('pid'), {}) or {}
province_code = province_info.get('code')
city_map[city.get('code')] = {
"name": city.get('city'),
"province": city.get('province'),
"province_code": province_code,
}
print(f"[华律] 城市来源表: {table}, 城市数: {len(cities)}")
return city_map
if last_error:
print(f"[华律] 加载地区数据失败: {last_error}")
print("[华律] 无城市数据(已尝试 area_new/area2/area")
return {}
def _post(self, data: Dict[str, str], max_retries: int = 3) -> Optional[Dict]:
for attempt in range(max_retries):
try:
resp = self.client.post_text(LIST_URL, data=data, timeout=20, verify=False)
status_code = resp.status_code
text = resp.text
if status_code == 403:
if attempt < max_retries - 1:
wait_time = 2 ** attempt + random.uniform(0.3, 1.0)
print(f"403被拦截,{wait_time}秒后重试 ({attempt + 1}/{max_retries})")
self._refresh_session()
time.sleep(wait_time)
continue
print("请求失败: 403 Forbidden")
return None
if status_code >= 400:
raise RequestClientError(f"{status_code} Error")
try:
return json.loads(text)
except ValueError as exc:
print(f"解析JSON失败: {exc}")
return None
except RequestClientError as exc:
print(f"请求失败: {exc}")
return None
return None
def _parse_detail(self, url: str, province: str, city: str) -> Optional[Dict[str, str]]:
contact_url = f"{url}lawyer_contact.aspx"
print(f" 详情: {contact_url}")
existing = self.db.select_data(
"lawyer",
"id, avatar_url",
f"domain='{DOMAIN}' AND url='{contact_url}'"
def _get_text(self, url: str, *, timeout: int = 20, max_retries: int = 3, referer: str = SITE_BASE) -> str:
return self._request_text(
"GET",
url,
timeout=timeout,
max_retries=max_retries,
referer=referer,
)
existing_id = None
if existing:
existing_id = existing[0].get("id")
avatar = (existing[0].get("avatar_url") or "").strip()
if avatar:
print(" -- 已存在且头像已补全,跳过")
return None
html = self._get_detail(contact_url)
if not html:
return None
def _post_text(
self,
url: str,
*,
data: Dict,
timeout: int = 20,
max_retries: int = 3,
referer: str = SITE_BASE,
) -> str:
return self._request_text(
"POST",
url,
timeout=timeout,
max_retries=max_retries,
referer=referer,
data=data,
)
def _extract_spc_location(self, script_text: str) -> List:
# main-v2.js 内置了 sPCLocation=new Array(...),后面紧跟 cateinfo 数组
marker = "sPCLocation = new Array("
start = script_text.find(marker)
if start == -1:
marker = "sPCLocation=new Array("
start = script_text.find(marker)
if start == -1:
return []
start += len(marker)
next_marker = script_text.find("cateinfo = new Array(", start)
if next_marker == -1:
next_marker = script_text.find("cateinfo=new Array(", start)
if next_marker != -1:
end = script_text.rfind(");", start, next_marker)
else:
end = script_text.find(");", start)
if end == -1 or end <= start:
return []
raw = "[" + script_text[start:end] + "]"
try:
data = ast.literal_eval(raw)
except Exception:
return []
return data if isinstance(data, list) else []
def discover_cities(self) -> List[CityTarget]:
script_text = self._get_text(CITY_DATA_URL, referer=SITE_BASE + "/findlawyer/")
rows = self._extract_spc_location(script_text)
targets: List[CityTarget] = []
seen: Set[Tuple[int, int]] = set()
for province in rows:
if not isinstance(province, list) or len(province) < 3:
continue
try:
province_id = int(province[0])
except Exception:
continue
province_name = str(province[1] or "").strip()
city_rows = province[2] if isinstance(province[2], list) else []
for city in city_rows:
if not isinstance(city, list) or len(city) < 2:
continue
try:
city_id = int(city[0])
except Exception:
continue
city_name = str(city[1] or "").strip()
if city_id <= 0 or not city_name:
continue
key = (province_id, city_id)
if key in seen:
continue
seen.add(key)
targets.append(
CityTarget(
province_id=province_id,
province_name=province_name,
city_id=city_id,
city_name=city_name,
)
)
return targets
def fetch_list_page(self, target: CityTarget, page: int) -> Tuple[List[Dict], int]:
payload = {
"pid": str(target.province_id),
"cid": str(target.city_id),
"page": str(page),
}
text = self._post_text(
LIST_API_URL,
data=payload,
referer=SITE_BASE + "/findlawyer/",
)
data = json.loads((text or "").strip().lstrip("\ufeff") or "{}")
items = data.get("lawyerList") or data.get("queryLawyerList") or []
if not isinstance(items, list):
items = []
page_count = 0
try:
page_count = int((data.get("lawyerItems") or {}).get("pageCount") or 0)
except Exception:
page_count = 0
return items, page_count
def parse_detail(self, detail_url: str) -> Dict:
contact_url = detail_url.rstrip("/") + "/lawyer_contact.aspx"
html = self._get_text(contact_url, referer=detail_url)
soup = BeautifulSoup(html, "html.parser")
info_list = soup.find("ul", class_="information-list")
if not info_list:
return None
phone = ""
law_firm = ""
for li in info_list.find_all("li"):
text = li.get_text(strip=True)
if "手机号" in text:
cleaned = text.replace("手机号", "").replace("(咨询请说明来自 华律网)", "").strip()
match = re.search(r"1\d{10}", cleaned.replace('-', '').replace(' ', ''))
if match:
phone = match.group(0)
if "执业单位" in text:
law_firm = text.replace("执业单位", "").strip()
full_text = soup.get_text(" ", strip=True)
name = ""
breadcrumb = soup.find("div", class_="weizhi")
if breadcrumb:
links = breadcrumb.find_all("a")
if len(links) > 2:
name = links[2].get_text(strip=True)
law_firm = ""
phone = ""
email = ""
address = ""
license_no = ""
practice_years: Optional[int] = None
phone = phone.replace('-', '').strip()
if not phone or not re.fullmatch(r"1\d{10}", phone):
print(" 无手机号,跳过")
name_tag = soup.select_one(".logo-box .title b")
if name_tag:
name = name_tag.get_text(strip=True).replace("律师", "").strip()
if not name and soup.title:
match = re.search(r"([^\s,,。_]+?)律师", soup.title.get_text(" ", strip=True))
if match:
name = match.group(1).strip()
phone_candidates = [
soup.select_one(".logo-box .r-bar .tel").get_text(" ", strip=True)
if soup.select_one(".logo-box .r-bar .tel")
else "",
soup.select_one(".lawyer-show ul.info").get_text(" ", strip=True)
if soup.select_one(".lawyer-show ul.info")
else "",
full_text,
]
for candidate in phone_candidates:
phone = normalize_phone(candidate)
if phone:
break
for li in soup.select(".lawyer-show ul.info li"):
li_text = li.get_text(" ", strip=True)
if ("事务所" in li_text or "法律服务所" in li_text) and not law_firm:
law_firm = li_text
if not law_firm:
match = re.search(r'"affiliation":\{"@type":"Organization","name":"([^"]+)"', html)
if match:
law_firm = match.group(1).strip()
match = re.search(r'"identifier":"([^"]+)"', html)
if match:
license_no = match.group(1).strip()
match = re.search(r'"streetAddress":"([^"]+)"', html)
if match:
address = match.group(1).strip()
email_match = EMAIL_RE.search(html)
if email_match:
email = email_match.group(0).strip()
year_match = YEAR_RE.search(full_text)
if year_match:
try:
practice_years = int(year_match.group(1))
except Exception:
practice_years = None
specialties = [node.get_text(strip=True) for node in soup.select(".tag-h38 span")]
specialties = [x for x in specialties if x]
return {
"name": name,
"law_firm": law_firm,
"phone": phone,
"email": email,
"address": address,
"license_no": license_no,
"practice_years": practice_years,
"specialties": specialties,
"detail_url": detail_url,
"contact_url": contact_url,
}
def crawl_city(self, target: CityTarget) -> Iterable[Dict]:
seen_details: Set[str] = set()
for page in range(1, self.max_pages + 1):
try:
items, page_count = self.fetch_list_page(target, page)
except Exception as exc:
print(f"[list] 失败 pid={target.province_id} cid={target.city_id} p{page}: {exc}")
break
if not items:
break
for item in items:
detail_url = str(item.get("lawyerUrl") or "").strip()
if not detail_url:
continue
if detail_url.startswith("//"):
detail_url = "https:" + detail_url
if not detail_url.startswith("http"):
detail_url = urljoin(SITE_BASE, detail_url)
if detail_url in seen_details:
continue
seen_details.add(detail_url)
try:
detail = self.parse_detail(detail_url)
except Exception as exc:
print(f"[detail] 失败 {detail_url}: {exc}")
continue
now = int(time.time())
uid = str(item.get("lawyerId") or item.get("globalUserId") or detail_url)
record_id = hashlib.md5(uid.encode("utf-8")).hexdigest()
list_name = str(item.get("name") or "").replace("律师", "").strip()
category_text = str(item.get("categoryNames") or "").strip()
category_arr = [x.strip() for x in re.split(r"[、,]", category_text) if x.strip()]
yield {
"record_id": record_id,
"collected_at": now,
"source": {
"site": SITE_NAME,
"province_id": target.province_id,
"province": target.province_name,
"city_id": target.city_id,
"city": target.city_name,
"page": page,
"detail_url": detail_url,
"contact_url": detail.get("contact_url", ""),
},
"list_snapshot": {
"lawyer_id": item.get("lawyerId"),
"name": list_name,
"category_names": category_arr,
"help_count": strip_html_tags(str(item.get("helpCount") or "")),
"comment_score": strip_html_tags(str(item.get("commentScore") or "")),
"response_time": str(item.get("responseTime") or "").strip(),
"year": item.get("year"),
"is_adv": bool(item.get("isAdv")),
},
"profile": {
"name": detail.get("name") or list_name,
"law_firm": detail.get("law_firm") or "",
"phone": detail.get("phone") or "",
"email": detail.get("email") or "",
"address": detail.get("address") or "",
"license_no": detail.get("license_no") or "",
"practice_years": detail.get("practice_years"),
"specialties": detail.get("specialties") or category_arr,
},
"raw": item,
}
if self.sleep_seconds:
time.sleep(self.sleep_seconds)
if page_count > 0 and page >= page_count:
break
def _to_legacy_lawyer_row(self, record: Dict) -> Optional[Dict[str, str]]:
source = record.get("source", {}) or {}
profile = record.get("profile", {}) or {}
phone = normalize_phone(profile.get("phone", ""))
if not phone:
return None
avatar_url, site_time = self._extract_avatar_and_time(soup)
data = {
"phone": phone,
province = (source.get("province") or "").strip()
city = (source.get("city") or province).strip()
return {
"name": (profile.get("name") or "").strip(),
"law_firm": (profile.get("law_firm") or "").strip(),
"province": province,
"city": city,
"law_firm": law_firm,
"url": contact_url,
"avatar_url": avatar_url,
"create_time": int(time.time()),
"site_time": site_time,
"domain": DOMAIN,
"name": name,
"params": json.dumps({"source": url}, ensure_ascii=False)
"phone": phone,
"url": (source.get("contact_url") or source.get("detail_url") or "").strip(),
"domain": LEGACY_DOMAIN,
"create_time": int(record.get("collected_at") or time.time()),
"params": json.dumps(record, ensure_ascii=False),
}
if existing_id:
update_data = {
"avatar_url": avatar_url,
"site_time": site_time,
}
if name:
update_data["name"] = name
if law_firm:
update_data["law_firm"] = law_firm
if province:
update_data["province"] = province
if city:
update_data["city"] = city
if phone:
update_data["phone"] = phone
update_data["params"] = json.dumps({"source": url}, ensure_ascii=False)
try:
self.db.update_data("lawyer", update_data, f"id={existing_id}")
print(" -- 已存在,已补全头像/时间")
except Exception as exc:
print(f" 更新失败: {exc}")
return None
# 若手机号已存在,则更新头像/时间,不再插入新记录
existing_phone = self.db.select_data(
"lawyer",
"id, avatar_url, url",
f"domain='{DOMAIN}' AND phone='{phone}'"
)
if existing_phone:
existing_row = existing_phone[0]
avatar = (existing_row.get("avatar_url") or "").strip()
if avatar:
print(" -- 已存在手机号且头像已补全,跳过")
return None
update_data = {
"avatar_url": avatar_url,
"site_time": site_time,
}
if name:
update_data["name"] = name
if law_firm:
update_data["law_firm"] = law_firm
if province:
update_data["province"] = province
if city:
update_data["city"] = city
if phone:
update_data["phone"] = phone
if not existing_row.get("url"):
update_data["url"] = contact_url
update_data["params"] = json.dumps({"source": url}, ensure_ascii=False)
try:
self.db.update_data("lawyer", update_data, f"id={existing_row.get('id')}")
print(" -- 已存在手机号,已补全头像/时间")
except Exception as exc:
print(f" 更新失败: {exc}")
return None
return data
def _extract_avatar_and_time(self, soup: BeautifulSoup) -> (str, Optional[int]):
avatar_url = ""
site_time = None
img_tag = soup.select_one(
"div.fixed-bottom-bar div.contact-lawye a.lr-photo img"
)
if img_tag:
src = (img_tag.get("src") or "").strip()
if src:
if src.startswith("//"):
avatar_url = f"https:{src}"
else:
avatar_url = src
match = re.search(r"/(20\d{2})(\d{2})/", avatar_url)
if match:
site_time = int(f"{match.group(1)}{match.group(2)}")
else:
match = re.search(r"(20\d{2})(\d{2})\d{2}", avatar_url)
if match:
site_time = int(f"{match.group(1)}{match.group(2)}")
return avatar_url, site_time
def _existing_phones_in_db(self, phones: List[str]) -> Set[str]:
if not self.db or not phones:
return set()
def _get_detail(self, url: str, max_retries: int = 3) -> Optional[str]:
for attempt in range(max_retries):
try:
resp = self.client.get_text(url, timeout=15, verify=False)
status_code = resp.status_code
text = resp.text
if status_code == 403:
if attempt < max_retries - 1:
wait_time = 2 ** attempt + random.uniform(0.3, 1.0)
print(f" 403被拦截,{wait_time}秒后重试 ({attempt + 1}/{max_retries})")
self._refresh_session()
time.sleep(wait_time)
continue
print(" 请求失败: 403 Forbidden")
return None
if status_code >= 400:
raise RequestClientError(f"{status_code} Error")
return text
except RequestClientError as exc:
print(f" 请求失败: {exc}")
return None
return None
deduped = sorted({p for p in phones if p})
if not deduped:
return set()
def run(self):
print("启动华律网采集...")
if not self.areas:
print("无城市数据")
return
existing: Set[str] = set()
cur = self.db.db.cursor()
try:
chunk_size = 500
for i in range(0, len(deduped), chunk_size):
chunk = deduped[i:i + chunk_size]
placeholders = ",".join(["%s"] * len(chunk))
sql = f"SELECT phone FROM lawyer WHERE domain=%s AND phone IN ({placeholders})"
cur.execute(sql, [LEGACY_DOMAIN, *chunk])
for row in cur.fetchall():
existing.add(row[0])
finally:
cur.close()
for city_code, city_info in self.areas.items():
province_code = city_info.get("province_code")
if not province_code:
return existing
def _write_records_to_db(self, records: List[Dict]) -> Tuple[int, int]:
if not self.db:
return 0, 0
rows: List[Dict[str, str]] = []
for record in records:
row = self._to_legacy_lawyer_row(record)
if row:
rows.append(row)
if not rows:
return 0, 0
existing = self._existing_phones_in_db([row["phone"] for row in rows])
inserted = 0
skipped = 0
for row in rows:
phone = row.get("phone", "")
if not phone or phone in existing:
skipped += 1
continue
province_name = city_info.get("province", "")
city_name = city_info.get("name", "")
print(f"采集 {province_name}-{city_name}")
try:
self.db.insert_data("lawyer", row)
existing.add(phone)
inserted += 1
except Exception as exc:
skipped += 1
print(f"[db] 插入失败 phone={phone} url={row.get('url', '')}: {exc}")
page = 1
while True:
payload = {"pid": province_code, "cid": city_code, "page": str(page)}
data = self._post(payload)
if not data or not data.get("lawyerList"):
break
return inserted, skipped
for item in data["lawyerList"]:
result = self._parse_detail(item.get("lawyerUrl", ""), province_name, city_name)
if not result:
def crawl(
self,
output_path: str,
max_cities: int = 0,
city_filter: Optional[str] = None,
) -> None:
cities = self.discover_cities()
print(f"[discover] 共发现城市 {len(cities)}")
if city_filter:
key = city_filter.strip().lower()
cities = [
c for c in cities
if key in c.city_name.lower() or key in str(c.city_id).lower()
]
print(f"[discover] 过滤后城市 {len(cities)} 个, filter={city_filter}")
if max_cities > 0:
cities = cities[:max_cities]
print(f"[discover] 截断城市数 {len(cities)}")
os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True)
seen_ids: Set[str] = set()
if os.path.exists(output_path):
with open(output_path, "r", encoding="utf-8") as old_file:
for line in old_file:
line = line.strip()
if not line:
continue
try:
self.db.insert_data("lawyer", result)
print(f" -> 新增: {result['name']} ({result['phone']})")
except Exception as exc:
print(f" 插入失败: {exc}")
time.sleep(1)
item = json.loads(line)
except Exception:
continue
rid = item.get("record_id")
if rid:
seen_ids.add(rid)
print(f"[resume] 已有记录 {len(seen_ids)}")
page_count = data.get("lawyerItems", {}).get("pageCount", page)
if page >= page_count:
break
page += 1
time.sleep(2)
total_new_json = 0
total_new_db = 0
total_skip_db = 0
time.sleep(1)
print("华律网采集完成")
with open(output_path, "a", encoding="utf-8") as out:
for idx, target in enumerate(cities, start=1):
print(
f"[city {idx}/{len(cities)}] {target.province_name}-{target.city_name} "
f"(pid={target.province_id}, cid={target.city_id})"
)
city_records = list(self.crawl_city(target))
city_new_json = 0
for record in city_records:
rid = record["record_id"]
if rid in seen_ids:
continue
out.write(json.dumps(record, ensure_ascii=False) + "\n")
seen_ids.add(rid)
city_new_json += 1
total_new_json += 1
city_new_db, city_skip_db = self._write_records_to_db(city_records)
total_new_db += city_new_db
total_skip_db += city_skip_db
print(
f"[city] 采集{len(city_records)}条, JSON新增{city_new_json}条, "
f"DB新增{city_new_db}条, DB跳过{city_skip_db}"
)
print(
f"[done] JSON新增{total_new_json}条, DB新增{total_new_db}条, "
f"DB跳过{total_skip_db}条, 输出: {output_path}"
)
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="华律网全新采集脚本(站点数据直采)")
parser.add_argument(
"--output",
default="/www/wwwroot/lawyers/data/hualv_records_all.jsonl",
help="输出 jsonl 文件路径",
)
parser.add_argument(
"--max-cities",
type=int,
default=0,
help="最多采集多少个城市,0 表示不限",
)
parser.add_argument(
"--max-pages",
type=int,
default=9999,
help="每个城市最多采集多少页",
)
parser.add_argument(
"--city-filter",
default="",
help="按城市名称或城市编码过滤,如 beijing / 110100",
)
parser.add_argument(
"--sleep",
type=float,
default=0.15,
help="详情页请求间隔秒数",
)
parser.add_argument(
"--direct",
action="store_true",
help="直连模式,不使用 proxy_settings.json 代理",
)
parser.add_argument(
"--no-db",
action="store_true",
help="只输出 JSONL,不写入数据库",
)
return parser.parse_args()
def main():
args = parse_args()
if args.no_db:
crawler = HualvCrawler(
max_pages=args.max_pages,
sleep_seconds=args.sleep,
use_proxy=not args.direct,
db_connection=None,
)
crawler.crawl(
output_path=args.output,
max_cities=args.max_cities,
city_filter=args.city_filter or None,
)
return
with Db() as db:
crawler = HualvCrawler(
max_pages=args.max_pages,
sleep_seconds=args.sleep,
use_proxy=not args.direct,
db_connection=db,
)
crawler.crawl(
output_path=args.output,
max_cities=args.max_cities,
city_filter=args.city_filter or None,
)
if __name__ == "__main__":
with Db() as db:
spider = HualvSpider(db)
spider.run()
main()