feat: enhance project configuration and improve data export functionality

- Updated `.gitignore` to streamline ignored files and added logging for common sites.
- Expanded `config.py` with new configurations for Weixin and Redis, and improved database connection settings.
- Refined `README.md` to clarify project structure and usage instructions.
- Enhanced `requirements.txt` with additional dependencies for MongoDB and Redis support.
- Refactored multiple spider scripts to utilize a session-based approach for HTTP requests, improving error handling and proxy management.
- Updated `export_lawyers_excel.py` to include a default timestamp for data exports.
This commit is contained in:
hello-dd-code
2026-03-18 10:02:25 +08:00
parent c2b77975c1
commit 38e7c284e8
14 changed files with 1665 additions and 3004 deletions
+264 -608
View File
@@ -1,17 +1,11 @@
import argparse
import hashlib
import json
import os
import random
import re
import sys
import time
from dataclasses import dataclass
from typing import Dict, Iterable, List, Optional, Set, Tuple
from urllib.parse import urljoin
import urllib3
from bs4 import BeautifulSoup
import random
from typing import Dict, Optional, List, Set
from concurrent.futures import ThreadPoolExecutor, as_completed
import threading
current_dir = os.path.dirname(os.path.abspath(__file__))
project_root = os.path.dirname(current_dir)
@@ -21,237 +15,165 @@ if request_dir not in sys.path:
if project_root not in sys.path:
sys.path.append(project_root)
from Db import Db
from request.requests_client import RequestClientError, RequestsClient
from utils.rate_limiter import wait_for_request
import requests
import urllib3
from bs4 import BeautifulSoup
from request.proxy_config import get_proxies, report_proxy_status
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
SITE_NAME = "64365"
LEGACY_DOMAIN = "律图"
SITE_BASE = "https://m.64365.com"
AREA_DATA_URL = "https://image.64365.com/ui_v3/m/js/public/area-cate-data.js"
LIST_API_URL = "https://m.64365.com/findLawyer/rpc/FindLawyer/LawyerRecommend/"
from Db import Db
PHONE_RE = re.compile(r"1[3-9]\d{9}")
YEAR_RE = re.compile(r"(\d+)\s*年")
DOMAIN = "律图"
LIST_URL = "https://m.64365.com/findLawyer/rpc/FindLawyer/LawyerRecommend/"
@dataclass
class CityTarget:
area_id: str
province_id: str
province_name: str
province_py: str
city_name: str
city_py: str
@dataclass
class ListCard:
detail_url: str
name: str
specialties: List[str]
score_text: str
service_text: str
def normalize_phone(text: str) -> str:
compact = re.sub(r"\D", "", text or "")
match = PHONE_RE.search(compact)
return match.group(0) if match else ""
class Six4365Crawler:
def __init__(
self,
max_pages: int = 9999,
sleep_seconds: float = 0.1,
use_proxy: bool = True,
db_connection=None,
):
self.max_pages = max_pages
self.sleep_seconds = max(0.0, sleep_seconds)
class Six4365Spider:
def __init__(self, db_connection):
self.db = db_connection
self.client = RequestsClient(
headers={
"User-Agent": (
"Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) "
"AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 "
"Mobile/15E148 Safari/604.1"
),
"Accept": "text/html, */*; q=0.01",
"Connection": "close",
},
use_proxy=use_proxy,
retry_total=2,
retry_backoff_factor=1,
retry_status_forcelist=(429, 500, 502, 503, 504),
retry_allowed_methods=("GET", "POST"),
)
self.session = self._build_session()
self.max_workers = int(os.getenv("SPIDER_WORKERS", "8"))
self._tls = threading.local()
self.cities = self._load_cities()
def _request_text(
self,
method: str,
url: str,
*,
timeout: int = 20,
max_retries: int = 3,
referer: str = SITE_BASE,
data: Optional[Dict] = None,
) -> str:
headers = {"Referer": referer}
last_error: Optional[Exception] = None
def _build_session(self) -> requests.Session:
report_proxy_status()
session = requests.Session()
session.trust_env = False
proxies = get_proxies()
if proxies:
session.proxies.update(proxies)
else:
session.proxies.clear()
session.headers.update({
"User-Agent": (
"Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) "
"AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 "
"Mobile/15E148 Safari/604.1"
),
"Connection": "close",
})
return session
for attempt in range(max_retries):
wait_for_request()
def _refresh_session(self) -> None:
try:
self.session.close()
except Exception:
pass
self.session = self._build_session()
def _get_thread_session(self) -> requests.Session:
"""requests.Session 不是严格线程安全:每个线程用独立 session(但共享同样代理/headers"""
s = getattr(self._tls, "session", None)
if s is not None:
return s
s = self._build_session()
s.headers.update(dict(self.session.headers))
self._tls.session = s
return s
def _refresh_thread_session(self) -> None:
s = getattr(self._tls, "session", None)
if s is not None:
try:
if method.upper() == "POST":
resp = self.client.post_text(
url,
timeout=timeout,
verify=False,
headers=headers,
data=data,
)
else:
resp = self.client.get_text(
url,
timeout=timeout,
verify=False,
headers=headers,
)
s.close()
except Exception:
pass
self._tls.session = None
code = resp.status_code
if code == 403:
if attempt < max_retries - 1:
self.client.refresh()
time.sleep((2 ** attempt) + random.uniform(0.2, 0.8))
continue
raise RequestClientError(f"{code} Error: {url}")
if code >= 500 and attempt < max_retries - 1:
time.sleep((2 ** attempt) + random.uniform(0.2, 0.8))
continue
if code >= 400:
raise RequestClientError(f"{code} Error: {url}")
return resp.text
def _existing_urls(self, urls: List[str]) -> Set[str]:
"""批量查重,减少 N 次 is_data_exist"""
if not urls:
return set()
existing: Set[str] = set()
cur = self.db.db.cursor()
try:
# IN 参数过多会失败,分批
chunk_size = 500
for i in range(0, len(urls), chunk_size):
chunk = urls[i:i + chunk_size]
placeholders = ",".join(["%s"] * len(chunk))
sql = f"SELECT url FROM lawyer WHERE url IN ({placeholders})"
cur.execute(sql, chunk)
for row in cur.fetchall():
# pymysql 默认返回 tuple
existing.add(row[0])
finally:
cur.close()
return existing
def _load_cities(self):
tables = ("area_new", "area2", "area")
last_error = None
for table in tables:
try:
provinces = self.db.select_data(
table,
"id, code, province",
"domain='64365' AND level=1"
) or []
cities = self.db.select_data(
table,
"code, city, province, pid",
"domain='64365' AND level=2"
) or []
except Exception as exc:
last_error = exc
if attempt < max_retries - 1:
time.sleep((2 ** attempt) + random.uniform(0.2, 0.8))
continue
raise
continue
if last_error is not None:
raise last_error
raise RequestClientError(f"Unknown request error: {url}")
if not cities:
continue
def _get_text(self, url: str, *, timeout: int = 20, max_retries: int = 3, referer: str = SITE_BASE) -> str:
return self._request_text(
"GET",
url,
timeout=timeout,
max_retries=max_retries,
referer=referer,
)
province_map = {row.get('id'): row for row in provinces}
data = {}
for city in cities:
province_row = province_map.get(city.get('pid'), {}) or {}
data[str(city.get('code'))] = {
"name": city.get('city'),
"province": city.get('province'),
"province_name": province_row.get('province', city.get('province')),
}
print(f"[律图] 城市来源表: {table}, 城市数: {len(cities)}")
return data
def _post_text(
self,
url: str,
*,
data: Dict,
timeout: int = 20,
max_retries: int = 3,
referer: str = SITE_BASE,
) -> str:
return self._request_text(
"POST",
url,
timeout=timeout,
max_retries=max_retries,
referer=referer,
data=data,
)
if last_error:
print(f"[律图] 加载地区数据失败: {last_error}")
print("[律图] 无城市数据(已尝试 area_new/area2/area")
return {}
def _extract_area_data(self, text: str) -> List[Dict]:
match = re.search(
r"lvtuData\.areaData\s*=\s*(\[[\s\S]*?\])\s*;\s*lvtuData\.categroyData",
text,
re.S,
)
if not match:
return []
raw = match.group(1)
try:
data = json.loads(raw)
except Exception:
return []
return data if isinstance(data, list) else []
def discover_cities(self) -> List[CityTarget]:
text = self._get_text(AREA_DATA_URL, referer=SITE_BASE + "/findlawyer/")
provinces = self._extract_area_data(text)
targets: List[CityTarget] = []
seen_area: Set[str] = set()
for province in provinces:
province_id = str(province.get("id") or "").strip()
province_name = str(province.get("name") or "").strip()
province_py = str(province.get("py") or "").strip()
child_rows = province.get("child") or []
# 常规省份 child 是地级市;直辖市 child 是区县,此时使用省级 id 抓取
if child_rows and any((row.get("child") or []) for row in child_rows):
for city in child_rows:
area_id = str(city.get("id") or "").strip()
city_name = str(city.get("name") or "").strip()
city_py = str(city.get("py") or "").strip()
if not area_id or not city_name:
def _post(self, payload: Dict[str, str], max_retries: int = 3) -> Optional[str]:
for attempt in range(max_retries):
try:
resp = self.session.post(LIST_URL, data=payload, timeout=10, verify=False)
status_code = resp.status_code
text = resp.text
resp.close()
if status_code == 403:
if attempt < max_retries - 1:
wait_time = 2 ** attempt + random.uniform(0.3, 1.0)
print(f"403被拦截,{wait_time}秒后重试 ({attempt + 1}/{max_retries})")
self._refresh_session()
time.sleep(wait_time)
continue
if area_id in seen_area:
continue
seen_area.add(area_id)
targets.append(
CityTarget(
area_id=area_id,
province_id=province_id,
province_name=province_name,
province_py=province_py,
city_name=city_name,
city_py=city_py,
)
)
else:
if not province_id or not province_name:
continue
if province_id in seen_area:
continue
seen_area.add(province_id)
targets.append(
CityTarget(
area_id=province_id,
province_id=province_id,
province_name=province_name,
province_py=province_py,
city_name=province_name,
city_py=province_py,
)
)
print("请求失败: 403 Forbidden")
return None
if status_code >= 400:
raise requests.exceptions.HTTPError(f"{status_code} Error")
return text
except requests.exceptions.RequestException as exc:
print(f"请求失败: {exc}")
return None
return None
return targets
def _build_payload(self, area_id: str, page: int) -> Dict[str, str]:
ua = self.client.headers.get("User-Agent", "")
def _build_payload(self, city_code: str, page: int) -> Dict[str, str]:
return {
"AdCode": "",
"RegionId": str(area_id),
"RegionId": str(city_code),
"CategoryId": "",
"MaxNumber": "",
"OnlyData": "true",
"IgnoreButton": "",
"LawyerRecommendRequest[AreaId]": str(area_id),
"LawyerRecommendRequest[AreaId]": str(city_code),
"LawyerRecommendRequest[LawCategoryIds]": "",
"LawyerRecommendRequest[LawFirmPersonCount]": "",
"LawyerRecommendRequest[LawFirmScale]": "",
@@ -268,429 +190,163 @@ class Six4365Crawler:
"LawyerRecommendRequest[RefferUrl]": "",
"LawyerRecommendRequest[RequestUrl]": "https://m.64365.com/findlawyer/",
"LawyerRecommendRequest[resource_type_name]": "",
"LawyerRecommendRequest[UserAgent]": ua,
"LawyerRecommendRequest[UserAgent]": self.session.headers["User-Agent"],
"LawyerRecommendRequest[AddLawyerWithNoData]": "false",
"ShowCaseButton": "true",
}
def fetch_list_html(self, target: CityTarget, page: int) -> str:
payload = self._build_payload(target.area_id, page)
return self._post_text(
LIST_API_URL,
data=payload,
referer=SITE_BASE + "/findlawyer/",
)
def parse_list_cards(self, html: str) -> List[ListCard]:
def _parse_list(self, html: str, province: str, city: str) -> int:
soup = BeautifulSoup(html, "html.parser")
cards: List[ListCard] = []
seen: Set[str] = set()
lawyers = soup.find_all("a", class_="lawyer")
if not lawyers:
return 0
for anchor in soup.select("a.lawyer[href]"):
href = (anchor.get("href") or "").strip()
detail_urls: List[str] = []
for lawyer in lawyers:
href = lawyer.get("href")
if not href:
continue
detail_url = urljoin(SITE_BASE, href)
if detail_url in seen:
detail_urls.append(f"{href.rstrip('/')}/info/")
if not detail_urls:
return 0
results: List[Dict[str, str]] = []
with ThreadPoolExecutor(max_workers=self.max_workers) as ex:
futs = [ex.submit(self._parse_detail, u, province, city) for u in detail_urls]
for fut in as_completed(futs):
try:
data = fut.result()
except Exception as exc:
print(f" 详情解析异常: {exc}")
continue
if data:
results.append(data)
if not results:
return len(detail_urls)
existing = self._existing_urls([r.get("url", "") for r in results if r.get("url")])
for data in results:
if not data:
continue
seen.add(detail_url)
url = data.get("url", "")
if not url:
continue
if url in existing:
print(f" -- 已存在URL: {url}")
continue
try:
self.db.insert_data("lawyer", data)
print(f" -> 新增: {data['name']} ({data['phone']})")
except Exception as exc:
print(f" 插入失败 {url}: {exc}")
name = ""
name_tag = anchor.select_one("b.name")
if name_tag:
name = name_tag.get_text(strip=True)
return len(detail_urls)
specialties: List[str] = []
skill_tag = anchor.select_one("div.skill")
if skill_tag:
raw = skill_tag.get_text(" ", strip=True).replace("擅长:", "")
specialties = [x.strip() for x in re.split(r"[、,]", raw) if x.strip()]
def _parse_detail(self, url: str, province: str, city: str) -> Optional[Dict[str, str]]:
html = self._get_detail(url)
if not html:
return None
score_text = ""
score_tag = anchor.select_one("div.info span[title='评分'] em")
if score_tag:
score_text = score_tag.get_text(strip=True)
service_text = ""
service_tag = anchor.select_one("div.info")
if service_tag:
service_text = service_tag.get_text(" ", strip=True)
cards.append(
ListCard(
detail_url=detail_url,
name=name,
specialties=specialties,
score_text=score_text,
service_text=service_text,
)
)
return cards
def parse_detail(self, detail_url: str) -> Dict:
info_url = detail_url.rstrip("/") + "/info/"
html = self._get_text(info_url, referer=detail_url)
soup = BeautifulSoup(html, "html.parser")
base_info = soup.find("ul", class_="intro-basic-bar")
if not base_info:
return None
name = ""
law_firm = ""
phone = ""
practice_years: Optional[int] = None
office_area = ""
address = ""
specialties: List[str] = []
for li in soup.select("ul.intro-basic-bar li"):
label_tag = li.select_one("span.label")
value_tag = li.select_one("div.txt")
if not label_tag or not value_tag:
for li in base_info.find_all("li"):
label = li.find("span", class_="label")
txt = li.find("div", class_="txt")
if not label or not txt:
continue
label_text = label.get_text(strip=True)
if "姓名" in label_text:
name = txt.get_text(strip=True)
if "执业律所" in label_text:
law_firm = txt.get_text(strip=True)
label = label_tag.get_text(" ", strip=True).replace("", "")
value = value_tag.get_text(" ", strip=True)
more_section = soup.find("div", class_="more-intro-basic")
if more_section:
phone_ul = more_section.find("ul", class_="intro-basic-bar")
if phone_ul:
for li in phone_ul.find_all("li"):
label = li.find("span", class_="label")
txt = li.find("div", class_="txt")
if label and txt and "联系电话" in label.get_text(strip=True):
phone = txt.get_text(strip=True).replace(" ", "")
break
if "姓名" in label and not name:
name = value
elif "执业律所" in label and not law_firm:
law_firm = value
elif "联系电话" in label and not phone:
phone = normalize_phone(value)
elif "执业年限" in label and practice_years is None:
year_match = YEAR_RE.search(value)
if year_match:
try:
practice_years = int(year_match.group(1))
except Exception:
practice_years = None
elif "办公地区" in label and not office_area:
office_area = value
elif "办公地址" in label and not address:
address = value
text = soup.get_text(" ", strip=True)
if not phone:
phone = normalize_phone(text)
if not name and soup.title:
title = soup.title.get_text(" ", strip=True)
match = re.search(r"([^\s_,。]+?)律师", title)
if match:
name = match.group(1).strip()
skill_match = re.search(r"擅长:([^\n]+)", text)
if skill_match:
specialties = [x.strip() for x in re.split(r"[、,]", skill_match.group(1)) if x.strip()]
return {
"name": name,
"law_firm": law_firm,
"phone": phone,
"practice_years": practice_years,
"office_area": office_area,
"address": address,
"specialties": specialties,
"detail_url": detail_url,
"info_url": info_url,
}
def crawl_city(self, target: CityTarget) -> Iterable[Dict]:
seen_detail_urls: Set[str] = set()
page_first_seen: Set[str] = set()
for page in range(1, self.max_pages + 1):
try:
html = self.fetch_list_html(target, page)
except Exception as exc:
print(f"[list] 失败 area={target.area_id} p{page}: {exc}")
break
cards = self.parse_list_cards(html)
if not cards:
break
first_url = cards[0].detail_url
if first_url in page_first_seen:
break
page_first_seen.add(first_url)
for card in cards:
if card.detail_url in seen_detail_urls:
continue
seen_detail_urls.add(card.detail_url)
try:
detail = self.parse_detail(card.detail_url)
except Exception as exc:
print(f"[detail] 失败 {card.detail_url}: {exc}")
continue
now = int(time.time())
uid_match = re.search(r"/lawyer/(\d+)/", card.detail_url)
uid = uid_match.group(1) if uid_match else card.detail_url
record_id = hashlib.md5(uid.encode("utf-8")).hexdigest()
yield {
"record_id": record_id,
"collected_at": now,
"source": {
"site": SITE_NAME,
"province_id": target.province_id,
"province": target.province_name,
"province_py": target.province_py,
"area_id": target.area_id,
"city": target.city_name,
"city_py": target.city_py,
"page": page,
"detail_url": card.detail_url,
"info_url": detail.get("info_url", ""),
},
"list_snapshot": {
"name": card.name,
"specialties": card.specialties,
"score_text": card.score_text,
"service_text": card.service_text,
},
"profile": {
"name": detail.get("name") or card.name,
"law_firm": detail.get("law_firm") or "",
"phone": detail.get("phone") or "",
"practice_years": detail.get("practice_years"),
"office_area": detail.get("office_area") or "",
"address": detail.get("address") or "",
"specialties": detail.get("specialties") or card.specialties,
},
}
if self.sleep_seconds:
time.sleep(self.sleep_seconds)
def _to_legacy_lawyer_row(self, record: Dict) -> Optional[Dict[str, str]]:
source = record.get("source", {}) or {}
profile = record.get("profile", {}) or {}
phone = normalize_phone(profile.get("phone", ""))
if not phone:
phone = phone.replace('-', '').strip()
if not name or not phone:
return None
province = (source.get("province") or "").strip()
city = (source.get("city") or province).strip()
return {
"name": (profile.get("name") or "").strip(),
"law_firm": (profile.get("law_firm") or "").strip(),
data = {
"phone": phone,
"province": province,
"city": city,
"phone": phone,
"url": (source.get("info_url") or source.get("detail_url") or "").strip(),
"domain": LEGACY_DOMAIN,
"create_time": int(record.get("collected_at") or time.time()),
"params": json.dumps(record, ensure_ascii=False),
"law_firm": law_firm,
"url": url,
"domain": DOMAIN,
"name": name,
"create_time": int(time.time()),
"params": json.dumps({"province": province, "city": city}, ensure_ascii=False)
}
return data
def _existing_phones_in_db(self, phones: List[str]) -> Set[str]:
if not self.db or not phones:
return set()
deduped = sorted({p for p in phones if p})
if not deduped:
return set()
existing: Set[str] = set()
cur = self.db.db.cursor()
try:
chunk_size = 500
for i in range(0, len(deduped), chunk_size):
chunk = deduped[i:i + chunk_size]
placeholders = ",".join(["%s"] * len(chunk))
sql = f"SELECT phone FROM lawyer WHERE domain=%s AND phone IN ({placeholders})"
cur.execute(sql, [LEGACY_DOMAIN, *chunk])
for row in cur.fetchall():
existing.add(row[0])
finally:
cur.close()
return existing
def _write_records_to_db(self, records: List[Dict]) -> Tuple[int, int]:
if not self.db:
return 0, 0
rows: List[Dict[str, str]] = []
for record in records:
row = self._to_legacy_lawyer_row(record)
if row:
rows.append(row)
if not rows:
return 0, 0
existing = self._existing_phones_in_db([row["phone"] for row in rows])
inserted = 0
skipped = 0
for row in rows:
phone = row.get("phone", "")
if not phone or phone in existing:
skipped += 1
continue
def _get_detail(self, url: str, max_retries: int = 3) -> Optional[str]:
session = self._get_thread_session()
for attempt in range(max_retries):
try:
self.db.insert_data("lawyer", row)
existing.add(phone)
inserted += 1
except Exception as exc:
skipped += 1
print(f"[db] 插入失败 phone={phone} url={row.get('url', '')}: {exc}")
return inserted, skipped
def crawl(
self,
output_path: str,
max_cities: int = 0,
city_filter: Optional[str] = None,
) -> None:
cities = self.discover_cities()
print(f"[discover] 共发现地区 {len(cities)}")
if city_filter:
key = city_filter.strip().lower()
cities = [
c for c in cities
if key in c.city_name.lower() or key in c.city_py.lower() or key in c.area_id
]
print(f"[discover] 过滤后地区 {len(cities)} 个, filter={city_filter}")
if max_cities > 0:
cities = cities[:max_cities]
print(f"[discover] 截断地区数 {len(cities)}")
os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True)
seen_ids: Set[str] = set()
if os.path.exists(output_path):
with open(output_path, "r", encoding="utf-8") as old_file:
for line in old_file:
line = line.strip()
if not line:
resp = session.get(url, timeout=10, verify=False)
status_code = resp.status_code
text = resp.text
resp.close()
if status_code == 403:
if attempt < max_retries - 1:
wait_time = 2 ** attempt + random.uniform(0.3, 1.0)
print(f" 403被拦截,{wait_time}秒后重试 ({attempt + 1}/{max_retries})")
self._refresh_thread_session()
session = self._get_thread_session()
time.sleep(wait_time)
continue
try:
item = json.loads(line)
except Exception:
continue
rid = item.get("record_id")
if rid:
seen_ids.add(rid)
print(f"[resume] 已有记录 {len(seen_ids)}")
print(" 请求失败: 403 Forbidden")
return None
if status_code >= 400:
raise requests.exceptions.HTTPError(f"{status_code} Error")
return text
except requests.exceptions.RequestException as exc:
print(f" 请求失败: {exc}")
return None
return None
total_new_json = 0
total_new_db = 0
total_skip_db = 0
def run(self):
print("启动律图采集...")
if not self.cities:
print("无城市数据")
return
with open(output_path, "a", encoding="utf-8") as out:
for idx, target in enumerate(cities, start=1):
print(
f"[city {idx}/{len(cities)}] {target.province_name}-{target.city_name} "
f"(area={target.area_id})"
)
city_records = list(self.crawl_city(target))
city_new_json = 0
for record in city_records:
rid = record["record_id"]
if rid in seen_ids:
continue
out.write(json.dumps(record, ensure_ascii=False) + "\n")
seen_ids.add(rid)
city_new_json += 1
total_new_json += 1
city_new_db, city_skip_db = self._write_records_to_db(city_records)
total_new_db += city_new_db
total_skip_db += city_skip_db
print(
f"[city] 采集{len(city_records)}条, JSON新增{city_new_json}条, "
f"DB新增{city_new_db}条, DB跳过{city_skip_db}"
)
print(
f"[done] JSON新增{total_new_json}条, DB新增{total_new_db}条, "
f"DB跳过{total_skip_db}条, 输出: {output_path}"
)
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="律图全新采集脚本(站点数据直采)")
parser.add_argument(
"--output",
default="/www/wwwroot/lawyers/data/six4365_records_all.jsonl",
help="输出 jsonl 文件路径",
)
parser.add_argument(
"--max-cities",
type=int,
default=0,
help="最多采集多少个地区,0 表示不限",
)
parser.add_argument(
"--max-pages",
type=int,
default=9999,
help="每个地区最多采集多少页",
)
parser.add_argument(
"--city-filter",
default="",
help="按城市名称/拼音/编码过滤",
)
parser.add_argument(
"--sleep",
type=float,
default=0.1,
help="详情页请求间隔秒数",
)
parser.add_argument(
"--direct",
action="store_true",
help="直连模式,不使用 proxy_settings.json 代理",
)
parser.add_argument(
"--no-db",
action="store_true",
help="只输出 JSONL,不写入数据库",
)
return parser.parse_args()
def main():
args = parse_args()
if args.no_db:
crawler = Six4365Crawler(
max_pages=args.max_pages,
sleep_seconds=args.sleep,
use_proxy=not args.direct,
db_connection=None,
)
crawler.crawl(
output_path=args.output,
max_cities=args.max_cities,
city_filter=args.city_filter or None,
)
return
with Db() as db:
crawler = Six4365Crawler(
max_pages=args.max_pages,
sleep_seconds=args.sleep,
use_proxy=not args.direct,
db_connection=db,
)
crawler.crawl(
output_path=args.output,
max_cities=args.max_cities,
city_filter=args.city_filter or None,
)
for city_code, info in self.cities.items():
province = info.get("province_name", "")
city = info.get("name", "")
print(f"采集 {province}-{city}")
page = 1
while True:
payload = self._build_payload(city_code, page)
html = self._post(payload)
if not html:
break
link_count = self._parse_list(html, province, city)
if link_count == 0:
break
page += 1
print("律图采集完成")
if __name__ == "__main__":
main()
with Db() as db:
spider = Six4365Spider(db)
spider.run()