Files
lawyers/one_off_sites/zhongfali_single.py
T
2026-03-06 23:57:43 +08:00

502 lines
16 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
import argparse
import hashlib
import json
import os
import re
import sys
import time
from typing import Dict, List, Optional, Set, Tuple
import urllib3
current_dir = os.path.dirname(os.path.abspath(__file__))
project_root = os.path.dirname(current_dir)
request_dir = os.path.join(project_root, "request")
if request_dir not in sys.path:
sys.path.insert(0, request_dir)
if project_root not in sys.path:
sys.path.append(project_root)
from Db import Db
from request.requests_client import RequestClientError, RequestsClient
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
SITE_NAME = "zhongfali_single"
LEGACY_DOMAIN = "众法利单页"
DEFAULT_URL = "http://m.zhongfali.com/h-pd-552.html#mid=3&groupId=196&desc=false"
DEFAULT_OUTPUT = "/www/wwwroot/lawyers/data/one_off_sites/zhongfali_records_all.jsonl"
PHONE_RE = re.compile(r"1[3-9]\d{9}")
INITIAL_STATE_RE = re.compile(r"window\.__INITIAL_STATE__\s*=\s*(\{.*?\})</script>", re.S)
def normalize_phone(text: str) -> str:
compact = re.sub(r"\D", "", text or "")
match = PHONE_RE.search(compact)
return match.group(0) if match else ""
def split_specialties(text: str) -> List[str]:
source = (text or "").strip()
if not source:
return []
parts = [item.strip() for item in re.split(r"[、,;\s]+", source) if item.strip()]
seen: Set[str] = set()
result: List[str] = []
for item in parts:
if item in seen:
continue
seen.add(item)
result.append(item)
return result
def strip_html(text: str) -> str:
cleaned = re.sub(r"<[^>]+>", " ", text or "")
cleaned = cleaned.replace("&nbsp;", " ")
cleaned = re.sub(r"\s+", " ", cleaned)
return cleaned.strip()
def extract_specialties_from_remark(remark: str) -> List[str]:
plain = strip_html(remark)
if not plain:
return []
match = re.search(r"专业领域[:]\s*([^。;]+)", plain)
if match:
return split_specialties(match.group(1))
return []
def value_at(values: List[str], index: int) -> str:
if index < 0 or index >= len(values):
return ""
return str(values[index] or "").strip()
def parse_initial_state(html: str) -> Dict:
match = INITIAL_STATE_RE.search(html)
if not match:
raise ValueError("未找到 window.__INITIAL_STATE__")
return json.loads(match.group(1))
def extract_location_and_name(product_name: str) -> Tuple[str, str, str]:
text = re.sub(r"\s+", " ", product_name or "").strip()
province = ""
city = ""
lawyer_name = ""
province_match = re.search(r"([\u4e00-\u9fa5]{2,}省)", text)
city_match = re.search(r"(?:[\u4e00-\u9fa5]+省)?([\u4e00-\u9fa5]+市)", text)
name_match = re.search(r"([\u4e00-\u9fa5]{2,4})\s*律师", text)
if province_match:
province = province_match.group(1)
if city_match:
city = city_match.group(1)
if name_match:
lawyer_name = name_match.group(1)
return province, city, lawyer_name
def pick_product_module(state: Dict) -> Optional[Dict]:
module_map = state.get("currentPageModuleIdMap", {}) or {}
page_ids = state.get("currentPageModuleIds", []) or []
for module_id in page_ids:
module = module_map.get(str(module_id)) or module_map.get(module_id)
if not isinstance(module, dict):
continue
ext_info = module.get("extInfo", {}) or {}
if ext_info.get("productInfo"):
return module
for module in module_map.values():
if not isinstance(module, dict):
continue
ext_info = module.get("extInfo", {}) or {}
if ext_info.get("productInfo"):
return module
return None
def parse_group_id_from_url(url: str) -> int:
match = re.search(r"(?:[?&#]|^)groupId=(\d+)", url)
if not match:
return 0
try:
return int(match.group(1))
except ValueError:
return 0
def extract_records(url: str, state: Dict) -> List[Dict]:
module = pick_product_module(state)
if not module:
return []
ext_info = module.get("extInfo", {}) or {}
product_info = ext_info.get("productInfo", {}) or {}
product_name = str(product_info.get("name") or "").strip()
province, city, current_name = extract_location_and_name(product_name)
group_id = product_info.get("groupId")
if not group_id:
group_id = parse_group_id_from_url(url)
module_id = module.get("id")
prop_map: Dict[str, List[str]] = {}
for prop in ext_info.get("propList", []) or []:
name = str(prop.get("name") or "").strip()
values = [str(item or "").strip() for item in (prop.get("valueList") or [])]
if name:
prop_map[name] = values
result: List[Dict] = []
seen_phones: Set[str] = set()
now = int(time.time())
phone_values = prop_map.get("电话", [])
for idx, raw_phone in enumerate(phone_values):
phone = normalize_phone(raw_phone)
if not phone or phone in seen_phones:
continue
seen_phones.add(phone)
law_firm = value_at(prop_map.get("律师所", []), idx)
area = value_at(prop_map.get("所在地区", []), idx)
direction = value_at(prop_map.get("主攻方向", []), idx)
specialty_text = value_at(prop_map.get("专业特长", []), idx)
license_no = value_at(prop_map.get("执业证号", []), idx)
address = value_at(prop_map.get("地址", []), idx)
email = value_at(prop_map.get("电子邮箱", []), idx)
seat_phone = value_at(prop_map.get("座机", []), idx)
wechat = value_at(prop_map.get("微信", []), idx)
qq = value_at(prop_map.get("QQ", []), idx)
first_practice_date = value_at(prop_map.get("首次执业日期", []), idx)
specialties = split_specialties(direction)
if not specialties:
specialties = split_specialties(specialty_text)
record = {
"record_id": hashlib.md5(f"{url}|{phone}".encode("utf-8")).hexdigest(),
"collected_at": now,
"source": {
"site": SITE_NAME,
"list_url": url,
"detail_url": "",
"province": province,
"province_py": "",
"city": area or city,
"city_py": "",
"page": 1,
"group_id": group_id,
"module_id": module_id,
"detail_url_status": "unresolved_from_pool",
},
"list_snapshot": {
"name": "",
"law_firm": law_firm,
"specialties": specialties,
"answer_count": None,
},
"profile": {
"name": "",
"law_firm": law_firm,
"phone": phone,
"license_no": license_no,
"practice_years": None,
"email": email,
"address": address,
"specialties": specialties,
},
"raw": {
"source_index": idx,
"direction": direction,
"specialty_text": specialty_text,
"seat_phone": seat_phone,
"wechat": wechat,
"qq": qq,
"first_practice_date": first_practice_date,
},
}
result.append(record)
current_phone = normalize_phone(str(product_info.get("material") or ""))
if current_phone and current_phone not in seen_phones:
seen_phones.add(current_phone)
remark = str(product_info.get("remark") or "")
specialties = extract_specialties_from_remark(remark)
result.append(
{
"record_id": hashlib.md5(f"{url}|{current_phone}".encode("utf-8")).hexdigest(),
"collected_at": now,
"source": {
"site": SITE_NAME,
"list_url": url,
"detail_url": url,
"province": province,
"province_py": "",
"city": city,
"city_py": "",
"page": 1,
"group_id": group_id,
"module_id": module_id,
},
"list_snapshot": {
"name": current_name,
"law_firm": str(product_info.get("prop0") or "").strip(),
"specialties": specialties,
"answer_count": None,
},
"profile": {
"name": current_name,
"law_firm": str(product_info.get("prop0") or "").strip(),
"phone": current_phone,
"license_no": str(product_info.get("prop1") or "").strip(),
"practice_years": None,
"email": "",
"address": str(product_info.get("prop3") or "").strip(),
"specialties": specialties,
},
"raw": {
"from_product_info": True,
"product_name": product_name,
"remark": remark,
},
}
)
return result
def to_legacy_row(record: Dict) -> Optional[Dict[str, str]]:
source = record.get("source", {}) or {}
profile = record.get("profile", {}) or {}
phone = normalize_phone(profile.get("phone", ""))
if not phone:
return None
province = str(source.get("province") or "").strip()
city = str(source.get("city") or province).strip()
return {
"name": str(profile.get("name") or "").strip(),
"law_firm": str(profile.get("law_firm") or "").strip(),
"province": province,
"city": city,
"phone": phone,
"url": str(source.get("detail_url") or "").strip(),
"domain": LEGACY_DOMAIN,
"create_time": int(record.get("collected_at") or time.time()),
"params": json.dumps(record, ensure_ascii=False),
}
def existing_phones_in_db(db: Db, phones: List[str]) -> Set[str]:
deduped = sorted({phone for phone in phones if phone})
if not deduped:
return set()
existing: Set[str] = set()
cur = db.db.cursor()
try:
chunk_size = 500
for i in range(0, len(deduped), chunk_size):
chunk = deduped[i:i + chunk_size]
placeholders = ",".join(["%s"] * len(chunk))
sql = f"SELECT phone FROM lawyer WHERE domain=%s AND phone IN ({placeholders})"
cur.execute(sql, [LEGACY_DOMAIN, *chunk])
for row in cur.fetchall():
existing.add(row[0])
finally:
cur.close()
return existing
def write_records_to_db(db: Db, records: List[Dict]) -> Tuple[int, int]:
rows: List[Dict[str, str]] = []
for record in records:
row = to_legacy_row(record)
if row:
rows.append(row)
if not rows:
return 0, 0
existing = existing_phones_in_db(db, [row["phone"] for row in rows])
inserted = 0
skipped = 0
for row in rows:
phone = row.get("phone", "")
if not phone or phone in existing:
skipped += 1
continue
try:
db.insert_data("lawyer", row)
existing.add(phone)
inserted += 1
except Exception as exc:
skipped += 1
print(f"[db] 插入失败 phone={phone}: {exc}")
return inserted, skipped
def lookup_name_map_from_db(db: Db, phones: List[str]) -> Dict[str, str]:
deduped = sorted({phone for phone in phones if phone})
if not deduped:
return {}
name_map: Dict[str, str] = {}
cur = db.db.cursor()
try:
chunk_size = 500
for i in range(0, len(deduped), chunk_size):
chunk = deduped[i:i + chunk_size]
placeholders = ",".join(["%s"] * len(chunk))
sql = (
"SELECT phone, name, create_time FROM lawyer "
f"WHERE phone IN ({placeholders}) AND name<>'' "
"ORDER BY create_time DESC"
)
cur.execute(sql, chunk)
for phone, name, _ in cur.fetchall():
if phone not in name_map and name:
name_map[phone] = str(name).strip()
finally:
cur.close()
return name_map
def apply_name_backfill(records: List[Dict], name_map: Dict[str, str]) -> int:
updated = 0
if not name_map:
return updated
for record in records:
profile = record.get("profile", {}) or {}
list_snapshot = record.get("list_snapshot", {}) or {}
phone = normalize_phone(profile.get("phone", ""))
if not phone:
continue
backfill_name = name_map.get(phone, "")
if not backfill_name:
continue
current_name = str(profile.get("name") or "").strip()
if current_name:
continue
profile["name"] = backfill_name
list_snapshot["name"] = backfill_name
record["profile"] = profile
record["list_snapshot"] = list_snapshot
updated += 1
return updated
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="众法利单页律师电话采集")
parser.add_argument("--url", default=DEFAULT_URL, help="详情页 URL")
parser.add_argument("--output", default=DEFAULT_OUTPUT, help="输出 jsonl 文件路径")
parser.add_argument("--direct", action="store_true", help="直连模式,不使用代理")
parser.add_argument("--no-db", action="store_true", help="仅输出 JSON,不写入数据库")
parser.add_argument("--skip-name-backfill", action="store_true", help="跳过按手机号回填姓名")
return parser.parse_args()
def main() -> None:
args = parse_args()
os.makedirs(os.path.dirname(args.output) or ".", exist_ok=True)
client = RequestsClient(
headers={
"User-Agent": (
"Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) "
"AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 "
"Mobile/15E148 Safari/604.1"
),
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
"Connection": "close",
},
use_proxy=not args.direct,
retry_total=2,
retry_backoff_factor=1,
retry_status_forcelist=(429, 500, 502, 503, 504),
retry_allowed_methods=("GET",),
)
try:
resp = client.get_text(args.url, timeout=30, verify=False)
if resp.status_code >= 400:
raise RequestClientError(f"{resp.status_code} Error: {args.url}")
state = parse_initial_state(resp.text)
records = extract_records(args.url, state)
finally:
client.close()
if not records:
print("[done] 未采集到有效手机号")
return
seen_ids: Set[str] = set()
if os.path.exists(args.output):
with open(args.output, "r", encoding="utf-8") as old_file:
for line in old_file:
line = line.strip()
if not line:
continue
try:
item = json.loads(line)
except Exception:
continue
record_id = item.get("record_id")
if record_id:
seen_ids.add(record_id)
json_new = 0
with open(args.output, "a", encoding="utf-8") as out:
for record in records:
record_id = record["record_id"]
if record_id in seen_ids:
continue
out.write(json.dumps(record, ensure_ascii=False) + "\n")
seen_ids.add(record_id)
json_new += 1
db_new = 0
db_skip = 0
name_backfill_count = 0
if not args.skip_name_backfill:
try:
with Db() as db:
name_map = lookup_name_map_from_db(
db,
[normalize_phone((record.get("profile", {}) or {}).get("phone", "")) for record in records],
)
name_backfill_count = apply_name_backfill(records, name_map)
except Exception as exc:
print(f"[name-backfill] 跳过,查询失败: {exc}")
if not args.no_db:
with Db() as db:
db_new, db_skip = write_records_to_db(db, records)
print(
f"[done] 采集{len(records)}条, 姓名回填{name_backfill_count}条, JSON新增{json_new}条, "
f"DB新增{db_new}条, DB跳过{db_skip}条, 输出: {args.output}"
)
if __name__ == "__main__":
main()