chore: commit remaining local changes
This commit is contained in:
@@ -191,7 +191,7 @@ def extract_records(url: str, state: Dict) -> List[Dict]:
|
||||
"source": {
|
||||
"site": SITE_NAME,
|
||||
"list_url": url,
|
||||
"detail_url": url,
|
||||
"detail_url": "",
|
||||
"province": province,
|
||||
"province_py": "",
|
||||
"city": area or city,
|
||||
@@ -199,6 +199,7 @@ def extract_records(url: str, state: Dict) -> List[Dict]:
|
||||
"page": 1,
|
||||
"group_id": group_id,
|
||||
"module_id": module_id,
|
||||
"detail_url_status": "unresolved_from_pool",
|
||||
},
|
||||
"list_snapshot": {
|
||||
"name": "",
|
||||
@@ -347,12 +348,68 @@ def write_records_to_db(db: Db, records: List[Dict]) -> Tuple[int, int]:
|
||||
return inserted, skipped
|
||||
|
||||
|
||||
def lookup_name_map_from_db(db: Db, phones: List[str]) -> Dict[str, str]:
|
||||
deduped = sorted({phone for phone in phones if phone})
|
||||
if not deduped:
|
||||
return {}
|
||||
|
||||
name_map: Dict[str, str] = {}
|
||||
cur = db.db.cursor()
|
||||
try:
|
||||
chunk_size = 500
|
||||
for i in range(0, len(deduped), chunk_size):
|
||||
chunk = deduped[i:i + chunk_size]
|
||||
placeholders = ",".join(["%s"] * len(chunk))
|
||||
sql = (
|
||||
"SELECT phone, name, create_time FROM lawyer "
|
||||
f"WHERE phone IN ({placeholders}) AND name<>'' "
|
||||
"ORDER BY create_time DESC"
|
||||
)
|
||||
cur.execute(sql, chunk)
|
||||
for phone, name, _ in cur.fetchall():
|
||||
if phone not in name_map and name:
|
||||
name_map[phone] = str(name).strip()
|
||||
finally:
|
||||
cur.close()
|
||||
return name_map
|
||||
|
||||
|
||||
def apply_name_backfill(records: List[Dict], name_map: Dict[str, str]) -> int:
|
||||
updated = 0
|
||||
if not name_map:
|
||||
return updated
|
||||
|
||||
for record in records:
|
||||
profile = record.get("profile", {}) or {}
|
||||
list_snapshot = record.get("list_snapshot", {}) or {}
|
||||
phone = normalize_phone(profile.get("phone", ""))
|
||||
if not phone:
|
||||
continue
|
||||
|
||||
backfill_name = name_map.get(phone, "")
|
||||
if not backfill_name:
|
||||
continue
|
||||
|
||||
current_name = str(profile.get("name") or "").strip()
|
||||
if current_name:
|
||||
continue
|
||||
|
||||
profile["name"] = backfill_name
|
||||
list_snapshot["name"] = backfill_name
|
||||
record["profile"] = profile
|
||||
record["list_snapshot"] = list_snapshot
|
||||
updated += 1
|
||||
|
||||
return updated
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(description="众法利单页律师电话采集")
|
||||
parser.add_argument("--url", default=DEFAULT_URL, help="详情页 URL")
|
||||
parser.add_argument("--output", default=DEFAULT_OUTPUT, help="输出 jsonl 文件路径")
|
||||
parser.add_argument("--direct", action="store_true", help="直连模式,不使用代理")
|
||||
parser.add_argument("--no-db", action="store_true", help="仅输出 JSON,不写入数据库")
|
||||
parser.add_argument("--skip-name-backfill", action="store_true", help="跳过按手机号回填姓名")
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
@@ -418,12 +475,24 @@ def main() -> None:
|
||||
|
||||
db_new = 0
|
||||
db_skip = 0
|
||||
name_backfill_count = 0
|
||||
if not args.skip_name_backfill:
|
||||
try:
|
||||
with Db() as db:
|
||||
name_map = lookup_name_map_from_db(
|
||||
db,
|
||||
[normalize_phone((record.get("profile", {}) or {}).get("phone", "")) for record in records],
|
||||
)
|
||||
name_backfill_count = apply_name_backfill(records, name_map)
|
||||
except Exception as exc:
|
||||
print(f"[name-backfill] 跳过,查询失败: {exc}")
|
||||
|
||||
if not args.no_db:
|
||||
with Db() as db:
|
||||
db_new, db_skip = write_records_to_db(db, records)
|
||||
|
||||
print(
|
||||
f"[done] 采集{len(records)}条, JSON新增{json_new}条, "
|
||||
f"[done] 采集{len(records)}条, 姓名回填{name_backfill_count}条, JSON新增{json_new}条, "
|
||||
f"DB新增{db_new}条, DB跳过{db_skip}条, 输出: {args.output}"
|
||||
)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user