chore: 暂存本地修改

This commit is contained in:
hello-dd-code
2026-04-28 17:33:51 +08:00
parent ba04fe42fc
commit f67cb30f0d
15 changed files with 1139 additions and 97 deletions
+565
View File
@@ -0,0 +1,565 @@
#!/usr/bin/env python3
import argparse
import re
from collections import defaultdict
from dataclasses import dataclass
from typing import Dict, Iterable, List, Optional, Sequence, Tuple
import pymysql
from openpyxl import Workbook, load_workbook
from openpyxl.styles import Font
from config import DB_CONFIG
@dataclass(frozen=True)
class LawyerRecord:
id: int
name: str
phone: str
law_firm: str
province: str
city: str
domain: str
create_time: int
@dataclass(frozen=True)
class PhoneBackfill:
matched_phones: List[str]
records: List[LawyerRecord]
best_name: str
best_law_firm: str
best_domain: str
candidate_names: List[str]
candidate_firms: List[str]
candidate_domains: List[str]
DOMAIN_PRIORITY = {
"华律": 90,
"大律师": 85,
"找法网": 82,
"法律快车": 80,
"律图": 72,
"众法利单页": 68,
"众法利": 66,
"六四三六五": 64,
"智飞律师在线": 40,
"高德地图": 10,
}
GENERIC_FIRMS = {"高德搜索"}
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="按律所名从数据库补手机号并导出对比表")
parser.add_argument("--input", default="man.xlsx", help="原始 xlsx 文件路径")
parser.add_argument(
"--output",
default="man_firm_phone_compare.xlsx",
help="输出 xlsx 文件路径",
)
return parser.parse_args()
def normalize_text(value: object) -> str:
text = str(value or "").strip()
text = text.replace("", "(").replace("", ")")
text = re.sub(r"\s+", "", text)
return text
def normalize_firm(value: object) -> str:
text = normalize_text(value)
text = text.replace("本地大所", "").replace("特色律所", "")
return text
def normalize_name(value: object) -> str:
text = normalize_text(value)
return text.replace("律师", "")
def normalize_province(value: object) -> str:
text = str(value or "").strip()
mapping = {
"北京市": "北京",
"天津市": "天津",
"上海市": "上海",
"重庆市": "重庆",
"内蒙古自治区": "内蒙古",
"广西壮族自治区": "广西",
"宁夏回族自治区": "宁夏",
"新疆维吾尔自治区": "新疆",
"西藏自治区": "西藏",
"香港特别行政区": "香港",
"澳门特别行政区": "澳门",
"新疆生产建设兵团": "新疆",
}
if text in mapping:
return mapping[text]
if text.endswith("") and len(text) > 1:
return text[:-1]
return text
def normalize_city(value: object) -> str:
text = str(value or "").strip()
for suffix in ("", "地区", ""):
if text.endswith(suffix) and len(text) > len(suffix):
return text[: -len(suffix)]
return text
def split_phones(value: object) -> List[str]:
return re.findall(r"1\d{10}", str(value or ""))
def unique_phones(records: Sequence[LawyerRecord]) -> List[str]:
output: List[str] = []
seen = set()
for record in sorted(records, key=lambda item: (item.create_time, item.id), reverse=True):
if record.phone and record.phone not in seen:
seen.add(record.phone)
output.append(record.phone)
return output
def unique_values(records: Sequence[LawyerRecord], attr: str) -> List[str]:
output: List[str] = []
seen = set()
for record in sorted(records, key=lambda item: (item.create_time, item.id), reverse=True):
value = getattr(record, attr, "")
if value and value not in seen:
seen.add(value)
output.append(value)
return output
def phone_record_sort_key(
record: LawyerRecord,
target_name: object,
target_province: object,
target_city: object,
) -> Tuple[int, int, int]:
score = 0
normalized_target_name = normalize_name(target_name)
normalized_target_province = normalize_province(target_province)
normalized_target_city = normalize_city(target_city)
if normalized_target_name:
if normalize_name(record.name) == normalized_target_name:
score += 400
elif record.name:
score -= 40
if record.law_firm and record.law_firm not in GENERIC_FIRMS:
score += 220
elif record.law_firm:
score += 40
if record.name:
score += 100
if normalized_target_city:
if normalize_city(record.city) == normalized_target_city:
score += 45
elif record.city:
score -= 10
if normalized_target_province:
if normalize_province(record.province) == normalized_target_province:
score += 25
elif record.province:
score -= 5
score += DOMAIN_PRIORITY.get(record.domain, 50)
return score, record.create_time, record.id
def compare_result(original_phones: Sequence[str], candidate_phones: Sequence[str]) -> str:
if not candidate_phones:
return "未匹配"
if not original_phones:
return "原手机号为空"
original_set = set(original_phones)
candidate_set = set(candidate_phones)
if original_set == candidate_set:
return "完全一致"
if original_set & candidate_set:
return "候选包含原手机号"
return "不包含原手机号"
def infer_firm_from_address(address: object, ordered_firms: Sequence[str]) -> str:
normalized_address = normalize_text(address)
if not normalized_address:
return ""
for firm in ordered_firms:
if len(firm) < 4:
continue
if firm in normalized_address:
return firm
return ""
def load_db_indexes() -> Tuple[Dict[str, List[LawyerRecord]], List[str], Dict[str, List[LawyerRecord]]]:
conn = pymysql.connect(**DB_CONFIG)
firm_index: Dict[str, List[LawyerRecord]] = defaultdict(list)
phone_index: Dict[str, List[LawyerRecord]] = defaultdict(list)
try:
with conn.cursor() as cur:
cur.execute(
"""
SELECT id, name, phone, law_firm, province, city, domain, create_time
FROM lawyer
WHERE phone IS NOT NULL
AND phone <> ''
"""
)
for row in cur.fetchall():
record = LawyerRecord(
id=int(row[0]),
name=str(row[1] or "").strip(),
phone=str(row[2] or "").strip(),
law_firm=str(row[3] or "").strip(),
province=str(row[4] or "").strip(),
city=str(row[5] or "").strip(),
domain=str(row[6] or "").strip(),
create_time=int(row[7] or 0),
)
phone_index[record.phone].append(record)
normalized_firm = normalize_firm(record.law_firm)
if normalized_firm:
firm_index[normalized_firm].append(record)
finally:
conn.close()
ordered_firms = sorted(firm_index.keys(), key=len, reverse=True)
return firm_index, ordered_firms, phone_index
def build_phone_backfill(
original_phone: object,
name: object,
province: object,
city: object,
phone_index: Dict[str, List[LawyerRecord]],
) -> PhoneBackfill:
def pick_best_name(records: Sequence[LawyerRecord], target_name: object) -> str:
normalized_target_name = normalize_name(target_name)
if normalized_target_name:
for item in records:
if item.name and normalize_name(item.name) == normalized_target_name:
return item.name
for item in records:
if item.name:
return item.name
return ""
records: List[LawyerRecord] = []
seen_ids = set()
for phone in split_phones(original_phone):
for record in phone_index.get(phone, []):
if record.id in seen_ids:
continue
seen_ids.add(record.id)
records.append(record)
sorted_records = sorted(
records,
key=lambda item: phone_record_sort_key(item, name, province, city),
reverse=True,
)
candidate_names = unique_values(sorted_records, "name")
candidate_firms = unique_values(
[item for item in sorted_records if item.law_firm and item.law_firm not in GENERIC_FIRMS],
"law_firm",
)
if not candidate_firms:
candidate_firms = unique_values(
[item for item in sorted_records if item.law_firm],
"law_firm",
)
candidate_domains = unique_values(sorted_records, "domain")
matched_phones = unique_values(sorted_records, "phone")
best_name = pick_best_name(sorted_records, name)
best_law_firm = ""
best_domain = ""
preferred_name = normalize_name(name) or normalize_name(best_name)
for record in sorted_records:
if not record.law_firm or record.law_firm in GENERIC_FIRMS:
continue
if preferred_name and normalize_name(record.name) != preferred_name:
continue
best_law_firm = record.law_firm
best_domain = record.domain
break
if not best_law_firm:
for record in sorted_records:
if record.law_firm and record.law_firm not in GENERIC_FIRMS:
best_law_firm = record.law_firm
best_domain = record.domain
break
if not best_domain and sorted_records:
best_domain = sorted_records[0].domain
return PhoneBackfill(
matched_phones=matched_phones,
records=sorted_records,
best_name=best_name,
best_law_firm=best_law_firm,
best_domain=best_domain,
candidate_names=candidate_names,
candidate_firms=candidate_firms,
candidate_domains=candidate_domains,
)
def match_row(
name: object,
original_phone: object,
law_firm: object,
province: object,
city: object,
address: object,
phone_backfill: PhoneBackfill,
firm_index: Dict[str, List[LawyerRecord]],
ordered_firms: Sequence[str],
) -> Tuple[str, str, List[LawyerRecord]]:
def add_method(part: str, method_parts: List[str]) -> None:
if part and part not in method_parts:
method_parts.append(part)
matched_firm = normalize_firm(law_firm)
used_phone_backfill_firm = False
inferred_from_address = False
if not matched_firm:
matched_firm = normalize_firm(phone_backfill.best_law_firm)
used_phone_backfill_firm = bool(matched_firm)
if not matched_firm:
matched_firm = infer_firm_from_address(address, ordered_firms)
inferred_from_address = bool(matched_firm)
if not matched_firm:
return "", "无可用律所名", []
candidates = firm_index.get(matched_firm, [])
if not candidates:
return matched_firm, "数据库无此律所", []
method_parts = ["律所"]
chosen = list(candidates)
normalized_name = normalize_name(name)
if not normalized_name:
normalized_name = normalize_name(phone_backfill.best_name)
if normalized_name:
name_filtered = [item for item in chosen if normalize_name(item.name) == normalized_name]
if name_filtered:
chosen = name_filtered
add_method("姓名", method_parts)
if len(unique_phones(chosen)) != 1:
normalized_province = normalize_province(province)
normalized_city = normalize_city(city)
if normalized_province and normalized_city:
province_city_filtered = [
item
for item in chosen
if normalize_province(item.province) == normalized_province
and normalize_city(item.city) == normalized_city
]
if province_city_filtered:
chosen = province_city_filtered
add_method("省份", method_parts)
add_method("城市", method_parts)
if len(unique_phones(chosen)) != 1 and normalized_city:
city_filtered = [
item for item in chosen if normalize_city(item.city) == normalized_city
]
if city_filtered:
chosen = city_filtered
add_method("城市", method_parts)
if len(unique_phones(chosen)) != 1 and normalized_province:
province_filtered = [
item
for item in chosen
if normalize_province(item.province) == normalized_province
]
if province_filtered:
chosen = province_filtered
add_method("省份", method_parts)
method = "+".join(method_parts)
if used_phone_backfill_firm:
method = "手机号回填律所|" + method
elif inferred_from_address:
method = "地址推断律所|" + method
return matched_firm, method, chosen
def autosize_columns(ws) -> None:
for column_cells in ws.columns:
values = [str(cell.value or "") for cell in column_cells]
max_length = min(max((len(value) for value in values), default=0), 60)
column_letter = column_cells[0].column_letter
ws.column_dimensions[column_letter].width = max_length + 2
def iter_input_rows(ws) -> Iterable[Tuple[int, List[object]]]:
for row_idx in range(1, ws.max_row + 1):
yield row_idx, [ws.cell(row_idx, col_idx).value for col_idx in range(1, 8)]
def build_output(input_path: str, output_path: str) -> Dict[str, int]:
workbook = load_workbook(input_path)
source_ws = workbook.active
firm_index, ordered_firms, phone_index = load_db_indexes()
out_wb = Workbook()
out_ws = out_wb.active
out_ws.title = "firm_phone_compare"
headers = [
"原始行号",
"原姓名",
"原手机号",
"原律所",
"原省份",
"原城市",
"原地址",
"原备注",
"手机号命中记录数",
"手机号命中手机号",
"手机号补全姓名",
"手机号补全律所",
"手机号补全来源",
"手机号候选姓名",
"手机号候选律所",
"用于匹配的律所",
"匹配方式",
"数据库候选手机号",
"候选数量",
"原手机号对比",
"数据库候选姓名",
"数据库候选省市",
"数据库来源",
]
out_ws.append(headers)
for cell in out_ws[1]:
cell.font = Font(bold=True)
stats = defaultdict(int)
for row_idx, row in iter_input_rows(source_ws):
name, original_phone, law_firm, province, city, address, remark = row
needs_phone_completion = not normalize_firm(law_firm)
phone_backfill = build_phone_backfill(
original_phone=original_phone,
name=name,
province=province,
city=city,
phone_index=phone_index,
)
matched_firm, method, matched_records = match_row(
name=name,
original_phone=original_phone,
law_firm=law_firm,
province=province,
city=city,
address=address,
phone_backfill=phone_backfill,
firm_index=firm_index,
ordered_firms=ordered_firms,
)
candidate_phones = unique_phones(matched_records)
compare = compare_result(split_phones(original_phone), candidate_phones)
candidate_names = unique_values(matched_records, "name")
candidate_domains = unique_values(matched_records, "domain")
city_province_pairs = []
seen_pairs = set()
for record in matched_records:
pair = f"{record.province}-{record.city}".strip("-")
if pair and pair not in seen_pairs:
seen_pairs.add(pair)
city_province_pairs.append(pair)
out_ws.append(
[
row_idx,
name or "",
original_phone or "",
law_firm or "",
province or "",
city or "",
address or "",
remark or "",
len(phone_backfill.records) if needs_phone_completion else "",
" / ".join(phone_backfill.matched_phones) if needs_phone_completion else "",
phone_backfill.best_name if needs_phone_completion else "",
phone_backfill.best_law_firm if needs_phone_completion else "",
phone_backfill.best_domain if needs_phone_completion else "",
" / ".join(phone_backfill.candidate_names) if needs_phone_completion else "",
" / ".join(phone_backfill.candidate_firms) if needs_phone_completion else "",
matched_firm or "",
method or "",
" / ".join(candidate_phones) or "",
len(candidate_phones),
compare,
" / ".join(candidate_names) or "",
" / ".join(city_province_pairs) or "",
" / ".join(candidate_domains) or "",
]
)
if needs_phone_completion and phone_backfill.records:
stats["phone_backfill_hit_rows"] += 1
if needs_phone_completion and phone_backfill.best_name:
stats["phone_backfill_name_rows"] += 1
if needs_phone_completion and phone_backfill.best_law_firm:
stats["phone_backfill_firm_rows"] += 1
if needs_phone_completion and method.startswith("手机号回填律所|"):
stats["phone_backfill_used_for_match_rows"] += 1
if candidate_phones:
stats["matched_rows"] += 1
if len(candidate_phones) == 1:
stats["unique_rows"] += 1
else:
stats["multi_rows"] += 1
else:
stats["unmatched_rows"] += 1
if compare == "完全一致":
stats["same_rows"] += 1
elif compare == "候选包含原手机号":
stats["contains_rows"] += 1
elif compare == "不包含原手机号":
stats["diff_rows"] += 1
elif compare == "原手机号为空":
stats["blank_phone_rows"] += 1
out_ws.freeze_panes = "A2"
autosize_columns(out_ws)
out_wb.save(output_path)
return dict(stats)
def main() -> None:
args = parse_args()
stats = build_output(args.input, args.output)
print(f"已生成: {args.output}")
for key in sorted(stats):
print(f"{key}={stats[key]}")
if __name__ == "__main__":
main()