feat: add douyin data export functionality to lawyer export script

- Introduced a new command-line argument `--douyin-only` to export data specifically for Douyin, including additional fields such as sec_uid, douyin_uid, and user information.
- Updated the README to include instructions for exporting Douyin data.
- Enhanced the export logic to accommodate new fields when exporting Douyin-specific data.
This commit is contained in:
hello-dd-code
2026-03-09 21:26:50 +08:00
parent e10437cd90
commit c2b77975c1
3 changed files with 273 additions and 14 deletions
+75 -2
View File
@@ -74,6 +74,11 @@ def parse_args() -> argparse.Namespace:
action="store_true",
help="关闭 params JSON 扩展信息解析(默认开启)",
)
parser.add_argument(
"--douyin-only",
action="store_true",
help="仅导出抖音采集数据(domain=抖音),并追加抖音专用字段",
)
return parser.parse_args()
@@ -109,13 +114,18 @@ def build_query(args: argparse.Namespace) -> (str, List):
where: List[str] = []
params: List = []
if args.douyin_only:
target_domain = args.domain.strip() or "抖音"
where.append("domain = %s")
params.append(target_domain)
if args.start_ts > 0:
where.append("create_time >= %s")
params.append(args.start_ts)
if args.end_ts > 0:
where.append("create_time <= %s")
params.append(args.end_ts)
if args.domain.strip():
if args.domain.strip() and not args.douyin_only:
where.append("domain = %s")
params.append(args.domain.strip())
if args.province.strip():
@@ -161,6 +171,13 @@ def parse_params(params_text: str) -> Dict[str, str]:
else:
specialties_text = ""
user_info = data.get("user_info") or {}
if not isinstance(user_info, dict):
user_info = {}
sec_uid = str(data.get("sec_uid") or "")
douyin_url = f"https://www.douyin.com/user/{sec_uid}" if sec_uid else ""
return {
"email": str(profile.get("email") or ""),
"address": str(profile.get("address") or ""),
@@ -170,10 +187,26 @@ def parse_params(params_text: str) -> Dict[str, str]:
"source_site": str(source.get("site") or ""),
"detail_url": str(source.get("detail_url") or ""),
"list_url": str(source.get("list_url") or ""),
"api_source": str(data.get("api_source") or ""),
"api_url": str(data.get("api_url") or ""),
"city_index": str(data.get("city_index") or ""),
"captured_at": str(data.get("captured_at") or ""),
"sec_uid": sec_uid,
"douyin_uid": str(user_info.get("uid") or ""),
"douyin_unique_id": str(user_info.get("unique_id") or ""),
"douyin_signature": str(user_info.get("signature") or ""),
"douyin_nickname": str(user_info.get("nickname") or ""),
"douyin_url": douyin_url,
}
def export_to_excel(rows: List[Dict], output_path: str, include_extra: bool, parse_params_flag: bool) -> int:
def export_to_excel(
rows: List[Dict],
output_path: str,
include_extra: bool,
parse_params_flag: bool,
douyin_only: bool,
) -> int:
wb = Workbook()
ws = wb.active
ws.title = "lawyers"
@@ -204,6 +237,22 @@ def export_to_excel(rows: List[Dict], output_path: str, include_extra: bool, par
"list_url",
]
)
if parse_params_flag and douyin_only:
headers.extend(
[
"sec_uid",
"抖音uid",
"抖音号",
"抖音昵称",
"抖音简介",
"抖音主页URL",
"api_source",
"api_url",
"city_index",
"captured_at",
"captured_at_text",
]
)
ws.append(headers)
for cell in ws[1]:
@@ -250,6 +299,29 @@ def export_to_excel(rows: List[Dict], output_path: str, include_extra: bool, par
]
)
if parse_params_flag and douyin_only:
captured_at_text = ""
try:
captured_at_text = ts_to_text(int(info.get("captured_at", "") or 0))
except Exception:
captured_at_text = ""
line.extend(
[
info.get("sec_uid", ""),
info.get("douyin_uid", ""),
info.get("douyin_unique_id", ""),
info.get("douyin_nickname", ""),
info.get("douyin_signature", ""),
info.get("douyin_url", ""),
info.get("api_source", ""),
info.get("api_url", ""),
info.get("city_index", ""),
info.get("captured_at", ""),
captured_at_text,
]
)
ws.append(line)
exported += 1
@@ -277,6 +349,7 @@ def main() -> None:
output_path=output_path,
include_extra=args.include_extra,
parse_params_flag=not args.no_parse_params,
douyin_only=args.douyin_only,
)
print(f"[export] 导出完成,共 {count}")