feat: add douyin data export functionality to lawyer export script
- Introduced a new command-line argument `--douyin-only` to export data specifically for Douyin, including additional fields such as sec_uid, douyin_uid, and user information. - Updated the README to include instructions for exporting Douyin data. - Enhanced the export logic to accommodate new fields when exporting Douyin-specific data.
This commit is contained in:
@@ -74,6 +74,11 @@ def parse_args() -> argparse.Namespace:
|
||||
action="store_true",
|
||||
help="关闭 params JSON 扩展信息解析(默认开启)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--douyin-only",
|
||||
action="store_true",
|
||||
help="仅导出抖音采集数据(domain=抖音),并追加抖音专用字段",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
@@ -109,13 +114,18 @@ def build_query(args: argparse.Namespace) -> (str, List):
|
||||
where: List[str] = []
|
||||
params: List = []
|
||||
|
||||
if args.douyin_only:
|
||||
target_domain = args.domain.strip() or "抖音"
|
||||
where.append("domain = %s")
|
||||
params.append(target_domain)
|
||||
|
||||
if args.start_ts > 0:
|
||||
where.append("create_time >= %s")
|
||||
params.append(args.start_ts)
|
||||
if args.end_ts > 0:
|
||||
where.append("create_time <= %s")
|
||||
params.append(args.end_ts)
|
||||
if args.domain.strip():
|
||||
if args.domain.strip() and not args.douyin_only:
|
||||
where.append("domain = %s")
|
||||
params.append(args.domain.strip())
|
||||
if args.province.strip():
|
||||
@@ -161,6 +171,13 @@ def parse_params(params_text: str) -> Dict[str, str]:
|
||||
else:
|
||||
specialties_text = ""
|
||||
|
||||
user_info = data.get("user_info") or {}
|
||||
if not isinstance(user_info, dict):
|
||||
user_info = {}
|
||||
|
||||
sec_uid = str(data.get("sec_uid") or "")
|
||||
douyin_url = f"https://www.douyin.com/user/{sec_uid}" if sec_uid else ""
|
||||
|
||||
return {
|
||||
"email": str(profile.get("email") or ""),
|
||||
"address": str(profile.get("address") or ""),
|
||||
@@ -170,10 +187,26 @@ def parse_params(params_text: str) -> Dict[str, str]:
|
||||
"source_site": str(source.get("site") or ""),
|
||||
"detail_url": str(source.get("detail_url") or ""),
|
||||
"list_url": str(source.get("list_url") or ""),
|
||||
"api_source": str(data.get("api_source") or ""),
|
||||
"api_url": str(data.get("api_url") or ""),
|
||||
"city_index": str(data.get("city_index") or ""),
|
||||
"captured_at": str(data.get("captured_at") or ""),
|
||||
"sec_uid": sec_uid,
|
||||
"douyin_uid": str(user_info.get("uid") or ""),
|
||||
"douyin_unique_id": str(user_info.get("unique_id") or ""),
|
||||
"douyin_signature": str(user_info.get("signature") or ""),
|
||||
"douyin_nickname": str(user_info.get("nickname") or ""),
|
||||
"douyin_url": douyin_url,
|
||||
}
|
||||
|
||||
|
||||
def export_to_excel(rows: List[Dict], output_path: str, include_extra: bool, parse_params_flag: bool) -> int:
|
||||
def export_to_excel(
|
||||
rows: List[Dict],
|
||||
output_path: str,
|
||||
include_extra: bool,
|
||||
parse_params_flag: bool,
|
||||
douyin_only: bool,
|
||||
) -> int:
|
||||
wb = Workbook()
|
||||
ws = wb.active
|
||||
ws.title = "lawyers"
|
||||
@@ -204,6 +237,22 @@ def export_to_excel(rows: List[Dict], output_path: str, include_extra: bool, par
|
||||
"list_url",
|
||||
]
|
||||
)
|
||||
if parse_params_flag and douyin_only:
|
||||
headers.extend(
|
||||
[
|
||||
"sec_uid",
|
||||
"抖音uid",
|
||||
"抖音号",
|
||||
"抖音昵称",
|
||||
"抖音简介",
|
||||
"抖音主页URL",
|
||||
"api_source",
|
||||
"api_url",
|
||||
"city_index",
|
||||
"captured_at",
|
||||
"captured_at_text",
|
||||
]
|
||||
)
|
||||
|
||||
ws.append(headers)
|
||||
for cell in ws[1]:
|
||||
@@ -250,6 +299,29 @@ def export_to_excel(rows: List[Dict], output_path: str, include_extra: bool, par
|
||||
]
|
||||
)
|
||||
|
||||
if parse_params_flag and douyin_only:
|
||||
captured_at_text = ""
|
||||
try:
|
||||
captured_at_text = ts_to_text(int(info.get("captured_at", "") or 0))
|
||||
except Exception:
|
||||
captured_at_text = ""
|
||||
|
||||
line.extend(
|
||||
[
|
||||
info.get("sec_uid", ""),
|
||||
info.get("douyin_uid", ""),
|
||||
info.get("douyin_unique_id", ""),
|
||||
info.get("douyin_nickname", ""),
|
||||
info.get("douyin_signature", ""),
|
||||
info.get("douyin_url", ""),
|
||||
info.get("api_source", ""),
|
||||
info.get("api_url", ""),
|
||||
info.get("city_index", ""),
|
||||
info.get("captured_at", ""),
|
||||
captured_at_text,
|
||||
]
|
||||
)
|
||||
|
||||
ws.append(line)
|
||||
exported += 1
|
||||
|
||||
@@ -277,6 +349,7 @@ def main() -> None:
|
||||
output_path=output_path,
|
||||
include_extra=args.include_extra,
|
||||
parse_params_flag=not args.no_parse_params,
|
||||
douyin_only=args.douyin_only,
|
||||
)
|
||||
|
||||
print(f"[export] 导出完成,共 {count} 条")
|
||||
|
||||
Reference in New Issue
Block a user