Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
97 changes: 72 additions & 25 deletions app/api/endpoints/download.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
import mimetypes
import os
from pathlib import Path
import re
import zipfile
import subprocess
import tempfile
Expand All @@ -20,20 +23,67 @@
with open(config_path, 'r', encoding='utf-8') as file:
config = yaml.safe_load(file)

DEFAULT_HEADERS = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}
SAFE_NAME_PATTERN = re.compile(r"[^A-Za-z0-9._-]")
ALLOWED_PLATFORMS = {"douyin", "tiktok", "bilibili"}
ALLOWED_DATA_TYPES = {"video", "image"}
ALLOWED_IMAGE_EXTENSIONS = {"jpg", "jpeg", "png", "webp", "gif"}


def sanitize_name(value: str, *, fallback: str) -> str:
value = SAFE_NAME_PATTERN.sub("_", (value or "").strip())
value = value.strip("._")
return value or fallback


def resolve_download_dir(platform: str, data_type: str) -> Path:
safe_platform = sanitize_name(platform, fallback="unknown")
safe_data_type = sanitize_name(data_type, fallback="unknown")
if safe_platform not in ALLOWED_PLATFORMS:
raise HTTPException(status_code=400, detail="Unsupported platform")
if safe_data_type not in ALLOWED_DATA_TYPES:
raise HTTPException(status_code=400, detail="Unsupported media type")

base_dir = Path(config.get("API", {}).get("Download_Path", "./download")).resolve()
target_dir = (base_dir / f"{safe_platform}_{safe_data_type}").resolve()
if base_dir != target_dir and base_dir not in target_dir.parents:
raise HTTPException(status_code=500, detail="Invalid download path")
target_dir.mkdir(parents=True, exist_ok=True)
return target_dir


def build_safe_file_path(directory: Path, filename: str) -> Path:
candidate = (directory / filename).resolve()
if directory != candidate.parent:
raise HTTPException(status_code=400, detail="Unsafe file name")
return candidate


def get_safe_image_extension(content_type: str | None) -> str:
if not content_type:
return "jpg"
mime_type = content_type.split(";", 1)[0].strip().lower()
extension = mimetypes.guess_extension(mime_type) or ""
extension = extension.lstrip(".").lower()
if extension == "jpe":
extension = "jpg"
if extension not in ALLOWED_IMAGE_EXTENSIONS:
return "jpg"
return extension


async def fetch_data(url: str, headers: dict = None):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
} if headers is None else headers.get('headers')
headers = DEFAULT_HEADERS if headers is None else headers.get('headers', DEFAULT_HEADERS)
async with httpx.AsyncClient() as client:
response = await client.get(url, headers=headers)
response.raise_for_status() # 确保响应是成功的
return response

# 下载视频专用
async def fetch_data_stream(url: str, request:Request , headers: dict = None, file_path: str = None):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
} if headers is None else headers.get('headers')
headers = DEFAULT_HEADERS if headers is None else headers.get('headers', DEFAULT_HEADERS)
async with httpx.AsyncClient() as client:
# 启用流式请求
async with client.stream("GET", url, headers=headers) as response:
Expand Down Expand Up @@ -165,21 +215,18 @@ async def download_file_hybrid(request: Request,
try:
data_type = data.get('type')
platform = data.get('platform')
video_id = data.get('video_id') # 改为使用video_id
file_prefix = config.get("API").get("Download_File_Prefix") if prefix else ''
download_path = os.path.join(config.get("API").get("Download_Path"), f"{platform}_{data_type}")

# 确保目录存在/Ensure the directory exists
os.makedirs(download_path, exist_ok=True)
video_id = sanitize_name(str(data.get('video_id')), fallback='media') # 改为使用video_id
file_prefix = sanitize_name(config.get("API", {}).get("Download_File_Prefix", ""), fallback="") if prefix else ""
download_dir = resolve_download_dir(platform, data_type)

# 下载视频文件/Download video file
if data_type == 'video':
file_name = f"{file_prefix}{platform}_{video_id}.mp4" if not with_watermark else f"{file_prefix}{platform}_{video_id}_watermark.mp4"
file_path = os.path.join(download_path, file_name)
file_path = build_safe_file_path(download_dir, file_name)

# 判断文件是否存在,存在就直接返回
if os.path.exists(file_path):
return FileResponse(path=file_path, media_type='video/mp4', filename=file_name)
return FileResponse(path=str(file_path), media_type='video/mp4', filename=file_name)

# 获取对应平台的headers
if platform == 'tiktok':
Expand All @@ -201,7 +248,7 @@ async def download_file_hybrid(request: Request,
)

# 使用专门的函数合并音视频
success = await merge_bilibili_video_audio(video_url, audio_url, request, file_path, __headers.get('headers'))
success = await merge_bilibili_video_audio(video_url, audio_url, request, str(file_path), __headers.get('headers'))
if not success:
raise HTTPException(
status_code=500,
Expand All @@ -210,7 +257,7 @@ async def download_file_hybrid(request: Request,
else:
# 其他平台的常规处理
url = data.get('video_data').get('nwm_video_url_HQ') if not with_watermark else data.get('video_data').get('wm_video_url_HQ')
success = await fetch_data_stream(url, request, headers=__headers, file_path=file_path)
success = await fetch_data_stream(url, request, headers=__headers, file_path=str(file_path))
if not success:
raise HTTPException(
status_code=500,
Expand All @@ -222,17 +269,17 @@ async def download_file_hybrid(request: Request,
# await out_file.write(response.content)

# 返回文件内容
return FileResponse(path=file_path, filename=file_name, media_type="video/mp4")
return FileResponse(path=str(file_path), filename=file_name, media_type="video/mp4")

# 下载图片文件/Download image file
elif data_type == 'image':
# 压缩文件属性/Compress file properties
zip_file_name = f"{file_prefix}{platform}_{video_id}_images.zip" if not with_watermark else f"{file_prefix}{platform}_{video_id}_images_watermark.zip"
zip_file_path = os.path.join(download_path, zip_file_name)
zip_file_path = build_safe_file_path(download_dir, zip_file_name)

# 判断文件是否存在,存在就直接返回、
if os.path.exists(zip_file_path):
return FileResponse(path=zip_file_path, filename=zip_file_name, media_type="application/zip")
return FileResponse(path=str(zip_file_path), filename=zip_file_name, media_type="application/zip")

# 获取图片文件/Get image file
urls = data.get('image_data').get('no_watermark_image_list') if not with_watermark else data.get(
Expand All @@ -243,22 +290,22 @@ async def download_file_hybrid(request: Request,
response = await fetch_data(url)
index = int(urls.index(url))
content_type = response.headers.get('content-type')
file_format = content_type.split('/')[1]
file_format = get_safe_image_extension(content_type)
file_name = f"{file_prefix}{platform}_{video_id}_{index + 1}.{file_format}" if not with_watermark else f"{file_prefix}{platform}_{video_id}_{index + 1}_watermark.{file_format}"
file_path = os.path.join(download_path, file_name)
image_file_list.append(file_path)
file_path = build_safe_file_path(download_dir, file_name)
image_file_list.append(str(file_path))

# 保存文件/Save file
async with aiofiles.open(file_path, 'wb') as out_file:
async with aiofiles.open(str(file_path), 'wb') as out_file:
await out_file.write(response.content)

# 压缩文件/Compress file
with zipfile.ZipFile(zip_file_path, 'w') as zip_file:
with zipfile.ZipFile(str(zip_file_path), 'w') as zip_file:
for image_file in image_file_list:
zip_file.write(image_file, os.path.basename(image_file))

# 返回压缩文件/Return compressed file
return FileResponse(path=zip_file_path, filename=zip_file_name, media_type="application/zip")
return FileResponse(path=str(zip_file_path), filename=zip_file_name, media_type="application/zip")

# 异常处理/Exception handling
except Exception as e:
Expand Down
2 changes: 1 addition & 1 deletion crawlers/douyin/web/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ TokenManager:
Referer: https://www.douyin.com/
# 你唯一需要修改的地方就是这里的Cookie,然后保存后重启程序即可。
# The only place you need to modify is the Cookie here, and then save and restart the program.
Cookie: __ac_nonce=067d687ac00d70af16eab; __ac_signature=_02B4Z6wo00f018O6kmgAAIDAR1H8JrcivBPDi5bAAJdBcf; ttwid=1%7C46sVJ6G5zO0ZRKBqbFef2B13U3CqP9gLwQEH8IV2y6A%7C1742112685%7Cae649397cca7dde21884d5f8e3e3d53eb2361aa64af04cd6889fa71d7f23344b; UIFID_TEMP=986fab8dfc2c74111fac2b883dbdee67777473ded35e2c4bebbf68cc8b91739da61f6b365ad9795b0aa3a8bddce6cc3e39c5d4fd4bad667aaefd3d3ec08baac66fe3b215343f12d8aae84e0a24048f44; douyin.com; device_web_cpu_core=16; device_web_memory_size=-1; architecture=amd64; hevc_supported=true; IsDouyinActive=true; home_can_add_dy_2_desktop=%220%22; dy_swidth=1835; dy_sheight=1147; stream_recommend_feed_params=%22%7B%5C%22cookie_enabled%5C%22%3Atrue%2C%5C%22screen_width%5C%22%3A1835%2C%5C%22screen_height%5C%22%3A1147%2C%5C%22browser_online%5C%22%3Atrue%2C%5C%22cpu_core_num%5C%22%3A16%2C%5C%22device_memory%5C%22%3A0%2C%5C%22downlink%5C%22%3A%5C%22%5C%22%2C%5C%22effective_type%5C%22%3A%5C%22%5C%22%2C%5C%22round_trip_time%5C%22%3A0%7D%22; strategyABtestKey=%221742112685.842%22; volume_info=%7B%22isUserMute%22%3Afalse%2C%22isMute%22%3Afalse%2C%22volume%22%3A0.5%7D; stream_player_status_params=%22%7B%5C%22is_auto_play%5C%22%3A0%2C%5C%22is_full_screen%5C%22%3A0%2C%5C%22is_full_webscreen%5C%22%3A0%2C%5C%22is_mute%5C%22%3A0%2C%5C%22is_speed%5C%22%3A1%2C%5C%22is_visible%5C%22%3A1%7D%22; xgplayer_user_id=835787001711; fpk1=U2FsdGVkX19Ke0llbjXpGOOr1Jeel/2GnaSJz41VO3mAFs271jC0hG7gdWlk+2pYLM4GF8TVGtwClCJIXsTKUw==; fpk2=2333b8d335abc6e14aef1caed0ae26fc; s_v_web_id=verify_m8bcww86_XfwSCnmj_5i3F_4Joq_8edO_9gRH9JENh07f; csrf_session_id=6f34e666e71445c9d39d8d06a347a13f; FORCE_LOGIN=%7B%22videoConsumedRemainSeconds%22%3A180%7D; biz_trace_id=c34e5eaf; passport_csrf_token=ab84b3e39ad78e719b236035a27379c0; passport_csrf_token_default=ab84b3e39ad78e719b236035a27379c0; __security_mc_1_s_sdk_crypt_sdk=ac2d56c3-44cd-a161; __security_mc_1_s_sdk_cert_key=ccf2bd2d-4718-b8de; __security_mc_1_s_sdk_sign_data_key_web_protect=9995d368-4e45-b17f; bd_ticket_guard_client_data=eyJiZC10aWNrZXQtZ3VhcmQtdmVyc2lvbiI6MiwiYmQtdGlja2V0LWd1YXJkLWl0ZXJhdGlvbi12ZXJzaW9uIjoxLCJiZC10aWNrZXQtZ3VhcmQtcmVlLXB1YmxpYy1rZXkiOiJCUHR2ZDlUeGU4UlhPaWdIczFqaStJWityQlF4UWZMKytiL2drbXlYUmNrelNua1lQUjJTRVZHVlo4MWFCU0EvSW4xSnBmbzN3TFlvSnhIZTZTV29DTmc9IiwiYmQtdGlja2V0LWd1YXJkLXdlYi12ZXJzaW9uIjoyfQ%3D%3D; bd_ticket_guard_client_web_domain=2; xg_device_score=8.208487995540095; sdk_source_info=7e276470716a68645a606960273f276364697660272927676c715a6d6069756077273f276364697660272927666d776a68605a607d71606b766c6a6b5a7666776c7571273f275e58272927666a6b766a69605a696c6061273f27636469766027292762696a6764695a7364776c6467696076273f275e5827292771273f27303035353c3337343437313234272927676c715a75776a716a666a69273f2763646976602778; bit_env=LVdHnIescW9BCGpo5gGuqIlwNfgj757SBdMhdZXBSWjPWbxp9Nv_B2vUt_LtEvr-ioRv0E9b8N8HWiOHe20JqcUhO4YmpIM6gB83hjgqZfmAhYEbzJR7z2bRViJaPg4xeoyGhwdjwK_Bzogp6uoUs4ov-P4JYzMh78i7jaY5Pzd6h3CaVO-eUKnTiFfUlJo_jmhSfHXGdwkurXwR4lO_UnU4Loqa0YlmDiyi0fPxURFIN5t4Ny6Ua8LLSYcUrBXHlXoQ5G4bQN4XqwuWwT9YauexXbkotU1Jv8pMJUiAhlFIMjbvfTutTSnOXJLoH_JsR_doifURl0wf8CIa_OcYw-A2VglrpbaFU6HDVTKbSRKovzIMY9bUwl_4EAiLBf87g2BU0Uz1MHd_lGNdH3ImEWpLtdRvUsW_KD7q87rPsEGVTceyQ5U3ZlETqoEOwOiggNGu5lL_1O8lt8_7eydeGA%3D%3D; gulu_source_res=eyJwX2luIjoiM2Y3NGJhZDgxMzc3OThkNmVkN2U5ZjM3NDMzNGJkYjMwNzRhYjI0ZWJhMDZkMzdmYWNiNjgzNTY2ZjY0OGUyNCJ9; passport_auth_mix_state=c534f2qcgpohqv4juisp74wq28e90snz
Cookie: PLEASE_REPLACE_WITH_YOUR_OWN_COOKIE

proxies:
http:
Expand Down
6 changes: 3 additions & 3 deletions crawlers/tiktok/web/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ TokenManager:
Referer: https://www.tiktok.com/
# 你唯一需要修改的地方就是这里的Cookie,然后保存后重启程序即可。
# The only place you need to modify is the Cookie here, and then save and restart the program.
Cookie: tt_csrf_token=bwnaRGd9-B-0ce8ntqw9jtGzAdvzTRKNpBl0; ak_bmsc=75A1956756DE42FD14ED069AAE7A8780~000000000000000000000000000000~YAAQXCw+F8jpmBGQAQAAIfGsFBj+ZEGzR/ZeiuPpMtItu0QQUQRmjBX2kADliy6QA9rZSfrxRUZc9zuRrI4/xbIrAwA/nkdguGpa+v3QSn/1sk5uP2aqLVm0eYB/SGNafa2h2QvIPbLNiSCRhgq1GalZJL4+udqDnyBRJWE74nin74bZwrVDvCX1s8M2hWqZ9/jTkdm4sfwON9MdJIEtjAPlddQ4gxoqjPoWhfnrm24dhPT4OjL1B8QP1mgurj7zJGspqD53VcjkAl65gHVxp3dwZ5WbPYpqrh9j8wo2u/Wh6uhX+0HWmkv5yVZyTyYQTl3/ilPp9G4CuIUi84gaPLjNYea9AEnphNX0ywzDa6/yegfqyE6r3wqBBDCrR1xRM98YEB4A5PV7pw==; tt_chain_token=ljZFLdRDfyfDflXMg5XGpg==; tiktok_webapp_theme_auto_dark_ab=1; tiktok_webapp_theme=dark; perf_feed_cache={%22expireTimestamp%22:1718503200000%2C%22itemIds%22:[%227348816520216186158%22%2C%227356022137678810410%22%2C%227349561209340857630%22]}; s_v_web_id=verify_lxe3l432_JnDE5WWo_URef_4WrS_88IM_fd1CqEXZs4dZ; passport_csrf_token=af197f073ed95f4dc2636f24d55566a6; passport_csrf_token_default=af197f073ed95f4dc2636f24d55566a6; ttwid=1%7CuNT4GcgvvOjH8rTETh9d9xti_QDJjlcnSK2V7djIpuc%7C1718333954%7Cf81b989a495aedff91302da4d0a3ab6055dea486fb203a4326b37d5a5346ad0c; msToken=1Mhpyi8MlaZjM6bbLDVUhCj_6C0kEO_1_Nb62ByXLg7wy_vLnBxdMFpKclhf4HYnEjCghk2Gq47ZM5jPj3L1yFxQUZvq4oPLo1b2Wfe_33RE94uIxdiR-eSueWbcYDDgOj1Pn9Wyid5Uf5fzBQ7xxFA=; bm_sv=9ADBA7BE06EC41817F117E2279F1410C~YAAQXCw+F8bsmBGQAQAAzSewFBg2fP3Zd0aky2x7S13D97O64xi8EXhoKORBnPQyCHlh0iSlh63FFjoy6peDWaF3lkWaTly3Z7I7WvWk1GCntnYzpJaSCE5EO2OL38zPWpHcgGWuekluvptHXsheedNEefN4SUHVMt4jJynWNeTKrao0RmNLkH4zGs7QO6+MPCt94QFvNfLjBRr0wVcXlN/hx9m6kcvCyzsBBqEnpugoYvZ0SMA+INsKI5PDfQz1~1; msToken=449_l3kdcLmnEHdDP0uACa5EcPVL1NbpjyVv8yah61EwxIPZRDlGwpGIkpIjH0Tk-CDtoKwFrDdP1v2AOpwmdoIz5oQzPEXCdyfGzcVXCHbwMX1fwPxMHpea5yFPUYEDlNWaCFlgLnejRdWeN5sB_lE=
Cookie: PLEASE_REPLACE_WITH_YOUR_OWN_COOKIE

proxies:
http:
Expand All @@ -14,7 +14,7 @@ TokenManager:
msToken:
# 不要修改下面的内容。
# Do not modify the content below.
url: https://mssdk.tiktokw.us/web/report?msToken=1Ab-7YxR9lUHSem0PraI_XzdKmpHb6j50L8AaXLAd2aWTdoJCYLfX_67rVQFE4UwwHVHmyG_NfIipqrlLT3kCXps-5PYlNAqtdwEg7TrDyTAfCKyBrOLmhMUjB55oW8SPZ4_EkNxNFUdV7MquA==
url: https://mssdk.tiktokw.us/web/report?msToken=PLEASE_REPLACE_WITH_YOUR_OWN_MSTOKEN
magic: 538969122
version: 1
dataType: 8
Expand All @@ -26,6 +26,6 @@ TokenManager:
# Do not modify the content below.
url: https://www.tiktok.com/ttwid/check/
data: '{"aid":1988,"service":"www.tiktok.com","union":false,"unionHost":"","needFid":false,"fid":"","migrate_priority":0}'
cookie: tt_csrf_token=YmksDB6a-h4cT2fF7JpORI2O9UBMCWjsntIc; ttwid=1%7C0FVb9fFc-sjDG2UdJwdC1AirqYozQ0xfbAS4N72vN2Y%7C1713886256%7C78a9d83445b82b73ca8d4e0cf024ea6cdf1329b7f3866c826b0a69a300ebce46; ak_bmsc=51B1D53481A3A4E4D0CEFF2BCF622DA2~000000000000000000000000000000~YAAQ7uIsF6c4j+SOAQAAANmUCxfRGVXZ4D9xnO97l1yDw0OWyomnVkNY7IUKaggUja0kQzFQ+WG4xaxBcPt0AN0n26KeHXGGKgHYpHPUMUBHGHQGDtE4RLyy7U+LPbSJCqVaSDiPuzxHht0YUIbWogvrFmBfkP4ohcmjkZxWtEI9qQ4Whaobb2CFHGdKNt0zlVNBjJQ3uYRAvUe12zSBynQB18y6QhE8goneRkCEw9VIeft2pFIwNQ8tkWWEjDt6wHNaqeND7eASg5WLzYskWbTt6bPAOhSNRLJ38HZrOB5QNg+xxN5uuCSYmjMXCl8SkvQr91pInmOng+V898FLLBQtefs95whvbpfE0mKwBk5Cz2TkkHcUJa/IoC0CLmNqoEk3AtKxpw/J; tt_chain_token=46Xkv2ukMzyJ2e7XU7y0AQ==; bm_sv=A2E67B998DE8E6A4F1C2C02485467446~YAAQ7uIsF6g4j+SOAQAABdqUCxf1J/K4dYG0k7bbw2m5rFujdlSqMoCKDubu4R602nFvbY6zWC5puJczBv3IXwJJRpQxxR03wDCMVlKTCqjQvgDs8BoCuoNQxfY2fdS+F3bKut2lxXPQ2qctqz4kHBrgspJArHn/zu/IuKCIeSzmV4KcyxW6Zvw3/xMRA0MeHgyuHsTRBS+VrFk8Ju2NbJWWC8uSHbLCM/dhFT7/ktw8RE30r24XpQmhLpVTsUSC~1; tiktok_webapp_theme=light; msToken=ySXERzKCE0QUG0cCg6TWLw3wfEB-6kh6kAfuzhzjcQvmV1jBFloSgIsT9xk-QXFVdI99U1Fqb9mhUpIOldoDkjdZwskB8rvt66MHZaHnvBRZRtOKtTYsWT8osDyQXDVZWdPkvyE598h9; passport_csrf_token=1a47d95ebf68fc3648b0018ee75afc9f; passport_csrf_token_default=1a47d95ebf68fc3648b0018ee75afc9f; perf_feed_cache={%22expireTimestamp%22:1714057200000%2C%22itemIds%22:[%227346425092966206766%22%2C%227353812964207594795%22%2C%227343343741916171563%22]}; msToken=yWwG-ITrCnjJbx5ltBa9FTXdCImOJrl-wtQJSQH3afeEumWZcbo_qcrF6F7-NjYcrG6JVxtJiOU208REZeCSgXEZrrs5_65K741fQ7PSzCGOhz6vUyycq3Xvj4Mu-S0kJ6SqyltHnpJp
cookie: PLEASE_REPLACE_WITH_YOUR_OWN_COOKIE
odin_tt:
url: https://www.tiktok.com/passport/web/account/info/?aid=1459&app_language=zh-Hans&app_name=tiktok_web&browser_language=zh-CN&browser_name=Mozilla&browser_online=true&browser_platform=Win32&browser_version=5.0%20%28Windows%20NT%2010.0%3B%20Win64%3B%20x64%29%20AppleWebKit%2F537.36%20%28KHTML%2C%20like%20Gecko%29%20Chrome%2F119.0.0.0%20Safari%2F537.36&channel=tiktok_web&cookie_enabled=true&device_id=7306060721837852167&root_referer=https%3A%2F%2Fwww.tiktok.com%2Flogin%2F