diff --git a/app/api/endpoints/download.py b/app/api/endpoints/download.py index 60b2c96d54..261f86fa6c 100644 --- a/app/api/endpoints/download.py +++ b/app/api/endpoints/download.py @@ -1,4 +1,7 @@ +import mimetypes import os +from pathlib import Path +import re import zipfile import subprocess import tempfile @@ -20,10 +23,59 @@ with open(config_path, 'r', encoding='utf-8') as file: config = yaml.safe_load(file) +DEFAULT_HEADERS = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" +} +SAFE_NAME_PATTERN = re.compile(r"[^A-Za-z0-9._-]") +ALLOWED_PLATFORMS = {"douyin", "tiktok", "bilibili"} +ALLOWED_DATA_TYPES = {"video", "image"} +ALLOWED_IMAGE_EXTENSIONS = {"jpg", "jpeg", "png", "webp", "gif"} + + +def sanitize_name(value: str, *, fallback: str) -> str: + value = SAFE_NAME_PATTERN.sub("_", (value or "").strip()) + value = value.strip("._") + return value or fallback + + +def resolve_download_dir(platform: str, data_type: str) -> Path: + safe_platform = sanitize_name(platform, fallback="unknown") + safe_data_type = sanitize_name(data_type, fallback="unknown") + if safe_platform not in ALLOWED_PLATFORMS: + raise HTTPException(status_code=400, detail="Unsupported platform") + if safe_data_type not in ALLOWED_DATA_TYPES: + raise HTTPException(status_code=400, detail="Unsupported media type") + + base_dir = Path(config.get("API", {}).get("Download_Path", "./download")).resolve() + target_dir = (base_dir / f"{safe_platform}_{safe_data_type}").resolve() + if base_dir != target_dir and base_dir not in target_dir.parents: + raise HTTPException(status_code=500, detail="Invalid download path") + target_dir.mkdir(parents=True, exist_ok=True) + return target_dir + + +def build_safe_file_path(directory: Path, filename: str) -> Path: + candidate = (directory / filename).resolve() + if directory != candidate.parent: + raise HTTPException(status_code=400, detail="Unsafe file name") + return candidate + + +def get_safe_image_extension(content_type: str | None) -> str: + if not content_type: + return "jpg" + mime_type = content_type.split(";", 1)[0].strip().lower() + extension = mimetypes.guess_extension(mime_type) or "" + extension = extension.lstrip(".").lower() + if extension == "jpe": + extension = "jpg" + if extension not in ALLOWED_IMAGE_EXTENSIONS: + return "jpg" + return extension + + async def fetch_data(url: str, headers: dict = None): - headers = { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' - } if headers is None else headers.get('headers') + headers = DEFAULT_HEADERS if headers is None else headers.get('headers', DEFAULT_HEADERS) async with httpx.AsyncClient() as client: response = await client.get(url, headers=headers) response.raise_for_status() # 确保响应是成功的 @@ -31,9 +83,7 @@ async def fetch_data(url: str, headers: dict = None): # 下载视频专用 async def fetch_data_stream(url: str, request:Request , headers: dict = None, file_path: str = None): - headers = { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' - } if headers is None else headers.get('headers') + headers = DEFAULT_HEADERS if headers is None else headers.get('headers', DEFAULT_HEADERS) async with httpx.AsyncClient() as client: # 启用流式请求 async with client.stream("GET", url, headers=headers) as response: @@ -165,21 +215,18 @@ async def download_file_hybrid(request: Request, try: data_type = data.get('type') platform = data.get('platform') - video_id = data.get('video_id') # 改为使用video_id - file_prefix = config.get("API").get("Download_File_Prefix") if prefix else '' - download_path = os.path.join(config.get("API").get("Download_Path"), f"{platform}_{data_type}") - - # 确保目录存在/Ensure the directory exists - os.makedirs(download_path, exist_ok=True) + video_id = sanitize_name(str(data.get('video_id')), fallback='media') # 改为使用video_id + file_prefix = sanitize_name(config.get("API", {}).get("Download_File_Prefix", ""), fallback="") if prefix else "" + download_dir = resolve_download_dir(platform, data_type) # 下载视频文件/Download video file if data_type == 'video': file_name = f"{file_prefix}{platform}_{video_id}.mp4" if not with_watermark else f"{file_prefix}{platform}_{video_id}_watermark.mp4" - file_path = os.path.join(download_path, file_name) + file_path = build_safe_file_path(download_dir, file_name) # 判断文件是否存在,存在就直接返回 if os.path.exists(file_path): - return FileResponse(path=file_path, media_type='video/mp4', filename=file_name) + return FileResponse(path=str(file_path), media_type='video/mp4', filename=file_name) # 获取对应平台的headers if platform == 'tiktok': @@ -201,7 +248,7 @@ async def download_file_hybrid(request: Request, ) # 使用专门的函数合并音视频 - success = await merge_bilibili_video_audio(video_url, audio_url, request, file_path, __headers.get('headers')) + success = await merge_bilibili_video_audio(video_url, audio_url, request, str(file_path), __headers.get('headers')) if not success: raise HTTPException( status_code=500, @@ -210,7 +257,7 @@ async def download_file_hybrid(request: Request, else: # 其他平台的常规处理 url = data.get('video_data').get('nwm_video_url_HQ') if not with_watermark else data.get('video_data').get('wm_video_url_HQ') - success = await fetch_data_stream(url, request, headers=__headers, file_path=file_path) + success = await fetch_data_stream(url, request, headers=__headers, file_path=str(file_path)) if not success: raise HTTPException( status_code=500, @@ -222,17 +269,17 @@ async def download_file_hybrid(request: Request, # await out_file.write(response.content) # 返回文件内容 - return FileResponse(path=file_path, filename=file_name, media_type="video/mp4") + return FileResponse(path=str(file_path), filename=file_name, media_type="video/mp4") # 下载图片文件/Download image file elif data_type == 'image': # 压缩文件属性/Compress file properties zip_file_name = f"{file_prefix}{platform}_{video_id}_images.zip" if not with_watermark else f"{file_prefix}{platform}_{video_id}_images_watermark.zip" - zip_file_path = os.path.join(download_path, zip_file_name) + zip_file_path = build_safe_file_path(download_dir, zip_file_name) # 判断文件是否存在,存在就直接返回、 if os.path.exists(zip_file_path): - return FileResponse(path=zip_file_path, filename=zip_file_name, media_type="application/zip") + return FileResponse(path=str(zip_file_path), filename=zip_file_name, media_type="application/zip") # 获取图片文件/Get image file urls = data.get('image_data').get('no_watermark_image_list') if not with_watermark else data.get( @@ -243,22 +290,22 @@ async def download_file_hybrid(request: Request, response = await fetch_data(url) index = int(urls.index(url)) content_type = response.headers.get('content-type') - file_format = content_type.split('/')[1] + file_format = get_safe_image_extension(content_type) file_name = f"{file_prefix}{platform}_{video_id}_{index + 1}.{file_format}" if not with_watermark else f"{file_prefix}{platform}_{video_id}_{index + 1}_watermark.{file_format}" - file_path = os.path.join(download_path, file_name) - image_file_list.append(file_path) + file_path = build_safe_file_path(download_dir, file_name) + image_file_list.append(str(file_path)) # 保存文件/Save file - async with aiofiles.open(file_path, 'wb') as out_file: + async with aiofiles.open(str(file_path), 'wb') as out_file: await out_file.write(response.content) # 压缩文件/Compress file - with zipfile.ZipFile(zip_file_path, 'w') as zip_file: + with zipfile.ZipFile(str(zip_file_path), 'w') as zip_file: for image_file in image_file_list: zip_file.write(image_file, os.path.basename(image_file)) # 返回压缩文件/Return compressed file - return FileResponse(path=zip_file_path, filename=zip_file_name, media_type="application/zip") + return FileResponse(path=str(zip_file_path), filename=zip_file_name, media_type="application/zip") # 异常处理/Exception handling except Exception as e: diff --git a/crawlers/douyin/web/config.yaml b/crawlers/douyin/web/config.yaml index 5603a79eb1..8f371eee7e 100644 --- a/crawlers/douyin/web/config.yaml +++ b/crawlers/douyin/web/config.yaml @@ -8,7 +8,7 @@ TokenManager: Referer: https://www.douyin.com/ # 你唯一需要修改的地方就是这里的Cookie,然后保存后重启程序即可。 # The only place you need to modify is the Cookie here, and then save and restart the program. - Cookie: __ac_nonce=067d687ac00d70af16eab; __ac_signature=_02B4Z6wo00f018O6kmgAAIDAR1H8JrcivBPDi5bAAJdBcf; ttwid=1%7C46sVJ6G5zO0ZRKBqbFef2B13U3CqP9gLwQEH8IV2y6A%7C1742112685%7Cae649397cca7dde21884d5f8e3e3d53eb2361aa64af04cd6889fa71d7f23344b; UIFID_TEMP=986fab8dfc2c74111fac2b883dbdee67777473ded35e2c4bebbf68cc8b91739da61f6b365ad9795b0aa3a8bddce6cc3e39c5d4fd4bad667aaefd3d3ec08baac66fe3b215343f12d8aae84e0a24048f44; douyin.com; device_web_cpu_core=16; device_web_memory_size=-1; architecture=amd64; hevc_supported=true; IsDouyinActive=true; home_can_add_dy_2_desktop=%220%22; dy_swidth=1835; dy_sheight=1147; stream_recommend_feed_params=%22%7B%5C%22cookie_enabled%5C%22%3Atrue%2C%5C%22screen_width%5C%22%3A1835%2C%5C%22screen_height%5C%22%3A1147%2C%5C%22browser_online%5C%22%3Atrue%2C%5C%22cpu_core_num%5C%22%3A16%2C%5C%22device_memory%5C%22%3A0%2C%5C%22downlink%5C%22%3A%5C%22%5C%22%2C%5C%22effective_type%5C%22%3A%5C%22%5C%22%2C%5C%22round_trip_time%5C%22%3A0%7D%22; strategyABtestKey=%221742112685.842%22; volume_info=%7B%22isUserMute%22%3Afalse%2C%22isMute%22%3Afalse%2C%22volume%22%3A0.5%7D; stream_player_status_params=%22%7B%5C%22is_auto_play%5C%22%3A0%2C%5C%22is_full_screen%5C%22%3A0%2C%5C%22is_full_webscreen%5C%22%3A0%2C%5C%22is_mute%5C%22%3A0%2C%5C%22is_speed%5C%22%3A1%2C%5C%22is_visible%5C%22%3A1%7D%22; xgplayer_user_id=835787001711; fpk1=U2FsdGVkX19Ke0llbjXpGOOr1Jeel/2GnaSJz41VO3mAFs271jC0hG7gdWlk+2pYLM4GF8TVGtwClCJIXsTKUw==; fpk2=2333b8d335abc6e14aef1caed0ae26fc; s_v_web_id=verify_m8bcww86_XfwSCnmj_5i3F_4Joq_8edO_9gRH9JENh07f; csrf_session_id=6f34e666e71445c9d39d8d06a347a13f; FORCE_LOGIN=%7B%22videoConsumedRemainSeconds%22%3A180%7D; biz_trace_id=c34e5eaf; passport_csrf_token=ab84b3e39ad78e719b236035a27379c0; passport_csrf_token_default=ab84b3e39ad78e719b236035a27379c0; __security_mc_1_s_sdk_crypt_sdk=ac2d56c3-44cd-a161; __security_mc_1_s_sdk_cert_key=ccf2bd2d-4718-b8de; __security_mc_1_s_sdk_sign_data_key_web_protect=9995d368-4e45-b17f; bd_ticket_guard_client_data=eyJiZC10aWNrZXQtZ3VhcmQtdmVyc2lvbiI6MiwiYmQtdGlja2V0LWd1YXJkLWl0ZXJhdGlvbi12ZXJzaW9uIjoxLCJiZC10aWNrZXQtZ3VhcmQtcmVlLXB1YmxpYy1rZXkiOiJCUHR2ZDlUeGU4UlhPaWdIczFqaStJWityQlF4UWZMKytiL2drbXlYUmNrelNua1lQUjJTRVZHVlo4MWFCU0EvSW4xSnBmbzN3TFlvSnhIZTZTV29DTmc9IiwiYmQtdGlja2V0LWd1YXJkLXdlYi12ZXJzaW9uIjoyfQ%3D%3D; bd_ticket_guard_client_web_domain=2; xg_device_score=8.208487995540095; sdk_source_info=7e276470716a68645a606960273f276364697660272927676c715a6d6069756077273f276364697660272927666d776a68605a607d71606b766c6a6b5a7666776c7571273f275e58272927666a6b766a69605a696c6061273f27636469766027292762696a6764695a7364776c6467696076273f275e5827292771273f27303035353c3337343437313234272927676c715a75776a716a666a69273f2763646976602778; bit_env=LVdHnIescW9BCGpo5gGuqIlwNfgj757SBdMhdZXBSWjPWbxp9Nv_B2vUt_LtEvr-ioRv0E9b8N8HWiOHe20JqcUhO4YmpIM6gB83hjgqZfmAhYEbzJR7z2bRViJaPg4xeoyGhwdjwK_Bzogp6uoUs4ov-P4JYzMh78i7jaY5Pzd6h3CaVO-eUKnTiFfUlJo_jmhSfHXGdwkurXwR4lO_UnU4Loqa0YlmDiyi0fPxURFIN5t4Ny6Ua8LLSYcUrBXHlXoQ5G4bQN4XqwuWwT9YauexXbkotU1Jv8pMJUiAhlFIMjbvfTutTSnOXJLoH_JsR_doifURl0wf8CIa_OcYw-A2VglrpbaFU6HDVTKbSRKovzIMY9bUwl_4EAiLBf87g2BU0Uz1MHd_lGNdH3ImEWpLtdRvUsW_KD7q87rPsEGVTceyQ5U3ZlETqoEOwOiggNGu5lL_1O8lt8_7eydeGA%3D%3D; gulu_source_res=eyJwX2luIjoiM2Y3NGJhZDgxMzc3OThkNmVkN2U5ZjM3NDMzNGJkYjMwNzRhYjI0ZWJhMDZkMzdmYWNiNjgzNTY2ZjY0OGUyNCJ9; passport_auth_mix_state=c534f2qcgpohqv4juisp74wq28e90snz + Cookie: PLEASE_REPLACE_WITH_YOUR_OWN_COOKIE proxies: http: diff --git a/crawlers/tiktok/web/config.yaml b/crawlers/tiktok/web/config.yaml index ef32371274..ffd97256ad 100644 --- a/crawlers/tiktok/web/config.yaml +++ b/crawlers/tiktok/web/config.yaml @@ -5,7 +5,7 @@ TokenManager: Referer: https://www.tiktok.com/ # 你唯一需要修改的地方就是这里的Cookie,然后保存后重启程序即可。 # The only place you need to modify is the Cookie here, and then save and restart the program. - Cookie: tt_csrf_token=bwnaRGd9-B-0ce8ntqw9jtGzAdvzTRKNpBl0; ak_bmsc=75A1956756DE42FD14ED069AAE7A8780~000000000000000000000000000000~YAAQXCw+F8jpmBGQAQAAIfGsFBj+ZEGzR/ZeiuPpMtItu0QQUQRmjBX2kADliy6QA9rZSfrxRUZc9zuRrI4/xbIrAwA/nkdguGpa+v3QSn/1sk5uP2aqLVm0eYB/SGNafa2h2QvIPbLNiSCRhgq1GalZJL4+udqDnyBRJWE74nin74bZwrVDvCX1s8M2hWqZ9/jTkdm4sfwON9MdJIEtjAPlddQ4gxoqjPoWhfnrm24dhPT4OjL1B8QP1mgurj7zJGspqD53VcjkAl65gHVxp3dwZ5WbPYpqrh9j8wo2u/Wh6uhX+0HWmkv5yVZyTyYQTl3/ilPp9G4CuIUi84gaPLjNYea9AEnphNX0ywzDa6/yegfqyE6r3wqBBDCrR1xRM98YEB4A5PV7pw==; tt_chain_token=ljZFLdRDfyfDflXMg5XGpg==; tiktok_webapp_theme_auto_dark_ab=1; tiktok_webapp_theme=dark; perf_feed_cache={%22expireTimestamp%22:1718503200000%2C%22itemIds%22:[%227348816520216186158%22%2C%227356022137678810410%22%2C%227349561209340857630%22]}; s_v_web_id=verify_lxe3l432_JnDE5WWo_URef_4WrS_88IM_fd1CqEXZs4dZ; passport_csrf_token=af197f073ed95f4dc2636f24d55566a6; passport_csrf_token_default=af197f073ed95f4dc2636f24d55566a6; ttwid=1%7CuNT4GcgvvOjH8rTETh9d9xti_QDJjlcnSK2V7djIpuc%7C1718333954%7Cf81b989a495aedff91302da4d0a3ab6055dea486fb203a4326b37d5a5346ad0c; msToken=1Mhpyi8MlaZjM6bbLDVUhCj_6C0kEO_1_Nb62ByXLg7wy_vLnBxdMFpKclhf4HYnEjCghk2Gq47ZM5jPj3L1yFxQUZvq4oPLo1b2Wfe_33RE94uIxdiR-eSueWbcYDDgOj1Pn9Wyid5Uf5fzBQ7xxFA=; bm_sv=9ADBA7BE06EC41817F117E2279F1410C~YAAQXCw+F8bsmBGQAQAAzSewFBg2fP3Zd0aky2x7S13D97O64xi8EXhoKORBnPQyCHlh0iSlh63FFjoy6peDWaF3lkWaTly3Z7I7WvWk1GCntnYzpJaSCE5EO2OL38zPWpHcgGWuekluvptHXsheedNEefN4SUHVMt4jJynWNeTKrao0RmNLkH4zGs7QO6+MPCt94QFvNfLjBRr0wVcXlN/hx9m6kcvCyzsBBqEnpugoYvZ0SMA+INsKI5PDfQz1~1; msToken=449_l3kdcLmnEHdDP0uACa5EcPVL1NbpjyVv8yah61EwxIPZRDlGwpGIkpIjH0Tk-CDtoKwFrDdP1v2AOpwmdoIz5oQzPEXCdyfGzcVXCHbwMX1fwPxMHpea5yFPUYEDlNWaCFlgLnejRdWeN5sB_lE= + Cookie: PLEASE_REPLACE_WITH_YOUR_OWN_COOKIE proxies: http: @@ -14,7 +14,7 @@ TokenManager: msToken: # 不要修改下面的内容。 # Do not modify the content below. - url: https://mssdk.tiktokw.us/web/report?msToken=1Ab-7YxR9lUHSem0PraI_XzdKmpHb6j50L8AaXLAd2aWTdoJCYLfX_67rVQFE4UwwHVHmyG_NfIipqrlLT3kCXps-5PYlNAqtdwEg7TrDyTAfCKyBrOLmhMUjB55oW8SPZ4_EkNxNFUdV7MquA== + url: https://mssdk.tiktokw.us/web/report?msToken=PLEASE_REPLACE_WITH_YOUR_OWN_MSTOKEN magic: 538969122 version: 1 dataType: 8 @@ -26,6 +26,6 @@ TokenManager: # Do not modify the content below. url: https://www.tiktok.com/ttwid/check/ data: '{"aid":1988,"service":"www.tiktok.com","union":false,"unionHost":"","needFid":false,"fid":"","migrate_priority":0}' - cookie: tt_csrf_token=YmksDB6a-h4cT2fF7JpORI2O9UBMCWjsntIc; ttwid=1%7C0FVb9fFc-sjDG2UdJwdC1AirqYozQ0xfbAS4N72vN2Y%7C1713886256%7C78a9d83445b82b73ca8d4e0cf024ea6cdf1329b7f3866c826b0a69a300ebce46; ak_bmsc=51B1D53481A3A4E4D0CEFF2BCF622DA2~000000000000000000000000000000~YAAQ7uIsF6c4j+SOAQAAANmUCxfRGVXZ4D9xnO97l1yDw0OWyomnVkNY7IUKaggUja0kQzFQ+WG4xaxBcPt0AN0n26KeHXGGKgHYpHPUMUBHGHQGDtE4RLyy7U+LPbSJCqVaSDiPuzxHht0YUIbWogvrFmBfkP4ohcmjkZxWtEI9qQ4Whaobb2CFHGdKNt0zlVNBjJQ3uYRAvUe12zSBynQB18y6QhE8goneRkCEw9VIeft2pFIwNQ8tkWWEjDt6wHNaqeND7eASg5WLzYskWbTt6bPAOhSNRLJ38HZrOB5QNg+xxN5uuCSYmjMXCl8SkvQr91pInmOng+V898FLLBQtefs95whvbpfE0mKwBk5Cz2TkkHcUJa/IoC0CLmNqoEk3AtKxpw/J; tt_chain_token=46Xkv2ukMzyJ2e7XU7y0AQ==; bm_sv=A2E67B998DE8E6A4F1C2C02485467446~YAAQ7uIsF6g4j+SOAQAABdqUCxf1J/K4dYG0k7bbw2m5rFujdlSqMoCKDubu4R602nFvbY6zWC5puJczBv3IXwJJRpQxxR03wDCMVlKTCqjQvgDs8BoCuoNQxfY2fdS+F3bKut2lxXPQ2qctqz4kHBrgspJArHn/zu/IuKCIeSzmV4KcyxW6Zvw3/xMRA0MeHgyuHsTRBS+VrFk8Ju2NbJWWC8uSHbLCM/dhFT7/ktw8RE30r24XpQmhLpVTsUSC~1; tiktok_webapp_theme=light; msToken=ySXERzKCE0QUG0cCg6TWLw3wfEB-6kh6kAfuzhzjcQvmV1jBFloSgIsT9xk-QXFVdI99U1Fqb9mhUpIOldoDkjdZwskB8rvt66MHZaHnvBRZRtOKtTYsWT8osDyQXDVZWdPkvyE598h9; passport_csrf_token=1a47d95ebf68fc3648b0018ee75afc9f; passport_csrf_token_default=1a47d95ebf68fc3648b0018ee75afc9f; perf_feed_cache={%22expireTimestamp%22:1714057200000%2C%22itemIds%22:[%227346425092966206766%22%2C%227353812964207594795%22%2C%227343343741916171563%22]}; msToken=yWwG-ITrCnjJbx5ltBa9FTXdCImOJrl-wtQJSQH3afeEumWZcbo_qcrF6F7-NjYcrG6JVxtJiOU208REZeCSgXEZrrs5_65K741fQ7PSzCGOhz6vUyycq3Xvj4Mu-S0kJ6SqyltHnpJp + cookie: PLEASE_REPLACE_WITH_YOUR_OWN_COOKIE odin_tt: url: https://www.tiktok.com/passport/web/account/info/?aid=1459&app_language=zh-Hans&app_name=tiktok_web&browser_language=zh-CN&browser_name=Mozilla&browser_online=true&browser_platform=Win32&browser_version=5.0%20%28Windows%20NT%2010.0%3B%20Win64%3B%20x64%29%20AppleWebKit%2F537.36%20%28KHTML%2C%20like%20Gecko%29%20Chrome%2F119.0.0.0%20Safari%2F537.36&channel=tiktok_web&cookie_enabled=true&device_id=7306060721837852167&root_referer=https%3A%2F%2Fwww.tiktok.com%2Flogin%2F \ No newline at end of file